| 1 |
import cgi |
|---|
| 2 |
import os |
|---|
| 3 |
import StringIO |
|---|
| 4 |
import traceback |
|---|
| 5 |
|
|---|
| 6 |
import cherrypy |
|---|
| 7 |
from basefilter import BaseFilter |
|---|
| 8 |
|
|---|
| 9 |
|
|---|
| 10 |
class TidyFilter(BaseFilter): |
|---|
| 11 |
"""Filter that runs the response through Tidy. |
|---|
| 12 |
|
|---|
| 13 |
Note that we use the standalone Tidy tool rather than the python |
|---|
| 14 |
mxTidy module. This is because this module doesn't seem to be |
|---|
| 15 |
stable and it crashes on some HTML pages (which means that the |
|---|
| 16 |
server would also crash) |
|---|
| 17 |
""" |
|---|
| 18 |
|
|---|
| 19 |
def before_finalize(self): |
|---|
| 20 |
if not cherrypy.config.get('tidy_filter.on', False): |
|---|
| 21 |
return |
|---|
| 22 |
|
|---|
| 23 |
|
|---|
| 24 |
|
|---|
| 25 |
originalBody = cherrypy.response.collapse_body() |
|---|
| 26 |
|
|---|
| 27 |
fct = cherrypy.response.headers.get('Content-Type', '') |
|---|
| 28 |
ct = fct.split(';')[0] |
|---|
| 29 |
encoding = '' |
|---|
| 30 |
i = fct.find('charset=') |
|---|
| 31 |
if i != -1: |
|---|
| 32 |
encoding = fct[i+8:] |
|---|
| 33 |
if ct == 'text/html': |
|---|
| 34 |
tmpdir = cherrypy.config.get('tidy_filter.tmp_dir') |
|---|
| 35 |
pageFile = os.path.join(tmpdir, 'page.html') |
|---|
| 36 |
outFile = os.path.join(tmpdir, 'tidy.out') |
|---|
| 37 |
errFile = os.path.join(tmpdir, 'tidy.err') |
|---|
| 38 |
f = open(pageFile, 'wb') |
|---|
| 39 |
f.write(originalBody) |
|---|
| 40 |
f.close() |
|---|
| 41 |
tidyEncoding = encoding.replace('-', '') |
|---|
| 42 |
if tidyEncoding: |
|---|
| 43 |
tidyEncoding = '-' + tidyEncoding |
|---|
| 44 |
|
|---|
| 45 |
strictXml = "" |
|---|
| 46 |
if cherrypy.config.get('tidy_filter.strict_xml', False): |
|---|
| 47 |
strictXml = ' -xml' |
|---|
| 48 |
os.system('"%s" %s%s -f %s -o %s %s' % |
|---|
| 49 |
(cherrypy.config.get('tidy_filter.tidy_path'), tidyEncoding, |
|---|
| 50 |
strictXml, errFile, outFile, pageFile)) |
|---|
| 51 |
f = open(errFile, 'rb') |
|---|
| 52 |
err = f.read() |
|---|
| 53 |
f.close() |
|---|
| 54 |
|
|---|
| 55 |
errList = err.splitlines() |
|---|
| 56 |
newErrList = [] |
|---|
| 57 |
for err in errList: |
|---|
| 58 |
if (err.find('Warning') != -1 or err.find('Error') != -1): |
|---|
| 59 |
ignore = 0 |
|---|
| 60 |
for errIgn in cherrypy.config.get('tidy_filter.errors_to_ignore', []): |
|---|
| 61 |
if err.find(errIgn) != -1: |
|---|
| 62 |
ignore = 1 |
|---|
| 63 |
break |
|---|
| 64 |
if not ignore: |
|---|
| 65 |
newErrList.append(err) |
|---|
| 66 |
|
|---|
| 67 |
if newErrList: |
|---|
| 68 |
newBody = "Wrong HTML:<br />" + cgi.escape('\n'.join(newErrList)).replace('\n','<br />') |
|---|
| 69 |
newBody += '<br /><br />' |
|---|
| 70 |
i = 0 |
|---|
| 71 |
for line in originalBody.splitlines(): |
|---|
| 72 |
i += 1 |
|---|
| 73 |
newBody += "%03d - "%i + cgi.escape(line).replace('\t',' ').replace(' ',' ') + '<br />' |
|---|
| 74 |
|
|---|
| 75 |
cherrypy.response.body = newBody |
|---|
| 76 |
|
|---|
| 77 |
cherrypy.response.headers.pop("Content-Length", None) |
|---|
| 78 |
|
|---|
| 79 |
elif strictXml: |
|---|
| 80 |
|
|---|
| 81 |
|
|---|
| 82 |
from elementtree.ElementTree import parse |
|---|
| 83 |
tagList = ['nbsp', 'quot'] |
|---|
| 84 |
for tag in tagList: |
|---|
| 85 |
originalBody = originalBody.replace( |
|---|
| 86 |
'&' + tag + ';', tag.upper()) |
|---|
| 87 |
|
|---|
| 88 |
if encoding: |
|---|
| 89 |
originalBody = """<?xml version="1.0" encoding="%s"?>""" % encoding + originalBody |
|---|
| 90 |
f = StringIO.StringIO(originalBody) |
|---|
| 91 |
try: |
|---|
| 92 |
tree = parse(f) |
|---|
| 93 |
except: |
|---|
| 94 |
|
|---|
| 95 |
bodyFile = StringIO.StringIO() |
|---|
| 96 |
traceback.print_exc(file = bodyFile) |
|---|
| 97 |
cherrypy.response.body = bodyFile.getvalue() |
|---|
| 98 |
|
|---|
| 99 |
cherrypy.response.headers.pop("Content-Length", None) |
|---|
| 100 |
|
|---|
| 101 |
newBody = "Wrong XML:<br />" + cgi.escape(bodyFile.getvalue().replace('\n','<br />')) |
|---|
| 102 |
newBody += '<br /><br />' |
|---|
| 103 |
i = 0 |
|---|
| 104 |
for line in originalBody.splitlines(): |
|---|
| 105 |
i += 1 |
|---|
| 106 |
newBody += "%03d - "%i + cgi.escape(line).replace('\t',' ').replace(' ',' ') + '<br />' |
|---|
| 107 |
|
|---|
| 108 |
cherrypy.response.body = newBody |
|---|
| 109 |
|
|---|
| 110 |
cherrypy.response.headers.pop("Content-Length", None) |
|---|
| 111 |
|
|---|