Download Install Tutorial Docs FAQ Tools WikiLicense Team IRC Planet Involvement Shop Book

root/tags/cherrypy-3.0.0/cherrypy/lib/tidy.py

Revision 1466 (checked in by fumanchu, 2 years ago)

Initial test_tidy, plus a bugfix in tidy.strict_xml and indent args.

  • Property svn:eol-style set to native
Line 
1 """Functions to run cherrypy.response through Tidy or NSGML."""
2
3 import cgi
4 import os
5 import StringIO
6 import traceback
7
8 import cherrypy
9    
10 def tidy(temp_dir, tidy_path, strict_xml=False, errors_to_ignore=None,
11          indent=False, wrap=False, warnings=True):
12     """Run cherrypy.response through Tidy.
13     
14     If either 'indent' or 'wrap' are specified, then response.body will be
15     set to the output of tidy. Otherwise, only errors (including warnings,
16     if warnings is True) will change the body.
17     
18     Note that we use the standalone Tidy tool rather than the python
19     mxTidy module. This is because this module does not seem to be
20     stable and it crashes on some HTML pages (which means that the
21     server would also crash)
22     """
23     response = cherrypy.response
24    
25     # the tidy tool, by its very nature it's not generator friendly,
26     # so we just collapse the body and work with it.
27     orig_body = response.collapse_body()
28    
29     fct = response.headers.get('Content-Type', '')
30     ct = fct.split(';')[0]
31     encoding = ''
32     i = fct.find('charset=')
33     if i != -1:
34         encoding = fct[i + 8:]
35    
36     if ct == 'text/html':
37         page_file = os.path.join(temp_dir, 'page.html')
38         open(page_file, 'wb').write(orig_body)
39        
40         out_file = os.path.join(temp_dir, 'tidy.out')
41         err_file = os.path.join(temp_dir, 'tidy.err')
42         tidy_enc = encoding.replace('-', '')
43         if tidy_enc:
44             tidy_enc = '-' + tidy_enc
45        
46         strict_xml = ("", " -xml")[bool(strict_xml)]
47        
48         if indent:
49             indent = ' -indent'
50         else:
51             indent = ''
52        
53         if wrap is False:
54             wrap = ''
55         else:
56             try:
57                 wrap = ' -wrap %d' % int(tidyWrap)
58             except:
59                 wrap = ''
60        
61         result = os.system('"%s" %s%s%s%s -f %s -o %s %s' %
62                            (tidy_path, tidy_enc, strict_xml, indent, wrap,
63                             err_file, out_file, page_file))
64         use_output = bool(indent or wrap) and not result
65         if use_output:
66             output = open(out_file, 'rb').read()
67        
68         new_errs = []
69         for err in open(err_file, 'rb').read().splitlines():
70             if (err.find('Error') != -1 or
71                 (warnings and err.find('Warning') != -1)):
72                 ignore = 0
73                 for err_ign in errors_to_ignore or []:
74                     if err.find(err_ign) != -1:
75                         ignore = 1
76                         break
77                 if not ignore:
78                     new_errs.append(err)
79        
80         if new_errs:
81             response.body = wrong_content('<br />'.join(new_errs), orig_body)
82             if response.headers.has_key("Content-Length"):
83                 # Delete Content-Length header so finalize() recalcs it.
84                 del response.headers["Content-Length"]
85             return
86         elif strict_xml:
87             # The HTML is OK, but is it valid XML?
88             # Use elementtree to parse XML
89             from elementtree.ElementTree import parse
90             tag_list = ['nbsp', 'quot']
91             for tag in tag_list:
92                 orig_body = orig_body.replace('&' + tag + ';', tag.upper())
93            
94             if encoding:
95                 enctag = '<?xml version="1.0" encoding="%s"?>' % encoding
96                 orig_body = enctag + orig_body
97            
98             f = StringIO.StringIO(orig_body)
99             try:
100                 tree = parse(f)
101             except:
102                 # Wrong XML
103                 body_file = StringIO.StringIO()
104                 traceback.print_exc(file = body_file)
105                 body_file = '<br />'.join(body_file.getvalue())
106                 response.body = wrong_content(body_file, orig_body, "XML")
107                 if response.headers.has_key("Content-Length"):
108                     # Delete Content-Length header so finalize() recalcs it.
109                     del response.headers["Content-Length"]
110                 return
111        
112         if use_output:
113             response.body = [output]
114             if response.headers.has_key("Content-Length"):
115                 # Delete Content-Length header so finalize() recalcs it.
116                 del response.headers["Content-Length"]
117
118 def html_space(text):
119     """Escape text, replacing space with nbsp and tab with 4 nbsp's."""
120     return cgi.escape(text).replace('\t', '    ').replace(' ', '&nbsp;')
121
122 def html_break(text):
123     """Escape text, replacing newline with HTML br element."""
124     return cgi.escape(text).replace('\n', '<br />')
125
126 def wrong_content(header, body, content_type="HTML"):
127     output = ["Wrong %s:<br />%s<br />" % (content_type, html_break(header))]
128     for i, line in enumerate(body.splitlines()):
129         output.append("%03d - %s" % (i + 1, html_space(line)))
130     return "<br />".join(output)
131
132
133 def nsgmls(temp_dir, nsgmls_path, catalog_path, errors_to_ignore=None):
134     response = cherrypy.response
135    
136     # the tidy tool, by its very nature it's not generator friendly,
137     # so we just collect the body and work with it.
138     orig_body = response.collapse_body()
139    
140     fct = response.headers.get('Content-Type', '')
141     ct = fct.split(';')[0]
142     encoding = ''
143     i = fct.find('charset=')
144     if i != -1:
145         encoding = fct[i + 8:]
146     if ct == 'text/html':
147         # Remove bits of Javascript (nsgmls doesn't seem to handle
148         #   them correctly (for instance, if <a appears in your
149         #   Javascript code nsgmls complains about it)
150         while True:
151             i = orig_body.find('<script')
152             if i == -1:
153                 break
154             j = orig_body.find('</script>', i)
155             if j == -1:
156                 break
157             orig_body = orig_body[:i] + orig_body[j+9:]
158
159         page_file = os.path.join(temp_dir, 'page.html')
160         open(page_file, 'wb').write(orig_body)
161        
162         err_file = os.path.join(temp_dir, 'nsgmls.err')
163         command = ('%s -c%s -f%s -s -E10 %s' %
164                    (nsgmls_path, catalog_path, err_file, page_file))
165         command = command.replace('\\', '/')
166         os.system(command)
167         errs = open(err_file, 'rb').read()
168        
169         new_errs = []
170         for err in errs.splitlines():
171             ignore = False
172             for err_ign in errors_to_ignore or []:
173                 if err.find(err_ign) != -1:
174                     ignore = True
175                     break
176             if not ignore:
177                 new_errs.append(err)
178        
179         if new_errs:
180             response.body = wrong_content('<br />'.join(new_errs), orig_body)
181             if response.headers.has_key("Content-Length"):
182                 # Delete Content-Length header so finalize() recalcs it.
183                 del response.headers["Content-Length"]
184
Note: See TracBrowser for help on using the browser.

Hosted by WebFaction

Log in as guest/cpguest to create tickets