root / tags / v1_0_2_Build_912 / extensions / extScripting / scripts / jython / Lib / htmllib.py @ 11422
History | View | Annotate | Download (10.4 KB)
1 |
"""HTML 2.0 parser.
|
---|---|
2 |
|
3 |
See the HTML 2.0 specification:
|
4 |
http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
|
5 |
"""
|
6 |
|
7 |
|
8 |
from sgmllib import SGMLParser |
9 |
from formatter import AS_IS |
10 |
|
11 |
__all__ = ["HTMLParser"]
|
12 |
|
13 |
class HTMLParser(SGMLParser): |
14 |
|
15 |
from htmlentitydefs import entitydefs |
16 |
|
17 |
def __init__(self, formatter, verbose=0): |
18 |
SGMLParser.__init__(self, verbose)
|
19 |
self.formatter = formatter
|
20 |
self.savedata = None |
21 |
self.isindex = 0 |
22 |
self.title = None |
23 |
self.base = None |
24 |
self.anchor = None |
25 |
self.anchorlist = []
|
26 |
self.nofill = 0 |
27 |
self.list_stack = []
|
28 |
|
29 |
# ------ Methods used internally; some may be overridden
|
30 |
|
31 |
# --- Formatter interface, taking care of 'savedata' mode;
|
32 |
# shouldn't need to be overridden
|
33 |
|
34 |
def handle_data(self, data): |
35 |
if self.savedata is not None: |
36 |
self.savedata = self.savedata + data |
37 |
else:
|
38 |
if self.nofill: |
39 |
self.formatter.add_literal_data(data)
|
40 |
else:
|
41 |
self.formatter.add_flowing_data(data)
|
42 |
|
43 |
# --- Hooks to save data; shouldn't need to be overridden
|
44 |
|
45 |
def save_bgn(self): |
46 |
self.savedata = '' |
47 |
|
48 |
def save_end(self): |
49 |
data = self.savedata
|
50 |
self.savedata = None |
51 |
if not self.nofill: |
52 |
data = ' '.join(data.split())
|
53 |
return data
|
54 |
|
55 |
# --- Hooks for anchors; should probably be overridden
|
56 |
|
57 |
def anchor_bgn(self, href, name, type): |
58 |
self.anchor = href
|
59 |
if self.anchor: |
60 |
self.anchorlist.append(href)
|
61 |
|
62 |
def anchor_end(self): |
63 |
if self.anchor: |
64 |
self.handle_data("[%d]" % len(self.anchorlist)) |
65 |
self.anchor = None |
66 |
|
67 |
# --- Hook for images; should probably be overridden
|
68 |
|
69 |
def handle_image(self, src, alt, *args): |
70 |
self.handle_data(alt)
|
71 |
|
72 |
# --------- Top level elememts
|
73 |
|
74 |
def start_html(self, attrs): pass |
75 |
def end_html(self): pass |
76 |
|
77 |
def start_head(self, attrs): pass |
78 |
def end_head(self): pass |
79 |
|
80 |
def start_body(self, attrs): pass |
81 |
def end_body(self): pass |
82 |
|
83 |
# ------ Head elements
|
84 |
|
85 |
def start_title(self, attrs): |
86 |
self.save_bgn()
|
87 |
|
88 |
def end_title(self): |
89 |
self.title = self.save_end() |
90 |
|
91 |
def do_base(self, attrs): |
92 |
for a, v in attrs: |
93 |
if a == 'href': |
94 |
self.base = v
|
95 |
|
96 |
def do_isindex(self, attrs): |
97 |
self.isindex = 1 |
98 |
|
99 |
def do_link(self, attrs): |
100 |
pass
|
101 |
|
102 |
def do_meta(self, attrs): |
103 |
pass
|
104 |
|
105 |
def do_nextid(self, attrs): # Deprecated |
106 |
pass
|
107 |
|
108 |
# ------ Body elements
|
109 |
|
110 |
# --- Headings
|
111 |
|
112 |
def start_h1(self, attrs): |
113 |
self.formatter.end_paragraph(1) |
114 |
self.formatter.push_font(('h1', 0, 1, 0)) |
115 |
|
116 |
def end_h1(self): |
117 |
self.formatter.end_paragraph(1) |
118 |
self.formatter.pop_font()
|
119 |
|
120 |
def start_h2(self, attrs): |
121 |
self.formatter.end_paragraph(1) |
122 |
self.formatter.push_font(('h2', 0, 1, 0)) |
123 |
|
124 |
def end_h2(self): |
125 |
self.formatter.end_paragraph(1) |
126 |
self.formatter.pop_font()
|
127 |
|
128 |
def start_h3(self, attrs): |
129 |
self.formatter.end_paragraph(1) |
130 |
self.formatter.push_font(('h3', 0, 1, 0)) |
131 |
|
132 |
def end_h3(self): |
133 |
self.formatter.end_paragraph(1) |
134 |
self.formatter.pop_font()
|
135 |
|
136 |
def start_h4(self, attrs): |
137 |
self.formatter.end_paragraph(1) |
138 |
self.formatter.push_font(('h4', 0, 1, 0)) |
139 |
|
140 |
def end_h4(self): |
141 |
self.formatter.end_paragraph(1) |
142 |
self.formatter.pop_font()
|
143 |
|
144 |
def start_h5(self, attrs): |
145 |
self.formatter.end_paragraph(1) |
146 |
self.formatter.push_font(('h5', 0, 1, 0)) |
147 |
|
148 |
def end_h5(self): |
149 |
self.formatter.end_paragraph(1) |
150 |
self.formatter.pop_font()
|
151 |
|
152 |
def start_h6(self, attrs): |
153 |
self.formatter.end_paragraph(1) |
154 |
self.formatter.push_font(('h6', 0, 1, 0)) |
155 |
|
156 |
def end_h6(self): |
157 |
self.formatter.end_paragraph(1) |
158 |
self.formatter.pop_font()
|
159 |
|
160 |
# --- Block Structuring Elements
|
161 |
|
162 |
def do_p(self, attrs): |
163 |
self.formatter.end_paragraph(1) |
164 |
|
165 |
def start_pre(self, attrs): |
166 |
self.formatter.end_paragraph(1) |
167 |
self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) |
168 |
self.nofill = self.nofill + 1 |
169 |
|
170 |
def end_pre(self): |
171 |
self.formatter.end_paragraph(1) |
172 |
self.formatter.pop_font()
|
173 |
self.nofill = max(0, self.nofill - 1) |
174 |
|
175 |
def start_xmp(self, attrs): |
176 |
self.start_pre(attrs)
|
177 |
self.setliteral('xmp') # Tell SGML parser |
178 |
|
179 |
def end_xmp(self): |
180 |
self.end_pre()
|
181 |
|
182 |
def start_listing(self, attrs): |
183 |
self.start_pre(attrs)
|
184 |
self.setliteral('listing') # Tell SGML parser |
185 |
|
186 |
def end_listing(self): |
187 |
self.end_pre()
|
188 |
|
189 |
def start_address(self, attrs): |
190 |
self.formatter.end_paragraph(0) |
191 |
self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) |
192 |
|
193 |
def end_address(self): |
194 |
self.formatter.end_paragraph(0) |
195 |
self.formatter.pop_font()
|
196 |
|
197 |
def start_blockquote(self, attrs): |
198 |
self.formatter.end_paragraph(1) |
199 |
self.formatter.push_margin('blockquote') |
200 |
|
201 |
def end_blockquote(self): |
202 |
self.formatter.end_paragraph(1) |
203 |
self.formatter.pop_margin()
|
204 |
|
205 |
# --- List Elements
|
206 |
|
207 |
def start_ul(self, attrs): |
208 |
self.formatter.end_paragraph(not self.list_stack) |
209 |
self.formatter.push_margin('ul') |
210 |
self.list_stack.append(['ul', '*', 0]) |
211 |
|
212 |
def end_ul(self): |
213 |
if self.list_stack: del self.list_stack[-1] |
214 |
self.formatter.end_paragraph(not self.list_stack) |
215 |
self.formatter.pop_margin()
|
216 |
|
217 |
def do_li(self, attrs): |
218 |
self.formatter.end_paragraph(0) |
219 |
if self.list_stack: |
220 |
[dummy, label, counter] = top = self.list_stack[-1] |
221 |
top[2] = counter = counter+1 |
222 |
else:
|
223 |
label, counter = '*', 0 |
224 |
self.formatter.add_label_data(label, counter)
|
225 |
|
226 |
def start_ol(self, attrs): |
227 |
self.formatter.end_paragraph(not self.list_stack) |
228 |
self.formatter.push_margin('ol') |
229 |
label = '1.'
|
230 |
for a, v in attrs: |
231 |
if a == 'type': |
232 |
if len(v) == 1: v = v + '.' |
233 |
label = v |
234 |
self.list_stack.append(['ol', label, 0]) |
235 |
|
236 |
def end_ol(self): |
237 |
if self.list_stack: del self.list_stack[-1] |
238 |
self.formatter.end_paragraph(not self.list_stack) |
239 |
self.formatter.pop_margin()
|
240 |
|
241 |
def start_menu(self, attrs): |
242 |
self.start_ul(attrs)
|
243 |
|
244 |
def end_menu(self): |
245 |
self.end_ul()
|
246 |
|
247 |
def start_dir(self, attrs): |
248 |
self.start_ul(attrs)
|
249 |
|
250 |
def end_dir(self): |
251 |
self.end_ul()
|
252 |
|
253 |
def start_dl(self, attrs): |
254 |
self.formatter.end_paragraph(1) |
255 |
self.list_stack.append(['dl', '', 0]) |
256 |
|
257 |
def end_dl(self): |
258 |
self.ddpop(1) |
259 |
if self.list_stack: del self.list_stack[-1] |
260 |
|
261 |
def do_dt(self, attrs): |
262 |
self.ddpop()
|
263 |
|
264 |
def do_dd(self, attrs): |
265 |
self.ddpop()
|
266 |
self.formatter.push_margin('dd') |
267 |
self.list_stack.append(['dd', '', 0]) |
268 |
|
269 |
def ddpop(self, bl=0): |
270 |
self.formatter.end_paragraph(bl)
|
271 |
if self.list_stack: |
272 |
if self.list_stack[-1][0] == 'dd': |
273 |
del self.list_stack[-1] |
274 |
self.formatter.pop_margin()
|
275 |
|
276 |
# --- Phrase Markup
|
277 |
|
278 |
# Idiomatic Elements
|
279 |
|
280 |
def start_cite(self, attrs): self.start_i(attrs) |
281 |
def end_cite(self): self.end_i() |
282 |
|
283 |
def start_code(self, attrs): self.start_tt(attrs) |
284 |
def end_code(self): self.end_tt() |
285 |
|
286 |
def start_em(self, attrs): self.start_i(attrs) |
287 |
def end_em(self): self.end_i() |
288 |
|
289 |
def start_kbd(self, attrs): self.start_tt(attrs) |
290 |
def end_kbd(self): self.end_tt() |
291 |
|
292 |
def start_samp(self, attrs): self.start_tt(attrs) |
293 |
def end_samp(self): self.end_tt() |
294 |
|
295 |
def start_strong(self, attrs): self.start_b(attrs) |
296 |
def end_strong(self): self.end_b() |
297 |
|
298 |
def start_var(self, attrs): self.start_i(attrs) |
299 |
def end_var(self): self.end_i() |
300 |
|
301 |
# Typographic Elements
|
302 |
|
303 |
def start_i(self, attrs): |
304 |
self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS)) |
305 |
def end_i(self): |
306 |
self.formatter.pop_font()
|
307 |
|
308 |
def start_b(self, attrs): |
309 |
self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS)) |
310 |
def end_b(self): |
311 |
self.formatter.pop_font()
|
312 |
|
313 |
def start_tt(self, attrs): |
314 |
self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1)) |
315 |
def end_tt(self): |
316 |
self.formatter.pop_font()
|
317 |
|
318 |
def start_a(self, attrs): |
319 |
href = ''
|
320 |
name = ''
|
321 |
type = ''
|
322 |
for attrname, value in attrs: |
323 |
value = value.strip() |
324 |
if attrname == 'href': |
325 |
href = value |
326 |
if attrname == 'name': |
327 |
name = value |
328 |
if attrname == 'type': |
329 |
type = value.lower() |
330 |
self.anchor_bgn(href, name, type) |
331 |
|
332 |
def end_a(self): |
333 |
self.anchor_end()
|
334 |
|
335 |
# --- Line Break
|
336 |
|
337 |
def do_br(self, attrs): |
338 |
self.formatter.add_line_break()
|
339 |
|
340 |
# --- Horizontal Rule
|
341 |
|
342 |
def do_hr(self, attrs): |
343 |
self.formatter.add_hor_rule()
|
344 |
|
345 |
# --- Image
|
346 |
|
347 |
def do_img(self, attrs): |
348 |
align = ''
|
349 |
alt = '(image)'
|
350 |
ismap = ''
|
351 |
src = ''
|
352 |
width = 0
|
353 |
height = 0
|
354 |
for attrname, value in attrs: |
355 |
if attrname == 'align': |
356 |
align = value |
357 |
if attrname == 'alt': |
358 |
alt = value |
359 |
if attrname == 'ismap': |
360 |
ismap = value |
361 |
if attrname == 'src': |
362 |
src = value |
363 |
if attrname == 'width': |
364 |
try: width = int(value) |
365 |
except: pass |
366 |
if attrname == 'height': |
367 |
try: height = int(value) |
368 |
except: pass |
369 |
self.handle_image(src, alt, ismap, align, width, height)
|
370 |
|
371 |
# --- Really Old Unofficial Deprecated Stuff
|
372 |
|
373 |
def do_plaintext(self, attrs): |
374 |
self.start_pre(attrs)
|
375 |
self.setnomoretags() # Tell SGML parser |
376 |
|
377 |
# --- Unhandled tags
|
378 |
|
379 |
def unknown_starttag(self, tag, attrs): |
380 |
pass
|
381 |
|
382 |
def unknown_endtag(self, tag): |
383 |
pass
|
384 |
|
385 |
|
386 |
def test(args = None): |
387 |
import sys, formatter |
388 |
|
389 |
if not args: |
390 |
args = sys.argv[1:]
|
391 |
|
392 |
silent = args and args[0] == '-s' |
393 |
if silent:
|
394 |
del args[0] |
395 |
|
396 |
if args:
|
397 |
file = args[0]
|
398 |
else:
|
399 |
file = 'test.html'
|
400 |
|
401 |
if file == '-': |
402 |
f = sys.stdin |
403 |
else:
|
404 |
try:
|
405 |
f = open(file, 'r') |
406 |
except IOError, msg: |
407 |
print file, ":", msg |
408 |
sys.exit(1)
|
409 |
|
410 |
data = f.read() |
411 |
|
412 |
if f is not sys.stdin: |
413 |
f.close() |
414 |
|
415 |
if silent:
|
416 |
f = formatter.NullFormatter() |
417 |
else:
|
418 |
f = formatter.AbstractFormatter(formatter.DumbWriter()) |
419 |
|
420 |
p = HTMLParser(f) |
421 |
p.feed(data) |
422 |
p.close() |
423 |
|
424 |
|
425 |
if __name__ == '__main__': |
426 |
test() |