Statistics
| Revision:

root / tags / v1_0_2_Build_912 / extensions / extScripting / scripts / jython / Lib / htmllib.py @ 11422

History | View | Annotate | Download (10.4 KB)

1
"""HTML 2.0 parser.
2

3
See the HTML 2.0 specification:
4
http://www.w3.org/hypertext/WWW/MarkUp/html-spec/html-spec_toc.html
5
"""
6

    
7

    
8
from sgmllib import SGMLParser
9
from formatter import AS_IS
10

    
11
__all__ = ["HTMLParser"]
12

    
13
class HTMLParser(SGMLParser):
14

    
15
    from htmlentitydefs import entitydefs
16

    
17
    def __init__(self, formatter, verbose=0):
18
        SGMLParser.__init__(self, verbose)
19
        self.formatter = formatter
20
        self.savedata = None
21
        self.isindex = 0
22
        self.title = None
23
        self.base = None
24
        self.anchor = None
25
        self.anchorlist = []
26
        self.nofill = 0
27
        self.list_stack = []
28

    
29
    # ------ Methods used internally; some may be overridden
30

    
31
    # --- Formatter interface, taking care of 'savedata' mode;
32
    # shouldn't need to be overridden
33

    
34
    def handle_data(self, data):
35
        if self.savedata is not None:
36
            self.savedata = self.savedata + data
37
        else:
38
            if self.nofill:
39
                self.formatter.add_literal_data(data)
40
            else:
41
                self.formatter.add_flowing_data(data)
42

    
43
    # --- Hooks to save data; shouldn't need to be overridden
44

    
45
    def save_bgn(self):
46
        self.savedata = ''
47

    
48
    def save_end(self):
49
        data = self.savedata
50
        self.savedata = None
51
        if not self.nofill:
52
            data = ' '.join(data.split())
53
        return data
54

    
55
    # --- Hooks for anchors; should probably be overridden
56

    
57
    def anchor_bgn(self, href, name, type):
58
        self.anchor = href
59
        if self.anchor:
60
            self.anchorlist.append(href)
61

    
62
    def anchor_end(self):
63
        if self.anchor:
64
            self.handle_data("[%d]" % len(self.anchorlist))
65
            self.anchor = None
66

    
67
    # --- Hook for images; should probably be overridden
68

    
69
    def handle_image(self, src, alt, *args):
70
        self.handle_data(alt)
71

    
72
    # --------- Top level elememts
73

    
74
    def start_html(self, attrs): pass
75
    def end_html(self): pass
76

    
77
    def start_head(self, attrs): pass
78
    def end_head(self): pass
79

    
80
    def start_body(self, attrs): pass
81
    def end_body(self): pass
82

    
83
    # ------ Head elements
84

    
85
    def start_title(self, attrs):
86
        self.save_bgn()
87

    
88
    def end_title(self):
89
        self.title = self.save_end()
90

    
91
    def do_base(self, attrs):
92
        for a, v in attrs:
93
            if a == 'href':
94
                self.base = v
95

    
96
    def do_isindex(self, attrs):
97
        self.isindex = 1
98

    
99
    def do_link(self, attrs):
100
        pass
101

    
102
    def do_meta(self, attrs):
103
        pass
104

    
105
    def do_nextid(self, attrs): # Deprecated
106
        pass
107

    
108
    # ------ Body elements
109

    
110
    # --- Headings
111

    
112
    def start_h1(self, attrs):
113
        self.formatter.end_paragraph(1)
114
        self.formatter.push_font(('h1', 0, 1, 0))
115

    
116
    def end_h1(self):
117
        self.formatter.end_paragraph(1)
118
        self.formatter.pop_font()
119

    
120
    def start_h2(self, attrs):
121
        self.formatter.end_paragraph(1)
122
        self.formatter.push_font(('h2', 0, 1, 0))
123

    
124
    def end_h2(self):
125
        self.formatter.end_paragraph(1)
126
        self.formatter.pop_font()
127

    
128
    def start_h3(self, attrs):
129
        self.formatter.end_paragraph(1)
130
        self.formatter.push_font(('h3', 0, 1, 0))
131

    
132
    def end_h3(self):
133
        self.formatter.end_paragraph(1)
134
        self.formatter.pop_font()
135

    
136
    def start_h4(self, attrs):
137
        self.formatter.end_paragraph(1)
138
        self.formatter.push_font(('h4', 0, 1, 0))
139

    
140
    def end_h4(self):
141
        self.formatter.end_paragraph(1)
142
        self.formatter.pop_font()
143

    
144
    def start_h5(self, attrs):
145
        self.formatter.end_paragraph(1)
146
        self.formatter.push_font(('h5', 0, 1, 0))
147

    
148
    def end_h5(self):
149
        self.formatter.end_paragraph(1)
150
        self.formatter.pop_font()
151

    
152
    def start_h6(self, attrs):
153
        self.formatter.end_paragraph(1)
154
        self.formatter.push_font(('h6', 0, 1, 0))
155

    
156
    def end_h6(self):
157
        self.formatter.end_paragraph(1)
158
        self.formatter.pop_font()
159

    
160
    # --- Block Structuring Elements
161

    
162
    def do_p(self, attrs):
163
        self.formatter.end_paragraph(1)
164

    
165
    def start_pre(self, attrs):
166
        self.formatter.end_paragraph(1)
167
        self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
168
        self.nofill = self.nofill + 1
169

    
170
    def end_pre(self):
171
        self.formatter.end_paragraph(1)
172
        self.formatter.pop_font()
173
        self.nofill = max(0, self.nofill - 1)
174

    
175
    def start_xmp(self, attrs):
176
        self.start_pre(attrs)
177
        self.setliteral('xmp') # Tell SGML parser
178

    
179
    def end_xmp(self):
180
        self.end_pre()
181

    
182
    def start_listing(self, attrs):
183
        self.start_pre(attrs)
184
        self.setliteral('listing') # Tell SGML parser
185

    
186
    def end_listing(self):
187
        self.end_pre()
188

    
189
    def start_address(self, attrs):
190
        self.formatter.end_paragraph(0)
191
        self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
192

    
193
    def end_address(self):
194
        self.formatter.end_paragraph(0)
195
        self.formatter.pop_font()
196

    
197
    def start_blockquote(self, attrs):
198
        self.formatter.end_paragraph(1)
199
        self.formatter.push_margin('blockquote')
200

    
201
    def end_blockquote(self):
202
        self.formatter.end_paragraph(1)
203
        self.formatter.pop_margin()
204

    
205
    # --- List Elements
206

    
207
    def start_ul(self, attrs):
208
        self.formatter.end_paragraph(not self.list_stack)
209
        self.formatter.push_margin('ul')
210
        self.list_stack.append(['ul', '*', 0])
211

    
212
    def end_ul(self):
213
        if self.list_stack: del self.list_stack[-1]
214
        self.formatter.end_paragraph(not self.list_stack)
215
        self.formatter.pop_margin()
216

    
217
    def do_li(self, attrs):
218
        self.formatter.end_paragraph(0)
219
        if self.list_stack:
220
            [dummy, label, counter] = top = self.list_stack[-1]
221
            top[2] = counter = counter+1
222
        else:
223
            label, counter = '*', 0
224
        self.formatter.add_label_data(label, counter)
225

    
226
    def start_ol(self, attrs):
227
        self.formatter.end_paragraph(not self.list_stack)
228
        self.formatter.push_margin('ol')
229
        label = '1.'
230
        for a, v in attrs:
231
            if a == 'type':
232
                if len(v) == 1: v = v + '.'
233
                label = v
234
        self.list_stack.append(['ol', label, 0])
235

    
236
    def end_ol(self):
237
        if self.list_stack: del self.list_stack[-1]
238
        self.formatter.end_paragraph(not self.list_stack)
239
        self.formatter.pop_margin()
240

    
241
    def start_menu(self, attrs):
242
        self.start_ul(attrs)
243

    
244
    def end_menu(self):
245
        self.end_ul()
246

    
247
    def start_dir(self, attrs):
248
        self.start_ul(attrs)
249

    
250
    def end_dir(self):
251
        self.end_ul()
252

    
253
    def start_dl(self, attrs):
254
        self.formatter.end_paragraph(1)
255
        self.list_stack.append(['dl', '', 0])
256

    
257
    def end_dl(self):
258
        self.ddpop(1)
259
        if self.list_stack: del self.list_stack[-1]
260

    
261
    def do_dt(self, attrs):
262
        self.ddpop()
263

    
264
    def do_dd(self, attrs):
265
        self.ddpop()
266
        self.formatter.push_margin('dd')
267
        self.list_stack.append(['dd', '', 0])
268

    
269
    def ddpop(self, bl=0):
270
        self.formatter.end_paragraph(bl)
271
        if self.list_stack:
272
            if self.list_stack[-1][0] == 'dd':
273
                del self.list_stack[-1]
274
                self.formatter.pop_margin()
275

    
276
    # --- Phrase Markup
277

    
278
    # Idiomatic Elements
279

    
280
    def start_cite(self, attrs): self.start_i(attrs)
281
    def end_cite(self): self.end_i()
282

    
283
    def start_code(self, attrs): self.start_tt(attrs)
284
    def end_code(self): self.end_tt()
285

    
286
    def start_em(self, attrs): self.start_i(attrs)
287
    def end_em(self): self.end_i()
288

    
289
    def start_kbd(self, attrs): self.start_tt(attrs)
290
    def end_kbd(self): self.end_tt()
291

    
292
    def start_samp(self, attrs): self.start_tt(attrs)
293
    def end_samp(self): self.end_tt()
294

    
295
    def start_strong(self, attrs): self.start_b(attrs)
296
    def end_strong(self): self.end_b()
297

    
298
    def start_var(self, attrs): self.start_i(attrs)
299
    def end_var(self): self.end_i()
300

    
301
    # Typographic Elements
302

    
303
    def start_i(self, attrs):
304
        self.formatter.push_font((AS_IS, 1, AS_IS, AS_IS))
305
    def end_i(self):
306
        self.formatter.pop_font()
307

    
308
    def start_b(self, attrs):
309
        self.formatter.push_font((AS_IS, AS_IS, 1, AS_IS))
310
    def end_b(self):
311
        self.formatter.pop_font()
312

    
313
    def start_tt(self, attrs):
314
        self.formatter.push_font((AS_IS, AS_IS, AS_IS, 1))
315
    def end_tt(self):
316
        self.formatter.pop_font()
317

    
318
    def start_a(self, attrs):
319
        href = ''
320
        name = ''
321
        type = ''
322
        for attrname, value in attrs:
323
            value = value.strip()
324
            if attrname == 'href':
325
                href = value
326
            if attrname == 'name':
327
                name = value
328
            if attrname == 'type':
329
                type = value.lower()
330
        self.anchor_bgn(href, name, type)
331

    
332
    def end_a(self):
333
        self.anchor_end()
334

    
335
    # --- Line Break
336

    
337
    def do_br(self, attrs):
338
        self.formatter.add_line_break()
339

    
340
    # --- Horizontal Rule
341

    
342
    def do_hr(self, attrs):
343
        self.formatter.add_hor_rule()
344

    
345
    # --- Image
346

    
347
    def do_img(self, attrs):
348
        align = ''
349
        alt = '(image)'
350
        ismap = ''
351
        src = ''
352
        width = 0
353
        height = 0
354
        for attrname, value in attrs:
355
            if attrname == 'align':
356
                align = value
357
            if attrname == 'alt':
358
                alt = value
359
            if attrname == 'ismap':
360
                ismap = value
361
            if attrname == 'src':
362
                src = value
363
            if attrname == 'width':
364
                try: width = int(value)
365
                except: pass
366
            if attrname == 'height':
367
                try: height = int(value)
368
                except: pass
369
        self.handle_image(src, alt, ismap, align, width, height)
370

    
371
    # --- Really Old Unofficial Deprecated Stuff
372

    
373
    def do_plaintext(self, attrs):
374
        self.start_pre(attrs)
375
        self.setnomoretags() # Tell SGML parser
376

    
377
    # --- Unhandled tags
378

    
379
    def unknown_starttag(self, tag, attrs):
380
        pass
381

    
382
    def unknown_endtag(self, tag):
383
        pass
384

    
385

    
386
def test(args = None):
387
    import sys, formatter
388

    
389
    if not args:
390
        args = sys.argv[1:]
391

    
392
    silent = args and args[0] == '-s'
393
    if silent:
394
        del args[0]
395

    
396
    if args:
397
        file = args[0]
398
    else:
399
        file = 'test.html'
400

    
401
    if file == '-':
402
        f = sys.stdin
403
    else:
404
        try:
405
            f = open(file, 'r')
406
        except IOError, msg:
407
            print file, ":", msg
408
            sys.exit(1)
409

    
410
    data = f.read()
411

    
412
    if f is not sys.stdin:
413
        f.close()
414

    
415
    if silent:
416
        f = formatter.NullFormatter()
417
    else:
418
        f = formatter.AbstractFormatter(formatter.DumbWriter())
419

    
420
    p = HTMLParser(f)
421
    p.feed(data)
422
    p.close()
423

    
424

    
425
if __name__ == '__main__':
426
    test()