Statistics
| Revision:

gvsig-scripting / org.gvsig.scripting / trunk / org.gvsig.scripting / org.gvsig.scripting.app / org.gvsig.scripting.app.mainplugin / src / main / resources-plugin / scripting / lib / BeautifulSoupTests.py @ 475

History | View | Annotate | Download (36.7 KB)

1
# -*- coding: utf-8 -*-
2
"""Unit tests for Beautiful Soup.
3

4
These tests make sure the Beautiful Soup works as it should. If you
5
find a bug in Beautiful Soup, the best way to express it is as a test
6
case like this that fails."""
7

    
8
import unittest
9
from BeautifulSoup import *
10

    
11
class SoupTest(unittest.TestCase):
12

    
13
    def assertSoupEquals(self, toParse, rep=None, c=BeautifulSoup):
14
        """Parse the given text and make sure its string rep is the other
15
        given text."""
16
        if rep == None:
17
            rep = toParse
18
        self.assertEqual(str(c(toParse)), rep)
19

    
20

    
21
class FollowThatTag(SoupTest):
22

    
23
    "Tests the various ways of fetching tags from a soup."
24

    
25
    def setUp(self):
26
        ml = """
27
        <a id="x">1</a>
28
        <A id="a">2</a>
29
        <b id="b">3</a>
30
        <b href="foo" id="x">4</a>
31
        <ac width=100>4</ac>"""
32
        self.soup = BeautifulStoneSoup(ml)
33

    
34
    def testFindAllByName(self):
35
        matching = self.soup('a')
36
        self.assertEqual(len(matching), 2)
37
        self.assertEqual(matching[0].name, 'a')
38
        self.assertEqual(matching, self.soup.findAll('a'))
39
        self.assertEqual(matching, self.soup.findAll(SoupStrainer('a')))
40

    
41
    def testFindAllByAttribute(self):
42
        matching = self.soup.findAll(id='x')
43
        self.assertEqual(len(matching), 2)
44
        self.assertEqual(matching[0].name, 'a')
45
        self.assertEqual(matching[1].name, 'b')
46

    
47
        matching2 = self.soup.findAll(attrs={'id' : 'x'})
48
        self.assertEqual(matching, matching2)
49

    
50
        strainer = SoupStrainer(attrs={'id' : 'x'})
51
        self.assertEqual(matching, self.soup.findAll(strainer))
52

    
53
        self.assertEqual(len(self.soup.findAll(id=None)), 1)
54

    
55
        self.assertEqual(len(self.soup.findAll(width=100)), 1)
56
        self.assertEqual(len(self.soup.findAll(junk=None)), 5)
57
        self.assertEqual(len(self.soup.findAll(junk=[1, None])), 5)
58

    
59
        self.assertEqual(len(self.soup.findAll(junk=re.compile('.*'))), 0)
60
        self.assertEqual(len(self.soup.findAll(junk=True)), 0)
61

    
62
        self.assertEqual(len(self.soup.findAll(junk=True)), 0)
63
        self.assertEqual(len(self.soup.findAll(href=True)), 1)
64

    
65
    def testFindallByClass(self):
66
        soup = BeautifulSoup('<b class="foo">Foo</b><a class="1 23 4">Bar</a>')
67
        self.assertEqual(soup.find(attrs='foo').string, "Foo")
68
        self.assertEqual(soup.find('a', '1').string, "Bar")
69
        self.assertEqual(soup.find('a', '23').string, "Bar")
70
        self.assertEqual(soup.find('a', '4').string, "Bar")
71

    
72
        self.assertEqual(soup.find('a', '2'), None)
73

    
74
    def testFindAllByList(self):
75
        matching = self.soup(['a', 'ac'])
76
        self.assertEqual(len(matching), 3)
77

    
78
    def testFindAllByHash(self):
79
        matching = self.soup({'a' : True, 'b' : True})
80
        self.assertEqual(len(matching), 4)
81

    
82
    def testFindAllText(self):
83
        soup = BeautifulSoup("<html>\xbb</html>")
84
        self.assertEqual(soup.findAll(text=re.compile('.*')),
85
                         [u'\xbb'])
86

    
87
    def testFindAllByRE(self):
88
        import re
89
        r = re.compile('a.*')
90
        self.assertEqual(len(self.soup(r)), 3)
91

    
92
    def testFindAllByMethod(self):
93
        def matchTagWhereIDMatchesName(tag):
94
            return tag.name == tag.get('id')
95

    
96
        matching = self.soup.findAll(matchTagWhereIDMatchesName)
97
        self.assertEqual(len(matching), 2)
98
        self.assertEqual(matching[0].name, 'a')
99

    
100
    def testFindByIndex(self):
101
        """For when you have the tag and you want to know where it is."""
102
        tag = self.soup.find('a', id="a")
103
        self.assertEqual(self.soup.index(tag), 3)
104

    
105
        # It works for NavigableStrings as well.
106
        s = tag.string
107
        self.assertEqual(tag.index(s), 0)
108

    
109
        # If the tag isn't present, a ValueError is raised.
110
        soup2 = BeautifulSoup("<b></b>")
111
        tag2 = soup2.find('b')
112
        self.assertRaises(ValueError, self.soup.index, tag2)
113

    
114
    def testConflictingFindArguments(self):
115
        """The 'text' argument takes precedence."""
116
        soup = BeautifulSoup('Foo<b>Bar</b>Baz')
117
        self.assertEqual(soup.find('b', text='Baz'), 'Baz')
118
        self.assertEqual(soup.findAll('b', text='Baz'), ['Baz'])
119

    
120
        self.assertEqual(soup.find(True, text='Baz'), 'Baz')
121
        self.assertEqual(soup.findAll(True, text='Baz'), ['Baz'])
122

    
123
    def testParents(self):
124
        soup = BeautifulSoup('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah')
125
        b = soup.b
126
        self.assertEquals(len(b.findParents('ul', {'id' : 'foo'})), 2)
127
        self.assertEquals(b.findParent('ul')['a'], 'b')
128

    
129
    PROXIMITY_TEST = BeautifulSoup('<b id="1"><b id="2"><b id="3"><b id="4">')
130

    
131
    def testNext(self):
132
        soup = self.PROXIMITY_TEST
133
        b = soup.find('b', {'id' : 2})
134
        self.assertEquals(b.findNext('b')['id'], '3')
135
        self.assertEquals(b.findNext('b')['id'], '3')
136
        self.assertEquals(len(b.findAllNext('b')), 2)
137
        self.assertEquals(len(b.findAllNext('b', {'id' : 4})), 1)
138

    
139
    def testPrevious(self):
140
        soup = self.PROXIMITY_TEST
141
        b = soup.find('b', {'id' : 3})
142
        self.assertEquals(b.findPrevious('b')['id'], '2')
143
        self.assertEquals(b.findPrevious('b')['id'], '2')
144
        self.assertEquals(len(b.findAllPrevious('b')), 2)
145
        self.assertEquals(len(b.findAllPrevious('b', {'id' : 2})), 1)
146

    
147

    
148
    SIBLING_TEST = BeautifulSoup('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')
149

    
150
    def testNextSibling(self):
151
        soup = self.SIBLING_TEST
152
        tag = 'blockquote'
153
        b = soup.find(tag, {'id' : 2})
154
        self.assertEquals(b.findNext(tag)['id'], '2.1')
155
        self.assertEquals(b.findNextSibling(tag)['id'], '3')
156
        self.assertEquals(b.findNextSibling(tag)['id'], '3')
157
        self.assertEquals(len(b.findNextSiblings(tag)), 2)
158
        self.assertEquals(len(b.findNextSiblings(tag, {'id' : 4})), 1)
159

    
160
    def testPreviousSibling(self):
161
        soup = self.SIBLING_TEST
162
        tag = 'blockquote'
163
        b = soup.find(tag, {'id' : 3})
164
        self.assertEquals(b.findPrevious(tag)['id'], '2.1')
165
        self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
166
        self.assertEquals(b.findPreviousSibling(tag)['id'], '2')
167
        self.assertEquals(len(b.findPreviousSiblings(tag)), 2)
168
        self.assertEquals(len(b.findPreviousSiblings(tag, id=1)), 1)
169

    
170
    def testTextNavigation(self):
171
        soup = BeautifulSoup('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
172
        baz = soup.find(text='Baz')
173
        self.assertEquals(baz.findParent("i")['id'], '1')
174
        self.assertEquals(baz.findNext(text='Blee'), 'Blee')
175
        self.assertEquals(baz.findNextSibling(text='Blee'), 'Blee')
176
        self.assertEquals(baz.findNextSibling(text='Blargh'), None)
177
        self.assertEquals(baz.findNextSibling('hr')['id'], '1')
178

    
179
class SiblingRivalry(SoupTest):
180
    "Tests the nextSibling and previousSibling navigation."
181

    
182
    def testSiblings(self):
183
        soup = BeautifulSoup("<ul><li>1<p>A</p>B<li>2<li>3</ul>")
184
        secondLI = soup.find('li').nextSibling
185
        self.assert_(secondLI.name == 'li' and secondLI.string == '2')
186
        self.assertEquals(soup.find(text='1').nextSibling.name, 'p')
187
        self.assertEquals(soup.find('p').nextSibling, 'B')
188
        self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B')
189

    
190
class TagsAreObjectsToo(SoupTest):
191
    "Tests the various built-in functions of Tag objects."
192

    
193
    def testLen(self):
194
        soup = BeautifulSoup("<top>1<b>2</b>3</top>")
195
        self.assertEquals(len(soup.top), 3)
196

    
197
class StringEmUp(SoupTest):
198
    "Tests the use of 'string' as an alias for a tag's only content."
199

    
200
    def testString(self):
201
        s = BeautifulSoup("<b>foo</b>")
202
        self.assertEquals(s.b.string, 'foo')
203

    
204
    def testLackOfString(self):
205
        s = BeautifulSoup("<b>f<i>e</i>o</b>")
206
        self.assert_(not s.b.string)
207

    
208
    def testStringAssign(self):
209
        s = BeautifulSoup("<b></b>")
210
        b = s.b
211
        b.string = "foo"
212
        string = b.string
213
        self.assertEquals(string, "foo")
214
        self.assert_(isinstance(string, NavigableString))
215

    
216
class AllText(SoupTest):
217
    "Tests the use of 'text' to get all of string content from the tag."
218

    
219
    def testText(self):
220
        soup = BeautifulSoup("<ul><li>spam</li><li>eggs</li><li>cheese</li>")
221
        self.assertEquals(soup.ul.text, "spameggscheese")
222
        self.assertEquals(soup.ul.getText('/'), "spam/eggs/cheese")
223

    
224
class ThatsMyLimit(SoupTest):
225
    "Tests the limit argument."
226

    
227
    def testBasicLimits(self):
228
        s = BeautifulSoup('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
229
        self.assertEquals(len(s.findAll('br')), 4)
230
        self.assertEquals(len(s.findAll('br', limit=2)), 2)
231
        self.assertEquals(len(s('br', limit=2)), 2)
232

    
233
class OnlyTheLonely(SoupTest):
234
    "Tests the parseOnly argument to the constructor."
235
    def setUp(self):
236
        x = []
237
        for i in range(1,6):
238
            x.append('<a id="%s">' % i)
239
            for j in range(100,103):
240
                x.append('<b id="%s.%s">Content %s.%s</b>' % (i,j, i,j))
241
            x.append('</a>')
242
        self.x = ''.join(x)
243

    
244
    def testOnly(self):
245
        strainer = SoupStrainer("b")
246
        soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
247
        self.assertEquals(len(soup), 15)
248

    
249
        strainer = SoupStrainer(id=re.compile("100.*"))
250
        soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
251
        self.assertEquals(len(soup), 5)
252

    
253
        strainer = SoupStrainer(text=re.compile("10[01].*"))
254
        soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
255
        self.assertEquals(len(soup), 10)
256

    
257
        strainer = SoupStrainer(text=lambda(x):x[8]=='3')
258
        soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
259
        self.assertEquals(len(soup), 3)
260

    
261
class PickleMeThis(SoupTest):
262
    "Testing features like pickle and deepcopy."
263

    
264
    def setUp(self):
265
        self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN"
266
"http://www.w3.org/TR/REC-html40/transitional.dtd">
267
<html>
268
<head>
269
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
270
<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
271
<link rev="made" href="mailto:leonardr@segfault.org">
272
<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
273
<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
274
<meta name="author" content="Leonard Richardson">
275
</head>
276
<body>
277
<a href="foo">foo</a>
278
<a href="foo"><b>bar</b></a>
279
</body>
280
</html>"""
281

    
282
        self.soup = BeautifulSoup(self.page)
283

    
284
    def testPickle(self):
285
        import pickle
286
        dumped = pickle.dumps(self.soup, 2)
287
        loaded = pickle.loads(dumped)
288
        self.assertEqual(loaded.__class__, BeautifulSoup)
289
        self.assertEqual(str(loaded), str(self.soup))
290

    
291
    def testDeepcopy(self):
292
        from copy import deepcopy
293
        copied = deepcopy(self.soup)
294
        self.assertEqual(str(copied), str(self.soup))
295

    
296
    def testUnicodePickle(self):
297
        import cPickle as pickle
298
        html = "<b>" + chr(0xc3) + "</b>"
299
        soup = BeautifulSoup(html)
300
        dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL)
301
        loaded = pickle.loads(dumped)
302
        self.assertEqual(str(loaded), str(soup))
303

    
304

    
305
class WriteOnlyCode(SoupTest):
306
    "Testing the modification of the tree."
307

    
308
    def testModifyAttributes(self):
309
        soup = BeautifulSoup('<a id="1"></a>')
310
        soup.a['id'] = 2
311
        self.assertEqual(soup.renderContents(), '<a id="2"></a>')
312
        del(soup.a['id'])
313
        self.assertEqual(soup.renderContents(), '<a></a>')
314
        soup.a['id2'] = 'foo'
315
        self.assertEqual(soup.renderContents(), '<a id2="foo"></a>')
316

    
317
    def testNewTagCreation(self):
318
        "Makes sure tags don't step on each others' toes."
319
        soup = BeautifulSoup()
320
        a = Tag(soup, 'a')
321
        ol = Tag(soup, 'ol')
322
        a['href'] = 'http://foo.com/'
323
        self.assertRaises(KeyError, lambda : ol['href'])
324

    
325
    def testNewTagWithAttributes(self):
326
        """Makes sure new tags can be created complete with attributes."""
327
        soup = BeautifulSoup()
328
        a = Tag(soup, 'a', [('href', 'foo')])
329
        b = Tag(soup, 'b', {'class':'bar'})
330
        soup.insert(0,a)
331
        soup.insert(1,b)
332
        self.assertEqual(soup.a['href'], 'foo')
333
        self.assertEqual(soup.b['class'], 'bar')
334

    
335
    def testTagReplacement(self):
336
        # Make sure you can replace an element with itself.
337
        text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
338
        soup = BeautifulSoup(text)
339
        c = soup.c
340
        soup.c.replaceWith(c)
341
        self.assertEquals(str(soup), text)
342

    
343
        # A very simple case
344
        soup = BeautifulSoup("<b>Argh!</b>")
345
        soup.find(text="Argh!").replaceWith("Hooray!")
346
        newText = soup.find(text="Hooray!")
347
        b = soup.b
348
        self.assertEqual(newText.previous, b)
349
        self.assertEqual(newText.parent, b)
350
        self.assertEqual(newText.previous.next, newText)
351
        self.assertEqual(newText.next, None)
352

    
353
        # A more complex case
354
        soup = BeautifulSoup("<a><b>Argh!</b><c></c><d></d></a>")
355
        soup.b.insert(1, "Hooray!")
356
        newText = soup.find(text="Hooray!")
357
        self.assertEqual(newText.previous, "Argh!")
358
        self.assertEqual(newText.previous.next, newText)
359

    
360
        self.assertEqual(newText.previousSibling, "Argh!")
361
        self.assertEqual(newText.previousSibling.nextSibling, newText)
362

    
363
        self.assertEqual(newText.nextSibling, None)
364
        self.assertEqual(newText.next, soup.c)
365

    
366
        text = "<html>There's <b>no</b> business like <b>show</b> business</html>"
367
        soup = BeautifulSoup(text)
368
        no, show = soup.findAll('b')
369
        show.replaceWith(no)
370
        self.assertEquals(str(soup), "<html>There's  business like <b>no</b> business</html>")
371

    
372
        # Even more complex
373
        soup = BeautifulSoup("<a><b>Find</b><c>lady!</c><d></d></a>")
374
        tag = Tag(soup, 'magictag')
375
        tag.insert(0, "the")
376
        soup.a.insert(1, tag)
377

    
378
        b = soup.b
379
        c = soup.c
380
        theText = tag.find(text=True)
381
        findText = b.find(text="Find")
382

    
383
        self.assertEqual(findText.next, tag)
384
        self.assertEqual(tag.previous, findText)
385
        self.assertEqual(b.nextSibling, tag)
386
        self.assertEqual(tag.previousSibling, b)
387
        self.assertEqual(tag.nextSibling, c)
388
        self.assertEqual(c.previousSibling, tag)
389

    
390
        self.assertEqual(theText.next, c)
391
        self.assertEqual(c.previous, theText)
392

    
393
        # Aand... incredibly complex.
394
        soup = BeautifulSoup("""<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
395
        f = soup.f
396
        a = soup.a
397
        c = soup.c
398
        e = soup.e
399
        weText = a.find(text="We")
400
        soup.b.replaceWith(soup.f)
401
        self.assertEqual(str(soup), "<a>We<f>refuse</f></a><e>to<g>service</g></e>")
402

    
403
        self.assertEqual(f.previous, weText)
404
        self.assertEqual(weText.next, f)
405
        self.assertEqual(f.previousSibling, weText)
406
        self.assertEqual(f.nextSibling, None)
407
        self.assertEqual(weText.nextSibling, f)
408

    
409
    def testReplaceWithChildren(self):
410
        soup = BeautifulStoneSoup(
411
            "<top><replace><child1/><child2/></replace></top>",
412
            selfClosingTags=["child1", "child2"])
413
        soup.replaceTag.replaceWithChildren()
414
        self.assertEqual(soup.top.contents[0].name, "child1")
415
        self.assertEqual(soup.top.contents[1].name, "child2")
416

    
417
    def testAppend(self):
418
       doc = "<p>Don't leave me <b>here</b>.</p> <p>Don't leave me.</p>"
419
       soup = BeautifulSoup(doc)
420
       second_para = soup('p')[1]
421
       bold = soup.find('b')
422
       soup('p')[1].append(soup.find('b'))
423
       self.assertEqual(bold.parent, second_para)
424
       self.assertEqual(str(soup),
425
                        "<p>Don't leave me .</p> "
426
                        "<p>Don't leave me.<b>here</b></p>")
427

    
428
    def testTagExtraction(self):
429
        # A very simple case
430
        text = '<html><div id="nav">Nav crap</div>Real content here.</html>'
431
        soup = BeautifulSoup(text)
432
        extracted = soup.find("div", id="nav").extract()
433
        self.assertEqual(str(soup), "<html>Real content here.</html>")
434
        self.assertEqual(str(extracted), '<div id="nav">Nav crap</div>')
435

    
436
        # A simple case, a more complex test.
437
        text = "<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>"
438
        soup = BeautifulStoneSoup(text)
439
        doc = soup.doc
440
        numbers, roman, letters = soup("a")
441

    
442
        self.assertEqual(roman.parent, doc)
443
        oldPrevious = roman.previous
444
        endOfThisTag = roman.nextSibling.previous
445
        self.assertEqual(oldPrevious, "2")
446
        self.assertEqual(roman.next, "i")
447
        self.assertEqual(endOfThisTag, "ii")
448
        self.assertEqual(roman.previousSibling, numbers)
449
        self.assertEqual(roman.nextSibling, letters)
450

    
451
        roman.extract()
452
        self.assertEqual(roman.parent, None)
453
        self.assertEqual(roman.previous, None)
454
        self.assertEqual(roman.next, "i")
455
        self.assertEqual(letters.previous, '2')
456
        self.assertEqual(roman.previousSibling, None)
457
        self.assertEqual(roman.nextSibling, None)
458
        self.assertEqual(endOfThisTag.next, None)
459
        self.assertEqual(roman.b.contents[0].next, None)
460
        self.assertEqual(numbers.nextSibling, letters)
461
        self.assertEqual(letters.previousSibling, numbers)
462
        self.assertEqual(len(doc.contents), 2)
463
        self.assertEqual(doc.contents[0], numbers)
464
        self.assertEqual(doc.contents[1], letters)
465

    
466
        # A more complex case.
467
        text = "<a>1<b>2<c>Hollywood, baby!</c></b></a>3"
468
        soup = BeautifulStoneSoup(text)
469
        one = soup.find(text="1")
470
        three = soup.find(text="3")
471
        toExtract = soup.b
472
        soup.b.extract()
473
        self.assertEqual(one.next, three)
474
        self.assertEqual(three.previous, one)
475
        self.assertEqual(one.parent.nextSibling, three)
476
        self.assertEqual(three.previousSibling, soup.a)
477
        
478
    def testClear(self):
479
        soup = BeautifulSoup("<ul><li></li><li></li></ul>")
480
        soup.ul.clear()
481
        self.assertEqual(len(soup.ul.contents), 0)
482

    
483
class TheManWithoutAttributes(SoupTest):
484
    "Test attribute access"
485

    
486
    def testHasKey(self):
487
        text = "<foo attr='bar'>"
488
        self.assertEquals(BeautifulSoup(text).foo.has_key('attr'), True)
489

    
490
class QuoteMeOnThat(SoupTest):
491
    "Test quoting"
492
    def testQuotedAttributeValues(self):
493
        self.assertSoupEquals("<foo attr='bar'></foo>",
494
                              '<foo attr="bar"></foo>')
495

    
496
        text = """<foo attr='bar "brawls" happen'>a</foo>"""
497
        soup = BeautifulSoup(text)
498
        self.assertEquals(soup.renderContents(), text)
499

    
500
        soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
501
        newText = """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>"""
502
        self.assertSoupEquals(soup.renderContents(), newText)
503

    
504
        self.assertSoupEquals('<this is="really messed up & stuff">',
505
                              '<this is="really messed up &amp; stuff"></this>')
506

    
507
        # This is not what the original author had in mind, but it's
508
        # a legitimate interpretation of what they wrote.
509
        self.assertSoupEquals("""<a href="foo</a>, </a><a href="bar">baz</a>""",
510
        '<a href="foo&lt;/a&gt;, &lt;/a&gt;&lt;a href="></a>, <a href="bar">baz</a>')
511

    
512
        # SGMLParser generates bogus parse events when attribute values
513
        # contain embedded brackets, but at least Beautiful Soup fixes
514
        # it up a little.
515
        self.assertSoupEquals('<a b="<a>">', '<a b="&lt;a&gt;"></a><a>"&gt;</a>')
516
        self.assertSoupEquals('<a href="http://foo.com/<a> and blah and blah',
517
                              """<a href='"http://foo.com/'></a><a> and blah and blah</a>""")
518

    
519

    
520

    
521
class YoureSoLiteral(SoupTest):
522
    "Test literal mode."
523
    def testLiteralMode(self):
524
        text = "<script>if (i<imgs.length)</script><b>Foo</b>"
525
        soup = BeautifulSoup(text)
526
        self.assertEqual(soup.script.contents[0], "if (i<imgs.length)")
527
        self.assertEqual(soup.b.contents[0], "Foo")
528

    
529
    def testTextArea(self):
530
        text = "<textarea><b>This is an example of an HTML tag</b><&<&</textarea>"
531
        soup = BeautifulSoup(text)
532
        self.assertEqual(soup.textarea.contents[0],
533
                         "<b>This is an example of an HTML tag</b><&<&")
534

    
535
class OperatorOverload(SoupTest):
536
    "Our operators do it all! Call now!"
537

    
538
    def testTagNameAsFind(self):
539
        "Tests that referencing a tag name as a member delegates to find()."
540
        soup = BeautifulSoup('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
541
        self.assertEqual(soup.b.i, soup.find('b').find('i'))
542
        self.assertEqual(soup.b.i.string, 'bar')
543
        self.assertEqual(soup.b['id'], '1')
544
        self.assertEqual(soup.b.contents[0], 'foo')
545
        self.assert_(not soup.a)
546

    
547
        #Test the .fooTag variant of .foo.
548
        self.assertEqual(soup.bTag.iTag.string, 'bar')
549
        self.assertEqual(soup.b.iTag.string, 'bar')
550
        self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag)
551

    
552
class NestableEgg(SoupTest):
553
    """Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!"""
554

    
555
    def testParaInsideBlockquote(self):
556
        soup = BeautifulSoup('<blockquote><p><b>Foo</blockquote><p>Bar')
557
        self.assertEqual(soup.blockquote.p.b.string, 'Foo')
558
        self.assertEqual(soup.blockquote.b.string, 'Foo')
559
        self.assertEqual(soup.find('p', recursive=False).string, 'Bar')
560

    
561
    def testNestedTables(self):
562
        text = """<table id="1"><tr><td>Here's another table:
563
        <table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>"""
564
        soup = BeautifulSoup(text)
565
        self.assertEquals(soup.table.table.td.string, 'Juicy text')
566
        self.assertEquals(len(soup.findAll('table')), 2)
567
        self.assertEquals(len(soup.table.findAll('table')), 1)
568
        self.assertEquals(soup.find('table', {'id' : 2}).parent.parent.parent.name,
569
                          'table')
570

    
571
        text = "<table><tr><td><div><table>Foo</table></div></td></tr></table>"
572
        soup = BeautifulSoup(text)
573
        self.assertEquals(soup.table.tr.td.div.table.contents[0], "Foo")
574

    
575
        text = """<table><thead><tr>Foo</tr></thead><tbody><tr>Bar</tr></tbody>
576
        <tfoot><tr>Baz</tr></tfoot></table>"""
577
        soup = BeautifulSoup(text)
578
        self.assertEquals(soup.table.thead.tr.contents[0], "Foo")
579

    
580
    def testBadNestedTables(self):
581
        soup = BeautifulSoup("<table><tr><table><tr id='nested'>")
582
        self.assertEquals(soup.table.tr.table.tr['id'], 'nested')
583

    
584
class CleanupOnAisleFour(SoupTest):
585
    """Here we test cleanup of text that breaks SGMLParser or is just
586
    obnoxious."""
587

    
588
    def testSelfClosingtag(self):
589
        self.assertEqual(str(BeautifulSoup("Foo<br/>Bar").find('br')),
590
                         '<br />')
591

    
592
        self.assertSoupEquals('<p>test1<br/>test2</p>',
593
                              '<p>test1<br />test2</p>')
594

    
595
        text = '<p>test1<selfclosing>test2'
596
        soup = BeautifulStoneSoup(text)
597
        self.assertEqual(str(soup),
598
                         '<p>test1<selfclosing>test2</selfclosing></p>')
599

    
600
        soup = BeautifulStoneSoup(text, selfClosingTags='selfclosing')
601
        self.assertEqual(str(soup),
602
                         '<p>test1<selfclosing />test2</p>')
603

    
604
    def testSelfClosingTagOrNot(self):
605
        text = "<item><link>http://foo.com/</link></item>"
606
        self.assertEqual(BeautifulStoneSoup(text).renderContents(), text)
607
        self.assertEqual(BeautifulSoup(text).renderContents(),
608
                         '<item><link />http://foo.com/</item>')
609

    
610
    def testCData(self):
611
        xml = "<root>foo<![CDATA[foobar]]>bar</root>"
612
        self.assertSoupEquals(xml, xml)
613
        r = re.compile("foo.*bar")
614
        soup = BeautifulSoup(xml)
615
        self.assertEquals(soup.find(text=r).string, "foobar")
616
        self.assertEquals(soup.find(text=r).__class__, CData)
617

    
618
    def testComments(self):
619
        xml = "foo<!--foobar-->baz"
620
        self.assertSoupEquals(xml)
621
        r = re.compile("foo.*bar")
622
        soup = BeautifulSoup(xml)
623
        self.assertEquals(soup.find(text=r).string, "foobar")
624
        self.assertEquals(soup.find(text="foobar").__class__, Comment)
625

    
626
    def testDeclaration(self):
627
        xml = "foo<!DOCTYPE foobar>baz"
628
        self.assertSoupEquals(xml)
629
        r = re.compile(".*foo.*bar")
630
        soup = BeautifulSoup(xml)
631
        text = "DOCTYPE foobar"
632
        self.assertEquals(soup.find(text=r).string, text)
633
        self.assertEquals(soup.find(text=text).__class__, Declaration)
634

    
635
        namespaced_doctype = ('<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">'
636
                              '<html>foo</html>')
637
        soup = BeautifulSoup(namespaced_doctype)
638
        self.assertEquals(soup.contents[0],
639
                          'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"')
640
        self.assertEquals(soup.html.contents[0], 'foo')
641

    
642
    def testEntityConversions(self):
643
        text = "&lt;&lt;sacr&eacute;&#32;bleu!&gt;&gt;"
644
        soup = BeautifulStoneSoup(text)
645
        self.assertSoupEquals(text)
646

    
647
        xmlEnt = BeautifulStoneSoup.XML_ENTITIES
648
        htmlEnt = BeautifulStoneSoup.HTML_ENTITIES
649
        xhtmlEnt = BeautifulStoneSoup.XHTML_ENTITIES
650

    
651
        soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
652
        self.assertEquals(str(soup), "&lt;&lt;sacr&eacute; bleu!&gt;&gt;")
653

    
654
        soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
655
        self.assertEquals(unicode(soup), u"&lt;&lt;sacr\xe9 bleu!&gt;&gt;")
656

    
657
        # Make sure the "XML", "HTML", and "XHTML" settings work.
658
        text = "&lt;&trade;&apos;"
659
        soup = BeautifulStoneSoup(text, convertEntities=xmlEnt)
660
        self.assertEquals(unicode(soup), u"&lt;&trade;'")
661

    
662
        soup = BeautifulStoneSoup(text, convertEntities=htmlEnt)
663
        self.assertEquals(unicode(soup), u"&lt;\u2122&apos;")
664

    
665
        soup = BeautifulStoneSoup(text, convertEntities=xhtmlEnt)
666
        self.assertEquals(unicode(soup), u"&lt;\u2122'")
667

    
668
        invalidEntity = "foo&#bar;baz"
669
        soup = BeautifulStoneSoup\
670
               (invalidEntity,
671
                convertEntities=htmlEnt)
672
        self.assertEquals(str(soup), "foo&amp;#bar;baz")
673

    
674
        nonexistentEntity = "foo&bar;baz"
675
        soup = BeautifulStoneSoup\
676
               (nonexistentEntity,
677
                convertEntities="xml")
678
        self.assertEquals(str(soup), nonexistentEntity)
679

    
680

    
681
    def testNonBreakingSpaces(self):
682
        soup = BeautifulSoup("<a>&nbsp;&nbsp;</a>",
683
                             convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
684
        self.assertEquals(unicode(soup), u"<a>\xa0\xa0</a>")
685

    
686
    def testWhitespaceInDeclaration(self):
687
        self.assertSoupEquals('<! DOCTYPE>', '<!DOCTYPE>')
688

    
689
    def testJunkInDeclaration(self):
690
        self.assertSoupEquals('<! Foo = -8>a', '&lt;!Foo = -8&gt;a')
691

    
692
    def testIncompleteDeclaration(self):
693
        self.assertSoupEquals('a<!b <p>c', 'a&lt;!b &lt;p&gt;c')
694

    
695
    def testEntityReplacement(self):
696
        self.assertSoupEquals('<b>hello&nbsp;there</b>')
697

    
698
    def testEntitiesInAttributeValues(self):
699
        self.assertSoupEquals('<x t="x&#241;">', '<x t="x\xc3\xb1"></x>')
700
        self.assertSoupEquals('<x t="x&#xf1;">', '<x t="x\xc3\xb1"></x>')
701

    
702
        soup = BeautifulSoup('<x t="&gt;&trade;">',
703
                             convertEntities=BeautifulStoneSoup.HTML_ENTITIES)
704
        self.assertEquals(unicode(soup), u'<x t="&gt;\u2122"></x>')
705

    
706
        uri = "http://crummy.com?sacr&eacute;&amp;bleu"
707
        link = '<a href="%s"></a>' % uri
708
        soup = BeautifulSoup(link)
709
        self.assertEquals(unicode(soup), link)
710
        #self.assertEquals(unicode(soup.a['href']), uri)
711

    
712
        soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
713
        self.assertEquals(unicode(soup),
714
                          link.replace("&eacute;", u"\xe9"))
715

    
716
        uri = "http://crummy.com?sacr&eacute;&bleu"
717
        link = '<a href="%s"></a>' % uri
718
        soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES)
719
        self.assertEquals(unicode(soup.a['href']),
720
                          uri.replace("&eacute;", u"\xe9"))
721

    
722
    def testNakedAmpersands(self):
723
        html = {'convertEntities':BeautifulStoneSoup.HTML_ENTITIES}
724
        soup = BeautifulStoneSoup("AT&T ", **html)
725
        self.assertEquals(str(soup), 'AT&amp;T ')
726

    
727
        nakedAmpersandInASentence = "AT&T was Ma Bell"
728
        soup = BeautifulStoneSoup(nakedAmpersandInASentence,**html)
729
        self.assertEquals(str(soup), \
730
               nakedAmpersandInASentence.replace('&','&amp;'))
731

    
732
        invalidURL = '<a href="http://example.org?a=1&b=2;3">foo</a>'
733
        validURL = invalidURL.replace('&','&amp;')
734
        soup = BeautifulStoneSoup(invalidURL)
735
        self.assertEquals(str(soup), validURL)
736

    
737
        soup = BeautifulStoneSoup(validURL)
738
        self.assertEquals(str(soup), validURL)
739

    
740

    
741
class EncodeRed(SoupTest):
742
    """Tests encoding conversion, Unicode conversion, and Microsoft
743
    smart quote fixes."""
744

    
745
    def testUnicodeDammitStandalone(self):
746
        markup = "<foo>\x92</foo>"
747
        dammit = UnicodeDammit(markup)
748
        self.assertEquals(dammit.unicode, "<foo>&#x2019;</foo>")
749

    
750
        hebrew = "\xed\xe5\xec\xf9"
751
        dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
752
        self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9')
753
        self.assertEquals(dammit.originalEncoding, 'iso-8859-8')
754

    
755
    def testGarbageInGarbageOut(self):
756
        ascii = "<foo>a</foo>"
757
        asciiSoup = BeautifulStoneSoup(ascii)
758
        self.assertEquals(ascii, str(asciiSoup))
759

    
760
        unicodeData = u"<foo>\u00FC</foo>"
761
        utf8 = unicodeData.encode("utf-8")
762
        self.assertEquals(utf8, '<foo>\xc3\xbc</foo>')
763

    
764
        unicodeSoup = BeautifulStoneSoup(unicodeData)
765
        self.assertEquals(unicodeData, unicode(unicodeSoup))
766
        self.assertEquals(unicode(unicodeSoup.foo.string), u'\u00FC')
767

    
768
        utf8Soup = BeautifulStoneSoup(utf8, fromEncoding='utf-8')
769
        self.assertEquals(utf8, str(utf8Soup))
770
        self.assertEquals(utf8Soup.originalEncoding, "utf-8")
771

    
772
        utf8Soup = BeautifulStoneSoup(unicodeData)
773
        self.assertEquals(utf8, str(utf8Soup))
774
        self.assertEquals(utf8Soup.originalEncoding, None)
775

    
776

    
777
    def testHandleInvalidCodec(self):
778
        for bad_encoding in ['.utf8', '...', 'utF---16.!']:
779
            soup = BeautifulSoup("Räksmörgås", fromEncoding=bad_encoding)
780
            self.assertEquals(soup.originalEncoding, 'utf-8')
781

    
782
    def testUnicodeSearch(self):
783
        html = u'<html><body><h1>Räksmörgås</h1></body></html>'
784
        soup = BeautifulSoup(html)
785
        self.assertEqual(soup.find(text=u'Räksmörgås'),u'Räksmörgås')
786

    
787
    def testRewrittenXMLHeader(self):
788
        euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n'
789
        utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n"
790
        soup = BeautifulStoneSoup(euc_jp)
791
        if soup.originalEncoding != "euc-jp":
792
            raise Exception("Test failed when parsing euc-jp document. "
793
                            "If you're running Python >=2.4, or you have "
794
                            "cjkcodecs installed, this is a real problem. "
795
                            "Otherwise, ignore it.")
796

    
797
        self.assertEquals(soup.originalEncoding, "euc-jp")
798
        self.assertEquals(str(soup), utf8)
799

    
800
        old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>"
801
        new_text = "<?xml version='1.0' encoding='utf-8'?><foo>&rsquo;</foo>"
802
        self.assertSoupEquals(old_text, new_text)
803

    
804
    def testRewrittenMetaTag(self):
805
        no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'''
806
        soup = BeautifulSoup(no_shift_jis_html)
807

    
808
        # Beautiful Soup used to try to rewrite the meta tag even if the
809
        # meta tag got filtered out by the strainer. This test makes
810
        # sure that doesn't happen.
811
        strainer = SoupStrainer('pre')
812
        soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer)
813
        self.assertEquals(soup.contents[0].name, 'pre')
814

    
815
        meta_tag = ('<meta content="text/html; charset=x-sjis" '
816
                    'http-equiv="Content-type" />')
817
        shift_jis_html = (
818
            '<html><head>\n%s\n'
819
            '<meta http-equiv="Content-language" content="ja" />'
820
            '</head><body><pre>\n'
821
            '\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
822
            '\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
823
            '\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n'
824
            '</pre></body></html>') % meta_tag
825
        soup = BeautifulSoup(shift_jis_html)
826
        if soup.originalEncoding != "shift-jis":
827
            raise Exception("Test failed when parsing shift-jis document "
828
                            "with meta tag '%s'."
829
                            "If you're running Python >=2.4, or you have "
830
                            "cjkcodecs installed, this is a real problem. "
831
                            "Otherwise, ignore it." % meta_tag)
832
        self.assertEquals(soup.originalEncoding, "shift-jis")
833

    
834
        content_type_tag = soup.meta['content']
835
        self.assertEquals(content_type_tag[content_type_tag.find('charset='):],
836
                          'charset=%SOUP-ENCODING%')
837
        content_type = str(soup.meta)
838
        index = content_type.find('charset=')
839
        self.assertEqual(content_type[index:index+len('charset=utf8')+1],
840
                         'charset=utf-8')
841
        content_type = soup.meta.__str__('shift-jis')
842
        index = content_type.find('charset=')
843
        self.assertEqual(content_type[index:index+len('charset=shift-jis')],
844
                         'charset=shift-jis')
845

    
846
        self.assertEquals(str(soup), (
847
                '<html><head>\n'
848
                '<meta content="text/html; charset=utf-8" '
849
                'http-equiv="Content-type" />\n'
850
                '<meta http-equiv="Content-language" content="ja" />'
851
                '</head><body><pre>\n'
852
                '\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3'
853
                '\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3'
854
                '\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6'
855
                '\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3'
856
                '\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n'
857
                '</pre></body></html>'))
858
        self.assertEquals(soup.renderContents("shift-jis"),
859
                          shift_jis_html.replace('x-sjis', 'shift-jis'))
860

    
861
        isolatin ="""<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
862
        soup = BeautifulSoup(isolatin)
863
        self.assertSoupEquals(soup.__str__("utf-8"),
864
                              isolatin.replace("ISO-Latin-1", "utf-8").replace("\xe9", "\xc3\xa9"))
865

    
866
    def testHebrew(self):
867
        iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n'
868
        utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n'
869
        soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8")
870
        self.assertEquals(str(soup), utf8)
871

    
872
    def testSmartQuotesNotSoSmartAnymore(self):
873
        self.assertSoupEquals("\x91Foo\x92 <!--blah-->",
874
                              '&lsquo;Foo&rsquo; <!--blah-->')
875

    
876
    def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self):
877
        smartQuotes = "Il a dit, \x8BSacr&eacute; bl&#101;u!\x9b"
878
        soup = BeautifulSoup(smartQuotes)
879
        self.assertEquals(str(soup),
880
                          'Il a dit, &lsaquo;Sacr&eacute; bl&#101;u!&rsaquo;')
881
        soup = BeautifulSoup(smartQuotes, convertEntities="html")
882
        self.assertEquals(str(soup),
883
                          'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba')
884

    
885
    def testDontSeeSmartQuotesWhereThereAreNone(self):
886
        utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
887
        self.assertSoupEquals(utf_8)
888

    
889

    
890
class Whitewash(SoupTest):
891
    """Test whitespace preservation."""
892

    
893
    def testPreservedWhitespace(self):
894
        self.assertSoupEquals("<pre>   </pre>")
895
        self.assertSoupEquals("<pre> woo  </pre>")
896

    
897
    def testCollapsedWhitespace(self):
898
        self.assertSoupEquals("<p>   </p>", "<p> </p>")
899

    
900

    
901
if __name__ == '__main__':
902
    unittest.main()