gvsig-scripting / org.gvsig.scripting / trunk / org.gvsig.scripting / org.gvsig.scripting.app / org.gvsig.scripting.app.mainplugin / src / main / resources-plugin / scripting / lib / BeautifulSoupTests.py @ 475
History | View | Annotate | Download (36.7 KB)
1 |
# -*- coding: utf-8 -*-
|
---|---|
2 |
"""Unit tests for Beautiful Soup.
|
3 |
|
4 |
These tests make sure the Beautiful Soup works as it should. If you
|
5 |
find a bug in Beautiful Soup, the best way to express it is as a test
|
6 |
case like this that fails."""
|
7 |
|
8 |
import unittest |
9 |
from BeautifulSoup import * |
10 |
|
11 |
class SoupTest(unittest.TestCase): |
12 |
|
13 |
def assertSoupEquals(self, toParse, rep=None, c=BeautifulSoup): |
14 |
"""Parse the given text and make sure its string rep is the other
|
15 |
given text."""
|
16 |
if rep == None: |
17 |
rep = toParse |
18 |
self.assertEqual(str(c(toParse)), rep) |
19 |
|
20 |
|
21 |
class FollowThatTag(SoupTest): |
22 |
|
23 |
"Tests the various ways of fetching tags from a soup."
|
24 |
|
25 |
def setUp(self): |
26 |
ml = """
|
27 |
<a id="x">1</a>
|
28 |
<A id="a">2</a>
|
29 |
<b id="b">3</a>
|
30 |
<b href="foo" id="x">4</a>
|
31 |
<ac width=100>4</ac>"""
|
32 |
self.soup = BeautifulStoneSoup(ml)
|
33 |
|
34 |
def testFindAllByName(self): |
35 |
matching = self.soup('a') |
36 |
self.assertEqual(len(matching), 2) |
37 |
self.assertEqual(matching[0].name, 'a') |
38 |
self.assertEqual(matching, self.soup.findAll('a')) |
39 |
self.assertEqual(matching, self.soup.findAll(SoupStrainer('a'))) |
40 |
|
41 |
def testFindAllByAttribute(self): |
42 |
matching = self.soup.findAll(id='x') |
43 |
self.assertEqual(len(matching), 2) |
44 |
self.assertEqual(matching[0].name, 'a') |
45 |
self.assertEqual(matching[1].name, 'b') |
46 |
|
47 |
matching2 = self.soup.findAll(attrs={'id' : 'x'}) |
48 |
self.assertEqual(matching, matching2)
|
49 |
|
50 |
strainer = SoupStrainer(attrs={'id' : 'x'}) |
51 |
self.assertEqual(matching, self.soup.findAll(strainer)) |
52 |
|
53 |
self.assertEqual(len(self.soup.findAll(id=None)), 1) |
54 |
|
55 |
self.assertEqual(len(self.soup.findAll(width=100)), 1) |
56 |
self.assertEqual(len(self.soup.findAll(junk=None)), 5) |
57 |
self.assertEqual(len(self.soup.findAll(junk=[1, None])), 5) |
58 |
|
59 |
self.assertEqual(len(self.soup.findAll(junk=re.compile('.*'))), 0) |
60 |
self.assertEqual(len(self.soup.findAll(junk=True)), 0) |
61 |
|
62 |
self.assertEqual(len(self.soup.findAll(junk=True)), 0) |
63 |
self.assertEqual(len(self.soup.findAll(href=True)), 1) |
64 |
|
65 |
def testFindallByClass(self): |
66 |
soup = BeautifulSoup('<b class="foo">Foo</b><a class="1 23 4">Bar</a>')
|
67 |
self.assertEqual(soup.find(attrs='foo').string, "Foo") |
68 |
self.assertEqual(soup.find('a', '1').string, "Bar") |
69 |
self.assertEqual(soup.find('a', '23').string, "Bar") |
70 |
self.assertEqual(soup.find('a', '4').string, "Bar") |
71 |
|
72 |
self.assertEqual(soup.find('a', '2'), None) |
73 |
|
74 |
def testFindAllByList(self): |
75 |
matching = self.soup(['a', 'ac']) |
76 |
self.assertEqual(len(matching), 3) |
77 |
|
78 |
def testFindAllByHash(self): |
79 |
matching = self.soup({'a' : True, 'b' : True}) |
80 |
self.assertEqual(len(matching), 4) |
81 |
|
82 |
def testFindAllText(self): |
83 |
soup = BeautifulSoup("<html>\xbb</html>")
|
84 |
self.assertEqual(soup.findAll(text=re.compile('.*')), |
85 |
[u'\xbb'])
|
86 |
|
87 |
def testFindAllByRE(self): |
88 |
import re |
89 |
r = re.compile('a.*')
|
90 |
self.assertEqual(len(self.soup(r)), 3) |
91 |
|
92 |
def testFindAllByMethod(self): |
93 |
def matchTagWhereIDMatchesName(tag): |
94 |
return tag.name == tag.get('id') |
95 |
|
96 |
matching = self.soup.findAll(matchTagWhereIDMatchesName)
|
97 |
self.assertEqual(len(matching), 2) |
98 |
self.assertEqual(matching[0].name, 'a') |
99 |
|
100 |
def testFindByIndex(self): |
101 |
"""For when you have the tag and you want to know where it is."""
|
102 |
tag = self.soup.find('a', id="a") |
103 |
self.assertEqual(self.soup.index(tag), 3) |
104 |
|
105 |
# It works for NavigableStrings as well.
|
106 |
s = tag.string |
107 |
self.assertEqual(tag.index(s), 0) |
108 |
|
109 |
# If the tag isn't present, a ValueError is raised.
|
110 |
soup2 = BeautifulSoup("<b></b>")
|
111 |
tag2 = soup2.find('b')
|
112 |
self.assertRaises(ValueError, self.soup.index, tag2) |
113 |
|
114 |
def testConflictingFindArguments(self): |
115 |
"""The 'text' argument takes precedence."""
|
116 |
soup = BeautifulSoup('Foo<b>Bar</b>Baz')
|
117 |
self.assertEqual(soup.find('b', text='Baz'), 'Baz') |
118 |
self.assertEqual(soup.findAll('b', text='Baz'), ['Baz']) |
119 |
|
120 |
self.assertEqual(soup.find(True, text='Baz'), 'Baz') |
121 |
self.assertEqual(soup.findAll(True, text='Baz'), ['Baz']) |
122 |
|
123 |
def testParents(self): |
124 |
soup = BeautifulSoup('<ul id="foo"></ul><ul id="foo"><ul><ul id="foo" a="b"><b>Blah')
|
125 |
b = soup.b |
126 |
self.assertEquals(len(b.findParents('ul', {'id' : 'foo'})), 2) |
127 |
self.assertEquals(b.findParent('ul')['a'], 'b') |
128 |
|
129 |
PROXIMITY_TEST = BeautifulSoup('<b id="1"><b id="2"><b id="3"><b id="4">')
|
130 |
|
131 |
def testNext(self): |
132 |
soup = self.PROXIMITY_TEST
|
133 |
b = soup.find('b', {'id' : 2}) |
134 |
self.assertEquals(b.findNext('b')['id'], '3') |
135 |
self.assertEquals(b.findNext('b')['id'], '3') |
136 |
self.assertEquals(len(b.findAllNext('b')), 2) |
137 |
self.assertEquals(len(b.findAllNext('b', {'id' : 4})), 1) |
138 |
|
139 |
def testPrevious(self): |
140 |
soup = self.PROXIMITY_TEST
|
141 |
b = soup.find('b', {'id' : 3}) |
142 |
self.assertEquals(b.findPrevious('b')['id'], '2') |
143 |
self.assertEquals(b.findPrevious('b')['id'], '2') |
144 |
self.assertEquals(len(b.findAllPrevious('b')), 2) |
145 |
self.assertEquals(len(b.findAllPrevious('b', {'id' : 2})), 1) |
146 |
|
147 |
|
148 |
SIBLING_TEST = BeautifulSoup('<blockquote id="1"><blockquote id="1.1"></blockquote></blockquote><blockquote id="2"><blockquote id="2.1"></blockquote></blockquote><blockquote id="3"><blockquote id="3.1"></blockquote></blockquote><blockquote id="4">')
|
149 |
|
150 |
def testNextSibling(self): |
151 |
soup = self.SIBLING_TEST
|
152 |
tag = 'blockquote'
|
153 |
b = soup.find(tag, {'id' : 2}) |
154 |
self.assertEquals(b.findNext(tag)['id'], '2.1') |
155 |
self.assertEquals(b.findNextSibling(tag)['id'], '3') |
156 |
self.assertEquals(b.findNextSibling(tag)['id'], '3') |
157 |
self.assertEquals(len(b.findNextSiblings(tag)), 2) |
158 |
self.assertEquals(len(b.findNextSiblings(tag, {'id' : 4})), 1) |
159 |
|
160 |
def testPreviousSibling(self): |
161 |
soup = self.SIBLING_TEST
|
162 |
tag = 'blockquote'
|
163 |
b = soup.find(tag, {'id' : 3}) |
164 |
self.assertEquals(b.findPrevious(tag)['id'], '2.1') |
165 |
self.assertEquals(b.findPreviousSibling(tag)['id'], '2') |
166 |
self.assertEquals(b.findPreviousSibling(tag)['id'], '2') |
167 |
self.assertEquals(len(b.findPreviousSiblings(tag)), 2) |
168 |
self.assertEquals(len(b.findPreviousSiblings(tag, id=1)), 1) |
169 |
|
170 |
def testTextNavigation(self): |
171 |
soup = BeautifulSoup('Foo<b>Bar</b><i id="1"><b>Baz<br />Blee<hr id="1"/></b></i>Blargh')
|
172 |
baz = soup.find(text='Baz')
|
173 |
self.assertEquals(baz.findParent("i")['id'], '1') |
174 |
self.assertEquals(baz.findNext(text='Blee'), 'Blee') |
175 |
self.assertEquals(baz.findNextSibling(text='Blee'), 'Blee') |
176 |
self.assertEquals(baz.findNextSibling(text='Blargh'), None) |
177 |
self.assertEquals(baz.findNextSibling('hr')['id'], '1') |
178 |
|
179 |
class SiblingRivalry(SoupTest): |
180 |
"Tests the nextSibling and previousSibling navigation."
|
181 |
|
182 |
def testSiblings(self): |
183 |
soup = BeautifulSoup("<ul><li>1<p>A</p>B<li>2<li>3</ul>")
|
184 |
secondLI = soup.find('li').nextSibling
|
185 |
self.assert_(secondLI.name == 'li' and secondLI.string == '2') |
186 |
self.assertEquals(soup.find(text='1').nextSibling.name, 'p') |
187 |
self.assertEquals(soup.find('p').nextSibling, 'B') |
188 |
self.assertEquals(soup.find('p').nextSibling.previousSibling.nextSibling, 'B') |
189 |
|
190 |
class TagsAreObjectsToo(SoupTest): |
191 |
"Tests the various built-in functions of Tag objects."
|
192 |
|
193 |
def testLen(self): |
194 |
soup = BeautifulSoup("<top>1<b>2</b>3</top>")
|
195 |
self.assertEquals(len(soup.top), 3) |
196 |
|
197 |
class StringEmUp(SoupTest): |
198 |
"Tests the use of 'string' as an alias for a tag's only content."
|
199 |
|
200 |
def testString(self): |
201 |
s = BeautifulSoup("<b>foo</b>")
|
202 |
self.assertEquals(s.b.string, 'foo') |
203 |
|
204 |
def testLackOfString(self): |
205 |
s = BeautifulSoup("<b>f<i>e</i>o</b>")
|
206 |
self.assert_(not s.b.string) |
207 |
|
208 |
def testStringAssign(self): |
209 |
s = BeautifulSoup("<b></b>")
|
210 |
b = s.b |
211 |
b.string = "foo"
|
212 |
string = b.string |
213 |
self.assertEquals(string, "foo") |
214 |
self.assert_(isinstance(string, NavigableString)) |
215 |
|
216 |
class AllText(SoupTest): |
217 |
"Tests the use of 'text' to get all of string content from the tag."
|
218 |
|
219 |
def testText(self): |
220 |
soup = BeautifulSoup("<ul><li>spam</li><li>eggs</li><li>cheese</li>")
|
221 |
self.assertEquals(soup.ul.text, "spameggscheese") |
222 |
self.assertEquals(soup.ul.getText('/'), "spam/eggs/cheese") |
223 |
|
224 |
class ThatsMyLimit(SoupTest): |
225 |
"Tests the limit argument."
|
226 |
|
227 |
def testBasicLimits(self): |
228 |
s = BeautifulSoup('<br id="1" /><br id="1" /><br id="1" /><br id="1" />')
|
229 |
self.assertEquals(len(s.findAll('br')), 4) |
230 |
self.assertEquals(len(s.findAll('br', limit=2)), 2) |
231 |
self.assertEquals(len(s('br', limit=2)), 2) |
232 |
|
233 |
class OnlyTheLonely(SoupTest): |
234 |
"Tests the parseOnly argument to the constructor."
|
235 |
def setUp(self): |
236 |
x = [] |
237 |
for i in range(1,6): |
238 |
x.append('<a id="%s">' % i)
|
239 |
for j in range(100,103): |
240 |
x.append('<b id="%s.%s">Content %s.%s</b>' % (i,j, i,j))
|
241 |
x.append('</a>')
|
242 |
self.x = ''.join(x) |
243 |
|
244 |
def testOnly(self): |
245 |
strainer = SoupStrainer("b")
|
246 |
soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
|
247 |
self.assertEquals(len(soup), 15) |
248 |
|
249 |
strainer = SoupStrainer(id=re.compile("100.*"))
|
250 |
soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
|
251 |
self.assertEquals(len(soup), 5) |
252 |
|
253 |
strainer = SoupStrainer(text=re.compile("10[01].*"))
|
254 |
soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
|
255 |
self.assertEquals(len(soup), 10) |
256 |
|
257 |
strainer = SoupStrainer(text=lambda(x):x[8]=='3') |
258 |
soup = BeautifulSoup(self.x, parseOnlyThese=strainer)
|
259 |
self.assertEquals(len(soup), 3) |
260 |
|
261 |
class PickleMeThis(SoupTest): |
262 |
"Testing features like pickle and deepcopy."
|
263 |
|
264 |
def setUp(self): |
265 |
self.page = """<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" |
266 |
"http://www.w3.org/TR/REC-html40/transitional.dtd">
|
267 |
<html>
|
268 |
<head>
|
269 |
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
|
270 |
<title>Beautiful Soup: We called him Tortoise because he taught us.</title>
|
271 |
<link rev="made" href="mailto:leonardr@segfault.org">
|
272 |
<meta name="Description" content="Beautiful Soup: an HTML parser optimized for screen-scraping.">
|
273 |
<meta name="generator" content="Markov Approximation 1.4 (module: leonardr)">
|
274 |
<meta name="author" content="Leonard Richardson">
|
275 |
</head>
|
276 |
<body>
|
277 |
<a href="foo">foo</a>
|
278 |
<a href="foo"><b>bar</b></a>
|
279 |
</body>
|
280 |
</html>"""
|
281 |
|
282 |
self.soup = BeautifulSoup(self.page) |
283 |
|
284 |
def testPickle(self): |
285 |
import pickle |
286 |
dumped = pickle.dumps(self.soup, 2) |
287 |
loaded = pickle.loads(dumped) |
288 |
self.assertEqual(loaded.__class__, BeautifulSoup)
|
289 |
self.assertEqual(str(loaded), str(self.soup)) |
290 |
|
291 |
def testDeepcopy(self): |
292 |
from copy import deepcopy |
293 |
copied = deepcopy(self.soup)
|
294 |
self.assertEqual(str(copied), str(self.soup)) |
295 |
|
296 |
def testUnicodePickle(self): |
297 |
import cPickle as pickle |
298 |
html = "<b>" + chr(0xc3) + "</b>" |
299 |
soup = BeautifulSoup(html) |
300 |
dumped = pickle.dumps(soup, pickle.HIGHEST_PROTOCOL) |
301 |
loaded = pickle.loads(dumped) |
302 |
self.assertEqual(str(loaded), str(soup)) |
303 |
|
304 |
|
305 |
class WriteOnlyCode(SoupTest): |
306 |
"Testing the modification of the tree."
|
307 |
|
308 |
def testModifyAttributes(self): |
309 |
soup = BeautifulSoup('<a id="1"></a>')
|
310 |
soup.a['id'] = 2 |
311 |
self.assertEqual(soup.renderContents(), '<a id="2"></a>') |
312 |
del(soup.a['id']) |
313 |
self.assertEqual(soup.renderContents(), '<a></a>') |
314 |
soup.a['id2'] = 'foo' |
315 |
self.assertEqual(soup.renderContents(), '<a id2="foo"></a>') |
316 |
|
317 |
def testNewTagCreation(self): |
318 |
"Makes sure tags don't step on each others' toes."
|
319 |
soup = BeautifulSoup() |
320 |
a = Tag(soup, 'a')
|
321 |
ol = Tag(soup, 'ol')
|
322 |
a['href'] = 'http://foo.com/' |
323 |
self.assertRaises(KeyError, lambda : ol['href']) |
324 |
|
325 |
def testNewTagWithAttributes(self): |
326 |
"""Makes sure new tags can be created complete with attributes."""
|
327 |
soup = BeautifulSoup() |
328 |
a = Tag(soup, 'a', [('href', 'foo')]) |
329 |
b = Tag(soup, 'b', {'class':'bar'}) |
330 |
soup.insert(0,a)
|
331 |
soup.insert(1,b)
|
332 |
self.assertEqual(soup.a['href'], 'foo') |
333 |
self.assertEqual(soup.b['class'], 'bar') |
334 |
|
335 |
def testTagReplacement(self): |
336 |
# Make sure you can replace an element with itself.
|
337 |
text = "<a><b></b><c>Foo<d></d></c></a><a><e></e></a>"
|
338 |
soup = BeautifulSoup(text) |
339 |
c = soup.c |
340 |
soup.c.replaceWith(c) |
341 |
self.assertEquals(str(soup), text) |
342 |
|
343 |
# A very simple case
|
344 |
soup = BeautifulSoup("<b>Argh!</b>")
|
345 |
soup.find(text="Argh!").replaceWith("Hooray!") |
346 |
newText = soup.find(text="Hooray!")
|
347 |
b = soup.b |
348 |
self.assertEqual(newText.previous, b)
|
349 |
self.assertEqual(newText.parent, b)
|
350 |
self.assertEqual(newText.previous.next, newText)
|
351 |
self.assertEqual(newText.next, None) |
352 |
|
353 |
# A more complex case
|
354 |
soup = BeautifulSoup("<a><b>Argh!</b><c></c><d></d></a>")
|
355 |
soup.b.insert(1, "Hooray!") |
356 |
newText = soup.find(text="Hooray!")
|
357 |
self.assertEqual(newText.previous, "Argh!") |
358 |
self.assertEqual(newText.previous.next, newText)
|
359 |
|
360 |
self.assertEqual(newText.previousSibling, "Argh!") |
361 |
self.assertEqual(newText.previousSibling.nextSibling, newText)
|
362 |
|
363 |
self.assertEqual(newText.nextSibling, None) |
364 |
self.assertEqual(newText.next, soup.c)
|
365 |
|
366 |
text = "<html>There's <b>no</b> business like <b>show</b> business</html>"
|
367 |
soup = BeautifulSoup(text) |
368 |
no, show = soup.findAll('b')
|
369 |
show.replaceWith(no) |
370 |
self.assertEquals(str(soup), "<html>There's business like <b>no</b> business</html>") |
371 |
|
372 |
# Even more complex
|
373 |
soup = BeautifulSoup("<a><b>Find</b><c>lady!</c><d></d></a>")
|
374 |
tag = Tag(soup, 'magictag')
|
375 |
tag.insert(0, "the") |
376 |
soup.a.insert(1, tag)
|
377 |
|
378 |
b = soup.b |
379 |
c = soup.c |
380 |
theText = tag.find(text=True)
|
381 |
findText = b.find(text="Find")
|
382 |
|
383 |
self.assertEqual(findText.next, tag)
|
384 |
self.assertEqual(tag.previous, findText)
|
385 |
self.assertEqual(b.nextSibling, tag)
|
386 |
self.assertEqual(tag.previousSibling, b)
|
387 |
self.assertEqual(tag.nextSibling, c)
|
388 |
self.assertEqual(c.previousSibling, tag)
|
389 |
|
390 |
self.assertEqual(theText.next, c)
|
391 |
self.assertEqual(c.previous, theText)
|
392 |
|
393 |
# Aand... incredibly complex.
|
394 |
soup = BeautifulSoup("""<a>We<b>reserve<c>the</c><d>right</d></b></a><e>to<f>refuse</f><g>service</g></e>""")
|
395 |
f = soup.f |
396 |
a = soup.a |
397 |
c = soup.c |
398 |
e = soup.e |
399 |
weText = a.find(text="We")
|
400 |
soup.b.replaceWith(soup.f) |
401 |
self.assertEqual(str(soup), "<a>We<f>refuse</f></a><e>to<g>service</g></e>") |
402 |
|
403 |
self.assertEqual(f.previous, weText)
|
404 |
self.assertEqual(weText.next, f)
|
405 |
self.assertEqual(f.previousSibling, weText)
|
406 |
self.assertEqual(f.nextSibling, None) |
407 |
self.assertEqual(weText.nextSibling, f)
|
408 |
|
409 |
def testReplaceWithChildren(self): |
410 |
soup = BeautifulStoneSoup( |
411 |
"<top><replace><child1/><child2/></replace></top>",
|
412 |
selfClosingTags=["child1", "child2"]) |
413 |
soup.replaceTag.replaceWithChildren() |
414 |
self.assertEqual(soup.top.contents[0].name, "child1") |
415 |
self.assertEqual(soup.top.contents[1].name, "child2") |
416 |
|
417 |
def testAppend(self): |
418 |
doc = "<p>Don't leave me <b>here</b>.</p> <p>Don't leave me.</p>"
|
419 |
soup = BeautifulSoup(doc) |
420 |
second_para = soup('p')[1] |
421 |
bold = soup.find('b')
|
422 |
soup('p')[1].append(soup.find('b')) |
423 |
self.assertEqual(bold.parent, second_para)
|
424 |
self.assertEqual(str(soup), |
425 |
"<p>Don't leave me .</p> "
|
426 |
"<p>Don't leave me.<b>here</b></p>")
|
427 |
|
428 |
def testTagExtraction(self): |
429 |
# A very simple case
|
430 |
text = '<html><div id="nav">Nav crap</div>Real content here.</html>'
|
431 |
soup = BeautifulSoup(text) |
432 |
extracted = soup.find("div", id="nav").extract() |
433 |
self.assertEqual(str(soup), "<html>Real content here.</html>") |
434 |
self.assertEqual(str(extracted), '<div id="nav">Nav crap</div>') |
435 |
|
436 |
# A simple case, a more complex test.
|
437 |
text = "<doc><a>1<b>2</b></a><a>i<b>ii</b></a><a>A<b>B</b></a></doc>"
|
438 |
soup = BeautifulStoneSoup(text) |
439 |
doc = soup.doc |
440 |
numbers, roman, letters = soup("a")
|
441 |
|
442 |
self.assertEqual(roman.parent, doc)
|
443 |
oldPrevious = roman.previous |
444 |
endOfThisTag = roman.nextSibling.previous |
445 |
self.assertEqual(oldPrevious, "2") |
446 |
self.assertEqual(roman.next, "i") |
447 |
self.assertEqual(endOfThisTag, "ii") |
448 |
self.assertEqual(roman.previousSibling, numbers)
|
449 |
self.assertEqual(roman.nextSibling, letters)
|
450 |
|
451 |
roman.extract() |
452 |
self.assertEqual(roman.parent, None) |
453 |
self.assertEqual(roman.previous, None) |
454 |
self.assertEqual(roman.next, "i") |
455 |
self.assertEqual(letters.previous, '2') |
456 |
self.assertEqual(roman.previousSibling, None) |
457 |
self.assertEqual(roman.nextSibling, None) |
458 |
self.assertEqual(endOfThisTag.next, None) |
459 |
self.assertEqual(roman.b.contents[0].next, None) |
460 |
self.assertEqual(numbers.nextSibling, letters)
|
461 |
self.assertEqual(letters.previousSibling, numbers)
|
462 |
self.assertEqual(len(doc.contents), 2) |
463 |
self.assertEqual(doc.contents[0], numbers) |
464 |
self.assertEqual(doc.contents[1], letters) |
465 |
|
466 |
# A more complex case.
|
467 |
text = "<a>1<b>2<c>Hollywood, baby!</c></b></a>3"
|
468 |
soup = BeautifulStoneSoup(text) |
469 |
one = soup.find(text="1")
|
470 |
three = soup.find(text="3")
|
471 |
toExtract = soup.b |
472 |
soup.b.extract() |
473 |
self.assertEqual(one.next, three)
|
474 |
self.assertEqual(three.previous, one)
|
475 |
self.assertEqual(one.parent.nextSibling, three)
|
476 |
self.assertEqual(three.previousSibling, soup.a)
|
477 |
|
478 |
def testClear(self): |
479 |
soup = BeautifulSoup("<ul><li></li><li></li></ul>")
|
480 |
soup.ul.clear() |
481 |
self.assertEqual(len(soup.ul.contents), 0) |
482 |
|
483 |
class TheManWithoutAttributes(SoupTest): |
484 |
"Test attribute access"
|
485 |
|
486 |
def testHasKey(self): |
487 |
text = "<foo attr='bar'>"
|
488 |
self.assertEquals(BeautifulSoup(text).foo.has_key('attr'), True) |
489 |
|
490 |
class QuoteMeOnThat(SoupTest): |
491 |
"Test quoting"
|
492 |
def testQuotedAttributeValues(self): |
493 |
self.assertSoupEquals("<foo attr='bar'></foo>", |
494 |
'<foo attr="bar"></foo>')
|
495 |
|
496 |
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
497 |
soup = BeautifulSoup(text) |
498 |
self.assertEquals(soup.renderContents(), text)
|
499 |
|
500 |
soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' |
501 |
newText = """<foo attr='Brawls happen at "Bob&squot;s Bar"'>a</foo>"""
|
502 |
self.assertSoupEquals(soup.renderContents(), newText)
|
503 |
|
504 |
self.assertSoupEquals('<this is="really messed up & stuff">', |
505 |
'<this is="really messed up & stuff"></this>')
|
506 |
|
507 |
# This is not what the original author had in mind, but it's
|
508 |
# a legitimate interpretation of what they wrote.
|
509 |
self.assertSoupEquals("""<a href="foo</a>, </a><a href="bar">baz</a>""", |
510 |
'<a href="foo</a>, </a><a href="></a>, <a href="bar">baz</a>')
|
511 |
|
512 |
# SGMLParser generates bogus parse events when attribute values
|
513 |
# contain embedded brackets, but at least Beautiful Soup fixes
|
514 |
# it up a little.
|
515 |
self.assertSoupEquals('<a b="<a>">', '<a b="<a>"></a><a>"></a>') |
516 |
self.assertSoupEquals('<a href="http://foo.com/<a> and blah and blah', |
517 |
"""<a href='"http://foo.com/'></a><a> and blah and blah</a>""")
|
518 |
|
519 |
|
520 |
|
521 |
class YoureSoLiteral(SoupTest): |
522 |
"Test literal mode."
|
523 |
def testLiteralMode(self): |
524 |
text = "<script>if (i<imgs.length)</script><b>Foo</b>"
|
525 |
soup = BeautifulSoup(text) |
526 |
self.assertEqual(soup.script.contents[0], "if (i<imgs.length)") |
527 |
self.assertEqual(soup.b.contents[0], "Foo") |
528 |
|
529 |
def testTextArea(self): |
530 |
text = "<textarea><b>This is an example of an HTML tag</b><&<&</textarea>"
|
531 |
soup = BeautifulSoup(text) |
532 |
self.assertEqual(soup.textarea.contents[0], |
533 |
"<b>This is an example of an HTML tag</b><&<&")
|
534 |
|
535 |
class OperatorOverload(SoupTest): |
536 |
"Our operators do it all! Call now!"
|
537 |
|
538 |
def testTagNameAsFind(self): |
539 |
"Tests that referencing a tag name as a member delegates to find()."
|
540 |
soup = BeautifulSoup('<b id="1">foo<i>bar</i></b><b>Red herring</b>')
|
541 |
self.assertEqual(soup.b.i, soup.find('b').find('i')) |
542 |
self.assertEqual(soup.b.i.string, 'bar') |
543 |
self.assertEqual(soup.b['id'], '1') |
544 |
self.assertEqual(soup.b.contents[0], 'foo') |
545 |
self.assert_(not soup.a) |
546 |
|
547 |
#Test the .fooTag variant of .foo.
|
548 |
self.assertEqual(soup.bTag.iTag.string, 'bar') |
549 |
self.assertEqual(soup.b.iTag.string, 'bar') |
550 |
self.assertEqual(soup.find('b').find('i'), soup.bTag.iTag) |
551 |
|
552 |
class NestableEgg(SoupTest): |
553 |
"""Here we test tag nesting. TEST THE NEST, DUDE! X-TREME!"""
|
554 |
|
555 |
def testParaInsideBlockquote(self): |
556 |
soup = BeautifulSoup('<blockquote><p><b>Foo</blockquote><p>Bar')
|
557 |
self.assertEqual(soup.blockquote.p.b.string, 'Foo') |
558 |
self.assertEqual(soup.blockquote.b.string, 'Foo') |
559 |
self.assertEqual(soup.find('p', recursive=False).string, 'Bar') |
560 |
|
561 |
def testNestedTables(self): |
562 |
text = """<table id="1"><tr><td>Here's another table:
|
563 |
<table id="2"><tr><td>Juicy text</td></tr></table></td></tr></table>"""
|
564 |
soup = BeautifulSoup(text) |
565 |
self.assertEquals(soup.table.table.td.string, 'Juicy text') |
566 |
self.assertEquals(len(soup.findAll('table')), 2) |
567 |
self.assertEquals(len(soup.table.findAll('table')), 1) |
568 |
self.assertEquals(soup.find('table', {'id' : 2}).parent.parent.parent.name, |
569 |
'table')
|
570 |
|
571 |
text = "<table><tr><td><div><table>Foo</table></div></td></tr></table>"
|
572 |
soup = BeautifulSoup(text) |
573 |
self.assertEquals(soup.table.tr.td.div.table.contents[0], "Foo") |
574 |
|
575 |
text = """<table><thead><tr>Foo</tr></thead><tbody><tr>Bar</tr></tbody>
|
576 |
<tfoot><tr>Baz</tr></tfoot></table>"""
|
577 |
soup = BeautifulSoup(text) |
578 |
self.assertEquals(soup.table.thead.tr.contents[0], "Foo") |
579 |
|
580 |
def testBadNestedTables(self): |
581 |
soup = BeautifulSoup("<table><tr><table><tr id='nested'>")
|
582 |
self.assertEquals(soup.table.tr.table.tr['id'], 'nested') |
583 |
|
584 |
class CleanupOnAisleFour(SoupTest): |
585 |
"""Here we test cleanup of text that breaks SGMLParser or is just
|
586 |
obnoxious."""
|
587 |
|
588 |
def testSelfClosingtag(self): |
589 |
self.assertEqual(str(BeautifulSoup("Foo<br/>Bar").find('br')), |
590 |
'<br />')
|
591 |
|
592 |
self.assertSoupEquals('<p>test1<br/>test2</p>', |
593 |
'<p>test1<br />test2</p>')
|
594 |
|
595 |
text = '<p>test1<selfclosing>test2'
|
596 |
soup = BeautifulStoneSoup(text) |
597 |
self.assertEqual(str(soup), |
598 |
'<p>test1<selfclosing>test2</selfclosing></p>')
|
599 |
|
600 |
soup = BeautifulStoneSoup(text, selfClosingTags='selfclosing')
|
601 |
self.assertEqual(str(soup), |
602 |
'<p>test1<selfclosing />test2</p>')
|
603 |
|
604 |
def testSelfClosingTagOrNot(self): |
605 |
text = "<item><link>http://foo.com/</link></item>"
|
606 |
self.assertEqual(BeautifulStoneSoup(text).renderContents(), text)
|
607 |
self.assertEqual(BeautifulSoup(text).renderContents(),
|
608 |
'<item><link />http://foo.com/</item>')
|
609 |
|
610 |
def testCData(self): |
611 |
xml = "<root>foo<![CDATA[foobar]]>bar</root>"
|
612 |
self.assertSoupEquals(xml, xml)
|
613 |
r = re.compile("foo.*bar")
|
614 |
soup = BeautifulSoup(xml) |
615 |
self.assertEquals(soup.find(text=r).string, "foobar") |
616 |
self.assertEquals(soup.find(text=r).__class__, CData)
|
617 |
|
618 |
def testComments(self): |
619 |
xml = "foo<!--foobar-->baz"
|
620 |
self.assertSoupEquals(xml)
|
621 |
r = re.compile("foo.*bar")
|
622 |
soup = BeautifulSoup(xml) |
623 |
self.assertEquals(soup.find(text=r).string, "foobar") |
624 |
self.assertEquals(soup.find(text="foobar").__class__, Comment) |
625 |
|
626 |
def testDeclaration(self): |
627 |
xml = "foo<!DOCTYPE foobar>baz"
|
628 |
self.assertSoupEquals(xml)
|
629 |
r = re.compile(".*foo.*bar")
|
630 |
soup = BeautifulSoup(xml) |
631 |
text = "DOCTYPE foobar"
|
632 |
self.assertEquals(soup.find(text=r).string, text)
|
633 |
self.assertEquals(soup.find(text=text).__class__, Declaration)
|
634 |
|
635 |
namespaced_doctype = ('<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">'
|
636 |
'<html>foo</html>')
|
637 |
soup = BeautifulSoup(namespaced_doctype) |
638 |
self.assertEquals(soup.contents[0], |
639 |
'DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd"')
|
640 |
self.assertEquals(soup.html.contents[0], 'foo') |
641 |
|
642 |
def testEntityConversions(self): |
643 |
text = "<<sacré bleu!>>"
|
644 |
soup = BeautifulStoneSoup(text) |
645 |
self.assertSoupEquals(text)
|
646 |
|
647 |
xmlEnt = BeautifulStoneSoup.XML_ENTITIES |
648 |
htmlEnt = BeautifulStoneSoup.HTML_ENTITIES |
649 |
xhtmlEnt = BeautifulStoneSoup.XHTML_ENTITIES |
650 |
|
651 |
soup = BeautifulStoneSoup(text, convertEntities=xmlEnt) |
652 |
self.assertEquals(str(soup), "<<sacré bleu!>>") |
653 |
|
654 |
soup = BeautifulStoneSoup(text, convertEntities=htmlEnt) |
655 |
self.assertEquals(unicode(soup), u"<<sacr\xe9 bleu!>>") |
656 |
|
657 |
# Make sure the "XML", "HTML", and "XHTML" settings work.
|
658 |
text = "<™'"
|
659 |
soup = BeautifulStoneSoup(text, convertEntities=xmlEnt) |
660 |
self.assertEquals(unicode(soup), u"<™'") |
661 |
|
662 |
soup = BeautifulStoneSoup(text, convertEntities=htmlEnt) |
663 |
self.assertEquals(unicode(soup), u"<\u2122'") |
664 |
|
665 |
soup = BeautifulStoneSoup(text, convertEntities=xhtmlEnt) |
666 |
self.assertEquals(unicode(soup), u"<\u2122'") |
667 |
|
668 |
invalidEntity = "foo&#bar;baz"
|
669 |
soup = BeautifulStoneSoup\ |
670 |
(invalidEntity, |
671 |
convertEntities=htmlEnt) |
672 |
self.assertEquals(str(soup), "foo&#bar;baz") |
673 |
|
674 |
nonexistentEntity = "foo&bar;baz"
|
675 |
soup = BeautifulStoneSoup\ |
676 |
(nonexistentEntity, |
677 |
convertEntities="xml")
|
678 |
self.assertEquals(str(soup), nonexistentEntity) |
679 |
|
680 |
|
681 |
def testNonBreakingSpaces(self): |
682 |
soup = BeautifulSoup("<a> </a>",
|
683 |
convertEntities=BeautifulStoneSoup.HTML_ENTITIES) |
684 |
self.assertEquals(unicode(soup), u"<a>\xa0\xa0</a>") |
685 |
|
686 |
def testWhitespaceInDeclaration(self): |
687 |
self.assertSoupEquals('<! DOCTYPE>', '<!DOCTYPE>') |
688 |
|
689 |
def testJunkInDeclaration(self): |
690 |
self.assertSoupEquals('<! Foo = -8>a', '<!Foo = -8>a') |
691 |
|
692 |
def testIncompleteDeclaration(self): |
693 |
self.assertSoupEquals('a<!b <p>c', 'a<!b <p>c') |
694 |
|
695 |
def testEntityReplacement(self): |
696 |
self.assertSoupEquals('<b>hello there</b>') |
697 |
|
698 |
def testEntitiesInAttributeValues(self): |
699 |
self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>') |
700 |
self.assertSoupEquals('<x t="xñ">', '<x t="x\xc3\xb1"></x>') |
701 |
|
702 |
soup = BeautifulSoup('<x t=">™">',
|
703 |
convertEntities=BeautifulStoneSoup.HTML_ENTITIES) |
704 |
self.assertEquals(unicode(soup), u'<x t=">\u2122"></x>') |
705 |
|
706 |
uri = "http://crummy.com?sacré&bleu"
|
707 |
link = '<a href="%s"></a>' % uri
|
708 |
soup = BeautifulSoup(link) |
709 |
self.assertEquals(unicode(soup), link) |
710 |
#self.assertEquals(unicode(soup.a['href']), uri)
|
711 |
|
712 |
soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES) |
713 |
self.assertEquals(unicode(soup), |
714 |
link.replace("é", u"\xe9")) |
715 |
|
716 |
uri = "http://crummy.com?sacré&bleu"
|
717 |
link = '<a href="%s"></a>' % uri
|
718 |
soup = BeautifulSoup(link, convertEntities=BeautifulSoup.HTML_ENTITIES) |
719 |
self.assertEquals(unicode(soup.a['href']), |
720 |
uri.replace("é", u"\xe9")) |
721 |
|
722 |
def testNakedAmpersands(self): |
723 |
html = {'convertEntities':BeautifulStoneSoup.HTML_ENTITIES}
|
724 |
soup = BeautifulStoneSoup("AT&T ", **html)
|
725 |
self.assertEquals(str(soup), 'AT&T ') |
726 |
|
727 |
nakedAmpersandInASentence = "AT&T was Ma Bell"
|
728 |
soup = BeautifulStoneSoup(nakedAmpersandInASentence,**html) |
729 |
self.assertEquals(str(soup), \ |
730 |
nakedAmpersandInASentence.replace('&','&')) |
731 |
|
732 |
invalidURL = '<a href="http://example.org?a=1&b=2;3">foo</a>'
|
733 |
validURL = invalidURL.replace('&','&') |
734 |
soup = BeautifulStoneSoup(invalidURL) |
735 |
self.assertEquals(str(soup), validURL) |
736 |
|
737 |
soup = BeautifulStoneSoup(validURL) |
738 |
self.assertEquals(str(soup), validURL) |
739 |
|
740 |
|
741 |
class EncodeRed(SoupTest): |
742 |
"""Tests encoding conversion, Unicode conversion, and Microsoft
|
743 |
smart quote fixes."""
|
744 |
|
745 |
def testUnicodeDammitStandalone(self): |
746 |
markup = "<foo>\x92</foo>"
|
747 |
dammit = UnicodeDammit(markup) |
748 |
self.assertEquals(dammit.unicode, "<foo>’</foo>") |
749 |
|
750 |
hebrew = "\xed\xe5\xec\xf9"
|
751 |
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
|
752 |
self.assertEquals(dammit.unicode, u'\u05dd\u05d5\u05dc\u05e9') |
753 |
self.assertEquals(dammit.originalEncoding, 'iso-8859-8') |
754 |
|
755 |
def testGarbageInGarbageOut(self): |
756 |
ascii = "<foo>a</foo>"
|
757 |
asciiSoup = BeautifulStoneSoup(ascii) |
758 |
self.assertEquals(ascii, str(asciiSoup)) |
759 |
|
760 |
unicodeData = u"<foo>\u00FC</foo>"
|
761 |
utf8 = unicodeData.encode("utf-8")
|
762 |
self.assertEquals(utf8, '<foo>\xc3\xbc</foo>') |
763 |
|
764 |
unicodeSoup = BeautifulStoneSoup(unicodeData) |
765 |
self.assertEquals(unicodeData, unicode(unicodeSoup)) |
766 |
self.assertEquals(unicode(unicodeSoup.foo.string), u'\u00FC') |
767 |
|
768 |
utf8Soup = BeautifulStoneSoup(utf8, fromEncoding='utf-8')
|
769 |
self.assertEquals(utf8, str(utf8Soup)) |
770 |
self.assertEquals(utf8Soup.originalEncoding, "utf-8") |
771 |
|
772 |
utf8Soup = BeautifulStoneSoup(unicodeData) |
773 |
self.assertEquals(utf8, str(utf8Soup)) |
774 |
self.assertEquals(utf8Soup.originalEncoding, None) |
775 |
|
776 |
|
777 |
def testHandleInvalidCodec(self): |
778 |
for bad_encoding in ['.utf8', '...', 'utF---16.!']: |
779 |
soup = BeautifulSoup("Räksmörgås", fromEncoding=bad_encoding)
|
780 |
self.assertEquals(soup.originalEncoding, 'utf-8') |
781 |
|
782 |
def testUnicodeSearch(self): |
783 |
html = u'<html><body><h1>Räksmörgås</h1></body></html>'
|
784 |
soup = BeautifulSoup(html) |
785 |
self.assertEqual(soup.find(text=u'Räksmörgås'),u'Räksmörgås') |
786 |
|
787 |
def testRewrittenXMLHeader(self): |
788 |
euc_jp = '<?xml version="1.0 encoding="euc-jp"?>\n<foo>\n\xa4\xb3\xa4\xec\xa4\xcfEUC-JP\xa4\xc7\xa5\xb3\xa1\xbc\xa5\xc7\xa5\xa3\xa5\xf3\xa5\xb0\xa4\xb5\xa4\xec\xa4\xbf\xc6\xfc\xcb\xdc\xb8\xec\xa4\xce\xa5\xd5\xa5\xa1\xa5\xa4\xa5\xeb\xa4\xc7\xa4\xb9\xa1\xa3\n</foo>\n'
|
789 |
utf8 = "<?xml version='1.0' encoding='utf-8'?>\n<foo>\n\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafEUC-JP\xe3\x81\xa7\xe3\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n</foo>\n"
|
790 |
soup = BeautifulStoneSoup(euc_jp) |
791 |
if soup.originalEncoding != "euc-jp": |
792 |
raise Exception("Test failed when parsing euc-jp document. " |
793 |
"If you're running Python >=2.4, or you have "
|
794 |
"cjkcodecs installed, this is a real problem. "
|
795 |
"Otherwise, ignore it.")
|
796 |
|
797 |
self.assertEquals(soup.originalEncoding, "euc-jp") |
798 |
self.assertEquals(str(soup), utf8) |
799 |
|
800 |
old_text = "<?xml encoding='windows-1252'><foo>\x92</foo>"
|
801 |
new_text = "<?xml version='1.0' encoding='utf-8'?><foo>’</foo>"
|
802 |
self.assertSoupEquals(old_text, new_text)
|
803 |
|
804 |
def testRewrittenMetaTag(self): |
805 |
no_shift_jis_html = '''<html><head>\n<meta http-equiv="Content-language" content="ja" /></head><body><pre>\n\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n</pre></body></html>'''
|
806 |
soup = BeautifulSoup(no_shift_jis_html) |
807 |
|
808 |
# Beautiful Soup used to try to rewrite the meta tag even if the
|
809 |
# meta tag got filtered out by the strainer. This test makes
|
810 |
# sure that doesn't happen.
|
811 |
strainer = SoupStrainer('pre')
|
812 |
soup = BeautifulSoup(no_shift_jis_html, parseOnlyThese=strainer) |
813 |
self.assertEquals(soup.contents[0].name, 'pre') |
814 |
|
815 |
meta_tag = ('<meta content="text/html; charset=x-sjis" '
|
816 |
'http-equiv="Content-type" />')
|
817 |
shift_jis_html = ( |
818 |
'<html><head>\n%s\n'
|
819 |
'<meta http-equiv="Content-language" content="ja" />'
|
820 |
'</head><body><pre>\n'
|
821 |
'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
|
822 |
'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
|
823 |
'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B\n'
|
824 |
'</pre></body></html>') % meta_tag
|
825 |
soup = BeautifulSoup(shift_jis_html) |
826 |
if soup.originalEncoding != "shift-jis": |
827 |
raise Exception("Test failed when parsing shift-jis document " |
828 |
"with meta tag '%s'."
|
829 |
"If you're running Python >=2.4, or you have "
|
830 |
"cjkcodecs installed, this is a real problem. "
|
831 |
"Otherwise, ignore it." % meta_tag)
|
832 |
self.assertEquals(soup.originalEncoding, "shift-jis") |
833 |
|
834 |
content_type_tag = soup.meta['content']
|
835 |
self.assertEquals(content_type_tag[content_type_tag.find('charset='):], |
836 |
'charset=%SOUP-ENCODING%')
|
837 |
content_type = str(soup.meta)
|
838 |
index = content_type.find('charset=')
|
839 |
self.assertEqual(content_type[index:index+len('charset=utf8')+1], |
840 |
'charset=utf-8')
|
841 |
content_type = soup.meta.__str__('shift-jis')
|
842 |
index = content_type.find('charset=')
|
843 |
self.assertEqual(content_type[index:index+len('charset=shift-jis')], |
844 |
'charset=shift-jis')
|
845 |
|
846 |
self.assertEquals(str(soup), ( |
847 |
'<html><head>\n'
|
848 |
'<meta content="text/html; charset=utf-8" '
|
849 |
'http-equiv="Content-type" />\n'
|
850 |
'<meta http-equiv="Content-language" content="ja" />'
|
851 |
'</head><body><pre>\n'
|
852 |
'\xe3\x81\x93\xe3\x82\x8c\xe3\x81\xafShift-JIS\xe3\x81\xa7\xe3'
|
853 |
'\x82\xb3\xe3\x83\xbc\xe3\x83\x87\xe3\x82\xa3\xe3\x83\xb3\xe3'
|
854 |
'\x82\xb0\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f\xe6\x97\xa5\xe6'
|
855 |
'\x9c\xac\xe8\xaa\x9e\xe3\x81\xae\xe3\x83\x95\xe3\x82\xa1\xe3'
|
856 |
'\x82\xa4\xe3\x83\xab\xe3\x81\xa7\xe3\x81\x99\xe3\x80\x82\n'
|
857 |
'</pre></body></html>'))
|
858 |
self.assertEquals(soup.renderContents("shift-jis"), |
859 |
shift_jis_html.replace('x-sjis', 'shift-jis')) |
860 |
|
861 |
isolatin ="""<html><meta http-equiv="Content-type" content="text/html; charset=ISO-Latin-1" />Sacr\xe9 bleu!</html>"""
|
862 |
soup = BeautifulSoup(isolatin) |
863 |
self.assertSoupEquals(soup.__str__("utf-8"), |
864 |
isolatin.replace("ISO-Latin-1", "utf-8").replace("\xe9", "\xc3\xa9")) |
865 |
|
866 |
def testHebrew(self): |
867 |
iso_8859_8= '<HEAD>\n<TITLE>Hebrew (ISO 8859-8) in Visual Directionality</TITLE>\n\n\n\n</HEAD>\n<BODY>\n<H1>Hebrew (ISO 8859-8) in Visual Directionality</H1>\n\xed\xe5\xec\xf9\n</BODY>\n'
|
868 |
utf8 = '<head>\n<title>Hebrew (ISO 8859-8) in Visual Directionality</title>\n</head>\n<body>\n<h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\n\xd7\x9d\xd7\x95\xd7\x9c\xd7\xa9\n</body>\n'
|
869 |
soup = BeautifulStoneSoup(iso_8859_8, fromEncoding="iso-8859-8")
|
870 |
self.assertEquals(str(soup), utf8) |
871 |
|
872 |
def testSmartQuotesNotSoSmartAnymore(self): |
873 |
self.assertSoupEquals("\x91Foo\x92 <!--blah-->", |
874 |
'‘Foo’ <!--blah-->')
|
875 |
|
876 |
def testDontConvertSmartQuotesWhenAlsoConvertingEntities(self): |
877 |
smartQuotes = "Il a dit, \x8BSacré bleu!\x9b"
|
878 |
soup = BeautifulSoup(smartQuotes) |
879 |
self.assertEquals(str(soup), |
880 |
'Il a dit, ‹Sacré bleu!›')
|
881 |
soup = BeautifulSoup(smartQuotes, convertEntities="html")
|
882 |
self.assertEquals(str(soup), |
883 |
'Il a dit, \xe2\x80\xb9Sacr\xc3\xa9 bleu!\xe2\x80\xba')
|
884 |
|
885 |
def testDontSeeSmartQuotesWhereThereAreNone(self): |
886 |
utf_8 = "\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
|
887 |
self.assertSoupEquals(utf_8)
|
888 |
|
889 |
|
890 |
class Whitewash(SoupTest): |
891 |
"""Test whitespace preservation."""
|
892 |
|
893 |
def testPreservedWhitespace(self): |
894 |
self.assertSoupEquals("<pre> </pre>") |
895 |
self.assertSoupEquals("<pre> woo </pre>") |
896 |
|
897 |
def testCollapsedWhitespace(self): |
898 |
self.assertSoupEquals("<p> </p>", "<p> </p>") |
899 |
|
900 |
|
901 |
if __name__ == '__main__': |
902 |
unittest.main() |