| 431 | self.assertEqual(soup.p.encode("utf-8"), expected) |
| 432 | |
| 433 | def test_real_iso_latin_document(self): |
| 434 | # Smoke test of interrelated functionality, using an |
| 435 | # easy-to-understand document. |
| 436 | |
| 437 | # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. |
| 438 | unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>' |
| 439 | |
| 440 | # That's because we're going to encode it into ISO-Latin-1, and use |
| 441 | # that to test. |
| 442 | iso_latin_html = unicode_html.encode("iso-8859-1") |
| 443 | |
| 444 | # Parse the ISO-Latin-1 HTML. |
| 445 | soup = self.soup(iso_latin_html) |
| 446 | # Encode it to UTF-8. |
| 447 | result = soup.encode("utf-8") |
| 448 | |
| 449 | # What do we expect the result to look like? Well, it would |
| 450 | # look like unicode_html, except that the META tag would say |
| 451 | # UTF-8 instead of ISO-Latin-1. |
| 452 | expected = unicode_html.replace("ISO-Latin-1", "utf-8") |
| 453 | |
| 454 | # And, of course, it would be in UTF-8, not Unicode. |
| 455 | expected = expected.encode("utf-8") |
| 456 | |
| 457 | # Ta-da! |
| 458 | self.assertEqual(result, expected) |
| 459 | |
| 460 | def test_real_shift_jis_document(self): |
| 461 | # Smoke test to make sure the parser can handle a document in |