Beautiful Soup can detect a charset included in a META tag, try to convert the document to that charset, and re-parse the document from the beginning.
(self, attrs)
| 1569 | CHARSET_RE = re.compile("((^|;)\s*charset=)([^;]*)", re.M) |
| 1570 | |
| 1571 | def start_meta(self, attrs): |
| 1572 | """Beautiful Soup can detect a charset included in a META tag, |
| 1573 | try to convert the document to that charset, and re-parse the |
| 1574 | document from the beginning.""" |
| 1575 | httpEquiv = None |
| 1576 | contentType = None |
| 1577 | contentTypeIndex = None |
| 1578 | tagNeedsEncodingSubstitution = False |
| 1579 | |
| 1580 | for i in range(0, len(attrs)): |
| 1581 | key, value = attrs[i] |
| 1582 | key = key.lower() |
| 1583 | if key == 'http-equiv': |
| 1584 | httpEquiv = value |
| 1585 | elif key == 'content': |
| 1586 | contentType = value |
| 1587 | contentTypeIndex = i |
| 1588 | |
| 1589 | if httpEquiv and contentType: # It's an interesting meta tag. |
| 1590 | match = self.CHARSET_RE.search(contentType) |
| 1591 | if match: |
| 1592 | if (self.declaredHTMLEncoding is not None or |
| 1593 | self.originalEncoding == self.fromEncoding): |
| 1594 | # An HTML encoding was sniffed while converting |
| 1595 | # the document to Unicode, or an HTML encoding was |
| 1596 | # sniffed during a previous pass through the |
| 1597 | # document, or an encoding was specified |
| 1598 | # explicitly and it worked. Rewrite the meta tag. |
| 1599 | def rewrite(match): |
| 1600 | return match.group(1) + "%SOUP-ENCODING%" |
| 1601 | newAttr = self.CHARSET_RE.sub(rewrite, contentType) |
| 1602 | attrs[contentTypeIndex] = (attrs[contentTypeIndex][0], |
| 1603 | newAttr) |
| 1604 | tagNeedsEncodingSubstitution = True |
| 1605 | else: |
| 1606 | # This is our first pass through the document. |
| 1607 | # Go through it again with the encoding information. |
| 1608 | newCharset = match.group(3) |
| 1609 | if newCharset and newCharset != self.originalEncoding: |
| 1610 | self.declaredHTMLEncoding = newCharset |
| 1611 | self._feed(self.declaredHTMLEncoding) |
| 1612 | raise StopParsing |
| 1613 | pass |
| 1614 | tag = self.unknown_starttag("meta", attrs) |
| 1615 | if tag and tagNeedsEncodingSubstitution: |
| 1616 | tag.containsSubstitutions = True |
| 1617 | |
| 1618 | class StopParsing(Exception): |
| 1619 | pass |
nothing calls this directly
no test coverage detected