The Soup object is initialized as the 'root tag', and the provided markup (which can be a string or a file-like object) is fed into the underlying parser. sgmllib will process most bad HTML, and the BeautifulSoup class has some tricks for dealing with some HTML that
(self, markup="", parseOnlyThese=None, fromEncoding=None,
markupMassage=True, smartQuotesTo=XML_ENTITIES,
convertEntities=None, selfClosingTags=None, isHTML=False)
| 1076 | STRIP_ASCII_SPACES = { 9: None, 10: None, 12: None, 13: None, 32: None, } |
| 1077 | |
| 1078 | def __init__(self, markup="", parseOnlyThese=None, fromEncoding=None, |
| 1079 | markupMassage=True, smartQuotesTo=XML_ENTITIES, |
| 1080 | convertEntities=None, selfClosingTags=None, isHTML=False): |
| 1081 | """The Soup object is initialized as the 'root tag', and the |
| 1082 | provided markup (which can be a string or a file-like object) |
| 1083 | is fed into the underlying parser. |
| 1084 | |
| 1085 | sgmllib will process most bad HTML, and the BeautifulSoup |
| 1086 | class has some tricks for dealing with some HTML that kills |
| 1087 | sgmllib, but Beautiful Soup can nonetheless choke or lose data |
| 1088 | if your data uses self-closing tags or declarations |
| 1089 | incorrectly. |
| 1090 | |
| 1091 | By default, Beautiful Soup uses regexes to sanitize input, |
| 1092 | avoiding the vast majority of these problems. If the problems |
| 1093 | don't apply to you, pass in False for markupMassage, and |
| 1094 | you'll get better performance. |
| 1095 | |
| 1096 | The default parser massage techniques fix the two most common |
| 1097 | instances of invalid HTML that choke sgmllib: |
| 1098 | |
| 1099 | <br/> (No space between name of closing tag and tag close) |
| 1100 | <! --Comment--> (Extraneous whitespace in declaration) |
| 1101 | |
| 1102 | You can pass in a custom list of (RE object, replace method) |
| 1103 | tuples to get Beautiful Soup to scrub your input the way you |
| 1104 | want.""" |
| 1105 | |
| 1106 | self.parseOnlyThese = parseOnlyThese |
| 1107 | self.fromEncoding = fromEncoding |
| 1108 | self.smartQuotesTo = smartQuotesTo |
| 1109 | self.convertEntities = convertEntities |
| 1110 | # Set the rules for how we'll deal with the entities we |
| 1111 | # encounter |
| 1112 | if self.convertEntities: |
| 1113 | # It doesn't make sense to convert encoded characters to |
| 1114 | # entities even while you're converting entities to Unicode. |
| 1115 | # Just convert it all to Unicode. |
| 1116 | self.smartQuotesTo = None |
| 1117 | if convertEntities == self.HTML_ENTITIES: |
| 1118 | self.convertXMLEntities = False |
| 1119 | self.convertHTMLEntities = True |
| 1120 | self.escapeUnrecognizedEntities = True |
| 1121 | elif convertEntities == self.XHTML_ENTITIES: |
| 1122 | self.convertXMLEntities = True |
| 1123 | self.convertHTMLEntities = True |
| 1124 | self.escapeUnrecognizedEntities = False |
| 1125 | elif convertEntities == self.XML_ENTITIES: |
| 1126 | self.convertXMLEntities = True |
| 1127 | self.convertHTMLEntities = False |
| 1128 | self.escapeUnrecognizedEntities = False |
| 1129 | else: |
| 1130 | self.convertXMLEntities = False |
| 1131 | self.convertHTMLEntities = False |
| 1132 | self.escapeUnrecognizedEntities = False |
| 1133 | |
| 1134 | self.instanceSelfClosingTags = buildTagMap(None, selfClosingTags) |
| 1135 | SGMLParser.__init__(self) |
nothing calls this directly
no test coverage detected