| 60 | |
| 61 | |
| 62 | def _build_html_entities() -> dict[str, str]: |
| 63 | entities = {} |
| 64 | # Create a dictionary based on the built-in HTML5 entity dictionary. |
| 65 | # Add a limited set of HTML entities that we'll also decode if they've |
| 66 | # been case-folded to uppercase, such as decoding Ñ as "Ñ". |
| 67 | for name, char in html.entities.html5.items(): # type: ignore |
| 68 | if name.endswith(";"): |
| 69 | entities["&" + name] = char |
| 70 | |
| 71 | # Restrict the set of characters we can attempt to decode if their |
| 72 | # name has been uppercased. If we tried to handle all entity names, |
| 73 | # the results would be ambiguous. |
| 74 | if name == name.lower(): |
| 75 | name_upper = name.upper() |
| 76 | entity_upper = "&" + name_upper |
| 77 | if html.unescape(entity_upper) == entity_upper: |
| 78 | entities[entity_upper] = char.upper() |
| 79 | return entities |
| 80 | |
| 81 | |
| 82 | HTML_ENTITY_RE = re.compile(r"&#?[0-9A-Za-z]{1,24};") |