将原始文本标准化并生成多视图便于匹配
(self, raw_text: str)
| 90 | return self._engine is not None or self._engine_paddle is not None or self._engine_tess is not None |
| 91 | |
| 92 | def process_text(self, raw_text: str) -> ProcessedText: |
| 93 | """将原始文本标准化并生成多视图便于匹配""" |
| 94 | if not raw_text or not raw_text.strip(): |
| 95 | return ProcessedText(original='', cleaned='', no_spaces='', words=[], chars=[]) |
| 96 | |
| 97 | # 正规化:全半角、大小写、常见混淆字符 |
| 98 | def to_half_width(s: str) -> str: |
| 99 | res = [] |
| 100 | for ch in s: |
| 101 | code = ord(ch) |
| 102 | if code == 0x3000: |
| 103 | code = 32 |
| 104 | elif 0xFF01 <= code <= 0xFF5E: |
| 105 | code -= 0xFEE0 |
| 106 | res.append(chr(code)) |
| 107 | return ''.join(res) |
| 108 | |
| 109 | def normalize_confusions(s: str) -> str: |
| 110 | mapping = { |
| 111 | 'I': 'I', 'L': 'L', 'O': 'O', 'S': 'S', 'B': 'B', |
| 112 | '0': '0', '1': '1', '2': '2', '5': '5', '6': '6', '8': '8', '9': '9', |
| 113 | # 常见 OCR 易混: |
| 114 | 'O': '0', 'o': '0', 'l': '1', 'I': '1', '丨': '1', '|': '1', |
| 115 | 'Z': '2', 'S': '5', 'B': '8', |
| 116 | } |
| 117 | return ''.join(mapping.get(c, c) for c in s) |
| 118 | |
| 119 | # 1. 清理文本:保留中文、字母、数字、空格 |
| 120 | raw_text_norm = normalize_confusions(to_half_width(raw_text)).casefold() |
| 121 | cleaned = re.sub(r'[^\u4e00-\u9fff\w\s]', ' ', raw_text_norm) |
| 122 | cleaned = re.sub(r'\s+', ' ', cleaned).strip() |
| 123 | |
| 124 | # 2. 无空格版本:用于连续匹配 |
| 125 | no_spaces = re.sub(r'\s+', '', cleaned) |
| 126 | |
| 127 | # 3. 提取词语:按空格分割 |
| 128 | words = [w.strip() for w in cleaned.split() if w.strip()] |
| 129 | |
| 130 | # 4. 提取字符 |
| 131 | chars = list(no_spaces) |
| 132 | return ProcessedText(original=raw_text, cleaned=cleaned, no_spaces=no_spaces, words=words, chars=chars) |
| 133 | |
| 134 | def smart_text_contains(self, processed_text: ProcessedText, keyword: str) -> bool: |
| 135 | """ |
no test coverage detected