MCPcopy
hub / github.com/IPADS-SAI/MobiAgent / process_text

Method process_text

utils/advanced_ocr.py:92–132  ·  view source on GitHub ↗

将原始文本标准化并生成多视图便于匹配

(self, raw_text: str)

Source from the content-addressed store, hash-verified

90 return self._engine is not None or self._engine_paddle is not None or self._engine_tess is not None
91
92 def process_text(self, raw_text: str) -> ProcessedText:
93 """将原始文本标准化并生成多视图便于匹配"""
94 if not raw_text or not raw_text.strip():
95 return ProcessedText(original='', cleaned='', no_spaces='', words=[], chars=[])
96
97 # 正规化:全半角、大小写、常见混淆字符
98 def to_half_width(s: str) -> str:
99 res = []
100 for ch in s:
101 code = ord(ch)
102 if code == 0x3000:
103 code = 32
104 elif 0xFF01 <= code <= 0xFF5E:
105 code -= 0xFEE0
106 res.append(chr(code))
107 return ''.join(res)
108
109 def normalize_confusions(s: str) -> str:
110 mapping = {
111 'I': 'I', 'L': 'L', 'O': 'O', 'S': 'S', 'B': 'B',
112 '0': '0', '1': '1', '2': '2', '5': '5', '6': '6', '8': '8', '9': '9',
113 # 常见 OCR 易混:
114 'O': '0', 'o': '0', 'l': '1', 'I': '1', '丨': '1', '|': '1',
115 'Z': '2', 'S': '5', 'B': '8',
116 }
117 return ''.join(mapping.get(c, c) for c in s)
118
119 # 1. 清理文本:保留中文、字母、数字、空格
120 raw_text_norm = normalize_confusions(to_half_width(raw_text)).casefold()
121 cleaned = re.sub(r'[^\u4e00-\u9fff\w\s]', ' ', raw_text_norm)
122 cleaned = re.sub(r'\s+', ' ', cleaned).strip()
123
124 # 2. 无空格版本:用于连续匹配
125 no_spaces = re.sub(r'\s+', '', cleaned)
126
127 # 3. 提取词语:按空格分割
128 words = [w.strip() for w in cleaned.split() if w.strip()]
129
130 # 4. 提取字符
131 chars = list(no_spaces)
132 return ProcessedText(original=raw_text, cleaned=cleaned, no_spaces=no_spaces, words=words, chars=chars)
133
134 def smart_text_contains(self, processed_text: ProcessedText, keyword: str) -> bool:
135 """

Callers 10

recognize_imageMethod · 0.95
get_word_listMethod · 0.95
process_frame_textMethod · 0.95
get_text_similarityMethod · 0.95
extract_text_from_xmlFunction · 0.80
frame_ocrFunction · 0.80
frame_textsFunction · 0.80
smart_text_searchFunction · 0.80

Calls 1

ProcessedTextClass · 0.70

Tested by

no test coverage detected