Function detect_text_encoding

astrbot/core/computer/file_read_utils.py:170–204 · view source on GitHub ↗

(sample: bytes)

Source from the content-addressed store, hash-verified

168
169
170	def detect_text_encoding(sample: bytes) -> str \| None:
171	if not sample:
172	return "utf-8"
173
174	if b"\x00" in sample and not sample.startswith(_UTF_BOMS):
175	odd_bytes = sample[1::2]
176	even_bytes = sample[0::2]
177	odd_zero_ratio = odd_bytes.count(0) / max(len(odd_bytes), 1)
178	even_zero_ratio = even_bytes.count(0) / max(len(even_bytes), 1)
179	if odd_zero_ratio < 0.8 and even_zero_ratio < 0.8:
180	return None
181
182	for encoding in _TEXT_ENCODINGS:
183	try:
184	decoded = sample.decode(encoding)
185	except UnicodeDecodeError as exc:
186	# Probe samples can end in the middle of a multibyte sequence.
187	# When the decode failure only happens at the sample tail, trim a few
188	# bytes and retry so UTF-8 text is not misclassified as binary.
189	if exc.start >= len(sample) - 4:
190	decoded = ""
191	for trim_bytes in range(1, min(4, len(sample)) + 1):
192	try:
193	decoded = sample[:-trim_bytes].decode(encoding)
194	break
195	except UnicodeDecodeError:
196	continue
197	if not decoded:
198	continue
199	else:
200	continue
201	if _looks_like_text(decoded):
202	return encoding
203
204	return None
205
206
207	def read_local_text_range_sync(

_runMethod · 0.90

_probe_fileFunction · 0.85

_looks_like_textFunction · 0.85

startswithMethod · 0.80

decodeMethod · 0.80

no test coverage detected