Function _decode_utf8_with_mixes

src/utils/encoding_utils.py:11–48 · view source on GitHub ↗

(data)

Source from the content-addressed store, hash-verified

9
10
11	def _decode_utf8_with_mixes(data):
12	new_data = bytearray()
13	skip_count = 0
14	for index, char in enumerate(data):
15	if skip_count > 0:
16	skip_count -= 1
17	continue
18
19	if char < 127:
20	new_data.append(char)
21	continue
22
23	next_bytes_count = 0
24	valid = None
25	if 192 <= char <= 223:
26	next_bytes_count = 1
27	elif 224 <= char <= 239:
28	next_bytes_count = 2
29	elif 240 <= char <= 247:
30	next_bytes_count = 3
31
32	if next_bytes_count:
33	valid = True
34	for offset in range(0, next_bytes_count):
35	if not (128 <= data[index + offset + 1] <= 192):
36	valid = False
37	break
38
39	if valid:
40	selected_bytes = data[index:(index + next_bytes_count + 1)]
41	new_data.extend(selected_bytes)
42	skip_count = next_bytes_count
43	continue
44
45	converted_bytes = data[index:index + 1].decode('iso-8859-1').encode('utf-8')
46	new_data.extend(converted_bytes)
47
48	return new_data.decode('utf-8', errors='replace')

decodeFunction · 0.85

appendMethod · 0.80

extendMethod · 0.80

no test coverage detected