(sample: bytes)
| 168 | |
| 169 | |
| 170 | def detect_text_encoding(sample: bytes) -> str | None: |
| 171 | if not sample: |
| 172 | return "utf-8" |
| 173 | |
| 174 | if b"\x00" in sample and not sample.startswith(_UTF_BOMS): |
| 175 | odd_bytes = sample[1::2] |
| 176 | even_bytes = sample[0::2] |
| 177 | odd_zero_ratio = odd_bytes.count(0) / max(len(odd_bytes), 1) |
| 178 | even_zero_ratio = even_bytes.count(0) / max(len(even_bytes), 1) |
| 179 | if odd_zero_ratio < 0.8 and even_zero_ratio < 0.8: |
| 180 | return None |
| 181 | |
| 182 | for encoding in _TEXT_ENCODINGS: |
| 183 | try: |
| 184 | decoded = sample.decode(encoding) |
| 185 | except UnicodeDecodeError as exc: |
| 186 | # Probe samples can end in the middle of a multibyte sequence. |
| 187 | # When the decode failure only happens at the sample tail, trim a few |
| 188 | # bytes and retry so UTF-8 text is not misclassified as binary. |
| 189 | if exc.start >= len(sample) - 4: |
| 190 | decoded = "" |
| 191 | for trim_bytes in range(1, min(4, len(sample)) + 1): |
| 192 | try: |
| 193 | decoded = sample[:-trim_bytes].decode(encoding) |
| 194 | break |
| 195 | except UnicodeDecodeError: |
| 196 | continue |
| 197 | if not decoded: |
| 198 | continue |
| 199 | else: |
| 200 | continue |
| 201 | if _looks_like_text(decoded): |
| 202 | return encoding |
| 203 | |
| 204 | return None |
| 205 | |
| 206 | |
| 207 | def read_local_text_range_sync( |
no test coverage detected