(make_enc: Callable[[], tiktoken.Encoding], data)
| 29 | @hypothesis.given(data=st.data()) |
| 30 | @hypothesis.settings(deadline=None, max_examples=MAX_EXAMPLES) |
| 31 | def test_hyp_offsets(make_enc: Callable[[], tiktoken.Encoding], data): |
| 32 | enc = make_enc() |
| 33 | |
| 34 | tokens_st = st.lists( |
| 35 | st.integers(0, enc.n_vocab - 1).filter( |
| 36 | lambda x: x in enc._special_tokens.values() or x in enc._mergeable_ranks.values() |
| 37 | ), |
| 38 | min_size=1, |
| 39 | max_size=20, |
| 40 | ) |
| 41 | tokens = data.draw(tokens_st) |
| 42 | |
| 43 | # This is a dumb hack to make sure that our tokens are a valid UTF-8 string |
| 44 | # We could potentially drop this, see the TODO in decode_with_offsets |
| 45 | tokens = enc.encode(enc.decode(tokens, errors="ignore"), allowed_special="all") |
| 46 | assert enc.decode_with_offsets(tokens)[1] == _token_offsets_reference(enc, tokens) |
| 47 | |
| 48 | |
| 49 | def test_basic_offsets(): |
nothing calls this directly
no test coverage detected
searching dependent graphs…