(model_path, input, interval, add_special_tokens, skip_special_tokens)
| 15 | @pytest.mark.parametrize('add_special_tokens', [True, False]) |
| 16 | @pytest.mark.parametrize('skip_special_tokens', [True, False]) |
| 17 | def test_tokenizer(model_path, input, interval, add_special_tokens, skip_special_tokens): |
| 18 | tokenizer = Tokenizer(model_path, trust_remote_code=True).model |
| 19 | encoded = tokenizer.encode(input, False, add_special_tokens=add_special_tokens) |
| 20 | output = '' |
| 21 | input = tokenizer.decode(encoded, skip_special_tokens=skip_special_tokens) |
| 22 | state = DetokenizeState() |
| 23 | for i in range(0, len(encoded), interval): |
| 24 | offset = i + interval |
| 25 | if offset < len(encoded): |
| 26 | # lmdeploy may decode nothing when concurrency is high |
| 27 | if random.randint(1, 10) < 4: |
| 28 | offset -= interval |
| 29 | decoded, state = tokenizer.detokenize_incrementally(encoded[:offset], state, skip_special_tokens) |
| 30 | output += decoded |
| 31 | assert input == output, 'input string should equal to output after enc-dec' |
| 32 | |
| 33 | |
| 34 | @pytest.mark.parametrize('model_path', [ |
nothing calls this directly
no test coverage detected