Return list of stop-words to numpy.ndarray.
(stop_words: list[int | str], tokenizer: object)
| 221 | |
| 222 | # TODO remove stop_word_offsets stuff and make it clean |
| 223 | def _stop_words(stop_words: list[int | str], tokenizer: object): |
| 224 | """Return list of stop-words to numpy.ndarray.""" |
| 225 | import numpy as np |
| 226 | if stop_words is None: |
| 227 | return None |
| 228 | assert isinstance(stop_words, list) and \ |
| 229 | all(isinstance(elem, (str, int)) for elem in stop_words), \ |
| 230 | f'stop_words must be a list but got {type(stop_words)}' |
| 231 | stop_indexes = [] |
| 232 | for stop_word in stop_words: |
| 233 | if isinstance(stop_word, str): |
| 234 | stop_indexes += tokenizer.indexes_containing_token(stop_word) |
| 235 | elif isinstance(stop_word, int): |
| 236 | stop_indexes.append(stop_word) |
| 237 | assert isinstance(stop_indexes, list) and all(isinstance(elem, int) for elem in stop_indexes), 'invalid stop_words' |
| 238 | # each id in stop_indexes represents a stop word |
| 239 | # refer to https://github.com/fauxpilot/fauxpilot/discussions/165 for |
| 240 | # detailed explanation about fastertransformer's stop_indexes |
| 241 | stop_word_offsets = range(1, len(stop_indexes) + 1) |
| 242 | stop_words = np.array([[stop_indexes, stop_word_offsets]]).astype(np.int32) |
| 243 | return stop_words |
| 244 | |
| 245 | |
| 246 | def get_hf_gen_cfg(path: str, trust_remote_code: bool = False): |
no test coverage detected