MCPcopy Index your code
hub / github.com/ddbourgin/numpy-ml / _transform

Method _transform

numpy_ml/preprocessing/nlp.py:310–337  ·  view source on GitHub ↗

Transform a single text string to a list of byte-pair IDs

(self, text)

Source from the content-addressed store, hash-verified

308 return [self._transform(string) for string in text]
309
310 def _transform(self, text):
311 """Transform a single text string to a list of byte-pair IDs"""
312 P = self.parameters
313 _bytes = tokenize_bytes_raw(text, encoding=P["encoding"])
314
315 encoded = []
316 for w in _bytes:
317 l, r = 0, len(w)
318 w = [int(i) for i in w.split(" ")]
319
320 while l < len(w):
321 candidate = tuple(w[l:r])
322
323 if len(candidate) > 1 and candidate in self.byte2token:
324 # candidate is a collection of several bytes and is in our
325 # vocab
326 encoded.append(self.byte2token[candidate])
327 l, r = r, len(w)
328 elif len(candidate) == 1:
329 # candidate is a single byte and should always be in our
330 # vocab
331 encoded.append(candidate[0])
332 l, r = r, len(w)
333 else:
334 # candidate is not in vocab, so we decrease our context
335 # window by 1 and try again
336 r -= 1
337 return encoded
338
339 def inverse_transform(self, codes):
340 """

Callers 1

transformMethod · 0.95

Calls 1

tokenize_bytes_rawFunction · 0.85

Tested by

no test coverage detected