Method _transform

numpy_ml/preprocessing/nlp.py:310–337 · view source on GitHub ↗

Transform a single text string to a list of byte-pair IDs

(self, text)

Source from the content-addressed store, hash-verified

308	return [self._transform(string) for string in text]
309
310	def _transform(self, text):
311	"""Transform a single text string to a list of byte-pair IDs"""
312	P = self.parameters
313	_bytes = tokenize_bytes_raw(text, encoding=P["encoding"])
314
315	encoded = []
316	for w in _bytes:
317	l, r = 0, len(w)
318	w = [int(i) for i in w.split(" ")]
319
320	while l < len(w):
321	candidate = tuple(w[l:r])
322
323	if len(candidate) > 1 and candidate in self.byte2token:
324	# candidate is a collection of several bytes and is in our
325	# vocab
326	encoded.append(self.byte2token[candidate])
327	l, r = r, len(w)
328	elif len(candidate) == 1:
329	# candidate is a single byte and should always be in our
330	# vocab
331	encoded.append(candidate[0])
332	l, r = r, len(w)
333	else:
334	# candidate is not in vocab, so we decrease our context
335	# window by 1 and try again
336	r -= 1
337	return encoded
338
339	def inverse_transform(self, codes):
340	"""

transformMethod · 0.95

tokenize_bytes_rawFunction · 0.85

no test coverage detected