Method encode

tools/preprocess_data.py:79–100 · view source on GitHub ↗

(self, json_line)

Source from the content-addressed store, hash-verified

77	return json.dumps(output), len(json_line)
78
79	def encode(self, json_line):
80	data = json.loads(json_line)
81	ids = {}
82	lens = {}
83	for key in self.args.json_keys:
84	text = data[key]
85	if isinstance(text, list):
86	sentences = text
87	else:
88	sentences = [text]
89	doc_ids = []
90	sentence_lens = []
91	for sentence in sentences:
92	sentence_ids = Encoder.tokenizer.tokenize(sentence)
93	if len(sentence_ids) > 0:
94	doc_ids.extend(sentence_ids)
95	sentence_lens.append(len(sentence_ids))
96	if len(doc_ids) > 0 and self.args.append_eod:
97	doc_ids.append(Encoder.tokenizer.eod)
98	ids[key] = doc_ids
99	lens[key] = sentence_lens
100	return ids, lens, len(json_line)
101
102
103	class Partition(object):

select_prompts_based_on_similarityFunction · 0.45

prompt_selection_for_knowledge_generationFunction · 0.45

get_scoreFunction · 0.45

get_scoresFunction · 0.45

tokenizeMethod · 0.45

no test coverage detected