MCPcopy Index your code
hub / github.com/NVIDIA/Megatron-LM / encode

Method encode

tools/preprocess_data.py:79–100  ·  view source on GitHub ↗
(self, json_line)

Source from the content-addressed store, hash-verified

77 return json.dumps(output), len(json_line)
78
79 def encode(self, json_line):
80 data = json.loads(json_line)
81 ids = {}
82 lens = {}
83 for key in self.args.json_keys:
84 text = data[key]
85 if isinstance(text, list):
86 sentences = text
87 else:
88 sentences = [text]
89 doc_ids = []
90 sentence_lens = []
91 for sentence in sentences:
92 sentence_ids = Encoder.tokenizer.tokenize(sentence)
93 if len(sentence_ids) > 0:
94 doc_ids.extend(sentence_ids)
95 sentence_lens.append(len(sentence_ids))
96 if len(doc_ids) > 0 and self.args.append_eod:
97 doc_ids.append(Encoder.tokenizer.eod)
98 ids[key] = doc_ids
99 lens[key] = sentence_lens
100 return ids, lens, len(json_line)
101
102
103class Partition(object):

Callers 4

get_scoreFunction · 0.45
get_scoresFunction · 0.45

Calls 1

tokenizeMethod · 0.45

Tested by

no test coverage detected