MCPcopy
hub / github.com/InternLM/InternLM / text2bin

Function text2bin

tools/tokenizer.py:74–113  ·  view source on GitHub ↗

Read content from the input file and write to bin file. Currently support 3 input formats: 'txt', 'json' and 'jsonl'. Args: text_input_path (str): txt file path. bin_output_path (str): output bin file path.

(text_input_path: str, bin_output_path: str)

Source from the content-addressed store, hash-verified

72
73
74def text2bin(text_input_path: str, bin_output_path: str):
75 """
76 Read content from the input file and write to bin file.
77 Currently support 3 input formats: 'txt', 'json' and 'jsonl'.
78
79 Args:
80 text_input_path (str): txt file path.
81 bin_output_path (str): output bin file path.
82 """
83 # Check if the txt file exists
84 if not os.path.isfile(text_input_path):
85 raise FileNotFoundError(f"{text_input_path} does not exist.")
86
87 file_format = text_input_path.split(".")[-1]
88 assert file_format in ["txt", "json", "jsonl"], print(
89 "Invalid input file type. Currently support `txt`, `json` and `jsonl`."
90 )
91
92 with open(text_input_path, "r") as text_file, open(bin_output_path, "ab") as bin_file:
93 if file_format == "txt":
94 for line in text_file:
95 # Strip any leading/trailing whitespace
96 stripped_line = line.strip()
97 if stripped_line:
98 # Pass each line to the write_bin function
99 write_bin(stripped_line, bin_file)
100
101 elif file_format == "json":
102 data = json.load(text_file)
103 # assuming data is a list of dictionaries
104 for record in data:
105 # the type of record is dict, transfer the dict into str
106 context = json.dumps(record)
107 # encode the str and write into bin
108 write_bin(context, bin_file)
109
110 elif file_format == "jsonl":
111 for line in text_file:
112 # encode the str and write into bin
113 write_bin(line, bin_file)
114
115
116def parse_args():

Callers 1

mainFunction · 0.85

Calls 2

write_binFunction · 0.85
loadMethod · 0.45

Tested by

no test coverage detected