Read content from the input file and write to bin file. Currently support 3 input formats: 'txt', 'json' and 'jsonl'. Args: text_input_path (str): txt file path. bin_output_path (str): output bin file path.
(text_input_path: str, bin_output_path: str)
| 72 | |
| 73 | |
| 74 | def text2bin(text_input_path: str, bin_output_path: str): |
| 75 | """ |
| 76 | Read content from the input file and write to bin file. |
| 77 | Currently support 3 input formats: 'txt', 'json' and 'jsonl'. |
| 78 | |
| 79 | Args: |
| 80 | text_input_path (str): txt file path. |
| 81 | bin_output_path (str): output bin file path. |
| 82 | """ |
| 83 | # Check if the txt file exists |
| 84 | if not os.path.isfile(text_input_path): |
| 85 | raise FileNotFoundError(f"{text_input_path} does not exist.") |
| 86 | |
| 87 | file_format = text_input_path.split(".")[-1] |
| 88 | assert file_format in ["txt", "json", "jsonl"], print( |
| 89 | "Invalid input file type. Currently support `txt`, `json` and `jsonl`." |
| 90 | ) |
| 91 | |
| 92 | with open(text_input_path, "r") as text_file, open(bin_output_path, "ab") as bin_file: |
| 93 | if file_format == "txt": |
| 94 | for line in text_file: |
| 95 | # Strip any leading/trailing whitespace |
| 96 | stripped_line = line.strip() |
| 97 | if stripped_line: |
| 98 | # Pass each line to the write_bin function |
| 99 | write_bin(stripped_line, bin_file) |
| 100 | |
| 101 | elif file_format == "json": |
| 102 | data = json.load(text_file) |
| 103 | # assuming data is a list of dictionaries |
| 104 | for record in data: |
| 105 | # the type of record is dict, transfer the dict into str |
| 106 | context = json.dumps(record) |
| 107 | # encode the str and write into bin |
| 108 | write_bin(context, bin_file) |
| 109 | |
| 110 | elif file_format == "jsonl": |
| 111 | for line in text_file: |
| 112 | # encode the str and write into bin |
| 113 | write_bin(line, bin_file) |
| 114 | |
| 115 | |
| 116 | def parse_args(): |