Write bin file based on the context. Args: context (str): the context of raw file. bin_file (file handler): the opened bin file. Example: >>> write_bin("今天天气晴朗适合出门散步", "out.bin") # the output file format is 'txt' >>> out.bin >>> {"tokens": [67577, 69095, 63
(context: str, bin_file)
| 14 | |
| 15 | |
| 16 | def write_bin(context: str, bin_file) -> None: |
| 17 | """ |
| 18 | Write bin file based on the context. |
| 19 | |
| 20 | Args: |
| 21 | context (str): the context of raw file. |
| 22 | bin_file (file handler): the opened bin file. |
| 23 | |
| 24 | Example: |
| 25 | >>> write_bin("今天天气晴朗适合出门散步", "out.bin") # the output file format is 'txt' |
| 26 | >>> out.bin |
| 27 | >>> {"tokens": [67577, 69095, 63010, 61770, 67783, 69301, 74732]} |
| 28 | """ |
| 29 | # encode the context into tokens, which is a list, eg. [67577, 69095, 63010, 61770, 67783, 69301, 74732] |
| 30 | tokens = tokenizer.encode(context) |
| 31 | # transfer the list into dic, key is str 'tokens', value is tokens. |
| 32 | # eg. {"tokens": [67577, 69095, 63010, 61770, 67783, 69301, 74732]} |
| 33 | data = dict(tokens=tokens) |
| 34 | # encode the data into bytes to save |
| 35 | saved_bin = str.encode(json.dumps(data) + "\n") |
| 36 | |
| 37 | # write bytes into bin_file |
| 38 | bin_file.write(saved_bin) |
| 39 | |
| 40 | |
| 41 | def prepare_meta(bin_output_path: str): |