Prepare metadata for the given bin file. Args: bin_output_path (str): Output bin file path.
(bin_output_path: str)
| 39 | |
| 40 | |
| 41 | def prepare_meta(bin_output_path: str): |
| 42 | """ |
| 43 | Prepare metadata for the given bin file. |
| 44 | |
| 45 | Args: |
| 46 | bin_output_path (str): Output bin file path. |
| 47 | """ |
| 48 | meta = [] |
| 49 | cur = 0 |
| 50 | with open(bin_output_path, "rb") as f: |
| 51 | while True: |
| 52 | # read lines |
| 53 | line = f.readline() |
| 54 | # if line is empty, then break |
| 55 | if line == b"": |
| 56 | break |
| 57 | # obtain the token amount of each line |
| 58 | length = len(json.loads(line)["tokens"]) |
| 59 | # meta is a list of tuple(cur, length) |
| 60 | # cur: the start index of each line |
| 61 | # length: the token amount of each line |
| 62 | meta.append((cur, length)) |
| 63 | # update the cur to generate the meta information of next line |
| 64 | cur += len(line) |
| 65 | |
| 66 | # define path of the generated meta file |
| 67 | meta_fp = bin_output_path + ".meta" |
| 68 | # save the generated meta information |
| 69 | with open(meta_fp, "wb") as f: |
| 70 | meta = np.array(meta, dtype=np.int32) |
| 71 | np.save(f, meta) |
| 72 | |
| 73 | |
| 74 | def text2bin(text_input_path: str, bin_output_path: str): |