(input_file, output_file)
| 2 | import random |
| 3 | |
| 4 | def sample_jsonl(input_file, output_file): |
| 5 | # 读取输入文件 |
| 6 | with open(input_file, 'r', encoding='utf-8') as f: |
| 7 | lines = f.readlines() |
| 8 | |
| 9 | # 随机采样1/3的数据 |
| 10 | sample_size = len(lines) // 3 |
| 11 | sampled_lines = random.sample(lines, sample_size) |
| 12 | |
| 13 | # 写入输出文件 |
| 14 | with open(output_file, 'w', encoding='utf-8') as f: |
| 15 | for line in sampled_lines: |
| 16 | f.write(line) |
| 17 | |
| 18 | if __name__ == "__main__": |
| 19 | input_file = "input.jsonl" # 输入文件路径 |