(name: str)
| 56 | |
| 57 | |
| 58 | def _prepare_dataset(name: str) -> Path: |
| 59 | from datasets import load_dataset |
| 60 | |
| 61 | cfg = DATASETS[name] |
| 62 | CACHE_DIR.mkdir(exist_ok=True) |
| 63 | out_path = CACHE_DIR / f"{name}.jsonl" |
| 64 | tmp_path = out_path.with_name(f"{out_path.name}.{os.getpid()}.tmp") |
| 65 | |
| 66 | print(f"[download] {name} ...") |
| 67 | dataset = load_dataset(*cfg["load_args"], **cfg["load_kwargs"]) |
| 68 | |
| 69 | with open(tmp_path, "w") as f: |
| 70 | for row in dataset: |
| 71 | if cfg.get("multi_turn"): |
| 72 | turns = cfg["format"](row) |
| 73 | else: |
| 74 | turns = [cfg["format"](row)] |
| 75 | f.write(json.dumps({"turns": turns}) + "\n") |
| 76 | os.replace(tmp_path, out_path) |
| 77 | |
| 78 | with open(out_path) as f: |
| 79 | num_samples = sum(1 for _ in f) |
| 80 | print(f"[cached] {out_path} ({num_samples} samples)") |
| 81 | return out_path |
| 82 | |
| 83 | |
| 84 | def load_and_process_dataset(data_name: str) -> list[dict]: |
no outgoing calls
no test coverage detected