MCPcopy Index your code
hub / github.com/z-lab/dflash / _prepare_dataset

Function _prepare_dataset

dflash/benchmark.py:58–81  ·  view source on GitHub ↗
(name: str)

Source from the content-addressed store, hash-verified

56
57
58def _prepare_dataset(name: str) -> Path:
59 from datasets import load_dataset
60
61 cfg = DATASETS[name]
62 CACHE_DIR.mkdir(exist_ok=True)
63 out_path = CACHE_DIR / f"{name}.jsonl"
64 tmp_path = out_path.with_name(f"{out_path.name}.{os.getpid()}.tmp")
65
66 print(f"[download] {name} ...")
67 dataset = load_dataset(*cfg["load_args"], **cfg["load_kwargs"])
68
69 with open(tmp_path, "w") as f:
70 for row in dataset:
71 if cfg.get("multi_turn"):
72 turns = cfg["format"](row)
73 else:
74 turns = [cfg["format"](row)]
75 f.write(json.dumps({"turns": turns}) + "\n")
76 os.replace(tmp_path, out_path)
77
78 with open(out_path) as f:
79 num_samples = sum(1 for _ in f)
80 print(f"[cached] {out_path} ({num_samples} samples)")
81 return out_path
82
83
84def load_and_process_dataset(data_name: str) -> list[dict]:

Callers 1

load_and_process_datasetFunction · 0.85

Calls

no outgoing calls

Tested by

no test coverage detected