Load JSONL chat requests. Rows with list-type ``prompt`` (e.g. dapo-math-17k) are treated as message lists, matching ``benchmark_generate.py``. When ``tokenizer`` is provided (``--input-ids``), rows are converted to ``input_ids`` client-side.
(
dataset_dir: str | Path | None = None,
dataset_files: Sequence[str | Path] | None = None,
datasets: Sequence[str] | None = None,
num_prompts: int | None = None,
shuffle: bool = False,
seed: int = 1,
tokenizer=None,
)
| 271 | |
| 272 | |
| 273 | def load_requests( |
| 274 | dataset_dir: str | Path | None = None, |
| 275 | dataset_files: Sequence[str | Path] | None = None, |
| 276 | datasets: Sequence[str] | None = None, |
| 277 | num_prompts: int | None = None, |
| 278 | shuffle: bool = False, |
| 279 | seed: int = 1, |
| 280 | tokenizer=None, |
| 281 | ) -> list[BenchmarkRequest]: |
| 282 | """Load JSONL chat requests. |
| 283 | |
| 284 | Rows with list-type ``prompt`` (e.g. dapo-math-17k) are treated as message lists, |
| 285 | matching ``benchmark_generate.py``. When ``tokenizer`` is provided (``--input-ids``), |
| 286 | rows are converted to ``input_ids`` client-side. |
| 287 | """ |
| 288 | raw_rows = _read_raw_rows( |
| 289 | dataset_dir=dataset_dir, |
| 290 | dataset_files=dataset_files, |
| 291 | datasets=datasets, |
| 292 | num_prompts=num_prompts, |
| 293 | shuffle=shuffle, |
| 294 | ) |
| 295 | if shuffle: |
| 296 | random.Random(seed).shuffle(raw_rows) |
| 297 | if num_prompts is not None: |
| 298 | raw_rows = raw_rows[:num_prompts] |
| 299 | if not raw_rows: |
| 300 | raise ValueError('No benchmark requests were loaded.') |
| 301 | |
| 302 | return [_normalize_row(row, dataset, row_index, tokenizer) for row, dataset, row_index in raw_rows] |
| 303 | |
| 304 | |
| 305 | def parse_sse_line(line: bytes | str) -> SSEEvent: |
no test coverage detected