(split, expected_dataset_class, expected_dataset_length, in_memory, tmp_path)
| 534 | ) |
| 535 | @pytest.mark.parametrize("in_memory", [False, True]) |
| 536 | def test_builder_as_dataset(split, expected_dataset_class, expected_dataset_length, in_memory, tmp_path): |
| 537 | cache_dir = str(tmp_path) |
| 538 | builder = DummyBuilder(cache_dir=cache_dir) |
| 539 | os.makedirs(builder.cache_dir) |
| 540 | |
| 541 | builder.info.splits = SplitDict() |
| 542 | builder.info.splits.add(SplitInfo("train", num_examples=10)) |
| 543 | builder.info.splits.add(SplitInfo("test", num_examples=10)) |
| 544 | |
| 545 | for info_split in builder.info.splits: |
| 546 | with ArrowWriter( |
| 547 | path=os.path.join(builder.cache_dir, f"{builder.dataset_name}-{info_split}.arrow"), |
| 548 | features=Features({"text": Value("string")}), |
| 549 | ) as writer: |
| 550 | writer.write_batch({"text": ["foo"] * 10}) |
| 551 | writer.finalize() |
| 552 | |
| 553 | with assert_arrow_memory_increases() if in_memory else assert_arrow_memory_doesnt_increase(): |
| 554 | dataset = builder.as_dataset(split=split, in_memory=in_memory) |
| 555 | assert isinstance(dataset, expected_dataset_class) |
| 556 | if isinstance(dataset, DatasetDict): |
| 557 | assert list(dataset.keys()) == ["train", "test"] |
| 558 | datasets = dataset.values() |
| 559 | expected_splits = ["train", "test"] |
| 560 | elif isinstance(dataset, Dataset): |
| 561 | datasets = [dataset] |
| 562 | expected_splits = [split] |
| 563 | for dataset, expected_split in zip(datasets, expected_splits): |
| 564 | assert dataset.split == expected_split |
| 565 | assert len(dataset) == expected_dataset_length |
| 566 | assert dataset.features == Features({"text": Value("string")}) |
| 567 | dataset.column_names == ["text"] |
| 568 | |
| 569 | |
| 570 | @pytest.mark.parametrize("in_memory", [False, True]) |
nothing calls this directly
no test coverage detected