MCPcopy
hub / github.com/huggingface/datasets / test_concatenate_pickle

Method test_concatenate_pickle

tests/test_arrow_dataset.py:860–910  ·  view source on GitHub ↗
(self, in_memory)

Source from the content-addressed store, hash-verified

858 del dset1, dset2, dset3
859
860 def test_concatenate_pickle(self, in_memory):
861 data1, data2, data3 = {"id": [0, 1, 2] * 2}, {"id": [3, 4, 5] * 2}, {"id": [6, 7], "foo": ["bar", "bar"]}
862 info1 = DatasetInfo(description="Dataset1")
863 info2 = DatasetInfo(description="Dataset2")
864 with tempfile.TemporaryDirectory() as tmp_dir:
865 dset1, dset2, dset3 = (
866 Dataset.from_dict(data1, info=info1),
867 Dataset.from_dict(data2, info=info2),
868 Dataset.from_dict(data3),
869 )
870 schema = dset1.data.schema
871 # mix from in-memory and on-disk datasets
872 dset1, dset2 = self._to(in_memory, tmp_dir, dset1, dset2)
873 dset3 = self._to(not in_memory, tmp_dir, dset3)
874 dset1, dset2, dset3 = (
875 dset1.select(
876 [2, 1, 0],
877 keep_in_memory=in_memory,
878 indices_cache_file_name=os.path.join(tmp_dir, "i1.arrow") if not in_memory else None,
879 ),
880 dset2.select(
881 [2, 1, 0],
882 keep_in_memory=in_memory,
883 indices_cache_file_name=os.path.join(tmp_dir, "i2.arrow") if not in_memory else None,
884 ),
885 dset3.select(
886 [1, 0],
887 keep_in_memory=in_memory,
888 indices_cache_file_name=os.path.join(tmp_dir, "i3.arrow") if not in_memory else None,
889 ),
890 )
891
892 dset3 = dset3.rename_column("foo", "new_foo")
893 dset3 = dset3.remove_columns("new_foo")
894 if in_memory:
895 dset3._data.table = Unpicklable(schema=schema)
896 else:
897 dset1._data.table, dset2._data.table = Unpicklable(schema=schema), Unpicklable(schema=schema)
898 dset1, dset2, dset3 = (pickle.loads(pickle.dumps(d)) for d in (dset1, dset2, dset3))
899 with concatenate_datasets([dset3, dset2, dset1]) as dset_concat:
900 if not in_memory:
901 dset_concat._data.table = Unpicklable(schema=schema)
902 with pickle.loads(pickle.dumps(dset_concat)) as dset_concat:
903 self.assertTupleEqual((len(dset1), len(dset2), len(dset3)), (3, 3, 2))
904 self.assertEqual(len(dset_concat), len(dset1) + len(dset2) + len(dset3))
905 self.assertListEqual(dset_concat["id"][:], [7, 6, 5, 4, 3, 2, 1, 0])
906 # in_memory = True: 1 cache file for dset3
907 # in_memory = False: 2 caches files for dset1 and dset2, and 1 cache file for i1.arrow
908 self.assertEqual(len(dset_concat.cache_files), 1 if in_memory else 2 + 1)
909 self.assertEqual(dset_concat.info.description, "Dataset2\n\nDataset1")
910 del dset1, dset2, dset3
911
912 def test_repeat(self, in_memory):
913 with tempfile.TemporaryDirectory() as tmp_dir:

Callers

nothing calls this directly

Calls 8

_toMethod · 0.95
DatasetInfoClass · 0.90
concatenate_datasetsFunction · 0.90
UnpicklableClass · 0.85
from_dictMethod · 0.45
selectMethod · 0.45
rename_columnMethod · 0.45
remove_columnsMethod · 0.45

Tested by

no test coverage detected