MCPcopy
hub / github.com/huggingface/datasets / test_select

Method test_select

tests/test_arrow_dataset.py:2220–2312  ·  view source on GitHub ↗
(self, in_memory)

Source from the content-addressed store, hash-verified

2218 assert_arrow_metadata_are_synced_with_dataset_features(inverted_dset)
2219
2220 def test_select(self, in_memory):
2221 with tempfile.TemporaryDirectory() as tmp_dir:
2222 with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
2223 # select every two example
2224 indices = list(range(0, len(dset), 2))
2225 tmp_file = os.path.join(tmp_dir, "test.arrow")
2226 fingerprint = dset._fingerprint
2227 with dset.select(indices, indices_cache_file_name=tmp_file) as dset_select_even:
2228 self.assertIsNotNone(dset_select_even._indices) # an indices mapping is created
2229 self.assertTrue(os.path.exists(tmp_file))
2230 self.assertEqual(len(dset_select_even), 15)
2231 for row in dset_select_even:
2232 self.assertEqual(int(row["filename"][-1]) % 2, 0)
2233 self.assertDictEqual(dset.features, Features({"filename": Value("string")}))
2234 self.assertDictEqual(dset_select_even.features, Features({"filename": Value("string")}))
2235 self.assertNotEqual(dset_select_even._fingerprint, fingerprint)
2236
2237 with tempfile.TemporaryDirectory() as tmp_dir:
2238 with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
2239 indices = list(range(0, len(dset)))
2240 with dset.select(indices) as dset_select_all:
2241 # no indices mapping, since the indices are contiguous
2242 # (in this case the arrow table is simply sliced, which is more efficient)
2243 self.assertIsNone(dset_select_all._indices)
2244 self.assertEqual(len(dset_select_all), len(dset))
2245 self.assertListEqual(list(dset_select_all), list(dset))
2246 self.assertDictEqual(dset.features, Features({"filename": Value("string")}))
2247 self.assertDictEqual(dset_select_all.features, Features({"filename": Value("string")}))
2248 self.assertNotEqual(dset_select_all._fingerprint, fingerprint)
2249 indices = range(0, len(dset))
2250 with dset.select(indices) as dset_select_all:
2251 # same but with range
2252 self.assertIsNone(dset_select_all._indices)
2253 self.assertEqual(len(dset_select_all), len(dset))
2254 self.assertListEqual(list(dset_select_all), list(dset))
2255 self.assertDictEqual(dset.features, Features({"filename": Value("string")}))
2256 self.assertDictEqual(dset_select_all.features, Features({"filename": Value("string")}))
2257 self.assertNotEqual(dset_select_all._fingerprint, fingerprint)
2258
2259 with tempfile.TemporaryDirectory() as tmp_dir:
2260 with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
2261 bad_indices = list(range(5))
2262 bad_indices[-1] = len(dset) + 10 # out of bounds
2263 tmp_file = os.path.join(tmp_dir, "test.arrow")
2264 self.assertRaises(
2265 Exception,
2266 dset.select,
2267 indices=bad_indices,
2268 indices_cache_file_name=tmp_file,
2269 writer_batch_size=2,
2270 )
2271 self.assertFalse(os.path.exists(tmp_file))
2272
2273 with tempfile.TemporaryDirectory() as tmp_dir:
2274 with self._create_dummy_dataset(in_memory, tmp_dir) as dset:
2275 indices = iter(range(len(dset))) # iterator of contiguous indices
2276 with dset.select(indices) as dset_select_all:
2277 # no indices mapping, since the indices are contiguous

Callers

nothing calls this directly

Calls 6

_create_dummy_datasetMethod · 0.95
FeaturesClass · 0.90
ValueClass · 0.90
existsMethod · 0.80
selectMethod · 0.45
set_formatMethod · 0.45

Tested by

no test coverage detected