(self, in_memory)
| 2218 | assert_arrow_metadata_are_synced_with_dataset_features(inverted_dset) |
| 2219 | |
| 2220 | def test_select(self, in_memory): |
| 2221 | with tempfile.TemporaryDirectory() as tmp_dir: |
| 2222 | with self._create_dummy_dataset(in_memory, tmp_dir) as dset: |
| 2223 | # select every two example |
| 2224 | indices = list(range(0, len(dset), 2)) |
| 2225 | tmp_file = os.path.join(tmp_dir, "test.arrow") |
| 2226 | fingerprint = dset._fingerprint |
| 2227 | with dset.select(indices, indices_cache_file_name=tmp_file) as dset_select_even: |
| 2228 | self.assertIsNotNone(dset_select_even._indices) # an indices mapping is created |
| 2229 | self.assertTrue(os.path.exists(tmp_file)) |
| 2230 | self.assertEqual(len(dset_select_even), 15) |
| 2231 | for row in dset_select_even: |
| 2232 | self.assertEqual(int(row["filename"][-1]) % 2, 0) |
| 2233 | self.assertDictEqual(dset.features, Features({"filename": Value("string")})) |
| 2234 | self.assertDictEqual(dset_select_even.features, Features({"filename": Value("string")})) |
| 2235 | self.assertNotEqual(dset_select_even._fingerprint, fingerprint) |
| 2236 | |
| 2237 | with tempfile.TemporaryDirectory() as tmp_dir: |
| 2238 | with self._create_dummy_dataset(in_memory, tmp_dir) as dset: |
| 2239 | indices = list(range(0, len(dset))) |
| 2240 | with dset.select(indices) as dset_select_all: |
| 2241 | # no indices mapping, since the indices are contiguous |
| 2242 | # (in this case the arrow table is simply sliced, which is more efficient) |
| 2243 | self.assertIsNone(dset_select_all._indices) |
| 2244 | self.assertEqual(len(dset_select_all), len(dset)) |
| 2245 | self.assertListEqual(list(dset_select_all), list(dset)) |
| 2246 | self.assertDictEqual(dset.features, Features({"filename": Value("string")})) |
| 2247 | self.assertDictEqual(dset_select_all.features, Features({"filename": Value("string")})) |
| 2248 | self.assertNotEqual(dset_select_all._fingerprint, fingerprint) |
| 2249 | indices = range(0, len(dset)) |
| 2250 | with dset.select(indices) as dset_select_all: |
| 2251 | # same but with range |
| 2252 | self.assertIsNone(dset_select_all._indices) |
| 2253 | self.assertEqual(len(dset_select_all), len(dset)) |
| 2254 | self.assertListEqual(list(dset_select_all), list(dset)) |
| 2255 | self.assertDictEqual(dset.features, Features({"filename": Value("string")})) |
| 2256 | self.assertDictEqual(dset_select_all.features, Features({"filename": Value("string")})) |
| 2257 | self.assertNotEqual(dset_select_all._fingerprint, fingerprint) |
| 2258 | |
| 2259 | with tempfile.TemporaryDirectory() as tmp_dir: |
| 2260 | with self._create_dummy_dataset(in_memory, tmp_dir) as dset: |
| 2261 | bad_indices = list(range(5)) |
| 2262 | bad_indices[-1] = len(dset) + 10 # out of bounds |
| 2263 | tmp_file = os.path.join(tmp_dir, "test.arrow") |
| 2264 | self.assertRaises( |
| 2265 | Exception, |
| 2266 | dset.select, |
| 2267 | indices=bad_indices, |
| 2268 | indices_cache_file_name=tmp_file, |
| 2269 | writer_batch_size=2, |
| 2270 | ) |
| 2271 | self.assertFalse(os.path.exists(tmp_file)) |
| 2272 | |
| 2273 | with tempfile.TemporaryDirectory() as tmp_dir: |
| 2274 | with self._create_dummy_dataset(in_memory, tmp_dir) as dset: |
| 2275 | indices = iter(range(len(dset))) # iterator of contiguous indices |
| 2276 | with dset.select(indices) as dset_select_all: |
| 2277 | # no indices mapping, since the indices are contiguous |
nothing calls this directly
no test coverage detected