MCPcopy
hub / github.com/treeverse/dvc / drop_duplicates

Method drop_duplicates

dvc/compare.py:226–274  ·  view source on GitHub ↗
(  # noqa: C901
        self,
        axis: str = "rows",
        subset: Optional[Iterable[str]] = None,
        ignore_empty: bool = True,
    )

Source from the content-addressed store, hash-verified

224 self.drop(*to_drop)
225
226 def drop_duplicates( # noqa: C901
227 self,
228 axis: str = "rows",
229 subset: Optional[Iterable[str]] = None,
230 ignore_empty: bool = True,
231 ):
232 if axis not in ["rows", "cols"]:
233 raise ValueError(
234 f"Invalid 'axis' value {axis}.Choose one of ['rows', 'cols']"
235 )
236
237 if axis == "cols":
238 cols_to_drop: list[str] = []
239 for n_col, col in enumerate(self.columns):
240 if subset and self.keys()[n_col] not in subset:
241 continue
242 # Cast to str because Text is not hashable error
243 unique_vals = {str(x) for x in col}
244 if ignore_empty and self._fill_value in unique_vals:
245 unique_vals -= {self._fill_value}
246 if len(unique_vals) == 1:
247 cols_to_drop.append(self.keys()[n_col])
248 self.drop(*cols_to_drop)
249
250 elif axis == "rows":
251 unique_rows = []
252 rows_to_drop: list[int] = []
253 for n_row, row in enumerate(self):
254 if subset:
255 row = [
256 col
257 for n_col, col in enumerate(row)
258 if self.keys()[n_col] in subset
259 ]
260
261 tuple_row = tuple(row)
262 if tuple_row in unique_rows:
263 rows_to_drop.append(n_row)
264 else:
265 unique_rows.append(tuple_row)
266
267 for name in self.keys():
268 self._columns[name] = Column(
269 [
270 x
271 for n, x in enumerate(self._columns[name])
272 if n not in rows_to_drop
273 ]
274 )
275
276
277def _normalize_float(val: float, precision: int):

Callers 5

test_drop_duplicatesFunction · 0.95
show_experimentsFunction · 0.80

Calls 4

keysMethod · 0.95
dropMethod · 0.95
ColumnClass · 0.85
appendMethod · 0.80

Tested by 4

test_drop_duplicatesFunction · 0.76