Give out a list of image file names to remove based on the similarity threshold. Does not remove the mentioned files. Args: image_dir: Path to the directory containing all the images or dictionary with keys as file names and values as nump
(
self,
image_dir: PurePath = None,
encoding_map: Dict[str, np.ndarray] = None,
min_similarity_threshold: float = 0.9,
outfile: Optional[str] = None,
recursive: Optional[bool] = False,
num_enc_workers: int = 0,
num_sim_workers: int = cpu_count(),
)
| 478 | return result |
| 479 | |
| 480 | def find_duplicates_to_remove( |
| 481 | self, |
| 482 | image_dir: PurePath = None, |
| 483 | encoding_map: Dict[str, np.ndarray] = None, |
| 484 | min_similarity_threshold: float = 0.9, |
| 485 | outfile: Optional[str] = None, |
| 486 | recursive: Optional[bool] = False, |
| 487 | num_enc_workers: int = 0, |
| 488 | num_sim_workers: int = cpu_count(), |
| 489 | ) -> List: |
| 490 | """ |
| 491 | Give out a list of image file names to remove based on the similarity threshold. Does not remove the mentioned |
| 492 | files. |
| 493 | |
| 494 | Args: |
| 495 | image_dir: Path to the directory containing all the images or dictionary with keys as file names |
| 496 | and values as numpy arrays which represent the CNN encoding for the key image file. |
| 497 | encoding_map: Optional, used instead of image_dir, a dictionary containing mapping of filenames and |
| 498 | corresponding CNN encodings. |
| 499 | min_similarity_threshold: Optional, threshold value (must be float between -1.0 and 1.0). Default is 0.9 |
| 500 | outfile: Optional, name of the file to save the results, must be a json. Default is None. |
| 501 | recursive: Optional, find images recursively in a nested image directory structure, set to False by default. |
| 502 | num_enc_workers: Optional, number of cpu cores to use for multiprocessing encoding generation (supported only on linux platform), set to 0 by default. 0 disables multiprocessing. |
| 503 | num_sim_workers: Optional, number of cpu cores to use for multiprocessing similarity computation, set to number of CPUs in the system by default. 0 disables multiprocessing. |
| 504 | |
| 505 | Returns: |
| 506 | duplicates: List of image file names that should be removed. |
| 507 | |
| 508 | Example: |
| 509 | ``` |
| 510 | from imagededup.methods import CNN |
| 511 | myencoder = CNN() |
| 512 | duplicates = myencoder.find_duplicates_to_remove(image_dir='path/to/images/directory'), |
| 513 | min_similarity_threshold=0.85) |
| 514 | |
| 515 | OR |
| 516 | |
| 517 | from imagededup.methods import CNN |
| 518 | myencoder = CNN() |
| 519 | duplicates = myencoder.find_duplicates_to_remove(encoding_map=<mapping filename to cnn encodings>, |
| 520 | min_similarity_threshold=0.85, outfile='results.json') |
| 521 | ``` |
| 522 | """ |
| 523 | if image_dir or encoding_map: |
| 524 | duplicates = self.find_duplicates( |
| 525 | image_dir=image_dir, |
| 526 | encoding_map=encoding_map, |
| 527 | min_similarity_threshold=min_similarity_threshold, |
| 528 | scores=False, |
| 529 | recursive=recursive, |
| 530 | num_enc_workers=num_enc_workers, |
| 531 | num_sim_workers=num_sim_workers, |
| 532 | ) |
| 533 | |
| 534 | files_to_remove = get_files_to_remove(duplicates) |
| 535 | |
| 536 | if outfile: |
| 537 | save_json(files_to_remove, outfile) |