Add a new URL or list of URLs to your archive
(urls: Union[str, List[str]],
tag: str='',
depth: int=0,
update: bool=not ONLY_NEW,
update_all: bool=False,
index_only: bool=False,
overwrite: bool=False,
# duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
init: bool=False,
extractors: str="",
parser: str="auto",
out_dir: Path=OUTPUT_DIR)
| 591 | |
| 592 | @enforce_types |
| 593 | def add(urls: Union[str, List[str]], |
| 594 | tag: str='', |
| 595 | depth: int=0, |
| 596 | update: bool=not ONLY_NEW, |
| 597 | update_all: bool=False, |
| 598 | index_only: bool=False, |
| 599 | overwrite: bool=False, |
| 600 | # duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically |
| 601 | init: bool=False, |
| 602 | extractors: str="", |
| 603 | parser: str="auto", |
| 604 | out_dir: Path=OUTPUT_DIR) -> List[Link]: |
| 605 | """Add a new URL or list of URLs to your archive""" |
| 606 | |
| 607 | from core.models import Snapshot, Tag |
| 608 | |
| 609 | assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' |
| 610 | |
| 611 | extractors = extractors.split(",") if extractors else [] |
| 612 | |
| 613 | if init: |
| 614 | run_subcommand('init', stdin=None, pwd=out_dir) |
| 615 | |
| 616 | # Load list of links from the existing index |
| 617 | check_data_folder(out_dir=out_dir) |
| 618 | check_dependencies() |
| 619 | new_links: List[Link] = [] |
| 620 | all_links = load_main_index(out_dir=out_dir) |
| 621 | |
| 622 | log_importing_started(urls=urls, depth=depth, index_only=index_only) |
| 623 | if isinstance(urls, str): |
| 624 | # save verbatim stdin to sources |
| 625 | write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir) |
| 626 | elif isinstance(urls, list): |
| 627 | # save verbatim args to sources |
| 628 | write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir) |
| 629 | |
| 630 | |
| 631 | new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser) |
| 632 | |
| 633 | # If we're going one level deeper, download each link and look for more links |
| 634 | new_links_depth = [] |
| 635 | if new_links and depth == 1: |
| 636 | log_crawl_started(new_links) |
| 637 | for new_link in new_links: |
| 638 | try: |
| 639 | downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir) |
| 640 | new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url) |
| 641 | except Exception as err: |
| 642 | stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red') |
| 643 | |
| 644 | imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values()) |
| 645 | |
| 646 | new_links = dedupe_links(all_links, imported_links) |
| 647 | |
| 648 | write_main_index(links=new_links, out_dir=out_dir) |
| 649 | all_links = load_main_index(out_dir=out_dir) |
| 650 |
no test coverage detected