MCPcopy
hub / github.com/ArchiveBox/ArchiveBox / add

Function add

archivebox/main.py:593–698  ·  view source on GitHub ↗

Add a new URL or list of URLs to your archive

(urls: Union[str, List[str]],
        tag: str='',
        depth: int=0,
        update: bool=not ONLY_NEW,
        update_all: bool=False,
        index_only: bool=False,
        overwrite: bool=False,
        # duplicate: bool=False,  # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
        init: bool=False,
        extractors: str="",
        parser: str="auto",
        out_dir: Path=OUTPUT_DIR)

Source from the content-addressed store, hash-verified

591
592@enforce_types
593def add(urls: Union[str, List[str]],
594 tag: str='',
595 depth: int=0,
596 update: bool=not ONLY_NEW,
597 update_all: bool=False,
598 index_only: bool=False,
599 overwrite: bool=False,
600 # duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
601 init: bool=False,
602 extractors: str="",
603 parser: str="auto",
604 out_dir: Path=OUTPUT_DIR) -> List[Link]:
605 """Add a new URL or list of URLs to your archive"""
606
607 from core.models import Snapshot, Tag
608
609 assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
610
611 extractors = extractors.split(",") if extractors else []
612
613 if init:
614 run_subcommand('init', stdin=None, pwd=out_dir)
615
616 # Load list of links from the existing index
617 check_data_folder(out_dir=out_dir)
618 check_dependencies()
619 new_links: List[Link] = []
620 all_links = load_main_index(out_dir=out_dir)
621
622 log_importing_started(urls=urls, depth=depth, index_only=index_only)
623 if isinstance(urls, str):
624 # save verbatim stdin to sources
625 write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
626 elif isinstance(urls, list):
627 # save verbatim args to sources
628 write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
629
630
631 new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser)
632
633 # If we're going one level deeper, download each link and look for more links
634 new_links_depth = []
635 if new_links and depth == 1:
636 log_crawl_started(new_links)
637 for new_link in new_links:
638 try:
639 downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
640 new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
641 except Exception as err:
642 stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')
643
644 imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
645
646 new_links = dedupe_links(all_links, imported_links)
647
648 write_main_index(links=new_links, out_dir=out_dir)
649 all_links = load_main_index(out_dir=out_dir)
650

Callers 4

resnapshot_snapshotMethod · 0.90
add_viewMethod · 0.90
form_validMethod · 0.85
mainFunction · 0.85

Calls 15

run_subcommandFunction · 0.85
check_data_folderFunction · 0.85
check_dependenciesFunction · 0.85
load_main_indexFunction · 0.85
log_importing_startedFunction · 0.85
save_text_as_sourceFunction · 0.85
parse_links_from_sourceFunction · 0.85
log_crawl_startedFunction · 0.85
save_file_as_sourceFunction · 0.85
stderrFunction · 0.85
dedupe_linksFunction · 0.85
write_main_indexFunction · 0.85

Tested by

no test coverage detected