hub / github.com/ArchiveBox/ArchiveBox / add

Function add

archivebox/main.py:593–698 · view source on GitHub ↗

Add a new URL or list of URLs to your archive

(urls: Union[str, List[str]],
        tag: str='',
        depth: int=0,
        update: bool=not ONLY_NEW,
        update_all: bool=False,
        index_only: bool=False,
        overwrite: bool=False,
        # duplicate: bool=False,  # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
        init: bool=False,
        extractors: str="",
        parser: str="auto",
        out_dir: Path=OUTPUT_DIR)

Source from the content-addressed store, hash-verified

591
592	@enforce_types
593	def add(urls: Union[str, List[str]],
594	tag: str='',
595	depth: int=0,
596	update: bool=not ONLY_NEW,
597	update_all: bool=False,
598	index_only: bool=False,
599	overwrite: bool=False,
600	# duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
601	init: bool=False,
602	extractors: str="",
603	parser: str="auto",
604	out_dir: Path=OUTPUT_DIR) -> List[Link]:
605	"""Add a new URL or list of URLs to your archive"""
606
607	from core.models import Snapshot, Tag
608
609	assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
610
611	extractors = extractors.split(",") if extractors else []
612
613	if init:
614	run_subcommand('init', stdin=None, pwd=out_dir)
615
616	# Load list of links from the existing index
617	check_data_folder(out_dir=out_dir)
618	check_dependencies()
619	new_links: List[Link] = []
620	all_links = load_main_index(out_dir=out_dir)
621
622	log_importing_started(urls=urls, depth=depth, index_only=index_only)
623	if isinstance(urls, str):
624	# save verbatim stdin to sources
625	write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
626	elif isinstance(urls, list):
627	# save verbatim args to sources
628	write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
629
630
631	new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser)
632
633	# If we're going one level deeper, download each link and look for more links
634	new_links_depth = []
635	if new_links and depth == 1:
636	log_crawl_started(new_links)
637	for new_link in new_links:
638	try:
639	downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
640	new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
641	except Exception as err:
642	stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')
643
644	imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
645
646	new_links = dedupe_links(all_links, imported_links)
647
648	write_main_index(links=new_links, out_dir=out_dir)
649	all_links = load_main_index(out_dir=out_dir)
650

Callers 4

resnapshot_snapshotMethod · 0.90

add_viewMethod · 0.90

form_validMethod · 0.85

mainFunction · 0.85

Calls 15

run_subcommandFunction · 0.85

check_data_folderFunction · 0.85

check_dependenciesFunction · 0.85

load_main_indexFunction · 0.85

log_importing_startedFunction · 0.85

save_text_as_sourceFunction · 0.85

parse_links_from_sourceFunction · 0.85

log_crawl_startedFunction · 0.85

save_file_as_sourceFunction · 0.85

stderrFunction · 0.85

dedupe_linksFunction · 0.85

write_main_indexFunction · 0.85

Tested by

no test coverage detected