MCPcopy
hub / github.com/ArchiveBox/ArchiveBox / update

Function update

archivebox/main.py:780–843  ·  view source on GitHub ↗

Import any new links from subscriptions and retry any previously failed/skipped links

(resume: Optional[float]=None,
           only_new: bool=ONLY_NEW,
           index_only: bool=False,
           overwrite: bool=False,
           filter_patterns_str: Optional[str]=None,
           filter_patterns: Optional[List[str]]=None,
           filter_type: Optional[str]=None,
           status: Optional[str]=None,
           after: Optional[str]=None,
           before: Optional[str]=None,
           extractors: str="",
           out_dir: Path=OUTPUT_DIR)

Source from the content-addressed store, hash-verified

778
779@enforce_types
780def update(resume: Optional[float]=None,
781 only_new: bool=ONLY_NEW,
782 index_only: bool=False,
783 overwrite: bool=False,
784 filter_patterns_str: Optional[str]=None,
785 filter_patterns: Optional[List[str]]=None,
786 filter_type: Optional[str]=None,
787 status: Optional[str]=None,
788 after: Optional[str]=None,
789 before: Optional[str]=None,
790 extractors: str="",
791 out_dir: Path=OUTPUT_DIR) -> List[Link]:
792 """Import any new links from subscriptions and retry any previously failed/skipped links"""
793
794 check_data_folder(out_dir=out_dir)
795 check_dependencies()
796 new_links: List[Link] = [] # TODO: Remove input argument: only_new
797
798 extractors = extractors.split(",") if extractors else []
799
800 # Step 1: Filter for selected_links
801 matching_snapshots = list_links(
802 filter_patterns=filter_patterns,
803 filter_type=filter_type,
804 before=before,
805 after=after,
806 )
807
808 matching_folders = list_folders(
809 links=matching_snapshots,
810 status=status,
811 out_dir=out_dir,
812 )
813 all_links = [link for link in matching_folders.values() if link]
814
815 if index_only:
816 for link in all_links:
817 write_link_details(link, out_dir=out_dir, skip_sql_index=True)
818 index_links(all_links, out_dir=out_dir)
819 return all_links
820
821 # Step 2: Run the archive methods for each link
822 to_archive = new_links if only_new else all_links
823 if resume:
824 to_archive = [
825 link for link in to_archive
826 if link.timestamp >= str(resume)
827 ]
828 if not to_archive:
829 stderr('')
830 stderr(f'[√] Nothing found to resume after {resume}', color='green')
831 return all_links
832
833 archive_kwargs = {
834 "out_dir": out_dir,
835 }
836 if extractors:
837 archive_kwargs["methods"] = extractors

Callers 1

mainFunction · 0.85

Calls 9

check_data_folderFunction · 0.85
check_dependenciesFunction · 0.85
list_linksFunction · 0.85
list_foldersFunction · 0.85
write_link_detailsFunction · 0.85
index_linksFunction · 0.85
stderrFunction · 0.85
archive_linksFunction · 0.85
load_main_indexFunction · 0.85

Tested by

no test coverage detected