MCPcopy
hub / github.com/ArchiveBox/ArchiveBox / run_parser_functions

Function run_parser_functions

archivebox/parsers/__init__.py:111–147  ·  view source on GitHub ↗
(to_parse: IO[str], timer, root_url: Optional[str]=None, parser: str="auto")

Source from the content-addressed store, hash-verified

109
110
111def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None, parser: str="auto") -> Tuple[List[Link], Optional[str]]:
112 most_links: List[Link] = []
113 best_parser_name = None
114
115 if parser != "auto":
116 parser_name, parser_func = PARSERS[parser]
117 parsed_links = list(parser_func(to_parse, root_url=root_url))
118 if not parsed_links:
119 stderr()
120 stderr(f'[X] No links found using {parser_name} parser', color='red')
121 hint('Try a different parser or double check the input?')
122 stderr()
123 timer.end()
124 return parsed_links, parser_name
125
126 for parser_id in PARSERS:
127 parser_name, parser_func = PARSERS[parser_id]
128 try:
129 parsed_links = list(parser_func(to_parse, root_url=root_url))
130 if not parsed_links:
131 raise Exception(f'No links found using {parser_name} parser')
132
133 # print(f'[√] Parser {parser_name} succeeded: {len(parsed_links)} links parsed')
134 if len(parsed_links) > len(most_links):
135 most_links = parsed_links
136 best_parser_name = parser_name
137
138 except Exception as err: # noqa
139 # Parsers are tried one by one down the list, and the first one
140 # that succeeds is used. To debug why a certain parser was not used
141 # due to python error or format incompatibility, uncomment this line:
142
143 # print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
144 # raise
145 pass
146 timer.end()
147 return most_links, best_parser_name
148
149
150@enforce_types

Callers 2

parse_links_memoryFunction · 0.85
parse_linksFunction · 0.85

Calls 3

stderrFunction · 0.85
hintFunction · 0.85
endMethod · 0.80

Tested by

no test coverage detected