Method get_urls

newspaper/extractors.py:491–511 · view source on GitHub ↗

`doc_or_html`s html page or doc and returns list of urls, the regex flag indicates we don't parse via lxml and just search the html.

(self, doc_or_html, titles=False, regex=False)

Source from the content-addressed store, hash-verified

489	return [a.get('href') for a in a_tags if a.get('href')]
490
491	def get_urls(self, doc_or_html, titles=False, regex=False):
492	"""`doc_or_html`s html page or doc and returns list of urls, the regex
493	flag indicates we don't parse via lxml and just search the html.
494	"""
495	if doc_or_html is None:
496	log.critical('Must extract urls from either html, text or doc!')
497	return []
498	# If we are extracting from raw text
499	if regex:
500	doc_or_html = re.sub('<[^<]+?>', ' ', str(doc_or_html))
501	doc_or_html = re.findall(
502	'http[s]?://(?:[a-zA-Z]\|[0-9]\|[$-_@.&+]\|[!*,]\|'
503	'(?:%[0-9a-fA-F][0-9a-fA-F]))+', doc_or_html)
504	doc_or_html = [i.strip() for i in doc_or_html]
505	return doc_or_html or []
506	# If the doc_or_html is html, parse it into a root
507	if isinstance(doc_or_html, str):
508	doc = self.parser.fromstring(doc_or_html)
509	else:
510	doc = doc_or_html
511	return self._get_urls(doc, titles)
512
513	def get_category_urls(self, source_url, doc):
514	"""Inputs source lxml root and source url, extracts domain and

get_category_urlsMethod · 0.95

feeds_to_articlesMethod · 0.80

categories_to_articlesMethod · 0.80

_get_urlsMethod · 0.95

fromstringMethod · 0.80

no test coverage detected