MCPcopy
hub / github.com/codelucas/newspaper / get_urls

Method get_urls

newspaper/extractors.py:491–511  ·  view source on GitHub ↗

`doc_or_html`s html page or doc and returns list of urls, the regex flag indicates we don't parse via lxml and just search the html.

(self, doc_or_html, titles=False, regex=False)

Source from the content-addressed store, hash-verified

489 return [a.get('href') for a in a_tags if a.get('href')]
490
491 def get_urls(self, doc_or_html, titles=False, regex=False):
492 """`doc_or_html`s html page or doc and returns list of urls, the regex
493 flag indicates we don't parse via lxml and just search the html.
494 """
495 if doc_or_html is None:
496 log.critical('Must extract urls from either html, text or doc!')
497 return []
498 # If we are extracting from raw text
499 if regex:
500 doc_or_html = re.sub('<[^<]+?>', ' ', str(doc_or_html))
501 doc_or_html = re.findall(
502 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|'
503 '(?:%[0-9a-fA-F][0-9a-fA-F]))+', doc_or_html)
504 doc_or_html = [i.strip() for i in doc_or_html]
505 return doc_or_html or []
506 # If the doc_or_html is html, parse it into a root
507 if isinstance(doc_or_html, str):
508 doc = self.parser.fromstring(doc_or_html)
509 else:
510 doc = doc_or_html
511 return self._get_urls(doc, titles)
512
513 def get_category_urls(self, source_url, doc):
514 """Inputs source lxml root and source url, extracts domain and

Callers 3

get_category_urlsMethod · 0.95
feeds_to_articlesMethod · 0.80

Calls 2

_get_urlsMethod · 0.95
fromstringMethod · 0.80

Tested by

no test coverage detected