`doc_or_html`s html page or doc and returns list of urls, the regex flag indicates we don't parse via lxml and just search the html.
(self, doc_or_html, titles=False, regex=False)
| 489 | return [a.get('href') for a in a_tags if a.get('href')] |
| 490 | |
| 491 | def get_urls(self, doc_or_html, titles=False, regex=False): |
| 492 | """`doc_or_html`s html page or doc and returns list of urls, the regex |
| 493 | flag indicates we don't parse via lxml and just search the html. |
| 494 | """ |
| 495 | if doc_or_html is None: |
| 496 | log.critical('Must extract urls from either html, text or doc!') |
| 497 | return [] |
| 498 | # If we are extracting from raw text |
| 499 | if regex: |
| 500 | doc_or_html = re.sub('<[^<]+?>', ' ', str(doc_or_html)) |
| 501 | doc_or_html = re.findall( |
| 502 | 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|' |
| 503 | '(?:%[0-9a-fA-F][0-9a-fA-F]))+', doc_or_html) |
| 504 | doc_or_html = [i.strip() for i in doc_or_html] |
| 505 | return doc_or_html or [] |
| 506 | # If the doc_or_html is html, parse it into a root |
| 507 | if isinstance(doc_or_html, str): |
| 508 | doc = self.parser.fromstring(doc_or_html) |
| 509 | else: |
| 510 | doc = doc_or_html |
| 511 | return self._get_urls(doc, titles) |
| 512 | |
| 513 | def get_category_urls(self, source_url, doc): |
| 514 | """Inputs source lxml root and source url, extracts domain and |
no test coverage detected