MCPcopy
hub / github.com/psf/requests-html / HTML

Class HTML

requests_html.py:405–638  ·  view source on GitHub ↗

An HTML document, ready for parsing. :param url: The URL from which the HTML originated, used for ``absolute_links``. :param html: HTML from which to base the parsing upon (optional). :param default_encoding: Which encoding to default to.

Source from the content-addressed store, hash-verified

403
404
405class HTML(BaseParser):
406 """An HTML document, ready for parsing.
407
408 :param url: The URL from which the HTML originated, used for ``absolute_links``.
409 :param html: HTML from which to base the parsing upon (optional).
410 :param default_encoding: Which encoding to default to.
411 """
412
413 def __init__(self, *, session: Union['HTMLSession', 'AsyncHTMLSession'] = None, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING, async_: bool = False) -> None:
414
415 # Convert incoming unicode HTML into bytes.
416 if isinstance(html, str):
417 html = html.encode(DEFAULT_ENCODING)
418
419 super(HTML, self).__init__(
420 # Convert unicode HTML to bytes.
421 element=PyQuery(html)('html') or PyQuery(f'<html>{html}</html>')('html'),
422 html=html,
423 url=url,
424 default_encoding=default_encoding
425 )
426 self.session = session or async_ and AsyncHTMLSession() or HTMLSession()
427 self.page = None
428 self.next_symbol = DEFAULT_NEXT_SYMBOL
429
430 def __repr__(self) -> str:
431 return f"<HTML url={self.url!r}>"
432
433 def next(self, fetch: bool = False, next_symbol: _NextSymbol = DEFAULT_NEXT_SYMBOL) -> _Next:
434 """Attempts to find the next page, if there is one. If ``fetch``
435 is ``True`` (default), returns :class:`HTML <HTML>` object of
436 next page. If ``fetch`` is ``False``, simply returns the next URL.
437
438 """
439
440 def get_next():
441 candidates = self.find('a', containing=next_symbol)
442
443 for candidate in candidates:
444 if candidate.attrs.get('href'):
445 # Support 'next' rel (e.g. reddit).
446 if 'next' in candidate.attrs.get('rel', []):
447 return candidate.attrs['href']
448
449 # Support 'next' in classnames.
450 for _class in candidate.attrs.get('class', []):
451 if 'next' in _class:
452 return candidate.attrs['href']
453
454 if 'page' in candidate.attrs['href']:
455 return candidate.attrs['href']
456
457 try:
458 # Resort to the last candidate.
459 return candidates[-1].attrs['href']
460 except IndexError:
461 return None
462

Callers 9

test_html_loadingFunction · 0.90
test_absolute_linksFunction · 0.90
test_bare_renderFunction · 0.90
test_bare_arenderFunction · 0.90
test_bare_js_evalFunction · 0.90
test_bare_js_async_evalFunction · 0.90
renderMethod · 0.85
arenderMethod · 0.85
htmlMethod · 0.85

Calls

no outgoing calls

Tested by 6

test_html_loadingFunction · 0.72
test_absolute_linksFunction · 0.72
test_bare_renderFunction · 0.72
test_bare_arenderFunction · 0.72
test_bare_js_evalFunction · 0.72
test_bare_js_async_evalFunction · 0.72

Used in the wild real call sites across dependent graphs

searching dependent graphs…