An HTML document, ready for parsing. :param url: The URL from which the HTML originated, used for ``absolute_links``. :param html: HTML from which to base the parsing upon (optional). :param default_encoding: Which encoding to default to.
| 403 | |
| 404 | |
| 405 | class HTML(BaseParser): |
| 406 | """An HTML document, ready for parsing. |
| 407 | |
| 408 | :param url: The URL from which the HTML originated, used for ``absolute_links``. |
| 409 | :param html: HTML from which to base the parsing upon (optional). |
| 410 | :param default_encoding: Which encoding to default to. |
| 411 | """ |
| 412 | |
| 413 | def __init__(self, *, session: Union['HTMLSession', 'AsyncHTMLSession'] = None, url: str = DEFAULT_URL, html: _HTML, default_encoding: str = DEFAULT_ENCODING, async_: bool = False) -> None: |
| 414 | |
| 415 | # Convert incoming unicode HTML into bytes. |
| 416 | if isinstance(html, str): |
| 417 | html = html.encode(DEFAULT_ENCODING) |
| 418 | |
| 419 | super(HTML, self).__init__( |
| 420 | # Convert unicode HTML to bytes. |
| 421 | element=PyQuery(html)('html') or PyQuery(f'<html>{html}</html>')('html'), |
| 422 | html=html, |
| 423 | url=url, |
| 424 | default_encoding=default_encoding |
| 425 | ) |
| 426 | self.session = session or async_ and AsyncHTMLSession() or HTMLSession() |
| 427 | self.page = None |
| 428 | self.next_symbol = DEFAULT_NEXT_SYMBOL |
| 429 | |
| 430 | def __repr__(self) -> str: |
| 431 | return f"<HTML url={self.url!r}>" |
| 432 | |
| 433 | def next(self, fetch: bool = False, next_symbol: _NextSymbol = DEFAULT_NEXT_SYMBOL) -> _Next: |
| 434 | """Attempts to find the next page, if there is one. If ``fetch`` |
| 435 | is ``True`` (default), returns :class:`HTML <HTML>` object of |
| 436 | next page. If ``fetch`` is ``False``, simply returns the next URL. |
| 437 | |
| 438 | """ |
| 439 | |
| 440 | def get_next(): |
| 441 | candidates = self.find('a', containing=next_symbol) |
| 442 | |
| 443 | for candidate in candidates: |
| 444 | if candidate.attrs.get('href'): |
| 445 | # Support 'next' rel (e.g. reddit). |
| 446 | if 'next' in candidate.attrs.get('rel', []): |
| 447 | return candidate.attrs['href'] |
| 448 | |
| 449 | # Support 'next' in classnames. |
| 450 | for _class in candidate.attrs.get('class', []): |
| 451 | if 'next' in _class: |
| 452 | return candidate.attrs['href'] |
| 453 | |
| 454 | if 'page' in candidate.attrs['href']: |
| 455 | return candidate.attrs['href'] |
| 456 | |
| 457 | try: |
| 458 | # Resort to the last candidate. |
| 459 | return candidates[-1].attrs['href'] |
| 460 | except IndexError: |
| 461 | return None |
| 462 |
no outgoing calls
searching dependent graphs…