An asynchronous generator function for iterating through paginated API data. This function continuously makes requests to a specified API URL, incrementing the page number or applying a custom pagination function, and yields the received data one page at a time. It
(self, url, page_size=100, _json=True, next_key=None, iter_key=None, **requests_kwargs)
| 1357 | return getattr(r, "is_success", False) or getattr(r, "status_code", 0) == 404 |
| 1358 | |
| 1359 | async def api_page_iter(self, url, page_size=100, _json=True, next_key=None, iter_key=None, **requests_kwargs): |
| 1360 | """ |
| 1361 | An asynchronous generator function for iterating through paginated API data. |
| 1362 | |
| 1363 | This function continuously makes requests to a specified API URL, incrementing the page number |
| 1364 | or applying a custom pagination function, and yields the received data one page at a time. |
| 1365 | It is well-suited for APIs that provide paginated results. |
| 1366 | |
| 1367 | Args: |
| 1368 | url (str): The initial API URL. Can contain placeholders for 'page', 'page_size', and 'offset'. |
| 1369 | page_size (int, optional): The number of items per page. Defaults to 100. |
| 1370 | json (bool, optional): If True, attempts to deserialize the response content to a JSON object. Defaults to True. |
| 1371 | next_key (callable, optional): A function that takes the last page's data and returns the URL for the next page. Defaults to None. |
| 1372 | iter_key (callable, optional): A function that builds each new request based on the page number, page size, and offset. Defaults to a simple implementation that autoreplaces {page} and {page_size} in the url. |
| 1373 | **requests_kwargs: Arbitrary keyword arguments that will be forwarded to the HTTP request function. |
| 1374 | |
| 1375 | Yields: |
| 1376 | dict or httpx.Response: If 'json' is True, yields a dictionary containing the parsed JSON data. Otherwise, yields the raw HTTP response. |
| 1377 | |
| 1378 | Note: |
| 1379 | The loop will continue indefinitely unless manually stopped. Make sure to break out of the loop once the last page has been received. |
| 1380 | |
| 1381 | Examples: |
| 1382 | >>> agen = api_page_iter('https://api.example.com/data?page={page}&page_size={page_size}') |
| 1383 | >>> try: |
| 1384 | >>> async for page in agen: |
| 1385 | >>> subdomains = page["subdomains"] |
| 1386 | >>> self.hugesuccess(subdomains) |
| 1387 | >>> if not subdomains: |
| 1388 | >>> break |
| 1389 | >>> finally: |
| 1390 | >>> await agen.aclose() |
| 1391 | """ |
| 1392 | page = 1 |
| 1393 | offset = 0 |
| 1394 | result = None |
| 1395 | if iter_key is None: |
| 1396 | iter_key = self._prepare_api_iter_req |
| 1397 | while 1: |
| 1398 | if result and callable(next_key): |
| 1399 | try: |
| 1400 | new_url = next_key(result) |
| 1401 | except Exception as e: |
| 1402 | self.debug(f"Failed to extract next page of results from {url}: {e}") |
| 1403 | self.debug(traceback.format_exc()) |
| 1404 | else: |
| 1405 | new_url, new_kwargs = iter_key(url, page, page_size, offset, **requests_kwargs) |
| 1406 | result = await self.api_request(new_url, **new_kwargs) |
| 1407 | if result is None: |
| 1408 | self.verbose(f"api_page_iter() got no response for {new_url}") |
| 1409 | break |
| 1410 | try: |
| 1411 | if _json: |
| 1412 | result = result.json() |
| 1413 | yield result |
| 1414 | except Exception: |
| 1415 | self.warning(f'Error in api_page_iter() for url: "{new_url}"') |
| 1416 | self.trace(traceback.format_exc()) |