Handle server connection and redirections. Parameters ---------- url : str URL to fetch. http_head : bool If True, send only HTTP HEAD request. Default is False. Returns ------- FetchResult (url, title, desc, keywords, mime, bad, fetch_status)
(
url: str,
http_head: bool = False
)
| 4345 | |
| 4346 | |
| 4347 | def fetch_data( |
| 4348 | url: str, |
| 4349 | http_head: bool = False |
| 4350 | ) -> FetchResult: |
| 4351 | """Handle server connection and redirections. |
| 4352 | |
| 4353 | Parameters |
| 4354 | ---------- |
| 4355 | url : str |
| 4356 | URL to fetch. |
| 4357 | http_head : bool |
| 4358 | If True, send only HTTP HEAD request. Default is False. |
| 4359 | |
| 4360 | Returns |
| 4361 | ------- |
| 4362 | FetchResult |
| 4363 | (url, title, desc, keywords, mime, bad, fetch_status) |
| 4364 | """ |
| 4365 | |
| 4366 | page_status = None |
| 4367 | page_url = url |
| 4368 | page_title = '' |
| 4369 | page_desc = '' |
| 4370 | page_keys = '' |
| 4371 | exception = False |
| 4372 | |
| 4373 | if is_nongeneric_url(url) or is_bad_url(url): |
| 4374 | return FetchResult(url, bad=True) |
| 4375 | |
| 4376 | if is_ignored_mime(url) or http_head: |
| 4377 | method = 'HEAD' |
| 4378 | else: |
| 4379 | method = 'GET' |
| 4380 | |
| 4381 | if not MYHEADERS: |
| 4382 | gen_headers() |
| 4383 | |
| 4384 | try: |
| 4385 | manager = get_PoolManager() |
| 4386 | |
| 4387 | while True: |
| 4388 | resp = manager.request(method, url, retries=Retry(redirect=10)) |
| 4389 | page_status = resp.status |
| 4390 | |
| 4391 | if resp.status == 200: |
| 4392 | if method == 'GET': |
| 4393 | for retry in resp.retries.history: |
| 4394 | if retry.status not in PERMANENT_REDIRECTS: |
| 4395 | break |
| 4396 | page_status, page_url = retry.status, retry.redirect_location |
| 4397 | page_title, page_desc, page_keys = get_data_from_page(resp) |
| 4398 | elif resp.status == 403 and url.endswith('/'): |
| 4399 | # HTTP response Forbidden |
| 4400 | # Handle URLs in the form of https://www.domain.com/ |
| 4401 | # which fail when trying to fetch resource '/' |
| 4402 | # retry without trailing '/' |
| 4403 | |
| 4404 | LOGDBG('Received status 403: retrying...') |
no test coverage detected