Make GET request with response size limiting.
(
self,
url: str,
params: Optional[Dict] = None,
headers: Optional[Dict] = None,
allow_redirects: bool = True,
**kwargs
)
| 133 | } |
| 134 | |
| 135 | def get( |
| 136 | self, |
| 137 | url: str, |
| 138 | params: Optional[Dict] = None, |
| 139 | headers: Optional[Dict] = None, |
| 140 | allow_redirects: bool = True, |
| 141 | **kwargs |
| 142 | ) -> Optional[requests.Response]: |
| 143 | """Make GET request with response size limiting.""" |
| 144 | try: |
| 145 | if self.rate_limiter: |
| 146 | self.rate_limiter.acquire() |
| 147 | if self.jitter_max > 0: |
| 148 | time.sleep(random.uniform(self.jitter_min, self.jitter_max)) |
| 149 | if self.ua_rotation: |
| 150 | self.session.headers['User-Agent'] = random.choice(self.ua_pool) |
| 151 | if self.proxy_pool: |
| 152 | proxy = self.proxy_pool[self.proxy_index % len(self.proxy_pool)] |
| 153 | self.session.proxies = {'http': proxy, 'https': proxy} |
| 154 | self.proxy_index += 1 |
| 155 | # Stream response to check size |
| 156 | response = self.session.get( |
| 157 | url, |
| 158 | params=params, |
| 159 | headers=headers, |
| 160 | timeout=self.timeout, |
| 161 | verify=self.verify_ssl, |
| 162 | allow_redirects=allow_redirects, |
| 163 | stream=True, # Enable streaming to check content length |
| 164 | **kwargs |
| 165 | ) |
| 166 | |
| 167 | # Check content length before downloading |
| 168 | content_length = response.headers.get('Content-Length') |
| 169 | if content_length and int(content_length) > self.max_response_size: |
| 170 | logger.warning( |
| 171 | f"Response too large for {url}: {content_length} bytes " |
| 172 | f"(max: {self.max_response_size}). Skipping." |
| 173 | ) |
| 174 | response.close() |
| 175 | return None |
| 176 | |
| 177 | # Read response in chunks with size limit and timeout protection |
| 178 | content = b'' |
| 179 | start_time = time.time() |
| 180 | read_timeout = self.timeout * 2 # Give extra time for reading (2x request timeout) |
| 181 | |
| 182 | for chunk in response.iter_content(chunk_size=8192): |
| 183 | # Check if reading is taking too long |
| 184 | if time.time() - start_time > read_timeout: |
| 185 | logger.warning( |
| 186 | f"Response reading timeout exceeded for {url} after {read_timeout}s. Truncating." |
| 187 | ) |
| 188 | response.close() |
| 189 | if content: |
| 190 | response._content = content |
| 191 | return response |
| 192 | return None |