Visits the next link in Crawler._queue. If the link is on a domain recently visited (< Crawler.delay) it is skipped. Parses the content at the link for new links and adds them to the queue, according to their Crawler.priority(). Visited links (and con
(self, method=DEPTH, **kwargs)
| 3163 | return self.pop(remove=False) |
| 3164 | |
| 3165 | def crawl(self, method=DEPTH, **kwargs): |
| 3166 | """ Visits the next link in Crawler._queue. |
| 3167 | If the link is on a domain recently visited (< Crawler.delay) it is skipped. |
| 3168 | Parses the content at the link for new links and adds them to the queue, |
| 3169 | according to their Crawler.priority(). |
| 3170 | Visited links (and content) are passed to Crawler.visit(). |
| 3171 | """ |
| 3172 | link = self.pop() |
| 3173 | if link is None: |
| 3174 | return False |
| 3175 | if link.url not in self.visited: |
| 3176 | t = time.time() |
| 3177 | url = URL(link.url) |
| 3178 | if url.mimetype == "text/html": |
| 3179 | try: |
| 3180 | kwargs.setdefault("unicode", True) |
| 3181 | html = url.download(**kwargs) |
| 3182 | for new in self.parse(html, url=link.url): |
| 3183 | new.url = abs(new.url, base=url.redirect or link.url) |
| 3184 | new.url = self.normalize(new.url) |
| 3185 | # 1) Parse new links from HTML web pages. |
| 3186 | # 2) Schedule unknown links for a visit. |
| 3187 | # 3) Only links that are not already queued are queued. |
| 3188 | # 4) Only links for which Crawler.follow() is True are queued. |
| 3189 | # 5) Only links on Crawler.domains are queued. |
| 3190 | if new.url in self.visited: |
| 3191 | continue |
| 3192 | if new.url in self._queued: |
| 3193 | continue |
| 3194 | if self.follow(new) is False: |
| 3195 | continue |
| 3196 | if self.domains and not base(new.url).endswith(tuple(self.domains)): |
| 3197 | continue |
| 3198 | # 6) Limit the queue (remove tail), unless you are Google. |
| 3199 | if self.QUEUE is not None and \ |
| 3200 | self.QUEUE * 1.25 < len(self._queue): |
| 3201 | self._queue = self._queue[:self.QUEUE] |
| 3202 | self._queued.clear() |
| 3203 | self._queued.update(dict((q[2].url, True) for q in self._queue)) |
| 3204 | # 7) Position in the queue is determined by Crawler.priority(). |
| 3205 | # 8) Equal ranks are sorted FIFO or FILO. |
| 3206 | self.push(new, priority=self.priority(new, method=method), sort=self.sort) |
| 3207 | self.visit(link, source=html) |
| 3208 | except URLError: |
| 3209 | # URL can not be reached (HTTP404NotFound, URLTimeout). |
| 3210 | self.fail(link) |
| 3211 | else: |
| 3212 | # URL MIME-type is not HTML, don't know how to handle. |
| 3213 | self.fail(link) |
| 3214 | # Log the current time visited for the domain (see Crawler.pop()). |
| 3215 | # Log the URL as visited. |
| 3216 | self.history[base(link.url)] = time.time() |
| 3217 | self.visited[link.url] = True |
| 3218 | return True |
| 3219 | # Nothing happened, we already visited this link. |
| 3220 | return False |
| 3221 | |
| 3222 | def normalize(self, url): |