MCPcopy Index your code
hub / github.com/clips/pattern / crawl

Method crawl

pattern/web/__init__.py:3165–3220  ·  view source on GitHub ↗

Visits the next link in Crawler._queue. If the link is on a domain recently visited (< Crawler.delay) it is skipped. Parses the content at the link for new links and adds them to the queue, according to their Crawler.priority(). Visited links (and con

(self, method=DEPTH, **kwargs)

Source from the content-addressed store, hash-verified

3163 return self.pop(remove=False)
3164
3165 def crawl(self, method=DEPTH, **kwargs):
3166 """ Visits the next link in Crawler._queue.
3167 If the link is on a domain recently visited (< Crawler.delay) it is skipped.
3168 Parses the content at the link for new links and adds them to the queue,
3169 according to their Crawler.priority().
3170 Visited links (and content) are passed to Crawler.visit().
3171 """
3172 link = self.pop()
3173 if link is None:
3174 return False
3175 if link.url not in self.visited:
3176 t = time.time()
3177 url = URL(link.url)
3178 if url.mimetype == "text/html":
3179 try:
3180 kwargs.setdefault("unicode", True)
3181 html = url.download(**kwargs)
3182 for new in self.parse(html, url=link.url):
3183 new.url = abs(new.url, base=url.redirect or link.url)
3184 new.url = self.normalize(new.url)
3185 # 1) Parse new links from HTML web pages.
3186 # 2) Schedule unknown links for a visit.
3187 # 3) Only links that are not already queued are queued.
3188 # 4) Only links for which Crawler.follow() is True are queued.
3189 # 5) Only links on Crawler.domains are queued.
3190 if new.url in self.visited:
3191 continue
3192 if new.url in self._queued:
3193 continue
3194 if self.follow(new) is False:
3195 continue
3196 if self.domains and not base(new.url).endswith(tuple(self.domains)):
3197 continue
3198 # 6) Limit the queue (remove tail), unless you are Google.
3199 if self.QUEUE is not None and \
3200 self.QUEUE * 1.25 < len(self._queue):
3201 self._queue = self._queue[:self.QUEUE]
3202 self._queued.clear()
3203 self._queued.update(dict((q[2].url, True) for q in self._queue))
3204 # 7) Position in the queue is determined by Crawler.priority().
3205 # 8) Equal ranks are sorted FIFO or FILO.
3206 self.push(new, priority=self.priority(new, method=method), sort=self.sort)
3207 self.visit(link, source=html)
3208 except URLError:
3209 # URL can not be reached (HTTP404NotFound, URLTimeout).
3210 self.fail(link)
3211 else:
3212 # URL MIME-type is not HTML, don't know how to handle.
3213 self.fail(link)
3214 # Log the current time visited for the domain (see Crawler.pop()).
3215 # Log the URL as visited.
3216 self.history[base(link.url)] = time.time()
3217 self.visited[link.url] = True
3218 return True
3219 # Nothing happened, we already visited this link.
3220 return False
3221
3222 def normalize(self, url):

Callers 5

crawlFunction · 0.95
test_crawler_crawlMethod · 0.95
test_crawler_delayMethod · 0.95
test_crawler_breadthMethod · 0.95
13-crawler.pyFile · 0.80

Calls 15

popMethod · 0.95
downloadMethod · 0.95
normalizeMethod · 0.95
followMethod · 0.95
pushMethod · 0.95
priorityMethod · 0.95
visitMethod · 0.95
failMethod · 0.95
URLClass · 0.85
lenFunction · 0.85
absFunction · 0.70
baseFunction · 0.70

Tested by 3

test_crawler_crawlMethod · 0.76
test_crawler_delayMethod · 0.76
test_crawler_breadthMethod · 0.76