Returns a generator that yields (Link, source)-tuples of visited pages. When the crawler is busy, it yields (None, None). When the crawler is done, it yields None.
(links=[], domains=[], delay=20.0, parser=HTMLLinkParser().parse, sort=FIFO, method=DEPTH, **kwargs)
| 3275 | # Functional approach to crawling. |
| 3276 | |
| 3277 | def crawl(links=[], domains=[], delay=20.0, parser=HTMLLinkParser().parse, sort=FIFO, method=DEPTH, **kwargs): |
| 3278 | """ Returns a generator that yields (Link, source)-tuples of visited pages. |
| 3279 | When the crawler is busy, it yields (None, None). |
| 3280 | When the crawler is done, it yields None. |
| 3281 | """ |
| 3282 | # The scenarios below defines "busy": |
| 3283 | # - crawl(delay=10, throttle=0) |
| 3284 | # The crawler will wait 10 seconds before visiting the same subdomain. |
| 3285 | # The crawler will not throttle downloads, so the next link is visited instantly. |
| 3286 | # So sometimes (None, None) is returned while it waits for an available subdomain. |
| 3287 | # - crawl(delay=0, throttle=10) |
| 3288 | # The crawler will halt 10 seconds after each visit. |
| 3289 | # The crawler will not delay before visiting the same subdomain. |
| 3290 | # So usually a result is returned each crawl.next(), but each call takes 10 seconds. |
| 3291 | # - asynchronous(crawl().next) |
| 3292 | # AsynchronousRequest.value is set to (Link, source) once AsynchronousRequest.done=True. |
| 3293 | # The program will not halt in the meantime (i.e., the next crawl is threaded). |
| 3294 | crawler = Crawler(links, domains, delay, parser, sort) |
| 3295 | bind(crawler, "visit", \ |
| 3296 | lambda crawler, link, source=None: \ |
| 3297 | setattr(crawler, "crawled", (link, source))) # Define Crawler.visit() on-the-fly. |
| 3298 | while not crawler.done: |
| 3299 | crawler.crawled = (None, None) |
| 3300 | crawler.crawl(method, **kwargs) |
| 3301 | yield crawler.crawled |
| 3302 | |
| 3303 | #for link, source in crawl("http://www.nodebox.net/", delay=0, throttle=10): |
| 3304 | # print link |
nothing calls this directly
no test coverage detected
searching dependent graphs…