MCPcopy
hub / github.com/square/retrofit / Crawler

Class Crawler

samples/src/main/java/com/example/retrofit/Crawler.java:48–158  ·  view source on GitHub ↗

A simple web crawler that uses a Retrofit service to turn URLs into webpages.

Source from the content-addressed store, hash-verified

46
47/** A simple web crawler that uses a Retrofit service to turn URLs into webpages. */
48public final class Crawler {
49 private final Set<HttpUrl> fetchedUrls =
50 Collections.synchronizedSet(new LinkedHashSet<HttpUrl>());
51 private final ConcurrentHashMap<String, AtomicInteger> hostnames = new ConcurrentHashMap<>();
52 private final PageService pageService;
53
54 public Crawler(PageService pageService) {
55 this.pageService = pageService;
56 }
57
58 public void crawlPage(HttpUrl url) {
59 // Skip hosts that we've visited many times.
60 AtomicInteger hostnameCount = new AtomicInteger();
61 AtomicInteger previous = hostnames.putIfAbsent(url.host(), hostnameCount);
62 if (previous != null) hostnameCount = previous;
63 if (hostnameCount.incrementAndGet() > 100) return;
64
65 // Asynchronously visit URL.
66 pageService
67 .get(url)
68 .enqueue(
69 new Callback<Page>() {
70 @Override
71 public void onResponse(Call<Page> call, Response<Page> response) {
72 if (!response.isSuccessful()) {
73 System.out.println(call.request().url() + ": failed: " + response.code());
74 return;
75 }
76
77 // Print this page's URL and title.
78 Page page = response.body();
79 HttpUrl base = response.raw().request().url();
80 System.out.println(base + ": " + page.title);
81
82 // Enqueue its links for visiting.
83 for (String link : page.links) {
84 HttpUrl linkUrl = base.resolve(link);
85 if (linkUrl != null && fetchedUrls.add(linkUrl)) {
86 crawlPage(linkUrl);
87 }
88 }
89 }
90
91 @Override
92 public void onFailure(Call<Page> call, Throwable t) {
93 System.out.println(call.request().url() + ": failed: " + t);
94 }
95 });
96 }
97
98 public static void main(String... args) throws Exception {
99 Dispatcher dispatcher = new Dispatcher(Executors.newFixedThreadPool(20));
100 dispatcher.setMaxRequests(20);
101 dispatcher.setMaxRequestsPerHost(1);
102
103 OkHttpClient okHttpClient =
104 new OkHttpClient.Builder()
105 .dispatcher(dispatcher)

Callers

nothing calls this directly

Calls

no outgoing calls

Tested by

no test coverage detected

Used in the wild real call sites across dependent graphs

searching dependent graphs…