Entrance of a crawler. A spider contains four modules: Downloader, Scheduler, PageProcessor and Pipeline. Every module is a field of Spider. The modules are defined in interface. You can customize a spider with various implementations of them. Examples: A simple craw
| 60 | * @since 0.1.0 |
| 61 | */ |
| 62 | public class Spider implements Runnable, Task { |
| 63 | |
| 64 | protected Downloader downloader; |
| 65 | |
| 66 | protected List<Pipeline> pipelines = new ArrayList<Pipeline>(); |
| 67 | |
| 68 | protected PageProcessor pageProcessor; |
| 69 | |
| 70 | protected List<Request> startRequests; |
| 71 | |
| 72 | protected Site site; |
| 73 | |
| 74 | protected String uuid; |
| 75 | |
| 76 | protected SpiderScheduler scheduler; |
| 77 | |
| 78 | protected Logger logger = LoggerFactory.getLogger(getClass()); |
| 79 | |
| 80 | protected CountableThreadPool threadPool; |
| 81 | |
| 82 | protected ExecutorService executorService; |
| 83 | |
| 84 | protected int threadNum = 1; |
| 85 | |
| 86 | protected AtomicInteger stat = new AtomicInteger(STAT_INIT); |
| 87 | |
| 88 | protected volatile boolean exitWhenComplete = true; |
| 89 | |
| 90 | protected final static int STAT_INIT = 0; |
| 91 | |
| 92 | protected final static int STAT_RUNNING = 1; |
| 93 | |
| 94 | protected final static int STAT_STOPPED = 2; |
| 95 | |
| 96 | protected boolean spawnUrl = true; |
| 97 | |
| 98 | protected boolean destroyWhenExit = true; |
| 99 | |
| 100 | private List<SpiderListener> spiderListeners; |
| 101 | |
| 102 | private final AtomicLong pageCount = new AtomicLong(0); |
| 103 | |
| 104 | private Date startTime; |
| 105 | |
| 106 | private long emptySleepTime = 30000; |
| 107 | |
| 108 | /** |
| 109 | * create a spider with pageProcessor. |
| 110 | * |
| 111 | * @param pageProcessor pageProcessor |
| 112 | * @return new spider |
| 113 | * @see PageProcessor |
| 114 | */ |
| 115 | public static Spider create(PageProcessor pageProcessor) { |
| 116 | return new Spider(pageProcessor); |
| 117 | } |
| 118 | |
| 119 | /** |
nothing calls this directly
no outgoing calls
no test coverage detected