({
urls, timeout, depth, saveToFile: saveToFile = false,
batchSize,
minPageCrawlTime,
maxPageCrawlTime,
program,
} = {})
| 1874 | } |
| 1875 | |
| 1876 | export async function startCrawl({ |
| 1877 | urls, timeout, depth, saveToFile: saveToFile = false, |
| 1878 | batchSize, |
| 1879 | minPageCrawlTime, |
| 1880 | maxPageCrawlTime, |
| 1881 | program, |
| 1882 | } = {}) { |
| 1883 | if ( State.crawling ) { |
| 1884 | console.log('Already crawling...'); |
| 1885 | return; |
| 1886 | } |
| 1887 | if ( saveToFile ) { |
| 1888 | logName = `crawl-${(new Date).toISOString()}.urls.txt`; |
| 1889 | logStream = Fs.createWriteStream(Path.resolve(args.CONFIG_DIR, logName), {flags:'as+'}); |
| 1890 | } |
| 1891 | console.log('StartCrawl', {urls, timeout, depth, batchSize, saveToFile, minPageCrawlTime, maxPageCrawlTime, program}); |
| 1892 | State.crawling = true; |
| 1893 | State.crawlDepth = depth; |
| 1894 | State.crawlTimeout = timeout; |
| 1895 | State.visited = new Set(); |
| 1896 | Object.assign(State,{ |
| 1897 | batchSize, |
| 1898 | minPageCrawlTime, |
| 1899 | maxPageCrawlTime |
| 1900 | }); |
| 1901 | const batch_sz = State.batchSize || BATCH_SIZE; |
| 1902 | let totalBytes = 0; |
| 1903 | setTimeout(async () => { |
| 1904 | try { |
| 1905 | while(urls.length >= batch_sz) { |
| 1906 | const jobs = []; |
| 1907 | const batch = urls.splice(urls.length-batch_sz,batch_sz); |
| 1908 | console.log({urls, batch}); |
| 1909 | for( let i = 0; i < batch_sz; i++ ) { |
| 1910 | const {depth,url} = batch.shift(); |
| 1911 | const pr = archiveAndIndexURL( |
| 1912 | url, |
| 1913 | {crawl: true, depth, timeout, createIfMissing:true, getLinks: depth >= 1, program} |
| 1914 | ); |
| 1915 | jobs.push(pr); |
| 1916 | } |
| 1917 | const links = (await Promise.all(jobs)).flat().filter(({url}) => !Q.has(url)); |
| 1918 | if ( links.length ) { |
| 1919 | urls.push(...links); |
| 1920 | links.forEach(({url}) => Q.add(url)); |
| 1921 | } |
| 1922 | } |
| 1923 | while(urls.length) { |
| 1924 | const {depth,url} = urls.pop(); |
| 1925 | const links = (await archiveAndIndexURL( |
| 1926 | url, |
| 1927 | {crawl: true, depth, timeout, createIfMissing:true, getLinks: depth >= 1, program} |
| 1928 | )).filter(({url}) => !Q.has(url)); |
| 1929 | console.log(links, Q); |
| 1930 | if ( links.length ) { |
| 1931 | urls.push(...links); |
| 1932 | links.forEach(({url}) => Q.add(url)); |
| 1933 | } |
no test coverage detected