MCPcopy
hub / github.com/DO-SAY-GO/dn / startCrawl

Function startCrawl

src/archivist.js:1876–1952  ·  view source on GitHub ↗
({
    urls, timeout, depth, saveToFile: saveToFile = false,
    batchSize,
    minPageCrawlTime, 
    maxPageCrawlTime,
    program,
  } = {})

Source from the content-addressed store, hash-verified

1874 }
1875
1876 export async function startCrawl({
1877 urls, timeout, depth, saveToFile: saveToFile = false,
1878 batchSize,
1879 minPageCrawlTime,
1880 maxPageCrawlTime,
1881 program,
1882 } = {}) {
1883 if ( State.crawling ) {
1884 console.log('Already crawling...');
1885 return;
1886 }
1887 if ( saveToFile ) {
1888 logName = `crawl-${(new Date).toISOString()}.urls.txt`;
1889 logStream = Fs.createWriteStream(Path.resolve(args.CONFIG_DIR, logName), {flags:'as+'});
1890 }
1891 console.log('StartCrawl', {urls, timeout, depth, batchSize, saveToFile, minPageCrawlTime, maxPageCrawlTime, program});
1892 State.crawling = true;
1893 State.crawlDepth = depth;
1894 State.crawlTimeout = timeout;
1895 State.visited = new Set();
1896 Object.assign(State,{
1897 batchSize,
1898 minPageCrawlTime,
1899 maxPageCrawlTime
1900 });
1901 const batch_sz = State.batchSize || BATCH_SIZE;
1902 let totalBytes = 0;
1903 setTimeout(async () => {
1904 try {
1905 while(urls.length >= batch_sz) {
1906 const jobs = [];
1907 const batch = urls.splice(urls.length-batch_sz,batch_sz);
1908 console.log({urls, batch});
1909 for( let i = 0; i < batch_sz; i++ ) {
1910 const {depth,url} = batch.shift();
1911 const pr = archiveAndIndexURL(
1912 url,
1913 {crawl: true, depth, timeout, createIfMissing:true, getLinks: depth >= 1, program}
1914 );
1915 jobs.push(pr);
1916 }
1917 const links = (await Promise.all(jobs)).flat().filter(({url}) => !Q.has(url));
1918 if ( links.length ) {
1919 urls.push(...links);
1920 links.forEach(({url}) => Q.add(url));
1921 }
1922 }
1923 while(urls.length) {
1924 const {depth,url} = urls.pop();
1925 const links = (await archiveAndIndexURL(
1926 url,
1927 {crawl: true, depth, timeout, createIfMissing:true, getLinks: depth >= 1, program}
1928 )).filter(({url}) => !Q.has(url));
1929 console.log(links, Q);
1930 if ( links.length ) {
1931 urls.push(...links);
1932 links.forEach(({url}) => Q.add(url));
1933 }

Callers 1

addHandlersFunction · 0.90

Calls 2

untilTrueFunction · 0.90
archiveAndIndexURLFunction · 0.85

Tested by

no test coverage detected