MCPcopy
hub / github.com/steel-dev/steel-browser / handleScrape

Function handleScrape

api/src/modules/actions/actions.controller.ts:22–338  ·  view source on GitHub ↗
(
  sessionService: SessionService,
  browserService: CDPService,
  request: ScrapeRequest,
  reply: FastifyReply,
)

Source from the content-addressed store, hash-verified

20import { safeGoto } from "../../utils/scrape/safeGoTo.js";
21
22export const handleScrape = async (
23 sessionService: SessionService,
24 browserService: CDPService,
25 request: ScrapeRequest,
26 reply: FastifyReply,
27) => {
28 const startTime = Date.now();
29 let times: Record<string, number> = {};
30 const { url, format, screenshot, pdf, proxyUrl, logUrl, delay, removeBase64Images } =
31 request.body;
32
33 let proxy: IProxyServer | null = null;
34 let context: BrowserContext | null = null;
35
36 try {
37 if (proxyUrl) {
38 proxy = await sessionService.proxyFactory(proxyUrl);
39 await proxy.listen();
40 }
41 times.proxyTime = Date.now() - startTime;
42
43 let page: Page;
44 let response: HTTPResponse | null = null;
45 let pdfResponse: HTTPResponse | null = null;
46 let isPdfNavigation = false;
47
48 if (!browserService.isRunning()) {
49 await browserService.launch();
50 }
51
52 if (proxy) {
53 // If a proxy is used, we proceed with browser navigation; implementing proxy-aware Node fetch
54 // would require an HTTP agent and is outside current scope.
55 context = await browserService.createBrowserContext(proxy.url);
56 page = await context.newPage();
57 times.proxyPageTime = Date.now() - startTime - times.proxyTime;
58 } else {
59 page = await browserService.getPrimaryPage();
60 times.pageTime = Date.now() - startTime - times.proxyTime;
61 }
62
63 // PDF retrieval will use node fetch with session cookies; removed CDP tracking
64
65 let normalizedUrl: string | null = null;
66 if (url) {
67 normalizedUrl = normalizeUrl(url);
68 if (!normalizedUrl) {
69 throw new Error(`Invalid URL: ${url}`);
70 }
71 }
72
73 const safeResponse = normalizedUrl
74 ? await safeGoto(page, normalizedUrl, {
75 timeout: 30000,
76 waitUntil: "domcontentloaded",
77 })
78 : { response: null, isPdf: false, pdfResponse: null };
79

Callers 2

routesFunction · 0.85
routesFunction · 0.85

Calls 15

isJsonContentTypeFunction · 0.90
jsonToMarkdownFunction · 0.90
normalizeUrlFunction · 0.85
safeGotoFunction · 0.85
convertPdfWithMupdfFunction · 0.85
getMetaByNameFunction · 0.85
getMetaByPropertyFunction · 0.85
extractJsonLdFunction · 0.85
cleanHtmlFunction · 0.85
getDefuddleContentFunction · 0.85
stripBase64ImagesFunction · 0.85

Tested by

no test coverage detected