| 323 | } |
| 324 | |
| 325 | export async function readUrl({ |
| 326 | url, |
| 327 | max_chars = DEFAULT_MAX_CHARS, |
| 328 | fetch: fetchImpl = globalThis.fetch, |
| 329 | }: { |
| 330 | url: string |
| 331 | max_chars?: number |
| 332 | fetch?: FetchLike |
| 333 | }): Promise<ReadUrlOutput> { |
| 334 | let parsedUrl: URL |
| 335 | try { |
| 336 | parsedUrl = new URL(url) |
| 337 | } catch { |
| 338 | return errorResult(url, 'Invalid URL') |
| 339 | } |
| 340 | |
| 341 | if (!isAllowedUrl(parsedUrl)) { |
| 342 | return errorResult(url, 'Only http:// and https:// URLs are supported') |
| 343 | } |
| 344 | |
| 345 | const controller = new AbortController() |
| 346 | const timeout = setTimeout(() => controller.abort(), FETCH_TIMEOUT_MS) |
| 347 | |
| 348 | try { |
| 349 | const response = await fetchImpl(parsedUrl.toString(), { |
| 350 | redirect: 'follow', |
| 351 | signal: controller.signal, |
| 352 | headers: { |
| 353 | accept: |
| 354 | 'text/html,application/xhtml+xml,application/json,text/plain;q=0.9,*/*;q=0.8', |
| 355 | 'accept-language': 'en-US,en;q=0.9', |
| 356 | 'user-agent': USER_AGENT, |
| 357 | }, |
| 358 | }) |
| 359 | |
| 360 | if (!response.ok) { |
| 361 | return errorResult( |
| 362 | url, |
| 363 | `Failed to fetch URL: ${response.status} ${response.statusText}`, |
| 364 | ) |
| 365 | } |
| 366 | |
| 367 | const contentType = getHeader(response.headers, 'content-type') ?? '' |
| 368 | if (contentType && !isSupportedContentType(contentType)) { |
| 369 | return errorResult( |
| 370 | url, |
| 371 | `Unsupported content type: ${contentType || 'unknown'}`, |
| 372 | ) |
| 373 | } |
| 374 | |
| 375 | const body = await readResponseBody(response, MAX_RESPONSE_BYTES) |
| 376 | const extracted = extractTextByContentType(contentType, body) |
| 377 | const truncated = truncateText(extracted.text, max_chars) |
| 378 | |
| 379 | if (!truncated.text) { |
| 380 | return errorResult(url, 'No readable text found at URL') |
| 381 | } |
| 382 | |