MCPcopy
hub / github.com/steel-dev/steel-browser / buildHtmlLikeMetadataFromPdf

Function buildHtmlLikeMetadataFromPdf

api/src/utils/scrape/pdfToHtml.ts:57–128  ·  view source on GitHub ↗
(
  pdfMeta: any,
  opts: { urlSource?: string | null; statusCode?: number; htmlForFallback?: string | null },
)

Source from the content-addressed store, hash-verified

55};
56
57export function buildHtmlLikeMetadataFromPdf(
58 pdfMeta: any,
59 opts: { urlSource?: string | null; statusCode?: number; htmlForFallback?: string | null },
60): HtmlLikeMetadata {
61 const { urlSource = null, statusCode = 200, htmlForFallback = null } = opts;
62
63 // Try to get a title from meta, fallback to <title> in converted HTML
64 let htmlTitle: string | null = null;
65 if (htmlForFallback) {
66 const $ = loadHtml(htmlForFallback);
67 const t = $("title").first().text()?.trim();
68 htmlTitle = t || null;
69 }
70
71 const title = pdfMeta?.title || htmlTitle || null;
72 const author = pdfMeta?.author || null;
73 const description = pdfMeta?.subject || null;
74
75 // Keywords might be array or string depending on library
76 let keywords: string | null = null;
77 if (Array.isArray(pdfMeta?.keywords)) {
78 keywords = pdfMeta.keywords.join(", ");
79 } else if (typeof pdfMeta?.keywords === "string") {
80 keywords = pdfMeta.keywords;
81 }
82
83 // XMP/DC language if exposed; often not present
84 const language = pdfMeta?.language || pdfMeta?.["dc:language"] || null;
85
86 const publishedTime =
87 parsePdfDate(pdfMeta?.creationDate || pdfMeta?.CreationDate || pdfMeta?.["xmp:CreateDate"]) ||
88 null;
89 const modifiedTime =
90 parsePdfDate(pdfMeta?.modDate || pdfMeta?.ModDate || pdfMeta?.["xmp:ModifyDate"]) || null;
91
92 let origin: string | null = null;
93 let host: string | null = null;
94 if (urlSource) {
95 try {
96 const u = new URL(urlSource);
97 origin = u.origin;
98 host = u.hostname;
99 } catch {}
100 }
101
102 return {
103 title,
104 language,
105 urlSource,
106 timestamp: new Date().toISOString(),
107
108 description,
109 keywords,
110 author,
111
112 ogTitle: title,
113 ogDescription: description,
114 ogImage: null,

Callers 1

handleScrapeFunction · 0.85

Calls 1

parsePdfDateFunction · 0.85

Tested by

no test coverage detected