MCPcopy
hub / github.com/simstudioai/sim / parseBuffer

Method parseBuffer

apps/sim/lib/file-parsers/html-parser.ts:24–78  ·  view source on GitHub ↗
(buffer: Buffer)

Source from the content-addressed store, hash-verified

22 }
23
24 async parseBuffer(buffer: Buffer): Promise<FileParseResult> {
25 try {
26 logger.info('Parsing HTML buffer, size:', buffer.length)
27
28 const htmlContent = buffer.toString('utf-8')
29 const $ = cheerio.load(htmlContent)
30
31 // Extract meta information before removing tags
32 const title = $('title').text().trim()
33 const metaDescription = $('meta[name="description"]').attr('content') || ''
34
35 $('script, style, noscript, meta, link, iframe, object, embed, svg').remove()
36
37 $.root()
38 .contents()
39 .filter(function () {
40 return this.type === 'comment'
41 })
42 .remove()
43
44 const content = this.extractStructuredText($)
45
46 const sanitizedContent = sanitizeTextForUTF8(content)
47
48 const characterCount = sanitizedContent.length
49 const wordCount = sanitizedContent.split(/\s+/).filter((word) => word.length > 0).length
50 const estimatedTokenCount = Math.ceil(characterCount / 4)
51
52 const headings = this.extractHeadings($)
53
54 const links = this.extractLinks($)
55
56 return {
57 content: sanitizedContent,
58 metadata: {
59 title,
60 metaDescription,
61 characterCount,
62 wordCount,
63 tokenCount: estimatedTokenCount,
64 headings,
65 links: links.slice(0, 50),
66 hasImages: $('img').length > 0,
67 imageCount: $('img').length,
68 hasTable: $('table').length > 0,
69 tableCount: $('table').length,
70 hasList: $('ul, ol').length > 0,
71 listCount: $('ul, ol').length,
72 },
73 }
74 } catch (error) {
75 logger.error('HTML buffer parsing error:', error)
76 throw new Error(`Failed to parse HTML buffer: ${(error as Error).message}`)
77 }
78 }
79
80 /**
81 * Extract structured text content preserving document hierarchy

Callers 1

parseFileMethod · 0.95

Calls 12

extractStructuredTextMethod · 0.95
extractHeadingsMethod · 0.95
extractLinksMethod · 0.95
sanitizeTextForUTF8Function · 0.90
infoMethod · 0.80
loadMethod · 0.80
textMethod · 0.80
attrMethod · 0.80
removeMethod · 0.80
errorMethod · 0.80
$Function · 0.50
toStringMethod · 0.45

Tested by

no test coverage detected