MCPcopy
hub / github.com/simstudioai/sim / HtmlParser

Class HtmlParser

apps/sim/lib/file-parsers/html-parser.ts:9–283  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

7const logger = createLogger('HtmlParser')
8
9export class HtmlParser implements FileParser {
10 async parseFile(filePath: string): Promise<FileParseResult> {
11 try {
12 if (!filePath) {
13 throw new Error('No file path provided')
14 }
15
16 const buffer = await readFile(filePath)
17 return this.parseBuffer(buffer)
18 } catch (error) {
19 logger.error('HTML file error:', error)
20 throw new Error(`Failed to parse HTML file: ${(error as Error).message}`)
21 }
22 }
23
24 async parseBuffer(buffer: Buffer): Promise<FileParseResult> {
25 try {
26 logger.info('Parsing HTML buffer, size:', buffer.length)
27
28 const htmlContent = buffer.toString('utf-8')
29 const $ = cheerio.load(htmlContent)
30
31 // Extract meta information before removing tags
32 const title = $('title').text().trim()
33 const metaDescription = $('meta[name="description"]').attr('content') || ''
34
35 $('script, style, noscript, meta, link, iframe, object, embed, svg').remove()
36
37 $.root()
38 .contents()
39 .filter(function () {
40 return this.type === 'comment'
41 })
42 .remove()
43
44 const content = this.extractStructuredText($)
45
46 const sanitizedContent = sanitizeTextForUTF8(content)
47
48 const characterCount = sanitizedContent.length
49 const wordCount = sanitizedContent.split(/\s+/).filter((word) => word.length > 0).length
50 const estimatedTokenCount = Math.ceil(characterCount / 4)
51
52 const headings = this.extractHeadings($)
53
54 const links = this.extractLinks($)
55
56 return {
57 content: sanitizedContent,
58 metadata: {
59 title,
60 metaDescription,
61 characterCount,
62 wordCount,
63 tokenCount: estimatedTokenCount,
64 headings,
65 links: links.slice(0, 50),
66 hasImages: $('img').length > 0,

Callers

nothing calls this directly

Calls

no outgoing calls

Tested by

no test coverage detected