hub / github.com/simstudioai/sim / HtmlParser

Class HtmlParser

apps/sim/lib/file-parsers/html-parser.ts:9–283 · view source on GitHub ↗

Source from the content-addressed store, hash-verified

7	const logger = createLogger('HtmlParser')
8
9	export class HtmlParser implements FileParser {
10	async parseFile(filePath: string): Promise<FileParseResult> {
11	try {
12	if (!filePath) {
13	throw new Error('No file path provided')
14	}
15
16	const buffer = await readFile(filePath)
17	return this.parseBuffer(buffer)
18	} catch (error) {
19	logger.error('HTML file error:', error)
20	throw new Error(`Failed to parse HTML file: ${(error as Error).message}`)
21	}
22	}
23
24	async parseBuffer(buffer: Buffer): Promise<FileParseResult> {
25	try {
26	logger.info('Parsing HTML buffer, size:', buffer.length)
27
28	const htmlContent = buffer.toString('utf-8')
29	const $ = cheerio.load(htmlContent)
30
31	// Extract meta information before removing tags
32	const title = $('title').text().trim()
33	const metaDescription = $('meta[name="description"]').attr('content') \|\| ''
34
35	$('script, style, noscript, meta, link, iframe, object, embed, svg').remove()
36
37	$.root()
38	.contents()
39	.filter(function () {
40	return this.type === 'comment'
41	})
42	.remove()
43
44	const content = this.extractStructuredText($)
45
46	const sanitizedContent = sanitizeTextForUTF8(content)
47
48	const characterCount = sanitizedContent.length
49	const wordCount = sanitizedContent.split(/\s+/).filter((word) => word.length > 0).length
50	const estimatedTokenCount = Math.ceil(characterCount / 4)
51
52	const headings = this.extractHeadings($)
53
54	const links = this.extractLinks($)
55
56	return {
57	content: sanitizedContent,
58	metadata: {
59	title,
60	metaDescription,
61	characterCount,
62	wordCount,
63	tokenCount: estimatedTokenCount,
64	headings,
65	links: links.slice(0, 50),
66	hasImages: $('img').length > 0,

Callers

nothing calls this directly

Calls

no outgoing calls

Tested by

no test coverage detected