| 7 | const logger = createLogger('HtmlParser') |
| 8 | |
| 9 | export class HtmlParser implements FileParser { |
| 10 | async parseFile(filePath: string): Promise<FileParseResult> { |
| 11 | try { |
| 12 | if (!filePath) { |
| 13 | throw new Error('No file path provided') |
| 14 | } |
| 15 | |
| 16 | const buffer = await readFile(filePath) |
| 17 | return this.parseBuffer(buffer) |
| 18 | } catch (error) { |
| 19 | logger.error('HTML file error:', error) |
| 20 | throw new Error(`Failed to parse HTML file: ${(error as Error).message}`) |
| 21 | } |
| 22 | } |
| 23 | |
| 24 | async parseBuffer(buffer: Buffer): Promise<FileParseResult> { |
| 25 | try { |
| 26 | logger.info('Parsing HTML buffer, size:', buffer.length) |
| 27 | |
| 28 | const htmlContent = buffer.toString('utf-8') |
| 29 | const $ = cheerio.load(htmlContent) |
| 30 | |
| 31 | // Extract meta information before removing tags |
| 32 | const title = $('title').text().trim() |
| 33 | const metaDescription = $('meta[name="description"]').attr('content') || '' |
| 34 | |
| 35 | $('script, style, noscript, meta, link, iframe, object, embed, svg').remove() |
| 36 | |
| 37 | $.root() |
| 38 | .contents() |
| 39 | .filter(function () { |
| 40 | return this.type === 'comment' |
| 41 | }) |
| 42 | .remove() |
| 43 | |
| 44 | const content = this.extractStructuredText($) |
| 45 | |
| 46 | const sanitizedContent = sanitizeTextForUTF8(content) |
| 47 | |
| 48 | const characterCount = sanitizedContent.length |
| 49 | const wordCount = sanitizedContent.split(/\s+/).filter((word) => word.length > 0).length |
| 50 | const estimatedTokenCount = Math.ceil(characterCount / 4) |
| 51 | |
| 52 | const headings = this.extractHeadings($) |
| 53 | |
| 54 | const links = this.extractLinks($) |
| 55 | |
| 56 | return { |
| 57 | content: sanitizedContent, |
| 58 | metadata: { |
| 59 | title, |
| 60 | metaDescription, |
| 61 | characterCount, |
| 62 | wordCount, |
| 63 | tokenCount: estimatedTokenCount, |
| 64 | headings, |
| 65 | links: links.slice(0, 50), |
| 66 | hasImages: $('img').length > 0, |
nothing calls this directly
no outgoing calls
no test coverage detected