(buffer: Buffer)
| 22 | } |
| 23 | |
| 24 | async parseBuffer(buffer: Buffer): Promise<FileParseResult> { |
| 25 | try { |
| 26 | logger.info('Parsing HTML buffer, size:', buffer.length) |
| 27 | |
| 28 | const htmlContent = buffer.toString('utf-8') |
| 29 | const $ = cheerio.load(htmlContent) |
| 30 | |
| 31 | // Extract meta information before removing tags |
| 32 | const title = $('title').text().trim() |
| 33 | const metaDescription = $('meta[name="description"]').attr('content') || '' |
| 34 | |
| 35 | $('script, style, noscript, meta, link, iframe, object, embed, svg').remove() |
| 36 | |
| 37 | $.root() |
| 38 | .contents() |
| 39 | .filter(function () { |
| 40 | return this.type === 'comment' |
| 41 | }) |
| 42 | .remove() |
| 43 | |
| 44 | const content = this.extractStructuredText($) |
| 45 | |
| 46 | const sanitizedContent = sanitizeTextForUTF8(content) |
| 47 | |
| 48 | const characterCount = sanitizedContent.length |
| 49 | const wordCount = sanitizedContent.split(/\s+/).filter((word) => word.length > 0).length |
| 50 | const estimatedTokenCount = Math.ceil(characterCount / 4) |
| 51 | |
| 52 | const headings = this.extractHeadings($) |
| 53 | |
| 54 | const links = this.extractLinks($) |
| 55 | |
| 56 | return { |
| 57 | content: sanitizedContent, |
| 58 | metadata: { |
| 59 | title, |
| 60 | metaDescription, |
| 61 | characterCount, |
| 62 | wordCount, |
| 63 | tokenCount: estimatedTokenCount, |
| 64 | headings, |
| 65 | links: links.slice(0, 50), |
| 66 | hasImages: $('img').length > 0, |
| 67 | imageCount: $('img').length, |
| 68 | hasTable: $('table').length > 0, |
| 69 | tableCount: $('table').length, |
| 70 | hasList: $('ul, ol').length > 0, |
| 71 | listCount: $('ul, ol').length, |
| 72 | }, |
| 73 | } |
| 74 | } catch (error) { |
| 75 | logger.error('HTML buffer parsing error:', error) |
| 76 | throw new Error(`Failed to parse HTML buffer: ${(error as Error).message}`) |
| 77 | } |
| 78 | } |
| 79 | |
| 80 | /** |
| 81 | * Extract structured text content preserving document hierarchy |
no test coverage detected