| 18 | } |
| 19 | |
| 20 | export class DocxParser implements FileParser { |
| 21 | async parseFile(filePath: string): Promise<FileParseResult> { |
| 22 | try { |
| 23 | if (!filePath) { |
| 24 | throw new Error('No file path provided') |
| 25 | } |
| 26 | |
| 27 | const buffer = await readFile(filePath) |
| 28 | return this.parseBuffer(buffer) |
| 29 | } catch (error) { |
| 30 | logger.error('DOCX file error:', error) |
| 31 | throw new Error(`Failed to parse DOCX file: ${(error as Error).message}`) |
| 32 | } |
| 33 | } |
| 34 | |
| 35 | async parseBuffer(buffer: Buffer): Promise<FileParseResult> { |
| 36 | try { |
| 37 | if (!buffer || buffer.length === 0) { |
| 38 | throw new Error('Empty buffer provided') |
| 39 | } |
| 40 | |
| 41 | assertOoxmlArchiveWithinLimits(buffer) |
| 42 | |
| 43 | try { |
| 44 | const result = await mammoth.extractRawText({ buffer }) |
| 45 | |
| 46 | if (result.value && result.value.trim().length > 0) { |
| 47 | let htmlResult: MammothResult = { value: '', messages: [] } |
| 48 | try { |
| 49 | htmlResult = await mammoth.convertToHtml({ buffer }) |
| 50 | } catch { |
| 51 | // HTML conversion is optional |
| 52 | } |
| 53 | |
| 54 | return { |
| 55 | content: sanitizeTextForUTF8(result.value), |
| 56 | metadata: { |
| 57 | extractionMethod: 'mammoth', |
| 58 | messages: [...result.messages, ...htmlResult.messages], |
| 59 | html: htmlResult.value, |
| 60 | }, |
| 61 | } |
| 62 | } |
| 63 | } catch (mammothError) { |
| 64 | logger.warn('mammoth failed, trying officeparser:', mammothError) |
| 65 | } |
| 66 | |
| 67 | try { |
| 68 | const officeParser = await import('officeparser') |
| 69 | const result = await officeParser.parseOfficeAsync(buffer) |
| 70 | |
| 71 | if (result) { |
| 72 | const resultString = typeof result === 'string' ? result : String(result) |
| 73 | const content = sanitizeTextForUTF8(resultString.trim()) |
| 74 | |
| 75 | if (content.length > 0) { |
| 76 | return { |
| 77 | content, |
nothing calls this directly
no outgoing calls
no test coverage detected