(buffer: Buffer)
| 76 | } |
| 77 | |
| 78 | private fallbackExtraction(buffer: Buffer): FileParseResult { |
| 79 | logger.info('Using fallback text extraction for PowerPoint file') |
| 80 | |
| 81 | const text = buffer.toString('utf8', 0, Math.min(buffer.length, 200000)) |
| 82 | |
| 83 | const readableText = text |
| 84 | .match(/[\x20-\x7E\s]{4,}/g) |
| 85 | ?.filter( |
| 86 | (chunk) => |
| 87 | chunk.trim().length > 10 && |
| 88 | /[a-zA-Z]/.test(chunk) && |
| 89 | !/^[\x00-\x1F]*$/.test(chunk) && |
| 90 | !/^[^\w\s]*$/.test(chunk) |
| 91 | ) |
| 92 | .join(' ') |
| 93 | .replace(/\s+/g, ' ') |
| 94 | .trim() |
| 95 | |
| 96 | const content = readableText |
| 97 | ? sanitizeTextForUTF8(readableText) |
| 98 | : 'Unable to extract text from PowerPoint file. Please ensure the file contains readable text content.' |
| 99 | |
| 100 | return { |
| 101 | content, |
| 102 | metadata: { |
| 103 | extractionMethod: 'fallback', |
| 104 | characterCount: content.length, |
| 105 | warning: 'Basic text extraction used', |
| 106 | }, |
| 107 | } |
| 108 | } |
| 109 | } |
no test coverage detected