(buffer: Buffer)
| 80 | } |
| 81 | |
| 82 | private fallbackExtraction(buffer: Buffer): FileParseResult { |
| 83 | const isBinaryDoc = buffer.length >= 2 && buffer[0] === 0xd0 && buffer[1] === 0xcf |
| 84 | |
| 85 | if (!isBinaryDoc) { |
| 86 | const textContent = buffer.toString('utf8').trim() |
| 87 | |
| 88 | if (textContent.length > 0) { |
| 89 | const printableChars = textContent.match(/[\x20-\x7E\n\r\t]/g)?.length || 0 |
| 90 | const isProbablyText = printableChars / textContent.length > 0.9 |
| 91 | |
| 92 | if (isProbablyText) { |
| 93 | return { |
| 94 | content: sanitizeTextForUTF8(textContent), |
| 95 | metadata: { |
| 96 | extractionMethod: 'plaintext-fallback', |
| 97 | characterCount: textContent.length, |
| 98 | warning: 'File is not a valid DOC format, extracted as plain text', |
| 99 | }, |
| 100 | } |
| 101 | } |
| 102 | } |
| 103 | } |
| 104 | |
| 105 | const text = buffer.toString('utf8', 0, Math.min(buffer.length, 100000)) |
| 106 | |
| 107 | const readableText = text |
| 108 | .match(/[\x20-\x7E\s]{4,}/g) |
| 109 | ?.filter( |
| 110 | (chunk) => |
| 111 | chunk.trim().length > 10 && /[a-zA-Z]/.test(chunk) && !/^[\x00-\x1F]*$/.test(chunk) |
| 112 | ) |
| 113 | .join(' ') |
| 114 | .replace(/\s+/g, ' ') |
| 115 | .trim() |
| 116 | |
| 117 | const content = readableText |
| 118 | ? sanitizeTextForUTF8(readableText) |
| 119 | : 'Unable to extract text from DOC file. Please convert to DOCX format for better results.' |
| 120 | |
| 121 | return { |
| 122 | content, |
| 123 | metadata: { |
| 124 | extractionMethod: 'fallback', |
| 125 | characterCount: content.length, |
| 126 | warning: 'Basic text extraction used. For better results, convert to DOCX format.', |
| 127 | }, |
| 128 | } |
| 129 | } |
| 130 | } |
no test coverage detected