Method fallbackExtraction

apps/sim/lib/file-parsers/doc-parser.ts:82–129 · view source on GitHub ↗

(buffer: Buffer)

Source from the content-addressed store, hash-verified

80	}
81
82	private fallbackExtraction(buffer: Buffer): FileParseResult {
83	const isBinaryDoc = buffer.length >= 2 && buffer[0] === 0xd0 && buffer[1] === 0xcf
84
85	if (!isBinaryDoc) {
86	const textContent = buffer.toString('utf8').trim()
87
88	if (textContent.length > 0) {
89	const printableChars = textContent.match(/[\x20-\x7E\n\r\t]/g)?.length \|\| 0
90	const isProbablyText = printableChars / textContent.length > 0.9
91
92	if (isProbablyText) {
93	return {
94	content: sanitizeTextForUTF8(textContent),
95	metadata: {
96	extractionMethod: 'plaintext-fallback',
97	characterCount: textContent.length,
98	warning: 'File is not a valid DOC format, extracted as plain text',
99	},
100	}
101	}
102	}
103	}
104
105	const text = buffer.toString('utf8', 0, Math.min(buffer.length, 100000))
106
107	const readableText = text
108	.match(/[\x20-\x7E\s]{4,}/g)
109	?.filter(
110	(chunk) =>
111	chunk.trim().length > 10 && /[a-zA-Z]/.test(chunk) && !/^[\x00-\x1F]*$/.test(chunk)
112	)
113	.join(' ')
114	.replace(/\s+/g, ' ')
115	.trim()
116
117	const content = readableText
118	? sanitizeTextForUTF8(readableText)
119	: 'Unable to extract text from DOC file. Please convert to DOCX format for better results.'
120
121	return {
122	content,
123	metadata: {
124	extractionMethod: 'fallback',
125	characterCount: content.length,
126	warning: 'Basic text extraction used. For better results, convert to DOCX format.',
127	},
128	}
129	}
130	}

parseBufferMethod · 0.95

sanitizeTextForUTF8Function · 0.90

joinMethod · 0.80

testMethod · 0.80

replaceMethod · 0.65

toStringMethod · 0.45

no test coverage detected