MCPcopy Index your code
hub / github.com/simstudioai/sim / fallbackExtraction

Method fallbackExtraction

apps/sim/lib/file-parsers/doc-parser.ts:82–129  ·  view source on GitHub ↗
(buffer: Buffer)

Source from the content-addressed store, hash-verified

80 }
81
82 private fallbackExtraction(buffer: Buffer): FileParseResult {
83 const isBinaryDoc = buffer.length >= 2 && buffer[0] === 0xd0 && buffer[1] === 0xcf
84
85 if (!isBinaryDoc) {
86 const textContent = buffer.toString('utf8').trim()
87
88 if (textContent.length > 0) {
89 const printableChars = textContent.match(/[\x20-\x7E\n\r\t]/g)?.length || 0
90 const isProbablyText = printableChars / textContent.length > 0.9
91
92 if (isProbablyText) {
93 return {
94 content: sanitizeTextForUTF8(textContent),
95 metadata: {
96 extractionMethod: 'plaintext-fallback',
97 characterCount: textContent.length,
98 warning: 'File is not a valid DOC format, extracted as plain text',
99 },
100 }
101 }
102 }
103 }
104
105 const text = buffer.toString('utf8', 0, Math.min(buffer.length, 100000))
106
107 const readableText = text
108 .match(/[\x20-\x7E\s]{4,}/g)
109 ?.filter(
110 (chunk) =>
111 chunk.trim().length > 10 && /[a-zA-Z]/.test(chunk) && !/^[\x00-\x1F]*$/.test(chunk)
112 )
113 .join(' ')
114 .replace(/\s+/g, ' ')
115 .trim()
116
117 const content = readableText
118 ? sanitizeTextForUTF8(readableText)
119 : 'Unable to extract text from DOC file. Please convert to DOCX format for better results.'
120
121 return {
122 content,
123 metadata: {
124 extractionMethod: 'fallback',
125 characterCount: content.length,
126 warning: 'Basic text extraction used. For better results, convert to DOCX format.',
127 },
128 }
129 }
130}

Callers 1

parseBufferMethod · 0.95

Calls 5

sanitizeTextForUTF8Function · 0.90
joinMethod · 0.80
testMethod · 0.80
replaceMethod · 0.65
toStringMethod · 0.45

Tested by

no test coverage detected