MCPcopy
hub / github.com/simstudioai/sim / DocxParser

Class DocxParser

apps/sim/lib/file-parsers/docx-parser.ts:20–110  ·  view source on GitHub ↗

Source from the content-addressed store, hash-verified

18}
19
20export class DocxParser implements FileParser {
21 async parseFile(filePath: string): Promise<FileParseResult> {
22 try {
23 if (!filePath) {
24 throw new Error('No file path provided')
25 }
26
27 const buffer = await readFile(filePath)
28 return this.parseBuffer(buffer)
29 } catch (error) {
30 logger.error('DOCX file error:', error)
31 throw new Error(`Failed to parse DOCX file: ${(error as Error).message}`)
32 }
33 }
34
35 async parseBuffer(buffer: Buffer): Promise<FileParseResult> {
36 try {
37 if (!buffer || buffer.length === 0) {
38 throw new Error('Empty buffer provided')
39 }
40
41 assertOoxmlArchiveWithinLimits(buffer)
42
43 try {
44 const result = await mammoth.extractRawText({ buffer })
45
46 if (result.value && result.value.trim().length > 0) {
47 let htmlResult: MammothResult = { value: '', messages: [] }
48 try {
49 htmlResult = await mammoth.convertToHtml({ buffer })
50 } catch {
51 // HTML conversion is optional
52 }
53
54 return {
55 content: sanitizeTextForUTF8(result.value),
56 metadata: {
57 extractionMethod: 'mammoth',
58 messages: [...result.messages, ...htmlResult.messages],
59 html: htmlResult.value,
60 },
61 }
62 }
63 } catch (mammothError) {
64 logger.warn('mammoth failed, trying officeparser:', mammothError)
65 }
66
67 try {
68 const officeParser = await import('officeparser')
69 const result = await officeParser.parseOfficeAsync(buffer)
70
71 if (result) {
72 const resultString = typeof result === 'string' ? result : String(result)
73 const content = sanitizeTextForUTF8(resultString.trim())
74
75 if (content.length > 0) {
76 return {
77 content,

Callers

nothing calls this directly

Calls

no outgoing calls

Tested by

no test coverage detected