MCPcopy Index your code
hub / github.com/simstudioai/sim / extractDocText

Function extractDocText

apps/sim/lib/copilot/tools/server/files/doc-extract.ts:28–113  ·  view source on GitHub ↗
(args: { binary: Buffer; ext: string })

Source from the content-addressed store, hash-verified

26 * failure or an unparseable document.
27 */
28export async function extractDocText(args: { binary: Buffer; ext: string }): Promise<DocExtract> {
29 const ext = args.ext.toLowerCase()
30 if (!isExtractableDocExt(ext)) {
31 throw new Error(`Cannot extract text from .${ext} (supported: pdf, pptx, docx, xlsx)`)
32 }
33
34 const script = `
35import json
36ext = ${JSON.stringify(ext)}
37inp = f"/home/user/input.{ext}"
38out = []
39
40if ext == "pdf":
41 import pdfplumber
42 with pdfplumber.open(inp) as pdf:
43 for i, page in enumerate(pdf.pages, 1):
44 out.append(f"--- Page {i} ---")
45 out.append(page.extract_text() or "")
46 for t in (page.extract_tables() or []):
47 out.append("[table] " + json.dumps(t, ensure_ascii=False))
48elif ext == "pptx":
49 from pptx import Presentation
50 prs = Presentation(inp)
51 for i, slide in enumerate(prs.slides, 1):
52 out.append(f"--- Slide {i} ---")
53 for shape in slide.shapes:
54 if shape.has_text_frame and shape.text_frame.text.strip():
55 out.append(shape.text_frame.text)
56 if shape.has_table:
57 for row in shape.table.rows:
58 out.append(" | ".join(c.text for c in row.cells))
59 nf = slide.notes_slide.notes_text_frame if slide.has_notes_slide else None
60 notes = nf.text if nf is not None else ""
61 if notes.strip():
62 out.append("[notes] " + notes)
63elif ext == "docx":
64 import docx
65 d = docx.Document(inp)
66 for p in d.paragraphs:
67 if p.text.strip():
68 out.append(p.text)
69 for tbl in d.tables:
70 for row in tbl.rows:
71 out.append(" | ".join(c.text for c in row.cells))
72elif ext == "xlsx":
73 import openpyxl
74 wb = openpyxl.load_workbook(inp, data_only=True)
75 for ws in wb.worksheets:
76 out.append(f"--- Sheet {ws.title} ---")
77 # Cap rows so an inflated used-range can't blow up memory/output.
78 for ri, row in enumerate(ws.iter_rows(values_only=True)):
79 if ri >= 5000:
80 out.append("[... more rows truncated]")
81 break
82 out.append(",".join("" if v is None else str(v) for v in row))
83
84# Bound the transferred text so a decompression bomb can't return gigabytes.
85# Headroom over MAX_EXTRACT_CHARS so the TS-side truncation flag can still fire.

Callers 1

readFileContentMethod · 0.90

Calls 3

executeInE2BFunction · 0.90
isExtractableDocExtFunction · 0.85
toStringMethod · 0.45

Tested by

no test coverage detected