hub / github.com/simstudioai/sim / extractDocText

Function extractDocText

apps/sim/lib/copilot/tools/server/files/doc-extract.ts:28–113 · view source on GitHub ↗

(args: { binary: Buffer; ext: string })

Source from the content-addressed store, hash-verified

26	* failure or an unparseable document.
27	*/
28	export async function extractDocText(args: { binary: Buffer; ext: string }): Promise<DocExtract> {
29	const ext = args.ext.toLowerCase()
30	if (!isExtractableDocExt(ext)) {
31	throw new Error(`Cannot extract text from .${ext} (supported: pdf, pptx, docx, xlsx)`)
32	}
33
34	const script = `
35	import json
36	ext = ${JSON.stringify(ext)}
37	inp = f"/home/user/input.{ext}"
38	out = []
39
40	if ext == "pdf":
41	import pdfplumber
42	with pdfplumber.open(inp) as pdf:
43	for i, page in enumerate(pdf.pages, 1):
44	out.append(f"--- Page {i} ---")
45	out.append(page.extract_text() or "")
46	for t in (page.extract_tables() or []):
47	out.append("[table] " + json.dumps(t, ensure_ascii=False))
48	elif ext == "pptx":
49	from pptx import Presentation
50	prs = Presentation(inp)
51	for i, slide in enumerate(prs.slides, 1):
52	out.append(f"--- Slide {i} ---")
53	for shape in slide.shapes:
54	if shape.has_text_frame and shape.text_frame.text.strip():
55	out.append(shape.text_frame.text)
56	if shape.has_table:
57	for row in shape.table.rows:
58	out.append(" \| ".join(c.text for c in row.cells))
59	nf = slide.notes_slide.notes_text_frame if slide.has_notes_slide else None
60	notes = nf.text if nf is not None else ""
61	if notes.strip():
62	out.append("[notes] " + notes)
63	elif ext == "docx":
64	import docx
65	d = docx.Document(inp)
66	for p in d.paragraphs:
67	if p.text.strip():
68	out.append(p.text)
69	for tbl in d.tables:
70	for row in tbl.rows:
71	out.append(" \| ".join(c.text for c in row.cells))
72	elif ext == "xlsx":
73	import openpyxl
74	wb = openpyxl.load_workbook(inp, data_only=True)
75	for ws in wb.worksheets:
76	out.append(f"--- Sheet {ws.title} ---")
77	# Cap rows so an inflated used-range can't blow up memory/output.
78	for ri, row in enumerate(ws.iter_rows(values_only=True)):
79	if ri >= 5000:
80	out.append("[... more rows truncated]")
81	break
82	out.append(",".join("" if v is None else str(v) for v in row))
83
84	# Bound the transferred text so a decompression bomb can't return gigabytes.
85	# Headroom over MAX_EXTRACT_CHARS so the TS-side truncation flag can still fire.

Callers 1

readFileContentMethod · 0.90

Calls 3

executeInE2BFunction · 0.90

isExtractableDocExtFunction · 0.85

toStringMethod · 0.45

Tested by

no test coverage detected