(args: { binary: Buffer; ext: string })
| 26 | * failure or an unparseable document. |
| 27 | */ |
| 28 | export async function extractDocText(args: { binary: Buffer; ext: string }): Promise<DocExtract> { |
| 29 | const ext = args.ext.toLowerCase() |
| 30 | if (!isExtractableDocExt(ext)) { |
| 31 | throw new Error(`Cannot extract text from .${ext} (supported: pdf, pptx, docx, xlsx)`) |
| 32 | } |
| 33 | |
| 34 | const script = ` |
| 35 | import json |
| 36 | ext = ${JSON.stringify(ext)} |
| 37 | inp = f"/home/user/input.{ext}" |
| 38 | out = [] |
| 39 | |
| 40 | if ext == "pdf": |
| 41 | import pdfplumber |
| 42 | with pdfplumber.open(inp) as pdf: |
| 43 | for i, page in enumerate(pdf.pages, 1): |
| 44 | out.append(f"--- Page {i} ---") |
| 45 | out.append(page.extract_text() or "") |
| 46 | for t in (page.extract_tables() or []): |
| 47 | out.append("[table] " + json.dumps(t, ensure_ascii=False)) |
| 48 | elif ext == "pptx": |
| 49 | from pptx import Presentation |
| 50 | prs = Presentation(inp) |
| 51 | for i, slide in enumerate(prs.slides, 1): |
| 52 | out.append(f"--- Slide {i} ---") |
| 53 | for shape in slide.shapes: |
| 54 | if shape.has_text_frame and shape.text_frame.text.strip(): |
| 55 | out.append(shape.text_frame.text) |
| 56 | if shape.has_table: |
| 57 | for row in shape.table.rows: |
| 58 | out.append(" | ".join(c.text for c in row.cells)) |
| 59 | nf = slide.notes_slide.notes_text_frame if slide.has_notes_slide else None |
| 60 | notes = nf.text if nf is not None else "" |
| 61 | if notes.strip(): |
| 62 | out.append("[notes] " + notes) |
| 63 | elif ext == "docx": |
| 64 | import docx |
| 65 | d = docx.Document(inp) |
| 66 | for p in d.paragraphs: |
| 67 | if p.text.strip(): |
| 68 | out.append(p.text) |
| 69 | for tbl in d.tables: |
| 70 | for row in tbl.rows: |
| 71 | out.append(" | ".join(c.text for c in row.cells)) |
| 72 | elif ext == "xlsx": |
| 73 | import openpyxl |
| 74 | wb = openpyxl.load_workbook(inp, data_only=True) |
| 75 | for ws in wb.worksheets: |
| 76 | out.append(f"--- Sheet {ws.title} ---") |
| 77 | # Cap rows so an inflated used-range can't blow up memory/output. |
| 78 | for ri, row in enumerate(ws.iter_rows(values_only=True)): |
| 79 | if ri >= 5000: |
| 80 | out.append("[... more rows truncated]") |
| 81 | break |
| 82 | out.append(",".join("" if v is None else str(v) for v in row)) |
| 83 | |
| 84 | # Bound the transferred text so a decompression bomb can't return gigabytes. |
| 85 | # Headroom over MAX_EXTRACT_CHARS so the TS-side truncation flag can still fire. |
no test coverage detected