MCPcopy
hub / github.com/impira/docquery / main

Function main

src/docquery/cmd/scan.py:52–103  ·  view source on GitHub ↗
(args)

Source from the content-addressed store, hash-verified

50
51
52def main(args):
53 paths = []
54 if pathlib.Path(args.path).is_dir():
55 for root, dirs, files in os.walk(args.path):
56 for fname in files:
57 if (pathlib.Path(root) / fname).is_dir():
58 continue
59 paths.append(pathlib.Path(root) / fname)
60 else:
61 paths.append(args.path)
62
63 docs = []
64 for p in paths:
65 try:
66 log.info(f"Loading {p}")
67 docs.append((p, load_document(str(p), ocr_reader=args.ocr, use_embedded_text=args.use_embedded_text)))
68 except UnsupportedDocument as e:
69 log.warning(f"Cannot load {p}: {e}. Skipping...")
70
71 log.info(f"Done loading {len(docs)} file(s).")
72 if not docs:
73 return
74
75 log.info("Loading pipelines.")
76
77 nlp = pipeline("document-question-answering", model=args.checkpoint)
78 if args.classify:
79 classify = pipeline("document-classification", model=args.classify_checkpoint)
80
81 log.info("Ready to start evaluating!")
82
83 max_fname_len = max(len(str(p)) for (p, _) in docs)
84 max_question_len = max(len(q) for q in args.questions) if len(args.questions) > 0 else 0
85 for i, (p, d) in enumerate(docs):
86 if i > 0 and len(args.questions) > 1:
87 print("")
88
89 if args.classify:
90 cls = classify(**d.context)[0]
91 print(f"{str(p):<{max_fname_len}} Document Type: {cls['label']}")
92
93 for q in args.questions:
94 try:
95 response = nlp(question=q, **d.context)
96 if isinstance(response, list):
97 response = response[0] if len(response) > 0 else None
98 except Exception:
99 log.error(f"Failed while processing {str(p)} on question: '{q}'")
100 raise
101
102 answer = response["answer"] if response is not None else "NULL"
103 print(f"{str(p):<{max_fname_len}} {q:<{max_question_len}}: {answer}")

Callers

nothing calls this directly

Calls 2

load_documentFunction · 0.85
pipelineFunction · 0.85

Tested by

no test coverage detected