Function main

src/docquery/cmd/scan.py:52–103 · view source on GitHub ↗

(args)

Source from the content-addressed store, hash-verified

50
51
52	def main(args):
53	paths = []
54	if pathlib.Path(args.path).is_dir():
55	for root, dirs, files in os.walk(args.path):
56	for fname in files:
57	if (pathlib.Path(root) / fname).is_dir():
58	continue
59	paths.append(pathlib.Path(root) / fname)
60	else:
61	paths.append(args.path)
62
63	docs = []
64	for p in paths:
65	try:
66	log.info(f"Loading {p}")
67	docs.append((p, load_document(str(p), ocr_reader=args.ocr, use_embedded_text=args.use_embedded_text)))
68	except UnsupportedDocument as e:
69	log.warning(f"Cannot load {p}: {e}. Skipping...")
70
71	log.info(f"Done loading {len(docs)} file(s).")
72	if not docs:
73	return
74
75	log.info("Loading pipelines.")
76
77	nlp = pipeline("document-question-answering", model=args.checkpoint)
78	if args.classify:
79	classify = pipeline("document-classification", model=args.classify_checkpoint)
80
81	log.info("Ready to start evaluating!")
82
83	max_fname_len = max(len(str(p)) for (p, _) in docs)
84	max_question_len = max(len(q) for q in args.questions) if len(args.questions) > 0 else 0
85	for i, (p, d) in enumerate(docs):
86	if i > 0 and len(args.questions) > 1:
87	print("")
88
89	if args.classify:
90	cls = classify(**d.context)[0]
91	print(f"{str(p):<{max_fname_len}} Document Type: {cls['label']}")
92
93	for q in args.questions:
94	try:
95	response = nlp(question=q, **d.context)
96	if isinstance(response, list):
97	response = response[0] if len(response) > 0 else None
98	except Exception:
99	log.error(f"Failed while processing {str(p)} on question: '{q}'")
100	raise
101
102	answer = response["answer"] if response is not None else "NULL"
103	print(f"{str(p):<{max_fname_len}} {q:<{max_question_len}}: {answer}")

Callers

nothing calls this directly

Calls 2

load_documentFunction · 0.85

pipelineFunction · 0.85

Tested by

no test coverage detected