hub / github.com/impira/docquery / _generate_document_output

Method _generate_document_output

src/docquery/document.py:64–100 · view source on GitHub ↗

(
        images: List["Image.Image"],
        words_by_page: List[List[str]],
        boxes_by_page: List[List[List[int]]],
        dimensions_by_page: List[Tuple[int, int]],
    )

Source from the content-addressed store, hash-verified

62
63	@staticmethod
64	def _generate_document_output(
65	images: List["Image.Image"],
66	words_by_page: List[List[str]],
67	boxes_by_page: List[List[List[int]]],
68	dimensions_by_page: List[Tuple[int, int]],
69	) -> Dict[str, List[Tuple["Image.Image", List[Any]]]]:
70
71	# pages_dimensions (width, height)
72	assert len(images) == len(dimensions_by_page)
73	assert len(images) == len(words_by_page)
74	assert len(images) == len(boxes_by_page)
75	processed_pages = []
76	for image, words, boxes, dimensions in zip(images, words_by_page, boxes_by_page, dimensions_by_page):
77	width, height = dimensions
78
79	"""
80	box is [x1,y1,x2,y2] where x1,y1 are the top left corner of box and x2,y2 is the bottom right corner
81	This function scales the distance between boxes to be on a fixed scale
82	It is derived from the preprocessing code for LayoutLM
83	"""
84	normalized_boxes = [
85	[
86	max(min(c, 1000), 0)
87	for c in [
88	int(1000 * (box[0] / width)),
89	int(1000 * (box[1] / height)),
90	int(1000 * (box[2] / width)),
91	int(1000 * (box[3] / height)),
92	]
93	]
94	for box in boxes
95	]
96	assert len(words) == len(normalized_boxes), "Not as many words as there are bounding boxes"
97	word_boxes = [x for x in zip(words, normalized_boxes)]
98	processed_pages.append((image, word_boxes))
99
100	return {"image": processed_pages}
101
102
103	class PDFDocument(Document):

Callers 3

contextMethod · 0.80

Calls

no outgoing calls

Tested by

no test coverage detected