MCPcopy
hub / github.com/impira/docquery / _generate_document_output

Method _generate_document_output

src/docquery/document.py:64–100  ·  view source on GitHub ↗
(
        images: List["Image.Image"],
        words_by_page: List[List[str]],
        boxes_by_page: List[List[List[int]]],
        dimensions_by_page: List[Tuple[int, int]],
    )

Source from the content-addressed store, hash-verified

62
63 @staticmethod
64 def _generate_document_output(
65 images: List["Image.Image"],
66 words_by_page: List[List[str]],
67 boxes_by_page: List[List[List[int]]],
68 dimensions_by_page: List[Tuple[int, int]],
69 ) -> Dict[str, List[Tuple["Image.Image", List[Any]]]]:
70
71 # pages_dimensions (width, height)
72 assert len(images) == len(dimensions_by_page)
73 assert len(images) == len(words_by_page)
74 assert len(images) == len(boxes_by_page)
75 processed_pages = []
76 for image, words, boxes, dimensions in zip(images, words_by_page, boxes_by_page, dimensions_by_page):
77 width, height = dimensions
78
79 """
80 box is [x1,y1,x2,y2] where x1,y1 are the top left corner of box and x2,y2 is the bottom right corner
81 This function scales the distance between boxes to be on a fixed scale
82 It is derived from the preprocessing code for LayoutLM
83 """
84 normalized_boxes = [
85 [
86 max(min(c, 1000), 0)
87 for c in [
88 int(1000 * (box[0] / width)),
89 int(1000 * (box[1] / height)),
90 int(1000 * (box[2] / width)),
91 int(1000 * (box[3] / height)),
92 ]
93 ]
94 for box in boxes
95 ]
96 assert len(words) == len(normalized_boxes), "Not as many words as there are bounding boxes"
97 word_boxes = [x for x in zip(words, normalized_boxes)]
98 processed_pages.append((image, word_boxes))
99
100 return {"image": processed_pages}
101
102
103class PDFDocument(Document):

Callers 3

contextMethod · 0.80
contextMethod · 0.80
contextMethod · 0.80

Calls

no outgoing calls

Tested by

no test coverage detected