| 62 | |
| 63 | @staticmethod |
| 64 | def _generate_document_output( |
| 65 | images: List["Image.Image"], |
| 66 | words_by_page: List[List[str]], |
| 67 | boxes_by_page: List[List[List[int]]], |
| 68 | dimensions_by_page: List[Tuple[int, int]], |
| 69 | ) -> Dict[str, List[Tuple["Image.Image", List[Any]]]]: |
| 70 | |
| 71 | # pages_dimensions (width, height) |
| 72 | assert len(images) == len(dimensions_by_page) |
| 73 | assert len(images) == len(words_by_page) |
| 74 | assert len(images) == len(boxes_by_page) |
| 75 | processed_pages = [] |
| 76 | for image, words, boxes, dimensions in zip(images, words_by_page, boxes_by_page, dimensions_by_page): |
| 77 | width, height = dimensions |
| 78 | |
| 79 | """ |
| 80 | box is [x1,y1,x2,y2] where x1,y1 are the top left corner of box and x2,y2 is the bottom right corner |
| 81 | This function scales the distance between boxes to be on a fixed scale |
| 82 | It is derived from the preprocessing code for LayoutLM |
| 83 | """ |
| 84 | normalized_boxes = [ |
| 85 | [ |
| 86 | max(min(c, 1000), 0) |
| 87 | for c in [ |
| 88 | int(1000 * (box[0] / width)), |
| 89 | int(1000 * (box[1] / height)), |
| 90 | int(1000 * (box[2] / width)), |
| 91 | int(1000 * (box[3] / height)), |
| 92 | ] |
| 93 | ] |
| 94 | for box in boxes |
| 95 | ] |
| 96 | assert len(words) == len(normalized_boxes), "Not as many words as there are bounding boxes" |
| 97 | word_boxes = [x for x in zip(words, normalized_boxes)] |
| 98 | processed_pages.append((image, word_boxes)) |
| 99 | |
| 100 | return {"image": processed_pages} |
| 101 | |
| 102 | |
| 103 | class PDFDocument(Document): |