(self, document: Document)
| 83 | self.detection_model = detection_model |
| 84 | |
| 85 | def __call__(self, document: Document): |
| 86 | filepath = document.filepath # Path to original pdf file |
| 87 | |
| 88 | table_data = [] |
| 89 | for page in document.pages: |
| 90 | for block in page.contained_blocks(document, self.block_types): |
| 91 | if block.block_type == BlockTypes.Table: |
| 92 | block.polygon = block.polygon.expand(0.01, 0.01) |
| 93 | image = block.get_image(document, highres=True) |
| 94 | image_poly = block.polygon.rescale( |
| 95 | (page.polygon.width, page.polygon.height), |
| 96 | page.get_image(highres=True).size, |
| 97 | ) |
| 98 | |
| 99 | table_data.append( |
| 100 | { |
| 101 | "block_id": block.id, |
| 102 | "page_id": page.page_id, |
| 103 | "table_image": image, |
| 104 | "table_bbox": image_poly.bbox, |
| 105 | "img_size": page.get_image(highres=True).size, |
| 106 | "ocr_block": any( |
| 107 | [ |
| 108 | page.text_extraction_method in ["surya"], |
| 109 | page.ocr_errors_detected, |
| 110 | ] |
| 111 | ), |
| 112 | } |
| 113 | ) |
| 114 | |
| 115 | # Detect tables and cells |
| 116 | self.table_rec_model.disable_tqdm = self.disable_tqdm |
| 117 | tables: List[TableResult] = self.table_rec_model( |
| 118 | [t["table_image"] for t in table_data], |
| 119 | batch_size=self.get_table_rec_batch_size(), |
| 120 | ) |
| 121 | assert len(tables) == len(table_data), ( |
| 122 | "Number of table results should match the number of tables" |
| 123 | ) |
| 124 | |
| 125 | # Assign cell text if we don't need OCR |
| 126 | # We do this at a line level |
| 127 | extract_blocks = [t for t in table_data if not t["ocr_block"]] |
| 128 | self.assign_pdftext_lines( |
| 129 | extract_blocks, filepath |
| 130 | ) # Handle tables where good text exists in the PDF |
| 131 | self.assign_text_to_cells(tables, table_data) |
| 132 | |
| 133 | # Assign OCR lines if needed - we do this at a cell level |
| 134 | self.assign_ocr_lines(tables, table_data) |
| 135 | |
| 136 | self.split_combined_rows(tables) # Split up rows that were combined |
| 137 | self.combine_dollar_column(tables) # Combine columns that are just dollar signs |
| 138 | |
| 139 | # Assign table cells to the table |
| 140 | table_idx = 0 |
| 141 | for page in document.pages: |
| 142 | for block in page.contained_blocks(document, self.block_types): |
nothing calls this directly
no test coverage detected