(self, image, thres=0.2)
| 268 | return ','.join(dense_cap_prompt) |
| 269 | |
| 270 | def parse_ocr(self, image, thres=0.2): |
| 271 | width, height = get_image_shape(image) |
| 272 | image = load_image(image, return_type='numpy') |
| 273 | bounds = self.ocr_reader.readtext(image) |
| 274 | bounds = [bound for bound in bounds if bound[2] > thres] |
| 275 | print('Process OCR Text:\n', bounds) |
| 276 | |
| 277 | ocr_prompt = [] |
| 278 | for box, text, conf in bounds: |
| 279 | p0, p1, p2, p3 = box |
| 280 | ocr_prompt.append('(\"{}\": X:{:.0f}, Y:{:.0f})'.format(text, (p0[0] + p1[0] + p2[0] + p3[0]) / 4, |
| 281 | (p0[1] + p1[1] + p2[1] + p3[1]) / 4)) |
| 282 | ocr_prompt = '\n'.join(ocr_prompt) |
| 283 | |
| 284 | # ocr_prompt = self.text_refiner.llm(f'The image have some scene texts with their locations: {ocr_prompt}. Please group these individual words into one or several phrase based on their relative positions (only give me your answer, do not show explanination)').strip() |
| 285 | |
| 286 | # ocr_prefix1 = f'The image have some scene texts with their locations: {ocr_prompt}. Please group these individual words into one or several phrase based on their relative positions (only give me your answer, do not show explanination)' |
| 287 | # ocr_prefix2 = f'Please group these individual words into 1-3 phrases, given scene texts with their locations: {ocr_prompt}. You return is one or several strings and infer their locations. (only give me your answer like (“man working”, X: value, Y: value), do not show explanination)' |
| 288 | # ocr_prefix4 = f'summarize the individual scene text words detected by OCR tools into a fluent sentence based on their positions and distances. You should strictly describe all of the given scene text words. Do not miss any given word. Do not create non-exist words. Do not appear numeric positions. The individual words are given:\n{ocr_prompt}\n' |
| 289 | # ocr_prefix3 = f'combine the individual scene text words detected by OCR tools into one/several fluent phrases/sentences based on their positions and distances. You should strictly copy or correct all of the given scene text words. Do not miss any given word. Do not create non-exist words. The response is several strings seperate with their location (X, Y), each of which represents a phrase. The individual words are given:\n{ocr_prompt}\n' |
| 290 | # response = self.text_refiner.llm(ocr_prefix3).strip() if len(ocr_prompt) else "" |
| 291 | return ocr_prompt |
| 292 | |
| 293 | def inference_cap_everything(self, image, verbose=False): |
| 294 | image = load_image(image, return_type='pil') |
no test coverage detected