Process either an image path or Image object Args: image_source: Either a file path (str) or PIL Image object ...
(image_source: Union[str, Image.Image], model=None, BOX_TRESHOLD=0.01, output_coord_in_ratio=False, ocr_bbox=None, text_scale=0.4, text_padding=5, draw_bbox_config=None, caption_model_processor=None, ocr_text=[], use_local_semantics=True, iou_threshold=0.9,prompt=None, scale_img=False, imgsz=None, batch_size=64)
| 405 | return area |
| 406 | |
| 407 | def get_som_labeled_img(image_source: Union[str, Image.Image], model=None, BOX_TRESHOLD=0.01, output_coord_in_ratio=False, ocr_bbox=None, text_scale=0.4, text_padding=5, draw_bbox_config=None, caption_model_processor=None, ocr_text=[], use_local_semantics=True, iou_threshold=0.9,prompt=None, scale_img=False, imgsz=None, batch_size=64): |
| 408 | """Process either an image path or Image object |
| 409 | |
| 410 | Args: |
| 411 | image_source: Either a file path (str) or PIL Image object |
| 412 | ... |
| 413 | """ |
| 414 | if isinstance(image_source, str): |
| 415 | image_source = Image.open(image_source).convert("RGB") |
| 416 | |
| 417 | w, h = image_source.size |
| 418 | if not imgsz: |
| 419 | imgsz = (h, w) |
| 420 | # print('image size:', w, h) |
| 421 | xyxy, logits, phrases = predict_yolo(model=model, image=image_source, box_threshold=BOX_TRESHOLD, imgsz=imgsz, scale_img=scale_img, iou_threshold=0.1) |
| 422 | xyxy = xyxy / torch.Tensor([w, h, w, h]).to(xyxy.device) |
| 423 | image_source = np.asarray(image_source) |
| 424 | phrases = [str(i) for i in range(len(phrases))] |
| 425 | |
| 426 | # annotate the image with labels |
| 427 | if ocr_bbox: |
| 428 | ocr_bbox = torch.tensor(ocr_bbox) / torch.Tensor([w, h, w, h]) |
| 429 | ocr_bbox=ocr_bbox.tolist() |
| 430 | else: |
| 431 | print('no ocr bbox!!!') |
| 432 | ocr_bbox = None |
| 433 | |
| 434 | ocr_bbox_elem = [{'type': 'text', 'bbox':box, 'interactivity':False, 'content':txt,} for box, txt in zip(ocr_bbox, ocr_text) if int_box_area(box, w, h) > 0] |
| 435 | xyxy_elem = [{'type': 'icon', 'bbox':box, 'interactivity':True, 'content':None} for box in xyxy.tolist() if int_box_area(box, w, h) > 0] |
| 436 | filtered_boxes = remove_overlap_new(boxes=xyxy_elem, iou_threshold=iou_threshold, ocr_bbox=ocr_bbox_elem) |
| 437 | |
| 438 | # sort the filtered_boxes so that the one with 'content': None is at the end, and get the index of the first 'content': None |
| 439 | filtered_boxes_elem = sorted(filtered_boxes, key=lambda x: x['content'] is None) |
| 440 | # get the index of the first 'content': None |
| 441 | starting_idx = next((i for i, box in enumerate(filtered_boxes_elem) if box['content'] is None), -1) |
| 442 | filtered_boxes = torch.tensor([box['bbox'] for box in filtered_boxes_elem]) |
| 443 | print('len(filtered_boxes):', len(filtered_boxes), starting_idx) |
| 444 | |
| 445 | # get parsed icon local semantics |
| 446 | time1 = time.time() |
| 447 | if use_local_semantics: |
| 448 | caption_model = caption_model_processor['model'] |
| 449 | if 'phi3_v' in caption_model.config.model_type: |
| 450 | parsed_content_icon = get_parsed_content_icon_phi3v(filtered_boxes, ocr_bbox, image_source, caption_model_processor) |
| 451 | else: |
| 452 | parsed_content_icon = get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_model_processor, prompt=prompt,batch_size=batch_size) |
| 453 | ocr_text = [f"Text Box ID {i}: {txt}" for i, txt in enumerate(ocr_text)] |
| 454 | icon_start = len(ocr_text) |
| 455 | parsed_content_icon_ls = [] |
| 456 | # fill the filtered_boxes_elem None content with parsed_content_icon in order |
| 457 | for i, box in enumerate(filtered_boxes_elem): |
| 458 | if box['content'] is None: |
| 459 | box['content'] = parsed_content_icon.pop(0) |
| 460 | for i, txt in enumerate(parsed_content_icon): |
| 461 | parsed_content_icon_ls.append(f"Icon Box ID {str(i+icon_start)}: {txt}") |
| 462 | parsed_content_merged = ocr_text + parsed_content_icon_ls |
| 463 | else: |
| 464 | ocr_text = [f"Text Box ID {i}: {txt}" for i, txt in enumerate(ocr_text)] |
no test coverage detected