MCPcopy
hub / github.com/microsoft/Magma / get_som_labeled_img

Function get_som_labeled_img

agents/ui_agent/util/utils.py:407–486  ·  view source on GitHub ↗

Process either an image path or Image object Args: image_source: Either a file path (str) or PIL Image object ...

(image_source: Union[str, Image.Image], model=None, BOX_TRESHOLD=0.01, output_coord_in_ratio=False, ocr_bbox=None, text_scale=0.4, text_padding=5, draw_bbox_config=None, caption_model_processor=None, ocr_text=[], use_local_semantics=True, iou_threshold=0.9,prompt=None, scale_img=False, imgsz=None, batch_size=64)

Source from the content-addressed store, hash-verified

405 return area
406
407def get_som_labeled_img(image_source: Union[str, Image.Image], model=None, BOX_TRESHOLD=0.01, output_coord_in_ratio=False, ocr_bbox=None, text_scale=0.4, text_padding=5, draw_bbox_config=None, caption_model_processor=None, ocr_text=[], use_local_semantics=True, iou_threshold=0.9,prompt=None, scale_img=False, imgsz=None, batch_size=64):
408 """Process either an image path or Image object
409
410 Args:
411 image_source: Either a file path (str) or PIL Image object
412 ...
413 """
414 if isinstance(image_source, str):
415 image_source = Image.open(image_source).convert("RGB")
416
417 w, h = image_source.size
418 if not imgsz:
419 imgsz = (h, w)
420 # print('image size:', w, h)
421 xyxy, logits, phrases = predict_yolo(model=model, image=image_source, box_threshold=BOX_TRESHOLD, imgsz=imgsz, scale_img=scale_img, iou_threshold=0.1)
422 xyxy = xyxy / torch.Tensor([w, h, w, h]).to(xyxy.device)
423 image_source = np.asarray(image_source)
424 phrases = [str(i) for i in range(len(phrases))]
425
426 # annotate the image with labels
427 if ocr_bbox:
428 ocr_bbox = torch.tensor(ocr_bbox) / torch.Tensor([w, h, w, h])
429 ocr_bbox=ocr_bbox.tolist()
430 else:
431 print('no ocr bbox!!!')
432 ocr_bbox = None
433
434 ocr_bbox_elem = [{'type': 'text', 'bbox':box, 'interactivity':False, 'content':txt,} for box, txt in zip(ocr_bbox, ocr_text) if int_box_area(box, w, h) > 0]
435 xyxy_elem = [{'type': 'icon', 'bbox':box, 'interactivity':True, 'content':None} for box in xyxy.tolist() if int_box_area(box, w, h) > 0]
436 filtered_boxes = remove_overlap_new(boxes=xyxy_elem, iou_threshold=iou_threshold, ocr_bbox=ocr_bbox_elem)
437
438 # sort the filtered_boxes so that the one with 'content': None is at the end, and get the index of the first 'content': None
439 filtered_boxes_elem = sorted(filtered_boxes, key=lambda x: x['content'] is None)
440 # get the index of the first 'content': None
441 starting_idx = next((i for i, box in enumerate(filtered_boxes_elem) if box['content'] is None), -1)
442 filtered_boxes = torch.tensor([box['bbox'] for box in filtered_boxes_elem])
443 print('len(filtered_boxes):', len(filtered_boxes), starting_idx)
444
445 # get parsed icon local semantics
446 time1 = time.time()
447 if use_local_semantics:
448 caption_model = caption_model_processor['model']
449 if 'phi3_v' in caption_model.config.model_type:
450 parsed_content_icon = get_parsed_content_icon_phi3v(filtered_boxes, ocr_bbox, image_source, caption_model_processor)
451 else:
452 parsed_content_icon = get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_model_processor, prompt=prompt,batch_size=batch_size)
453 ocr_text = [f"Text Box ID {i}: {txt}" for i, txt in enumerate(ocr_text)]
454 icon_start = len(ocr_text)
455 parsed_content_icon_ls = []
456 # fill the filtered_boxes_elem None content with parsed_content_icon in order
457 for i, box in enumerate(filtered_boxes_elem):
458 if box['content'] is None:
459 box['content'] = parsed_content_icon.pop(0)
460 for i, txt in enumerate(parsed_content_icon):
461 parsed_content_icon_ls.append(f"Icon Box ID {str(i+icon_start)}: {txt}")
462 parsed_content_merged = ocr_text + parsed_content_icon_ls
463 else:
464 ocr_text = [f"Text Box ID {i}: {txt}" for i, txt in enumerate(ocr_text)]

Callers 2

processFunction · 0.90
parseMethod · 0.90

Calls 7

predict_yoloFunction · 0.85
int_box_areaFunction · 0.85
remove_overlap_newFunction · 0.85
get_parsed_content_iconFunction · 0.85
annotateFunction · 0.85
decodeMethod · 0.80

Tested by

no test coverage detected