hub / github.com/microsoft/Magma / get_som_labeled_img

Function get_som_labeled_img

agents/ui_agent/util/utils.py:407–486 · view source on GitHub ↗

Process either an image path or Image object Args: image_source: Either a file path (str) or PIL Image object ...

(image_source: Union[str, Image.Image], model=None, BOX_TRESHOLD=0.01, output_coord_in_ratio=False, ocr_bbox=None, text_scale=0.4, text_padding=5, draw_bbox_config=None, caption_model_processor=None, ocr_text=[], use_local_semantics=True, iou_threshold=0.9,prompt=None, scale_img=False, imgsz=None, batch_size=64)

Source from the content-addressed store, hash-verified

405	return area
406
407	def get_som_labeled_img(image_source: Union[str, Image.Image], model=None, BOX_TRESHOLD=0.01, output_coord_in_ratio=False, ocr_bbox=None, text_scale=0.4, text_padding=5, draw_bbox_config=None, caption_model_processor=None, ocr_text=[], use_local_semantics=True, iou_threshold=0.9,prompt=None, scale_img=False, imgsz=None, batch_size=64):
408	"""Process either an image path or Image object
409
410	Args:
411	image_source: Either a file path (str) or PIL Image object
412	...
413	"""
414	if isinstance(image_source, str):
415	image_source = Image.open(image_source).convert("RGB")
416
417	w, h = image_source.size
418	if not imgsz:
419	imgsz = (h, w)
420	# print('image size:', w, h)
421	xyxy, logits, phrases = predict_yolo(model=model, image=image_source, box_threshold=BOX_TRESHOLD, imgsz=imgsz, scale_img=scale_img, iou_threshold=0.1)
422	xyxy = xyxy / torch.Tensor([w, h, w, h]).to(xyxy.device)
423	image_source = np.asarray(image_source)
424	phrases = [str(i) for i in range(len(phrases))]
425
426	# annotate the image with labels
427	if ocr_bbox:
428	ocr_bbox = torch.tensor(ocr_bbox) / torch.Tensor([w, h, w, h])
429	ocr_bbox=ocr_bbox.tolist()
430	else:
431	print('no ocr bbox!!!')
432	ocr_bbox = None
433
434	ocr_bbox_elem = [{'type': 'text', 'bbox':box, 'interactivity':False, 'content':txt,} for box, txt in zip(ocr_bbox, ocr_text) if int_box_area(box, w, h) > 0]
435	xyxy_elem = [{'type': 'icon', 'bbox':box, 'interactivity':True, 'content':None} for box in xyxy.tolist() if int_box_area(box, w, h) > 0]
436	filtered_boxes = remove_overlap_new(boxes=xyxy_elem, iou_threshold=iou_threshold, ocr_bbox=ocr_bbox_elem)
437
438	# sort the filtered_boxes so that the one with 'content': None is at the end, and get the index of the first 'content': None
439	filtered_boxes_elem = sorted(filtered_boxes, key=lambda x: x['content'] is None)
440	# get the index of the first 'content': None
441	starting_idx = next((i for i, box in enumerate(filtered_boxes_elem) if box['content'] is None), -1)
442	filtered_boxes = torch.tensor([box['bbox'] for box in filtered_boxes_elem])
443	print('len(filtered_boxes):', len(filtered_boxes), starting_idx)
444
445	# get parsed icon local semantics
446	time1 = time.time()
447	if use_local_semantics:
448	caption_model = caption_model_processor['model']
449	if 'phi3_v' in caption_model.config.model_type:
450	parsed_content_icon = get_parsed_content_icon_phi3v(filtered_boxes, ocr_bbox, image_source, caption_model_processor)
451	else:
452	parsed_content_icon = get_parsed_content_icon(filtered_boxes, starting_idx, image_source, caption_model_processor, prompt=prompt,batch_size=batch_size)
453	ocr_text = [f"Text Box ID {i}: {txt}" for i, txt in enumerate(ocr_text)]
454	icon_start = len(ocr_text)
455	parsed_content_icon_ls = []
456	# fill the filtered_boxes_elem None content with parsed_content_icon in order
457	for i, box in enumerate(filtered_boxes_elem):
458	if box['content'] is None:
459	box['content'] = parsed_content_icon.pop(0)
460	for i, txt in enumerate(parsed_content_icon):
461	parsed_content_icon_ls.append(f"Icon Box ID {str(i+icon_start)}: {txt}")
462	parsed_content_merged = ocr_text + parsed_content_icon_ls
463	else:
464	ocr_text = [f"Text Box ID {i}: {txt}" for i, txt in enumerate(ocr_text)]

Callers 2

processFunction · 0.90

parseMethod · 0.90

Calls 7

predict_yoloFunction · 0.85

int_box_areaFunction · 0.85

remove_overlap_newFunction · 0.85

get_parsed_content_icon_phi3vFunction · 0.85

get_parsed_content_iconFunction · 0.85

annotateFunction · 0.85

decodeMethod · 0.80

Tested by

no test coverage detected