(self, image, topN=10, reference_caption=[], verbose=False)
| 217 | return out_list |
| 218 | |
| 219 | def parse_dense_caption(self, image, topN=10, reference_caption=[], verbose=False): |
| 220 | width, height = get_image_shape(image) |
| 221 | prompt = {'prompt_type': ['everything']} |
| 222 | densecap_args = { |
| 223 | 'return_ppl': True, |
| 224 | 'clip_filter': True, |
| 225 | 'reference_caption': reference_caption, |
| 226 | 'text_prompt': "", # 'Question: what does the image show? Answer:' |
| 227 | 'seg_crop_mode': 'w_bg', |
| 228 | # 'text_prompt': "", |
| 229 | # 'seg_crop_mode': 'wo_bg', |
| 230 | 'disable_regular_box': False, |
| 231 | 'topN': topN, |
| 232 | 'min_ppl_score': -1.8, |
| 233 | 'min_clip_score': 0.30, |
| 234 | 'min_mask_area': 2500, |
| 235 | } |
| 236 | |
| 237 | dense_captions = self.inference(image, prompt, |
| 238 | controls=None, |
| 239 | disable_gpt=True, |
| 240 | verbose=verbose, |
| 241 | is_densecap=True, |
| 242 | args=densecap_args) |
| 243 | print('Process Dense Captioning: \n', dense_captions) |
| 244 | dense_captions = list(filter( |
| 245 | lambda x: x['ppl_score'] / (1 + len(x['generated_captions']['raw_caption'].split())) >= densecap_args[ |
| 246 | 'min_ppl_score'], dense_captions)) |
| 247 | dense_captions = list(filter(lambda x: x['clip_score'] >= densecap_args['min_clip_score'], dense_captions)) |
| 248 | dense_cap_prompt = [] |
| 249 | for cap in dense_captions: |
| 250 | x, y, w, h = cap['bbox'] |
| 251 | cx, cy = x + w / 2, (y + h / 2) |
| 252 | dense_cap_prompt.append( |
| 253 | "({}: X:{:.0f}, Y:{:.0f}, Width:{:.0f}, Height:{:.0f})".format(cap['generated_captions']['raw_caption'], |
| 254 | cx, cy, w, h)) |
| 255 | |
| 256 | if verbose: |
| 257 | all_masks = [np.array(item['mask'].convert('P')) for item in dense_captions] |
| 258 | new_image = mask_painter_foreground_all(np.array(image), all_masks, background_alpha=0.4) |
| 259 | save_path = 'result/dense_caption_mask.png' |
| 260 | Image.fromarray(new_image).save(save_path) |
| 261 | print(f'Dense captioning mask saved in {save_path}') |
| 262 | |
| 263 | vis_path = 'result/dense_caption_vis_{}.png'.format(time.time()) |
| 264 | dense_cap_painter_input = [{'bbox': xywh_to_x1y1x2y2(cap['bbox']), |
| 265 | 'caption': cap['generated_captions']['raw_caption']} for cap in dense_captions] |
| 266 | draw_bbox(load_image(image, return_type='numpy'), vis_path, dense_cap_painter_input, show_caption=True) |
| 267 | print(f'Dense Captioning visualization saved in {vis_path}') |
| 268 | return ','.join(dense_cap_prompt) |
| 269 | |
| 270 | def parse_ocr(self, image, thres=0.2): |
| 271 | width, height = get_image_shape(image) |
no test coverage detected