| 71 | @spaces.GPU |
| 72 | @torch.inference_mode() |
| 73 | def get_som_response(instruction, image_som): |
| 74 | prompt = magma_som_prompt.format(instruction) |
| 75 | if magam_model.config.mm_use_image_start_end: |
| 76 | qs = prompt.replace('<image>', '<image_start><image><image_end>') |
| 77 | else: |
| 78 | qs = prompt |
| 79 | convs = [{"role": "user", "content": qs}] |
| 80 | convs = [{"role": "system", "content": "You are agent that can see, talk and act."}] + convs |
| 81 | prompt = magma_processor.tokenizer.apply_chat_template( |
| 82 | convs, |
| 83 | tokenize=False, |
| 84 | add_generation_prompt=True |
| 85 | ) |
| 86 | |
| 87 | inputs = magma_processor(images=[image_som], texts=prompt, return_tensors="pt") |
| 88 | inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0) |
| 89 | inputs['image_sizes'] = inputs['image_sizes'].unsqueeze(0) |
| 90 | inputs = inputs.to(dtype).to(DEVICE) |
| 91 | |
| 92 | magam_model.generation_config.pad_token_id = magma_processor.tokenizer.pad_token_id |
| 93 | with torch.inference_mode(): |
| 94 | output_ids = magam_model.generate( |
| 95 | **inputs, |
| 96 | temperature=0.0, |
| 97 | do_sample=False, |
| 98 | num_beams=1, |
| 99 | max_new_tokens=128, |
| 100 | use_cache=True |
| 101 | ) |
| 102 | |
| 103 | prompt_decoded = magma_processor.batch_decode(inputs['input_ids'], skip_special_tokens=True)[0] |
| 104 | response = magma_processor.batch_decode(output_ids, skip_special_tokens=True)[0] |
| 105 | response = response.replace(prompt_decoded, '').strip() |
| 106 | return response |
| 107 | |
| 108 | @spaces.GPU |
| 109 | @torch.inference_mode() |