| 108 | @spaces.GPU |
| 109 | @torch.inference_mode() |
| 110 | def get_qa_response(instruction, image): |
| 111 | prompt = magma_qa_prompt.format(instruction) |
| 112 | if magam_model.config.mm_use_image_start_end: |
| 113 | qs = prompt.replace('<image>', '<image_start><image><image_end>') |
| 114 | else: |
| 115 | qs = prompt |
| 116 | convs = [{"role": "user", "content": qs}] |
| 117 | convs = [{"role": "system", "content": "You are agent that can see, talk and act."}] + convs |
| 118 | prompt = magma_processor.tokenizer.apply_chat_template( |
| 119 | convs, |
| 120 | tokenize=False, |
| 121 | add_generation_prompt=True |
| 122 | ) |
| 123 | |
| 124 | inputs = magma_processor(images=[image], texts=prompt, return_tensors="pt") |
| 125 | inputs['pixel_values'] = inputs['pixel_values'].unsqueeze(0) |
| 126 | inputs['image_sizes'] = inputs['image_sizes'].unsqueeze(0) |
| 127 | inputs = inputs.to(dtype).to(DEVICE) |
| 128 | |
| 129 | magam_model.generation_config.pad_token_id = magma_processor.tokenizer.pad_token_id |
| 130 | with torch.inference_mode(): |
| 131 | output_ids = magam_model.generate( |
| 132 | **inputs, |
| 133 | temperature=0.0, |
| 134 | do_sample=False, |
| 135 | num_beams=1, |
| 136 | max_new_tokens=128, |
| 137 | use_cache=True |
| 138 | ) |
| 139 | |
| 140 | prompt_decoded = magma_processor.batch_decode(inputs['input_ids'], skip_special_tokens=True)[0] |
| 141 | response = magma_processor.batch_decode(output_ids, skip_special_tokens=True)[0] |
| 142 | response = response.replace(prompt_decoded, '').strip() |
| 143 | return response |
| 144 | |
| 145 | @spaces.GPU |
| 146 | @torch.inference_mode() |