(self, tokens)
| 107 | |
| 108 | |
| 109 | def patched_SDClipModel_forward(self, tokens): |
| 110 | backup_embeds = self.transformer.get_input_embeddings() |
| 111 | device = backup_embeds.weight.device |
| 112 | tokens = self.set_up_textual_embeddings(tokens, backup_embeds) |
| 113 | tokens = torch.LongTensor(tokens).to(device) |
| 114 | |
| 115 | attention_mask = None |
| 116 | if self.enable_attention_masks: |
| 117 | attention_mask = torch.zeros_like(tokens) |
| 118 | max_token = self.transformer.get_input_embeddings().weight.shape[0] - 1 |
| 119 | for x in range(attention_mask.shape[0]): |
| 120 | for y in range(attention_mask.shape[1]): |
| 121 | attention_mask[x, y] = 1 |
| 122 | if tokens[x, y] == max_token: |
| 123 | break |
| 124 | |
| 125 | outputs = self.transformer(input_ids=tokens, attention_mask=attention_mask, |
| 126 | output_hidden_states=self.layer == "hidden") |
| 127 | self.transformer.set_input_embeddings(backup_embeds) |
| 128 | |
| 129 | if self.layer == "last": |
| 130 | z = outputs.last_hidden_state |
| 131 | elif self.layer == "pooled": |
| 132 | z = outputs.pooler_output[:, None, :] |
| 133 | else: |
| 134 | z = outputs.hidden_states[self.layer_idx] |
| 135 | if self.layer_norm_hidden_state: |
| 136 | z = self.transformer.text_model.final_layer_norm(z) |
| 137 | |
| 138 | if hasattr(outputs, "pooler_output"): |
| 139 | pooled_output = outputs.pooler_output.float() |
| 140 | else: |
| 141 | pooled_output = None |
| 142 | |
| 143 | if self.text_projection is not None and pooled_output is not None: |
| 144 | pooled_output = pooled_output.float().to(self.text_projection.device) @ self.text_projection.float() |
| 145 | |
| 146 | return z.float(), pooled_output |
| 147 | |
| 148 | |
| 149 | def patched_ClipVisionModel__init__(self, json_config): |
nothing calls this directly
no test coverage detected