Args: batch_encoding (dict): Batch encoding from tokenizer. use_attention_mask (bool): Whether to use attention mask. If None, use self.use_attention_mask. Defaults to None. output_hidden_states (bool): Whether to output hidden states. If
(
self,
batch_encoding,
use_attention_mask=None,
output_hidden_states=False,
do_sample=None,
hidden_state_skip_layer=None,
return_texts=False,
data_type="image",
semantic_images=None,
device=None,
)
| 290 | raise ValueError(f"Unsupported tokenize_input_type: {tokenize_input_type}") |
| 291 | |
| 292 | def encode( |
| 293 | self, |
| 294 | batch_encoding, |
| 295 | use_attention_mask=None, |
| 296 | output_hidden_states=False, |
| 297 | do_sample=None, |
| 298 | hidden_state_skip_layer=None, |
| 299 | return_texts=False, |
| 300 | data_type="image", |
| 301 | semantic_images=None, |
| 302 | device=None, |
| 303 | ): |
| 304 | """ |
| 305 | Args: |
| 306 | batch_encoding (dict): Batch encoding from tokenizer. |
| 307 | use_attention_mask (bool): Whether to use attention mask. If None, use self.use_attention_mask. |
| 308 | Defaults to None. |
| 309 | output_hidden_states (bool): Whether to output hidden states. If False, return the value of |
| 310 | self.output_key. If True, return the entire output. If set self.hidden_state_skip_layer, |
| 311 | output_hidden_states will be set True. Defaults to False. |
| 312 | do_sample (bool): Whether to sample from the model. Used for Decoder-Only LLMs. Defaults to None. |
| 313 | When self.produce is False, do_sample is set to True by default. |
| 314 | hidden_state_skip_layer (int): Number of hidden states to hidden_state_skip_layer. 0 means the last layer. |
| 315 | If None, self.output_key will be used. Defaults to None. |
| 316 | hidden_state_skip_layer (PIL.Image): The reference images for i2v models. |
| 317 | return_texts (bool): Whether to return the decoded texts. Defaults to False. |
| 318 | """ |
| 319 | device = self.model.device if device is None else device |
| 320 | use_attention_mask = use_default(use_attention_mask, self.use_attention_mask) |
| 321 | hidden_state_skip_layer = use_default( |
| 322 | hidden_state_skip_layer, self.hidden_state_skip_layer |
| 323 | ) |
| 324 | do_sample = use_default(do_sample, not self.reproduce) |
| 325 | if not self.i2v_mode: |
| 326 | attention_mask = ( |
| 327 | batch_encoding["attention_mask"].to(device) |
| 328 | if use_attention_mask |
| 329 | else None |
| 330 | ) |
| 331 | outputs = self.model( |
| 332 | input_ids=batch_encoding["input_ids"].to(device), |
| 333 | attention_mask=attention_mask, |
| 334 | output_hidden_states=output_hidden_states |
| 335 | or hidden_state_skip_layer is not None, |
| 336 | ) |
| 337 | if hidden_state_skip_layer is not None: |
| 338 | last_hidden_state = outputs.hidden_states[ |
| 339 | -(hidden_state_skip_layer + 1) |
| 340 | ] |
| 341 | # Real last hidden state already has layer norm applied. So here we only apply it |
| 342 | # for intermediate layers. |
| 343 | if hidden_state_skip_layer > 0 and self.apply_final_norm: |
| 344 | last_hidden_state = self.model.final_layer_norm(last_hidden_state) |
| 345 | else: |
| 346 | last_hidden_state = outputs[self.output_key] |
| 347 | |
| 348 | # Remove hidden states of instruction tokens, only keep prompt tokens. |
| 349 | if self.use_template: |
no test coverage detected