@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the ranges of the dimensions of when using TRT dynamic shapes. @return: a list contains values which can be fed into the self.forward()
(self,
max_batch_size,
max_beam_width,
max_decoder_input_len,
max_seq_len,
max_encoder_input_len,
gather_context_logits: bool = False,
lora_target_modules: List[str] = None,
use_cache=True,
*args,
**kwargs)
| 1339 | return hidden_states |
| 1340 | |
| 1341 | def prepare_inputs(self, |
| 1342 | max_batch_size, |
| 1343 | max_beam_width, |
| 1344 | max_decoder_input_len, |
| 1345 | max_seq_len, |
| 1346 | max_encoder_input_len, |
| 1347 | gather_context_logits: bool = False, |
| 1348 | lora_target_modules: List[str] = None, |
| 1349 | use_cache=True, |
| 1350 | *args, |
| 1351 | **kwargs): |
| 1352 | '''@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the |
| 1353 | ranges of the dimensions of when using TRT dynamic shapes. |
| 1354 | |
| 1355 | @return: a list contains values which can be fed into the self.forward() |
| 1356 | ''' |
| 1357 | |
| 1358 | # Prepare inputs |
| 1359 | max_output_len = max_decoder_input_len + max_seq_len |
| 1360 | |
| 1361 | head_size = self.head_size |
| 1362 | num_kv_heads = (self.num_kv_heads + self.mapping.tp_size - |
| 1363 | 1) // self.mapping.tp_size |
| 1364 | |
| 1365 | encoder_head_size = self.encoder_head_size |
| 1366 | encoder_num_kv_heads = (self.encoder_num_kv_heads + self.mapping.tp_size |
| 1367 | - 1) // self.mapping.tp_size |
| 1368 | |
| 1369 | bb_range = [ |
| 1370 | 1, (max_batch_size * max_beam_width + 1) // 2, |
| 1371 | max_batch_size * max_beam_width |
| 1372 | ] |
| 1373 | bs_range = [1, (max_batch_size + 1) // 2, max_batch_size] |
| 1374 | beam_width_range = [1, (max_beam_width + 1) // 2, max_beam_width] |
| 1375 | inlen_range = [ |
| 1376 | 1, 1, max_decoder_input_len |
| 1377 | ] # context phase >= 1 (if forced_input_ids), generation phase = 1 |
| 1378 | encoder_inlen_range = [ |
| 1379 | 1, (max_encoder_input_len + 1) // 2, max_encoder_input_len |
| 1380 | ] |
| 1381 | mask_len_range = [1, (max_output_len + 1) // 2 + 1, max_output_len + 1] |
| 1382 | max_output_len_range = [0, (max_output_len + 1) // 2, max_output_len] |
| 1383 | |
| 1384 | encoder_num_tokens_range = [ |
| 1385 | 0, # 0 for generation phase, >0 for context phase |
| 1386 | (max_encoder_input_len * max_batch_size + 1) // 2, |
| 1387 | max_encoder_input_len * max_batch_size, |
| 1388 | ] |
| 1389 | decoder_num_tokens_range = [ |
| 1390 | 1, |
| 1391 | max_batch_size * max_beam_width, |
| 1392 | max(max_decoder_input_len * max_batch_size, |
| 1393 | max_beam_width * max_batch_size), |
| 1394 | ] |
| 1395 | |
| 1396 | # No enable_two_optimization_profiles support yet |
| 1397 | |
| 1398 | encoder_input_len_range = [ |
nothing calls this directly
no test coverage detected