Add argument cache_block_seq_len to parser.
(parser)
| 551 | |
| 552 | @staticmethod |
| 553 | def cache_block_seq_len(parser): |
| 554 | """Add argument cache_block_seq_len to parser.""" |
| 555 | |
| 556 | return parser.add_argument('--cache-block-seq-len', |
| 557 | type=int, |
| 558 | default=64, |
| 559 | help='The length of the token sequence in a k/v block. ' |
| 560 | 'For Turbomind Engine, if the GPU compute capability ' |
| 561 | 'is >= 8.0, it should be a multiple of 32, otherwise ' |
| 562 | 'it should be a multiple of 64. For Pytorch Engine, ' |
| 563 | 'if Lora Adapter is specified, this parameter will ' |
| 564 | 'be ignored') |
| 565 | |
| 566 | @staticmethod |
| 567 | def kernel_block_size(parser): |
no outgoing calls
no test coverage detected