The output of the model inference.
| 40 | |
| 41 | @dataclass |
| 42 | class InferOutput: |
| 43 | """The output of the model inference.""" |
| 44 | |
| 45 | session_id: int |
| 46 | resp: Response |
| 47 | token_ids: np.ndarray | list[int] |
| 48 | meta: Any = None |
| 49 | finish: bool = False |
| 50 | logits: torch.Tensor = None |
| 51 | logprobs: torch.Tensor = None |
| 52 | |
| 53 | # send cache blocks back for migration in Disaggregated LLM Serving |
| 54 | # when Prefill Engine is Done. |
| 55 | cache_block_ids: list[int] = None |
| 56 | |
| 57 | # for logging |
| 58 | req_metrics: RequestMetrics = None |
| 59 | |
| 60 | # expert ids |
| 61 | routed_experts: torch.Tensor = None |
| 62 | |
| 63 | |
| 64 | def _build_seq_meta(model_config: ModelConfig, cache_config: CacheConfig, seq_strategy: Any, sampling_strategy: Any): |
no outgoing calls