(request, prompt, max_tokens, worker_addr)
| 156 | |
| 157 | |
| 158 | async def check_length(request, prompt, max_tokens, worker_addr): |
| 159 | if ( |
| 160 | not isinstance(max_tokens, int) or max_tokens <= 0 |
| 161 | ): # model worker not support max_tokens=None |
| 162 | max_tokens = 1024 * 1024 |
| 163 | |
| 164 | context_len = await fetch_remote( |
| 165 | worker_addr + "/model_details", {"model": request.model}, "context_length" |
| 166 | ) |
| 167 | token_num = await fetch_remote( |
| 168 | worker_addr + "/count_token", |
| 169 | {"model": request.model, "prompt": prompt}, |
| 170 | "count", |
| 171 | ) |
| 172 | length = min(max_tokens, context_len - token_num) |
| 173 | |
| 174 | if length <= 0: |
| 175 | return None, create_error_response( |
| 176 | ErrorCode.CONTEXT_OVERFLOW, |
| 177 | f"This model's maximum context length is {context_len} tokens. However, your messages resulted in {token_num} tokens. Please reduce the length of the messages.", |
| 178 | ) |
| 179 | |
| 180 | return length, None |
| 181 | |
| 182 | |
| 183 | def check_requests(request) -> Optional[JSONResponse]: |
no test coverage detected
searching dependent graphs…