AdmissionControl runs after RouteModel so the limit applies to the SERVED model — a router fanout that lands on a saturated downstream model gets rejected even though the requested router-model has slack. On reject: HTTP 503, Retry-After header, error JSON. An audit row goes into the shared event s
(limiter *admission.Limiter, events pii.EventStore)
| 27 | // Models without limits.max_concurrent (the common case) hit a fast |
| 28 | // no-op path — Acquire returns immediately for max <= 0. |
| 29 | func AdmissionControl(limiter *admission.Limiter, events pii.EventStore) echo.MiddlewareFunc { |
| 30 | return func(next echo.HandlerFunc) echo.HandlerFunc { |
| 31 | return func(c echo.Context) error { |
| 32 | cfg, ok := c.Get(CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig) |
| 33 | if !ok || cfg == nil { |
| 34 | return next(c) |
| 35 | } |
| 36 | max := cfg.Limits.MaxConcurrent |
| 37 | release, ok := limiter.Acquire(cfg.Name, max) |
| 38 | if !ok { |
| 39 | retryAfter := admission.RetryAfter(cfg.Limits.RetryAfterSeconds) |
| 40 | recordAdmissionRejection(events, cfg.Name, retryAfter) |
| 41 | c.Response().Header().Set("Retry-After", strconv.Itoa(int(retryAfter.Seconds()))) |
| 42 | return c.JSON(http.StatusServiceUnavailable, map[string]any{ |
| 43 | "error": map[string]any{ |
| 44 | "type": "admission_rejected", |
| 45 | "message": fmt.Sprintf("model %q is at capacity (max_concurrent=%d); retry after %s", cfg.Name, max, retryAfter), |
| 46 | }, |
| 47 | }) |
| 48 | } |
| 49 | defer release() |
| 50 | return next(c) |
| 51 | } |
| 52 | } |
| 53 | } |
| 54 | |
| 55 | // admissionEventSeq scopes IDs across the process so rapid |
| 56 | // rejections under load get unique row IDs without coordinating |