(opts ...Option)
| 344 | } |
| 345 | |
| 346 | func (ml *ModelLoader) Load(opts ...Option) (grpc.Backend, error) { |
| 347 | o := NewOptions(opts...) |
| 348 | |
| 349 | ml.mu.Lock() |
| 350 | distributed := ml.modelRouter != nil |
| 351 | ml.mu.Unlock() |
| 352 | |
| 353 | // In distributed mode, SmartRouter must run per inference request so |
| 354 | // PickBestReplica (core/services/nodes/replicapicker.go) picks the |
| 355 | // least-loaded replica each time. Bypass the local cache and the local |
| 356 | // LRU / concurrency-group watchdog enforcement: both are scoped to the |
| 357 | // in-process Model store, which in distributed mode only holds stubs for |
| 358 | // remote replicas. SmartRouter handles cluster-wide eviction |
| 359 | // (evictLRUAndFreeNode) and concurrency-group anti-affinity |
| 360 | // (narrowByGroupAntiAffinity) at the scheduler layer. |
| 361 | // |
| 362 | // TODO(distributed-cache): see LoadModel for the rotating-replica-cache |
| 363 | // integration point that would let hot paths skip the per-request DB |
| 364 | // round-trip without giving up the shared PickBestReplica policy. |
| 365 | if distributed { |
| 366 | client, err := ml.backendLoader(opts...) |
| 367 | if err != nil { |
| 368 | return nil, err |
| 369 | } |
| 370 | if m := ml.CheckIsLoaded(o.modelID); m != nil && m.Process() == nil { |
| 371 | client = newConnectionEvictingClient(client, o.modelID, func() { |
| 372 | if err := ml.ShutdownModel(o.modelID); err != nil { |
| 373 | xlog.Warn("Failed to shut down remote model after connection error", "model", o.modelID, "error", err) |
| 374 | } |
| 375 | }) |
| 376 | } |
| 377 | return client, nil |
| 378 | } |
| 379 | |
| 380 | // Return earlier if we have a model already loaded |
| 381 | // (avoid looping through all the backends) |
| 382 | if m := ml.CheckIsLoaded(o.modelID); m != nil { |
| 383 | xlog.Debug("Model already loaded", "model", o.modelID) |
| 384 | // Update last used time for LRU tracking |
| 385 | ml.updateModelLastUsed(m) |
| 386 | client := m.GRPC(o.parallelRequests, ml.wd) |
| 387 | // Wrap remote models so connection errors during inference trigger eviction |
| 388 | if m.Process() == nil { |
| 389 | client = newConnectionEvictingClient(client, o.modelID, func() { |
| 390 | ml.ShutdownModel(o.modelID) |
| 391 | }) |
| 392 | } |
| 393 | return client, nil |
| 394 | } |
| 395 | |
| 396 | // Evict any loaded model that shares a concurrency group with the |
| 397 | // requested one before applying the global LRU cap — group eviction may |
| 398 | // already make room, and otherwise LRU might evict an unrelated model |
| 399 | // only for the group check to immediately evict another. |
| 400 | ml.enforceGroupExclusivity(o.modelID) |
| 401 | |
| 402 | // Enforce LRU limit before loading a new model |
| 403 | ml.enforceLRULimit() |
nothing calls this directly
no test coverage detected