(ctx context.Context, s string, messages schema.Messages, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool, tools string, toolChoice string, logprobs *int, topLogprobs *int, logitBias map[string]float64, metadata map[string]string)
| 72 | var ModelInferenceFunc = ModelInference |
| 73 | |
| 74 | func ModelInference(ctx context.Context, s string, messages schema.Messages, images, videos, audios []string, loader *model.ModelLoader, c *config.ModelConfig, cl *config.ModelConfigLoader, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool, tools string, toolChoice string, logprobs *int, topLogprobs *int, logitBias map[string]float64, metadata map[string]string) (func() (LLMResponse, error), error) { |
| 75 | modelFile := c.Model |
| 76 | |
| 77 | // Check if the modelFile exists, if it doesn't try to load it from the gallery |
| 78 | if o.AutoloadGalleries { // experimental |
| 79 | modelNames, err := galleryop.ListModels(cl, loader, nil, galleryop.SKIP_ALWAYS) |
| 80 | if err != nil { |
| 81 | return nil, err |
| 82 | } |
| 83 | modelName := c.Name |
| 84 | if modelName == "" { |
| 85 | modelName = c.Model |
| 86 | } |
| 87 | if !slices.Contains(modelNames, modelName) { |
| 88 | utils.ResetDownloadTimers() |
| 89 | // if we failed to load the model, we try to download it |
| 90 | err := gallery.InstallModelFromGallery(ctx, o.Galleries, o.BackendGalleries, o.SystemState, loader, modelName, gallery.GalleryModel{}, utils.DisplayDownloadFunction, o.EnforcePredownloadScans, o.AutoloadBackendGalleries, o.RequireBackendIntegrity) |
| 91 | if err != nil { |
| 92 | xlog.Error("failed to install model from gallery", "error", err, "model", modelFile) |
| 93 | //return nil, err |
| 94 | } |
| 95 | } |
| 96 | } |
| 97 | |
| 98 | // Make the rendered prompt's prefix chain available to the distributed router |
| 99 | // for prefix-cache-aware node selection. No-op in single-process mode. The |
| 100 | // model id MUST match the id ModelOptions feeds to model.WithModelID, so both |
| 101 | // use the shared config.ModelConfig.ModelID() helper (Name with a fallback to |
| 102 | // Model) or the chain salt and the tracking key would diverge. |
| 103 | // |
| 104 | // s is empty for UseTokenizerTemplate models (the backend tokenizes the |
| 105 | // structured messages itself), so fall back to a prefix-stable serialization |
| 106 | // of the messages - otherwise prefix routing would silently degrade to |
| 107 | // round-robin for the bulk of modern chat models. |
| 108 | chainSource := s |
| 109 | if chainSource == "" { |
| 110 | chainSource = messagesPrefixSource(messages) |
| 111 | } |
| 112 | ctx = distributedhdr.MaybeWithPrefixChain(ctx, c.ModelID(), chainSource) |
| 113 | |
| 114 | opts := ModelOptions(*c, o, model.WithContext(ctx)) |
| 115 | inferenceModel, err := loader.Load(opts...) |
| 116 | if err != nil { |
| 117 | recordModelLoadFailure(o, c.Name, c.Backend, err, map[string]any{"model_file": modelFile}) |
| 118 | return nil, err |
| 119 | } |
| 120 | |
| 121 | // Probe the backend for model-scoped metadata after LoadModel succeeds. |
| 122 | // Two signals are captured: thinking-mode detection (only meaningful when the |
| 123 | // tokenizer template path is active) and the multimodal media marker (needed |
| 124 | // by custom chat templates so markers line up with what mtmd expects). |
| 125 | // We probe whenever any of those slots is still empty. |
| 126 | shouldProbeThinking := needsThinkingProbe(c) |
| 127 | needsMarkerProbe := c.MediaMarker == "" |
| 128 | if shouldProbeThinking || needsMarkerProbe { |
| 129 | modelOpts := grpcModelOpts(*c, o.SystemState.Model.ModelsPath) |
| 130 | config.DetectThinkingSupportFromBackend(ctx, c, inferenceModel, modelOpts) |
| 131 | // Update the config in the loader so it persists for future requests |
no test coverage detected