Loads a language model using MLX. Args: request: The load model request. context: The gRPC context. Returns: backend_pb2.Result: The load model result.
(self, request, context)
| 52 | return backend_pb2.Reply(message=bytes("OK", 'utf-8')) |
| 53 | |
| 54 | async def LoadModel(self, request, context): |
| 55 | """ |
| 56 | Loads a language model using MLX. |
| 57 | |
| 58 | Args: |
| 59 | request: The load model request. |
| 60 | context: The gRPC context. |
| 61 | |
| 62 | Returns: |
| 63 | backend_pb2.Result: The load model result. |
| 64 | """ |
| 65 | try: |
| 66 | print(f"Loading MLX model: {request.Model}", file=sys.stderr) |
| 67 | print(f"Request: {request}", file=sys.stderr) |
| 68 | |
| 69 | # Parse Options[] key:value strings into a typed dict (shared helper) |
| 70 | self.options = parse_options(request.Options) |
| 71 | print(f"Options: {self.options}", file=sys.stderr) |
| 72 | |
| 73 | # Build tokenizer config for MLX using options |
| 74 | tokenizer_config = {} |
| 75 | |
| 76 | # Handle trust_remote_code from request or options |
| 77 | if request.TrustRemoteCode or self.options.get("trust_remote_code", False): |
| 78 | tokenizer_config["trust_remote_code"] = True |
| 79 | |
| 80 | # Handle EOS token from options |
| 81 | if "eos_token" in self.options: |
| 82 | tokenizer_config["eos_token"] = self.options["eos_token"] |
| 83 | |
| 84 | # Handle other tokenizer config options |
| 85 | for key in ["pad_token", "bos_token", "unk_token", "sep_token", "cls_token", "mask_token"]: |
| 86 | if key in self.options: |
| 87 | tokenizer_config[key] = self.options[key] |
| 88 | |
| 89 | # Load model and tokenizer using MLX |
| 90 | if tokenizer_config: |
| 91 | print(f"Loading with tokenizer_config: {tokenizer_config}", file=sys.stderr) |
| 92 | self.model, self.tokenizer = load(request.Model, tokenizer_config=tokenizer_config) |
| 93 | else: |
| 94 | self.model, self.tokenizer = load(request.Model) |
| 95 | |
| 96 | # mlx_lm.load() returns a TokenizerWrapper that detects tool |
| 97 | # calling and thinking markers from the chat template / vocab. |
| 98 | # mlx-lm >= 0.30 also exposes a parser callable on the wrapper; |
| 99 | # earlier versions don't (we fall back to json.loads inside |
| 100 | # _tool_module_from_tokenizer below). |
| 101 | has_tools = bool(getattr(self.tokenizer, "has_tool_calling", False)) |
| 102 | has_thinking = bool(getattr(self.tokenizer, "has_thinking", False)) |
| 103 | tcs = getattr(self.tokenizer, "tool_call_start", None) |
| 104 | tce = getattr(self.tokenizer, "tool_call_end", None) |
| 105 | print( |
| 106 | f"MLX tokenizer capabilities: has_tool_calling={has_tools} " |
| 107 | f"has_thinking={has_thinking} tool_call_start={tcs!r} tool_call_end={tce!r}", |
| 108 | file=sys.stderr, |
| 109 | ) |
| 110 | |
| 111 | # Initialize thread-safe LRU prompt cache for efficient generation |
nothing calls this directly
no test coverage detected