Generator that yields partial *response* strings as they arrive. Example: for tok in client.stream_completion("qwen2", "Hello"): print(tok, end="", flush=True)
(
self,
model: str,
prompt: str,
*,
images: List[Image.Image] | None = None,
enable_thinking: bool | None = None,
)
| 119 | # Streaming variant – yields token chunks in real time |
| 120 | # ------------------------------------------------------------- |
| 121 | def stream_completion( |
| 122 | self, |
| 123 | model: str, |
| 124 | prompt: str, |
| 125 | *, |
| 126 | images: List[Image.Image] | None = None, |
| 127 | enable_thinking: bool | None = None, |
| 128 | ): |
| 129 | """Generator that yields partial *response* strings as they arrive. |
| 130 | |
| 131 | Example: |
| 132 | |
| 133 | for tok in client.stream_completion("qwen2", "Hello"): |
| 134 | print(tok, end="", flush=True) |
| 135 | """ |
| 136 | payload: Dict[str, Any] = {"model": model, "prompt": prompt, "stream": True} |
| 137 | if images: |
| 138 | payload["images"] = [self._image_to_base64(img) for img in images] |
| 139 | if enable_thinking is not None: |
| 140 | payload["chat_template_kwargs"] = {"enable_thinking": enable_thinking} |
| 141 | |
| 142 | with requests.post(f"{self.api_url}/generate", json=payload, stream=True) as resp: |
| 143 | resp.raise_for_status() |
| 144 | for raw_line in resp.iter_lines(): |
| 145 | if not raw_line: |
| 146 | # Keep-alive newline |
| 147 | continue |
| 148 | try: |
| 149 | data = json.loads(raw_line.decode()) |
| 150 | except json.JSONDecodeError: |
| 151 | continue |
| 152 | # The Ollama streaming API sends objects like {"response":"Hi","done":false} |
| 153 | chunk = data.get("response", "") |
| 154 | if chunk: |
| 155 | yield chunk |
| 156 | if data.get("done"): |
| 157 | break |
| 158 | |
| 159 | if __name__ == '__main__': |
| 160 | # This test now requires a VLM model like 'llava' or 'qwen-vl' to be pulled. |
no test coverage detected