v0.1.4: greedy-decode and capture model-token IDs at decode time. Drives the patched ``llama-completion`` binary with ``REFRACT_TRAJECTORY= `` set; the binary writes one JSONL record per sampled token (``{"step":N,"token_id":ID}``). We read the file back, return the ID seque
(
model: Path,
prompt: str,
kv: KVConfig,
n_predict: int = 128,
ctx: int = 512,
n_gpu_layers: int = 99,
seed: int = 42,
temperature: float = 0.0,
timeout: float = 300.0,
apply_chat_template: bool = True,
system: Optional[str] = None,
reasoning: str = "off",
)
| 546 | |
| 547 | |
| 548 | def run_completion_trajectory( |
| 549 | model: Path, |
| 550 | prompt: str, |
| 551 | kv: KVConfig, |
| 552 | n_predict: int = 128, |
| 553 | ctx: int = 512, |
| 554 | n_gpu_layers: int = 99, |
| 555 | seed: int = 42, |
| 556 | temperature: float = 0.0, |
| 557 | timeout: float = 300.0, |
| 558 | apply_chat_template: bool = True, |
| 559 | system: Optional[str] = None, |
| 560 | reasoning: str = "off", |
| 561 | ) -> tuple[list[int], dict]: |
| 562 | """v0.1.4: greedy-decode and capture model-token IDs at decode time. |
| 563 | |
| 564 | Drives the patched ``llama-completion`` binary with |
| 565 | ``REFRACT_TRAJECTORY=<tmpfile>`` set; the binary writes one JSONL record |
| 566 | per sampled token (``{"step":N,"token_id":ID}``). We read the file back, |
| 567 | return the ID sequence, and delete the file. |
| 568 | |
| 569 | Returns (token_ids, metadata). The token IDs are the model's own |
| 570 | sampled tokens — no detokenize→retokenize round-trip, no whitespace-vs- |
| 571 | model-token unit mismatch. This is the v0.1.4 fix for GTM's structural |
| 572 | weakness (LIMITATIONS.md §1, §5). |
| 573 | |
| 574 | Requires the ``llama-completion`` binary to be built from the patched |
| 575 | ``tools/completion/completion.cpp`` (REFRACT v0.1.4 patch). If the |
| 576 | binary lacks the patch, the trajectory file will be empty and this |
| 577 | function returns ``([], meta)``. |
| 578 | """ |
| 579 | # v0.3.1: dispatch to active backend if non-llamacpp is set. |
| 580 | if _ACTIVE_BACKEND is not None and getattr(_ACTIVE_BACKEND, "name", None) != "llamacpp": |
| 581 | res = _ACTIVE_BACKEND.run_completion_trajectory( |
| 582 | model=model, prompt=prompt, kv_config_str=kv.label(), |
| 583 | n_predict=n_predict, ctx=ctx, n_gpu_layers=n_gpu_layers, |
| 584 | seed=seed, temperature=temperature, timeout=timeout, |
| 585 | apply_chat_template=apply_chat_template, system=system, |
| 586 | ) |
| 587 | return res.token_ids, res.metadata |
| 588 | |
| 589 | bin_path = _bin("llama-completion") |
| 590 | |
| 591 | fd, traj_path = tempfile.mkstemp(prefix="refract-traj-", suffix=".jsonl") |
| 592 | os.close(fd) |
| 593 | os.unlink(traj_path) # patched binary creates it itself |
| 594 | |
| 595 | cmd: list[str] = [ |
| 596 | str(bin_path), |
| 597 | "-m", str(model), |
| 598 | "-p", prompt, |
| 599 | "-n", str(n_predict), |
| 600 | "-c", str(ctx), |
| 601 | "-ngl", str(n_gpu_layers), |
| 602 | "--seed", str(seed), |
| 603 | "--temp", str(temperature), |
| 604 | "-no-cnv", |
| 605 | "--no-display-prompt", |