| 16 | |
| 17 | |
| 18 | class MetricsLogger: |
| 19 | def __init__( |
| 20 | self, |
| 21 | stage: str, |
| 22 | log_dir: str, |
| 23 | *, |
| 24 | use_wandb: bool = False, |
| 25 | wandb_project: str = "train-llm-from-scratch-posttrain", |
| 26 | config: dict | None = None, |
| 27 | run_name: str | None = None, |
| 28 | ) -> None: |
| 29 | os.makedirs(log_dir, exist_ok=True) |
| 30 | self.stage = stage |
| 31 | # No Date.now in scripts here; use time.time() (allowed) for unique filenames. |
| 32 | stamp = int(time.time()) |
| 33 | self.path = os.path.join(log_dir, f"{stage}_{stamp}.jsonl") |
| 34 | self._fh = open(self.path, "a") |
| 35 | self._wandb = None |
| 36 | if use_wandb: |
| 37 | try: |
| 38 | import wandb |
| 39 | |
| 40 | wandb.init(project=wandb_project, name=run_name or f"{stage}-{stamp}", config=config or {}) |
| 41 | self._wandb = wandb |
| 42 | except Exception as e: # noqa: BLE001 - logging must never crash training |
| 43 | print(f"[logger] wandb disabled ({e}); JSONL logging only -> {self.path}") |
| 44 | print(f"[logger] stage={stage} -> {self.path}") |
| 45 | |
| 46 | def log(self, step: int, metrics: dict[str, Any]) -> None: |
| 47 | record = {"step": step, "wall": time.time(), **metrics} |
| 48 | self._fh.write(json.dumps(record) + "\n") |
| 49 | self._fh.flush() |
| 50 | if self._wandb is not None: |
| 51 | self._wandb.log(metrics, step=step) |
| 52 | |
| 53 | def close(self) -> None: |
| 54 | try: |
| 55 | self._fh.close() |
| 56 | finally: |
| 57 | if self._wandb is not None: |
| 58 | self._wandb.finish() |