feat(10): structured logging for the trainer

The trainer was nearly silent: outside an example count and a save line it printed nothing through tokenizer load, the base-model load, example building, or the training loop, and trajectories dropped for length or empty mask vanished without a trace. Add _log.py (a shared stderr logging setup so stdout stays clean for results) and a module logger. sekft-train gains -v/--verbose and -q/--quiet. Log the run config and each phase; report dataset accounting (keepers -> usable, with counts dropped for over-length and empty-mask and a warning when any are dropped); inside the build loop, a per-trajectory debug line and a progress line every 100; raise transformers' verbosity during training so the per-step curve shows. Prints in train() and inspect() are routed through the logger.
2026-06-17 23:47:42 +02:00 · 2026-06-17 23:47:42 +02:00 · e4e88c5c18
commit e4e88c5c18
parent 47b84a0dce
2 changed files with 70 additions and 11 deletions
--- a/src/tiararodney/sekft/_log.py
+++ b/src/tiararodney/sekft/_log.py
@ -0,0 +1,20 @@
 """Console logging setup shared by the sekft entry points.
 Logs go to stderr so stdout stays clean for a command's actual output (metrics
 JSON, a path a caller might capture). Call :func:`setup` once at the top of a
 ``main()``; modules then log through ``logging.getLogger("sekft.<area>")``.
 """
 from __future__ import annotations
 import logging
 def setup(verbose: bool = False, quiet: bool = False) -> None:
    """Configure root logging to stderr. ``quiet`` shows warnings and worse,
    ``verbose`` adds debug; the default is info."""
    level = logging.WARNING if quiet else logging.DEBUG if verbose else logging.INFO
    logging.basicConfig(
        level=level,
        format="%(asctime)s %(levelname)-5s %(name)s  %(message)s",
        datefmt="%H:%M:%S",
    )
--- a/src/tiararodney/sekft/sft.py
+++ b/src/tiararodney/sekft/sft.py
@ -28,10 +28,16 @@ from __future__ import annotations
 import argparse
 import json
 import logging
 from collections.abc import Iterator
 from pathlib import Path
 from typing import Any
 from ._log import setup as _setup_logging
 log = logging.getLogger("sekft.train")
 def normalize_for_template(messages: list[dict[str, str]]) -> list[dict[str, str]]:
    """Canonicalise a trajectory for instruct chat templates that have no system
    role and require strict user/assistant alternation (Mistral and friends):
@ -109,21 +115,44 @@ def train(data_dir: Path, base: str, out: Path, epochs: float, lr: float,
    from peft import LoraConfig, get_peft_model
    from transformers import (AutoModelForCausalLM, AutoTokenizer,
                              DataCollatorForSeq2Seq, Trainer, TrainingArguments)
    from transformers.utils import logging as hf_logging
    # Surface the Trainer's own per-step curve (loss/lr/grad_norm); it is at
    # WARNING by default, which is most of why training looks silent.
    hf_logging.set_verbosity_info()
    log.info("base=%s  data=%s  out=%s", base, data_dir, out)
    log.info("loading tokenizer: %s", base)
    tok = AutoTokenizer.from_pretrained(base)
    if tok.pad_token is None:
        tok.pad_token = tok.eos_token
-    rows = []
+    log.info("building masked examples from %s ...", data_dir)
    rows: list[dict[str, list[Any]]] = []
    n_seen = n_long = n_empty = 0
    for turns in iter_keepers(data_dir):
        n_seen += 1
        ex = build_masked_example(turns, tok)
-        if len(ex["input_ids"]) <= max_len and any(l != -100 for l in ex["labels"]):
+        log.debug("  trajectory %d: %d turns -> %d tokens, %d trained",
-            rows.append(ex)
+                  n_seen, len(turns), len(ex["input_ids"]), mask_stats(ex)[0])
        if n_seen % 100 == 0:
            log.info("  ... %d trajectories processed, %d usable", n_seen, len(rows))
        if len(ex["input_ids"]) > max_len:
            n_long += 1
            continue
        if not any(l != -100 for l in ex["labels"]):
            n_empty += 1
            continue
        rows.append(ex)
    if not rows:
        raise SystemExit(f"no usable keeper trajectories in {data_dir}")
-    print(f"examples: {len(rows)}; "
+    trained = sum(mask_stats(r)[0] for r in rows)
-          f"trained/total tokens: {sum(mask_stats(r)[0] for r in rows)}"
+    total = sum(mask_stats(r)[1] for r in rows)
-          f"/{sum(mask_stats(r)[1] for r in rows)}")
+    log.info("dataset: %d keepers -> %d usable; %d trained / %d tokens (%.1f%% assistant)",
             n_seen, len(rows), trained, total, 100 * trained / total)
    if n_long or n_empty:
        log.warning("dropped %d trajectories: %d over --max-len %d, %d empty-mask",
                    n_long + n_empty, n_long, max_len, n_empty)
    ds = Dataset.from_list(rows)
    # 4-bit (QLoRA) shrinks the base from ~14 GB to ~4 GB to move across the
@ -136,6 +165,8 @@ def train(data_dir: Path, base: str, out: Path, epochs: float, lr: float,
            load_in_4bit=True, bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True,
        )
    log.info("loading base model: %s (%s)", base,
             "4-bit QLoRA" if load_4bit else "fp16")
    model = AutoModelForCausalLM.from_pretrained(
        base, dtype=torch.float16, quantization_config=quant)
    if load_4bit:
@ -148,7 +179,9 @@ def train(data_dir: Path, base: str, out: Path, epochs: float, lr: float,
        r=lora_r, lora_alpha=lora_r * 2, lora_dropout=0.05, task_type="CAUSAL_LM",
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    ))
-    model.print_trainable_parameters()
+    n_train, n_all = model.get_nb_trainable_parameters()
    log.info("LoRA r=%d: %d trainable / %d params (%.3f%%)",
             lora_r, n_train, n_all, 100 * n_train / n_all)
    args = TrainingArguments(
        output_dir=str(out), per_device_train_batch_size=batch,
@ -161,18 +194,21 @@ def train(data_dir: Path, base: str, out: Path, epochs: float, lr: float,
        model=model, args=args, train_dataset=ds,
        data_collator=DataCollatorForSeq2Seq(tok, padding=True, label_pad_token_id=-100),
    )
    log.info("training: %g epochs, lr=%g, batch=%d x accum=%d (effective %d), max_len=%d",
             epochs, lr, batch, accum, batch * accum, max_len)
    trainer.train()
    model.save_pretrained(str(out))
    tok.save_pretrained(str(out))
    # durable, greppable record of the curve (loss/lr/grad_norm per step).
    (out / "log_history.jsonl").write_text(
        "\n".join(json.dumps(r) for r in trainer.state.log_history))
-    print(f"saved LoRA adapter + log_history.jsonl -> {out}  "
+    log.info("saved LoRA adapter + log_history.jsonl -> %s  (tensorboard: --logdir %s)",
-          f"(tensorboard: --logdir {out / 'runs'})")
+             out, out / "runs")
 def inspect(data_dir: Path, base: str) -> None:
    from transformers import AutoTokenizer
    log.info("loading tokenizer: %s", base)
    tok = AutoTokenizer.from_pretrained(base)
    n = tt = tr = 0
    for turns in iter_keepers(data_dir):
@ -181,8 +217,8 @@ def inspect(data_dir: Path, base: str) -> None:
        tr += t; tt += total; n += 1
    if not n:
        raise SystemExit(f"no keeper trajectories in {data_dir}")
-    print(f"{n} keeper trajectories; {tr}/{tt} tokens trained "
+    log.info("%d keeper trajectories; %d/%d tokens trained (%.1f%% assistant, rest masked)",
-          f"({100*tr/tt:.1f}% assistant, rest masked)")
+             n, tr, tt, 100 * tr / tt)
 def main() -> None:
@ -199,7 +235,10 @@ def main() -> None:
    ap.add_argument("--lora-r", type=int, default=16)
    ap.add_argument("--load-4bit", action="store_true",
                    help="QLoRA: load base in 4-bit (less to move over the link, less VRAM)")
    ap.add_argument("-v", "--verbose", action="store_true", help="debug-level logging")
    ap.add_argument("-q", "--quiet", action="store_true", help="warnings and errors only")
    ns = ap.parse_args()
    _setup_logging(verbose=ns.verbose, quiet=ns.quiet)
    if ns.inspect:
        inspect(ns.data, ns.base)
    else: