feat(10): structured logging for the trainer

The trainer was nearly silent: outside an example count and a save line it
printed nothing through tokenizer load, the base-model load, example building, or
the training loop, and trajectories dropped for length or empty mask vanished
without a trace.

Add _log.py (a shared stderr logging setup so stdout stays clean for results)
and a module logger. sekft-train gains -v/--verbose and -q/--quiet. Log the run
config and each phase; report dataset accounting (keepers -> usable, with counts
dropped for over-length and empty-mask and a warning when any are dropped);
inside the build loop, a per-trajectory debug line and a progress line every 100;
raise transformers' verbosity during training so the per-step curve shows. Prints
in train() and inspect() are routed through the logger.
This commit is contained in:
Tiara Rodney 2026-06-17 23:47:42 +02:00
parent 47b84a0dce
commit e4e88c5c18
Signed by: tiara
GPG key ID: 5CD8EC1D46106723
2 changed files with 70 additions and 11 deletions

View file

@ -0,0 +1,20 @@
"""Console logging setup shared by the sekft entry points.
Logs go to stderr so stdout stays clean for a command's actual output (metrics
JSON, a path a caller might capture). Call :func:`setup` once at the top of a
``main()``; modules then log through ``logging.getLogger("sekft.<area>")``.
"""
from __future__ import annotations
import logging
def setup(verbose: bool = False, quiet: bool = False) -> None:
"""Configure root logging to stderr. ``quiet`` shows warnings and worse,
``verbose`` adds debug; the default is info."""
level = logging.WARNING if quiet else logging.DEBUG if verbose else logging.INFO
logging.basicConfig(
level=level,
format="%(asctime)s %(levelname)-5s %(name)s %(message)s",
datefmt="%H:%M:%S",
)

View file

@ -28,10 +28,16 @@ from __future__ import annotations
import argparse
import json
import logging
from collections.abc import Iterator
from pathlib import Path
from typing import Any
from ._log import setup as _setup_logging
log = logging.getLogger("sekft.train")
def normalize_for_template(messages: list[dict[str, str]]) -> list[dict[str, str]]:
"""Canonicalise a trajectory for instruct chat templates that have no system
role and require strict user/assistant alternation (Mistral and friends):
@ -109,21 +115,44 @@ def train(data_dir: Path, base: str, out: Path, epochs: float, lr: float,
from peft import LoraConfig, get_peft_model
from transformers import (AutoModelForCausalLM, AutoTokenizer,
DataCollatorForSeq2Seq, Trainer, TrainingArguments)
from transformers.utils import logging as hf_logging
# Surface the Trainer's own per-step curve (loss/lr/grad_norm); it is at
# WARNING by default, which is most of why training looks silent.
hf_logging.set_verbosity_info()
log.info("base=%s data=%s out=%s", base, data_dir, out)
log.info("loading tokenizer: %s", base)
tok = AutoTokenizer.from_pretrained(base)
if tok.pad_token is None:
tok.pad_token = tok.eos_token
rows = []
log.info("building masked examples from %s ...", data_dir)
rows: list[dict[str, list[Any]]] = []
n_seen = n_long = n_empty = 0
for turns in iter_keepers(data_dir):
n_seen += 1
ex = build_masked_example(turns, tok)
if len(ex["input_ids"]) <= max_len and any(l != -100 for l in ex["labels"]):
log.debug(" trajectory %d: %d turns -> %d tokens, %d trained",
n_seen, len(turns), len(ex["input_ids"]), mask_stats(ex)[0])
if n_seen % 100 == 0:
log.info(" ... %d trajectories processed, %d usable", n_seen, len(rows))
if len(ex["input_ids"]) > max_len:
n_long += 1
continue
if not any(l != -100 for l in ex["labels"]):
n_empty += 1
continue
rows.append(ex)
if not rows:
raise SystemExit(f"no usable keeper trajectories in {data_dir}")
print(f"examples: {len(rows)}; "
f"trained/total tokens: {sum(mask_stats(r)[0] for r in rows)}"
f"/{sum(mask_stats(r)[1] for r in rows)}")
trained = sum(mask_stats(r)[0] for r in rows)
total = sum(mask_stats(r)[1] for r in rows)
log.info("dataset: %d keepers -> %d usable; %d trained / %d tokens (%.1f%% assistant)",
n_seen, len(rows), trained, total, 100 * trained / total)
if n_long or n_empty:
log.warning("dropped %d trajectories: %d over --max-len %d, %d empty-mask",
n_long + n_empty, n_long, max_len, n_empty)
ds = Dataset.from_list(rows)
# 4-bit (QLoRA) shrinks the base from ~14 GB to ~4 GB to move across the
@ -136,6 +165,8 @@ def train(data_dir: Path, base: str, out: Path, epochs: float, lr: float,
load_in_4bit=True, bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True,
)
log.info("loading base model: %s (%s)", base,
"4-bit QLoRA" if load_4bit else "fp16")
model = AutoModelForCausalLM.from_pretrained(
base, dtype=torch.float16, quantization_config=quant)
if load_4bit:
@ -148,7 +179,9 @@ def train(data_dir: Path, base: str, out: Path, epochs: float, lr: float,
r=lora_r, lora_alpha=lora_r * 2, lora_dropout=0.05, task_type="CAUSAL_LM",
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
))
model.print_trainable_parameters()
n_train, n_all = model.get_nb_trainable_parameters()
log.info("LoRA r=%d: %d trainable / %d params (%.3f%%)",
lora_r, n_train, n_all, 100 * n_train / n_all)
args = TrainingArguments(
output_dir=str(out), per_device_train_batch_size=batch,
@ -161,18 +194,21 @@ def train(data_dir: Path, base: str, out: Path, epochs: float, lr: float,
model=model, args=args, train_dataset=ds,
data_collator=DataCollatorForSeq2Seq(tok, padding=True, label_pad_token_id=-100),
)
log.info("training: %g epochs, lr=%g, batch=%d x accum=%d (effective %d), max_len=%d",
epochs, lr, batch, accum, batch * accum, max_len)
trainer.train()
model.save_pretrained(str(out))
tok.save_pretrained(str(out))
# durable, greppable record of the curve (loss/lr/grad_norm per step).
(out / "log_history.jsonl").write_text(
"\n".join(json.dumps(r) for r in trainer.state.log_history))
print(f"saved LoRA adapter + log_history.jsonl -> {out} "
f"(tensorboard: --logdir {out / 'runs'})")
log.info("saved LoRA adapter + log_history.jsonl -> %s (tensorboard: --logdir %s)",
out, out / "runs")
def inspect(data_dir: Path, base: str) -> None:
from transformers import AutoTokenizer
log.info("loading tokenizer: %s", base)
tok = AutoTokenizer.from_pretrained(base)
n = tt = tr = 0
for turns in iter_keepers(data_dir):
@ -181,8 +217,8 @@ def inspect(data_dir: Path, base: str) -> None:
tr += t; tt += total; n += 1
if not n:
raise SystemExit(f"no keeper trajectories in {data_dir}")
print(f"{n} keeper trajectories; {tr}/{tt} tokens trained "
f"({100*tr/tt:.1f}% assistant, rest masked)")
log.info("%d keeper trajectories; %d/%d tokens trained (%.1f%% assistant, rest masked)",
n, tr, tt, 100 * tr / tt)
def main() -> None:
@ -199,7 +235,10 @@ def main() -> None:
ap.add_argument("--lora-r", type=int, default=16)
ap.add_argument("--load-4bit", action="store_true",
help="QLoRA: load base in 4-bit (less to move over the link, less VRAM)")
ap.add_argument("-v", "--verbose", action="store_true", help="debug-level logging")
ap.add_argument("-q", "--quiet", action="store_true", help="warnings and errors only")
ns = ap.parse_args()
_setup_logging(verbose=ns.verbose, quiet=ns.quiet)
if ns.inspect:
inspect(ns.data, ns.base)
else: