From e4e88c5c18feae444c4ef8ba606ca20dfad26c5d Mon Sep 17 00:00:00 2001 From: Tiara Rodney Date: Wed, 17 Jun 2026 23:47:42 +0200 Subject: [PATCH] feat(10): structured logging for the trainer The trainer was nearly silent: outside an example count and a save line it printed nothing through tokenizer load, the base-model load, example building, or the training loop, and trajectories dropped for length or empty mask vanished without a trace. Add _log.py (a shared stderr logging setup so stdout stays clean for results) and a module logger. sekft-train gains -v/--verbose and -q/--quiet. Log the run config and each phase; report dataset accounting (keepers -> usable, with counts dropped for over-length and empty-mask and a warning when any are dropped); inside the build loop, a per-trajectory debug line and a progress line every 100; raise transformers' verbosity during training so the per-step curve shows. Prints in train() and inspect() are routed through the logger. --- src/tiararodney/sekft/_log.py | 20 ++++++++++++ src/tiararodney/sekft/sft.py | 61 ++++++++++++++++++++++++++++------- 2 files changed, 70 insertions(+), 11 deletions(-) create mode 100644 src/tiararodney/sekft/_log.py diff --git a/src/tiararodney/sekft/_log.py b/src/tiararodney/sekft/_log.py new file mode 100644 index 0000000..8cb4d4f --- /dev/null +++ b/src/tiararodney/sekft/_log.py @@ -0,0 +1,20 @@ +"""Console logging setup shared by the sekft entry points. + +Logs go to stderr so stdout stays clean for a command's actual output (metrics +JSON, a path a caller might capture). Call :func:`setup` once at the top of a +``main()``; modules then log through ``logging.getLogger("sekft.")``. +""" +from __future__ import annotations + +import logging + + +def setup(verbose: bool = False, quiet: bool = False) -> None: + """Configure root logging to stderr. ``quiet`` shows warnings and worse, + ``verbose`` adds debug; the default is info.""" + level = logging.WARNING if quiet else logging.DEBUG if verbose else logging.INFO + logging.basicConfig( + level=level, + format="%(asctime)s %(levelname)-5s %(name)s %(message)s", + datefmt="%H:%M:%S", + ) diff --git a/src/tiararodney/sekft/sft.py b/src/tiararodney/sekft/sft.py index 0716f69..6acfd0f 100644 --- a/src/tiararodney/sekft/sft.py +++ b/src/tiararodney/sekft/sft.py @@ -28,10 +28,16 @@ from __future__ import annotations import argparse import json +import logging from collections.abc import Iterator from pathlib import Path from typing import Any +from ._log import setup as _setup_logging + +log = logging.getLogger("sekft.train") + + def normalize_for_template(messages: list[dict[str, str]]) -> list[dict[str, str]]: """Canonicalise a trajectory for instruct chat templates that have no system role and require strict user/assistant alternation (Mistral and friends): @@ -109,21 +115,44 @@ def train(data_dir: Path, base: str, out: Path, epochs: float, lr: float, from peft import LoraConfig, get_peft_model from transformers import (AutoModelForCausalLM, AutoTokenizer, DataCollatorForSeq2Seq, Trainer, TrainingArguments) + from transformers.utils import logging as hf_logging + # Surface the Trainer's own per-step curve (loss/lr/grad_norm); it is at + # WARNING by default, which is most of why training looks silent. + hf_logging.set_verbosity_info() + + log.info("base=%s data=%s out=%s", base, data_dir, out) + log.info("loading tokenizer: %s", base) tok = AutoTokenizer.from_pretrained(base) if tok.pad_token is None: tok.pad_token = tok.eos_token - rows = [] + log.info("building masked examples from %s ...", data_dir) + rows: list[dict[str, list[Any]]] = [] + n_seen = n_long = n_empty = 0 for turns in iter_keepers(data_dir): + n_seen += 1 ex = build_masked_example(turns, tok) - if len(ex["input_ids"]) <= max_len and any(l != -100 for l in ex["labels"]): - rows.append(ex) + log.debug(" trajectory %d: %d turns -> %d tokens, %d trained", + n_seen, len(turns), len(ex["input_ids"]), mask_stats(ex)[0]) + if n_seen % 100 == 0: + log.info(" ... %d trajectories processed, %d usable", n_seen, len(rows)) + if len(ex["input_ids"]) > max_len: + n_long += 1 + continue + if not any(l != -100 for l in ex["labels"]): + n_empty += 1 + continue + rows.append(ex) if not rows: raise SystemExit(f"no usable keeper trajectories in {data_dir}") - print(f"examples: {len(rows)}; " - f"trained/total tokens: {sum(mask_stats(r)[0] for r in rows)}" - f"/{sum(mask_stats(r)[1] for r in rows)}") + trained = sum(mask_stats(r)[0] for r in rows) + total = sum(mask_stats(r)[1] for r in rows) + log.info("dataset: %d keepers -> %d usable; %d trained / %d tokens (%.1f%% assistant)", + n_seen, len(rows), trained, total, 100 * trained / total) + if n_long or n_empty: + log.warning("dropped %d trajectories: %d over --max-len %d, %d empty-mask", + n_long + n_empty, n_long, max_len, n_empty) ds = Dataset.from_list(rows) # 4-bit (QLoRA) shrinks the base from ~14 GB to ~4 GB to move across the @@ -136,6 +165,8 @@ def train(data_dir: Path, base: str, out: Path, epochs: float, lr: float, load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, bnb_4bit_use_double_quant=True, ) + log.info("loading base model: %s (%s)", base, + "4-bit QLoRA" if load_4bit else "fp16") model = AutoModelForCausalLM.from_pretrained( base, dtype=torch.float16, quantization_config=quant) if load_4bit: @@ -148,7 +179,9 @@ def train(data_dir: Path, base: str, out: Path, epochs: float, lr: float, r=lora_r, lora_alpha=lora_r * 2, lora_dropout=0.05, task_type="CAUSAL_LM", target_modules=["q_proj", "k_proj", "v_proj", "o_proj"], )) - model.print_trainable_parameters() + n_train, n_all = model.get_nb_trainable_parameters() + log.info("LoRA r=%d: %d trainable / %d params (%.3f%%)", + lora_r, n_train, n_all, 100 * n_train / n_all) args = TrainingArguments( output_dir=str(out), per_device_train_batch_size=batch, @@ -161,18 +194,21 @@ def train(data_dir: Path, base: str, out: Path, epochs: float, lr: float, model=model, args=args, train_dataset=ds, data_collator=DataCollatorForSeq2Seq(tok, padding=True, label_pad_token_id=-100), ) + log.info("training: %g epochs, lr=%g, batch=%d x accum=%d (effective %d), max_len=%d", + epochs, lr, batch, accum, batch * accum, max_len) trainer.train() model.save_pretrained(str(out)) tok.save_pretrained(str(out)) # durable, greppable record of the curve (loss/lr/grad_norm per step). (out / "log_history.jsonl").write_text( "\n".join(json.dumps(r) for r in trainer.state.log_history)) - print(f"saved LoRA adapter + log_history.jsonl -> {out} " - f"(tensorboard: --logdir {out / 'runs'})") + log.info("saved LoRA adapter + log_history.jsonl -> %s (tensorboard: --logdir %s)", + out, out / "runs") def inspect(data_dir: Path, base: str) -> None: from transformers import AutoTokenizer + log.info("loading tokenizer: %s", base) tok = AutoTokenizer.from_pretrained(base) n = tt = tr = 0 for turns in iter_keepers(data_dir): @@ -181,8 +217,8 @@ def inspect(data_dir: Path, base: str) -> None: tr += t; tt += total; n += 1 if not n: raise SystemExit(f"no keeper trajectories in {data_dir}") - print(f"{n} keeper trajectories; {tr}/{tt} tokens trained " - f"({100*tr/tt:.1f}% assistant, rest masked)") + log.info("%d keeper trajectories; %d/%d tokens trained (%.1f%% assistant, rest masked)", + n, tr, tt, 100 * tr / tt) def main() -> None: @@ -199,7 +235,10 @@ def main() -> None: ap.add_argument("--lora-r", type=int, default=16) ap.add_argument("--load-4bit", action="store_true", help="QLoRA: load base in 4-bit (less to move over the link, less VRAM)") + ap.add_argument("-v", "--verbose", action="store_true", help="debug-level logging") + ap.add_argument("-q", "--quiet", action="store_true", help="warnings and errors only") ns = ap.parse_args() + _setup_logging(verbose=ns.verbose, quiet=ns.quiet) if ns.inspect: inspect(ns.data, ns.base) else: