From 4987d951ce22a54f131ae9e051bca39ba875a202 Mon Sep 17 00:00:00 2001
From: Tiara Rodney <tiara.rodney@byteb4rb1e.me>
Date: Thu, 18 Jun 2026 12:37:01 +0200
Subject: [PATCH] bugfix(15): normalise apply_chat_template's BatchEncoding
 (transformers 5.x)

apply_chat_template returns a BatchEncoding ({input_ids: [...]}) on transformers
>= 5 where 4.x returned a bare list[int]. build_masked_example treated the render
as a dict, so len/slicing were wrong and the prefix-differencing spuriously
raised "chat template is not additive" on every real model. Extract the id
sequence via a _render_ids helper; verified the assistant-only mask against
mistralai/Mistral-7B-Instruct-v0.2. The fake tokenizer returned a bare list and
missed this, so a BatchEncoding-returning variant now guards it.
---
 CHANGELOG.md                 | 13 +++++++++++++
 src/tiararodney/sekft/sft.py | 17 +++++++++++++++--
 tests/unit/test_sft.py       | 23 +++++++++++++++++++++++
 3 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4408334..0f331f5 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,18 @@ are documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
 and the project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.0.1] - 2026-06-18
+
+### Fixed
+- `build_masked_example` could not derive the assistant mask on transformers
+  ≥ 5: `apply_chat_template` now returns a `BatchEncoding` (`{input_ids: [...]}`)
+  where 4.x returned a bare `list[int]`, so the render was treated as a dict and
+  the prefix-differencing spuriously raised "chat template is not additive" on
+  every real model. The id sequence is now extracted either way; verified the
+  assistant-only mask against `mistralai/Mistral-7B-Instruct-v0.2`. The
+  fake-tokenizer test gained a `BatchEncoding`-returning variant so this can't
+  regress.
+
 ## [1.0.0] - 2026-06-18
 
 First release: the training and evaluation pipeline that turns posix-sdc
@@ -38,4 +50,5 @@ trajectories into a fine-tuned shell operator.
   mypy-strict codebase; an optional `[gpu]` extra (torch / transformers / peft);
   and a dependency on `posix-sdc[hub]`. Released under GPL-2.0.
 
+[1.0.1]: https://git.code.tiararodney.com/tiara/sekft/compare/v1.0.0...v1.0.1
 [1.0.0]: https://git.code.tiararodney.com/tiara/sekft/releases/tag/v1.0.0
diff --git a/src/tiararodney/sekft/sft.py b/src/tiararodney/sekft/sft.py
index ce0e478..4e40362 100644
--- a/src/tiararodney/sekft/sft.py
+++ b/src/tiararodney/sekft/sft.py
@@ -62,6 +62,19 @@ def normalize_for_template(messages: list[dict[str, str]]) -> list[dict[str, str
     return out
 
 
+def _render_ids(tokenizer: Any, msgs: list[dict[str, str]]) -> Any:
+    """Token ids for a rendered conversation, as a flat sequence.
+
+    ``apply_chat_template`` returns a ``BatchEncoding`` (``{input_ids: [...]}``)
+    on transformers >= 5, where 4.x returned a bare ``list[int]``. Normalise to
+    the id sequence either way, so the prefix-differencing below diffs tokens and
+    not a dict (a dict makes ``len`` the key count and spuriously trips the
+    not-additive guard).
+    """
+    out = tokenizer.apply_chat_template(msgs, add_generation_prompt=False)
+    return out["input_ids"] if hasattr(out, "keys") else out
+
+
 def build_masked_example(messages: list[dict[str, str]], tokenizer: Any) -> dict[str, list[Any]]:
     """Tokenize a trajectory with the tokenizer's OWN chat template and build an
     assistant-only loss mask.
@@ -76,11 +89,11 @@ def build_masked_example(messages: list[dict[str, str]], tokenizer: Any) -> dict
     non-additive one raises rather than silently mis-mask.
     """
     msgs = normalize_for_template(messages)
-    ids = tokenizer.apply_chat_template(msgs, add_generation_prompt=False)
+    ids = _render_ids(tokenizer, msgs)
     labels = [-100] * len(ids)
     prev: list[int] = []
     for i, m in enumerate(msgs):
-        upto = tokenizer.apply_chat_template(msgs[:i + 1], add_generation_prompt=False)
+        upto = _render_ids(tokenizer, msgs[:i + 1])
         if ids[:len(upto)] != upto or upto[:len(prev)] != prev:
             raise ValueError("chat template is not additive; cannot derive an "
                              "assistant loss mask by token-prefix differencing")
diff --git a/tests/unit/test_sft.py b/tests/unit/test_sft.py
index d24eef0..d4bf179 100644
--- a/tests/unit/test_sft.py
+++ b/tests/unit/test_sft.py
@@ -27,6 +27,15 @@ class FakeTok:
         return toks
 
 
+class FakeTokBatchEncoding(FakeTok):
+    """Like FakeTok, but returns a dict as transformers >= 5's
+    ``apply_chat_template`` does (a BatchEncoding), to exercise the id-extraction."""
+
+    def apply_chat_template(self, msgs: list[dict[str, str]], add_generation_prompt: bool = False,
+                            return_tensors: Any = None) -> dict[str, list[str]]:
+        return {"input_ids": super().apply_chat_template(msgs, add_generation_prompt, return_tensors)}
+
+
 def test_normalize_folds_system_and_merges_consecutive() -> None:
     raw = [
         {"role": "system", "content": "orient"},
@@ -63,6 +72,20 @@ def test_mask_trains_assistant_turns_only() -> None:
     assert {"orient", "login", "out"} <= set(masked)       # environment masked
 
 
+def test_mask_handles_batchencoding_return() -> None:
+    # transformers >= 5 returns a BatchEncoding ({input_ids: [...]}) rather than a
+    # bare list[int]; the mask must come out identical. Regression for the 5.x bug
+    # that made every real template look "not additive".
+    raw = [
+        {"role": "user", "content": "login"},
+        {"role": "assistant", "content": "cat f"},
+        {"role": "user", "content": "out"},
+        {"role": "assistant", "content": "exit"},
+    ]
+    assert (sft.build_masked_example(raw, FakeTokBatchEncoding())
+            == sft.build_masked_example(raw, FakeTok()))
+
+
 def test_mask_raises_on_non_additive_template() -> None:
     class BadTok:
         def apply_chat_template(self, msgs: list[dict[str, str]], add_generation_prompt: bool = False,