Merge branch 'develop'

This commit is contained in:
Tiara Rodney 2026-06-18 12:37:18 +02:00
commit a76470e55d
Signed by: tiara
GPG key ID: 5CD8EC1D46106723
4 changed files with 73 additions and 2 deletions

View file

@ -7,6 +7,18 @@ are documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
and the project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html). and the project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [1.0.1] - 2026-06-18
### Fixed
- `build_masked_example` could not derive the assistant mask on transformers
≥ 5: `apply_chat_template` now returns a `BatchEncoding` (`{input_ids: [...]}`)
where 4.x returned a bare `list[int]`, so the render was treated as a dict and
the prefix-differencing spuriously raised "chat template is not additive" on
every real model. The id sequence is now extracted either way; verified the
assistant-only mask against `mistralai/Mistral-7B-Instruct-v0.2`. The
fake-tokenizer test gained a `BatchEncoding`-returning variant so this can't
regress.
## [1.0.0] - 2026-06-18 ## [1.0.0] - 2026-06-18
First release: the training and evaluation pipeline that turns posix-sdc First release: the training and evaluation pipeline that turns posix-sdc
@ -38,4 +50,5 @@ trajectories into a fine-tuned shell operator.
mypy-strict codebase; an optional `[gpu]` extra (torch / transformers / peft); mypy-strict codebase; an optional `[gpu]` extra (torch / transformers / peft);
and a dependency on `posix-sdc[hub]`. Released under GPL-2.0. and a dependency on `posix-sdc[hub]`. Released under GPL-2.0.
[1.0.1]: https://git.code.tiararodney.com/tiara/sekft/compare/v1.0.0...v1.0.1
[1.0.0]: https://git.code.tiararodney.com/tiara/sekft/releases/tag/v1.0.0 [1.0.0]: https://git.code.tiararodney.com/tiara/sekft/releases/tag/v1.0.0

22
TODO
View file

@ -249,3 +249,25 @@ Description: The lock committed with the triplet (#13) predated the published
and its transitive deps into the lock. Commit the refreshed and its transitive deps into the lock. Commit the refreshed
Pipfile.lock so the next machine installs the published wheel with Pipfile.lock so the next machine installs the published wheel with
the Hub path available. the Hub path available.
--ISSUE
Content-Type: application/issue
ID: 15
Type: bugfix
Title: apply_chat_template returns BatchEncoding on transformers 5.x
Status: done
Priority: high
Created: 2026-06-18
Module: sekft
Relationships:
Description: build_masked_example assumed apply_chat_template returns a flat
list[int] (transformers 4.x). On transformers 5.x it returns a
BatchEncoding ({input_ids: [...]}), so ids was a dict, len(ids) was
the key count, and the prefix-differencing spuriously raised 'chat
template is not additive' on every real model (verified against
mistralai/Mistral-7B-Instruct-v0.2). The masking logic is sound and
the Mistral template is additive; only the return type needs
normalising. Add a _render_ids helper that extracts input_ids when
the result is dict-like, and use it for both renders. The
fake-tokenizer test returned a bare list and missed this, so add a
BatchEncoding-returning fake and assert the mask matches.

View file

@ -62,6 +62,19 @@ def normalize_for_template(messages: list[dict[str, str]]) -> list[dict[str, str
return out return out
def _render_ids(tokenizer: Any, msgs: list[dict[str, str]]) -> Any:
"""Token ids for a rendered conversation, as a flat sequence.
``apply_chat_template`` returns a ``BatchEncoding`` (``{input_ids: [...]}``)
on transformers >= 5, where 4.x returned a bare ``list[int]``. Normalise to
the id sequence either way, so the prefix-differencing below diffs tokens and
not a dict (a dict makes ``len`` the key count and spuriously trips the
not-additive guard).
"""
out = tokenizer.apply_chat_template(msgs, add_generation_prompt=False)
return out["input_ids"] if hasattr(out, "keys") else out
def build_masked_example(messages: list[dict[str, str]], tokenizer: Any) -> dict[str, list[Any]]: def build_masked_example(messages: list[dict[str, str]], tokenizer: Any) -> dict[str, list[Any]]:
"""Tokenize a trajectory with the tokenizer's OWN chat template and build an """Tokenize a trajectory with the tokenizer's OWN chat template and build an
assistant-only loss mask. assistant-only loss mask.
@ -76,11 +89,11 @@ def build_masked_example(messages: list[dict[str, str]], tokenizer: Any) -> dict
non-additive one raises rather than silently mis-mask. non-additive one raises rather than silently mis-mask.
""" """
msgs = normalize_for_template(messages) msgs = normalize_for_template(messages)
ids = tokenizer.apply_chat_template(msgs, add_generation_prompt=False) ids = _render_ids(tokenizer, msgs)
labels = [-100] * len(ids) labels = [-100] * len(ids)
prev: list[int] = [] prev: list[int] = []
for i, m in enumerate(msgs): for i, m in enumerate(msgs):
upto = tokenizer.apply_chat_template(msgs[:i + 1], add_generation_prompt=False) upto = _render_ids(tokenizer, msgs[:i + 1])
if ids[:len(upto)] != upto or upto[:len(prev)] != prev: if ids[:len(upto)] != upto or upto[:len(prev)] != prev:
raise ValueError("chat template is not additive; cannot derive an " raise ValueError("chat template is not additive; cannot derive an "
"assistant loss mask by token-prefix differencing") "assistant loss mask by token-prefix differencing")

View file

@ -27,6 +27,15 @@ class FakeTok:
return toks return toks
class FakeTokBatchEncoding(FakeTok):
"""Like FakeTok, but returns a dict as transformers >= 5's
``apply_chat_template`` does (a BatchEncoding), to exercise the id-extraction."""
def apply_chat_template(self, msgs: list[dict[str, str]], add_generation_prompt: bool = False,
return_tensors: Any = None) -> dict[str, list[str]]:
return {"input_ids": super().apply_chat_template(msgs, add_generation_prompt, return_tensors)}
def test_normalize_folds_system_and_merges_consecutive() -> None: def test_normalize_folds_system_and_merges_consecutive() -> None:
raw = [ raw = [
{"role": "system", "content": "orient"}, {"role": "system", "content": "orient"},
@ -63,6 +72,20 @@ def test_mask_trains_assistant_turns_only() -> None:
assert {"orient", "login", "out"} <= set(masked) # environment masked assert {"orient", "login", "out"} <= set(masked) # environment masked
def test_mask_handles_batchencoding_return() -> None:
# transformers >= 5 returns a BatchEncoding ({input_ids: [...]}) rather than a
# bare list[int]; the mask must come out identical. Regression for the 5.x bug
# that made every real template look "not additive".
raw = [
{"role": "user", "content": "login"},
{"role": "assistant", "content": "cat f"},
{"role": "user", "content": "out"},
{"role": "assistant", "content": "exit"},
]
assert (sft.build_masked_example(raw, FakeTokBatchEncoding())
== sft.build_masked_example(raw, FakeTok()))
def test_mask_raises_on_non_additive_template() -> None: def test_mask_raises_on_non_additive_template() -> None:
class BadTok: class BadTok:
def apply_chat_template(self, msgs: list[dict[str, str]], add_generation_prompt: bool = False, def apply_chat_template(self, msgs: list[dict[str, str]], add_generation_prompt: bool = False,