diff --git a/CHANGELOG.md b/CHANGELOG.md index 0f331f5..4408334 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,18 +7,6 @@ are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and the project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [1.0.1] - 2026-06-18 - -### Fixed -- `build_masked_example` could not derive the assistant mask on transformers - ≥ 5: `apply_chat_template` now returns a `BatchEncoding` (`{input_ids: [...]}`) - where 4.x returned a bare `list[int]`, so the render was treated as a dict and - the prefix-differencing spuriously raised "chat template is not additive" on - every real model. The id sequence is now extracted either way; verified the - assistant-only mask against `mistralai/Mistral-7B-Instruct-v0.2`. The - fake-tokenizer test gained a `BatchEncoding`-returning variant so this can't - regress. - ## [1.0.0] - 2026-06-18 First release: the training and evaluation pipeline that turns posix-sdc @@ -50,5 +38,4 @@ trajectories into a fine-tuned shell operator. mypy-strict codebase; an optional `[gpu]` extra (torch / transformers / peft); and a dependency on `posix-sdc[hub]`. Released under GPL-2.0. -[1.0.1]: https://git.code.tiararodney.com/tiara/sekft/compare/v1.0.0...v1.0.1 [1.0.0]: https://git.code.tiararodney.com/tiara/sekft/releases/tag/v1.0.0 diff --git a/TODO b/TODO index 63768b4..4f2c672 100644 --- a/TODO +++ b/TODO @@ -249,25 +249,3 @@ Description: The lock committed with the triplet (#13) predated the published and its transitive deps into the lock. Commit the refreshed Pipfile.lock so the next machine installs the published wheel with the Hub path available. - ---ISSUE -Content-Type: application/issue -ID: 15 -Type: bugfix -Title: apply_chat_template returns BatchEncoding on transformers 5.x -Status: done -Priority: high -Created: 2026-06-18 -Module: sekft -Relationships: -Description: build_masked_example assumed apply_chat_template returns a flat - list[int] (transformers 4.x). On transformers 5.x it returns a - BatchEncoding ({input_ids: [...]}), so ids was a dict, len(ids) was - the key count, and the prefix-differencing spuriously raised 'chat - template is not additive' on every real model (verified against - mistralai/Mistral-7B-Instruct-v0.2). The masking logic is sound and - the Mistral template is additive; only the return type needs - normalising. Add a _render_ids helper that extracts input_ids when - the result is dict-like, and use it for both renders. The - fake-tokenizer test returned a bare list and missed this, so add a - BatchEncoding-returning fake and assert the mask matches. diff --git a/src/tiararodney/sekft/sft.py b/src/tiararodney/sekft/sft.py index 4e40362..ce0e478 100644 --- a/src/tiararodney/sekft/sft.py +++ b/src/tiararodney/sekft/sft.py @@ -62,19 +62,6 @@ def normalize_for_template(messages: list[dict[str, str]]) -> list[dict[str, str return out -def _render_ids(tokenizer: Any, msgs: list[dict[str, str]]) -> Any: - """Token ids for a rendered conversation, as a flat sequence. - - ``apply_chat_template`` returns a ``BatchEncoding`` (``{input_ids: [...]}``) - on transformers >= 5, where 4.x returned a bare ``list[int]``. Normalise to - the id sequence either way, so the prefix-differencing below diffs tokens and - not a dict (a dict makes ``len`` the key count and spuriously trips the - not-additive guard). - """ - out = tokenizer.apply_chat_template(msgs, add_generation_prompt=False) - return out["input_ids"] if hasattr(out, "keys") else out - - def build_masked_example(messages: list[dict[str, str]], tokenizer: Any) -> dict[str, list[Any]]: """Tokenize a trajectory with the tokenizer's OWN chat template and build an assistant-only loss mask. @@ -89,11 +76,11 @@ def build_masked_example(messages: list[dict[str, str]], tokenizer: Any) -> dict non-additive one raises rather than silently mis-mask. """ msgs = normalize_for_template(messages) - ids = _render_ids(tokenizer, msgs) + ids = tokenizer.apply_chat_template(msgs, add_generation_prompt=False) labels = [-100] * len(ids) prev: list[int] = [] for i, m in enumerate(msgs): - upto = _render_ids(tokenizer, msgs[:i + 1]) + upto = tokenizer.apply_chat_template(msgs[:i + 1], add_generation_prompt=False) if ids[:len(upto)] != upto or upto[:len(prev)] != prev: raise ValueError("chat template is not additive; cannot derive an " "assistant loss mask by token-prefix differencing") diff --git a/tests/unit/test_sft.py b/tests/unit/test_sft.py index d4bf179..d24eef0 100644 --- a/tests/unit/test_sft.py +++ b/tests/unit/test_sft.py @@ -27,15 +27,6 @@ class FakeTok: return toks -class FakeTokBatchEncoding(FakeTok): - """Like FakeTok, but returns a dict as transformers >= 5's - ``apply_chat_template`` does (a BatchEncoding), to exercise the id-extraction.""" - - def apply_chat_template(self, msgs: list[dict[str, str]], add_generation_prompt: bool = False, - return_tensors: Any = None) -> dict[str, list[str]]: - return {"input_ids": super().apply_chat_template(msgs, add_generation_prompt, return_tensors)} - - def test_normalize_folds_system_and_merges_consecutive() -> None: raw = [ {"role": "system", "content": "orient"}, @@ -72,20 +63,6 @@ def test_mask_trains_assistant_turns_only() -> None: assert {"orient", "login", "out"} <= set(masked) # environment masked -def test_mask_handles_batchencoding_return() -> None: - # transformers >= 5 returns a BatchEncoding ({input_ids: [...]}) rather than a - # bare list[int]; the mask must come out identical. Regression for the 5.x bug - # that made every real template look "not additive". - raw = [ - {"role": "user", "content": "login"}, - {"role": "assistant", "content": "cat f"}, - {"role": "user", "content": "out"}, - {"role": "assistant", "content": "exit"}, - ] - assert (sft.build_masked_example(raw, FakeTokBatchEncoding()) - == sft.build_masked_example(raw, FakeTok())) - - def test_mask_raises_on_non_additive_template() -> None: class BadTok: def apply_chat_template(self, msgs: list[dict[str, str]], add_generation_prompt: bool = False,