Merge branch 'bugfix/15'
bugfix(15): normalise apply_chat_template's BatchEncoding (transformers 5.x)
This commit is contained in:
commit
e1f8ef8d1a
4 changed files with 52 additions and 3 deletions
13
CHANGELOG.md
13
CHANGELOG.md
|
|
@ -7,6 +7,18 @@ are documented in this file.
|
||||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
|
||||||
and the project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
and the project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||||
|
|
||||||
|
## [1.0.1] - 2026-06-18
|
||||||
|
|
||||||
|
### Fixed
|
||||||
|
- `build_masked_example` could not derive the assistant mask on transformers
|
||||||
|
≥ 5: `apply_chat_template` now returns a `BatchEncoding` (`{input_ids: [...]}`)
|
||||||
|
where 4.x returned a bare `list[int]`, so the render was treated as a dict and
|
||||||
|
the prefix-differencing spuriously raised "chat template is not additive" on
|
||||||
|
every real model. The id sequence is now extracted either way; verified the
|
||||||
|
assistant-only mask against `mistralai/Mistral-7B-Instruct-v0.2`. The
|
||||||
|
fake-tokenizer test gained a `BatchEncoding`-returning variant so this can't
|
||||||
|
regress.
|
||||||
|
|
||||||
## [1.0.0] - 2026-06-18
|
## [1.0.0] - 2026-06-18
|
||||||
|
|
||||||
First release: the training and evaluation pipeline that turns posix-sdc
|
First release: the training and evaluation pipeline that turns posix-sdc
|
||||||
|
|
@ -38,4 +50,5 @@ trajectories into a fine-tuned shell operator.
|
||||||
mypy-strict codebase; an optional `[gpu]` extra (torch / transformers / peft);
|
mypy-strict codebase; an optional `[gpu]` extra (torch / transformers / peft);
|
||||||
and a dependency on `posix-sdc[hub]`. Released under GPL-2.0.
|
and a dependency on `posix-sdc[hub]`. Released under GPL-2.0.
|
||||||
|
|
||||||
|
[1.0.1]: https://git.code.tiararodney.com/tiara/sekft/compare/v1.0.0...v1.0.1
|
||||||
[1.0.0]: https://git.code.tiararodney.com/tiara/sekft/releases/tag/v1.0.0
|
[1.0.0]: https://git.code.tiararodney.com/tiara/sekft/releases/tag/v1.0.0
|
||||||
|
|
|
||||||
2
TODO
2
TODO
|
|
@ -255,7 +255,7 @@ Content-Type: application/issue
|
||||||
ID: 15
|
ID: 15
|
||||||
Type: bugfix
|
Type: bugfix
|
||||||
Title: apply_chat_template returns BatchEncoding on transformers 5.x
|
Title: apply_chat_template returns BatchEncoding on transformers 5.x
|
||||||
Status: in-progress
|
Status: done
|
||||||
Priority: high
|
Priority: high
|
||||||
Created: 2026-06-18
|
Created: 2026-06-18
|
||||||
Module: sekft
|
Module: sekft
|
||||||
|
|
|
||||||
|
|
@ -62,6 +62,19 @@ def normalize_for_template(messages: list[dict[str, str]]) -> list[dict[str, str
|
||||||
return out
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _render_ids(tokenizer: Any, msgs: list[dict[str, str]]) -> Any:
|
||||||
|
"""Token ids for a rendered conversation, as a flat sequence.
|
||||||
|
|
||||||
|
``apply_chat_template`` returns a ``BatchEncoding`` (``{input_ids: [...]}``)
|
||||||
|
on transformers >= 5, where 4.x returned a bare ``list[int]``. Normalise to
|
||||||
|
the id sequence either way, so the prefix-differencing below diffs tokens and
|
||||||
|
not a dict (a dict makes ``len`` the key count and spuriously trips the
|
||||||
|
not-additive guard).
|
||||||
|
"""
|
||||||
|
out = tokenizer.apply_chat_template(msgs, add_generation_prompt=False)
|
||||||
|
return out["input_ids"] if hasattr(out, "keys") else out
|
||||||
|
|
||||||
|
|
||||||
def build_masked_example(messages: list[dict[str, str]], tokenizer: Any) -> dict[str, list[Any]]:
|
def build_masked_example(messages: list[dict[str, str]], tokenizer: Any) -> dict[str, list[Any]]:
|
||||||
"""Tokenize a trajectory with the tokenizer's OWN chat template and build an
|
"""Tokenize a trajectory with the tokenizer's OWN chat template and build an
|
||||||
assistant-only loss mask.
|
assistant-only loss mask.
|
||||||
|
|
@ -76,11 +89,11 @@ def build_masked_example(messages: list[dict[str, str]], tokenizer: Any) -> dict
|
||||||
non-additive one raises rather than silently mis-mask.
|
non-additive one raises rather than silently mis-mask.
|
||||||
"""
|
"""
|
||||||
msgs = normalize_for_template(messages)
|
msgs = normalize_for_template(messages)
|
||||||
ids = tokenizer.apply_chat_template(msgs, add_generation_prompt=False)
|
ids = _render_ids(tokenizer, msgs)
|
||||||
labels = [-100] * len(ids)
|
labels = [-100] * len(ids)
|
||||||
prev: list[int] = []
|
prev: list[int] = []
|
||||||
for i, m in enumerate(msgs):
|
for i, m in enumerate(msgs):
|
||||||
upto = tokenizer.apply_chat_template(msgs[:i + 1], add_generation_prompt=False)
|
upto = _render_ids(tokenizer, msgs[:i + 1])
|
||||||
if ids[:len(upto)] != upto or upto[:len(prev)] != prev:
|
if ids[:len(upto)] != upto or upto[:len(prev)] != prev:
|
||||||
raise ValueError("chat template is not additive; cannot derive an "
|
raise ValueError("chat template is not additive; cannot derive an "
|
||||||
"assistant loss mask by token-prefix differencing")
|
"assistant loss mask by token-prefix differencing")
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,15 @@ class FakeTok:
|
||||||
return toks
|
return toks
|
||||||
|
|
||||||
|
|
||||||
|
class FakeTokBatchEncoding(FakeTok):
|
||||||
|
"""Like FakeTok, but returns a dict as transformers >= 5's
|
||||||
|
``apply_chat_template`` does (a BatchEncoding), to exercise the id-extraction."""
|
||||||
|
|
||||||
|
def apply_chat_template(self, msgs: list[dict[str, str]], add_generation_prompt: bool = False,
|
||||||
|
return_tensors: Any = None) -> dict[str, list[str]]:
|
||||||
|
return {"input_ids": super().apply_chat_template(msgs, add_generation_prompt, return_tensors)}
|
||||||
|
|
||||||
|
|
||||||
def test_normalize_folds_system_and_merges_consecutive() -> None:
|
def test_normalize_folds_system_and_merges_consecutive() -> None:
|
||||||
raw = [
|
raw = [
|
||||||
{"role": "system", "content": "orient"},
|
{"role": "system", "content": "orient"},
|
||||||
|
|
@ -63,6 +72,20 @@ def test_mask_trains_assistant_turns_only() -> None:
|
||||||
assert {"orient", "login", "out"} <= set(masked) # environment masked
|
assert {"orient", "login", "out"} <= set(masked) # environment masked
|
||||||
|
|
||||||
|
|
||||||
|
def test_mask_handles_batchencoding_return() -> None:
|
||||||
|
# transformers >= 5 returns a BatchEncoding ({input_ids: [...]}) rather than a
|
||||||
|
# bare list[int]; the mask must come out identical. Regression for the 5.x bug
|
||||||
|
# that made every real template look "not additive".
|
||||||
|
raw = [
|
||||||
|
{"role": "user", "content": "login"},
|
||||||
|
{"role": "assistant", "content": "cat f"},
|
||||||
|
{"role": "user", "content": "out"},
|
||||||
|
{"role": "assistant", "content": "exit"},
|
||||||
|
]
|
||||||
|
assert (sft.build_masked_example(raw, FakeTokBatchEncoding())
|
||||||
|
== sft.build_masked_example(raw, FakeTok()))
|
||||||
|
|
||||||
|
|
||||||
def test_mask_raises_on_non_additive_template() -> None:
|
def test_mask_raises_on_non_additive_template() -> None:
|
||||||
class BadTok:
|
class BadTok:
|
||||||
def apply_chat_template(self, msgs: list[dict[str, str]], add_generation_prompt: bool = False,
|
def apply_chat_template(self, msgs: list[dict[str, str]], add_generation_prompt: bool = False,
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue