diff --git a/CHANGELOG.md b/CHANGELOG.md index 814f7a6..0f331f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,26 +7,6 @@ are documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and the project follows [Semantic Versioning](https://semver.org/spec/v2.0.0.html). -## [1.0.3] - 2026-06-18 - -### Changed -- The README intro now states up front that this is **not tool-calling**: sekft - trains shell operation, not function-calling; the model is given no typed tool - API or JSON-schema action list, and writes plain-text commands at a real prompt - with the whole system as its action space. - -## [1.0.2] - 2026-06-18 - -### Fixed -- The generation operators (`sekft-eval`, `sekft-resident`) passed the - `BatchEncoding` from `apply_chat_template(..., return_tensors="pt")` straight - to `model.generate`, which does `inputs.shape[0]` and raised `AttributeError` - on transformers ≥ 5 — the holdout eval crashed on its first scenario. 1.0.1 - fixed only the trainer's masking; this sweeps the generation path too. A shared - `_input_ids` helper and a `render_prompt_ids` function now extract the id - tensor for both operators, with unit tests for the BatchEncoding and bare - shapes. - ## [1.0.1] - 2026-06-18 ### Fixed @@ -70,7 +50,5 @@ trajectories into a fine-tuned shell operator. mypy-strict codebase; an optional `[gpu]` extra (torch / transformers / peft); and a dependency on `posix-sdc[hub]`. Released under GPL-2.0. -[1.0.3]: https://git.code.tiararodney.com/tiara/sekft/compare/v1.0.2...v1.0.3 -[1.0.2]: https://git.code.tiararodney.com/tiara/sekft/compare/v1.0.1...v1.0.2 [1.0.1]: https://git.code.tiararodney.com/tiara/sekft/compare/v1.0.0...v1.0.1 [1.0.0]: https://git.code.tiararodney.com/tiara/sekft/releases/tag/v1.0.0 diff --git a/README.md b/README.md index 355962f..22e2e67 100644 --- a/README.md +++ b/README.md @@ -5,12 +5,6 @@ land with **no imperative**, discover where directives live, learn the provider from its own self-documentation, do the work, and terminate (`exit` on success, `panic` when genuinely blocked). -> **Not tool-calling.** sekft trains shell operation, not function-calling. The -> model is given no typed tool API and no JSON-schema action list; it writes -> plain-text commands at a real prompt, with the whole system as its action -> space, discovered like a person would (`--help`, `man`, `ls`) rather than -> enumerated up front. - sekft is the **training half**. The dataset and the synthetic-data factory live in [`posix-sdc`](../posix-sdc) (`tiararodney.posix-sdc`), which this package depends on. Here live the trainer, the behavioural evaluator, and the diff --git a/TODO b/TODO index ad3c85e..63768b4 100644 --- a/TODO +++ b/TODO @@ -271,56 +271,3 @@ Description: build_masked_example assumed apply_chat_template returns a flat the result is dict-like, and use it for both renders. The fake-tokenizer test returned a bare list and missed this, so add a BatchEncoding-returning fake and assert the mask matches. - ---ISSUE -Content-Type: application/issue -ID: 16 -Type: bugfix -Title: generation operators pass BatchEncoding to generate (transformers 5.x) -Status: done -Priority: high -Created: 2026-06-18 -Module: sekft -Relationships: -Description: The same transformers 5.x return-type change that broke - build_masked_example (#15) also breaks the generation path: - apply_chat_template(add_generation_prompt=True, - return_tensors='pt') returns a BatchEncoding, and eval.py and - resident.py pass it straight to model.generate(), which does - inputs_tensor.shape[0] -> AttributeError (the holdout eval crashed - here on scenario 1). #15 only fixed the trainer. Factor the id - extraction into a shared _input_ids helper, add - render_prompt_ids(tokenizer, messages, device) in sft.py, and use - it in both operators. Add a unit test for _input_ids covering the - BatchEncoding and bare-sequence cases. This is the sweep I should - have done at #15. - ---ISSUE -Content-Type: application/issue -ID: 17 -Type: feature -Title: docs: state up front that this is not tool-calling -Status: done -Priority: medium -Created: 2026-06-18 -Module: sekft -Relationships: -Description: Add a prominent clarification to the README intro that sekft trains - shell operation, not function-calling: the model is given no typed - tool API or JSON-schema action list; it writes plain-text commands - at a real prompt with the whole system as its action space, - discovered like a person does. - ---ISSUE -Content-Type: application/issue -ID: 18 -Type: feature -Title: docs: deliver the not-tool-calling intro clarification (1.0.3) -Status: done -Priority: medium -Created: 2026-06-18 -Module: sekft -Relationships: -Description: Deliver the not-tool-calling clarification to the README intro and - add the 1.0.3 changelog entry. The prior issue's merge carried only - the todo status; the step-4 work commit was skipped. diff --git a/src/tiararodney/sekft/eval.py b/src/tiararodney/sekft/eval.py index efec04d..5d5964c 100644 --- a/src/tiararodney/sekft/eval.py +++ b/src/tiararodney/sekft/eval.py @@ -28,7 +28,7 @@ from tiararodney.posix_sdc.factory.dashdocker import DashDocker, available from tiararodney.posix_sdc.factory.rollout import rollout from tiararodney.posix_sdc.schema import Scenario -from .sft import render_prompt_ids +from .sft import normalize_for_template def make_local_operator(base: str, adapter: str, max_new_tokens: int = 64, @@ -49,7 +49,9 @@ def make_local_operator(base: str, adapter: str, max_new_tokens: int = 64, model.eval() def operator(messages: list[dict[str, str]]) -> str: - ids = render_prompt_ids(tok, messages, model.device) + msgs = normalize_for_template(messages) + ids = tok.apply_chat_template( + msgs, add_generation_prompt=True, return_tensors="pt").to(model.device) with torch.no_grad(): out = model.generate( ids, max_new_tokens=max_new_tokens, diff --git a/src/tiararodney/sekft/resident.py b/src/tiararodney/sekft/resident.py index e9a8292..2d7bc3b 100644 --- a/src/tiararodney/sekft/resident.py +++ b/src/tiararodney/sekft/resident.py @@ -32,7 +32,7 @@ from peft import (LoraConfig, PeftModel, get_peft_model, from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, DataCollatorForSeq2Seq, Trainer, TrainingArguments) -from .sft import build_masked_example, iter_keepers, render_prompt_ids +from .sft import build_masked_example, iter_keepers, normalize_for_template LORA_TARGETS = ["q_proj", "k_proj", "v_proj", "o_proj"] @@ -132,7 +132,9 @@ class Resident: pm.eval() def operator(messages: list[dict[str, str]]) -> str: - ids = render_prompt_ids(self.tok, messages, pm.device) + msgs = normalize_for_template(messages) + ids = self.tok.apply_chat_template( + msgs, add_generation_prompt=True, return_tensors="pt").to(pm.device) with torch.no_grad(): o = pm.generate(ids, max_new_tokens=64, do_sample=temperature > 0, temperature=max(temperature, 1e-2), diff --git a/src/tiararodney/sekft/sft.py b/src/tiararodney/sekft/sft.py index 916e5d7..4e40362 100644 --- a/src/tiararodney/sekft/sft.py +++ b/src/tiararodney/sekft/sft.py @@ -62,33 +62,17 @@ def normalize_for_template(messages: list[dict[str, str]]) -> list[dict[str, str return out -def _input_ids(enc: Any) -> Any: - """The id sequence from an ``apply_chat_template`` result. transformers >= 5 - returns a ``BatchEncoding`` (``{input_ids: ...}``) where 4.x returned the - bare ``list[int]`` / tensor; return the ids either way. Passing the dict on - unfixed breaks everything downstream: the trainer's prefix-differencing sees - ``len`` as the key count, and ``model.generate`` does ``inputs.shape[0]`` on - a dict and raises ``AttributeError``.""" - return enc["input_ids"] if hasattr(enc, "keys") else enc - - def _render_ids(tokenizer: Any, msgs: list[dict[str, str]]) -> Any: - """Token ids for a rendered conversation (no generation prompt), as a flat - sequence — see :func:`_input_ids` for the BatchEncoding normalisation.""" - return _input_ids(tokenizer.apply_chat_template(msgs, add_generation_prompt=False)) + """Token ids for a rendered conversation, as a flat sequence. - -def render_prompt_ids(tokenizer: Any, messages: list[dict[str, str]], - device: Any = None) -> Any: - """The tokenized generation prompt for an operator: canonicalise the turns, - append the assistant generation prompt, and return the ``input_ids`` tensor - (extracted from the BatchEncoding on transformers >= 5), moved to ``device`` - if given. Shared by the eval and resident operators so neither feeds a - BatchEncoding to ``model.generate``.""" - enc = tokenizer.apply_chat_template( - normalize_for_template(messages), add_generation_prompt=True, return_tensors="pt") - ids = _input_ids(enc) - return ids.to(device) if device is not None else ids + ``apply_chat_template`` returns a ``BatchEncoding`` (``{input_ids: [...]}``) + on transformers >= 5, where 4.x returned a bare ``list[int]``. Normalise to + the id sequence either way, so the prefix-differencing below diffs tokens and + not a dict (a dict makes ``len`` the key count and spuriously trips the + not-additive guard). + """ + out = tokenizer.apply_chat_template(msgs, add_generation_prompt=False) + return out["input_ids"] if hasattr(out, "keys") else out def build_masked_example(messages: list[dict[str, str]], tokenizer: Any) -> dict[str, list[Any]]: diff --git a/tests/unit/test_sft.py b/tests/unit/test_sft.py index 2df7130..d4bf179 100644 --- a/tests/unit/test_sft.py +++ b/tests/unit/test_sft.py @@ -86,21 +86,6 @@ def test_mask_handles_batchencoding_return() -> None: == sft.build_masked_example(raw, FakeTok())) -def test_input_ids_extracts_from_batchencoding_or_passthrough() -> None: - # BatchEncoding (transformers 5.x) -> its input_ids; bare list/tensor (4.x) -> itself - assert sft._input_ids({"input_ids": [1, 2, 3], "attention_mask": [1, 1, 1]}) == [1, 2, 3] - assert sft._input_ids([4, 5, 6]) == [4, 5, 6] - - -def test_render_prompt_ids_normalises_and_appends_generation_prompt() -> None: - # the generation operators rely on this: fold + append , return ids - # (not a BatchEncoding) so model.generate doesn't choke on a dict. - raw = [{"role": "system", "content": "orient"}, {"role": "user", "content": "go"}] - ids = sft.render_prompt_ids(FakeTok(), raw) - assert ids[-1] == "" # generation prompt appended - assert {"orient", "go"} <= set(ids) # system folded into the user turn - - def test_mask_raises_on_non_additive_template() -> None: class BadTok: def apply_chat_template(self, msgs: list[dict[str, str]], add_generation_prompt: bool = False,