Merge branch 'feature/3'
This commit is contained in:
commit
a374e3a69c
2 changed files with 102 additions and 1 deletions
2
TODO
2
TODO
|
|
@ -52,7 +52,7 @@ Content-Type: application/issue
|
||||||
ID: 3
|
ID: 3
|
||||||
Type: feature
|
Type: feature
|
||||||
Title: Behavioural evaluator
|
Title: Behavioural evaluator
|
||||||
Status: in-progress
|
Status: done
|
||||||
Priority: medium
|
Priority: medium
|
||||||
Created: 2026-06-16
|
Created: 2026-06-16
|
||||||
Module: sekft
|
Module: sekft
|
||||||
|
|
|
||||||
101
src/tiararodney/sekft/eval.py
Normal file
101
src/tiararodney/sekft/eval.py
Normal file
|
|
@ -0,0 +1,101 @@
|
||||||
|
"""Behavioural eval: the metric that matters.
|
||||||
|
|
||||||
|
Train loss says nothing about whether the model operates the shell and leaves.
|
||||||
|
This loads a fine-tuned model (base + LoRA adapter), drops it into held-out
|
||||||
|
scenarios with NO scaffold (the trained behaviour must stand on its own), and
|
||||||
|
reports the rates that count: does it reach command-mode, does it terminate,
|
||||||
|
does the checker pass.
|
||||||
|
|
||||||
|
python eval.py --base <hf-dir> --adapter ./ckpt-mistral-r16 \
|
||||||
|
--scenarios ./holdout-scenarios --n 10
|
||||||
|
|
||||||
|
Reuses the rollout loop with a *local* operator: the model formats and
|
||||||
|
generates in the same role-delimited render it was trained on (train == eval ==
|
||||||
|
deploy, or the prompts go out of distribution). Prerequisites on the box: torch
|
||||||
|
+ transformers + peft, the ``sekft-dash`` image, and held-out SCENARIO bundles
|
||||||
|
(from ``generate.py`` -- not trajectories; the eval stands up and verifies each).
|
||||||
|
"""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from tiararodney.posix_sdc.factory.dashdocker import DashDocker, available
|
||||||
|
from tiararodney.posix_sdc.factory.rollout import rollout
|
||||||
|
from tiararodney.posix_sdc.schema import Scenario
|
||||||
|
|
||||||
|
from .sft import normalize_for_template
|
||||||
|
|
||||||
|
|
||||||
|
def make_local_operator(base: str, adapter: str, max_new_tokens: int = 64,
|
||||||
|
temperature: float = 0.7):
|
||||||
|
"""A ``messages -> command`` callable backed by base + LoRA adapter.
|
||||||
|
|
||||||
|
Renders the conversation exactly as the model was trained, appends the
|
||||||
|
assistant header, generates one turn, and cuts at the first stop marker.
|
||||||
|
"""
|
||||||
|
import torch
|
||||||
|
from peft import PeftModel
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
|
tok = AutoTokenizer.from_pretrained(adapter)
|
||||||
|
model = AutoModelForCausalLM.from_pretrained(
|
||||||
|
base, torch_dtype=torch.float16, device_map="auto")
|
||||||
|
model = PeftModel.from_pretrained(model, adapter)
|
||||||
|
model.eval()
|
||||||
|
|
||||||
|
def operator(messages):
|
||||||
|
msgs = normalize_for_template(messages)
|
||||||
|
ids = tok.apply_chat_template(
|
||||||
|
msgs, add_generation_prompt=True, return_tensors="pt").to(model.device)
|
||||||
|
with torch.no_grad():
|
||||||
|
out = model.generate(
|
||||||
|
ids, max_new_tokens=max_new_tokens,
|
||||||
|
do_sample=temperature > 0, temperature=max(temperature, 1e-2),
|
||||||
|
eos_token_id=tok.eos_token_id, pad_token_id=tok.eos_token_id)
|
||||||
|
return tok.decode(out[0][ids.shape[1]:], skip_special_tokens=True).strip()
|
||||||
|
|
||||||
|
return operator
|
||||||
|
|
||||||
|
|
||||||
|
def evaluate(base: str, adapter: str, scenarios_dir: Path, n: int,
|
||||||
|
max_steps: int, temperature: float) -> dict:
|
||||||
|
if not available():
|
||||||
|
raise SystemExit("sekft-dash image unavailable; `docker build -t sekft-dash .`")
|
||||||
|
operator = make_local_operator(base, adapter, temperature=temperature)
|
||||||
|
backend = DashDocker()
|
||||||
|
rows = []
|
||||||
|
for f in sorted(scenarios_dir.glob("*.json"))[:n]:
|
||||||
|
sc = Scenario.from_dict(json.loads(f.read_text()))
|
||||||
|
tj = rollout(sc, backend, max_steps=max_steps, temperature=temperature,
|
||||||
|
operator=operator, use_scaffold=False)
|
||||||
|
rows.append(tj)
|
||||||
|
print(f" {sc.id}: {tj.outcome} (terminal={tj.terminal} "
|
||||||
|
f"verified={tj.verified} steps={tj.steps})")
|
||||||
|
d = len(rows) or 1
|
||||||
|
return {
|
||||||
|
"n": len(rows),
|
||||||
|
"operate_rate": round(sum(t.steps > 0 and t.meta.get("clean") for t in rows) / d, 3),
|
||||||
|
"terminate_rate": round(sum(t.terminal in ("exit", "panic") for t in rows) / d, 3),
|
||||||
|
"verified_rate": round(sum(t.verified for t in rows) / d, 3),
|
||||||
|
"clean_rate": round(sum(t.keep for t in rows) / d, 3),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
ap = argparse.ArgumentParser(description="Behavioural eval of a tuned model.")
|
||||||
|
ap.add_argument("--base", required=True)
|
||||||
|
ap.add_argument("--adapter", required=True)
|
||||||
|
ap.add_argument("--scenarios", type=Path, required=True)
|
||||||
|
ap.add_argument("--n", type=int, default=10)
|
||||||
|
ap.add_argument("--max-steps", type=int, default=30)
|
||||||
|
ap.add_argument("--temperature", type=float, default=0.7)
|
||||||
|
ns = ap.parse_args()
|
||||||
|
m = evaluate(ns.base, ns.adapter, ns.scenarios, ns.n, ns.max_steps, ns.temperature)
|
||||||
|
print("\n=== behavioural metrics ===")
|
||||||
|
print(json.dumps(m, indent=2))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Add table
Add a link
Reference in a new issue