feat(12): load training data from a raw dir, a curated jsonl, or the Hub
iter_keepers read only raw per-trajectory .json -- one of three input shapes. Add load_turns(data, hub, revision) yielding assistant-bearing turns from a raw rollout dir (keep-filtered), a curated .jsonl corpus (one record per line), or the published corpus via posix-sdc's load_trajectories (the in-repo data/ of a checkout, else the Hugging Face Hub). sekft-train gains --hub and --revision and dispatches --data by dir-vs-.jsonl; train() and inspect() use it. Raw-rollout reading stays sekft-local; curated + Hub reuse posix-sdc's loader, imported lazily so the raw/jsonl paths need neither posix-sdc nor huggingface_hub installed. Unit tests cover the raw-dir and jsonl dispatch.
This commit is contained in:
parent
d78a8028d2
commit
414e963825
2 changed files with 84 additions and 10 deletions
35
tests/unit/test_load.py
Normal file
35
tests/unit/test_load.py
Normal file
|
|
@ -0,0 +1,35 @@
|
|||
"""Unit tests for the trainer's three-source data loader (raw dir / curated
|
||||
jsonl). The Hub path delegates to posix-sdc and is covered there."""
|
||||
from __future__ import annotations
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
from tiararodney.sekft import sft
|
||||
|
||||
|
||||
def test_load_turns_from_raw_dir(tmp_path: Path) -> None:
|
||||
(tmp_path / "a.json").write_text(json.dumps(
|
||||
{"keep": True, "turns": [{"role": "assistant", "content": "ls"}]}))
|
||||
(tmp_path / "b.json").write_text(json.dumps( # not kept -> excluded
|
||||
{"keep": False, "turns": [{"role": "assistant", "content": "rm -rf /"}]}))
|
||||
got = list(sft.load_turns(tmp_path))
|
||||
assert len(got) == 1
|
||||
assert got[0][0]["content"] == "ls"
|
||||
|
||||
|
||||
def test_load_turns_from_jsonl(tmp_path: Path) -> None:
|
||||
f = tmp_path / "corpus.jsonl"
|
||||
f.write_text("\n".join(json.dumps({"turns": [{"role": "assistant", "content": c}]})
|
||||
for c in ("ls", "cat x")) + "\n")
|
||||
got = list(sft.load_turns(f))
|
||||
assert [t[0]["content"] for t in got] == ["ls", "cat x"]
|
||||
|
||||
|
||||
def test_load_turns_rejects_other_paths(tmp_path: Path) -> None:
|
||||
bad = tmp_path / "notes.txt"
|
||||
bad.write_text("hi")
|
||||
with pytest.raises(SystemExit):
|
||||
list(sft.load_turns(bad))
|
||||
Loading…
Add table
Add a link
Reference in a new issue