Merge pull request 'feat: eval package scaffold — CLI + schemas + types + task loader' (#85) from feat/69-eval-scaffold into main

feat: eval package scaffold — CLI + schemas + types + task loader (#85)
2026-06-05 00:23:56 +00:00
parent b94234652a ae81e4b5ac
commit f373945304
41 changed files with 2317 additions and 1 deletions
@@ -0,0 +1,196 @@
+import type { StepEntry } from "@united-workforce/protocol";
+import { beforeEach, describe, expect, test, vi } from "vitest";
+
+import {
+  runFrontmatterJudge,
+  runHallucinationJudge,
+  runTokenStatsJudge,
+  runUpstreamJudge,
+} from "../src/judge/builtin/index.js";
+
+// Mock the shared read-steps helper so the judges never shell out to `uwf`.
+vi.mock("../src/judge/builtin/read-steps.js", () => ({
+  readThreadSteps: vi.fn(),
+}));
+
+import { readThreadSteps } from "../src/judge/builtin/read-steps.js";
+
+const mockedReadSteps = vi.mocked(readThreadSteps);
+
+function makeStep(overrides: Partial<StepEntry>): StepEntry {
+  return {
+    hash: "HASH000000000",
+    role: "worker",
+    output: "---\n$status: done\n---\n\nbody",
+    detail: "DETAIL0000000",
+    agent: "hermes",
+    timestamp: 0,
+    durationMs: 0,
+    usage: null,
+    ...overrides,
+  };
+}
+
+beforeEach(() => {
+  mockedReadSteps.mockReset();
+});
+
+describe("frontmatter-compliance judge", () => {
+  test("all steps have valid frontmatter → score 1.0", async () => {
+    mockedReadSteps.mockReturnValue([
+      makeStep({ role: "a", output: "---\n$status: done\n---\n\nwork" }),
+      makeStep({ role: "b", output: "---\n$status: needs_input\n---\nmore" }),
+    ]);
+
+    const result = await runFrontmatterJudge("T1");
+    const data = result.data as { stepsTotal: number; stepsValid: number; invalidSteps: unknown[] };
+
+    expect(result.score).toBe(1.0);
+    expect(data.stepsTotal).toBe(2);
+    expect(data.stepsValid).toBe(2);
+    expect(data.invalidSteps).toHaveLength(0);
+  });
+
+  test("some steps missing $status → partial score", async () => {
+    mockedReadSteps.mockReturnValue([
+      makeStep({ role: "a", output: "---\n$status: done\n---\nok" }),
+      makeStep({ role: "b", output: "---\nfoo: bar\n---\nmissing status" }),
+      makeStep({ role: "c", output: "no frontmatter at all" }),
+    ]);
+
+    const result = await runFrontmatterJudge("T2");
+    const data = result.data as {
+      stepsTotal: number;
+      stepsValid: number;
+      invalidSteps: Array<{ stepIndex: number; role: string; errors: string[] }>;
+    };
+
+    expect(result.score).toBeCloseTo(1 / 3, 10);
+    expect(data.stepsTotal).toBe(3);
+    expect(data.stepsValid).toBe(1);
+    expect(data.invalidSteps).toHaveLength(2);
+    expect(data.invalidSteps[0]).toMatchObject({ stepIndex: 1, role: "b" });
+    expect(data.invalidSteps[1]).toMatchObject({ stepIndex: 2, role: "c" });
+  });
+
+  test("no steps → score 0 (0/0 edge case)", async () => {
+    mockedReadSteps.mockReturnValue([]);
+
+    const result = await runFrontmatterJudge("T3");
+    const data = result.data as { stepsTotal: number; stepsValid: number; invalidSteps: unknown[] };
+
+    expect(result.score).toBe(0);
+    expect(data.stepsTotal).toBe(0);
+    expect(data.stepsValid).toBe(0);
+    expect(data.invalidSteps).toHaveLength(0);
+  });
+
+  test("empty-string $status counts as invalid", async () => {
+    mockedReadSteps.mockReturnValue([makeStep({ role: "a", output: '---\n$status: ""\n---\nx' })]);
+
+    const result = await runFrontmatterJudge("T4");
+    expect(result.score).toBe(0);
+  });
+});
+
+describe("token-stats judge", () => {
+  test("steps with usage → sums correctly", async () => {
+    mockedReadSteps.mockReturnValue([
+      makeStep({
+        role: "a",
+        usage: { turns: 2, inputTokens: 100, outputTokens: 50, duration: 1.5 },
+      }),
+      makeStep({
+        role: "b",
+        usage: { turns: 3, inputTokens: 200, outputTokens: 75, duration: 2.0 },
+      }),
+    ]);
+
+    const result = await runTokenStatsJudge("T1");
+    const data = result.data as {
+      totalInput: number;
+      totalOutput: number;
+      totalTurns: number;
+      perStep: Array<{ role: string; inputTokens: number; outputTokens: number; turns: number }>;
+    };
+
+    expect(result.score).toBe(1.0);
+    expect(data.totalInput).toBe(300);
+    expect(data.totalOutput).toBe(125);
+    expect(data.totalTurns).toBe(5);
+    expect(data.perStep).toHaveLength(2);
+    expect(data.perStep[0]).toEqual({
+      role: "a",
+      inputTokens: 100,
+      outputTokens: 50,
+      turns: 2,
+      duration: 1.5,
+    });
+  });
+
+  test("steps with null usage → zeros", async () => {
+    mockedReadSteps.mockReturnValue([
+      makeStep({ role: "a", usage: null }),
+      makeStep({ role: "b", usage: null }),
+    ]);
+
+    const result = await runTokenStatsJudge("T2");
+    const data = result.data as {
+      totalInput: number;
+      totalOutput: number;
+      totalTurns: number;
+      perStep: Array<{
+        inputTokens: number;
+        outputTokens: number;
+        turns: number;
+        duration: number;
+      }>;
+    };
+
+    expect(result.score).toBe(1.0);
+    expect(data.totalInput).toBe(0);
+    expect(data.totalOutput).toBe(0);
+    expect(data.totalTurns).toBe(0);
+    expect(data.perStep[0]).toEqual({
+      role: "a",
+      inputTokens: 0,
+      outputTokens: 0,
+      turns: 0,
+      duration: 0,
+    });
+  });
+
+  test("empty steps → all zeros, score 1.0", async () => {
+    mockedReadSteps.mockReturnValue([]);
+
+    const result = await runTokenStatsJudge("T3");
+    const data = result.data as {
+      totalInput: number;
+      totalOutput: number;
+      totalTurns: number;
+      perStep: unknown[];
+    };
+
+    expect(result.score).toBe(1.0);
+    expect(data.totalInput).toBe(0);
+    expect(data.totalOutput).toBe(0);
+    expect(data.totalTurns).toBe(0);
+    expect(data.perStep).toHaveLength(0);
+  });
+});
+
+describe("LLM-as-judge stubs", () => {
+  test("upstream-consumption returns a stub", async () => {
+    const result = await runUpstreamJudge("T1");
+    expect(result.score).toBe(0);
+    expect(result.data).toEqual({ perStep: [] });
+    expect(result.schema.title).toBe("@uwf/eval-judge-upstream");
+  });
+
+  test("hallucination returns a stub", async () => {
+    const result = await runHallucinationJudge("T1");
+    expect(result.score).toBe(0);
+    expect(result.data).toEqual({ perStep: [] });
+    expect(result.schema.title).toBe("@uwf/eval-judge-hallucination");
+  });
+});
@@ -0,0 +1,152 @@
+import { bootstrap, createMemoryStore } from "@ocas/core";
+import { describe, expect, test } from "vitest";
+import type { JudgeRunner } from "../src/runner/index.js";
+import { collect, computeOverall } from "../src/runner/index.js";
+import type { EvalRunConfig, EvalStore } from "../src/storage/index.js";
+import type { JudgeEntry, TaskManifest } from "../src/task/index.js";
+
+function makeJudge(name: string, weight: number, builtin: boolean): JudgeEntry {
+  return {
+    name,
+    weight,
+    builtin,
+    entry: builtin ? null : `dist/judges/${name}.js`,
+    schema: null,
+  };
+}
+
+function makeManifest(judges: JudgeEntry[]): TaskManifest {
+  return {
+    name: "fix-off-by-one",
+    description: "test task",
+    workflow: "solve-issue",
+    prompt: "Fix the bug",
+    limits: { maxSteps: 10, timeoutMinutes: 30 },
+    judges,
+  };
+}
+
+function makeEvalStore(): EvalStore {
+  const store = createMemoryStore();
+  bootstrap(store);
+  return { store, varStore: store.var };
+}
+
+const CONFIG: EvalRunConfig = {
+  agent: "hermes",
+  model: "claude-sonnet-4",
+  engineVersion: "test",
+};
+
+/** Returns a fixed score per judge name. */
+function scriptedRunner(scores: Record<string, number>): JudgeRunner {
+  return async (_taskDir, _workDir, _threadId, judge) => ({
+    score: scores[judge.name] ?? 0,
+    data: { judged: judge.name },
+    schema: { type: "object" },
+  });
+}
+
+describe("computeOverall", () => {
+  test("computes the weighted average correctly", () => {
+    const overall = computeOverall([
+      { score: 0.8, weight: 0.3 },
+      { score: 0.6, weight: 0.3 },
+      { score: 1.0, weight: 0.4 },
+    ]);
+    // 0.24 + 0.18 + 0.4 = 0.82
+    expect(overall).toBeCloseTo(0.82, 10);
+  });
+
+  test("a weight-0 judge does not affect the result", () => {
+    const withInformational = computeOverall([
+      { score: 1.0, weight: 1.0 },
+      { score: 0.0, weight: 0.0 },
+    ]);
+    expect(withInformational).toBe(1.0);
+  });
+
+  test("returns 0 when total weight is 0", () => {
+    expect(computeOverall([{ score: 0.5, weight: 0 }])).toBe(0);
+  });
+});
+
+describe("collect", () => {
+  test("computes weighted score correctly across judges", async () => {
+    const evalStore = makeEvalStore();
+    const manifest = makeManifest([
+      makeJudge("test-pass", 0.6, false),
+      makeJudge("code-quality", 0.4, false),
+    ]);
+    const runJudge = scriptedRunner({ "test-pass": 1.0, "code-quality": 0.5 });
+
+    const result = await collect(
+      {
+        evalStore,
+        taskDir: "/tmp/task",
+        workDir: "/tmp/work",
+        threadId: "THREAD123",
+        manifest,
+        config: CONFIG,
+      },
+      runJudge,
+    );
+
+    // 1.0 * 0.6 + 0.5 * 0.4 = 0.8
+    expect(result.overall).toBeCloseTo(0.8, 10);
+    expect(result.runHash).toBeTruthy();
+    expect(result.judges).toHaveLength(2);
+    expect(result.judges[0]).toEqual({ name: "test-pass", score: 1.0, weight: 0.6 });
+
+    const latest = evalStore.varStore.list({
+      exactName: "@uwf/eval/fix-off-by-one/latest",
+    });
+    expect(latest[0]?.value).toBe(result.runHash);
+  });
+
+  test("handles a judge with weight 0 (informational)", async () => {
+    const evalStore = makeEvalStore();
+    const manifest = makeManifest([
+      makeJudge("test-pass", 1.0, false),
+      makeJudge("token-stats", 0, true),
+    ]);
+    // token-stats is builtin → default runner would score 0; give scripted score
+    // that would skew the result if it were counted.
+    const runJudge = scriptedRunner({ "test-pass": 0.5, "token-stats": 1.0 });
+
+    const result = await collect(
+      {
+        evalStore,
+        taskDir: "/tmp/task",
+        workDir: "/tmp/work",
+        threadId: "THREAD123",
+        manifest,
+        config: CONFIG,
+      },
+      runJudge,
+    );
+
+    // Only test-pass (weight 1.0) counts → overall = 0.5
+    expect(result.overall).toBeCloseTo(0.5, 10);
+    expect(result.judges).toHaveLength(2);
+    const tokenStats = result.judges.find((j) => j.name === "token-stats");
+    expect(tokenStats?.weight).toBe(0);
+  });
+
+  test("unknown builtin judge name throws via the default runner", async () => {
+    const evalStore = makeEvalStore();
+    const manifest = makeManifest([makeJudge("not-a-real-judge", 1.0, true)]);
+
+    // Use the default runner (no injected runner) → builtin dispatch → unknown name throws.
+    await expect(
+      collect({
+        evalStore,
+        taskDir: "/tmp/task",
+        workDir: "/tmp/work",
+        threadId: "THREAD123",
+        manifest,
+        config: CONFIG,
+      }),
+    ).rejects.toThrow(/unknown builtin judge/);
+  });
+});
@@ -0,0 +1,171 @@
+import { bootstrap, createMemoryStore, putSchema } from "@ocas/core";
+import type { CasRef } from "@united-workforce/protocol";
+import { describe, expect, test } from "vitest";
+
+import {
+  formatDiff,
+  formatList,
+  formatReport,
+  readEvalEntries,
+  readEvalRun,
+  selectEntries,
+} from "../src/commands/index.js";
+import type { EvalRunPayload, EvalStore } from "../src/storage/index.js";
+import { EVAL_RUN_SCHEMA, setEvalLatest } from "../src/storage/index.js";
+
+function makeEvalStore(): EvalStore {
+  const store = createMemoryStore();
+  bootstrap(store);
+  return { store, varStore: store.var };
+}
+
+function makePayload(
+  task: string,
+  overall: number,
+  timestamp: number,
+  judges: EvalRunPayload["judges"] = [
+    {
+      name: "frontmatter-compliance",
+      score: 1.0,
+      weight: 0.6,
+      dataHash: "AAAAAAAAAAAAA" as CasRef,
+    },
+    { name: "token-stats", score: 0.5, weight: 0, dataHash: "BBBBBBBBBBBBB" as CasRef },
+  ],
+  config: EvalRunPayload["config"] = {
+    agent: "hermes",
+    model: "claude-sonnet-4",
+    engineVersion: "1.0.0",
+  },
+): EvalRunPayload {
+  return { task, config, threadId: "THREAD0123456789", judges, overall, timestamp };
+}
+
+/** Store an eval-run node in CAS and index it under @uwf/eval/<task>/latest. */
+function storeRun(evalStore: EvalStore, payload: EvalRunPayload): string {
+  const schemaHash = putSchema(evalStore.store, EVAL_RUN_SCHEMA);
+  const hash = evalStore.store.cas.put(schemaHash, payload);
+  setEvalLatest(evalStore.varStore, payload.task, hash);
+  return hash;
+}
+
+describe("formatReport", () => {
+  test("includes task, overall, config and judges", () => {
+    const payload = makePayload("fix-off-by-one", 0.8, Date.UTC(2026, 0, 2, 3, 4, 5));
+    const output = formatReport(payload, "RUNHASH123456");
+
+    expect(output).toContain("fix-off-by-one");
+    expect(output).toContain("0.8000");
+    expect(output).toContain("hermes");
+    expect(output).toContain("claude-sonnet-4");
+    expect(output).toContain("1.0.0");
+    expect(output).toContain("frontmatter-compliance");
+    expect(output).toContain("token-stats");
+    expect(output).toContain("THREAD0123456789");
+    expect(output).toContain("RUNHASH123456");
+  });
+
+  test("round-trips a stored run via readEvalRun", () => {
+    const evalStore = makeEvalStore();
+    const payload = makePayload("fix-off-by-one", 0.75, Date.now());
+    const hash = storeRun(evalStore, payload);
+
+    const loaded = readEvalRun(evalStore, hash);
+    expect(loaded).not.toBeNull();
+    const output = formatReport(loaded as EvalRunPayload, hash);
+    expect(output).toContain("fix-off-by-one");
+    expect(output).toContain("0.7500");
+  });
+
+  test("readEvalRun returns null for a missing hash", () => {
+    const evalStore = makeEvalStore();
+    expect(readEvalRun(evalStore, "NOPENOPENOPE0")).toBeNull();
+  });
+});
+
+describe("list", () => {
+  test("lists eval runs stored under different tasks", () => {
+    const evalStore = makeEvalStore();
+    storeRun(evalStore, makePayload("fix-off-by-one", 0.8, 2000));
+    storeRun(evalStore, makePayload("write-docs", 0.6, 1000));
+
+    const entries = readEvalEntries(evalStore);
+    expect(entries).toHaveLength(2);
+
+    const output = formatList(selectEntries(entries, null, 20));
+    expect(output).toContain("fix-off-by-one");
+    expect(output).toContain("write-docs");
+  });
+
+  test("sorts newest-first by timestamp", () => {
+    const evalStore = makeEvalStore();
+    storeRun(evalStore, makePayload("old-task", 0.5, 1000));
+    storeRun(evalStore, makePayload("new-task", 0.5, 2000));
+
+    const selected = selectEntries(readEvalEntries(evalStore), null, 20);
+    expect(selected[0]?.task).toBe("new-task");
+    expect(selected[1]?.task).toBe("old-task");
+  });
+
+  test("--task filter only shows the matching task", () => {
+    const evalStore = makeEvalStore();
+    storeRun(evalStore, makePayload("fix-off-by-one", 0.8, 2000));
+    storeRun(evalStore, makePayload("write-docs", 0.6, 1000));
+
+    const output = formatList(selectEntries(readEvalEntries(evalStore), "write-docs", 20));
+    expect(output).toContain("write-docs");
+    expect(output).not.toContain("fix-off-by-one");
+  });
+
+  test("--limit caps the number of rows", () => {
+    const evalStore = makeEvalStore();
+    storeRun(evalStore, makePayload("task-a", 0.8, 3000));
+    storeRun(evalStore, makePayload("task-b", 0.6, 2000));
+    storeRun(evalStore, makePayload("task-c", 0.4, 1000));
+
+    const selected = selectEntries(readEvalEntries(evalStore), null, 2);
+    expect(selected).toHaveLength(2);
+    expect(selected.map((e) => e.task)).toEqual(["task-a", "task-b"]);
+  });
+
+  test("empty store renders a placeholder", () => {
+    const evalStore = makeEvalStore();
+    const output = formatList(selectEntries(readEvalEntries(evalStore), null, 20));
+    expect(output).toContain("(no eval runs found)");
+  });
+});
+
+describe("formatDiff", () => {
+  test("shows an upward delta when B scores higher", () => {
+    const a = makePayload("fix-off-by-one", 0.6, 1000);
+    const b = makePayload("fix-off-by-one", 0.8, 2000);
+    const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000");
+
+    expect(output).toContain("▲");
+    expect(output).toContain("HASHA00000000");
+    expect(output).toContain("HASHB00000000");
+  });
+
+  test("shows a downward delta when B scores lower", () => {
+    const a = makePayload("fix-off-by-one", 0.9, 1000);
+    const b = makePayload("fix-off-by-one", 0.4, 2000);
+    const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000");
+    expect(output).toContain("▼");
+  });
+
+  test("marks differing config values", () => {
+    const a = makePayload("fix-off-by-one", 0.6, 1000, undefined, {
+      agent: "hermes",
+      model: "claude-sonnet-4",
+      engineVersion: "1.0.0",
+    });
+    const b = makePayload("fix-off-by-one", 0.6, 2000, undefined, {
+      agent: "claude-code",
+      model: "claude-sonnet-4",
+      engineVersion: "1.0.0",
+    });
+    const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000");
+    expect(output).toContain("≠");
+    expect(output).toContain("claude-code");
+  });
+});
@@ -0,0 +1,74 @@
+import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+
+import { afterEach, beforeEach, describe, expect, test } from "vitest";
+
+import { prepare } from "../src/runner/index.js";
+
+const TASK_YAML = `
+name: fix-off-by-one
+description: Fix an off-by-one error
+workflow: solve-issue
+prompt: "Fix the bug"
+limits:
+  maxSteps: 12
+  timeoutMinutes: 20
+judges:
+  - name: frontmatter-compliance
+    weight: 0.5
+    builtin: true
+  - name: test-pass
+    weight: 0.5
+    entry: dist/judges/test-pass.js
+`;
+
+let taskDir: string;
+
+beforeEach(async () => {
+  taskDir = await mkdtemp(join(tmpdir(), "uwf-eval-task-"));
+  await writeFile(join(taskDir, "task.yaml"), TASK_YAML, "utf8");
+  const fixtureDir = join(taskDir, "fixture");
+  await mkdir(join(fixtureDir, "src"), { recursive: true });
+  await writeFile(join(fixtureDir, "src", "calc.ts"), "export const add = (a, b) => a + b + 1;\n");
+  await writeFile(join(fixtureDir, "package.json"), '{ "name": "fixture" }\n');
+});
+
+afterEach(async () => {
+  await rm(taskDir, { recursive: true, force: true });
+});
+
+describe("prepare", () => {
+  test("returns the parsed manifest", async () => {
+    const result = await prepare(taskDir);
+    expect(result.taskDir).toBe(taskDir);
+    expect(result.manifest.name).toBe("fix-off-by-one");
+    expect(result.manifest.workflow).toBe("solve-issue");
+    expect(result.manifest.limits.maxSteps).toBe(12);
+    expect(result.manifest.judges).toHaveLength(2);
+  });
+
+  test("copies fixture into a fresh temp work dir", async () => {
+    const result = await prepare(taskDir);
+    expect(result.workDir).not.toBe(taskDir);
+    expect(result.workDir.startsWith(tmpdir())).toBe(true);
+
+    const calc = await readFile(join(result.workDir, "src", "calc.ts"), "utf8");
+    expect(calc).toContain("export const add");
+    const pkg = await readFile(join(result.workDir, "package.json"), "utf8");
+    expect(pkg).toContain("fixture");
+
+    await rm(result.workDir, { recursive: true, force: true });
+  });
+
+  test("creates an empty work dir when no fixture/ exists", async () => {
+    const noFixtureDir = await mkdtemp(join(tmpdir(), "uwf-eval-nofix-"));
+    await writeFile(join(noFixtureDir, "task.yaml"), TASK_YAML, "utf8");
+
+    const result = await prepare(noFixtureDir);
+    expect(result.workDir.startsWith(tmpdir())).toBe(true);
+
+    await rm(noFixtureDir, { recursive: true, force: true });
+    await rm(result.workDir, { recursive: true, force: true });
+  });
+});
@@ -0,0 +1,63 @@
+import { describe, expect, test } from "vitest";
+import {
+  EVAL_JUDGE_FRONTMATTER_SCHEMA,
+  EVAL_JUDGE_HALLUCINATION_SCHEMA,
+  EVAL_JUDGE_TOKEN_STATS_SCHEMA,
+  EVAL_JUDGE_UPSTREAM_SCHEMA,
+  EVAL_RUN_SCHEMA,
+} from "../src/storage/index.js";
+
+describe("OCAS schema definitions", () => {
+  test("eval-run schema has correct title and required fields", () => {
+    expect(EVAL_RUN_SCHEMA.title).toBe("@uwf/eval-run");
+    const required = EVAL_RUN_SCHEMA.required as string[];
+    expect(required).toContain("task");
+    expect(required).toContain("config");
+    expect(required).toContain("threadId");
+    expect(required).toContain("judges");
+    expect(required).toContain("overall");
+    expect(required).toContain("timestamp");
+  });
+
+  test("frontmatter judge schema has correct title", () => {
+    expect(EVAL_JUDGE_FRONTMATTER_SCHEMA.title).toBe("@uwf/eval-judge-frontmatter");
+    const required = EVAL_JUDGE_FRONTMATTER_SCHEMA.required as string[];
+    expect(required).toContain("stepsTotal");
+    expect(required).toContain("stepsValid");
+    expect(required).toContain("invalidSteps");
+  });
+
+  test("upstream judge schema has correct title", () => {
+    expect(EVAL_JUDGE_UPSTREAM_SCHEMA.title).toBe("@uwf/eval-judge-upstream");
+    const required = EVAL_JUDGE_UPSTREAM_SCHEMA.required as string[];
+    expect(required).toContain("perStep");
+  });
+
+  test("hallucination judge schema has correct title", () => {
+    expect(EVAL_JUDGE_HALLUCINATION_SCHEMA.title).toBe("@uwf/eval-judge-hallucination");
+    const required = EVAL_JUDGE_HALLUCINATION_SCHEMA.required as string[];
+    expect(required).toContain("perStep");
+  });
+
+  test("token-stats judge schema has correct title", () => {
+    expect(EVAL_JUDGE_TOKEN_STATS_SCHEMA.title).toBe("@uwf/eval-judge-token-stats");
+    const required = EVAL_JUDGE_TOKEN_STATS_SCHEMA.required as string[];
+    expect(required).toContain("totalInput");
+    expect(required).toContain("totalOutput");
+    expect(required).toContain("totalTurns");
+    expect(required).toContain("perStep");
+  });
+
+  test("all schemas have type object at root", () => {
+    const schemas = [
+      EVAL_RUN_SCHEMA,
+      EVAL_JUDGE_FRONTMATTER_SCHEMA,
+      EVAL_JUDGE_UPSTREAM_SCHEMA,
+      EVAL_JUDGE_HALLUCINATION_SCHEMA,
+      EVAL_JUDGE_TOKEN_STATS_SCHEMA,
+    ];
+    for (const s of schemas) {
+      expect(s.type).toBe("object");
+    }
+  });
+});
@@ -0,0 +1,163 @@
+import { describe, expect, test } from "vitest";
+import { parseTaskManifest } from "../src/task/index.js";
+
+const VALID_YAML = `
+name: fix-off-by-one
+description: Fix an off-by-one error in a calculator
+workflow: solve-issue
+prompt: "Fix the bug: add(1,2) returns 4 instead of 3"
+limits:
+  maxSteps: 15
+  timeoutMinutes: 30
+judges:
+  - name: frontmatter-compliance
+    weight: 0.15
+    builtin: true
+  - name: test-pass
+    weight: 0.3
+    entry: dist/judges/test-pass.js
+    schema: schemas/test-pass.json
+`;
+
+describe("parseTaskManifest", () => {
+  test("parses valid task.yaml", () => {
+    const manifest = parseTaskManifest(VALID_YAML);
+    expect(manifest.name).toBe("fix-off-by-one");
+    expect(manifest.description).toBe("Fix an off-by-one error in a calculator");
+    expect(manifest.workflow).toBe("solve-issue");
+    expect(manifest.prompt).toBe("Fix the bug: add(1,2) returns 4 instead of 3");
+    expect(manifest.limits).toEqual({ maxSteps: 15, timeoutMinutes: 30 });
+    expect(manifest.judges).toHaveLength(2);
+  });
+
+  test("parses builtin judge", () => {
+    const manifest = parseTaskManifest(VALID_YAML);
+    const builtin = manifest.judges[0];
+    expect(builtin).toBeDefined();
+    expect(builtin!.name).toBe("frontmatter-compliance");
+    expect(builtin!.weight).toBe(0.15);
+    expect(builtin!.builtin).toBe(true);
+    expect(builtin!.entry).toBeNull();
+  });
+
+  test("parses custom judge with entry + schema", () => {
+    const manifest = parseTaskManifest(VALID_YAML);
+    const custom = manifest.judges[1];
+    expect(custom).toBeDefined();
+    expect(custom!.name).toBe("test-pass");
+    expect(custom!.weight).toBe(0.3);
+    expect(custom!.builtin).toBe(false);
+    expect(custom!.entry).toBe("dist/judges/test-pass.js");
+    expect(custom!.schema).toBe("schemas/test-pass.json");
+  });
+
+  test("defaults limits when omitted", () => {
+    const yaml = `
+name: minimal
+workflow: solve-issue
+prompt: do something
+judges:
+  - name: check
+    builtin: true
+`;
+    const manifest = parseTaskManifest(yaml);
+    expect(manifest.limits).toEqual({ maxSteps: 20, timeoutMinutes: 30 });
+  });
+
+  test("defaults description to empty string", () => {
+    const yaml = `
+name: no-desc
+workflow: solve-issue
+prompt: do something
+judges:
+  - name: check
+    builtin: true
+`;
+    const manifest = parseTaskManifest(yaml);
+    expect(manifest.description).toBe("");
+  });
+
+  test("rejects missing name", () => {
+    const yaml = `
+workflow: solve-issue
+prompt: do something
+judges:
+  - name: check
+    builtin: true
+`;
+    expect(() => parseTaskManifest(yaml)).toThrow("name is required");
+  });
+
+  test("rejects missing workflow", () => {
+    const yaml = `
+name: test
+prompt: do something
+judges:
+  - name: check
+    builtin: true
+`;
+    expect(() => parseTaskManifest(yaml)).toThrow("workflow is required");
+  });
+
+  test("rejects missing prompt", () => {
+    const yaml = `
+name: test
+workflow: solve-issue
+judges:
+  - name: check
+    builtin: true
+`;
+    expect(() => parseTaskManifest(yaml)).toThrow("prompt is required");
+  });
+
+  test("rejects empty judges array", () => {
+    const yaml = `
+name: test
+workflow: solve-issue
+prompt: do something
+judges: []
+`;
+    expect(() => parseTaskManifest(yaml)).toThrow("at least one judge");
+  });
+
+  test("rejects non-builtin judge without entry", () => {
+    const yaml = `
+name: test
+workflow: solve-issue
+prompt: do something
+judges:
+  - name: custom-check
+    weight: 0.5
+`;
+    expect(() => parseTaskManifest(yaml)).toThrow("non-builtin judge must have entry");
+  });
+
+  test("rejects non-object YAML root", () => {
+    expect(() => parseTaskManifest("just a string")).toThrow("must be a YAML mapping");
+  });
+
+  test("rejects judge without name", () => {
+    const yaml = `
+name: test
+workflow: solve-issue
+prompt: do something
+judges:
+  - weight: 0.5
+    builtin: true
+`;
+    expect(() => parseTaskManifest(yaml)).toThrow("name is required");
+  });
+
+  test("defaults weight to 0 when omitted", () => {
+    const yaml = `
+name: test
+workflow: solve-issue
+prompt: do something
+judges:
+  - name: token-stats
+    builtin: true
+`;
+    const manifest = parseTaskManifest(yaml);
+    expect(manifest.judges[0]!.weight).toBe(0);
+  });
+});
@@ -0,0 +1,46 @@
+{
+  "name": "@united-workforce/eval",
+  "version": "0.1.0",
+  "private": true,
+  "files": [
+    "src",
+    "dist",
+    "package.json"
+  ],
+  "type": "module",
+  "bin": {
+    "uwf-eval": "./dist/cli.js"
+  },
+  "exports": {
+    ".": {
+      "types": "./dist/index.d.ts",
+      "import": "./dist/index.js"
+    }
+  },
+  "scripts": {
+    "prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
+    "test": "vitest run __tests__/",
+    "test:ci": "vitest run __tests__/"
+  },
+  "dependencies": {
+    "@ocas/core": "^0.3.0",
+    "@ocas/fs": "^0.3.0",
+    "@united-workforce/protocol": "workspace:^",
+    "@united-workforce/util": "workspace:^",
+    "commander": "^14.0.3",
+    "yaml": "^2.9.0"
+  },
+  "devDependencies": {
+    "typescript": "^5.8.3"
+  },
+  "repository": {
+    "type": "git",
+    "url": "https://git.shazhou.work/shazhou/united-workforce.git",
+    "directory": "packages/eval"
+  },
+  "homepage": "https://git.shazhou.work/shazhou/united-workforce#readme",
+  "bugs": {
+    "url": "https://git.shazhou.work/shazhou/united-workforce/issues"
+  },
+  "license": "MIT"
+}
@@ -0,0 +1,22 @@
+#!/usr/bin/env node
+import { Command } from "commander";
+import {
+  registerDiffCommand,
+  registerListCommand,
+  registerReportCommand,
+  registerRunCommand,
+} from "./commands/index.js";
+
+const program = new Command();
+
+program
+  .name("uwf-eval")
+  .description("Evaluate uwf workflow quality with real agents")
+  .version("0.1.0");
+
+registerRunCommand(program);
+registerReportCommand(program);
+registerDiffCommand(program);
+registerListCommand(program);
+
+program.parse();
@@ -0,0 +1,38 @@
+import { createLogger } from "@united-workforce/util";
+import type { Command } from "commander";
+
+import { createEvalStore } from "../storage/index.js";
+import { formatDiff } from "./format.js";
+import { readEvalRun } from "./read.js";
+
+const log = createLogger({ sink: { kind: "stderr" } });
+const LOG_DIFF = "D3WZ8N5T";
+
+export function registerDiffCommand(program: Command): void {
+  program
+    .command("diff <hash1> <hash2>")
+    .description("Compare two eval runs side-by-side")
+    .action(async (hash1: string, hash2: string) => {
+      try {
+        const evalStore = await createEvalStore();
+        const payloadA = readEvalRun(evalStore, hash1);
+        if (payloadA === null) {
+          process.stderr.write(`eval run not found: ${hash1}\n`);
+          process.exitCode = 1;
+          return;
+        }
+        const payloadB = readEvalRun(evalStore, hash2);
+        if (payloadB === null) {
+          process.stderr.write(`eval run not found: ${hash2}\n`);
+          process.exitCode = 1;
+          return;
+        }
+        log(LOG_DIFF, `diff a=${hash1} b=${hash2}`);
+        process.stdout.write(formatDiff(payloadA, hash1, payloadB, hash2));
+      } catch (e) {
+        const message = e instanceof Error ? e.message : String(e);
+        process.stderr.write(`${message}\n`);
+        process.exitCode = 1;
+      }
+    });
+}
@@ -0,0 +1,148 @@
+import type { EvalRunPayload } from "../storage/index.js";
+import type { EvalListEntry } from "./types.js";
+
+const NAME_WIDTH = 28;
+const SCORE_WIDTH = 10;
+const TIMESTAMP_WIDTH = 26;
+
+/** Format a 0..1 score (or weight) with fixed precision. */
+function formatScore(value: number): string {
+  return value.toFixed(4);
+}
+
+/** Human-readable ISO-8601 timestamp from epoch milliseconds. */
+function formatTimestamp(ms: number): string {
+  return new Date(ms).toISOString();
+}
+
+/** Right-pad to a fixed column width (with a trailing space if already full). */
+function pad(value: string, width: number): string {
+  return value.length >= width ? `${value} ` : value.padEnd(width);
+}
+
+/** Directional indicator for a score delta (B relative to A). */
+function formatDelta(delta: number): string {
+  if (delta > 0) {
+    return `▲ +${formatScore(delta)}`;
+  }
+  if (delta < 0) {
+    return `▼ ${formatScore(delta)}`;
+  }
+  return `= ${formatScore(0)}`;
+}
+
+/** Render a single eval run as a human-readable report. */
+export function formatReport(payload: EvalRunPayload, runHash: string): string {
+  const lines: string[] = [];
+  lines.push("=== Eval Report ===");
+  lines.push(`Task:       ${payload.task}`);
+  lines.push(`Overall:    ${formatScore(payload.overall)}`);
+  lines.push(`Timestamp:  ${formatTimestamp(payload.timestamp)}`);
+  lines.push("");
+  lines.push("Config:");
+  lines.push(`  Agent:    ${payload.config.agent}`);
+  lines.push(`  Model:    ${payload.config.model}`);
+  lines.push(`  Engine:   ${payload.config.engineVersion}`);
+  lines.push("");
+  lines.push("Judges:");
+  lines.push(`  ${pad("NAME", NAME_WIDTH)}${pad("SCORE", SCORE_WIDTH)}WEIGHT`);
+  for (const judge of payload.judges) {
+    lines.push(
+      `  ${pad(judge.name, NAME_WIDTH)}${pad(formatScore(judge.score), SCORE_WIDTH)}${formatScore(judge.weight)}`,
+    );
+  }
+  lines.push("");
+  lines.push(`Thread:     ${payload.threadId}`);
+  lines.push(`Run:        ${runHash}`);
+  return `${lines.join("\n")}\n`;
+}
+
+/** Render a side-by-side comparison of two eval runs. */
+export function formatDiff(
+  payloadA: EvalRunPayload,
+  hashA: string,
+  payloadB: EvalRunPayload,
+  hashB: string,
+): string {
+  const lines: string[] = [];
+  lines.push("=== Eval Diff ===");
+  lines.push(`A: ${hashA}  (${payloadA.task})`);
+  lines.push(`B: ${hashB}  (${payloadB.task})`);
+  lines.push("");
+
+  const overallDelta = payloadB.overall - payloadA.overall;
+  lines.push("Overall:");
+  lines.push(
+    `  A=${formatScore(payloadA.overall)}  B=${formatScore(payloadB.overall)}  ${formatDelta(overallDelta)}`,
+  );
+  lines.push("");
+
+  lines.push("Config:");
+  lines.push(configLine("Agent", payloadA.config.agent, payloadB.config.agent));
+  lines.push(configLine("Model", payloadA.config.model, payloadB.config.model));
+  lines.push(configLine("Engine", payloadA.config.engineVersion, payloadB.config.engineVersion));
+  lines.push("");
+
+  lines.push("Judges:");
+  lines.push(`  ${pad("NAME", NAME_WIDTH)}${pad("A", SCORE_WIDTH)}${pad("B", SCORE_WIDTH)}DELTA`);
+  const scoresA = new Map(payloadA.judges.map((judge) => [judge.name, judge.score]));
+  const scoresB = new Map(payloadB.judges.map((judge) => [judge.name, judge.score]));
+  for (const name of unionJudgeNames(payloadA, payloadB)) {
+    const scoreA = scoresA.get(name);
+    const scoreB = scoresB.get(name);
+    const cellA = scoreA === undefined ? "—" : formatScore(scoreA);
+    const cellB = scoreB === undefined ? "—" : formatScore(scoreB);
+    const delta = scoreA !== undefined && scoreB !== undefined ? formatDelta(scoreB - scoreA) : "";
+    lines.push(
+      `  ${pad(name, NAME_WIDTH)}${pad(cellA, SCORE_WIDTH)}${pad(cellB, SCORE_WIDTH)}${delta}`,
+    );
+  }
+  return `${lines.join("\n")}\n`;
+}
+
+/** Render a table of indexed eval runs. */
+export function formatList(entries: ReadonlyArray<EvalListEntry>): string {
+  const lines: string[] = [];
+  lines.push(
+    `  ${pad("TASK", NAME_WIDTH)}${pad("OVERALL", SCORE_WIDTH)}${pad("TIMESTAMP", TIMESTAMP_WIDTH)}HASH`,
+  );
+  if (entries.length === 0) {
+    lines.push("  (no eval runs found)");
+  }
+  for (const entry of entries) {
+    lines.push(
+      `  ${pad(entry.task, NAME_WIDTH)}${pad(formatScore(entry.overall), SCORE_WIDTH)}${pad(formatTimestamp(entry.timestamp), TIMESTAMP_WIDTH)}${entry.hash}`,
+    );
+  }
+  return `${lines.join("\n")}\n`;
+}
+
+/** Sort newest-first, then apply optional task filter and result limit. */
+export function selectEntries(
+  entries: ReadonlyArray<EvalListEntry>,
+  task: string | null,
+  limit: number | null,
+): EvalListEntry[] {
+  const sorted = [...entries].sort((a, b) => b.timestamp - a.timestamp);
+  const filtered = task !== null ? sorted.filter((entry) => entry.task === task) : sorted;
+  return limit !== null ? filtered.slice(0, limit) : filtered;
+}
+
+/** Ordered union of judge names: A's order first, then B-only names. */
+function unionJudgeNames(payloadA: EvalRunPayload, payloadB: EvalRunPayload): string[] {
+  const names: string[] = [];
+  const seen = new Set<string>();
+  for (const judge of [...payloadA.judges, ...payloadB.judges]) {
+    if (!seen.has(judge.name)) {
+      seen.add(judge.name);
+      names.push(judge.name);
+    }
+  }
+  return names;
+}
+
+/** One config row: `=` when equal, `≠` otherwise. */
+function configLine(label: string, valueA: string, valueB: string): string {
+  const marker = valueA === valueB ? "=" : "≠";
+  return `  ${pad(`${label}:`, SCORE_WIDTH)}${marker} A=${valueA}  B=${valueB}`;
+}
@@ -0,0 +1,7 @@
+export { registerDiffCommand } from "./diff.js";
+export { formatDiff, formatList, formatReport, selectEntries } from "./format.js";
+export { registerListCommand } from "./list.js";
+export { readEvalEntries, readEvalRun } from "./read.js";
+export { registerReportCommand } from "./report.js";
+export { registerRunCommand } from "./run.js";
+export type { EvalListEntry } from "./types.js";
@@ -0,0 +1,43 @@
+import { createLogger } from "@united-workforce/util";
+import type { Command } from "commander";
+
+import { createEvalStore } from "../storage/index.js";
+import { formatList, selectEntries } from "./format.js";
+import { readEvalEntries } from "./read.js";
+
+const log = createLogger({ sink: { kind: "stderr" } });
+const LOG_LIST = "L5KX9R2B";
+
+type ListCliOptions = {
+  task: string | undefined;
+  limit: string;
+};
+
+export function registerListCommand(program: Command): void {
+  program
+    .command("list")
+    .description("List past eval runs")
+    .option("--task <name>", "filter by task name")
+    .option("--limit <n>", "max results", "20")
+    .action(async (opts: ListCliOptions) => {
+      const limit = Number.parseInt(opts.limit, 10);
+      if (!Number.isInteger(limit) || limit < 1) {
+        process.stderr.write("--limit must be a positive integer\n");
+        process.exitCode = 1;
+        return;
+      }
+
+      try {
+        const evalStore = await createEvalStore();
+        const entries = readEvalEntries(evalStore);
+        const task = opts.task ?? null;
+        const selected = selectEntries(entries, task, limit);
+        log(LOG_LIST, `list task=${task ?? "*"} found=${entries.length} shown=${selected.length}`);
+        process.stdout.write(formatList(selected));
+      } catch (e) {
+        const message = e instanceof Error ? e.message : String(e);
+        process.stderr.write(`${message}\n`);
+        process.exitCode = 1;
+      }
+    });
+}
@@ -0,0 +1,41 @@
+import type { EvalRunPayload, EvalStore } from "../storage/index.js";
+import type { EvalListEntry } from "./types.js";
+
+/** Variable prefix and suffix for eval run pointers (`@uwf/eval/<task>/latest`). */
+const EVAL_VAR_PREFIX = "@uwf/eval/";
+const EVAL_VAR_SUFFIX = "/latest";
+
+/** Read a single eval-run payload from CAS. Returns null when the node is absent. */
+export function readEvalRun(evalStore: EvalStore, hash: string): EvalRunPayload | null {
+  const node = evalStore.store.cas.get(hash);
+  if (node === null) {
+    return null;
+  }
+  return node.payload as EvalRunPayload;
+}
+
+/**
+ * Read every indexed eval run by scanning `@uwf/eval/*\/latest` variables and
+ * loading the referenced CAS node. Dangling pointers are skipped.
+ */
+export function readEvalEntries(evalStore: EvalStore): EvalListEntry[] {
+  const { store, varStore } = evalStore;
+  const entries: EvalListEntry[] = [];
+  for (const variable of varStore.list()) {
+    if (!variable.name.startsWith(EVAL_VAR_PREFIX) || !variable.name.endsWith(EVAL_VAR_SUFFIX)) {
+      continue;
+    }
+    const node = store.cas.get(variable.value);
+    if (node === null) {
+      continue;
+    }
+    const payload = node.payload as EvalRunPayload;
+    entries.push({
+      task: payload.task,
+      overall: payload.overall,
+      timestamp: payload.timestamp,
+      hash: variable.value,
+    });
+  }
+  return entries;
+}
@@ -0,0 +1,32 @@
+import { createLogger } from "@united-workforce/util";
+import type { Command } from "commander";
+
+import { createEvalStore } from "../storage/index.js";
+import { formatReport } from "./format.js";
+import { readEvalRun } from "./read.js";
+
+const log = createLogger({ sink: { kind: "stderr" } });
+const LOG_REPORT = "R7QP2M4K";
+
+export function registerReportCommand(program: Command): void {
+  program
+    .command("report <hash>")
+    .description("Show eval run results")
+    .action(async (hash: string) => {
+      try {
+        const evalStore = await createEvalStore();
+        const payload = readEvalRun(evalStore, hash);
+        if (payload === null) {
+          process.stderr.write(`eval run not found: ${hash}\n`);
+          process.exitCode = 1;
+          return;
+        }
+        log(LOG_REPORT, `report task=${payload.task} hash=${hash}`);
+        process.stdout.write(formatReport(payload, hash));
+      } catch (e) {
+        const message = e instanceof Error ? e.message : String(e);
+        process.stderr.write(`${message}\n`);
+        process.exitCode = 1;
+      }
+    });
+}
@@ -0,0 +1,84 @@
+import { resolve } from "node:path";
+
+import type { Command } from "commander";
+import type { RunResult } from "../runner/index.js";
+import { collect, execute, getEngineVersion, prepare } from "../runner/index.js";
+import type { EvalRunConfig } from "../storage/index.js";
+import { createEvalStore } from "../storage/index.js";
+
+type RunCliOptions = {
+  agent: string;
+  model: string | undefined;
+  count: string;
+};
+
+async function runOnce(
+  taskDir: string,
+  agent: string,
+  model: string,
+  engineVersion: string,
+): Promise<RunResult> {
+  const prepared = await prepare(taskDir);
+  const { manifest, workDir } = prepared;
+
+  const { threadId } = await execute({
+    workDir,
+    workflow: manifest.workflow,
+    prompt: manifest.prompt,
+    agent,
+    maxSteps: manifest.limits.maxSteps,
+  });
+
+  const evalStore = await createEvalStore();
+  const config: EvalRunConfig = { agent, model, engineVersion };
+  const collected = await collect({
+    evalStore,
+    taskDir: prepared.taskDir,
+    workDir,
+    threadId,
+    manifest,
+    config,
+  });
+
+  return {
+    runHash: collected.runHash,
+    overall: collected.overall,
+    task: manifest.name,
+    judges: collected.judges,
+  };
+}
+
+export function registerRunCommand(program: Command): void {
+  program
+    .command("run <task>")
+    .description("Run eval on a task directory or tarball")
+    .option("--agent <name>", "agent adapter to use", "hermes")
+    .option("--model <model>", "model override")
+    .option("--count <n>", "number of eval runs", "1")
+    .action(async (task: string, opts: RunCliOptions) => {
+      const taskDir = resolve(task);
+      const agent = opts.agent;
+      const model = opts.model ?? "";
+      const count = Number.parseInt(opts.count, 10);
+      if (!Number.isInteger(count) || count < 1) {
+        process.stderr.write("--count must be a positive integer\n");
+        process.exitCode = 1;
+        return;
+      }
+
+      const engineVersion = getEngineVersion();
+
+      try {
+        const results: RunResult[] = [];
+        for (let i = 0; i < count; i++) {
+          results.push(await runOnce(taskDir, agent, model, engineVersion));
+        }
+        const output = count === 1 ? results[0] : results;
+        process.stdout.write(`${JSON.stringify(output)}\n`);
+      } catch (e) {
+        const message = e instanceof Error ? e.message : String(e);
+        process.stderr.write(`${message}\n`);
+        process.exitCode = 1;
+      }
+    });
+}
@@ -0,0 +1,9 @@
+import type { CasRef } from "@united-workforce/protocol";
+
+/** Summary row for the `list` command: one indexed eval run. */
+export type EvalListEntry = {
+  task: string;
+  overall: number;
+  timestamp: number;
+  hash: CasRef;
+};
@@ -0,0 +1,34 @@
+// Judge types
+export type { JudgeInput, JudgeOutput } from "./judge/index.js";
+export type {
+  CollectInput,
+  CollectResult,
+  ExecuteInput,
+  ExecuteResult,
+  JudgeRunner,
+  JudgeRunOutput,
+  JudgeSummary,
+  PrepareResult,
+  RunOptions,
+  RunResult,
+} from "./runner/index.js";
+// Runner (prepare → execute → collect)
+export { collect, computeOverall, execute, getEngineVersion, prepare } from "./runner/index.js";
+export type {
+  EvalJudgeRecord,
+  EvalRunConfig,
+  EvalRunPayload,
+  EvalStore,
+} from "./storage/index.js";
+// Storage schemas and types
+export {
+  createEvalStore,
+  EVAL_JUDGE_FRONTMATTER_SCHEMA,
+  EVAL_JUDGE_HALLUCINATION_SCHEMA,
+  EVAL_JUDGE_TOKEN_STATS_SCHEMA,
+  EVAL_JUDGE_UPSTREAM_SCHEMA,
+  EVAL_RUN_SCHEMA,
+  setEvalLatest,
+} from "./storage/index.js";
+export type { JudgeEntry, TaskLimits, TaskManifest } from "./task/index.js";
+export { loadTaskManifest, parseTaskManifest } from "./task/index.js";
@@ -0,0 +1,95 @@
+import { createLogger } from "@united-workforce/util";
+import { parse as parseYaml } from "yaml";
+
+import { EVAL_JUDGE_FRONTMATTER_SCHEMA } from "../../storage/index.js";
+import { readThreadSteps } from "./read-steps.js";
+import type { BuiltinJudgeOutput } from "./types.js";
+
+const log = createLogger({ sink: { kind: "stderr" } });
+
+const LOG_RESULT = "F2QH7R4M";
+
+const FENCE = "---";
+
+type InvalidStep = {
+  stepIndex: number;
+  role: string;
+  errors: string[];
+};
+
+/**
+ * Extract the YAML frontmatter block from a step output. Returns the inner YAML
+ * string when the output starts with a `---\n` block closed by a `\n---` fence,
+ * otherwise null.
+ */
+function extractFrontmatterYaml(output: unknown): string | null {
+  if (typeof output !== "string") {
+    return null;
+  }
+  if (!output.startsWith(`${FENCE}\n`)) {
+    return null;
+  }
+  const rest = output.slice(FENCE.length + 1);
+  const closeIndex = rest.indexOf(`\n${FENCE}`);
+  if (closeIndex === -1) {
+    return null;
+  }
+  return rest.slice(0, closeIndex);
+}
+
+/** Validate a single step's frontmatter, returning a list of errors (empty = valid). */
+function validateStepFrontmatter(output: unknown): string[] {
+  const yaml = extractFrontmatterYaml(output);
+  if (yaml === null) {
+    return ["output does not begin with a valid '---' frontmatter block"];
+  }
+
+  let parsed: unknown;
+  try {
+    parsed = parseYaml(yaml);
+  } catch (e) {
+    const message = e instanceof Error ? e.message : String(e);
+    return [`frontmatter YAML failed to parse: ${message}`];
+  }
+
+  if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
+    return ["frontmatter is not a YAML mapping"];
+  }
+
+  const status = (parsed as Record<string, unknown>).$status;
+  if (typeof status !== "string" || status.trim() === "") {
+    return ["$status field is missing or not a non-empty string"];
+  }
+
+  return [];
+}
+
+/**
+ * Deterministic judge: every step's agent output must contain valid YAML
+ * frontmatter with a non-empty `$status` field. Score = stepsValid / stepsTotal
+ * (0 when there are no steps).
+ */
+export async function runFrontmatterJudge(threadId: string): Promise<BuiltinJudgeOutput> {
+  const steps = readThreadSteps(threadId);
+
+  const invalidSteps: InvalidStep[] = [];
+  for (let i = 0; i < steps.length; i++) {
+    const step = steps[i];
+    const errors = validateStepFrontmatter(step.output);
+    if (errors.length > 0) {
+      invalidSteps.push({ stepIndex: i, role: step.role, errors });
+    }
+  }
+
+  const stepsTotal = steps.length;
+  const stepsValid = stepsTotal - invalidSteps.length;
+  const score = stepsTotal > 0 ? stepsValid / stepsTotal : 0;
+
+  log(LOG_RESULT, `frontmatter thread=${threadId} valid=${stepsValid}/${stepsTotal}`);
+
+  return {
+    score,
+    data: { stepsTotal, stepsValid, invalidSteps },
+    schema: EVAL_JUDGE_FRONTMATTER_SCHEMA,
+  };
+}
@@ -0,0 +1,17 @@
+import { EVAL_JUDGE_HALLUCINATION_SCHEMA } from "../../storage/index.js";
+import type { BuiltinJudgeOutput } from "./types.js";
+
+/**
+ * LLM-as-judge: detects claims in each step's output that are not grounded in
+ * the available context (hallucinations).
+ *
+ * TODO: LLM-as-judge — needs provider config to call LLM API. Returns a stub
+ * (score 0, empty perStep) until the LLM call path is wired up.
+ */
+export async function runHallucinationJudge(_threadId: string): Promise<BuiltinJudgeOutput> {
+  return {
+    score: 0,
+    data: { perStep: [] },
+    schema: EVAL_JUDGE_HALLUCINATION_SCHEMA,
+  };
+}
@@ -0,0 +1,6 @@
+export { runFrontmatterJudge } from "./frontmatter.js";
+export { runHallucinationJudge } from "./hallucination.js";
+export { readThreadSteps } from "./read-steps.js";
+export { runTokenStatsJudge } from "./token-stats.js";
+export type { BuiltinJudge, BuiltinJudgeOutput } from "./types.js";
+export { runUpstreamJudge } from "./upstream.js";
@@ -0,0 +1,14 @@
+import { execFileSync } from "node:child_process";
+
+import type { StepEntry, ThreadStepsOutput } from "@united-workforce/protocol";
+
+/** Shell out to `uwf step list` and return the parsed step entries (excludes start entry). */
+export function readThreadSteps(threadId: string): StepEntry[] {
+  const stdout = execFileSync("uwf", ["step", "list", threadId], {
+    encoding: "utf8",
+    stdio: ["ignore", "pipe", "pipe"],
+  }).trim();
+  const parsed = JSON.parse(stdout) as ThreadStepsOutput;
+  // steps[0] is the StartEntry; the rest are StepEntry records.
+  return parsed.steps.slice(1) as StepEntry[];
+}
@@ -0,0 +1,53 @@
+import { createLogger } from "@united-workforce/util";
+
+import { EVAL_JUDGE_TOKEN_STATS_SCHEMA } from "../../storage/index.js";
+import { readThreadSteps } from "./read-steps.js";
+import type { BuiltinJudgeOutput } from "./types.js";
+
+const log = createLogger({ sink: { kind: "stderr" } });
+
+const LOG_RESULT = "T7KQ3M9P";
+
+type PerStepStats = {
+  role: string;
+  inputTokens: number;
+  outputTokens: number;
+  turns: number;
+  duration: number;
+};
+
+/**
+ * Informational judge: aggregate token usage across every step. Always scores
+ * 1.0 — it never penalizes a run, it only reports usage. Steps with null usage
+ * contribute zeros.
+ */
+export async function runTokenStatsJudge(threadId: string): Promise<BuiltinJudgeOutput> {
+  const steps = readThreadSteps(threadId);
+
+  let totalInput = 0;
+  let totalOutput = 0;
+  let totalTurns = 0;
+  const perStep: PerStepStats[] = [];
+
+  for (const step of steps) {
+    const usage = step.usage;
+    const inputTokens = usage !== null ? usage.inputTokens : 0;
+    const outputTokens = usage !== null ? usage.outputTokens : 0;
+    const turns = usage !== null ? usage.turns : 0;
+    const duration = usage !== null ? usage.duration : 0;
+
+    totalInput += inputTokens;
+    totalOutput += outputTokens;
+    totalTurns += turns;
+
+    perStep.push({ role: step.role, inputTokens, outputTokens, turns, duration });
+  }
+
+  log(LOG_RESULT, `token-stats thread=${threadId} in=${totalInput} out=${totalOutput}`);
+
+  return {
+    score: 1.0,
+    data: { totalInput, totalOutput, totalTurns, perStep },
+    schema: EVAL_JUDGE_TOKEN_STATS_SCHEMA,
+  };
+}
@@ -0,0 +1,16 @@
+import type { JSONSchema } from "@ocas/core";
+
+/**
+ * Output produced by a builtin judge. Structurally identical to the runner's
+ * `JudgeRunOutput`; defined locally to keep the judge module free of a
+ * dependency on the runner module.
+ */
+export type BuiltinJudgeOutput = {
+  score: number;
+  data: unknown;
+  /** Schema describing `data`, used when persisting to CAS. */
+  schema: JSONSchema;
+};
+
+/** A builtin judge analyzes a thread's steps and returns a scored result. */
+export type BuiltinJudge = (threadId: string) => Promise<BuiltinJudgeOutput>;
@@ -0,0 +1,17 @@
+import { EVAL_JUDGE_UPSTREAM_SCHEMA } from "../../storage/index.js";
+import type { BuiltinJudgeOutput } from "./types.js";
+
+/**
+ * LLM-as-judge: measures how well each role consumed the relevant outputs from
+ * upstream steps.
+ *
+ * TODO: LLM-as-judge — needs provider config to call LLM API. Returns a stub
+ * (score 0, empty perStep) until the LLM call path is wired up.
+ */
+export async function runUpstreamJudge(_threadId: string): Promise<BuiltinJudgeOutput> {
+  return {
+    score: 0,
+    data: { perStep: [] },
+    schema: EVAL_JUDGE_UPSTREAM_SCHEMA,
+  };
+}
@@ -0,0 +1,10 @@
+export {
+  type BuiltinJudge,
+  type BuiltinJudgeOutput,
+  readThreadSteps,
+  runFrontmatterJudge,
+  runHallucinationJudge,
+  runTokenStatsJudge,
+  runUpstreamJudge,
+} from "./builtin/index.js";
+export type { JudgeInput, JudgeOutput } from "./types.js";
@@ -0,0 +1,15 @@
+/** Output shape every judge must produce on stdout (JSON). */
+export type JudgeOutput<T = unknown> = {
+  /** Score between 0.0 and 1.0. */
+  score: number;
+  /** Judge-specific structured data, stored in CAS with its own schema. */
+  data: T;
+};
+
+/** Input context passed to judge scripts via argv. */
+export type JudgeInput = {
+  /** Working directory where the task was executed. */
+  cwd: string;
+  /** Thread ID of the eval run. */
+  threadId: string;
+};
@@ -0,0 +1,172 @@
+import { execFileSync } from "node:child_process";
+import { readFile } from "node:fs/promises";
+import { resolve } from "node:path";
+
+import type { JSONSchema, Store } from "@ocas/core";
+import { putSchema } from "@ocas/core";
+import type { CasRef } from "@united-workforce/protocol";
+import { createLogger } from "@united-workforce/util";
+
+import type { JudgeOutput } from "../judge/index.js";
+import {
+  runFrontmatterJudge,
+  runHallucinationJudge,
+  runTokenStatsJudge,
+  runUpstreamJudge,
+} from "../judge/index.js";
+import type { EvalJudgeRecord, EvalRunPayload } from "../storage/index.js";
+import { EVAL_RUN_SCHEMA, setEvalLatest } from "../storage/index.js";
+import type { JudgeEntry } from "../task/index.js";
+import type {
+  CollectInput,
+  CollectResult,
+  JudgeRunner,
+  JudgeRunOutput,
+  JudgeSummary,
+} from "./types.js";
+
+const log = createLogger({ sink: { kind: "stderr" } });
+
+const LOG_JUDGE = "CT6N3P2K";
+const LOG_STORED = "CT9V2Q7M";
+
+/** Permissive schema for judge data without a dedicated schema (e.g. builtin placeholders). */
+const GENERIC_DATA_SCHEMA: JSONSchema = { type: "object" };
+
+/**
+ * Compute the weighted overall score. Judges with weight 0 are informational
+ * and do not affect the result (they contribute 0 to both numerator and
+ * denominator). Returns 0 when total weight is 0.
+ */
+export function computeOverall(judges: ReadonlyArray<{ score: number; weight: number }>): number {
+  let totalWeight = 0;
+  let weighted = 0;
+  for (const judge of judges) {
+    totalWeight += judge.weight;
+    weighted += judge.score * judge.weight;
+  }
+  return totalWeight > 0 ? weighted / totalWeight : 0;
+}
+
+/** Run a task-provided judge script: `node <entry> <cwd> <threadId>`. */
+async function runTaskJudge(
+  taskDir: string,
+  workDir: string,
+  threadId: string,
+  judge: JudgeEntry,
+): Promise<JudgeRunOutput> {
+  if (judge.entry === null) {
+    throw new Error(`judge "${judge.name}" is not builtin but has no entry`);
+  }
+  const entryPath = resolve(taskDir, judge.entry);
+
+  let stdout: string;
+  try {
+    stdout = execFileSync("node", [entryPath, workDir, threadId], {
+      encoding: "utf8",
+      stdio: ["ignore", "pipe", "pipe"],
+      maxBuffer: 50 * 1024 * 1024,
+    });
+  } catch (e) {
+    const message = e instanceof Error ? e.message : String(e);
+    throw new Error(`judge "${judge.name}" failed: ${message}`);
+  }
+
+  const line = stdout.trim().split("\n").pop()?.trim() ?? "";
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(line);
+  } catch {
+    throw new Error(`judge "${judge.name}" stdout is not valid JSON: ${line || "(empty)"}`);
+  }
+  const output = parsed as JudgeOutput;
+  if (typeof output.score !== "number") {
+    throw new Error(`judge "${judge.name}" output missing numeric score`);
+  }
+
+  const schema =
+    judge.schema !== null ? await loadSchema(resolve(taskDir, judge.schema)) : GENERIC_DATA_SCHEMA;
+  return { score: output.score, data: output.data, schema };
+}
+
+/** Load and parse an OCAS JSON Schema file. */
+async function loadSchema(path: string): Promise<JSONSchema> {
+  const text = await readFile(path, "utf8");
+  return JSON.parse(text) as JSONSchema;
+}
+
+/** Dispatch a builtin judge by name. Throws on an unknown builtin name. */
+async function runBuiltinJudge(name: string, threadId: string): Promise<JudgeRunOutput> {
+  switch (name) {
+    case "frontmatter-compliance":
+      return runFrontmatterJudge(threadId);
+    case "upstream-consumption":
+      return runUpstreamJudge(threadId);
+    case "hallucination":
+      return runHallucinationJudge(threadId);
+    case "token-stats":
+      return runTokenStatsJudge(threadId);
+    default:
+      throw new Error(`unknown builtin judge "${name}"`);
+  }
+}
+
+/**
+ * Default judge runner. Builtin judges are dispatched by name; task judges spawn
+ * their entry script.
+ */
+const defaultJudgeRunner: JudgeRunner = async (taskDir, workDir, threadId, judge) => {
+  if (judge.builtin) {
+    return runBuiltinJudge(judge.name, threadId);
+  }
+  return runTaskJudge(taskDir, workDir, threadId, judge);
+};
+
+/** Persist judge data to CAS under its schema and return the CAS hash. */
+async function storeJudgeData(store: Store, schema: JSONSchema, data: unknown): Promise<CasRef> {
+  const schemaHash = await putSchema(store, schema);
+  return (await store.cas.put(schemaHash, data)) as CasRef;
+}
+
+/**
+ * Run all judges, store their data and the overall eval-run record in CAS, then
+ * index the run under `@uwf/eval/<task>/latest`.
+ */
+export async function collect(
+  input: CollectInput,
+  runJudge: JudgeRunner = defaultJudgeRunner,
+): Promise<CollectResult> {
+  const { evalStore, taskDir, workDir, threadId, manifest, config } = input;
+  const { store, varStore } = evalStore;
+
+  const records: EvalJudgeRecord[] = [];
+  for (const judge of manifest.judges) {
+    const result = await runJudge(taskDir, workDir, threadId, judge);
+    const dataHash = await storeJudgeData(store, result.schema, result.data);
+    records.push({ name: judge.name, score: result.score, weight: judge.weight, dataHash });
+    log(LOG_JUDGE, `judge=${judge.name} score=${result.score} weight=${judge.weight}`);
+  }
+
+  const overall = computeOverall(records);
+
+  const payload: EvalRunPayload = {
+    task: manifest.name,
+    config,
+    threadId,
+    judges: records,
+    overall,
+    timestamp: Date.now(),
+  };
+
+  const schemaHash = await putSchema(store, EVAL_RUN_SCHEMA);
+  const runHash = (await store.cas.put(schemaHash, payload)) as string;
+  setEvalLatest(varStore, manifest.name, runHash);
+  log(LOG_STORED, `stored eval-run task=${manifest.name} hash=${runHash} overall=${overall}`);
+
+  const judges: JudgeSummary[] = records.map((r) => ({
+    name: r.name,
+    score: r.score,
+    weight: r.weight,
+  }));
+  return { runHash, overall, judges };
+}
@@ -0,0 +1,87 @@
+import { execFileSync } from "node:child_process";
+
+import { createLogger } from "@united-workforce/util";
+
+import type { ExecuteInput, ExecuteResult } from "./types.js";
+
+const log = createLogger({ sink: { kind: "stderr" } });
+
+const LOG_START = "EX5M2T9V";
+const LOG_EXEC = "EX7Q4K2N";
+
+/** Resolve the uwf CLI binary. Override with `UWF_BIN` for testing. */
+function uwfBin(): string {
+  const override = process.env.UWF_BIN;
+  return override !== undefined && override !== "" ? override : "uwf";
+}
+
+/** Run a uwf subcommand and return trimmed stdout. */
+function runUwf(args: string[], cwd: string): string {
+  try {
+    return execFileSync(uwfBin(), args, {
+      encoding: "utf8",
+      stdio: ["ignore", "pipe", "pipe"],
+      maxBuffer: 50 * 1024 * 1024,
+      cwd,
+    }).trim();
+  } catch (e) {
+    const err = e as NodeJS.ErrnoException & { stderr?: Buffer | string | null };
+    const stderr =
+      err.stderr == null
+        ? ""
+        : typeof err.stderr === "string"
+          ? err.stderr
+          : err.stderr.toString("utf8");
+    const detail = stderr.trim() !== "" ? `: ${stderr.trim()}` : "";
+    throw new Error(`uwf ${args[0]} ${args[1]} failed${detail}`);
+  }
+}
+
+/** Parse the thread ID from `uwf thread start` JSON output (`{ workflow, thread }`). */
+function parseThreadId(stdout: string): string {
+  let parsed: unknown;
+  try {
+    parsed = JSON.parse(stdout);
+  } catch {
+    throw new Error(`uwf thread start did not emit valid JSON: ${stdout || "(empty)"}`);
+  }
+  const obj = parsed as Record<string, unknown>;
+  const thread = obj.thread;
+  if (typeof thread !== "string" || thread === "") {
+    throw new Error(`uwf thread start output missing thread id: ${stdout}`);
+  }
+  return thread;
+}
+
+/**
+ * Execute a workflow: create a thread, then run it for up to `maxSteps` steps.
+ * Shells out to the uwf CLI rather than importing it directly.
+ */
+export async function execute(input: ExecuteInput): Promise<ExecuteResult> {
+  const startOut = runUwf(
+    ["thread", "start", input.workflow, "-p", input.prompt, "--cwd", input.workDir],
+    input.workDir,
+  );
+  const threadId = parseThreadId(startOut);
+  log(LOG_START, `thread started thread=${threadId} workflow=${input.workflow}`);
+
+  runUwf(
+    ["thread", "exec", threadId, "--agent", input.agent, "-c", String(input.maxSteps)],
+    input.workDir,
+  );
+  log(LOG_EXEC, `thread executed thread=${threadId} maxSteps=${input.maxSteps}`);
+
+  return { threadId };
+}
+
+/** Best-effort lookup of the uwf engine version (`uwf -V`); "unknown" on failure. */
+export function getEngineVersion(): string {
+  try {
+    return execFileSync(uwfBin(), ["-V"], {
+      encoding: "utf8",
+      stdio: ["ignore", "pipe", "ignore"],
+    }).trim();
+  } catch {
+    return "unknown";
+  }
+}
@@ -0,0 +1,15 @@
+export { collect, computeOverall } from "./collect.js";
+export { execute, getEngineVersion } from "./execute.js";
+export { prepare } from "./prepare.js";
+export type {
+  CollectInput,
+  CollectResult,
+  ExecuteInput,
+  ExecuteResult,
+  JudgeRunner,
+  JudgeRunOutput,
+  JudgeSummary,
+  PrepareResult,
+  RunOptions,
+  RunResult,
+} from "./types.js";
@@ -0,0 +1,45 @@
+import { access, cp, mkdir, mkdtemp } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+
+import { createLogger } from "@united-workforce/util";
+
+import { loadTaskManifest } from "../task/index.js";
+import type { PrepareResult } from "./types.js";
+
+const log = createLogger({ sink: { kind: "stderr" } });
+
+const LOG_PREPARE = "PRE4K2NQ";
+const LOG_FIXTURE = "PRE7M3VX";
+
+/** Check whether a path exists. */
+async function pathExists(path: string): Promise<boolean> {
+  try {
+    await access(path);
+    return true;
+  } catch {
+    return false;
+  }
+}
+
+/**
+ * Prepare a task for execution: read its manifest and copy the fixture
+ * directory into a fresh temp working directory.
+ */
+export async function prepare(taskDir: string): Promise<PrepareResult> {
+  const manifest = await loadTaskManifest(taskDir);
+  log(LOG_PREPARE, `loaded task manifest name=${manifest.name} workflow=${manifest.workflow}`);
+
+  const workDir = await mkdtemp(join(tmpdir(), "uwf-eval-"));
+
+  const fixtureDir = join(taskDir, "fixture");
+  if (await pathExists(fixtureDir)) {
+    await cp(fixtureDir, workDir, { recursive: true });
+    log(LOG_FIXTURE, `copied fixture into workDir=${workDir}`);
+  } else {
+    await mkdir(workDir, { recursive: true });
+    log(LOG_FIXTURE, `no fixture/ found, using empty workDir=${workDir}`);
+  }
+
+  return { taskDir, workDir, manifest };
+}
@@ -0,0 +1,85 @@
+import type { JSONSchema } from "@ocas/core";
+
+import type { EvalRunConfig, EvalStore } from "../storage/index.js";
+import type { JudgeEntry, TaskManifest } from "../task/index.js";
+
+/** Result of the prepare phase: task dir, temp working dir, parsed manifest. */
+export type PrepareResult = {
+  taskDir: string;
+  workDir: string;
+  manifest: TaskManifest;
+};
+
+/** Input to the execute phase. */
+export type ExecuteInput = {
+  /** Working directory the workflow runs in (the prepared temp dir). */
+  workDir: string;
+  /** Workflow name or path (from task.yaml). */
+  workflow: string;
+  /** Initial prompt for the thread. */
+  prompt: string;
+  /** Agent adapter to use. */
+  agent: string;
+  /** Maximum number of steps to execute. */
+  maxSteps: number;
+};
+
+/** Result of the execute phase. */
+export type ExecuteResult = {
+  threadId: string;
+};
+
+/** Output produced by running a single judge. */
+export type JudgeRunOutput = {
+  score: number;
+  data: unknown;
+  /** Schema describing `data`, used when persisting to CAS. */
+  schema: JSONSchema;
+};
+
+/** Pluggable judge execution strategy (injectable for testing). */
+export type JudgeRunner = (
+  taskDir: string,
+  workDir: string,
+  threadId: string,
+  judge: JudgeEntry,
+) => Promise<JudgeRunOutput>;
+
+/** Input to the collect phase. */
+export type CollectInput = {
+  evalStore: EvalStore;
+  taskDir: string;
+  workDir: string;
+  threadId: string;
+  manifest: TaskManifest;
+  config: EvalRunConfig;
+};
+
+/** A single judge's summarized result in the run output. */
+export type JudgeSummary = {
+  name: string;
+  score: number;
+  weight: number;
+};
+
+/** Result of the collect phase. */
+export type CollectResult = {
+  runHash: string;
+  overall: number;
+  judges: JudgeSummary[];
+};
+
+/** Options for a full eval run (from CLI flags). */
+export type RunOptions = {
+  agent: string;
+  model: string;
+  count: number;
+};
+
+/** Final result of a full eval run. */
+export type RunResult = {
+  runHash: string;
+  overall: number;
+  task: string;
+  judges: JudgeSummary[];
+};
@@ -0,0 +1,9 @@
+export {
+  EVAL_JUDGE_FRONTMATTER_SCHEMA,
+  EVAL_JUDGE_HALLUCINATION_SCHEMA,
+  EVAL_JUDGE_TOKEN_STATS_SCHEMA,
+  EVAL_JUDGE_UPSTREAM_SCHEMA,
+  EVAL_RUN_SCHEMA,
+} from "./schemas.js";
+export { createEvalStore, setEvalLatest } from "./store.js";
+export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload, EvalStore } from "./types.js";
@@ -0,0 +1,123 @@
+import type { JSONSchema } from "@ocas/core";
+
+export const EVAL_RUN_SCHEMA: JSONSchema = {
+  title: "@uwf/eval-run",
+  type: "object",
+  required: ["task", "config", "threadId", "judges", "overall", "timestamp"],
+  properties: {
+    task: { type: "string" },
+    config: {
+      type: "object",
+      required: ["agent", "model", "engineVersion"],
+      properties: {
+        agent: { type: "string" },
+        model: { type: "string" },
+        engineVersion: { type: "string" },
+      },
+    },
+    threadId: { type: "string" },
+    judges: {
+      type: "array",
+      items: {
+        type: "object",
+        required: ["name", "score", "weight", "dataHash"],
+        properties: {
+          name: { type: "string" },
+          score: { type: "number" },
+          weight: { type: "number" },
+          dataHash: { type: "string" },
+        },
+      },
+    },
+    overall: { type: "number" },
+    timestamp: { type: "integer" },
+  },
+};
+
+export const EVAL_JUDGE_FRONTMATTER_SCHEMA: JSONSchema = {
+  title: "@uwf/eval-judge-frontmatter",
+  type: "object",
+  required: ["stepsTotal", "stepsValid", "invalidSteps"],
+  properties: {
+    stepsTotal: { type: "integer" },
+    stepsValid: { type: "integer" },
+    invalidSteps: {
+      type: "array",
+      items: {
+        type: "object",
+        required: ["stepIndex", "role", "errors"],
+        properties: {
+          stepIndex: { type: "integer" },
+          role: { type: "string" },
+          errors: { type: "array", items: { type: "string" } },
+        },
+      },
+    },
+  },
+};
+
+export const EVAL_JUDGE_UPSTREAM_SCHEMA: JSONSchema = {
+  title: "@uwf/eval-judge-upstream",
+  type: "object",
+  required: ["perStep"],
+  properties: {
+    perStep: {
+      type: "array",
+      items: {
+        type: "object",
+        required: ["role", "consumed", "missed", "score"],
+        properties: {
+          role: { type: "string" },
+          consumed: { type: "array", items: { type: "string" } },
+          missed: { type: "array", items: { type: "string" } },
+          score: { type: "number" },
+        },
+      },
+    },
+  },
+};
+
+export const EVAL_JUDGE_HALLUCINATION_SCHEMA: JSONSchema = {
+  title: "@uwf/eval-judge-hallucination",
+  type: "object",
+  required: ["perStep"],
+  properties: {
+    perStep: {
+      type: "array",
+      items: {
+        type: "object",
+        required: ["role", "hallucinations", "score"],
+        properties: {
+          role: { type: "string" },
+          hallucinations: { type: "array", items: { type: "string" } },
+          score: { type: "number" },
+        },
+      },
+    },
+  },
+};
+
+export const EVAL_JUDGE_TOKEN_STATS_SCHEMA: JSONSchema = {
+  title: "@uwf/eval-judge-token-stats",
+  type: "object",
+  required: ["totalInput", "totalOutput", "totalTurns", "perStep"],
+  properties: {
+    totalInput: { type: "integer" },
+    totalOutput: { type: "integer" },
+    totalTurns: { type: "integer" },
+    perStep: {
+      type: "array",
+      items: {
+        type: "object",
+        required: ["role", "inputTokens", "outputTokens", "turns", "duration"],
+        properties: {
+          role: { type: "string" },
+          inputTokens: { type: "integer" },
+          outputTokens: { type: "integer" },
+          turns: { type: "integer" },
+          duration: { type: "number" },
+        },
+      },
+    },
+  },
+};
@@ -0,0 +1,42 @@
+import { mkdir } from "node:fs/promises";
+import { homedir } from "node:os";
+import { join } from "node:path";
+import type { VarStore } from "@ocas/core";
+import { bootstrap, type Store } from "@ocas/core";
+import { createFsStore, createSqliteVarStore } from "@ocas/fs";
+
+import type { EvalStore } from "./types.js";
+
+/** Variable name prefix for eval run pointers (`@uwf/eval/<task>/latest`). */
+const EVAL_VAR_PREFIX = "@uwf/eval/";
+
+/**
+ * Resolve the global CAS directory shared by all uwf and ocas tools.
+ * Priority: `OCAS_HOME` → default ~/.ocas (matches uwf CLI's getGlobalCasDir).
+ */
+function getGlobalCasDir(): string {
+  const primary = process.env.OCAS_HOME;
+  if (primary !== undefined && primary !== "") {
+    return primary;
+  }
+  return join(homedir(), ".ocas");
+}
+
+/**
+ * Open the unified OCAS store on the filesystem.
+ * Shares the same CAS + variable backend as the uwf CLI.
+ */
+export async function createEvalStore(): Promise<EvalStore> {
+  const casDir = getGlobalCasDir();
+  await mkdir(casDir, { recursive: true });
+  const cas = createFsStore(casDir);
+  const { var: varStore, tag } = createSqliteVarStore(join(casDir, "vars"), cas);
+  const store: Store = { cas, var: varStore, tag };
+  bootstrap(store);
+  return { store, varStore };
+}
+
+/** Set the `@uwf/eval/<task>/latest` variable to point at a run hash. */
+export function setEvalLatest(varStore: VarStore, taskName: string, runHash: string): void {
+  varStore.set(`${EVAL_VAR_PREFIX}${taskName}/latest`, runHash);
+}
@@ -0,0 +1,33 @@
+import type { Store, VarStore } from "@ocas/core";
+import type { CasRef } from "@united-workforce/protocol";
+
+/** Handle to the OCAS store used for eval persistence. */
+export type EvalStore = {
+  store: Store;
+  varStore: VarStore;
+};
+
+/** A single judge result within an eval run. */
+export type EvalJudgeRecord = {
+  name: string;
+  score: number;
+  weight: number;
+  dataHash: CasRef;
+};
+
+/** Config snapshot for an eval run. */
+export type EvalRunConfig = {
+  agent: string;
+  model: string;
+  engineVersion: string;
+};
+
+/** Full eval run record stored in CAS. */
+export type EvalRunPayload = {
+  task: string;
+  config: EvalRunConfig;
+  threadId: string;
+  judges: EvalJudgeRecord[];
+  overall: number;
+  timestamp: number;
+};
@@ -0,0 +1,2 @@
+export { loadTaskManifest, parseTaskManifest } from "./loader.js";
+export type { JudgeEntry, TaskLimits, TaskManifest } from "./types.js";
@@ -0,0 +1,74 @@
+import { readFile } from "node:fs/promises";
+import { join } from "node:path";
+import { parse as parseYaml } from "yaml";
+import type { JudgeEntry, TaskLimits, TaskManifest } from "./types.js";
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return typeof value === "object" && value !== null && !Array.isArray(value);
+}
+
+function parseJudgeEntry(raw: unknown, index: number): JudgeEntry {
+  if (!isRecord(raw)) {
+    throw new Error(`judges[${index}]: expected object`);
+  }
+  const name = raw.name;
+  if (typeof name !== "string" || name === "") {
+    throw new Error(`judges[${index}]: name is required`);
+  }
+  const weight = typeof raw.weight === "number" ? raw.weight : 0;
+  const builtin = raw.builtin === true;
+  const entry = typeof raw.entry === "string" ? raw.entry : null;
+  const schema = typeof raw.schema === "string" ? raw.schema : null;
+  if (!builtin && entry === null) {
+    throw new Error(`judges[${index}] "${name}": non-builtin judge must have entry`);
+  }
+  return { name, weight, builtin, entry, schema };
+}
+
+function parseLimits(raw: unknown): TaskLimits {
+  if (!isRecord(raw)) {
+    return { maxSteps: 20, timeoutMinutes: 30 };
+  }
+  return {
+    maxSteps: typeof raw.maxSteps === "number" ? raw.maxSteps : 20,
+    timeoutMinutes: typeof raw.timeoutMinutes === "number" ? raw.timeoutMinutes : 30,
+  };
+}
+
+/** Parse and validate a task.yaml file into a TaskManifest. */
+export function parseTaskManifest(yamlText: string): TaskManifest {
+  const raw = parseYaml(yamlText) as unknown;
+  if (!isRecord(raw)) {
+    throw new Error("task.yaml must be a YAML mapping");
+  }
+  const name = raw.name;
+  if (typeof name !== "string" || name === "") {
+    throw new Error("task.yaml: name is required");
+  }
+  const description = typeof raw.description === "string" ? raw.description : "";
+  const workflow = raw.workflow;
+  if (typeof workflow !== "string" || workflow === "") {
+    throw new Error("task.yaml: workflow is required");
+  }
+  const prompt = raw.prompt;
+  if (typeof prompt !== "string" || prompt === "") {
+    throw new Error("task.yaml: prompt is required");
+  }
+  const limits = parseLimits(raw.limits);
+  const judgesRaw = raw.judges;
+  if (!Array.isArray(judgesRaw) || judgesRaw.length === 0) {
+    throw new Error("task.yaml: at least one judge is required");
+  }
+  const judges: JudgeEntry[] = [];
+  for (let i = 0; i < judgesRaw.length; i++) {
+    judges.push(parseJudgeEntry(judgesRaw[i], i));
+  }
+  return { name, description, workflow, prompt, limits, judges };
+}
+
+/** Load and parse task.yaml from a directory. */
+export async function loadTaskManifest(taskDir: string): Promise<TaskManifest> {
+  const yamlPath = join(taskDir, "task.yaml");
+  const text = await readFile(yamlPath, "utf8");
+  return parseTaskManifest(text);
+}
@@ -0,0 +1,28 @@
+/** Judge entry in task.yaml */
+export type JudgeEntry = {
+  name: string;
+  weight: number;
+  builtin: boolean;
+  /** Path to judge entry script (relative to task root). Required for non-builtin judges. */
+  entry: string | null;
+  /** Path to OCAS schema JSON for judge data. Required for non-builtin judges. */
+  schema: string | null;
+};
+
+/** Limits for eval execution. */
+export type TaskLimits = {
+  maxSteps: number;
+  timeoutMinutes: number;
+};
+
+/** Parsed task.yaml manifest. */
+export type TaskManifest = {
+  name: string;
+  description: string;
+  /** Workflow name or relative path to .yaml file. */
+  workflow: string;
+  /** Initial prompt for thread start. */
+  prompt: string;
+  limits: TaskLimits;
+  judges: JudgeEntry[];
+};
@@ -0,0 +1,9 @@
+{
+  "extends": "../../tsconfig.json",
+  "compilerOptions": {
+    "rootDir": "src",
+    "outDir": "dist"
+  },
+  "include": ["src"],
+  "references": [{ "path": "../protocol" }, { "path": "../util" }]
+}
@@ -228,6 +228,31 @@ importers:
        specifier: ^8.0.13
        version: 8.0.16(@types/node@25.9.1)(esbuild@0.27.7)(jiti@2.7.0)(yaml@2.9.0)

+  packages/eval:
+    dependencies:
+      '@ocas/core':
+        specifier: ^0.3.0
+        version: 0.3.0
+      '@ocas/fs':
+        specifier: ^0.3.0
+        version: 0.3.0
+      '@united-workforce/protocol':
+        specifier: workspace:^
+        version: link:../protocol
+      '@united-workforce/util':
+        specifier: workspace:^
+        version: link:../util
+      commander:
+        specifier: ^14.0.3
+        version: 14.0.3
+      yaml:
+        specifier: ^2.9.0
+        version: 2.9.0
+    devDependencies:
+      typescript:
+        specifier: ^5.8.3
+        version: 5.9.3
+
  packages/protocol:
    dependencies:
      '@ocas/core':
@@ -25,6 +25,7 @@
    { "path": "packages/agent-builtin" },
    { "path": "packages/agent-mock" },
    { "path": "packages/agent-claude-code" },
-    { "path": "packages/cli" }
+    { "path": "packages/cli" },
+    { "path": "packages/eval" }
  ]
 }