diff --git a/packages/eval/__tests__/commands.test.ts b/packages/eval/__tests__/commands.test.ts new file mode 100644 index 0000000..2b498b1 --- /dev/null +++ b/packages/eval/__tests__/commands.test.ts @@ -0,0 +1,171 @@ +import { bootstrap, createMemoryStore, putSchema } from "@ocas/core"; +import type { CasRef } from "@united-workforce/protocol"; +import { describe, expect, test } from "vitest"; + +import { + formatDiff, + formatList, + formatReport, + readEvalEntries, + readEvalRun, + selectEntries, +} from "../src/commands/index.js"; +import type { EvalRunPayload, EvalStore } from "../src/storage/index.js"; +import { EVAL_RUN_SCHEMA, setEvalLatest } from "../src/storage/index.js"; + +function makeEvalStore(): EvalStore { + const store = createMemoryStore(); + bootstrap(store); + return { store, varStore: store.var }; +} + +function makePayload( + task: string, + overall: number, + timestamp: number, + judges: EvalRunPayload["judges"] = [ + { + name: "frontmatter-compliance", + score: 1.0, + weight: 0.6, + dataHash: "AAAAAAAAAAAAA" as CasRef, + }, + { name: "token-stats", score: 0.5, weight: 0, dataHash: "BBBBBBBBBBBBB" as CasRef }, + ], + config: EvalRunPayload["config"] = { + agent: "hermes", + model: "claude-sonnet-4", + engineVersion: "1.0.0", + }, +): EvalRunPayload { + return { task, config, threadId: "THREAD0123456789", judges, overall, timestamp }; +} + +/** Store an eval-run node in CAS and index it under @uwf/eval//latest. */ +function storeRun(evalStore: EvalStore, payload: EvalRunPayload): string { + const schemaHash = putSchema(evalStore.store, EVAL_RUN_SCHEMA); + const hash = evalStore.store.cas.put(schemaHash, payload); + setEvalLatest(evalStore.varStore, payload.task, hash); + return hash; +} + +describe("formatReport", () => { + test("includes task, overall, config and judges", () => { + const payload = makePayload("fix-off-by-one", 0.8, Date.UTC(2026, 0, 2, 3, 4, 5)); + const output = formatReport(payload, "RUNHASH123456"); + + expect(output).toContain("fix-off-by-one"); + expect(output).toContain("0.8000"); + expect(output).toContain("hermes"); + expect(output).toContain("claude-sonnet-4"); + expect(output).toContain("1.0.0"); + expect(output).toContain("frontmatter-compliance"); + expect(output).toContain("token-stats"); + expect(output).toContain("THREAD0123456789"); + expect(output).toContain("RUNHASH123456"); + }); + + test("round-trips a stored run via readEvalRun", () => { + const evalStore = makeEvalStore(); + const payload = makePayload("fix-off-by-one", 0.75, Date.now()); + const hash = storeRun(evalStore, payload); + + const loaded = readEvalRun(evalStore, hash); + expect(loaded).not.toBeNull(); + const output = formatReport(loaded as EvalRunPayload, hash); + expect(output).toContain("fix-off-by-one"); + expect(output).toContain("0.7500"); + }); + + test("readEvalRun returns null for a missing hash", () => { + const evalStore = makeEvalStore(); + expect(readEvalRun(evalStore, "NOPENOPENOPE0")).toBeNull(); + }); +}); + +describe("list", () => { + test("lists eval runs stored under different tasks", () => { + const evalStore = makeEvalStore(); + storeRun(evalStore, makePayload("fix-off-by-one", 0.8, 2000)); + storeRun(evalStore, makePayload("write-docs", 0.6, 1000)); + + const entries = readEvalEntries(evalStore); + expect(entries).toHaveLength(2); + + const output = formatList(selectEntries(entries, null, 20)); + expect(output).toContain("fix-off-by-one"); + expect(output).toContain("write-docs"); + }); + + test("sorts newest-first by timestamp", () => { + const evalStore = makeEvalStore(); + storeRun(evalStore, makePayload("old-task", 0.5, 1000)); + storeRun(evalStore, makePayload("new-task", 0.5, 2000)); + + const selected = selectEntries(readEvalEntries(evalStore), null, 20); + expect(selected[0]?.task).toBe("new-task"); + expect(selected[1]?.task).toBe("old-task"); + }); + + test("--task filter only shows the matching task", () => { + const evalStore = makeEvalStore(); + storeRun(evalStore, makePayload("fix-off-by-one", 0.8, 2000)); + storeRun(evalStore, makePayload("write-docs", 0.6, 1000)); + + const output = formatList(selectEntries(readEvalEntries(evalStore), "write-docs", 20)); + expect(output).toContain("write-docs"); + expect(output).not.toContain("fix-off-by-one"); + }); + + test("--limit caps the number of rows", () => { + const evalStore = makeEvalStore(); + storeRun(evalStore, makePayload("task-a", 0.8, 3000)); + storeRun(evalStore, makePayload("task-b", 0.6, 2000)); + storeRun(evalStore, makePayload("task-c", 0.4, 1000)); + + const selected = selectEntries(readEvalEntries(evalStore), null, 2); + expect(selected).toHaveLength(2); + expect(selected.map((e) => e.task)).toEqual(["task-a", "task-b"]); + }); + + test("empty store renders a placeholder", () => { + const evalStore = makeEvalStore(); + const output = formatList(selectEntries(readEvalEntries(evalStore), null, 20)); + expect(output).toContain("(no eval runs found)"); + }); +}); + +describe("formatDiff", () => { + test("shows an upward delta when B scores higher", () => { + const a = makePayload("fix-off-by-one", 0.6, 1000); + const b = makePayload("fix-off-by-one", 0.8, 2000); + const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000"); + + expect(output).toContain("▲"); + expect(output).toContain("HASHA00000000"); + expect(output).toContain("HASHB00000000"); + }); + + test("shows a downward delta when B scores lower", () => { + const a = makePayload("fix-off-by-one", 0.9, 1000); + const b = makePayload("fix-off-by-one", 0.4, 2000); + const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000"); + expect(output).toContain("▼"); + }); + + test("marks differing config values", () => { + const a = makePayload("fix-off-by-one", 0.6, 1000, undefined, { + agent: "hermes", + model: "claude-sonnet-4", + engineVersion: "1.0.0", + }); + const b = makePayload("fix-off-by-one", 0.6, 2000, undefined, { + agent: "claude-code", + model: "claude-sonnet-4", + engineVersion: "1.0.0", + }); + const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000"); + expect(output).toContain("≠"); + expect(output).toContain("claude-code"); + }); +}); diff --git a/packages/eval/src/commands/diff.ts b/packages/eval/src/commands/diff.ts index fa443d8..cb292ac 100644 --- a/packages/eval/src/commands/diff.ts +++ b/packages/eval/src/commands/diff.ts @@ -1,11 +1,38 @@ +import { createLogger } from "@united-workforce/util"; import type { Command } from "commander"; +import { createEvalStore } from "../storage/index.js"; +import { formatDiff } from "./format.js"; +import { readEvalRun } from "./read.js"; + +const log = createLogger({ sink: { kind: "stderr" } }); +const LOG_DIFF = "D3WZ8N5T"; + export function registerDiffCommand(program: Command): void { program .command("diff ") .description("Compare two eval runs side-by-side") - .action(async (_hash1: string, _hash2: string) => { - process.stderr.write("uwf-eval diff: not yet implemented\n"); - process.exitCode = 1; + .action(async (hash1: string, hash2: string) => { + try { + const evalStore = await createEvalStore(); + const payloadA = readEvalRun(evalStore, hash1); + if (payloadA === null) { + process.stderr.write(`eval run not found: ${hash1}\n`); + process.exitCode = 1; + return; + } + const payloadB = readEvalRun(evalStore, hash2); + if (payloadB === null) { + process.stderr.write(`eval run not found: ${hash2}\n`); + process.exitCode = 1; + return; + } + log(LOG_DIFF, `diff a=${hash1} b=${hash2}`); + process.stdout.write(formatDiff(payloadA, hash1, payloadB, hash2)); + } catch (e) { + const message = e instanceof Error ? e.message : String(e); + process.stderr.write(`${message}\n`); + process.exitCode = 1; + } }); } diff --git a/packages/eval/src/commands/format.ts b/packages/eval/src/commands/format.ts new file mode 100644 index 0000000..880e733 --- /dev/null +++ b/packages/eval/src/commands/format.ts @@ -0,0 +1,148 @@ +import type { EvalRunPayload } from "../storage/index.js"; +import type { EvalListEntry } from "./types.js"; + +const NAME_WIDTH = 28; +const SCORE_WIDTH = 10; +const TIMESTAMP_WIDTH = 26; + +/** Format a 0..1 score (or weight) with fixed precision. */ +function formatScore(value: number): string { + return value.toFixed(4); +} + +/** Human-readable ISO-8601 timestamp from epoch milliseconds. */ +function formatTimestamp(ms: number): string { + return new Date(ms).toISOString(); +} + +/** Right-pad to a fixed column width (with a trailing space if already full). */ +function pad(value: string, width: number): string { + return value.length >= width ? `${value} ` : value.padEnd(width); +} + +/** Directional indicator for a score delta (B relative to A). */ +function formatDelta(delta: number): string { + if (delta > 0) { + return `▲ +${formatScore(delta)}`; + } + if (delta < 0) { + return `▼ ${formatScore(delta)}`; + } + return `= ${formatScore(0)}`; +} + +/** Render a single eval run as a human-readable report. */ +export function formatReport(payload: EvalRunPayload, runHash: string): string { + const lines: string[] = []; + lines.push("=== Eval Report ==="); + lines.push(`Task: ${payload.task}`); + lines.push(`Overall: ${formatScore(payload.overall)}`); + lines.push(`Timestamp: ${formatTimestamp(payload.timestamp)}`); + lines.push(""); + lines.push("Config:"); + lines.push(` Agent: ${payload.config.agent}`); + lines.push(` Model: ${payload.config.model}`); + lines.push(` Engine: ${payload.config.engineVersion}`); + lines.push(""); + lines.push("Judges:"); + lines.push(` ${pad("NAME", NAME_WIDTH)}${pad("SCORE", SCORE_WIDTH)}WEIGHT`); + for (const judge of payload.judges) { + lines.push( + ` ${pad(judge.name, NAME_WIDTH)}${pad(formatScore(judge.score), SCORE_WIDTH)}${formatScore(judge.weight)}`, + ); + } + lines.push(""); + lines.push(`Thread: ${payload.threadId}`); + lines.push(`Run: ${runHash}`); + return `${lines.join("\n")}\n`; +} + +/** Render a side-by-side comparison of two eval runs. */ +export function formatDiff( + payloadA: EvalRunPayload, + hashA: string, + payloadB: EvalRunPayload, + hashB: string, +): string { + const lines: string[] = []; + lines.push("=== Eval Diff ==="); + lines.push(`A: ${hashA} (${payloadA.task})`); + lines.push(`B: ${hashB} (${payloadB.task})`); + lines.push(""); + + const overallDelta = payloadB.overall - payloadA.overall; + lines.push("Overall:"); + lines.push( + ` A=${formatScore(payloadA.overall)} B=${formatScore(payloadB.overall)} ${formatDelta(overallDelta)}`, + ); + lines.push(""); + + lines.push("Config:"); + lines.push(configLine("Agent", payloadA.config.agent, payloadB.config.agent)); + lines.push(configLine("Model", payloadA.config.model, payloadB.config.model)); + lines.push(configLine("Engine", payloadA.config.engineVersion, payloadB.config.engineVersion)); + lines.push(""); + + lines.push("Judges:"); + lines.push(` ${pad("NAME", NAME_WIDTH)}${pad("A", SCORE_WIDTH)}${pad("B", SCORE_WIDTH)}DELTA`); + const scoresA = new Map(payloadA.judges.map((judge) => [judge.name, judge.score])); + const scoresB = new Map(payloadB.judges.map((judge) => [judge.name, judge.score])); + for (const name of unionJudgeNames(payloadA, payloadB)) { + const scoreA = scoresA.get(name); + const scoreB = scoresB.get(name); + const cellA = scoreA === undefined ? "—" : formatScore(scoreA); + const cellB = scoreB === undefined ? "—" : formatScore(scoreB); + const delta = scoreA !== undefined && scoreB !== undefined ? formatDelta(scoreB - scoreA) : ""; + lines.push( + ` ${pad(name, NAME_WIDTH)}${pad(cellA, SCORE_WIDTH)}${pad(cellB, SCORE_WIDTH)}${delta}`, + ); + } + return `${lines.join("\n")}\n`; +} + +/** Render a table of indexed eval runs. */ +export function formatList(entries: ReadonlyArray): string { + const lines: string[] = []; + lines.push( + ` ${pad("TASK", NAME_WIDTH)}${pad("OVERALL", SCORE_WIDTH)}${pad("TIMESTAMP", TIMESTAMP_WIDTH)}HASH`, + ); + if (entries.length === 0) { + lines.push(" (no eval runs found)"); + } + for (const entry of entries) { + lines.push( + ` ${pad(entry.task, NAME_WIDTH)}${pad(formatScore(entry.overall), SCORE_WIDTH)}${pad(formatTimestamp(entry.timestamp), TIMESTAMP_WIDTH)}${entry.hash}`, + ); + } + return `${lines.join("\n")}\n`; +} + +/** Sort newest-first, then apply optional task filter and result limit. */ +export function selectEntries( + entries: ReadonlyArray, + task: string | null, + limit: number | null, +): EvalListEntry[] { + const sorted = [...entries].sort((a, b) => b.timestamp - a.timestamp); + const filtered = task !== null ? sorted.filter((entry) => entry.task === task) : sorted; + return limit !== null ? filtered.slice(0, limit) : filtered; +} + +/** Ordered union of judge names: A's order first, then B-only names. */ +function unionJudgeNames(payloadA: EvalRunPayload, payloadB: EvalRunPayload): string[] { + const names: string[] = []; + const seen = new Set(); + for (const judge of [...payloadA.judges, ...payloadB.judges]) { + if (!seen.has(judge.name)) { + seen.add(judge.name); + names.push(judge.name); + } + } + return names; +} + +/** One config row: `=` when equal, `≠` otherwise. */ +function configLine(label: string, valueA: string, valueB: string): string { + const marker = valueA === valueB ? "=" : "≠"; + return ` ${pad(`${label}:`, SCORE_WIDTH)}${marker} A=${valueA} B=${valueB}`; +} diff --git a/packages/eval/src/commands/index.ts b/packages/eval/src/commands/index.ts index 0dded07..cec7ad6 100644 --- a/packages/eval/src/commands/index.ts +++ b/packages/eval/src/commands/index.ts @@ -1,4 +1,7 @@ export { registerDiffCommand } from "./diff.js"; +export { formatDiff, formatList, formatReport, selectEntries } from "./format.js"; export { registerListCommand } from "./list.js"; +export { readEvalEntries, readEvalRun } from "./read.js"; export { registerReportCommand } from "./report.js"; export { registerRunCommand } from "./run.js"; +export type { EvalListEntry } from "./types.js"; diff --git a/packages/eval/src/commands/list.ts b/packages/eval/src/commands/list.ts index 9c4bd0c..1556f94 100644 --- a/packages/eval/src/commands/list.ts +++ b/packages/eval/src/commands/list.ts @@ -1,13 +1,43 @@ +import { createLogger } from "@united-workforce/util"; import type { Command } from "commander"; +import { createEvalStore } from "../storage/index.js"; +import { formatList, selectEntries } from "./format.js"; +import { readEvalEntries } from "./read.js"; + +const log = createLogger({ sink: { kind: "stderr" } }); +const LOG_LIST = "L5KX9R2B"; + +type ListCliOptions = { + task: string | undefined; + limit: string; +}; + export function registerListCommand(program: Command): void { program .command("list") .description("List past eval runs") .option("--task ", "filter by task name") .option("--limit ", "max results", "20") - .action(async (_opts: Record) => { - process.stderr.write("uwf-eval list: not yet implemented\n"); - process.exitCode = 1; + .action(async (opts: ListCliOptions) => { + const limit = Number.parseInt(opts.limit, 10); + if (!Number.isInteger(limit) || limit < 1) { + process.stderr.write("--limit must be a positive integer\n"); + process.exitCode = 1; + return; + } + + try { + const evalStore = await createEvalStore(); + const entries = readEvalEntries(evalStore); + const task = opts.task ?? null; + const selected = selectEntries(entries, task, limit); + log(LOG_LIST, `list task=${task ?? "*"} found=${entries.length} shown=${selected.length}`); + process.stdout.write(formatList(selected)); + } catch (e) { + const message = e instanceof Error ? e.message : String(e); + process.stderr.write(`${message}\n`); + process.exitCode = 1; + } }); } diff --git a/packages/eval/src/commands/read.ts b/packages/eval/src/commands/read.ts new file mode 100644 index 0000000..a44bbec --- /dev/null +++ b/packages/eval/src/commands/read.ts @@ -0,0 +1,41 @@ +import type { EvalRunPayload, EvalStore } from "../storage/index.js"; +import type { EvalListEntry } from "./types.js"; + +/** Variable prefix and suffix for eval run pointers (`@uwf/eval//latest`). */ +const EVAL_VAR_PREFIX = "@uwf/eval/"; +const EVAL_VAR_SUFFIX = "/latest"; + +/** Read a single eval-run payload from CAS. Returns null when the node is absent. */ +export function readEvalRun(evalStore: EvalStore, hash: string): EvalRunPayload | null { + const node = evalStore.store.cas.get(hash); + if (node === null) { + return null; + } + return node.payload as EvalRunPayload; +} + +/** + * Read every indexed eval run by scanning `@uwf/eval/*\/latest` variables and + * loading the referenced CAS node. Dangling pointers are skipped. + */ +export function readEvalEntries(evalStore: EvalStore): EvalListEntry[] { + const { store, varStore } = evalStore; + const entries: EvalListEntry[] = []; + for (const variable of varStore.list()) { + if (!variable.name.startsWith(EVAL_VAR_PREFIX) || !variable.name.endsWith(EVAL_VAR_SUFFIX)) { + continue; + } + const node = store.cas.get(variable.value); + if (node === null) { + continue; + } + const payload = node.payload as EvalRunPayload; + entries.push({ + task: payload.task, + overall: payload.overall, + timestamp: payload.timestamp, + hash: variable.value, + }); + } + return entries; +} diff --git a/packages/eval/src/commands/report.ts b/packages/eval/src/commands/report.ts index db3e8d6..32c6551 100644 --- a/packages/eval/src/commands/report.ts +++ b/packages/eval/src/commands/report.ts @@ -1,11 +1,32 @@ +import { createLogger } from "@united-workforce/util"; import type { Command } from "commander"; +import { createEvalStore } from "../storage/index.js"; +import { formatReport } from "./format.js"; +import { readEvalRun } from "./read.js"; + +const log = createLogger({ sink: { kind: "stderr" } }); +const LOG_REPORT = "R7QP2M4K"; + export function registerReportCommand(program: Command): void { program .command("report ") .description("Show eval run results") - .action(async (_hash: string) => { - process.stderr.write("uwf-eval report: not yet implemented\n"); - process.exitCode = 1; + .action(async (hash: string) => { + try { + const evalStore = await createEvalStore(); + const payload = readEvalRun(evalStore, hash); + if (payload === null) { + process.stderr.write(`eval run not found: ${hash}\n`); + process.exitCode = 1; + return; + } + log(LOG_REPORT, `report task=${payload.task} hash=${hash}`); + process.stdout.write(formatReport(payload, hash)); + } catch (e) { + const message = e instanceof Error ? e.message : String(e); + process.stderr.write(`${message}\n`); + process.exitCode = 1; + } }); } diff --git a/packages/eval/src/commands/types.ts b/packages/eval/src/commands/types.ts new file mode 100644 index 0000000..2603bcc --- /dev/null +++ b/packages/eval/src/commands/types.ts @@ -0,0 +1,9 @@ +import type { CasRef } from "@united-workforce/protocol"; + +/** Summary row for the `list` command: one indexed eval run. */ +export type EvalListEntry = { + task: string; + overall: number; + timestamp: number; + hash: CasRef; +};