Implement the 3 read commands for eval framework: - report: read eval-run from CAS, render formatted text (task, overall, config, judges table, thread ID) - diff: side-by-side comparison with ▲/▼ delta indicators and config change markers - list: scan @uwf/eval/*/latest variables, sort by timestamp desc, --task filter, --limit pagination Architecture: pure formatting functions (format.ts) + data access (read.ts) + thin CLI handlers. Types in types.ts. 11 new tests (formatReport, formatDiff, formatList, selectEntries) Refs #72
This commit is contained in:
@@ -0,0 +1,171 @@
|
|||||||
|
import { bootstrap, createMemoryStore, putSchema } from "@ocas/core";
|
||||||
|
import type { CasRef } from "@united-workforce/protocol";
|
||||||
|
import { describe, expect, test } from "vitest";
|
||||||
|
|
||||||
|
import {
|
||||||
|
formatDiff,
|
||||||
|
formatList,
|
||||||
|
formatReport,
|
||||||
|
readEvalEntries,
|
||||||
|
readEvalRun,
|
||||||
|
selectEntries,
|
||||||
|
} from "../src/commands/index.js";
|
||||||
|
import type { EvalRunPayload, EvalStore } from "../src/storage/index.js";
|
||||||
|
import { EVAL_RUN_SCHEMA, setEvalLatest } from "../src/storage/index.js";
|
||||||
|
|
||||||
|
function makeEvalStore(): EvalStore {
|
||||||
|
const store = createMemoryStore();
|
||||||
|
bootstrap(store);
|
||||||
|
return { store, varStore: store.var };
|
||||||
|
}
|
||||||
|
|
||||||
|
function makePayload(
|
||||||
|
task: string,
|
||||||
|
overall: number,
|
||||||
|
timestamp: number,
|
||||||
|
judges: EvalRunPayload["judges"] = [
|
||||||
|
{
|
||||||
|
name: "frontmatter-compliance",
|
||||||
|
score: 1.0,
|
||||||
|
weight: 0.6,
|
||||||
|
dataHash: "AAAAAAAAAAAAA" as CasRef,
|
||||||
|
},
|
||||||
|
{ name: "token-stats", score: 0.5, weight: 0, dataHash: "BBBBBBBBBBBBB" as CasRef },
|
||||||
|
],
|
||||||
|
config: EvalRunPayload["config"] = {
|
||||||
|
agent: "hermes",
|
||||||
|
model: "claude-sonnet-4",
|
||||||
|
engineVersion: "1.0.0",
|
||||||
|
},
|
||||||
|
): EvalRunPayload {
|
||||||
|
return { task, config, threadId: "THREAD0123456789", judges, overall, timestamp };
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Store an eval-run node in CAS and index it under @uwf/eval/<task>/latest. */
|
||||||
|
function storeRun(evalStore: EvalStore, payload: EvalRunPayload): string {
|
||||||
|
const schemaHash = putSchema(evalStore.store, EVAL_RUN_SCHEMA);
|
||||||
|
const hash = evalStore.store.cas.put(schemaHash, payload);
|
||||||
|
setEvalLatest(evalStore.varStore, payload.task, hash);
|
||||||
|
return hash;
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("formatReport", () => {
|
||||||
|
test("includes task, overall, config and judges", () => {
|
||||||
|
const payload = makePayload("fix-off-by-one", 0.8, Date.UTC(2026, 0, 2, 3, 4, 5));
|
||||||
|
const output = formatReport(payload, "RUNHASH123456");
|
||||||
|
|
||||||
|
expect(output).toContain("fix-off-by-one");
|
||||||
|
expect(output).toContain("0.8000");
|
||||||
|
expect(output).toContain("hermes");
|
||||||
|
expect(output).toContain("claude-sonnet-4");
|
||||||
|
expect(output).toContain("1.0.0");
|
||||||
|
expect(output).toContain("frontmatter-compliance");
|
||||||
|
expect(output).toContain("token-stats");
|
||||||
|
expect(output).toContain("THREAD0123456789");
|
||||||
|
expect(output).toContain("RUNHASH123456");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("round-trips a stored run via readEvalRun", () => {
|
||||||
|
const evalStore = makeEvalStore();
|
||||||
|
const payload = makePayload("fix-off-by-one", 0.75, Date.now());
|
||||||
|
const hash = storeRun(evalStore, payload);
|
||||||
|
|
||||||
|
const loaded = readEvalRun(evalStore, hash);
|
||||||
|
expect(loaded).not.toBeNull();
|
||||||
|
const output = formatReport(loaded as EvalRunPayload, hash);
|
||||||
|
expect(output).toContain("fix-off-by-one");
|
||||||
|
expect(output).toContain("0.7500");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("readEvalRun returns null for a missing hash", () => {
|
||||||
|
const evalStore = makeEvalStore();
|
||||||
|
expect(readEvalRun(evalStore, "NOPENOPENOPE0")).toBeNull();
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("list", () => {
|
||||||
|
test("lists eval runs stored under different tasks", () => {
|
||||||
|
const evalStore = makeEvalStore();
|
||||||
|
storeRun(evalStore, makePayload("fix-off-by-one", 0.8, 2000));
|
||||||
|
storeRun(evalStore, makePayload("write-docs", 0.6, 1000));
|
||||||
|
|
||||||
|
const entries = readEvalEntries(evalStore);
|
||||||
|
expect(entries).toHaveLength(2);
|
||||||
|
|
||||||
|
const output = formatList(selectEntries(entries, null, 20));
|
||||||
|
expect(output).toContain("fix-off-by-one");
|
||||||
|
expect(output).toContain("write-docs");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("sorts newest-first by timestamp", () => {
|
||||||
|
const evalStore = makeEvalStore();
|
||||||
|
storeRun(evalStore, makePayload("old-task", 0.5, 1000));
|
||||||
|
storeRun(evalStore, makePayload("new-task", 0.5, 2000));
|
||||||
|
|
||||||
|
const selected = selectEntries(readEvalEntries(evalStore), null, 20);
|
||||||
|
expect(selected[0]?.task).toBe("new-task");
|
||||||
|
expect(selected[1]?.task).toBe("old-task");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("--task filter only shows the matching task", () => {
|
||||||
|
const evalStore = makeEvalStore();
|
||||||
|
storeRun(evalStore, makePayload("fix-off-by-one", 0.8, 2000));
|
||||||
|
storeRun(evalStore, makePayload("write-docs", 0.6, 1000));
|
||||||
|
|
||||||
|
const output = formatList(selectEntries(readEvalEntries(evalStore), "write-docs", 20));
|
||||||
|
expect(output).toContain("write-docs");
|
||||||
|
expect(output).not.toContain("fix-off-by-one");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("--limit caps the number of rows", () => {
|
||||||
|
const evalStore = makeEvalStore();
|
||||||
|
storeRun(evalStore, makePayload("task-a", 0.8, 3000));
|
||||||
|
storeRun(evalStore, makePayload("task-b", 0.6, 2000));
|
||||||
|
storeRun(evalStore, makePayload("task-c", 0.4, 1000));
|
||||||
|
|
||||||
|
const selected = selectEntries(readEvalEntries(evalStore), null, 2);
|
||||||
|
expect(selected).toHaveLength(2);
|
||||||
|
expect(selected.map((e) => e.task)).toEqual(["task-a", "task-b"]);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("empty store renders a placeholder", () => {
|
||||||
|
const evalStore = makeEvalStore();
|
||||||
|
const output = formatList(selectEntries(readEvalEntries(evalStore), null, 20));
|
||||||
|
expect(output).toContain("(no eval runs found)");
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("formatDiff", () => {
|
||||||
|
test("shows an upward delta when B scores higher", () => {
|
||||||
|
const a = makePayload("fix-off-by-one", 0.6, 1000);
|
||||||
|
const b = makePayload("fix-off-by-one", 0.8, 2000);
|
||||||
|
const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000");
|
||||||
|
|
||||||
|
expect(output).toContain("▲");
|
||||||
|
expect(output).toContain("HASHA00000000");
|
||||||
|
expect(output).toContain("HASHB00000000");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("shows a downward delta when B scores lower", () => {
|
||||||
|
const a = makePayload("fix-off-by-one", 0.9, 1000);
|
||||||
|
const b = makePayload("fix-off-by-one", 0.4, 2000);
|
||||||
|
const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000");
|
||||||
|
expect(output).toContain("▼");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("marks differing config values", () => {
|
||||||
|
const a = makePayload("fix-off-by-one", 0.6, 1000, undefined, {
|
||||||
|
agent: "hermes",
|
||||||
|
model: "claude-sonnet-4",
|
||||||
|
engineVersion: "1.0.0",
|
||||||
|
});
|
||||||
|
const b = makePayload("fix-off-by-one", 0.6, 2000, undefined, {
|
||||||
|
agent: "claude-code",
|
||||||
|
model: "claude-sonnet-4",
|
||||||
|
engineVersion: "1.0.0",
|
||||||
|
});
|
||||||
|
const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000");
|
||||||
|
expect(output).toContain("≠");
|
||||||
|
expect(output).toContain("claude-code");
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -1,11 +1,38 @@
|
|||||||
|
import { createLogger } from "@united-workforce/util";
|
||||||
import type { Command } from "commander";
|
import type { Command } from "commander";
|
||||||
|
|
||||||
|
import { createEvalStore } from "../storage/index.js";
|
||||||
|
import { formatDiff } from "./format.js";
|
||||||
|
import { readEvalRun } from "./read.js";
|
||||||
|
|
||||||
|
const log = createLogger({ sink: { kind: "stderr" } });
|
||||||
|
const LOG_DIFF = "D3WZ8N5T";
|
||||||
|
|
||||||
export function registerDiffCommand(program: Command): void {
|
export function registerDiffCommand(program: Command): void {
|
||||||
program
|
program
|
||||||
.command("diff <hash1> <hash2>")
|
.command("diff <hash1> <hash2>")
|
||||||
.description("Compare two eval runs side-by-side")
|
.description("Compare two eval runs side-by-side")
|
||||||
.action(async (_hash1: string, _hash2: string) => {
|
.action(async (hash1: string, hash2: string) => {
|
||||||
process.stderr.write("uwf-eval diff: not yet implemented\n");
|
try {
|
||||||
process.exitCode = 1;
|
const evalStore = await createEvalStore();
|
||||||
|
const payloadA = readEvalRun(evalStore, hash1);
|
||||||
|
if (payloadA === null) {
|
||||||
|
process.stderr.write(`eval run not found: ${hash1}\n`);
|
||||||
|
process.exitCode = 1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
const payloadB = readEvalRun(evalStore, hash2);
|
||||||
|
if (payloadB === null) {
|
||||||
|
process.stderr.write(`eval run not found: ${hash2}\n`);
|
||||||
|
process.exitCode = 1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
log(LOG_DIFF, `diff a=${hash1} b=${hash2}`);
|
||||||
|
process.stdout.write(formatDiff(payloadA, hash1, payloadB, hash2));
|
||||||
|
} catch (e) {
|
||||||
|
const message = e instanceof Error ? e.message : String(e);
|
||||||
|
process.stderr.write(`${message}\n`);
|
||||||
|
process.exitCode = 1;
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,148 @@
|
|||||||
|
import type { EvalRunPayload } from "../storage/index.js";
|
||||||
|
import type { EvalListEntry } from "./types.js";
|
||||||
|
|
||||||
|
const NAME_WIDTH = 28;
|
||||||
|
const SCORE_WIDTH = 10;
|
||||||
|
const TIMESTAMP_WIDTH = 26;
|
||||||
|
|
||||||
|
/** Format a 0..1 score (or weight) with fixed precision. */
|
||||||
|
function formatScore(value: number): string {
|
||||||
|
return value.toFixed(4);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Human-readable ISO-8601 timestamp from epoch milliseconds. */
|
||||||
|
function formatTimestamp(ms: number): string {
|
||||||
|
return new Date(ms).toISOString();
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Right-pad to a fixed column width (with a trailing space if already full). */
|
||||||
|
function pad(value: string, width: number): string {
|
||||||
|
return value.length >= width ? `${value} ` : value.padEnd(width);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Directional indicator for a score delta (B relative to A). */
|
||||||
|
function formatDelta(delta: number): string {
|
||||||
|
if (delta > 0) {
|
||||||
|
return `▲ +${formatScore(delta)}`;
|
||||||
|
}
|
||||||
|
if (delta < 0) {
|
||||||
|
return `▼ ${formatScore(delta)}`;
|
||||||
|
}
|
||||||
|
return `= ${formatScore(0)}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Render a single eval run as a human-readable report. */
|
||||||
|
export function formatReport(payload: EvalRunPayload, runHash: string): string {
|
||||||
|
const lines: string[] = [];
|
||||||
|
lines.push("=== Eval Report ===");
|
||||||
|
lines.push(`Task: ${payload.task}`);
|
||||||
|
lines.push(`Overall: ${formatScore(payload.overall)}`);
|
||||||
|
lines.push(`Timestamp: ${formatTimestamp(payload.timestamp)}`);
|
||||||
|
lines.push("");
|
||||||
|
lines.push("Config:");
|
||||||
|
lines.push(` Agent: ${payload.config.agent}`);
|
||||||
|
lines.push(` Model: ${payload.config.model}`);
|
||||||
|
lines.push(` Engine: ${payload.config.engineVersion}`);
|
||||||
|
lines.push("");
|
||||||
|
lines.push("Judges:");
|
||||||
|
lines.push(` ${pad("NAME", NAME_WIDTH)}${pad("SCORE", SCORE_WIDTH)}WEIGHT`);
|
||||||
|
for (const judge of payload.judges) {
|
||||||
|
lines.push(
|
||||||
|
` ${pad(judge.name, NAME_WIDTH)}${pad(formatScore(judge.score), SCORE_WIDTH)}${formatScore(judge.weight)}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
lines.push("");
|
||||||
|
lines.push(`Thread: ${payload.threadId}`);
|
||||||
|
lines.push(`Run: ${runHash}`);
|
||||||
|
return `${lines.join("\n")}\n`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Render a side-by-side comparison of two eval runs. */
|
||||||
|
export function formatDiff(
|
||||||
|
payloadA: EvalRunPayload,
|
||||||
|
hashA: string,
|
||||||
|
payloadB: EvalRunPayload,
|
||||||
|
hashB: string,
|
||||||
|
): string {
|
||||||
|
const lines: string[] = [];
|
||||||
|
lines.push("=== Eval Diff ===");
|
||||||
|
lines.push(`A: ${hashA} (${payloadA.task})`);
|
||||||
|
lines.push(`B: ${hashB} (${payloadB.task})`);
|
||||||
|
lines.push("");
|
||||||
|
|
||||||
|
const overallDelta = payloadB.overall - payloadA.overall;
|
||||||
|
lines.push("Overall:");
|
||||||
|
lines.push(
|
||||||
|
` A=${formatScore(payloadA.overall)} B=${formatScore(payloadB.overall)} ${formatDelta(overallDelta)}`,
|
||||||
|
);
|
||||||
|
lines.push("");
|
||||||
|
|
||||||
|
lines.push("Config:");
|
||||||
|
lines.push(configLine("Agent", payloadA.config.agent, payloadB.config.agent));
|
||||||
|
lines.push(configLine("Model", payloadA.config.model, payloadB.config.model));
|
||||||
|
lines.push(configLine("Engine", payloadA.config.engineVersion, payloadB.config.engineVersion));
|
||||||
|
lines.push("");
|
||||||
|
|
||||||
|
lines.push("Judges:");
|
||||||
|
lines.push(` ${pad("NAME", NAME_WIDTH)}${pad("A", SCORE_WIDTH)}${pad("B", SCORE_WIDTH)}DELTA`);
|
||||||
|
const scoresA = new Map(payloadA.judges.map((judge) => [judge.name, judge.score]));
|
||||||
|
const scoresB = new Map(payloadB.judges.map((judge) => [judge.name, judge.score]));
|
||||||
|
for (const name of unionJudgeNames(payloadA, payloadB)) {
|
||||||
|
const scoreA = scoresA.get(name);
|
||||||
|
const scoreB = scoresB.get(name);
|
||||||
|
const cellA = scoreA === undefined ? "—" : formatScore(scoreA);
|
||||||
|
const cellB = scoreB === undefined ? "—" : formatScore(scoreB);
|
||||||
|
const delta = scoreA !== undefined && scoreB !== undefined ? formatDelta(scoreB - scoreA) : "";
|
||||||
|
lines.push(
|
||||||
|
` ${pad(name, NAME_WIDTH)}${pad(cellA, SCORE_WIDTH)}${pad(cellB, SCORE_WIDTH)}${delta}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return `${lines.join("\n")}\n`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Render a table of indexed eval runs. */
|
||||||
|
export function formatList(entries: ReadonlyArray<EvalListEntry>): string {
|
||||||
|
const lines: string[] = [];
|
||||||
|
lines.push(
|
||||||
|
` ${pad("TASK", NAME_WIDTH)}${pad("OVERALL", SCORE_WIDTH)}${pad("TIMESTAMP", TIMESTAMP_WIDTH)}HASH`,
|
||||||
|
);
|
||||||
|
if (entries.length === 0) {
|
||||||
|
lines.push(" (no eval runs found)");
|
||||||
|
}
|
||||||
|
for (const entry of entries) {
|
||||||
|
lines.push(
|
||||||
|
` ${pad(entry.task, NAME_WIDTH)}${pad(formatScore(entry.overall), SCORE_WIDTH)}${pad(formatTimestamp(entry.timestamp), TIMESTAMP_WIDTH)}${entry.hash}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return `${lines.join("\n")}\n`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Sort newest-first, then apply optional task filter and result limit. */
|
||||||
|
export function selectEntries(
|
||||||
|
entries: ReadonlyArray<EvalListEntry>,
|
||||||
|
task: string | null,
|
||||||
|
limit: number | null,
|
||||||
|
): EvalListEntry[] {
|
||||||
|
const sorted = [...entries].sort((a, b) => b.timestamp - a.timestamp);
|
||||||
|
const filtered = task !== null ? sorted.filter((entry) => entry.task === task) : sorted;
|
||||||
|
return limit !== null ? filtered.slice(0, limit) : filtered;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Ordered union of judge names: A's order first, then B-only names. */
|
||||||
|
function unionJudgeNames(payloadA: EvalRunPayload, payloadB: EvalRunPayload): string[] {
|
||||||
|
const names: string[] = [];
|
||||||
|
const seen = new Set<string>();
|
||||||
|
for (const judge of [...payloadA.judges, ...payloadB.judges]) {
|
||||||
|
if (!seen.has(judge.name)) {
|
||||||
|
seen.add(judge.name);
|
||||||
|
names.push(judge.name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return names;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** One config row: `=` when equal, `≠` otherwise. */
|
||||||
|
function configLine(label: string, valueA: string, valueB: string): string {
|
||||||
|
const marker = valueA === valueB ? "=" : "≠";
|
||||||
|
return ` ${pad(`${label}:`, SCORE_WIDTH)}${marker} A=${valueA} B=${valueB}`;
|
||||||
|
}
|
||||||
@@ -1,4 +1,7 @@
|
|||||||
export { registerDiffCommand } from "./diff.js";
|
export { registerDiffCommand } from "./diff.js";
|
||||||
|
export { formatDiff, formatList, formatReport, selectEntries } from "./format.js";
|
||||||
export { registerListCommand } from "./list.js";
|
export { registerListCommand } from "./list.js";
|
||||||
|
export { readEvalEntries, readEvalRun } from "./read.js";
|
||||||
export { registerReportCommand } from "./report.js";
|
export { registerReportCommand } from "./report.js";
|
||||||
export { registerRunCommand } from "./run.js";
|
export { registerRunCommand } from "./run.js";
|
||||||
|
export type { EvalListEntry } from "./types.js";
|
||||||
|
|||||||
@@ -1,13 +1,43 @@
|
|||||||
|
import { createLogger } from "@united-workforce/util";
|
||||||
import type { Command } from "commander";
|
import type { Command } from "commander";
|
||||||
|
|
||||||
|
import { createEvalStore } from "../storage/index.js";
|
||||||
|
import { formatList, selectEntries } from "./format.js";
|
||||||
|
import { readEvalEntries } from "./read.js";
|
||||||
|
|
||||||
|
const log = createLogger({ sink: { kind: "stderr" } });
|
||||||
|
const LOG_LIST = "L5KX9R2B";
|
||||||
|
|
||||||
|
type ListCliOptions = {
|
||||||
|
task: string | undefined;
|
||||||
|
limit: string;
|
||||||
|
};
|
||||||
|
|
||||||
export function registerListCommand(program: Command): void {
|
export function registerListCommand(program: Command): void {
|
||||||
program
|
program
|
||||||
.command("list")
|
.command("list")
|
||||||
.description("List past eval runs")
|
.description("List past eval runs")
|
||||||
.option("--task <name>", "filter by task name")
|
.option("--task <name>", "filter by task name")
|
||||||
.option("--limit <n>", "max results", "20")
|
.option("--limit <n>", "max results", "20")
|
||||||
.action(async (_opts: Record<string, unknown>) => {
|
.action(async (opts: ListCliOptions) => {
|
||||||
process.stderr.write("uwf-eval list: not yet implemented\n");
|
const limit = Number.parseInt(opts.limit, 10);
|
||||||
process.exitCode = 1;
|
if (!Number.isInteger(limit) || limit < 1) {
|
||||||
|
process.stderr.write("--limit must be a positive integer\n");
|
||||||
|
process.exitCode = 1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
const evalStore = await createEvalStore();
|
||||||
|
const entries = readEvalEntries(evalStore);
|
||||||
|
const task = opts.task ?? null;
|
||||||
|
const selected = selectEntries(entries, task, limit);
|
||||||
|
log(LOG_LIST, `list task=${task ?? "*"} found=${entries.length} shown=${selected.length}`);
|
||||||
|
process.stdout.write(formatList(selected));
|
||||||
|
} catch (e) {
|
||||||
|
const message = e instanceof Error ? e.message : String(e);
|
||||||
|
process.stderr.write(`${message}\n`);
|
||||||
|
process.exitCode = 1;
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,41 @@
|
|||||||
|
import type { EvalRunPayload, EvalStore } from "../storage/index.js";
|
||||||
|
import type { EvalListEntry } from "./types.js";
|
||||||
|
|
||||||
|
/** Variable prefix and suffix for eval run pointers (`@uwf/eval/<task>/latest`). */
|
||||||
|
const EVAL_VAR_PREFIX = "@uwf/eval/";
|
||||||
|
const EVAL_VAR_SUFFIX = "/latest";
|
||||||
|
|
||||||
|
/** Read a single eval-run payload from CAS. Returns null when the node is absent. */
|
||||||
|
export function readEvalRun(evalStore: EvalStore, hash: string): EvalRunPayload | null {
|
||||||
|
const node = evalStore.store.cas.get(hash);
|
||||||
|
if (node === null) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return node.payload as EvalRunPayload;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Read every indexed eval run by scanning `@uwf/eval/*\/latest` variables and
|
||||||
|
* loading the referenced CAS node. Dangling pointers are skipped.
|
||||||
|
*/
|
||||||
|
export function readEvalEntries(evalStore: EvalStore): EvalListEntry[] {
|
||||||
|
const { store, varStore } = evalStore;
|
||||||
|
const entries: EvalListEntry[] = [];
|
||||||
|
for (const variable of varStore.list()) {
|
||||||
|
if (!variable.name.startsWith(EVAL_VAR_PREFIX) || !variable.name.endsWith(EVAL_VAR_SUFFIX)) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const node = store.cas.get(variable.value);
|
||||||
|
if (node === null) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const payload = node.payload as EvalRunPayload;
|
||||||
|
entries.push({
|
||||||
|
task: payload.task,
|
||||||
|
overall: payload.overall,
|
||||||
|
timestamp: payload.timestamp,
|
||||||
|
hash: variable.value,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
return entries;
|
||||||
|
}
|
||||||
@@ -1,11 +1,32 @@
|
|||||||
|
import { createLogger } from "@united-workforce/util";
|
||||||
import type { Command } from "commander";
|
import type { Command } from "commander";
|
||||||
|
|
||||||
|
import { createEvalStore } from "../storage/index.js";
|
||||||
|
import { formatReport } from "./format.js";
|
||||||
|
import { readEvalRun } from "./read.js";
|
||||||
|
|
||||||
|
const log = createLogger({ sink: { kind: "stderr" } });
|
||||||
|
const LOG_REPORT = "R7QP2M4K";
|
||||||
|
|
||||||
export function registerReportCommand(program: Command): void {
|
export function registerReportCommand(program: Command): void {
|
||||||
program
|
program
|
||||||
.command("report <hash>")
|
.command("report <hash>")
|
||||||
.description("Show eval run results")
|
.description("Show eval run results")
|
||||||
.action(async (_hash: string) => {
|
.action(async (hash: string) => {
|
||||||
process.stderr.write("uwf-eval report: not yet implemented\n");
|
try {
|
||||||
process.exitCode = 1;
|
const evalStore = await createEvalStore();
|
||||||
|
const payload = readEvalRun(evalStore, hash);
|
||||||
|
if (payload === null) {
|
||||||
|
process.stderr.write(`eval run not found: ${hash}\n`);
|
||||||
|
process.exitCode = 1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
log(LOG_REPORT, `report task=${payload.task} hash=${hash}`);
|
||||||
|
process.stdout.write(formatReport(payload, hash));
|
||||||
|
} catch (e) {
|
||||||
|
const message = e instanceof Error ? e.message : String(e);
|
||||||
|
process.stderr.write(`${message}\n`);
|
||||||
|
process.exitCode = 1;
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -0,0 +1,9 @@
|
|||||||
|
import type { CasRef } from "@united-workforce/protocol";
|
||||||
|
|
||||||
|
/** Summary row for the `list` command: one indexed eval run. */
|
||||||
|
export type EvalListEntry = {
|
||||||
|
task: string;
|
||||||
|
overall: number;
|
||||||
|
timestamp: number;
|
||||||
|
hash: CasRef;
|
||||||
|
};
|
||||||
Reference in New Issue
Block a user