From 99619d85db2e829217037a66ac04d3ac0fc0632a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=8F=E6=A9=98?= Date: Thu, 4 Jun 2026 23:42:16 +0000 Subject: [PATCH 1/4] feat: eval package scaffold with CLI, schemas, types, task loader New package @united-workforce/eval (uwf-eval CLI): - CLI skeleton: run/report/diff/list subcommands (stubs) - 5 OCAS schemas: eval-run, judge-frontmatter, judge-upstream, judge-hallucination, judge-token-stats - TaskManifest type + parser/validator for task.yaml - JudgeOutput/JudgeInput types for judge contract - EvalRunPayload/EvalRunConfig/EvalJudgeRecord storage types - 19 unit tests: task loader validation + schema definitions Refs #69 --- packages/eval/__tests__/schemas.test.ts | 63 ++++++++ packages/eval/__tests__/task-loader.test.ts | 163 ++++++++++++++++++++ packages/eval/package.json | 46 ++++++ packages/eval/src/cli.ts | 22 +++ packages/eval/src/commands/diff.ts | 11 ++ packages/eval/src/commands/index.ts | 4 + packages/eval/src/commands/list.ts | 13 ++ packages/eval/src/commands/report.ts | 11 ++ packages/eval/src/commands/run.ts | 14 ++ packages/eval/src/index.ts | 15 ++ packages/eval/src/judge/index.ts | 1 + packages/eval/src/judge/types.ts | 15 ++ packages/eval/src/storage/index.ts | 8 + packages/eval/src/storage/schemas.ts | 123 +++++++++++++++ packages/eval/src/storage/types.ts | 26 ++++ packages/eval/src/task/index.ts | 2 + packages/eval/src/task/loader.ts | 74 +++++++++ packages/eval/src/task/types.ts | 28 ++++ packages/eval/tsconfig.json | 9 ++ pnpm-lock.yaml | 25 +++ tsconfig.json | 3 +- 21 files changed, 675 insertions(+), 1 deletion(-) create mode 100644 packages/eval/__tests__/schemas.test.ts create mode 100644 packages/eval/__tests__/task-loader.test.ts create mode 100644 packages/eval/package.json create mode 100644 packages/eval/src/cli.ts create mode 100644 packages/eval/src/commands/diff.ts create mode 100644 packages/eval/src/commands/index.ts create mode 100644 packages/eval/src/commands/list.ts create mode 100644 packages/eval/src/commands/report.ts create mode 100644 packages/eval/src/commands/run.ts create mode 100644 packages/eval/src/index.ts create mode 100644 packages/eval/src/judge/index.ts create mode 100644 packages/eval/src/judge/types.ts create mode 100644 packages/eval/src/storage/index.ts create mode 100644 packages/eval/src/storage/schemas.ts create mode 100644 packages/eval/src/storage/types.ts create mode 100644 packages/eval/src/task/index.ts create mode 100644 packages/eval/src/task/loader.ts create mode 100644 packages/eval/src/task/types.ts create mode 100644 packages/eval/tsconfig.json diff --git a/packages/eval/__tests__/schemas.test.ts b/packages/eval/__tests__/schemas.test.ts new file mode 100644 index 0000000..5e760d4 --- /dev/null +++ b/packages/eval/__tests__/schemas.test.ts @@ -0,0 +1,63 @@ +import { describe, expect, test } from "vitest"; +import { + EVAL_JUDGE_FRONTMATTER_SCHEMA, + EVAL_JUDGE_HALLUCINATION_SCHEMA, + EVAL_JUDGE_TOKEN_STATS_SCHEMA, + EVAL_JUDGE_UPSTREAM_SCHEMA, + EVAL_RUN_SCHEMA, +} from "../src/storage/index.js"; + +describe("OCAS schema definitions", () => { + test("eval-run schema has correct title and required fields", () => { + expect(EVAL_RUN_SCHEMA.title).toBe("@uwf/eval-run"); + const required = EVAL_RUN_SCHEMA.required as string[]; + expect(required).toContain("task"); + expect(required).toContain("config"); + expect(required).toContain("threadId"); + expect(required).toContain("judges"); + expect(required).toContain("overall"); + expect(required).toContain("timestamp"); + }); + + test("frontmatter judge schema has correct title", () => { + expect(EVAL_JUDGE_FRONTMATTER_SCHEMA.title).toBe("@uwf/eval-judge-frontmatter"); + const required = EVAL_JUDGE_FRONTMATTER_SCHEMA.required as string[]; + expect(required).toContain("stepsTotal"); + expect(required).toContain("stepsValid"); + expect(required).toContain("invalidSteps"); + }); + + test("upstream judge schema has correct title", () => { + expect(EVAL_JUDGE_UPSTREAM_SCHEMA.title).toBe("@uwf/eval-judge-upstream"); + const required = EVAL_JUDGE_UPSTREAM_SCHEMA.required as string[]; + expect(required).toContain("perStep"); + }); + + test("hallucination judge schema has correct title", () => { + expect(EVAL_JUDGE_HALLUCINATION_SCHEMA.title).toBe("@uwf/eval-judge-hallucination"); + const required = EVAL_JUDGE_HALLUCINATION_SCHEMA.required as string[]; + expect(required).toContain("perStep"); + }); + + test("token-stats judge schema has correct title", () => { + expect(EVAL_JUDGE_TOKEN_STATS_SCHEMA.title).toBe("@uwf/eval-judge-token-stats"); + const required = EVAL_JUDGE_TOKEN_STATS_SCHEMA.required as string[]; + expect(required).toContain("totalInput"); + expect(required).toContain("totalOutput"); + expect(required).toContain("totalTurns"); + expect(required).toContain("perStep"); + }); + + test("all schemas have type object at root", () => { + const schemas = [ + EVAL_RUN_SCHEMA, + EVAL_JUDGE_FRONTMATTER_SCHEMA, + EVAL_JUDGE_UPSTREAM_SCHEMA, + EVAL_JUDGE_HALLUCINATION_SCHEMA, + EVAL_JUDGE_TOKEN_STATS_SCHEMA, + ]; + for (const s of schemas) { + expect(s.type).toBe("object"); + } + }); +}); diff --git a/packages/eval/__tests__/task-loader.test.ts b/packages/eval/__tests__/task-loader.test.ts new file mode 100644 index 0000000..bbbc857 --- /dev/null +++ b/packages/eval/__tests__/task-loader.test.ts @@ -0,0 +1,163 @@ +import { describe, expect, test } from "vitest"; +import { parseTaskManifest } from "../src/task/index.js"; + +const VALID_YAML = ` +name: fix-off-by-one +description: Fix an off-by-one error in a calculator +workflow: solve-issue +prompt: "Fix the bug: add(1,2) returns 4 instead of 3" +limits: + maxSteps: 15 + timeoutMinutes: 30 +judges: + - name: frontmatter-compliance + weight: 0.15 + builtin: true + - name: test-pass + weight: 0.3 + entry: dist/judges/test-pass.js + schema: schemas/test-pass.json +`; + +describe("parseTaskManifest", () => { + test("parses valid task.yaml", () => { + const manifest = parseTaskManifest(VALID_YAML); + expect(manifest.name).toBe("fix-off-by-one"); + expect(manifest.description).toBe("Fix an off-by-one error in a calculator"); + expect(manifest.workflow).toBe("solve-issue"); + expect(manifest.prompt).toBe("Fix the bug: add(1,2) returns 4 instead of 3"); + expect(manifest.limits).toEqual({ maxSteps: 15, timeoutMinutes: 30 }); + expect(manifest.judges).toHaveLength(2); + }); + + test("parses builtin judge", () => { + const manifest = parseTaskManifest(VALID_YAML); + const builtin = manifest.judges[0]; + expect(builtin).toBeDefined(); + expect(builtin!.name).toBe("frontmatter-compliance"); + expect(builtin!.weight).toBe(0.15); + expect(builtin!.builtin).toBe(true); + expect(builtin!.entry).toBeNull(); + }); + + test("parses custom judge with entry + schema", () => { + const manifest = parseTaskManifest(VALID_YAML); + const custom = manifest.judges[1]; + expect(custom).toBeDefined(); + expect(custom!.name).toBe("test-pass"); + expect(custom!.weight).toBe(0.3); + expect(custom!.builtin).toBe(false); + expect(custom!.entry).toBe("dist/judges/test-pass.js"); + expect(custom!.schema).toBe("schemas/test-pass.json"); + }); + + test("defaults limits when omitted", () => { + const yaml = ` +name: minimal +workflow: solve-issue +prompt: do something +judges: + - name: check + builtin: true +`; + const manifest = parseTaskManifest(yaml); + expect(manifest.limits).toEqual({ maxSteps: 20, timeoutMinutes: 30 }); + }); + + test("defaults description to empty string", () => { + const yaml = ` +name: no-desc +workflow: solve-issue +prompt: do something +judges: + - name: check + builtin: true +`; + const manifest = parseTaskManifest(yaml); + expect(manifest.description).toBe(""); + }); + + test("rejects missing name", () => { + const yaml = ` +workflow: solve-issue +prompt: do something +judges: + - name: check + builtin: true +`; + expect(() => parseTaskManifest(yaml)).toThrow("name is required"); + }); + + test("rejects missing workflow", () => { + const yaml = ` +name: test +prompt: do something +judges: + - name: check + builtin: true +`; + expect(() => parseTaskManifest(yaml)).toThrow("workflow is required"); + }); + + test("rejects missing prompt", () => { + const yaml = ` +name: test +workflow: solve-issue +judges: + - name: check + builtin: true +`; + expect(() => parseTaskManifest(yaml)).toThrow("prompt is required"); + }); + + test("rejects empty judges array", () => { + const yaml = ` +name: test +workflow: solve-issue +prompt: do something +judges: [] +`; + expect(() => parseTaskManifest(yaml)).toThrow("at least one judge"); + }); + + test("rejects non-builtin judge without entry", () => { + const yaml = ` +name: test +workflow: solve-issue +prompt: do something +judges: + - name: custom-check + weight: 0.5 +`; + expect(() => parseTaskManifest(yaml)).toThrow("non-builtin judge must have entry"); + }); + + test("rejects non-object YAML root", () => { + expect(() => parseTaskManifest("just a string")).toThrow("must be a YAML mapping"); + }); + + test("rejects judge without name", () => { + const yaml = ` +name: test +workflow: solve-issue +prompt: do something +judges: + - weight: 0.5 + builtin: true +`; + expect(() => parseTaskManifest(yaml)).toThrow("name is required"); + }); + + test("defaults weight to 0 when omitted", () => { + const yaml = ` +name: test +workflow: solve-issue +prompt: do something +judges: + - name: token-stats + builtin: true +`; + const manifest = parseTaskManifest(yaml); + expect(manifest.judges[0]!.weight).toBe(0); + }); +}); diff --git a/packages/eval/package.json b/packages/eval/package.json new file mode 100644 index 0000000..696b2a2 --- /dev/null +++ b/packages/eval/package.json @@ -0,0 +1,46 @@ +{ + "name": "@united-workforce/eval", + "version": "0.1.0", + "private": true, + "files": [ + "src", + "dist", + "package.json" + ], + "type": "module", + "bin": { + "uwf-eval": "./dist/cli.js" + }, + "exports": { + ".": { + "types": "./dist/index.d.ts", + "import": "./dist/index.js" + } + }, + "scripts": { + "prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1", + "test": "vitest run __tests__/", + "test:ci": "vitest run __tests__/" + }, + "dependencies": { + "@ocas/core": "^0.3.0", + "@ocas/fs": "^0.3.0", + "@united-workforce/protocol": "workspace:^", + "@united-workforce/util": "workspace:^", + "commander": "^14.0.3", + "yaml": "^2.9.0" + }, + "devDependencies": { + "typescript": "^5.8.3" + }, + "repository": { + "type": "git", + "url": "https://git.shazhou.work/shazhou/united-workforce.git", + "directory": "packages/eval" + }, + "homepage": "https://git.shazhou.work/shazhou/united-workforce#readme", + "bugs": { + "url": "https://git.shazhou.work/shazhou/united-workforce/issues" + }, + "license": "MIT" +} diff --git a/packages/eval/src/cli.ts b/packages/eval/src/cli.ts new file mode 100644 index 0000000..00c89b5 --- /dev/null +++ b/packages/eval/src/cli.ts @@ -0,0 +1,22 @@ +#!/usr/bin/env node +import { Command } from "commander"; +import { + registerDiffCommand, + registerListCommand, + registerReportCommand, + registerRunCommand, +} from "./commands/index.js"; + +const program = new Command(); + +program + .name("uwf-eval") + .description("Evaluate uwf workflow quality with real agents") + .version("0.1.0"); + +registerRunCommand(program); +registerReportCommand(program); +registerDiffCommand(program); +registerListCommand(program); + +program.parse(); diff --git a/packages/eval/src/commands/diff.ts b/packages/eval/src/commands/diff.ts new file mode 100644 index 0000000..fa443d8 --- /dev/null +++ b/packages/eval/src/commands/diff.ts @@ -0,0 +1,11 @@ +import type { Command } from "commander"; + +export function registerDiffCommand(program: Command): void { + program + .command("diff ") + .description("Compare two eval runs side-by-side") + .action(async (_hash1: string, _hash2: string) => { + process.stderr.write("uwf-eval diff: not yet implemented\n"); + process.exitCode = 1; + }); +} diff --git a/packages/eval/src/commands/index.ts b/packages/eval/src/commands/index.ts new file mode 100644 index 0000000..0dded07 --- /dev/null +++ b/packages/eval/src/commands/index.ts @@ -0,0 +1,4 @@ +export { registerDiffCommand } from "./diff.js"; +export { registerListCommand } from "./list.js"; +export { registerReportCommand } from "./report.js"; +export { registerRunCommand } from "./run.js"; diff --git a/packages/eval/src/commands/list.ts b/packages/eval/src/commands/list.ts new file mode 100644 index 0000000..9c4bd0c --- /dev/null +++ b/packages/eval/src/commands/list.ts @@ -0,0 +1,13 @@ +import type { Command } from "commander"; + +export function registerListCommand(program: Command): void { + program + .command("list") + .description("List past eval runs") + .option("--task ", "filter by task name") + .option("--limit ", "max results", "20") + .action(async (_opts: Record) => { + process.stderr.write("uwf-eval list: not yet implemented\n"); + process.exitCode = 1; + }); +} diff --git a/packages/eval/src/commands/report.ts b/packages/eval/src/commands/report.ts new file mode 100644 index 0000000..db3e8d6 --- /dev/null +++ b/packages/eval/src/commands/report.ts @@ -0,0 +1,11 @@ +import type { Command } from "commander"; + +export function registerReportCommand(program: Command): void { + program + .command("report ") + .description("Show eval run results") + .action(async (_hash: string) => { + process.stderr.write("uwf-eval report: not yet implemented\n"); + process.exitCode = 1; + }); +} diff --git a/packages/eval/src/commands/run.ts b/packages/eval/src/commands/run.ts new file mode 100644 index 0000000..4bc6b08 --- /dev/null +++ b/packages/eval/src/commands/run.ts @@ -0,0 +1,14 @@ +import type { Command } from "commander"; + +export function registerRunCommand(program: Command): void { + program + .command("run ") + .description("Run eval on a task directory or tarball") + .option("--agent ", "agent adapter to use", "hermes") + .option("--model ", "model override") + .option("--count ", "number of eval runs", "1") + .action(async (_task: string, _opts: Record) => { + process.stderr.write("uwf-eval run: not yet implemented\n"); + process.exitCode = 1; + }); +} diff --git a/packages/eval/src/index.ts b/packages/eval/src/index.ts new file mode 100644 index 0000000..69c6e69 --- /dev/null +++ b/packages/eval/src/index.ts @@ -0,0 +1,15 @@ +// Task manifest + +// Judge types +export type { JudgeInput, JudgeOutput } from "./judge/index.js"; +export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload } from "./storage/index.js"; +// Storage schemas and types +export { + EVAL_JUDGE_FRONTMATTER_SCHEMA, + EVAL_JUDGE_HALLUCINATION_SCHEMA, + EVAL_JUDGE_TOKEN_STATS_SCHEMA, + EVAL_JUDGE_UPSTREAM_SCHEMA, + EVAL_RUN_SCHEMA, +} from "./storage/index.js"; +export type { JudgeEntry, TaskLimits, TaskManifest } from "./task/index.js"; +export { loadTaskManifest, parseTaskManifest } from "./task/index.js"; diff --git a/packages/eval/src/judge/index.ts b/packages/eval/src/judge/index.ts new file mode 100644 index 0000000..ebb4dfb --- /dev/null +++ b/packages/eval/src/judge/index.ts @@ -0,0 +1 @@ +export type { JudgeInput, JudgeOutput } from "./types.js"; diff --git a/packages/eval/src/judge/types.ts b/packages/eval/src/judge/types.ts new file mode 100644 index 0000000..93b506f --- /dev/null +++ b/packages/eval/src/judge/types.ts @@ -0,0 +1,15 @@ +/** Output shape every judge must produce on stdout (JSON). */ +export type JudgeOutput = { + /** Score between 0.0 and 1.0. */ + score: number; + /** Judge-specific structured data, stored in CAS with its own schema. */ + data: T; +}; + +/** Input context passed to judge scripts via argv. */ +export type JudgeInput = { + /** Working directory where the task was executed. */ + cwd: string; + /** Thread ID of the eval run. */ + threadId: string; +}; diff --git a/packages/eval/src/storage/index.ts b/packages/eval/src/storage/index.ts new file mode 100644 index 0000000..8b0d554 --- /dev/null +++ b/packages/eval/src/storage/index.ts @@ -0,0 +1,8 @@ +export { + EVAL_JUDGE_FRONTMATTER_SCHEMA, + EVAL_JUDGE_HALLUCINATION_SCHEMA, + EVAL_JUDGE_TOKEN_STATS_SCHEMA, + EVAL_JUDGE_UPSTREAM_SCHEMA, + EVAL_RUN_SCHEMA, +} from "./schemas.js"; +export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload } from "./types.js"; diff --git a/packages/eval/src/storage/schemas.ts b/packages/eval/src/storage/schemas.ts new file mode 100644 index 0000000..a7809c6 --- /dev/null +++ b/packages/eval/src/storage/schemas.ts @@ -0,0 +1,123 @@ +import type { JSONSchema } from "@ocas/core"; + +export const EVAL_RUN_SCHEMA: JSONSchema = { + title: "@uwf/eval-run", + type: "object", + required: ["task", "config", "threadId", "judges", "overall", "timestamp"], + properties: { + task: { type: "string" }, + config: { + type: "object", + required: ["agent", "model", "engineVersion"], + properties: { + agent: { type: "string" }, + model: { type: "string" }, + engineVersion: { type: "string" }, + }, + }, + threadId: { type: "string" }, + judges: { + type: "array", + items: { + type: "object", + required: ["name", "score", "weight", "dataHash"], + properties: { + name: { type: "string" }, + score: { type: "number" }, + weight: { type: "number" }, + dataHash: { type: "string" }, + }, + }, + }, + overall: { type: "number" }, + timestamp: { type: "integer" }, + }, +}; + +export const EVAL_JUDGE_FRONTMATTER_SCHEMA: JSONSchema = { + title: "@uwf/eval-judge-frontmatter", + type: "object", + required: ["stepsTotal", "stepsValid", "invalidSteps"], + properties: { + stepsTotal: { type: "integer" }, + stepsValid: { type: "integer" }, + invalidSteps: { + type: "array", + items: { + type: "object", + required: ["stepIndex", "role", "errors"], + properties: { + stepIndex: { type: "integer" }, + role: { type: "string" }, + errors: { type: "array", items: { type: "string" } }, + }, + }, + }, + }, +}; + +export const EVAL_JUDGE_UPSTREAM_SCHEMA: JSONSchema = { + title: "@uwf/eval-judge-upstream", + type: "object", + required: ["perStep"], + properties: { + perStep: { + type: "array", + items: { + type: "object", + required: ["role", "consumed", "missed", "score"], + properties: { + role: { type: "string" }, + consumed: { type: "array", items: { type: "string" } }, + missed: { type: "array", items: { type: "string" } }, + score: { type: "number" }, + }, + }, + }, + }, +}; + +export const EVAL_JUDGE_HALLUCINATION_SCHEMA: JSONSchema = { + title: "@uwf/eval-judge-hallucination", + type: "object", + required: ["perStep"], + properties: { + perStep: { + type: "array", + items: { + type: "object", + required: ["role", "hallucinations", "score"], + properties: { + role: { type: "string" }, + hallucinations: { type: "array", items: { type: "string" } }, + score: { type: "number" }, + }, + }, + }, + }, +}; + +export const EVAL_JUDGE_TOKEN_STATS_SCHEMA: JSONSchema = { + title: "@uwf/eval-judge-token-stats", + type: "object", + required: ["totalInput", "totalOutput", "totalTurns", "perStep"], + properties: { + totalInput: { type: "integer" }, + totalOutput: { type: "integer" }, + totalTurns: { type: "integer" }, + perStep: { + type: "array", + items: { + type: "object", + required: ["role", "inputTokens", "outputTokens", "turns", "duration"], + properties: { + role: { type: "string" }, + inputTokens: { type: "integer" }, + outputTokens: { type: "integer" }, + turns: { type: "integer" }, + duration: { type: "number" }, + }, + }, + }, + }, +}; diff --git a/packages/eval/src/storage/types.ts b/packages/eval/src/storage/types.ts new file mode 100644 index 0000000..862e45c --- /dev/null +++ b/packages/eval/src/storage/types.ts @@ -0,0 +1,26 @@ +import type { CasRef } from "@united-workforce/protocol"; + +/** A single judge result within an eval run. */ +export type EvalJudgeRecord = { + name: string; + score: number; + weight: number; + dataHash: CasRef; +}; + +/** Config snapshot for an eval run. */ +export type EvalRunConfig = { + agent: string; + model: string; + engineVersion: string; +}; + +/** Full eval run record stored in CAS. */ +export type EvalRunPayload = { + task: string; + config: EvalRunConfig; + threadId: string; + judges: EvalJudgeRecord[]; + overall: number; + timestamp: number; +}; diff --git a/packages/eval/src/task/index.ts b/packages/eval/src/task/index.ts new file mode 100644 index 0000000..67009e3 --- /dev/null +++ b/packages/eval/src/task/index.ts @@ -0,0 +1,2 @@ +export { loadTaskManifest, parseTaskManifest } from "./loader.js"; +export type { JudgeEntry, TaskLimits, TaskManifest } from "./types.js"; diff --git a/packages/eval/src/task/loader.ts b/packages/eval/src/task/loader.ts new file mode 100644 index 0000000..1257c3b --- /dev/null +++ b/packages/eval/src/task/loader.ts @@ -0,0 +1,74 @@ +import { readFile } from "node:fs/promises"; +import { join } from "node:path"; +import { parse as parseYaml } from "yaml"; +import type { JudgeEntry, TaskLimits, TaskManifest } from "./types.js"; + +function isRecord(value: unknown): value is Record { + return typeof value === "object" && value !== null && !Array.isArray(value); +} + +function parseJudgeEntry(raw: unknown, index: number): JudgeEntry { + if (!isRecord(raw)) { + throw new Error(`judges[${index}]: expected object`); + } + const name = raw.name; + if (typeof name !== "string" || name === "") { + throw new Error(`judges[${index}]: name is required`); + } + const weight = typeof raw.weight === "number" ? raw.weight : 0; + const builtin = raw.builtin === true; + const entry = typeof raw.entry === "string" ? raw.entry : null; + const schema = typeof raw.schema === "string" ? raw.schema : null; + if (!builtin && entry === null) { + throw new Error(`judges[${index}] "${name}": non-builtin judge must have entry`); + } + return { name, weight, builtin, entry, schema }; +} + +function parseLimits(raw: unknown): TaskLimits { + if (!isRecord(raw)) { + return { maxSteps: 20, timeoutMinutes: 30 }; + } + return { + maxSteps: typeof raw.maxSteps === "number" ? raw.maxSteps : 20, + timeoutMinutes: typeof raw.timeoutMinutes === "number" ? raw.timeoutMinutes : 30, + }; +} + +/** Parse and validate a task.yaml file into a TaskManifest. */ +export function parseTaskManifest(yamlText: string): TaskManifest { + const raw = parseYaml(yamlText) as unknown; + if (!isRecord(raw)) { + throw new Error("task.yaml must be a YAML mapping"); + } + const name = raw.name; + if (typeof name !== "string" || name === "") { + throw new Error("task.yaml: name is required"); + } + const description = typeof raw.description === "string" ? raw.description : ""; + const workflow = raw.workflow; + if (typeof workflow !== "string" || workflow === "") { + throw new Error("task.yaml: workflow is required"); + } + const prompt = raw.prompt; + if (typeof prompt !== "string" || prompt === "") { + throw new Error("task.yaml: prompt is required"); + } + const limits = parseLimits(raw.limits); + const judgesRaw = raw.judges; + if (!Array.isArray(judgesRaw) || judgesRaw.length === 0) { + throw new Error("task.yaml: at least one judge is required"); + } + const judges: JudgeEntry[] = []; + for (let i = 0; i < judgesRaw.length; i++) { + judges.push(parseJudgeEntry(judgesRaw[i], i)); + } + return { name, description, workflow, prompt, limits, judges }; +} + +/** Load and parse task.yaml from a directory. */ +export async function loadTaskManifest(taskDir: string): Promise { + const yamlPath = join(taskDir, "task.yaml"); + const text = await readFile(yamlPath, "utf8"); + return parseTaskManifest(text); +} diff --git a/packages/eval/src/task/types.ts b/packages/eval/src/task/types.ts new file mode 100644 index 0000000..80f27e5 --- /dev/null +++ b/packages/eval/src/task/types.ts @@ -0,0 +1,28 @@ +/** Judge entry in task.yaml */ +export type JudgeEntry = { + name: string; + weight: number; + builtin: boolean; + /** Path to judge entry script (relative to task root). Required for non-builtin judges. */ + entry: string | null; + /** Path to OCAS schema JSON for judge data. Required for non-builtin judges. */ + schema: string | null; +}; + +/** Limits for eval execution. */ +export type TaskLimits = { + maxSteps: number; + timeoutMinutes: number; +}; + +/** Parsed task.yaml manifest. */ +export type TaskManifest = { + name: string; + description: string; + /** Workflow name or relative path to .yaml file. */ + workflow: string; + /** Initial prompt for thread start. */ + prompt: string; + limits: TaskLimits; + judges: JudgeEntry[]; +}; diff --git a/packages/eval/tsconfig.json b/packages/eval/tsconfig.json new file mode 100644 index 0000000..e8bf6d5 --- /dev/null +++ b/packages/eval/tsconfig.json @@ -0,0 +1,9 @@ +{ + "extends": "../../tsconfig.json", + "compilerOptions": { + "rootDir": "src", + "outDir": "dist" + }, + "include": ["src"], + "references": [{ "path": "../protocol" }, { "path": "../util" }] +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 11093aa..299e83d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -228,6 +228,31 @@ importers: specifier: ^8.0.13 version: 8.0.16(@types/node@25.9.1)(esbuild@0.27.7)(jiti@2.7.0)(yaml@2.9.0) + packages/eval: + dependencies: + '@ocas/core': + specifier: ^0.3.0 + version: 0.3.0 + '@ocas/fs': + specifier: ^0.3.0 + version: 0.3.0 + '@united-workforce/protocol': + specifier: workspace:^ + version: link:../protocol + '@united-workforce/util': + specifier: workspace:^ + version: link:../util + commander: + specifier: ^14.0.3 + version: 14.0.3 + yaml: + specifier: ^2.9.0 + version: 2.9.0 + devDependencies: + typescript: + specifier: ^5.8.3 + version: 5.9.3 + packages/protocol: dependencies: '@ocas/core': diff --git a/tsconfig.json b/tsconfig.json index 76e1129..7cba641 100644 --- a/tsconfig.json +++ b/tsconfig.json @@ -25,6 +25,7 @@ { "path": "packages/agent-builtin" }, { "path": "packages/agent-mock" }, { "path": "packages/agent-claude-code" }, - { "path": "packages/cli" } + { "path": "packages/cli" }, + { "path": "packages/eval" } ] } From fae9e9ed3a7a87e28e3ec40f0c4daa77613f147e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=8F=E6=A9=98?= Date: Thu, 4 Jun 2026 23:59:21 +0000 Subject: [PATCH 2/4] =?UTF-8?q?feat:=20eval=20run=20command=20=E2=80=94=20?= =?UTF-8?q?prepare,=20execute,=20collect=20pipeline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the uwf-eval run command with 3-phase pipeline: - prepare: read task.yaml, copy fixture/ to temp workdir - execute: shell out to uwf thread start + exec - collect: run judges, compute weighted score, store CAS node, set @uwf/eval//latest variable Changes: - src/runner/ — types, prepare, execute, collect, index - src/storage/store.ts — createEvalStore(), setEvalLatest() - src/commands/run.ts — full pipeline wiring with --agent/--model/--count - 9 new tests (prepare + collect + weighted scoring) Builtin judges return placeholder score 0 (Phase 1c). Refs #70 --- packages/eval/__tests__/collect.test.ts | 157 ++++++++++++++++++++++++ packages/eval/__tests__/prepare.test.ts | 74 +++++++++++ packages/eval/src/commands/run.ts | 76 +++++++++++- packages/eval/src/index.ts | 25 +++- packages/eval/src/runner/collect.ts | 150 ++++++++++++++++++++++ packages/eval/src/runner/execute.ts | 87 +++++++++++++ packages/eval/src/runner/index.ts | 15 +++ packages/eval/src/runner/prepare.ts | 45 +++++++ packages/eval/src/runner/types.ts | 85 +++++++++++++ packages/eval/src/storage/index.ts | 3 +- packages/eval/src/storage/store.ts | 42 +++++++ packages/eval/src/storage/types.ts | 7 ++ 12 files changed, 759 insertions(+), 7 deletions(-) create mode 100644 packages/eval/__tests__/collect.test.ts create mode 100644 packages/eval/__tests__/prepare.test.ts create mode 100644 packages/eval/src/runner/collect.ts create mode 100644 packages/eval/src/runner/execute.ts create mode 100644 packages/eval/src/runner/index.ts create mode 100644 packages/eval/src/runner/prepare.ts create mode 100644 packages/eval/src/runner/types.ts create mode 100644 packages/eval/src/storage/store.ts diff --git a/packages/eval/__tests__/collect.test.ts b/packages/eval/__tests__/collect.test.ts new file mode 100644 index 0000000..7622d09 --- /dev/null +++ b/packages/eval/__tests__/collect.test.ts @@ -0,0 +1,157 @@ +import { bootstrap, createMemoryStore } from "@ocas/core"; +import { describe, expect, test } from "vitest"; +import type { JudgeRunner } from "../src/runner/index.js"; +import { collect, computeOverall } from "../src/runner/index.js"; +import type { EvalRunConfig, EvalStore } from "../src/storage/index.js"; +import type { JudgeEntry, TaskManifest } from "../src/task/index.js"; + +function makeJudge(name: string, weight: number, builtin: boolean): JudgeEntry { + return { + name, + weight, + builtin, + entry: builtin ? null : `dist/judges/${name}.js`, + schema: null, + }; +} + +function makeManifest(judges: JudgeEntry[]): TaskManifest { + return { + name: "fix-off-by-one", + description: "test task", + workflow: "solve-issue", + prompt: "Fix the bug", + limits: { maxSteps: 10, timeoutMinutes: 30 }, + judges, + }; +} + +function makeEvalStore(): EvalStore { + const store = createMemoryStore(); + bootstrap(store); + return { store, varStore: store.var }; +} + +const CONFIG: EvalRunConfig = { + agent: "hermes", + model: "claude-sonnet-4", + engineVersion: "test", +}; + +/** Returns a fixed score per judge name. */ +function scriptedRunner(scores: Record): JudgeRunner { + return async (_taskDir, _workDir, _threadId, judge) => ({ + score: scores[judge.name] ?? 0, + data: { judged: judge.name }, + schema: { type: "object" }, + }); +} + +describe("computeOverall", () => { + test("computes the weighted average correctly", () => { + const overall = computeOverall([ + { score: 0.8, weight: 0.3 }, + { score: 0.6, weight: 0.3 }, + { score: 1.0, weight: 0.4 }, + ]); + // 0.24 + 0.18 + 0.4 = 0.82 + expect(overall).toBeCloseTo(0.82, 10); + }); + + test("a weight-0 judge does not affect the result", () => { + const withInformational = computeOverall([ + { score: 1.0, weight: 1.0 }, + { score: 0.0, weight: 0.0 }, + ]); + expect(withInformational).toBe(1.0); + }); + + test("returns 0 when total weight is 0", () => { + expect(computeOverall([{ score: 0.5, weight: 0 }])).toBe(0); + }); +}); + +describe("collect", () => { + test("computes weighted score correctly across judges", async () => { + const evalStore = makeEvalStore(); + const manifest = makeManifest([ + makeJudge("test-pass", 0.6, false), + makeJudge("code-quality", 0.4, false), + ]); + const runJudge = scriptedRunner({ "test-pass": 1.0, "code-quality": 0.5 }); + + const result = await collect( + { + evalStore, + taskDir: "/tmp/task", + workDir: "/tmp/work", + threadId: "THREAD123", + manifest, + config: CONFIG, + }, + runJudge, + ); + + // 1.0 * 0.6 + 0.5 * 0.4 = 0.8 + expect(result.overall).toBeCloseTo(0.8, 10); + expect(result.runHash).toBeTruthy(); + expect(result.judges).toHaveLength(2); + expect(result.judges[0]).toEqual({ name: "test-pass", score: 1.0, weight: 0.6 }); + + const latest = evalStore.varStore.list({ + exactName: "@uwf/eval/fix-off-by-one/latest", + }); + expect(latest[0]?.value).toBe(result.runHash); + }); + + test("handles a judge with weight 0 (informational)", async () => { + const evalStore = makeEvalStore(); + const manifest = makeManifest([ + makeJudge("test-pass", 1.0, false), + makeJudge("token-stats", 0, true), + ]); + // token-stats is builtin → default runner would score 0; give scripted score + // that would skew the result if it were counted. + const runJudge = scriptedRunner({ "test-pass": 0.5, "token-stats": 1.0 }); + + const result = await collect( + { + evalStore, + taskDir: "/tmp/task", + workDir: "/tmp/work", + threadId: "THREAD123", + manifest, + config: CONFIG, + }, + runJudge, + ); + + // Only test-pass (weight 1.0) counts → overall = 0.5 + expect(result.overall).toBeCloseTo(0.5, 10); + expect(result.judges).toHaveLength(2); + const tokenStats = result.judges.find((j) => j.name === "token-stats"); + expect(tokenStats?.weight).toBe(0); + }); + + test("builtin judges are skipped with placeholder score 0", async () => { + const evalStore = makeEvalStore(); + const manifest = makeManifest([makeJudge("frontmatter-compliance", 1.0, true)]); + + // Use the default runner (no injected runner) → builtin skipped → score 0. + const result = await collect({ + evalStore, + taskDir: "/tmp/task", + workDir: "/tmp/work", + threadId: "THREAD123", + manifest, + config: CONFIG, + }); + + expect(result.overall).toBe(0); + expect(result.judges[0]).toEqual({ + name: "frontmatter-compliance", + score: 0, + weight: 1.0, + }); + }); +}); diff --git a/packages/eval/__tests__/prepare.test.ts b/packages/eval/__tests__/prepare.test.ts new file mode 100644 index 0000000..ed923f6 --- /dev/null +++ b/packages/eval/__tests__/prepare.test.ts @@ -0,0 +1,74 @@ +import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +import { afterEach, beforeEach, describe, expect, test } from "vitest"; + +import { prepare } from "../src/runner/index.js"; + +const TASK_YAML = ` +name: fix-off-by-one +description: Fix an off-by-one error +workflow: solve-issue +prompt: "Fix the bug" +limits: + maxSteps: 12 + timeoutMinutes: 20 +judges: + - name: frontmatter-compliance + weight: 0.5 + builtin: true + - name: test-pass + weight: 0.5 + entry: dist/judges/test-pass.js +`; + +let taskDir: string; + +beforeEach(async () => { + taskDir = await mkdtemp(join(tmpdir(), "uwf-eval-task-")); + await writeFile(join(taskDir, "task.yaml"), TASK_YAML, "utf8"); + const fixtureDir = join(taskDir, "fixture"); + await mkdir(join(fixtureDir, "src"), { recursive: true }); + await writeFile(join(fixtureDir, "src", "calc.ts"), "export const add = (a, b) => a + b + 1;\n"); + await writeFile(join(fixtureDir, "package.json"), '{ "name": "fixture" }\n'); +}); + +afterEach(async () => { + await rm(taskDir, { recursive: true, force: true }); +}); + +describe("prepare", () => { + test("returns the parsed manifest", async () => { + const result = await prepare(taskDir); + expect(result.taskDir).toBe(taskDir); + expect(result.manifest.name).toBe("fix-off-by-one"); + expect(result.manifest.workflow).toBe("solve-issue"); + expect(result.manifest.limits.maxSteps).toBe(12); + expect(result.manifest.judges).toHaveLength(2); + }); + + test("copies fixture into a fresh temp work dir", async () => { + const result = await prepare(taskDir); + expect(result.workDir).not.toBe(taskDir); + expect(result.workDir.startsWith(tmpdir())).toBe(true); + + const calc = await readFile(join(result.workDir, "src", "calc.ts"), "utf8"); + expect(calc).toContain("export const add"); + const pkg = await readFile(join(result.workDir, "package.json"), "utf8"); + expect(pkg).toContain("fixture"); + + await rm(result.workDir, { recursive: true, force: true }); + }); + + test("creates an empty work dir when no fixture/ exists", async () => { + const noFixtureDir = await mkdtemp(join(tmpdir(), "uwf-eval-nofix-")); + await writeFile(join(noFixtureDir, "task.yaml"), TASK_YAML, "utf8"); + + const result = await prepare(noFixtureDir); + expect(result.workDir.startsWith(tmpdir())).toBe(true); + + await rm(noFixtureDir, { recursive: true, force: true }); + await rm(result.workDir, { recursive: true, force: true }); + }); +}); diff --git a/packages/eval/src/commands/run.ts b/packages/eval/src/commands/run.ts index 4bc6b08..a066419 100644 --- a/packages/eval/src/commands/run.ts +++ b/packages/eval/src/commands/run.ts @@ -1,4 +1,52 @@ +import { resolve } from "node:path"; + import type { Command } from "commander"; +import type { RunResult } from "../runner/index.js"; +import { collect, execute, getEngineVersion, prepare } from "../runner/index.js"; +import type { EvalRunConfig } from "../storage/index.js"; +import { createEvalStore } from "../storage/index.js"; + +type RunCliOptions = { + agent: string; + model: string | undefined; + count: string; +}; + +async function runOnce( + taskDir: string, + agent: string, + model: string, + engineVersion: string, +): Promise { + const prepared = await prepare(taskDir); + const { manifest, workDir } = prepared; + + const { threadId } = await execute({ + workDir, + workflow: manifest.workflow, + prompt: manifest.prompt, + agent, + maxSteps: manifest.limits.maxSteps, + }); + + const evalStore = await createEvalStore(); + const config: EvalRunConfig = { agent, model, engineVersion }; + const collected = await collect({ + evalStore, + taskDir: prepared.taskDir, + workDir, + threadId, + manifest, + config, + }); + + return { + runHash: collected.runHash, + overall: collected.overall, + task: manifest.name, + judges: collected.judges, + }; +} export function registerRunCommand(program: Command): void { program @@ -7,8 +55,30 @@ export function registerRunCommand(program: Command): void { .option("--agent ", "agent adapter to use", "hermes") .option("--model ", "model override") .option("--count ", "number of eval runs", "1") - .action(async (_task: string, _opts: Record) => { - process.stderr.write("uwf-eval run: not yet implemented\n"); - process.exitCode = 1; + .action(async (task: string, opts: RunCliOptions) => { + const taskDir = resolve(task); + const agent = opts.agent; + const model = opts.model ?? ""; + const count = Number.parseInt(opts.count, 10); + if (!Number.isInteger(count) || count < 1) { + process.stderr.write("--count must be a positive integer\n"); + process.exitCode = 1; + return; + } + + const engineVersion = getEngineVersion(); + + try { + const results: RunResult[] = []; + for (let i = 0; i < count; i++) { + results.push(await runOnce(taskDir, agent, model, engineVersion)); + } + const output = count === 1 ? results[0] : results; + process.stdout.write(`${JSON.stringify(output)}\n`); + } catch (e) { + const message = e instanceof Error ? e.message : String(e); + process.stderr.write(`${message}\n`); + process.exitCode = 1; + } }); } diff --git a/packages/eval/src/index.ts b/packages/eval/src/index.ts index 69c6e69..dadd36f 100644 --- a/packages/eval/src/index.ts +++ b/packages/eval/src/index.ts @@ -1,15 +1,34 @@ -// Task manifest - // Judge types export type { JudgeInput, JudgeOutput } from "./judge/index.js"; -export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload } from "./storage/index.js"; +export type { + CollectInput, + CollectResult, + ExecuteInput, + ExecuteResult, + JudgeRunner, + JudgeRunOutput, + JudgeSummary, + PrepareResult, + RunOptions, + RunResult, +} from "./runner/index.js"; +// Runner (prepare → execute → collect) +export { collect, computeOverall, execute, getEngineVersion, prepare } from "./runner/index.js"; +export type { + EvalJudgeRecord, + EvalRunConfig, + EvalRunPayload, + EvalStore, +} from "./storage/index.js"; // Storage schemas and types export { + createEvalStore, EVAL_JUDGE_FRONTMATTER_SCHEMA, EVAL_JUDGE_HALLUCINATION_SCHEMA, EVAL_JUDGE_TOKEN_STATS_SCHEMA, EVAL_JUDGE_UPSTREAM_SCHEMA, EVAL_RUN_SCHEMA, + setEvalLatest, } from "./storage/index.js"; export type { JudgeEntry, TaskLimits, TaskManifest } from "./task/index.js"; export { loadTaskManifest, parseTaskManifest } from "./task/index.js"; diff --git a/packages/eval/src/runner/collect.ts b/packages/eval/src/runner/collect.ts new file mode 100644 index 0000000..b487dc8 --- /dev/null +++ b/packages/eval/src/runner/collect.ts @@ -0,0 +1,150 @@ +import { execFileSync } from "node:child_process"; +import { readFile } from "node:fs/promises"; +import { resolve } from "node:path"; + +import type { JSONSchema, Store } from "@ocas/core"; +import { putSchema } from "@ocas/core"; +import type { CasRef } from "@united-workforce/protocol"; +import { createLogger } from "@united-workforce/util"; + +import type { JudgeOutput } from "../judge/index.js"; +import type { EvalJudgeRecord, EvalRunPayload } from "../storage/index.js"; +import { EVAL_RUN_SCHEMA, setEvalLatest } from "../storage/index.js"; +import type { JudgeEntry } from "../task/index.js"; +import type { + CollectInput, + CollectResult, + JudgeRunner, + JudgeRunOutput, + JudgeSummary, +} from "./types.js"; + +const log = createLogger({ sink: { kind: "stderr" } }); + +const LOG_JUDGE = "CT6N3P2K"; +const LOG_STORED = "CT9V2Q7M"; + +/** Permissive schema for judge data without a dedicated schema (e.g. builtin placeholders). */ +const GENERIC_DATA_SCHEMA: JSONSchema = { type: "object" }; + +/** + * Compute the weighted overall score. Judges with weight 0 are informational + * and do not affect the result (they contribute 0 to both numerator and + * denominator). Returns 0 when total weight is 0. + */ +export function computeOverall(judges: ReadonlyArray<{ score: number; weight: number }>): number { + let totalWeight = 0; + let weighted = 0; + for (const judge of judges) { + totalWeight += judge.weight; + weighted += judge.score * judge.weight; + } + return totalWeight > 0 ? weighted / totalWeight : 0; +} + +/** Run a task-provided judge script: `node `. */ +async function runTaskJudge( + taskDir: string, + workDir: string, + threadId: string, + judge: JudgeEntry, +): Promise { + if (judge.entry === null) { + throw new Error(`judge "${judge.name}" is not builtin but has no entry`); + } + const entryPath = resolve(taskDir, judge.entry); + + let stdout: string; + try { + stdout = execFileSync("node", [entryPath, workDir, threadId], { + encoding: "utf8", + stdio: ["ignore", "pipe", "pipe"], + maxBuffer: 50 * 1024 * 1024, + }); + } catch (e) { + const message = e instanceof Error ? e.message : String(e); + throw new Error(`judge "${judge.name}" failed: ${message}`); + } + + const line = stdout.trim().split("\n").pop()?.trim() ?? ""; + let parsed: unknown; + try { + parsed = JSON.parse(line); + } catch { + throw new Error(`judge "${judge.name}" stdout is not valid JSON: ${line || "(empty)"}`); + } + const output = parsed as JudgeOutput; + if (typeof output.score !== "number") { + throw new Error(`judge "${judge.name}" output missing numeric score`); + } + + const schema = + judge.schema !== null ? await loadSchema(resolve(taskDir, judge.schema)) : GENERIC_DATA_SCHEMA; + return { score: output.score, data: output.data, schema }; +} + +/** Load and parse an OCAS JSON Schema file. */ +async function loadSchema(path: string): Promise { + const text = await readFile(path, "utf8"); + return JSON.parse(text) as JSONSchema; +} + +/** + * Default judge runner. Builtin judges are skipped for now (placeholder score 0 + * with empty data); task judges spawn their entry script. + */ +const defaultJudgeRunner: JudgeRunner = async (taskDir, workDir, threadId, judge) => { + if (judge.builtin) { + return { score: 0, data: {}, schema: GENERIC_DATA_SCHEMA }; + } + return runTaskJudge(taskDir, workDir, threadId, judge); +}; + +/** Persist judge data to CAS under its schema and return the CAS hash. */ +async function storeJudgeData(store: Store, schema: JSONSchema, data: unknown): Promise { + const schemaHash = await putSchema(store, schema); + return (await store.cas.put(schemaHash, data)) as CasRef; +} + +/** + * Run all judges, store their data and the overall eval-run record in CAS, then + * index the run under `@uwf/eval//latest`. + */ +export async function collect( + input: CollectInput, + runJudge: JudgeRunner = defaultJudgeRunner, +): Promise { + const { evalStore, taskDir, workDir, threadId, manifest, config } = input; + const { store, varStore } = evalStore; + + const records: EvalJudgeRecord[] = []; + for (const judge of manifest.judges) { + const result = await runJudge(taskDir, workDir, threadId, judge); + const dataHash = await storeJudgeData(store, result.schema, result.data); + records.push({ name: judge.name, score: result.score, weight: judge.weight, dataHash }); + log(LOG_JUDGE, `judge=${judge.name} score=${result.score} weight=${judge.weight}`); + } + + const overall = computeOverall(records); + + const payload: EvalRunPayload = { + task: manifest.name, + config, + threadId, + judges: records, + overall, + timestamp: Date.now(), + }; + + const schemaHash = await putSchema(store, EVAL_RUN_SCHEMA); + const runHash = (await store.cas.put(schemaHash, payload)) as string; + setEvalLatest(varStore, manifest.name, runHash); + log(LOG_STORED, `stored eval-run task=${manifest.name} hash=${runHash} overall=${overall}`); + + const judges: JudgeSummary[] = records.map((r) => ({ + name: r.name, + score: r.score, + weight: r.weight, + })); + return { runHash, overall, judges }; +} diff --git a/packages/eval/src/runner/execute.ts b/packages/eval/src/runner/execute.ts new file mode 100644 index 0000000..d0b35a3 --- /dev/null +++ b/packages/eval/src/runner/execute.ts @@ -0,0 +1,87 @@ +import { execFileSync } from "node:child_process"; + +import { createLogger } from "@united-workforce/util"; + +import type { ExecuteInput, ExecuteResult } from "./types.js"; + +const log = createLogger({ sink: { kind: "stderr" } }); + +const LOG_START = "EX5M2T9V"; +const LOG_EXEC = "EX7Q4K2N"; + +/** Resolve the uwf CLI binary. Override with `UWF_BIN` for testing. */ +function uwfBin(): string { + const override = process.env.UWF_BIN; + return override !== undefined && override !== "" ? override : "uwf"; +} + +/** Run a uwf subcommand and return trimmed stdout. */ +function runUwf(args: string[], cwd: string): string { + try { + return execFileSync(uwfBin(), args, { + encoding: "utf8", + stdio: ["ignore", "pipe", "pipe"], + maxBuffer: 50 * 1024 * 1024, + cwd, + }).trim(); + } catch (e) { + const err = e as NodeJS.ErrnoException & { stderr?: Buffer | string | null }; + const stderr = + err.stderr == null + ? "" + : typeof err.stderr === "string" + ? err.stderr + : err.stderr.toString("utf8"); + const detail = stderr.trim() !== "" ? `: ${stderr.trim()}` : ""; + throw new Error(`uwf ${args[0]} ${args[1]} failed${detail}`); + } +} + +/** Parse the thread ID from `uwf thread start` JSON output (`{ workflow, thread }`). */ +function parseThreadId(stdout: string): string { + let parsed: unknown; + try { + parsed = JSON.parse(stdout); + } catch { + throw new Error(`uwf thread start did not emit valid JSON: ${stdout || "(empty)"}`); + } + const obj = parsed as Record; + const thread = obj.thread; + if (typeof thread !== "string" || thread === "") { + throw new Error(`uwf thread start output missing thread id: ${stdout}`); + } + return thread; +} + +/** + * Execute a workflow: create a thread, then run it for up to `maxSteps` steps. + * Shells out to the uwf CLI rather than importing it directly. + */ +export async function execute(input: ExecuteInput): Promise { + const startOut = runUwf( + ["thread", "start", input.workflow, "-p", input.prompt, "--cwd", input.workDir], + input.workDir, + ); + const threadId = parseThreadId(startOut); + log(LOG_START, `thread started thread=${threadId} workflow=${input.workflow}`); + + runUwf( + ["thread", "exec", threadId, "--agent", input.agent, "-c", String(input.maxSteps)], + input.workDir, + ); + log(LOG_EXEC, `thread executed thread=${threadId} maxSteps=${input.maxSteps}`); + + return { threadId }; +} + +/** Best-effort lookup of the uwf engine version (`uwf -V`); "unknown" on failure. */ +export function getEngineVersion(): string { + try { + return execFileSync(uwfBin(), ["-V"], { + encoding: "utf8", + stdio: ["ignore", "pipe", "ignore"], + }).trim(); + } catch { + return "unknown"; + } +} diff --git a/packages/eval/src/runner/index.ts b/packages/eval/src/runner/index.ts new file mode 100644 index 0000000..7b2f95f --- /dev/null +++ b/packages/eval/src/runner/index.ts @@ -0,0 +1,15 @@ +export { collect, computeOverall } from "./collect.js"; +export { execute, getEngineVersion } from "./execute.js"; +export { prepare } from "./prepare.js"; +export type { + CollectInput, + CollectResult, + ExecuteInput, + ExecuteResult, + JudgeRunner, + JudgeRunOutput, + JudgeSummary, + PrepareResult, + RunOptions, + RunResult, +} from "./types.js"; diff --git a/packages/eval/src/runner/prepare.ts b/packages/eval/src/runner/prepare.ts new file mode 100644 index 0000000..4ee5ceb --- /dev/null +++ b/packages/eval/src/runner/prepare.ts @@ -0,0 +1,45 @@ +import { access, cp, mkdir, mkdtemp } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +import { createLogger } from "@united-workforce/util"; + +import { loadTaskManifest } from "../task/index.js"; +import type { PrepareResult } from "./types.js"; + +const log = createLogger({ sink: { kind: "stderr" } }); + +const LOG_PREPARE = "PRE4K2NQ"; +const LOG_FIXTURE = "PRE7M3VX"; + +/** Check whether a path exists. */ +async function pathExists(path: string): Promise { + try { + await access(path); + return true; + } catch { + return false; + } +} + +/** + * Prepare a task for execution: read its manifest and copy the fixture + * directory into a fresh temp working directory. + */ +export async function prepare(taskDir: string): Promise { + const manifest = await loadTaskManifest(taskDir); + log(LOG_PREPARE, `loaded task manifest name=${manifest.name} workflow=${manifest.workflow}`); + + const workDir = await mkdtemp(join(tmpdir(), "uwf-eval-")); + + const fixtureDir = join(taskDir, "fixture"); + if (await pathExists(fixtureDir)) { + await cp(fixtureDir, workDir, { recursive: true }); + log(LOG_FIXTURE, `copied fixture into workDir=${workDir}`); + } else { + await mkdir(workDir, { recursive: true }); + log(LOG_FIXTURE, `no fixture/ found, using empty workDir=${workDir}`); + } + + return { taskDir, workDir, manifest }; +} diff --git a/packages/eval/src/runner/types.ts b/packages/eval/src/runner/types.ts new file mode 100644 index 0000000..e785e19 --- /dev/null +++ b/packages/eval/src/runner/types.ts @@ -0,0 +1,85 @@ +import type { JSONSchema } from "@ocas/core"; + +import type { EvalRunConfig, EvalStore } from "../storage/index.js"; +import type { JudgeEntry, TaskManifest } from "../task/index.js"; + +/** Result of the prepare phase: task dir, temp working dir, parsed manifest. */ +export type PrepareResult = { + taskDir: string; + workDir: string; + manifest: TaskManifest; +}; + +/** Input to the execute phase. */ +export type ExecuteInput = { + /** Working directory the workflow runs in (the prepared temp dir). */ + workDir: string; + /** Workflow name or path (from task.yaml). */ + workflow: string; + /** Initial prompt for the thread. */ + prompt: string; + /** Agent adapter to use. */ + agent: string; + /** Maximum number of steps to execute. */ + maxSteps: number; +}; + +/** Result of the execute phase. */ +export type ExecuteResult = { + threadId: string; +}; + +/** Output produced by running a single judge. */ +export type JudgeRunOutput = { + score: number; + data: unknown; + /** Schema describing `data`, used when persisting to CAS. */ + schema: JSONSchema; +}; + +/** Pluggable judge execution strategy (injectable for testing). */ +export type JudgeRunner = ( + taskDir: string, + workDir: string, + threadId: string, + judge: JudgeEntry, +) => Promise; + +/** Input to the collect phase. */ +export type CollectInput = { + evalStore: EvalStore; + taskDir: string; + workDir: string; + threadId: string; + manifest: TaskManifest; + config: EvalRunConfig; +}; + +/** A single judge's summarized result in the run output. */ +export type JudgeSummary = { + name: string; + score: number; + weight: number; +}; + +/** Result of the collect phase. */ +export type CollectResult = { + runHash: string; + overall: number; + judges: JudgeSummary[]; +}; + +/** Options for a full eval run (from CLI flags). */ +export type RunOptions = { + agent: string; + model: string; + count: number; +}; + +/** Final result of a full eval run. */ +export type RunResult = { + runHash: string; + overall: number; + task: string; + judges: JudgeSummary[]; +}; diff --git a/packages/eval/src/storage/index.ts b/packages/eval/src/storage/index.ts index 8b0d554..53ea2d2 100644 --- a/packages/eval/src/storage/index.ts +++ b/packages/eval/src/storage/index.ts @@ -5,4 +5,5 @@ export { EVAL_JUDGE_UPSTREAM_SCHEMA, EVAL_RUN_SCHEMA, } from "./schemas.js"; -export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload } from "./types.js"; +export { createEvalStore, setEvalLatest } from "./store.js"; +export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload, EvalStore } from "./types.js"; diff --git a/packages/eval/src/storage/store.ts b/packages/eval/src/storage/store.ts new file mode 100644 index 0000000..ce69224 --- /dev/null +++ b/packages/eval/src/storage/store.ts @@ -0,0 +1,42 @@ +import { mkdir } from "node:fs/promises"; +import { homedir } from "node:os"; +import { join } from "node:path"; +import type { VarStore } from "@ocas/core"; +import { bootstrap, type Store } from "@ocas/core"; +import { createFsStore, createSqliteVarStore } from "@ocas/fs"; + +import type { EvalStore } from "./types.js"; + +/** Variable name prefix for eval run pointers (`@uwf/eval//latest`). */ +const EVAL_VAR_PREFIX = "@uwf/eval/"; + +/** + * Resolve the global CAS directory shared by all uwf and ocas tools. + * Priority: `OCAS_HOME` → default ~/.ocas (matches uwf CLI's getGlobalCasDir). + */ +function getGlobalCasDir(): string { + const primary = process.env.OCAS_HOME; + if (primary !== undefined && primary !== "") { + return primary; + } + return join(homedir(), ".ocas"); +} + +/** + * Open the unified OCAS store on the filesystem. + * Shares the same CAS + variable backend as the uwf CLI. + */ +export async function createEvalStore(): Promise { + const casDir = getGlobalCasDir(); + await mkdir(casDir, { recursive: true }); + const cas = createFsStore(casDir); + const { var: varStore, tag } = createSqliteVarStore(join(casDir, "vars"), cas); + const store: Store = { cas, var: varStore, tag }; + bootstrap(store); + return { store, varStore }; +} + +/** Set the `@uwf/eval//latest` variable to point at a run hash. */ +export function setEvalLatest(varStore: VarStore, taskName: string, runHash: string): void { + varStore.set(`${EVAL_VAR_PREFIX}${taskName}/latest`, runHash); +} diff --git a/packages/eval/src/storage/types.ts b/packages/eval/src/storage/types.ts index 862e45c..1348eb7 100644 --- a/packages/eval/src/storage/types.ts +++ b/packages/eval/src/storage/types.ts @@ -1,5 +1,12 @@ +import type { Store, VarStore } from "@ocas/core"; import type { CasRef } from "@united-workforce/protocol"; +/** Handle to the OCAS store used for eval persistence. */ +export type EvalStore = { + store: Store; + varStore: VarStore; +}; + /** A single judge result within an eval run. */ export type EvalJudgeRecord = { name: string; From 8c26f1671647d07ec11729939458fe2712c03cf0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=8F=E6=A9=98?= Date: Fri, 5 Jun 2026 00:09:06 +0000 Subject: [PATCH 3/4] =?UTF-8?q?feat:=20builtin=20judges=20=E2=80=94=20fron?= =?UTF-8?q?tmatter=20+=20token-stats=20(deterministic)=20+=20upstream/hall?= =?UTF-8?q?ucination=20(stubs)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement 4 builtin judges for eval framework: - frontmatter-compliance: validates YAML frontmatter with $status field, score = stepsValid / stepsTotal - token-stats: aggregates Usage from step nodes, always score 1.0 (informational only) - upstream-consumption: LLM-as-judge stub (score 0, TODO) - hallucination: LLM-as-judge stub (score 0, TODO) Infrastructure: - judge/builtin/read-steps.ts — shell out to uwf step list - judge/builtin/types.ts — BuiltinJudge, BuiltinJudgeOutput - runner/collect.ts — dispatch builtin judges by name 9 new tests (frontmatter validation + token aggregation) Refs #71 --- .../eval/__tests__/builtin-judges.test.ts | 196 ++++++++++++++++++ packages/eval/__tests__/collect.test.ts | 31 ++- .../eval/src/judge/builtin/frontmatter.ts | 95 +++++++++ .../eval/src/judge/builtin/hallucination.ts | 17 ++ packages/eval/src/judge/builtin/index.ts | 6 + packages/eval/src/judge/builtin/read-steps.ts | 14 ++ .../eval/src/judge/builtin/token-stats.ts | 53 +++++ packages/eval/src/judge/builtin/types.ts | 16 ++ packages/eval/src/judge/builtin/upstream.ts | 17 ++ packages/eval/src/judge/index.ts | 9 + packages/eval/src/runner/collect.ts | 28 ++- 11 files changed, 461 insertions(+), 21 deletions(-) create mode 100644 packages/eval/__tests__/builtin-judges.test.ts create mode 100644 packages/eval/src/judge/builtin/frontmatter.ts create mode 100644 packages/eval/src/judge/builtin/hallucination.ts create mode 100644 packages/eval/src/judge/builtin/index.ts create mode 100644 packages/eval/src/judge/builtin/read-steps.ts create mode 100644 packages/eval/src/judge/builtin/token-stats.ts create mode 100644 packages/eval/src/judge/builtin/types.ts create mode 100644 packages/eval/src/judge/builtin/upstream.ts diff --git a/packages/eval/__tests__/builtin-judges.test.ts b/packages/eval/__tests__/builtin-judges.test.ts new file mode 100644 index 0000000..9b4768d --- /dev/null +++ b/packages/eval/__tests__/builtin-judges.test.ts @@ -0,0 +1,196 @@ +import type { StepEntry } from "@united-workforce/protocol"; +import { beforeEach, describe, expect, test, vi } from "vitest"; + +import { + runFrontmatterJudge, + runHallucinationJudge, + runTokenStatsJudge, + runUpstreamJudge, +} from "../src/judge/builtin/index.js"; + +// Mock the shared read-steps helper so the judges never shell out to `uwf`. +vi.mock("../src/judge/builtin/read-steps.js", () => ({ + readThreadSteps: vi.fn(), +})); + +import { readThreadSteps } from "../src/judge/builtin/read-steps.js"; + +const mockedReadSteps = vi.mocked(readThreadSteps); + +function makeStep(overrides: Partial): StepEntry { + return { + hash: "HASH000000000", + role: "worker", + output: "---\n$status: done\n---\n\nbody", + detail: "DETAIL0000000", + agent: "hermes", + timestamp: 0, + durationMs: 0, + usage: null, + ...overrides, + }; +} + +beforeEach(() => { + mockedReadSteps.mockReset(); +}); + +describe("frontmatter-compliance judge", () => { + test("all steps have valid frontmatter → score 1.0", async () => { + mockedReadSteps.mockReturnValue([ + makeStep({ role: "a", output: "---\n$status: done\n---\n\nwork" }), + makeStep({ role: "b", output: "---\n$status: needs_input\n---\nmore" }), + ]); + + const result = await runFrontmatterJudge("T1"); + const data = result.data as { stepsTotal: number; stepsValid: number; invalidSteps: unknown[] }; + + expect(result.score).toBe(1.0); + expect(data.stepsTotal).toBe(2); + expect(data.stepsValid).toBe(2); + expect(data.invalidSteps).toHaveLength(0); + }); + + test("some steps missing $status → partial score", async () => { + mockedReadSteps.mockReturnValue([ + makeStep({ role: "a", output: "---\n$status: done\n---\nok" }), + makeStep({ role: "b", output: "---\nfoo: bar\n---\nmissing status" }), + makeStep({ role: "c", output: "no frontmatter at all" }), + ]); + + const result = await runFrontmatterJudge("T2"); + const data = result.data as { + stepsTotal: number; + stepsValid: number; + invalidSteps: Array<{ stepIndex: number; role: string; errors: string[] }>; + }; + + expect(result.score).toBeCloseTo(1 / 3, 10); + expect(data.stepsTotal).toBe(3); + expect(data.stepsValid).toBe(1); + expect(data.invalidSteps).toHaveLength(2); + expect(data.invalidSteps[0]).toMatchObject({ stepIndex: 1, role: "b" }); + expect(data.invalidSteps[1]).toMatchObject({ stepIndex: 2, role: "c" }); + }); + + test("no steps → score 0 (0/0 edge case)", async () => { + mockedReadSteps.mockReturnValue([]); + + const result = await runFrontmatterJudge("T3"); + const data = result.data as { stepsTotal: number; stepsValid: number; invalidSteps: unknown[] }; + + expect(result.score).toBe(0); + expect(data.stepsTotal).toBe(0); + expect(data.stepsValid).toBe(0); + expect(data.invalidSteps).toHaveLength(0); + }); + + test("empty-string $status counts as invalid", async () => { + mockedReadSteps.mockReturnValue([makeStep({ role: "a", output: '---\n$status: ""\n---\nx' })]); + + const result = await runFrontmatterJudge("T4"); + expect(result.score).toBe(0); + }); +}); + +describe("token-stats judge", () => { + test("steps with usage → sums correctly", async () => { + mockedReadSteps.mockReturnValue([ + makeStep({ + role: "a", + usage: { turns: 2, inputTokens: 100, outputTokens: 50, duration: 1.5 }, + }), + makeStep({ + role: "b", + usage: { turns: 3, inputTokens: 200, outputTokens: 75, duration: 2.0 }, + }), + ]); + + const result = await runTokenStatsJudge("T1"); + const data = result.data as { + totalInput: number; + totalOutput: number; + totalTurns: number; + perStep: Array<{ role: string; inputTokens: number; outputTokens: number; turns: number }>; + }; + + expect(result.score).toBe(1.0); + expect(data.totalInput).toBe(300); + expect(data.totalOutput).toBe(125); + expect(data.totalTurns).toBe(5); + expect(data.perStep).toHaveLength(2); + expect(data.perStep[0]).toEqual({ + role: "a", + inputTokens: 100, + outputTokens: 50, + turns: 2, + duration: 1.5, + }); + }); + + test("steps with null usage → zeros", async () => { + mockedReadSteps.mockReturnValue([ + makeStep({ role: "a", usage: null }), + makeStep({ role: "b", usage: null }), + ]); + + const result = await runTokenStatsJudge("T2"); + const data = result.data as { + totalInput: number; + totalOutput: number; + totalTurns: number; + perStep: Array<{ + inputTokens: number; + outputTokens: number; + turns: number; + duration: number; + }>; + }; + + expect(result.score).toBe(1.0); + expect(data.totalInput).toBe(0); + expect(data.totalOutput).toBe(0); + expect(data.totalTurns).toBe(0); + expect(data.perStep[0]).toEqual({ + role: "a", + inputTokens: 0, + outputTokens: 0, + turns: 0, + duration: 0, + }); + }); + + test("empty steps → all zeros, score 1.0", async () => { + mockedReadSteps.mockReturnValue([]); + + const result = await runTokenStatsJudge("T3"); + const data = result.data as { + totalInput: number; + totalOutput: number; + totalTurns: number; + perStep: unknown[]; + }; + + expect(result.score).toBe(1.0); + expect(data.totalInput).toBe(0); + expect(data.totalOutput).toBe(0); + expect(data.totalTurns).toBe(0); + expect(data.perStep).toHaveLength(0); + }); +}); + +describe("LLM-as-judge stubs", () => { + test("upstream-consumption returns a stub", async () => { + const result = await runUpstreamJudge("T1"); + expect(result.score).toBe(0); + expect(result.data).toEqual({ perStep: [] }); + expect(result.schema.title).toBe("@uwf/eval-judge-upstream"); + }); + + test("hallucination returns a stub", async () => { + const result = await runHallucinationJudge("T1"); + expect(result.score).toBe(0); + expect(result.data).toEqual({ perStep: [] }); + expect(result.schema.title).toBe("@uwf/eval-judge-hallucination"); + }); +}); diff --git a/packages/eval/__tests__/collect.test.ts b/packages/eval/__tests__/collect.test.ts index 7622d09..5d22c14 100644 --- a/packages/eval/__tests__/collect.test.ts +++ b/packages/eval/__tests__/collect.test.ts @@ -133,25 +133,20 @@ describe("collect", () => { expect(tokenStats?.weight).toBe(0); }); - test("builtin judges are skipped with placeholder score 0", async () => { + test("unknown builtin judge name throws via the default runner", async () => { const evalStore = makeEvalStore(); - const manifest = makeManifest([makeJudge("frontmatter-compliance", 1.0, true)]); + const manifest = makeManifest([makeJudge("not-a-real-judge", 1.0, true)]); - // Use the default runner (no injected runner) → builtin skipped → score 0. - const result = await collect({ - evalStore, - taskDir: "/tmp/task", - workDir: "/tmp/work", - threadId: "THREAD123", - manifest, - config: CONFIG, - }); - - expect(result.overall).toBe(0); - expect(result.judges[0]).toEqual({ - name: "frontmatter-compliance", - score: 0, - weight: 1.0, - }); + // Use the default runner (no injected runner) → builtin dispatch → unknown name throws. + await expect( + collect({ + evalStore, + taskDir: "/tmp/task", + workDir: "/tmp/work", + threadId: "THREAD123", + manifest, + config: CONFIG, + }), + ).rejects.toThrow(/unknown builtin judge/); }); }); diff --git a/packages/eval/src/judge/builtin/frontmatter.ts b/packages/eval/src/judge/builtin/frontmatter.ts new file mode 100644 index 0000000..46ab5a3 --- /dev/null +++ b/packages/eval/src/judge/builtin/frontmatter.ts @@ -0,0 +1,95 @@ +import { createLogger } from "@united-workforce/util"; +import { parse as parseYaml } from "yaml"; + +import { EVAL_JUDGE_FRONTMATTER_SCHEMA } from "../../storage/index.js"; +import { readThreadSteps } from "./read-steps.js"; +import type { BuiltinJudgeOutput } from "./types.js"; + +const log = createLogger({ sink: { kind: "stderr" } }); + +const LOG_RESULT = "F2QH7R4M"; + +const FENCE = "---"; + +type InvalidStep = { + stepIndex: number; + role: string; + errors: string[]; +}; + +/** + * Extract the YAML frontmatter block from a step output. Returns the inner YAML + * string when the output starts with a `---\n` block closed by a `\n---` fence, + * otherwise null. + */ +function extractFrontmatterYaml(output: unknown): string | null { + if (typeof output !== "string") { + return null; + } + if (!output.startsWith(`${FENCE}\n`)) { + return null; + } + const rest = output.slice(FENCE.length + 1); + const closeIndex = rest.indexOf(`\n${FENCE}`); + if (closeIndex === -1) { + return null; + } + return rest.slice(0, closeIndex); +} + +/** Validate a single step's frontmatter, returning a list of errors (empty = valid). */ +function validateStepFrontmatter(output: unknown): string[] { + const yaml = extractFrontmatterYaml(output); + if (yaml === null) { + return ["output does not begin with a valid '---' frontmatter block"]; + } + + let parsed: unknown; + try { + parsed = parseYaml(yaml); + } catch (e) { + const message = e instanceof Error ? e.message : String(e); + return [`frontmatter YAML failed to parse: ${message}`]; + } + + if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) { + return ["frontmatter is not a YAML mapping"]; + } + + const status = (parsed as Record).$status; + if (typeof status !== "string" || status.trim() === "") { + return ["$status field is missing or not a non-empty string"]; + } + + return []; +} + +/** + * Deterministic judge: every step's agent output must contain valid YAML + * frontmatter with a non-empty `$status` field. Score = stepsValid / stepsTotal + * (0 when there are no steps). + */ +export async function runFrontmatterJudge(threadId: string): Promise { + const steps = readThreadSteps(threadId); + + const invalidSteps: InvalidStep[] = []; + for (let i = 0; i < steps.length; i++) { + const step = steps[i]; + const errors = validateStepFrontmatter(step.output); + if (errors.length > 0) { + invalidSteps.push({ stepIndex: i, role: step.role, errors }); + } + } + + const stepsTotal = steps.length; + const stepsValid = stepsTotal - invalidSteps.length; + const score = stepsTotal > 0 ? stepsValid / stepsTotal : 0; + + log(LOG_RESULT, `frontmatter thread=${threadId} valid=${stepsValid}/${stepsTotal}`); + + return { + score, + data: { stepsTotal, stepsValid, invalidSteps }, + schema: EVAL_JUDGE_FRONTMATTER_SCHEMA, + }; +} diff --git a/packages/eval/src/judge/builtin/hallucination.ts b/packages/eval/src/judge/builtin/hallucination.ts new file mode 100644 index 0000000..702b743 --- /dev/null +++ b/packages/eval/src/judge/builtin/hallucination.ts @@ -0,0 +1,17 @@ +import { EVAL_JUDGE_HALLUCINATION_SCHEMA } from "../../storage/index.js"; +import type { BuiltinJudgeOutput } from "./types.js"; + +/** + * LLM-as-judge: detects claims in each step's output that are not grounded in + * the available context (hallucinations). + * + * TODO: LLM-as-judge — needs provider config to call LLM API. Returns a stub + * (score 0, empty perStep) until the LLM call path is wired up. + */ +export async function runHallucinationJudge(_threadId: string): Promise { + return { + score: 0, + data: { perStep: [] }, + schema: EVAL_JUDGE_HALLUCINATION_SCHEMA, + }; +} diff --git a/packages/eval/src/judge/builtin/index.ts b/packages/eval/src/judge/builtin/index.ts new file mode 100644 index 0000000..a7dad57 --- /dev/null +++ b/packages/eval/src/judge/builtin/index.ts @@ -0,0 +1,6 @@ +export { runFrontmatterJudge } from "./frontmatter.js"; +export { runHallucinationJudge } from "./hallucination.js"; +export { readThreadSteps } from "./read-steps.js"; +export { runTokenStatsJudge } from "./token-stats.js"; +export type { BuiltinJudge, BuiltinJudgeOutput } from "./types.js"; +export { runUpstreamJudge } from "./upstream.js"; diff --git a/packages/eval/src/judge/builtin/read-steps.ts b/packages/eval/src/judge/builtin/read-steps.ts new file mode 100644 index 0000000..38221b9 --- /dev/null +++ b/packages/eval/src/judge/builtin/read-steps.ts @@ -0,0 +1,14 @@ +import { execFileSync } from "node:child_process"; + +import type { StepEntry, ThreadStepsOutput } from "@united-workforce/protocol"; + +/** Shell out to `uwf step list` and return the parsed step entries (excludes start entry). */ +export function readThreadSteps(threadId: string): StepEntry[] { + const stdout = execFileSync("uwf", ["step", "list", threadId], { + encoding: "utf8", + stdio: ["ignore", "pipe", "pipe"], + }).trim(); + const parsed = JSON.parse(stdout) as ThreadStepsOutput; + // steps[0] is the StartEntry; the rest are StepEntry records. + return parsed.steps.slice(1) as StepEntry[]; +} diff --git a/packages/eval/src/judge/builtin/token-stats.ts b/packages/eval/src/judge/builtin/token-stats.ts new file mode 100644 index 0000000..cd7396d --- /dev/null +++ b/packages/eval/src/judge/builtin/token-stats.ts @@ -0,0 +1,53 @@ +import { createLogger } from "@united-workforce/util"; + +import { EVAL_JUDGE_TOKEN_STATS_SCHEMA } from "../../storage/index.js"; +import { readThreadSteps } from "./read-steps.js"; +import type { BuiltinJudgeOutput } from "./types.js"; + +const log = createLogger({ sink: { kind: "stderr" } }); + +const LOG_RESULT = "T7KQ3M9P"; + +type PerStepStats = { + role: string; + inputTokens: number; + outputTokens: number; + turns: number; + duration: number; +}; + +/** + * Informational judge: aggregate token usage across every step. Always scores + * 1.0 — it never penalizes a run, it only reports usage. Steps with null usage + * contribute zeros. + */ +export async function runTokenStatsJudge(threadId: string): Promise { + const steps = readThreadSteps(threadId); + + let totalInput = 0; + let totalOutput = 0; + let totalTurns = 0; + const perStep: PerStepStats[] = []; + + for (const step of steps) { + const usage = step.usage; + const inputTokens = usage !== null ? usage.inputTokens : 0; + const outputTokens = usage !== null ? usage.outputTokens : 0; + const turns = usage !== null ? usage.turns : 0; + const duration = usage !== null ? usage.duration : 0; + + totalInput += inputTokens; + totalOutput += outputTokens; + totalTurns += turns; + + perStep.push({ role: step.role, inputTokens, outputTokens, turns, duration }); + } + + log(LOG_RESULT, `token-stats thread=${threadId} in=${totalInput} out=${totalOutput}`); + + return { + score: 1.0, + data: { totalInput, totalOutput, totalTurns, perStep }, + schema: EVAL_JUDGE_TOKEN_STATS_SCHEMA, + }; +} diff --git a/packages/eval/src/judge/builtin/types.ts b/packages/eval/src/judge/builtin/types.ts new file mode 100644 index 0000000..1d21037 --- /dev/null +++ b/packages/eval/src/judge/builtin/types.ts @@ -0,0 +1,16 @@ +import type { JSONSchema } from "@ocas/core"; + +/** + * Output produced by a builtin judge. Structurally identical to the runner's + * `JudgeRunOutput`; defined locally to keep the judge module free of a + * dependency on the runner module. + */ +export type BuiltinJudgeOutput = { + score: number; + data: unknown; + /** Schema describing `data`, used when persisting to CAS. */ + schema: JSONSchema; +}; + +/** A builtin judge analyzes a thread's steps and returns a scored result. */ +export type BuiltinJudge = (threadId: string) => Promise; diff --git a/packages/eval/src/judge/builtin/upstream.ts b/packages/eval/src/judge/builtin/upstream.ts new file mode 100644 index 0000000..0fb548f --- /dev/null +++ b/packages/eval/src/judge/builtin/upstream.ts @@ -0,0 +1,17 @@ +import { EVAL_JUDGE_UPSTREAM_SCHEMA } from "../../storage/index.js"; +import type { BuiltinJudgeOutput } from "./types.js"; + +/** + * LLM-as-judge: measures how well each role consumed the relevant outputs from + * upstream steps. + * + * TODO: LLM-as-judge — needs provider config to call LLM API. Returns a stub + * (score 0, empty perStep) until the LLM call path is wired up. + */ +export async function runUpstreamJudge(_threadId: string): Promise { + return { + score: 0, + data: { perStep: [] }, + schema: EVAL_JUDGE_UPSTREAM_SCHEMA, + }; +} diff --git a/packages/eval/src/judge/index.ts b/packages/eval/src/judge/index.ts index ebb4dfb..84c63e5 100644 --- a/packages/eval/src/judge/index.ts +++ b/packages/eval/src/judge/index.ts @@ -1 +1,10 @@ +export { + type BuiltinJudge, + type BuiltinJudgeOutput, + readThreadSteps, + runFrontmatterJudge, + runHallucinationJudge, + runTokenStatsJudge, + runUpstreamJudge, +} from "./builtin/index.js"; export type { JudgeInput, JudgeOutput } from "./types.js"; diff --git a/packages/eval/src/runner/collect.ts b/packages/eval/src/runner/collect.ts index b487dc8..da745f7 100644 --- a/packages/eval/src/runner/collect.ts +++ b/packages/eval/src/runner/collect.ts @@ -8,6 +8,12 @@ import type { CasRef } from "@united-workforce/protocol"; import { createLogger } from "@united-workforce/util"; import type { JudgeOutput } from "../judge/index.js"; +import { + runFrontmatterJudge, + runHallucinationJudge, + runTokenStatsJudge, + runUpstreamJudge, +} from "../judge/index.js"; import type { EvalJudgeRecord, EvalRunPayload } from "../storage/index.js"; import { EVAL_RUN_SCHEMA, setEvalLatest } from "../storage/index.js"; import type { JudgeEntry } from "../task/index.js"; @@ -89,13 +95,29 @@ async function loadSchema(path: string): Promise { return JSON.parse(text) as JSONSchema; } +/** Dispatch a builtin judge by name. Throws on an unknown builtin name. */ +async function runBuiltinJudge(name: string, threadId: string): Promise { + switch (name) { + case "frontmatter-compliance": + return runFrontmatterJudge(threadId); + case "upstream-consumption": + return runUpstreamJudge(threadId); + case "hallucination": + return runHallucinationJudge(threadId); + case "token-stats": + return runTokenStatsJudge(threadId); + default: + throw new Error(`unknown builtin judge "${name}"`); + } +} + /** - * Default judge runner. Builtin judges are skipped for now (placeholder score 0 - * with empty data); task judges spawn their entry script. + * Default judge runner. Builtin judges are dispatched by name; task judges spawn + * their entry script. */ const defaultJudgeRunner: JudgeRunner = async (taskDir, workDir, threadId, judge) => { if (judge.builtin) { - return { score: 0, data: {}, schema: GENERIC_DATA_SCHEMA }; + return runBuiltinJudge(judge.name, threadId); } return runTaskJudge(taskDir, workDir, threadId, judge); }; From ae81e4b5ac39a08b8425991fb6e6c3f7ba880869 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=8F=E6=A9=98?= Date: Fri, 5 Jun 2026 00:19:25 +0000 Subject: [PATCH 4/4] feat: eval report, diff, list commands MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the 3 read commands for eval framework: - report: read eval-run from CAS, render formatted text (task, overall, config, judges table, thread ID) - diff: side-by-side comparison with ▲/▼ delta indicators and config change markers - list: scan @uwf/eval/*/latest variables, sort by timestamp desc, --task filter, --limit pagination Architecture: pure formatting functions (format.ts) + data access (read.ts) + thin CLI handlers. Types in types.ts. 11 new tests (formatReport, formatDiff, formatList, selectEntries) Refs #72 --- packages/eval/__tests__/commands.test.ts | 171 +++++++++++++++++++++++ packages/eval/src/commands/diff.ts | 33 ++++- packages/eval/src/commands/format.ts | 148 ++++++++++++++++++++ packages/eval/src/commands/index.ts | 3 + packages/eval/src/commands/list.ts | 36 ++++- packages/eval/src/commands/read.ts | 41 ++++++ packages/eval/src/commands/report.ts | 27 +++- packages/eval/src/commands/types.ts | 9 ++ 8 files changed, 459 insertions(+), 9 deletions(-) create mode 100644 packages/eval/__tests__/commands.test.ts create mode 100644 packages/eval/src/commands/format.ts create mode 100644 packages/eval/src/commands/read.ts create mode 100644 packages/eval/src/commands/types.ts diff --git a/packages/eval/__tests__/commands.test.ts b/packages/eval/__tests__/commands.test.ts new file mode 100644 index 0000000..2b498b1 --- /dev/null +++ b/packages/eval/__tests__/commands.test.ts @@ -0,0 +1,171 @@ +import { bootstrap, createMemoryStore, putSchema } from "@ocas/core"; +import type { CasRef } from "@united-workforce/protocol"; +import { describe, expect, test } from "vitest"; + +import { + formatDiff, + formatList, + formatReport, + readEvalEntries, + readEvalRun, + selectEntries, +} from "../src/commands/index.js"; +import type { EvalRunPayload, EvalStore } from "../src/storage/index.js"; +import { EVAL_RUN_SCHEMA, setEvalLatest } from "../src/storage/index.js"; + +function makeEvalStore(): EvalStore { + const store = createMemoryStore(); + bootstrap(store); + return { store, varStore: store.var }; +} + +function makePayload( + task: string, + overall: number, + timestamp: number, + judges: EvalRunPayload["judges"] = [ + { + name: "frontmatter-compliance", + score: 1.0, + weight: 0.6, + dataHash: "AAAAAAAAAAAAA" as CasRef, + }, + { name: "token-stats", score: 0.5, weight: 0, dataHash: "BBBBBBBBBBBBB" as CasRef }, + ], + config: EvalRunPayload["config"] = { + agent: "hermes", + model: "claude-sonnet-4", + engineVersion: "1.0.0", + }, +): EvalRunPayload { + return { task, config, threadId: "THREAD0123456789", judges, overall, timestamp }; +} + +/** Store an eval-run node in CAS and index it under @uwf/eval//latest. */ +function storeRun(evalStore: EvalStore, payload: EvalRunPayload): string { + const schemaHash = putSchema(evalStore.store, EVAL_RUN_SCHEMA); + const hash = evalStore.store.cas.put(schemaHash, payload); + setEvalLatest(evalStore.varStore, payload.task, hash); + return hash; +} + +describe("formatReport", () => { + test("includes task, overall, config and judges", () => { + const payload = makePayload("fix-off-by-one", 0.8, Date.UTC(2026, 0, 2, 3, 4, 5)); + const output = formatReport(payload, "RUNHASH123456"); + + expect(output).toContain("fix-off-by-one"); + expect(output).toContain("0.8000"); + expect(output).toContain("hermes"); + expect(output).toContain("claude-sonnet-4"); + expect(output).toContain("1.0.0"); + expect(output).toContain("frontmatter-compliance"); + expect(output).toContain("token-stats"); + expect(output).toContain("THREAD0123456789"); + expect(output).toContain("RUNHASH123456"); + }); + + test("round-trips a stored run via readEvalRun", () => { + const evalStore = makeEvalStore(); + const payload = makePayload("fix-off-by-one", 0.75, Date.now()); + const hash = storeRun(evalStore, payload); + + const loaded = readEvalRun(evalStore, hash); + expect(loaded).not.toBeNull(); + const output = formatReport(loaded as EvalRunPayload, hash); + expect(output).toContain("fix-off-by-one"); + expect(output).toContain("0.7500"); + }); + + test("readEvalRun returns null for a missing hash", () => { + const evalStore = makeEvalStore(); + expect(readEvalRun(evalStore, "NOPENOPENOPE0")).toBeNull(); + }); +}); + +describe("list", () => { + test("lists eval runs stored under different tasks", () => { + const evalStore = makeEvalStore(); + storeRun(evalStore, makePayload("fix-off-by-one", 0.8, 2000)); + storeRun(evalStore, makePayload("write-docs", 0.6, 1000)); + + const entries = readEvalEntries(evalStore); + expect(entries).toHaveLength(2); + + const output = formatList(selectEntries(entries, null, 20)); + expect(output).toContain("fix-off-by-one"); + expect(output).toContain("write-docs"); + }); + + test("sorts newest-first by timestamp", () => { + const evalStore = makeEvalStore(); + storeRun(evalStore, makePayload("old-task", 0.5, 1000)); + storeRun(evalStore, makePayload("new-task", 0.5, 2000)); + + const selected = selectEntries(readEvalEntries(evalStore), null, 20); + expect(selected[0]?.task).toBe("new-task"); + expect(selected[1]?.task).toBe("old-task"); + }); + + test("--task filter only shows the matching task", () => { + const evalStore = makeEvalStore(); + storeRun(evalStore, makePayload("fix-off-by-one", 0.8, 2000)); + storeRun(evalStore, makePayload("write-docs", 0.6, 1000)); + + const output = formatList(selectEntries(readEvalEntries(evalStore), "write-docs", 20)); + expect(output).toContain("write-docs"); + expect(output).not.toContain("fix-off-by-one"); + }); + + test("--limit caps the number of rows", () => { + const evalStore = makeEvalStore(); + storeRun(evalStore, makePayload("task-a", 0.8, 3000)); + storeRun(evalStore, makePayload("task-b", 0.6, 2000)); + storeRun(evalStore, makePayload("task-c", 0.4, 1000)); + + const selected = selectEntries(readEvalEntries(evalStore), null, 2); + expect(selected).toHaveLength(2); + expect(selected.map((e) => e.task)).toEqual(["task-a", "task-b"]); + }); + + test("empty store renders a placeholder", () => { + const evalStore = makeEvalStore(); + const output = formatList(selectEntries(readEvalEntries(evalStore), null, 20)); + expect(output).toContain("(no eval runs found)"); + }); +}); + +describe("formatDiff", () => { + test("shows an upward delta when B scores higher", () => { + const a = makePayload("fix-off-by-one", 0.6, 1000); + const b = makePayload("fix-off-by-one", 0.8, 2000); + const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000"); + + expect(output).toContain("▲"); + expect(output).toContain("HASHA00000000"); + expect(output).toContain("HASHB00000000"); + }); + + test("shows a downward delta when B scores lower", () => { + const a = makePayload("fix-off-by-one", 0.9, 1000); + const b = makePayload("fix-off-by-one", 0.4, 2000); + const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000"); + expect(output).toContain("▼"); + }); + + test("marks differing config values", () => { + const a = makePayload("fix-off-by-one", 0.6, 1000, undefined, { + agent: "hermes", + model: "claude-sonnet-4", + engineVersion: "1.0.0", + }); + const b = makePayload("fix-off-by-one", 0.6, 2000, undefined, { + agent: "claude-code", + model: "claude-sonnet-4", + engineVersion: "1.0.0", + }); + const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000"); + expect(output).toContain("≠"); + expect(output).toContain("claude-code"); + }); +}); diff --git a/packages/eval/src/commands/diff.ts b/packages/eval/src/commands/diff.ts index fa443d8..cb292ac 100644 --- a/packages/eval/src/commands/diff.ts +++ b/packages/eval/src/commands/diff.ts @@ -1,11 +1,38 @@ +import { createLogger } from "@united-workforce/util"; import type { Command } from "commander"; +import { createEvalStore } from "../storage/index.js"; +import { formatDiff } from "./format.js"; +import { readEvalRun } from "./read.js"; + +const log = createLogger({ sink: { kind: "stderr" } }); +const LOG_DIFF = "D3WZ8N5T"; + export function registerDiffCommand(program: Command): void { program .command("diff ") .description("Compare two eval runs side-by-side") - .action(async (_hash1: string, _hash2: string) => { - process.stderr.write("uwf-eval diff: not yet implemented\n"); - process.exitCode = 1; + .action(async (hash1: string, hash2: string) => { + try { + const evalStore = await createEvalStore(); + const payloadA = readEvalRun(evalStore, hash1); + if (payloadA === null) { + process.stderr.write(`eval run not found: ${hash1}\n`); + process.exitCode = 1; + return; + } + const payloadB = readEvalRun(evalStore, hash2); + if (payloadB === null) { + process.stderr.write(`eval run not found: ${hash2}\n`); + process.exitCode = 1; + return; + } + log(LOG_DIFF, `diff a=${hash1} b=${hash2}`); + process.stdout.write(formatDiff(payloadA, hash1, payloadB, hash2)); + } catch (e) { + const message = e instanceof Error ? e.message : String(e); + process.stderr.write(`${message}\n`); + process.exitCode = 1; + } }); } diff --git a/packages/eval/src/commands/format.ts b/packages/eval/src/commands/format.ts new file mode 100644 index 0000000..880e733 --- /dev/null +++ b/packages/eval/src/commands/format.ts @@ -0,0 +1,148 @@ +import type { EvalRunPayload } from "../storage/index.js"; +import type { EvalListEntry } from "./types.js"; + +const NAME_WIDTH = 28; +const SCORE_WIDTH = 10; +const TIMESTAMP_WIDTH = 26; + +/** Format a 0..1 score (or weight) with fixed precision. */ +function formatScore(value: number): string { + return value.toFixed(4); +} + +/** Human-readable ISO-8601 timestamp from epoch milliseconds. */ +function formatTimestamp(ms: number): string { + return new Date(ms).toISOString(); +} + +/** Right-pad to a fixed column width (with a trailing space if already full). */ +function pad(value: string, width: number): string { + return value.length >= width ? `${value} ` : value.padEnd(width); +} + +/** Directional indicator for a score delta (B relative to A). */ +function formatDelta(delta: number): string { + if (delta > 0) { + return `▲ +${formatScore(delta)}`; + } + if (delta < 0) { + return `▼ ${formatScore(delta)}`; + } + return `= ${formatScore(0)}`; +} + +/** Render a single eval run as a human-readable report. */ +export function formatReport(payload: EvalRunPayload, runHash: string): string { + const lines: string[] = []; + lines.push("=== Eval Report ==="); + lines.push(`Task: ${payload.task}`); + lines.push(`Overall: ${formatScore(payload.overall)}`); + lines.push(`Timestamp: ${formatTimestamp(payload.timestamp)}`); + lines.push(""); + lines.push("Config:"); + lines.push(` Agent: ${payload.config.agent}`); + lines.push(` Model: ${payload.config.model}`); + lines.push(` Engine: ${payload.config.engineVersion}`); + lines.push(""); + lines.push("Judges:"); + lines.push(` ${pad("NAME", NAME_WIDTH)}${pad("SCORE", SCORE_WIDTH)}WEIGHT`); + for (const judge of payload.judges) { + lines.push( + ` ${pad(judge.name, NAME_WIDTH)}${pad(formatScore(judge.score), SCORE_WIDTH)}${formatScore(judge.weight)}`, + ); + } + lines.push(""); + lines.push(`Thread: ${payload.threadId}`); + lines.push(`Run: ${runHash}`); + return `${lines.join("\n")}\n`; +} + +/** Render a side-by-side comparison of two eval runs. */ +export function formatDiff( + payloadA: EvalRunPayload, + hashA: string, + payloadB: EvalRunPayload, + hashB: string, +): string { + const lines: string[] = []; + lines.push("=== Eval Diff ==="); + lines.push(`A: ${hashA} (${payloadA.task})`); + lines.push(`B: ${hashB} (${payloadB.task})`); + lines.push(""); + + const overallDelta = payloadB.overall - payloadA.overall; + lines.push("Overall:"); + lines.push( + ` A=${formatScore(payloadA.overall)} B=${formatScore(payloadB.overall)} ${formatDelta(overallDelta)}`, + ); + lines.push(""); + + lines.push("Config:"); + lines.push(configLine("Agent", payloadA.config.agent, payloadB.config.agent)); + lines.push(configLine("Model", payloadA.config.model, payloadB.config.model)); + lines.push(configLine("Engine", payloadA.config.engineVersion, payloadB.config.engineVersion)); + lines.push(""); + + lines.push("Judges:"); + lines.push(` ${pad("NAME", NAME_WIDTH)}${pad("A", SCORE_WIDTH)}${pad("B", SCORE_WIDTH)}DELTA`); + const scoresA = new Map(payloadA.judges.map((judge) => [judge.name, judge.score])); + const scoresB = new Map(payloadB.judges.map((judge) => [judge.name, judge.score])); + for (const name of unionJudgeNames(payloadA, payloadB)) { + const scoreA = scoresA.get(name); + const scoreB = scoresB.get(name); + const cellA = scoreA === undefined ? "—" : formatScore(scoreA); + const cellB = scoreB === undefined ? "—" : formatScore(scoreB); + const delta = scoreA !== undefined && scoreB !== undefined ? formatDelta(scoreB - scoreA) : ""; + lines.push( + ` ${pad(name, NAME_WIDTH)}${pad(cellA, SCORE_WIDTH)}${pad(cellB, SCORE_WIDTH)}${delta}`, + ); + } + return `${lines.join("\n")}\n`; +} + +/** Render a table of indexed eval runs. */ +export function formatList(entries: ReadonlyArray): string { + const lines: string[] = []; + lines.push( + ` ${pad("TASK", NAME_WIDTH)}${pad("OVERALL", SCORE_WIDTH)}${pad("TIMESTAMP", TIMESTAMP_WIDTH)}HASH`, + ); + if (entries.length === 0) { + lines.push(" (no eval runs found)"); + } + for (const entry of entries) { + lines.push( + ` ${pad(entry.task, NAME_WIDTH)}${pad(formatScore(entry.overall), SCORE_WIDTH)}${pad(formatTimestamp(entry.timestamp), TIMESTAMP_WIDTH)}${entry.hash}`, + ); + } + return `${lines.join("\n")}\n`; +} + +/** Sort newest-first, then apply optional task filter and result limit. */ +export function selectEntries( + entries: ReadonlyArray, + task: string | null, + limit: number | null, +): EvalListEntry[] { + const sorted = [...entries].sort((a, b) => b.timestamp - a.timestamp); + const filtered = task !== null ? sorted.filter((entry) => entry.task === task) : sorted; + return limit !== null ? filtered.slice(0, limit) : filtered; +} + +/** Ordered union of judge names: A's order first, then B-only names. */ +function unionJudgeNames(payloadA: EvalRunPayload, payloadB: EvalRunPayload): string[] { + const names: string[] = []; + const seen = new Set(); + for (const judge of [...payloadA.judges, ...payloadB.judges]) { + if (!seen.has(judge.name)) { + seen.add(judge.name); + names.push(judge.name); + } + } + return names; +} + +/** One config row: `=` when equal, `≠` otherwise. */ +function configLine(label: string, valueA: string, valueB: string): string { + const marker = valueA === valueB ? "=" : "≠"; + return ` ${pad(`${label}:`, SCORE_WIDTH)}${marker} A=${valueA} B=${valueB}`; +} diff --git a/packages/eval/src/commands/index.ts b/packages/eval/src/commands/index.ts index 0dded07..cec7ad6 100644 --- a/packages/eval/src/commands/index.ts +++ b/packages/eval/src/commands/index.ts @@ -1,4 +1,7 @@ export { registerDiffCommand } from "./diff.js"; +export { formatDiff, formatList, formatReport, selectEntries } from "./format.js"; export { registerListCommand } from "./list.js"; +export { readEvalEntries, readEvalRun } from "./read.js"; export { registerReportCommand } from "./report.js"; export { registerRunCommand } from "./run.js"; +export type { EvalListEntry } from "./types.js"; diff --git a/packages/eval/src/commands/list.ts b/packages/eval/src/commands/list.ts index 9c4bd0c..1556f94 100644 --- a/packages/eval/src/commands/list.ts +++ b/packages/eval/src/commands/list.ts @@ -1,13 +1,43 @@ +import { createLogger } from "@united-workforce/util"; import type { Command } from "commander"; +import { createEvalStore } from "../storage/index.js"; +import { formatList, selectEntries } from "./format.js"; +import { readEvalEntries } from "./read.js"; + +const log = createLogger({ sink: { kind: "stderr" } }); +const LOG_LIST = "L5KX9R2B"; + +type ListCliOptions = { + task: string | undefined; + limit: string; +}; + export function registerListCommand(program: Command): void { program .command("list") .description("List past eval runs") .option("--task ", "filter by task name") .option("--limit ", "max results", "20") - .action(async (_opts: Record) => { - process.stderr.write("uwf-eval list: not yet implemented\n"); - process.exitCode = 1; + .action(async (opts: ListCliOptions) => { + const limit = Number.parseInt(opts.limit, 10); + if (!Number.isInteger(limit) || limit < 1) { + process.stderr.write("--limit must be a positive integer\n"); + process.exitCode = 1; + return; + } + + try { + const evalStore = await createEvalStore(); + const entries = readEvalEntries(evalStore); + const task = opts.task ?? null; + const selected = selectEntries(entries, task, limit); + log(LOG_LIST, `list task=${task ?? "*"} found=${entries.length} shown=${selected.length}`); + process.stdout.write(formatList(selected)); + } catch (e) { + const message = e instanceof Error ? e.message : String(e); + process.stderr.write(`${message}\n`); + process.exitCode = 1; + } }); } diff --git a/packages/eval/src/commands/read.ts b/packages/eval/src/commands/read.ts new file mode 100644 index 0000000..a44bbec --- /dev/null +++ b/packages/eval/src/commands/read.ts @@ -0,0 +1,41 @@ +import type { EvalRunPayload, EvalStore } from "../storage/index.js"; +import type { EvalListEntry } from "./types.js"; + +/** Variable prefix and suffix for eval run pointers (`@uwf/eval//latest`). */ +const EVAL_VAR_PREFIX = "@uwf/eval/"; +const EVAL_VAR_SUFFIX = "/latest"; + +/** Read a single eval-run payload from CAS. Returns null when the node is absent. */ +export function readEvalRun(evalStore: EvalStore, hash: string): EvalRunPayload | null { + const node = evalStore.store.cas.get(hash); + if (node === null) { + return null; + } + return node.payload as EvalRunPayload; +} + +/** + * Read every indexed eval run by scanning `@uwf/eval/*\/latest` variables and + * loading the referenced CAS node. Dangling pointers are skipped. + */ +export function readEvalEntries(evalStore: EvalStore): EvalListEntry[] { + const { store, varStore } = evalStore; + const entries: EvalListEntry[] = []; + for (const variable of varStore.list()) { + if (!variable.name.startsWith(EVAL_VAR_PREFIX) || !variable.name.endsWith(EVAL_VAR_SUFFIX)) { + continue; + } + const node = store.cas.get(variable.value); + if (node === null) { + continue; + } + const payload = node.payload as EvalRunPayload; + entries.push({ + task: payload.task, + overall: payload.overall, + timestamp: payload.timestamp, + hash: variable.value, + }); + } + return entries; +} diff --git a/packages/eval/src/commands/report.ts b/packages/eval/src/commands/report.ts index db3e8d6..32c6551 100644 --- a/packages/eval/src/commands/report.ts +++ b/packages/eval/src/commands/report.ts @@ -1,11 +1,32 @@ +import { createLogger } from "@united-workforce/util"; import type { Command } from "commander"; +import { createEvalStore } from "../storage/index.js"; +import { formatReport } from "./format.js"; +import { readEvalRun } from "./read.js"; + +const log = createLogger({ sink: { kind: "stderr" } }); +const LOG_REPORT = "R7QP2M4K"; + export function registerReportCommand(program: Command): void { program .command("report ") .description("Show eval run results") - .action(async (_hash: string) => { - process.stderr.write("uwf-eval report: not yet implemented\n"); - process.exitCode = 1; + .action(async (hash: string) => { + try { + const evalStore = await createEvalStore(); + const payload = readEvalRun(evalStore, hash); + if (payload === null) { + process.stderr.write(`eval run not found: ${hash}\n`); + process.exitCode = 1; + return; + } + log(LOG_REPORT, `report task=${payload.task} hash=${hash}`); + process.stdout.write(formatReport(payload, hash)); + } catch (e) { + const message = e instanceof Error ? e.message : String(e); + process.stderr.write(`${message}\n`); + process.exitCode = 1; + } }); } diff --git a/packages/eval/src/commands/types.ts b/packages/eval/src/commands/types.ts new file mode 100644 index 0000000..2603bcc --- /dev/null +++ b/packages/eval/src/commands/types.ts @@ -0,0 +1,9 @@ +import type { CasRef } from "@united-workforce/protocol"; + +/** Summary row for the `list` command: one indexed eval run. */ +export type EvalListEntry = { + task: string; + overall: number; + timestamp: number; + hash: CasRef; +};