feat: eval package scaffold — CLI + schemas + types + task loader #85

Merged
xiaomo merged 4 commits from feat/69-eval-scaffold into main 2026-06-05 00:23:57 +00:00
21 changed files with 675 additions and 1 deletions
Showing only changes of commit 99619d85db - Show all commits
+63
View File
@@ -0,0 +1,63 @@
import { describe, expect, test } from "vitest";
import {
EVAL_JUDGE_FRONTMATTER_SCHEMA,
EVAL_JUDGE_HALLUCINATION_SCHEMA,
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
EVAL_JUDGE_UPSTREAM_SCHEMA,
EVAL_RUN_SCHEMA,
} from "../src/storage/index.js";
describe("OCAS schema definitions", () => {
test("eval-run schema has correct title and required fields", () => {
expect(EVAL_RUN_SCHEMA.title).toBe("@uwf/eval-run");
const required = EVAL_RUN_SCHEMA.required as string[];
expect(required).toContain("task");
expect(required).toContain("config");
expect(required).toContain("threadId");
expect(required).toContain("judges");
expect(required).toContain("overall");
expect(required).toContain("timestamp");
});
test("frontmatter judge schema has correct title", () => {
expect(EVAL_JUDGE_FRONTMATTER_SCHEMA.title).toBe("@uwf/eval-judge-frontmatter");
const required = EVAL_JUDGE_FRONTMATTER_SCHEMA.required as string[];
expect(required).toContain("stepsTotal");
expect(required).toContain("stepsValid");
expect(required).toContain("invalidSteps");
});
test("upstream judge schema has correct title", () => {
expect(EVAL_JUDGE_UPSTREAM_SCHEMA.title).toBe("@uwf/eval-judge-upstream");
const required = EVAL_JUDGE_UPSTREAM_SCHEMA.required as string[];
expect(required).toContain("perStep");
});
test("hallucination judge schema has correct title", () => {
expect(EVAL_JUDGE_HALLUCINATION_SCHEMA.title).toBe("@uwf/eval-judge-hallucination");
const required = EVAL_JUDGE_HALLUCINATION_SCHEMA.required as string[];
expect(required).toContain("perStep");
});
test("token-stats judge schema has correct title", () => {
expect(EVAL_JUDGE_TOKEN_STATS_SCHEMA.title).toBe("@uwf/eval-judge-token-stats");
const required = EVAL_JUDGE_TOKEN_STATS_SCHEMA.required as string[];
expect(required).toContain("totalInput");
expect(required).toContain("totalOutput");
expect(required).toContain("totalTurns");
expect(required).toContain("perStep");
});
test("all schemas have type object at root", () => {
const schemas = [
EVAL_RUN_SCHEMA,
EVAL_JUDGE_FRONTMATTER_SCHEMA,
EVAL_JUDGE_UPSTREAM_SCHEMA,
EVAL_JUDGE_HALLUCINATION_SCHEMA,
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
];
for (const s of schemas) {
expect(s.type).toBe("object");
}
});
});
+163
View File
@@ -0,0 +1,163 @@
import { describe, expect, test } from "vitest";
import { parseTaskManifest } from "../src/task/index.js";
const VALID_YAML = `
name: fix-off-by-one
description: Fix an off-by-one error in a calculator
workflow: solve-issue
prompt: "Fix the bug: add(1,2) returns 4 instead of 3"
limits:
maxSteps: 15
timeoutMinutes: 30
judges:
- name: frontmatter-compliance
weight: 0.15
builtin: true
- name: test-pass
weight: 0.3
entry: dist/judges/test-pass.js
schema: schemas/test-pass.json
`;
describe("parseTaskManifest", () => {
test("parses valid task.yaml", () => {
const manifest = parseTaskManifest(VALID_YAML);
expect(manifest.name).toBe("fix-off-by-one");
expect(manifest.description).toBe("Fix an off-by-one error in a calculator");
expect(manifest.workflow).toBe("solve-issue");
expect(manifest.prompt).toBe("Fix the bug: add(1,2) returns 4 instead of 3");
expect(manifest.limits).toEqual({ maxSteps: 15, timeoutMinutes: 30 });
expect(manifest.judges).toHaveLength(2);
});
test("parses builtin judge", () => {
const manifest = parseTaskManifest(VALID_YAML);
const builtin = manifest.judges[0];
expect(builtin).toBeDefined();
expect(builtin!.name).toBe("frontmatter-compliance");
expect(builtin!.weight).toBe(0.15);
expect(builtin!.builtin).toBe(true);
expect(builtin!.entry).toBeNull();
});
test("parses custom judge with entry + schema", () => {
const manifest = parseTaskManifest(VALID_YAML);
const custom = manifest.judges[1];
expect(custom).toBeDefined();
expect(custom!.name).toBe("test-pass");
expect(custom!.weight).toBe(0.3);
expect(custom!.builtin).toBe(false);
expect(custom!.entry).toBe("dist/judges/test-pass.js");
expect(custom!.schema).toBe("schemas/test-pass.json");
});
test("defaults limits when omitted", () => {
const yaml = `
name: minimal
workflow: solve-issue
prompt: do something
judges:
- name: check
builtin: true
`;
const manifest = parseTaskManifest(yaml);
expect(manifest.limits).toEqual({ maxSteps: 20, timeoutMinutes: 30 });
});
test("defaults description to empty string", () => {
const yaml = `
name: no-desc
workflow: solve-issue
prompt: do something
judges:
- name: check
builtin: true
`;
const manifest = parseTaskManifest(yaml);
expect(manifest.description).toBe("");
});
test("rejects missing name", () => {
const yaml = `
workflow: solve-issue
prompt: do something
judges:
- name: check
builtin: true
`;
expect(() => parseTaskManifest(yaml)).toThrow("name is required");
});
test("rejects missing workflow", () => {
const yaml = `
name: test
prompt: do something
judges:
- name: check
builtin: true
`;
expect(() => parseTaskManifest(yaml)).toThrow("workflow is required");
});
test("rejects missing prompt", () => {
const yaml = `
name: test
workflow: solve-issue
judges:
- name: check
builtin: true
`;
expect(() => parseTaskManifest(yaml)).toThrow("prompt is required");
});
test("rejects empty judges array", () => {
const yaml = `
name: test
workflow: solve-issue
prompt: do something
judges: []
`;
expect(() => parseTaskManifest(yaml)).toThrow("at least one judge");
});
test("rejects non-builtin judge without entry", () => {
const yaml = `
name: test
workflow: solve-issue
prompt: do something
judges:
- name: custom-check
weight: 0.5
`;
expect(() => parseTaskManifest(yaml)).toThrow("non-builtin judge must have entry");
});
test("rejects non-object YAML root", () => {
expect(() => parseTaskManifest("just a string")).toThrow("must be a YAML mapping");
});
test("rejects judge without name", () => {
const yaml = `
name: test
workflow: solve-issue
prompt: do something
judges:
- weight: 0.5
builtin: true
`;
expect(() => parseTaskManifest(yaml)).toThrow("name is required");
});
test("defaults weight to 0 when omitted", () => {
const yaml = `
name: test
workflow: solve-issue
prompt: do something
judges:
- name: token-stats
builtin: true
`;
const manifest = parseTaskManifest(yaml);
expect(manifest.judges[0]!.weight).toBe(0);
});
});
+46
View File
@@ -0,0 +1,46 @@
{
"name": "@united-workforce/eval",
"version": "0.1.0",
"private": true,
"files": [
"src",
"dist",
"package.json"
],
"type": "module",
"bin": {
"uwf-eval": "./dist/cli.js"
},
"exports": {
".": {
"types": "./dist/index.d.ts",
"import": "./dist/index.js"
}
},
"scripts": {
"prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
"test": "vitest run __tests__/",
"test:ci": "vitest run __tests__/"
},
"dependencies": {
"@ocas/core": "^0.3.0",
"@ocas/fs": "^0.3.0",
"@united-workforce/protocol": "workspace:^",
"@united-workforce/util": "workspace:^",
"commander": "^14.0.3",
"yaml": "^2.9.0"
},
"devDependencies": {
"typescript": "^5.8.3"
},
"repository": {
"type": "git",
"url": "https://git.shazhou.work/shazhou/united-workforce.git",
"directory": "packages/eval"
},
"homepage": "https://git.shazhou.work/shazhou/united-workforce#readme",
"bugs": {
"url": "https://git.shazhou.work/shazhou/united-workforce/issues"
},
"license": "MIT"
}
+22
View File
@@ -0,0 +1,22 @@
#!/usr/bin/env node
import { Command } from "commander";
import {
registerDiffCommand,
registerListCommand,
registerReportCommand,
registerRunCommand,
} from "./commands/index.js";
const program = new Command();
program
.name("uwf-eval")
.description("Evaluate uwf workflow quality with real agents")
.version("0.1.0");
registerRunCommand(program);
registerReportCommand(program);
registerDiffCommand(program);
registerListCommand(program);
program.parse();
+11
View File
@@ -0,0 +1,11 @@
import type { Command } from "commander";
export function registerDiffCommand(program: Command): void {
program
.command("diff <hash1> <hash2>")
.description("Compare two eval runs side-by-side")
.action(async (_hash1: string, _hash2: string) => {
process.stderr.write("uwf-eval diff: not yet implemented\n");
process.exitCode = 1;
});
}
+4
View File
@@ -0,0 +1,4 @@
export { registerDiffCommand } from "./diff.js";
export { registerListCommand } from "./list.js";
export { registerReportCommand } from "./report.js";
export { registerRunCommand } from "./run.js";
+13
View File
@@ -0,0 +1,13 @@
import type { Command } from "commander";
export function registerListCommand(program: Command): void {
program
.command("list")
.description("List past eval runs")
.option("--task <name>", "filter by task name")
.option("--limit <n>", "max results", "20")
.action(async (_opts: Record<string, unknown>) => {
process.stderr.write("uwf-eval list: not yet implemented\n");
process.exitCode = 1;
});
}
+11
View File
@@ -0,0 +1,11 @@
import type { Command } from "commander";
export function registerReportCommand(program: Command): void {
program
.command("report <hash>")
.description("Show eval run results")
.action(async (_hash: string) => {
process.stderr.write("uwf-eval report: not yet implemented\n");
process.exitCode = 1;
});
}
+14
View File
@@ -0,0 +1,14 @@
import type { Command } from "commander";
export function registerRunCommand(program: Command): void {
program
.command("run <task>")
.description("Run eval on a task directory or tarball")
.option("--agent <name>", "agent adapter to use", "hermes")
.option("--model <model>", "model override")
.option("--count <n>", "number of eval runs", "1")
.action(async (_task: string, _opts: Record<string, unknown>) => {
process.stderr.write("uwf-eval run: not yet implemented\n");
process.exitCode = 1;
});
}
+15
View File
@@ -0,0 +1,15 @@
// Task manifest
// Judge types
export type { JudgeInput, JudgeOutput } from "./judge/index.js";
export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload } from "./storage/index.js";
// Storage schemas and types
export {
EVAL_JUDGE_FRONTMATTER_SCHEMA,
EVAL_JUDGE_HALLUCINATION_SCHEMA,
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
EVAL_JUDGE_UPSTREAM_SCHEMA,
EVAL_RUN_SCHEMA,
} from "./storage/index.js";
export type { JudgeEntry, TaskLimits, TaskManifest } from "./task/index.js";
export { loadTaskManifest, parseTaskManifest } from "./task/index.js";
+1
View File
@@ -0,0 +1 @@
export type { JudgeInput, JudgeOutput } from "./types.js";
+15
View File
@@ -0,0 +1,15 @@
/** Output shape every judge must produce on stdout (JSON). */
export type JudgeOutput<T = unknown> = {
/** Score between 0.0 and 1.0. */
score: number;
/** Judge-specific structured data, stored in CAS with its own schema. */
data: T;
};
/** Input context passed to judge scripts via argv. */
export type JudgeInput = {
/** Working directory where the task was executed. */
cwd: string;
/** Thread ID of the eval run. */
threadId: string;
};
+8
View File
@@ -0,0 +1,8 @@
export {
EVAL_JUDGE_FRONTMATTER_SCHEMA,
EVAL_JUDGE_HALLUCINATION_SCHEMA,
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
EVAL_JUDGE_UPSTREAM_SCHEMA,
EVAL_RUN_SCHEMA,
} from "./schemas.js";
export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload } from "./types.js";
+123
View File
@@ -0,0 +1,123 @@
import type { JSONSchema } from "@ocas/core";
export const EVAL_RUN_SCHEMA: JSONSchema = {
title: "@uwf/eval-run",
type: "object",
required: ["task", "config", "threadId", "judges", "overall", "timestamp"],
properties: {
task: { type: "string" },
config: {
type: "object",
required: ["agent", "model", "engineVersion"],
properties: {
agent: { type: "string" },
model: { type: "string" },
engineVersion: { type: "string" },
},
},
threadId: { type: "string" },
judges: {
type: "array",
items: {
type: "object",
required: ["name", "score", "weight", "dataHash"],
properties: {
name: { type: "string" },
score: { type: "number" },
weight: { type: "number" },
dataHash: { type: "string" },
},
},
},
overall: { type: "number" },
timestamp: { type: "integer" },
},
};
export const EVAL_JUDGE_FRONTMATTER_SCHEMA: JSONSchema = {
title: "@uwf/eval-judge-frontmatter",
type: "object",
required: ["stepsTotal", "stepsValid", "invalidSteps"],
properties: {
stepsTotal: { type: "integer" },
stepsValid: { type: "integer" },
invalidSteps: {
type: "array",
items: {
type: "object",
required: ["stepIndex", "role", "errors"],
properties: {
stepIndex: { type: "integer" },
role: { type: "string" },
errors: { type: "array", items: { type: "string" } },
},
},
},
},
};
export const EVAL_JUDGE_UPSTREAM_SCHEMA: JSONSchema = {
title: "@uwf/eval-judge-upstream",
type: "object",
required: ["perStep"],
properties: {
perStep: {
type: "array",
items: {
type: "object",
required: ["role", "consumed", "missed", "score"],
properties: {
role: { type: "string" },
consumed: { type: "array", items: { type: "string" } },
missed: { type: "array", items: { type: "string" } },
score: { type: "number" },
},
},
},
},
};
export const EVAL_JUDGE_HALLUCINATION_SCHEMA: JSONSchema = {
title: "@uwf/eval-judge-hallucination",
type: "object",
required: ["perStep"],
properties: {
perStep: {
type: "array",
items: {
type: "object",
required: ["role", "hallucinations", "score"],
properties: {
role: { type: "string" },
hallucinations: { type: "array", items: { type: "string" } },
score: { type: "number" },
},
},
},
},
};
export const EVAL_JUDGE_TOKEN_STATS_SCHEMA: JSONSchema = {
title: "@uwf/eval-judge-token-stats",
type: "object",
required: ["totalInput", "totalOutput", "totalTurns", "perStep"],
properties: {
totalInput: { type: "integer" },
totalOutput: { type: "integer" },
totalTurns: { type: "integer" },
perStep: {
type: "array",
items: {
type: "object",
required: ["role", "inputTokens", "outputTokens", "turns", "duration"],
properties: {
role: { type: "string" },
inputTokens: { type: "integer" },
outputTokens: { type: "integer" },
turns: { type: "integer" },
duration: { type: "number" },
},
},
},
},
};
+26
View File
@@ -0,0 +1,26 @@
import type { CasRef } from "@united-workforce/protocol";
/** A single judge result within an eval run. */
export type EvalJudgeRecord = {
name: string;
score: number;
weight: number;
dataHash: CasRef;
};
/** Config snapshot for an eval run. */
export type EvalRunConfig = {
agent: string;
model: string;
engineVersion: string;
};
/** Full eval run record stored in CAS. */
export type EvalRunPayload = {
task: string;
config: EvalRunConfig;
threadId: string;
judges: EvalJudgeRecord[];
overall: number;
timestamp: number;
};
+2
View File
@@ -0,0 +1,2 @@
export { loadTaskManifest, parseTaskManifest } from "./loader.js";
export type { JudgeEntry, TaskLimits, TaskManifest } from "./types.js";
+74
View File
@@ -0,0 +1,74 @@
import { readFile } from "node:fs/promises";
import { join } from "node:path";
import { parse as parseYaml } from "yaml";
import type { JudgeEntry, TaskLimits, TaskManifest } from "./types.js";
function isRecord(value: unknown): value is Record<string, unknown> {
return typeof value === "object" && value !== null && !Array.isArray(value);
}
function parseJudgeEntry(raw: unknown, index: number): JudgeEntry {
if (!isRecord(raw)) {
throw new Error(`judges[${index}]: expected object`);
}
const name = raw.name;
if (typeof name !== "string" || name === "") {
throw new Error(`judges[${index}]: name is required`);
}
const weight = typeof raw.weight === "number" ? raw.weight : 0;
const builtin = raw.builtin === true;
const entry = typeof raw.entry === "string" ? raw.entry : null;
const schema = typeof raw.schema === "string" ? raw.schema : null;
if (!builtin && entry === null) {
throw new Error(`judges[${index}] "${name}": non-builtin judge must have entry`);
}
return { name, weight, builtin, entry, schema };
}
function parseLimits(raw: unknown): TaskLimits {
if (!isRecord(raw)) {
return { maxSteps: 20, timeoutMinutes: 30 };
}
return {
maxSteps: typeof raw.maxSteps === "number" ? raw.maxSteps : 20,
timeoutMinutes: typeof raw.timeoutMinutes === "number" ? raw.timeoutMinutes : 30,
};
}
/** Parse and validate a task.yaml file into a TaskManifest. */
export function parseTaskManifest(yamlText: string): TaskManifest {
const raw = parseYaml(yamlText) as unknown;
if (!isRecord(raw)) {
throw new Error("task.yaml must be a YAML mapping");
}
const name = raw.name;
if (typeof name !== "string" || name === "") {
throw new Error("task.yaml: name is required");
}
const description = typeof raw.description === "string" ? raw.description : "";
const workflow = raw.workflow;
if (typeof workflow !== "string" || workflow === "") {
throw new Error("task.yaml: workflow is required");
}
const prompt = raw.prompt;
if (typeof prompt !== "string" || prompt === "") {
throw new Error("task.yaml: prompt is required");
}
const limits = parseLimits(raw.limits);
const judgesRaw = raw.judges;
if (!Array.isArray(judgesRaw) || judgesRaw.length === 0) {
throw new Error("task.yaml: at least one judge is required");
}
const judges: JudgeEntry[] = [];
for (let i = 0; i < judgesRaw.length; i++) {
judges.push(parseJudgeEntry(judgesRaw[i], i));
}
return { name, description, workflow, prompt, limits, judges };
}
/** Load and parse task.yaml from a directory. */
export async function loadTaskManifest(taskDir: string): Promise<TaskManifest> {
const yamlPath = join(taskDir, "task.yaml");
const text = await readFile(yamlPath, "utf8");
return parseTaskManifest(text);
}
+28
View File
@@ -0,0 +1,28 @@
/** Judge entry in task.yaml */
export type JudgeEntry = {
name: string;
weight: number;
builtin: boolean;
/** Path to judge entry script (relative to task root). Required for non-builtin judges. */
entry: string | null;
/** Path to OCAS schema JSON for judge data. Required for non-builtin judges. */
schema: string | null;
};
/** Limits for eval execution. */
export type TaskLimits = {
maxSteps: number;
timeoutMinutes: number;
};
/** Parsed task.yaml manifest. */
export type TaskManifest = {
name: string;
description: string;
/** Workflow name or relative path to .yaml file. */
workflow: string;
/** Initial prompt for thread start. */
prompt: string;
limits: TaskLimits;
judges: JudgeEntry[];
};
+9
View File
@@ -0,0 +1,9 @@
{
"extends": "../../tsconfig.json",
"compilerOptions": {
"rootDir": "src",
"outDir": "dist"
},
"include": ["src"],
"references": [{ "path": "../protocol" }, { "path": "../util" }]
}
+25
View File
@@ -228,6 +228,31 @@ importers:
specifier: ^8.0.13
version: 8.0.16(@types/node@25.9.1)(esbuild@0.27.7)(jiti@2.7.0)(yaml@2.9.0)
packages/eval:
dependencies:
'@ocas/core':
specifier: ^0.3.0
version: 0.3.0
'@ocas/fs':
specifier: ^0.3.0
version: 0.3.0
'@united-workforce/protocol':
specifier: workspace:^
version: link:../protocol
'@united-workforce/util':
specifier: workspace:^
version: link:../util
commander:
specifier: ^14.0.3
version: 14.0.3
yaml:
specifier: ^2.9.0
version: 2.9.0
devDependencies:
typescript:
specifier: ^5.8.3
version: 5.9.3
packages/protocol:
dependencies:
'@ocas/core':
+2 -1
View File
@@ -25,6 +25,7 @@
{ "path": "packages/agent-builtin" },
{ "path": "packages/agent-mock" },
{ "path": "packages/agent-claude-code" },
{ "path": "packages/cli" }
{ "path": "packages/cli" },
{ "path": "packages/eval" }
]
}