feat: eval package scaffold with CLI, schemas, types, task loader

New package @united-workforce/eval (uwf-eval CLI): - CLI skeleton: run/report/diff/list subcommands (stubs) - 5 OCAS schemas: eval-run, judge-frontmatter, judge-upstream, judge-hallucination, judge-token-stats - TaskManifest type + parser/validator for task.yaml - JudgeOutput/JudgeInput types for judge contract - EvalRunPayload/EvalRunConfig/EvalJudgeRecord storage types - 19 unit tests: task loader validation + schema definitions Refs #69
2026-06-04 23:42:16 +00:00
parent b94234652a
commit 99619d85db
21 changed files with 675 additions and 1 deletions
@@ -0,0 +1,63 @@
 import { describe, expect, test } from "vitest";
 import {
  EVAL_JUDGE_FRONTMATTER_SCHEMA,
  EVAL_JUDGE_HALLUCINATION_SCHEMA,
  EVAL_JUDGE_TOKEN_STATS_SCHEMA,
  EVAL_JUDGE_UPSTREAM_SCHEMA,
  EVAL_RUN_SCHEMA,
 } from "../src/storage/index.js";
 describe("OCAS schema definitions", () => {
  test("eval-run schema has correct title and required fields", () => {
    expect(EVAL_RUN_SCHEMA.title).toBe("@uwf/eval-run");
    const required = EVAL_RUN_SCHEMA.required as string[];
    expect(required).toContain("task");
    expect(required).toContain("config");
    expect(required).toContain("threadId");
    expect(required).toContain("judges");
    expect(required).toContain("overall");
    expect(required).toContain("timestamp");
  });
  test("frontmatter judge schema has correct title", () => {
    expect(EVAL_JUDGE_FRONTMATTER_SCHEMA.title).toBe("@uwf/eval-judge-frontmatter");
    const required = EVAL_JUDGE_FRONTMATTER_SCHEMA.required as string[];
    expect(required).toContain("stepsTotal");
    expect(required).toContain("stepsValid");
    expect(required).toContain("invalidSteps");
  });
  test("upstream judge schema has correct title", () => {
    expect(EVAL_JUDGE_UPSTREAM_SCHEMA.title).toBe("@uwf/eval-judge-upstream");
    const required = EVAL_JUDGE_UPSTREAM_SCHEMA.required as string[];
    expect(required).toContain("perStep");
  });
  test("hallucination judge schema has correct title", () => {
    expect(EVAL_JUDGE_HALLUCINATION_SCHEMA.title).toBe("@uwf/eval-judge-hallucination");
    const required = EVAL_JUDGE_HALLUCINATION_SCHEMA.required as string[];
    expect(required).toContain("perStep");
  });
  test("token-stats judge schema has correct title", () => {
    expect(EVAL_JUDGE_TOKEN_STATS_SCHEMA.title).toBe("@uwf/eval-judge-token-stats");
    const required = EVAL_JUDGE_TOKEN_STATS_SCHEMA.required as string[];
    expect(required).toContain("totalInput");
    expect(required).toContain("totalOutput");
    expect(required).toContain("totalTurns");
    expect(required).toContain("perStep");
  });
  test("all schemas have type object at root", () => {
    const schemas = [
      EVAL_RUN_SCHEMA,
      EVAL_JUDGE_FRONTMATTER_SCHEMA,
      EVAL_JUDGE_UPSTREAM_SCHEMA,
      EVAL_JUDGE_HALLUCINATION_SCHEMA,
      EVAL_JUDGE_TOKEN_STATS_SCHEMA,
    ];
    for (const s of schemas) {
      expect(s.type).toBe("object");
    }
  });
 });
@@ -0,0 +1,163 @@
 import { describe, expect, test } from "vitest";
 import { parseTaskManifest } from "../src/task/index.js";
 const VALID_YAML = `
 name: fix-off-by-one
 description: Fix an off-by-one error in a calculator
 workflow: solve-issue
 prompt: "Fix the bug: add(1,2) returns 4 instead of 3"
 limits:
  maxSteps: 15
  timeoutMinutes: 30
 judges:
  - name: frontmatter-compliance
    weight: 0.15
    builtin: true
  - name: test-pass
    weight: 0.3
    entry: dist/judges/test-pass.js
    schema: schemas/test-pass.json
 `;
 describe("parseTaskManifest", () => {
  test("parses valid task.yaml", () => {
    const manifest = parseTaskManifest(VALID_YAML);
    expect(manifest.name).toBe("fix-off-by-one");
    expect(manifest.description).toBe("Fix an off-by-one error in a calculator");
    expect(manifest.workflow).toBe("solve-issue");
    expect(manifest.prompt).toBe("Fix the bug: add(1,2) returns 4 instead of 3");
    expect(manifest.limits).toEqual({ maxSteps: 15, timeoutMinutes: 30 });
    expect(manifest.judges).toHaveLength(2);
  });
  test("parses builtin judge", () => {
    const manifest = parseTaskManifest(VALID_YAML);
    const builtin = manifest.judges[0];
    expect(builtin).toBeDefined();
    expect(builtin!.name).toBe("frontmatter-compliance");
    expect(builtin!.weight).toBe(0.15);
    expect(builtin!.builtin).toBe(true);
    expect(builtin!.entry).toBeNull();
  });
  test("parses custom judge with entry + schema", () => {
    const manifest = parseTaskManifest(VALID_YAML);
    const custom = manifest.judges[1];
    expect(custom).toBeDefined();
    expect(custom!.name).toBe("test-pass");
    expect(custom!.weight).toBe(0.3);
    expect(custom!.builtin).toBe(false);
    expect(custom!.entry).toBe("dist/judges/test-pass.js");
    expect(custom!.schema).toBe("schemas/test-pass.json");
  });
  test("defaults limits when omitted", () => {
    const yaml = `
 name: minimal
 workflow: solve-issue
 prompt: do something
 judges:
  - name: check
    builtin: true
 `;
    const manifest = parseTaskManifest(yaml);
    expect(manifest.limits).toEqual({ maxSteps: 20, timeoutMinutes: 30 });
  });
  test("defaults description to empty string", () => {
    const yaml = `
 name: no-desc
 workflow: solve-issue
 prompt: do something
 judges:
  - name: check
    builtin: true
 `;
    const manifest = parseTaskManifest(yaml);
    expect(manifest.description).toBe("");
  });
  test("rejects missing name", () => {
    const yaml = `
 workflow: solve-issue
 prompt: do something
 judges:
  - name: check
    builtin: true
 `;
    expect(() => parseTaskManifest(yaml)).toThrow("name is required");
  });
  test("rejects missing workflow", () => {
    const yaml = `
 name: test
 prompt: do something
 judges:
  - name: check
    builtin: true
 `;
    expect(() => parseTaskManifest(yaml)).toThrow("workflow is required");
  });
  test("rejects missing prompt", () => {
    const yaml = `
 name: test
 workflow: solve-issue
 judges:
  - name: check
    builtin: true
 `;
    expect(() => parseTaskManifest(yaml)).toThrow("prompt is required");
  });
  test("rejects empty judges array", () => {
    const yaml = `
 name: test
 workflow: solve-issue
 prompt: do something
 judges: []
 `;
    expect(() => parseTaskManifest(yaml)).toThrow("at least one judge");
  });
  test("rejects non-builtin judge without entry", () => {
    const yaml = `
 name: test
 workflow: solve-issue
 prompt: do something
 judges:
  - name: custom-check
    weight: 0.5
 `;
    expect(() => parseTaskManifest(yaml)).toThrow("non-builtin judge must have entry");
  });
  test("rejects non-object YAML root", () => {
    expect(() => parseTaskManifest("just a string")).toThrow("must be a YAML mapping");
  });
  test("rejects judge without name", () => {
    const yaml = `
 name: test
 workflow: solve-issue
 prompt: do something
 judges:
  - weight: 0.5
    builtin: true
 `;
    expect(() => parseTaskManifest(yaml)).toThrow("name is required");
  });
  test("defaults weight to 0 when omitted", () => {
    const yaml = `
 name: test
 workflow: solve-issue
 prompt: do something
 judges:
  - name: token-stats
    builtin: true
 `;
    const manifest = parseTaskManifest(yaml);
    expect(manifest.judges[0]!.weight).toBe(0);
  });
 });
@@ -0,0 +1,46 @@
 {
  "name": "@united-workforce/eval",
  "version": "0.1.0",
  "private": true,
  "files": [
    "src",
    "dist",
    "package.json"
  ],
  "type": "module",
  "bin": {
    "uwf-eval": "./dist/cli.js"
  },
  "exports": {
    ".": {
      "types": "./dist/index.d.ts",
      "import": "./dist/index.js"
    }
  },
  "scripts": {
    "prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
    "test": "vitest run __tests__/",
    "test:ci": "vitest run __tests__/"
  },
  "dependencies": {
    "@ocas/core": "^0.3.0",
    "@ocas/fs": "^0.3.0",
    "@united-workforce/protocol": "workspace:^",
    "@united-workforce/util": "workspace:^",
    "commander": "^14.0.3",
    "yaml": "^2.9.0"
  },
  "devDependencies": {
    "typescript": "^5.8.3"
  },
  "repository": {
    "type": "git",
    "url": "https://git.shazhou.work/shazhou/united-workforce.git",
    "directory": "packages/eval"
  },
  "homepage": "https://git.shazhou.work/shazhou/united-workforce#readme",
  "bugs": {
    "url": "https://git.shazhou.work/shazhou/united-workforce/issues"
  },
  "license": "MIT"
 }
@@ -0,0 +1,22 @@
 #!/usr/bin/env node
 import { Command } from "commander";
 import {
  registerDiffCommand,
  registerListCommand,
  registerReportCommand,
  registerRunCommand,
 } from "./commands/index.js";
 const program = new Command();
 program
  .name("uwf-eval")
  .description("Evaluate uwf workflow quality with real agents")
  .version("0.1.0");
 registerRunCommand(program);
 registerReportCommand(program);
 registerDiffCommand(program);
 registerListCommand(program);
 program.parse();
@@ -0,0 +1,11 @@
 import type { Command } from "commander";
 export function registerDiffCommand(program: Command): void {
  program
    .command("diff <hash1> <hash2>")
    .description("Compare two eval runs side-by-side")
    .action(async (_hash1: string, _hash2: string) => {
      process.stderr.write("uwf-eval diff: not yet implemented\n");
      process.exitCode = 1;
    });
 }
@@ -0,0 +1,4 @@
 export { registerDiffCommand } from "./diff.js";
 export { registerListCommand } from "./list.js";
 export { registerReportCommand } from "./report.js";
 export { registerRunCommand } from "./run.js";
@@ -0,0 +1,13 @@
 import type { Command } from "commander";
 export function registerListCommand(program: Command): void {
  program
    .command("list")
    .description("List past eval runs")
    .option("--task <name>", "filter by task name")
    .option("--limit <n>", "max results", "20")
    .action(async (_opts: Record<string, unknown>) => {
      process.stderr.write("uwf-eval list: not yet implemented\n");
      process.exitCode = 1;
    });
 }
@@ -0,0 +1,11 @@
 import type { Command } from "commander";
 export function registerReportCommand(program: Command): void {
  program
    .command("report <hash>")
    .description("Show eval run results")
    .action(async (_hash: string) => {
      process.stderr.write("uwf-eval report: not yet implemented\n");
      process.exitCode = 1;
    });
 }
@@ -0,0 +1,14 @@
 import type { Command } from "commander";
 export function registerRunCommand(program: Command): void {
  program
    .command("run <task>")
    .description("Run eval on a task directory or tarball")
    .option("--agent <name>", "agent adapter to use", "hermes")
    .option("--model <model>", "model override")
    .option("--count <n>", "number of eval runs", "1")
    .action(async (_task: string, _opts: Record<string, unknown>) => {
      process.stderr.write("uwf-eval run: not yet implemented\n");
      process.exitCode = 1;
    });
 }
@@ -0,0 +1,15 @@
 // Task manifest
 // Judge types
 export type { JudgeInput, JudgeOutput } from "./judge/index.js";
 export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload } from "./storage/index.js";
 // Storage schemas and types
 export {
  EVAL_JUDGE_FRONTMATTER_SCHEMA,
  EVAL_JUDGE_HALLUCINATION_SCHEMA,
  EVAL_JUDGE_TOKEN_STATS_SCHEMA,
  EVAL_JUDGE_UPSTREAM_SCHEMA,
  EVAL_RUN_SCHEMA,
 } from "./storage/index.js";
 export type { JudgeEntry, TaskLimits, TaskManifest } from "./task/index.js";
 export { loadTaskManifest, parseTaskManifest } from "./task/index.js";
@@ -0,0 +1 @@
 export type { JudgeInput, JudgeOutput } from "./types.js";
@@ -0,0 +1,15 @@
 /** Output shape every judge must produce on stdout (JSON). */
 export type JudgeOutput<T = unknown> = {
  /** Score between 0.0 and 1.0. */
  score: number;
  /** Judge-specific structured data, stored in CAS with its own schema. */
  data: T;
 };
 /** Input context passed to judge scripts via argv. */
 export type JudgeInput = {
  /** Working directory where the task was executed. */
  cwd: string;
  /** Thread ID of the eval run. */
  threadId: string;
 };
@@ -0,0 +1,8 @@
 export {
  EVAL_JUDGE_FRONTMATTER_SCHEMA,
  EVAL_JUDGE_HALLUCINATION_SCHEMA,
  EVAL_JUDGE_TOKEN_STATS_SCHEMA,
  EVAL_JUDGE_UPSTREAM_SCHEMA,
  EVAL_RUN_SCHEMA,
 } from "./schemas.js";
 export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload } from "./types.js";
@@ -0,0 +1,123 @@
 import type { JSONSchema } from "@ocas/core";
 export const EVAL_RUN_SCHEMA: JSONSchema = {
  title: "@uwf/eval-run",
  type: "object",
  required: ["task", "config", "threadId", "judges", "overall", "timestamp"],
  properties: {
    task: { type: "string" },
    config: {
      type: "object",
      required: ["agent", "model", "engineVersion"],
      properties: {
        agent: { type: "string" },
        model: { type: "string" },
        engineVersion: { type: "string" },
      },
    },
    threadId: { type: "string" },
    judges: {
      type: "array",
      items: {
        type: "object",
        required: ["name", "score", "weight", "dataHash"],
        properties: {
          name: { type: "string" },
          score: { type: "number" },
          weight: { type: "number" },
          dataHash: { type: "string" },
        },
      },
    },
    overall: { type: "number" },
    timestamp: { type: "integer" },
  },
 };
 export const EVAL_JUDGE_FRONTMATTER_SCHEMA: JSONSchema = {
  title: "@uwf/eval-judge-frontmatter",
  type: "object",
  required: ["stepsTotal", "stepsValid", "invalidSteps"],
  properties: {
    stepsTotal: { type: "integer" },
    stepsValid: { type: "integer" },
    invalidSteps: {
      type: "array",
      items: {
        type: "object",
        required: ["stepIndex", "role", "errors"],
        properties: {
          stepIndex: { type: "integer" },
          role: { type: "string" },
          errors: { type: "array", items: { type: "string" } },
        },
      },
    },
  },
 };
 export const EVAL_JUDGE_UPSTREAM_SCHEMA: JSONSchema = {
  title: "@uwf/eval-judge-upstream",
  type: "object",
  required: ["perStep"],
  properties: {
    perStep: {
      type: "array",
      items: {
        type: "object",
        required: ["role", "consumed", "missed", "score"],
        properties: {
          role: { type: "string" },
          consumed: { type: "array", items: { type: "string" } },
          missed: { type: "array", items: { type: "string" } },
          score: { type: "number" },
        },
      },
    },
  },
 };
 export const EVAL_JUDGE_HALLUCINATION_SCHEMA: JSONSchema = {
  title: "@uwf/eval-judge-hallucination",
  type: "object",
  required: ["perStep"],
  properties: {
    perStep: {
      type: "array",
      items: {
        type: "object",
        required: ["role", "hallucinations", "score"],
        properties: {
          role: { type: "string" },
          hallucinations: { type: "array", items: { type: "string" } },
          score: { type: "number" },
        },
      },
    },
  },
 };
 export const EVAL_JUDGE_TOKEN_STATS_SCHEMA: JSONSchema = {
  title: "@uwf/eval-judge-token-stats",
  type: "object",
  required: ["totalInput", "totalOutput", "totalTurns", "perStep"],
  properties: {
    totalInput: { type: "integer" },
    totalOutput: { type: "integer" },
    totalTurns: { type: "integer" },
    perStep: {
      type: "array",
      items: {
        type: "object",
        required: ["role", "inputTokens", "outputTokens", "turns", "duration"],
        properties: {
          role: { type: "string" },
          inputTokens: { type: "integer" },
          outputTokens: { type: "integer" },
          turns: { type: "integer" },
          duration: { type: "number" },
        },
      },
    },
  },
 };
@@ -0,0 +1,26 @@
 import type { CasRef } from "@united-workforce/protocol";
 /** A single judge result within an eval run. */
 export type EvalJudgeRecord = {
  name: string;
  score: number;
  weight: number;
  dataHash: CasRef;
 };
 /** Config snapshot for an eval run. */
 export type EvalRunConfig = {
  agent: string;
  model: string;
  engineVersion: string;
 };
 /** Full eval run record stored in CAS. */
 export type EvalRunPayload = {
  task: string;
  config: EvalRunConfig;
  threadId: string;
  judges: EvalJudgeRecord[];
  overall: number;
  timestamp: number;
 };
@@ -0,0 +1,2 @@
 export { loadTaskManifest, parseTaskManifest } from "./loader.js";
 export type { JudgeEntry, TaskLimits, TaskManifest } from "./types.js";
@@ -0,0 +1,74 @@
 import { readFile } from "node:fs/promises";
 import { join } from "node:path";
 import { parse as parseYaml } from "yaml";
 import type { JudgeEntry, TaskLimits, TaskManifest } from "./types.js";
 function isRecord(value: unknown): value is Record<string, unknown> {
  return typeof value === "object" && value !== null && !Array.isArray(value);
 }
 function parseJudgeEntry(raw: unknown, index: number): JudgeEntry {
  if (!isRecord(raw)) {
    throw new Error(`judges[${index}]: expected object`);
  }
  const name = raw.name;
  if (typeof name !== "string" || name === "") {
    throw new Error(`judges[${index}]: name is required`);
  }
  const weight = typeof raw.weight === "number" ? raw.weight : 0;
  const builtin = raw.builtin === true;
  const entry = typeof raw.entry === "string" ? raw.entry : null;
  const schema = typeof raw.schema === "string" ? raw.schema : null;
  if (!builtin && entry === null) {
    throw new Error(`judges[${index}] "${name}": non-builtin judge must have entry`);
  }
  return { name, weight, builtin, entry, schema };
 }
 function parseLimits(raw: unknown): TaskLimits {
  if (!isRecord(raw)) {
    return { maxSteps: 20, timeoutMinutes: 30 };
  }
  return {
    maxSteps: typeof raw.maxSteps === "number" ? raw.maxSteps : 20,
    timeoutMinutes: typeof raw.timeoutMinutes === "number" ? raw.timeoutMinutes : 30,
  };
 }
 /** Parse and validate a task.yaml file into a TaskManifest. */
 export function parseTaskManifest(yamlText: string): TaskManifest {
  const raw = parseYaml(yamlText) as unknown;
  if (!isRecord(raw)) {
    throw new Error("task.yaml must be a YAML mapping");
  }
  const name = raw.name;
  if (typeof name !== "string" || name === "") {
    throw new Error("task.yaml: name is required");
  }
  const description = typeof raw.description === "string" ? raw.description : "";
  const workflow = raw.workflow;
  if (typeof workflow !== "string" || workflow === "") {
    throw new Error("task.yaml: workflow is required");
  }
  const prompt = raw.prompt;
  if (typeof prompt !== "string" || prompt === "") {
    throw new Error("task.yaml: prompt is required");
  }
  const limits = parseLimits(raw.limits);
  const judgesRaw = raw.judges;
  if (!Array.isArray(judgesRaw) || judgesRaw.length === 0) {
    throw new Error("task.yaml: at least one judge is required");
  }
  const judges: JudgeEntry[] = [];
  for (let i = 0; i < judgesRaw.length; i++) {
    judges.push(parseJudgeEntry(judgesRaw[i], i));
  }
  return { name, description, workflow, prompt, limits, judges };
 }
 /** Load and parse task.yaml from a directory. */
 export async function loadTaskManifest(taskDir: string): Promise<TaskManifest> {
  const yamlPath = join(taskDir, "task.yaml");
  const text = await readFile(yamlPath, "utf8");
  return parseTaskManifest(text);
 }
@@ -0,0 +1,28 @@
 /** Judge entry in task.yaml */
 export type JudgeEntry = {
  name: string;
  weight: number;
  builtin: boolean;
  /** Path to judge entry script (relative to task root). Required for non-builtin judges. */
  entry: string | null;
  /** Path to OCAS schema JSON for judge data. Required for non-builtin judges. */
  schema: string | null;
 };
 /** Limits for eval execution. */
 export type TaskLimits = {
  maxSteps: number;
  timeoutMinutes: number;
 };
 /** Parsed task.yaml manifest. */
 export type TaskManifest = {
  name: string;
  description: string;
  /** Workflow name or relative path to .yaml file. */
  workflow: string;
  /** Initial prompt for thread start. */
  prompt: string;
  limits: TaskLimits;
  judges: JudgeEntry[];
 };
@@ -0,0 +1,9 @@
 {
  "extends": "../../tsconfig.json",
  "compilerOptions": {
    "rootDir": "src",
    "outDir": "dist"
  },
  "include": ["src"],
  "references": [{ "path": "../protocol" }, { "path": "../util" }]
 }
@@ -228,6 +228,31 @@ importers:
        specifier: ^8.0.13
        version: 8.0.16(@types/node@25.9.1)(esbuild@0.27.7)(jiti@2.7.0)(yaml@2.9.0)
  packages/eval:
    dependencies:
      '@ocas/core':
        specifier: ^0.3.0
        version: 0.3.0
      '@ocas/fs':
        specifier: ^0.3.0
        version: 0.3.0
      '@united-workforce/protocol':
        specifier: workspace:^
        version: link:../protocol
      '@united-workforce/util':
        specifier: workspace:^
        version: link:../util
      commander:
        specifier: ^14.0.3
        version: 14.0.3
      yaml:
        specifier: ^2.9.0
        version: 2.9.0
    devDependencies:
      typescript:
        specifier: ^5.8.3
        version: 5.9.3
  packages/protocol:
    dependencies:
      '@ocas/core':
@@ -25,6 +25,7 @@
    { "path": "packages/agent-builtin" },
    { "path": "packages/agent-mock" },
    { "path": "packages/agent-claude-code" },
-    { "path": "packages/cli" }
+    { "path": "packages/cli" },
    { "path": "packages/eval" }
  ]
 }
		`@@ -0,0 +1 @@`
							`export type { JudgeInput, JudgeOutput } from "./types.js";`
		`@@ -0,0 +1,2 @@`
							`export { loadTaskManifest, parseTaskManifest } from "./loader.js";`
							`export type { JudgeEntry, TaskLimits, TaskManifest } from "./types.js";`