2026-06-05 00:23:57 +00:00
21 changed files with 675 additions and 1 deletions
@@ -0,0 +1,63 @@
+import { describe, expect, test } from "vitest";
+import {
+  EVAL_JUDGE_FRONTMATTER_SCHEMA,
+  EVAL_JUDGE_HALLUCINATION_SCHEMA,
+  EVAL_JUDGE_TOKEN_STATS_SCHEMA,
+  EVAL_JUDGE_UPSTREAM_SCHEMA,
+  EVAL_RUN_SCHEMA,
+} from "../src/storage/index.js";
+
+describe("OCAS schema definitions", () => {
+  test("eval-run schema has correct title and required fields", () => {
+    expect(EVAL_RUN_SCHEMA.title).toBe("@uwf/eval-run");
+    const required = EVAL_RUN_SCHEMA.required as string[];
+    expect(required).toContain("task");
+    expect(required).toContain("config");
+    expect(required).toContain("threadId");
+    expect(required).toContain("judges");
+    expect(required).toContain("overall");
+    expect(required).toContain("timestamp");
+  });
+
+  test("frontmatter judge schema has correct title", () => {
+    expect(EVAL_JUDGE_FRONTMATTER_SCHEMA.title).toBe("@uwf/eval-judge-frontmatter");
+    const required = EVAL_JUDGE_FRONTMATTER_SCHEMA.required as string[];
+    expect(required).toContain("stepsTotal");
+    expect(required).toContain("stepsValid");
+    expect(required).toContain("invalidSteps");
+  });
+
+  test("upstream judge schema has correct title", () => {
+    expect(EVAL_JUDGE_UPSTREAM_SCHEMA.title).toBe("@uwf/eval-judge-upstream");
+    const required = EVAL_JUDGE_UPSTREAM_SCHEMA.required as string[];
+    expect(required).toContain("perStep");
+  });
+
+  test("hallucination judge schema has correct title", () => {
+    expect(EVAL_JUDGE_HALLUCINATION_SCHEMA.title).toBe("@uwf/eval-judge-hallucination");
+    const required = EVAL_JUDGE_HALLUCINATION_SCHEMA.required as string[];
+    expect(required).toContain("perStep");
+  });
+
+  test("token-stats judge schema has correct title", () => {
+    expect(EVAL_JUDGE_TOKEN_STATS_SCHEMA.title).toBe("@uwf/eval-judge-token-stats");
+    const required = EVAL_JUDGE_TOKEN_STATS_SCHEMA.required as string[];
+    expect(required).toContain("totalInput");
+    expect(required).toContain("totalOutput");
+    expect(required).toContain("totalTurns");
+    expect(required).toContain("perStep");
+  });
+
+  test("all schemas have type object at root", () => {
+    const schemas = [
+      EVAL_RUN_SCHEMA,
+      EVAL_JUDGE_FRONTMATTER_SCHEMA,
+      EVAL_JUDGE_UPSTREAM_SCHEMA,
+      EVAL_JUDGE_HALLUCINATION_SCHEMA,
+      EVAL_JUDGE_TOKEN_STATS_SCHEMA,
+    ];
+    for (const s of schemas) {
+      expect(s.type).toBe("object");
+    }
+  });
+});
@@ -0,0 +1,163 @@
+import { describe, expect, test } from "vitest";
+import { parseTaskManifest } from "../src/task/index.js";
+
+const VALID_YAML = `
+name: fix-off-by-one
+description: Fix an off-by-one error in a calculator
+workflow: solve-issue
+prompt: "Fix the bug: add(1,2) returns 4 instead of 3"
+limits:
+  maxSteps: 15
+  timeoutMinutes: 30
+judges:
+  - name: frontmatter-compliance
+    weight: 0.15
+    builtin: true
+  - name: test-pass
+    weight: 0.3
+    entry: dist/judges/test-pass.js
+    schema: schemas/test-pass.json
+`;
+
+describe("parseTaskManifest", () => {
+  test("parses valid task.yaml", () => {
+    const manifest = parseTaskManifest(VALID_YAML);
+    expect(manifest.name).toBe("fix-off-by-one");
+    expect(manifest.description).toBe("Fix an off-by-one error in a calculator");
+    expect(manifest.workflow).toBe("solve-issue");
+    expect(manifest.prompt).toBe("Fix the bug: add(1,2) returns 4 instead of 3");
+    expect(manifest.limits).toEqual({ maxSteps: 15, timeoutMinutes: 30 });
+    expect(manifest.judges).toHaveLength(2);
+  });
+
+  test("parses builtin judge", () => {
+    const manifest = parseTaskManifest(VALID_YAML);
+    const builtin = manifest.judges[0];
+    expect(builtin).toBeDefined();
+    expect(builtin!.name).toBe("frontmatter-compliance");
+    expect(builtin!.weight).toBe(0.15);
+    expect(builtin!.builtin).toBe(true);
+    expect(builtin!.entry).toBeNull();
+  });
+
+  test("parses custom judge with entry + schema", () => {
+    const manifest = parseTaskManifest(VALID_YAML);
+    const custom = manifest.judges[1];
+    expect(custom).toBeDefined();
+    expect(custom!.name).toBe("test-pass");
+    expect(custom!.weight).toBe(0.3);
+    expect(custom!.builtin).toBe(false);
+    expect(custom!.entry).toBe("dist/judges/test-pass.js");
+    expect(custom!.schema).toBe("schemas/test-pass.json");
+  });
+
+  test("defaults limits when omitted", () => {
+    const yaml = `
+name: minimal
+workflow: solve-issue
+prompt: do something
+judges:
+  - name: check
+    builtin: true
+`;
+    const manifest = parseTaskManifest(yaml);
+    expect(manifest.limits).toEqual({ maxSteps: 20, timeoutMinutes: 30 });
+  });
+
+  test("defaults description to empty string", () => {
+    const yaml = `
+name: no-desc
+workflow: solve-issue
+prompt: do something
+judges:
+  - name: check
+    builtin: true
+`;
+    const manifest = parseTaskManifest(yaml);
+    expect(manifest.description).toBe("");
+  });
+
+  test("rejects missing name", () => {
+    const yaml = `
+workflow: solve-issue
+prompt: do something
+judges:
+  - name: check
+    builtin: true
+`;
+    expect(() => parseTaskManifest(yaml)).toThrow("name is required");
+  });
+
+  test("rejects missing workflow", () => {
+    const yaml = `
+name: test
+prompt: do something
+judges:
+  - name: check
+    builtin: true
+`;
+    expect(() => parseTaskManifest(yaml)).toThrow("workflow is required");
+  });
+
+  test("rejects missing prompt", () => {
+    const yaml = `
+name: test
+workflow: solve-issue
+judges:
+  - name: check
+    builtin: true
+`;
+    expect(() => parseTaskManifest(yaml)).toThrow("prompt is required");
+  });
+
+  test("rejects empty judges array", () => {
+    const yaml = `
+name: test
+workflow: solve-issue
+prompt: do something
+judges: []
+`;
+    expect(() => parseTaskManifest(yaml)).toThrow("at least one judge");
+  });
+
+  test("rejects non-builtin judge without entry", () => {
+    const yaml = `
+name: test
+workflow: solve-issue
+prompt: do something
+judges:
+  - name: custom-check
+    weight: 0.5
+`;
+    expect(() => parseTaskManifest(yaml)).toThrow("non-builtin judge must have entry");
+  });
+
+  test("rejects non-object YAML root", () => {
+    expect(() => parseTaskManifest("just a string")).toThrow("must be a YAML mapping");
+  });
+
+  test("rejects judge without name", () => {
+    const yaml = `
+name: test
+workflow: solve-issue
+prompt: do something
+judges:
+  - weight: 0.5
+    builtin: true
+`;
+    expect(() => parseTaskManifest(yaml)).toThrow("name is required");
+  });
+
+  test("defaults weight to 0 when omitted", () => {
+    const yaml = `
+name: test
+workflow: solve-issue
+prompt: do something
+judges:
+  - name: token-stats
+    builtin: true
+`;
+    const manifest = parseTaskManifest(yaml);
+    expect(manifest.judges[0]!.weight).toBe(0);
+  });
+});
@@ -0,0 +1,46 @@
+{
+  "name": "@united-workforce/eval",
+  "version": "0.1.0",
+  "private": true,
+  "files": [
+    "src",
+    "dist",
+    "package.json"
+  ],
+  "type": "module",
+  "bin": {
+    "uwf-eval": "./dist/cli.js"
+  },
+  "exports": {
+    ".": {
+      "types": "./dist/index.d.ts",
+      "import": "./dist/index.js"
+    }
+  },
+  "scripts": {
+    "prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
+    "test": "vitest run __tests__/",
+    "test:ci": "vitest run __tests__/"
+  },
+  "dependencies": {
+    "@ocas/core": "^0.3.0",
+    "@ocas/fs": "^0.3.0",
+    "@united-workforce/protocol": "workspace:^",
+    "@united-workforce/util": "workspace:^",
+    "commander": "^14.0.3",
+    "yaml": "^2.9.0"
+  },
+  "devDependencies": {
+    "typescript": "^5.8.3"
+  },
+  "repository": {
+    "type": "git",
+    "url": "https://git.shazhou.work/shazhou/united-workforce.git",
+    "directory": "packages/eval"
+  },
+  "homepage": "https://git.shazhou.work/shazhou/united-workforce#readme",
+  "bugs": {
+    "url": "https://git.shazhou.work/shazhou/united-workforce/issues"
+  },
+  "license": "MIT"
+}
@@ -0,0 +1,22 @@
+#!/usr/bin/env node
+import { Command } from "commander";
+import {
+  registerDiffCommand,
+  registerListCommand,
+  registerReportCommand,
+  registerRunCommand,
+} from "./commands/index.js";
+
+const program = new Command();
+
+program
+  .name("uwf-eval")
+  .description("Evaluate uwf workflow quality with real agents")
+  .version("0.1.0");
+
+registerRunCommand(program);
+registerReportCommand(program);
+registerDiffCommand(program);
+registerListCommand(program);
+
+program.parse();
@@ -0,0 +1,11 @@
+import type { Command } from "commander";
+
+export function registerDiffCommand(program: Command): void {
+  program
+    .command("diff <hash1> <hash2>")
+    .description("Compare two eval runs side-by-side")
+    .action(async (_hash1: string, _hash2: string) => {
+      process.stderr.write("uwf-eval diff: not yet implemented\n");
+      process.exitCode = 1;
+    });
+}
@@ -0,0 +1,4 @@
+export { registerDiffCommand } from "./diff.js";
+export { registerListCommand } from "./list.js";
+export { registerReportCommand } from "./report.js";
+export { registerRunCommand } from "./run.js";
@@ -0,0 +1,13 @@
+import type { Command } from "commander";
+
+export function registerListCommand(program: Command): void {
+  program
+    .command("list")
+    .description("List past eval runs")
+    .option("--task <name>", "filter by task name")
+    .option("--limit <n>", "max results", "20")
+    .action(async (_opts: Record<string, unknown>) => {
+      process.stderr.write("uwf-eval list: not yet implemented\n");
+      process.exitCode = 1;
+    });
+}
@@ -0,0 +1,11 @@
+import type { Command } from "commander";
+
+export function registerReportCommand(program: Command): void {
+  program
+    .command("report <hash>")
+    .description("Show eval run results")
+    .action(async (_hash: string) => {
+      process.stderr.write("uwf-eval report: not yet implemented\n");
+      process.exitCode = 1;
+    });
+}
@@ -0,0 +1,14 @@
+import type { Command } from "commander";
+
+export function registerRunCommand(program: Command): void {
+  program
+    .command("run <task>")
+    .description("Run eval on a task directory or tarball")
+    .option("--agent <name>", "agent adapter to use", "hermes")
+    .option("--model <model>", "model override")
+    .option("--count <n>", "number of eval runs", "1")
+    .action(async (_task: string, _opts: Record<string, unknown>) => {
+      process.stderr.write("uwf-eval run: not yet implemented\n");
+      process.exitCode = 1;
+    });
+}
@@ -0,0 +1,15 @@
+// Task manifest
+
+// Judge types
+export type { JudgeInput, JudgeOutput } from "./judge/index.js";
+export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload } from "./storage/index.js";
+// Storage schemas and types
+export {
+  EVAL_JUDGE_FRONTMATTER_SCHEMA,
+  EVAL_JUDGE_HALLUCINATION_SCHEMA,
+  EVAL_JUDGE_TOKEN_STATS_SCHEMA,
+  EVAL_JUDGE_UPSTREAM_SCHEMA,
+  EVAL_RUN_SCHEMA,
+} from "./storage/index.js";
+export type { JudgeEntry, TaskLimits, TaskManifest } from "./task/index.js";
+export { loadTaskManifest, parseTaskManifest } from "./task/index.js";
@@ -0,0 +1 @@
+export type { JudgeInput, JudgeOutput } from "./types.js";
@@ -0,0 +1,15 @@
+/** Output shape every judge must produce on stdout (JSON). */
+export type JudgeOutput<T = unknown> = {
+  /** Score between 0.0 and 1.0. */
+  score: number;
+  /** Judge-specific structured data, stored in CAS with its own schema. */
+  data: T;
+};
+
+/** Input context passed to judge scripts via argv. */
+export type JudgeInput = {
+  /** Working directory where the task was executed. */
+  cwd: string;
+  /** Thread ID of the eval run. */
+  threadId: string;
+};
@@ -0,0 +1,8 @@
+export {
+  EVAL_JUDGE_FRONTMATTER_SCHEMA,
+  EVAL_JUDGE_HALLUCINATION_SCHEMA,
+  EVAL_JUDGE_TOKEN_STATS_SCHEMA,
+  EVAL_JUDGE_UPSTREAM_SCHEMA,
+  EVAL_RUN_SCHEMA,
+} from "./schemas.js";
+export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload } from "./types.js";
@@ -0,0 +1,123 @@
+import type { JSONSchema } from "@ocas/core";
+
+export const EVAL_RUN_SCHEMA: JSONSchema = {
+  title: "@uwf/eval-run",
+  type: "object",
+  required: ["task", "config", "threadId", "judges", "overall", "timestamp"],
+  properties: {
+    task: { type: "string" },
+    config: {
+      type: "object",
+      required: ["agent", "model", "engineVersion"],
+      properties: {
+        agent: { type: "string" },
+        model: { type: "string" },
+        engineVersion: { type: "string" },
+      },
+    },
+    threadId: { type: "string" },
+    judges: {
+      type: "array",
+      items: {
+        type: "object",
+        required: ["name", "score", "weight", "dataHash"],
+        properties: {
+          name: { type: "string" },
+          score: { type: "number" },
+          weight: { type: "number" },
+          dataHash: { type: "string" },
+        },
+      },
+    },
+    overall: { type: "number" },
+    timestamp: { type: "integer" },
+  },
+};
+
+export const EVAL_JUDGE_FRONTMATTER_SCHEMA: JSONSchema = {
+  title: "@uwf/eval-judge-frontmatter",
+  type: "object",
+  required: ["stepsTotal", "stepsValid", "invalidSteps"],
+  properties: {
+    stepsTotal: { type: "integer" },
+    stepsValid: { type: "integer" },
+    invalidSteps: {
+      type: "array",
+      items: {
+        type: "object",
+        required: ["stepIndex", "role", "errors"],
+        properties: {
+          stepIndex: { type: "integer" },
+          role: { type: "string" },
+          errors: { type: "array", items: { type: "string" } },
+        },
+      },
+    },
+  },
+};
+
+export const EVAL_JUDGE_UPSTREAM_SCHEMA: JSONSchema = {
+  title: "@uwf/eval-judge-upstream",
+  type: "object",
+  required: ["perStep"],
+  properties: {
+    perStep: {
+      type: "array",
+      items: {
+        type: "object",
+        required: ["role", "consumed", "missed", "score"],
+        properties: {
+          role: { type: "string" },
+          consumed: { type: "array", items: { type: "string" } },
+          missed: { type: "array", items: { type: "string" } },
+          score: { type: "number" },
+        },
+      },
+    },
+  },
+};
+
+export const EVAL_JUDGE_HALLUCINATION_SCHEMA: JSONSchema = {
+  title: "@uwf/eval-judge-hallucination",
+  type: "object",
+  required: ["perStep"],
+  properties: {
+    perStep: {
+      type: "array",
+      items: {
+        type: "object",
+        required: ["role", "hallucinations", "score"],
+        properties: {
+          role: { type: "string" },
+          hallucinations: { type: "array", items: { type: "string" } },
+          score: { type: "number" },
+        },
+      },
+    },
+  },
+};
+
+export const EVAL_JUDGE_TOKEN_STATS_SCHEMA: JSONSchema = {
+  title: "@uwf/eval-judge-token-stats",
+  type: "object",
+  required: ["totalInput", "totalOutput", "totalTurns", "perStep"],
+  properties: {
+    totalInput: { type: "integer" },
+    totalOutput: { type: "integer" },
+    totalTurns: { type: "integer" },
+    perStep: {
+      type: "array",
+      items: {
+        type: "object",
+        required: ["role", "inputTokens", "outputTokens", "turns", "duration"],
+        properties: {
+          role: { type: "string" },
+          inputTokens: { type: "integer" },
+          outputTokens: { type: "integer" },
+          turns: { type: "integer" },
+          duration: { type: "number" },
+        },
+      },
+    },
+  },
+};
@@ -0,0 +1,26 @@
+import type { CasRef } from "@united-workforce/protocol";
+
+/** A single judge result within an eval run. */
+export type EvalJudgeRecord = {
+  name: string;
+  score: number;
+  weight: number;
+  dataHash: CasRef;
+};
+
+/** Config snapshot for an eval run. */
+export type EvalRunConfig = {
+  agent: string;
+  model: string;
+  engineVersion: string;
+};
+
+/** Full eval run record stored in CAS. */
+export type EvalRunPayload = {
+  task: string;
+  config: EvalRunConfig;
+  threadId: string;
+  judges: EvalJudgeRecord[];
+  overall: number;
+  timestamp: number;
+};
@@ -0,0 +1,2 @@
+export { loadTaskManifest, parseTaskManifest } from "./loader.js";
+export type { JudgeEntry, TaskLimits, TaskManifest } from "./types.js";
@@ -0,0 +1,74 @@
+import { readFile } from "node:fs/promises";
+import { join } from "node:path";
+import { parse as parseYaml } from "yaml";
+import type { JudgeEntry, TaskLimits, TaskManifest } from "./types.js";
+
+function isRecord(value: unknown): value is Record<string, unknown> {
+  return typeof value === "object" && value !== null && !Array.isArray(value);
+}
+
+function parseJudgeEntry(raw: unknown, index: number): JudgeEntry {
+  if (!isRecord(raw)) {
+    throw new Error(`judges[${index}]: expected object`);
+  }
+  const name = raw.name;
+  if (typeof name !== "string" || name === "") {
+    throw new Error(`judges[${index}]: name is required`);
+  }
+  const weight = typeof raw.weight === "number" ? raw.weight : 0;
+  const builtin = raw.builtin === true;
+  const entry = typeof raw.entry === "string" ? raw.entry : null;
+  const schema = typeof raw.schema === "string" ? raw.schema : null;
+  if (!builtin && entry === null) {
+    throw new Error(`judges[${index}] "${name}": non-builtin judge must have entry`);
+  }
+  return { name, weight, builtin, entry, schema };
+}
+
+function parseLimits(raw: unknown): TaskLimits {
+  if (!isRecord(raw)) {
+    return { maxSteps: 20, timeoutMinutes: 30 };
+  }
+  return {
+    maxSteps: typeof raw.maxSteps === "number" ? raw.maxSteps : 20,
+    timeoutMinutes: typeof raw.timeoutMinutes === "number" ? raw.timeoutMinutes : 30,
+  };
+}
+
+/** Parse and validate a task.yaml file into a TaskManifest. */
+export function parseTaskManifest(yamlText: string): TaskManifest {
+  const raw = parseYaml(yamlText) as unknown;
+  if (!isRecord(raw)) {
+    throw new Error("task.yaml must be a YAML mapping");
+  }
+  const name = raw.name;
+  if (typeof name !== "string" || name === "") {
+    throw new Error("task.yaml: name is required");
+  }
+  const description = typeof raw.description === "string" ? raw.description : "";
+  const workflow = raw.workflow;
+  if (typeof workflow !== "string" || workflow === "") {
+    throw new Error("task.yaml: workflow is required");
+  }
+  const prompt = raw.prompt;
+  if (typeof prompt !== "string" || prompt === "") {
+    throw new Error("task.yaml: prompt is required");
+  }
+  const limits = parseLimits(raw.limits);
+  const judgesRaw = raw.judges;
+  if (!Array.isArray(judgesRaw) || judgesRaw.length === 0) {
+    throw new Error("task.yaml: at least one judge is required");
+  }
+  const judges: JudgeEntry[] = [];
+  for (let i = 0; i < judgesRaw.length; i++) {
+    judges.push(parseJudgeEntry(judgesRaw[i], i));
+  }
+  return { name, description, workflow, prompt, limits, judges };
+}
+
+/** Load and parse task.yaml from a directory. */
+export async function loadTaskManifest(taskDir: string): Promise<TaskManifest> {
+  const yamlPath = join(taskDir, "task.yaml");
+  const text = await readFile(yamlPath, "utf8");
+  return parseTaskManifest(text);
+}
@@ -0,0 +1,28 @@
+/** Judge entry in task.yaml */
+export type JudgeEntry = {
+  name: string;
+  weight: number;
+  builtin: boolean;
+  /** Path to judge entry script (relative to task root). Required for non-builtin judges. */
+  entry: string | null;
+  /** Path to OCAS schema JSON for judge data. Required for non-builtin judges. */
+  schema: string | null;
+};
+
+/** Limits for eval execution. */
+export type TaskLimits = {
+  maxSteps: number;
+  timeoutMinutes: number;
+};
+
+/** Parsed task.yaml manifest. */
+export type TaskManifest = {
+  name: string;
+  description: string;
+  /** Workflow name or relative path to .yaml file. */
+  workflow: string;
+  /** Initial prompt for thread start. */
+  prompt: string;
+  limits: TaskLimits;
+  judges: JudgeEntry[];
+};
@@ -0,0 +1,9 @@
+{
+  "extends": "../../tsconfig.json",
+  "compilerOptions": {
+    "rootDir": "src",
+    "outDir": "dist"
+  },
+  "include": ["src"],
+  "references": [{ "path": "../protocol" }, { "path": "../util" }]
+}
@@ -228,6 +228,31 @@ importers:
        specifier: ^8.0.13
        version: 8.0.16(@types/node@25.9.1)(esbuild@0.27.7)(jiti@2.7.0)(yaml@2.9.0)

+  packages/eval:
+    dependencies:
+      '@ocas/core':
+        specifier: ^0.3.0
+        version: 0.3.0
+      '@ocas/fs':
+        specifier: ^0.3.0
+        version: 0.3.0
+      '@united-workforce/protocol':
+        specifier: workspace:^
+        version: link:../protocol
+      '@united-workforce/util':
+        specifier: workspace:^
+        version: link:../util
+      commander:
+        specifier: ^14.0.3
+        version: 14.0.3
+      yaml:
+        specifier: ^2.9.0
+        version: 2.9.0
+    devDependencies:
+      typescript:
+        specifier: ^5.8.3
+        version: 5.9.3
+
  packages/protocol:
    dependencies:
      '@ocas/core':
@@ -25,6 +25,7 @@
    { "path": "packages/agent-builtin" },
    { "path": "packages/agent-mock" },
    { "path": "packages/agent-claude-code" },
-    { "path": "packages/cli" }
+    { "path": "packages/cli" },
+    { "path": "packages/eval" }
  ]
 }
				`@@ -0,0 +1 @@`
				`export type { JudgeInput, JudgeOutput } from "./types.js";`