feat: eval package scaffold with CLI, schemas, types, task loader
CI / check (pull_request) Successful in 1m42s
CI / check (pull_request) Successful in 1m42s
New package @united-workforce/eval (uwf-eval CLI): - CLI skeleton: run/report/diff/list subcommands (stubs) - 5 OCAS schemas: eval-run, judge-frontmatter, judge-upstream, judge-hallucination, judge-token-stats - TaskManifest type + parser/validator for task.yaml - JudgeOutput/JudgeInput types for judge contract - EvalRunPayload/EvalRunConfig/EvalJudgeRecord storage types - 19 unit tests: task loader validation + schema definitions Refs #69
This commit is contained in:
@@ -0,0 +1,63 @@
|
|||||||
|
import { describe, expect, test } from "vitest";
|
||||||
|
import {
|
||||||
|
EVAL_JUDGE_FRONTMATTER_SCHEMA,
|
||||||
|
EVAL_JUDGE_HALLUCINATION_SCHEMA,
|
||||||
|
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
|
||||||
|
EVAL_JUDGE_UPSTREAM_SCHEMA,
|
||||||
|
EVAL_RUN_SCHEMA,
|
||||||
|
} from "../src/storage/index.js";
|
||||||
|
|
||||||
|
describe("OCAS schema definitions", () => {
|
||||||
|
test("eval-run schema has correct title and required fields", () => {
|
||||||
|
expect(EVAL_RUN_SCHEMA.title).toBe("@uwf/eval-run");
|
||||||
|
const required = EVAL_RUN_SCHEMA.required as string[];
|
||||||
|
expect(required).toContain("task");
|
||||||
|
expect(required).toContain("config");
|
||||||
|
expect(required).toContain("threadId");
|
||||||
|
expect(required).toContain("judges");
|
||||||
|
expect(required).toContain("overall");
|
||||||
|
expect(required).toContain("timestamp");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("frontmatter judge schema has correct title", () => {
|
||||||
|
expect(EVAL_JUDGE_FRONTMATTER_SCHEMA.title).toBe("@uwf/eval-judge-frontmatter");
|
||||||
|
const required = EVAL_JUDGE_FRONTMATTER_SCHEMA.required as string[];
|
||||||
|
expect(required).toContain("stepsTotal");
|
||||||
|
expect(required).toContain("stepsValid");
|
||||||
|
expect(required).toContain("invalidSteps");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("upstream judge schema has correct title", () => {
|
||||||
|
expect(EVAL_JUDGE_UPSTREAM_SCHEMA.title).toBe("@uwf/eval-judge-upstream");
|
||||||
|
const required = EVAL_JUDGE_UPSTREAM_SCHEMA.required as string[];
|
||||||
|
expect(required).toContain("perStep");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("hallucination judge schema has correct title", () => {
|
||||||
|
expect(EVAL_JUDGE_HALLUCINATION_SCHEMA.title).toBe("@uwf/eval-judge-hallucination");
|
||||||
|
const required = EVAL_JUDGE_HALLUCINATION_SCHEMA.required as string[];
|
||||||
|
expect(required).toContain("perStep");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("token-stats judge schema has correct title", () => {
|
||||||
|
expect(EVAL_JUDGE_TOKEN_STATS_SCHEMA.title).toBe("@uwf/eval-judge-token-stats");
|
||||||
|
const required = EVAL_JUDGE_TOKEN_STATS_SCHEMA.required as string[];
|
||||||
|
expect(required).toContain("totalInput");
|
||||||
|
expect(required).toContain("totalOutput");
|
||||||
|
expect(required).toContain("totalTurns");
|
||||||
|
expect(required).toContain("perStep");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("all schemas have type object at root", () => {
|
||||||
|
const schemas = [
|
||||||
|
EVAL_RUN_SCHEMA,
|
||||||
|
EVAL_JUDGE_FRONTMATTER_SCHEMA,
|
||||||
|
EVAL_JUDGE_UPSTREAM_SCHEMA,
|
||||||
|
EVAL_JUDGE_HALLUCINATION_SCHEMA,
|
||||||
|
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
|
||||||
|
];
|
||||||
|
for (const s of schemas) {
|
||||||
|
expect(s.type).toBe("object");
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -0,0 +1,163 @@
|
|||||||
|
import { describe, expect, test } from "vitest";
|
||||||
|
import { parseTaskManifest } from "../src/task/index.js";
|
||||||
|
|
||||||
|
const VALID_YAML = `
|
||||||
|
name: fix-off-by-one
|
||||||
|
description: Fix an off-by-one error in a calculator
|
||||||
|
workflow: solve-issue
|
||||||
|
prompt: "Fix the bug: add(1,2) returns 4 instead of 3"
|
||||||
|
limits:
|
||||||
|
maxSteps: 15
|
||||||
|
timeoutMinutes: 30
|
||||||
|
judges:
|
||||||
|
- name: frontmatter-compliance
|
||||||
|
weight: 0.15
|
||||||
|
builtin: true
|
||||||
|
- name: test-pass
|
||||||
|
weight: 0.3
|
||||||
|
entry: dist/judges/test-pass.js
|
||||||
|
schema: schemas/test-pass.json
|
||||||
|
`;
|
||||||
|
|
||||||
|
describe("parseTaskManifest", () => {
|
||||||
|
test("parses valid task.yaml", () => {
|
||||||
|
const manifest = parseTaskManifest(VALID_YAML);
|
||||||
|
expect(manifest.name).toBe("fix-off-by-one");
|
||||||
|
expect(manifest.description).toBe("Fix an off-by-one error in a calculator");
|
||||||
|
expect(manifest.workflow).toBe("solve-issue");
|
||||||
|
expect(manifest.prompt).toBe("Fix the bug: add(1,2) returns 4 instead of 3");
|
||||||
|
expect(manifest.limits).toEqual({ maxSteps: 15, timeoutMinutes: 30 });
|
||||||
|
expect(manifest.judges).toHaveLength(2);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("parses builtin judge", () => {
|
||||||
|
const manifest = parseTaskManifest(VALID_YAML);
|
||||||
|
const builtin = manifest.judges[0];
|
||||||
|
expect(builtin).toBeDefined();
|
||||||
|
expect(builtin!.name).toBe("frontmatter-compliance");
|
||||||
|
expect(builtin!.weight).toBe(0.15);
|
||||||
|
expect(builtin!.builtin).toBe(true);
|
||||||
|
expect(builtin!.entry).toBeNull();
|
||||||
|
});
|
||||||
|
|
||||||
|
test("parses custom judge with entry + schema", () => {
|
||||||
|
const manifest = parseTaskManifest(VALID_YAML);
|
||||||
|
const custom = manifest.judges[1];
|
||||||
|
expect(custom).toBeDefined();
|
||||||
|
expect(custom!.name).toBe("test-pass");
|
||||||
|
expect(custom!.weight).toBe(0.3);
|
||||||
|
expect(custom!.builtin).toBe(false);
|
||||||
|
expect(custom!.entry).toBe("dist/judges/test-pass.js");
|
||||||
|
expect(custom!.schema).toBe("schemas/test-pass.json");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("defaults limits when omitted", () => {
|
||||||
|
const yaml = `
|
||||||
|
name: minimal
|
||||||
|
workflow: solve-issue
|
||||||
|
prompt: do something
|
||||||
|
judges:
|
||||||
|
- name: check
|
||||||
|
builtin: true
|
||||||
|
`;
|
||||||
|
const manifest = parseTaskManifest(yaml);
|
||||||
|
expect(manifest.limits).toEqual({ maxSteps: 20, timeoutMinutes: 30 });
|
||||||
|
});
|
||||||
|
|
||||||
|
test("defaults description to empty string", () => {
|
||||||
|
const yaml = `
|
||||||
|
name: no-desc
|
||||||
|
workflow: solve-issue
|
||||||
|
prompt: do something
|
||||||
|
judges:
|
||||||
|
- name: check
|
||||||
|
builtin: true
|
||||||
|
`;
|
||||||
|
const manifest = parseTaskManifest(yaml);
|
||||||
|
expect(manifest.description).toBe("");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("rejects missing name", () => {
|
||||||
|
const yaml = `
|
||||||
|
workflow: solve-issue
|
||||||
|
prompt: do something
|
||||||
|
judges:
|
||||||
|
- name: check
|
||||||
|
builtin: true
|
||||||
|
`;
|
||||||
|
expect(() => parseTaskManifest(yaml)).toThrow("name is required");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("rejects missing workflow", () => {
|
||||||
|
const yaml = `
|
||||||
|
name: test
|
||||||
|
prompt: do something
|
||||||
|
judges:
|
||||||
|
- name: check
|
||||||
|
builtin: true
|
||||||
|
`;
|
||||||
|
expect(() => parseTaskManifest(yaml)).toThrow("workflow is required");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("rejects missing prompt", () => {
|
||||||
|
const yaml = `
|
||||||
|
name: test
|
||||||
|
workflow: solve-issue
|
||||||
|
judges:
|
||||||
|
- name: check
|
||||||
|
builtin: true
|
||||||
|
`;
|
||||||
|
expect(() => parseTaskManifest(yaml)).toThrow("prompt is required");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("rejects empty judges array", () => {
|
||||||
|
const yaml = `
|
||||||
|
name: test
|
||||||
|
workflow: solve-issue
|
||||||
|
prompt: do something
|
||||||
|
judges: []
|
||||||
|
`;
|
||||||
|
expect(() => parseTaskManifest(yaml)).toThrow("at least one judge");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("rejects non-builtin judge without entry", () => {
|
||||||
|
const yaml = `
|
||||||
|
name: test
|
||||||
|
workflow: solve-issue
|
||||||
|
prompt: do something
|
||||||
|
judges:
|
||||||
|
- name: custom-check
|
||||||
|
weight: 0.5
|
||||||
|
`;
|
||||||
|
expect(() => parseTaskManifest(yaml)).toThrow("non-builtin judge must have entry");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("rejects non-object YAML root", () => {
|
||||||
|
expect(() => parseTaskManifest("just a string")).toThrow("must be a YAML mapping");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("rejects judge without name", () => {
|
||||||
|
const yaml = `
|
||||||
|
name: test
|
||||||
|
workflow: solve-issue
|
||||||
|
prompt: do something
|
||||||
|
judges:
|
||||||
|
- weight: 0.5
|
||||||
|
builtin: true
|
||||||
|
`;
|
||||||
|
expect(() => parseTaskManifest(yaml)).toThrow("name is required");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("defaults weight to 0 when omitted", () => {
|
||||||
|
const yaml = `
|
||||||
|
name: test
|
||||||
|
workflow: solve-issue
|
||||||
|
prompt: do something
|
||||||
|
judges:
|
||||||
|
- name: token-stats
|
||||||
|
builtin: true
|
||||||
|
`;
|
||||||
|
const manifest = parseTaskManifest(yaml);
|
||||||
|
expect(manifest.judges[0]!.weight).toBe(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -0,0 +1,46 @@
|
|||||||
|
{
|
||||||
|
"name": "@united-workforce/eval",
|
||||||
|
"version": "0.1.0",
|
||||||
|
"private": true,
|
||||||
|
"files": [
|
||||||
|
"src",
|
||||||
|
"dist",
|
||||||
|
"package.json"
|
||||||
|
],
|
||||||
|
"type": "module",
|
||||||
|
"bin": {
|
||||||
|
"uwf-eval": "./dist/cli.js"
|
||||||
|
},
|
||||||
|
"exports": {
|
||||||
|
".": {
|
||||||
|
"types": "./dist/index.d.ts",
|
||||||
|
"import": "./dist/index.js"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"scripts": {
|
||||||
|
"prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
|
||||||
|
"test": "vitest run __tests__/",
|
||||||
|
"test:ci": "vitest run __tests__/"
|
||||||
|
},
|
||||||
|
"dependencies": {
|
||||||
|
"@ocas/core": "^0.3.0",
|
||||||
|
"@ocas/fs": "^0.3.0",
|
||||||
|
"@united-workforce/protocol": "workspace:^",
|
||||||
|
"@united-workforce/util": "workspace:^",
|
||||||
|
"commander": "^14.0.3",
|
||||||
|
"yaml": "^2.9.0"
|
||||||
|
},
|
||||||
|
"devDependencies": {
|
||||||
|
"typescript": "^5.8.3"
|
||||||
|
},
|
||||||
|
"repository": {
|
||||||
|
"type": "git",
|
||||||
|
"url": "https://git.shazhou.work/shazhou/united-workforce.git",
|
||||||
|
"directory": "packages/eval"
|
||||||
|
},
|
||||||
|
"homepage": "https://git.shazhou.work/shazhou/united-workforce#readme",
|
||||||
|
"bugs": {
|
||||||
|
"url": "https://git.shazhou.work/shazhou/united-workforce/issues"
|
||||||
|
},
|
||||||
|
"license": "MIT"
|
||||||
|
}
|
||||||
@@ -0,0 +1,22 @@
|
|||||||
|
#!/usr/bin/env node
|
||||||
|
import { Command } from "commander";
|
||||||
|
import {
|
||||||
|
registerDiffCommand,
|
||||||
|
registerListCommand,
|
||||||
|
registerReportCommand,
|
||||||
|
registerRunCommand,
|
||||||
|
} from "./commands/index.js";
|
||||||
|
|
||||||
|
const program = new Command();
|
||||||
|
|
||||||
|
program
|
||||||
|
.name("uwf-eval")
|
||||||
|
.description("Evaluate uwf workflow quality with real agents")
|
||||||
|
.version("0.1.0");
|
||||||
|
|
||||||
|
registerRunCommand(program);
|
||||||
|
registerReportCommand(program);
|
||||||
|
registerDiffCommand(program);
|
||||||
|
registerListCommand(program);
|
||||||
|
|
||||||
|
program.parse();
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
import type { Command } from "commander";
|
||||||
|
|
||||||
|
export function registerDiffCommand(program: Command): void {
|
||||||
|
program
|
||||||
|
.command("diff <hash1> <hash2>")
|
||||||
|
.description("Compare two eval runs side-by-side")
|
||||||
|
.action(async (_hash1: string, _hash2: string) => {
|
||||||
|
process.stderr.write("uwf-eval diff: not yet implemented\n");
|
||||||
|
process.exitCode = 1;
|
||||||
|
});
|
||||||
|
}
|
||||||
@@ -0,0 +1,4 @@
|
|||||||
|
export { registerDiffCommand } from "./diff.js";
|
||||||
|
export { registerListCommand } from "./list.js";
|
||||||
|
export { registerReportCommand } from "./report.js";
|
||||||
|
export { registerRunCommand } from "./run.js";
|
||||||
@@ -0,0 +1,13 @@
|
|||||||
|
import type { Command } from "commander";
|
||||||
|
|
||||||
|
export function registerListCommand(program: Command): void {
|
||||||
|
program
|
||||||
|
.command("list")
|
||||||
|
.description("List past eval runs")
|
||||||
|
.option("--task <name>", "filter by task name")
|
||||||
|
.option("--limit <n>", "max results", "20")
|
||||||
|
.action(async (_opts: Record<string, unknown>) => {
|
||||||
|
process.stderr.write("uwf-eval list: not yet implemented\n");
|
||||||
|
process.exitCode = 1;
|
||||||
|
});
|
||||||
|
}
|
||||||
@@ -0,0 +1,11 @@
|
|||||||
|
import type { Command } from "commander";
|
||||||
|
|
||||||
|
export function registerReportCommand(program: Command): void {
|
||||||
|
program
|
||||||
|
.command("report <hash>")
|
||||||
|
.description("Show eval run results")
|
||||||
|
.action(async (_hash: string) => {
|
||||||
|
process.stderr.write("uwf-eval report: not yet implemented\n");
|
||||||
|
process.exitCode = 1;
|
||||||
|
});
|
||||||
|
}
|
||||||
@@ -0,0 +1,14 @@
|
|||||||
|
import type { Command } from "commander";
|
||||||
|
|
||||||
|
export function registerRunCommand(program: Command): void {
|
||||||
|
program
|
||||||
|
.command("run <task>")
|
||||||
|
.description("Run eval on a task directory or tarball")
|
||||||
|
.option("--agent <name>", "agent adapter to use", "hermes")
|
||||||
|
.option("--model <model>", "model override")
|
||||||
|
.option("--count <n>", "number of eval runs", "1")
|
||||||
|
.action(async (_task: string, _opts: Record<string, unknown>) => {
|
||||||
|
process.stderr.write("uwf-eval run: not yet implemented\n");
|
||||||
|
process.exitCode = 1;
|
||||||
|
});
|
||||||
|
}
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
// Task manifest
|
||||||
|
|
||||||
|
// Judge types
|
||||||
|
export type { JudgeInput, JudgeOutput } from "./judge/index.js";
|
||||||
|
export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload } from "./storage/index.js";
|
||||||
|
// Storage schemas and types
|
||||||
|
export {
|
||||||
|
EVAL_JUDGE_FRONTMATTER_SCHEMA,
|
||||||
|
EVAL_JUDGE_HALLUCINATION_SCHEMA,
|
||||||
|
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
|
||||||
|
EVAL_JUDGE_UPSTREAM_SCHEMA,
|
||||||
|
EVAL_RUN_SCHEMA,
|
||||||
|
} from "./storage/index.js";
|
||||||
|
export type { JudgeEntry, TaskLimits, TaskManifest } from "./task/index.js";
|
||||||
|
export { loadTaskManifest, parseTaskManifest } from "./task/index.js";
|
||||||
@@ -0,0 +1 @@
|
|||||||
|
export type { JudgeInput, JudgeOutput } from "./types.js";
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
/** Output shape every judge must produce on stdout (JSON). */
|
||||||
|
export type JudgeOutput<T = unknown> = {
|
||||||
|
/** Score between 0.0 and 1.0. */
|
||||||
|
score: number;
|
||||||
|
/** Judge-specific structured data, stored in CAS with its own schema. */
|
||||||
|
data: T;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Input context passed to judge scripts via argv. */
|
||||||
|
export type JudgeInput = {
|
||||||
|
/** Working directory where the task was executed. */
|
||||||
|
cwd: string;
|
||||||
|
/** Thread ID of the eval run. */
|
||||||
|
threadId: string;
|
||||||
|
};
|
||||||
@@ -0,0 +1,8 @@
|
|||||||
|
export {
|
||||||
|
EVAL_JUDGE_FRONTMATTER_SCHEMA,
|
||||||
|
EVAL_JUDGE_HALLUCINATION_SCHEMA,
|
||||||
|
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
|
||||||
|
EVAL_JUDGE_UPSTREAM_SCHEMA,
|
||||||
|
EVAL_RUN_SCHEMA,
|
||||||
|
} from "./schemas.js";
|
||||||
|
export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload } from "./types.js";
|
||||||
@@ -0,0 +1,123 @@
|
|||||||
|
import type { JSONSchema } from "@ocas/core";
|
||||||
|
|
||||||
|
export const EVAL_RUN_SCHEMA: JSONSchema = {
|
||||||
|
title: "@uwf/eval-run",
|
||||||
|
type: "object",
|
||||||
|
required: ["task", "config", "threadId", "judges", "overall", "timestamp"],
|
||||||
|
properties: {
|
||||||
|
task: { type: "string" },
|
||||||
|
config: {
|
||||||
|
type: "object",
|
||||||
|
required: ["agent", "model", "engineVersion"],
|
||||||
|
properties: {
|
||||||
|
agent: { type: "string" },
|
||||||
|
model: { type: "string" },
|
||||||
|
engineVersion: { type: "string" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
threadId: { type: "string" },
|
||||||
|
judges: {
|
||||||
|
type: "array",
|
||||||
|
items: {
|
||||||
|
type: "object",
|
||||||
|
required: ["name", "score", "weight", "dataHash"],
|
||||||
|
properties: {
|
||||||
|
name: { type: "string" },
|
||||||
|
score: { type: "number" },
|
||||||
|
weight: { type: "number" },
|
||||||
|
dataHash: { type: "string" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
overall: { type: "number" },
|
||||||
|
timestamp: { type: "integer" },
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
export const EVAL_JUDGE_FRONTMATTER_SCHEMA: JSONSchema = {
|
||||||
|
title: "@uwf/eval-judge-frontmatter",
|
||||||
|
type: "object",
|
||||||
|
required: ["stepsTotal", "stepsValid", "invalidSteps"],
|
||||||
|
properties: {
|
||||||
|
stepsTotal: { type: "integer" },
|
||||||
|
stepsValid: { type: "integer" },
|
||||||
|
invalidSteps: {
|
||||||
|
type: "array",
|
||||||
|
items: {
|
||||||
|
type: "object",
|
||||||
|
required: ["stepIndex", "role", "errors"],
|
||||||
|
properties: {
|
||||||
|
stepIndex: { type: "integer" },
|
||||||
|
role: { type: "string" },
|
||||||
|
errors: { type: "array", items: { type: "string" } },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
export const EVAL_JUDGE_UPSTREAM_SCHEMA: JSONSchema = {
|
||||||
|
title: "@uwf/eval-judge-upstream",
|
||||||
|
type: "object",
|
||||||
|
required: ["perStep"],
|
||||||
|
properties: {
|
||||||
|
perStep: {
|
||||||
|
type: "array",
|
||||||
|
items: {
|
||||||
|
type: "object",
|
||||||
|
required: ["role", "consumed", "missed", "score"],
|
||||||
|
properties: {
|
||||||
|
role: { type: "string" },
|
||||||
|
consumed: { type: "array", items: { type: "string" } },
|
||||||
|
missed: { type: "array", items: { type: "string" } },
|
||||||
|
score: { type: "number" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
export const EVAL_JUDGE_HALLUCINATION_SCHEMA: JSONSchema = {
|
||||||
|
title: "@uwf/eval-judge-hallucination",
|
||||||
|
type: "object",
|
||||||
|
required: ["perStep"],
|
||||||
|
properties: {
|
||||||
|
perStep: {
|
||||||
|
type: "array",
|
||||||
|
items: {
|
||||||
|
type: "object",
|
||||||
|
required: ["role", "hallucinations", "score"],
|
||||||
|
properties: {
|
||||||
|
role: { type: "string" },
|
||||||
|
hallucinations: { type: "array", items: { type: "string" } },
|
||||||
|
score: { type: "number" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
export const EVAL_JUDGE_TOKEN_STATS_SCHEMA: JSONSchema = {
|
||||||
|
title: "@uwf/eval-judge-token-stats",
|
||||||
|
type: "object",
|
||||||
|
required: ["totalInput", "totalOutput", "totalTurns", "perStep"],
|
||||||
|
properties: {
|
||||||
|
totalInput: { type: "integer" },
|
||||||
|
totalOutput: { type: "integer" },
|
||||||
|
totalTurns: { type: "integer" },
|
||||||
|
perStep: {
|
||||||
|
type: "array",
|
||||||
|
items: {
|
||||||
|
type: "object",
|
||||||
|
required: ["role", "inputTokens", "outputTokens", "turns", "duration"],
|
||||||
|
properties: {
|
||||||
|
role: { type: "string" },
|
||||||
|
inputTokens: { type: "integer" },
|
||||||
|
outputTokens: { type: "integer" },
|
||||||
|
turns: { type: "integer" },
|
||||||
|
duration: { type: "number" },
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
},
|
||||||
|
};
|
||||||
@@ -0,0 +1,26 @@
|
|||||||
|
import type { CasRef } from "@united-workforce/protocol";
|
||||||
|
|
||||||
|
/** A single judge result within an eval run. */
|
||||||
|
export type EvalJudgeRecord = {
|
||||||
|
name: string;
|
||||||
|
score: number;
|
||||||
|
weight: number;
|
||||||
|
dataHash: CasRef;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Config snapshot for an eval run. */
|
||||||
|
export type EvalRunConfig = {
|
||||||
|
agent: string;
|
||||||
|
model: string;
|
||||||
|
engineVersion: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Full eval run record stored in CAS. */
|
||||||
|
export type EvalRunPayload = {
|
||||||
|
task: string;
|
||||||
|
config: EvalRunConfig;
|
||||||
|
threadId: string;
|
||||||
|
judges: EvalJudgeRecord[];
|
||||||
|
overall: number;
|
||||||
|
timestamp: number;
|
||||||
|
};
|
||||||
@@ -0,0 +1,2 @@
|
|||||||
|
export { loadTaskManifest, parseTaskManifest } from "./loader.js";
|
||||||
|
export type { JudgeEntry, TaskLimits, TaskManifest } from "./types.js";
|
||||||
@@ -0,0 +1,74 @@
|
|||||||
|
import { readFile } from "node:fs/promises";
|
||||||
|
import { join } from "node:path";
|
||||||
|
import { parse as parseYaml } from "yaml";
|
||||||
|
import type { JudgeEntry, TaskLimits, TaskManifest } from "./types.js";
|
||||||
|
|
||||||
|
function isRecord(value: unknown): value is Record<string, unknown> {
|
||||||
|
return typeof value === "object" && value !== null && !Array.isArray(value);
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseJudgeEntry(raw: unknown, index: number): JudgeEntry {
|
||||||
|
if (!isRecord(raw)) {
|
||||||
|
throw new Error(`judges[${index}]: expected object`);
|
||||||
|
}
|
||||||
|
const name = raw.name;
|
||||||
|
if (typeof name !== "string" || name === "") {
|
||||||
|
throw new Error(`judges[${index}]: name is required`);
|
||||||
|
}
|
||||||
|
const weight = typeof raw.weight === "number" ? raw.weight : 0;
|
||||||
|
const builtin = raw.builtin === true;
|
||||||
|
const entry = typeof raw.entry === "string" ? raw.entry : null;
|
||||||
|
const schema = typeof raw.schema === "string" ? raw.schema : null;
|
||||||
|
if (!builtin && entry === null) {
|
||||||
|
throw new Error(`judges[${index}] "${name}": non-builtin judge must have entry`);
|
||||||
|
}
|
||||||
|
return { name, weight, builtin, entry, schema };
|
||||||
|
}
|
||||||
|
|
||||||
|
function parseLimits(raw: unknown): TaskLimits {
|
||||||
|
if (!isRecord(raw)) {
|
||||||
|
return { maxSteps: 20, timeoutMinutes: 30 };
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
maxSteps: typeof raw.maxSteps === "number" ? raw.maxSteps : 20,
|
||||||
|
timeoutMinutes: typeof raw.timeoutMinutes === "number" ? raw.timeoutMinutes : 30,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Parse and validate a task.yaml file into a TaskManifest. */
|
||||||
|
export function parseTaskManifest(yamlText: string): TaskManifest {
|
||||||
|
const raw = parseYaml(yamlText) as unknown;
|
||||||
|
if (!isRecord(raw)) {
|
||||||
|
throw new Error("task.yaml must be a YAML mapping");
|
||||||
|
}
|
||||||
|
const name = raw.name;
|
||||||
|
if (typeof name !== "string" || name === "") {
|
||||||
|
throw new Error("task.yaml: name is required");
|
||||||
|
}
|
||||||
|
const description = typeof raw.description === "string" ? raw.description : "";
|
||||||
|
const workflow = raw.workflow;
|
||||||
|
if (typeof workflow !== "string" || workflow === "") {
|
||||||
|
throw new Error("task.yaml: workflow is required");
|
||||||
|
}
|
||||||
|
const prompt = raw.prompt;
|
||||||
|
if (typeof prompt !== "string" || prompt === "") {
|
||||||
|
throw new Error("task.yaml: prompt is required");
|
||||||
|
}
|
||||||
|
const limits = parseLimits(raw.limits);
|
||||||
|
const judgesRaw = raw.judges;
|
||||||
|
if (!Array.isArray(judgesRaw) || judgesRaw.length === 0) {
|
||||||
|
throw new Error("task.yaml: at least one judge is required");
|
||||||
|
}
|
||||||
|
const judges: JudgeEntry[] = [];
|
||||||
|
for (let i = 0; i < judgesRaw.length; i++) {
|
||||||
|
judges.push(parseJudgeEntry(judgesRaw[i], i));
|
||||||
|
}
|
||||||
|
return { name, description, workflow, prompt, limits, judges };
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Load and parse task.yaml from a directory. */
|
||||||
|
export async function loadTaskManifest(taskDir: string): Promise<TaskManifest> {
|
||||||
|
const yamlPath = join(taskDir, "task.yaml");
|
||||||
|
const text = await readFile(yamlPath, "utf8");
|
||||||
|
return parseTaskManifest(text);
|
||||||
|
}
|
||||||
@@ -0,0 +1,28 @@
|
|||||||
|
/** Judge entry in task.yaml */
|
||||||
|
export type JudgeEntry = {
|
||||||
|
name: string;
|
||||||
|
weight: number;
|
||||||
|
builtin: boolean;
|
||||||
|
/** Path to judge entry script (relative to task root). Required for non-builtin judges. */
|
||||||
|
entry: string | null;
|
||||||
|
/** Path to OCAS schema JSON for judge data. Required for non-builtin judges. */
|
||||||
|
schema: string | null;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Limits for eval execution. */
|
||||||
|
export type TaskLimits = {
|
||||||
|
maxSteps: number;
|
||||||
|
timeoutMinutes: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Parsed task.yaml manifest. */
|
||||||
|
export type TaskManifest = {
|
||||||
|
name: string;
|
||||||
|
description: string;
|
||||||
|
/** Workflow name or relative path to .yaml file. */
|
||||||
|
workflow: string;
|
||||||
|
/** Initial prompt for thread start. */
|
||||||
|
prompt: string;
|
||||||
|
limits: TaskLimits;
|
||||||
|
judges: JudgeEntry[];
|
||||||
|
};
|
||||||
@@ -0,0 +1,9 @@
|
|||||||
|
{
|
||||||
|
"extends": "../../tsconfig.json",
|
||||||
|
"compilerOptions": {
|
||||||
|
"rootDir": "src",
|
||||||
|
"outDir": "dist"
|
||||||
|
},
|
||||||
|
"include": ["src"],
|
||||||
|
"references": [{ "path": "../protocol" }, { "path": "../util" }]
|
||||||
|
}
|
||||||
Generated
+25
@@ -228,6 +228,31 @@ importers:
|
|||||||
specifier: ^8.0.13
|
specifier: ^8.0.13
|
||||||
version: 8.0.16(@types/node@25.9.1)(esbuild@0.27.7)(jiti@2.7.0)(yaml@2.9.0)
|
version: 8.0.16(@types/node@25.9.1)(esbuild@0.27.7)(jiti@2.7.0)(yaml@2.9.0)
|
||||||
|
|
||||||
|
packages/eval:
|
||||||
|
dependencies:
|
||||||
|
'@ocas/core':
|
||||||
|
specifier: ^0.3.0
|
||||||
|
version: 0.3.0
|
||||||
|
'@ocas/fs':
|
||||||
|
specifier: ^0.3.0
|
||||||
|
version: 0.3.0
|
||||||
|
'@united-workforce/protocol':
|
||||||
|
specifier: workspace:^
|
||||||
|
version: link:../protocol
|
||||||
|
'@united-workforce/util':
|
||||||
|
specifier: workspace:^
|
||||||
|
version: link:../util
|
||||||
|
commander:
|
||||||
|
specifier: ^14.0.3
|
||||||
|
version: 14.0.3
|
||||||
|
yaml:
|
||||||
|
specifier: ^2.9.0
|
||||||
|
version: 2.9.0
|
||||||
|
devDependencies:
|
||||||
|
typescript:
|
||||||
|
specifier: ^5.8.3
|
||||||
|
version: 5.9.3
|
||||||
|
|
||||||
packages/protocol:
|
packages/protocol:
|
||||||
dependencies:
|
dependencies:
|
||||||
'@ocas/core':
|
'@ocas/core':
|
||||||
|
|||||||
+2
-1
@@ -25,6 +25,7 @@
|
|||||||
{ "path": "packages/agent-builtin" },
|
{ "path": "packages/agent-builtin" },
|
||||||
{ "path": "packages/agent-mock" },
|
{ "path": "packages/agent-mock" },
|
||||||
{ "path": "packages/agent-claude-code" },
|
{ "path": "packages/agent-claude-code" },
|
||||||
{ "path": "packages/cli" }
|
{ "path": "packages/cli" },
|
||||||
|
{ "path": "packages/eval" }
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user