feat: eval run command — prepare, execute, collect pipeline
CI / check (pull_request) Successful in 1m45s
CI / check (pull_request) Successful in 1m45s
Implement the uwf-eval run <task-dir> command with 3-phase pipeline: - prepare: read task.yaml, copy fixture/ to temp workdir - execute: shell out to uwf thread start + exec - collect: run judges, compute weighted score, store CAS node, set @uwf/eval/<task>/latest variable Changes: - src/runner/ — types, prepare, execute, collect, index - src/storage/store.ts — createEvalStore(), setEvalLatest() - src/commands/run.ts — full pipeline wiring with --agent/--model/--count - 9 new tests (prepare + collect + weighted scoring) Builtin judges return placeholder score 0 (Phase 1c). Refs #70
This commit is contained in:
@@ -0,0 +1,157 @@
|
|||||||
|
import { bootstrap, createMemoryStore } from "@ocas/core";
|
||||||
|
import { describe, expect, test } from "vitest";
|
||||||
|
import type { JudgeRunner } from "../src/runner/index.js";
|
||||||
|
import { collect, computeOverall } from "../src/runner/index.js";
|
||||||
|
import type { EvalRunConfig, EvalStore } from "../src/storage/index.js";
|
||||||
|
import type { JudgeEntry, TaskManifest } from "../src/task/index.js";
|
||||||
|
|
||||||
|
function makeJudge(name: string, weight: number, builtin: boolean): JudgeEntry {
|
||||||
|
return {
|
||||||
|
name,
|
||||||
|
weight,
|
||||||
|
builtin,
|
||||||
|
entry: builtin ? null : `dist/judges/${name}.js`,
|
||||||
|
schema: null,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function makeManifest(judges: JudgeEntry[]): TaskManifest {
|
||||||
|
return {
|
||||||
|
name: "fix-off-by-one",
|
||||||
|
description: "test task",
|
||||||
|
workflow: "solve-issue",
|
||||||
|
prompt: "Fix the bug",
|
||||||
|
limits: { maxSteps: 10, timeoutMinutes: 30 },
|
||||||
|
judges,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function makeEvalStore(): EvalStore {
|
||||||
|
const store = createMemoryStore();
|
||||||
|
bootstrap(store);
|
||||||
|
return { store, varStore: store.var };
|
||||||
|
}
|
||||||
|
|
||||||
|
const CONFIG: EvalRunConfig = {
|
||||||
|
agent: "hermes",
|
||||||
|
model: "claude-sonnet-4",
|
||||||
|
engineVersion: "test",
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Returns a fixed score per judge name. */
|
||||||
|
function scriptedRunner(scores: Record<string, number>): JudgeRunner {
|
||||||
|
return async (_taskDir, _workDir, _threadId, judge) => ({
|
||||||
|
score: scores[judge.name] ?? 0,
|
||||||
|
data: { judged: judge.name },
|
||||||
|
schema: { type: "object" },
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
describe("computeOverall", () => {
|
||||||
|
test("computes the weighted average correctly", () => {
|
||||||
|
const overall = computeOverall([
|
||||||
|
{ score: 0.8, weight: 0.3 },
|
||||||
|
{ score: 0.6, weight: 0.3 },
|
||||||
|
{ score: 1.0, weight: 0.4 },
|
||||||
|
]);
|
||||||
|
// 0.24 + 0.18 + 0.4 = 0.82
|
||||||
|
expect(overall).toBeCloseTo(0.82, 10);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("a weight-0 judge does not affect the result", () => {
|
||||||
|
const withInformational = computeOverall([
|
||||||
|
{ score: 1.0, weight: 1.0 },
|
||||||
|
{ score: 0.0, weight: 0.0 },
|
||||||
|
]);
|
||||||
|
expect(withInformational).toBe(1.0);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("returns 0 when total weight is 0", () => {
|
||||||
|
expect(computeOverall([{ score: 0.5, weight: 0 }])).toBe(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("collect", () => {
|
||||||
|
test("computes weighted score correctly across judges", async () => {
|
||||||
|
const evalStore = makeEvalStore();
|
||||||
|
const manifest = makeManifest([
|
||||||
|
makeJudge("test-pass", 0.6, false),
|
||||||
|
makeJudge("code-quality", 0.4, false),
|
||||||
|
]);
|
||||||
|
const runJudge = scriptedRunner({ "test-pass": 1.0, "code-quality": 0.5 });
|
||||||
|
|
||||||
|
const result = await collect(
|
||||||
|
{
|
||||||
|
evalStore,
|
||||||
|
taskDir: "/tmp/task",
|
||||||
|
workDir: "/tmp/work",
|
||||||
|
threadId: "THREAD123",
|
||||||
|
manifest,
|
||||||
|
config: CONFIG,
|
||||||
|
},
|
||||||
|
runJudge,
|
||||||
|
);
|
||||||
|
|
||||||
|
// 1.0 * 0.6 + 0.5 * 0.4 = 0.8
|
||||||
|
expect(result.overall).toBeCloseTo(0.8, 10);
|
||||||
|
expect(result.runHash).toBeTruthy();
|
||||||
|
expect(result.judges).toHaveLength(2);
|
||||||
|
expect(result.judges[0]).toEqual({ name: "test-pass", score: 1.0, weight: 0.6 });
|
||||||
|
|
||||||
|
const latest = evalStore.varStore.list({
|
||||||
|
exactName: "@uwf/eval/fix-off-by-one/latest",
|
||||||
|
});
|
||||||
|
expect(latest[0]?.value).toBe(result.runHash);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("handles a judge with weight 0 (informational)", async () => {
|
||||||
|
const evalStore = makeEvalStore();
|
||||||
|
const manifest = makeManifest([
|
||||||
|
makeJudge("test-pass", 1.0, false),
|
||||||
|
makeJudge("token-stats", 0, true),
|
||||||
|
]);
|
||||||
|
// token-stats is builtin → default runner would score 0; give scripted score
|
||||||
|
// that would skew the result if it were counted.
|
||||||
|
const runJudge = scriptedRunner({ "test-pass": 0.5, "token-stats": 1.0 });
|
||||||
|
|
||||||
|
const result = await collect(
|
||||||
|
{
|
||||||
|
evalStore,
|
||||||
|
taskDir: "/tmp/task",
|
||||||
|
workDir: "/tmp/work",
|
||||||
|
threadId: "THREAD123",
|
||||||
|
manifest,
|
||||||
|
config: CONFIG,
|
||||||
|
},
|
||||||
|
runJudge,
|
||||||
|
);
|
||||||
|
|
||||||
|
// Only test-pass (weight 1.0) counts → overall = 0.5
|
||||||
|
expect(result.overall).toBeCloseTo(0.5, 10);
|
||||||
|
expect(result.judges).toHaveLength(2);
|
||||||
|
const tokenStats = result.judges.find((j) => j.name === "token-stats");
|
||||||
|
expect(tokenStats?.weight).toBe(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("builtin judges are skipped with placeholder score 0", async () => {
|
||||||
|
const evalStore = makeEvalStore();
|
||||||
|
const manifest = makeManifest([makeJudge("frontmatter-compliance", 1.0, true)]);
|
||||||
|
|
||||||
|
// Use the default runner (no injected runner) → builtin skipped → score 0.
|
||||||
|
const result = await collect({
|
||||||
|
evalStore,
|
||||||
|
taskDir: "/tmp/task",
|
||||||
|
workDir: "/tmp/work",
|
||||||
|
threadId: "THREAD123",
|
||||||
|
manifest,
|
||||||
|
config: CONFIG,
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(result.overall).toBe(0);
|
||||||
|
expect(result.judges[0]).toEqual({
|
||||||
|
name: "frontmatter-compliance",
|
||||||
|
score: 0,
|
||||||
|
weight: 1.0,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -0,0 +1,74 @@
|
|||||||
|
import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
|
||||||
|
import { tmpdir } from "node:os";
|
||||||
|
import { join } from "node:path";
|
||||||
|
|
||||||
|
import { afterEach, beforeEach, describe, expect, test } from "vitest";
|
||||||
|
|
||||||
|
import { prepare } from "../src/runner/index.js";
|
||||||
|
|
||||||
|
const TASK_YAML = `
|
||||||
|
name: fix-off-by-one
|
||||||
|
description: Fix an off-by-one error
|
||||||
|
workflow: solve-issue
|
||||||
|
prompt: "Fix the bug"
|
||||||
|
limits:
|
||||||
|
maxSteps: 12
|
||||||
|
timeoutMinutes: 20
|
||||||
|
judges:
|
||||||
|
- name: frontmatter-compliance
|
||||||
|
weight: 0.5
|
||||||
|
builtin: true
|
||||||
|
- name: test-pass
|
||||||
|
weight: 0.5
|
||||||
|
entry: dist/judges/test-pass.js
|
||||||
|
`;
|
||||||
|
|
||||||
|
let taskDir: string;
|
||||||
|
|
||||||
|
beforeEach(async () => {
|
||||||
|
taskDir = await mkdtemp(join(tmpdir(), "uwf-eval-task-"));
|
||||||
|
await writeFile(join(taskDir, "task.yaml"), TASK_YAML, "utf8");
|
||||||
|
const fixtureDir = join(taskDir, "fixture");
|
||||||
|
await mkdir(join(fixtureDir, "src"), { recursive: true });
|
||||||
|
await writeFile(join(fixtureDir, "src", "calc.ts"), "export const add = (a, b) => a + b + 1;\n");
|
||||||
|
await writeFile(join(fixtureDir, "package.json"), '{ "name": "fixture" }\n');
|
||||||
|
});
|
||||||
|
|
||||||
|
afterEach(async () => {
|
||||||
|
await rm(taskDir, { recursive: true, force: true });
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("prepare", () => {
|
||||||
|
test("returns the parsed manifest", async () => {
|
||||||
|
const result = await prepare(taskDir);
|
||||||
|
expect(result.taskDir).toBe(taskDir);
|
||||||
|
expect(result.manifest.name).toBe("fix-off-by-one");
|
||||||
|
expect(result.manifest.workflow).toBe("solve-issue");
|
||||||
|
expect(result.manifest.limits.maxSteps).toBe(12);
|
||||||
|
expect(result.manifest.judges).toHaveLength(2);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("copies fixture into a fresh temp work dir", async () => {
|
||||||
|
const result = await prepare(taskDir);
|
||||||
|
expect(result.workDir).not.toBe(taskDir);
|
||||||
|
expect(result.workDir.startsWith(tmpdir())).toBe(true);
|
||||||
|
|
||||||
|
const calc = await readFile(join(result.workDir, "src", "calc.ts"), "utf8");
|
||||||
|
expect(calc).toContain("export const add");
|
||||||
|
const pkg = await readFile(join(result.workDir, "package.json"), "utf8");
|
||||||
|
expect(pkg).toContain("fixture");
|
||||||
|
|
||||||
|
await rm(result.workDir, { recursive: true, force: true });
|
||||||
|
});
|
||||||
|
|
||||||
|
test("creates an empty work dir when no fixture/ exists", async () => {
|
||||||
|
const noFixtureDir = await mkdtemp(join(tmpdir(), "uwf-eval-nofix-"));
|
||||||
|
await writeFile(join(noFixtureDir, "task.yaml"), TASK_YAML, "utf8");
|
||||||
|
|
||||||
|
const result = await prepare(noFixtureDir);
|
||||||
|
expect(result.workDir.startsWith(tmpdir())).toBe(true);
|
||||||
|
|
||||||
|
await rm(noFixtureDir, { recursive: true, force: true });
|
||||||
|
await rm(result.workDir, { recursive: true, force: true });
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -1,4 +1,52 @@
|
|||||||
|
import { resolve } from "node:path";
|
||||||
|
|
||||||
import type { Command } from "commander";
|
import type { Command } from "commander";
|
||||||
|
import type { RunResult } from "../runner/index.js";
|
||||||
|
import { collect, execute, getEngineVersion, prepare } from "../runner/index.js";
|
||||||
|
import type { EvalRunConfig } from "../storage/index.js";
|
||||||
|
import { createEvalStore } from "../storage/index.js";
|
||||||
|
|
||||||
|
type RunCliOptions = {
|
||||||
|
agent: string;
|
||||||
|
model: string | undefined;
|
||||||
|
count: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
async function runOnce(
|
||||||
|
taskDir: string,
|
||||||
|
agent: string,
|
||||||
|
model: string,
|
||||||
|
engineVersion: string,
|
||||||
|
): Promise<RunResult> {
|
||||||
|
const prepared = await prepare(taskDir);
|
||||||
|
const { manifest, workDir } = prepared;
|
||||||
|
|
||||||
|
const { threadId } = await execute({
|
||||||
|
workDir,
|
||||||
|
workflow: manifest.workflow,
|
||||||
|
prompt: manifest.prompt,
|
||||||
|
agent,
|
||||||
|
maxSteps: manifest.limits.maxSteps,
|
||||||
|
});
|
||||||
|
|
||||||
|
const evalStore = await createEvalStore();
|
||||||
|
const config: EvalRunConfig = { agent, model, engineVersion };
|
||||||
|
const collected = await collect({
|
||||||
|
evalStore,
|
||||||
|
taskDir: prepared.taskDir,
|
||||||
|
workDir,
|
||||||
|
threadId,
|
||||||
|
manifest,
|
||||||
|
config,
|
||||||
|
});
|
||||||
|
|
||||||
|
return {
|
||||||
|
runHash: collected.runHash,
|
||||||
|
overall: collected.overall,
|
||||||
|
task: manifest.name,
|
||||||
|
judges: collected.judges,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
export function registerRunCommand(program: Command): void {
|
export function registerRunCommand(program: Command): void {
|
||||||
program
|
program
|
||||||
@@ -7,8 +55,30 @@ export function registerRunCommand(program: Command): void {
|
|||||||
.option("--agent <name>", "agent adapter to use", "hermes")
|
.option("--agent <name>", "agent adapter to use", "hermes")
|
||||||
.option("--model <model>", "model override")
|
.option("--model <model>", "model override")
|
||||||
.option("--count <n>", "number of eval runs", "1")
|
.option("--count <n>", "number of eval runs", "1")
|
||||||
.action(async (_task: string, _opts: Record<string, unknown>) => {
|
.action(async (task: string, opts: RunCliOptions) => {
|
||||||
process.stderr.write("uwf-eval run: not yet implemented\n");
|
const taskDir = resolve(task);
|
||||||
process.exitCode = 1;
|
const agent = opts.agent;
|
||||||
|
const model = opts.model ?? "";
|
||||||
|
const count = Number.parseInt(opts.count, 10);
|
||||||
|
if (!Number.isInteger(count) || count < 1) {
|
||||||
|
process.stderr.write("--count must be a positive integer\n");
|
||||||
|
process.exitCode = 1;
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
const engineVersion = getEngineVersion();
|
||||||
|
|
||||||
|
try {
|
||||||
|
const results: RunResult[] = [];
|
||||||
|
for (let i = 0; i < count; i++) {
|
||||||
|
results.push(await runOnce(taskDir, agent, model, engineVersion));
|
||||||
|
}
|
||||||
|
const output = count === 1 ? results[0] : results;
|
||||||
|
process.stdout.write(`${JSON.stringify(output)}\n`);
|
||||||
|
} catch (e) {
|
||||||
|
const message = e instanceof Error ? e.message : String(e);
|
||||||
|
process.stderr.write(`${message}\n`);
|
||||||
|
process.exitCode = 1;
|
||||||
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -1,15 +1,34 @@
|
|||||||
// Task manifest
|
|
||||||
|
|
||||||
// Judge types
|
// Judge types
|
||||||
export type { JudgeInput, JudgeOutput } from "./judge/index.js";
|
export type { JudgeInput, JudgeOutput } from "./judge/index.js";
|
||||||
export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload } from "./storage/index.js";
|
export type {
|
||||||
|
CollectInput,
|
||||||
|
CollectResult,
|
||||||
|
ExecuteInput,
|
||||||
|
ExecuteResult,
|
||||||
|
JudgeRunner,
|
||||||
|
JudgeRunOutput,
|
||||||
|
JudgeSummary,
|
||||||
|
PrepareResult,
|
||||||
|
RunOptions,
|
||||||
|
RunResult,
|
||||||
|
} from "./runner/index.js";
|
||||||
|
// Runner (prepare → execute → collect)
|
||||||
|
export { collect, computeOverall, execute, getEngineVersion, prepare } from "./runner/index.js";
|
||||||
|
export type {
|
||||||
|
EvalJudgeRecord,
|
||||||
|
EvalRunConfig,
|
||||||
|
EvalRunPayload,
|
||||||
|
EvalStore,
|
||||||
|
} from "./storage/index.js";
|
||||||
// Storage schemas and types
|
// Storage schemas and types
|
||||||
export {
|
export {
|
||||||
|
createEvalStore,
|
||||||
EVAL_JUDGE_FRONTMATTER_SCHEMA,
|
EVAL_JUDGE_FRONTMATTER_SCHEMA,
|
||||||
EVAL_JUDGE_HALLUCINATION_SCHEMA,
|
EVAL_JUDGE_HALLUCINATION_SCHEMA,
|
||||||
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
|
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
|
||||||
EVAL_JUDGE_UPSTREAM_SCHEMA,
|
EVAL_JUDGE_UPSTREAM_SCHEMA,
|
||||||
EVAL_RUN_SCHEMA,
|
EVAL_RUN_SCHEMA,
|
||||||
|
setEvalLatest,
|
||||||
} from "./storage/index.js";
|
} from "./storage/index.js";
|
||||||
export type { JudgeEntry, TaskLimits, TaskManifest } from "./task/index.js";
|
export type { JudgeEntry, TaskLimits, TaskManifest } from "./task/index.js";
|
||||||
export { loadTaskManifest, parseTaskManifest } from "./task/index.js";
|
export { loadTaskManifest, parseTaskManifest } from "./task/index.js";
|
||||||
|
|||||||
@@ -0,0 +1,150 @@
|
|||||||
|
import { execFileSync } from "node:child_process";
|
||||||
|
import { readFile } from "node:fs/promises";
|
||||||
|
import { resolve } from "node:path";
|
||||||
|
|
||||||
|
import type { JSONSchema, Store } from "@ocas/core";
|
||||||
|
import { putSchema } from "@ocas/core";
|
||||||
|
import type { CasRef } from "@united-workforce/protocol";
|
||||||
|
import { createLogger } from "@united-workforce/util";
|
||||||
|
|
||||||
|
import type { JudgeOutput } from "../judge/index.js";
|
||||||
|
import type { EvalJudgeRecord, EvalRunPayload } from "../storage/index.js";
|
||||||
|
import { EVAL_RUN_SCHEMA, setEvalLatest } from "../storage/index.js";
|
||||||
|
import type { JudgeEntry } from "../task/index.js";
|
||||||
|
import type {
|
||||||
|
CollectInput,
|
||||||
|
CollectResult,
|
||||||
|
JudgeRunner,
|
||||||
|
JudgeRunOutput,
|
||||||
|
JudgeSummary,
|
||||||
|
} from "./types.js";
|
||||||
|
|
||||||
|
const log = createLogger({ sink: { kind: "stderr" } });
|
||||||
|
|
||||||
|
const LOG_JUDGE = "CT6N3P2K";
|
||||||
|
const LOG_STORED = "CT9V2Q7M";
|
||||||
|
|
||||||
|
/** Permissive schema for judge data without a dedicated schema (e.g. builtin placeholders). */
|
||||||
|
const GENERIC_DATA_SCHEMA: JSONSchema = { type: "object" };
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Compute the weighted overall score. Judges with weight 0 are informational
|
||||||
|
* and do not affect the result (they contribute 0 to both numerator and
|
||||||
|
* denominator). Returns 0 when total weight is 0.
|
||||||
|
*/
|
||||||
|
export function computeOverall(judges: ReadonlyArray<{ score: number; weight: number }>): number {
|
||||||
|
let totalWeight = 0;
|
||||||
|
let weighted = 0;
|
||||||
|
for (const judge of judges) {
|
||||||
|
totalWeight += judge.weight;
|
||||||
|
weighted += judge.score * judge.weight;
|
||||||
|
}
|
||||||
|
return totalWeight > 0 ? weighted / totalWeight : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Run a task-provided judge script: `node <entry> <cwd> <threadId>`. */
|
||||||
|
async function runTaskJudge(
|
||||||
|
taskDir: string,
|
||||||
|
workDir: string,
|
||||||
|
threadId: string,
|
||||||
|
judge: JudgeEntry,
|
||||||
|
): Promise<JudgeRunOutput> {
|
||||||
|
if (judge.entry === null) {
|
||||||
|
throw new Error(`judge "${judge.name}" is not builtin but has no entry`);
|
||||||
|
}
|
||||||
|
const entryPath = resolve(taskDir, judge.entry);
|
||||||
|
|
||||||
|
let stdout: string;
|
||||||
|
try {
|
||||||
|
stdout = execFileSync("node", [entryPath, workDir, threadId], {
|
||||||
|
encoding: "utf8",
|
||||||
|
stdio: ["ignore", "pipe", "pipe"],
|
||||||
|
maxBuffer: 50 * 1024 * 1024,
|
||||||
|
});
|
||||||
|
} catch (e) {
|
||||||
|
const message = e instanceof Error ? e.message : String(e);
|
||||||
|
throw new Error(`judge "${judge.name}" failed: ${message}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const line = stdout.trim().split("\n").pop()?.trim() ?? "";
|
||||||
|
let parsed: unknown;
|
||||||
|
try {
|
||||||
|
parsed = JSON.parse(line);
|
||||||
|
} catch {
|
||||||
|
throw new Error(`judge "${judge.name}" stdout is not valid JSON: ${line || "(empty)"}`);
|
||||||
|
}
|
||||||
|
const output = parsed as JudgeOutput;
|
||||||
|
if (typeof output.score !== "number") {
|
||||||
|
throw new Error(`judge "${judge.name}" output missing numeric score`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const schema =
|
||||||
|
judge.schema !== null ? await loadSchema(resolve(taskDir, judge.schema)) : GENERIC_DATA_SCHEMA;
|
||||||
|
return { score: output.score, data: output.data, schema };
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Load and parse an OCAS JSON Schema file. */
|
||||||
|
async function loadSchema(path: string): Promise<JSONSchema> {
|
||||||
|
const text = await readFile(path, "utf8");
|
||||||
|
return JSON.parse(text) as JSONSchema;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Default judge runner. Builtin judges are skipped for now (placeholder score 0
|
||||||
|
* with empty data); task judges spawn their entry script.
|
||||||
|
*/
|
||||||
|
const defaultJudgeRunner: JudgeRunner = async (taskDir, workDir, threadId, judge) => {
|
||||||
|
if (judge.builtin) {
|
||||||
|
return { score: 0, data: {}, schema: GENERIC_DATA_SCHEMA };
|
||||||
|
}
|
||||||
|
return runTaskJudge(taskDir, workDir, threadId, judge);
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Persist judge data to CAS under its schema and return the CAS hash. */
|
||||||
|
async function storeJudgeData(store: Store, schema: JSONSchema, data: unknown): Promise<CasRef> {
|
||||||
|
const schemaHash = await putSchema(store, schema);
|
||||||
|
return (await store.cas.put(schemaHash, data)) as CasRef;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Run all judges, store their data and the overall eval-run record in CAS, then
|
||||||
|
* index the run under `@uwf/eval/<task>/latest`.
|
||||||
|
*/
|
||||||
|
export async function collect(
|
||||||
|
input: CollectInput,
|
||||||
|
runJudge: JudgeRunner = defaultJudgeRunner,
|
||||||
|
): Promise<CollectResult> {
|
||||||
|
const { evalStore, taskDir, workDir, threadId, manifest, config } = input;
|
||||||
|
const { store, varStore } = evalStore;
|
||||||
|
|
||||||
|
const records: EvalJudgeRecord[] = [];
|
||||||
|
for (const judge of manifest.judges) {
|
||||||
|
const result = await runJudge(taskDir, workDir, threadId, judge);
|
||||||
|
const dataHash = await storeJudgeData(store, result.schema, result.data);
|
||||||
|
records.push({ name: judge.name, score: result.score, weight: judge.weight, dataHash });
|
||||||
|
log(LOG_JUDGE, `judge=${judge.name} score=${result.score} weight=${judge.weight}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const overall = computeOverall(records);
|
||||||
|
|
||||||
|
const payload: EvalRunPayload = {
|
||||||
|
task: manifest.name,
|
||||||
|
config,
|
||||||
|
threadId,
|
||||||
|
judges: records,
|
||||||
|
overall,
|
||||||
|
timestamp: Date.now(),
|
||||||
|
};
|
||||||
|
|
||||||
|
const schemaHash = await putSchema(store, EVAL_RUN_SCHEMA);
|
||||||
|
const runHash = (await store.cas.put(schemaHash, payload)) as string;
|
||||||
|
setEvalLatest(varStore, manifest.name, runHash);
|
||||||
|
log(LOG_STORED, `stored eval-run task=${manifest.name} hash=${runHash} overall=${overall}`);
|
||||||
|
|
||||||
|
const judges: JudgeSummary[] = records.map((r) => ({
|
||||||
|
name: r.name,
|
||||||
|
score: r.score,
|
||||||
|
weight: r.weight,
|
||||||
|
}));
|
||||||
|
return { runHash, overall, judges };
|
||||||
|
}
|
||||||
@@ -0,0 +1,87 @@
|
|||||||
|
import { execFileSync } from "node:child_process";
|
||||||
|
|
||||||
|
import { createLogger } from "@united-workforce/util";
|
||||||
|
|
||||||
|
import type { ExecuteInput, ExecuteResult } from "./types.js";
|
||||||
|
|
||||||
|
const log = createLogger({ sink: { kind: "stderr" } });
|
||||||
|
|
||||||
|
const LOG_START = "EX5M2T9V";
|
||||||
|
const LOG_EXEC = "EX7Q4K2N";
|
||||||
|
|
||||||
|
/** Resolve the uwf CLI binary. Override with `UWF_BIN` for testing. */
|
||||||
|
function uwfBin(): string {
|
||||||
|
const override = process.env.UWF_BIN;
|
||||||
|
return override !== undefined && override !== "" ? override : "uwf";
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Run a uwf subcommand and return trimmed stdout. */
|
||||||
|
function runUwf(args: string[], cwd: string): string {
|
||||||
|
try {
|
||||||
|
return execFileSync(uwfBin(), args, {
|
||||||
|
encoding: "utf8",
|
||||||
|
stdio: ["ignore", "pipe", "pipe"],
|
||||||
|
maxBuffer: 50 * 1024 * 1024,
|
||||||
|
cwd,
|
||||||
|
}).trim();
|
||||||
|
} catch (e) {
|
||||||
|
const err = e as NodeJS.ErrnoException & { stderr?: Buffer | string | null };
|
||||||
|
const stderr =
|
||||||
|
err.stderr == null
|
||||||
|
? ""
|
||||||
|
: typeof err.stderr === "string"
|
||||||
|
? err.stderr
|
||||||
|
: err.stderr.toString("utf8");
|
||||||
|
const detail = stderr.trim() !== "" ? `: ${stderr.trim()}` : "";
|
||||||
|
throw new Error(`uwf ${args[0]} ${args[1]} failed${detail}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Parse the thread ID from `uwf thread start` JSON output (`{ workflow, thread }`). */
|
||||||
|
function parseThreadId(stdout: string): string {
|
||||||
|
let parsed: unknown;
|
||||||
|
try {
|
||||||
|
parsed = JSON.parse(stdout);
|
||||||
|
} catch {
|
||||||
|
throw new Error(`uwf thread start did not emit valid JSON: ${stdout || "(empty)"}`);
|
||||||
|
}
|
||||||
|
const obj = parsed as Record<string, unknown>;
|
||||||
|
const thread = obj.thread;
|
||||||
|
if (typeof thread !== "string" || thread === "") {
|
||||||
|
throw new Error(`uwf thread start output missing thread id: ${stdout}`);
|
||||||
|
}
|
||||||
|
return thread;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Execute a workflow: create a thread, then run it for up to `maxSteps` steps.
|
||||||
|
* Shells out to the uwf CLI rather than importing it directly.
|
||||||
|
*/
|
||||||
|
export async function execute(input: ExecuteInput): Promise<ExecuteResult> {
|
||||||
|
const startOut = runUwf(
|
||||||
|
["thread", "start", input.workflow, "-p", input.prompt, "--cwd", input.workDir],
|
||||||
|
input.workDir,
|
||||||
|
);
|
||||||
|
const threadId = parseThreadId(startOut);
|
||||||
|
log(LOG_START, `thread started thread=${threadId} workflow=${input.workflow}`);
|
||||||
|
|
||||||
|
runUwf(
|
||||||
|
["thread", "exec", threadId, "--agent", input.agent, "-c", String(input.maxSteps)],
|
||||||
|
input.workDir,
|
||||||
|
);
|
||||||
|
log(LOG_EXEC, `thread executed thread=${threadId} maxSteps=${input.maxSteps}`);
|
||||||
|
|
||||||
|
return { threadId };
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Best-effort lookup of the uwf engine version (`uwf -V`); "unknown" on failure. */
|
||||||
|
export function getEngineVersion(): string {
|
||||||
|
try {
|
||||||
|
return execFileSync(uwfBin(), ["-V"], {
|
||||||
|
encoding: "utf8",
|
||||||
|
stdio: ["ignore", "pipe", "ignore"],
|
||||||
|
}).trim();
|
||||||
|
} catch {
|
||||||
|
return "unknown";
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -0,0 +1,15 @@
|
|||||||
|
export { collect, computeOverall } from "./collect.js";
|
||||||
|
export { execute, getEngineVersion } from "./execute.js";
|
||||||
|
export { prepare } from "./prepare.js";
|
||||||
|
export type {
|
||||||
|
CollectInput,
|
||||||
|
CollectResult,
|
||||||
|
ExecuteInput,
|
||||||
|
ExecuteResult,
|
||||||
|
JudgeRunner,
|
||||||
|
JudgeRunOutput,
|
||||||
|
JudgeSummary,
|
||||||
|
PrepareResult,
|
||||||
|
RunOptions,
|
||||||
|
RunResult,
|
||||||
|
} from "./types.js";
|
||||||
@@ -0,0 +1,45 @@
|
|||||||
|
import { access, cp, mkdir, mkdtemp } from "node:fs/promises";
|
||||||
|
import { tmpdir } from "node:os";
|
||||||
|
import { join } from "node:path";
|
||||||
|
|
||||||
|
import { createLogger } from "@united-workforce/util";
|
||||||
|
|
||||||
|
import { loadTaskManifest } from "../task/index.js";
|
||||||
|
import type { PrepareResult } from "./types.js";
|
||||||
|
|
||||||
|
const log = createLogger({ sink: { kind: "stderr" } });
|
||||||
|
|
||||||
|
const LOG_PREPARE = "PRE4K2NQ";
|
||||||
|
const LOG_FIXTURE = "PRE7M3VX";
|
||||||
|
|
||||||
|
/** Check whether a path exists. */
|
||||||
|
async function pathExists(path: string): Promise<boolean> {
|
||||||
|
try {
|
||||||
|
await access(path);
|
||||||
|
return true;
|
||||||
|
} catch {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Prepare a task for execution: read its manifest and copy the fixture
|
||||||
|
* directory into a fresh temp working directory.
|
||||||
|
*/
|
||||||
|
export async function prepare(taskDir: string): Promise<PrepareResult> {
|
||||||
|
const manifest = await loadTaskManifest(taskDir);
|
||||||
|
log(LOG_PREPARE, `loaded task manifest name=${manifest.name} workflow=${manifest.workflow}`);
|
||||||
|
|
||||||
|
const workDir = await mkdtemp(join(tmpdir(), "uwf-eval-"));
|
||||||
|
|
||||||
|
const fixtureDir = join(taskDir, "fixture");
|
||||||
|
if (await pathExists(fixtureDir)) {
|
||||||
|
await cp(fixtureDir, workDir, { recursive: true });
|
||||||
|
log(LOG_FIXTURE, `copied fixture into workDir=${workDir}`);
|
||||||
|
} else {
|
||||||
|
await mkdir(workDir, { recursive: true });
|
||||||
|
log(LOG_FIXTURE, `no fixture/ found, using empty workDir=${workDir}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return { taskDir, workDir, manifest };
|
||||||
|
}
|
||||||
@@ -0,0 +1,85 @@
|
|||||||
|
import type { JSONSchema } from "@ocas/core";
|
||||||
|
|
||||||
|
import type { EvalRunConfig, EvalStore } from "../storage/index.js";
|
||||||
|
import type { JudgeEntry, TaskManifest } from "../task/index.js";
|
||||||
|
|
||||||
|
/** Result of the prepare phase: task dir, temp working dir, parsed manifest. */
|
||||||
|
export type PrepareResult = {
|
||||||
|
taskDir: string;
|
||||||
|
workDir: string;
|
||||||
|
manifest: TaskManifest;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Input to the execute phase. */
|
||||||
|
export type ExecuteInput = {
|
||||||
|
/** Working directory the workflow runs in (the prepared temp dir). */
|
||||||
|
workDir: string;
|
||||||
|
/** Workflow name or path (from task.yaml). */
|
||||||
|
workflow: string;
|
||||||
|
/** Initial prompt for the thread. */
|
||||||
|
prompt: string;
|
||||||
|
/** Agent adapter to use. */
|
||||||
|
agent: string;
|
||||||
|
/** Maximum number of steps to execute. */
|
||||||
|
maxSteps: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Result of the execute phase. */
|
||||||
|
export type ExecuteResult = {
|
||||||
|
threadId: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Output produced by running a single judge. */
|
||||||
|
export type JudgeRunOutput = {
|
||||||
|
score: number;
|
||||||
|
data: unknown;
|
||||||
|
/** Schema describing `data`, used when persisting to CAS. */
|
||||||
|
schema: JSONSchema;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Pluggable judge execution strategy (injectable for testing). */
|
||||||
|
export type JudgeRunner = (
|
||||||
|
taskDir: string,
|
||||||
|
workDir: string,
|
||||||
|
threadId: string,
|
||||||
|
judge: JudgeEntry,
|
||||||
|
) => Promise<JudgeRunOutput>;
|
||||||
|
|
||||||
|
/** Input to the collect phase. */
|
||||||
|
export type CollectInput = {
|
||||||
|
evalStore: EvalStore;
|
||||||
|
taskDir: string;
|
||||||
|
workDir: string;
|
||||||
|
threadId: string;
|
||||||
|
manifest: TaskManifest;
|
||||||
|
config: EvalRunConfig;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** A single judge's summarized result in the run output. */
|
||||||
|
export type JudgeSummary = {
|
||||||
|
name: string;
|
||||||
|
score: number;
|
||||||
|
weight: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Result of the collect phase. */
|
||||||
|
export type CollectResult = {
|
||||||
|
runHash: string;
|
||||||
|
overall: number;
|
||||||
|
judges: JudgeSummary[];
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Options for a full eval run (from CLI flags). */
|
||||||
|
export type RunOptions = {
|
||||||
|
agent: string;
|
||||||
|
model: string;
|
||||||
|
count: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** Final result of a full eval run. */
|
||||||
|
export type RunResult = {
|
||||||
|
runHash: string;
|
||||||
|
overall: number;
|
||||||
|
task: string;
|
||||||
|
judges: JudgeSummary[];
|
||||||
|
};
|
||||||
@@ -5,4 +5,5 @@ export {
|
|||||||
EVAL_JUDGE_UPSTREAM_SCHEMA,
|
EVAL_JUDGE_UPSTREAM_SCHEMA,
|
||||||
EVAL_RUN_SCHEMA,
|
EVAL_RUN_SCHEMA,
|
||||||
} from "./schemas.js";
|
} from "./schemas.js";
|
||||||
export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload } from "./types.js";
|
export { createEvalStore, setEvalLatest } from "./store.js";
|
||||||
|
export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload, EvalStore } from "./types.js";
|
||||||
|
|||||||
@@ -0,0 +1,42 @@
|
|||||||
|
import { mkdir } from "node:fs/promises";
|
||||||
|
import { homedir } from "node:os";
|
||||||
|
import { join } from "node:path";
|
||||||
|
import type { VarStore } from "@ocas/core";
|
||||||
|
import { bootstrap, type Store } from "@ocas/core";
|
||||||
|
import { createFsStore, createSqliteVarStore } from "@ocas/fs";
|
||||||
|
|
||||||
|
import type { EvalStore } from "./types.js";
|
||||||
|
|
||||||
|
/** Variable name prefix for eval run pointers (`@uwf/eval/<task>/latest`). */
|
||||||
|
const EVAL_VAR_PREFIX = "@uwf/eval/";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Resolve the global CAS directory shared by all uwf and ocas tools.
|
||||||
|
* Priority: `OCAS_HOME` → default ~/.ocas (matches uwf CLI's getGlobalCasDir).
|
||||||
|
*/
|
||||||
|
function getGlobalCasDir(): string {
|
||||||
|
const primary = process.env.OCAS_HOME;
|
||||||
|
if (primary !== undefined && primary !== "") {
|
||||||
|
return primary;
|
||||||
|
}
|
||||||
|
return join(homedir(), ".ocas");
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Open the unified OCAS store on the filesystem.
|
||||||
|
* Shares the same CAS + variable backend as the uwf CLI.
|
||||||
|
*/
|
||||||
|
export async function createEvalStore(): Promise<EvalStore> {
|
||||||
|
const casDir = getGlobalCasDir();
|
||||||
|
await mkdir(casDir, { recursive: true });
|
||||||
|
const cas = createFsStore(casDir);
|
||||||
|
const { var: varStore, tag } = createSqliteVarStore(join(casDir, "vars"), cas);
|
||||||
|
const store: Store = { cas, var: varStore, tag };
|
||||||
|
bootstrap(store);
|
||||||
|
return { store, varStore };
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Set the `@uwf/eval/<task>/latest` variable to point at a run hash. */
|
||||||
|
export function setEvalLatest(varStore: VarStore, taskName: string, runHash: string): void {
|
||||||
|
varStore.set(`${EVAL_VAR_PREFIX}${taskName}/latest`, runHash);
|
||||||
|
}
|
||||||
@@ -1,5 +1,12 @@
|
|||||||
|
import type { Store, VarStore } from "@ocas/core";
|
||||||
import type { CasRef } from "@united-workforce/protocol";
|
import type { CasRef } from "@united-workforce/protocol";
|
||||||
|
|
||||||
|
/** Handle to the OCAS store used for eval persistence. */
|
||||||
|
export type EvalStore = {
|
||||||
|
store: Store;
|
||||||
|
varStore: VarStore;
|
||||||
|
};
|
||||||
|
|
||||||
/** A single judge result within an eval run. */
|
/** A single judge result within an eval run. */
|
||||||
export type EvalJudgeRecord = {
|
export type EvalJudgeRecord = {
|
||||||
name: string;
|
name: string;
|
||||||
|
|||||||
Reference in New Issue
Block a user