From fae9e9ed3a7a87e28e3ec40f0c4daa77613f147e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=8F=E6=A9=98?= Date: Thu, 4 Jun 2026 23:59:21 +0000 Subject: [PATCH] =?UTF-8?q?feat:=20eval=20run=20command=20=E2=80=94=20prep?= =?UTF-8?q?are,=20execute,=20collect=20pipeline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement the uwf-eval run command with 3-phase pipeline: - prepare: read task.yaml, copy fixture/ to temp workdir - execute: shell out to uwf thread start + exec - collect: run judges, compute weighted score, store CAS node, set @uwf/eval//latest variable Changes: - src/runner/ — types, prepare, execute, collect, index - src/storage/store.ts — createEvalStore(), setEvalLatest() - src/commands/run.ts — full pipeline wiring with --agent/--model/--count - 9 new tests (prepare + collect + weighted scoring) Builtin judges return placeholder score 0 (Phase 1c). Refs #70 --- packages/eval/__tests__/collect.test.ts | 157 ++++++++++++++++++++++++ packages/eval/__tests__/prepare.test.ts | 74 +++++++++++ packages/eval/src/commands/run.ts | 76 +++++++++++- packages/eval/src/index.ts | 25 +++- packages/eval/src/runner/collect.ts | 150 ++++++++++++++++++++++ packages/eval/src/runner/execute.ts | 87 +++++++++++++ packages/eval/src/runner/index.ts | 15 +++ packages/eval/src/runner/prepare.ts | 45 +++++++ packages/eval/src/runner/types.ts | 85 +++++++++++++ packages/eval/src/storage/index.ts | 3 +- packages/eval/src/storage/store.ts | 42 +++++++ packages/eval/src/storage/types.ts | 7 ++ 12 files changed, 759 insertions(+), 7 deletions(-) create mode 100644 packages/eval/__tests__/collect.test.ts create mode 100644 packages/eval/__tests__/prepare.test.ts create mode 100644 packages/eval/src/runner/collect.ts create mode 100644 packages/eval/src/runner/execute.ts create mode 100644 packages/eval/src/runner/index.ts create mode 100644 packages/eval/src/runner/prepare.ts create mode 100644 packages/eval/src/runner/types.ts create mode 100644 packages/eval/src/storage/store.ts diff --git a/packages/eval/__tests__/collect.test.ts b/packages/eval/__tests__/collect.test.ts new file mode 100644 index 0000000..7622d09 --- /dev/null +++ b/packages/eval/__tests__/collect.test.ts @@ -0,0 +1,157 @@ +import { bootstrap, createMemoryStore } from "@ocas/core"; +import { describe, expect, test } from "vitest"; +import type { JudgeRunner } from "../src/runner/index.js"; +import { collect, computeOverall } from "../src/runner/index.js"; +import type { EvalRunConfig, EvalStore } from "../src/storage/index.js"; +import type { JudgeEntry, TaskManifest } from "../src/task/index.js"; + +function makeJudge(name: string, weight: number, builtin: boolean): JudgeEntry { + return { + name, + weight, + builtin, + entry: builtin ? null : `dist/judges/${name}.js`, + schema: null, + }; +} + +function makeManifest(judges: JudgeEntry[]): TaskManifest { + return { + name: "fix-off-by-one", + description: "test task", + workflow: "solve-issue", + prompt: "Fix the bug", + limits: { maxSteps: 10, timeoutMinutes: 30 }, + judges, + }; +} + +function makeEvalStore(): EvalStore { + const store = createMemoryStore(); + bootstrap(store); + return { store, varStore: store.var }; +} + +const CONFIG: EvalRunConfig = { + agent: "hermes", + model: "claude-sonnet-4", + engineVersion: "test", +}; + +/** Returns a fixed score per judge name. */ +function scriptedRunner(scores: Record): JudgeRunner { + return async (_taskDir, _workDir, _threadId, judge) => ({ + score: scores[judge.name] ?? 0, + data: { judged: judge.name }, + schema: { type: "object" }, + }); +} + +describe("computeOverall", () => { + test("computes the weighted average correctly", () => { + const overall = computeOverall([ + { score: 0.8, weight: 0.3 }, + { score: 0.6, weight: 0.3 }, + { score: 1.0, weight: 0.4 }, + ]); + // 0.24 + 0.18 + 0.4 = 0.82 + expect(overall).toBeCloseTo(0.82, 10); + }); + + test("a weight-0 judge does not affect the result", () => { + const withInformational = computeOverall([ + { score: 1.0, weight: 1.0 }, + { score: 0.0, weight: 0.0 }, + ]); + expect(withInformational).toBe(1.0); + }); + + test("returns 0 when total weight is 0", () => { + expect(computeOverall([{ score: 0.5, weight: 0 }])).toBe(0); + }); +}); + +describe("collect", () => { + test("computes weighted score correctly across judges", async () => { + const evalStore = makeEvalStore(); + const manifest = makeManifest([ + makeJudge("test-pass", 0.6, false), + makeJudge("code-quality", 0.4, false), + ]); + const runJudge = scriptedRunner({ "test-pass": 1.0, "code-quality": 0.5 }); + + const result = await collect( + { + evalStore, + taskDir: "/tmp/task", + workDir: "/tmp/work", + threadId: "THREAD123", + manifest, + config: CONFIG, + }, + runJudge, + ); + + // 1.0 * 0.6 + 0.5 * 0.4 = 0.8 + expect(result.overall).toBeCloseTo(0.8, 10); + expect(result.runHash).toBeTruthy(); + expect(result.judges).toHaveLength(2); + expect(result.judges[0]).toEqual({ name: "test-pass", score: 1.0, weight: 0.6 }); + + const latest = evalStore.varStore.list({ + exactName: "@uwf/eval/fix-off-by-one/latest", + }); + expect(latest[0]?.value).toBe(result.runHash); + }); + + test("handles a judge with weight 0 (informational)", async () => { + const evalStore = makeEvalStore(); + const manifest = makeManifest([ + makeJudge("test-pass", 1.0, false), + makeJudge("token-stats", 0, true), + ]); + // token-stats is builtin → default runner would score 0; give scripted score + // that would skew the result if it were counted. + const runJudge = scriptedRunner({ "test-pass": 0.5, "token-stats": 1.0 }); + + const result = await collect( + { + evalStore, + taskDir: "/tmp/task", + workDir: "/tmp/work", + threadId: "THREAD123", + manifest, + config: CONFIG, + }, + runJudge, + ); + + // Only test-pass (weight 1.0) counts → overall = 0.5 + expect(result.overall).toBeCloseTo(0.5, 10); + expect(result.judges).toHaveLength(2); + const tokenStats = result.judges.find((j) => j.name === "token-stats"); + expect(tokenStats?.weight).toBe(0); + }); + + test("builtin judges are skipped with placeholder score 0", async () => { + const evalStore = makeEvalStore(); + const manifest = makeManifest([makeJudge("frontmatter-compliance", 1.0, true)]); + + // Use the default runner (no injected runner) → builtin skipped → score 0. + const result = await collect({ + evalStore, + taskDir: "/tmp/task", + workDir: "/tmp/work", + threadId: "THREAD123", + manifest, + config: CONFIG, + }); + + expect(result.overall).toBe(0); + expect(result.judges[0]).toEqual({ + name: "frontmatter-compliance", + score: 0, + weight: 1.0, + }); + }); +}); diff --git a/packages/eval/__tests__/prepare.test.ts b/packages/eval/__tests__/prepare.test.ts new file mode 100644 index 0000000..ed923f6 --- /dev/null +++ b/packages/eval/__tests__/prepare.test.ts @@ -0,0 +1,74 @@ +import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +import { afterEach, beforeEach, describe, expect, test } from "vitest"; + +import { prepare } from "../src/runner/index.js"; + +const TASK_YAML = ` +name: fix-off-by-one +description: Fix an off-by-one error +workflow: solve-issue +prompt: "Fix the bug" +limits: + maxSteps: 12 + timeoutMinutes: 20 +judges: + - name: frontmatter-compliance + weight: 0.5 + builtin: true + - name: test-pass + weight: 0.5 + entry: dist/judges/test-pass.js +`; + +let taskDir: string; + +beforeEach(async () => { + taskDir = await mkdtemp(join(tmpdir(), "uwf-eval-task-")); + await writeFile(join(taskDir, "task.yaml"), TASK_YAML, "utf8"); + const fixtureDir = join(taskDir, "fixture"); + await mkdir(join(fixtureDir, "src"), { recursive: true }); + await writeFile(join(fixtureDir, "src", "calc.ts"), "export const add = (a, b) => a + b + 1;\n"); + await writeFile(join(fixtureDir, "package.json"), '{ "name": "fixture" }\n'); +}); + +afterEach(async () => { + await rm(taskDir, { recursive: true, force: true }); +}); + +describe("prepare", () => { + test("returns the parsed manifest", async () => { + const result = await prepare(taskDir); + expect(result.taskDir).toBe(taskDir); + expect(result.manifest.name).toBe("fix-off-by-one"); + expect(result.manifest.workflow).toBe("solve-issue"); + expect(result.manifest.limits.maxSteps).toBe(12); + expect(result.manifest.judges).toHaveLength(2); + }); + + test("copies fixture into a fresh temp work dir", async () => { + const result = await prepare(taskDir); + expect(result.workDir).not.toBe(taskDir); + expect(result.workDir.startsWith(tmpdir())).toBe(true); + + const calc = await readFile(join(result.workDir, "src", "calc.ts"), "utf8"); + expect(calc).toContain("export const add"); + const pkg = await readFile(join(result.workDir, "package.json"), "utf8"); + expect(pkg).toContain("fixture"); + + await rm(result.workDir, { recursive: true, force: true }); + }); + + test("creates an empty work dir when no fixture/ exists", async () => { + const noFixtureDir = await mkdtemp(join(tmpdir(), "uwf-eval-nofix-")); + await writeFile(join(noFixtureDir, "task.yaml"), TASK_YAML, "utf8"); + + const result = await prepare(noFixtureDir); + expect(result.workDir.startsWith(tmpdir())).toBe(true); + + await rm(noFixtureDir, { recursive: true, force: true }); + await rm(result.workDir, { recursive: true, force: true }); + }); +}); diff --git a/packages/eval/src/commands/run.ts b/packages/eval/src/commands/run.ts index 4bc6b08..a066419 100644 --- a/packages/eval/src/commands/run.ts +++ b/packages/eval/src/commands/run.ts @@ -1,4 +1,52 @@ +import { resolve } from "node:path"; + import type { Command } from "commander"; +import type { RunResult } from "../runner/index.js"; +import { collect, execute, getEngineVersion, prepare } from "../runner/index.js"; +import type { EvalRunConfig } from "../storage/index.js"; +import { createEvalStore } from "../storage/index.js"; + +type RunCliOptions = { + agent: string; + model: string | undefined; + count: string; +}; + +async function runOnce( + taskDir: string, + agent: string, + model: string, + engineVersion: string, +): Promise { + const prepared = await prepare(taskDir); + const { manifest, workDir } = prepared; + + const { threadId } = await execute({ + workDir, + workflow: manifest.workflow, + prompt: manifest.prompt, + agent, + maxSteps: manifest.limits.maxSteps, + }); + + const evalStore = await createEvalStore(); + const config: EvalRunConfig = { agent, model, engineVersion }; + const collected = await collect({ + evalStore, + taskDir: prepared.taskDir, + workDir, + threadId, + manifest, + config, + }); + + return { + runHash: collected.runHash, + overall: collected.overall, + task: manifest.name, + judges: collected.judges, + }; +} export function registerRunCommand(program: Command): void { program @@ -7,8 +55,30 @@ export function registerRunCommand(program: Command): void { .option("--agent ", "agent adapter to use", "hermes") .option("--model ", "model override") .option("--count ", "number of eval runs", "1") - .action(async (_task: string, _opts: Record) => { - process.stderr.write("uwf-eval run: not yet implemented\n"); - process.exitCode = 1; + .action(async (task: string, opts: RunCliOptions) => { + const taskDir = resolve(task); + const agent = opts.agent; + const model = opts.model ?? ""; + const count = Number.parseInt(opts.count, 10); + if (!Number.isInteger(count) || count < 1) { + process.stderr.write("--count must be a positive integer\n"); + process.exitCode = 1; + return; + } + + const engineVersion = getEngineVersion(); + + try { + const results: RunResult[] = []; + for (let i = 0; i < count; i++) { + results.push(await runOnce(taskDir, agent, model, engineVersion)); + } + const output = count === 1 ? results[0] : results; + process.stdout.write(`${JSON.stringify(output)}\n`); + } catch (e) { + const message = e instanceof Error ? e.message : String(e); + process.stderr.write(`${message}\n`); + process.exitCode = 1; + } }); } diff --git a/packages/eval/src/index.ts b/packages/eval/src/index.ts index 69c6e69..dadd36f 100644 --- a/packages/eval/src/index.ts +++ b/packages/eval/src/index.ts @@ -1,15 +1,34 @@ -// Task manifest - // Judge types export type { JudgeInput, JudgeOutput } from "./judge/index.js"; -export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload } from "./storage/index.js"; +export type { + CollectInput, + CollectResult, + ExecuteInput, + ExecuteResult, + JudgeRunner, + JudgeRunOutput, + JudgeSummary, + PrepareResult, + RunOptions, + RunResult, +} from "./runner/index.js"; +// Runner (prepare → execute → collect) +export { collect, computeOverall, execute, getEngineVersion, prepare } from "./runner/index.js"; +export type { + EvalJudgeRecord, + EvalRunConfig, + EvalRunPayload, + EvalStore, +} from "./storage/index.js"; // Storage schemas and types export { + createEvalStore, EVAL_JUDGE_FRONTMATTER_SCHEMA, EVAL_JUDGE_HALLUCINATION_SCHEMA, EVAL_JUDGE_TOKEN_STATS_SCHEMA, EVAL_JUDGE_UPSTREAM_SCHEMA, EVAL_RUN_SCHEMA, + setEvalLatest, } from "./storage/index.js"; export type { JudgeEntry, TaskLimits, TaskManifest } from "./task/index.js"; export { loadTaskManifest, parseTaskManifest } from "./task/index.js"; diff --git a/packages/eval/src/runner/collect.ts b/packages/eval/src/runner/collect.ts new file mode 100644 index 0000000..b487dc8 --- /dev/null +++ b/packages/eval/src/runner/collect.ts @@ -0,0 +1,150 @@ +import { execFileSync } from "node:child_process"; +import { readFile } from "node:fs/promises"; +import { resolve } from "node:path"; + +import type { JSONSchema, Store } from "@ocas/core"; +import { putSchema } from "@ocas/core"; +import type { CasRef } from "@united-workforce/protocol"; +import { createLogger } from "@united-workforce/util"; + +import type { JudgeOutput } from "../judge/index.js"; +import type { EvalJudgeRecord, EvalRunPayload } from "../storage/index.js"; +import { EVAL_RUN_SCHEMA, setEvalLatest } from "../storage/index.js"; +import type { JudgeEntry } from "../task/index.js"; +import type { + CollectInput, + CollectResult, + JudgeRunner, + JudgeRunOutput, + JudgeSummary, +} from "./types.js"; + +const log = createLogger({ sink: { kind: "stderr" } }); + +const LOG_JUDGE = "CT6N3P2K"; +const LOG_STORED = "CT9V2Q7M"; + +/** Permissive schema for judge data without a dedicated schema (e.g. builtin placeholders). */ +const GENERIC_DATA_SCHEMA: JSONSchema = { type: "object" }; + +/** + * Compute the weighted overall score. Judges with weight 0 are informational + * and do not affect the result (they contribute 0 to both numerator and + * denominator). Returns 0 when total weight is 0. + */ +export function computeOverall(judges: ReadonlyArray<{ score: number; weight: number }>): number { + let totalWeight = 0; + let weighted = 0; + for (const judge of judges) { + totalWeight += judge.weight; + weighted += judge.score * judge.weight; + } + return totalWeight > 0 ? weighted / totalWeight : 0; +} + +/** Run a task-provided judge script: `node `. */ +async function runTaskJudge( + taskDir: string, + workDir: string, + threadId: string, + judge: JudgeEntry, +): Promise { + if (judge.entry === null) { + throw new Error(`judge "${judge.name}" is not builtin but has no entry`); + } + const entryPath = resolve(taskDir, judge.entry); + + let stdout: string; + try { + stdout = execFileSync("node", [entryPath, workDir, threadId], { + encoding: "utf8", + stdio: ["ignore", "pipe", "pipe"], + maxBuffer: 50 * 1024 * 1024, + }); + } catch (e) { + const message = e instanceof Error ? e.message : String(e); + throw new Error(`judge "${judge.name}" failed: ${message}`); + } + + const line = stdout.trim().split("\n").pop()?.trim() ?? ""; + let parsed: unknown; + try { + parsed = JSON.parse(line); + } catch { + throw new Error(`judge "${judge.name}" stdout is not valid JSON: ${line || "(empty)"}`); + } + const output = parsed as JudgeOutput; + if (typeof output.score !== "number") { + throw new Error(`judge "${judge.name}" output missing numeric score`); + } + + const schema = + judge.schema !== null ? await loadSchema(resolve(taskDir, judge.schema)) : GENERIC_DATA_SCHEMA; + return { score: output.score, data: output.data, schema }; +} + +/** Load and parse an OCAS JSON Schema file. */ +async function loadSchema(path: string): Promise { + const text = await readFile(path, "utf8"); + return JSON.parse(text) as JSONSchema; +} + +/** + * Default judge runner. Builtin judges are skipped for now (placeholder score 0 + * with empty data); task judges spawn their entry script. + */ +const defaultJudgeRunner: JudgeRunner = async (taskDir, workDir, threadId, judge) => { + if (judge.builtin) { + return { score: 0, data: {}, schema: GENERIC_DATA_SCHEMA }; + } + return runTaskJudge(taskDir, workDir, threadId, judge); +}; + +/** Persist judge data to CAS under its schema and return the CAS hash. */ +async function storeJudgeData(store: Store, schema: JSONSchema, data: unknown): Promise { + const schemaHash = await putSchema(store, schema); + return (await store.cas.put(schemaHash, data)) as CasRef; +} + +/** + * Run all judges, store their data and the overall eval-run record in CAS, then + * index the run under `@uwf/eval//latest`. + */ +export async function collect( + input: CollectInput, + runJudge: JudgeRunner = defaultJudgeRunner, +): Promise { + const { evalStore, taskDir, workDir, threadId, manifest, config } = input; + const { store, varStore } = evalStore; + + const records: EvalJudgeRecord[] = []; + for (const judge of manifest.judges) { + const result = await runJudge(taskDir, workDir, threadId, judge); + const dataHash = await storeJudgeData(store, result.schema, result.data); + records.push({ name: judge.name, score: result.score, weight: judge.weight, dataHash }); + log(LOG_JUDGE, `judge=${judge.name} score=${result.score} weight=${judge.weight}`); + } + + const overall = computeOverall(records); + + const payload: EvalRunPayload = { + task: manifest.name, + config, + threadId, + judges: records, + overall, + timestamp: Date.now(), + }; + + const schemaHash = await putSchema(store, EVAL_RUN_SCHEMA); + const runHash = (await store.cas.put(schemaHash, payload)) as string; + setEvalLatest(varStore, manifest.name, runHash); + log(LOG_STORED, `stored eval-run task=${manifest.name} hash=${runHash} overall=${overall}`); + + const judges: JudgeSummary[] = records.map((r) => ({ + name: r.name, + score: r.score, + weight: r.weight, + })); + return { runHash, overall, judges }; +} diff --git a/packages/eval/src/runner/execute.ts b/packages/eval/src/runner/execute.ts new file mode 100644 index 0000000..d0b35a3 --- /dev/null +++ b/packages/eval/src/runner/execute.ts @@ -0,0 +1,87 @@ +import { execFileSync } from "node:child_process"; + +import { createLogger } from "@united-workforce/util"; + +import type { ExecuteInput, ExecuteResult } from "./types.js"; + +const log = createLogger({ sink: { kind: "stderr" } }); + +const LOG_START = "EX5M2T9V"; +const LOG_EXEC = "EX7Q4K2N"; + +/** Resolve the uwf CLI binary. Override with `UWF_BIN` for testing. */ +function uwfBin(): string { + const override = process.env.UWF_BIN; + return override !== undefined && override !== "" ? override : "uwf"; +} + +/** Run a uwf subcommand and return trimmed stdout. */ +function runUwf(args: string[], cwd: string): string { + try { + return execFileSync(uwfBin(), args, { + encoding: "utf8", + stdio: ["ignore", "pipe", "pipe"], + maxBuffer: 50 * 1024 * 1024, + cwd, + }).trim(); + } catch (e) { + const err = e as NodeJS.ErrnoException & { stderr?: Buffer | string | null }; + const stderr = + err.stderr == null + ? "" + : typeof err.stderr === "string" + ? err.stderr + : err.stderr.toString("utf8"); + const detail = stderr.trim() !== "" ? `: ${stderr.trim()}` : ""; + throw new Error(`uwf ${args[0]} ${args[1]} failed${detail}`); + } +} + +/** Parse the thread ID from `uwf thread start` JSON output (`{ workflow, thread }`). */ +function parseThreadId(stdout: string): string { + let parsed: unknown; + try { + parsed = JSON.parse(stdout); + } catch { + throw new Error(`uwf thread start did not emit valid JSON: ${stdout || "(empty)"}`); + } + const obj = parsed as Record; + const thread = obj.thread; + if (typeof thread !== "string" || thread === "") { + throw new Error(`uwf thread start output missing thread id: ${stdout}`); + } + return thread; +} + +/** + * Execute a workflow: create a thread, then run it for up to `maxSteps` steps. + * Shells out to the uwf CLI rather than importing it directly. + */ +export async function execute(input: ExecuteInput): Promise { + const startOut = runUwf( + ["thread", "start", input.workflow, "-p", input.prompt, "--cwd", input.workDir], + input.workDir, + ); + const threadId = parseThreadId(startOut); + log(LOG_START, `thread started thread=${threadId} workflow=${input.workflow}`); + + runUwf( + ["thread", "exec", threadId, "--agent", input.agent, "-c", String(input.maxSteps)], + input.workDir, + ); + log(LOG_EXEC, `thread executed thread=${threadId} maxSteps=${input.maxSteps}`); + + return { threadId }; +} + +/** Best-effort lookup of the uwf engine version (`uwf -V`); "unknown" on failure. */ +export function getEngineVersion(): string { + try { + return execFileSync(uwfBin(), ["-V"], { + encoding: "utf8", + stdio: ["ignore", "pipe", "ignore"], + }).trim(); + } catch { + return "unknown"; + } +} diff --git a/packages/eval/src/runner/index.ts b/packages/eval/src/runner/index.ts new file mode 100644 index 0000000..7b2f95f --- /dev/null +++ b/packages/eval/src/runner/index.ts @@ -0,0 +1,15 @@ +export { collect, computeOverall } from "./collect.js"; +export { execute, getEngineVersion } from "./execute.js"; +export { prepare } from "./prepare.js"; +export type { + CollectInput, + CollectResult, + ExecuteInput, + ExecuteResult, + JudgeRunner, + JudgeRunOutput, + JudgeSummary, + PrepareResult, + RunOptions, + RunResult, +} from "./types.js"; diff --git a/packages/eval/src/runner/prepare.ts b/packages/eval/src/runner/prepare.ts new file mode 100644 index 0000000..4ee5ceb --- /dev/null +++ b/packages/eval/src/runner/prepare.ts @@ -0,0 +1,45 @@ +import { access, cp, mkdir, mkdtemp } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; + +import { createLogger } from "@united-workforce/util"; + +import { loadTaskManifest } from "../task/index.js"; +import type { PrepareResult } from "./types.js"; + +const log = createLogger({ sink: { kind: "stderr" } }); + +const LOG_PREPARE = "PRE4K2NQ"; +const LOG_FIXTURE = "PRE7M3VX"; + +/** Check whether a path exists. */ +async function pathExists(path: string): Promise { + try { + await access(path); + return true; + } catch { + return false; + } +} + +/** + * Prepare a task for execution: read its manifest and copy the fixture + * directory into a fresh temp working directory. + */ +export async function prepare(taskDir: string): Promise { + const manifest = await loadTaskManifest(taskDir); + log(LOG_PREPARE, `loaded task manifest name=${manifest.name} workflow=${manifest.workflow}`); + + const workDir = await mkdtemp(join(tmpdir(), "uwf-eval-")); + + const fixtureDir = join(taskDir, "fixture"); + if (await pathExists(fixtureDir)) { + await cp(fixtureDir, workDir, { recursive: true }); + log(LOG_FIXTURE, `copied fixture into workDir=${workDir}`); + } else { + await mkdir(workDir, { recursive: true }); + log(LOG_FIXTURE, `no fixture/ found, using empty workDir=${workDir}`); + } + + return { taskDir, workDir, manifest }; +} diff --git a/packages/eval/src/runner/types.ts b/packages/eval/src/runner/types.ts new file mode 100644 index 0000000..e785e19 --- /dev/null +++ b/packages/eval/src/runner/types.ts @@ -0,0 +1,85 @@ +import type { JSONSchema } from "@ocas/core"; + +import type { EvalRunConfig, EvalStore } from "../storage/index.js"; +import type { JudgeEntry, TaskManifest } from "../task/index.js"; + +/** Result of the prepare phase: task dir, temp working dir, parsed manifest. */ +export type PrepareResult = { + taskDir: string; + workDir: string; + manifest: TaskManifest; +}; + +/** Input to the execute phase. */ +export type ExecuteInput = { + /** Working directory the workflow runs in (the prepared temp dir). */ + workDir: string; + /** Workflow name or path (from task.yaml). */ + workflow: string; + /** Initial prompt for the thread. */ + prompt: string; + /** Agent adapter to use. */ + agent: string; + /** Maximum number of steps to execute. */ + maxSteps: number; +}; + +/** Result of the execute phase. */ +export type ExecuteResult = { + threadId: string; +}; + +/** Output produced by running a single judge. */ +export type JudgeRunOutput = { + score: number; + data: unknown; + /** Schema describing `data`, used when persisting to CAS. */ + schema: JSONSchema; +}; + +/** Pluggable judge execution strategy (injectable for testing). */ +export type JudgeRunner = ( + taskDir: string, + workDir: string, + threadId: string, + judge: JudgeEntry, +) => Promise; + +/** Input to the collect phase. */ +export type CollectInput = { + evalStore: EvalStore; + taskDir: string; + workDir: string; + threadId: string; + manifest: TaskManifest; + config: EvalRunConfig; +}; + +/** A single judge's summarized result in the run output. */ +export type JudgeSummary = { + name: string; + score: number; + weight: number; +}; + +/** Result of the collect phase. */ +export type CollectResult = { + runHash: string; + overall: number; + judges: JudgeSummary[]; +}; + +/** Options for a full eval run (from CLI flags). */ +export type RunOptions = { + agent: string; + model: string; + count: number; +}; + +/** Final result of a full eval run. */ +export type RunResult = { + runHash: string; + overall: number; + task: string; + judges: JudgeSummary[]; +}; diff --git a/packages/eval/src/storage/index.ts b/packages/eval/src/storage/index.ts index 8b0d554..53ea2d2 100644 --- a/packages/eval/src/storage/index.ts +++ b/packages/eval/src/storage/index.ts @@ -5,4 +5,5 @@ export { EVAL_JUDGE_UPSTREAM_SCHEMA, EVAL_RUN_SCHEMA, } from "./schemas.js"; -export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload } from "./types.js"; +export { createEvalStore, setEvalLatest } from "./store.js"; +export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload, EvalStore } from "./types.js"; diff --git a/packages/eval/src/storage/store.ts b/packages/eval/src/storage/store.ts new file mode 100644 index 0000000..ce69224 --- /dev/null +++ b/packages/eval/src/storage/store.ts @@ -0,0 +1,42 @@ +import { mkdir } from "node:fs/promises"; +import { homedir } from "node:os"; +import { join } from "node:path"; +import type { VarStore } from "@ocas/core"; +import { bootstrap, type Store } from "@ocas/core"; +import { createFsStore, createSqliteVarStore } from "@ocas/fs"; + +import type { EvalStore } from "./types.js"; + +/** Variable name prefix for eval run pointers (`@uwf/eval//latest`). */ +const EVAL_VAR_PREFIX = "@uwf/eval/"; + +/** + * Resolve the global CAS directory shared by all uwf and ocas tools. + * Priority: `OCAS_HOME` → default ~/.ocas (matches uwf CLI's getGlobalCasDir). + */ +function getGlobalCasDir(): string { + const primary = process.env.OCAS_HOME; + if (primary !== undefined && primary !== "") { + return primary; + } + return join(homedir(), ".ocas"); +} + +/** + * Open the unified OCAS store on the filesystem. + * Shares the same CAS + variable backend as the uwf CLI. + */ +export async function createEvalStore(): Promise { + const casDir = getGlobalCasDir(); + await mkdir(casDir, { recursive: true }); + const cas = createFsStore(casDir); + const { var: varStore, tag } = createSqliteVarStore(join(casDir, "vars"), cas); + const store: Store = { cas, var: varStore, tag }; + bootstrap(store); + return { store, varStore }; +} + +/** Set the `@uwf/eval//latest` variable to point at a run hash. */ +export function setEvalLatest(varStore: VarStore, taskName: string, runHash: string): void { + varStore.set(`${EVAL_VAR_PREFIX}${taskName}/latest`, runHash); +} diff --git a/packages/eval/src/storage/types.ts b/packages/eval/src/storage/types.ts index 862e45c..1348eb7 100644 --- a/packages/eval/src/storage/types.ts +++ b/packages/eval/src/storage/types.ts @@ -1,5 +1,12 @@ +import type { Store, VarStore } from "@ocas/core"; import type { CasRef } from "@united-workforce/protocol"; +/** Handle to the OCAS store used for eval persistence. */ +export type EvalStore = { + store: Store; + varStore: VarStore; +}; + /** A single judge result within an eval run. */ export type EvalJudgeRecord = { name: string;