feat: eval run command — prepare, execute, collect pipeline
CI / check (pull_request) Successful in 1m45s

Implement the uwf-eval run <task-dir> command with 3-phase pipeline:

- prepare: read task.yaml, copy fixture/ to temp workdir
- execute: shell out to uwf thread start + exec
- collect: run judges, compute weighted score, store CAS node,
  set @uwf/eval/<task>/latest variable

Changes:
- src/runner/ — types, prepare, execute, collect, index
- src/storage/store.ts — createEvalStore(), setEvalLatest()
- src/commands/run.ts — full pipeline wiring with --agent/--model/--count
- 9 new tests (prepare + collect + weighted scoring)

Builtin judges return placeholder score 0 (Phase 1c).

Refs #70
This commit is contained in:
2026-06-04 23:59:21 +00:00
parent 99619d85db
commit fae9e9ed3a
12 changed files with 759 additions and 7 deletions
+157
View File
@@ -0,0 +1,157 @@
import { bootstrap, createMemoryStore } from "@ocas/core";
import { describe, expect, test } from "vitest";
import type { JudgeRunner } from "../src/runner/index.js";
import { collect, computeOverall } from "../src/runner/index.js";
import type { EvalRunConfig, EvalStore } from "../src/storage/index.js";
import type { JudgeEntry, TaskManifest } from "../src/task/index.js";
function makeJudge(name: string, weight: number, builtin: boolean): JudgeEntry {
return {
name,
weight,
builtin,
entry: builtin ? null : `dist/judges/${name}.js`,
schema: null,
};
}
function makeManifest(judges: JudgeEntry[]): TaskManifest {
return {
name: "fix-off-by-one",
description: "test task",
workflow: "solve-issue",
prompt: "Fix the bug",
limits: { maxSteps: 10, timeoutMinutes: 30 },
judges,
};
}
function makeEvalStore(): EvalStore {
const store = createMemoryStore();
bootstrap(store);
return { store, varStore: store.var };
}
const CONFIG: EvalRunConfig = {
agent: "hermes",
model: "claude-sonnet-4",
engineVersion: "test",
};
/** Returns a fixed score per judge name. */
function scriptedRunner(scores: Record<string, number>): JudgeRunner {
return async (_taskDir, _workDir, _threadId, judge) => ({
score: scores[judge.name] ?? 0,
data: { judged: judge.name },
schema: { type: "object" },
});
}
describe("computeOverall", () => {
test("computes the weighted average correctly", () => {
const overall = computeOverall([
{ score: 0.8, weight: 0.3 },
{ score: 0.6, weight: 0.3 },
{ score: 1.0, weight: 0.4 },
]);
// 0.24 + 0.18 + 0.4 = 0.82
expect(overall).toBeCloseTo(0.82, 10);
});
test("a weight-0 judge does not affect the result", () => {
const withInformational = computeOverall([
{ score: 1.0, weight: 1.0 },
{ score: 0.0, weight: 0.0 },
]);
expect(withInformational).toBe(1.0);
});
test("returns 0 when total weight is 0", () => {
expect(computeOverall([{ score: 0.5, weight: 0 }])).toBe(0);
});
});
describe("collect", () => {
test("computes weighted score correctly across judges", async () => {
const evalStore = makeEvalStore();
const manifest = makeManifest([
makeJudge("test-pass", 0.6, false),
makeJudge("code-quality", 0.4, false),
]);
const runJudge = scriptedRunner({ "test-pass": 1.0, "code-quality": 0.5 });
const result = await collect(
{
evalStore,
taskDir: "/tmp/task",
workDir: "/tmp/work",
threadId: "THREAD123",
manifest,
config: CONFIG,
},
runJudge,
);
// 1.0 * 0.6 + 0.5 * 0.4 = 0.8
expect(result.overall).toBeCloseTo(0.8, 10);
expect(result.runHash).toBeTruthy();
expect(result.judges).toHaveLength(2);
expect(result.judges[0]).toEqual({ name: "test-pass", score: 1.0, weight: 0.6 });
const latest = evalStore.varStore.list({
exactName: "@uwf/eval/fix-off-by-one/latest",
});
expect(latest[0]?.value).toBe(result.runHash);
});
test("handles a judge with weight 0 (informational)", async () => {
const evalStore = makeEvalStore();
const manifest = makeManifest([
makeJudge("test-pass", 1.0, false),
makeJudge("token-stats", 0, true),
]);
// token-stats is builtin → default runner would score 0; give scripted score
// that would skew the result if it were counted.
const runJudge = scriptedRunner({ "test-pass": 0.5, "token-stats": 1.0 });
const result = await collect(
{
evalStore,
taskDir: "/tmp/task",
workDir: "/tmp/work",
threadId: "THREAD123",
manifest,
config: CONFIG,
},
runJudge,
);
// Only test-pass (weight 1.0) counts → overall = 0.5
expect(result.overall).toBeCloseTo(0.5, 10);
expect(result.judges).toHaveLength(2);
const tokenStats = result.judges.find((j) => j.name === "token-stats");
expect(tokenStats?.weight).toBe(0);
});
test("builtin judges are skipped with placeholder score 0", async () => {
const evalStore = makeEvalStore();
const manifest = makeManifest([makeJudge("frontmatter-compliance", 1.0, true)]);
// Use the default runner (no injected runner) → builtin skipped → score 0.
const result = await collect({
evalStore,
taskDir: "/tmp/task",
workDir: "/tmp/work",
threadId: "THREAD123",
manifest,
config: CONFIG,
});
expect(result.overall).toBe(0);
expect(result.judges[0]).toEqual({
name: "frontmatter-compliance",
score: 0,
weight: 1.0,
});
});
});
+74
View File
@@ -0,0 +1,74 @@
import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, beforeEach, describe, expect, test } from "vitest";
import { prepare } from "../src/runner/index.js";
const TASK_YAML = `
name: fix-off-by-one
description: Fix an off-by-one error
workflow: solve-issue
prompt: "Fix the bug"
limits:
maxSteps: 12
timeoutMinutes: 20
judges:
- name: frontmatter-compliance
weight: 0.5
builtin: true
- name: test-pass
weight: 0.5
entry: dist/judges/test-pass.js
`;
let taskDir: string;
beforeEach(async () => {
taskDir = await mkdtemp(join(tmpdir(), "uwf-eval-task-"));
await writeFile(join(taskDir, "task.yaml"), TASK_YAML, "utf8");
const fixtureDir = join(taskDir, "fixture");
await mkdir(join(fixtureDir, "src"), { recursive: true });
await writeFile(join(fixtureDir, "src", "calc.ts"), "export const add = (a, b) => a + b + 1;\n");
await writeFile(join(fixtureDir, "package.json"), '{ "name": "fixture" }\n');
});
afterEach(async () => {
await rm(taskDir, { recursive: true, force: true });
});
describe("prepare", () => {
test("returns the parsed manifest", async () => {
const result = await prepare(taskDir);
expect(result.taskDir).toBe(taskDir);
expect(result.manifest.name).toBe("fix-off-by-one");
expect(result.manifest.workflow).toBe("solve-issue");
expect(result.manifest.limits.maxSteps).toBe(12);
expect(result.manifest.judges).toHaveLength(2);
});
test("copies fixture into a fresh temp work dir", async () => {
const result = await prepare(taskDir);
expect(result.workDir).not.toBe(taskDir);
expect(result.workDir.startsWith(tmpdir())).toBe(true);
const calc = await readFile(join(result.workDir, "src", "calc.ts"), "utf8");
expect(calc).toContain("export const add");
const pkg = await readFile(join(result.workDir, "package.json"), "utf8");
expect(pkg).toContain("fixture");
await rm(result.workDir, { recursive: true, force: true });
});
test("creates an empty work dir when no fixture/ exists", async () => {
const noFixtureDir = await mkdtemp(join(tmpdir(), "uwf-eval-nofix-"));
await writeFile(join(noFixtureDir, "task.yaml"), TASK_YAML, "utf8");
const result = await prepare(noFixtureDir);
expect(result.workDir.startsWith(tmpdir())).toBe(true);
await rm(noFixtureDir, { recursive: true, force: true });
await rm(result.workDir, { recursive: true, force: true });
});
});
+72 -2
View File
@@ -1,4 +1,52 @@
import { resolve } from "node:path";
import type { Command } from "commander"; import type { Command } from "commander";
import type { RunResult } from "../runner/index.js";
import { collect, execute, getEngineVersion, prepare } from "../runner/index.js";
import type { EvalRunConfig } from "../storage/index.js";
import { createEvalStore } from "../storage/index.js";
type RunCliOptions = {
agent: string;
model: string | undefined;
count: string;
};
async function runOnce(
taskDir: string,
agent: string,
model: string,
engineVersion: string,
): Promise<RunResult> {
const prepared = await prepare(taskDir);
const { manifest, workDir } = prepared;
const { threadId } = await execute({
workDir,
workflow: manifest.workflow,
prompt: manifest.prompt,
agent,
maxSteps: manifest.limits.maxSteps,
});
const evalStore = await createEvalStore();
const config: EvalRunConfig = { agent, model, engineVersion };
const collected = await collect({
evalStore,
taskDir: prepared.taskDir,
workDir,
threadId,
manifest,
config,
});
return {
runHash: collected.runHash,
overall: collected.overall,
task: manifest.name,
judges: collected.judges,
};
}
export function registerRunCommand(program: Command): void { export function registerRunCommand(program: Command): void {
program program
@@ -7,8 +55,30 @@ export function registerRunCommand(program: Command): void {
.option("--agent <name>", "agent adapter to use", "hermes") .option("--agent <name>", "agent adapter to use", "hermes")
.option("--model <model>", "model override") .option("--model <model>", "model override")
.option("--count <n>", "number of eval runs", "1") .option("--count <n>", "number of eval runs", "1")
.action(async (_task: string, _opts: Record<string, unknown>) => { .action(async (task: string, opts: RunCliOptions) => {
process.stderr.write("uwf-eval run: not yet implemented\n"); const taskDir = resolve(task);
const agent = opts.agent;
const model = opts.model ?? "";
const count = Number.parseInt(opts.count, 10);
if (!Number.isInteger(count) || count < 1) {
process.stderr.write("--count must be a positive integer\n");
process.exitCode = 1; process.exitCode = 1;
return;
}
const engineVersion = getEngineVersion();
try {
const results: RunResult[] = [];
for (let i = 0; i < count; i++) {
results.push(await runOnce(taskDir, agent, model, engineVersion));
}
const output = count === 1 ? results[0] : results;
process.stdout.write(`${JSON.stringify(output)}\n`);
} catch (e) {
const message = e instanceof Error ? e.message : String(e);
process.stderr.write(`${message}\n`);
process.exitCode = 1;
}
}); });
} }
+22 -3
View File
@@ -1,15 +1,34 @@
// Task manifest
// Judge types // Judge types
export type { JudgeInput, JudgeOutput } from "./judge/index.js"; export type { JudgeInput, JudgeOutput } from "./judge/index.js";
export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload } from "./storage/index.js"; export type {
CollectInput,
CollectResult,
ExecuteInput,
ExecuteResult,
JudgeRunner,
JudgeRunOutput,
JudgeSummary,
PrepareResult,
RunOptions,
RunResult,
} from "./runner/index.js";
// Runner (prepare → execute → collect)
export { collect, computeOverall, execute, getEngineVersion, prepare } from "./runner/index.js";
export type {
EvalJudgeRecord,
EvalRunConfig,
EvalRunPayload,
EvalStore,
} from "./storage/index.js";
// Storage schemas and types // Storage schemas and types
export { export {
createEvalStore,
EVAL_JUDGE_FRONTMATTER_SCHEMA, EVAL_JUDGE_FRONTMATTER_SCHEMA,
EVAL_JUDGE_HALLUCINATION_SCHEMA, EVAL_JUDGE_HALLUCINATION_SCHEMA,
EVAL_JUDGE_TOKEN_STATS_SCHEMA, EVAL_JUDGE_TOKEN_STATS_SCHEMA,
EVAL_JUDGE_UPSTREAM_SCHEMA, EVAL_JUDGE_UPSTREAM_SCHEMA,
EVAL_RUN_SCHEMA, EVAL_RUN_SCHEMA,
setEvalLatest,
} from "./storage/index.js"; } from "./storage/index.js";
export type { JudgeEntry, TaskLimits, TaskManifest } from "./task/index.js"; export type { JudgeEntry, TaskLimits, TaskManifest } from "./task/index.js";
export { loadTaskManifest, parseTaskManifest } from "./task/index.js"; export { loadTaskManifest, parseTaskManifest } from "./task/index.js";
+150
View File
@@ -0,0 +1,150 @@
import { execFileSync } from "node:child_process";
import { readFile } from "node:fs/promises";
import { resolve } from "node:path";
import type { JSONSchema, Store } from "@ocas/core";
import { putSchema } from "@ocas/core";
import type { CasRef } from "@united-workforce/protocol";
import { createLogger } from "@united-workforce/util";
import type { JudgeOutput } from "../judge/index.js";
import type { EvalJudgeRecord, EvalRunPayload } from "../storage/index.js";
import { EVAL_RUN_SCHEMA, setEvalLatest } from "../storage/index.js";
import type { JudgeEntry } from "../task/index.js";
import type {
CollectInput,
CollectResult,
JudgeRunner,
JudgeRunOutput,
JudgeSummary,
} from "./types.js";
const log = createLogger({ sink: { kind: "stderr" } });
const LOG_JUDGE = "CT6N3P2K";
const LOG_STORED = "CT9V2Q7M";
/** Permissive schema for judge data without a dedicated schema (e.g. builtin placeholders). */
const GENERIC_DATA_SCHEMA: JSONSchema = { type: "object" };
/**
* Compute the weighted overall score. Judges with weight 0 are informational
* and do not affect the result (they contribute 0 to both numerator and
* denominator). Returns 0 when total weight is 0.
*/
export function computeOverall(judges: ReadonlyArray<{ score: number; weight: number }>): number {
let totalWeight = 0;
let weighted = 0;
for (const judge of judges) {
totalWeight += judge.weight;
weighted += judge.score * judge.weight;
}
return totalWeight > 0 ? weighted / totalWeight : 0;
}
/** Run a task-provided judge script: `node <entry> <cwd> <threadId>`. */
async function runTaskJudge(
taskDir: string,
workDir: string,
threadId: string,
judge: JudgeEntry,
): Promise<JudgeRunOutput> {
if (judge.entry === null) {
throw new Error(`judge "${judge.name}" is not builtin but has no entry`);
}
const entryPath = resolve(taskDir, judge.entry);
let stdout: string;
try {
stdout = execFileSync("node", [entryPath, workDir, threadId], {
encoding: "utf8",
stdio: ["ignore", "pipe", "pipe"],
maxBuffer: 50 * 1024 * 1024,
});
} catch (e) {
const message = e instanceof Error ? e.message : String(e);
throw new Error(`judge "${judge.name}" failed: ${message}`);
}
const line = stdout.trim().split("\n").pop()?.trim() ?? "";
let parsed: unknown;
try {
parsed = JSON.parse(line);
} catch {
throw new Error(`judge "${judge.name}" stdout is not valid JSON: ${line || "(empty)"}`);
}
const output = parsed as JudgeOutput;
if (typeof output.score !== "number") {
throw new Error(`judge "${judge.name}" output missing numeric score`);
}
const schema =
judge.schema !== null ? await loadSchema(resolve(taskDir, judge.schema)) : GENERIC_DATA_SCHEMA;
return { score: output.score, data: output.data, schema };
}
/** Load and parse an OCAS JSON Schema file. */
async function loadSchema(path: string): Promise<JSONSchema> {
const text = await readFile(path, "utf8");
return JSON.parse(text) as JSONSchema;
}
/**
* Default judge runner. Builtin judges are skipped for now (placeholder score 0
* with empty data); task judges spawn their entry script.
*/
const defaultJudgeRunner: JudgeRunner = async (taskDir, workDir, threadId, judge) => {
if (judge.builtin) {
return { score: 0, data: {}, schema: GENERIC_DATA_SCHEMA };
}
return runTaskJudge(taskDir, workDir, threadId, judge);
};
/** Persist judge data to CAS under its schema and return the CAS hash. */
async function storeJudgeData(store: Store, schema: JSONSchema, data: unknown): Promise<CasRef> {
const schemaHash = await putSchema(store, schema);
return (await store.cas.put(schemaHash, data)) as CasRef;
}
/**
* Run all judges, store their data and the overall eval-run record in CAS, then
* index the run under `@uwf/eval/<task>/latest`.
*/
export async function collect(
input: CollectInput,
runJudge: JudgeRunner = defaultJudgeRunner,
): Promise<CollectResult> {
const { evalStore, taskDir, workDir, threadId, manifest, config } = input;
const { store, varStore } = evalStore;
const records: EvalJudgeRecord[] = [];
for (const judge of manifest.judges) {
const result = await runJudge(taskDir, workDir, threadId, judge);
const dataHash = await storeJudgeData(store, result.schema, result.data);
records.push({ name: judge.name, score: result.score, weight: judge.weight, dataHash });
log(LOG_JUDGE, `judge=${judge.name} score=${result.score} weight=${judge.weight}`);
}
const overall = computeOverall(records);
const payload: EvalRunPayload = {
task: manifest.name,
config,
threadId,
judges: records,
overall,
timestamp: Date.now(),
};
const schemaHash = await putSchema(store, EVAL_RUN_SCHEMA);
const runHash = (await store.cas.put(schemaHash, payload)) as string;
setEvalLatest(varStore, manifest.name, runHash);
log(LOG_STORED, `stored eval-run task=${manifest.name} hash=${runHash} overall=${overall}`);
const judges: JudgeSummary[] = records.map((r) => ({
name: r.name,
score: r.score,
weight: r.weight,
}));
return { runHash, overall, judges };
}
+87
View File
@@ -0,0 +1,87 @@
import { execFileSync } from "node:child_process";
import { createLogger } from "@united-workforce/util";
import type { ExecuteInput, ExecuteResult } from "./types.js";
const log = createLogger({ sink: { kind: "stderr" } });
const LOG_START = "EX5M2T9V";
const LOG_EXEC = "EX7Q4K2N";
/** Resolve the uwf CLI binary. Override with `UWF_BIN` for testing. */
function uwfBin(): string {
const override = process.env.UWF_BIN;
return override !== undefined && override !== "" ? override : "uwf";
}
/** Run a uwf subcommand and return trimmed stdout. */
function runUwf(args: string[], cwd: string): string {
try {
return execFileSync(uwfBin(), args, {
encoding: "utf8",
stdio: ["ignore", "pipe", "pipe"],
maxBuffer: 50 * 1024 * 1024,
cwd,
}).trim();
} catch (e) {
const err = e as NodeJS.ErrnoException & { stderr?: Buffer | string | null };
const stderr =
err.stderr == null
? ""
: typeof err.stderr === "string"
? err.stderr
: err.stderr.toString("utf8");
const detail = stderr.trim() !== "" ? `: ${stderr.trim()}` : "";
throw new Error(`uwf ${args[0]} ${args[1]} failed${detail}`);
}
}
/** Parse the thread ID from `uwf thread start` JSON output (`{ workflow, thread }`). */
function parseThreadId(stdout: string): string {
let parsed: unknown;
try {
parsed = JSON.parse(stdout);
} catch {
throw new Error(`uwf thread start did not emit valid JSON: ${stdout || "(empty)"}`);
}
const obj = parsed as Record<string, unknown>;
const thread = obj.thread;
if (typeof thread !== "string" || thread === "") {
throw new Error(`uwf thread start output missing thread id: ${stdout}`);
}
return thread;
}
/**
* Execute a workflow: create a thread, then run it for up to `maxSteps` steps.
* Shells out to the uwf CLI rather than importing it directly.
*/
export async function execute(input: ExecuteInput): Promise<ExecuteResult> {
const startOut = runUwf(
["thread", "start", input.workflow, "-p", input.prompt, "--cwd", input.workDir],
input.workDir,
);
const threadId = parseThreadId(startOut);
log(LOG_START, `thread started thread=${threadId} workflow=${input.workflow}`);
runUwf(
["thread", "exec", threadId, "--agent", input.agent, "-c", String(input.maxSteps)],
input.workDir,
);
log(LOG_EXEC, `thread executed thread=${threadId} maxSteps=${input.maxSteps}`);
return { threadId };
}
/** Best-effort lookup of the uwf engine version (`uwf -V`); "unknown" on failure. */
export function getEngineVersion(): string {
try {
return execFileSync(uwfBin(), ["-V"], {
encoding: "utf8",
stdio: ["ignore", "pipe", "ignore"],
}).trim();
} catch {
return "unknown";
}
}
+15
View File
@@ -0,0 +1,15 @@
export { collect, computeOverall } from "./collect.js";
export { execute, getEngineVersion } from "./execute.js";
export { prepare } from "./prepare.js";
export type {
CollectInput,
CollectResult,
ExecuteInput,
ExecuteResult,
JudgeRunner,
JudgeRunOutput,
JudgeSummary,
PrepareResult,
RunOptions,
RunResult,
} from "./types.js";
+45
View File
@@ -0,0 +1,45 @@
import { access, cp, mkdir, mkdtemp } from "node:fs/promises";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { createLogger } from "@united-workforce/util";
import { loadTaskManifest } from "../task/index.js";
import type { PrepareResult } from "./types.js";
const log = createLogger({ sink: { kind: "stderr" } });
const LOG_PREPARE = "PRE4K2NQ";
const LOG_FIXTURE = "PRE7M3VX";
/** Check whether a path exists. */
async function pathExists(path: string): Promise<boolean> {
try {
await access(path);
return true;
} catch {
return false;
}
}
/**
* Prepare a task for execution: read its manifest and copy the fixture
* directory into a fresh temp working directory.
*/
export async function prepare(taskDir: string): Promise<PrepareResult> {
const manifest = await loadTaskManifest(taskDir);
log(LOG_PREPARE, `loaded task manifest name=${manifest.name} workflow=${manifest.workflow}`);
const workDir = await mkdtemp(join(tmpdir(), "uwf-eval-"));
const fixtureDir = join(taskDir, "fixture");
if (await pathExists(fixtureDir)) {
await cp(fixtureDir, workDir, { recursive: true });
log(LOG_FIXTURE, `copied fixture into workDir=${workDir}`);
} else {
await mkdir(workDir, { recursive: true });
log(LOG_FIXTURE, `no fixture/ found, using empty workDir=${workDir}`);
}
return { taskDir, workDir, manifest };
}
+85
View File
@@ -0,0 +1,85 @@
import type { JSONSchema } from "@ocas/core";
import type { EvalRunConfig, EvalStore } from "../storage/index.js";
import type { JudgeEntry, TaskManifest } from "../task/index.js";
/** Result of the prepare phase: task dir, temp working dir, parsed manifest. */
export type PrepareResult = {
taskDir: string;
workDir: string;
manifest: TaskManifest;
};
/** Input to the execute phase. */
export type ExecuteInput = {
/** Working directory the workflow runs in (the prepared temp dir). */
workDir: string;
/** Workflow name or path (from task.yaml). */
workflow: string;
/** Initial prompt for the thread. */
prompt: string;
/** Agent adapter to use. */
agent: string;
/** Maximum number of steps to execute. */
maxSteps: number;
};
/** Result of the execute phase. */
export type ExecuteResult = {
threadId: string;
};
/** Output produced by running a single judge. */
export type JudgeRunOutput = {
score: number;
data: unknown;
/** Schema describing `data`, used when persisting to CAS. */
schema: JSONSchema;
};
/** Pluggable judge execution strategy (injectable for testing). */
export type JudgeRunner = (
taskDir: string,
workDir: string,
threadId: string,
judge: JudgeEntry,
) => Promise<JudgeRunOutput>;
/** Input to the collect phase. */
export type CollectInput = {
evalStore: EvalStore;
taskDir: string;
workDir: string;
threadId: string;
manifest: TaskManifest;
config: EvalRunConfig;
};
/** A single judge's summarized result in the run output. */
export type JudgeSummary = {
name: string;
score: number;
weight: number;
};
/** Result of the collect phase. */
export type CollectResult = {
runHash: string;
overall: number;
judges: JudgeSummary[];
};
/** Options for a full eval run (from CLI flags). */
export type RunOptions = {
agent: string;
model: string;
count: number;
};
/** Final result of a full eval run. */
export type RunResult = {
runHash: string;
overall: number;
task: string;
judges: JudgeSummary[];
};
+2 -1
View File
@@ -5,4 +5,5 @@ export {
EVAL_JUDGE_UPSTREAM_SCHEMA, EVAL_JUDGE_UPSTREAM_SCHEMA,
EVAL_RUN_SCHEMA, EVAL_RUN_SCHEMA,
} from "./schemas.js"; } from "./schemas.js";
export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload } from "./types.js"; export { createEvalStore, setEvalLatest } from "./store.js";
export type { EvalJudgeRecord, EvalRunConfig, EvalRunPayload, EvalStore } from "./types.js";
+42
View File
@@ -0,0 +1,42 @@
import { mkdir } from "node:fs/promises";
import { homedir } from "node:os";
import { join } from "node:path";
import type { VarStore } from "@ocas/core";
import { bootstrap, type Store } from "@ocas/core";
import { createFsStore, createSqliteVarStore } from "@ocas/fs";
import type { EvalStore } from "./types.js";
/** Variable name prefix for eval run pointers (`@uwf/eval/<task>/latest`). */
const EVAL_VAR_PREFIX = "@uwf/eval/";
/**
* Resolve the global CAS directory shared by all uwf and ocas tools.
* Priority: `OCAS_HOME` → default ~/.ocas (matches uwf CLI's getGlobalCasDir).
*/
function getGlobalCasDir(): string {
const primary = process.env.OCAS_HOME;
if (primary !== undefined && primary !== "") {
return primary;
}
return join(homedir(), ".ocas");
}
/**
* Open the unified OCAS store on the filesystem.
* Shares the same CAS + variable backend as the uwf CLI.
*/
export async function createEvalStore(): Promise<EvalStore> {
const casDir = getGlobalCasDir();
await mkdir(casDir, { recursive: true });
const cas = createFsStore(casDir);
const { var: varStore, tag } = createSqliteVarStore(join(casDir, "vars"), cas);
const store: Store = { cas, var: varStore, tag };
bootstrap(store);
return { store, varStore };
}
/** Set the `@uwf/eval/<task>/latest` variable to point at a run hash. */
export function setEvalLatest(varStore: VarStore, taskName: string, runHash: string): void {
varStore.set(`${EVAL_VAR_PREFIX}${taskName}/latest`, runHash);
}
+7
View File
@@ -1,5 +1,12 @@
import type { Store, VarStore } from "@ocas/core";
import type { CasRef } from "@united-workforce/protocol"; import type { CasRef } from "@united-workforce/protocol";
/** Handle to the OCAS store used for eval persistence. */
export type EvalStore = {
store: Store;
varStore: VarStore;
};
/** A single judge result within an eval run. */ /** A single judge result within an eval run. */
export type EvalJudgeRecord = { export type EvalJudgeRecord = {
name: string; name: string;