feat: eval run command — prepare, execute, collect pipeline
CI / check (pull_request) Successful in 1m45s
CI / check (pull_request) Successful in 1m45s
Implement the uwf-eval run <task-dir> command with 3-phase pipeline: - prepare: read task.yaml, copy fixture/ to temp workdir - execute: shell out to uwf thread start + exec - collect: run judges, compute weighted score, store CAS node, set @uwf/eval/<task>/latest variable Changes: - src/runner/ — types, prepare, execute, collect, index - src/storage/store.ts — createEvalStore(), setEvalLatest() - src/commands/run.ts — full pipeline wiring with --agent/--model/--count - 9 new tests (prepare + collect + weighted scoring) Builtin judges return placeholder score 0 (Phase 1c). Refs #70
This commit is contained in:
@@ -0,0 +1,150 @@
|
||||
import { execFileSync } from "node:child_process";
|
||||
import { readFile } from "node:fs/promises";
|
||||
import { resolve } from "node:path";
|
||||
|
||||
import type { JSONSchema, Store } from "@ocas/core";
|
||||
import { putSchema } from "@ocas/core";
|
||||
import type { CasRef } from "@united-workforce/protocol";
|
||||
import { createLogger } from "@united-workforce/util";
|
||||
|
||||
import type { JudgeOutput } from "../judge/index.js";
|
||||
import type { EvalJudgeRecord, EvalRunPayload } from "../storage/index.js";
|
||||
import { EVAL_RUN_SCHEMA, setEvalLatest } from "../storage/index.js";
|
||||
import type { JudgeEntry } from "../task/index.js";
|
||||
import type {
|
||||
CollectInput,
|
||||
CollectResult,
|
||||
JudgeRunner,
|
||||
JudgeRunOutput,
|
||||
JudgeSummary,
|
||||
} from "./types.js";
|
||||
|
||||
const log = createLogger({ sink: { kind: "stderr" } });
|
||||
|
||||
const LOG_JUDGE = "CT6N3P2K";
|
||||
const LOG_STORED = "CT9V2Q7M";
|
||||
|
||||
/** Permissive schema for judge data without a dedicated schema (e.g. builtin placeholders). */
|
||||
const GENERIC_DATA_SCHEMA: JSONSchema = { type: "object" };
|
||||
|
||||
/**
|
||||
* Compute the weighted overall score. Judges with weight 0 are informational
|
||||
* and do not affect the result (they contribute 0 to both numerator and
|
||||
* denominator). Returns 0 when total weight is 0.
|
||||
*/
|
||||
export function computeOverall(judges: ReadonlyArray<{ score: number; weight: number }>): number {
|
||||
let totalWeight = 0;
|
||||
let weighted = 0;
|
||||
for (const judge of judges) {
|
||||
totalWeight += judge.weight;
|
||||
weighted += judge.score * judge.weight;
|
||||
}
|
||||
return totalWeight > 0 ? weighted / totalWeight : 0;
|
||||
}
|
||||
|
||||
/** Run a task-provided judge script: `node <entry> <cwd> <threadId>`. */
|
||||
async function runTaskJudge(
|
||||
taskDir: string,
|
||||
workDir: string,
|
||||
threadId: string,
|
||||
judge: JudgeEntry,
|
||||
): Promise<JudgeRunOutput> {
|
||||
if (judge.entry === null) {
|
||||
throw new Error(`judge "${judge.name}" is not builtin but has no entry`);
|
||||
}
|
||||
const entryPath = resolve(taskDir, judge.entry);
|
||||
|
||||
let stdout: string;
|
||||
try {
|
||||
stdout = execFileSync("node", [entryPath, workDir, threadId], {
|
||||
encoding: "utf8",
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
maxBuffer: 50 * 1024 * 1024,
|
||||
});
|
||||
} catch (e) {
|
||||
const message = e instanceof Error ? e.message : String(e);
|
||||
throw new Error(`judge "${judge.name}" failed: ${message}`);
|
||||
}
|
||||
|
||||
const line = stdout.trim().split("\n").pop()?.trim() ?? "";
|
||||
let parsed: unknown;
|
||||
try {
|
||||
parsed = JSON.parse(line);
|
||||
} catch {
|
||||
throw new Error(`judge "${judge.name}" stdout is not valid JSON: ${line || "(empty)"}`);
|
||||
}
|
||||
const output = parsed as JudgeOutput;
|
||||
if (typeof output.score !== "number") {
|
||||
throw new Error(`judge "${judge.name}" output missing numeric score`);
|
||||
}
|
||||
|
||||
const schema =
|
||||
judge.schema !== null ? await loadSchema(resolve(taskDir, judge.schema)) : GENERIC_DATA_SCHEMA;
|
||||
return { score: output.score, data: output.data, schema };
|
||||
}
|
||||
|
||||
/** Load and parse an OCAS JSON Schema file. */
|
||||
async function loadSchema(path: string): Promise<JSONSchema> {
|
||||
const text = await readFile(path, "utf8");
|
||||
return JSON.parse(text) as JSONSchema;
|
||||
}
|
||||
|
||||
/**
|
||||
* Default judge runner. Builtin judges are skipped for now (placeholder score 0
|
||||
* with empty data); task judges spawn their entry script.
|
||||
*/
|
||||
const defaultJudgeRunner: JudgeRunner = async (taskDir, workDir, threadId, judge) => {
|
||||
if (judge.builtin) {
|
||||
return { score: 0, data: {}, schema: GENERIC_DATA_SCHEMA };
|
||||
}
|
||||
return runTaskJudge(taskDir, workDir, threadId, judge);
|
||||
};
|
||||
|
||||
/** Persist judge data to CAS under its schema and return the CAS hash. */
|
||||
async function storeJudgeData(store: Store, schema: JSONSchema, data: unknown): Promise<CasRef> {
|
||||
const schemaHash = await putSchema(store, schema);
|
||||
return (await store.cas.put(schemaHash, data)) as CasRef;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run all judges, store their data and the overall eval-run record in CAS, then
|
||||
* index the run under `@uwf/eval/<task>/latest`.
|
||||
*/
|
||||
export async function collect(
|
||||
input: CollectInput,
|
||||
runJudge: JudgeRunner = defaultJudgeRunner,
|
||||
): Promise<CollectResult> {
|
||||
const { evalStore, taskDir, workDir, threadId, manifest, config } = input;
|
||||
const { store, varStore } = evalStore;
|
||||
|
||||
const records: EvalJudgeRecord[] = [];
|
||||
for (const judge of manifest.judges) {
|
||||
const result = await runJudge(taskDir, workDir, threadId, judge);
|
||||
const dataHash = await storeJudgeData(store, result.schema, result.data);
|
||||
records.push({ name: judge.name, score: result.score, weight: judge.weight, dataHash });
|
||||
log(LOG_JUDGE, `judge=${judge.name} score=${result.score} weight=${judge.weight}`);
|
||||
}
|
||||
|
||||
const overall = computeOverall(records);
|
||||
|
||||
const payload: EvalRunPayload = {
|
||||
task: manifest.name,
|
||||
config,
|
||||
threadId,
|
||||
judges: records,
|
||||
overall,
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
|
||||
const schemaHash = await putSchema(store, EVAL_RUN_SCHEMA);
|
||||
const runHash = (await store.cas.put(schemaHash, payload)) as string;
|
||||
setEvalLatest(varStore, manifest.name, runHash);
|
||||
log(LOG_STORED, `stored eval-run task=${manifest.name} hash=${runHash} overall=${overall}`);
|
||||
|
||||
const judges: JudgeSummary[] = records.map((r) => ({
|
||||
name: r.name,
|
||||
score: r.score,
|
||||
weight: r.weight,
|
||||
}));
|
||||
return { runHash, overall, judges };
|
||||
}
|
||||
Reference in New Issue
Block a user