feat: eval run command — prepare, execute, collect pipeline
CI / check (pull_request) Successful in 1m45s
CI / check (pull_request) Successful in 1m45s
Implement the uwf-eval run <task-dir> command with 3-phase pipeline: - prepare: read task.yaml, copy fixture/ to temp workdir - execute: shell out to uwf thread start + exec - collect: run judges, compute weighted score, store CAS node, set @uwf/eval/<task>/latest variable Changes: - src/runner/ — types, prepare, execute, collect, index - src/storage/store.ts — createEvalStore(), setEvalLatest() - src/commands/run.ts — full pipeline wiring with --agent/--model/--count - 9 new tests (prepare + collect + weighted scoring) Builtin judges return placeholder score 0 (Phase 1c). Refs #70
This commit is contained in:
@@ -0,0 +1,74 @@
|
||||
import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
|
||||
import { afterEach, beforeEach, describe, expect, test } from "vitest";
|
||||
|
||||
import { prepare } from "../src/runner/index.js";
|
||||
|
||||
const TASK_YAML = `
|
||||
name: fix-off-by-one
|
||||
description: Fix an off-by-one error
|
||||
workflow: solve-issue
|
||||
prompt: "Fix the bug"
|
||||
limits:
|
||||
maxSteps: 12
|
||||
timeoutMinutes: 20
|
||||
judges:
|
||||
- name: frontmatter-compliance
|
||||
weight: 0.5
|
||||
builtin: true
|
||||
- name: test-pass
|
||||
weight: 0.5
|
||||
entry: dist/judges/test-pass.js
|
||||
`;
|
||||
|
||||
let taskDir: string;
|
||||
|
||||
beforeEach(async () => {
|
||||
taskDir = await mkdtemp(join(tmpdir(), "uwf-eval-task-"));
|
||||
await writeFile(join(taskDir, "task.yaml"), TASK_YAML, "utf8");
|
||||
const fixtureDir = join(taskDir, "fixture");
|
||||
await mkdir(join(fixtureDir, "src"), { recursive: true });
|
||||
await writeFile(join(fixtureDir, "src", "calc.ts"), "export const add = (a, b) => a + b + 1;\n");
|
||||
await writeFile(join(fixtureDir, "package.json"), '{ "name": "fixture" }\n');
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(taskDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
describe("prepare", () => {
|
||||
test("returns the parsed manifest", async () => {
|
||||
const result = await prepare(taskDir);
|
||||
expect(result.taskDir).toBe(taskDir);
|
||||
expect(result.manifest.name).toBe("fix-off-by-one");
|
||||
expect(result.manifest.workflow).toBe("solve-issue");
|
||||
expect(result.manifest.limits.maxSteps).toBe(12);
|
||||
expect(result.manifest.judges).toHaveLength(2);
|
||||
});
|
||||
|
||||
test("copies fixture into a fresh temp work dir", async () => {
|
||||
const result = await prepare(taskDir);
|
||||
expect(result.workDir).not.toBe(taskDir);
|
||||
expect(result.workDir.startsWith(tmpdir())).toBe(true);
|
||||
|
||||
const calc = await readFile(join(result.workDir, "src", "calc.ts"), "utf8");
|
||||
expect(calc).toContain("export const add");
|
||||
const pkg = await readFile(join(result.workDir, "package.json"), "utf8");
|
||||
expect(pkg).toContain("fixture");
|
||||
|
||||
await rm(result.workDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
test("creates an empty work dir when no fixture/ exists", async () => {
|
||||
const noFixtureDir = await mkdtemp(join(tmpdir(), "uwf-eval-nofix-"));
|
||||
await writeFile(join(noFixtureDir, "task.yaml"), TASK_YAML, "utf8");
|
||||
|
||||
const result = await prepare(noFixtureDir);
|
||||
expect(result.workDir.startsWith(tmpdir())).toBe(true);
|
||||
|
||||
await rm(noFixtureDir, { recursive: true, force: true });
|
||||
await rm(result.workDir, { recursive: true, force: true });
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user