feat: eval package scaffold with CLI, schemas, types, task loader
CI / check (pull_request) Successful in 1m42s
CI / check (pull_request) Successful in 1m42s
New package @united-workforce/eval (uwf-eval CLI): - CLI skeleton: run/report/diff/list subcommands (stubs) - 5 OCAS schemas: eval-run, judge-frontmatter, judge-upstream, judge-hallucination, judge-token-stats - TaskManifest type + parser/validator for task.yaml - JudgeOutput/JudgeInput types for judge contract - EvalRunPayload/EvalRunConfig/EvalJudgeRecord storage types - 19 unit tests: task loader validation + schema definitions Refs #69
This commit is contained in:
@@ -0,0 +1,63 @@
|
||||
import { describe, expect, test } from "vitest";
|
||||
import {
|
||||
EVAL_JUDGE_FRONTMATTER_SCHEMA,
|
||||
EVAL_JUDGE_HALLUCINATION_SCHEMA,
|
||||
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
|
||||
EVAL_JUDGE_UPSTREAM_SCHEMA,
|
||||
EVAL_RUN_SCHEMA,
|
||||
} from "../src/storage/index.js";
|
||||
|
||||
describe("OCAS schema definitions", () => {
|
||||
test("eval-run schema has correct title and required fields", () => {
|
||||
expect(EVAL_RUN_SCHEMA.title).toBe("@uwf/eval-run");
|
||||
const required = EVAL_RUN_SCHEMA.required as string[];
|
||||
expect(required).toContain("task");
|
||||
expect(required).toContain("config");
|
||||
expect(required).toContain("threadId");
|
||||
expect(required).toContain("judges");
|
||||
expect(required).toContain("overall");
|
||||
expect(required).toContain("timestamp");
|
||||
});
|
||||
|
||||
test("frontmatter judge schema has correct title", () => {
|
||||
expect(EVAL_JUDGE_FRONTMATTER_SCHEMA.title).toBe("@uwf/eval-judge-frontmatter");
|
||||
const required = EVAL_JUDGE_FRONTMATTER_SCHEMA.required as string[];
|
||||
expect(required).toContain("stepsTotal");
|
||||
expect(required).toContain("stepsValid");
|
||||
expect(required).toContain("invalidSteps");
|
||||
});
|
||||
|
||||
test("upstream judge schema has correct title", () => {
|
||||
expect(EVAL_JUDGE_UPSTREAM_SCHEMA.title).toBe("@uwf/eval-judge-upstream");
|
||||
const required = EVAL_JUDGE_UPSTREAM_SCHEMA.required as string[];
|
||||
expect(required).toContain("perStep");
|
||||
});
|
||||
|
||||
test("hallucination judge schema has correct title", () => {
|
||||
expect(EVAL_JUDGE_HALLUCINATION_SCHEMA.title).toBe("@uwf/eval-judge-hallucination");
|
||||
const required = EVAL_JUDGE_HALLUCINATION_SCHEMA.required as string[];
|
||||
expect(required).toContain("perStep");
|
||||
});
|
||||
|
||||
test("token-stats judge schema has correct title", () => {
|
||||
expect(EVAL_JUDGE_TOKEN_STATS_SCHEMA.title).toBe("@uwf/eval-judge-token-stats");
|
||||
const required = EVAL_JUDGE_TOKEN_STATS_SCHEMA.required as string[];
|
||||
expect(required).toContain("totalInput");
|
||||
expect(required).toContain("totalOutput");
|
||||
expect(required).toContain("totalTurns");
|
||||
expect(required).toContain("perStep");
|
||||
});
|
||||
|
||||
test("all schemas have type object at root", () => {
|
||||
const schemas = [
|
||||
EVAL_RUN_SCHEMA,
|
||||
EVAL_JUDGE_FRONTMATTER_SCHEMA,
|
||||
EVAL_JUDGE_UPSTREAM_SCHEMA,
|
||||
EVAL_JUDGE_HALLUCINATION_SCHEMA,
|
||||
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
|
||||
];
|
||||
for (const s of schemas) {
|
||||
expect(s.type).toBe("object");
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,163 @@
|
||||
import { describe, expect, test } from "vitest";
|
||||
import { parseTaskManifest } from "../src/task/index.js";
|
||||
|
||||
const VALID_YAML = `
|
||||
name: fix-off-by-one
|
||||
description: Fix an off-by-one error in a calculator
|
||||
workflow: solve-issue
|
||||
prompt: "Fix the bug: add(1,2) returns 4 instead of 3"
|
||||
limits:
|
||||
maxSteps: 15
|
||||
timeoutMinutes: 30
|
||||
judges:
|
||||
- name: frontmatter-compliance
|
||||
weight: 0.15
|
||||
builtin: true
|
||||
- name: test-pass
|
||||
weight: 0.3
|
||||
entry: dist/judges/test-pass.js
|
||||
schema: schemas/test-pass.json
|
||||
`;
|
||||
|
||||
describe("parseTaskManifest", () => {
|
||||
test("parses valid task.yaml", () => {
|
||||
const manifest = parseTaskManifest(VALID_YAML);
|
||||
expect(manifest.name).toBe("fix-off-by-one");
|
||||
expect(manifest.description).toBe("Fix an off-by-one error in a calculator");
|
||||
expect(manifest.workflow).toBe("solve-issue");
|
||||
expect(manifest.prompt).toBe("Fix the bug: add(1,2) returns 4 instead of 3");
|
||||
expect(manifest.limits).toEqual({ maxSteps: 15, timeoutMinutes: 30 });
|
||||
expect(manifest.judges).toHaveLength(2);
|
||||
});
|
||||
|
||||
test("parses builtin judge", () => {
|
||||
const manifest = parseTaskManifest(VALID_YAML);
|
||||
const builtin = manifest.judges[0];
|
||||
expect(builtin).toBeDefined();
|
||||
expect(builtin!.name).toBe("frontmatter-compliance");
|
||||
expect(builtin!.weight).toBe(0.15);
|
||||
expect(builtin!.builtin).toBe(true);
|
||||
expect(builtin!.entry).toBeNull();
|
||||
});
|
||||
|
||||
test("parses custom judge with entry + schema", () => {
|
||||
const manifest = parseTaskManifest(VALID_YAML);
|
||||
const custom = manifest.judges[1];
|
||||
expect(custom).toBeDefined();
|
||||
expect(custom!.name).toBe("test-pass");
|
||||
expect(custom!.weight).toBe(0.3);
|
||||
expect(custom!.builtin).toBe(false);
|
||||
expect(custom!.entry).toBe("dist/judges/test-pass.js");
|
||||
expect(custom!.schema).toBe("schemas/test-pass.json");
|
||||
});
|
||||
|
||||
test("defaults limits when omitted", () => {
|
||||
const yaml = `
|
||||
name: minimal
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges:
|
||||
- name: check
|
||||
builtin: true
|
||||
`;
|
||||
const manifest = parseTaskManifest(yaml);
|
||||
expect(manifest.limits).toEqual({ maxSteps: 20, timeoutMinutes: 30 });
|
||||
});
|
||||
|
||||
test("defaults description to empty string", () => {
|
||||
const yaml = `
|
||||
name: no-desc
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges:
|
||||
- name: check
|
||||
builtin: true
|
||||
`;
|
||||
const manifest = parseTaskManifest(yaml);
|
||||
expect(manifest.description).toBe("");
|
||||
});
|
||||
|
||||
test("rejects missing name", () => {
|
||||
const yaml = `
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges:
|
||||
- name: check
|
||||
builtin: true
|
||||
`;
|
||||
expect(() => parseTaskManifest(yaml)).toThrow("name is required");
|
||||
});
|
||||
|
||||
test("rejects missing workflow", () => {
|
||||
const yaml = `
|
||||
name: test
|
||||
prompt: do something
|
||||
judges:
|
||||
- name: check
|
||||
builtin: true
|
||||
`;
|
||||
expect(() => parseTaskManifest(yaml)).toThrow("workflow is required");
|
||||
});
|
||||
|
||||
test("rejects missing prompt", () => {
|
||||
const yaml = `
|
||||
name: test
|
||||
workflow: solve-issue
|
||||
judges:
|
||||
- name: check
|
||||
builtin: true
|
||||
`;
|
||||
expect(() => parseTaskManifest(yaml)).toThrow("prompt is required");
|
||||
});
|
||||
|
||||
test("rejects empty judges array", () => {
|
||||
const yaml = `
|
||||
name: test
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges: []
|
||||
`;
|
||||
expect(() => parseTaskManifest(yaml)).toThrow("at least one judge");
|
||||
});
|
||||
|
||||
test("rejects non-builtin judge without entry", () => {
|
||||
const yaml = `
|
||||
name: test
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges:
|
||||
- name: custom-check
|
||||
weight: 0.5
|
||||
`;
|
||||
expect(() => parseTaskManifest(yaml)).toThrow("non-builtin judge must have entry");
|
||||
});
|
||||
|
||||
test("rejects non-object YAML root", () => {
|
||||
expect(() => parseTaskManifest("just a string")).toThrow("must be a YAML mapping");
|
||||
});
|
||||
|
||||
test("rejects judge without name", () => {
|
||||
const yaml = `
|
||||
name: test
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges:
|
||||
- weight: 0.5
|
||||
builtin: true
|
||||
`;
|
||||
expect(() => parseTaskManifest(yaml)).toThrow("name is required");
|
||||
});
|
||||
|
||||
test("defaults weight to 0 when omitted", () => {
|
||||
const yaml = `
|
||||
name: test
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges:
|
||||
- name: token-stats
|
||||
builtin: true
|
||||
`;
|
||||
const manifest = parseTaskManifest(yaml);
|
||||
expect(manifest.judges[0]!.weight).toBe(0);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user