diff --git a/packages/eval/__tests__/builtin-judges.test.ts b/packages/eval/__tests__/builtin-judges.test.ts new file mode 100644 index 0000000..9b4768d --- /dev/null +++ b/packages/eval/__tests__/builtin-judges.test.ts @@ -0,0 +1,196 @@ +import type { StepEntry } from "@united-workforce/protocol"; +import { beforeEach, describe, expect, test, vi } from "vitest"; + +import { + runFrontmatterJudge, + runHallucinationJudge, + runTokenStatsJudge, + runUpstreamJudge, +} from "../src/judge/builtin/index.js"; + +// Mock the shared read-steps helper so the judges never shell out to `uwf`. +vi.mock("../src/judge/builtin/read-steps.js", () => ({ + readThreadSteps: vi.fn(), +})); + +import { readThreadSteps } from "../src/judge/builtin/read-steps.js"; + +const mockedReadSteps = vi.mocked(readThreadSteps); + +function makeStep(overrides: Partial): StepEntry { + return { + hash: "HASH000000000", + role: "worker", + output: "---\n$status: done\n---\n\nbody", + detail: "DETAIL0000000", + agent: "hermes", + timestamp: 0, + durationMs: 0, + usage: null, + ...overrides, + }; +} + +beforeEach(() => { + mockedReadSteps.mockReset(); +}); + +describe("frontmatter-compliance judge", () => { + test("all steps have valid frontmatter → score 1.0", async () => { + mockedReadSteps.mockReturnValue([ + makeStep({ role: "a", output: "---\n$status: done\n---\n\nwork" }), + makeStep({ role: "b", output: "---\n$status: needs_input\n---\nmore" }), + ]); + + const result = await runFrontmatterJudge("T1"); + const data = result.data as { stepsTotal: number; stepsValid: number; invalidSteps: unknown[] }; + + expect(result.score).toBe(1.0); + expect(data.stepsTotal).toBe(2); + expect(data.stepsValid).toBe(2); + expect(data.invalidSteps).toHaveLength(0); + }); + + test("some steps missing $status → partial score", async () => { + mockedReadSteps.mockReturnValue([ + makeStep({ role: "a", output: "---\n$status: done\n---\nok" }), + makeStep({ role: "b", output: "---\nfoo: bar\n---\nmissing status" }), + makeStep({ role: "c", output: "no frontmatter at all" }), + ]); + + const result = await runFrontmatterJudge("T2"); + const data = result.data as { + stepsTotal: number; + stepsValid: number; + invalidSteps: Array<{ stepIndex: number; role: string; errors: string[] }>; + }; + + expect(result.score).toBeCloseTo(1 / 3, 10); + expect(data.stepsTotal).toBe(3); + expect(data.stepsValid).toBe(1); + expect(data.invalidSteps).toHaveLength(2); + expect(data.invalidSteps[0]).toMatchObject({ stepIndex: 1, role: "b" }); + expect(data.invalidSteps[1]).toMatchObject({ stepIndex: 2, role: "c" }); + }); + + test("no steps → score 0 (0/0 edge case)", async () => { + mockedReadSteps.mockReturnValue([]); + + const result = await runFrontmatterJudge("T3"); + const data = result.data as { stepsTotal: number; stepsValid: number; invalidSteps: unknown[] }; + + expect(result.score).toBe(0); + expect(data.stepsTotal).toBe(0); + expect(data.stepsValid).toBe(0); + expect(data.invalidSteps).toHaveLength(0); + }); + + test("empty-string $status counts as invalid", async () => { + mockedReadSteps.mockReturnValue([makeStep({ role: "a", output: '---\n$status: ""\n---\nx' })]); + + const result = await runFrontmatterJudge("T4"); + expect(result.score).toBe(0); + }); +}); + +describe("token-stats judge", () => { + test("steps with usage → sums correctly", async () => { + mockedReadSteps.mockReturnValue([ + makeStep({ + role: "a", + usage: { turns: 2, inputTokens: 100, outputTokens: 50, duration: 1.5 }, + }), + makeStep({ + role: "b", + usage: { turns: 3, inputTokens: 200, outputTokens: 75, duration: 2.0 }, + }), + ]); + + const result = await runTokenStatsJudge("T1"); + const data = result.data as { + totalInput: number; + totalOutput: number; + totalTurns: number; + perStep: Array<{ role: string; inputTokens: number; outputTokens: number; turns: number }>; + }; + + expect(result.score).toBe(1.0); + expect(data.totalInput).toBe(300); + expect(data.totalOutput).toBe(125); + expect(data.totalTurns).toBe(5); + expect(data.perStep).toHaveLength(2); + expect(data.perStep[0]).toEqual({ + role: "a", + inputTokens: 100, + outputTokens: 50, + turns: 2, + duration: 1.5, + }); + }); + + test("steps with null usage → zeros", async () => { + mockedReadSteps.mockReturnValue([ + makeStep({ role: "a", usage: null }), + makeStep({ role: "b", usage: null }), + ]); + + const result = await runTokenStatsJudge("T2"); + const data = result.data as { + totalInput: number; + totalOutput: number; + totalTurns: number; + perStep: Array<{ + inputTokens: number; + outputTokens: number; + turns: number; + duration: number; + }>; + }; + + expect(result.score).toBe(1.0); + expect(data.totalInput).toBe(0); + expect(data.totalOutput).toBe(0); + expect(data.totalTurns).toBe(0); + expect(data.perStep[0]).toEqual({ + role: "a", + inputTokens: 0, + outputTokens: 0, + turns: 0, + duration: 0, + }); + }); + + test("empty steps → all zeros, score 1.0", async () => { + mockedReadSteps.mockReturnValue([]); + + const result = await runTokenStatsJudge("T3"); + const data = result.data as { + totalInput: number; + totalOutput: number; + totalTurns: number; + perStep: unknown[]; + }; + + expect(result.score).toBe(1.0); + expect(data.totalInput).toBe(0); + expect(data.totalOutput).toBe(0); + expect(data.totalTurns).toBe(0); + expect(data.perStep).toHaveLength(0); + }); +}); + +describe("LLM-as-judge stubs", () => { + test("upstream-consumption returns a stub", async () => { + const result = await runUpstreamJudge("T1"); + expect(result.score).toBe(0); + expect(result.data).toEqual({ perStep: [] }); + expect(result.schema.title).toBe("@uwf/eval-judge-upstream"); + }); + + test("hallucination returns a stub", async () => { + const result = await runHallucinationJudge("T1"); + expect(result.score).toBe(0); + expect(result.data).toEqual({ perStep: [] }); + expect(result.schema.title).toBe("@uwf/eval-judge-hallucination"); + }); +}); diff --git a/packages/eval/__tests__/collect.test.ts b/packages/eval/__tests__/collect.test.ts index 7622d09..5d22c14 100644 --- a/packages/eval/__tests__/collect.test.ts +++ b/packages/eval/__tests__/collect.test.ts @@ -133,25 +133,20 @@ describe("collect", () => { expect(tokenStats?.weight).toBe(0); }); - test("builtin judges are skipped with placeholder score 0", async () => { + test("unknown builtin judge name throws via the default runner", async () => { const evalStore = makeEvalStore(); - const manifest = makeManifest([makeJudge("frontmatter-compliance", 1.0, true)]); + const manifest = makeManifest([makeJudge("not-a-real-judge", 1.0, true)]); - // Use the default runner (no injected runner) → builtin skipped → score 0. - const result = await collect({ - evalStore, - taskDir: "/tmp/task", - workDir: "/tmp/work", - threadId: "THREAD123", - manifest, - config: CONFIG, - }); - - expect(result.overall).toBe(0); - expect(result.judges[0]).toEqual({ - name: "frontmatter-compliance", - score: 0, - weight: 1.0, - }); + // Use the default runner (no injected runner) → builtin dispatch → unknown name throws. + await expect( + collect({ + evalStore, + taskDir: "/tmp/task", + workDir: "/tmp/work", + threadId: "THREAD123", + manifest, + config: CONFIG, + }), + ).rejects.toThrow(/unknown builtin judge/); }); }); diff --git a/packages/eval/src/judge/builtin/frontmatter.ts b/packages/eval/src/judge/builtin/frontmatter.ts new file mode 100644 index 0000000..46ab5a3 --- /dev/null +++ b/packages/eval/src/judge/builtin/frontmatter.ts @@ -0,0 +1,95 @@ +import { createLogger } from "@united-workforce/util"; +import { parse as parseYaml } from "yaml"; + +import { EVAL_JUDGE_FRONTMATTER_SCHEMA } from "../../storage/index.js"; +import { readThreadSteps } from "./read-steps.js"; +import type { BuiltinJudgeOutput } from "./types.js"; + +const log = createLogger({ sink: { kind: "stderr" } }); + +const LOG_RESULT = "F2QH7R4M"; + +const FENCE = "---"; + +type InvalidStep = { + stepIndex: number; + role: string; + errors: string[]; +}; + +/** + * Extract the YAML frontmatter block from a step output. Returns the inner YAML + * string when the output starts with a `---\n` block closed by a `\n---` fence, + * otherwise null. + */ +function extractFrontmatterYaml(output: unknown): string | null { + if (typeof output !== "string") { + return null; + } + if (!output.startsWith(`${FENCE}\n`)) { + return null; + } + const rest = output.slice(FENCE.length + 1); + const closeIndex = rest.indexOf(`\n${FENCE}`); + if (closeIndex === -1) { + return null; + } + return rest.slice(0, closeIndex); +} + +/** Validate a single step's frontmatter, returning a list of errors (empty = valid). */ +function validateStepFrontmatter(output: unknown): string[] { + const yaml = extractFrontmatterYaml(output); + if (yaml === null) { + return ["output does not begin with a valid '---' frontmatter block"]; + } + + let parsed: unknown; + try { + parsed = parseYaml(yaml); + } catch (e) { + const message = e instanceof Error ? e.message : String(e); + return [`frontmatter YAML failed to parse: ${message}`]; + } + + if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) { + return ["frontmatter is not a YAML mapping"]; + } + + const status = (parsed as Record).$status; + if (typeof status !== "string" || status.trim() === "") { + return ["$status field is missing or not a non-empty string"]; + } + + return []; +} + +/** + * Deterministic judge: every step's agent output must contain valid YAML + * frontmatter with a non-empty `$status` field. Score = stepsValid / stepsTotal + * (0 when there are no steps). + */ +export async function runFrontmatterJudge(threadId: string): Promise { + const steps = readThreadSteps(threadId); + + const invalidSteps: InvalidStep[] = []; + for (let i = 0; i < steps.length; i++) { + const step = steps[i]; + const errors = validateStepFrontmatter(step.output); + if (errors.length > 0) { + invalidSteps.push({ stepIndex: i, role: step.role, errors }); + } + } + + const stepsTotal = steps.length; + const stepsValid = stepsTotal - invalidSteps.length; + const score = stepsTotal > 0 ? stepsValid / stepsTotal : 0; + + log(LOG_RESULT, `frontmatter thread=${threadId} valid=${stepsValid}/${stepsTotal}`); + + return { + score, + data: { stepsTotal, stepsValid, invalidSteps }, + schema: EVAL_JUDGE_FRONTMATTER_SCHEMA, + }; +} diff --git a/packages/eval/src/judge/builtin/hallucination.ts b/packages/eval/src/judge/builtin/hallucination.ts new file mode 100644 index 0000000..702b743 --- /dev/null +++ b/packages/eval/src/judge/builtin/hallucination.ts @@ -0,0 +1,17 @@ +import { EVAL_JUDGE_HALLUCINATION_SCHEMA } from "../../storage/index.js"; +import type { BuiltinJudgeOutput } from "./types.js"; + +/** + * LLM-as-judge: detects claims in each step's output that are not grounded in + * the available context (hallucinations). + * + * TODO: LLM-as-judge — needs provider config to call LLM API. Returns a stub + * (score 0, empty perStep) until the LLM call path is wired up. + */ +export async function runHallucinationJudge(_threadId: string): Promise { + return { + score: 0, + data: { perStep: [] }, + schema: EVAL_JUDGE_HALLUCINATION_SCHEMA, + }; +} diff --git a/packages/eval/src/judge/builtin/index.ts b/packages/eval/src/judge/builtin/index.ts new file mode 100644 index 0000000..a7dad57 --- /dev/null +++ b/packages/eval/src/judge/builtin/index.ts @@ -0,0 +1,6 @@ +export { runFrontmatterJudge } from "./frontmatter.js"; +export { runHallucinationJudge } from "./hallucination.js"; +export { readThreadSteps } from "./read-steps.js"; +export { runTokenStatsJudge } from "./token-stats.js"; +export type { BuiltinJudge, BuiltinJudgeOutput } from "./types.js"; +export { runUpstreamJudge } from "./upstream.js"; diff --git a/packages/eval/src/judge/builtin/read-steps.ts b/packages/eval/src/judge/builtin/read-steps.ts new file mode 100644 index 0000000..38221b9 --- /dev/null +++ b/packages/eval/src/judge/builtin/read-steps.ts @@ -0,0 +1,14 @@ +import { execFileSync } from "node:child_process"; + +import type { StepEntry, ThreadStepsOutput } from "@united-workforce/protocol"; + +/** Shell out to `uwf step list` and return the parsed step entries (excludes start entry). */ +export function readThreadSteps(threadId: string): StepEntry[] { + const stdout = execFileSync("uwf", ["step", "list", threadId], { + encoding: "utf8", + stdio: ["ignore", "pipe", "pipe"], + }).trim(); + const parsed = JSON.parse(stdout) as ThreadStepsOutput; + // steps[0] is the StartEntry; the rest are StepEntry records. + return parsed.steps.slice(1) as StepEntry[]; +} diff --git a/packages/eval/src/judge/builtin/token-stats.ts b/packages/eval/src/judge/builtin/token-stats.ts new file mode 100644 index 0000000..cd7396d --- /dev/null +++ b/packages/eval/src/judge/builtin/token-stats.ts @@ -0,0 +1,53 @@ +import { createLogger } from "@united-workforce/util"; + +import { EVAL_JUDGE_TOKEN_STATS_SCHEMA } from "../../storage/index.js"; +import { readThreadSteps } from "./read-steps.js"; +import type { BuiltinJudgeOutput } from "./types.js"; + +const log = createLogger({ sink: { kind: "stderr" } }); + +const LOG_RESULT = "T7KQ3M9P"; + +type PerStepStats = { + role: string; + inputTokens: number; + outputTokens: number; + turns: number; + duration: number; +}; + +/** + * Informational judge: aggregate token usage across every step. Always scores + * 1.0 — it never penalizes a run, it only reports usage. Steps with null usage + * contribute zeros. + */ +export async function runTokenStatsJudge(threadId: string): Promise { + const steps = readThreadSteps(threadId); + + let totalInput = 0; + let totalOutput = 0; + let totalTurns = 0; + const perStep: PerStepStats[] = []; + + for (const step of steps) { + const usage = step.usage; + const inputTokens = usage !== null ? usage.inputTokens : 0; + const outputTokens = usage !== null ? usage.outputTokens : 0; + const turns = usage !== null ? usage.turns : 0; + const duration = usage !== null ? usage.duration : 0; + + totalInput += inputTokens; + totalOutput += outputTokens; + totalTurns += turns; + + perStep.push({ role: step.role, inputTokens, outputTokens, turns, duration }); + } + + log(LOG_RESULT, `token-stats thread=${threadId} in=${totalInput} out=${totalOutput}`); + + return { + score: 1.0, + data: { totalInput, totalOutput, totalTurns, perStep }, + schema: EVAL_JUDGE_TOKEN_STATS_SCHEMA, + }; +} diff --git a/packages/eval/src/judge/builtin/types.ts b/packages/eval/src/judge/builtin/types.ts new file mode 100644 index 0000000..1d21037 --- /dev/null +++ b/packages/eval/src/judge/builtin/types.ts @@ -0,0 +1,16 @@ +import type { JSONSchema } from "@ocas/core"; + +/** + * Output produced by a builtin judge. Structurally identical to the runner's + * `JudgeRunOutput`; defined locally to keep the judge module free of a + * dependency on the runner module. + */ +export type BuiltinJudgeOutput = { + score: number; + data: unknown; + /** Schema describing `data`, used when persisting to CAS. */ + schema: JSONSchema; +}; + +/** A builtin judge analyzes a thread's steps and returns a scored result. */ +export type BuiltinJudge = (threadId: string) => Promise; diff --git a/packages/eval/src/judge/builtin/upstream.ts b/packages/eval/src/judge/builtin/upstream.ts new file mode 100644 index 0000000..0fb548f --- /dev/null +++ b/packages/eval/src/judge/builtin/upstream.ts @@ -0,0 +1,17 @@ +import { EVAL_JUDGE_UPSTREAM_SCHEMA } from "../../storage/index.js"; +import type { BuiltinJudgeOutput } from "./types.js"; + +/** + * LLM-as-judge: measures how well each role consumed the relevant outputs from + * upstream steps. + * + * TODO: LLM-as-judge — needs provider config to call LLM API. Returns a stub + * (score 0, empty perStep) until the LLM call path is wired up. + */ +export async function runUpstreamJudge(_threadId: string): Promise { + return { + score: 0, + data: { perStep: [] }, + schema: EVAL_JUDGE_UPSTREAM_SCHEMA, + }; +} diff --git a/packages/eval/src/judge/index.ts b/packages/eval/src/judge/index.ts index ebb4dfb..84c63e5 100644 --- a/packages/eval/src/judge/index.ts +++ b/packages/eval/src/judge/index.ts @@ -1 +1,10 @@ +export { + type BuiltinJudge, + type BuiltinJudgeOutput, + readThreadSteps, + runFrontmatterJudge, + runHallucinationJudge, + runTokenStatsJudge, + runUpstreamJudge, +} from "./builtin/index.js"; export type { JudgeInput, JudgeOutput } from "./types.js"; diff --git a/packages/eval/src/runner/collect.ts b/packages/eval/src/runner/collect.ts index b487dc8..da745f7 100644 --- a/packages/eval/src/runner/collect.ts +++ b/packages/eval/src/runner/collect.ts @@ -8,6 +8,12 @@ import type { CasRef } from "@united-workforce/protocol"; import { createLogger } from "@united-workforce/util"; import type { JudgeOutput } from "../judge/index.js"; +import { + runFrontmatterJudge, + runHallucinationJudge, + runTokenStatsJudge, + runUpstreamJudge, +} from "../judge/index.js"; import type { EvalJudgeRecord, EvalRunPayload } from "../storage/index.js"; import { EVAL_RUN_SCHEMA, setEvalLatest } from "../storage/index.js"; import type { JudgeEntry } from "../task/index.js"; @@ -89,13 +95,29 @@ async function loadSchema(path: string): Promise { return JSON.parse(text) as JSONSchema; } +/** Dispatch a builtin judge by name. Throws on an unknown builtin name. */ +async function runBuiltinJudge(name: string, threadId: string): Promise { + switch (name) { + case "frontmatter-compliance": + return runFrontmatterJudge(threadId); + case "upstream-consumption": + return runUpstreamJudge(threadId); + case "hallucination": + return runHallucinationJudge(threadId); + case "token-stats": + return runTokenStatsJudge(threadId); + default: + throw new Error(`unknown builtin judge "${name}"`); + } +} + /** - * Default judge runner. Builtin judges are skipped for now (placeholder score 0 - * with empty data); task judges spawn their entry script. + * Default judge runner. Builtin judges are dispatched by name; task judges spawn + * their entry script. */ const defaultJudgeRunner: JudgeRunner = async (taskDir, workDir, threadId, judge) => { if (judge.builtin) { - return { score: 0, data: {}, schema: GENERIC_DATA_SCHEMA }; + return runBuiltinJudge(judge.name, threadId); } return runTaskJudge(taskDir, workDir, threadId, judge); };