feat: builtin judges — frontmatter + token-stats (deterministic) + upstream/hallucination (stubs)
CI / check (pull_request) Successful in 1m45s
CI / check (pull_request) Successful in 1m45s
Implement 4 builtin judges for eval framework: - frontmatter-compliance: validates YAML frontmatter with $status field, score = stepsValid / stepsTotal - token-stats: aggregates Usage from step nodes, always score 1.0 (informational only) - upstream-consumption: LLM-as-judge stub (score 0, TODO) - hallucination: LLM-as-judge stub (score 0, TODO) Infrastructure: - judge/builtin/read-steps.ts — shell out to uwf step list - judge/builtin/types.ts — BuiltinJudge, BuiltinJudgeOutput - runner/collect.ts — dispatch builtin judges by name 9 new tests (frontmatter validation + token aggregation) Refs #71
This commit is contained in:
@@ -0,0 +1,196 @@
|
|||||||
|
import type { StepEntry } from "@united-workforce/protocol";
|
||||||
|
import { beforeEach, describe, expect, test, vi } from "vitest";
|
||||||
|
|
||||||
|
import {
|
||||||
|
runFrontmatterJudge,
|
||||||
|
runHallucinationJudge,
|
||||||
|
runTokenStatsJudge,
|
||||||
|
runUpstreamJudge,
|
||||||
|
} from "../src/judge/builtin/index.js";
|
||||||
|
|
||||||
|
// Mock the shared read-steps helper so the judges never shell out to `uwf`.
|
||||||
|
vi.mock("../src/judge/builtin/read-steps.js", () => ({
|
||||||
|
readThreadSteps: vi.fn(),
|
||||||
|
}));
|
||||||
|
|
||||||
|
import { readThreadSteps } from "../src/judge/builtin/read-steps.js";
|
||||||
|
|
||||||
|
const mockedReadSteps = vi.mocked(readThreadSteps);
|
||||||
|
|
||||||
|
function makeStep(overrides: Partial<StepEntry>): StepEntry {
|
||||||
|
return {
|
||||||
|
hash: "HASH000000000",
|
||||||
|
role: "worker",
|
||||||
|
output: "---\n$status: done\n---\n\nbody",
|
||||||
|
detail: "DETAIL0000000",
|
||||||
|
agent: "hermes",
|
||||||
|
timestamp: 0,
|
||||||
|
durationMs: 0,
|
||||||
|
usage: null,
|
||||||
|
...overrides,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
mockedReadSteps.mockReset();
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("frontmatter-compliance judge", () => {
|
||||||
|
test("all steps have valid frontmatter → score 1.0", async () => {
|
||||||
|
mockedReadSteps.mockReturnValue([
|
||||||
|
makeStep({ role: "a", output: "---\n$status: done\n---\n\nwork" }),
|
||||||
|
makeStep({ role: "b", output: "---\n$status: needs_input\n---\nmore" }),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const result = await runFrontmatterJudge("T1");
|
||||||
|
const data = result.data as { stepsTotal: number; stepsValid: number; invalidSteps: unknown[] };
|
||||||
|
|
||||||
|
expect(result.score).toBe(1.0);
|
||||||
|
expect(data.stepsTotal).toBe(2);
|
||||||
|
expect(data.stepsValid).toBe(2);
|
||||||
|
expect(data.invalidSteps).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("some steps missing $status → partial score", async () => {
|
||||||
|
mockedReadSteps.mockReturnValue([
|
||||||
|
makeStep({ role: "a", output: "---\n$status: done\n---\nok" }),
|
||||||
|
makeStep({ role: "b", output: "---\nfoo: bar\n---\nmissing status" }),
|
||||||
|
makeStep({ role: "c", output: "no frontmatter at all" }),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const result = await runFrontmatterJudge("T2");
|
||||||
|
const data = result.data as {
|
||||||
|
stepsTotal: number;
|
||||||
|
stepsValid: number;
|
||||||
|
invalidSteps: Array<{ stepIndex: number; role: string; errors: string[] }>;
|
||||||
|
};
|
||||||
|
|
||||||
|
expect(result.score).toBeCloseTo(1 / 3, 10);
|
||||||
|
expect(data.stepsTotal).toBe(3);
|
||||||
|
expect(data.stepsValid).toBe(1);
|
||||||
|
expect(data.invalidSteps).toHaveLength(2);
|
||||||
|
expect(data.invalidSteps[0]).toMatchObject({ stepIndex: 1, role: "b" });
|
||||||
|
expect(data.invalidSteps[1]).toMatchObject({ stepIndex: 2, role: "c" });
|
||||||
|
});
|
||||||
|
|
||||||
|
test("no steps → score 0 (0/0 edge case)", async () => {
|
||||||
|
mockedReadSteps.mockReturnValue([]);
|
||||||
|
|
||||||
|
const result = await runFrontmatterJudge("T3");
|
||||||
|
const data = result.data as { stepsTotal: number; stepsValid: number; invalidSteps: unknown[] };
|
||||||
|
|
||||||
|
expect(result.score).toBe(0);
|
||||||
|
expect(data.stepsTotal).toBe(0);
|
||||||
|
expect(data.stepsValid).toBe(0);
|
||||||
|
expect(data.invalidSteps).toHaveLength(0);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("empty-string $status counts as invalid", async () => {
|
||||||
|
mockedReadSteps.mockReturnValue([makeStep({ role: "a", output: '---\n$status: ""\n---\nx' })]);
|
||||||
|
|
||||||
|
const result = await runFrontmatterJudge("T4");
|
||||||
|
expect(result.score).toBe(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("token-stats judge", () => {
|
||||||
|
test("steps with usage → sums correctly", async () => {
|
||||||
|
mockedReadSteps.mockReturnValue([
|
||||||
|
makeStep({
|
||||||
|
role: "a",
|
||||||
|
usage: { turns: 2, inputTokens: 100, outputTokens: 50, duration: 1.5 },
|
||||||
|
}),
|
||||||
|
makeStep({
|
||||||
|
role: "b",
|
||||||
|
usage: { turns: 3, inputTokens: 200, outputTokens: 75, duration: 2.0 },
|
||||||
|
}),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const result = await runTokenStatsJudge("T1");
|
||||||
|
const data = result.data as {
|
||||||
|
totalInput: number;
|
||||||
|
totalOutput: number;
|
||||||
|
totalTurns: number;
|
||||||
|
perStep: Array<{ role: string; inputTokens: number; outputTokens: number; turns: number }>;
|
||||||
|
};
|
||||||
|
|
||||||
|
expect(result.score).toBe(1.0);
|
||||||
|
expect(data.totalInput).toBe(300);
|
||||||
|
expect(data.totalOutput).toBe(125);
|
||||||
|
expect(data.totalTurns).toBe(5);
|
||||||
|
expect(data.perStep).toHaveLength(2);
|
||||||
|
expect(data.perStep[0]).toEqual({
|
||||||
|
role: "a",
|
||||||
|
inputTokens: 100,
|
||||||
|
outputTokens: 50,
|
||||||
|
turns: 2,
|
||||||
|
duration: 1.5,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("steps with null usage → zeros", async () => {
|
||||||
|
mockedReadSteps.mockReturnValue([
|
||||||
|
makeStep({ role: "a", usage: null }),
|
||||||
|
makeStep({ role: "b", usage: null }),
|
||||||
|
]);
|
||||||
|
|
||||||
|
const result = await runTokenStatsJudge("T2");
|
||||||
|
const data = result.data as {
|
||||||
|
totalInput: number;
|
||||||
|
totalOutput: number;
|
||||||
|
totalTurns: number;
|
||||||
|
perStep: Array<{
|
||||||
|
inputTokens: number;
|
||||||
|
outputTokens: number;
|
||||||
|
turns: number;
|
||||||
|
duration: number;
|
||||||
|
}>;
|
||||||
|
};
|
||||||
|
|
||||||
|
expect(result.score).toBe(1.0);
|
||||||
|
expect(data.totalInput).toBe(0);
|
||||||
|
expect(data.totalOutput).toBe(0);
|
||||||
|
expect(data.totalTurns).toBe(0);
|
||||||
|
expect(data.perStep[0]).toEqual({
|
||||||
|
role: "a",
|
||||||
|
inputTokens: 0,
|
||||||
|
outputTokens: 0,
|
||||||
|
turns: 0,
|
||||||
|
duration: 0,
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test("empty steps → all zeros, score 1.0", async () => {
|
||||||
|
mockedReadSteps.mockReturnValue([]);
|
||||||
|
|
||||||
|
const result = await runTokenStatsJudge("T3");
|
||||||
|
const data = result.data as {
|
||||||
|
totalInput: number;
|
||||||
|
totalOutput: number;
|
||||||
|
totalTurns: number;
|
||||||
|
perStep: unknown[];
|
||||||
|
};
|
||||||
|
|
||||||
|
expect(result.score).toBe(1.0);
|
||||||
|
expect(data.totalInput).toBe(0);
|
||||||
|
expect(data.totalOutput).toBe(0);
|
||||||
|
expect(data.totalTurns).toBe(0);
|
||||||
|
expect(data.perStep).toHaveLength(0);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe("LLM-as-judge stubs", () => {
|
||||||
|
test("upstream-consumption returns a stub", async () => {
|
||||||
|
const result = await runUpstreamJudge("T1");
|
||||||
|
expect(result.score).toBe(0);
|
||||||
|
expect(result.data).toEqual({ perStep: [] });
|
||||||
|
expect(result.schema.title).toBe("@uwf/eval-judge-upstream");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("hallucination returns a stub", async () => {
|
||||||
|
const result = await runHallucinationJudge("T1");
|
||||||
|
expect(result.score).toBe(0);
|
||||||
|
expect(result.data).toEqual({ perStep: [] });
|
||||||
|
expect(result.schema.title).toBe("@uwf/eval-judge-hallucination");
|
||||||
|
});
|
||||||
|
});
|
||||||
@@ -133,25 +133,20 @@ describe("collect", () => {
|
|||||||
expect(tokenStats?.weight).toBe(0);
|
expect(tokenStats?.weight).toBe(0);
|
||||||
});
|
});
|
||||||
|
|
||||||
test("builtin judges are skipped with placeholder score 0", async () => {
|
test("unknown builtin judge name throws via the default runner", async () => {
|
||||||
const evalStore = makeEvalStore();
|
const evalStore = makeEvalStore();
|
||||||
const manifest = makeManifest([makeJudge("frontmatter-compliance", 1.0, true)]);
|
const manifest = makeManifest([makeJudge("not-a-real-judge", 1.0, true)]);
|
||||||
|
|
||||||
// Use the default runner (no injected runner) → builtin skipped → score 0.
|
// Use the default runner (no injected runner) → builtin dispatch → unknown name throws.
|
||||||
const result = await collect({
|
await expect(
|
||||||
|
collect({
|
||||||
evalStore,
|
evalStore,
|
||||||
taskDir: "/tmp/task",
|
taskDir: "/tmp/task",
|
||||||
workDir: "/tmp/work",
|
workDir: "/tmp/work",
|
||||||
threadId: "THREAD123",
|
threadId: "THREAD123",
|
||||||
manifest,
|
manifest,
|
||||||
config: CONFIG,
|
config: CONFIG,
|
||||||
});
|
}),
|
||||||
|
).rejects.toThrow(/unknown builtin judge/);
|
||||||
expect(result.overall).toBe(0);
|
|
||||||
expect(result.judges[0]).toEqual({
|
|
||||||
name: "frontmatter-compliance",
|
|
||||||
score: 0,
|
|
||||||
weight: 1.0,
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -0,0 +1,95 @@
|
|||||||
|
import { createLogger } from "@united-workforce/util";
|
||||||
|
import { parse as parseYaml } from "yaml";
|
||||||
|
|
||||||
|
import { EVAL_JUDGE_FRONTMATTER_SCHEMA } from "../../storage/index.js";
|
||||||
|
import { readThreadSteps } from "./read-steps.js";
|
||||||
|
import type { BuiltinJudgeOutput } from "./types.js";
|
||||||
|
|
||||||
|
const log = createLogger({ sink: { kind: "stderr" } });
|
||||||
|
|
||||||
|
const LOG_RESULT = "F2QH7R4M";
|
||||||
|
|
||||||
|
const FENCE = "---";
|
||||||
|
|
||||||
|
type InvalidStep = {
|
||||||
|
stepIndex: number;
|
||||||
|
role: string;
|
||||||
|
errors: string[];
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract the YAML frontmatter block from a step output. Returns the inner YAML
|
||||||
|
* string when the output starts with a `---\n` block closed by a `\n---` fence,
|
||||||
|
* otherwise null.
|
||||||
|
*/
|
||||||
|
function extractFrontmatterYaml(output: unknown): string | null {
|
||||||
|
if (typeof output !== "string") {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
if (!output.startsWith(`${FENCE}\n`)) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
const rest = output.slice(FENCE.length + 1);
|
||||||
|
const closeIndex = rest.indexOf(`\n${FENCE}`);
|
||||||
|
if (closeIndex === -1) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
return rest.slice(0, closeIndex);
|
||||||
|
}
|
||||||
|
|
||||||
|
/** Validate a single step's frontmatter, returning a list of errors (empty = valid). */
|
||||||
|
function validateStepFrontmatter(output: unknown): string[] {
|
||||||
|
const yaml = extractFrontmatterYaml(output);
|
||||||
|
if (yaml === null) {
|
||||||
|
return ["output does not begin with a valid '---' frontmatter block"];
|
||||||
|
}
|
||||||
|
|
||||||
|
let parsed: unknown;
|
||||||
|
try {
|
||||||
|
parsed = parseYaml(yaml);
|
||||||
|
} catch (e) {
|
||||||
|
const message = e instanceof Error ? e.message : String(e);
|
||||||
|
return [`frontmatter YAML failed to parse: ${message}`];
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
|
||||||
|
return ["frontmatter is not a YAML mapping"];
|
||||||
|
}
|
||||||
|
|
||||||
|
const status = (parsed as Record<string, unknown>).$status;
|
||||||
|
if (typeof status !== "string" || status.trim() === "") {
|
||||||
|
return ["$status field is missing or not a non-empty string"];
|
||||||
|
}
|
||||||
|
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Deterministic judge: every step's agent output must contain valid YAML
|
||||||
|
* frontmatter with a non-empty `$status` field. Score = stepsValid / stepsTotal
|
||||||
|
* (0 when there are no steps).
|
||||||
|
*/
|
||||||
|
export async function runFrontmatterJudge(threadId: string): Promise<BuiltinJudgeOutput> {
|
||||||
|
const steps = readThreadSteps(threadId);
|
||||||
|
|
||||||
|
const invalidSteps: InvalidStep[] = [];
|
||||||
|
for (let i = 0; i < steps.length; i++) {
|
||||||
|
const step = steps[i];
|
||||||
|
const errors = validateStepFrontmatter(step.output);
|
||||||
|
if (errors.length > 0) {
|
||||||
|
invalidSteps.push({ stepIndex: i, role: step.role, errors });
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const stepsTotal = steps.length;
|
||||||
|
const stepsValid = stepsTotal - invalidSteps.length;
|
||||||
|
const score = stepsTotal > 0 ? stepsValid / stepsTotal : 0;
|
||||||
|
|
||||||
|
log(LOG_RESULT, `frontmatter thread=${threadId} valid=${stepsValid}/${stepsTotal}`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
score,
|
||||||
|
data: { stepsTotal, stepsValid, invalidSteps },
|
||||||
|
schema: EVAL_JUDGE_FRONTMATTER_SCHEMA,
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
import { EVAL_JUDGE_HALLUCINATION_SCHEMA } from "../../storage/index.js";
|
||||||
|
import type { BuiltinJudgeOutput } from "./types.js";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* LLM-as-judge: detects claims in each step's output that are not grounded in
|
||||||
|
* the available context (hallucinations).
|
||||||
|
*
|
||||||
|
* TODO: LLM-as-judge — needs provider config to call LLM API. Returns a stub
|
||||||
|
* (score 0, empty perStep) until the LLM call path is wired up.
|
||||||
|
*/
|
||||||
|
export async function runHallucinationJudge(_threadId: string): Promise<BuiltinJudgeOutput> {
|
||||||
|
return {
|
||||||
|
score: 0,
|
||||||
|
data: { perStep: [] },
|
||||||
|
schema: EVAL_JUDGE_HALLUCINATION_SCHEMA,
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -0,0 +1,6 @@
|
|||||||
|
export { runFrontmatterJudge } from "./frontmatter.js";
|
||||||
|
export { runHallucinationJudge } from "./hallucination.js";
|
||||||
|
export { readThreadSteps } from "./read-steps.js";
|
||||||
|
export { runTokenStatsJudge } from "./token-stats.js";
|
||||||
|
export type { BuiltinJudge, BuiltinJudgeOutput } from "./types.js";
|
||||||
|
export { runUpstreamJudge } from "./upstream.js";
|
||||||
@@ -0,0 +1,14 @@
|
|||||||
|
import { execFileSync } from "node:child_process";
|
||||||
|
|
||||||
|
import type { StepEntry, ThreadStepsOutput } from "@united-workforce/protocol";
|
||||||
|
|
||||||
|
/** Shell out to `uwf step list` and return the parsed step entries (excludes start entry). */
|
||||||
|
export function readThreadSteps(threadId: string): StepEntry[] {
|
||||||
|
const stdout = execFileSync("uwf", ["step", "list", threadId], {
|
||||||
|
encoding: "utf8",
|
||||||
|
stdio: ["ignore", "pipe", "pipe"],
|
||||||
|
}).trim();
|
||||||
|
const parsed = JSON.parse(stdout) as ThreadStepsOutput;
|
||||||
|
// steps[0] is the StartEntry; the rest are StepEntry records.
|
||||||
|
return parsed.steps.slice(1) as StepEntry[];
|
||||||
|
}
|
||||||
@@ -0,0 +1,53 @@
|
|||||||
|
import { createLogger } from "@united-workforce/util";
|
||||||
|
|
||||||
|
import { EVAL_JUDGE_TOKEN_STATS_SCHEMA } from "../../storage/index.js";
|
||||||
|
import { readThreadSteps } from "./read-steps.js";
|
||||||
|
import type { BuiltinJudgeOutput } from "./types.js";
|
||||||
|
|
||||||
|
const log = createLogger({ sink: { kind: "stderr" } });
|
||||||
|
|
||||||
|
const LOG_RESULT = "T7KQ3M9P";
|
||||||
|
|
||||||
|
type PerStepStats = {
|
||||||
|
role: string;
|
||||||
|
inputTokens: number;
|
||||||
|
outputTokens: number;
|
||||||
|
turns: number;
|
||||||
|
duration: number;
|
||||||
|
};
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Informational judge: aggregate token usage across every step. Always scores
|
||||||
|
* 1.0 — it never penalizes a run, it only reports usage. Steps with null usage
|
||||||
|
* contribute zeros.
|
||||||
|
*/
|
||||||
|
export async function runTokenStatsJudge(threadId: string): Promise<BuiltinJudgeOutput> {
|
||||||
|
const steps = readThreadSteps(threadId);
|
||||||
|
|
||||||
|
let totalInput = 0;
|
||||||
|
let totalOutput = 0;
|
||||||
|
let totalTurns = 0;
|
||||||
|
const perStep: PerStepStats[] = [];
|
||||||
|
|
||||||
|
for (const step of steps) {
|
||||||
|
const usage = step.usage;
|
||||||
|
const inputTokens = usage !== null ? usage.inputTokens : 0;
|
||||||
|
const outputTokens = usage !== null ? usage.outputTokens : 0;
|
||||||
|
const turns = usage !== null ? usage.turns : 0;
|
||||||
|
const duration = usage !== null ? usage.duration : 0;
|
||||||
|
|
||||||
|
totalInput += inputTokens;
|
||||||
|
totalOutput += outputTokens;
|
||||||
|
totalTurns += turns;
|
||||||
|
|
||||||
|
perStep.push({ role: step.role, inputTokens, outputTokens, turns, duration });
|
||||||
|
}
|
||||||
|
|
||||||
|
log(LOG_RESULT, `token-stats thread=${threadId} in=${totalInput} out=${totalOutput}`);
|
||||||
|
|
||||||
|
return {
|
||||||
|
score: 1.0,
|
||||||
|
data: { totalInput, totalOutput, totalTurns, perStep },
|
||||||
|
schema: EVAL_JUDGE_TOKEN_STATS_SCHEMA,
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -0,0 +1,16 @@
|
|||||||
|
import type { JSONSchema } from "@ocas/core";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Output produced by a builtin judge. Structurally identical to the runner's
|
||||||
|
* `JudgeRunOutput`; defined locally to keep the judge module free of a
|
||||||
|
* dependency on the runner module.
|
||||||
|
*/
|
||||||
|
export type BuiltinJudgeOutput = {
|
||||||
|
score: number;
|
||||||
|
data: unknown;
|
||||||
|
/** Schema describing `data`, used when persisting to CAS. */
|
||||||
|
schema: JSONSchema;
|
||||||
|
};
|
||||||
|
|
||||||
|
/** A builtin judge analyzes a thread's steps and returns a scored result. */
|
||||||
|
export type BuiltinJudge = (threadId: string) => Promise<BuiltinJudgeOutput>;
|
||||||
@@ -0,0 +1,17 @@
|
|||||||
|
import { EVAL_JUDGE_UPSTREAM_SCHEMA } from "../../storage/index.js";
|
||||||
|
import type { BuiltinJudgeOutput } from "./types.js";
|
||||||
|
|
||||||
|
/**
|
||||||
|
* LLM-as-judge: measures how well each role consumed the relevant outputs from
|
||||||
|
* upstream steps.
|
||||||
|
*
|
||||||
|
* TODO: LLM-as-judge — needs provider config to call LLM API. Returns a stub
|
||||||
|
* (score 0, empty perStep) until the LLM call path is wired up.
|
||||||
|
*/
|
||||||
|
export async function runUpstreamJudge(_threadId: string): Promise<BuiltinJudgeOutput> {
|
||||||
|
return {
|
||||||
|
score: 0,
|
||||||
|
data: { perStep: [] },
|
||||||
|
schema: EVAL_JUDGE_UPSTREAM_SCHEMA,
|
||||||
|
};
|
||||||
|
}
|
||||||
@@ -1 +1,10 @@
|
|||||||
|
export {
|
||||||
|
type BuiltinJudge,
|
||||||
|
type BuiltinJudgeOutput,
|
||||||
|
readThreadSteps,
|
||||||
|
runFrontmatterJudge,
|
||||||
|
runHallucinationJudge,
|
||||||
|
runTokenStatsJudge,
|
||||||
|
runUpstreamJudge,
|
||||||
|
} from "./builtin/index.js";
|
||||||
export type { JudgeInput, JudgeOutput } from "./types.js";
|
export type { JudgeInput, JudgeOutput } from "./types.js";
|
||||||
|
|||||||
@@ -8,6 +8,12 @@ import type { CasRef } from "@united-workforce/protocol";
|
|||||||
import { createLogger } from "@united-workforce/util";
|
import { createLogger } from "@united-workforce/util";
|
||||||
|
|
||||||
import type { JudgeOutput } from "../judge/index.js";
|
import type { JudgeOutput } from "../judge/index.js";
|
||||||
|
import {
|
||||||
|
runFrontmatterJudge,
|
||||||
|
runHallucinationJudge,
|
||||||
|
runTokenStatsJudge,
|
||||||
|
runUpstreamJudge,
|
||||||
|
} from "../judge/index.js";
|
||||||
import type { EvalJudgeRecord, EvalRunPayload } from "../storage/index.js";
|
import type { EvalJudgeRecord, EvalRunPayload } from "../storage/index.js";
|
||||||
import { EVAL_RUN_SCHEMA, setEvalLatest } from "../storage/index.js";
|
import { EVAL_RUN_SCHEMA, setEvalLatest } from "../storage/index.js";
|
||||||
import type { JudgeEntry } from "../task/index.js";
|
import type { JudgeEntry } from "../task/index.js";
|
||||||
@@ -89,13 +95,29 @@ async function loadSchema(path: string): Promise<JSONSchema> {
|
|||||||
return JSON.parse(text) as JSONSchema;
|
return JSON.parse(text) as JSONSchema;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/** Dispatch a builtin judge by name. Throws on an unknown builtin name. */
|
||||||
|
async function runBuiltinJudge(name: string, threadId: string): Promise<JudgeRunOutput> {
|
||||||
|
switch (name) {
|
||||||
|
case "frontmatter-compliance":
|
||||||
|
return runFrontmatterJudge(threadId);
|
||||||
|
case "upstream-consumption":
|
||||||
|
return runUpstreamJudge(threadId);
|
||||||
|
case "hallucination":
|
||||||
|
return runHallucinationJudge(threadId);
|
||||||
|
case "token-stats":
|
||||||
|
return runTokenStatsJudge(threadId);
|
||||||
|
default:
|
||||||
|
throw new Error(`unknown builtin judge "${name}"`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Default judge runner. Builtin judges are skipped for now (placeholder score 0
|
* Default judge runner. Builtin judges are dispatched by name; task judges spawn
|
||||||
* with empty data); task judges spawn their entry script.
|
* their entry script.
|
||||||
*/
|
*/
|
||||||
const defaultJudgeRunner: JudgeRunner = async (taskDir, workDir, threadId, judge) => {
|
const defaultJudgeRunner: JudgeRunner = async (taskDir, workDir, threadId, judge) => {
|
||||||
if (judge.builtin) {
|
if (judge.builtin) {
|
||||||
return { score: 0, data: {}, schema: GENERIC_DATA_SCHEMA };
|
return runBuiltinJudge(judge.name, threadId);
|
||||||
}
|
}
|
||||||
return runTaskJudge(taskDir, workDir, threadId, judge);
|
return runTaskJudge(taskDir, workDir, threadId, judge);
|
||||||
};
|
};
|
||||||
|
|||||||
Reference in New Issue
Block a user