feat: builtin judges — frontmatter + token-stats (deterministic) + upstream/hallucination (stubs)
CI / check (pull_request) Successful in 1m45s
CI / check (pull_request) Successful in 1m45s
Implement 4 builtin judges for eval framework: - frontmatter-compliance: validates YAML frontmatter with $status field, score = stepsValid / stepsTotal - token-stats: aggregates Usage from step nodes, always score 1.0 (informational only) - upstream-consumption: LLM-as-judge stub (score 0, TODO) - hallucination: LLM-as-judge stub (score 0, TODO) Infrastructure: - judge/builtin/read-steps.ts — shell out to uwf step list - judge/builtin/types.ts — BuiltinJudge, BuiltinJudgeOutput - runner/collect.ts — dispatch builtin judges by name 9 new tests (frontmatter validation + token aggregation) Refs #71
This commit is contained in:
@@ -8,6 +8,12 @@ import type { CasRef } from "@united-workforce/protocol";
|
||||
import { createLogger } from "@united-workforce/util";
|
||||
|
||||
import type { JudgeOutput } from "../judge/index.js";
|
||||
import {
|
||||
runFrontmatterJudge,
|
||||
runHallucinationJudge,
|
||||
runTokenStatsJudge,
|
||||
runUpstreamJudge,
|
||||
} from "../judge/index.js";
|
||||
import type { EvalJudgeRecord, EvalRunPayload } from "../storage/index.js";
|
||||
import { EVAL_RUN_SCHEMA, setEvalLatest } from "../storage/index.js";
|
||||
import type { JudgeEntry } from "../task/index.js";
|
||||
@@ -89,13 +95,29 @@ async function loadSchema(path: string): Promise<JSONSchema> {
|
||||
return JSON.parse(text) as JSONSchema;
|
||||
}
|
||||
|
||||
/** Dispatch a builtin judge by name. Throws on an unknown builtin name. */
|
||||
async function runBuiltinJudge(name: string, threadId: string): Promise<JudgeRunOutput> {
|
||||
switch (name) {
|
||||
case "frontmatter-compliance":
|
||||
return runFrontmatterJudge(threadId);
|
||||
case "upstream-consumption":
|
||||
return runUpstreamJudge(threadId);
|
||||
case "hallucination":
|
||||
return runHallucinationJudge(threadId);
|
||||
case "token-stats":
|
||||
return runTokenStatsJudge(threadId);
|
||||
default:
|
||||
throw new Error(`unknown builtin judge "${name}"`);
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Default judge runner. Builtin judges are skipped for now (placeholder score 0
|
||||
* with empty data); task judges spawn their entry script.
|
||||
* Default judge runner. Builtin judges are dispatched by name; task judges spawn
|
||||
* their entry script.
|
||||
*/
|
||||
const defaultJudgeRunner: JudgeRunner = async (taskDir, workDir, threadId, judge) => {
|
||||
if (judge.builtin) {
|
||||
return { score: 0, data: {}, schema: GENERIC_DATA_SCHEMA };
|
||||
return runBuiltinJudge(judge.name, threadId);
|
||||
}
|
||||
return runTaskJudge(taskDir, workDir, threadId, judge);
|
||||
};
|
||||
|
||||
Reference in New Issue
Block a user