8c26f16716
CI / check (pull_request) Successful in 1m45s
Implement 4 builtin judges for eval framework: - frontmatter-compliance: validates YAML frontmatter with $status field, score = stepsValid / stepsTotal - token-stats: aggregates Usage from step nodes, always score 1.0 (informational only) - upstream-consumption: LLM-as-judge stub (score 0, TODO) - hallucination: LLM-as-judge stub (score 0, TODO) Infrastructure: - judge/builtin/read-steps.ts — shell out to uwf step list - judge/builtin/types.ts — BuiltinJudge, BuiltinJudgeOutput - runner/collect.ts — dispatch builtin judges by name 9 new tests (frontmatter validation + token aggregation) Refs #71
18 lines
605 B
TypeScript
18 lines
605 B
TypeScript
import { EVAL_JUDGE_HALLUCINATION_SCHEMA } from "../../storage/index.js";
|
|
import type { BuiltinJudgeOutput } from "./types.js";
|
|
|
|
/**
|
|
* LLM-as-judge: detects claims in each step's output that are not grounded in
|
|
* the available context (hallucinations).
|
|
*
|
|
* TODO: LLM-as-judge — needs provider config to call LLM API. Returns a stub
|
|
* (score 0, empty perStep) until the LLM call path is wired up.
|
|
*/
|
|
export async function runHallucinationJudge(_threadId: string): Promise<BuiltinJudgeOutput> {
|
|
return {
|
|
score: 0,
|
|
data: { perStep: [] },
|
|
schema: EVAL_JUDGE_HALLUCINATION_SCHEMA,
|
|
};
|
|
}
|