8c26f16716
CI / check (pull_request) Successful in 1m45s
Implement 4 builtin judges for eval framework: - frontmatter-compliance: validates YAML frontmatter with $status field, score = stepsValid / stepsTotal - token-stats: aggregates Usage from step nodes, always score 1.0 (informational only) - upstream-consumption: LLM-as-judge stub (score 0, TODO) - hallucination: LLM-as-judge stub (score 0, TODO) Infrastructure: - judge/builtin/read-steps.ts — shell out to uwf step list - judge/builtin/types.ts — BuiltinJudge, BuiltinJudgeOutput - runner/collect.ts — dispatch builtin judges by name 9 new tests (frontmatter validation + token aggregation) Refs #71
54 lines
1.5 KiB
TypeScript
54 lines
1.5 KiB
TypeScript
import { createLogger } from "@united-workforce/util";
|
|
|
|
import { EVAL_JUDGE_TOKEN_STATS_SCHEMA } from "../../storage/index.js";
|
|
import { readThreadSteps } from "./read-steps.js";
|
|
import type { BuiltinJudgeOutput } from "./types.js";
|
|
|
|
const log = createLogger({ sink: { kind: "stderr" } });
|
|
|
|
const LOG_RESULT = "T7KQ3M9P";
|
|
|
|
type PerStepStats = {
|
|
role: string;
|
|
inputTokens: number;
|
|
outputTokens: number;
|
|
turns: number;
|
|
duration: number;
|
|
};
|
|
|
|
/**
|
|
* Informational judge: aggregate token usage across every step. Always scores
|
|
* 1.0 — it never penalizes a run, it only reports usage. Steps with null usage
|
|
* contribute zeros.
|
|
*/
|
|
export async function runTokenStatsJudge(threadId: string): Promise<BuiltinJudgeOutput> {
|
|
const steps = readThreadSteps(threadId);
|
|
|
|
let totalInput = 0;
|
|
let totalOutput = 0;
|
|
let totalTurns = 0;
|
|
const perStep: PerStepStats[] = [];
|
|
|
|
for (const step of steps) {
|
|
const usage = step.usage;
|
|
const inputTokens = usage !== null ? usage.inputTokens : 0;
|
|
const outputTokens = usage !== null ? usage.outputTokens : 0;
|
|
const turns = usage !== null ? usage.turns : 0;
|
|
const duration = usage !== null ? usage.duration : 0;
|
|
|
|
totalInput += inputTokens;
|
|
totalOutput += outputTokens;
|
|
totalTurns += turns;
|
|
|
|
perStep.push({ role: step.role, inputTokens, outputTokens, turns, duration });
|
|
}
|
|
|
|
log(LOG_RESULT, `token-stats thread=${threadId} in=${totalInput} out=${totalOutput}`);
|
|
|
|
return {
|
|
score: 1.0,
|
|
data: { totalInput, totalOutput, totalTurns, perStep },
|
|
schema: EVAL_JUDGE_TOKEN_STATS_SCHEMA,
|
|
};
|
|
}
|