Files
united-workforce/packages/eval/src/judge/builtin/token-stats.ts
T
xiaoju 8c26f16716
CI / check (pull_request) Successful in 1m45s
feat: builtin judges — frontmatter + token-stats (deterministic) + upstream/hallucination (stubs)
Implement 4 builtin judges for eval framework:

- frontmatter-compliance: validates YAML frontmatter with $status field,
  score = stepsValid / stepsTotal
- token-stats: aggregates Usage from step nodes, always score 1.0
  (informational only)
- upstream-consumption: LLM-as-judge stub (score 0, TODO)
- hallucination: LLM-as-judge stub (score 0, TODO)

Infrastructure:
- judge/builtin/read-steps.ts — shell out to uwf step list
- judge/builtin/types.ts — BuiltinJudge, BuiltinJudgeOutput
- runner/collect.ts — dispatch builtin judges by name

9 new tests (frontmatter validation + token aggregation)

Refs #71
2026-06-05 00:09:06 +00:00

54 lines
1.5 KiB
TypeScript

import { createLogger } from "@united-workforce/util";
import { EVAL_JUDGE_TOKEN_STATS_SCHEMA } from "../../storage/index.js";
import { readThreadSteps } from "./read-steps.js";
import type { BuiltinJudgeOutput } from "./types.js";
const log = createLogger({ sink: { kind: "stderr" } });
const LOG_RESULT = "T7KQ3M9P";
type PerStepStats = {
role: string;
inputTokens: number;
outputTokens: number;
turns: number;
duration: number;
};
/**
* Informational judge: aggregate token usage across every step. Always scores
* 1.0 — it never penalizes a run, it only reports usage. Steps with null usage
* contribute zeros.
*/
export async function runTokenStatsJudge(threadId: string): Promise<BuiltinJudgeOutput> {
const steps = readThreadSteps(threadId);
let totalInput = 0;
let totalOutput = 0;
let totalTurns = 0;
const perStep: PerStepStats[] = [];
for (const step of steps) {
const usage = step.usage;
const inputTokens = usage !== null ? usage.inputTokens : 0;
const outputTokens = usage !== null ? usage.outputTokens : 0;
const turns = usage !== null ? usage.turns : 0;
const duration = usage !== null ? usage.duration : 0;
totalInput += inputTokens;
totalOutput += outputTokens;
totalTurns += turns;
perStep.push({ role: step.role, inputTokens, outputTokens, turns, duration });
}
log(LOG_RESULT, `token-stats thread=${threadId} in=${totalInput} out=${totalOutput}`);
return {
score: 1.0,
data: { totalInput, totalOutput, totalTurns, perStep },
schema: EVAL_JUDGE_TOKEN_STATS_SCHEMA,
};
}