8c26f16716
CI / check (pull_request) Successful in 1m45s
Implement 4 builtin judges for eval framework: - frontmatter-compliance: validates YAML frontmatter with $status field, score = stepsValid / stepsTotal - token-stats: aggregates Usage from step nodes, always score 1.0 (informational only) - upstream-consumption: LLM-as-judge stub (score 0, TODO) - hallucination: LLM-as-judge stub (score 0, TODO) Infrastructure: - judge/builtin/read-steps.ts — shell out to uwf step list - judge/builtin/types.ts — BuiltinJudge, BuiltinJudgeOutput - runner/collect.ts — dispatch builtin judges by name 9 new tests (frontmatter validation + token aggregation) Refs #71
15 lines
606 B
TypeScript
15 lines
606 B
TypeScript
import { execFileSync } from "node:child_process";
|
|
|
|
import type { StepEntry, ThreadStepsOutput } from "@united-workforce/protocol";
|
|
|
|
/** Shell out to `uwf step list` and return the parsed step entries (excludes start entry). */
|
|
export function readThreadSteps(threadId: string): StepEntry[] {
|
|
const stdout = execFileSync("uwf", ["step", "list", threadId], {
|
|
encoding: "utf8",
|
|
stdio: ["ignore", "pipe", "pipe"],
|
|
}).trim();
|
|
const parsed = JSON.parse(stdout) as ThreadStepsOutput;
|
|
// steps[0] is the StartEntry; the rest are StepEntry records.
|
|
return parsed.steps.slice(1) as StepEntry[];
|
|
}
|