From 99f40c2488ddbda1956e03b0eb9233e44bd3b809 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=8F=E6=A9=98?= Date: Thu, 4 Jun 2026 15:41:07 +0000 Subject: [PATCH] feat: add $usage field to adapter protocol MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add Usage type to protocol (turns, inputTokens, outputTokens, duration) - Add usage to StepRecord, StepNodePayload, StepEntry, STEP_NODE_SCHEMA - Thread usage through util-agent extract pipeline (writeStepNode → persistStep → createAgent) - All adapters return usage: null as placeholder (mock, hermes, claude-code, builtin) - 746 tests pass, no breaking changes (usage not in schema required array) Fixes #74 Refs #68 --- .hermes/plans/2026-06-04-eval-framework.md | 226 ++++++++++++++++++ packages/agent-builtin/src/agent.ts | 9 +- packages/agent-claude-code/src/claude-code.ts | 2 +- packages/agent-hermes/src/hermes.ts | 4 +- packages/agent-mock/src/mock-agent.ts | 1 + .../cli/src/__tests__/step-show-json.test.ts | 1 + .../cli/src/__tests__/step-timing.test.ts | 2 + packages/cli/src/commands/step.ts | 1 + packages/protocol/src/__tests__/types.test.ts | 1 + packages/protocol/src/index.ts | 1 + packages/protocol/src/schemas.ts | 16 ++ packages/protocol/src/types.ts | 12 + packages/util-agent/src/context.ts | 1 + packages/util-agent/src/run.ts | 9 +- packages/util-agent/src/types.ts | 10 +- 15 files changed, 290 insertions(+), 6 deletions(-) create mode 100644 .hermes/plans/2026-06-04-eval-framework.md diff --git a/.hermes/plans/2026-06-04-eval-framework.md b/.hermes/plans/2026-06-04-eval-framework.md new file mode 100644 index 0000000..882be16 --- /dev/null +++ b/.hermes/plans/2026-06-04-eval-framework.md @@ -0,0 +1,226 @@ +# Eval Framework Implementation Plan + +## Goal + +Build `uwf-eval` CLI + eval task infrastructure for evaluating uwf workflow quality with real agents. + +## Architecture + +``` +uwf-eval (runner) task package (npm) OCAS (storage) + │ │ │ + ├─ unpack tarball ───────► fixture/ → tmp cwd │ + ├─ read task.yaml │ │ + ├─ uwf thread start/exec │ │ + ├─ run judges ───────────► dist/judges/*.js │ + ├─ collect scores │ │ + └─ store results ─────────────────────────────────────► CAS nodes + variables +``` + +### Key Design Decisions + +- **uwf-eval is NOT part of uwf** — separate package, shells out to uwf CLI +- **Task = npm package** — fixture + task.yaml + judge scripts, distributable as tarball +- **Judge = Node script** — `node `, outputs `{score, data}` JSON +- **Every output is OCAS typed** — eval-run, judge results all have registered schemas +- **Builtin judges** — frontmatter compliance, upstream consumption, hallucination, token stats +- **Task-specific judges** — bundled in the task package, custom schema per judge + +## Deliverables + +### Phase 1: Foundation (`@united-workforce/eval`) + +New package in the uwf monorepo. + +``` +packages/eval/ + src/ + cli.ts # uwf-eval entry point + commands/ + run.ts # uwf-eval run + report.ts # uwf-eval report + diff.ts # uwf-eval diff + list.ts # uwf-eval list + runner/ + prepare.ts # unpack tarball/dir → tmp cwd + execute.ts # shell out to uwf thread start/exec + collect.ts # run judges, collect scores + judge/ + types.ts # JudgeInput, JudgeOutput types + builtin/ + frontmatter.ts # frontmatter compliance check + upstream.ts # upstream info consumption (LLM-as-judge) + hallucination.ts # hallucination detection (LLM-as-judge) + token-stats.ts # token usage from $usage field (#68) + storage/ + schemas.ts # OCAS schema definitions + store.ts # CAS read/write helpers + index.ts # variable indexing (@uwf/eval/*) + task/ + types.ts # TaskManifest type (task.yaml) + loader.ts # parse task.yaml, validate + package.json + tsconfig.json +``` + +#### OCAS Schemas to Register + +1. `@uwf/eval-run` — full eval execution record + ``` + { task, config: {agent, model, engineVersion}, threadId, + judges: [{name, score, weight, dataHash}], overall, timestamp } + ``` + +2. `@uwf/eval-judge-frontmatter` — frontmatter judge data + ``` + { stepsTotal, stepsValid, invalidSteps: [{stepIndex, role, errors: string[]}] } + ``` + +3. `@uwf/eval-judge-upstream` — upstream consumption judge data + ``` + { perStep: [{role, consumed: string[], missed: string[], score}] } + ``` + +4. `@uwf/eval-judge-hallucination` — hallucination judge data + ``` + { perStep: [{role, hallucinations: string[], score}] } + ``` + +5. `@uwf/eval-judge-token-stats` — token stats (not scored, informational) + ``` + { totalInput, totalOutput, totalTurns, perStep: [{role, input, output, turns, duration}] } + ``` + +#### CLI Design + +```bash +# Run eval +uwf-eval run [--agent hermes] [--model claude-sonnet-4] [--count 20] + +# View results +uwf-eval report # render via ocas render +uwf-eval diff # side-by-side comparison +uwf-eval list # list past runs +``` + +### Phase 2: Task Package Scaffold + +Template for creating eval tasks. Also serves as the first real task. + +``` +eval-tasks/ # shazhou/uwf-eval-tasks monorepo + packages/ + _template/ # copypaste template + package.json + task.yaml + fixture/ + src/judges/ + tsconfig.json + fix-off-by-one/ # first real task + package.json # @uwf-eval/fix-off-by-one + task.yaml + fixture/ + src/calc.ts # buggy calculator + src/calc.test.ts # test that exposes the bug + package.json + src/judges/ + test-pass.ts # runs pnpm test, checks exit code + code-quality.ts # LLM judge: minimal change, correct fix + schemas/ + test-pass.json # OCAS schema for test-pass data + code-quality.json # OCAS schema for code-quality data + tsconfig.json + pnpm-workspace.yaml + tsconfig.json + biome.json +``` + +#### task.yaml Format + +```yaml +name: fix-off-by-one +description: Fix an off-by-one error in a calculator's add function +workflow: solve-issue # registered workflow name, or relative path to .yaml +prompt: "Fix the bug: add(1,2) returns 4 instead of 3" +limits: + maxSteps: 15 + timeoutMinutes: 30 +judges: + - name: frontmatter-compliance + weight: 0.15 + builtin: true + - name: upstream-consumption + weight: 0.15 + builtin: true + - name: hallucination + weight: 0.1 + builtin: true + - name: token-stats + weight: 0 # informational, not scored + builtin: true + - name: test-pass + weight: 0.3 + entry: dist/judges/test-pass.js + schema: schemas/test-pass.json + - name: code-quality + weight: 0.3 + entry: dist/judges/code-quality.js + schema: schemas/code-quality.json +``` + +#### Judge Script Contract + +```typescript +// Input: process.argv = [node, script, cwd, threadId] +// Output: stdout JSON +// Exit 0 = success, non-zero = judge error (not low score) + +import type { JudgeOutput } from "@united-workforce/eval"; + +const result: JudgeOutput = { + score: 1.0, // 0.0 - 1.0 + data: { // typed per judge schema + command: "pnpm test", + exitCode: 0, + output: "3 tests passed" + } +}; + +console.log(JSON.stringify(result)); +``` + +### Phase 3: Prerequisite — $usage in Adapter Protocol (#68) + +Blocked by #68. Token stats judge needs `$usage` in step nodes. + +Can proceed with Phase 1+2 without it — token-stats judge just returns zeros until adapters report usage. + +## Implementation Order + +1. **Phase 1a**: `@united-workforce/eval` package scaffold + CLI skeleton + OCAS schemas +2. **Phase 1b**: `run` command — prepare, execute, collect flow +3. **Phase 1c**: Builtin judges — frontmatter (deterministic), upstream + hallucination (LLM-as-judge) +4. **Phase 2a**: Create `shazhou/uwf-eval-tasks` monorepo with proman +5. **Phase 2b**: First task `fix-off-by-one` with fixture repo + 2 custom judges +6. **Phase 2c**: End-to-end test: `uwf-eval run packages/fix-off-by-one --agent hermes` +7. **Phase 1d**: `report`, `diff`, `list` commands (read from CAS, render via ocas render) + +## Dependencies + +- `@ocas/core` + `@ocas/fs` — CAS storage +- `@united-workforce/protocol` — step node types +- `commander` — CLI framework (consistent with uwf) +- LLM API access — for LLM-as-judge (upstream, hallucination, task-specific quality judges) + +## Open Questions + +1. **LLM-as-judge provider config** — reuse uwf's `~/.uwf/config.yaml` provider settings? Or separate config? +2. **Workflow file location** — task.yaml references a workflow. Should the workflow YAML be inside the tarball, or reference a registered workflow by name? +3. **Non-coding tasks** — debate workflow has no fixture repo. task.yaml needs `fixture: null` or simply omit the `fixture/` dir. Runner creates empty cwd. +4. **Parallel judge execution** — judges are independent, can run in parallel. Worth the complexity? + +## Risks + +- LLM-as-judge consistency — same input may get different scores. Mitigation: run judge multiple times, take average? Or accept variance. +- Token cost of judges — each LLM judge call costs tokens. For a 10-step workflow with 2 LLM judges = 20 LLM calls just for judging. Acceptable? +- Fixture repo drift — if the fixture evolves, old eval runs become non-comparable. Pin fixture version in task.yaml. diff --git a/packages/agent-builtin/src/agent.ts b/packages/agent-builtin/src/agent.ts index 6efa2ad..2854669 100644 --- a/packages/agent-builtin/src/agent.ts +++ b/packages/agent-builtin/src/agent.ts @@ -82,7 +82,13 @@ async function runBuiltinWithMessages( if (loopResult.turnCount === 0) { log("5RWTK9NB", "no turns produced, returning empty output"); - return { output: "", detailHash: "", sessionId: session.sessionId, assembledPrompt: "" }; + return { + output: "", + detailHash: "", + sessionId: session.sessionId, + assembledPrompt: "", + usage: null, + }; } // Read jsonl → persist turns to CAS → store detail @@ -99,6 +105,7 @@ async function runBuiltinWithMessages( detailHash, sessionId: session.sessionId, assembledPrompt: "", + usage: null, }; } diff --git a/packages/agent-claude-code/src/claude-code.ts b/packages/agent-claude-code/src/claude-code.ts index c04b0f7..516e285 100644 --- a/packages/agent-claude-code/src/claude-code.ts +++ b/packages/agent-claude-code/src/claude-code.ts @@ -145,7 +145,7 @@ async function processClaudeOutput( ); } - return { output, detailHash, sessionId, assembledPrompt }; + return { output, detailHash, sessionId, assembledPrompt, usage: null }; } // Truly unparseable output - provide enhanced error message diff --git a/packages/agent-hermes/src/hermes.ts b/packages/agent-hermes/src/hermes.ts index 8191cea..23c1a43 100644 --- a/packages/agent-hermes/src/hermes.ts +++ b/packages/agent-hermes/src/hermes.ts @@ -118,7 +118,7 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise await setCachedSessionId(ctx.threadId, ctx.role, sessionId, ctx.storageRoot); } - return { output: text, detailHash, sessionId, assembledPrompt: fullPrompt }; + return { output: text, detailHash, sessionId, assembledPrompt: fullPrompt, usage: null }; } async function runHermes(ctx: AgentContext): Promise { @@ -149,7 +149,7 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise // so the agent sees the full conversation history (crucial for retries). const { text, sessionId } = await client.prompt(message); const { detailHash } = await storePromptResult(store, sessionId); - return { output: text, detailHash, sessionId, assembledPrompt: "" }; + return { output: text, detailHash, sessionId, assembledPrompt: "", usage: null }; } const agentMain = createAgent({ diff --git a/packages/agent-mock/src/mock-agent.ts b/packages/agent-mock/src/mock-agent.ts index 1044232..09713d5 100644 --- a/packages/agent-mock/src/mock-agent.ts +++ b/packages/agent-mock/src/mock-agent.ts @@ -103,6 +103,7 @@ export function createMockAgent(mockDataPath: string): () => Promise { detailHash, sessionId, assembledPrompt: "", + usage: null, }; lastResult = result; return result; diff --git a/packages/cli/src/__tests__/step-show-json.test.ts b/packages/cli/src/__tests__/step-show-json.test.ts index 21d446f..e7b6076 100644 --- a/packages/cli/src/__tests__/step-show-json.test.ts +++ b/packages/cli/src/__tests__/step-show-json.test.ts @@ -118,6 +118,7 @@ async function createTestStep( completedAtMs: Date.now() + 1000, assembledPrompt: null, cwd: "/tmp", + usage: null, }; return store.cas.put(schemas.stepNode, stepPayload); } diff --git a/packages/cli/src/__tests__/step-timing.test.ts b/packages/cli/src/__tests__/step-timing.test.ts index 9d84c89..f18c3dc 100644 --- a/packages/cli/src/__tests__/step-timing.test.ts +++ b/packages/cli/src/__tests__/step-timing.test.ts @@ -96,6 +96,7 @@ describe("protocol types", () => { completedAtMs: 2000, assembledPrompt: null, cwd: "/test/path", + usage: null, }; expect(record.startedAtMs).toBe(1000); expect(record.completedAtMs).toBe(2000); @@ -110,6 +111,7 @@ describe("protocol types", () => { agent: "uwf-test", timestamp: 123, durationMs: 5000, + usage: null, }; expect(entry.durationMs).toBe(5000); }); diff --git a/packages/cli/src/commands/step.ts b/packages/cli/src/commands/step.ts index 6786d4f..12070b9 100644 --- a/packages/cli/src/commands/step.ts +++ b/packages/cli/src/commands/step.ts @@ -66,6 +66,7 @@ export async function cmdStepList( agent: item.payload.agent, timestamp: item.timestamp, durationMs: item.payload.completedAtMs - item.payload.startedAtMs, + usage: item.payload.usage ?? null, }); } diff --git a/packages/protocol/src/__tests__/types.test.ts b/packages/protocol/src/__tests__/types.test.ts index ed8a1bc..3234e65 100644 --- a/packages/protocol/src/__tests__/types.test.ts +++ b/packages/protocol/src/__tests__/types.test.ts @@ -27,6 +27,7 @@ describe("Protocol types for thread/edge location", () => { completedAtMs: Date.now() + 1000, assembledPrompt: null, cwd: "/home/user/project", + usage: null, }; expect(record.cwd).toBe("/home/user/project"); diff --git a/packages/protocol/src/index.ts b/packages/protocol/src/index.ts index d62c170..bf4ac8c 100644 --- a/packages/protocol/src/index.ts +++ b/packages/protocol/src/index.ts @@ -44,6 +44,7 @@ export type { ThreadStatus, ThreadStepsOutput, ThreadsIndex, + Usage, WorkflowConfig, WorkflowName, WorkflowPayload, diff --git a/packages/protocol/src/schemas.ts b/packages/protocol/src/schemas.ts index 13c6e09..902e91b 100644 --- a/packages/protocol/src/schemas.ts +++ b/packages/protocol/src/schemas.ts @@ -91,6 +91,22 @@ export const STEP_NODE_SCHEMA: JSONSchema = { assembledPrompt: { anyOf: [{ type: "string", format: "ocas_ref" }, { type: "null" }], }, + usage: { + anyOf: [ + { + type: "object", + required: ["turns", "inputTokens", "outputTokens", "duration"], + properties: { + turns: { type: "integer" }, + inputTokens: { type: "integer" }, + outputTokens: { type: "integer" }, + duration: { type: "number" }, + }, + additionalProperties: false, + }, + { type: "null" }, + ], + }, }, additionalProperties: false, }; diff --git a/packages/protocol/src/types.ts b/packages/protocol/src/types.ts index 333eeac..9823b36 100644 --- a/packages/protocol/src/types.ts +++ b/packages/protocol/src/types.ts @@ -22,6 +22,17 @@ export type StepRecord = { cwd: string; /** CAS ref to the fully assembled prompt sent to the agent. null for legacy steps. */ assembledPrompt: CasRef | null; + /** Token usage statistics reported by the agent adapter. null for legacy steps. */ + usage: Usage | null; +}; + +/** Token usage statistics reported by agent adapters. */ +export type Usage = { + turns: number; + inputTokens: number; + outputTokens: number; + /** Wall-clock duration in seconds. */ + duration: number; }; // ── 4.2 Workflow 定义 ─────────────────────────────────────────────── @@ -131,6 +142,7 @@ export type StepEntry = { agent: string; timestamp: number; durationMs: number; + usage: Usage | null; }; /** uwf thread steps — start entry */ diff --git a/packages/util-agent/src/context.ts b/packages/util-agent/src/context.ts index 3103cdb..0c4f488 100644 --- a/packages/util-agent/src/context.ts +++ b/packages/util-agent/src/context.ts @@ -132,6 +132,7 @@ async function buildHistory( completedAtMs: step.completedAtMs, cwd: step.cwd ?? "", assembledPrompt: step.assembledPrompt ?? null, + usage: step.usage ?? null, content, }); } diff --git a/packages/util-agent/src/run.ts b/packages/util-agent/src/run.ts index 0b8e951..9887b5f 100644 --- a/packages/util-agent/src/run.ts +++ b/packages/util-agent/src/run.ts @@ -1,5 +1,5 @@ import { getSchema, validate } from "@ocas/core"; -import type { CasRef, StepNodePayload, ThreadId } from "@united-workforce/protocol"; +import type { CasRef, StepNodePayload, ThreadId, Usage } from "@united-workforce/protocol"; import { config as loadDotenv } from "dotenv"; import { buildOutputFormatInstruction } from "./build-output-format-instruction.js"; import { buildContextWithMeta } from "./context.js"; @@ -65,6 +65,7 @@ async function writeStepNode(options: { startedAtMs: number; completedAtMs: number; assembledPromptHash: CasRef | null; + usage: Usage | null; }): Promise { const payload: StepNodePayload = { start: options.startHash, @@ -78,6 +79,7 @@ async function writeStepNode(options: { completedAtMs: options.completedAtMs, cwd: process.cwd(), assembledPrompt: options.assembledPromptHash, + usage: options.usage, }; const hash = await options.store.cas.put(options.schemas.stepNode, payload); const node = options.store.cas.get(hash); @@ -117,6 +119,7 @@ async function persistStep(options: { startedAtMs: number; completedAtMs: number; assembledPromptHash: CasRef | null; + usage: Usage | null; }): Promise { const { store, schemas, chain, headHash } = options.ctx.meta; return writeStepNode({ @@ -132,6 +135,7 @@ async function persistStep(options: { startedAtMs: options.startedAtMs, completedAtMs: options.completedAtMs, assembledPromptHash: options.assembledPromptHash, + usage: options.usage, }); } @@ -200,6 +204,7 @@ export function createAgent(options: AgentOptions): () => Promise { ); } const completedAtMs = Date.now(); + const usage = agentResult.usage; // Store the assembled prompt in CAS for later inspection via `step read --prompt` const promptText = agentResult.assembledPrompt; @@ -220,6 +225,7 @@ export function createAgent(options: AgentOptions): () => Promise { startedAtMs, completedAtMs, assembledPromptHash, + usage, }); const adapterOutput: AdapterOutput = { @@ -230,6 +236,7 @@ export function createAgent(options: AgentOptions): () => Promise { body: extracted.body, startedAtMs, completedAtMs, + usage, }; process.stdout.write(`${JSON.stringify(adapterOutput)}\n`); }; diff --git a/packages/util-agent/src/types.ts b/packages/util-agent/src/types.ts index 103f19d..bd3e823 100644 --- a/packages/util-agent/src/types.ts +++ b/packages/util-agent/src/types.ts @@ -1,5 +1,10 @@ import type { Store } from "@ocas/core"; -import type { ModeratorContext, ThreadId, WorkflowPayload } from "@united-workforce/protocol"; +import type { + ModeratorContext, + ThreadId, + Usage, + WorkflowPayload, +} from "@united-workforce/protocol"; export type AgentContext = ModeratorContext & { threadId: ThreadId; @@ -33,6 +38,8 @@ export type AgentRunResult = { sessionId: string; /** The fully assembled prompt that was sent to the agent. */ assembledPrompt: string; + /** Token usage statistics for this run. null when the adapter does not report usage. */ + usage: Usage | null; }; export type AgentContinueFn = ( @@ -51,6 +58,7 @@ export type AdapterOutput = { body: string; startedAtMs: number; completedAtMs: number; + usage: Usage | null; }; export type AgentOptions = {