From 99f40c2488ddbda1956e03b0eb9233e44bd3b809 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=8F=E6=A9=98?= <xiaoju@shazhou.work>
Date: Thu, 4 Jun 2026 15:41:07 +0000
Subject: [PATCH] feat: add $usage field to adapter protocol
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add Usage type to protocol (turns, inputTokens, outputTokens, duration)
- Add usage to StepRecord, StepNodePayload, StepEntry, STEP_NODE_SCHEMA
- Thread usage through util-agent extract pipeline (writeStepNode → persistStep → createAgent)
- All adapters return usage: null as placeholder (mock, hermes, claude-code, builtin)
- 746 tests pass, no breaking changes (usage not in schema required array)

Fixes #74
Refs #68
---
 .hermes/plans/2026-06-04-eval-framework.md    | 226 ++++++++++++++++++
 packages/agent-builtin/src/agent.ts           |   9 +-
 packages/agent-claude-code/src/claude-code.ts |   2 +-
 packages/agent-hermes/src/hermes.ts           |   4 +-
 packages/agent-mock/src/mock-agent.ts         |   1 +
 .../cli/src/__tests__/step-show-json.test.ts  |   1 +
 .../cli/src/__tests__/step-timing.test.ts     |   2 +
 packages/cli/src/commands/step.ts             |   1 +
 packages/protocol/src/__tests__/types.test.ts |   1 +
 packages/protocol/src/index.ts                |   1 +
 packages/protocol/src/schemas.ts              |  16 ++
 packages/protocol/src/types.ts                |  12 +
 packages/util-agent/src/context.ts            |   1 +
 packages/util-agent/src/run.ts                |   9 +-
 packages/util-agent/src/types.ts              |  10 +-
 15 files changed, 290 insertions(+), 6 deletions(-)
 create mode 100644 .hermes/plans/2026-06-04-eval-framework.md
diff --git a/.hermes/plans/2026-06-04-eval-framework.md b/.hermes/plans/2026-06-04-eval-framework.md
new file mode 100644
index 0000000..882be16
--- /dev/null
+++ b/.hermes/plans/2026-06-04-eval-framework.md
@@ -0,0 +1,226 @@
+# Eval Framework Implementation Plan
+
+## Goal
+
+Build `uwf-eval` CLI + eval task infrastructure for evaluating uwf workflow quality with real agents.
+
+## Architecture
+
+```
+uwf-eval (runner)          task package (npm)          OCAS (storage)
+  │                          │                           │
+  ├─ unpack tarball ───────► fixture/ → tmp cwd          │
+  ├─ read task.yaml          │                           │
+  ├─ uwf thread start/exec  │                           │
+  ├─ run judges ───────────► dist/judges/*.js            │
+  ├─ collect scores          │                           │
+  └─ store results ─────────────────────────────────────► CAS nodes + variables
+```
+
+### Key Design Decisions
+
+- **uwf-eval is NOT part of uwf** — separate package, shells out to uwf CLI
+- **Task = npm package** — fixture + task.yaml + judge scripts, distributable as tarball
+- **Judge = Node script** — `node <entry> <cwd> <thread-id>`, outputs `{score, data}` JSON
+- **Every output is OCAS typed** — eval-run, judge results all have registered schemas
+- **Builtin judges** — frontmatter compliance, upstream consumption, hallucination, token stats
+- **Task-specific judges** — bundled in the task package, custom schema per judge
+
+## Deliverables
+
+### Phase 1: Foundation (`@united-workforce/eval`)
+
+New package in the uwf monorepo.
+
+```
+packages/eval/
+  src/
+    cli.ts                    # uwf-eval entry point
+    commands/
+      run.ts                  # uwf-eval run
+      report.ts               # uwf-eval report <hash>
+      diff.ts                 # uwf-eval diff <hash> <hash>
+      list.ts                 # uwf-eval list
+    runner/
+      prepare.ts              # unpack tarball/dir → tmp cwd
+      execute.ts              # shell out to uwf thread start/exec
+      collect.ts              # run judges, collect scores
+    judge/
+      types.ts                # JudgeInput, JudgeOutput types
+      builtin/
+        frontmatter.ts        # frontmatter compliance check
+        upstream.ts           # upstream info consumption (LLM-as-judge)
+        hallucination.ts      # hallucination detection (LLM-as-judge)
+        token-stats.ts        # token usage from $usage field (#68)
+    storage/
+      schemas.ts              # OCAS schema definitions
+      store.ts                # CAS read/write helpers
+      index.ts                # variable indexing (@uwf/eval/*)
+    task/
+      types.ts                # TaskManifest type (task.yaml)
+      loader.ts               # parse task.yaml, validate
+  package.json
+  tsconfig.json
+```
+
+#### OCAS Schemas to Register
+
+1. `@uwf/eval-run` — full eval execution record
+   ```
+   { task, config: {agent, model, engineVersion}, threadId,
+     judges: [{name, score, weight, dataHash}], overall, timestamp }
+   ```
+
+2. `@uwf/eval-judge-frontmatter` — frontmatter judge data
+   ```
+   { stepsTotal, stepsValid, invalidSteps: [{stepIndex, role, errors: string[]}] }
+   ```
+
+3. `@uwf/eval-judge-upstream` — upstream consumption judge data
+   ```
+   { perStep: [{role, consumed: string[], missed: string[], score}] }
+   ```
+
+4. `@uwf/eval-judge-hallucination` — hallucination judge data
+   ```
+   { perStep: [{role, hallucinations: string[], score}] }
+   ```
+
+5. `@uwf/eval-judge-token-stats` — token stats (not scored, informational)
+   ```
+   { totalInput, totalOutput, totalTurns, perStep: [{role, input, output, turns, duration}] }
+   ```
+
+#### CLI Design
+
+```bash
+# Run eval
+uwf-eval run <task-dir-or-tarball> [--agent hermes] [--model claude-sonnet-4] [--count 20]
+
+# View results
+uwf-eval report <run-hash>        # render via ocas render
+uwf-eval diff <hash1> <hash2>     # side-by-side comparison
+uwf-eval list                     # list past runs
+```
+
+### Phase 2: Task Package Scaffold
+
+Template for creating eval tasks. Also serves as the first real task.
+
+```
+eval-tasks/                        # shazhou/uwf-eval-tasks monorepo
+  packages/
+    _template/                     # copypaste template
+      package.json
+      task.yaml
+      fixture/
+      src/judges/
+      tsconfig.json
+    fix-off-by-one/                # first real task
+      package.json                 # @uwf-eval/fix-off-by-one
+      task.yaml
+      fixture/
+        src/calc.ts                # buggy calculator
+        src/calc.test.ts           # test that exposes the bug
+        package.json
+      src/judges/
+        test-pass.ts               # runs pnpm test, checks exit code
+        code-quality.ts            # LLM judge: minimal change, correct fix
+      schemas/
+        test-pass.json             # OCAS schema for test-pass data
+        code-quality.json          # OCAS schema for code-quality data
+      tsconfig.json
+  pnpm-workspace.yaml
+  tsconfig.json
+  biome.json
+```
+
+#### task.yaml Format
+
+```yaml
+name: fix-off-by-one
+description: Fix an off-by-one error in a calculator's add function
+workflow: solve-issue              # registered workflow name, or relative path to .yaml
+prompt: "Fix the bug: add(1,2) returns 4 instead of 3"
+limits:
+  maxSteps: 15
+  timeoutMinutes: 30
+judges:
+  - name: frontmatter-compliance
+    weight: 0.15
+    builtin: true
+  - name: upstream-consumption
+    weight: 0.15
+    builtin: true
+  - name: hallucination
+    weight: 0.1
+    builtin: true
+  - name: token-stats
+    weight: 0                      # informational, not scored
+    builtin: true
+  - name: test-pass
+    weight: 0.3
+    entry: dist/judges/test-pass.js
+    schema: schemas/test-pass.json
+  - name: code-quality
+    weight: 0.3
+    entry: dist/judges/code-quality.js
+    schema: schemas/code-quality.json
+```
+
+#### Judge Script Contract
+
+```typescript
+// Input: process.argv = [node, script, cwd, threadId]
+// Output: stdout JSON
+// Exit 0 = success, non-zero = judge error (not low score)
+
+import type { JudgeOutput } from "@united-workforce/eval";
+
+const result: JudgeOutput<TestPassData> = {
+  score: 1.0,      // 0.0 - 1.0
+  data: {           // typed per judge schema
+    command: "pnpm test",
+    exitCode: 0,
+    output: "3 tests passed"
+  }
+};
+
+console.log(JSON.stringify(result));
+```
+
+### Phase 3: Prerequisite — $usage in Adapter Protocol (#68)
+
+Blocked by #68. Token stats judge needs `$usage` in step nodes.
+
+Can proceed with Phase 1+2 without it — token-stats judge just returns zeros until adapters report usage.
+
+## Implementation Order
+
+1. **Phase 1a**: `@united-workforce/eval` package scaffold + CLI skeleton + OCAS schemas
+2. **Phase 1b**: `run` command — prepare, execute, collect flow
+3. **Phase 1c**: Builtin judges — frontmatter (deterministic), upstream + hallucination (LLM-as-judge)
+4. **Phase 2a**: Create `shazhou/uwf-eval-tasks` monorepo with proman
+5. **Phase 2b**: First task `fix-off-by-one` with fixture repo + 2 custom judges
+6. **Phase 2c**: End-to-end test: `uwf-eval run packages/fix-off-by-one --agent hermes`
+7. **Phase 1d**: `report`, `diff`, `list` commands (read from CAS, render via ocas render)
+
+## Dependencies
+
+- `@ocas/core` + `@ocas/fs` — CAS storage
+- `@united-workforce/protocol` — step node types
+- `commander` — CLI framework (consistent with uwf)
+- LLM API access — for LLM-as-judge (upstream, hallucination, task-specific quality judges)
+
+## Open Questions
+
+1. **LLM-as-judge provider config** — reuse uwf's `~/.uwf/config.yaml` provider settings? Or separate config?
+2. **Workflow file location** — task.yaml references a workflow. Should the workflow YAML be inside the tarball, or reference a registered workflow by name?
+3. **Non-coding tasks** — debate workflow has no fixture repo. task.yaml needs `fixture: null` or simply omit the `fixture/` dir. Runner creates empty cwd.
+4. **Parallel judge execution** — judges are independent, can run in parallel. Worth the complexity?
+
+## Risks
+
+- LLM-as-judge consistency — same input may get different scores. Mitigation: run judge multiple times, take average? Or accept variance.
+- Token cost of judges — each LLM judge call costs tokens. For a 10-step workflow with 2 LLM judges = 20 LLM calls just for judging. Acceptable?
+- Fixture repo drift — if the fixture evolves, old eval runs become non-comparable. Pin fixture version in task.yaml.
diff --git a/packages/agent-builtin/src/agent.ts b/packages/agent-builtin/src/agent.ts
index 6efa2ad..2854669 100644
--- a/packages/agent-builtin/src/agent.ts
+++ b/packages/agent-builtin/src/agent.ts
@@ -82,7 +82,13 @@ async function runBuiltinWithMessages(
 
   if (loopResult.turnCount === 0) {
     log("5RWTK9NB", "no turns produced, returning empty output");
-    return { output: "", detailHash: "", sessionId: session.sessionId, assembledPrompt: "" };
+    return {
+      output: "",
+      detailHash: "",
+      sessionId: session.sessionId,
+      assembledPrompt: "",
+      usage: null,
+    };
   }
 
   // Read jsonl → persist turns to CAS → store detail
@@ -99,6 +105,7 @@ async function runBuiltinWithMessages(
     detailHash,
     sessionId: session.sessionId,
     assembledPrompt: "",
+    usage: null,
   };
 }
 
diff --git a/packages/agent-claude-code/src/claude-code.ts b/packages/agent-claude-code/src/claude-code.ts
index c04b0f7..516e285 100644
--- a/packages/agent-claude-code/src/claude-code.ts
+++ b/packages/agent-claude-code/src/claude-code.ts
@@ -145,7 +145,7 @@ async function processClaudeOutput(
       );
     }
 
-    return { output, detailHash, sessionId, assembledPrompt };
+    return { output, detailHash, sessionId, assembledPrompt, usage: null };
   }
 
   // Truly unparseable output - provide enhanced error message
diff --git a/packages/agent-hermes/src/hermes.ts b/packages/agent-hermes/src/hermes.ts
index 8191cea..23c1a43 100644
--- a/packages/agent-hermes/src/hermes.ts
+++ b/packages/agent-hermes/src/hermes.ts
@@ -118,7 +118,7 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise<void>
       await setCachedSessionId(ctx.threadId, ctx.role, sessionId, ctx.storageRoot);
     }
 
-    return { output: text, detailHash, sessionId, assembledPrompt: fullPrompt };
+    return { output: text, detailHash, sessionId, assembledPrompt: fullPrompt, usage: null };
   }
 
   async function runHermes(ctx: AgentContext): Promise<AgentRunResult> {
@@ -149,7 +149,7 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise<void>
     // so the agent sees the full conversation history (crucial for retries).
     const { text, sessionId } = await client.prompt(message);
     const { detailHash } = await storePromptResult(store, sessionId);
-    return { output: text, detailHash, sessionId, assembledPrompt: "" };
+    return { output: text, detailHash, sessionId, assembledPrompt: "", usage: null };
   }
 
   const agentMain = createAgent({
diff --git a/packages/agent-mock/src/mock-agent.ts b/packages/agent-mock/src/mock-agent.ts
index 1044232..09713d5 100644
--- a/packages/agent-mock/src/mock-agent.ts
+++ b/packages/agent-mock/src/mock-agent.ts
@@ -103,6 +103,7 @@ export function createMockAgent(mockDataPath: string): () => Promise<void> {
       detailHash,
       sessionId,
       assembledPrompt: "",
+      usage: null,
     };
     lastResult = result;
     return result;
diff --git a/packages/cli/src/__tests__/step-show-json.test.ts b/packages/cli/src/__tests__/step-show-json.test.ts
index 21d446f..e7b6076 100644
--- a/packages/cli/src/__tests__/step-show-json.test.ts
+++ b/packages/cli/src/__tests__/step-show-json.test.ts
@@ -118,6 +118,7 @@ async function createTestStep(
     completedAtMs: Date.now() + 1000,
     assembledPrompt: null,
     cwd: "/tmp",
+    usage: null,
   };
   return store.cas.put(schemas.stepNode, stepPayload);
 }
diff --git a/packages/cli/src/__tests__/step-timing.test.ts b/packages/cli/src/__tests__/step-timing.test.ts
index 9d84c89..f18c3dc 100644
--- a/packages/cli/src/__tests__/step-timing.test.ts
+++ b/packages/cli/src/__tests__/step-timing.test.ts
@@ -96,6 +96,7 @@ describe("protocol types", () => {
       completedAtMs: 2000,
       assembledPrompt: null,
       cwd: "/test/path",
+      usage: null,
     };
     expect(record.startedAtMs).toBe(1000);
     expect(record.completedAtMs).toBe(2000);
@@ -110,6 +111,7 @@ describe("protocol types", () => {
       agent: "uwf-test",
       timestamp: 123,
       durationMs: 5000,
+      usage: null,
     };
     expect(entry.durationMs).toBe(5000);
   });
diff --git a/packages/cli/src/commands/step.ts b/packages/cli/src/commands/step.ts
index 6786d4f..12070b9 100644
--- a/packages/cli/src/commands/step.ts
+++ b/packages/cli/src/commands/step.ts
@@ -66,6 +66,7 @@ export async function cmdStepList(
       agent: item.payload.agent,
       timestamp: item.timestamp,
       durationMs: item.payload.completedAtMs - item.payload.startedAtMs,
+      usage: item.payload.usage ?? null,
     });
   }
 
diff --git a/packages/protocol/src/__tests__/types.test.ts b/packages/protocol/src/__tests__/types.test.ts
index ed8a1bc..3234e65 100644
--- a/packages/protocol/src/__tests__/types.test.ts
+++ b/packages/protocol/src/__tests__/types.test.ts
@@ -27,6 +27,7 @@ describe("Protocol types for thread/edge location", () => {
         completedAtMs: Date.now() + 1000,
         assembledPrompt: null,
         cwd: "/home/user/project",
+        usage: null,
       };
 
       expect(record.cwd).toBe("/home/user/project");
diff --git a/packages/protocol/src/index.ts b/packages/protocol/src/index.ts
index d62c170..bf4ac8c 100644
--- a/packages/protocol/src/index.ts
+++ b/packages/protocol/src/index.ts
@@ -44,6 +44,7 @@ export type {
   ThreadStatus,
   ThreadStepsOutput,
   ThreadsIndex,
+  Usage,
   WorkflowConfig,
   WorkflowName,
   WorkflowPayload,
diff --git a/packages/protocol/src/schemas.ts b/packages/protocol/src/schemas.ts
index 13c6e09..902e91b 100644
--- a/packages/protocol/src/schemas.ts
+++ b/packages/protocol/src/schemas.ts
@@ -91,6 +91,22 @@ export const STEP_NODE_SCHEMA: JSONSchema = {
     assembledPrompt: {
       anyOf: [{ type: "string", format: "ocas_ref" }, { type: "null" }],
     },
+    usage: {
+      anyOf: [
+        {
+          type: "object",
+          required: ["turns", "inputTokens", "outputTokens", "duration"],
+          properties: {
+            turns: { type: "integer" },
+            inputTokens: { type: "integer" },
+            outputTokens: { type: "integer" },
+            duration: { type: "number" },
+          },
+          additionalProperties: false,
+        },
+        { type: "null" },
+      ],
+    },
   },
   additionalProperties: false,
 };
diff --git a/packages/protocol/src/types.ts b/packages/protocol/src/types.ts
index 333eeac..9823b36 100644
--- a/packages/protocol/src/types.ts
+++ b/packages/protocol/src/types.ts
@@ -22,6 +22,17 @@ export type StepRecord = {
   cwd: string;
   /** CAS ref to the fully assembled prompt sent to the agent. null for legacy steps. */
   assembledPrompt: CasRef | null;
+  /** Token usage statistics reported by the agent adapter. null for legacy steps. */
+  usage: Usage | null;
+};
+
+/** Token usage statistics reported by agent adapters. */
+export type Usage = {
+  turns: number;
+  inputTokens: number;
+  outputTokens: number;
+  /** Wall-clock duration in seconds. */
+  duration: number;
 };
 
 // ── 4.2 Workflow 定义 ───────────────────────────────────────────────
@@ -131,6 +142,7 @@ export type StepEntry = {
   agent: string;
   timestamp: number;
   durationMs: number;
+  usage: Usage | null;
 };
 
 /** uwf thread steps — start entry */
diff --git a/packages/util-agent/src/context.ts b/packages/util-agent/src/context.ts
index 3103cdb..0c4f488 100644
--- a/packages/util-agent/src/context.ts
+++ b/packages/util-agent/src/context.ts
@@ -132,6 +132,7 @@ async function buildHistory(
       completedAtMs: step.completedAtMs,
       cwd: step.cwd ?? "",
       assembledPrompt: step.assembledPrompt ?? null,
+      usage: step.usage ?? null,
       content,
     });
   }
diff --git a/packages/util-agent/src/run.ts b/packages/util-agent/src/run.ts
index 0b8e951..9887b5f 100644
--- a/packages/util-agent/src/run.ts
+++ b/packages/util-agent/src/run.ts
@@ -1,5 +1,5 @@
 import { getSchema, validate } from "@ocas/core";
-import type { CasRef, StepNodePayload, ThreadId } from "@united-workforce/protocol";
+import type { CasRef, StepNodePayload, ThreadId, Usage } from "@united-workforce/protocol";
 import { config as loadDotenv } from "dotenv";
 import { buildOutputFormatInstruction } from "./build-output-format-instruction.js";
 import { buildContextWithMeta } from "./context.js";
@@ -65,6 +65,7 @@ async function writeStepNode(options: {
   startedAtMs: number;
   completedAtMs: number;
   assembledPromptHash: CasRef | null;
+  usage: Usage | null;
 }): Promise<CasRef> {
   const payload: StepNodePayload = {
     start: options.startHash,
@@ -78,6 +79,7 @@ async function writeStepNode(options: {
     completedAtMs: options.completedAtMs,
     cwd: process.cwd(),
     assembledPrompt: options.assembledPromptHash,
+    usage: options.usage,
   };
   const hash = await options.store.cas.put(options.schemas.stepNode, payload);
   const node = options.store.cas.get(hash);
@@ -117,6 +119,7 @@ async function persistStep(options: {
   startedAtMs: number;
   completedAtMs: number;
   assembledPromptHash: CasRef | null;
+  usage: Usage | null;
 }): Promise<CasRef> {
   const { store, schemas, chain, headHash } = options.ctx.meta;
   return writeStepNode({
@@ -132,6 +135,7 @@ async function persistStep(options: {
     startedAtMs: options.startedAtMs,
     completedAtMs: options.completedAtMs,
     assembledPromptHash: options.assembledPromptHash,
+    usage: options.usage,
   });
 }
 
@@ -200,6 +204,7 @@ export function createAgent(options: AgentOptions): () => Promise<void> {
       );
     }
     const completedAtMs = Date.now();
+    const usage = agentResult.usage;
 
     // Store the assembled prompt in CAS for later inspection via `step read --prompt`
     const promptText = agentResult.assembledPrompt;
@@ -220,6 +225,7 @@ export function createAgent(options: AgentOptions): () => Promise<void> {
       startedAtMs,
       completedAtMs,
       assembledPromptHash,
+      usage,
     });
 
     const adapterOutput: AdapterOutput = {
@@ -230,6 +236,7 @@ export function createAgent(options: AgentOptions): () => Promise<void> {
       body: extracted.body,
       startedAtMs,
       completedAtMs,
+      usage,
     };
     process.stdout.write(`${JSON.stringify(adapterOutput)}\n`);
   };
diff --git a/packages/util-agent/src/types.ts b/packages/util-agent/src/types.ts
index 103f19d..bd3e823 100644
--- a/packages/util-agent/src/types.ts
+++ b/packages/util-agent/src/types.ts
@@ -1,5 +1,10 @@
 import type { Store } from "@ocas/core";
-import type { ModeratorContext, ThreadId, WorkflowPayload } from "@united-workforce/protocol";
+import type {
+  ModeratorContext,
+  ThreadId,
+  Usage,
+  WorkflowPayload,
+} from "@united-workforce/protocol";
 
 export type AgentContext = ModeratorContext & {
   threadId: ThreadId;
@@ -33,6 +38,8 @@ export type AgentRunResult = {
   sessionId: string;
   /** The fully assembled prompt that was sent to the agent. */
   assembledPrompt: string;
+  /** Token usage statistics for this run. null when the adapter does not report usage. */
+  usage: Usage | null;
 };
 
 export type AgentContinueFn = (
@@ -51,6 +58,7 @@ export type AdapterOutput = {
   body: string;
   startedAtMs: number;
   completedAtMs: number;
+  usage: Usage | null;
 };
 
 export type AgentOptions = {