2026-06-04 22:14:13 +00:00
15 changed files with 290 additions and 6 deletions
@@ -0,0 +1,226 @@
+# Eval Framework Implementation Plan
+
+## Goal
+
+Build `uwf-eval` CLI + eval task infrastructure for evaluating uwf workflow quality with real agents.
+
+## Architecture
+
+```
+uwf-eval (runner)          task package (npm)          OCAS (storage)
+  │                          │                           │
+  ├─ unpack tarball ───────► fixture/ → tmp cwd          │
+  ├─ read task.yaml          │                           │
+  ├─ uwf thread start/exec  │                           │
+  ├─ run judges ───────────► dist/judges/*.js            │
+  ├─ collect scores          │                           │
+  └─ store results ─────────────────────────────────────► CAS nodes + variables
+```
+
+### Key Design Decisions
+
+- **uwf-eval is NOT part of uwf** — separate package, shells out to uwf CLI
+- **Task = npm package** — fixture + task.yaml + judge scripts, distributable as tarball
+- **Judge = Node script** — `node <entry> <cwd> <thread-id>`, outputs `{score, data}` JSON
+- **Every output is OCAS typed** — eval-run, judge results all have registered schemas
+- **Builtin judges** — frontmatter compliance, upstream consumption, hallucination, token stats
+- **Task-specific judges** — bundled in the task package, custom schema per judge
+
+## Deliverables
+
+### Phase 1: Foundation (`@united-workforce/eval`)
+
+New package in the uwf monorepo.
+
+```
+packages/eval/
+  src/
+    cli.ts                    # uwf-eval entry point
+    commands/
+      run.ts                  # uwf-eval run
+      report.ts               # uwf-eval report <hash>
+      diff.ts                 # uwf-eval diff <hash> <hash>
+      list.ts                 # uwf-eval list
+    runner/
+      prepare.ts              # unpack tarball/dir → tmp cwd
+      execute.ts              # shell out to uwf thread start/exec
+      collect.ts              # run judges, collect scores
+    judge/
+      types.ts                # JudgeInput, JudgeOutput types
+      builtin/
+        frontmatter.ts        # frontmatter compliance check
+        upstream.ts           # upstream info consumption (LLM-as-judge)
+        hallucination.ts      # hallucination detection (LLM-as-judge)
+        token-stats.ts        # token usage from $usage field (#68)
+    storage/
+      schemas.ts              # OCAS schema definitions
+      store.ts                # CAS read/write helpers
+      index.ts                # variable indexing (@uwf/eval/*)
+    task/
+      types.ts                # TaskManifest type (task.yaml)
+      loader.ts               # parse task.yaml, validate
+  package.json
+  tsconfig.json
+```
+
+#### OCAS Schemas to Register
+
+1. `@uwf/eval-run` — full eval execution record
+   ```
+   { task, config: {agent, model, engineVersion}, threadId,
+     judges: [{name, score, weight, dataHash}], overall, timestamp }
+   ```
+
+2. `@uwf/eval-judge-frontmatter` — frontmatter judge data
+   ```
+   { stepsTotal, stepsValid, invalidSteps: [{stepIndex, role, errors: string[]}] }
+   ```
+
+3. `@uwf/eval-judge-upstream` — upstream consumption judge data
+   ```
+   { perStep: [{role, consumed: string[], missed: string[], score}] }
+   ```
+
+4. `@uwf/eval-judge-hallucination` — hallucination judge data
+   ```
+   { perStep: [{role, hallucinations: string[], score}] }
+   ```
+
+5. `@uwf/eval-judge-token-stats` — token stats (not scored, informational)
+   ```
+   { totalInput, totalOutput, totalTurns, perStep: [{role, input, output, turns, duration}] }
+   ```
+
+#### CLI Design
+
+```bash
+# Run eval
+uwf-eval run <task-dir-or-tarball> [--agent hermes] [--model claude-sonnet-4] [--count 20]
+
+# View results
+uwf-eval report <run-hash>        # render via ocas render
+uwf-eval diff <hash1> <hash2>     # side-by-side comparison
+uwf-eval list                     # list past runs
+```
+
+### Phase 2: Task Package Scaffold
+
+Template for creating eval tasks. Also serves as the first real task.
+
+```
+eval-tasks/                        # shazhou/uwf-eval-tasks monorepo
+  packages/
+    _template/                     # copypaste template
+      package.json
+      task.yaml
+      fixture/
+      src/judges/
+      tsconfig.json
+    fix-off-by-one/                # first real task
+      package.json                 # @uwf-eval/fix-off-by-one
+      task.yaml
+      fixture/
+        src/calc.ts                # buggy calculator
+        src/calc.test.ts           # test that exposes the bug
+        package.json
+      src/judges/
+        test-pass.ts               # runs pnpm test, checks exit code
+        code-quality.ts            # LLM judge: minimal change, correct fix
+      schemas/
+        test-pass.json             # OCAS schema for test-pass data
+        code-quality.json          # OCAS schema for code-quality data
+      tsconfig.json
+  pnpm-workspace.yaml
+  tsconfig.json
+  biome.json
+```
+
+#### task.yaml Format
+
+```yaml
+name: fix-off-by-one
+description: Fix an off-by-one error in a calculator's add function
+workflow: solve-issue              # registered workflow name, or relative path to .yaml
+prompt: "Fix the bug: add(1,2) returns 4 instead of 3"
+limits:
+  maxSteps: 15
+  timeoutMinutes: 30
+judges:
+  - name: frontmatter-compliance
+    weight: 0.15
+    builtin: true
+  - name: upstream-consumption
+    weight: 0.15
+    builtin: true
+  - name: hallucination
+    weight: 0.1
+    builtin: true
+  - name: token-stats
+    weight: 0                      # informational, not scored
+    builtin: true
+  - name: test-pass
+    weight: 0.3
+    entry: dist/judges/test-pass.js
+    schema: schemas/test-pass.json
+  - name: code-quality
+    weight: 0.3
+    entry: dist/judges/code-quality.js
+    schema: schemas/code-quality.json
+```
+
+#### Judge Script Contract
+
+```typescript
+// Input: process.argv = [node, script, cwd, threadId]
+// Output: stdout JSON
+// Exit 0 = success, non-zero = judge error (not low score)
+
+import type { JudgeOutput } from "@united-workforce/eval";
+
+const result: JudgeOutput<TestPassData> = {
+  score: 1.0,      // 0.0 - 1.0
+  data: {           // typed per judge schema
+    command: "pnpm test",
+    exitCode: 0,
+    output: "3 tests passed"
+  }
+};
+
+console.log(JSON.stringify(result));
+```
+
+### Phase 3: Prerequisite — $usage in Adapter Protocol (#68)
+
+Blocked by #68. Token stats judge needs `$usage` in step nodes.
+
+Can proceed with Phase 1+2 without it — token-stats judge just returns zeros until adapters report usage.
+
+## Implementation Order
+
+1. **Phase 1a**: `@united-workforce/eval` package scaffold + CLI skeleton + OCAS schemas
+2. **Phase 1b**: `run` command — prepare, execute, collect flow
+3. **Phase 1c**: Builtin judges — frontmatter (deterministic), upstream + hallucination (LLM-as-judge)
+4. **Phase 2a**: Create `shazhou/uwf-eval-tasks` monorepo with proman
+5. **Phase 2b**: First task `fix-off-by-one` with fixture repo + 2 custom judges
+6. **Phase 2c**: End-to-end test: `uwf-eval run packages/fix-off-by-one --agent hermes`
+7. **Phase 1d**: `report`, `diff`, `list` commands (read from CAS, render via ocas render)
+
+## Dependencies
+
+- `@ocas/core` + `@ocas/fs` — CAS storage
+- `@united-workforce/protocol` — step node types
+- `commander` — CLI framework (consistent with uwf)
+- LLM API access — for LLM-as-judge (upstream, hallucination, task-specific quality judges)
+
+## Open Questions
+
+1. **LLM-as-judge provider config** — reuse uwf's `~/.uwf/config.yaml` provider settings? Or separate config?
+2. **Workflow file location** — task.yaml references a workflow. Should the workflow YAML be inside the tarball, or reference a registered workflow by name?
+3. **Non-coding tasks** — debate workflow has no fixture repo. task.yaml needs `fixture: null` or simply omit the `fixture/` dir. Runner creates empty cwd.
+4. **Parallel judge execution** — judges are independent, can run in parallel. Worth the complexity?
+
+## Risks
+
+- LLM-as-judge consistency — same input may get different scores. Mitigation: run judge multiple times, take average? Or accept variance.
+- Token cost of judges — each LLM judge call costs tokens. For a 10-step workflow with 2 LLM judges = 20 LLM calls just for judging. Acceptable?
+- Fixture repo drift — if the fixture evolves, old eval runs become non-comparable. Pin fixture version in task.yaml.
@@ -82,7 +82,13 @@ async function runBuiltinWithMessages(

  if (loopResult.turnCount === 0) {
    log("5RWTK9NB", "no turns produced, returning empty output");
-    return { output: "", detailHash: "", sessionId: session.sessionId, assembledPrompt: "" };
+    return {
+      output: "",
+      detailHash: "",
+      sessionId: session.sessionId,
+      assembledPrompt: "",
+      usage: null,
+    };
  }

  // Read jsonl → persist turns to CAS → store detail
@@ -99,6 +105,7 @@ async function runBuiltinWithMessages(
    detailHash,
    sessionId: session.sessionId,
    assembledPrompt: "",
+    usage: null,
  };
 }

@@ -145,7 +145,7 @@ async function processClaudeOutput(
      );
    }

-    return { output, detailHash, sessionId, assembledPrompt };
+    return { output, detailHash, sessionId, assembledPrompt, usage: null };
  }

  // Truly unparseable output - provide enhanced error message
@@ -118,7 +118,7 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise<void>
      await setCachedSessionId(ctx.threadId, ctx.role, sessionId, ctx.storageRoot);
    }

-    return { output: text, detailHash, sessionId, assembledPrompt: fullPrompt };
+    return { output: text, detailHash, sessionId, assembledPrompt: fullPrompt, usage: null };
  }

  async function runHermes(ctx: AgentContext): Promise<AgentRunResult> {
@@ -149,7 +149,7 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise<void>
    // so the agent sees the full conversation history (crucial for retries).
    const { text, sessionId } = await client.prompt(message);
    const { detailHash } = await storePromptResult(store, sessionId);
-    return { output: text, detailHash, sessionId, assembledPrompt: "" };
+    return { output: text, detailHash, sessionId, assembledPrompt: "", usage: null };
  }

  const agentMain = createAgent({
@@ -103,6 +103,7 @@ export function createMockAgent(mockDataPath: string): () => Promise<void> {
      detailHash,
      sessionId,
      assembledPrompt: "",
+      usage: null,
    };
    lastResult = result;
    return result;
@@ -118,6 +118,7 @@ async function createTestStep(
    completedAtMs: Date.now() + 1000,
    assembledPrompt: null,
    cwd: "/tmp",
+    usage: null,
  };
  return store.cas.put(schemas.stepNode, stepPayload);
 }
@@ -96,6 +96,7 @@ describe("protocol types", () => {
      completedAtMs: 2000,
      assembledPrompt: null,
      cwd: "/test/path",
+      usage: null,
    };
    expect(record.startedAtMs).toBe(1000);
    expect(record.completedAtMs).toBe(2000);
@@ -110,6 +111,7 @@ describe("protocol types", () => {
      agent: "uwf-test",
      timestamp: 123,
      durationMs: 5000,
+      usage: null,
    };
    expect(entry.durationMs).toBe(5000);
  });
@@ -66,6 +66,7 @@ export async function cmdStepList(
      agent: item.payload.agent,
      timestamp: item.timestamp,
      durationMs: item.payload.completedAtMs - item.payload.startedAtMs,
+      usage: item.payload.usage ?? null,
    });
  }

@@ -27,6 +27,7 @@ describe("Protocol types for thread/edge location", () => {
        completedAtMs: Date.now() + 1000,
        assembledPrompt: null,
        cwd: "/home/user/project",
+        usage: null,
      };

      expect(record.cwd).toBe("/home/user/project");
@@ -44,6 +44,7 @@ export type {
  ThreadStatus,
  ThreadStepsOutput,
  ThreadsIndex,
+  Usage,
  WorkflowConfig,
  WorkflowName,
  WorkflowPayload,
@@ -91,6 +91,22 @@ export const STEP_NODE_SCHEMA: JSONSchema = {
    assembledPrompt: {
      anyOf: [{ type: "string", format: "ocas_ref" }, { type: "null" }],
    },
+    usage: {
+      anyOf: [
+        {
+          type: "object",
+          required: ["turns", "inputTokens", "outputTokens", "duration"],
+          properties: {
+            turns: { type: "integer" },
+            inputTokens: { type: "integer" },
+            outputTokens: { type: "integer" },
+            duration: { type: "number" },
+          },
+          additionalProperties: false,
+        },
+        { type: "null" },
+      ],
+    },
  },
  additionalProperties: false,
 };
@@ -22,6 +22,17 @@ export type StepRecord = {
  cwd: string;
  /** CAS ref to the fully assembled prompt sent to the agent. null for legacy steps. */
  assembledPrompt: CasRef | null;
+  /** Token usage statistics reported by the agent adapter. null for legacy steps. */
+  usage: Usage | null;
+};
+
+/** Token usage statistics reported by agent adapters. */
+export type Usage = {
+  turns: number;
+  inputTokens: number;
+  outputTokens: number;
+  /** Wall-clock duration in seconds. */
+  duration: number;
 };

 // ── 4.2 Workflow 定义 ───────────────────────────────────────────────
@@ -131,6 +142,7 @@ export type StepEntry = {
  agent: string;
  timestamp: number;
  durationMs: number;
+  usage: Usage | null;
 };

 /** uwf thread steps — start entry */
@@ -132,6 +132,7 @@ async function buildHistory(
      completedAtMs: step.completedAtMs,
      cwd: step.cwd ?? "",
      assembledPrompt: step.assembledPrompt ?? null,
+      usage: step.usage ?? null,
      content,
    });
  }
@@ -1,5 +1,5 @@
 import { getSchema, validate } from "@ocas/core";
-import type { CasRef, StepNodePayload, ThreadId } from "@united-workforce/protocol";
+import type { CasRef, StepNodePayload, ThreadId, Usage } from "@united-workforce/protocol";
 import { config as loadDotenv } from "dotenv";
 import { buildOutputFormatInstruction } from "./build-output-format-instruction.js";
 import { buildContextWithMeta } from "./context.js";
@@ -65,6 +65,7 @@ async function writeStepNode(options: {
  startedAtMs: number;
  completedAtMs: number;
  assembledPromptHash: CasRef | null;
+  usage: Usage | null;
 }): Promise<CasRef> {
  const payload: StepNodePayload = {
    start: options.startHash,
@@ -78,6 +79,7 @@ async function writeStepNode(options: {
    completedAtMs: options.completedAtMs,
    cwd: process.cwd(),
    assembledPrompt: options.assembledPromptHash,
+    usage: options.usage,
  };
  const hash = await options.store.cas.put(options.schemas.stepNode, payload);
  const node = options.store.cas.get(hash);
@@ -117,6 +119,7 @@ async function persistStep(options: {
  startedAtMs: number;
  completedAtMs: number;
  assembledPromptHash: CasRef | null;
+  usage: Usage | null;
 }): Promise<CasRef> {
  const { store, schemas, chain, headHash } = options.ctx.meta;
  return writeStepNode({
@@ -132,6 +135,7 @@ async function persistStep(options: {
    startedAtMs: options.startedAtMs,
    completedAtMs: options.completedAtMs,
    assembledPromptHash: options.assembledPromptHash,
+    usage: options.usage,
  });
 }

@@ -200,6 +204,7 @@ export function createAgent(options: AgentOptions): () => Promise<void> {
      );
    }
    const completedAtMs = Date.now();
+    const usage = agentResult.usage;

    // Store the assembled prompt in CAS for later inspection via `step read --prompt`
    const promptText = agentResult.assembledPrompt;
@@ -220,6 +225,7 @@ export function createAgent(options: AgentOptions): () => Promise<void> {
      startedAtMs,
      completedAtMs,
      assembledPromptHash,
+      usage,
    });

    const adapterOutput: AdapterOutput = {
@@ -230,6 +236,7 @@ export function createAgent(options: AgentOptions): () => Promise<void> {
      body: extracted.body,
      startedAtMs,
      completedAtMs,
+      usage,
    };
    process.stdout.write(`${JSON.stringify(adapterOutput)}\n`);
  };
@@ -1,5 +1,10 @@
 import type { Store } from "@ocas/core";
-import type { ModeratorContext, ThreadId, WorkflowPayload } from "@united-workforce/protocol";
+import type {
+  ModeratorContext,
+  ThreadId,
+  Usage,
+  WorkflowPayload,
+} from "@united-workforce/protocol";

 export type AgentContext = ModeratorContext & {
  threadId: ThreadId;
@@ -33,6 +38,8 @@ export type AgentRunResult = {
  sessionId: string;
  /** The fully assembled prompt that was sent to the agent. */
  assembledPrompt: string;
+  /** Token usage statistics for this run. null when the adapter does not report usage. */
+  usage: Usage | null;
 };

 export type AgentContinueFn = (
@@ -51,6 +58,7 @@ export type AdapterOutput = {
  body: string;
  startedAtMs: number;
  completedAtMs: number;
+  usage: Usage | null;
 };

 export type AgentOptions = {