Compare commits

..

2 Commits

Author SHA1 Message Date
xiaoju ede428bff2 test: E2E integration tests with uwf-mock agent (#33)
CI / check (pull_request) Failing after 2m30s
Three scenarios testing the full CLI pipeline:
1. Linear workflow (planner → worker → $END): CAS chain integrity
2. Loop workflow (developer ↔ reviewer): moderator routing through cycles
3. Role mismatch detection: agent catches routing bugs

Uses workflow add → thread start → thread exec with uwf-mock,
verifying CAS state, thread lifecycle, and error handling.

Refs #33
2026-06-04 07:44:48 +00:00
xiaoju 6850826abe feat: add agent-mock package for deterministic E2E testing (#33)
CI / check (pull_request) Failing after 3m14s
New package @united-workforce/agent-mock (uwf-mock CLI):
- Reads pre-scripted outputs from a YAML mock data file (--mock-data)
- Counts existing CAS chain steps to determine step index
- Validates expected role matches actual moderator routing
- Stores minimal detail node in CAS for valid step refs
- Zero LLM, instant execution, 100% deterministic

Usage in config.yaml:
  agents:
    mock:
      command: uwf-mock
      args: ["--mock-data", "./fixtures/scenario.yaml"]

Refs #33
2026-06-04 06:50:49 +00:00
148 changed files with 1260 additions and 5043 deletions
+8
View File
@@ -0,0 +1,8 @@
# Changesets
Hello and welcome! This folder has been automatically generated by `@changesets/cli`, a build tool that works
with multi-package repos, or single-package repos to help you version and publish your code. You can
find the full documentation for it [in our repository](https://github.com/changesets/changesets).
We have a quick list of common questions to get you started engaging with this project in
[our documentation](https://github.com/changesets/changesets/blob/main/docs/common-questions.md).
+11
View File
@@ -0,0 +1,11 @@
{
"$schema": "https://unpkg.com/@changesets/config@3.1.4/schema.json",
"changelog": "@changesets/cli/changelog",
"commit": false,
"fixed": [["@united-workforce/*"]],
"linked": [],
"access": "public",
"baseBranch": "main",
"updateInternalDependencies": "patch",
"ignore": ["@united-workforce/dashboard"]
}
+30
View File
@@ -0,0 +1,30 @@
{
"mode": "exit",
"tag": "alpha",
"initialVersions": {
"@uncaged/cli": "0.4.5",
"@uncaged/workflow-agent-cursor": "0.4.5",
"@uncaged/agent-hermes": "0.4.5",
"@uncaged/workflow-agent-llm": "0.4.5",
"@uncaged/workflow-agent-react": "0.4.5",
"@uncaged/workflow-cas": "0.4.5",
"@uncaged/dashboard": "0.1.0",
"@uncaged/workflow-execute": "0.4.5",
"@uncaged/workflow-gateway": "0.4.5",
"@uncaged/protocol": "0.4.5",
"@uncaged/workflow-reactor": "0.4.5",
"@uncaged/workflow-register": "0.4.5",
"@uncaged/workflow-runtime": "0.4.5",
"@uncaged/workflow-template-develop": "0.4.5",
"@uncaged/workflow-template-solve-issue": "0.4.5",
"@uncaged/util": "0.4.5",
"@uncaged/util-agent": "0.4.5"
},
"changesets": [
"env-api-unify",
"fix-internal-deps",
"fix-publish-src",
"fix-workspace-deps",
"rfc-252-agent-fn"
]
}
+5 -7
View File
@@ -12,17 +12,15 @@ jobs:
steps:
- uses: actions/checkout@v4
- uses: actions/setup-node@v4
with:
node-version: 22
- uses: oven-sh/setup-bun@v2
- run: corepack enable && pnpm install
- run: bun install
- name: Build
run: pnpm run build
run: bun run build
- name: Lint
run: pnpm run check
run: bun run check
- name: Test
run: pnpm run test:ci
run: bun run test:ci
-226
View File
@@ -1,226 +0,0 @@
# Eval Framework Implementation Plan
## Goal
Build `uwf-eval` CLI + eval task infrastructure for evaluating uwf workflow quality with real agents.
## Architecture
```
uwf-eval (runner) task package (npm) OCAS (storage)
│ │ │
├─ unpack tarball ───────► fixture/ → tmp cwd │
├─ read task.yaml │ │
├─ uwf thread start/exec │ │
├─ run judges ───────────► dist/judges/*.js │
├─ collect scores │ │
└─ store results ─────────────────────────────────────► CAS nodes + variables
```
### Key Design Decisions
- **uwf-eval is NOT part of uwf** — separate package, shells out to uwf CLI
- **Task = npm package** — fixture + task.yaml + judge scripts, distributable as tarball
- **Judge = Node script** — `node <entry> <cwd> <thread-id>`, outputs `{score, data}` JSON
- **Every output is OCAS typed** — eval-run, judge results all have registered schemas
- **Builtin judges** — frontmatter compliance, upstream consumption, hallucination, token stats
- **Task-specific judges** — bundled in the task package, custom schema per judge
## Deliverables
### Phase 1: Foundation (`@united-workforce/eval`)
New package in the uwf monorepo.
```
packages/eval/
src/
cli.ts # uwf-eval entry point
commands/
run.ts # uwf-eval run
report.ts # uwf-eval report <hash>
diff.ts # uwf-eval diff <hash> <hash>
list.ts # uwf-eval list
runner/
prepare.ts # unpack tarball/dir → tmp cwd
execute.ts # shell out to uwf thread start/exec
collect.ts # run judges, collect scores
judge/
types.ts # JudgeInput, JudgeOutput types
builtin/
frontmatter.ts # frontmatter compliance check
upstream.ts # upstream info consumption (LLM-as-judge)
hallucination.ts # hallucination detection (LLM-as-judge)
token-stats.ts # token usage from $usage field (#68)
storage/
schemas.ts # OCAS schema definitions
store.ts # CAS read/write helpers
index.ts # variable indexing (@uwf/eval/*)
task/
types.ts # TaskManifest type (task.yaml)
loader.ts # parse task.yaml, validate
package.json
tsconfig.json
```
#### OCAS Schemas to Register
1. `@uwf/eval-run` — full eval execution record
```
{ task, config: {agent, model, engineVersion}, threadId,
judges: [{name, score, weight, dataHash}], overall, timestamp }
```
2. `@uwf/eval-judge-frontmatter` — frontmatter judge data
```
{ stepsTotal, stepsValid, invalidSteps: [{stepIndex, role, errors: string[]}] }
```
3. `@uwf/eval-judge-upstream` — upstream consumption judge data
```
{ perStep: [{role, consumed: string[], missed: string[], score}] }
```
4. `@uwf/eval-judge-hallucination` — hallucination judge data
```
{ perStep: [{role, hallucinations: string[], score}] }
```
5. `@uwf/eval-judge-token-stats` — token stats (not scored, informational)
```
{ totalInput, totalOutput, totalTurns, perStep: [{role, input, output, turns, duration}] }
```
#### CLI Design
```bash
# Run eval
uwf-eval run <task-dir-or-tarball> [--agent hermes] [--model claude-sonnet-4] [--count 20]
# View results
uwf-eval report <run-hash> # render via ocas render
uwf-eval diff <hash1> <hash2> # side-by-side comparison
uwf-eval list # list past runs
```
### Phase 2: Task Package Scaffold
Template for creating eval tasks. Also serves as the first real task.
```
eval-tasks/ # shazhou/uwf-eval-tasks monorepo
packages/
_template/ # copypaste template
package.json
task.yaml
fixture/
src/judges/
tsconfig.json
fix-off-by-one/ # first real task
package.json # @uwf-eval/fix-off-by-one
task.yaml
fixture/
src/calc.ts # buggy calculator
src/calc.test.ts # test that exposes the bug
package.json
src/judges/
test-pass.ts # runs pnpm test, checks exit code
code-quality.ts # LLM judge: minimal change, correct fix
schemas/
test-pass.json # OCAS schema for test-pass data
code-quality.json # OCAS schema for code-quality data
tsconfig.json
pnpm-workspace.yaml
tsconfig.json
biome.json
```
#### task.yaml Format
```yaml
name: fix-off-by-one
description: Fix an off-by-one error in a calculator's add function
workflow: solve-issue # registered workflow name, or relative path to .yaml
prompt: "Fix the bug: add(1,2) returns 4 instead of 3"
limits:
maxSteps: 15
timeoutMinutes: 30
judges:
- name: frontmatter-compliance
weight: 0.15
builtin: true
- name: upstream-consumption
weight: 0.15
builtin: true
- name: hallucination
weight: 0.1
builtin: true
- name: token-stats
weight: 0 # informational, not scored
builtin: true
- name: test-pass
weight: 0.3
entry: dist/judges/test-pass.js
schema: schemas/test-pass.json
- name: code-quality
weight: 0.3
entry: dist/judges/code-quality.js
schema: schemas/code-quality.json
```
#### Judge Script Contract
```typescript
// Input: process.argv = [node, script, cwd, threadId]
// Output: stdout JSON
// Exit 0 = success, non-zero = judge error (not low score)
import type { JudgeOutput } from "@united-workforce/eval";
const result: JudgeOutput<TestPassData> = {
score: 1.0, // 0.0 - 1.0
data: { // typed per judge schema
command: "pnpm test",
exitCode: 0,
output: "3 tests passed"
}
};
console.log(JSON.stringify(result));
```
### Phase 3: Prerequisite — $usage in Adapter Protocol (#68)
Blocked by #68. Token stats judge needs `$usage` in step nodes.
Can proceed with Phase 1+2 without it — token-stats judge just returns zeros until adapters report usage.
## Implementation Order
1. **Phase 1a**: `@united-workforce/eval` package scaffold + CLI skeleton + OCAS schemas
2. **Phase 1b**: `run` command — prepare, execute, collect flow
3. **Phase 1c**: Builtin judges — frontmatter (deterministic), upstream + hallucination (LLM-as-judge)
4. **Phase 2a**: Create `shazhou/uwf-eval-tasks` monorepo with proman
5. **Phase 2b**: First task `fix-off-by-one` with fixture repo + 2 custom judges
6. **Phase 2c**: End-to-end test: `uwf-eval run packages/fix-off-by-one --agent hermes`
7. **Phase 1d**: `report`, `diff`, `list` commands (read from CAS, render via ocas render)
## Dependencies
- `@ocas/core` + `@ocas/fs` — CAS storage
- `@united-workforce/protocol` — step node types
- `commander` — CLI framework (consistent with uwf)
- LLM API access — for LLM-as-judge (upstream, hallucination, task-specific quality judges)
## Open Questions
1. **LLM-as-judge provider config** — reuse uwf's `~/.uwf/config.yaml` provider settings? Or separate config?
2. **Workflow file location** — task.yaml references a workflow. Should the workflow YAML be inside the tarball, or reference a registered workflow by name?
3. **Non-coding tasks** — debate workflow has no fixture repo. task.yaml needs `fixture: null` or simply omit the `fixture/` dir. Runner creates empty cwd.
4. **Parallel judge execution** — judges are independent, can run in parallel. Worth the complexity?
## Risks
- LLM-as-judge consistency — same input may get different scores. Mitigation: run judge multiple times, take average? Or accept variance.
- Token cost of judges — each LLM judge call costs tokens. For a 10-step workflow with 2 LLM judges = 20 LLM calls just for judging. Acceptable?
- Fixture repo drift — if the fixture evolves, old eval runs become non-comparable. Pin fixture version in task.yaml.
-25
View File
@@ -1,25 +0,0 @@
# Changelog
## 0.1.0 (2026-06-05)
Initial release of `@united-workforce/*` — a stateless workflow engine for AI agent orchestration.
### Packages
- **@united-workforce/protocol** — shared types (WorkflowPayload, StepNode, etc.)
- **@united-workforce/util** — Crockford Base32, ULID, structured logger, frontmatter parsing
- **@united-workforce/util-agent** — agent factory, context builder, extract pipeline
- **@united-workforce/cli** — `uwf` CLI (thread lifecycle, status-based moderator, workflow registry)
- **@united-workforce/eval** — `uwf-eval` CLI (prepare → execute → collect eval pipeline)
- **@united-workforce/agent-hermes** — `uwf-hermes` adapter (Hermes Agent)
- **@united-workforce/agent-claude-code** — `uwf-claude-code` adapter (Claude Code CLI)
- **@united-workforce/agent-builtin** — `uwf-builtin` adapter (built-in LLM agent)
- **@united-workforce/agent-mock** — `uwf-mock` adapter (deterministic test agent)
### Highlights
- Status-based graph routing (no LLM moderator cost)
- CAS-backed immutable thread chains (`@ocas/core`)
- Real token usage tracking (Hermes + Claude Code)
- Eval framework with built-in judges (frontmatter, token-stats, test-pass)
- `$SUSPEND` / resume for human-in-the-loop workflows
+16 -17
View File
@@ -222,42 +222,41 @@ Test files (`__tests__/**`) are exempt.
| Tool | Purpose |
|------|---------|
| **pnpm** | Package manager |
| **bun** | Package manager + runtime |
| **TypeScript** | Type checking (strict mode) |
| **Biome** | Lint + format (replaces ESLint + Prettier) |
| **vitest** | Test runner (all packages) |
| **vitest** | Test runner (`cli` uses vitest; other packages use `bun test`) |
### Development Workflow
```bash
# ── Setup ──
pnpm install # install all workspace dependencies
bun install # install all workspace dependencies
# ── Daily development ──
pnpm run build # build all packages (dependency order)
pnpm run check # biome check + lint-log-tags
pnpm run typecheck # tsc --build
pnpm run test # run tests across all packages
bun run build # tsc --build (all packages, dependency order)
bun run check # tsc --build + biome check + lint-log-tags
bun run format # biome format --write
bun test # run tests across all packages
# ── Before committing ──
pnpm run check # must pass — lint + log tag validation
pnpm run typecheck # must pass — type checking
pnpm run test # must pass — all package tests
bun run check # must pass — typecheck + lint + log tag validation
bun test # must pass — all package tests
```
### Publishing
All public `@united-workforce/*` packages are published to **npmjs.org** with **independent versioning**.
All public `@united-workforce/*` packages are published to **npmjs.org** with **fixed mode** (all packages share the same version number).
```bash
# 1. Add a changeset describing the change
npx changeset
bun changeset
# 2. Bump versions + generate CHANGELOGs
proman bump
# 2. Bump all package versions + generate CHANGELOGs
bun version
# 3. Build, test, and publish
proman publish
# 3. Build, test, and publish (runs scripts/publish-all.mjs)
bun release
# Or publish manually with a tag:
node scripts/publish-all.mjs --tag alpha
@@ -266,7 +265,7 @@ node scripts/publish-all.mjs --dry-run # preview without publishing
- `workspace:^` dependencies resolve to `^x.y.z` on publish
- Publish order defined in `scripts/publish-all.mjs` (dependency order)
- Changesets config: `.changeset/config.json` (independent versioning, public access)
- Changesets config: `.changeset/config.json` (fixed mode, public access)
### End-to-end: Author → Register → Run
+1 -1
View File
@@ -470,7 +470,7 @@ Use the `ocas` CLI for direct CAS operations (`~/.ocas/` store, shared with `uwf
| Tool | Purpose |
|------|---------|
| **pnpm** | Package manager |
| **bun** | Package manager + runtime |
| **TypeScript** | Type checking (strict mode) |
| **Biome** | Lint + format |
| **vitest** | Test runner |
+3 -3
View File
@@ -17,7 +17,7 @@ The root README should have these sections in order:
4. **Packages** — table with ALL packages from packages/ directory, columns: Package, Description, Type (cli/lib/agent/app)
5. **Quick Start** — install, build, register workflow, start thread, run step
6. **CLI Reference** — brief command list, detailed usage in cli README
7. **Development**pnpm install / build / check / test
7. **Development**bun install / build / check / test
## Per-Package README Structure
@@ -26,7 +26,7 @@ Each package README should have:
1. **Title** — package name
2. **One-line description** — matching package.json
3. **Overview** — what it does, where it sits in the architecture, dependencies
4. **Installation**pnpm add (for libs) or "included as binary" (for cli/agents)
4. **Installation**bun add (for libs) or "included as binary" (for cli/agents)
5. **API** (lib packages) — all exports from src/index.ts with type signatures, grouped by category, minimal usage examples
6. **CLI Usage** (cli/agent packages) — command reference with examples
7. **Internal Structure** — brief src/ file organization
@@ -56,7 +56,7 @@ For each package read:
- All relative links work
- Package names match package.json
- No references to removed/renamed packages
- pnpm run build still passes
- bun run build still passes
## Guidelines
+2 -2
View File
@@ -23,7 +23,7 @@ roles:
type: object
properties:
$status:
enum: ["done"]
enum: ["_"]
thesis:
type: string
keyPoints:
@@ -37,4 +37,4 @@ graph:
$START:
_: { role: "analyst", prompt: "Analyze the topic in the task and produce a structured summary with key points." }
analyst:
done: { role: "$END", prompt: "Analysis complete. Finish the workflow." }
_: { role: "$END", prompt: "Analysis complete. Finish the workflow." }
-30
View File
@@ -1,30 +0,0 @@
name: eval-simple
description: "Single-role eval workflow: fixer takes prompt, fixes code, done."
roles:
fixer:
description: "Fixes the code based on the prompt"
goal: |
You are a code fixer. Read the prompt, understand the bug, fix it, and verify by running the tests.
capabilities:
- code-editing
- test-running
procedure: |
1. Read the prompt to understand what needs to be fixed
2. Fix the bug in the source code
3. Run the tests mentioned in the prompt to verify
4. Output $status=done when tests pass
output: "Describe what you fixed and confirm tests pass. Set $status to done."
frontmatter:
type: object
properties:
$status:
type: string
enum: [done]
summary:
type: string
required: [$status, summary]
graph:
$START:
_: { role: "fixer", prompt: "Fix the code issue described in the task prompt." }
fixer:
done: { role: "$END", prompt: "Fix complete." }
@@ -1,8 +1,8 @@
import { mkdtemp, rm } from "node:fs/promises";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { createMemoryStore } from "@ocas/core";
import { afterEach, beforeEach, describe, expect, test } from "vitest";
import { createMemoryStore } from "@ocas/core";
import { storeBuiltinDetail } from "../src/detail.js";
import { appendSessionTurn, initSessionDir } from "../src/session.js";
import type { BuiltinTurnPayload } from "../src/types.js";
@@ -1,51 +1,51 @@
import { mkdir, rm, writeFile } from "node:fs/promises";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterAll, beforeAll, describe, expect, it } from "vitest";
import { describe, it, expect, beforeAll, afterAll } from "vitest";
import { readFileTool } from "../src/tools/read-file.js";
import { writeFile, mkdir, rm } from "node:fs/promises";
import { join } from "node:path";
import { tmpdir } from "node:os";
const testDir = join(tmpdir(), `read-file-test-${Date.now()}`);
const ctx = { cwd: testDir, storageRoot: testDir };
beforeAll(async () => {
await mkdir(testDir, { recursive: true });
await writeFile(join(testDir, "hello.txt"), "hello world", "utf8");
await mkdir(testDir, { recursive: true });
await writeFile(join(testDir, "hello.txt"), "hello world", "utf8");
});
afterAll(async () => {
await rm(testDir, { recursive: true, force: true });
await rm(testDir, { recursive: true, force: true });
});
describe("readFileTool", () => {
it("reads a file successfully", async () => {
const result = await readFileTool.execute({ path: "hello.txt" }, ctx);
expect(result).toBe("hello world");
});
it("reads a file successfully", async () => {
const result = await readFileTool.execute({ path: "hello.txt" }, ctx);
expect(result).toBe("hello world");
});
it("returns error for non-existent file", async () => {
const result = await readFileTool.execute({ path: "nope.txt" }, ctx);
expect(result).toMatch(/^Error:/);
});
it("returns error for non-existent file", async () => {
const result = await readFileTool.execute({ path: "nope.txt" }, ctx);
expect(result).toMatch(/^Error:/);
});
it("returns error for directory", async () => {
const result = await readFileTool.execute({ path: "." }, ctx);
expect(result).toBe("Error: not a file");
});
it("returns error for directory", async () => {
const result = await readFileTool.execute({ path: "." }, ctx);
expect(result).toBe("Error: not a file");
});
it("returns error when path is not a string", async () => {
const result = await readFileTool.execute({ path: 123 }, ctx);
expect(result).toBe("Error: path must be a string");
});
it("returns error when path is not a string", async () => {
const result = await readFileTool.execute({ path: 123 }, ctx);
expect(result).toBe("Error: path must be a string");
});
it("returns error when args is null", async () => {
const result = await readFileTool.execute(null, ctx);
expect(result).toBe("Error: path must be a string");
});
it("returns error when args is null", async () => {
const result = await readFileTool.execute(null, ctx);
expect(result).toBe("Error: path must be a string");
});
it("returns error for file exceeding 512KB limit", async () => {
const bigFile = join(testDir, "big.txt");
await writeFile(bigFile, Buffer.alloc(512 * 1024 + 1, 65));
const result = await readFileTool.execute({ path: "big.txt" }, ctx);
expect(result).toMatch(/Error:.*limit/);
});
it("returns error for file exceeding 512KB limit", async () => {
const bigFile = join(testDir, "big.txt");
await writeFile(bigFile, Buffer.alloc(512 * 1024 + 1, 65));
const result = await readFileTool.execute({ path: "big.txt" }, ctx);
expect(result).toMatch(/Error:.*limit/);
});
});
@@ -1,38 +1,38 @@
import { tmpdir } from "node:os";
import { describe, expect, it } from "vitest";
import { describe, it, expect } from "vitest";
import { runCommandTool } from "../src/tools/run-command.js";
import { tmpdir } from "node:os";
const ctx = { cwd: tmpdir(), storageRoot: tmpdir() };
describe("runCommandTool", () => {
it("runs echo command and checks stdout", async () => {
const result = await runCommandTool.execute({ command: "echo hello" }, ctx);
expect(result).toContain("hello");
expect(result).toContain("stdout");
});
it("runs echo command and checks stdout", async () => {
const result = await runCommandTool.execute({ command: "echo hello" }, ctx);
expect(result).toContain("hello");
expect(result).toContain("stdout");
});
it("returns exit code", async () => {
const result = await runCommandTool.execute({ command: "exit 0" }, ctx);
expect(result).toContain("exit_code: 0");
});
it("returns exit code", async () => {
const result = await runCommandTool.execute({ command: "exit 0" }, ctx);
expect(result).toContain("exit_code: 0");
});
it("returns non-zero exit code", async () => {
const result = await runCommandTool.execute({ command: "exit 42" }, ctx);
expect(result).toContain("exit_code: 42");
});
it("returns non-zero exit code", async () => {
const result = await runCommandTool.execute({ command: "exit 42" }, ctx);
expect(result).toContain("exit_code: 42");
});
it("returns error when command is not a string", async () => {
const result = await runCommandTool.execute({ command: 123 }, ctx);
expect(result).toBe("Error: command must be a string");
});
it("returns error when command is not a string", async () => {
const result = await runCommandTool.execute({ command: 123 }, ctx);
expect(result).toBe("Error: command must be a string");
});
it("returns error when args is null", async () => {
const result = await runCommandTool.execute(null, ctx);
expect(result).toBe("Error: command must be a string");
});
it("returns error when args is null", async () => {
const result = await runCommandTool.execute(null, ctx);
expect(result).toBe("Error: command must be a string");
});
it("custom cwd works", async () => {
const result = await runCommandTool.execute({ command: "pwd", cwd: "/tmp" }, ctx);
expect(result).toContain("/tmp");
});
it("custom cwd works", async () => {
const result = await runCommandTool.execute({ command: "pwd", cwd: "/tmp" }, ctx);
expect(result).toContain("/tmp");
});
});
@@ -3,13 +3,13 @@ import { mkdtemp, rm } from "node:fs/promises";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, beforeEach, describe, expect, test } from "vitest";
import type { BuiltinTurnPayload } from "../src/types.js";
import {
appendSessionTurn,
initSessionDir,
readSessionTurns,
removeSession,
} from "../src/session.js";
import type { BuiltinTurnPayload } from "../src/types.js";
describe("session", () => {
let storageRoot: string;
@@ -1,43 +1,43 @@
import { readFile, rm } from "node:fs/promises";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterAll, describe, expect, it } from "vitest";
import { describe, it, expect, afterAll } from "vitest";
import { writeFileTool } from "../src/tools/write-file.js";
import { readFile, rm } from "node:fs/promises";
import { join } from "node:path";
import { tmpdir } from "node:os";
const testDir = join(tmpdir(), `write-file-test-${Date.now()}`);
const ctx = { cwd: testDir, storageRoot: testDir };
afterAll(async () => {
await rm(testDir, { recursive: true, force: true });
await rm(testDir, { recursive: true, force: true });
});
describe("writeFileTool", () => {
it("writes file successfully", async () => {
const result = await writeFileTool.execute({ path: "out.txt", content: "hi" }, ctx);
expect(result).toMatch(/Wrote 2 bytes/);
const content = await readFile(join(testDir, "out.txt"), "utf8");
expect(content).toBe("hi");
});
it("writes file successfully", async () => {
const result = await writeFileTool.execute({ path: "out.txt", content: "hi" }, ctx);
expect(result).toMatch(/Wrote 2 bytes/);
const content = await readFile(join(testDir, "out.txt"), "utf8");
expect(content).toBe("hi");
});
it("creates parent directories", async () => {
const result = await writeFileTool.execute({ path: "a/b/c.txt", content: "nested" }, ctx);
expect(result).toMatch(/Wrote/);
const content = await readFile(join(testDir, "a/b/c.txt"), "utf8");
expect(content).toBe("nested");
});
it("creates parent directories", async () => {
const result = await writeFileTool.execute({ path: "a/b/c.txt", content: "nested" }, ctx);
expect(result).toMatch(/Wrote/);
const content = await readFile(join(testDir, "a/b/c.txt"), "utf8");
expect(content).toBe("nested");
});
it("returns error when path is not a string", async () => {
const result = await writeFileTool.execute({ path: 123, content: "x" }, ctx);
expect(result).toBe("Error: path and content must be strings");
});
it("returns error when path is not a string", async () => {
const result = await writeFileTool.execute({ path: 123, content: "x" }, ctx);
expect(result).toBe("Error: path and content must be strings");
});
it("returns error when content is not a string", async () => {
const result = await writeFileTool.execute({ path: "x.txt", content: 42 }, ctx);
expect(result).toBe("Error: path and content must be strings");
});
it("returns error when content is not a string", async () => {
const result = await writeFileTool.execute({ path: "x.txt", content: 42 }, ctx);
expect(result).toBe("Error: path and content must be strings");
});
it("returns error when args is null", async () => {
const result = await writeFileTool.execute(null, ctx);
expect(result).toBe("Error: path and content must be strings");
});
it("returns error when args is null", async () => {
const result = await writeFileTool.execute(null, ctx);
expect(result).toBe("Error: path and content must be strings");
});
});
+3 -2
View File
@@ -1,6 +1,6 @@
{
"name": "@united-workforce/agent-builtin",
"version": "0.1.1",
"version": "0.5.0",
"files": [
"src",
"dist",
@@ -8,7 +8,7 @@
],
"type": "module",
"bin": {
"uwf-builtin": "./dist/cli.js"
"uwf-builtin": "./src/cli.ts"
},
"exports": {
".": {
@@ -17,6 +17,7 @@
}
},
"scripts": {
"prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
"test": "vitest run __tests__/",
"test:ci": "vitest run __tests__/"
},
+1 -8
View File
@@ -82,13 +82,7 @@ async function runBuiltinWithMessages(
if (loopResult.turnCount === 0) {
log("5RWTK9NB", "no turns produced, returning empty output");
return {
output: "",
detailHash: "",
sessionId: session.sessionId,
assembledPrompt: "",
usage: null,
};
return { output: "", detailHash: "", sessionId: session.sessionId, assembledPrompt: "" };
}
// Read jsonl → persist turns to CAS → store detail
@@ -105,7 +99,6 @@ async function runBuiltinWithMessages(
detailHash,
sessionId: session.sessionId,
assembledPrompt: "",
usage: null,
};
}
-7
View File
@@ -1,12 +1,5 @@
#!/usr/bin/env node
// eslint-disable-next-line -- dynamic import for version
const pkg = await import("../package.json", { with: { type: "json" } });
if (process.argv.includes("--version") || process.argv.includes("-V")) {
process.stdout.write(`${pkg.default.version}\n`);
process.exit(0);
}
import { createBuiltinAgent } from "./agent.js";
const main = createBuiltinAgent();
+3 -3
View File
@@ -1,6 +1,6 @@
{
"name": "@united-workforce/agent-claude-code",
"version": "0.1.1",
"version": "0.1.0",
"files": [
"src",
"dist",
@@ -8,7 +8,7 @@
],
"type": "module",
"bin": {
"uwf-claude-code": "./dist/cli.js"
"uwf-claude-code": "./src/cli.ts"
},
"exports": {
".": {
@@ -17,12 +17,12 @@
}
},
"scripts": {
"prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
"test": "vitest run __tests__/",
"test:ci": "vitest run __tests__/"
},
"dependencies": {
"@ocas/core": "^0.3.0",
"@united-workforce/protocol": "workspace:^",
"@united-workforce/util": "workspace:^",
"@united-workforce/util-agent": "workspace:^"
},
@@ -1,6 +1,5 @@
import { spawn } from "node:child_process";
import type { Store } from "@ocas/core";
import type { Usage } from "@united-workforce/protocol";
import { createLogger } from "@united-workforce/util";
import {
type AgentContext,
@@ -146,14 +145,7 @@ async function processClaudeOutput(
);
}
const usage: Usage = {
turns: parsed.numTurns,
inputTokens: parsed.usage.inputTokens,
outputTokens: parsed.usage.outputTokens,
duration: Math.round(parsed.durationMs / 1000),
};
return { output, detailHash, sessionId, assembledPrompt, usage };
return { output, detailHash, sessionId, assembledPrompt };
}
// Truly unparseable output - provide enhanced error message
-7
View File
@@ -1,12 +1,5 @@
#!/usr/bin/env node
// eslint-disable-next-line -- dynamic import for version
const pkg = await import("../package.json", { with: { type: "json" } });
if (process.argv.includes("--version") || process.argv.includes("-V")) {
process.stdout.write(`${pkg.default.version}\n`);
process.exit(0);
}
import { createClaudeCodeAgent } from "./claude-code.js";
const model = process.env.CLAUDE_MODEL ?? null;
+1 -1
View File
@@ -2,5 +2,5 @@
"extends": "../../tsconfig.json",
"compilerOptions": { "rootDir": "src", "outDir": "dist" },
"include": ["src"],
"references": [{ "path": "../protocol" }, { "path": "../util-agent" }]
"references": [{ "path": "../util-agent" }]
}
-18
View File
@@ -1,18 +0,0 @@
# @united-workforce/agent-hermes
## 0.1.1
### Patch Changes
- 8085d1d: fix: read token usage from ACP PromptResponse instead of DB
Token counts (inputTokens, outputTokens) now come from the ACP
`PromptResponse.usage` field, which is populated synchronously from
`run_conversation()` return data — no WAL race condition.
Turns (assistant message count) still come from the DB via
`snapshotTurns()` before/after delta.
Previously both tokens and turns were read from the Hermes state DB
after the ACP prompt returned, but due to WAL write lag the DB often
had incomplete token data at read time (e.g. 235 vs actual 26,080).
@@ -0,0 +1,55 @@
import { afterEach, beforeEach, describe, expect, it } from "vitest";
import { HermesAcpClient } from "../../src/acp-client.js";
const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
describe("HermesAcpClient", () => {
let client: HermesAcpClient;
beforeEach(() => {
client = new HermesAcpClient();
});
afterEach(async () => {
await client.close();
});
it(
"connect() returns a UUID sessionId",
async () => {
const sessionId = await client.connect(process.cwd());
expect(typeof sessionId).toBe("string");
expect(sessionId).toMatch(UUID_RE);
},
{ timeout: 2 * 60 * 1000 },
);
it(
"prompt() returns a non-empty text response",
async () => {
await client.connect(process.cwd());
const result = await client.prompt("Reply with exactly the word: PONG");
expect(typeof result.text).toBe("string");
expect(result.text.length).toBeGreaterThan(0);
expect(typeof result.sessionId).toBe("string");
expect(result.sessionId).toMatch(UUID_RE);
},
{ timeout: 2 * 60 * 1000 },
);
it(
"prompt() can be called twice on the same session (resume)",
async () => {
await client.connect(process.cwd());
const first = await client.prompt("Say the word ALPHA and nothing else.");
expect(first.text.length).toBeGreaterThan(0);
const second = await client.prompt("Now say the word BETA and nothing else.");
expect(second.text.length).toBeGreaterThan(0);
expect(first.sessionId).toBe(second.sessionId);
},
{ timeout: 2 * 60 * 1000 },
);
});
@@ -0,0 +1,56 @@
import { afterEach, describe, expect, it } from "vitest";
import { HermesAcpClient } from "../../src/acp-client.js";
/**
* E2E test for cross-process session resume.
*
* Simulates the workflow re-entry scenario:
* 1. Client A: connect → prompt → close (developer first run)
* 2. Client B: resume(sessionId) → prompt (developer re-entry after reviewer reject)
*
* This is what happens when uwf thread step spawns uwf-hermes twice for the same role.
*/
describe("HermesAcpClient cross-process resume", () => {
const clients: HermesAcpClient[] = [];
afterEach(async () => {
for (const c of clients) {
await c.close();
}
clients.length = 0;
});
// TODO(#435): flaky — depends on live LLM; mock or move to integration suite
it.skip(
"resume() after close — second prompt returns non-empty text",
async () => {
// --- Client A: first run ---
const clientA = new HermesAcpClient();
clients.push(clientA);
await clientA.connect(process.cwd());
const first = await clientA.prompt(
"Remember the secret code: WATERMELON. Reply with exactly: ACKNOWLEDGED",
);
expect(first.text.length).toBeGreaterThan(0);
const sessionId = first.sessionId;
// Close client A (simulates uwf-hermes process exit)
await clientA.close();
// --- Client B: resume (simulates re-entry) ---
const clientB = new HermesAcpClient();
clients.push(clientB);
await clientB.resume(sessionId, process.cwd());
const second = await clientB.prompt(
"What was the secret code I told you earlier? Reply with just the code word.",
);
// The critical assertion: resumed session produces non-empty output
expect(second.text.length).toBeGreaterThan(0);
expect(second.sessionId).toBe(sessionId);
},
{ timeout: 3 * 60 * 1000 },
);
});
@@ -140,9 +140,7 @@ function createTestDb(dbPath: string): TestDb {
db.exec(`CREATE TABLE sessions (
id TEXT PRIMARY KEY,
model TEXT NOT NULL,
started_at INTEGER NOT NULL,
input_tokens INTEGER DEFAULT 0,
output_tokens INTEGER DEFAULT 0
started_at INTEGER NOT NULL
)`);
db.exec(`CREATE TABLE messages (
id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -1,122 +0,0 @@
import { describe, expect, test } from "vitest";
import type { AcpUsage } from "../src/acp-client.js";
import { buildUsage, snapshotTurns } from "../src/hermes.js";
import type { HermesSessionJson } from "../src/types.js";
function makeSession(overrides: Partial<HermesSessionJson> = {}): HermesSessionJson {
return {
session_id: "test-session",
model: "test-model",
session_start: "2026-01-01T00:00:00Z",
messages: [],
inputTokens: 0,
outputTokens: 0,
...overrides,
};
}
describe("snapshotTurns", () => {
test("returns zero for null session", () => {
const result = snapshotTurns(null);
expect(result).toEqual({ turns: 0 });
});
test("returns zero for empty session", () => {
const result = snapshotTurns(makeSession());
expect(result).toEqual({ turns: 0 });
});
test("counts assistant messages as turns", () => {
const result = snapshotTurns(
makeSession({
messages: [
{ role: "user", content: "hello", reasoning: null, tool_calls: null },
{ role: "assistant", content: "hi", reasoning: null, tool_calls: null },
{ role: "user", content: "do X", reasoning: null, tool_calls: null },
{ role: "tool", content: "result", reasoning: null, tool_calls: null },
{ role: "assistant", content: "done", reasoning: null, tool_calls: null },
],
inputTokens: 1000,
outputTokens: 500,
}),
);
expect(result).toEqual({ turns: 2 });
});
test("ignores non-assistant messages for turn count", () => {
const result = snapshotTurns(
makeSession({
messages: [
{ role: "user", content: "hello", reasoning: null, tool_calls: null },
{ role: "tool", content: "result", reasoning: null, tool_calls: null },
],
}),
);
expect(result.turns).toBe(0);
});
});
describe("buildUsage", () => {
const acpUsage: AcpUsage = { inputTokens: 5000, outputTokens: 2000, totalTokens: 7000 };
test("first visit: tokens from ACP, turns from DB delta", () => {
const beforeTurns = { turns: 0 };
const afterTurns = { turns: 3 };
const result = buildUsage(acpUsage, beforeTurns, afterTurns, 12.5);
expect(result).toEqual({
turns: 3,
inputTokens: 5000,
outputTokens: 2000,
duration: 13,
});
});
test("re-entry: turn delta computed correctly, tokens from ACP", () => {
const beforeTurns = { turns: 2 };
const afterTurns = { turns: 4 };
const acpDelta: AcpUsage = { inputTokens: 8000, outputTokens: 3500, totalTokens: 11500 };
const result = buildUsage(acpDelta, beforeTurns, afterTurns, 7.3);
expect(result).toEqual({
turns: 2,
inputTokens: 8000,
outputTokens: 3500,
duration: 7,
});
});
test("floors negative turn deltas at 0, then defaults to 1", () => {
const beforeTurns = { turns: 5 };
const afterTurns = { turns: 3 };
const result = buildUsage(acpUsage, beforeTurns, afterTurns, 1.0);
// turns would be negative (-2), floored to 0, then || 1 gives 1
expect(result.turns).toBe(1);
});
test("zero turns delta defaults to 1 (at least one turn happened)", () => {
const beforeTurns = { turns: 3 };
const afterTurns = { turns: 3 };
const result = buildUsage(acpUsage, beforeTurns, afterTurns, 5.0);
// turns delta is 0, || 1 gives 1
expect(result.turns).toBe(1);
});
test("null ACP usage yields zero tokens", () => {
const beforeTurns = { turns: 0 };
const afterTurns = { turns: 2 };
const result = buildUsage(null, beforeTurns, afterTurns, 10.0);
expect(result).toEqual({
turns: 2,
inputTokens: 0,
outputTokens: 0,
duration: 10,
});
});
test("duration is rounded", () => {
const beforeTurns = { turns: 0 };
const afterTurns = { turns: 1 };
expect(buildUsage(acpUsage, beforeTurns, afterTurns, 3.7).duration).toBe(4);
expect(buildUsage(acpUsage, beforeTurns, afterTurns, 3.2).duration).toBe(3);
expect(buildUsage(acpUsage, beforeTurns, afterTurns, 0.0).duration).toBe(0);
});
});
+3 -2
View File
@@ -1,6 +1,6 @@
{
"name": "@united-workforce/agent-hermes",
"version": "0.1.2",
"version": "0.5.0",
"files": [
"src",
"dist",
@@ -8,7 +8,7 @@
],
"type": "module",
"bin": {
"uwf-hermes": "./dist/cli.js"
"uwf-hermes": "./src/cli.ts"
},
"exports": {
".": {
@@ -17,6 +17,7 @@
}
},
"scripts": {
"prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
"test": "vitest run __tests__/",
"test:ci": "vitest run __tests__/"
},
+1 -40
View File
@@ -1,16 +1,6 @@
import type { ChildProcess } from "node:child_process";
import { spawn } from "node:child_process";
import { readFileSync } from "node:fs";
import { dirname, join } from "node:path";
import { createInterface } from "node:readline";
import { fileURLToPath } from "node:url";
const __dirname = dirname(fileURLToPath(import.meta.url));
const OWN_VERSION = (
JSON.parse(readFileSync(join(__dirname, "..", "package.json"), "utf-8")) as {
version: string;
}
).version;
const HERMES_COMMAND = "hermes";
const PROTOCOL_VERSION = 1;
@@ -27,17 +17,9 @@ type PendingRequest = {
reject: (reason: Error) => void;
};
/** Token usage returned by ACP PromptResponse. */
export type AcpUsage = {
inputTokens: number;
outputTokens: number;
totalTokens: number;
};
export type AcpPromptResult = {
text: string;
sessionId: string;
usage: AcpUsage | null;
};
export class HermesAcpClient {
@@ -90,11 +72,6 @@ export class HermesAcpClient {
return sessionId;
}
/** Return the current session ID, or null if not connected. */
getSessionId(): string | null {
return this.sessionId;
}
/** Send prompt and collect final assistant text from ACP stream chunks. */
async prompt(text: string): Promise<AcpPromptResult> {
if (this.sessionId === null) {
@@ -114,25 +91,9 @@ export class HermesAcpClient {
);
}
// Extract token usage from ACP PromptResponse.result.usage (camelCase wire format)
const result = (response as { result?: Record<string, unknown> }).result;
const rawUsage = result?.usage as Record<string, unknown> | undefined;
const usage: AcpUsage | null =
rawUsage !== undefined &&
typeof rawUsage.inputTokens === "number" &&
typeof rawUsage.outputTokens === "number" &&
typeof rawUsage.totalTokens === "number"
? {
inputTokens: rawUsage.inputTokens,
outputTokens: rawUsage.outputTokens,
totalTokens: rawUsage.totalTokens,
}
: null;
return {
text: this.messageChunks.join(""),
sessionId: this.sessionId,
usage,
};
}
@@ -309,7 +270,7 @@ export class HermesAcpClient {
private async initialize(): Promise<void> {
const initResponse = await this.sendRequest("initialize", {
protocolVersion: PROTOCOL_VERSION,
clientInfo: { name: "uwf-hermes", version: OWN_VERSION },
clientInfo: { name: "uwf", version: "0.1.0" },
capabilities: {},
});
-7
View File
@@ -1,12 +1,5 @@
#!/usr/bin/env node
// eslint-disable-next-line -- dynamic import for version
const pkg = await import("../package.json", { with: { type: "json" } });
if (process.argv.includes("--version") || process.argv.includes("-V")) {
process.stdout.write(`${pkg.default.version}\n`);
process.exit(0);
}
import { createHermesAgent } from "./hermes.js";
import { isResumeDisabled } from "./session-cache.js";
+8 -80
View File
@@ -1,5 +1,4 @@
import type { Store } from "@ocas/core";
import type { Usage } from "@united-workforce/protocol";
import { createLogger } from "@united-workforce/util";
import {
type AgentContext,
@@ -8,50 +7,13 @@ import {
buildRolePrompt,
createAgent,
} from "@united-workforce/util-agent";
import type { AcpUsage } from "./acp-client.js";
import { HermesAcpClient } from "./acp-client.js";
import { getCachedSessionId, setCachedSessionId } from "./session-cache.js";
import { loadHermesSession, storeHermesSessionDetail } from "./session-detail.js";
import type { HermesSessionJson } from "./types.js";
const log = createLogger({ sink: { kind: "stderr" } });
/** Snapshot of session metrics taken before and after a prompt call. */
type TurnsSnapshot = {
turns: number;
};
const ZERO_TURNS: TurnsSnapshot = { turns: 0 };
/** Extract assistant turn count from a session. Returns zero for null sessions. */
export function snapshotTurns(session: HermesSessionJson | null): TurnsSnapshot {
if (session === null) {
return ZERO_TURNS;
}
return {
turns: session.messages.filter((m) => m.role === "assistant").length,
};
}
/**
* Build Usage from ACP token data + DB turn delta.
* Tokens come from ACP PromptResponse (synchronous, accurate).
* Turns come from DB before/after snapshots (may have WAL lag, but acceptable).
*/
export function buildUsage(
acpUsage: AcpUsage | null,
beforeTurns: TurnsSnapshot,
afterTurns: TurnsSnapshot,
durationSec: number,
): Usage {
return {
turns: Math.max(0, afterTurns.turns - beforeTurns.turns) || 1,
inputTokens: acpUsage?.inputTokens ?? 0,
outputTokens: acpUsage?.outputTokens ?? 0,
duration: Math.round(durationSec),
};
}
/** Assemble system prompt, task, and prior step outputs for Hermes. */
export function buildHermesPrompt(ctx: AgentContext): string {
const parts: string[] = [];
@@ -146,45 +108,25 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise<void>
void client.close();
});
async function runPrompt(
ctx: AgentContext,
useContinuation: boolean,
beforeTurns: TurnsSnapshot,
): Promise<AgentRunResult> {
async function runPrompt(ctx: AgentContext, useContinuation: boolean): Promise<AgentRunResult> {
const effectiveCtx = useContinuation ? ctx : { ...ctx, isFirstVisit: true };
const fullPrompt = buildHermesPrompt(effectiveCtx);
const startMs = Date.now();
const { text, sessionId, usage: acpUsage } = await client.prompt(fullPrompt);
const durationSec = (Date.now() - startMs) / 1000;
const { text, sessionId } = await client.prompt(fullPrompt);
const { detailHash } = await storePromptResult(ctx.store, sessionId);
if (!resumeDisabled) {
await setCachedSessionId(ctx.threadId, ctx.role, sessionId, ctx.storageRoot);
}
// Turns from DB (may lag slightly due to WAL, but acceptable)
const afterSession = await loadHermesSession(sessionId);
const afterTurns = snapshotTurns(afterSession);
const usage = buildUsage(acpUsage, beforeTurns, afterTurns, durationSec);
return { output: text, detailHash, sessionId, assembledPrompt: fullPrompt, usage };
return { output: text, detailHash, sessionId, assembledPrompt: fullPrompt };
}
async function runHermes(ctx: AgentContext): Promise<AgentRunResult> {
const cwd = process.cwd();
const attempt = await prepareSession(client, ctx, cwd, resumeDisabled);
// Snapshot before prompt: for resumed sessions, captures cumulative state
// so we can compute the turn delta. For new sessions, this is ZERO_TURNS.
const currentSessionId = client.getSessionId();
const beforeSession =
attempt.resumed && currentSessionId !== null
? await loadHermesSession(currentSessionId)
: null;
const beforeTurns = snapshotTurns(beforeSession);
try {
return await runPrompt(ctx, attempt.useContinuation, beforeTurns);
return await runPrompt(ctx, attempt.useContinuation);
} catch (error) {
if (!attempt.resumed) {
throw error;
@@ -194,8 +136,7 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise<void>
log("8FQW2R6N", `continuation prompt failed, retrying with initial prompt: ${message}`);
await client.close();
await client.connect(cwd);
// Fresh session after retry — reset snapshot to zero
return runPrompt(ctx, false, ZERO_TURNS);
return runPrompt(ctx, false);
}
}
@@ -206,22 +147,9 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise<void>
): Promise<AgentRunResult> {
// Client is already connected from runHermes — same ACP session,
// so the agent sees the full conversation history (crucial for retries).
// Snapshot turns before the continuation prompt for delta computation.
const currentSessionId = client.getSessionId();
const beforeSession =
currentSessionId !== null ? await loadHermesSession(currentSessionId) : null;
const beforeTurns = snapshotTurns(beforeSession);
const startMs = Date.now();
const { text, sessionId, usage: acpUsage } = await client.prompt(message);
const durationSec = (Date.now() - startMs) / 1000;
const { text, sessionId } = await client.prompt(message);
const { detailHash } = await storePromptResult(store, sessionId);
const afterSession = await loadHermesSession(sessionId);
const afterTurns = snapshotTurns(afterSession);
const usage = buildUsage(acpUsage, beforeTurns, afterTurns, durationSec);
return { output: text, detailHash, sessionId, assembledPrompt: "", usage };
return { output: text, detailHash, sessionId, assembledPrompt: "" };
}
const agentMain = createAgent({
+1 -7
View File
@@ -1,8 +1,2 @@
export type { AcpUsage } from "./acp-client.js";
export { HermesAcpClient } from "./acp-client.js";
export {
buildHermesPrompt,
buildUsage,
createHermesAgent,
snapshotTurns,
} from "./hermes.js";
export { buildHermesPrompt, createHermesAgent } from "./hermes.js";
+2 -8
View File
@@ -106,7 +106,7 @@ function parseSessionJson(raw: unknown): HermesSessionJson | null {
messages.push(msg);
}
}
return { session_id, model, session_start, messages, inputTokens: 0, outputTokens: 0 };
return { session_id, model, session_start, messages };
}
export function getHermesDbPath(): string {
@@ -117,8 +117,6 @@ type DbSessionRow = {
id: string;
model: string;
started_at: number;
input_tokens: number;
output_tokens: number;
};
type DbMessageRow = {
@@ -158,9 +156,7 @@ export function loadHermesSessionFromDb(
try {
db = new DatabaseSync(resolvedPath, { readOnly: true });
const session = db
.prepare(
"SELECT id, model, started_at, input_tokens, output_tokens FROM sessions WHERE id = ?",
)
.prepare("SELECT id, model, started_at FROM sessions WHERE id = ?")
.get(sessionId) as DbSessionRow | null;
if (session === null) {
return null;
@@ -185,8 +181,6 @@ export function loadHermesSessionFromDb(
model: session.model,
session_start: new Date(session.started_at * 1000).toISOString(),
messages,
inputTokens: session.input_tokens ?? 0,
outputTokens: session.output_tokens ?? 0,
};
} catch {
return null;
-2
View File
@@ -40,6 +40,4 @@ export type HermesSessionJson = {
model: string;
session_start: string;
messages: HermesSessionMessage[];
inputTokens: number;
outputTokens: number;
};
+2 -1
View File
@@ -1,6 +1,6 @@
{
"name": "@united-workforce/agent-mock",
"version": "0.1.1",
"version": "0.5.0",
"files": [
"src",
"dist",
@@ -17,6 +17,7 @@
}
},
"scripts": {
"prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
"test": "vitest run __tests__/",
"test:ci": "vitest run __tests__/"
},
-7
View File
@@ -1,12 +1,5 @@
#!/usr/bin/env node
// eslint-disable-next-line -- dynamic import for version
const pkg = await import("../package.json", { with: { type: "json" } });
if (process.argv.includes("--version") || process.argv.includes("-V")) {
process.stdout.write(`${pkg.default.version}\n`);
process.exit(0);
}
import { createMockAgent } from "./mock-agent.js";
const USAGE = "usage: uwf-mock --mock-data <path> --thread <id> --role <role> --prompt <text>";
-1
View File
@@ -103,7 +103,6 @@ export function createMockAgent(mockDataPath: string): () => Promise<void> {
detailHash,
sessionId,
assembledPrompt: "",
usage: { turns: 1, inputTokens: 0, outputTokens: 0, duration: 0 },
};
lastResult = result;
return result;
-9
View File
@@ -1,9 +0,0 @@
# @united-workforce/cli
## 0.1.1
### Patch Changes
- 850a3b2: fix: resolve --agent override via config alias before raw command
`resolveAgentConfig()` now checks `config.agents[alias]` first before falling back to `parseAgentOverride()`. Eval CLI default `--agent` changed from `"hermes"` to `"uwf-hermes"`.
+2 -1
View File
@@ -1,6 +1,6 @@
{
"name": "@united-workforce/cli",
"version": "0.1.1",
"version": "0.5.0",
"files": [
"src",
"dist",
@@ -22,6 +22,7 @@
"yaml": "^2.8.4"
},
"scripts": {
"prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
"test": "vitest run src/",
"test:ci": "vitest run src/"
},
+48 -27
View File
@@ -6,7 +6,13 @@ import type { CasRef, ThreadId } from "@united-workforce/protocol";
import { describe, expect, test } from "vitest";
import { createMarker, deleteMarker } from "../background/index.js";
import { cmdThreadList, cmdThreadShow, cmdThreadStart } from "../commands/thread.js";
import { completeThread, createUwfStore, loadActiveThreads, setThread } from "../store.js";
import {
addHistoryEntry,
createUwfStore,
deleteThread,
loadAllThreads,
setThread,
} from "../store.js";
const OUTPUT_SCHEMA = {
type: "object" as const,
@@ -42,7 +48,7 @@ roles:
type: object
required: ["$status"]
properties:
$status: { type: string, enum: ["done"] }
$status: { type: string }
graph:
$START:
_:
@@ -59,7 +65,7 @@ graph:
prompt: "Try again"
location: null
roleB:
done:
_:
role: $END
prompt: "Done"
location: null
@@ -92,7 +98,7 @@ roles:
type: object
required: ["$status"]
properties:
$status: { type: string, enum: ["done"] }
$status: { type: string }
roleC:
description: Fail role
goal: Do C
@@ -104,7 +110,7 @@ roles:
type: object
required: ["$status"]
properties:
$status: { type: string, enum: ["done"] }
$status: { type: string }
graph:
$START:
_:
@@ -121,12 +127,12 @@ graph:
prompt: "Do C (fail)"
location: null
roleB:
done:
_:
role: $END
prompt: "Done"
location: null
roleC:
done:
_:
role: $END
prompt: "Done"
location: null
@@ -147,7 +153,7 @@ roles:
type: object
required: ["$status"]
properties:
$status: { type: string, enum: ["done"] }
$status: { type: string }
graph:
$START:
_:
@@ -155,7 +161,7 @@ graph:
prompt: "Work"
location: null
worker:
done:
_:
role: $END
prompt: "Done"
location: null
@@ -169,7 +175,7 @@ async function insertStepNode(
outputPayload: Record<string, unknown>,
): Promise<void> {
const uwf = await createUwfStore(storageRoot);
const index = loadActiveThreads(uwf.varStore);
const index = loadAllThreads(uwf.varStore);
const headEntry = index[threadId];
if (headEntry === undefined) throw new Error(`thread ${threadId} not in index`);
const head = headEntry.head;
@@ -200,13 +206,7 @@ async function insertStepNode(
assembledPrompt: null,
})) as CasRef;
setThread(uwf.varStore, threadId, {
head: stepHash,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
});
setThread(uwf.varStore, threadId, { head: stepHash, suspendedRole: null, suspendMessage: null });
}
describe("currentRole field", () => {
@@ -282,12 +282,19 @@ describe("currentRole field", () => {
try {
const wf = join(tmpDir, "test-current-role.yaml");
await writeFile(wf, SIMPLE_WORKFLOW_YAML, "utf8");
const { thread } = await cmdThreadStart(storageRoot, wf, "test", tmpDir);
const { thread, workflow } = await cmdThreadStart(storageRoot, wf, "test", tmpDir);
const tid = thread as ThreadId;
const uwfForIndex = await createUwfStore(storageRoot);
loadActiveThreads(uwfForIndex.varStore)[tid]!.head;
completeThread(uwfForIndex.varStore, tid, "completed");
const head = loadAllThreads(uwfForIndex.varStore)[tid]!.head;
deleteThread(uwfForIndex.varStore, tid);
addHistoryEntry(uwfForIndex.varStore, {
thread: tid,
workflow,
head,
completedAt: Date.now(),
reason: "completed",
});
const result = await cmdThreadShow(storageRoot, tid);
expect(result.status).toBe("completed");
@@ -303,12 +310,19 @@ describe("currentRole field", () => {
try {
const wf = join(tmpDir, "test-current-role.yaml");
await writeFile(wf, SIMPLE_WORKFLOW_YAML, "utf8");
const { thread } = await cmdThreadStart(storageRoot, wf, "test", tmpDir);
const { thread, workflow } = await cmdThreadStart(storageRoot, wf, "test", tmpDir);
const tid = thread as ThreadId;
const uwfForIndex = await createUwfStore(storageRoot);
loadActiveThreads(uwfForIndex.varStore)[tid]!.head;
completeThread(uwfForIndex.varStore, tid, "cancelled");
const head = loadAllThreads(uwfForIndex.varStore)[tid]!.head;
deleteThread(uwfForIndex.varStore, tid);
addHistoryEntry(uwfForIndex.varStore, {
thread: tid,
workflow,
head,
completedAt: Date.now(),
reason: "cancelled",
});
const result = await cmdThreadShow(storageRoot, tid);
expect(result.status).toBe("cancelled");
@@ -361,8 +375,15 @@ describe("currentRole field", () => {
const comp = await cmdThreadStart(storageRoot, wf, "completed", tmpDir);
const compId = comp.thread as ThreadId;
const uwfForIndex = await createUwfStore(storageRoot);
const _compHead = loadActiveThreads(uwfForIndex.varStore)[compId]!.head;
completeThread(uwfForIndex.varStore, compId, "completed");
const compHead = loadAllThreads(uwfForIndex.varStore)[compId]!.head;
deleteThread(uwfForIndex.varStore, compId);
addHistoryEntry(uwfForIndex.varStore, {
thread: compId,
workflow: comp.workflow,
head: compHead,
completedAt: Date.now(),
reason: "completed",
});
const list = await cmdThreadList(storageRoot, null, null, null, 0, 100);
@@ -426,8 +447,8 @@ describe("currentRole field", () => {
await writeFile(wf, SINGLE_ROLE_WORKFLOW_YAML, "utf8");
const { thread } = await cmdThreadStart(storageRoot, wf, "test", tmpDir);
// worker → done maps to $END
await insertStepNode(storageRoot, thread as ThreadId, "worker", { $status: "done" });
// worker → _ maps to $END
await insertStepNode(storageRoot, thread as ThreadId, "worker", {});
const result = await cmdThreadShow(storageRoot, thread as ThreadId);
expect(result.currentRole).toBe(null);
+14 -219
View File
@@ -10,7 +10,7 @@ import { afterEach, beforeAll, beforeEach, describe, expect, test } from "vitest
import { stringify } from "yaml";
import { cmdThreadStart } from "../commands/thread.js";
import { cmdWorkflowAdd } from "../commands/workflow.js";
import { createUwfStore, getThread } from "../store.js";
import { createUwfStore, findHistoryEntry, getThread } from "../store.js";
// ── paths ──────────────────────────────────────────────────────────────────
@@ -106,13 +106,9 @@ async function addWorkflow(workflowFixture: string, workflowName: string): Promi
type ExecResult = { stdout: string; stderr: string; exitCode: number };
function runExec(threadId: string, count: number | null = null): ExecResult {
const args = [CLI_PATH, "thread", "exec", threadId];
if (count !== null) {
args.push("--count", String(count));
}
function runExec(threadId: string): ExecResult {
try {
const stdout = execFileSync(process.execPath, args, {
const stdout = execFileSync(process.execPath, [CLI_PATH, "thread", "exec", threadId], {
encoding: "utf8",
stdio: ["ignore", "pipe", "pipe"],
env: { ...process.env, UWF_HOME: uwfHome, OCAS_HOME: casDir },
@@ -130,38 +126,11 @@ function runExec(threadId: string, count: number | null = null): ExecResult {
}
}
/** Invoke `uwf thread resume <threadId> -p <prompt>` through the built CLI. */
function runResume(threadId: string, prompt: string): ExecResult {
try {
const stdout = execFileSync(
process.execPath,
[CLI_PATH, "thread", "resume", threadId, "-p", prompt],
{
encoding: "utf8",
stdio: ["ignore", "pipe", "pipe"],
env: { ...process.env, UWF_HOME: uwfHome, OCAS_HOME: casDir },
cwd: tmpDir,
timeout: 30000,
},
);
return { stdout, stderr: "", exitCode: 0 };
} catch (e: unknown) {
const err = e as NodeJS.ErrnoException & {
stdout?: string;
stderr?: string;
status?: number;
};
return { stdout: err.stdout ?? "", stderr: err.stderr ?? "", exitCode: err.status ?? 1 };
}
}
type StepOutputJson = {
thread: string;
head: string;
status: string;
currentRole: string | null;
suspendedRole: string | null;
suspendMessage: string | null;
done: boolean;
};
@@ -229,25 +198,19 @@ describe("E2E mock-agent: full uwf pipeline", () => {
expect(getStatus(store, s1.output)).toBe("ready");
expect(getStatus(store, s2.output)).toBe("done");
// Mock agent reports usage stats in step nodes.
expect(s1.usage).toEqual({ turns: 1, inputTokens: 0, outputTokens: 0, duration: 0 });
expect(s2.usage).toEqual({ turns: 1, inputTokens: 0, outputTokens: 0, duration: 0 });
// The start node points at the registered workflow.
const startNode = store.cas.get(startHash as CasRef);
expect((startNode!.payload as StartNodePayload).workflow).toBe(workflowHash);
// Thread is completed: status changed to "completed", head updated.
// Thread is completed: removed from active index, present in history.
const uwf = await createUwfStore(uwfHome);
const finalEntry = getThread(uwf.varStore, threadId);
expect(finalEntry).not.toBeNull();
expect(finalEntry!.status).toBe("completed");
expect(finalEntry!.head).toBe(step2.head);
expect(getThread(uwf.varStore, threadId)).toBeNull();
const hist = findHistoryEntry(uwf.varStore, threadId);
expect(hist).not.toBeNull();
expect(hist!.head).toBe(step2.head);
});
test("2. branching workflow loops developer→reviewer→developer→reviewer→$END", {
timeout: 30_000,
}, async () => {
test("2. branching workflow loops developer→reviewer→developer→reviewer→$END", async () => {
await writeMockConfig("e2e-loop.mock.yaml");
const workflowHash = await addWorkflow("e2e-loop.workflow.yaml", "test-loop");
@@ -300,14 +263,11 @@ describe("E2E mock-agent: full uwf pipeline", () => {
expect(getStatus(store, n4.output)).toBe("approved");
const uwf = await createUwfStore(uwfHome);
const finalEntry = getThread(uwf.varStore, threadId);
expect(finalEntry).not.toBeNull();
expect(finalEntry!.status).toBe("completed");
expect(getThread(uwf.varStore, threadId)).toBeNull();
expect(findHistoryEntry(uwf.varStore, threadId)).not.toBeNull();
});
test("3. role mismatch in mock data makes the agent exit with an error", {
timeout: 30_000,
}, async () => {
test("3. role mismatch in mock data makes the agent exit with an error", async () => {
// Reuses the linear workflow but with a mock whose step[1].role is wrong.
await writeMockConfig("e2e-mismatch.mock.yaml");
const workflowHash = await addWorkflow("e2e-linear.workflow.yaml", "test-linear");
@@ -327,172 +287,7 @@ describe("E2E mock-agent: full uwf pipeline", () => {
// The thread remains active (no step node was written for the failed step).
const uwf = await createUwfStore(uwfHome);
const entry = getThread(uwf.varStore, threadId);
expect(entry).not.toBeNull();
expect(entry!.status).not.toBe("completed");
expect(entry!.head).toBe(step1.head);
});
test("4. planner $SUSPEND then resume re-runs planner and reaches $END", {
timeout: 30_000,
}, async () => {
await writeMockConfig("e2e-suspend.mock.yaml");
const workflowHash = await addWorkflow("e2e-suspend.workflow.yaml", "test-suspend");
const start = await cmdThreadStart(uwfHome, workflowHash, "Analyze the task", uwfHome, tmpDir);
const threadId = start.thread;
// Step 1 → planner emits insufficient_info → thread suspends.
const step1 = execStep(threadId);
expect(step1.status).toBe("suspended");
expect(step1.done).toBe(false);
expect(step1.currentRole).toBeNull();
expect(step1.suspendedRole).toBe("planner");
expect(step1.suspendMessage).toBe("Need more info: missing requirements");
// Thread index entry reflects the suspension with rendered metadata.
const suspendedEntry = getThread((await createUwfStore(uwfHome)).varStore, threadId);
expect(suspendedEntry).not.toBeNull();
expect(suspendedEntry!.status).toBe("suspended");
expect(suspendedEntry!.suspendedRole).toBe("planner");
expect(suspendedEntry!.suspendMessage).toBe("Need more info: missing requirements");
// Resume re-runs the planner role; the second scripted step is `ready` → $END.
const resume = runResume(threadId, "Here are the requirements");
expect(resume.exitCode).toBe(0);
const resumeOut = JSON.parse(resume.stdout.trim()) as StepOutputJson;
expect(resumeOut.status).toBe("completed");
expect(resumeOut.done).toBe(true);
expect(resumeOut.currentRole).toBeNull();
expect(resumeOut.suspendedRole).toBeNull();
// CAS chain: suspended planner step → resumed planner step.
const store = await openStore(casDir);
const s1 = getStepNode(store, step1.head);
const s2 = getStepNode(store, resumeOut.head);
expect(s1.role).toBe("planner");
expect(s2.role).toBe("planner");
expect(s2.prev).toBe(step1.head);
expect(getStatus(store, s1.output)).toBe("insufficient_info");
expect(getStatus(store, s2.output)).toBe("ready");
const finalEntry = getThread((await createUwfStore(uwfHome)).varStore, threadId);
expect(finalEntry).not.toBeNull();
expect(finalEntry!.status).toBe("completed");
expect(finalEntry!.head).toBe(resumeOut.head);
});
test("5. --count 3 runs the whole linear pipeline in one invocation", {
timeout: 30_000,
}, async () => {
await writeMockConfig("e2e-count.mock.yaml");
const workflowHash = await addWorkflow("e2e-count.workflow.yaml", "test-count");
const start = await cmdThreadStart(uwfHome, workflowHash, "Ship the feature", uwfHome, tmpDir);
const threadId = start.thread;
// Single invocation with --count 3 → moderator drives analyst → developer → reviewer → $END.
const { stdout, stderr, exitCode } = runExec(threadId, 3);
expect(exitCode, `stderr: ${stderr}`).toBe(0);
// Multi-step exec emits a JSON array (one entry per executed step).
const results = JSON.parse(stdout.trim()) as StepOutputJson[];
expect(Array.isArray(results)).toBe(true);
expect(results).toHaveLength(3);
expect(results[0].status).toBe("idle");
expect(results[0].currentRole).toBe("developer");
expect(results[1].status).toBe("idle");
expect(results[1].currentRole).toBe("reviewer");
expect(results[2].status).toBe("completed");
expect(results[2].done).toBe(true);
// Verify the CAS chain holds 3 step nodes in the correct order.
const store = await openStore(casDir);
const n1 = getStepNode(store, results[0].head);
const n2 = getStepNode(store, results[1].head);
const n3 = getStepNode(store, results[2].head);
expect([n1.role, n2.role, n3.role]).toEqual(["analyst", "developer", "reviewer"]);
expect(n1.prev).toBeNull();
expect(n2.prev).toBe(results[0].head);
expect(n3.prev).toBe(results[1].head);
expect(new Set([n1.start, n2.start, n3.start]).size).toBe(1);
const finalEntry = getThread((await createUwfStore(uwfHome)).varStore, threadId);
expect(finalEntry).not.toBeNull();
expect(finalEntry!.status).toBe("completed");
expect(finalEntry!.head).toBe(results[2].head);
});
test("6. mustache edge prompt renders planner variables into the worker step", {
timeout: 30_000,
}, async () => {
await writeMockConfig("e2e-mustache.mock.yaml");
const workflowHash = await addWorkflow("e2e-mustache.workflow.yaml", "test-mustache");
const start = await cmdThreadStart(uwfHome, workflowHash, "Plan the task", uwfHome, tmpDir);
const threadId = start.thread;
// Step 1 → planner emits branch + repoPath.
const step1 = execStep(threadId);
expect(step1.status).toBe("idle");
expect(step1.currentRole).toBe("worker");
// Step 2 → worker; the moderator renders the templated edge prompt before spawning it.
const step2 = execStep(threadId);
expect(step2.done).toBe(true);
expect(step2.status).toBe("completed");
const store = await openStore(casDir);
const plannerStep = getStepNode(store, step1.head);
expect(getStatus(store, plannerStep.output)).toBe("ready");
// The worker step's edgePrompt is the mustache-rendered template.
const workerStep = getStepNode(store, step2.head);
expect(workerStep.role).toBe("worker");
expect(workerStep.edgePrompt).toContain("fix/42-auth");
expect(workerStep.edgePrompt).toContain("/tmp/my-repo");
expect(workerStep.edgePrompt).toBe("Work on branch fix/42-auth in /tmp/my-repo");
});
test("7. completed thread can be resumed (衔尾蛇: end → start)", {
timeout: 30_000,
}, async () => {
// Reuse the suspend workflow (planner with ready → $END), but mock data
// goes straight to ready on first run, then ready again after resume.
await writeMockConfig("e2e-completed-resume.mock.yaml");
const workflowHash = await addWorkflow("e2e-suspend.workflow.yaml", "test-suspend");
const start = await cmdThreadStart(uwfHome, workflowHash, "Do the work", uwfHome, tmpDir);
const threadId = start.thread;
// Step 1: planner outputs ready → $END → thread completed.
const step1 = execStep(threadId);
expect(step1.done).toBe(true);
expect(step1.status).toBe("completed");
const uwf1 = await createUwfStore(uwfHome);
const entry1 = getThread(uwf1.varStore, threadId);
expect(entry1).not.toBeNull();
expect(entry1!.status).toBe("completed");
// Resume the completed thread — should re-evaluate $START → planner.
const resumeResult = runResume(threadId, "Additional context for round 2");
expect(resumeResult.exitCode).toBe(0);
// After resume step, planner ran again (step index 1 in mock) → ready → $END.
const uwf2 = await createUwfStore(uwfHome);
const entry2 = getThread(uwf2.varStore, threadId);
expect(entry2).not.toBeNull();
expect(entry2!.status).toBe("completed");
// Head should have advanced (not the same as step1).
expect(entry2!.head).not.toBe(step1.head);
// CAS chain: step2.prev === step1 head (chain is preserved across resume).
const store = await openStore(casDir);
const resumeOutput = JSON.parse(resumeResult.stdout.trim());
const step2Node = getStepNode(store, resumeOutput.head);
expect(step2Node.role).toBe("planner");
expect(step2Node.prev).toBe(step1.head);
expect(getThread(uwf.varStore, threadId)).not.toBeNull();
expect(getThread(uwf.varStore, threadId)!.head).toBe(step1.head);
});
});
@@ -1,15 +0,0 @@
steps:
# Step 0: planner → ready → $END (thread completes)
- role: planner
output: |
---
$status: ready
---
Initial plan complete.
# Step 1: after resume, planner runs again from $START → ready → $END again
- role: planner
output: |
---
$status: ready
---
Revised plan after resume.
@@ -1,19 +0,0 @@
steps:
- role: analyst
output: |
---
$status: analyzed
---
Analysis complete.
- role: developer
output: |
---
$status: implemented
---
Implementation complete.
- role: reviewer
output: |
---
$status: approved
---
Approved.
@@ -1,45 +0,0 @@
name: test-count
description: 3-step linear pipeline (analyst -> developer -> reviewer -> $END)
roles:
analyst:
description: Analyzes the task
goal: Analyze the task
capabilities: []
procedure: Analyze it
output: Output the analysis and set $status to analyzed
frontmatter:
oneOf:
- properties:
$status: { const: analyzed }
required: [$status]
developer:
description: Implements the change
goal: Implement the change
capabilities: []
procedure: Write code
output: Output the implementation and set $status to implemented
frontmatter:
oneOf:
- properties:
$status: { const: implemented }
required: [$status]
reviewer:
description: Reviews the change
goal: Review the change
capabilities: []
procedure: Review code
output: Approve and set $status to approved
frontmatter:
oneOf:
- properties:
$status: { const: approved }
required: [$status]
graph:
$START:
_: { role: analyst, prompt: 'Analyze the task' }
analyst:
analyzed: { role: developer, prompt: 'Implement the change' }
developer:
implemented: { role: reviewer, prompt: 'Review the change' }
reviewer:
approved: { role: '$END', prompt: 'Done' }
@@ -1,15 +0,0 @@
steps:
- role: planner
output: |
---
$status: ready
branch: fix/42-auth
repoPath: /tmp/my-repo
---
Planned the work.
- role: worker
output: |
---
$status: done
---
Work complete.
@@ -1,34 +0,0 @@
name: test-mustache
description: Planner emits template variables consumed by the worker edge prompt
roles:
planner:
description: Plans work and emits branch + repo path
goal: Plan the task
capabilities: []
procedure: Decide the branch and repo path
output: Set $status to ready and emit branch and repoPath
frontmatter:
oneOf:
- properties:
$status: { const: ready }
branch: { type: string }
repoPath: { type: string }
required: [$status, branch, repoPath]
worker:
description: Works on the planned branch
goal: Do the work
capabilities: []
procedure: Do it
output: Output the result and set $status to done
frontmatter:
oneOf:
- properties:
$status: { const: done }
required: [$status]
graph:
$START:
_: { role: planner, prompt: 'Plan the task' }
planner:
ready: { role: worker, prompt: 'Work on branch {{{branch}}} in {{{repoPath}}}' }
worker:
done: { role: '$END', prompt: 'Complete' }
@@ -1,14 +0,0 @@
steps:
- role: planner
output: |
---
$status: insufficient_info
reason: missing requirements
---
I need more information before I can plan this.
- role: planner
output: |
---
$status: ready
---
I now have what I need. Ready to proceed.
@@ -1,24 +0,0 @@
name: test-suspend
description: Planner can suspend for more info or finish when ready
roles:
planner:
description: Plans work and may request more info
goal: Analyze the task
capabilities: []
procedure: Analyze the task and decide if more info is needed
output: Set $status to insufficient_info (with reason) or ready
frontmatter:
oneOf:
- properties:
$status: { const: insufficient_info }
reason: { type: string }
required: [$status, reason]
- properties:
$status: { const: ready }
required: [$status]
graph:
$START:
_: { role: planner, prompt: 'Analyze the task' }
planner:
insufficient_info: { role: '$SUSPEND', prompt: 'Need more info: {{{reason}}}' }
ready: { role: '$END', prompt: 'Done' }
@@ -8,10 +8,10 @@ const solveIssueGraph: WorkflowPayload["graph"] = {
_: { role: "planner", prompt: "Start planning from the issue in the task.", location: null },
},
planner: {
planned: { role: "developer", prompt: "Implement the plan: {{plan}}", location: null },
_: { role: "developer", prompt: "Implement the plan: {{plan}}", location: null },
},
developer: {
implemented: { role: "reviewer", prompt: "Review the changes: {{summary}}", location: null },
_: { role: "reviewer", prompt: "Review the changes: {{summary}}", location: null },
},
reviewer: {
approved: { role: "$END", prompt: "Done.", location: null },
@@ -112,7 +112,7 @@ describe("evaluate", () => {
test("mustache template rendering with simple fields", () => {
const result = evaluate(solveIssueGraph, "planner", {
$status: "planned",
$status: "_",
plan: "Add auth middleware",
});
expect(result).toEqual({
@@ -139,11 +139,11 @@ describe("evaluate", () => {
test("triple mustache also works for unescaped output", () => {
const graph: Record<string, Record<string, Target>> = {
reviewer: {
rejected: { role: "developer", prompt: "Fix: {{{comments}}}", location: null },
_: { role: "developer", prompt: "Fix: {{{comments}}}", location: null },
},
};
const result = evaluate(graph, "reviewer", {
$status: "rejected",
$status: "_",
comments: "<script>alert(1)</script>",
});
expect(result).toEqual({
@@ -152,22 +152,24 @@ describe("evaluate", () => {
});
});
test("missing $status → error (no unit fallback)", () => {
test("missing $status defaults to _ (unit routing)", () => {
const result = evaluate(solveIssueGraph, "planner", {
plan: "Add auth middleware",
});
expect(result.ok).toBe(false);
if (!result.ok) {
expect(result.error.message).toBe(
'agent output for role "planner" is missing required "$status" string',
);
}
expect(result).toEqual({
ok: true,
value: {
role: "developer",
prompt: "Implement the plan: Add auth middleware",
location: null,
},
});
});
test("mustache template with nested object paths", () => {
const graph: Record<string, Record<string, Target>> = {
reviewer: {
rejected: {
_: {
role: "developer",
prompt: "Address: {{review.comments}}",
location: null,
@@ -175,7 +177,7 @@ describe("evaluate", () => {
},
};
const result = evaluate(graph, "reviewer", {
$status: "rejected",
$status: "_",
review: { comments: "refactor the handler" },
});
expect(result).toEqual({
+40 -63
View File
@@ -6,124 +6,101 @@ import { describe, expect, test } from "vitest";
const __dirname = dirname(fileURLToPath(import.meta.url));
import {
cmdPromptAdapterDeveloping,
cmdPromptBootstrap,
cmdPromptAdapter,
cmdPromptAuthor,
cmdPromptDeveloper,
cmdPromptList,
cmdPromptSetup,
cmdPromptUsage,
cmdPromptUsageReference,
cmdPromptWorkflowAuthoring,
cmdPromptUser,
} from "../commands/prompt.js";
describe("prompt commands", () => {
test("prompt list returns new prompt names", () => {
test("prompt list returns all prompt names", () => {
const result = cmdPromptList();
expect(result).toBeInstanceOf(Array);
expect(result).toContain("usage");
expect(result).toContain("workflow-authoring");
expect(result).toContain("adapter-developing");
expect(result).toContain("bootstrap");
expect(result).not.toContain("user");
expect(result).not.toContain("author");
expect(result).not.toContain("developer");
expect(result).not.toContain("adapter");
expect(result).toContain("user");
expect(result).toContain("author");
expect(result).toContain("developer");
expect(result).toContain("adapter");
for (const name of result) {
expect(name).toMatch(/^\S+$/);
}
});
test("prompt usage-reference returns non-empty markdown string with frontmatter", () => {
const result = cmdPromptUsageReference();
test("prompt user returns non-empty markdown string", () => {
const result = cmdPromptUser();
expect(typeof result).toBe("string");
expect(result).toContain("uwf");
expect(result).toContain("thread");
expect(result).toContain("workflow");
expect(result).toContain("Quick Start");
expect(result).toContain("---");
expect(result).toContain("name:");
expect(result).toContain("version:");
expect(result.length).toBeGreaterThan(500);
});
test("prompt workflow-authoring returns non-empty markdown string with frontmatter", () => {
const result = cmdPromptWorkflowAuthoring();
test("prompt author returns non-empty markdown string", () => {
const result = cmdPromptAuthor();
expect(typeof result).toBe("string");
expect(result).toContain("frontmatter");
expect(result).toContain("graph");
expect(result).toContain("$START");
expect(result).toContain("$END");
expect(result).toContain("$status");
expect(result).toContain("---");
expect(result).toContain("name:");
expect(result).toContain("version:");
expect(result.length).toBeGreaterThan(500);
});
test("prompt adapter-developing returns non-empty markdown string with frontmatter", () => {
const result = cmdPromptAdapterDeveloping();
test("prompt developer returns non-empty markdown string", () => {
const result = cmdPromptDeveloper();
expect(typeof result).toBe("string");
expect(result).toContain("Monorepo");
expect(result).toContain("CAS");
expect(result).toContain("Biome");
expect(result.length).toBeGreaterThan(500);
});
test("prompt adapter returns non-empty markdown string", () => {
const result = cmdPromptAdapter();
expect(typeof result).toBe("string");
expect(result).toContain("createAgent");
expect(result).toContain("AgentContext");
expect(result).toContain("frontmatter");
expect(result).toContain("---");
expect(result).toContain("name:");
expect(result).toContain("version:");
expect(result.length).toBeGreaterThan(500);
});
test("prompt bootstrap returns non-empty skill with frontmatter", () => {
const result = cmdPromptBootstrap();
expect(typeof result).toBe("string");
expect(result).toContain("uwf");
expect(result).toContain("---");
expect(result.length).toBeGreaterThan(100);
});
test("prompt usage combines remaining references (no developer)", () => {
test("prompt usage combines all references", () => {
const result = cmdPromptUsage();
expect(typeof result).toBe("string");
expect(result).toContain("Usage Reference");
expect(result).toContain("Workflow Authoring Reference");
expect(result).toContain("Adapter Developing Reference");
expect(result).not.toContain("Developer Reference");
expect(result).toContain("User Reference");
expect(result).toContain("Author Reference");
expect(result).toContain("Developer Reference");
expect(result).toContain("Adapter Reference");
expect(result).toContain("---");
expect(result.length).toBeGreaterThan(2000);
});
test("prompt setup returns simplified setup instructions", () => {
test("prompt setup returns setup instructions", () => {
const result = cmdPromptSetup();
expect(typeof result).toBe("string");
expect(result).toContain("uwf Skill Setup");
expect(result).toContain("uwf prompt bootstrap");
expect(result).toContain("uwf prompt usage");
expect(result).toContain("uwf prompt setup");
expect(result).toContain("SKILL.md");
expect(result).toContain("version");
expect(result).not.toMatch(/\bbun (install|run|test|changeset|version|release)\b/);
});
test("prompt setup references new subcommand names", () => {
const result = cmdPromptSetup();
expect(result).toContain("uwf prompt usage");
expect(result).toContain("uwf prompt workflow-authoring");
expect(result).toContain("uwf prompt adapter-developing");
expect(result).not.toContain("uwf prompt user");
expect(result).not.toContain("uwf prompt author");
expect(result).not.toContain("uwf prompt developer");
expect(result).not.toMatch(/uwf prompt adapter\b(?!-developing)/);
});
test("prompt help subcommand is suppressed", { timeout: 30_000 }, () => {
const cliPath = join(__dirname, "..", "..", "dist", "cli.js");
const output = execFileSync("node", [cliPath, "prompt", "--help"], {
test("prompt help subcommand is suppressed", () => {
const output = execFileSync("npx", ["tsx", "src/cli.ts", "prompt", "--help"], {
cwd: join(__dirname, "..", ".."),
encoding: "utf-8",
env: { ...process.env },
env: { ...process.env, PATH: `/opt/homebrew/bin:${process.env.PATH}` },
});
expect(output).not.toMatch(/help\s+\[command\]/i);
expect(output).toContain("usage");
expect(output).toContain("setup");
expect(output).toContain("workflow-authoring");
expect(output).toContain("adapter-developing");
expect(output).toContain("bootstrap");
expect(output).toContain("user");
expect(output).toContain("author");
expect(output).toContain("developer");
expect(output).toContain("adapter");
expect(output).toContain("list");
expect(output).not.toContain("developer");
});
});
@@ -4,7 +4,7 @@ import { join } from "node:path";
import { type CasRef, createThreadIndexEntry, type ThreadId } from "@united-workforce/protocol";
import { afterEach, beforeEach, describe, expect, test } from "vitest";
import { resolveHeadHash } from "../commands/shared.js";
import { completeThread, createUwfStore, setThread } from "../store.js";
import { addHistoryEntry, createUwfStore, setThread } from "../store.js";
let tmpDir: string;
@@ -31,13 +31,19 @@ describe("resolveHeadHash", () => {
expect(result).toBe(headHash);
});
test("finds completed thread", async () => {
test("falls back to history variable when thread not in active index", async () => {
const threadId = "01JTEST0000000000000000002" as ThreadId;
const workflowHash = "workflow_hash_789" as CasRef;
const uwf = await createUwfStore(tmpDir);
const headHash = (await uwf.store.cas.put(uwf.schemas.text, "completed-head")) as CasRef;
setThread(uwf.varStore, threadId, createThreadIndexEntry(headHash));
completeThread(uwf.varStore, threadId, "completed");
addHistoryEntry(uwf.varStore, {
thread: threadId,
workflow: workflowHash,
head: headHash,
completedAt: Date.now(),
reason: null,
});
const result = await resolveHeadHash(tmpDir, threadId);
@@ -48,36 +54,58 @@ describe("resolveHeadHash", () => {
// calls fail() which does process.exit(1), terminating the test runner.
// The error behavior is tested in integration tests below via CLI invocation.
test("prioritizes active thread", async () => {
test("prioritizes active thread over history when thread exists in both", async () => {
const threadId = "01JTEST0000000000000000004" as ThreadId;
const workflowHash = "workflow_hash_xyz" as CasRef;
const uwf = await createUwfStore(tmpDir);
const activeHead = (await uwf.store.cas.put(uwf.schemas.text, "active-v2")) as CasRef;
const historicalHash = (await uwf.store.cas.put(uwf.schemas.text, "historical-v1")) as CasRef;
setThread(uwf.varStore, threadId, createThreadIndexEntry(activeHead));
addHistoryEntry(uwf.varStore, {
thread: threadId,
workflow: workflowHash,
head: historicalHash,
completedAt: Date.now(),
reason: null,
});
const result = await resolveHeadHash(tmpDir, threadId);
// Should return the active head
// Should return the active head, not the historical one
expect(result).toBe(activeHead);
});
test("finds thread from multiple completed threads", async () => {
test("finds thread from multiple history entries", async () => {
const threadId1 = "01JTEST0000000000000000005" as ThreadId;
const threadId2 = "01JTEST0000000000000000006" as ThreadId;
const threadId3 = "01JTEST0000000000000000007" as ThreadId;
const workflowHash = "workflow_hash_abc" as CasRef;
const uwf = await createUwfStore(tmpDir);
const hash1 = (await uwf.store.cas.put(uwf.schemas.text, "hash-thread1")) as CasRef;
const hash2 = (await uwf.store.cas.put(uwf.schemas.text, "hash-thread2")) as CasRef;
const hash3 = (await uwf.store.cas.put(uwf.schemas.text, "hash-thread3")) as CasRef;
setThread(uwf.varStore, threadId1, createThreadIndexEntry(hash1));
completeThread(uwf.varStore, threadId1, "completed");
setThread(uwf.varStore, threadId2, createThreadIndexEntry(hash2));
completeThread(uwf.varStore, threadId2, "completed");
setThread(uwf.varStore, threadId3, createThreadIndexEntry(hash3));
completeThread(uwf.varStore, threadId3, "completed");
addHistoryEntry(uwf.varStore, {
thread: threadId1,
workflow: workflowHash,
head: hash1,
completedAt: Date.now() - 2000,
reason: null,
});
addHistoryEntry(uwf.varStore, {
thread: threadId2,
workflow: workflowHash,
head: hash2,
completedAt: Date.now() - 1000,
reason: null,
});
addHistoryEntry(uwf.varStore, {
thread: threadId3,
workflow: workflowHash,
head: hash3,
completedAt: Date.now(),
reason: null,
});
const result = await resolveHeadHash(tmpDir, threadId2);
@@ -118,7 +118,6 @@ async function createTestStep(
completedAtMs: Date.now() + 1000,
assembledPrompt: null,
cwd: "/tmp",
usage: null,
};
return store.cas.put(schemas.stepNode, stepPayload);
}
@@ -96,7 +96,6 @@ describe("protocol types", () => {
completedAtMs: 2000,
assembledPrompt: null,
cwd: "/test/path",
usage: null,
};
expect(record.startedAtMs).toBe(1000);
expect(record.completedAtMs).toBe(2000);
@@ -111,7 +110,6 @@ describe("protocol types", () => {
agent: "uwf-test",
timestamp: 123,
durationMs: 5000,
usage: null,
};
expect(entry.durationMs).toBe(5000);
});
@@ -254,7 +252,7 @@ describe("thread read timing", () => {
},
graph: {
$START: { _: { role: "worker", prompt: "go", location: null } },
worker: { done: { role: "$END", prompt: "", location: null } },
worker: { _: { role: "$END", prompt: "", location: null } },
},
});
@@ -320,7 +318,7 @@ describe("thread read timing", () => {
},
graph: {
$START: { _: { role: "worker", prompt: "go", location: null } },
worker: { done: { role: "$END", prompt: "", location: null } },
worker: { _: { role: "$END", prompt: "", location: null } },
},
});
@@ -226,15 +226,19 @@ describe("Global CAS directory", () => {
const uwf = await createUwfStore(storageRoot);
const threadId = "thread-123" as ThreadId;
const headHash = await uwf.store.cas.put(uwf.schemas.text, "history-head");
const { completeThread, setThread, getThread } = await import("../store.js");
const { createThreadIndexEntry } = await import("@united-workforce/protocol");
const { addHistoryEntry, findHistoryEntry } = await import("../store.js");
addHistoryEntry(uwf.varStore, {
thread: threadId,
workflow: "workflow-456",
head: headHash,
completedAt: Date.now(),
reason: "completed",
});
setThread(uwf.varStore, threadId, createThreadIndexEntry(headHash));
completeThread(uwf.varStore, threadId, "completed");
const entry = getThread(uwf.varStore, threadId);
const entry = findHistoryEntry(uwf.varStore, threadId);
expect(entry?.thread).toBe(threadId);
expect(entry?.workflow).toBe("workflow-456");
expect(entry?.head).toBe(headHash);
expect(entry?.status).toBe("completed");
const { access } = await import("node:fs/promises");
await access(join(globalCasDir, "vars"));
@@ -270,12 +274,15 @@ describe("Global CAS directory", () => {
);
const uwf = await createUwfStore(storageRoot);
const { getThread } = await import("../store.js");
const entry = getThread(uwf.varStore, threadId);
expect(entry).not.toBeNull();
expect(entry?.head).toBe(headHash);
expect(entry?.status).toBe("cancelled");
expect(entry?.completedAt).toBe(completedAt);
const { findHistoryEntry } = await import("../store.js");
const entry = findHistoryEntry(uwf.varStore, threadId);
expect(entry).toEqual({
thread: threadId,
workflow: workflowHash,
head: headHash,
completedAt,
reason: "cancelled",
});
await expect(access(historyPath)).rejects.toThrow();
const migratedContent = await readFile(`${historyPath}.migrated`, "utf8");
@@ -1,235 +0,0 @@
import { mkdir, mkdtemp } from "node:fs/promises";
import { tmpdir } from "node:os";
import { join } from "node:path";
import type { CasRef, ThreadId } from "@united-workforce/protocol";
import { describe, expect, test } from "vitest";
import {
completeThread,
createUwfStore,
getThread,
loadActiveThreads,
loadHistoryThreads,
setThread,
} from "../store.js";
async function makeUwfStore(storageRoot: string) {
const casDir = join(storageRoot, "cas");
await mkdir(casDir, { recursive: true });
process.env.OCAS_HOME = casDir;
return createUwfStore(storageRoot);
}
async function seedThreadHead(
uwf: Awaited<ReturnType<typeof createUwfStore>>,
label: string,
): Promise<CasRef> {
return (await uwf.store.cas.put(uwf.schemas.text, label)) as CasRef;
}
describe("unified thread storage", () => {
test("loadActiveThreads excludes completed threads", async () => {
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-active-test-"));
const uwf = await makeUwfStore(tmpDir);
const threadId1 = "01JTEST000000000000ACTIVE1" as ThreadId;
const threadId2 = "01JTEST000000000000ACTIVE2" as ThreadId;
const head1 = await seedThreadHead(uwf, "active-head");
const head2 = await seedThreadHead(uwf, "completed-head");
setThread(uwf.varStore, threadId1, {
head: head1,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
});
setThread(uwf.varStore, threadId2, {
head: head2,
status: "completed",
suspendedRole: null,
suspendMessage: null,
completedAt: Date.now(),
});
const active = loadActiveThreads(uwf.varStore);
expect(Object.keys(active)).toHaveLength(1);
expect(active[threadId1]).toBeDefined();
expect(active[threadId2]).toBeUndefined();
});
test("loadActiveThreads excludes cancelled threads", async () => {
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-active-test-"));
const uwf = await makeUwfStore(tmpDir);
const threadId1 = "01JTEST000000000000ACTIVE3" as ThreadId;
const threadId2 = "01JTEST000000000000ACTIVE4" as ThreadId;
const head1 = await seedThreadHead(uwf, "active-head");
const head2 = await seedThreadHead(uwf, "cancelled-head");
setThread(uwf.varStore, threadId1, {
head: head1,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
});
setThread(uwf.varStore, threadId2, {
head: head2,
status: "cancelled",
suspendedRole: null,
suspendMessage: null,
completedAt: Date.now(),
});
const active = loadActiveThreads(uwf.varStore);
expect(Object.keys(active)).toHaveLength(1);
expect(active[threadId1]).toBeDefined();
expect(active[threadId2]).toBeUndefined();
});
test("loadHistoryThreads only returns completed and cancelled", async () => {
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-history-test-"));
const uwf = await makeUwfStore(tmpDir);
const threadId1 = "01JTEST000000000000HISTOR1" as ThreadId;
const threadId2 = "01JTEST000000000000HISTOR2" as ThreadId;
const threadId3 = "01JTEST000000000000HISTOR3" as ThreadId;
const head1 = await seedThreadHead(uwf, "active-head");
const head2 = await seedThreadHead(uwf, "completed-head");
const head3 = await seedThreadHead(uwf, "cancelled-head");
setThread(uwf.varStore, threadId1, {
head: head1,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
});
setThread(uwf.varStore, threadId2, {
head: head2,
status: "completed",
suspendedRole: null,
suspendMessage: null,
completedAt: Date.now(),
});
setThread(uwf.varStore, threadId3, {
head: head3,
status: "cancelled",
suspendedRole: null,
suspendMessage: null,
completedAt: Date.now(),
});
const history = loadHistoryThreads(uwf.varStore);
expect(Object.keys(history)).toHaveLength(2);
expect(history[threadId1]).toBeUndefined();
expect(history[threadId2]).toBeDefined();
expect(history[threadId3]).toBeDefined();
});
test("completeThread marks thread as completed", async () => {
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-complete-test-"));
const uwf = await makeUwfStore(tmpDir);
const threadId = "01JTEST000000000000COMPLE1" as ThreadId;
const head = await seedThreadHead(uwf, "active-head");
setThread(uwf.varStore, threadId, {
head,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
});
completeThread(uwf.varStore, threadId, "completed");
const entry = getThread(uwf.varStore, threadId);
expect(entry).not.toBeNull();
expect(entry?.status).toBe("completed");
expect(entry?.completedAt).toBeDefined();
expect(entry?.completedAt).toBeGreaterThan(0);
});
test("completeThread marks thread as cancelled", async () => {
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-complete-test-"));
const uwf = await makeUwfStore(tmpDir);
const threadId = "01JTEST000000000000COMPLE2" as ThreadId;
const head = await seedThreadHead(uwf, "active-head");
setThread(uwf.varStore, threadId, {
head,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
});
completeThread(uwf.varStore, threadId, "cancelled");
const entry = getThread(uwf.varStore, threadId);
expect(entry).not.toBeNull();
expect(entry?.status).toBe("cancelled");
expect(entry?.completedAt).toBeDefined();
expect(entry?.completedAt).toBeGreaterThan(0);
});
test("completeThread clears suspend metadata", async () => {
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-complete-test-"));
const uwf = await makeUwfStore(tmpDir);
const threadId = "01JTEST000000000000COMPLE3" as ThreadId;
const head = await seedThreadHead(uwf, "suspended-head");
setThread(uwf.varStore, threadId, {
head,
status: "suspended",
suspendedRole: "test-role",
suspendMessage: "test message",
completedAt: null,
});
completeThread(uwf.varStore, threadId, "completed");
const entry = getThread(uwf.varStore, threadId);
expect(entry).not.toBeNull();
expect(entry?.status).toBe("completed");
expect(entry?.suspendedRole).toBeNull();
expect(entry?.suspendMessage).toBeNull();
});
test("completeThread handles non-existent thread gracefully", async () => {
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-complete-test-"));
const uwf = await makeUwfStore(tmpDir);
const threadId = "01JTEST000000000000NOEXIST" as ThreadId;
// Should not throw
completeThread(uwf.varStore, threadId, "completed");
const entry = getThread(uwf.varStore, threadId);
expect(entry).toBeNull();
});
test("status and completedAt tags are persisted and loaded", async () => {
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-tags-test-"));
const uwf = await makeUwfStore(tmpDir);
const threadId = "01JTEST000000000000TAGTEST" as ThreadId;
const head = await seedThreadHead(uwf, "test-head");
const now = Date.now();
setThread(uwf.varStore, threadId, {
head,
status: "completed",
suspendedRole: null,
suspendMessage: null,
completedAt: now,
});
const entry = getThread(uwf.varStore, threadId);
expect(entry).not.toBeNull();
expect(entry?.status).toBe("completed");
expect(entry?.completedAt).toBe(now);
});
});
@@ -3,13 +3,7 @@ import { tmpdir } from "node:os";
import { join } from "node:path";
import type { CasRef, ThreadId } from "@united-workforce/protocol";
import { describe, expect, test } from "vitest";
import {
completeThread,
createUwfStore,
getThread,
loadHistoryThreads,
setThread,
} from "../store.js";
import { addHistoryEntry, createUwfStore, loadAllHistory } from "../store.js";
async function makeUwfStore(storageRoot: string) {
const casDir = join(storageRoot, "cas");
@@ -26,113 +20,88 @@ async function seedHistoryHead(
}
describe("thread cancel status", () => {
test("cancelled thread has status 'cancelled'", async () => {
test("cancelled history entry has reason 'cancelled'", async () => {
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-cancel-test-"));
const threadId = "01JTEST000000000000CANCEL1" as ThreadId;
const uwf = await makeUwfStore(tmpDir);
const head = await seedHistoryHead(uwf, "cancelled-head");
setThread(uwf.varStore, threadId, {
addHistoryEntry(uwf.varStore, {
thread: threadId,
workflow: "test-workflow",
head,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
completedAt: Date.now(),
reason: "cancelled",
});
completeThread(uwf.varStore, threadId, "cancelled");
const entry = getThread(uwf.varStore, threadId);
expect(entry).not.toBeNull();
expect(entry?.status).toBe("cancelled");
const history = loadAllHistory(uwf.varStore);
expect(history).toHaveLength(1);
expect(history[0]?.reason).toBe("cancelled");
});
test("completed thread has status 'completed'", async () => {
test("completed history entry has reason 'completed'", async () => {
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-cancel-test-"));
const threadId = "01JTEST000000000000CANCEL2" as ThreadId;
const uwf = await makeUwfStore(tmpDir);
const head = await seedHistoryHead(uwf, "completed-head");
setThread(uwf.varStore, threadId, {
addHistoryEntry(uwf.varStore, {
thread: threadId,
workflow: "test-workflow",
head,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
completedAt: Date.now(),
reason: "completed",
});
completeThread(uwf.varStore, threadId, "completed");
const entry = getThread(uwf.varStore, threadId);
expect(entry).not.toBeNull();
expect(entry?.status).toBe("completed");
const history = loadAllHistory(uwf.varStore);
expect(history).toHaveLength(1);
expect(history[0]?.reason).toBe("completed");
});
test("loadHistoryThreads returns completed and cancelled", async () => {
test("history entry with null reason is stored as completed", async () => {
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-cancel-test-"));
const threadId = "01JTEST000000000000CANCEL3" as ThreadId;
const uwf = await makeUwfStore(tmpDir);
const head = await seedHistoryHead(uwf, "legacy-head");
addHistoryEntry(uwf.varStore, {
thread: threadId,
workflow: "test-workflow",
head,
completedAt: Date.now(),
reason: null,
});
const history = loadAllHistory(uwf.varStore);
expect(history).toHaveLength(1);
expect(history[0]?.reason).toBe("completed");
});
test("mixed completed and cancelled entries preserve distinct reasons", async () => {
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-cancel-test-"));
const uwf = await makeUwfStore(tmpDir);
const head1 = await seedHistoryHead(uwf, "head1");
const head2 = await seedHistoryHead(uwf, "head2");
const threadId1 = "01JTEST000000000000CANCEL4" as ThreadId;
setThread(uwf.varStore, threadId1, {
addHistoryEntry(uwf.varStore, {
thread: "01JTEST000000000000CANCEL4" as ThreadId,
workflow: "test-workflow",
head: head1,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
completedAt: Date.now(),
reason: "completed",
});
completeThread(uwf.varStore, threadId1, "completed");
const threadId2 = "01JTEST000000000000CANCEL5" as ThreadId;
setThread(uwf.varStore, threadId2, {
addHistoryEntry(uwf.varStore, {
thread: "01JTEST000000000000CANCEL5" as ThreadId,
workflow: "test-workflow",
head: head2,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
completedAt: Date.now(),
reason: "cancelled",
});
completeThread(uwf.varStore, threadId2, "cancelled");
const history = loadHistoryThreads(uwf.varStore);
expect(Object.keys(history)).toHaveLength(2);
const statuses = Object.values(history)
.map((entry) => entry.status)
.sort();
expect(statuses).toEqual(["cancelled", "completed"]);
});
test("mixed completed and cancelled entries preserve distinct statuses", async () => {
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-cancel-test-"));
const uwf = await makeUwfStore(tmpDir);
const head1 = await seedHistoryHead(uwf, "head1");
const head2 = await seedHistoryHead(uwf, "head2");
const threadId1 = "01JTEST000000000000CANCEL6" as ThreadId;
setThread(uwf.varStore, threadId1, {
head: head1,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
});
completeThread(uwf.varStore, threadId1, "completed");
const threadId2 = "01JTEST000000000000CANCEL7" as ThreadId;
setThread(uwf.varStore, threadId2, {
head: head2,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
});
completeThread(uwf.varStore, threadId2, "cancelled");
const history = loadHistoryThreads(uwf.varStore);
expect(Object.keys(history)).toHaveLength(2);
const statuses = Object.values(history)
.map((entry) => entry.status)
.sort();
expect(statuses).toEqual(["cancelled", "completed"]);
const history = loadAllHistory(uwf.varStore);
expect(history).toHaveLength(2);
const reasons = history.map((entry) => entry.reason).sort();
expect(reasons).toEqual(["cancelled", "completed"]);
});
});
@@ -10,8 +10,9 @@ import { cmdThreadList } from "../commands/thread.js";
import { parseTimeInput } from "../commands/thread-time-parser.js";
import type { UwfStore } from "../store.js";
import {
completeThread as completeThreadInStore,
addHistoryEntry,
createUwfStore,
deleteThread,
loadAllThreads,
setThread,
} from "../store.js";
@@ -72,11 +73,18 @@ async function markThreadRunning(storageRoot: string, threadId: ThreadId, workfl
async function completeThread(
storageRoot: string,
threadId: ThreadId,
_workflowHash: CasRef,
_headHash: CasRef,
workflowHash: CasRef,
headHash: CasRef,
) {
const uwfIdx = await createUwfStore(storageRoot);
completeThreadInStore(uwfIdx.varStore, threadId, "completed");
deleteThread(uwfIdx.varStore, threadId);
addHistoryEntry(uwfIdx.varStore, {
thread: threadId,
workflow: workflowHash,
head: headHash,
completedAt: Date.now(),
reason: null,
});
}
// ── test setup ────────────────────────────────────────────────────────────────
@@ -492,10 +500,8 @@ describe("edge cases", () => {
)) as CasRef;
index["INVALID_ULID_FORMAT_HERE" as ThreadId] = {
head: placeholderHead,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
};
for (const [tid, ent] of Object.entries(index)) {
setThread(uwfIdx.varStore, tid as ThreadId, ent);
@@ -54,7 +54,7 @@ roles:
type: object
required: ["$status"]
properties:
$status: { type: string, enum: ["ready"] }
$status: { type: string }
graph:
$START:
_:
@@ -62,7 +62,7 @@ graph:
prompt: "Plan the work"
location: null
planner:
ready:
_:
role: $END
prompt: "Done"
location: null
@@ -110,7 +110,7 @@ roles:
type: object
required: ["$status"]
properties:
$status: { type: string, enum: ["ready"] }
$status: { type: string }
graph:
$START:
_:
@@ -118,7 +118,7 @@ graph:
prompt: "Plan"
location: null
planner:
ready:
_:
role: $END
prompt: "Done"
location: null
@@ -153,7 +153,7 @@ roles:
type: object
required: ["$status"]
properties:
$status: { type: string, enum: ["ready"] }
$status: { type: string }
graph:
$START:
_:
@@ -161,7 +161,7 @@ graph:
prompt: "Plan"
location: null
planner:
ready:
_:
role: $END
prompt: "Done"
location: null
@@ -79,7 +79,7 @@ async function setupSuspendedThread(mode: MockAgentMode): Promise<{
},
ok: { role: "reviewer", prompt: "Review the work", location: null },
},
reviewer: { done: { role: "$END", prompt: "Done", location: null } },
reviewer: { _: { role: "$END", prompt: "Done", location: null } },
},
});
@@ -118,10 +118,8 @@ async function setupSuspendedThread(mode: MockAgentMode): Promise<{
await seedThreads(tmpDir, {
[THREAD_ID]: {
head: stepHash,
status: "suspended",
suspendedRole: "worker",
suspendMessage: SUSPEND_MESSAGE,
completedAt: null,
},
});
@@ -234,7 +232,7 @@ describe("uwf thread resume", () => {
},
graph: {
$START: { _: { role: "worker", prompt: "Start", location: null } },
worker: { done: { role: "$END", prompt: "Done", location: null } },
worker: { _: { role: "$END", prompt: "Done", location: null } },
},
});
@@ -249,7 +247,7 @@ describe("uwf thread resume", () => {
const result = runUwf(["thread", "resume", THREAD_ID], casDir);
expect(result.status).not.toBe(0);
expect(result.stderr).toContain("thread cannot be resumed");
expect(result.stderr).toContain("thread is not suspended");
});
test("resume suspended thread executes step and becomes idle", async () => {
@@ -349,10 +347,8 @@ describe("uwf thread resume", () => {
const uwfAfterFirst = await createUwfStore(tmpDir);
expect(getThread(uwfAfterFirst.varStore, THREAD_ID)).toEqual({
head: firstResume.head,
status: "suspended",
suspendedRole: "worker",
suspendMessage: SUSPEND_MESSAGE,
completedAt: null,
});
const { mockAgentPath: okMockAgentPath } = await setupOkMockAgent(
@@ -448,263 +444,3 @@ echo '${adapterJson}'
return { mockAgentPath };
}
describe("uwf thread resume - completed threads", () => {
test("resume completed thread starts from $START role", async () => {
const casDir = join(tmpDir, "cas");
await mkdir(casDir, { recursive: true });
const store = await openStore(casDir);
const schemas = await registerUwfSchemas(store);
const outputSchemaHash = await putSchema(store, OUTPUT_SCHEMA);
const workflowHash = await store.cas.put(schemas.workflow, {
name: "test-completed-resume",
description: "completed thread resume test",
roles: {
worker: {
description: "Worker role",
goal: "Work",
capabilities: [],
procedure: "work",
output: "result",
frontmatter: outputSchemaHash,
},
reviewer: {
description: "Reviewer role",
goal: "Review",
capabilities: [],
procedure: "review",
output: "result",
frontmatter: outputSchemaHash,
},
},
graph: {
$START: { _: { role: "worker", prompt: "Start work", location: null } },
worker: { done: { role: "reviewer", prompt: "Review the work", location: null } },
reviewer: { done: { role: "$END", prompt: "Done", location: null } },
},
});
const startHash = await store.cas.put(schemas.startNode, {
workflow: workflowHash,
prompt: "Initial task",
cwd: tmpDir,
});
process.env.OCAS_HOME = casDir;
const workerOutputHash = await store.cas.put(outputSchemaHash, { $status: "done" });
const reviewerOutputHash = await store.cas.put(outputSchemaHash, { $status: "done" });
const detailHash = await store.cas.put(schemas.text, "mock detail");
const workerStepHash = await store.cas.put(schemas.stepNode, {
start: startHash,
prev: null,
role: "worker",
output: workerOutputHash,
detail: detailHash,
agent: "uwf-mock",
edgePrompt: "Start work",
startedAtMs: 1716600000000,
completedAtMs: 1716600001000,
cwd: tmpDir,
assembledPrompt: null,
});
const reviewerStepHash = await store.cas.put(schemas.stepNode, {
start: startHash,
prev: workerStepHash,
role: "reviewer",
output: reviewerOutputHash,
detail: detailHash,
agent: "uwf-mock",
edgePrompt: "Review the work",
startedAtMs: 1716600001000,
completedAtMs: 1716600002000,
cwd: tmpDir,
assembledPrompt: null,
});
await seedThreads(tmpDir, {
[THREAD_ID]: {
head: reviewerStepHash,
status: "completed",
suspendedRole: null,
suspendMessage: null,
completedAt: 1716600002000,
},
});
// Verify the status was actually set
const { createUwfStore, getThread } = await import("../store.js");
const verifyUwf = await createUwfStore(tmpDir);
const verifyEntry = getThread(verifyUwf.varStore, THREAD_ID);
console.log("Seeded entry status:", verifyEntry?.status);
console.log("Seeded entry:", JSON.stringify(verifyEntry, null, 2));
const promptCapturePath = join(tmpDir, "captured-prompt-completed.txt");
const mockAgentPath = join(tmpDir, "mock-agent-completed.sh");
const newWorkerStepHash = await store.cas.put(schemas.stepNode, {
start: startHash,
prev: reviewerStepHash,
role: "worker",
output: workerOutputHash,
detail: detailHash,
agent: "uwf-mock",
edgePrompt: "Start work",
startedAtMs: 1716600003000,
completedAtMs: 1716600004000,
cwd: tmpDir,
assembledPrompt: null,
});
const adapterJson = JSON.stringify({
stepHash: newWorkerStepHash,
detailHash,
role: "worker",
frontmatter: { $status: "done" },
body: "",
startedAtMs: 1716600003000,
completedAtMs: 1716600004000,
});
await writeFile(
mockAgentPath,
`#!/bin/sh
prompt=""
while [ $# -gt 0 ]; do
if [ "$1" = "--prompt" ]; then
prompt="$2"
shift 2
else
shift
fi
done
printf '%s' "$prompt" > '${promptCapturePath}'
echo '${adapterJson}'
`,
{ mode: 0o755 },
);
const configPath = join(tmpDir, "config.yaml");
await writeFile(
configPath,
`defaultAgent: uwf-hermes\ndefaultModel: test-model\nagentOverrides: null\nagents: {}\nproviders: {}\nmodels: {}\n`,
);
const result = runUwf(
["thread", "resume", THREAD_ID, "-p", "Additional context", "--agent", mockAgentPath],
casDir,
);
if (result.status !== 0) {
console.error("Command failed:", result.stderr);
}
expect(result.status).toBe(0);
const cliOutput = JSON.parse(result.stdout.trim());
expect(cliOutput.status).toBe("idle");
expect(cliOutput.currentRole).toBe("reviewer");
expect(cliOutput.done).toBe(false);
const capturedPrompt = await readFile(promptCapturePath, "utf8");
expect(capturedPrompt).toContain("Previous run completed");
expect(capturedPrompt).toContain("Additional context");
const storeModule = await import("../store.js");
const uwf2 = await storeModule.createUwfStore(tmpDir);
const entry2 = storeModule.getThread(uwf2.varStore, THREAD_ID);
expect(entry2?.status).toBe("idle");
expect(entry2?.completedAt).toBeNull();
});
test("resume cancelled thread returns error", async () => {
const casDir = join(tmpDir, "cas");
await mkdir(casDir, { recursive: true });
const store = await openStore(casDir);
const schemas = await registerUwfSchemas(store);
const workflowHash = await store.cas.put(schemas.workflow, {
name: "cancelled-workflow",
description: "cancelled thread",
roles: {
worker: {
description: "Worker",
goal: "Work",
capabilities: [],
procedure: "work",
output: "result",
frontmatter: await putSchema(store, OUTPUT_SCHEMA),
},
},
graph: {
$START: { _: { role: "worker", prompt: "Start", location: null } },
worker: { done: { role: "$END", prompt: "Done", location: null } },
},
});
const startHash = await store.cas.put(schemas.startNode, {
workflow: workflowHash,
prompt: "task",
cwd: tmpDir,
});
process.env.OCAS_HOME = casDir;
await seedThreads(tmpDir, {
[THREAD_ID]: {
head: startHash,
status: "cancelled",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
},
});
const result = runUwf(["thread", "resume", THREAD_ID], casDir);
expect(result.status).not.toBe(0);
expect(result.stderr).toContain("thread cannot be resumed");
expect(result.stderr).toContain("cancelled");
});
test("resume idle thread returns error", async () => {
const casDir = join(tmpDir, "cas");
await mkdir(casDir, { recursive: true });
const store = await openStore(casDir);
const schemas = await registerUwfSchemas(store);
const workflowHash = await store.cas.put(schemas.workflow, {
name: "idle-workflow",
description: "idle thread",
roles: {
worker: {
description: "Worker",
goal: "Work",
capabilities: [],
procedure: "work",
output: "result",
frontmatter: await putSchema(store, OUTPUT_SCHEMA),
},
},
graph: {
$START: { _: { role: "worker", prompt: "Start", location: null } },
worker: { done: { role: "$END", prompt: "Done", location: null } },
},
});
const startHash = await store.cas.put(schemas.startNode, {
workflow: workflowHash,
prompt: "task",
cwd: tmpDir,
});
process.env.OCAS_HOME = casDir;
await seedThreads(tmpDir, { [THREAD_ID]: startHash });
const result = runUwf(["thread", "resume", THREAD_ID], casDir);
expect(result.status).not.toBe(0);
expect(result.stderr).toContain("thread cannot be resumed");
expect(result.stderr).toContain("idle");
});
});
@@ -6,7 +6,13 @@ import type { CasRef, ThreadId } from "@united-workforce/protocol";
import { describe, expect, test } from "vitest";
import { createMarker, deleteMarker } from "../background/index.js";
import { cmdThreadShow, cmdThreadStart } from "../commands/thread.js";
import { completeThread, createUwfStore, loadAllThreads, setThread } from "../store.js";
import {
addHistoryEntry,
createUwfStore,
deleteThread,
loadAllThreads,
setThread,
} from "../store.js";
const OUTPUT_SCHEMA = {
type: "object" as const,
@@ -31,7 +37,7 @@ roles:
type: object
required: ["$status"]
properties:
$status: { type: string, enum: ["ready"] }
$status: { type: string }
graph:
$START:
_:
@@ -39,7 +45,7 @@ graph:
prompt: "Plan the work"
location: null
planner:
ready:
_:
role: $END
prompt: "Done"
location: null
@@ -112,13 +118,7 @@ async function insertStepNode(
assembledPrompt: null,
})) as CasRef;
setThread(uwf.varStore, threadId, {
head: stepHash,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
});
setThread(uwf.varStore, threadId, { head: stepHash, suspendedRole: null, suspendMessage: null });
}
describe("thread show status field", () => {
@@ -200,7 +200,7 @@ describe("thread show status field", () => {
// Create a thread
const startResult = await cmdThreadStart(storageRoot, workflowPath, "test prompt", tmpDir);
const threadId = startResult.thread as ThreadId;
const _workflow = startResult.workflow;
const workflow = startResult.workflow;
// Get the head hash before moving to history
const uwfForIndex = await createUwfStore(storageRoot);
@@ -208,7 +208,15 @@ describe("thread show status field", () => {
const head = index[threadId]!.head;
if (!head) throw new Error("Thread not found in index");
completeThread(uwfForIndex.varStore, threadId, "completed");
deleteThread(uwfForIndex.varStore, threadId);
addHistoryEntry(uwfForIndex.varStore, {
thread: threadId,
workflow,
head,
completedAt: Date.now(),
reason: "completed",
});
const result = await cmdThreadShow(storageRoot, threadId);
@@ -229,7 +237,7 @@ describe("thread show status field", () => {
// Create a thread
const startResult = await cmdThreadStart(storageRoot, workflowPath, "test prompt", tmpDir);
const threadId = startResult.thread as ThreadId;
const _workflow = startResult.workflow;
const workflow = startResult.workflow;
// Get the head hash before moving to history
const uwfForIndex = await createUwfStore(storageRoot);
@@ -237,7 +245,15 @@ describe("thread show status field", () => {
const head = index[threadId]!.head;
if (!head) throw new Error("Thread not found in index");
completeThread(uwfForIndex.varStore, threadId, "cancelled");
deleteThread(uwfForIndex.varStore, threadId);
addHistoryEntry(uwfForIndex.varStore, {
thread: threadId,
workflow,
head,
completedAt: Date.now(),
reason: "cancelled",
});
const result = await cmdThreadShow(storageRoot, threadId);
@@ -258,7 +274,7 @@ describe("thread show status field", () => {
// Create a thread
const startResult = await cmdThreadStart(storageRoot, workflowPath, "test prompt", tmpDir);
const threadId = startResult.thread as ThreadId;
const _workflow = startResult.workflow;
const workflow = startResult.workflow;
// Get the head hash before moving to history
const uwfForIndex = await createUwfStore(storageRoot);
@@ -266,7 +282,15 @@ describe("thread show status field", () => {
const head = index[threadId]!.head;
if (!head) throw new Error("Thread not found in index");
completeThread(uwfForIndex.varStore, threadId, "completed");
deleteThread(uwfForIndex.varStore, threadId);
addHistoryEntry(uwfForIndex.varStore, {
thread: threadId,
workflow,
head,
completedAt: Date.now(),
reason: null,
});
const result = await cmdThreadShow(storageRoot, threadId);
@@ -54,7 +54,7 @@ roles:
type: object
required: ["$status"]
properties:
$status: { type: string, enum: ["ready"] }
$status: { type: string }
graph:
$START:
_:
@@ -62,7 +62,7 @@ graph:
prompt: "Plan the work"
location: null
planner:
ready:
_:
role: $END
prompt: "Done"
location: null
@@ -2,28 +2,19 @@ import { execFileSync } from "node:child_process";
import { dirname, join } from "node:path";
import { fileURLToPath } from "node:url";
import { describe, expect, test } from "vitest";
import { validateCount } from "../commands/thread.js";
const CLI_PATH = join(dirname(fileURLToPath(import.meta.url)), "..", "..", "dist", "cli.js");
const CLI_PATH = join(dirname(fileURLToPath(import.meta.url)), "..", "cli.js");
function runCli(args: string[]): {
stdout: string;
stderr: string;
exitCode: number;
} {
function runCli(args: string[]): { stdout: string; stderr: string; exitCode: number } {
try {
const stdout = execFileSync("node", [CLI_PATH, ...args], {
const stdout = execFileSync("npx", ["tsx", CLI_PATH, ...args], {
encoding: "utf8",
env: { ...process.env, UWF_HOME: "/tmp/uwf-test-nonexistent" },
stdio: ["ignore", "pipe", "pipe"],
});
return { stdout, stderr: "", exitCode: 0 };
} catch (e: unknown) {
const err = e as NodeJS.ErrnoException & {
stdout?: string;
stderr?: string;
status?: number;
};
const err = e as NodeJS.ErrnoException & { stdout?: string; stderr?: string; status?: number };
return {
stdout: err.stdout ?? "",
stderr: err.stderr ?? "",
@@ -32,39 +23,50 @@ function runCli(args: string[]): {
}
}
describe("thread exec --count CLI parsing", { timeout: 30_000 }, () => {
describe("thread exec --count CLI parsing", () => {
test("--help shows -c/--count option", () => {
const result = runCli(["thread", "exec", "--help"]);
const combined = result.stdout + result.stderr;
expect(combined).toContain("--count");
expect(combined).toContain("-c");
expect(result.stdout).toContain("--count");
expect(result.stdout).toContain("-c");
});
test("description says 'one or more steps'", () => {
const result = runCli(["thread", "exec", "--help"]);
const combined = result.stdout + result.stderr;
expect(combined).toContain("one or more steps");
expect(result.stdout).toContain("one or more steps");
});
});
describe("validateCount", () => {
test("count=0 throws validation error", () => {
expect(() => validateCount(0)).toThrow("positive integer");
describe("cmdThreadExec count logic", () => {
test("count=0 fails with validation error", () => {
const result = runCli(["thread", "exec", "FAKE_THREAD_ID", "-c", "0"]);
expect(result.exitCode).not.toBe(0);
expect(result.stderr).toContain("positive integer");
});
test("negative count throws validation error", () => {
expect(() => validateCount(-1)).toThrow("positive integer");
test("negative count fails with validation error", () => {
const result = runCli(["thread", "exec", "FAKE_THREAD_ID", "-c", "-1"]);
expect(result.exitCode).not.toBe(0);
expect(result.stderr).toContain("positive integer");
});
test("non-integer count throws validation error", () => {
expect(() => validateCount(1.5)).toThrow("positive integer");
test("non-integer count fails with validation error", () => {
const result = runCli(["thread", "exec", "FAKE_THREAD_ID", "-c", "1.5"]);
expect(result.exitCode).not.toBe(0);
expect(result.stderr).toContain("positive integer");
});
test("count=1 passes validation", () => {
expect(() => validateCount(1)).not.toThrow();
test("count=1 is the default (no -c flag)", () => {
// Without -c, it should attempt to run 1 step (failing on missing thread, not on count validation)
const result = runCli(["thread", "exec", "FAKE_THREAD_ID"]);
expect(result.exitCode).not.toBe(0);
// Should NOT contain "positive integer" error — should fail on thread lookup instead
expect(result.stderr).not.toContain("positive integer");
});
test("count=3 passes validation", () => {
expect(() => validateCount(3)).not.toThrow();
test("count=3 passes validation (fails on thread lookup)", () => {
const result = runCli(["thread", "exec", "FAKE_THREAD_ID", "-c", "3"]);
expect(result.exitCode).not.toBe(0);
// Should NOT contain "positive integer" error — should fail on thread/storage lookup
expect(result.stderr).not.toContain("positive integer");
});
});
@@ -160,10 +160,8 @@ describe("suspend step CAS chain and threads.yaml metadata", () => {
const threadEntry = getThread(uwf.varStore, threadId);
expect(threadEntry).toEqual({
head: stepHash,
status: "suspended",
suspendedRole: "worker",
suspendMessage: "Please clarify: Which API?",
completedAt: null,
});
const showResult = await cmdThreadShow(tmpDir, threadId);
+24 -27
View File
@@ -11,7 +11,7 @@ import {
THREAD_READ_DEFAULT_QUOTA,
} from "../commands/thread.js";
import type { UwfStore } from "../store.js";
import { completeThread, createUwfStore, setThread } from "../store.js";
import { addHistoryEntry, createUwfStore } from "../store.js";
import { seedThreads } from "./thread-test-helpers.js";
// ── schemas used in tests ────────────────────────────────────────────────────
@@ -745,14 +745,13 @@ describe("cmdStepList with completed threads", () => {
const threadId = "01JTEST0000000000000000A2" as ThreadId;
// Thread is NOT in active index (simulating completed thread)
// But it IS in history variable store
setThread(uwf.varStore, threadId, {
addHistoryEntry(uwf.varStore, {
thread: threadId,
workflow: workflowHash,
head: step2Hash,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
completedAt: Date.now(),
reason: null,
});
completeThread(uwf.varStore, threadId, "completed");
const result = await cmdStepList(tmpDir, threadId);
@@ -873,15 +872,14 @@ describe("cmdStepShow with completed threads", () => {
const threadId = "01JTEST0000000000000000B2" as ThreadId;
// Thread is NOT in active index
// But it IS in the unified store with completed status
setThread(uwf.varStore, threadId, {
// But it IS in history variable store
addHistoryEntry(uwf.varStore, {
thread: threadId,
workflow: workflowHash,
head: stepHash,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
completedAt: Date.now(),
reason: null,
});
completeThread(uwf.varStore, threadId, "completed");
const result = await cmdStepShow(tmpDir, stepHash);
@@ -936,15 +934,15 @@ describe("cmdThreadRead with completed threads", () => {
});
const threadId = "01JTEST0000000000000000C1" as ThreadId;
// Thread is in store with completed status
setThread(uwf.varStore, threadId, {
// Thread is NOT in active index
// But it IS in history variable store
addHistoryEntry(uwf.varStore, {
thread: threadId,
workflow: workflowHash,
head: stepHash,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
completedAt: Date.now(),
reason: null,
});
completeThread(uwf.varStore, threadId, "completed");
const markdown = await cmdThreadRead(tmpDir, threadId, THREAD_READ_DEFAULT_QUOTA, null, false);
@@ -1000,14 +998,13 @@ describe("cmdThreadRead with completed threads", () => {
});
const threadId = "01JTEST0000000000000000C2" as ThreadId;
setThread(uwf.varStore, threadId, {
addHistoryEntry(uwf.varStore, {
thread: threadId,
workflow: workflowHash,
head: step3Hash,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
completedAt: Date.now(),
reason: null,
});
completeThread(uwf.varStore, threadId, "completed");
const markdown = await cmdThreadRead(
tmpDir,
@@ -17,7 +17,7 @@ function makeWorkflow(overrides?: Partial<WorkflowPayload>): WorkflowPayload {
frontmatter: {
type: "object",
properties: {
$status: { enum: ["done"] },
$status: { enum: ["_"] },
plan: { type: "string" },
},
required: ["$status", "plan"],
@@ -52,7 +52,7 @@ function makeWorkflow(overrides?: Partial<WorkflowPayload>): WorkflowPayload {
},
graph: {
$START: { _: { role: "writer", prompt: "Begin writing", location: null } },
writer: { done: { role: "reviewer", prompt: "Review this: {{{plan}}}", location: null } },
writer: { _: { role: "reviewer", prompt: "Review this: {{{plan}}}", location: null } },
reviewer: {
approved: { role: "$END", prompt: "Done: {{{summary}}}", location: null },
rejected: { role: "writer", prompt: "Fix: {{{reason}}}", location: null },
@@ -82,7 +82,7 @@ describe("Suite 1: Role Reference Integrity", () => {
output: "None",
frontmatter: {
type: "object",
properties: { $status: { enum: ["done"] } },
properties: { $status: { enum: ["_"] } },
required: ["$status"],
} as unknown as string,
};
@@ -173,11 +173,11 @@ describe("Suite 2: Graph Structure", () => {
output: "Isolated",
frontmatter: {
type: "object",
properties: { $status: { enum: ["done"] } },
properties: { $status: { enum: ["_"] } },
required: ["$status"],
} as unknown as string,
};
wf.graph.isolated = { done: { role: "$END", prompt: "done", location: null } };
wf.graph.isolated = { _: { role: "$END", prompt: "done", location: null } };
const errors = validateWorkflow(wf);
expect(errors.some((e) => e.includes('role "isolated" is not reachable from $START'))).toBe(
true,
@@ -186,34 +186,34 @@ describe("Suite 2: Graph Structure", () => {
test("2.6 edge target references invalid role", () => {
const wf = makeWorkflow();
wf.graph.writer = { done: { role: "ghost", prompt: "Go to ghost", location: null } };
wf.graph.writer = { _: { role: "ghost", prompt: "Go to ghost", location: null } };
const errors = validateWorkflow(wf);
expect(errors.some((e) => e.includes('unknown target role "ghost"'))).toBe(true);
});
});
describe("Suite 3: Status-Edge Consistency", () => {
test("3.1 user role using _ graph key is rejected", () => {
test("3.1 single-exit role with multiple graph keys", () => {
const wf = makeWorkflow();
wf.graph.writer = { _: { role: "reviewer", prompt: "Review", location: null } };
wf.graph.writer = {
_: { role: "reviewer", prompt: "Review", location: null },
extra: { role: "$END", prompt: "Done", location: null },
};
const errors = validateWorkflow(wf);
expect(
errors.some((e) =>
e.includes('role "writer" must use explicit $status keys in graph, not "_"'),
e.includes('role "writer" is single-exit but has status keys other than "_"'),
),
).toBe(true);
});
test("3.2 user role graph key not matching $status enum", () => {
test("3.2 single-exit role missing _ key", () => {
const wf = makeWorkflow();
wf.graph.writer = { wrong: { role: "reviewer", prompt: "Review", location: null } };
wf.graph.writer = { done: { role: "reviewer", prompt: "Review", location: null } };
const errors = validateWorkflow(wf);
expect(errors.some((e) => e.includes('role "writer" graph has extra status keys: wrong'))).toBe(
true,
);
expect(errors.some((e) => e.includes('role "writer" graph is missing status keys: done'))).toBe(
true,
);
expect(
errors.some((e) => e.includes('role "writer" is single-exit but graph has no "_" key')),
).toBe(true);
});
test("3.3 multi-exit role with extra statuses", () => {
@@ -244,11 +244,9 @@ describe("Suite 3: Status-Edge Consistency", () => {
const wf = makeWorkflow();
wf.graph.reviewer = { _: { role: "$END", prompt: "Done", location: null } };
const errors = validateWorkflow(wf);
expect(
errors.some((e) =>
e.includes('role "reviewer" must use explicit $status keys in graph, not "_"'),
),
).toBe(true);
expect(errors.some((e) => e.includes('role "reviewer" is multi-exit but graph uses "_"'))).toBe(
true,
);
});
});
@@ -316,20 +314,20 @@ describe("Suite 3b: Enum-Based Multi-Exit", () => {
expect(errors.some((e) => e.includes("missing status keys: rejected"))).toBe(true);
});
test("3b.4 enum with single explicit value passes", () => {
test("3b.4 enum with single value (not multi-exit) treated as single-exit", () => {
const wf = makeWorkflow();
wf.roles.writer = {
...wf.roles.writer,
frontmatter: {
type: "object",
properties: {
$status: { enum: ["ready"] },
$status: { enum: ["_"] },
plan: { type: "string" },
},
required: ["$status", "plan"],
} as unknown as string,
};
wf.graph.writer = { ready: { role: "reviewer", prompt: "Review: {{{plan}}}", location: null } };
wf.graph.writer = { _: { role: "reviewer", prompt: "Review: {{{plan}}}", location: null } };
const errors = validateWorkflow(wf);
expect(errors).toEqual([]);
});
@@ -357,15 +355,13 @@ describe("Suite 3b: Enum-Based Multi-Exit", () => {
});
describe("Suite 4: Mustache Template Variable Existence", () => {
test("4.1 prompt references nonexistent variable (enum status)", () => {
test("4.1 prompt references nonexistent variable (single-exit)", () => {
const wf = makeWorkflow();
wf.graph.writer = {
done: { role: "reviewer", prompt: "Review: {{{branch}}}", location: null },
};
wf.graph.writer = { _: { role: "reviewer", prompt: "Review: {{{branch}}}", location: null } };
const errors = validateWorkflow(wf);
expect(
errors.some(
(e) => e.includes('prompt variable "branch"') && e.includes('role "writer" frontmatter'),
errors.some((e) =>
e.includes('prompt variable "branch" not found in role "writer" frontmatter'),
),
).toBe(true);
});
@@ -392,7 +388,7 @@ describe("Suite 4: Mustache Template Variable Existence", () => {
test("4.4 $status variable is always valid", () => {
const wf = makeWorkflow();
wf.graph.writer = { done: { role: "reviewer", prompt: "Status: {{$status}}", location: null } };
wf.graph.writer = { _: { role: "reviewer", prompt: "Status: {{$status}}", location: null } };
const errors = validateWorkflow(wf);
expect(errors).toEqual([]);
});
@@ -460,14 +456,14 @@ describe("Suite 6: Multiple Errors Collection", () => {
output: "None",
frontmatter: {
type: "object",
properties: { $status: { enum: ["done"] } },
properties: { $status: { enum: ["_"] } },
required: ["$status"],
} as unknown as string,
};
// unknown graph reference
wf.graph.nonexistent = { done: { role: "$END", prompt: "done", location: null } };
wf.graph.nonexistent = { _: { role: "$END", prompt: "done", location: null } };
// bad mustache var
wf.graph.writer = { done: { role: "reviewer", prompt: "{{{badvar}}}", location: null } };
wf.graph.writer = { _: { role: "reviewer", prompt: "{{{badvar}}}", location: null } };
const errors = validateWorkflow(wf);
expect(errors.length).toBeGreaterThanOrEqual(3);
});
@@ -31,7 +31,7 @@ function makeMinimalPayload(name: string, description: string): WorkflowPayload
frontmatter: {
type: "object",
properties: {
$status: { type: "string", enum: ["done"] },
$status: { type: "string" },
},
required: ["$status"],
} as unknown as CasRef,
@@ -39,7 +39,7 @@ function makeMinimalPayload(name: string, description: string): WorkflowPayload
},
graph: {
$START: { _: { role: "worker", prompt: "start working", location: null } },
worker: { done: { role: "$END", prompt: "done", location: null } },
worker: { _: { role: "$END", prompt: "done", location: null } },
},
};
}
+20 -12
View File
@@ -5,13 +5,14 @@ import { Command } from "commander";
import { cmdConfigGet, cmdConfigList, cmdConfigSet } from "./commands/config.js";
import { cmdLogClean, cmdLogList, cmdLogShow } from "./commands/log.js";
import {
cmdPromptAdapterDeveloping,
cmdPromptAdapter,
cmdPromptAuthor,
cmdPromptBootstrap,
cmdPromptDeveloper,
cmdPromptList,
cmdPromptSetup,
cmdPromptUsage,
cmdPromptUsageReference,
cmdPromptWorkflowAuthoring,
cmdPromptUser,
} from "./commands/prompt.js";
import { cmdSetup, cmdSetupInteractive } from "./commands/setup.js";
import { cmdStepFork, cmdStepList, cmdStepRead, cmdStepShow } from "./commands/step.js";
@@ -522,24 +523,31 @@ prompt
});
prompt
.command("usage-reference")
.description("Print the usage reference (CLI guide + typical workflows)")
.command("adapter")
.description("Print the adapter reference (building agent adapters)")
.action(() => {
console.log(cmdPromptUsageReference());
console.log(cmdPromptAdapter());
});
prompt
.command("workflow-authoring")
.description("Print the workflow authoring reference (YAML design guide)")
.command("author")
.description("Print the author reference (workflow YAML design guide)")
.action(() => {
console.log(cmdPromptWorkflowAuthoring());
console.log(cmdPromptAuthor());
});
prompt
.command("adapter-developing")
.description("Print the adapter developing reference (building agent adapters)")
.command("developer")
.description("Print the developer reference (coding conventions + architecture)")
.action(() => {
console.log(cmdPromptAdapterDeveloping());
console.log(cmdPromptDeveloper());
});
prompt
.command("user")
.description("Print the user reference (CLI guide + typical workflows)")
.action(() => {
console.log(cmdPromptUser());
});
prompt
+43 -23
View File
@@ -1,21 +1,24 @@
import {
generateAdapterDevelopingReference,
generateAdapterReference,
generateAuthorReference,
generateBootstrapReference,
generateUsageReference,
generateWorkflowAuthoringReference,
generateDeveloperReference,
generateUserReference,
} from "@united-workforce/util";
export {
generateAdapterDevelopingReference as cmdPromptAdapterDeveloping,
generateAdapterReference as cmdPromptAdapter,
generateAuthorReference as cmdPromptAuthor,
generateBootstrapReference as cmdPromptBootstrap,
generateUsageReference as cmdPromptUsageReference,
generateWorkflowAuthoringReference as cmdPromptWorkflowAuthoring,
generateDeveloperReference as cmdPromptDeveloper,
generateUserReference as cmdPromptUser,
};
const PROMPT_ENTRIES: ReadonlyArray<{ name: string; generate: () => string }> = [
{ name: "usage", generate: generateUsageReference },
{ name: "workflow-authoring", generate: generateWorkflowAuthoringReference },
{ name: "adapter-developing", generate: generateAdapterDevelopingReference },
{ name: "user", generate: generateUserReference },
{ name: "author", generate: generateAuthorReference },
{ name: "developer", generate: generateDeveloperReference },
{ name: "adapter", generate: generateAdapterReference },
{ name: "bootstrap", generate: generateBootstrapReference },
];
@@ -24,9 +27,7 @@ export function cmdPromptList(): ReadonlyArray<string> {
}
export function cmdPromptUsage(): string {
return PROMPT_ENTRIES.filter((e) => e.name !== "bootstrap")
.map((e) => e.generate())
.join("\n\n---\n\n");
return PROMPT_ENTRIES.map((e) => e.generate()).join("\n\n---\n\n");
}
export function cmdPromptSetup(): string {
@@ -41,7 +42,7 @@ so that you know how to use the \`uwf\` CLI for workflow orchestration.
\`\`\`bash
uwf --help
\`\`\`
If not installed: \`npm install -g @united-workforce/cli\`
If not installed: \`bun add -g @united-workforce/cli\`
2. **Clean up old versions of the skill** (if any exist):
- Look for any existing uwf/workflow skill files in your skill directories
@@ -50,24 +51,43 @@ so that you know how to use the \`uwf\` CLI for workflow orchestration.
- If the version matches the current CLI version (\`uwf --version\`), you're done — no update needed.
- If the version is older, **delete the old skill file** before proceeding.
3. **Install the bootstrap skill:**
3. **Get the usage skill content:**
\`\`\`bash
uwf prompt bootstrap > ~/.hermes/skills/uwf/SKILL.md
uwf prompt usage
\`\`\`
The bootstrap prompt already includes complete YAML frontmatter no editing needed.
4. **Verify** the skill is loadable by your agent framework.
4. **Create a SKILL.md file** in your skill directory (e.g. \`~/.hermes/skills/uwf/SKILL.md\`).
The output of \`uwf prompt usage\` is the skill body WITHOUT frontmatter.
You must prepend the following YAML frontmatter:
\`\`\`yaml
---
name: uwf
description: "Use when orchestrating multi-agent workflows with uwf (United Workforce). Covers CLI usage, workflow YAML authoring, moderator logic, and agent development."
version: <VERSION>
author: United Workforce
license: MIT
metadata:
hermes:
tags: [workflow, agents, orchestration, cli, yaml]
---
\`\`\`
Replace \`<VERSION>\` with the installed CLI version (from \`uwf --version\`).
5. **Verify** the skill is loadable by your agent framework.
## Individual prompts
Each prompt outputs a complete SKILL.md with frontmatter pipe directly to a file:
You can also get individual reference sections:
\`\`\`bash
uwf prompt list # list available prompt names
uwf prompt usage > ~/.hermes/skills/uwf-usage/SKILL.md # CLI usage guide
uwf prompt workflow-authoring > ~/.hermes/skills/uwf-workflow-authoring/SKILL.md
uwf prompt adapter-developing > ~/.hermes/skills/uwf-adapter-developing/SKILL.md
uwf prompt bootstrap > ~/.hermes/skills/uwf/SKILL.md # bootstrap skill
uwf prompt list # list available prompt names
uwf prompt user # user reference (CLI guide + typical workflows)
uwf prompt author # author reference (workflow YAML design guide)
uwf prompt developer # developer reference (coding conventions + architecture)
uwf prompt adapter # adapter reference (building agent adapters)
uwf prompt bootstrap # bootstrap skill YAML for Hermes agents
\`\`\`
## Notes
+5 -1
View File
@@ -6,7 +6,7 @@ import type {
StepNodePayload,
ThreadId,
} from "@united-workforce/protocol";
import { createUwfStore, getThread, type UwfStore } from "../store.js";
import { createUwfStore, findHistoryEntry, getThread, type UwfStore } from "../store.js";
type ChainState = {
startHash: CasRef;
@@ -207,6 +207,10 @@ async function resolveHeadHash(storageRoot: string, threadId: ThreadId): Promise
if (entry !== null) {
return entry.head;
}
const hist = findHistoryEntry(uwf.varStore, threadId);
if (hist !== null) {
return hist.head;
}
fail(`thread not found: ${threadId}`);
}
-3
View File
@@ -66,7 +66,6 @@ export async function cmdStepList(
agent: item.payload.agent,
timestamp: item.timestamp,
durationMs: item.payload.completedAtMs - item.payload.startedAtMs,
usage: item.payload.usage ?? null,
});
}
@@ -115,10 +114,8 @@ export async function cmdStepFork(
const newThreadId = generateUlid(Date.now()) as ThreadId;
setThread(uwf.varStore, newThreadId, {
head: stepHash,
status: "idle",
suspendedRole: null,
suspendMessage: null,
completedAt: null,
});
return {
+111 -126
View File
@@ -38,14 +38,17 @@ import { createMarker, deleteMarker, isThreadRunning } from "../background/index
import { createIncludeTag } from "../include.js";
import { evaluate, isSuspendResult } from "../moderator/index.js";
import {
completeThread,
addHistoryEntry,
createUwfStore,
deleteThread,
findHistoryEntry,
getThread,
loadActiveThreads,
loadHistoryThreads,
loadAllHistory,
loadAllThreads,
loadWorkflowRegistry,
resolveWorkflowHash,
setThread,
type ThreadHistoryLine,
type UwfStore,
} from "../store.js";
import { checkWorkflowFilenameConsistency, isCasRef, parseWorkflowPayload } from "../validate.js";
@@ -482,55 +485,61 @@ export async function cmdThreadShow(
): Promise<ThreadShowOutput> {
const uwf = await createUwfStore(storageRoot);
const entry = getThread(uwf.varStore, threadId);
if (entry === null) {
fail(`thread not found: ${threadId}`);
}
if (entry !== null) {
const activeHead = entry.head;
const workflow = resolveWorkflowFromHead(uwf, activeHead);
if (workflow === null) {
fail(`failed to resolve workflow from head: ${activeHead}`);
}
const activeHead = entry.head;
const workflow = resolveWorkflowFromHead(uwf, activeHead);
if (workflow === null) {
fail(`failed to resolve workflow from head: ${activeHead}`);
}
const status = await resolveActiveThreadStatus(
storageRoot,
threadId,
uwf,
activeHead,
workflow,
);
const currentRole = resolveCurrentRole(uwf, activeHead, workflow);
const suspendFields = resolveSuspendFieldsForShow(entry, status, uwf, activeHead, workflow);
const hint =
status === "suspended"
? `Thread is suspended. Resume with: uwf thread resume ${threadId}`
: null;
// Determine if this is a completed/cancelled thread
if (entry.status === "completed" || entry.status === "cancelled") {
const hint = null;
return {
workflow,
thread: threadId,
head: activeHead,
status: entry.status,
currentRole: null,
suspendedRole: null,
suspendMessage: null,
done: true,
status,
currentRole,
suspendedRole: suspendFields.suspendedRole,
suspendMessage: suspendFields.suspendMessage,
done: false,
background: null,
hint,
};
}
// Active thread
const status = await resolveActiveThreadStatus(storageRoot, threadId, uwf, activeHead, workflow);
const currentRole = resolveCurrentRole(uwf, activeHead, workflow);
const suspendFields = resolveSuspendFieldsForShow(entry, status, uwf, activeHead, workflow);
const hist = findHistoryEntry(uwf.varStore, threadId);
if (hist !== null) {
const status: ThreadStatus = hist.reason === "cancelled" ? "cancelled" : "completed";
const hint =
status === "suspended"
? `Thread is suspended. Resume with: uwf thread resume ${threadId}`
: null;
return {
workflow: hist.workflow,
thread: threadId,
head: hist.head,
status,
currentRole: null,
suspendedRole: null,
suspendMessage: null,
done: true,
background: null,
hint: null,
};
}
return {
workflow,
thread: threadId,
head: activeHead,
status,
currentRole,
suspendedRole: suspendFields.suspendedRole,
suspendMessage: suspendFields.suspendMessage,
done: false,
background: null,
hint,
};
fail(`thread not found: ${threadId}`);
}
export type ThreadListItemWithStatus = ThreadListItem & {
@@ -585,20 +594,19 @@ async function collectActiveThreads(
}
function collectCompletedThreads(
uwf: UwfStore,
varStore: VarStore,
activeIds: Set<ThreadId>,
): ThreadListItemWithStatus[] {
const items: ThreadListItemWithStatus[] = [];
const history = loadHistoryThreads(uwf.varStore);
const history = loadAllHistory(varStore);
const seen = new Set<ThreadId>(); // Deduplication (issue #470)
for (const [threadId, entry] of Object.entries(history)) {
if (!activeIds.has(threadId as ThreadId) && !seen.has(threadId as ThreadId)) {
seen.add(threadId as ThreadId);
const status = entry.status;
const workflow = resolveWorkflowFromHead(uwf, entry.head);
for (const entry of history) {
if (!activeIds.has(entry.thread) && !seen.has(entry.thread)) {
seen.add(entry.thread);
const status = entry.reason === "cancelled" ? "cancelled" : "completed";
items.push({
thread: threadId as ThreadId,
workflow: workflow ?? "",
thread: entry.thread,
workflow: entry.workflow,
head: entry.head,
status,
currentRole: null,
@@ -651,7 +659,7 @@ export async function cmdThreadList(
take: number | null,
): Promise<ThreadListItemWithStatus[]> {
const uwf = await createUwfStore(storageRoot);
const index = loadActiveThreads(uwf.varStore);
const index = loadAllThreads(uwf.varStore);
// Collect active threads
let items = await collectActiveThreads(storageRoot, uwf, index);
@@ -663,7 +671,7 @@ export async function cmdThreadList(
statusFilter.includes("cancelled");
if (includeCompleted) {
const activeIds = new Set(items.map((i) => i.thread));
const completedItems = collectCompletedThreads(uwf, activeIds);
const completedItems = collectCompletedThreads(uwf.varStore, activeIds);
items = items.concat(completedItems);
}
@@ -961,12 +969,6 @@ function resolveAgentConfig(
agentOverride: string | null,
): AgentConfig {
if (agentOverride !== null) {
// Try config alias first (e.g. "hermes" → config.agents.hermes),
// then fall back to raw command name (e.g. "uwf-hermes" or "/usr/bin/agent").
const fromAlias = config.agents[agentOverride as AgentAlias];
if (fromAlias !== undefined) {
return fromAlias;
}
return parseAgentOverride(agentOverride);
}
@@ -1033,11 +1035,17 @@ function spawnAgent(
return obj as unknown as AdapterOutput;
}
function archiveThread(uwf: UwfStore, threadId: ThreadId, _workflow: CasRef, _head: CasRef): void {
completeThread(uwf.varStore, threadId, "completed");
function archiveThread(uwf: UwfStore, threadId: ThreadId, workflow: CasRef, head: CasRef): void {
deleteThread(uwf.varStore, threadId);
addHistoryEntry(uwf.varStore, {
thread: threadId,
workflow,
head,
completedAt: Date.now(),
reason: "completed",
});
}
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: orchestration function with inherent branching
export async function cmdThreadResume(
storageRoot: string,
threadId: ThreadId,
@@ -1059,87 +1067,43 @@ export async function cmdThreadResume(
const chain = walkChain(uwf, headHash);
const workflowHash = chain.start.workflow;
// Check entry.status first for completed/cancelled (like in cmdThreadShow)
let status: ThreadStatus;
if (entry.status === "completed" || entry.status === "cancelled") {
status = entry.status;
} else {
status = await resolveActiveThreadStatus(storageRoot, threadId, uwf, headHash, workflowHash);
const status = await resolveActiveThreadStatus(
storageRoot,
threadId,
uwf,
headHash,
workflowHash,
);
if (status !== "suspended") {
fail(`thread is not suspended: ${threadId} (status: ${status})`);
}
if (status !== "suspended" && status !== "completed") {
fail(`thread cannot be resumed: ${threadId} (status: ${status})`);
const suspendFields = resolveSuspendFieldsForShow(entry, status, uwf, headHash, workflowHash);
if (suspendFields.suspendedRole === null) {
fail(`thread is suspended but suspendedRole is missing: ${threadId}`);
}
if (suspendFields.suspendMessage === null) {
fail(`thread is suspended but suspendMessage is missing: ${threadId}`);
}
const resumePrompt = buildResumePrompt(suspendFields.suspendMessage, supplement);
const plog = createProcessLogger({
storageRoot,
context: { thread: threadId, workflow: workflowHash },
});
if (status === "suspended") {
const suspendFields = resolveSuspendFieldsForShow(entry, status, uwf, headHash, workflowHash);
if (suspendFields.suspendedRole === null) {
fail(`thread is suspended but suspendedRole is missing: ${threadId}`);
}
if (suspendFields.suspendMessage === null) {
fail(`thread is suspended but suspendMessage is missing: ${threadId}`);
}
const resumePrompt = buildResumePrompt(suspendFields.suspendMessage, supplement);
plog.log(
PL_THREAD_RESUME,
`resume role=${suspendFields.suspendedRole} supplement=${supplement !== null}`,
null,
);
return cmdThreadStepOnce(storageRoot, threadId, agentOverride, plog, {
role: suspendFields.suspendedRole,
prompt: resumePrompt,
});
}
// status === "completed"
const workflow = loadWorkflowPayload(uwf, workflowHash);
const startResult = evaluate(workflow.graph, START_ROLE, {});
if (!startResult.ok) {
fail(`failed to evaluate $START: ${startResult.error.message}`);
}
if (isSuspendResult(startResult.value)) {
fail("workflow cannot start with $SUSPEND");
}
if (startResult.value.role === END_ROLE) {
fail("workflow cannot start with $END");
}
const startRole = startResult.value.role;
const completedPromptPrefix = "Previous run completed. Resuming with additional context.";
const completedResumePrompt =
supplement !== null && supplement !== ""
? `${completedPromptPrefix}\n\n${supplement}`
: completedPromptPrefix;
const updatedEntry = { ...entry, status: "idle" as const, completedAt: null };
setThread(uwf.varStore, threadId, updatedEntry);
plog.log(
PL_THREAD_RESUME,
`resume completed role=${startRole} supplement=${supplement !== null}`,
`resume role=${suspendFields.suspendedRole} supplement=${supplement !== null}`,
null,
);
return cmdThreadStepOnce(storageRoot, threadId, agentOverride, plog, {
role: startRole,
prompt: completedResumePrompt,
role: suspendFields.suspendedRole,
prompt: resumePrompt,
});
}
export function validateCount(count: number): void {
if (count < 1 || !Number.isInteger(count)) {
throw new Error(`--count must be a positive integer, got: ${count}`);
}
}
export async function cmdThreadExec(
storageRoot: string,
threadId: ThreadId,
@@ -1148,7 +1112,9 @@ export async function cmdThreadExec(
background: boolean,
backgroundWorker: boolean,
): Promise<StepOutput[]> {
validateCount(count);
if (count < 1 || !Number.isInteger(count)) {
fail(`--count must be a positive integer, got: ${count}`);
}
// Check if thread is already running in background (unless we ARE the background worker)
if (!backgroundWorker) {
@@ -1283,7 +1249,7 @@ function resolveResumeStepTarget(
}
async function resolveModeratorStepTarget(
_storageRoot: string,
storageRoot: string,
threadId: ThreadId,
entry: ThreadIndexEntry,
headHash: CasRef,
@@ -1352,7 +1318,7 @@ async function resolveModeratorStepTarget(
}
async function finalizeAgentStep(
_storageRoot: string,
storageRoot: string,
threadId: ThreadId,
workflowHash: CasRef,
workflow: WorkflowPayload,
@@ -1484,6 +1450,10 @@ async function resolveHeadHash(storageRoot: string, threadId: ThreadId): Promise
if (entry !== null) {
return entry.head;
}
const hist = findHistoryEntry(uwf.varStore, threadId);
if (hist !== null) {
return hist.head;
}
fail(`thread not found: ${threadId}`);
}
@@ -1563,6 +1533,7 @@ export async function cmdThreadCancel(
if (entry === null) {
fail(`thread not active: ${threadId}`);
}
const head = entry.head;
// Check if thread is running in background and terminate it
const runningMarker = await isThreadRunning(storageRoot, threadId);
@@ -1575,7 +1546,21 @@ export async function cmdThreadCancel(
await deleteMarker(storageRoot, threadId);
}
completeThread(uwf.varStore, threadId, "cancelled");
const workflow = resolveWorkflowFromHead(uwf, head);
if (workflow === null) {
fail(`failed to resolve workflow from head: ${head}`);
}
deleteThread(uwf.varStore, threadId);
const historyEntry: ThreadHistoryLine = {
thread: threadId,
workflow,
head,
completedAt: Date.now(),
reason: "cancelled",
};
addHistoryEntry(uwf.varStore, historyEntry);
return { thread: threadId, cancelled: true };
}
+7 -13
View File
@@ -8,8 +8,7 @@ mustache.escape = (text: string) => text;
const START_ROLE = "$START";
const SUSPEND_ROLE = "$SUSPEND";
// $START is a special entry node with no agent output — it always uses this key.
const START_STATUS = "_";
const UNIT_STATUS = "_";
type LastOutput = Record<string, unknown>;
@@ -20,17 +19,12 @@ export function evaluate(
lastRole: string,
lastOutput: LastOutput,
): Result<EvaluateResult, Error> {
let status: string;
if (lastRole === START_ROLE) {
status = START_STATUS;
} else if (typeof lastOutput[STATUS_KEY] === "string") {
status = lastOutput[STATUS_KEY] as string;
} else {
return {
ok: false,
error: new Error(`agent output for role "${lastRole}" is missing required "$status" string`),
};
}
const status =
lastRole === START_ROLE
? UNIT_STATUS
: typeof lastOutput[STATUS_KEY] === "string"
? (lastOutput[STATUS_KEY] as string)
: UNIT_STATUS;
const roleTargets = graph[lastRole];
if (roleTargets === undefined) {
+56 -94
View File
@@ -6,7 +6,13 @@ import { join } from "node:path";
import { bootstrap, type Hash, type Store, type VarStore } from "@ocas/core";
import { createFsStore, createSqliteVarStore } from "@ocas/fs";
import type { CasRef, ThreadId, ThreadIndexEntry, ThreadsIndex } from "@united-workforce/protocol";
import type {
CasRef,
ThreadId,
ThreadIndexEntry,
ThreadListItem,
ThreadsIndex,
} from "@united-workforce/protocol";
import { parseThreadsIndex } from "@united-workforce/protocol";
import { parse } from "yaml";
@@ -20,6 +26,9 @@ export const REGISTRY_VAR_PREFIX = "@uwf/registry/";
/** Variable name prefix for active thread entries (`@uwf/thread/<thread-id>`). */
export const THREAD_VAR_PREFIX = "@uwf/thread/";
/** Variable name prefix for completed/cancelled thread history (`@uwf/history/<thread-id>`). */
export const HISTORY_VAR_PREFIX = "@uwf/history/";
/** A workflow entry discovered from the project-local .workflows/ directory. */
export type ProjectWorkflowEntry = {
/** Workflow name (from YAML `name` field, equals filename stem). */
@@ -147,6 +156,11 @@ export function getThreadsPath(storageRoot: string): string {
return join(storageRoot, "threads.yaml");
}
export type ThreadHistoryLine = ThreadListItem & {
completedAt: number;
reason: "completed" | "cancelled" | null;
};
export type UwfStore = {
storageRoot: string;
store: Store;
@@ -165,7 +179,6 @@ export async function createUwfStore(storageRoot: string): Promise<UwfStore> {
await migrateWorkflowRegistryIfNeeded(storageRoot, varStore);
await migrateThreadsIndexIfNeeded(storageRoot, varStore);
await migrateHistoryIfNeeded(storageRoot, varStore);
migrateHistoryVarsToThreadVars(varStore);
return { storageRoot, store, schemas, varStore };
}
@@ -286,10 +299,8 @@ function threadVarName(threadId: ThreadId): string {
function entryFromVariable(v: { value: string; tags: Record<string, string> }): ThreadIndexEntry {
return {
head: v.value as CasRef,
status: (v.tags.status ?? "idle") as ThreadIndexEntry["status"],
suspendedRole: v.tags.suspendedRole ?? null,
suspendMessage: v.tags.suspendMessage ?? null,
completedAt: v.tags.completedAt !== undefined ? Number(v.tags.completedAt) : null,
};
}
@@ -320,74 +331,21 @@ export function setThread(varStore: VarStore, threadId: ThreadId, entry: ThreadI
// Head CAS nodes may use different schemas (StartNode vs StepNode) — clear all variants first.
varStore.remove(name);
const tags: Record<string, string> = {};
if (entry.status !== "idle") {
tags.status = entry.status;
}
if (entry.suspendedRole !== null) {
tags.suspendedRole = entry.suspendedRole;
}
if (entry.suspendMessage !== null) {
tags.suspendMessage = entry.suspendMessage;
}
if (entry.completedAt !== null) {
tags.completedAt = String(entry.completedAt);
}
varStore.set(name, entry.head, { tags });
}
/** Load only active threads (status not in completed/cancelled). */
export function loadActiveThreads(varStore: VarStore): ThreadsIndex {
const all = loadAllThreads(varStore);
const active: ThreadsIndex = {};
for (const [threadId, entry] of Object.entries(all)) {
if (entry.status !== "completed" && entry.status !== "cancelled") {
active[threadId as ThreadId] = entry;
}
}
return active;
/** Remove an active thread entry (on complete/cancel). */
export function deleteThread(varStore: VarStore, threadId: ThreadId): void {
varStore.remove(threadVarName(threadId));
}
/** Load only completed/cancelled threads (history). */
export function loadHistoryThreads(varStore: VarStore): ThreadsIndex {
const all = loadAllThreads(varStore);
const history: ThreadsIndex = {};
for (const [threadId, entry] of Object.entries(all)) {
if (entry.status === "completed" || entry.status === "cancelled") {
history[threadId as ThreadId] = entry;
}
}
return history;
}
/** Complete a thread by marking it completed or cancelled. */
export function completeThread(
varStore: VarStore,
threadId: ThreadId,
reason: "completed" | "cancelled",
): void {
const entry = getThread(varStore, threadId);
if (entry === null) {
return;
}
const completed = {
head: entry.head,
status: reason,
suspendedRole: null,
suspendMessage: null,
completedAt: Date.now(),
} as ThreadIndexEntry;
setThread(varStore, threadId, completed);
}
type LegacyHistoryEntry = {
thread: ThreadId;
workflow: CasRef;
head: CasRef;
completedAt: number;
reason: "completed" | "cancelled" | null;
};
function parseLegacyHistoryJsonlLine(trimmed: string): LegacyHistoryEntry | null {
function parseHistoryJsonlLine(trimmed: string): ThreadHistoryLine | null {
let raw: unknown;
try {
raw = JSON.parse(trimmed) as unknown;
@@ -421,7 +379,7 @@ function parseLegacyHistoryJsonlLine(trimmed: string): LegacyHistoryEntry | null
return null;
}
/** One-time migration: `~/.uwf/history.jsonl` → `@uwf/thread/*` variables with status tags. */
/** One-time migration: `~/.uwf/history.jsonl` → `@uwf/history/*` variables. */
export async function migrateHistoryIfNeeded(
storageRoot: string,
varStore: VarStore,
@@ -437,43 +395,47 @@ export async function migrateHistoryIfNeeded(
if (trimmed === "") {
continue;
}
const entry = parseLegacyHistoryJsonlLine(trimmed);
const entry = parseHistoryJsonlLine(trimmed);
if (entry !== null) {
const status = entry.reason === "cancelled" ? "cancelled" : "completed";
const threadEntry: ThreadIndexEntry = {
head: entry.head,
status: status as ThreadIndexEntry["status"],
suspendedRole: null,
suspendMessage: null,
completedAt: entry.completedAt,
};
setThread(varStore, entry.thread, threadEntry);
addHistoryEntry(varStore, entry);
}
}
await rename(path, `${path}.migrated`);
}
/** Migrate `@uwf/history/*` variables to `@uwf/thread/*` with status tags. */
export function migrateHistoryVarsToThreadVars(varStore: VarStore): void {
const LEGACY_HISTORY_VAR_PREFIX = "@uwf/history/";
const vars = varStore.list({ namePrefix: LEGACY_HISTORY_VAR_PREFIX });
for (const v of vars) {
const threadId = v.name.slice(LEGACY_HISTORY_VAR_PREFIX.length) as ThreadId;
const reason = v.tags.reason;
const status = reason === "cancelled" ? "cancelled" : "completed";
const completedAt = Number(v.tags.completedAt ?? Date.now());
const threadEntry: ThreadIndexEntry = {
head: v.value as CasRef,
status: status as ThreadIndexEntry["status"],
suspendedRole: null,
suspendMessage: null,
completedAt,
};
setThread(varStore, threadId, threadEntry);
varStore.remove(v.name);
}
export function loadAllHistory(varStore: VarStore): ThreadHistoryLine[] {
const vars = varStore.list({ namePrefix: HISTORY_VAR_PREFIX });
return vars.map((v) => ({
thread: v.name.slice(HISTORY_VAR_PREFIX.length) as ThreadId,
workflow: v.tags.workflow ?? "",
head: v.value as CasRef,
completedAt: Number(v.tags.completedAt ?? "0"),
reason: v.tags.reason === "completed" || v.tags.reason === "cancelled" ? v.tags.reason : null,
}));
}
export function findHistoryEntry(varStore: VarStore, threadId: ThreadId): ThreadHistoryLine | null {
const vars = varStore.list({ namePrefix: `${HISTORY_VAR_PREFIX}${threadId}` });
const v = vars.find((entry) => entry.name === `${HISTORY_VAR_PREFIX}${threadId}`);
if (v === undefined) {
return null;
}
return {
thread: threadId,
workflow: v.tags.workflow ?? "",
head: v.value as CasRef,
completedAt: Number(v.tags.completedAt ?? "0"),
reason: v.tags.reason === "completed" || v.tags.reason === "cancelled" ? v.tags.reason : null,
};
}
export function addHistoryEntry(varStore: VarStore, entry: ThreadHistoryLine): void {
varStore.set(`${HISTORY_VAR_PREFIX}${entry.thread}`, entry.head, {
tags: {
workflow: entry.workflow,
completedAt: String(entry.completedAt),
reason: entry.reason ?? "completed",
},
});
}
+46 -19
View File
@@ -24,13 +24,17 @@ function isOneOfSchema(fm: unknown): fm is SchemaObj & { oneOf: SchemaObj[] } {
return Array.isArray(obj.oneOf);
}
/** Check if a frontmatter schema declares "$status" as an enum (the required form for user roles). */
function hasStatusEnum(fm: unknown): boolean {
/** Check if a frontmatter schema uses enum-based multi-exit ($status with multiple enum values). */
function isEnumMultiExit(fm: unknown): boolean {
if (typeof fm !== "object" || fm === null) return false;
const obj = fm as SchemaObj;
const props = obj.properties as Record<string, SchemaObj> | undefined;
if (!props?.$status) return false;
return Array.isArray(props.$status.enum);
const statusDef = props.$status;
if (!Array.isArray(statusDef.enum)) return false;
// Filter out "_" (wildcard) — if remaining values > 1, it's multi-exit
const statuses = (statusDef.enum as string[]).filter((s) => s !== "_");
return statuses.length > 1;
}
/** Extract status values from an enum-based $status field. */
@@ -39,7 +43,7 @@ function getEnumStatuses(fm: SchemaObj): string[] {
if (!props?.$status) return [];
const statusDef = props.$status;
if (!Array.isArray(statusDef.enum)) return [];
return statusDef.enum as string[];
return (statusDef.enum as string[]).filter((s) => s !== "_");
}
/** Get property names from a schema object. */
@@ -190,19 +194,15 @@ function checkOneOfDiscriminant(
}
}
/** Check status-edge consistency for a user role. "_" is reserved for $START and rejected here. */
function checkStatusEdges(
/** Check status-edge consistency for a multi-exit role. */
function checkMultiExitEdges(
roleName: string,
graphKeys: Set<string>,
statusSet: Set<string>,
errors: string[],
): void {
if (graphKeys.has("_")) {
errors.push(`role "${roleName}" must use explicit $status keys in graph, not "_"`);
return;
}
if (statusSet.has("_")) {
errors.push(`role "${roleName}" $status enum must use explicit values, not "_"`);
errors.push(`role "${roleName}" is multi-exit but graph uses "_"`);
return;
}
@@ -255,23 +255,50 @@ function checkRoleConsistency(payload: WorkflowPayload, errors: string[]): void
const statuses = getOneOfStatuses(variants);
checkOneOfDiscriminant(roleName, variants, statuses, errors);
checkStatusEdges(roleName, graphKeys, new Set(statuses), errors);
checkMultiExitEdges(roleName, graphKeys, new Set(statuses), errors);
checkMultiExitMustache(roleName, graphEntry, variants, errors);
} else if (hasStatusEnum(fm)) {
} else if (isEnumMultiExit(fm)) {
const statuses = getEnumStatuses(fm as SchemaObj);
checkStatusEdges(roleName, graphKeys, new Set(statuses), errors);
checkMultiExitEdges(roleName, graphKeys, new Set(statuses), errors);
// For enum-based schemas, mustache vars come from the flat properties
checkEnumMustache(roleName, graphEntry, fm as SchemaObj, errors);
checkSingleExitMustache(roleName, graphEntry, fm as SchemaObj, errors);
} else {
errors.push(
`role "${roleName}" must define "$status" as an enum (or oneOf const) in frontmatter`,
);
checkSingleExitRole(roleName, graphKeys, graphEntry, fm as SchemaObj | null, errors);
}
}
}
/** Check single-exit role status and mustache. */
function checkSingleExitRole(
roleName: string,
graphKeys: Set<string>,
graphEntry: Record<string, { role: string; prompt: string }>,
fm: SchemaObj | null,
errors: string[],
): void {
if (graphKeys.size > 1 || (graphKeys.size === 1 && !graphKeys.has("_"))) {
if (!graphKeys.has("_")) {
errors.push(`role "${roleName}" is single-exit but graph has no "_" key`);
} else {
errors.push(`role "${roleName}" is single-exit but has status keys other than "_"`);
}
}
const singleTarget = graphEntry._;
if (!singleTarget) return;
const vars = extractMustacheVars(singleTarget.prompt);
const propNames = fm ? getPropertyNames(fm) : new Set<string>();
for (const v of vars) {
if (v === "$status") continue;
if (!propNames.has(v)) {
errors.push(`prompt variable "${v}" not found in role "${roleName}" frontmatter`);
}
}
}
/** Check mustache vars in all edge prompts against flat schema properties. */
function checkEnumMustache(
function checkSingleExitMustache(
roleName: string,
graphEntry: Record<string, { role: string; prompt: string }>,
fm: SchemaObj,
+3 -13
View File
@@ -57,18 +57,9 @@ function isGraph(value: unknown): boolean {
if (!isRecord(value)) {
return false;
}
return Object.entries(value).every(([node, statusMap]) => {
if (!isRecord(statusMap)) {
return false;
}
return Object.entries(statusMap).every(([status, target]) => {
// "_" is only valid as a status key for the $START entry node.
if (status === "_" && node !== "$START") {
return false;
}
return isTarget(target);
});
});
return Object.values(value).every(
(statusMap) => isRecord(statusMap) && Object.values(statusMap).every((t) => isTarget(t)),
);
}
/**
@@ -105,7 +96,6 @@ export function checkWorkflowFilenameConsistency(
}
/** Validate YAML-parsed workflow document shape (outputSchema may be inline JSON Schema). */
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: validation function with many field checks
export function parseWorkflowPayload(raw: unknown): WorkflowPayload | null {
if (!isRecord(raw)) {
return null;
+1 -1
View File
@@ -1,6 +1,6 @@
{
"name": "@united-workforce/dashboard",
"version": "0.1.0",
"version": "0.5.0-alpha.4",
"private": true,
"type": "module",
"scripts": {
-9
View File
@@ -1,9 +0,0 @@
# @united-workforce/eval
## 0.1.2
### Patch Changes
- 850a3b2: fix: resolve --agent override via config alias before raw command
`resolveAgentConfig()` now checks `config.agents[alias]` first before falling back to `parseAgentOverride()`. Eval CLI default `--agent` changed from `"hermes"` to `"uwf-hermes"`.
@@ -1,219 +0,0 @@
import type { StepEntry } from "@united-workforce/protocol";
import { beforeEach, describe, expect, test, vi } from "vitest";
import {
runFrontmatterJudge,
runHallucinationJudge,
runTokenStatsJudge,
runUpstreamJudge,
} from "../src/judge/builtin/index.js";
// Mock the shared read-steps helper so the judges never shell out to `uwf`.
vi.mock("../src/judge/builtin/read-steps.js", () => ({
readThreadSteps: vi.fn(),
}));
import { readThreadSteps } from "../src/judge/builtin/read-steps.js";
const mockedReadSteps = vi.mocked(readThreadSteps);
function makeStep(overrides: Partial<StepEntry>): StepEntry {
return {
hash: "HASH000000000",
role: "worker",
output: "---\n$status: done\n---\n\nbody",
detail: "DETAIL0000000",
agent: "hermes",
timestamp: 0,
durationMs: 0,
usage: null,
...overrides,
};
}
beforeEach(() => {
mockedReadSteps.mockReset();
});
describe("frontmatter-compliance judge", () => {
test("all steps have valid frontmatter → score 1.0", async () => {
mockedReadSteps.mockReturnValue([
makeStep({ role: "a", output: "---\n$status: done\n---\n\nwork" }),
makeStep({ role: "b", output: "---\n$status: needs_input\n---\nmore" }),
]);
const result = await runFrontmatterJudge("T1");
const data = result.data as { stepsTotal: number; stepsValid: number; invalidSteps: unknown[] };
expect(result.score).toBe(1.0);
expect(data.stepsTotal).toBe(2);
expect(data.stepsValid).toBe(2);
expect(data.invalidSteps).toHaveLength(0);
});
test("some steps missing $status → partial score", async () => {
mockedReadSteps.mockReturnValue([
makeStep({ role: "a", output: "---\n$status: done\n---\nok" }),
makeStep({ role: "b", output: "---\nfoo: bar\n---\nmissing status" }),
makeStep({ role: "c", output: "no frontmatter at all" }),
]);
const result = await runFrontmatterJudge("T2");
const data = result.data as {
stepsTotal: number;
stepsValid: number;
invalidSteps: Array<{ stepIndex: number; role: string; errors: string[] }>;
};
expect(result.score).toBeCloseTo(1 / 3, 10);
expect(data.stepsTotal).toBe(3);
expect(data.stepsValid).toBe(1);
expect(data.invalidSteps).toHaveLength(2);
expect(data.invalidSteps[0]).toMatchObject({ stepIndex: 1, role: "b" });
expect(data.invalidSteps[1]).toMatchObject({ stepIndex: 2, role: "c" });
});
test("no steps → score 0 (0/0 edge case)", async () => {
mockedReadSteps.mockReturnValue([]);
const result = await runFrontmatterJudge("T3");
const data = result.data as { stepsTotal: number; stepsValid: number; invalidSteps: unknown[] };
expect(result.score).toBe(0);
expect(data.stepsTotal).toBe(0);
expect(data.stepsValid).toBe(0);
expect(data.invalidSteps).toHaveLength(0);
});
test("empty-string $status counts as invalid", async () => {
mockedReadSteps.mockReturnValue([makeStep({ role: "a", output: '---\n$status: ""\n---\nx' })]);
const result = await runFrontmatterJudge("T4");
expect(result.score).toBe(0);
});
test("parsed object output with $status → score 1.0", async () => {
mockedReadSteps.mockReturnValue([
makeStep({ role: "a", output: { $status: "done", summary: "fixed" } as unknown as string }),
makeStep({ role: "b", output: { $status: "reviewed" } as unknown as string }),
]);
const result = await runFrontmatterJudge("T5");
const data = result.data as { stepsTotal: number; stepsValid: number; invalidSteps: unknown[] };
expect(result.score).toBe(1.0);
expect(data.stepsTotal).toBe(2);
expect(data.stepsValid).toBe(2);
});
test("parsed object output missing $status → score 0", async () => {
mockedReadSteps.mockReturnValue([
makeStep({ role: "a", output: { summary: "no status field" } as unknown as string }),
]);
const result = await runFrontmatterJudge("T6");
expect(result.score).toBe(0);
});
});
describe("token-stats judge", () => {
test("steps with usage → sums correctly", async () => {
mockedReadSteps.mockReturnValue([
makeStep({
role: "a",
usage: { turns: 2, inputTokens: 100, outputTokens: 50, duration: 1.5 },
}),
makeStep({
role: "b",
usage: { turns: 3, inputTokens: 200, outputTokens: 75, duration: 2.0 },
}),
]);
const result = await runTokenStatsJudge("T1");
const data = result.data as {
totalInput: number;
totalOutput: number;
totalTurns: number;
perStep: Array<{ role: string; inputTokens: number; outputTokens: number; turns: number }>;
};
expect(result.score).toBe(1.0);
expect(data.totalInput).toBe(300);
expect(data.totalOutput).toBe(125);
expect(data.totalTurns).toBe(5);
expect(data.perStep).toHaveLength(2);
expect(data.perStep[0]).toEqual({
role: "a",
inputTokens: 100,
outputTokens: 50,
turns: 2,
duration: 1.5,
});
});
test("steps with null usage → zeros", async () => {
mockedReadSteps.mockReturnValue([
makeStep({ role: "a", usage: null }),
makeStep({ role: "b", usage: null }),
]);
const result = await runTokenStatsJudge("T2");
const data = result.data as {
totalInput: number;
totalOutput: number;
totalTurns: number;
perStep: Array<{
inputTokens: number;
outputTokens: number;
turns: number;
duration: number;
}>;
};
expect(result.score).toBe(1.0);
expect(data.totalInput).toBe(0);
expect(data.totalOutput).toBe(0);
expect(data.totalTurns).toBe(0);
expect(data.perStep[0]).toEqual({
role: "a",
inputTokens: 0,
outputTokens: 0,
turns: 0,
duration: 0,
});
});
test("empty steps → all zeros, score 1.0", async () => {
mockedReadSteps.mockReturnValue([]);
const result = await runTokenStatsJudge("T3");
const data = result.data as {
totalInput: number;
totalOutput: number;
totalTurns: number;
perStep: unknown[];
};
expect(result.score).toBe(1.0);
expect(data.totalInput).toBe(0);
expect(data.totalOutput).toBe(0);
expect(data.totalTurns).toBe(0);
expect(data.perStep).toHaveLength(0);
});
});
describe("LLM-as-judge stubs", () => {
test("upstream-consumption returns a stub", async () => {
const result = await runUpstreamJudge("T1");
expect(result.score).toBe(0);
expect(result.data).toEqual({ perStep: [] });
expect(result.schema.title).toBe("@uwf/eval-judge-upstream");
});
test("hallucination returns a stub", async () => {
const result = await runHallucinationJudge("T1");
expect(result.score).toBe(0);
expect(result.data).toEqual({ perStep: [] });
expect(result.schema.title).toBe("@uwf/eval-judge-hallucination");
});
});
-152
View File
@@ -1,152 +0,0 @@
import { bootstrap, createMemoryStore } from "@ocas/core";
import { describe, expect, test } from "vitest";
import type { JudgeRunner } from "../src/runner/index.js";
import { collect, computeOverall } from "../src/runner/index.js";
import type { EvalRunConfig, EvalStore } from "../src/storage/index.js";
import type { JudgeEntry, TaskManifest } from "../src/task/index.js";
function makeJudge(name: string, weight: number, builtin: boolean): JudgeEntry {
return {
name,
weight,
builtin,
entry: builtin ? null : `dist/judges/${name}.js`,
schema: null,
};
}
function makeManifest(judges: JudgeEntry[]): TaskManifest {
return {
name: "fix-off-by-one",
description: "test task",
workflow: "solve-issue",
prompt: "Fix the bug",
limits: { maxSteps: 10, timeoutMinutes: 30 },
judges,
};
}
function makeEvalStore(): EvalStore {
const store = createMemoryStore();
bootstrap(store);
return { store, varStore: store.var };
}
const CONFIG: EvalRunConfig = {
agent: "hermes",
model: "claude-sonnet-4",
engineVersion: "test",
};
/** Returns a fixed score per judge name. */
function scriptedRunner(scores: Record<string, number>): JudgeRunner {
return async (_taskDir, _workDir, _threadId, judge) => ({
score: scores[judge.name] ?? 0,
data: { judged: judge.name },
schema: { type: "object" },
});
}
describe("computeOverall", () => {
test("computes the weighted average correctly", () => {
const overall = computeOverall([
{ score: 0.8, weight: 0.3 },
{ score: 0.6, weight: 0.3 },
{ score: 1.0, weight: 0.4 },
]);
// 0.24 + 0.18 + 0.4 = 0.82
expect(overall).toBeCloseTo(0.82, 10);
});
test("a weight-0 judge does not affect the result", () => {
const withInformational = computeOverall([
{ score: 1.0, weight: 1.0 },
{ score: 0.0, weight: 0.0 },
]);
expect(withInformational).toBe(1.0);
});
test("returns 0 when total weight is 0", () => {
expect(computeOverall([{ score: 0.5, weight: 0 }])).toBe(0);
});
});
describe("collect", () => {
test("computes weighted score correctly across judges", async () => {
const evalStore = makeEvalStore();
const manifest = makeManifest([
makeJudge("test-pass", 0.6, false),
makeJudge("code-quality", 0.4, false),
]);
const runJudge = scriptedRunner({ "test-pass": 1.0, "code-quality": 0.5 });
const result = await collect(
{
evalStore,
taskDir: "/tmp/task",
workDir: "/tmp/work",
threadId: "THREAD123",
manifest,
config: CONFIG,
},
runJudge,
);
// 1.0 * 0.6 + 0.5 * 0.4 = 0.8
expect(result.overall).toBeCloseTo(0.8, 10);
expect(result.runHash).toBeTruthy();
expect(result.judges).toHaveLength(2);
expect(result.judges[0]).toEqual({ name: "test-pass", score: 1.0, weight: 0.6 });
const latest = evalStore.varStore.list({
exactName: "@uwf/eval/fix-off-by-one/latest",
});
expect(latest[0]?.value).toBe(result.runHash);
});
test("handles a judge with weight 0 (informational)", async () => {
const evalStore = makeEvalStore();
const manifest = makeManifest([
makeJudge("test-pass", 1.0, false),
makeJudge("token-stats", 0, true),
]);
// token-stats is builtin → default runner would score 0; give scripted score
// that would skew the result if it were counted.
const runJudge = scriptedRunner({ "test-pass": 0.5, "token-stats": 1.0 });
const result = await collect(
{
evalStore,
taskDir: "/tmp/task",
workDir: "/tmp/work",
threadId: "THREAD123",
manifest,
config: CONFIG,
},
runJudge,
);
// Only test-pass (weight 1.0) counts → overall = 0.5
expect(result.overall).toBeCloseTo(0.5, 10);
expect(result.judges).toHaveLength(2);
const tokenStats = result.judges.find((j) => j.name === "token-stats");
expect(tokenStats?.weight).toBe(0);
});
test("unknown builtin judge name throws via the default runner", async () => {
const evalStore = makeEvalStore();
const manifest = makeManifest([makeJudge("not-a-real-judge", 1.0, true)]);
// Use the default runner (no injected runner) → builtin dispatch → unknown name throws.
await expect(
collect({
evalStore,
taskDir: "/tmp/task",
workDir: "/tmp/work",
threadId: "THREAD123",
manifest,
config: CONFIG,
}),
).rejects.toThrow(/unknown builtin judge/);
});
});
-171
View File
@@ -1,171 +0,0 @@
import { bootstrap, createMemoryStore, putSchema } from "@ocas/core";
import type { CasRef } from "@united-workforce/protocol";
import { describe, expect, test } from "vitest";
import {
formatDiff,
formatList,
formatReport,
readEvalEntries,
readEvalRun,
selectEntries,
} from "../src/commands/index.js";
import type { EvalRunPayload, EvalStore } from "../src/storage/index.js";
import { EVAL_RUN_SCHEMA, setEvalLatest } from "../src/storage/index.js";
function makeEvalStore(): EvalStore {
const store = createMemoryStore();
bootstrap(store);
return { store, varStore: store.var };
}
function makePayload(
task: string,
overall: number,
timestamp: number,
judges: EvalRunPayload["judges"] = [
{
name: "frontmatter-compliance",
score: 1.0,
weight: 0.6,
dataHash: "AAAAAAAAAAAAA" as CasRef,
},
{ name: "token-stats", score: 0.5, weight: 0, dataHash: "BBBBBBBBBBBBB" as CasRef },
],
config: EvalRunPayload["config"] = {
agent: "hermes",
model: "claude-sonnet-4",
engineVersion: "1.0.0",
},
): EvalRunPayload {
return { task, config, threadId: "THREAD0123456789", judges, overall, timestamp };
}
/** Store an eval-run node in CAS and index it under @uwf/eval/<task>/latest. */
function storeRun(evalStore: EvalStore, payload: EvalRunPayload): string {
const schemaHash = putSchema(evalStore.store, EVAL_RUN_SCHEMA);
const hash = evalStore.store.cas.put(schemaHash, payload);
setEvalLatest(evalStore.varStore, payload.task, hash);
return hash;
}
describe("formatReport", () => {
test("includes task, overall, config and judges", () => {
const payload = makePayload("fix-off-by-one", 0.8, Date.UTC(2026, 0, 2, 3, 4, 5));
const output = formatReport(payload, "RUNHASH123456");
expect(output).toContain("fix-off-by-one");
expect(output).toContain("0.8000");
expect(output).toContain("hermes");
expect(output).toContain("claude-sonnet-4");
expect(output).toContain("1.0.0");
expect(output).toContain("frontmatter-compliance");
expect(output).toContain("token-stats");
expect(output).toContain("THREAD0123456789");
expect(output).toContain("RUNHASH123456");
});
test("round-trips a stored run via readEvalRun", () => {
const evalStore = makeEvalStore();
const payload = makePayload("fix-off-by-one", 0.75, Date.now());
const hash = storeRun(evalStore, payload);
const loaded = readEvalRun(evalStore, hash);
expect(loaded).not.toBeNull();
const output = formatReport(loaded as EvalRunPayload, hash);
expect(output).toContain("fix-off-by-one");
expect(output).toContain("0.7500");
});
test("readEvalRun returns null for a missing hash", () => {
const evalStore = makeEvalStore();
expect(readEvalRun(evalStore, "NOPENOPENOPE0")).toBeNull();
});
});
describe("list", () => {
test("lists eval runs stored under different tasks", () => {
const evalStore = makeEvalStore();
storeRun(evalStore, makePayload("fix-off-by-one", 0.8, 2000));
storeRun(evalStore, makePayload("write-docs", 0.6, 1000));
const entries = readEvalEntries(evalStore);
expect(entries).toHaveLength(2);
const output = formatList(selectEntries(entries, null, 20));
expect(output).toContain("fix-off-by-one");
expect(output).toContain("write-docs");
});
test("sorts newest-first by timestamp", () => {
const evalStore = makeEvalStore();
storeRun(evalStore, makePayload("old-task", 0.5, 1000));
storeRun(evalStore, makePayload("new-task", 0.5, 2000));
const selected = selectEntries(readEvalEntries(evalStore), null, 20);
expect(selected[0]?.task).toBe("new-task");
expect(selected[1]?.task).toBe("old-task");
});
test("--task filter only shows the matching task", () => {
const evalStore = makeEvalStore();
storeRun(evalStore, makePayload("fix-off-by-one", 0.8, 2000));
storeRun(evalStore, makePayload("write-docs", 0.6, 1000));
const output = formatList(selectEntries(readEvalEntries(evalStore), "write-docs", 20));
expect(output).toContain("write-docs");
expect(output).not.toContain("fix-off-by-one");
});
test("--limit caps the number of rows", () => {
const evalStore = makeEvalStore();
storeRun(evalStore, makePayload("task-a", 0.8, 3000));
storeRun(evalStore, makePayload("task-b", 0.6, 2000));
storeRun(evalStore, makePayload("task-c", 0.4, 1000));
const selected = selectEntries(readEvalEntries(evalStore), null, 2);
expect(selected).toHaveLength(2);
expect(selected.map((e) => e.task)).toEqual(["task-a", "task-b"]);
});
test("empty store renders a placeholder", () => {
const evalStore = makeEvalStore();
const output = formatList(selectEntries(readEvalEntries(evalStore), null, 20));
expect(output).toContain("(no eval runs found)");
});
});
describe("formatDiff", () => {
test("shows an upward delta when B scores higher", () => {
const a = makePayload("fix-off-by-one", 0.6, 1000);
const b = makePayload("fix-off-by-one", 0.8, 2000);
const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000");
expect(output).toContain("▲");
expect(output).toContain("HASHA00000000");
expect(output).toContain("HASHB00000000");
});
test("shows a downward delta when B scores lower", () => {
const a = makePayload("fix-off-by-one", 0.9, 1000);
const b = makePayload("fix-off-by-one", 0.4, 2000);
const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000");
expect(output).toContain("▼");
});
test("marks differing config values", () => {
const a = makePayload("fix-off-by-one", 0.6, 1000, undefined, {
agent: "hermes",
model: "claude-sonnet-4",
engineVersion: "1.0.0",
});
const b = makePayload("fix-off-by-one", 0.6, 2000, undefined, {
agent: "claude-code",
model: "claude-sonnet-4",
engineVersion: "1.0.0",
});
const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000");
expect(output).toContain("≠");
expect(output).toContain("claude-code");
});
});
-74
View File
@@ -1,74 +0,0 @@
import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, beforeEach, describe, expect, test } from "vitest";
import { prepare } from "../src/runner/index.js";
const TASK_YAML = `
name: fix-off-by-one
description: Fix an off-by-one error
workflow: solve-issue
prompt: "Fix the bug"
limits:
maxSteps: 12
timeoutMinutes: 20
judges:
- name: frontmatter-compliance
weight: 0.5
builtin: true
- name: test-pass
weight: 0.5
entry: dist/judges/test-pass.js
`;
let taskDir: string;
beforeEach(async () => {
taskDir = await mkdtemp(join(tmpdir(), "uwf-eval-task-"));
await writeFile(join(taskDir, "task.yaml"), TASK_YAML, "utf8");
const fixtureDir = join(taskDir, "fixture");
await mkdir(join(fixtureDir, "src"), { recursive: true });
await writeFile(join(fixtureDir, "src", "calc.ts"), "export const add = (a, b) => a + b + 1;\n");
await writeFile(join(fixtureDir, "package.json"), '{ "name": "fixture" }\n');
});
afterEach(async () => {
await rm(taskDir, { recursive: true, force: true });
});
describe("prepare", () => {
test("returns the parsed manifest", async () => {
const result = await prepare(taskDir);
expect(result.taskDir).toBe(taskDir);
expect(result.manifest.name).toBe("fix-off-by-one");
expect(result.manifest.workflow).toBe("solve-issue");
expect(result.manifest.limits.maxSteps).toBe(12);
expect(result.manifest.judges).toHaveLength(2);
});
test("copies fixture into a fresh temp work dir", async () => {
const result = await prepare(taskDir);
expect(result.workDir).not.toBe(taskDir);
expect(result.workDir.startsWith(tmpdir())).toBe(true);
const calc = await readFile(join(result.workDir, "src", "calc.ts"), "utf8");
expect(calc).toContain("export const add");
const pkg = await readFile(join(result.workDir, "package.json"), "utf8");
expect(pkg).toContain("fixture");
await rm(result.workDir, { recursive: true, force: true });
});
test("creates an empty work dir when no fixture/ exists", async () => {
const noFixtureDir = await mkdtemp(join(tmpdir(), "uwf-eval-nofix-"));
await writeFile(join(noFixtureDir, "task.yaml"), TASK_YAML, "utf8");
const result = await prepare(noFixtureDir);
expect(result.workDir.startsWith(tmpdir())).toBe(true);
await rm(noFixtureDir, { recursive: true, force: true });
await rm(result.workDir, { recursive: true, force: true });
});
});
-63
View File
@@ -1,63 +0,0 @@
import { describe, expect, test } from "vitest";
import {
EVAL_JUDGE_FRONTMATTER_SCHEMA,
EVAL_JUDGE_HALLUCINATION_SCHEMA,
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
EVAL_JUDGE_UPSTREAM_SCHEMA,
EVAL_RUN_SCHEMA,
} from "../src/storage/index.js";
describe("OCAS schema definitions", () => {
test("eval-run schema has correct title and required fields", () => {
expect(EVAL_RUN_SCHEMA.title).toBe("@uwf/eval-run");
const required = EVAL_RUN_SCHEMA.required as string[];
expect(required).toContain("task");
expect(required).toContain("config");
expect(required).toContain("threadId");
expect(required).toContain("judges");
expect(required).toContain("overall");
expect(required).toContain("timestamp");
});
test("frontmatter judge schema has correct title", () => {
expect(EVAL_JUDGE_FRONTMATTER_SCHEMA.title).toBe("@uwf/eval-judge-frontmatter");
const required = EVAL_JUDGE_FRONTMATTER_SCHEMA.required as string[];
expect(required).toContain("stepsTotal");
expect(required).toContain("stepsValid");
expect(required).toContain("invalidSteps");
});
test("upstream judge schema has correct title", () => {
expect(EVAL_JUDGE_UPSTREAM_SCHEMA.title).toBe("@uwf/eval-judge-upstream");
const required = EVAL_JUDGE_UPSTREAM_SCHEMA.required as string[];
expect(required).toContain("perStep");
});
test("hallucination judge schema has correct title", () => {
expect(EVAL_JUDGE_HALLUCINATION_SCHEMA.title).toBe("@uwf/eval-judge-hallucination");
const required = EVAL_JUDGE_HALLUCINATION_SCHEMA.required as string[];
expect(required).toContain("perStep");
});
test("token-stats judge schema has correct title", () => {
expect(EVAL_JUDGE_TOKEN_STATS_SCHEMA.title).toBe("@uwf/eval-judge-token-stats");
const required = EVAL_JUDGE_TOKEN_STATS_SCHEMA.required as string[];
expect(required).toContain("totalInput");
expect(required).toContain("totalOutput");
expect(required).toContain("totalTurns");
expect(required).toContain("perStep");
});
test("all schemas have type object at root", () => {
const schemas = [
EVAL_RUN_SCHEMA,
EVAL_JUDGE_FRONTMATTER_SCHEMA,
EVAL_JUDGE_UPSTREAM_SCHEMA,
EVAL_JUDGE_HALLUCINATION_SCHEMA,
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
];
for (const s of schemas) {
expect(s.type).toBe("object");
}
});
});
-163
View File
@@ -1,163 +0,0 @@
import { describe, expect, test } from "vitest";
import { parseTaskManifest } from "../src/task/index.js";
const VALID_YAML = `
name: fix-off-by-one
description: Fix an off-by-one error in a calculator
workflow: solve-issue
prompt: "Fix the bug: add(1,2) returns 4 instead of 3"
limits:
maxSteps: 15
timeoutMinutes: 30
judges:
- name: frontmatter-compliance
weight: 0.15
builtin: true
- name: test-pass
weight: 0.3
entry: dist/judges/test-pass.js
schema: schemas/test-pass.json
`;
describe("parseTaskManifest", () => {
test("parses valid task.yaml", () => {
const manifest = parseTaskManifest(VALID_YAML);
expect(manifest.name).toBe("fix-off-by-one");
expect(manifest.description).toBe("Fix an off-by-one error in a calculator");
expect(manifest.workflow).toBe("solve-issue");
expect(manifest.prompt).toBe("Fix the bug: add(1,2) returns 4 instead of 3");
expect(manifest.limits).toEqual({ maxSteps: 15, timeoutMinutes: 30 });
expect(manifest.judges).toHaveLength(2);
});
test("parses builtin judge", () => {
const manifest = parseTaskManifest(VALID_YAML);
const builtin = manifest.judges[0];
expect(builtin).toBeDefined();
expect(builtin!.name).toBe("frontmatter-compliance");
expect(builtin!.weight).toBe(0.15);
expect(builtin!.builtin).toBe(true);
expect(builtin!.entry).toBeNull();
});
test("parses custom judge with entry + schema", () => {
const manifest = parseTaskManifest(VALID_YAML);
const custom = manifest.judges[1];
expect(custom).toBeDefined();
expect(custom!.name).toBe("test-pass");
expect(custom!.weight).toBe(0.3);
expect(custom!.builtin).toBe(false);
expect(custom!.entry).toBe("dist/judges/test-pass.js");
expect(custom!.schema).toBe("schemas/test-pass.json");
});
test("defaults limits when omitted", () => {
const yaml = `
name: minimal
workflow: solve-issue
prompt: do something
judges:
- name: check
builtin: true
`;
const manifest = parseTaskManifest(yaml);
expect(manifest.limits).toEqual({ maxSteps: 20, timeoutMinutes: 30 });
});
test("defaults description to empty string", () => {
const yaml = `
name: no-desc
workflow: solve-issue
prompt: do something
judges:
- name: check
builtin: true
`;
const manifest = parseTaskManifest(yaml);
expect(manifest.description).toBe("");
});
test("rejects missing name", () => {
const yaml = `
workflow: solve-issue
prompt: do something
judges:
- name: check
builtin: true
`;
expect(() => parseTaskManifest(yaml)).toThrow("name is required");
});
test("rejects missing workflow", () => {
const yaml = `
name: test
prompt: do something
judges:
- name: check
builtin: true
`;
expect(() => parseTaskManifest(yaml)).toThrow("workflow is required");
});
test("rejects missing prompt", () => {
const yaml = `
name: test
workflow: solve-issue
judges:
- name: check
builtin: true
`;
expect(() => parseTaskManifest(yaml)).toThrow("prompt is required");
});
test("rejects empty judges array", () => {
const yaml = `
name: test
workflow: solve-issue
prompt: do something
judges: []
`;
expect(() => parseTaskManifest(yaml)).toThrow("at least one judge");
});
test("rejects non-builtin judge without entry", () => {
const yaml = `
name: test
workflow: solve-issue
prompt: do something
judges:
- name: custom-check
weight: 0.5
`;
expect(() => parseTaskManifest(yaml)).toThrow("non-builtin judge must have entry");
});
test("rejects non-object YAML root", () => {
expect(() => parseTaskManifest("just a string")).toThrow("must be a YAML mapping");
});
test("rejects judge without name", () => {
const yaml = `
name: test
workflow: solve-issue
prompt: do something
judges:
- weight: 0.5
builtin: true
`;
expect(() => parseTaskManifest(yaml)).toThrow("name is required");
});
test("defaults weight to 0 when omitted", () => {
const yaml = `
name: test
workflow: solve-issue
prompt: do something
judges:
- name: token-stats
builtin: true
`;
const manifest = parseTaskManifest(yaml);
expect(manifest.judges[0]!.weight).toBe(0);
});
});
-45
View File
@@ -1,45 +0,0 @@
{
"name": "@united-workforce/eval",
"version": "0.1.3",
"private": false,
"files": [
"src",
"dist",
"package.json"
],
"type": "module",
"bin": {
"uwf-eval": "./dist/cli.js"
},
"exports": {
".": {
"types": "./dist/index.d.ts",
"import": "./dist/index.js"
}
},
"scripts": {
"test": "vitest run __tests__/",
"test:ci": "vitest run __tests__/"
},
"dependencies": {
"@ocas/core": "^0.3.0",
"@ocas/fs": "^0.3.0",
"@united-workforce/protocol": "workspace:^",
"@united-workforce/util": "workspace:^",
"commander": "^14.0.3",
"yaml": "^2.9.0"
},
"devDependencies": {
"typescript": "^5.8.3"
},
"repository": {
"type": "git",
"url": "https://git.shazhou.work/shazhou/united-workforce.git",
"directory": "packages/eval"
},
"homepage": "https://git.shazhou.work/shazhou/united-workforce#readme",
"bugs": {
"url": "https://git.shazhou.work/shazhou/united-workforce/issues"
},
"license": "MIT"
}
-25
View File
@@ -1,25 +0,0 @@
#!/usr/bin/env node
import { Command } from "commander";
import {
registerDiffCommand,
registerListCommand,
registerReportCommand,
registerRunCommand,
} from "./commands/index.js";
// eslint-disable-next-line -- dynamic import for version
const pkg = await import("../package.json", { with: { type: "json" } });
const program = new Command();
program
.name("uwf-eval")
.description("Evaluate uwf workflow quality with real agents")
.version(pkg.default.version, "-V, --version");
registerRunCommand(program);
registerReportCommand(program);
registerDiffCommand(program);
registerListCommand(program);
program.parse();
-38
View File
@@ -1,38 +0,0 @@
import { createLogger } from "@united-workforce/util";
import type { Command } from "commander";
import { createEvalStore } from "../storage/index.js";
import { formatDiff } from "./format.js";
import { readEvalRun } from "./read.js";
const log = createLogger({ sink: { kind: "stderr" } });
const LOG_DIFF = "D3WZ8N5T";
export function registerDiffCommand(program: Command): void {
program
.command("diff <hash1> <hash2>")
.description("Compare two eval runs side-by-side")
.action(async (hash1: string, hash2: string) => {
try {
const evalStore = await createEvalStore();
const payloadA = readEvalRun(evalStore, hash1);
if (payloadA === null) {
process.stderr.write(`eval run not found: ${hash1}\n`);
process.exitCode = 1;
return;
}
const payloadB = readEvalRun(evalStore, hash2);
if (payloadB === null) {
process.stderr.write(`eval run not found: ${hash2}\n`);
process.exitCode = 1;
return;
}
log(LOG_DIFF, `diff a=${hash1} b=${hash2}`);
process.stdout.write(formatDiff(payloadA, hash1, payloadB, hash2));
} catch (e) {
const message = e instanceof Error ? e.message : String(e);
process.stderr.write(`${message}\n`);
process.exitCode = 1;
}
});
}
-148
View File
@@ -1,148 +0,0 @@
import type { EvalRunPayload } from "../storage/index.js";
import type { EvalListEntry } from "./types.js";
const NAME_WIDTH = 28;
const SCORE_WIDTH = 10;
const TIMESTAMP_WIDTH = 26;
/** Format a 0..1 score (or weight) with fixed precision. */
function formatScore(value: number): string {
return value.toFixed(4);
}
/** Human-readable ISO-8601 timestamp from epoch milliseconds. */
function formatTimestamp(ms: number): string {
return new Date(ms).toISOString();
}
/** Right-pad to a fixed column width (with a trailing space if already full). */
function pad(value: string, width: number): string {
return value.length >= width ? `${value} ` : value.padEnd(width);
}
/** Directional indicator for a score delta (B relative to A). */
function formatDelta(delta: number): string {
if (delta > 0) {
return `▲ +${formatScore(delta)}`;
}
if (delta < 0) {
return `${formatScore(delta)}`;
}
return `= ${formatScore(0)}`;
}
/** Render a single eval run as a human-readable report. */
export function formatReport(payload: EvalRunPayload, runHash: string): string {
const lines: string[] = [];
lines.push("=== Eval Report ===");
lines.push(`Task: ${payload.task}`);
lines.push(`Overall: ${formatScore(payload.overall)}`);
lines.push(`Timestamp: ${formatTimestamp(payload.timestamp)}`);
lines.push("");
lines.push("Config:");
lines.push(` Agent: ${payload.config.agent}`);
lines.push(` Model: ${payload.config.model}`);
lines.push(` Engine: ${payload.config.engineVersion}`);
lines.push("");
lines.push("Judges:");
lines.push(` ${pad("NAME", NAME_WIDTH)}${pad("SCORE", SCORE_WIDTH)}WEIGHT`);
for (const judge of payload.judges) {
lines.push(
` ${pad(judge.name, NAME_WIDTH)}${pad(formatScore(judge.score), SCORE_WIDTH)}${formatScore(judge.weight)}`,
);
}
lines.push("");
lines.push(`Thread: ${payload.threadId}`);
lines.push(`Run: ${runHash}`);
return `${lines.join("\n")}\n`;
}
/** Render a side-by-side comparison of two eval runs. */
export function formatDiff(
payloadA: EvalRunPayload,
hashA: string,
payloadB: EvalRunPayload,
hashB: string,
): string {
const lines: string[] = [];
lines.push("=== Eval Diff ===");
lines.push(`A: ${hashA} (${payloadA.task})`);
lines.push(`B: ${hashB} (${payloadB.task})`);
lines.push("");
const overallDelta = payloadB.overall - payloadA.overall;
lines.push("Overall:");
lines.push(
` A=${formatScore(payloadA.overall)} B=${formatScore(payloadB.overall)} ${formatDelta(overallDelta)}`,
);
lines.push("");
lines.push("Config:");
lines.push(configLine("Agent", payloadA.config.agent, payloadB.config.agent));
lines.push(configLine("Model", payloadA.config.model, payloadB.config.model));
lines.push(configLine("Engine", payloadA.config.engineVersion, payloadB.config.engineVersion));
lines.push("");
lines.push("Judges:");
lines.push(` ${pad("NAME", NAME_WIDTH)}${pad("A", SCORE_WIDTH)}${pad("B", SCORE_WIDTH)}DELTA`);
const scoresA = new Map(payloadA.judges.map((judge) => [judge.name, judge.score]));
const scoresB = new Map(payloadB.judges.map((judge) => [judge.name, judge.score]));
for (const name of unionJudgeNames(payloadA, payloadB)) {
const scoreA = scoresA.get(name);
const scoreB = scoresB.get(name);
const cellA = scoreA === undefined ? "—" : formatScore(scoreA);
const cellB = scoreB === undefined ? "—" : formatScore(scoreB);
const delta = scoreA !== undefined && scoreB !== undefined ? formatDelta(scoreB - scoreA) : "";
lines.push(
` ${pad(name, NAME_WIDTH)}${pad(cellA, SCORE_WIDTH)}${pad(cellB, SCORE_WIDTH)}${delta}`,
);
}
return `${lines.join("\n")}\n`;
}
/** Render a table of indexed eval runs. */
export function formatList(entries: ReadonlyArray<EvalListEntry>): string {
const lines: string[] = [];
lines.push(
` ${pad("TASK", NAME_WIDTH)}${pad("OVERALL", SCORE_WIDTH)}${pad("TIMESTAMP", TIMESTAMP_WIDTH)}HASH`,
);
if (entries.length === 0) {
lines.push(" (no eval runs found)");
}
for (const entry of entries) {
lines.push(
` ${pad(entry.task, NAME_WIDTH)}${pad(formatScore(entry.overall), SCORE_WIDTH)}${pad(formatTimestamp(entry.timestamp), TIMESTAMP_WIDTH)}${entry.hash}`,
);
}
return `${lines.join("\n")}\n`;
}
/** Sort newest-first, then apply optional task filter and result limit. */
export function selectEntries(
entries: ReadonlyArray<EvalListEntry>,
task: string | null,
limit: number | null,
): EvalListEntry[] {
const sorted = [...entries].sort((a, b) => b.timestamp - a.timestamp);
const filtered = task !== null ? sorted.filter((entry) => entry.task === task) : sorted;
return limit !== null ? filtered.slice(0, limit) : filtered;
}
/** Ordered union of judge names: A's order first, then B-only names. */
function unionJudgeNames(payloadA: EvalRunPayload, payloadB: EvalRunPayload): string[] {
const names: string[] = [];
const seen = new Set<string>();
for (const judge of [...payloadA.judges, ...payloadB.judges]) {
if (!seen.has(judge.name)) {
seen.add(judge.name);
names.push(judge.name);
}
}
return names;
}
/** One config row: `=` when equal, `≠` otherwise. */
function configLine(label: string, valueA: string, valueB: string): string {
const marker = valueA === valueB ? "=" : "≠";
return ` ${pad(`${label}:`, SCORE_WIDTH)}${marker} A=${valueA} B=${valueB}`;
}
-7
View File
@@ -1,7 +0,0 @@
export { registerDiffCommand } from "./diff.js";
export { formatDiff, formatList, formatReport, selectEntries } from "./format.js";
export { registerListCommand } from "./list.js";
export { readEvalEntries, readEvalRun } from "./read.js";
export { registerReportCommand } from "./report.js";
export { registerRunCommand } from "./run.js";
export type { EvalListEntry } from "./types.js";
-43
View File
@@ -1,43 +0,0 @@
import { createLogger } from "@united-workforce/util";
import type { Command } from "commander";
import { createEvalStore } from "../storage/index.js";
import { formatList, selectEntries } from "./format.js";
import { readEvalEntries } from "./read.js";
const log = createLogger({ sink: { kind: "stderr" } });
const LOG_LIST = "L5KX9R2B";
type ListCliOptions = {
task: string | undefined;
limit: string;
};
export function registerListCommand(program: Command): void {
program
.command("list")
.description("List past eval runs")
.option("--task <name>", "filter by task name")
.option("--limit <n>", "max results", "20")
.action(async (opts: ListCliOptions) => {
const limit = Number.parseInt(opts.limit, 10);
if (!Number.isInteger(limit) || limit < 1) {
process.stderr.write("--limit must be a positive integer\n");
process.exitCode = 1;
return;
}
try {
const evalStore = await createEvalStore();
const entries = readEvalEntries(evalStore);
const task = opts.task ?? null;
const selected = selectEntries(entries, task, limit);
log(LOG_LIST, `list task=${task ?? "*"} found=${entries.length} shown=${selected.length}`);
process.stdout.write(formatList(selected));
} catch (e) {
const message = e instanceof Error ? e.message : String(e);
process.stderr.write(`${message}\n`);
process.exitCode = 1;
}
});
}
-41
View File
@@ -1,41 +0,0 @@
import type { EvalRunPayload, EvalStore } from "../storage/index.js";
import type { EvalListEntry } from "./types.js";
/** Variable prefix and suffix for eval run pointers (`@uwf/eval/<task>/latest`). */
const EVAL_VAR_PREFIX = "@uwf/eval/";
const EVAL_VAR_SUFFIX = "/latest";
/** Read a single eval-run payload from CAS. Returns null when the node is absent. */
export function readEvalRun(evalStore: EvalStore, hash: string): EvalRunPayload | null {
const node = evalStore.store.cas.get(hash);
if (node === null) {
return null;
}
return node.payload as EvalRunPayload;
}
/**
* Read every indexed eval run by scanning `@uwf/eval/*\/latest` variables and
* loading the referenced CAS node. Dangling pointers are skipped.
*/
export function readEvalEntries(evalStore: EvalStore): EvalListEntry[] {
const { store, varStore } = evalStore;
const entries: EvalListEntry[] = [];
for (const variable of varStore.list()) {
if (!variable.name.startsWith(EVAL_VAR_PREFIX) || !variable.name.endsWith(EVAL_VAR_SUFFIX)) {
continue;
}
const node = store.cas.get(variable.value);
if (node === null) {
continue;
}
const payload = node.payload as EvalRunPayload;
entries.push({
task: payload.task,
overall: payload.overall,
timestamp: payload.timestamp,
hash: variable.value,
});
}
return entries;
}
-32
View File
@@ -1,32 +0,0 @@
import { createLogger } from "@united-workforce/util";
import type { Command } from "commander";
import { createEvalStore } from "../storage/index.js";
import { formatReport } from "./format.js";
import { readEvalRun } from "./read.js";
const log = createLogger({ sink: { kind: "stderr" } });
const LOG_REPORT = "R7QP2M4K";
export function registerReportCommand(program: Command): void {
program
.command("report <hash>")
.description("Show eval run results")
.action(async (hash: string) => {
try {
const evalStore = await createEvalStore();
const payload = readEvalRun(evalStore, hash);
if (payload === null) {
process.stderr.write(`eval run not found: ${hash}\n`);
process.exitCode = 1;
return;
}
log(LOG_REPORT, `report task=${payload.task} hash=${hash}`);
process.stdout.write(formatReport(payload, hash));
} catch (e) {
const message = e instanceof Error ? e.message : String(e);
process.stderr.write(`${message}\n`);
process.exitCode = 1;
}
});
}
-84
View File
@@ -1,84 +0,0 @@
import { resolve } from "node:path";
import type { Command } from "commander";
import type { RunResult } from "../runner/index.js";
import { collect, execute, getEngineVersion, prepare } from "../runner/index.js";
import type { EvalRunConfig } from "../storage/index.js";
import { createEvalStore } from "../storage/index.js";
type RunCliOptions = {
agent: string;
model: string | undefined;
count: string;
};
async function runOnce(
taskDir: string,
agent: string,
model: string,
engineVersion: string,
): Promise<RunResult> {
const prepared = await prepare(taskDir);
const { manifest, workDir } = prepared;
const { threadId } = await execute({
workDir,
workflow: manifest.workflow,
prompt: manifest.prompt,
agent,
maxSteps: manifest.limits.maxSteps,
});
const evalStore = await createEvalStore();
const config: EvalRunConfig = { agent, model, engineVersion };
const collected = await collect({
evalStore,
taskDir: prepared.taskDir,
workDir,
threadId,
manifest,
config,
});
return {
runHash: collected.runHash,
overall: collected.overall,
task: manifest.name,
judges: collected.judges,
};
}
export function registerRunCommand(program: Command): void {
program
.command("run <task>")
.description("Run eval on a task directory or tarball")
.option("--agent <name>", "agent adapter to use", "uwf-hermes")
.option("--model <model>", "model override")
.option("--count <n>", "number of eval runs", "1")
.action(async (task: string, opts: RunCliOptions) => {
const taskDir = resolve(task);
const agent = opts.agent;
const model = opts.model ?? "";
const count = Number.parseInt(opts.count, 10);
if (!Number.isInteger(count) || count < 1) {
process.stderr.write("--count must be a positive integer\n");
process.exitCode = 1;
return;
}
const engineVersion = getEngineVersion();
try {
const results: RunResult[] = [];
for (let i = 0; i < count; i++) {
results.push(await runOnce(taskDir, agent, model, engineVersion));
}
const output = count === 1 ? results[0] : results;
process.stdout.write(`${JSON.stringify(output)}\n`);
} catch (e) {
const message = e instanceof Error ? e.message : String(e);
process.stderr.write(`${message}\n`);
process.exitCode = 1;
}
});
}
-9
View File
@@ -1,9 +0,0 @@
import type { CasRef } from "@united-workforce/protocol";
/** Summary row for the `list` command: one indexed eval run. */
export type EvalListEntry = {
task: string;
overall: number;
timestamp: number;
hash: CasRef;
};
-34
View File
@@ -1,34 +0,0 @@
// Judge types
export type { JudgeInput, JudgeOutput } from "./judge/index.js";
export type {
CollectInput,
CollectResult,
ExecuteInput,
ExecuteResult,
JudgeRunner,
JudgeRunOutput,
JudgeSummary,
PrepareResult,
RunOptions,
RunResult,
} from "./runner/index.js";
// Runner (prepare → execute → collect)
export { collect, computeOverall, execute, getEngineVersion, prepare } from "./runner/index.js";
export type {
EvalJudgeRecord,
EvalRunConfig,
EvalRunPayload,
EvalStore,
} from "./storage/index.js";
// Storage schemas and types
export {
createEvalStore,
EVAL_JUDGE_FRONTMATTER_SCHEMA,
EVAL_JUDGE_HALLUCINATION_SCHEMA,
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
EVAL_JUDGE_UPSTREAM_SCHEMA,
EVAL_RUN_SCHEMA,
setEvalLatest,
} from "./storage/index.js";
export type { JudgeEntry, TaskLimits, TaskManifest } from "./task/index.js";
export { loadTaskManifest, parseTaskManifest } from "./task/index.js";
@@ -1,105 +0,0 @@
import { createLogger } from "@united-workforce/util";
import { parse as parseYaml } from "yaml";
import { EVAL_JUDGE_FRONTMATTER_SCHEMA } from "../../storage/index.js";
import { readThreadSteps } from "./read-steps.js";
import type { BuiltinJudgeOutput } from "./types.js";
const log = createLogger({ sink: { kind: "stderr" } });
const LOG_RESULT = "F2QH7R4M";
const FENCE = "---";
type InvalidStep = {
stepIndex: number;
role: string;
errors: string[];
};
/**
* Extract the YAML frontmatter block from a step output. Returns the inner YAML
* string when the output starts with a `---\n` block closed by a `\n---` fence,
* otherwise null.
*/
function extractFrontmatterYaml(output: unknown): string | null {
if (typeof output !== "string") {
return null;
}
if (!output.startsWith(`${FENCE}\n`)) {
return null;
}
const rest = output.slice(FENCE.length + 1);
const closeIndex = rest.indexOf(`\n${FENCE}`);
if (closeIndex === -1) {
return null;
}
return rest.slice(0, closeIndex);
}
/** Validate a single step's frontmatter, returning a list of errors (empty = valid). */
function validateStepFrontmatter(output: unknown): string[] {
// CAS stores the extracted output as a JSON object after the extract pipeline.
// Accept both: parsed object (from step.output) or raw markdown string.
if (typeof output === "object" && output !== null && !Array.isArray(output)) {
const status = (output as Record<string, unknown>).$status;
if (typeof status !== "string" || status.trim() === "") {
return ["$status field is missing or not a non-empty string"];
}
return [];
}
const yaml = extractFrontmatterYaml(output);
if (yaml === null) {
return ["output does not begin with a valid '---' frontmatter block"];
}
let parsed: unknown;
try {
parsed = parseYaml(yaml);
} catch (e) {
const message = e instanceof Error ? e.message : String(e);
return [`frontmatter YAML failed to parse: ${message}`];
}
if (typeof parsed !== "object" || parsed === null || Array.isArray(parsed)) {
return ["frontmatter is not a YAML mapping"];
}
const status = (parsed as Record<string, unknown>).$status;
if (typeof status !== "string" || status.trim() === "") {
return ["$status field is missing or not a non-empty string"];
}
return [];
}
/**
* Deterministic judge: every step's agent output must contain valid YAML
* frontmatter with a non-empty `$status` field. Score = stepsValid / stepsTotal
* (0 when there are no steps).
*/
export async function runFrontmatterJudge(threadId: string): Promise<BuiltinJudgeOutput> {
const steps = readThreadSteps(threadId);
const invalidSteps: InvalidStep[] = [];
for (let i = 0; i < steps.length; i++) {
const step = steps[i];
const errors = validateStepFrontmatter(step.output);
if (errors.length > 0) {
invalidSteps.push({ stepIndex: i, role: step.role, errors });
}
}
const stepsTotal = steps.length;
const stepsValid = stepsTotal - invalidSteps.length;
const score = stepsTotal > 0 ? stepsValid / stepsTotal : 0;
log(LOG_RESULT, `frontmatter thread=${threadId} valid=${stepsValid}/${stepsTotal}`);
return {
score,
data: { stepsTotal, stepsValid, invalidSteps },
schema: EVAL_JUDGE_FRONTMATTER_SCHEMA,
};
}
@@ -1,17 +0,0 @@
import { EVAL_JUDGE_HALLUCINATION_SCHEMA } from "../../storage/index.js";
import type { BuiltinJudgeOutput } from "./types.js";
/**
* LLM-as-judge: detects claims in each step's output that are not grounded in
* the available context (hallucinations).
*
* TODO: LLM-as-judge needs provider config to call LLM API. Returns a stub
* (score 0, empty perStep) until the LLM call path is wired up.
*/
export async function runHallucinationJudge(_threadId: string): Promise<BuiltinJudgeOutput> {
return {
score: 0,
data: { perStep: [] },
schema: EVAL_JUDGE_HALLUCINATION_SCHEMA,
};
}
-6
View File
@@ -1,6 +0,0 @@
export { runFrontmatterJudge } from "./frontmatter.js";
export { runHallucinationJudge } from "./hallucination.js";
export { readThreadSteps } from "./read-steps.js";
export { runTokenStatsJudge } from "./token-stats.js";
export type { BuiltinJudge, BuiltinJudgeOutput } from "./types.js";
export { runUpstreamJudge } from "./upstream.js";
@@ -1,14 +0,0 @@
import { execFileSync } from "node:child_process";
import type { StepEntry, ThreadStepsOutput } from "@united-workforce/protocol";
/** Shell out to `uwf step list` and return the parsed step entries (excludes start entry). */
export function readThreadSteps(threadId: string): StepEntry[] {
const stdout = execFileSync("uwf", ["step", "list", threadId], {
encoding: "utf8",
stdio: ["ignore", "pipe", "pipe"],
}).trim();
const parsed = JSON.parse(stdout) as ThreadStepsOutput;
// steps[0] is the StartEntry; the rest are StepEntry records.
return parsed.steps.slice(1) as StepEntry[];
}
@@ -1,53 +0,0 @@
import { createLogger } from "@united-workforce/util";
import { EVAL_JUDGE_TOKEN_STATS_SCHEMA } from "../../storage/index.js";
import { readThreadSteps } from "./read-steps.js";
import type { BuiltinJudgeOutput } from "./types.js";
const log = createLogger({ sink: { kind: "stderr" } });
const LOG_RESULT = "T7KQ3M9P";
type PerStepStats = {
role: string;
inputTokens: number;
outputTokens: number;
turns: number;
duration: number;
};
/**
* Informational judge: aggregate token usage across every step. Always scores
* 1.0 it never penalizes a run, it only reports usage. Steps with null usage
* contribute zeros.
*/
export async function runTokenStatsJudge(threadId: string): Promise<BuiltinJudgeOutput> {
const steps = readThreadSteps(threadId);
let totalInput = 0;
let totalOutput = 0;
let totalTurns = 0;
const perStep: PerStepStats[] = [];
for (const step of steps) {
const usage = step.usage;
const inputTokens = usage !== null ? usage.inputTokens : 0;
const outputTokens = usage !== null ? usage.outputTokens : 0;
const turns = usage !== null ? usage.turns : 0;
const duration = usage !== null ? usage.duration : 0;
totalInput += inputTokens;
totalOutput += outputTokens;
totalTurns += turns;
perStep.push({ role: step.role, inputTokens, outputTokens, turns, duration });
}
log(LOG_RESULT, `token-stats thread=${threadId} in=${totalInput} out=${totalOutput}`);
return {
score: 1.0,
data: { totalInput, totalOutput, totalTurns, perStep },
schema: EVAL_JUDGE_TOKEN_STATS_SCHEMA,
};
}

Some files were not shown because too many files have changed in this diff Show More