Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| ede428bff2 | |||
| 6850826abe |
@@ -0,0 +1,8 @@
|
||||
# Changesets
|
||||
|
||||
Hello and welcome! This folder has been automatically generated by `@changesets/cli`, a build tool that works
|
||||
with multi-package repos, or single-package repos to help you version and publish your code. You can
|
||||
find the full documentation for it [in our repository](https://github.com/changesets/changesets).
|
||||
|
||||
We have a quick list of common questions to get you started engaging with this project in
|
||||
[our documentation](https://github.com/changesets/changesets/blob/main/docs/common-questions.md).
|
||||
@@ -0,0 +1,11 @@
|
||||
{
|
||||
"$schema": "https://unpkg.com/@changesets/config@3.1.4/schema.json",
|
||||
"changelog": "@changesets/cli/changelog",
|
||||
"commit": false,
|
||||
"fixed": [["@united-workforce/*"]],
|
||||
"linked": [],
|
||||
"access": "public",
|
||||
"baseBranch": "main",
|
||||
"updateInternalDependencies": "patch",
|
||||
"ignore": ["@united-workforce/dashboard"]
|
||||
}
|
||||
@@ -0,0 +1,30 @@
|
||||
{
|
||||
"mode": "exit",
|
||||
"tag": "alpha",
|
||||
"initialVersions": {
|
||||
"@uncaged/cli": "0.4.5",
|
||||
"@uncaged/workflow-agent-cursor": "0.4.5",
|
||||
"@uncaged/agent-hermes": "0.4.5",
|
||||
"@uncaged/workflow-agent-llm": "0.4.5",
|
||||
"@uncaged/workflow-agent-react": "0.4.5",
|
||||
"@uncaged/workflow-cas": "0.4.5",
|
||||
"@uncaged/dashboard": "0.1.0",
|
||||
"@uncaged/workflow-execute": "0.4.5",
|
||||
"@uncaged/workflow-gateway": "0.4.5",
|
||||
"@uncaged/protocol": "0.4.5",
|
||||
"@uncaged/workflow-reactor": "0.4.5",
|
||||
"@uncaged/workflow-register": "0.4.5",
|
||||
"@uncaged/workflow-runtime": "0.4.5",
|
||||
"@uncaged/workflow-template-develop": "0.4.5",
|
||||
"@uncaged/workflow-template-solve-issue": "0.4.5",
|
||||
"@uncaged/util": "0.4.5",
|
||||
"@uncaged/util-agent": "0.4.5"
|
||||
},
|
||||
"changesets": [
|
||||
"env-api-unify",
|
||||
"fix-internal-deps",
|
||||
"fix-publish-src",
|
||||
"fix-workspace-deps",
|
||||
"rfc-252-agent-fn"
|
||||
]
|
||||
}
|
||||
@@ -12,17 +12,15 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 22
|
||||
- uses: oven-sh/setup-bun@v2
|
||||
|
||||
- run: corepack enable && pnpm install
|
||||
- run: bun install
|
||||
|
||||
- name: Build
|
||||
run: pnpm run build
|
||||
run: bun run build
|
||||
|
||||
- name: Lint
|
||||
run: pnpm run check
|
||||
run: bun run check
|
||||
|
||||
- name: Test
|
||||
run: pnpm run test:ci
|
||||
run: bun run test:ci
|
||||
|
||||
@@ -1,226 +0,0 @@
|
||||
# Eval Framework Implementation Plan
|
||||
|
||||
## Goal
|
||||
|
||||
Build `uwf-eval` CLI + eval task infrastructure for evaluating uwf workflow quality with real agents.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
uwf-eval (runner) task package (npm) OCAS (storage)
|
||||
│ │ │
|
||||
├─ unpack tarball ───────► fixture/ → tmp cwd │
|
||||
├─ read task.yaml │ │
|
||||
├─ uwf thread start/exec │ │
|
||||
├─ run judges ───────────► dist/judges/*.js │
|
||||
├─ collect scores │ │
|
||||
└─ store results ─────────────────────────────────────► CAS nodes + variables
|
||||
```
|
||||
|
||||
### Key Design Decisions
|
||||
|
||||
- **uwf-eval is NOT part of uwf** — separate package, shells out to uwf CLI
|
||||
- **Task = npm package** — fixture + task.yaml + judge scripts, distributable as tarball
|
||||
- **Judge = Node script** — `node <entry> <cwd> <thread-id>`, outputs `{score, data}` JSON
|
||||
- **Every output is OCAS typed** — eval-run, judge results all have registered schemas
|
||||
- **Builtin judges** — frontmatter compliance, upstream consumption, hallucination, token stats
|
||||
- **Task-specific judges** — bundled in the task package, custom schema per judge
|
||||
|
||||
## Deliverables
|
||||
|
||||
### Phase 1: Foundation (`@united-workforce/eval`)
|
||||
|
||||
New package in the uwf monorepo.
|
||||
|
||||
```
|
||||
packages/eval/
|
||||
src/
|
||||
cli.ts # uwf-eval entry point
|
||||
commands/
|
||||
run.ts # uwf-eval run
|
||||
report.ts # uwf-eval report <hash>
|
||||
diff.ts # uwf-eval diff <hash> <hash>
|
||||
list.ts # uwf-eval list
|
||||
runner/
|
||||
prepare.ts # unpack tarball/dir → tmp cwd
|
||||
execute.ts # shell out to uwf thread start/exec
|
||||
collect.ts # run judges, collect scores
|
||||
judge/
|
||||
types.ts # JudgeInput, JudgeOutput types
|
||||
builtin/
|
||||
frontmatter.ts # frontmatter compliance check
|
||||
upstream.ts # upstream info consumption (LLM-as-judge)
|
||||
hallucination.ts # hallucination detection (LLM-as-judge)
|
||||
token-stats.ts # token usage from $usage field (#68)
|
||||
storage/
|
||||
schemas.ts # OCAS schema definitions
|
||||
store.ts # CAS read/write helpers
|
||||
index.ts # variable indexing (@uwf/eval/*)
|
||||
task/
|
||||
types.ts # TaskManifest type (task.yaml)
|
||||
loader.ts # parse task.yaml, validate
|
||||
package.json
|
||||
tsconfig.json
|
||||
```
|
||||
|
||||
#### OCAS Schemas to Register
|
||||
|
||||
1. `@uwf/eval-run` — full eval execution record
|
||||
```
|
||||
{ task, config: {agent, model, engineVersion}, threadId,
|
||||
judges: [{name, score, weight, dataHash}], overall, timestamp }
|
||||
```
|
||||
|
||||
2. `@uwf/eval-judge-frontmatter` — frontmatter judge data
|
||||
```
|
||||
{ stepsTotal, stepsValid, invalidSteps: [{stepIndex, role, errors: string[]}] }
|
||||
```
|
||||
|
||||
3. `@uwf/eval-judge-upstream` — upstream consumption judge data
|
||||
```
|
||||
{ perStep: [{role, consumed: string[], missed: string[], score}] }
|
||||
```
|
||||
|
||||
4. `@uwf/eval-judge-hallucination` — hallucination judge data
|
||||
```
|
||||
{ perStep: [{role, hallucinations: string[], score}] }
|
||||
```
|
||||
|
||||
5. `@uwf/eval-judge-token-stats` — token stats (not scored, informational)
|
||||
```
|
||||
{ totalInput, totalOutput, totalTurns, perStep: [{role, input, output, turns, duration}] }
|
||||
```
|
||||
|
||||
#### CLI Design
|
||||
|
||||
```bash
|
||||
# Run eval
|
||||
uwf-eval run <task-dir-or-tarball> [--agent hermes] [--model claude-sonnet-4] [--count 20]
|
||||
|
||||
# View results
|
||||
uwf-eval report <run-hash> # render via ocas render
|
||||
uwf-eval diff <hash1> <hash2> # side-by-side comparison
|
||||
uwf-eval list # list past runs
|
||||
```
|
||||
|
||||
### Phase 2: Task Package Scaffold
|
||||
|
||||
Template for creating eval tasks. Also serves as the first real task.
|
||||
|
||||
```
|
||||
eval-tasks/ # shazhou/uwf-eval-tasks monorepo
|
||||
packages/
|
||||
_template/ # copypaste template
|
||||
package.json
|
||||
task.yaml
|
||||
fixture/
|
||||
src/judges/
|
||||
tsconfig.json
|
||||
fix-off-by-one/ # first real task
|
||||
package.json # @uwf-eval/fix-off-by-one
|
||||
task.yaml
|
||||
fixture/
|
||||
src/calc.ts # buggy calculator
|
||||
src/calc.test.ts # test that exposes the bug
|
||||
package.json
|
||||
src/judges/
|
||||
test-pass.ts # runs pnpm test, checks exit code
|
||||
code-quality.ts # LLM judge: minimal change, correct fix
|
||||
schemas/
|
||||
test-pass.json # OCAS schema for test-pass data
|
||||
code-quality.json # OCAS schema for code-quality data
|
||||
tsconfig.json
|
||||
pnpm-workspace.yaml
|
||||
tsconfig.json
|
||||
biome.json
|
||||
```
|
||||
|
||||
#### task.yaml Format
|
||||
|
||||
```yaml
|
||||
name: fix-off-by-one
|
||||
description: Fix an off-by-one error in a calculator's add function
|
||||
workflow: solve-issue # registered workflow name, or relative path to .yaml
|
||||
prompt: "Fix the bug: add(1,2) returns 4 instead of 3"
|
||||
limits:
|
||||
maxSteps: 15
|
||||
timeoutMinutes: 30
|
||||
judges:
|
||||
- name: frontmatter-compliance
|
||||
weight: 0.15
|
||||
builtin: true
|
||||
- name: upstream-consumption
|
||||
weight: 0.15
|
||||
builtin: true
|
||||
- name: hallucination
|
||||
weight: 0.1
|
||||
builtin: true
|
||||
- name: token-stats
|
||||
weight: 0 # informational, not scored
|
||||
builtin: true
|
||||
- name: test-pass
|
||||
weight: 0.3
|
||||
entry: dist/judges/test-pass.js
|
||||
schema: schemas/test-pass.json
|
||||
- name: code-quality
|
||||
weight: 0.3
|
||||
entry: dist/judges/code-quality.js
|
||||
schema: schemas/code-quality.json
|
||||
```
|
||||
|
||||
#### Judge Script Contract
|
||||
|
||||
```typescript
|
||||
// Input: process.argv = [node, script, cwd, threadId]
|
||||
// Output: stdout JSON
|
||||
// Exit 0 = success, non-zero = judge error (not low score)
|
||||
|
||||
import type { JudgeOutput } from "@united-workforce/eval";
|
||||
|
||||
const result: JudgeOutput<TestPassData> = {
|
||||
score: 1.0, // 0.0 - 1.0
|
||||
data: { // typed per judge schema
|
||||
command: "pnpm test",
|
||||
exitCode: 0,
|
||||
output: "3 tests passed"
|
||||
}
|
||||
};
|
||||
|
||||
console.log(JSON.stringify(result));
|
||||
```
|
||||
|
||||
### Phase 3: Prerequisite — $usage in Adapter Protocol (#68)
|
||||
|
||||
Blocked by #68. Token stats judge needs `$usage` in step nodes.
|
||||
|
||||
Can proceed with Phase 1+2 without it — token-stats judge just returns zeros until adapters report usage.
|
||||
|
||||
## Implementation Order
|
||||
|
||||
1. **Phase 1a**: `@united-workforce/eval` package scaffold + CLI skeleton + OCAS schemas
|
||||
2. **Phase 1b**: `run` command — prepare, execute, collect flow
|
||||
3. **Phase 1c**: Builtin judges — frontmatter (deterministic), upstream + hallucination (LLM-as-judge)
|
||||
4. **Phase 2a**: Create `shazhou/uwf-eval-tasks` monorepo with proman
|
||||
5. **Phase 2b**: First task `fix-off-by-one` with fixture repo + 2 custom judges
|
||||
6. **Phase 2c**: End-to-end test: `uwf-eval run packages/fix-off-by-one --agent hermes`
|
||||
7. **Phase 1d**: `report`, `diff`, `list` commands (read from CAS, render via ocas render)
|
||||
|
||||
## Dependencies
|
||||
|
||||
- `@ocas/core` + `@ocas/fs` — CAS storage
|
||||
- `@united-workforce/protocol` — step node types
|
||||
- `commander` — CLI framework (consistent with uwf)
|
||||
- LLM API access — for LLM-as-judge (upstream, hallucination, task-specific quality judges)
|
||||
|
||||
## Open Questions
|
||||
|
||||
1. **LLM-as-judge provider config** — reuse uwf's `~/.uwf/config.yaml` provider settings? Or separate config?
|
||||
2. **Workflow file location** — task.yaml references a workflow. Should the workflow YAML be inside the tarball, or reference a registered workflow by name?
|
||||
3. **Non-coding tasks** — debate workflow has no fixture repo. task.yaml needs `fixture: null` or simply omit the `fixture/` dir. Runner creates empty cwd.
|
||||
4. **Parallel judge execution** — judges are independent, can run in parallel. Worth the complexity?
|
||||
|
||||
## Risks
|
||||
|
||||
- LLM-as-judge consistency — same input may get different scores. Mitigation: run judge multiple times, take average? Or accept variance.
|
||||
- Token cost of judges — each LLM judge call costs tokens. For a 10-step workflow with 2 LLM judges = 20 LLM calls just for judging. Acceptable?
|
||||
- Fixture repo drift — if the fixture evolves, old eval runs become non-comparable. Pin fixture version in task.yaml.
|
||||
@@ -264,8 +264,7 @@ roles:
|
||||
|
||||
graph:
|
||||
$START:
|
||||
new: { role: "bootstrap", prompt: "Set up the Docker container and verify uwf is runnable." }
|
||||
resume: { role: "bootstrap", prompt: "Review the previous run output and continue the walkthrough." }
|
||||
_: { role: "bootstrap", prompt: "Set up the Docker container and verify uwf is runnable." }
|
||||
bootstrap:
|
||||
pass: { role: "config-and-registry", prompt: "Container {{{containerName}}} is ready. Validate config and workflow registration." }
|
||||
fail: { role: "$END", prompt: "Bootstrap failed: {{{error}}}. No container was created." }
|
||||
|
||||
@@ -227,8 +227,7 @@ roles:
|
||||
required: [$status, error]
|
||||
graph:
|
||||
$START:
|
||||
new: { role: "planner", prompt: "Analyze the issue and produce an implementation plan." }
|
||||
resume: { role: "planner", prompt: "Review the previous run output and continue the work." }
|
||||
_: { role: "planner", prompt: "Analyze the issue and produce an implementation plan." }
|
||||
planner:
|
||||
insufficient_info: { role: "$SUSPEND", prompt: "信息不足,需要补充:{{{reason}}}" }
|
||||
ready: { role: "developer", prompt: "Implement the TDD test spec (CAS hash: {{{plan}}}) in repo {{{repoPath}}}. Repo remote: {{{repoRemote}}}." }
|
||||
|
||||
@@ -1,25 +0,0 @@
|
||||
# Changelog
|
||||
|
||||
## 0.1.0 (2026-06-05)
|
||||
|
||||
Initial release of `@united-workforce/*` — a stateless workflow engine for AI agent orchestration.
|
||||
|
||||
### Packages
|
||||
|
||||
- **@united-workforce/protocol** — shared types (WorkflowPayload, StepNode, etc.)
|
||||
- **@united-workforce/util** — Crockford Base32, ULID, structured logger, frontmatter parsing
|
||||
- **@united-workforce/util-agent** — agent factory, context builder, extract pipeline
|
||||
- **@united-workforce/cli** — `uwf` CLI (thread lifecycle, status-based moderator, workflow registry)
|
||||
- **@united-workforce/eval** — `uwf-eval` CLI (prepare → execute → collect eval pipeline)
|
||||
- **@united-workforce/agent-hermes** — `uwf-hermes` adapter (Hermes Agent)
|
||||
- **@united-workforce/agent-claude-code** — `uwf-claude-code` adapter (Claude Code CLI)
|
||||
- **@united-workforce/agent-builtin** — `uwf-builtin` adapter (built-in LLM agent)
|
||||
- **@united-workforce/agent-mock** — `uwf-mock` adapter (deterministic test agent)
|
||||
|
||||
### Highlights
|
||||
|
||||
- Status-based graph routing (no LLM moderator cost)
|
||||
- CAS-backed immutable thread chains (`@ocas/core`)
|
||||
- Real token usage tracking (Hermes + Claude Code)
|
||||
- Eval framework with built-in judges (frontmatter, token-stats, test-pass)
|
||||
- `$SUSPEND` / resume for human-in-the-loop workflows
|
||||
@@ -222,42 +222,41 @@ Test files (`__tests__/**`) are exempt.
|
||||
|
||||
| Tool | Purpose |
|
||||
|------|---------|
|
||||
| **pnpm** | Package manager |
|
||||
| **bun** | Package manager + runtime |
|
||||
| **TypeScript** | Type checking (strict mode) |
|
||||
| **Biome** | Lint + format (replaces ESLint + Prettier) |
|
||||
| **vitest** | Test runner (all packages) |
|
||||
| **vitest** | Test runner (`cli` uses vitest; other packages use `bun test`) |
|
||||
|
||||
### Development Workflow
|
||||
|
||||
```bash
|
||||
# ── Setup ──
|
||||
pnpm install # install all workspace dependencies
|
||||
bun install # install all workspace dependencies
|
||||
|
||||
# ── Daily development ──
|
||||
pnpm run build # build all packages (dependency order)
|
||||
pnpm run check # biome check + lint-log-tags
|
||||
pnpm run typecheck # tsc --build
|
||||
pnpm run test # run tests across all packages
|
||||
bun run build # tsc --build (all packages, dependency order)
|
||||
bun run check # tsc --build + biome check + lint-log-tags
|
||||
bun run format # biome format --write
|
||||
bun test # run tests across all packages
|
||||
|
||||
# ── Before committing ──
|
||||
pnpm run check # must pass — lint + log tag validation
|
||||
pnpm run typecheck # must pass — type checking
|
||||
pnpm run test # must pass — all package tests
|
||||
bun run check # must pass — typecheck + lint + log tag validation
|
||||
bun test # must pass — all package tests
|
||||
```
|
||||
|
||||
### Publishing
|
||||
|
||||
All public `@united-workforce/*` packages are published to **npmjs.org** with **independent versioning**.
|
||||
All public `@united-workforce/*` packages are published to **npmjs.org** with **fixed mode** (all packages share the same version number).
|
||||
|
||||
```bash
|
||||
# 1. Add a changeset describing the change
|
||||
npx changeset
|
||||
bun changeset
|
||||
|
||||
# 2. Bump versions + generate CHANGELOGs
|
||||
proman bump
|
||||
# 2. Bump all package versions + generate CHANGELOGs
|
||||
bun version
|
||||
|
||||
# 3. Build, test, and publish
|
||||
proman publish
|
||||
# 3. Build, test, and publish (runs scripts/publish-all.mjs)
|
||||
bun release
|
||||
|
||||
# Or publish manually with a tag:
|
||||
node scripts/publish-all.mjs --tag alpha
|
||||
@@ -266,7 +265,7 @@ node scripts/publish-all.mjs --dry-run # preview without publishing
|
||||
|
||||
- `workspace:^` dependencies resolve to `^x.y.z` on publish
|
||||
- Publish order defined in `scripts/publish-all.mjs` (dependency order)
|
||||
- Changesets config: `.changeset/config.json` (independent versioning, public access)
|
||||
- Changesets config: `.changeset/config.json` (fixed mode, public access)
|
||||
|
||||
### End-to-end: Author → Register → Run
|
||||
|
||||
|
||||
@@ -470,7 +470,7 @@ Use the `ocas` CLI for direct CAS operations (`~/.ocas/` store, shared with `uwf
|
||||
|
||||
| Tool | Purpose |
|
||||
|------|---------|
|
||||
| **pnpm** | Package manager |
|
||||
| **bun** | Package manager + runtime |
|
||||
| **TypeScript** | Type checking (strict mode) |
|
||||
| **Biome** | Lint + format |
|
||||
| **vitest** | Test runner |
|
||||
|
||||
+3
-3
@@ -17,7 +17,7 @@ The root README should have these sections in order:
|
||||
4. **Packages** — table with ALL packages from packages/ directory, columns: Package, Description, Type (cli/lib/agent/app)
|
||||
5. **Quick Start** — install, build, register workflow, start thread, run step
|
||||
6. **CLI Reference** — brief command list, detailed usage in cli README
|
||||
7. **Development** — pnpm install / build / check / test
|
||||
7. **Development** — bun install / build / check / test
|
||||
|
||||
## Per-Package README Structure
|
||||
|
||||
@@ -26,7 +26,7 @@ Each package README should have:
|
||||
1. **Title** — package name
|
||||
2. **One-line description** — matching package.json
|
||||
3. **Overview** — what it does, where it sits in the architecture, dependencies
|
||||
4. **Installation** — pnpm add (for libs) or "included as binary" (for cli/agents)
|
||||
4. **Installation** — bun add (for libs) or "included as binary" (for cli/agents)
|
||||
5. **API** (lib packages) — all exports from src/index.ts with type signatures, grouped by category, minimal usage examples
|
||||
6. **CLI Usage** (cli/agent packages) — command reference with examples
|
||||
7. **Internal Structure** — brief src/ file organization
|
||||
@@ -56,7 +56,7 @@ For each package read:
|
||||
- All relative links work
|
||||
- Package names match package.json
|
||||
- No references to removed/renamed packages
|
||||
- pnpm run build still passes
|
||||
- bun run build still passes
|
||||
|
||||
## Guidelines
|
||||
|
||||
|
||||
@@ -200,7 +200,7 @@ payload:
|
||||
|
||||
- `roles` — 内联定义,每个 role 的 `meta` 是独立的 ocas_ref(指向 ocas 内置 JSON Schema 节点)
|
||||
- `graph` — `Record<Role | "$START", Record<Status, Target>>`,每个 Target = `{ role, prompt }`
|
||||
- Status 来自上一个 role 输出的 `$status` 字段,`$START` 使用 `new`(首次启动)和 `resume`(恢复已完成的 thread)作为 status
|
||||
- Status 来自上一个 role 输出的 `status` 字段,`$START` 用 `_` 作为初始 status
|
||||
- Prompt 模板使用 Mustache 渲染,变量来自 lastOutput
|
||||
- 不含 agent binding — agent 配置在 `~/.uwf/config.yaml` 中管理
|
||||
|
||||
@@ -208,7 +208,7 @@ Moderator 的求值逻辑:
|
||||
|
||||
```typescript
|
||||
evaluate(graph, lastRole, lastOutput) → { role, prompt }
|
||||
// 1. status = lastOutput.$status (e.g. "new" for $START first run, "resume" for completed thread resume)
|
||||
// 1. status = lastRole === "$START" ? "_" : lastOutput.status
|
||||
// 2. target = graph[lastRole][status]
|
||||
// 3. prompt = mustache.render(target.prompt, lastOutput)
|
||||
```
|
||||
@@ -422,8 +422,8 @@ type StepNodePayload = StepRecord & {
|
||||
Moderator 使用 `evaluate(graph, lastRole, lastOutput)` 进行同步 status-based routing:
|
||||
|
||||
```typescript
|
||||
// graph[lastRole][lastOutput.$status] → Target { role, prompt }
|
||||
// $START 使用 "new"(首次启动)和 "resume"(恢复已完成 thread)作为 status
|
||||
// graph[lastRole][lastOutput.status] → Target { role, prompt }
|
||||
// $START 角色使用 "_" 作为初始 status
|
||||
// prompt 通过 Mustache 模板渲染,变量来自 lastOutput
|
||||
```
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ roles:
|
||||
type: object
|
||||
properties:
|
||||
$status:
|
||||
enum: ["done"]
|
||||
enum: ["_"]
|
||||
thesis:
|
||||
type: string
|
||||
keyPoints:
|
||||
@@ -35,7 +35,6 @@ roles:
|
||||
required: [$status, thesis, keyPoints]
|
||||
graph:
|
||||
$START:
|
||||
new: { role: "analyst", prompt: "Analyze the topic in the task and produce a structured summary with key points." }
|
||||
resume: { role: "analyst", prompt: "Review the previous analysis output and continue with additional context." }
|
||||
_: { role: "analyst", prompt: "Analyze the topic in the task and produce a structured summary with key points." }
|
||||
analyst:
|
||||
done: { role: "$END", prompt: "Analysis complete. Finish the workflow." }
|
||||
_: { role: "$END", prompt: "Analysis complete. Finish the workflow." }
|
||||
|
||||
+55
-124
@@ -1,131 +1,62 @@
|
||||
name: debate
|
||||
description: "Multi-role structured debate with critical thinking framework and host summary."
|
||||
|
||||
# Shared frontmatter schema for debater roles (YAML anchor)
|
||||
x-debater-frontmatter: &debater-frontmatter
|
||||
type: object
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: speak }
|
||||
argument: { type: string }
|
||||
required: [$status, argument]
|
||||
- properties:
|
||||
$status: { const: conceded }
|
||||
reason: { type: string }
|
||||
required: [$status, reason]
|
||||
- properties:
|
||||
$status: { const: final }
|
||||
closing: { type: string }
|
||||
required: [$status, closing]
|
||||
|
||||
name: "debate"
|
||||
description: "Structured debate between two sides. Tests cross-process session resume."
|
||||
roles:
|
||||
proponent:
|
||||
description: "Argues FOR the proposition"
|
||||
goal: "Build a compelling case for the proposition through logical reasoning and evidence"
|
||||
capabilities: []
|
||||
against:
|
||||
description: "Argues against the proposition"
|
||||
goal: |
|
||||
You are a skilled debater arguing AGAINST the proposition.
|
||||
Be logical, cite evidence, and directly address your opponent's points.
|
||||
Keep each argument concise (under 200 words).
|
||||
capabilities:
|
||||
- argumentation
|
||||
- critical-thinking
|
||||
procedure: |
|
||||
You are an experienced scholar arguing FOR the proposition.
|
||||
|
||||
## Critical Thinking Framework (execute before every speech)
|
||||
|
||||
### A. Pre-speech reflection (internal, do not output)
|
||||
- Does every step in my argument chain hold? Any hidden assumptions or logical gaps?
|
||||
- If I were my opponent, how would I attack this? Where am I weakest?
|
||||
- Does my evidence actually support my claim, or could it backfire?
|
||||
- Should I go on offense or defense this round?
|
||||
|
||||
### B. Evidence discipline
|
||||
- Verify key numbers — watch for order-of-magnitude errors
|
||||
- Assess data freshness — fast-moving fields have short half-lives
|
||||
- Distinguish primary data from secondary citations, expert opinion, and common assumptions
|
||||
|
||||
### C. Anti-fragility
|
||||
- Anticipate counterarguments; preemptively strengthen or strategically abandon weak points
|
||||
- Catch logical gaps, data misuse, or outdated claims in your opponent's reasoning
|
||||
|
||||
## Rules
|
||||
1. Check Thread Progress to see how many times you have spoken.
|
||||
2. On your 3rd speech, you MUST output $status: final (closing statement).
|
||||
3. If genuinely convinced by the opponent, output $status: conceded.
|
||||
4. Otherwise output $status: speak and counter the opponent's points.
|
||||
5. Be rigorous, cite evidence, stay concise.
|
||||
output: "Debate argument"
|
||||
frontmatter: *debater-frontmatter
|
||||
|
||||
opponent:
|
||||
description: "Argues AGAINST the proposition"
|
||||
goal: "Build a compelling case against the proposition through logical reasoning and evidence"
|
||||
capabilities: []
|
||||
procedure: |
|
||||
You are an experienced scholar arguing AGAINST the proposition.
|
||||
|
||||
## Critical Thinking Framework (execute before every speech)
|
||||
|
||||
### A. Pre-speech reflection (internal, do not output)
|
||||
- Does every step in my argument chain hold? Any hidden assumptions or logical gaps?
|
||||
- If I were my opponent, how would I attack this? Where am I weakest?
|
||||
- Does my evidence actually support my claim, or could it backfire?
|
||||
- Should I go on offense or defense this round?
|
||||
|
||||
### B. Evidence discipline
|
||||
- Verify key numbers — watch for order-of-magnitude errors
|
||||
- Assess data freshness — fast-moving fields have short half-lives
|
||||
- Distinguish primary data from secondary citations, expert opinion, and common assumptions
|
||||
|
||||
### C. Anti-fragility
|
||||
- Anticipate counterarguments; preemptively strengthen or strategically abandon weak points
|
||||
- Catch logical gaps, data misuse, or outdated claims in your opponent's reasoning
|
||||
|
||||
## Rules
|
||||
1. Check Thread Progress to see how many times you have spoken.
|
||||
2. On your 3rd speech, or when the proponent has issued a final statement, you MUST output $status: final.
|
||||
3. If genuinely convinced by the proponent, output $status: conceded.
|
||||
4. Otherwise output $status: speak and counter the proponent's points.
|
||||
5. Be rigorous, cite evidence, stay concise.
|
||||
output: "Debate argument"
|
||||
frontmatter: *debater-frontmatter
|
||||
|
||||
host:
|
||||
description: "Debate moderator — delivers impartial summary and verdict"
|
||||
goal: "Objectively review the debate, analyze both sides, and deliver a verdict"
|
||||
capabilities: []
|
||||
procedure: |
|
||||
You are an experienced academic debate moderator.
|
||||
|
||||
## Task
|
||||
1. Outline each side's core arguments
|
||||
2. Evaluate reasoning quality and evidence use
|
||||
3. Highlight the most impactful exchanges
|
||||
4. Analyze the deeper significance of the topic
|
||||
5. Deliver an overall verdict
|
||||
|
||||
## Style
|
||||
- Impartial but with independent judgment
|
||||
- Substantive, not superficial
|
||||
output: "Debate summary report"
|
||||
1. If this is the opening, present your strongest argument against the proposition.
|
||||
2. If responding to the other side, directly counter their points with evidence and logic.
|
||||
3. If you find yourself genuinely convinced by the other side, you may concede.
|
||||
output: |
|
||||
Provide your argument in the frontmatter.
|
||||
Set status to "conceded" ONLY if you are genuinely convinced and wish to stop debating.
|
||||
Otherwise set status to "continue".
|
||||
frontmatter:
|
||||
type: object
|
||||
properties:
|
||||
$status: { const: done }
|
||||
summary: { type: string }
|
||||
highlights: { type: string }
|
||||
verdict: { type: string }
|
||||
required: [$status, summary, highlights, verdict]
|
||||
|
||||
$status:
|
||||
enum: ["continue", "conceded"]
|
||||
argument:
|
||||
type: string
|
||||
required: [$status, argument]
|
||||
for:
|
||||
description: "Argues for the proposition"
|
||||
goal: |
|
||||
You are a skilled debater arguing FOR the proposition.
|
||||
Be logical, cite evidence, and directly address your opponent's points.
|
||||
Keep each argument concise (under 200 words).
|
||||
capabilities:
|
||||
- argumentation
|
||||
- critical-thinking
|
||||
procedure: |
|
||||
1. Read the opposing side's latest argument carefully.
|
||||
2. Counter their points with evidence and logic.
|
||||
3. If you find yourself genuinely convinced by the other side, you may concede.
|
||||
output: |
|
||||
Provide your argument in the frontmatter.
|
||||
Set status to "conceded" ONLY if you are genuinely convinced and wish to stop debating.
|
||||
Otherwise set status to "continue".
|
||||
frontmatter:
|
||||
type: object
|
||||
properties:
|
||||
$status:
|
||||
enum: ["continue", "conceded"]
|
||||
argument:
|
||||
type: string
|
||||
required: [$status, argument]
|
||||
graph:
|
||||
$START:
|
||||
new: { role: proponent, prompt: "The debate begins. You are arguing FOR the proposition. Present your opening argument." }
|
||||
resume: { role: proponent, prompt: "The debate continues." }
|
||||
|
||||
proponent:
|
||||
speak: { role: opponent, prompt: "Proponent argues:\n\n{{{argument}}}\n\nYou are the opponent. Counter this argument." }
|
||||
conceded: { role: host, prompt: "The proponent conceded: {{{reason}}}\n\nPlease summarize the debate." }
|
||||
final: { role: opponent, prompt: "Proponent's closing statement:\n\n{{{closing}}}\n\nYou are the opponent. Deliver your final response." }
|
||||
|
||||
opponent:
|
||||
speak: { role: proponent, prompt: "Opponent argues:\n\n{{{argument}}}\n\nYou are the proponent. Counter this argument." }
|
||||
conceded: { role: host, prompt: "The opponent conceded: {{{reason}}}\n\nPlease summarize the debate." }
|
||||
final: { role: host, prompt: "Opponent's closing statement:\n\n{{{closing}}}\n\nThe debate is over. Please summarize." }
|
||||
|
||||
host:
|
||||
done: { role: "$END", prompt: "Summary complete." }
|
||||
_: { role: "against", prompt: "Present your opening argument against the proposition." }
|
||||
against:
|
||||
conceded: { role: "$END", prompt: "The against side conceded. Debate over." }
|
||||
continue: { role: "for", prompt: "Counter the opposing argument: {{{argument}}}" }
|
||||
for:
|
||||
conceded: { role: "$END", prompt: "The for side conceded. Debate over." }
|
||||
continue: { role: "against", prompt: "Counter the opposing argument: {{{argument}}}" }
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
name: eval-simple
|
||||
description: "Single-role eval workflow: fixer takes prompt, fixes code, done."
|
||||
roles:
|
||||
fixer:
|
||||
description: "Fixes the code based on the prompt"
|
||||
goal: |
|
||||
You are a code fixer. Read the prompt, understand the bug, fix it, and verify by running the tests.
|
||||
capabilities:
|
||||
- code-editing
|
||||
- test-running
|
||||
procedure: |
|
||||
1. Read the prompt to understand what needs to be fixed
|
||||
2. Fix the bug in the source code
|
||||
3. Run the tests mentioned in the prompt to verify
|
||||
4. Output $status=done when tests pass
|
||||
output: "Describe what you fixed and confirm tests pass. Set $status to done."
|
||||
frontmatter:
|
||||
type: object
|
||||
properties:
|
||||
$status:
|
||||
type: string
|
||||
enum: [done]
|
||||
summary:
|
||||
type: string
|
||||
required: [$status, summary]
|
||||
graph:
|
||||
$START:
|
||||
new: { role: "fixer", prompt: "Fix the code issue described in the task prompt." }
|
||||
resume: { role: "fixer", prompt: "Review the previous run output and continue fixing the code issue." }
|
||||
fixer:
|
||||
done: { role: "$END", prompt: "Fix complete." }
|
||||
@@ -215,8 +215,7 @@ roles:
|
||||
required: [$status, error]
|
||||
graph:
|
||||
$START:
|
||||
new: { role: "planner", prompt: "Analyze the issue and produce an implementation plan." }
|
||||
resume: { role: "planner", prompt: "Review the previous run output and continue the work." }
|
||||
_: { role: "planner", prompt: "Analyze the issue and produce an implementation plan." }
|
||||
planner:
|
||||
insufficient_info: { role: "$SUSPEND", prompt: "信息不足,需要补充:{{{reason}}}" }
|
||||
ready: { role: "developer", prompt: "Implement the TDD test spec (CAS hash: {{{plan}}}) in repo {{{repoPath}}}." }
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import { mkdtemp, rm } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { createMemoryStore } from "@ocas/core";
|
||||
import { afterEach, beforeEach, describe, expect, test } from "vitest";
|
||||
import { createMemoryStore } from "@ocas/core";
|
||||
import { storeBuiltinDetail } from "../src/detail.js";
|
||||
import { appendSessionTurn, initSessionDir } from "../src/session.js";
|
||||
import type { BuiltinTurnPayload } from "../src/types.js";
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import { mkdir, rm, writeFile } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { afterAll, beforeAll, describe, expect, it } from "vitest";
|
||||
import { describe, it, expect, beforeAll, afterAll } from "vitest";
|
||||
import { readFileTool } from "../src/tools/read-file.js";
|
||||
import { writeFile, mkdir, rm } from "node:fs/promises";
|
||||
import { join } from "node:path";
|
||||
import { tmpdir } from "node:os";
|
||||
|
||||
const testDir = join(tmpdir(), `read-file-test-${Date.now()}`);
|
||||
const ctx = { cwd: testDir, storageRoot: testDir };
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import { tmpdir } from "node:os";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { runCommandTool } from "../src/tools/run-command.js";
|
||||
import { tmpdir } from "node:os";
|
||||
|
||||
const ctx = { cwd: tmpdir(), storageRoot: tmpdir() };
|
||||
|
||||
|
||||
@@ -3,13 +3,13 @@ import { mkdtemp, rm } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { afterEach, beforeEach, describe, expect, test } from "vitest";
|
||||
import type { BuiltinTurnPayload } from "../src/types.js";
|
||||
import {
|
||||
appendSessionTurn,
|
||||
initSessionDir,
|
||||
readSessionTurns,
|
||||
removeSession,
|
||||
} from "../src/session.js";
|
||||
import type { BuiltinTurnPayload } from "../src/types.js";
|
||||
|
||||
describe("session", () => {
|
||||
let storageRoot: string;
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import { readFile, rm } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { afterAll, describe, expect, it } from "vitest";
|
||||
import { describe, it, expect, afterAll } from "vitest";
|
||||
import { writeFileTool } from "../src/tools/write-file.js";
|
||||
import { readFile, rm } from "node:fs/promises";
|
||||
import { join } from "node:path";
|
||||
import { tmpdir } from "node:os";
|
||||
|
||||
const testDir = join(tmpdir(), `write-file-test-${Date.now()}`);
|
||||
const ctx = { cwd: testDir, storageRoot: testDir };
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@united-workforce/agent-builtin",
|
||||
"version": "0.1.2",
|
||||
"version": "0.5.0",
|
||||
"files": [
|
||||
"src",
|
||||
"dist",
|
||||
@@ -8,7 +8,7 @@
|
||||
],
|
||||
"type": "module",
|
||||
"bin": {
|
||||
"uwf-builtin": "./dist/cli.js"
|
||||
"uwf-builtin": "./src/cli.ts"
|
||||
},
|
||||
"exports": {
|
||||
".": {
|
||||
@@ -17,6 +17,7 @@
|
||||
}
|
||||
},
|
||||
"scripts": {
|
||||
"prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
|
||||
"test": "vitest run __tests__/",
|
||||
"test:ci": "vitest run __tests__/"
|
||||
},
|
||||
|
||||
@@ -82,13 +82,7 @@ async function runBuiltinWithMessages(
|
||||
|
||||
if (loopResult.turnCount === 0) {
|
||||
log("5RWTK9NB", "no turns produced, returning empty output");
|
||||
return {
|
||||
output: "",
|
||||
detailHash: "",
|
||||
sessionId: session.sessionId,
|
||||
assembledPrompt: "",
|
||||
usage: null,
|
||||
};
|
||||
return { output: "", detailHash: "", sessionId: session.sessionId, assembledPrompt: "" };
|
||||
}
|
||||
|
||||
// Read jsonl → persist turns to CAS → store detail
|
||||
@@ -105,7 +99,6 @@ async function runBuiltinWithMessages(
|
||||
detailHash,
|
||||
sessionId: session.sessionId,
|
||||
assembledPrompt: "",
|
||||
usage: null,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -1,11 +1,4 @@
|
||||
#!/usr/bin/env -S node --disable-warning=ExperimentalWarning
|
||||
|
||||
// eslint-disable-next-line -- dynamic import for version
|
||||
const pkg = await import("../package.json", { with: { type: "json" } });
|
||||
if (process.argv.includes("--version") || process.argv.includes("-V")) {
|
||||
process.stdout.write(`${pkg.default.version}\n`);
|
||||
process.exit(0);
|
||||
}
|
||||
#!/usr/bin/env node
|
||||
|
||||
import { createBuiltinAgent } from "./agent.js";
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@united-workforce/agent-claude-code",
|
||||
"version": "0.1.3",
|
||||
"version": "0.1.0",
|
||||
"files": [
|
||||
"src",
|
||||
"dist",
|
||||
@@ -8,7 +8,7 @@
|
||||
],
|
||||
"type": "module",
|
||||
"bin": {
|
||||
"uwf-claude-code": "./dist/cli.js"
|
||||
"uwf-claude-code": "./src/cli.ts"
|
||||
},
|
||||
"exports": {
|
||||
".": {
|
||||
@@ -17,12 +17,12 @@
|
||||
}
|
||||
},
|
||||
"scripts": {
|
||||
"prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
|
||||
"test": "vitest run __tests__/",
|
||||
"test:ci": "vitest run __tests__/"
|
||||
},
|
||||
"dependencies": {
|
||||
"@ocas/core": "^0.3.0",
|
||||
"@united-workforce/protocol": "workspace:^",
|
||||
"@united-workforce/util": "workspace:^",
|
||||
"@united-workforce/util-agent": "workspace:^"
|
||||
},
|
||||
|
||||
@@ -1,13 +1,11 @@
|
||||
import { spawn } from "node:child_process";
|
||||
import type { Store } from "@ocas/core";
|
||||
import type { Usage } from "@united-workforce/protocol";
|
||||
import { createLogger } from "@united-workforce/util";
|
||||
import {
|
||||
type AgentContext,
|
||||
type AgentRunResult,
|
||||
buildContinuationPrompt,
|
||||
buildRolePrompt,
|
||||
buildThreadProgress,
|
||||
createAgent,
|
||||
getCachedSessionId,
|
||||
setCachedSessionId,
|
||||
@@ -28,10 +26,6 @@ export function buildClaudeCodePrompt(ctx: AgentContext): string {
|
||||
if (ctx.outputFormatInstruction !== undefined && ctx.outputFormatInstruction !== "") {
|
||||
parts.push(ctx.outputFormatInstruction, "");
|
||||
}
|
||||
|
||||
// Inject thread progress so the agent knows step count and role visit count
|
||||
parts.push(buildThreadProgress(ctx.steps, ctx.role), "");
|
||||
|
||||
parts.push(rolePrompt, "", "## Task", ctx.start.prompt);
|
||||
|
||||
if (!ctx.isFirstVisit) {
|
||||
@@ -151,14 +145,7 @@ async function processClaudeOutput(
|
||||
);
|
||||
}
|
||||
|
||||
const usage: Usage = {
|
||||
turns: parsed.numTurns,
|
||||
inputTokens: parsed.usage.inputTokens,
|
||||
outputTokens: parsed.usage.outputTokens,
|
||||
duration: Math.round(parsed.durationMs / 1000),
|
||||
};
|
||||
|
||||
return { output, detailHash, sessionId, assembledPrompt, usage };
|
||||
return { output, detailHash, sessionId, assembledPrompt };
|
||||
}
|
||||
|
||||
// Truly unparseable output - provide enhanced error message
|
||||
|
||||
@@ -1,11 +1,4 @@
|
||||
#!/usr/bin/env -S node --disable-warning=ExperimentalWarning
|
||||
|
||||
// eslint-disable-next-line -- dynamic import for version
|
||||
const pkg = await import("../package.json", { with: { type: "json" } });
|
||||
if (process.argv.includes("--version") || process.argv.includes("-V")) {
|
||||
process.stdout.write(`${pkg.default.version}\n`);
|
||||
process.exit(0);
|
||||
}
|
||||
#!/usr/bin/env node
|
||||
|
||||
import { createClaudeCodeAgent } from "./claude-code.js";
|
||||
|
||||
|
||||
@@ -2,5 +2,5 @@
|
||||
"extends": "../../tsconfig.json",
|
||||
"compilerOptions": { "rootDir": "src", "outDir": "dist" },
|
||||
"include": ["src"],
|
||||
"references": [{ "path": "../protocol" }, { "path": "../util-agent" }]
|
||||
"references": [{ "path": "../util-agent" }]
|
||||
}
|
||||
|
||||
@@ -1,18 +0,0 @@
|
||||
# @united-workforce/agent-hermes
|
||||
|
||||
## 0.1.1
|
||||
|
||||
### Patch Changes
|
||||
|
||||
- 8085d1d: fix: read token usage from ACP PromptResponse instead of DB
|
||||
|
||||
Token counts (inputTokens, outputTokens) now come from the ACP
|
||||
`PromptResponse.usage` field, which is populated synchronously from
|
||||
`run_conversation()` return data — no WAL race condition.
|
||||
|
||||
Turns (assistant message count) still come from the DB via
|
||||
`snapshotTurns()` before/after delta.
|
||||
|
||||
Previously both tokens and turns were read from the Hermes state DB
|
||||
after the ACP prompt returned, but due to WAL write lag the DB often
|
||||
had incomplete token data at read time (e.g. 235 vs actual 26,080).
|
||||
@@ -0,0 +1,55 @@
|
||||
import { afterEach, beforeEach, describe, expect, it } from "vitest";
|
||||
import { HermesAcpClient } from "../../src/acp-client.js";
|
||||
|
||||
const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
|
||||
|
||||
describe("HermesAcpClient", () => {
|
||||
let client: HermesAcpClient;
|
||||
|
||||
beforeEach(() => {
|
||||
client = new HermesAcpClient();
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await client.close();
|
||||
});
|
||||
|
||||
it(
|
||||
"connect() returns a UUID sessionId",
|
||||
async () => {
|
||||
const sessionId = await client.connect(process.cwd());
|
||||
expect(typeof sessionId).toBe("string");
|
||||
expect(sessionId).toMatch(UUID_RE);
|
||||
},
|
||||
{ timeout: 2 * 60 * 1000 },
|
||||
);
|
||||
|
||||
it(
|
||||
"prompt() returns a non-empty text response",
|
||||
async () => {
|
||||
await client.connect(process.cwd());
|
||||
const result = await client.prompt("Reply with exactly the word: PONG");
|
||||
expect(typeof result.text).toBe("string");
|
||||
expect(result.text.length).toBeGreaterThan(0);
|
||||
expect(typeof result.sessionId).toBe("string");
|
||||
expect(result.sessionId).toMatch(UUID_RE);
|
||||
},
|
||||
{ timeout: 2 * 60 * 1000 },
|
||||
);
|
||||
|
||||
it(
|
||||
"prompt() can be called twice on the same session (resume)",
|
||||
async () => {
|
||||
await client.connect(process.cwd());
|
||||
|
||||
const first = await client.prompt("Say the word ALPHA and nothing else.");
|
||||
expect(first.text.length).toBeGreaterThan(0);
|
||||
|
||||
const second = await client.prompt("Now say the word BETA and nothing else.");
|
||||
expect(second.text.length).toBeGreaterThan(0);
|
||||
|
||||
expect(first.sessionId).toBe(second.sessionId);
|
||||
},
|
||||
{ timeout: 2 * 60 * 1000 },
|
||||
);
|
||||
});
|
||||
@@ -0,0 +1,56 @@
|
||||
import { afterEach, describe, expect, it } from "vitest";
|
||||
import { HermesAcpClient } from "../../src/acp-client.js";
|
||||
|
||||
/**
|
||||
* E2E test for cross-process session resume.
|
||||
*
|
||||
* Simulates the workflow re-entry scenario:
|
||||
* 1. Client A: connect → prompt → close (developer first run)
|
||||
* 2. Client B: resume(sessionId) → prompt (developer re-entry after reviewer reject)
|
||||
*
|
||||
* This is what happens when uwf thread step spawns uwf-hermes twice for the same role.
|
||||
*/
|
||||
describe("HermesAcpClient cross-process resume", () => {
|
||||
const clients: HermesAcpClient[] = [];
|
||||
|
||||
afterEach(async () => {
|
||||
for (const c of clients) {
|
||||
await c.close();
|
||||
}
|
||||
clients.length = 0;
|
||||
});
|
||||
|
||||
// TODO(#435): flaky — depends on live LLM; mock or move to integration suite
|
||||
it.skip(
|
||||
"resume() after close — second prompt returns non-empty text",
|
||||
async () => {
|
||||
// --- Client A: first run ---
|
||||
const clientA = new HermesAcpClient();
|
||||
clients.push(clientA);
|
||||
|
||||
await clientA.connect(process.cwd());
|
||||
const first = await clientA.prompt(
|
||||
"Remember the secret code: WATERMELON. Reply with exactly: ACKNOWLEDGED",
|
||||
);
|
||||
expect(first.text.length).toBeGreaterThan(0);
|
||||
const sessionId = first.sessionId;
|
||||
|
||||
// Close client A (simulates uwf-hermes process exit)
|
||||
await clientA.close();
|
||||
|
||||
// --- Client B: resume (simulates re-entry) ---
|
||||
const clientB = new HermesAcpClient();
|
||||
clients.push(clientB);
|
||||
|
||||
await clientB.resume(sessionId, process.cwd());
|
||||
const second = await clientB.prompt(
|
||||
"What was the secret code I told you earlier? Reply with just the code word.",
|
||||
);
|
||||
|
||||
// The critical assertion: resumed session produces non-empty output
|
||||
expect(second.text.length).toBeGreaterThan(0);
|
||||
expect(second.sessionId).toBe(sessionId);
|
||||
},
|
||||
{ timeout: 3 * 60 * 1000 },
|
||||
);
|
||||
});
|
||||
@@ -15,8 +15,7 @@ describe("Issue #551 — bin entry & engines", () => {
|
||||
const pkg = JSON.parse(readFileSync(join(PKG_ROOT, "package.json"), "utf-8"));
|
||||
const binPath = pkg.bin["uwf-hermes"];
|
||||
const content = readFileSync(join(PKG_ROOT, binPath), "utf-8");
|
||||
expect(content.startsWith("#!/usr/bin/env")).toBe(true);
|
||||
expect(content).toContain("node");
|
||||
expect(content.startsWith("#!/usr/bin/env node")).toBe(true);
|
||||
});
|
||||
|
||||
test("README.md explains uwf-hermes is an adapter", () => {
|
||||
|
||||
@@ -140,9 +140,7 @@ function createTestDb(dbPath: string): TestDb {
|
||||
db.exec(`CREATE TABLE sessions (
|
||||
id TEXT PRIMARY KEY,
|
||||
model TEXT NOT NULL,
|
||||
started_at INTEGER NOT NULL,
|
||||
input_tokens INTEGER DEFAULT 0,
|
||||
output_tokens INTEGER DEFAULT 0
|
||||
started_at INTEGER NOT NULL
|
||||
)`);
|
||||
db.exec(`CREATE TABLE messages (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
|
||||
@@ -1,122 +0,0 @@
|
||||
import { describe, expect, test } from "vitest";
|
||||
import type { AcpUsage } from "../src/acp-client.js";
|
||||
import { buildUsage, snapshotTurns } from "../src/hermes.js";
|
||||
import type { HermesSessionJson } from "../src/types.js";
|
||||
|
||||
function makeSession(overrides: Partial<HermesSessionJson> = {}): HermesSessionJson {
|
||||
return {
|
||||
session_id: "test-session",
|
||||
model: "test-model",
|
||||
session_start: "2026-01-01T00:00:00Z",
|
||||
messages: [],
|
||||
inputTokens: 0,
|
||||
outputTokens: 0,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
describe("snapshotTurns", () => {
|
||||
test("returns zero for null session", () => {
|
||||
const result = snapshotTurns(null);
|
||||
expect(result).toEqual({ turns: 0 });
|
||||
});
|
||||
|
||||
test("returns zero for empty session", () => {
|
||||
const result = snapshotTurns(makeSession());
|
||||
expect(result).toEqual({ turns: 0 });
|
||||
});
|
||||
|
||||
test("counts assistant messages as turns", () => {
|
||||
const result = snapshotTurns(
|
||||
makeSession({
|
||||
messages: [
|
||||
{ role: "user", content: "hello", reasoning: null, tool_calls: null },
|
||||
{ role: "assistant", content: "hi", reasoning: null, tool_calls: null },
|
||||
{ role: "user", content: "do X", reasoning: null, tool_calls: null },
|
||||
{ role: "tool", content: "result", reasoning: null, tool_calls: null },
|
||||
{ role: "assistant", content: "done", reasoning: null, tool_calls: null },
|
||||
],
|
||||
inputTokens: 1000,
|
||||
outputTokens: 500,
|
||||
}),
|
||||
);
|
||||
expect(result).toEqual({ turns: 2 });
|
||||
});
|
||||
|
||||
test("ignores non-assistant messages for turn count", () => {
|
||||
const result = snapshotTurns(
|
||||
makeSession({
|
||||
messages: [
|
||||
{ role: "user", content: "hello", reasoning: null, tool_calls: null },
|
||||
{ role: "tool", content: "result", reasoning: null, tool_calls: null },
|
||||
],
|
||||
}),
|
||||
);
|
||||
expect(result.turns).toBe(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe("buildUsage", () => {
|
||||
const acpUsage: AcpUsage = { inputTokens: 5000, outputTokens: 2000, totalTokens: 7000 };
|
||||
|
||||
test("first visit: tokens from ACP, turns from DB delta", () => {
|
||||
const beforeTurns = { turns: 0 };
|
||||
const afterTurns = { turns: 3 };
|
||||
const result = buildUsage(acpUsage, beforeTurns, afterTurns, 12.5);
|
||||
expect(result).toEqual({
|
||||
turns: 3,
|
||||
inputTokens: 5000,
|
||||
outputTokens: 2000,
|
||||
duration: 13,
|
||||
});
|
||||
});
|
||||
|
||||
test("re-entry: turn delta computed correctly, tokens from ACP", () => {
|
||||
const beforeTurns = { turns: 2 };
|
||||
const afterTurns = { turns: 4 };
|
||||
const acpDelta: AcpUsage = { inputTokens: 8000, outputTokens: 3500, totalTokens: 11500 };
|
||||
const result = buildUsage(acpDelta, beforeTurns, afterTurns, 7.3);
|
||||
expect(result).toEqual({
|
||||
turns: 2,
|
||||
inputTokens: 8000,
|
||||
outputTokens: 3500,
|
||||
duration: 7,
|
||||
});
|
||||
});
|
||||
|
||||
test("floors negative turn deltas at 0, then defaults to 1", () => {
|
||||
const beforeTurns = { turns: 5 };
|
||||
const afterTurns = { turns: 3 };
|
||||
const result = buildUsage(acpUsage, beforeTurns, afterTurns, 1.0);
|
||||
// turns would be negative (-2), floored to 0, then || 1 gives 1
|
||||
expect(result.turns).toBe(1);
|
||||
});
|
||||
|
||||
test("zero turns delta defaults to 1 (at least one turn happened)", () => {
|
||||
const beforeTurns = { turns: 3 };
|
||||
const afterTurns = { turns: 3 };
|
||||
const result = buildUsage(acpUsage, beforeTurns, afterTurns, 5.0);
|
||||
// turns delta is 0, || 1 gives 1
|
||||
expect(result.turns).toBe(1);
|
||||
});
|
||||
|
||||
test("null ACP usage yields zero tokens", () => {
|
||||
const beforeTurns = { turns: 0 };
|
||||
const afterTurns = { turns: 2 };
|
||||
const result = buildUsage(null, beforeTurns, afterTurns, 10.0);
|
||||
expect(result).toEqual({
|
||||
turns: 2,
|
||||
inputTokens: 0,
|
||||
outputTokens: 0,
|
||||
duration: 10,
|
||||
});
|
||||
});
|
||||
|
||||
test("duration is rounded", () => {
|
||||
const beforeTurns = { turns: 0 };
|
||||
const afterTurns = { turns: 1 };
|
||||
expect(buildUsage(acpUsage, beforeTurns, afterTurns, 3.7).duration).toBe(4);
|
||||
expect(buildUsage(acpUsage, beforeTurns, afterTurns, 3.2).duration).toBe(3);
|
||||
expect(buildUsage(acpUsage, beforeTurns, afterTurns, 0.0).duration).toBe(0);
|
||||
});
|
||||
});
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@united-workforce/agent-hermes",
|
||||
"version": "0.1.4",
|
||||
"version": "0.5.0",
|
||||
"files": [
|
||||
"src",
|
||||
"dist",
|
||||
@@ -8,7 +8,7 @@
|
||||
],
|
||||
"type": "module",
|
||||
"bin": {
|
||||
"uwf-hermes": "./dist/cli.js"
|
||||
"uwf-hermes": "./src/cli.ts"
|
||||
},
|
||||
"exports": {
|
||||
".": {
|
||||
@@ -17,6 +17,7 @@
|
||||
}
|
||||
},
|
||||
"scripts": {
|
||||
"prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
|
||||
"test": "vitest run __tests__/",
|
||||
"test:ci": "vitest run __tests__/"
|
||||
},
|
||||
|
||||
@@ -1,22 +1,8 @@
|
||||
import type { ChildProcess } from "node:child_process";
|
||||
import { spawn } from "node:child_process";
|
||||
import { readFileSync } from "node:fs";
|
||||
import { dirname, join } from "node:path";
|
||||
import { createInterface } from "node:readline";
|
||||
import { fileURLToPath } from "node:url";
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const OWN_VERSION = (
|
||||
JSON.parse(readFileSync(join(__dirname, "..", "package.json"), "utf-8")) as {
|
||||
version: string;
|
||||
}
|
||||
).version;
|
||||
|
||||
/** Resolve hermes binary: `UWF_HERMES_BIN` override → default `"hermes"` via PATH. */
|
||||
function resolveHermesCommand(): string {
|
||||
const override = process.env.UWF_HERMES_BIN;
|
||||
return override !== undefined && override !== "" ? override : "hermes";
|
||||
}
|
||||
const HERMES_COMMAND = "hermes";
|
||||
const PROTOCOL_VERSION = 1;
|
||||
|
||||
type JsonRpcResponse = {
|
||||
@@ -31,17 +17,9 @@ type PendingRequest = {
|
||||
reject: (reason: Error) => void;
|
||||
};
|
||||
|
||||
/** Token usage returned by ACP PromptResponse. */
|
||||
export type AcpUsage = {
|
||||
inputTokens: number;
|
||||
outputTokens: number;
|
||||
totalTokens: number;
|
||||
};
|
||||
|
||||
export type AcpPromptResult = {
|
||||
text: string;
|
||||
sessionId: string;
|
||||
usage: AcpUsage | null;
|
||||
};
|
||||
|
||||
export class HermesAcpClient {
|
||||
@@ -94,11 +72,6 @@ export class HermesAcpClient {
|
||||
return sessionId;
|
||||
}
|
||||
|
||||
/** Return the current session ID, or null if not connected. */
|
||||
getSessionId(): string | null {
|
||||
return this.sessionId;
|
||||
}
|
||||
|
||||
/** Send prompt and collect final assistant text from ACP stream chunks. */
|
||||
async prompt(text: string): Promise<AcpPromptResult> {
|
||||
if (this.sessionId === null) {
|
||||
@@ -118,25 +91,9 @@ export class HermesAcpClient {
|
||||
);
|
||||
}
|
||||
|
||||
// Extract token usage from ACP PromptResponse.result.usage (camelCase wire format)
|
||||
const result = (response as { result?: Record<string, unknown> }).result;
|
||||
const rawUsage = result?.usage as Record<string, unknown> | undefined;
|
||||
const usage: AcpUsage | null =
|
||||
rawUsage !== undefined &&
|
||||
typeof rawUsage.inputTokens === "number" &&
|
||||
typeof rawUsage.outputTokens === "number" &&
|
||||
typeof rawUsage.totalTokens === "number"
|
||||
? {
|
||||
inputTokens: rawUsage.inputTokens,
|
||||
outputTokens: rawUsage.outputTokens,
|
||||
totalTokens: rawUsage.totalTokens,
|
||||
}
|
||||
: null;
|
||||
|
||||
return {
|
||||
text: this.messageChunks.join(""),
|
||||
sessionId: this.sessionId,
|
||||
usage,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -275,8 +232,7 @@ export class HermesAcpClient {
|
||||
return;
|
||||
}
|
||||
|
||||
const hermesCommand = resolveHermesCommand();
|
||||
const child = spawn(hermesCommand, ["acp"], {
|
||||
const child = spawn(HERMES_COMMAND, ["acp"], {
|
||||
env: process.env,
|
||||
shell: false,
|
||||
stdio: ["pipe", "pipe", "pipe"],
|
||||
@@ -314,7 +270,7 @@ export class HermesAcpClient {
|
||||
private async initialize(): Promise<void> {
|
||||
const initResponse = await this.sendRequest("initialize", {
|
||||
protocolVersion: PROTOCOL_VERSION,
|
||||
clientInfo: { name: "uwf-hermes", version: OWN_VERSION },
|
||||
clientInfo: { name: "uwf", version: "0.1.0" },
|
||||
capabilities: {},
|
||||
});
|
||||
|
||||
|
||||
@@ -1,11 +1,4 @@
|
||||
#!/usr/bin/env -S node --disable-warning=ExperimentalWarning
|
||||
|
||||
// eslint-disable-next-line -- dynamic import for version
|
||||
const pkg = await import("../package.json", { with: { type: "json" } });
|
||||
if (process.argv.includes("--version") || process.argv.includes("-V")) {
|
||||
process.stdout.write(`${pkg.default.version}\n`);
|
||||
process.exit(0);
|
||||
}
|
||||
#!/usr/bin/env node
|
||||
|
||||
import { createHermesAgent } from "./hermes.js";
|
||||
import { isResumeDisabled } from "./session-cache.js";
|
||||
|
||||
@@ -1,58 +1,19 @@
|
||||
import type { Store } from "@ocas/core";
|
||||
import type { Usage } from "@united-workforce/protocol";
|
||||
import { createLogger } from "@united-workforce/util";
|
||||
import {
|
||||
type AgentContext,
|
||||
type AgentRunResult,
|
||||
buildContinuationPrompt,
|
||||
buildRolePrompt,
|
||||
buildThreadProgress,
|
||||
createAgent,
|
||||
} from "@united-workforce/util-agent";
|
||||
import type { AcpUsage } from "./acp-client.js";
|
||||
|
||||
import { HermesAcpClient } from "./acp-client.js";
|
||||
import { getCachedSessionId, setCachedSessionId } from "./session-cache.js";
|
||||
import { loadHermesSession, storeHermesSessionDetail } from "./session-detail.js";
|
||||
import type { HermesSessionJson } from "./types.js";
|
||||
|
||||
const log = createLogger({ sink: { kind: "stderr" } });
|
||||
|
||||
/** Snapshot of session metrics taken before and after a prompt call. */
|
||||
type TurnsSnapshot = {
|
||||
turns: number;
|
||||
};
|
||||
|
||||
const ZERO_TURNS: TurnsSnapshot = { turns: 0 };
|
||||
|
||||
/** Extract assistant turn count from a session. Returns zero for null sessions. */
|
||||
export function snapshotTurns(session: HermesSessionJson | null): TurnsSnapshot {
|
||||
if (session === null) {
|
||||
return ZERO_TURNS;
|
||||
}
|
||||
return {
|
||||
turns: session.messages.filter((m) => m.role === "assistant").length,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Build Usage from ACP token data + DB turn delta.
|
||||
* Tokens come from ACP PromptResponse (synchronous, accurate).
|
||||
* Turns come from DB before/after snapshots (may have WAL lag, but acceptable).
|
||||
*/
|
||||
export function buildUsage(
|
||||
acpUsage: AcpUsage | null,
|
||||
beforeTurns: TurnsSnapshot,
|
||||
afterTurns: TurnsSnapshot,
|
||||
durationSec: number,
|
||||
): Usage {
|
||||
return {
|
||||
turns: Math.max(0, afterTurns.turns - beforeTurns.turns) || 1,
|
||||
inputTokens: acpUsage?.inputTokens ?? 0,
|
||||
outputTokens: acpUsage?.outputTokens ?? 0,
|
||||
duration: Math.round(durationSec),
|
||||
};
|
||||
}
|
||||
|
||||
/** Assemble system prompt, task, and prior step outputs for Hermes. */
|
||||
export function buildHermesPrompt(ctx: AgentContext): string {
|
||||
const parts: string[] = [];
|
||||
@@ -61,9 +22,6 @@ export function buildHermesPrompt(ctx: AgentContext): string {
|
||||
parts.push(ctx.outputFormatInstruction, "");
|
||||
}
|
||||
|
||||
// Inject thread progress so the agent knows step count and role visit count
|
||||
parts.push(buildThreadProgress(ctx.steps, ctx.role), "");
|
||||
|
||||
if (!ctx.isFirstVisit) {
|
||||
// Re-entry: show only steps since last visit, meta only
|
||||
parts.push(buildContinuationPrompt(ctx.steps, ctx.role, ctx.edgePrompt));
|
||||
@@ -150,45 +108,25 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise<void>
|
||||
void client.close();
|
||||
});
|
||||
|
||||
async function runPrompt(
|
||||
ctx: AgentContext,
|
||||
useContinuation: boolean,
|
||||
beforeTurns: TurnsSnapshot,
|
||||
): Promise<AgentRunResult> {
|
||||
async function runPrompt(ctx: AgentContext, useContinuation: boolean): Promise<AgentRunResult> {
|
||||
const effectiveCtx = useContinuation ? ctx : { ...ctx, isFirstVisit: true };
|
||||
const fullPrompt = buildHermesPrompt(effectiveCtx);
|
||||
const startMs = Date.now();
|
||||
const { text, sessionId, usage: acpUsage } = await client.prompt(fullPrompt);
|
||||
const durationSec = (Date.now() - startMs) / 1000;
|
||||
const { text, sessionId } = await client.prompt(fullPrompt);
|
||||
const { detailHash } = await storePromptResult(ctx.store, sessionId);
|
||||
|
||||
if (!resumeDisabled) {
|
||||
await setCachedSessionId(ctx.threadId, ctx.role, sessionId, ctx.storageRoot);
|
||||
}
|
||||
|
||||
// Turns from DB (may lag slightly due to WAL, but acceptable)
|
||||
const afterSession = await loadHermesSession(sessionId);
|
||||
const afterTurns = snapshotTurns(afterSession);
|
||||
const usage = buildUsage(acpUsage, beforeTurns, afterTurns, durationSec);
|
||||
|
||||
return { output: text, detailHash, sessionId, assembledPrompt: fullPrompt, usage };
|
||||
return { output: text, detailHash, sessionId, assembledPrompt: fullPrompt };
|
||||
}
|
||||
|
||||
async function runHermes(ctx: AgentContext): Promise<AgentRunResult> {
|
||||
const cwd = process.cwd();
|
||||
const attempt = await prepareSession(client, ctx, cwd, resumeDisabled);
|
||||
|
||||
// Snapshot before prompt: for resumed sessions, captures cumulative state
|
||||
// so we can compute the turn delta. For new sessions, this is ZERO_TURNS.
|
||||
const currentSessionId = client.getSessionId();
|
||||
const beforeSession =
|
||||
attempt.resumed && currentSessionId !== null
|
||||
? await loadHermesSession(currentSessionId)
|
||||
: null;
|
||||
const beforeTurns = snapshotTurns(beforeSession);
|
||||
|
||||
try {
|
||||
return await runPrompt(ctx, attempt.useContinuation, beforeTurns);
|
||||
return await runPrompt(ctx, attempt.useContinuation);
|
||||
} catch (error) {
|
||||
if (!attempt.resumed) {
|
||||
throw error;
|
||||
@@ -198,8 +136,7 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise<void>
|
||||
log("8FQW2R6N", `continuation prompt failed, retrying with initial prompt: ${message}`);
|
||||
await client.close();
|
||||
await client.connect(cwd);
|
||||
// Fresh session after retry — reset snapshot to zero
|
||||
return runPrompt(ctx, false, ZERO_TURNS);
|
||||
return runPrompt(ctx, false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -210,22 +147,9 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise<void>
|
||||
): Promise<AgentRunResult> {
|
||||
// Client is already connected from runHermes — same ACP session,
|
||||
// so the agent sees the full conversation history (crucial for retries).
|
||||
// Snapshot turns before the continuation prompt for delta computation.
|
||||
const currentSessionId = client.getSessionId();
|
||||
const beforeSession =
|
||||
currentSessionId !== null ? await loadHermesSession(currentSessionId) : null;
|
||||
const beforeTurns = snapshotTurns(beforeSession);
|
||||
|
||||
const startMs = Date.now();
|
||||
const { text, sessionId, usage: acpUsage } = await client.prompt(message);
|
||||
const durationSec = (Date.now() - startMs) / 1000;
|
||||
const { text, sessionId } = await client.prompt(message);
|
||||
const { detailHash } = await storePromptResult(store, sessionId);
|
||||
|
||||
const afterSession = await loadHermesSession(sessionId);
|
||||
const afterTurns = snapshotTurns(afterSession);
|
||||
const usage = buildUsage(acpUsage, beforeTurns, afterTurns, durationSec);
|
||||
|
||||
return { output: text, detailHash, sessionId, assembledPrompt: "", usage };
|
||||
return { output: text, detailHash, sessionId, assembledPrompt: "" };
|
||||
}
|
||||
|
||||
const agentMain = createAgent({
|
||||
|
||||
@@ -1,8 +1,2 @@
|
||||
export type { AcpUsage } from "./acp-client.js";
|
||||
export { HermesAcpClient } from "./acp-client.js";
|
||||
export {
|
||||
buildHermesPrompt,
|
||||
buildUsage,
|
||||
createHermesAgent,
|
||||
snapshotTurns,
|
||||
} from "./hermes.js";
|
||||
export { buildHermesPrompt, createHermesAgent } from "./hermes.js";
|
||||
|
||||
@@ -106,7 +106,7 @@ function parseSessionJson(raw: unknown): HermesSessionJson | null {
|
||||
messages.push(msg);
|
||||
}
|
||||
}
|
||||
return { session_id, model, session_start, messages, inputTokens: 0, outputTokens: 0 };
|
||||
return { session_id, model, session_start, messages };
|
||||
}
|
||||
|
||||
export function getHermesDbPath(): string {
|
||||
@@ -117,8 +117,6 @@ type DbSessionRow = {
|
||||
id: string;
|
||||
model: string;
|
||||
started_at: number;
|
||||
input_tokens: number;
|
||||
output_tokens: number;
|
||||
};
|
||||
|
||||
type DbMessageRow = {
|
||||
@@ -158,9 +156,7 @@ export function loadHermesSessionFromDb(
|
||||
try {
|
||||
db = new DatabaseSync(resolvedPath, { readOnly: true });
|
||||
const session = db
|
||||
.prepare(
|
||||
"SELECT id, model, started_at, input_tokens, output_tokens FROM sessions WHERE id = ?",
|
||||
)
|
||||
.prepare("SELECT id, model, started_at FROM sessions WHERE id = ?")
|
||||
.get(sessionId) as DbSessionRow | null;
|
||||
if (session === null) {
|
||||
return null;
|
||||
@@ -185,8 +181,6 @@ export function loadHermesSessionFromDb(
|
||||
model: session.model,
|
||||
session_start: new Date(session.started_at * 1000).toISOString(),
|
||||
messages,
|
||||
inputTokens: session.input_tokens ?? 0,
|
||||
outputTokens: session.output_tokens ?? 0,
|
||||
};
|
||||
} catch {
|
||||
return null;
|
||||
|
||||
@@ -40,6 +40,4 @@ export type HermesSessionJson = {
|
||||
model: string;
|
||||
session_start: string;
|
||||
messages: HermesSessionMessage[];
|
||||
inputTokens: number;
|
||||
outputTokens: number;
|
||||
};
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@united-workforce/agent-mock",
|
||||
"version": "0.1.2",
|
||||
"version": "0.5.0",
|
||||
"files": [
|
||||
"src",
|
||||
"dist",
|
||||
@@ -17,6 +17,7 @@
|
||||
}
|
||||
},
|
||||
"scripts": {
|
||||
"prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
|
||||
"test": "vitest run __tests__/",
|
||||
"test:ci": "vitest run __tests__/"
|
||||
},
|
||||
|
||||
@@ -1,11 +1,4 @@
|
||||
#!/usr/bin/env -S node --disable-warning=ExperimentalWarning
|
||||
|
||||
// eslint-disable-next-line -- dynamic import for version
|
||||
const pkg = await import("../package.json", { with: { type: "json" } });
|
||||
if (process.argv.includes("--version") || process.argv.includes("-V")) {
|
||||
process.stdout.write(`${pkg.default.version}\n`);
|
||||
process.exit(0);
|
||||
}
|
||||
#!/usr/bin/env node
|
||||
|
||||
import { createMockAgent } from "./mock-agent.js";
|
||||
|
||||
|
||||
@@ -103,7 +103,6 @@ export function createMockAgent(mockDataPath: string): () => Promise<void> {
|
||||
detailHash,
|
||||
sessionId,
|
||||
assembledPrompt: "",
|
||||
usage: { turns: 1, inputTokens: 0, outputTokens: 0, duration: 0 },
|
||||
};
|
||||
lastResult = result;
|
||||
return result;
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
# @united-workforce/cli
|
||||
|
||||
## 0.1.1
|
||||
|
||||
### Patch Changes
|
||||
|
||||
- 850a3b2: fix: resolve --agent override via config alias before raw command
|
||||
|
||||
`resolveAgentConfig()` now checks `config.agents[alias]` first before falling back to `parseAgentOverride()`. Eval CLI default `--agent` changed from `"hermes"` to `"uwf-hermes"`.
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@united-workforce/cli",
|
||||
"version": "0.3.0",
|
||||
"version": "0.5.0",
|
||||
"files": [
|
||||
"src",
|
||||
"dist",
|
||||
@@ -22,6 +22,7 @@
|
||||
"yaml": "^2.8.4"
|
||||
},
|
||||
"scripts": {
|
||||
"prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
|
||||
"test": "vitest run src/",
|
||||
"test:ci": "vitest run src/"
|
||||
},
|
||||
|
||||
@@ -58,10 +58,7 @@ describe("C1: adapter JSON round-trip integration", () => {
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Do the work", location: null },
|
||||
resume: { role: "worker", prompt: "Resume the work", location: null },
|
||||
},
|
||||
$START: { _: { role: "worker", prompt: "Do the work", location: null } },
|
||||
worker: { done: { role: "$END", prompt: "completed", location: null } },
|
||||
},
|
||||
});
|
||||
|
||||
@@ -6,7 +6,13 @@ import type { CasRef, ThreadId } from "@united-workforce/protocol";
|
||||
import { describe, expect, test } from "vitest";
|
||||
import { createMarker, deleteMarker } from "../background/index.js";
|
||||
import { cmdThreadList, cmdThreadShow, cmdThreadStart } from "../commands/thread.js";
|
||||
import { completeThread, createUwfStore, loadActiveThreads, setThread } from "../store.js";
|
||||
import {
|
||||
addHistoryEntry,
|
||||
createUwfStore,
|
||||
deleteThread,
|
||||
loadAllThreads,
|
||||
setThread,
|
||||
} from "../store.js";
|
||||
|
||||
const OUTPUT_SCHEMA = {
|
||||
type: "object" as const,
|
||||
@@ -28,13 +34,9 @@ roles:
|
||||
$status: "ready"
|
||||
frontmatter:
|
||||
type: object
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: "ready" }
|
||||
required: ["$status"]
|
||||
- properties:
|
||||
$status: { const: "not-ready" }
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { type: string, enum: ["ready", "not-ready"] }
|
||||
roleB:
|
||||
description: Second role
|
||||
goal: Do B
|
||||
@@ -46,17 +48,13 @@ roles:
|
||||
type: object
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { const: "done" }
|
||||
$status: { type: string }
|
||||
graph:
|
||||
$START:
|
||||
new:
|
||||
_:
|
||||
role: roleA
|
||||
prompt: "Do A"
|
||||
location: null
|
||||
resume:
|
||||
role: roleA
|
||||
prompt: "Resume A"
|
||||
location: null
|
||||
roleA:
|
||||
ready:
|
||||
role: roleB
|
||||
@@ -67,7 +65,7 @@ graph:
|
||||
prompt: "Try again"
|
||||
location: null
|
||||
roleB:
|
||||
done:
|
||||
_:
|
||||
role: $END
|
||||
prompt: "Done"
|
||||
location: null
|
||||
@@ -86,13 +84,9 @@ roles:
|
||||
$status: "pass"
|
||||
frontmatter:
|
||||
type: object
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: "pass" }
|
||||
required: ["$status"]
|
||||
- properties:
|
||||
$status: { const: "fail" }
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { type: string, enum: ["pass", "fail"] }
|
||||
roleB:
|
||||
description: Pass role
|
||||
goal: Do B
|
||||
@@ -104,7 +98,7 @@ roles:
|
||||
type: object
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { const: "done" }
|
||||
$status: { type: string }
|
||||
roleC:
|
||||
description: Fail role
|
||||
goal: Do C
|
||||
@@ -116,17 +110,13 @@ roles:
|
||||
type: object
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { const: "done" }
|
||||
$status: { type: string }
|
||||
graph:
|
||||
$START:
|
||||
new:
|
||||
_:
|
||||
role: roleA
|
||||
prompt: "Do A"
|
||||
location: null
|
||||
resume:
|
||||
role: roleA
|
||||
prompt: "Resume A"
|
||||
location: null
|
||||
roleA:
|
||||
pass:
|
||||
role: roleB
|
||||
@@ -137,12 +127,12 @@ graph:
|
||||
prompt: "Do C (fail)"
|
||||
location: null
|
||||
roleB:
|
||||
done:
|
||||
_:
|
||||
role: $END
|
||||
prompt: "Done"
|
||||
location: null
|
||||
roleC:
|
||||
done:
|
||||
_:
|
||||
role: $END
|
||||
prompt: "Done"
|
||||
location: null
|
||||
@@ -163,19 +153,15 @@ roles:
|
||||
type: object
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { const: "done" }
|
||||
$status: { type: string }
|
||||
graph:
|
||||
$START:
|
||||
new:
|
||||
_:
|
||||
role: worker
|
||||
prompt: "Work"
|
||||
location: null
|
||||
resume:
|
||||
role: worker
|
||||
prompt: "Resume work"
|
||||
location: null
|
||||
worker:
|
||||
done:
|
||||
_:
|
||||
role: $END
|
||||
prompt: "Done"
|
||||
location: null
|
||||
@@ -189,7 +175,7 @@ async function insertStepNode(
|
||||
outputPayload: Record<string, unknown>,
|
||||
): Promise<void> {
|
||||
const uwf = await createUwfStore(storageRoot);
|
||||
const index = loadActiveThreads(uwf.varStore);
|
||||
const index = loadAllThreads(uwf.varStore);
|
||||
const headEntry = index[threadId];
|
||||
if (headEntry === undefined) throw new Error(`thread ${threadId} not in index`);
|
||||
const head = headEntry.head;
|
||||
@@ -220,13 +206,7 @@ async function insertStepNode(
|
||||
assembledPrompt: null,
|
||||
})) as CasRef;
|
||||
|
||||
setThread(uwf.varStore, threadId, {
|
||||
head: stepHash,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
setThread(uwf.varStore, threadId, { head: stepHash, suspendedRole: null, suspendMessage: null });
|
||||
}
|
||||
|
||||
describe("currentRole field", () => {
|
||||
@@ -302,12 +282,19 @@ describe("currentRole field", () => {
|
||||
try {
|
||||
const wf = join(tmpDir, "test-current-role.yaml");
|
||||
await writeFile(wf, SIMPLE_WORKFLOW_YAML, "utf8");
|
||||
const { thread } = await cmdThreadStart(storageRoot, wf, "test", tmpDir);
|
||||
const { thread, workflow } = await cmdThreadStart(storageRoot, wf, "test", tmpDir);
|
||||
const tid = thread as ThreadId;
|
||||
|
||||
const uwfForIndex = await createUwfStore(storageRoot);
|
||||
loadActiveThreads(uwfForIndex.varStore)[tid]!.head;
|
||||
completeThread(uwfForIndex.varStore, tid, "completed");
|
||||
const head = loadAllThreads(uwfForIndex.varStore)[tid]!.head;
|
||||
deleteThread(uwfForIndex.varStore, tid);
|
||||
addHistoryEntry(uwfForIndex.varStore, {
|
||||
thread: tid,
|
||||
workflow,
|
||||
head,
|
||||
completedAt: Date.now(),
|
||||
reason: "completed",
|
||||
});
|
||||
|
||||
const result = await cmdThreadShow(storageRoot, tid);
|
||||
expect(result.status).toBe("completed");
|
||||
@@ -323,12 +310,19 @@ describe("currentRole field", () => {
|
||||
try {
|
||||
const wf = join(tmpDir, "test-current-role.yaml");
|
||||
await writeFile(wf, SIMPLE_WORKFLOW_YAML, "utf8");
|
||||
const { thread } = await cmdThreadStart(storageRoot, wf, "test", tmpDir);
|
||||
const { thread, workflow } = await cmdThreadStart(storageRoot, wf, "test", tmpDir);
|
||||
const tid = thread as ThreadId;
|
||||
|
||||
const uwfForIndex = await createUwfStore(storageRoot);
|
||||
loadActiveThreads(uwfForIndex.varStore)[tid]!.head;
|
||||
completeThread(uwfForIndex.varStore, tid, "cancelled");
|
||||
const head = loadAllThreads(uwfForIndex.varStore)[tid]!.head;
|
||||
deleteThread(uwfForIndex.varStore, tid);
|
||||
addHistoryEntry(uwfForIndex.varStore, {
|
||||
thread: tid,
|
||||
workflow,
|
||||
head,
|
||||
completedAt: Date.now(),
|
||||
reason: "cancelled",
|
||||
});
|
||||
|
||||
const result = await cmdThreadShow(storageRoot, tid);
|
||||
expect(result.status).toBe("cancelled");
|
||||
@@ -381,8 +375,15 @@ describe("currentRole field", () => {
|
||||
const comp = await cmdThreadStart(storageRoot, wf, "completed", tmpDir);
|
||||
const compId = comp.thread as ThreadId;
|
||||
const uwfForIndex = await createUwfStore(storageRoot);
|
||||
const _compHead = loadActiveThreads(uwfForIndex.varStore)[compId]!.head;
|
||||
completeThread(uwfForIndex.varStore, compId, "completed");
|
||||
const compHead = loadAllThreads(uwfForIndex.varStore)[compId]!.head;
|
||||
deleteThread(uwfForIndex.varStore, compId);
|
||||
addHistoryEntry(uwfForIndex.varStore, {
|
||||
thread: compId,
|
||||
workflow: comp.workflow,
|
||||
head: compHead,
|
||||
completedAt: Date.now(),
|
||||
reason: "completed",
|
||||
});
|
||||
|
||||
const list = await cmdThreadList(storageRoot, null, null, null, 0, 100);
|
||||
|
||||
@@ -446,8 +447,8 @@ describe("currentRole field", () => {
|
||||
await writeFile(wf, SINGLE_ROLE_WORKFLOW_YAML, "utf8");
|
||||
|
||||
const { thread } = await cmdThreadStart(storageRoot, wf, "test", tmpDir);
|
||||
// worker → done maps to $END
|
||||
await insertStepNode(storageRoot, thread as ThreadId, "worker", { $status: "done" });
|
||||
// worker → _ maps to $END
|
||||
await insertStepNode(storageRoot, thread as ThreadId, "worker", {});
|
||||
|
||||
const result = await cmdThreadShow(storageRoot, thread as ThreadId);
|
||||
expect(result.currentRole).toBe(null);
|
||||
|
||||
@@ -10,7 +10,7 @@ import { afterEach, beforeAll, beforeEach, describe, expect, test } from "vitest
|
||||
import { stringify } from "yaml";
|
||||
import { cmdThreadStart } from "../commands/thread.js";
|
||||
import { cmdWorkflowAdd } from "../commands/workflow.js";
|
||||
import { createUwfStore, getThread } from "../store.js";
|
||||
import { createUwfStore, findHistoryEntry, getThread } from "../store.js";
|
||||
|
||||
// ── paths ──────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -106,13 +106,9 @@ async function addWorkflow(workflowFixture: string, workflowName: string): Promi
|
||||
|
||||
type ExecResult = { stdout: string; stderr: string; exitCode: number };
|
||||
|
||||
function runExec(threadId: string, count: number | null = null): ExecResult {
|
||||
const args = [CLI_PATH, "thread", "exec", threadId];
|
||||
if (count !== null) {
|
||||
args.push("--count", String(count));
|
||||
}
|
||||
function runExec(threadId: string): ExecResult {
|
||||
try {
|
||||
const stdout = execFileSync(process.execPath, args, {
|
||||
const stdout = execFileSync(process.execPath, [CLI_PATH, "thread", "exec", threadId], {
|
||||
encoding: "utf8",
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
env: { ...process.env, UWF_HOME: uwfHome, OCAS_HOME: casDir },
|
||||
@@ -130,38 +126,11 @@ function runExec(threadId: string, count: number | null = null): ExecResult {
|
||||
}
|
||||
}
|
||||
|
||||
/** Invoke `uwf thread resume <threadId> -p <prompt>` through the built CLI. */
|
||||
function runResume(threadId: string, prompt: string): ExecResult {
|
||||
try {
|
||||
const stdout = execFileSync(
|
||||
process.execPath,
|
||||
[CLI_PATH, "thread", "resume", threadId, "-p", prompt],
|
||||
{
|
||||
encoding: "utf8",
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
env: { ...process.env, UWF_HOME: uwfHome, OCAS_HOME: casDir },
|
||||
cwd: tmpDir,
|
||||
timeout: 30000,
|
||||
},
|
||||
);
|
||||
return { stdout, stderr: "", exitCode: 0 };
|
||||
} catch (e: unknown) {
|
||||
const err = e as NodeJS.ErrnoException & {
|
||||
stdout?: string;
|
||||
stderr?: string;
|
||||
status?: number;
|
||||
};
|
||||
return { stdout: err.stdout ?? "", stderr: err.stderr ?? "", exitCode: err.status ?? 1 };
|
||||
}
|
||||
}
|
||||
|
||||
type StepOutputJson = {
|
||||
thread: string;
|
||||
head: string;
|
||||
status: string;
|
||||
currentRole: string | null;
|
||||
suspendedRole: string | null;
|
||||
suspendMessage: string | null;
|
||||
done: boolean;
|
||||
};
|
||||
|
||||
@@ -229,25 +198,19 @@ describe("E2E mock-agent: full uwf pipeline", () => {
|
||||
expect(getStatus(store, s1.output)).toBe("ready");
|
||||
expect(getStatus(store, s2.output)).toBe("done");
|
||||
|
||||
// Mock agent reports usage stats in step nodes.
|
||||
expect(s1.usage).toEqual({ turns: 1, inputTokens: 0, outputTokens: 0, duration: 0 });
|
||||
expect(s2.usage).toEqual({ turns: 1, inputTokens: 0, outputTokens: 0, duration: 0 });
|
||||
|
||||
// The start node points at the registered workflow.
|
||||
const startNode = store.cas.get(startHash as CasRef);
|
||||
expect((startNode!.payload as StartNodePayload).workflow).toBe(workflowHash);
|
||||
|
||||
// Thread is completed: status changed to "completed", head updated.
|
||||
// Thread is completed: removed from active index, present in history.
|
||||
const uwf = await createUwfStore(uwfHome);
|
||||
const finalEntry = getThread(uwf.varStore, threadId);
|
||||
expect(finalEntry).not.toBeNull();
|
||||
expect(finalEntry!.status).toBe("completed");
|
||||
expect(finalEntry!.head).toBe(step2.head);
|
||||
expect(getThread(uwf.varStore, threadId)).toBeNull();
|
||||
const hist = findHistoryEntry(uwf.varStore, threadId);
|
||||
expect(hist).not.toBeNull();
|
||||
expect(hist!.head).toBe(step2.head);
|
||||
});
|
||||
|
||||
test("2. branching workflow loops developer→reviewer→developer→reviewer→$END", {
|
||||
timeout: 30_000,
|
||||
}, async () => {
|
||||
test("2. branching workflow loops developer→reviewer→developer→reviewer→$END", async () => {
|
||||
await writeMockConfig("e2e-loop.mock.yaml");
|
||||
const workflowHash = await addWorkflow("e2e-loop.workflow.yaml", "test-loop");
|
||||
|
||||
@@ -300,14 +263,11 @@ describe("E2E mock-agent: full uwf pipeline", () => {
|
||||
expect(getStatus(store, n4.output)).toBe("approved");
|
||||
|
||||
const uwf = await createUwfStore(uwfHome);
|
||||
const finalEntry = getThread(uwf.varStore, threadId);
|
||||
expect(finalEntry).not.toBeNull();
|
||||
expect(finalEntry!.status).toBe("completed");
|
||||
expect(getThread(uwf.varStore, threadId)).toBeNull();
|
||||
expect(findHistoryEntry(uwf.varStore, threadId)).not.toBeNull();
|
||||
});
|
||||
|
||||
test("3. role mismatch in mock data makes the agent exit with an error", {
|
||||
timeout: 30_000,
|
||||
}, async () => {
|
||||
test("3. role mismatch in mock data makes the agent exit with an error", async () => {
|
||||
// Reuses the linear workflow but with a mock whose step[1].role is wrong.
|
||||
await writeMockConfig("e2e-mismatch.mock.yaml");
|
||||
const workflowHash = await addWorkflow("e2e-linear.workflow.yaml", "test-linear");
|
||||
@@ -327,172 +287,7 @@ describe("E2E mock-agent: full uwf pipeline", () => {
|
||||
|
||||
// The thread remains active (no step node was written for the failed step).
|
||||
const uwf = await createUwfStore(uwfHome);
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry!.status).not.toBe("completed");
|
||||
expect(entry!.head).toBe(step1.head);
|
||||
});
|
||||
|
||||
test("4. planner $SUSPEND then resume re-runs planner and reaches $END", {
|
||||
timeout: 30_000,
|
||||
}, async () => {
|
||||
await writeMockConfig("e2e-suspend.mock.yaml");
|
||||
const workflowHash = await addWorkflow("e2e-suspend.workflow.yaml", "test-suspend");
|
||||
|
||||
const start = await cmdThreadStart(uwfHome, workflowHash, "Analyze the task", uwfHome, tmpDir);
|
||||
const threadId = start.thread;
|
||||
|
||||
// Step 1 → planner emits insufficient_info → thread suspends.
|
||||
const step1 = execStep(threadId);
|
||||
expect(step1.status).toBe("suspended");
|
||||
expect(step1.done).toBe(false);
|
||||
expect(step1.currentRole).toBeNull();
|
||||
expect(step1.suspendedRole).toBe("planner");
|
||||
expect(step1.suspendMessage).toBe("Need more info: missing requirements");
|
||||
|
||||
// Thread index entry reflects the suspension with rendered metadata.
|
||||
const suspendedEntry = getThread((await createUwfStore(uwfHome)).varStore, threadId);
|
||||
expect(suspendedEntry).not.toBeNull();
|
||||
expect(suspendedEntry!.status).toBe("suspended");
|
||||
expect(suspendedEntry!.suspendedRole).toBe("planner");
|
||||
expect(suspendedEntry!.suspendMessage).toBe("Need more info: missing requirements");
|
||||
|
||||
// Resume re-runs the planner role; the second scripted step is `ready` → $END.
|
||||
const resume = runResume(threadId, "Here are the requirements");
|
||||
expect(resume.exitCode).toBe(0);
|
||||
const resumeOut = JSON.parse(resume.stdout.trim()) as StepOutputJson;
|
||||
expect(resumeOut.status).toBe("completed");
|
||||
expect(resumeOut.done).toBe(true);
|
||||
expect(resumeOut.currentRole).toBeNull();
|
||||
expect(resumeOut.suspendedRole).toBeNull();
|
||||
|
||||
// CAS chain: suspended planner step → resumed planner step.
|
||||
const store = await openStore(casDir);
|
||||
const s1 = getStepNode(store, step1.head);
|
||||
const s2 = getStepNode(store, resumeOut.head);
|
||||
expect(s1.role).toBe("planner");
|
||||
expect(s2.role).toBe("planner");
|
||||
expect(s2.prev).toBe(step1.head);
|
||||
expect(getStatus(store, s1.output)).toBe("insufficient_info");
|
||||
expect(getStatus(store, s2.output)).toBe("ready");
|
||||
|
||||
const finalEntry = getThread((await createUwfStore(uwfHome)).varStore, threadId);
|
||||
expect(finalEntry).not.toBeNull();
|
||||
expect(finalEntry!.status).toBe("completed");
|
||||
expect(finalEntry!.head).toBe(resumeOut.head);
|
||||
});
|
||||
|
||||
test("5. --count 3 runs the whole linear pipeline in one invocation", {
|
||||
timeout: 30_000,
|
||||
}, async () => {
|
||||
await writeMockConfig("e2e-count.mock.yaml");
|
||||
const workflowHash = await addWorkflow("e2e-count.workflow.yaml", "test-count");
|
||||
|
||||
const start = await cmdThreadStart(uwfHome, workflowHash, "Ship the feature", uwfHome, tmpDir);
|
||||
const threadId = start.thread;
|
||||
|
||||
// Single invocation with --count 3 → moderator drives analyst → developer → reviewer → $END.
|
||||
const { stdout, stderr, exitCode } = runExec(threadId, 3);
|
||||
expect(exitCode, `stderr: ${stderr}`).toBe(0);
|
||||
|
||||
// Multi-step exec emits a JSON array (one entry per executed step).
|
||||
const results = JSON.parse(stdout.trim()) as StepOutputJson[];
|
||||
expect(Array.isArray(results)).toBe(true);
|
||||
expect(results).toHaveLength(3);
|
||||
|
||||
expect(results[0].status).toBe("idle");
|
||||
expect(results[0].currentRole).toBe("developer");
|
||||
expect(results[1].status).toBe("idle");
|
||||
expect(results[1].currentRole).toBe("reviewer");
|
||||
expect(results[2].status).toBe("completed");
|
||||
expect(results[2].done).toBe(true);
|
||||
|
||||
// Verify the CAS chain holds 3 step nodes in the correct order.
|
||||
const store = await openStore(casDir);
|
||||
const n1 = getStepNode(store, results[0].head);
|
||||
const n2 = getStepNode(store, results[1].head);
|
||||
const n3 = getStepNode(store, results[2].head);
|
||||
expect([n1.role, n2.role, n3.role]).toEqual(["analyst", "developer", "reviewer"]);
|
||||
expect(n1.prev).toBeNull();
|
||||
expect(n2.prev).toBe(results[0].head);
|
||||
expect(n3.prev).toBe(results[1].head);
|
||||
expect(new Set([n1.start, n2.start, n3.start]).size).toBe(1);
|
||||
|
||||
const finalEntry = getThread((await createUwfStore(uwfHome)).varStore, threadId);
|
||||
expect(finalEntry).not.toBeNull();
|
||||
expect(finalEntry!.status).toBe("completed");
|
||||
expect(finalEntry!.head).toBe(results[2].head);
|
||||
});
|
||||
|
||||
test("6. mustache edge prompt renders planner variables into the worker step", {
|
||||
timeout: 30_000,
|
||||
}, async () => {
|
||||
await writeMockConfig("e2e-mustache.mock.yaml");
|
||||
const workflowHash = await addWorkflow("e2e-mustache.workflow.yaml", "test-mustache");
|
||||
|
||||
const start = await cmdThreadStart(uwfHome, workflowHash, "Plan the task", uwfHome, tmpDir);
|
||||
const threadId = start.thread;
|
||||
|
||||
// Step 1 → planner emits branch + repoPath.
|
||||
const step1 = execStep(threadId);
|
||||
expect(step1.status).toBe("idle");
|
||||
expect(step1.currentRole).toBe("worker");
|
||||
|
||||
// Step 2 → worker; the moderator renders the templated edge prompt before spawning it.
|
||||
const step2 = execStep(threadId);
|
||||
expect(step2.done).toBe(true);
|
||||
expect(step2.status).toBe("completed");
|
||||
|
||||
const store = await openStore(casDir);
|
||||
const plannerStep = getStepNode(store, step1.head);
|
||||
expect(getStatus(store, plannerStep.output)).toBe("ready");
|
||||
|
||||
// The worker step's edgePrompt is the mustache-rendered template.
|
||||
const workerStep = getStepNode(store, step2.head);
|
||||
expect(workerStep.role).toBe("worker");
|
||||
expect(workerStep.edgePrompt).toContain("fix/42-auth");
|
||||
expect(workerStep.edgePrompt).toContain("/tmp/my-repo");
|
||||
expect(workerStep.edgePrompt).toBe("Work on branch fix/42-auth in /tmp/my-repo");
|
||||
});
|
||||
|
||||
test("7. completed thread can be resumed (衔尾蛇: end → start)", {
|
||||
timeout: 30_000,
|
||||
}, async () => {
|
||||
// Reuse the suspend workflow (planner with ready → $END), but mock data
|
||||
// goes straight to ready on first run, then ready again after resume.
|
||||
await writeMockConfig("e2e-completed-resume.mock.yaml");
|
||||
const workflowHash = await addWorkflow("e2e-suspend.workflow.yaml", "test-suspend");
|
||||
|
||||
const start = await cmdThreadStart(uwfHome, workflowHash, "Do the work", uwfHome, tmpDir);
|
||||
const threadId = start.thread;
|
||||
|
||||
// Step 1: planner outputs ready → $END → thread completed.
|
||||
const step1 = execStep(threadId);
|
||||
expect(step1.done).toBe(true);
|
||||
expect(step1.status).toBe("completed");
|
||||
|
||||
const uwf1 = await createUwfStore(uwfHome);
|
||||
const entry1 = getThread(uwf1.varStore, threadId);
|
||||
expect(entry1).not.toBeNull();
|
||||
expect(entry1!.status).toBe("completed");
|
||||
|
||||
// Resume the completed thread — should re-evaluate $START → planner.
|
||||
const resumeResult = runResume(threadId, "Additional context for round 2");
|
||||
expect(resumeResult.exitCode).toBe(0);
|
||||
|
||||
// After resume step, planner ran again (step index 1 in mock) → ready → $END.
|
||||
const uwf2 = await createUwfStore(uwfHome);
|
||||
const entry2 = getThread(uwf2.varStore, threadId);
|
||||
expect(entry2).not.toBeNull();
|
||||
expect(entry2!.status).toBe("completed");
|
||||
// Head should have advanced (not the same as step1).
|
||||
expect(entry2!.head).not.toBe(step1.head);
|
||||
|
||||
// CAS chain: step2.prev === step1 head (chain is preserved across resume).
|
||||
const store = await openStore(casDir);
|
||||
const resumeOutput = JSON.parse(resumeResult.stdout.trim());
|
||||
const step2Node = getStepNode(store, resumeOutput.head);
|
||||
expect(step2Node.role).toBe("planner");
|
||||
expect(step2Node.prev).toBe(step1.head);
|
||||
expect(getThread(uwf.varStore, threadId)).not.toBeNull();
|
||||
expect(getThread(uwf.varStore, threadId)!.head).toBe(step1.head);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
steps:
|
||||
# Step 0: planner → ready → $END (thread completes)
|
||||
- role: planner
|
||||
output: |
|
||||
---
|
||||
$status: ready
|
||||
---
|
||||
Initial plan complete.
|
||||
# Step 1: after resume, planner runs again from $START → ready → $END again
|
||||
- role: planner
|
||||
output: |
|
||||
---
|
||||
$status: ready
|
||||
---
|
||||
Revised plan after resume.
|
||||
@@ -1,19 +0,0 @@
|
||||
steps:
|
||||
- role: analyst
|
||||
output: |
|
||||
---
|
||||
$status: analyzed
|
||||
---
|
||||
Analysis complete.
|
||||
- role: developer
|
||||
output: |
|
||||
---
|
||||
$status: implemented
|
||||
---
|
||||
Implementation complete.
|
||||
- role: reviewer
|
||||
output: |
|
||||
---
|
||||
$status: approved
|
||||
---
|
||||
Approved.
|
||||
@@ -1,46 +0,0 @@
|
||||
name: test-count
|
||||
description: 3-step linear pipeline (analyst -> developer -> reviewer -> $END)
|
||||
roles:
|
||||
analyst:
|
||||
description: Analyzes the task
|
||||
goal: Analyze the task
|
||||
capabilities: []
|
||||
procedure: Analyze it
|
||||
output: Output the analysis and set $status to analyzed
|
||||
frontmatter:
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: analyzed }
|
||||
required: [$status]
|
||||
developer:
|
||||
description: Implements the change
|
||||
goal: Implement the change
|
||||
capabilities: []
|
||||
procedure: Write code
|
||||
output: Output the implementation and set $status to implemented
|
||||
frontmatter:
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: implemented }
|
||||
required: [$status]
|
||||
reviewer:
|
||||
description: Reviews the change
|
||||
goal: Review the change
|
||||
capabilities: []
|
||||
procedure: Review code
|
||||
output: Approve and set $status to approved
|
||||
frontmatter:
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: approved }
|
||||
required: [$status]
|
||||
graph:
|
||||
$START:
|
||||
new: { role: analyst, prompt: 'Analyze the task' }
|
||||
resume: { role: analyst, prompt: 'Review the previous run output and continue the work.' }
|
||||
analyst:
|
||||
analyzed: { role: developer, prompt: 'Implement the change' }
|
||||
developer:
|
||||
implemented: { role: reviewer, prompt: 'Review the change' }
|
||||
reviewer:
|
||||
approved: { role: '$END', prompt: 'Done' }
|
||||
@@ -25,8 +25,7 @@ roles:
|
||||
required: [$status]
|
||||
graph:
|
||||
$START:
|
||||
new: { role: planner, prompt: 'Plan the task' }
|
||||
resume: { role: planner, prompt: 'Review the previous run output and continue the work.' }
|
||||
_: { role: planner, prompt: 'Plan the task' }
|
||||
planner:
|
||||
ready: { role: worker, prompt: 'Do the work' }
|
||||
worker:
|
||||
|
||||
@@ -28,8 +28,7 @@ roles:
|
||||
required: [$status]
|
||||
graph:
|
||||
$START:
|
||||
new: { role: developer, prompt: 'Implement the change' }
|
||||
resume: { role: developer, prompt: 'Review the previous run output and continue the work.' }
|
||||
_: { role: developer, prompt: 'Implement the change' }
|
||||
developer:
|
||||
review_needed: { role: reviewer, prompt: 'Review the change' }
|
||||
reviewer:
|
||||
|
||||
@@ -1,15 +0,0 @@
|
||||
steps:
|
||||
- role: planner
|
||||
output: |
|
||||
---
|
||||
$status: ready
|
||||
branch: fix/42-auth
|
||||
repoPath: /tmp/my-repo
|
||||
---
|
||||
Planned the work.
|
||||
- role: worker
|
||||
output: |
|
||||
---
|
||||
$status: done
|
||||
---
|
||||
Work complete.
|
||||
@@ -1,35 +0,0 @@
|
||||
name: test-mustache
|
||||
description: Planner emits template variables consumed by the worker edge prompt
|
||||
roles:
|
||||
planner:
|
||||
description: Plans work and emits branch + repo path
|
||||
goal: Plan the task
|
||||
capabilities: []
|
||||
procedure: Decide the branch and repo path
|
||||
output: Set $status to ready and emit branch and repoPath
|
||||
frontmatter:
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: ready }
|
||||
branch: { type: string }
|
||||
repoPath: { type: string }
|
||||
required: [$status, branch, repoPath]
|
||||
worker:
|
||||
description: Works on the planned branch
|
||||
goal: Do the work
|
||||
capabilities: []
|
||||
procedure: Do it
|
||||
output: Output the result and set $status to done
|
||||
frontmatter:
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: done }
|
||||
required: [$status]
|
||||
graph:
|
||||
$START:
|
||||
new: { role: planner, prompt: 'Plan the task' }
|
||||
resume: { role: planner, prompt: 'Review the previous run output and continue the work.' }
|
||||
planner:
|
||||
ready: { role: worker, prompt: 'Work on branch {{{branch}}} in {{{repoPath}}}' }
|
||||
worker:
|
||||
done: { role: '$END', prompt: 'Complete' }
|
||||
@@ -1,14 +0,0 @@
|
||||
steps:
|
||||
- role: planner
|
||||
output: |
|
||||
---
|
||||
$status: insufficient_info
|
||||
reason: missing requirements
|
||||
---
|
||||
I need more information before I can plan this.
|
||||
- role: planner
|
||||
output: |
|
||||
---
|
||||
$status: ready
|
||||
---
|
||||
I now have what I need. Ready to proceed.
|
||||
@@ -1,25 +0,0 @@
|
||||
name: test-suspend
|
||||
description: Planner can suspend for more info or finish when ready
|
||||
roles:
|
||||
planner:
|
||||
description: Plans work and may request more info
|
||||
goal: Analyze the task
|
||||
capabilities: []
|
||||
procedure: Analyze the task and decide if more info is needed
|
||||
output: Set $status to insufficient_info (with reason) or ready
|
||||
frontmatter:
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: insufficient_info }
|
||||
reason: { type: string }
|
||||
required: [$status, reason]
|
||||
- properties:
|
||||
$status: { const: ready }
|
||||
required: [$status]
|
||||
graph:
|
||||
$START:
|
||||
new: { role: planner, prompt: 'Analyze the task' }
|
||||
resume: { role: planner, prompt: 'Review the previous run output and continue the work.' }
|
||||
planner:
|
||||
insufficient_info: { role: '$SUSPEND', prompt: 'Need more info: {{{reason}}}' }
|
||||
ready: { role: '$END', prompt: 'Done' }
|
||||
@@ -5,18 +5,13 @@ import { evaluate } from "../moderator/evaluate.js";
|
||||
|
||||
const solveIssueGraph: WorkflowPayload["graph"] = {
|
||||
$START: {
|
||||
new: { role: "planner", prompt: "Start planning from the issue in the task.", location: null },
|
||||
resume: {
|
||||
role: "planner",
|
||||
prompt: "Review the previous run output and continue the work.",
|
||||
location: null,
|
||||
},
|
||||
_: { role: "planner", prompt: "Start planning from the issue in the task.", location: null },
|
||||
},
|
||||
planner: {
|
||||
planned: { role: "developer", prompt: "Implement the plan: {{plan}}", location: null },
|
||||
_: { role: "developer", prompt: "Implement the plan: {{plan}}", location: null },
|
||||
},
|
||||
developer: {
|
||||
implemented: { role: "reviewer", prompt: "Review the changes: {{summary}}", location: null },
|
||||
_: { role: "reviewer", prompt: "Review the changes: {{summary}}", location: null },
|
||||
},
|
||||
reviewer: {
|
||||
approved: { role: "$END", prompt: "Done.", location: null },
|
||||
@@ -25,8 +20,8 @@ const solveIssueGraph: WorkflowPayload["graph"] = {
|
||||
};
|
||||
|
||||
describe("evaluate", () => {
|
||||
test("$START → first role (status new)", () => {
|
||||
const result = evaluate(solveIssueGraph, "$START", { $status: "new" });
|
||||
test("$START → first role (unit status _)", () => {
|
||||
const result = evaluate(solveIssueGraph, "$START", { $status: "_" });
|
||||
expect(result).toEqual({
|
||||
ok: true,
|
||||
value: {
|
||||
@@ -37,18 +32,6 @@ describe("evaluate", () => {
|
||||
});
|
||||
});
|
||||
|
||||
test("$START → first role (status resume)", () => {
|
||||
const result = evaluate(solveIssueGraph, "$START", { $status: "resume" });
|
||||
expect(result).toEqual({
|
||||
ok: true,
|
||||
value: {
|
||||
role: "planner",
|
||||
prompt: "Review the previous run output and continue the work.",
|
||||
location: null,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
test("status-based routing (reviewer rejected → developer)", () => {
|
||||
const result = evaluate(solveIssueGraph, "reviewer", {
|
||||
$status: "rejected",
|
||||
@@ -112,7 +95,7 @@ describe("evaluate", () => {
|
||||
});
|
||||
|
||||
test("missing role in graph → error", () => {
|
||||
const result = evaluate(solveIssueGraph, "unknown-role", { $status: "new" });
|
||||
const result = evaluate(solveIssueGraph, "unknown-role", { $status: "_" });
|
||||
expect(result.ok).toBe(false);
|
||||
if (!result.ok) {
|
||||
expect(result.error.message).toBe('no transitions defined for role "unknown-role"');
|
||||
@@ -129,7 +112,7 @@ describe("evaluate", () => {
|
||||
|
||||
test("mustache template rendering with simple fields", () => {
|
||||
const result = evaluate(solveIssueGraph, "planner", {
|
||||
$status: "planned",
|
||||
$status: "_",
|
||||
plan: "Add auth middleware",
|
||||
});
|
||||
expect(result).toEqual({
|
||||
@@ -156,11 +139,11 @@ describe("evaluate", () => {
|
||||
test("triple mustache also works for unescaped output", () => {
|
||||
const graph: Record<string, Record<string, Target>> = {
|
||||
reviewer: {
|
||||
rejected: { role: "developer", prompt: "Fix: {{{comments}}}", location: null },
|
||||
_: { role: "developer", prompt: "Fix: {{{comments}}}", location: null },
|
||||
},
|
||||
};
|
||||
const result = evaluate(graph, "reviewer", {
|
||||
$status: "rejected",
|
||||
$status: "_",
|
||||
comments: "<script>alert(1)</script>",
|
||||
});
|
||||
expect(result).toEqual({
|
||||
@@ -169,22 +152,24 @@ describe("evaluate", () => {
|
||||
});
|
||||
});
|
||||
|
||||
test("missing $status → error (no unit fallback)", () => {
|
||||
test("missing $status defaults to _ (unit routing)", () => {
|
||||
const result = evaluate(solveIssueGraph, "planner", {
|
||||
plan: "Add auth middleware",
|
||||
});
|
||||
expect(result.ok).toBe(false);
|
||||
if (!result.ok) {
|
||||
expect(result.error.message).toBe(
|
||||
'agent output for role "planner" is missing required "$status" string',
|
||||
);
|
||||
}
|
||||
expect(result).toEqual({
|
||||
ok: true,
|
||||
value: {
|
||||
role: "developer",
|
||||
prompt: "Implement the plan: Add auth middleware",
|
||||
location: null,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
test("mustache template with nested object paths", () => {
|
||||
const graph: Record<string, Record<string, Target>> = {
|
||||
reviewer: {
|
||||
rejected: {
|
||||
_: {
|
||||
role: "developer",
|
||||
prompt: "Address: {{review.comments}}",
|
||||
location: null,
|
||||
@@ -192,7 +177,7 @@ describe("evaluate", () => {
|
||||
},
|
||||
};
|
||||
const result = evaluate(graph, "reviewer", {
|
||||
$status: "rejected",
|
||||
$status: "_",
|
||||
review: { comments: "refactor the handler" },
|
||||
});
|
||||
expect(result).toEqual({
|
||||
|
||||
@@ -6,107 +6,101 @@ import { describe, expect, test } from "vitest";
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
import {
|
||||
cmdPromptAdapterDeveloping,
|
||||
cmdPromptBootstrap,
|
||||
cmdPromptAdapter,
|
||||
cmdPromptAuthor,
|
||||
cmdPromptDeveloper,
|
||||
cmdPromptList,
|
||||
cmdPromptSetup,
|
||||
cmdPromptUsage,
|
||||
cmdPromptWorkflowAuthoring,
|
||||
cmdPromptUser,
|
||||
} from "../commands/prompt.js";
|
||||
|
||||
describe("prompt commands", () => {
|
||||
test("prompt list returns prompt names (no bootstrap)", () => {
|
||||
test("prompt list returns all prompt names", () => {
|
||||
const result = cmdPromptList();
|
||||
expect(result).toBeInstanceOf(Array);
|
||||
expect(result).toContain("usage");
|
||||
expect(result).toContain("workflow-authoring");
|
||||
expect(result).toContain("adapter-developing");
|
||||
expect(result).not.toContain("bootstrap");
|
||||
expect(result).toContain("user");
|
||||
expect(result).toContain("author");
|
||||
expect(result).toContain("developer");
|
||||
expect(result).toContain("adapter");
|
||||
for (const name of result) {
|
||||
expect(name).toMatch(/^\S+$/);
|
||||
}
|
||||
});
|
||||
|
||||
test("prompt usage returns only the usage reference with frontmatter", () => {
|
||||
const result = cmdPromptUsage();
|
||||
test("prompt user returns non-empty markdown string", () => {
|
||||
const result = cmdPromptUser();
|
||||
expect(typeof result).toBe("string");
|
||||
expect(result).toContain("uwf");
|
||||
expect(result).toContain("thread");
|
||||
expect(result).toContain("workflow");
|
||||
expect(result).toContain("Quick Start");
|
||||
expect(result).toContain("---");
|
||||
expect(result).toContain("name:");
|
||||
expect(result).toContain("version:");
|
||||
// Should NOT contain other references
|
||||
expect(result).not.toContain("Workflow Authoring Reference");
|
||||
expect(result).not.toContain("Adapter Developing Reference");
|
||||
expect(result.length).toBeGreaterThan(500);
|
||||
});
|
||||
|
||||
test("prompt workflow-authoring returns non-empty markdown string with frontmatter", () => {
|
||||
const result = cmdPromptWorkflowAuthoring();
|
||||
test("prompt author returns non-empty markdown string", () => {
|
||||
const result = cmdPromptAuthor();
|
||||
expect(typeof result).toBe("string");
|
||||
expect(result).toContain("frontmatter");
|
||||
expect(result).toContain("graph");
|
||||
expect(result).toContain("$START");
|
||||
expect(result).toContain("$END");
|
||||
expect(result).toContain("$status");
|
||||
expect(result).toContain("---");
|
||||
expect(result).toContain("name:");
|
||||
expect(result).toContain("version:");
|
||||
expect(result.length).toBeGreaterThan(500);
|
||||
});
|
||||
|
||||
test("prompt adapter-developing returns non-empty markdown string with frontmatter", () => {
|
||||
const result = cmdPromptAdapterDeveloping();
|
||||
test("prompt developer returns non-empty markdown string", () => {
|
||||
const result = cmdPromptDeveloper();
|
||||
expect(typeof result).toBe("string");
|
||||
expect(result).toContain("Monorepo");
|
||||
expect(result).toContain("CAS");
|
||||
expect(result).toContain("Biome");
|
||||
expect(result.length).toBeGreaterThan(500);
|
||||
});
|
||||
|
||||
test("prompt adapter returns non-empty markdown string", () => {
|
||||
const result = cmdPromptAdapter();
|
||||
expect(typeof result).toBe("string");
|
||||
expect(result).toContain("createAgent");
|
||||
expect(result).toContain("AgentContext");
|
||||
expect(result).toContain("frontmatter");
|
||||
expect(result).toContain("---");
|
||||
expect(result).toContain("name:");
|
||||
expect(result).toContain("version:");
|
||||
expect(result.length).toBeGreaterThan(500);
|
||||
});
|
||||
|
||||
test("prompt bootstrap returns framework-agnostic setup instructions", () => {
|
||||
const result = cmdPromptBootstrap();
|
||||
test("prompt usage combines all references", () => {
|
||||
const result = cmdPromptUsage();
|
||||
expect(typeof result).toBe("string");
|
||||
// Skills installation
|
||||
expect(result).toContain("uwf prompt usage");
|
||||
expect(result).toContain("uwf prompt workflow-authoring");
|
||||
expect(result).toContain("uwf prompt adapter-developing");
|
||||
expect(result).toContain("uwf-usage");
|
||||
expect(result).toContain("uwf-workflow-authoring");
|
||||
expect(result).toContain("uwf-adapter-developing");
|
||||
// Fresh install scenario
|
||||
expect(result).toContain("Fresh Install");
|
||||
expect(result).toContain("uwf setup");
|
||||
expect(result).toContain("--provider");
|
||||
expect(result).toContain("--api-key");
|
||||
expect(result).toContain("agent adapter");
|
||||
// Upgrade scenario
|
||||
expect(result).toContain("Upgrade");
|
||||
expect(result).toContain("Migrate");
|
||||
// Should NOT contain Hermes-specific paths
|
||||
expect(result).not.toContain("~/.hermes/skills/");
|
||||
expect(result).not.toContain("> ~/.hermes/");
|
||||
expect(result.length).toBeGreaterThan(100);
|
||||
expect(result).toContain("User Reference");
|
||||
expect(result).toContain("Author Reference");
|
||||
expect(result).toContain("Developer Reference");
|
||||
expect(result).toContain("Adapter Reference");
|
||||
expect(result).toContain("---");
|
||||
expect(result.length).toBeGreaterThan(2000);
|
||||
});
|
||||
|
||||
test("prompt help subcommand is suppressed", { timeout: 30_000 }, () => {
|
||||
const cliPath = join(__dirname, "..", "..", "dist", "cli.js");
|
||||
const output = execFileSync("node", [cliPath, "prompt", "--help"], {
|
||||
test("prompt setup returns setup instructions", () => {
|
||||
const result = cmdPromptSetup();
|
||||
expect(typeof result).toBe("string");
|
||||
expect(result).toContain("uwf Skill Setup");
|
||||
expect(result).toContain("uwf prompt usage");
|
||||
expect(result).toContain("uwf prompt setup");
|
||||
expect(result).toContain("SKILL.md");
|
||||
expect(result).toContain("version");
|
||||
});
|
||||
|
||||
test("prompt help subcommand is suppressed", () => {
|
||||
const output = execFileSync("npx", ["tsx", "src/cli.ts", "prompt", "--help"], {
|
||||
cwd: join(__dirname, "..", ".."),
|
||||
encoding: "utf-8",
|
||||
env: { ...process.env },
|
||||
env: { ...process.env, PATH: `/opt/homebrew/bin:${process.env.PATH}` },
|
||||
});
|
||||
expect(output).not.toMatch(/help\s+\[command\]/i);
|
||||
expect(output).toContain("usage");
|
||||
expect(output).toContain("bootstrap");
|
||||
expect(output).toContain("workflow-authoring");
|
||||
expect(output).toContain("adapter-developing");
|
||||
expect(output).toContain("setup");
|
||||
expect(output).toContain("user");
|
||||
expect(output).toContain("author");
|
||||
expect(output).toContain("developer");
|
||||
expect(output).toContain("adapter");
|
||||
expect(output).toContain("list");
|
||||
// Removed subcommands should not appear as command names
|
||||
expect(output).not.toMatch(/^\s+setup\s/m);
|
||||
expect(output).not.toContain("usage-reference");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -4,7 +4,7 @@ import { join } from "node:path";
|
||||
import { type CasRef, createThreadIndexEntry, type ThreadId } from "@united-workforce/protocol";
|
||||
import { afterEach, beforeEach, describe, expect, test } from "vitest";
|
||||
import { resolveHeadHash } from "../commands/shared.js";
|
||||
import { completeThread, createUwfStore, setThread } from "../store.js";
|
||||
import { addHistoryEntry, createUwfStore, setThread } from "../store.js";
|
||||
|
||||
let tmpDir: string;
|
||||
|
||||
@@ -31,13 +31,19 @@ describe("resolveHeadHash", () => {
|
||||
expect(result).toBe(headHash);
|
||||
});
|
||||
|
||||
test("finds completed thread", async () => {
|
||||
test("falls back to history variable when thread not in active index", async () => {
|
||||
const threadId = "01JTEST0000000000000000002" as ThreadId;
|
||||
const workflowHash = "workflow_hash_789" as CasRef;
|
||||
|
||||
const uwf = await createUwfStore(tmpDir);
|
||||
const headHash = (await uwf.store.cas.put(uwf.schemas.text, "completed-head")) as CasRef;
|
||||
setThread(uwf.varStore, threadId, createThreadIndexEntry(headHash));
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: workflowHash,
|
||||
head: headHash,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
});
|
||||
|
||||
const result = await resolveHeadHash(tmpDir, threadId);
|
||||
|
||||
@@ -48,36 +54,58 @@ describe("resolveHeadHash", () => {
|
||||
// calls fail() which does process.exit(1), terminating the test runner.
|
||||
// The error behavior is tested in integration tests below via CLI invocation.
|
||||
|
||||
test("prioritizes active thread", async () => {
|
||||
test("prioritizes active thread over history when thread exists in both", async () => {
|
||||
const threadId = "01JTEST0000000000000000004" as ThreadId;
|
||||
const workflowHash = "workflow_hash_xyz" as CasRef;
|
||||
|
||||
const uwf = await createUwfStore(tmpDir);
|
||||
const activeHead = (await uwf.store.cas.put(uwf.schemas.text, "active-v2")) as CasRef;
|
||||
const historicalHash = (await uwf.store.cas.put(uwf.schemas.text, "historical-v1")) as CasRef;
|
||||
setThread(uwf.varStore, threadId, createThreadIndexEntry(activeHead));
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: workflowHash,
|
||||
head: historicalHash,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
});
|
||||
|
||||
const result = await resolveHeadHash(tmpDir, threadId);
|
||||
|
||||
// Should return the active head
|
||||
// Should return the active head, not the historical one
|
||||
expect(result).toBe(activeHead);
|
||||
});
|
||||
|
||||
test("finds thread from multiple completed threads", async () => {
|
||||
test("finds thread from multiple history entries", async () => {
|
||||
const threadId1 = "01JTEST0000000000000000005" as ThreadId;
|
||||
const threadId2 = "01JTEST0000000000000000006" as ThreadId;
|
||||
const threadId3 = "01JTEST0000000000000000007" as ThreadId;
|
||||
const workflowHash = "workflow_hash_abc" as CasRef;
|
||||
const uwf = await createUwfStore(tmpDir);
|
||||
const hash1 = (await uwf.store.cas.put(uwf.schemas.text, "hash-thread1")) as CasRef;
|
||||
const hash2 = (await uwf.store.cas.put(uwf.schemas.text, "hash-thread2")) as CasRef;
|
||||
const hash3 = (await uwf.store.cas.put(uwf.schemas.text, "hash-thread3")) as CasRef;
|
||||
|
||||
setThread(uwf.varStore, threadId1, createThreadIndexEntry(hash1));
|
||||
completeThread(uwf.varStore, threadId1, "completed");
|
||||
|
||||
setThread(uwf.varStore, threadId2, createThreadIndexEntry(hash2));
|
||||
completeThread(uwf.varStore, threadId2, "completed");
|
||||
|
||||
setThread(uwf.varStore, threadId3, createThreadIndexEntry(hash3));
|
||||
completeThread(uwf.varStore, threadId3, "completed");
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId1,
|
||||
workflow: workflowHash,
|
||||
head: hash1,
|
||||
completedAt: Date.now() - 2000,
|
||||
reason: null,
|
||||
});
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId2,
|
||||
workflow: workflowHash,
|
||||
head: hash2,
|
||||
completedAt: Date.now() - 1000,
|
||||
reason: null,
|
||||
});
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId3,
|
||||
workflow: workflowHash,
|
||||
head: hash3,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
});
|
||||
|
||||
const result = await resolveHeadHash(tmpDir, threadId2);
|
||||
|
||||
|
||||
@@ -118,7 +118,6 @@ async function createTestStep(
|
||||
completedAtMs: Date.now() + 1000,
|
||||
assembledPrompt: null,
|
||||
cwd: "/tmp",
|
||||
usage: null,
|
||||
};
|
||||
return store.cas.put(schemas.stepNode, stepPayload);
|
||||
}
|
||||
|
||||
@@ -96,7 +96,6 @@ describe("protocol types", () => {
|
||||
completedAtMs: 2000,
|
||||
assembledPrompt: null,
|
||||
cwd: "/test/path",
|
||||
usage: null,
|
||||
};
|
||||
expect(record.startedAtMs).toBe(1000);
|
||||
expect(record.completedAtMs).toBe(2000);
|
||||
@@ -111,7 +110,6 @@ describe("protocol types", () => {
|
||||
agent: "uwf-test",
|
||||
timestamp: 123,
|
||||
durationMs: 5000,
|
||||
usage: null,
|
||||
};
|
||||
expect(entry.durationMs).toBe(5000);
|
||||
});
|
||||
@@ -253,11 +251,8 @@ describe("thread read timing", () => {
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "go", location: null },
|
||||
resume: { role: "worker", prompt: "resume", location: null },
|
||||
},
|
||||
worker: { done: { role: "$END", prompt: "", location: null } },
|
||||
$START: { _: { role: "worker", prompt: "go", location: null } },
|
||||
worker: { _: { role: "$END", prompt: "", location: null } },
|
||||
},
|
||||
});
|
||||
|
||||
@@ -322,11 +317,8 @@ describe("thread read timing", () => {
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "go", location: null },
|
||||
resume: { role: "worker", prompt: "resume", location: null },
|
||||
},
|
||||
worker: { done: { role: "$END", prompt: "", location: null } },
|
||||
$START: { _: { role: "worker", prompt: "go", location: null } },
|
||||
worker: { _: { role: "$END", prompt: "", location: null } },
|
||||
},
|
||||
});
|
||||
|
||||
|
||||
@@ -226,15 +226,19 @@ describe("Global CAS directory", () => {
|
||||
const uwf = await createUwfStore(storageRoot);
|
||||
const threadId = "thread-123" as ThreadId;
|
||||
const headHash = await uwf.store.cas.put(uwf.schemas.text, "history-head");
|
||||
const { completeThread, setThread, getThread } = await import("../store.js");
|
||||
const { createThreadIndexEntry } = await import("@united-workforce/protocol");
|
||||
const { addHistoryEntry, findHistoryEntry } = await import("../store.js");
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: "workflow-456",
|
||||
head: headHash,
|
||||
completedAt: Date.now(),
|
||||
reason: "completed",
|
||||
});
|
||||
|
||||
setThread(uwf.varStore, threadId, createThreadIndexEntry(headHash));
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
const entry = findHistoryEntry(uwf.varStore, threadId);
|
||||
expect(entry?.thread).toBe(threadId);
|
||||
expect(entry?.workflow).toBe("workflow-456");
|
||||
expect(entry?.head).toBe(headHash);
|
||||
expect(entry?.status).toBe("completed");
|
||||
|
||||
const { access } = await import("node:fs/promises");
|
||||
await access(join(globalCasDir, "vars"));
|
||||
@@ -270,12 +274,15 @@ describe("Global CAS directory", () => {
|
||||
);
|
||||
|
||||
const uwf = await createUwfStore(storageRoot);
|
||||
const { getThread } = await import("../store.js");
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry?.head).toBe(headHash);
|
||||
expect(entry?.status).toBe("cancelled");
|
||||
expect(entry?.completedAt).toBe(completedAt);
|
||||
const { findHistoryEntry } = await import("../store.js");
|
||||
const entry = findHistoryEntry(uwf.varStore, threadId);
|
||||
expect(entry).toEqual({
|
||||
thread: threadId,
|
||||
workflow: workflowHash,
|
||||
head: headHash,
|
||||
completedAt,
|
||||
reason: "cancelled",
|
||||
});
|
||||
|
||||
await expect(access(historyPath)).rejects.toThrow();
|
||||
const migratedContent = await readFile(`${historyPath}.migrated`, "utf8");
|
||||
|
||||
@@ -1,235 +0,0 @@
|
||||
import { mkdir, mkdtemp } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import type { CasRef, ThreadId } from "@united-workforce/protocol";
|
||||
import { describe, expect, test } from "vitest";
|
||||
import {
|
||||
completeThread,
|
||||
createUwfStore,
|
||||
getThread,
|
||||
loadActiveThreads,
|
||||
loadHistoryThreads,
|
||||
setThread,
|
||||
} from "../store.js";
|
||||
|
||||
async function makeUwfStore(storageRoot: string) {
|
||||
const casDir = join(storageRoot, "cas");
|
||||
await mkdir(casDir, { recursive: true });
|
||||
process.env.OCAS_HOME = casDir;
|
||||
return createUwfStore(storageRoot);
|
||||
}
|
||||
|
||||
async function seedThreadHead(
|
||||
uwf: Awaited<ReturnType<typeof createUwfStore>>,
|
||||
label: string,
|
||||
): Promise<CasRef> {
|
||||
return (await uwf.store.cas.put(uwf.schemas.text, label)) as CasRef;
|
||||
}
|
||||
|
||||
describe("unified thread storage", () => {
|
||||
test("loadActiveThreads excludes completed threads", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-active-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
|
||||
const threadId1 = "01JTEST000000000000ACTIVE1" as ThreadId;
|
||||
const threadId2 = "01JTEST000000000000ACTIVE2" as ThreadId;
|
||||
const head1 = await seedThreadHead(uwf, "active-head");
|
||||
const head2 = await seedThreadHead(uwf, "completed-head");
|
||||
|
||||
setThread(uwf.varStore, threadId1, {
|
||||
head: head1,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
setThread(uwf.varStore, threadId2, {
|
||||
head: head2,
|
||||
status: "completed",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: Date.now(),
|
||||
});
|
||||
|
||||
const active = loadActiveThreads(uwf.varStore);
|
||||
expect(Object.keys(active)).toHaveLength(1);
|
||||
expect(active[threadId1]).toBeDefined();
|
||||
expect(active[threadId2]).toBeUndefined();
|
||||
});
|
||||
|
||||
test("loadActiveThreads excludes cancelled threads", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-active-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
|
||||
const threadId1 = "01JTEST000000000000ACTIVE3" as ThreadId;
|
||||
const threadId2 = "01JTEST000000000000ACTIVE4" as ThreadId;
|
||||
const head1 = await seedThreadHead(uwf, "active-head");
|
||||
const head2 = await seedThreadHead(uwf, "cancelled-head");
|
||||
|
||||
setThread(uwf.varStore, threadId1, {
|
||||
head: head1,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
setThread(uwf.varStore, threadId2, {
|
||||
head: head2,
|
||||
status: "cancelled",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: Date.now(),
|
||||
});
|
||||
|
||||
const active = loadActiveThreads(uwf.varStore);
|
||||
expect(Object.keys(active)).toHaveLength(1);
|
||||
expect(active[threadId1]).toBeDefined();
|
||||
expect(active[threadId2]).toBeUndefined();
|
||||
});
|
||||
|
||||
test("loadHistoryThreads only returns completed and cancelled", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-history-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
|
||||
const threadId1 = "01JTEST000000000000HISTOR1" as ThreadId;
|
||||
const threadId2 = "01JTEST000000000000HISTOR2" as ThreadId;
|
||||
const threadId3 = "01JTEST000000000000HISTOR3" as ThreadId;
|
||||
const head1 = await seedThreadHead(uwf, "active-head");
|
||||
const head2 = await seedThreadHead(uwf, "completed-head");
|
||||
const head3 = await seedThreadHead(uwf, "cancelled-head");
|
||||
|
||||
setThread(uwf.varStore, threadId1, {
|
||||
head: head1,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
setThread(uwf.varStore, threadId2, {
|
||||
head: head2,
|
||||
status: "completed",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: Date.now(),
|
||||
});
|
||||
|
||||
setThread(uwf.varStore, threadId3, {
|
||||
head: head3,
|
||||
status: "cancelled",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: Date.now(),
|
||||
});
|
||||
|
||||
const history = loadHistoryThreads(uwf.varStore);
|
||||
expect(Object.keys(history)).toHaveLength(2);
|
||||
expect(history[threadId1]).toBeUndefined();
|
||||
expect(history[threadId2]).toBeDefined();
|
||||
expect(history[threadId3]).toBeDefined();
|
||||
});
|
||||
|
||||
test("completeThread marks thread as completed", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-complete-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const threadId = "01JTEST000000000000COMPLE1" as ThreadId;
|
||||
const head = await seedThreadHead(uwf, "active-head");
|
||||
|
||||
setThread(uwf.varStore, threadId, {
|
||||
head,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry?.status).toBe("completed");
|
||||
expect(entry?.completedAt).toBeDefined();
|
||||
expect(entry?.completedAt).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
test("completeThread marks thread as cancelled", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-complete-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const threadId = "01JTEST000000000000COMPLE2" as ThreadId;
|
||||
const head = await seedThreadHead(uwf, "active-head");
|
||||
|
||||
setThread(uwf.varStore, threadId, {
|
||||
head,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
completeThread(uwf.varStore, threadId, "cancelled");
|
||||
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry?.status).toBe("cancelled");
|
||||
expect(entry?.completedAt).toBeDefined();
|
||||
expect(entry?.completedAt).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
test("completeThread clears suspend metadata", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-complete-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const threadId = "01JTEST000000000000COMPLE3" as ThreadId;
|
||||
const head = await seedThreadHead(uwf, "suspended-head");
|
||||
|
||||
setThread(uwf.varStore, threadId, {
|
||||
head,
|
||||
status: "suspended",
|
||||
suspendedRole: "test-role",
|
||||
suspendMessage: "test message",
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry?.status).toBe("completed");
|
||||
expect(entry?.suspendedRole).toBeNull();
|
||||
expect(entry?.suspendMessage).toBeNull();
|
||||
});
|
||||
|
||||
test("completeThread handles non-existent thread gracefully", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-complete-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const threadId = "01JTEST000000000000NOEXIST" as ThreadId;
|
||||
|
||||
// Should not throw
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry).toBeNull();
|
||||
});
|
||||
|
||||
test("status and completedAt tags are persisted and loaded", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-tags-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const threadId = "01JTEST000000000000TAGTEST" as ThreadId;
|
||||
const head = await seedThreadHead(uwf, "test-head");
|
||||
const now = Date.now();
|
||||
|
||||
setThread(uwf.varStore, threadId, {
|
||||
head,
|
||||
status: "completed",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: now,
|
||||
});
|
||||
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry?.status).toBe("completed");
|
||||
expect(entry?.completedAt).toBe(now);
|
||||
});
|
||||
});
|
||||
@@ -3,13 +3,7 @@ import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import type { CasRef, ThreadId } from "@united-workforce/protocol";
|
||||
import { describe, expect, test } from "vitest";
|
||||
import {
|
||||
completeThread,
|
||||
createUwfStore,
|
||||
getThread,
|
||||
loadHistoryThreads,
|
||||
setThread,
|
||||
} from "../store.js";
|
||||
import { addHistoryEntry, createUwfStore, loadAllHistory } from "../store.js";
|
||||
|
||||
async function makeUwfStore(storageRoot: string) {
|
||||
const casDir = join(storageRoot, "cas");
|
||||
@@ -26,113 +20,88 @@ async function seedHistoryHead(
|
||||
}
|
||||
|
||||
describe("thread cancel status", () => {
|
||||
test("cancelled thread has status 'cancelled'", async () => {
|
||||
test("cancelled history entry has reason 'cancelled'", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-cancel-test-"));
|
||||
const threadId = "01JTEST000000000000CANCEL1" as ThreadId;
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const head = await seedHistoryHead(uwf, "cancelled-head");
|
||||
|
||||
setThread(uwf.varStore, threadId, {
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: "test-workflow",
|
||||
head,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
completedAt: Date.now(),
|
||||
reason: "cancelled",
|
||||
});
|
||||
|
||||
completeThread(uwf.varStore, threadId, "cancelled");
|
||||
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry?.status).toBe("cancelled");
|
||||
const history = loadAllHistory(uwf.varStore);
|
||||
expect(history).toHaveLength(1);
|
||||
expect(history[0]?.reason).toBe("cancelled");
|
||||
});
|
||||
|
||||
test("completed thread has status 'completed'", async () => {
|
||||
test("completed history entry has reason 'completed'", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-cancel-test-"));
|
||||
const threadId = "01JTEST000000000000CANCEL2" as ThreadId;
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const head = await seedHistoryHead(uwf, "completed-head");
|
||||
|
||||
setThread(uwf.varStore, threadId, {
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: "test-workflow",
|
||||
head,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
completedAt: Date.now(),
|
||||
reason: "completed",
|
||||
});
|
||||
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry?.status).toBe("completed");
|
||||
const history = loadAllHistory(uwf.varStore);
|
||||
expect(history).toHaveLength(1);
|
||||
expect(history[0]?.reason).toBe("completed");
|
||||
});
|
||||
|
||||
test("loadHistoryThreads returns completed and cancelled", async () => {
|
||||
test("history entry with null reason is stored as completed", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-cancel-test-"));
|
||||
const threadId = "01JTEST000000000000CANCEL3" as ThreadId;
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const head = await seedHistoryHead(uwf, "legacy-head");
|
||||
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: "test-workflow",
|
||||
head,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
});
|
||||
|
||||
const history = loadAllHistory(uwf.varStore);
|
||||
expect(history).toHaveLength(1);
|
||||
expect(history[0]?.reason).toBe("completed");
|
||||
});
|
||||
|
||||
test("mixed completed and cancelled entries preserve distinct reasons", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-cancel-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const head1 = await seedHistoryHead(uwf, "head1");
|
||||
const head2 = await seedHistoryHead(uwf, "head2");
|
||||
|
||||
const threadId1 = "01JTEST000000000000CANCEL4" as ThreadId;
|
||||
setThread(uwf.varStore, threadId1, {
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: "01JTEST000000000000CANCEL4" as ThreadId,
|
||||
workflow: "test-workflow",
|
||||
head: head1,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
completedAt: Date.now(),
|
||||
reason: "completed",
|
||||
});
|
||||
completeThread(uwf.varStore, threadId1, "completed");
|
||||
|
||||
const threadId2 = "01JTEST000000000000CANCEL5" as ThreadId;
|
||||
setThread(uwf.varStore, threadId2, {
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: "01JTEST000000000000CANCEL5" as ThreadId,
|
||||
workflow: "test-workflow",
|
||||
head: head2,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
completeThread(uwf.varStore, threadId2, "cancelled");
|
||||
|
||||
const history = loadHistoryThreads(uwf.varStore);
|
||||
expect(Object.keys(history)).toHaveLength(2);
|
||||
const statuses = Object.values(history)
|
||||
.map((entry) => entry.status)
|
||||
.sort();
|
||||
expect(statuses).toEqual(["cancelled", "completed"]);
|
||||
completedAt: Date.now(),
|
||||
reason: "cancelled",
|
||||
});
|
||||
|
||||
test("mixed completed and cancelled entries preserve distinct statuses", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-cancel-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const head1 = await seedHistoryHead(uwf, "head1");
|
||||
const head2 = await seedHistoryHead(uwf, "head2");
|
||||
|
||||
const threadId1 = "01JTEST000000000000CANCEL6" as ThreadId;
|
||||
setThread(uwf.varStore, threadId1, {
|
||||
head: head1,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
completeThread(uwf.varStore, threadId1, "completed");
|
||||
|
||||
const threadId2 = "01JTEST000000000000CANCEL7" as ThreadId;
|
||||
setThread(uwf.varStore, threadId2, {
|
||||
head: head2,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
completeThread(uwf.varStore, threadId2, "cancelled");
|
||||
|
||||
const history = loadHistoryThreads(uwf.varStore);
|
||||
expect(Object.keys(history)).toHaveLength(2);
|
||||
const statuses = Object.values(history)
|
||||
.map((entry) => entry.status)
|
||||
.sort();
|
||||
expect(statuses).toEqual(["cancelled", "completed"]);
|
||||
const history = loadAllHistory(uwf.varStore);
|
||||
expect(history).toHaveLength(2);
|
||||
const reasons = history.map((entry) => entry.reason).sort();
|
||||
expect(reasons).toEqual(["cancelled", "completed"]);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -10,8 +10,9 @@ import { cmdThreadList } from "../commands/thread.js";
|
||||
import { parseTimeInput } from "../commands/thread-time-parser.js";
|
||||
import type { UwfStore } from "../store.js";
|
||||
import {
|
||||
completeThread as completeThreadInStore,
|
||||
addHistoryEntry,
|
||||
createUwfStore,
|
||||
deleteThread,
|
||||
loadAllThreads,
|
||||
setThread,
|
||||
} from "../store.js";
|
||||
@@ -72,11 +73,18 @@ async function markThreadRunning(storageRoot: string, threadId: ThreadId, workfl
|
||||
async function completeThread(
|
||||
storageRoot: string,
|
||||
threadId: ThreadId,
|
||||
_workflowHash: CasRef,
|
||||
_headHash: CasRef,
|
||||
workflowHash: CasRef,
|
||||
headHash: CasRef,
|
||||
) {
|
||||
const uwfIdx = await createUwfStore(storageRoot);
|
||||
completeThreadInStore(uwfIdx.varStore, threadId, "completed");
|
||||
deleteThread(uwfIdx.varStore, threadId);
|
||||
addHistoryEntry(uwfIdx.varStore, {
|
||||
thread: threadId,
|
||||
workflow: workflowHash,
|
||||
head: headHash,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
});
|
||||
}
|
||||
|
||||
// ── test setup ────────────────────────────────────────────────────────────────
|
||||
@@ -492,10 +500,8 @@ describe("edge cases", () => {
|
||||
)) as CasRef;
|
||||
index["INVALID_ULID_FORMAT_HERE" as ThreadId] = {
|
||||
head: placeholderHead,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
};
|
||||
for (const [tid, ent] of Object.entries(index)) {
|
||||
setThread(uwfIdx.varStore, tid as ThreadId, ent);
|
||||
|
||||
@@ -54,19 +54,15 @@ roles:
|
||||
type: object
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { const: "ready" }
|
||||
$status: { type: string }
|
||||
graph:
|
||||
$START:
|
||||
new:
|
||||
_:
|
||||
role: planner
|
||||
prompt: "Plan the work"
|
||||
location: null
|
||||
resume:
|
||||
role: planner
|
||||
prompt: "Resume the work"
|
||||
location: null
|
||||
planner:
|
||||
ready:
|
||||
_:
|
||||
role: $END
|
||||
prompt: "Done"
|
||||
location: null
|
||||
@@ -114,19 +110,15 @@ roles:
|
||||
type: object
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { const: "ready" }
|
||||
$status: { type: string }
|
||||
graph:
|
||||
$START:
|
||||
new:
|
||||
_:
|
||||
role: planner
|
||||
prompt: "Plan"
|
||||
location: null
|
||||
resume:
|
||||
role: planner
|
||||
prompt: "Resume"
|
||||
location: null
|
||||
planner:
|
||||
ready:
|
||||
_:
|
||||
role: $END
|
||||
prompt: "Done"
|
||||
location: null
|
||||
@@ -161,19 +153,15 @@ roles:
|
||||
type: object
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { const: "ready" }
|
||||
$status: { type: string }
|
||||
graph:
|
||||
$START:
|
||||
new:
|
||||
_:
|
||||
role: planner
|
||||
prompt: "Plan"
|
||||
location: null
|
||||
resume:
|
||||
role: planner
|
||||
prompt: "Resume"
|
||||
location: null
|
||||
planner:
|
||||
ready:
|
||||
_:
|
||||
role: $END
|
||||
prompt: "Done"
|
||||
location: null
|
||||
|
||||
@@ -70,10 +70,7 @@ async function setupSuspendedThread(mode: MockAgentMode): Promise<{
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Start work", location: null },
|
||||
resume: { role: "worker", prompt: "Resume the work", location: null },
|
||||
},
|
||||
$START: { _: { role: "worker", prompt: "Start work", location: null } },
|
||||
worker: {
|
||||
needs_input: {
|
||||
role: "$SUSPEND",
|
||||
@@ -82,7 +79,7 @@ async function setupSuspendedThread(mode: MockAgentMode): Promise<{
|
||||
},
|
||||
ok: { role: "reviewer", prompt: "Review the work", location: null },
|
||||
},
|
||||
reviewer: { done: { role: "$END", prompt: "Done", location: null } },
|
||||
reviewer: { _: { role: "$END", prompt: "Done", location: null } },
|
||||
},
|
||||
});
|
||||
|
||||
@@ -121,10 +118,8 @@ async function setupSuspendedThread(mode: MockAgentMode): Promise<{
|
||||
await seedThreads(tmpDir, {
|
||||
[THREAD_ID]: {
|
||||
head: stepHash,
|
||||
status: "suspended",
|
||||
suspendedRole: "worker",
|
||||
suspendMessage: SUSPEND_MESSAGE,
|
||||
completedAt: null,
|
||||
},
|
||||
});
|
||||
|
||||
@@ -236,11 +231,8 @@ describe("uwf thread resume", () => {
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Start", location: null },
|
||||
resume: { role: "worker", prompt: "Resume", location: null },
|
||||
},
|
||||
worker: { done: { role: "$END", prompt: "Done", location: null } },
|
||||
$START: { _: { role: "worker", prompt: "Start", location: null } },
|
||||
worker: { _: { role: "$END", prompt: "Done", location: null } },
|
||||
},
|
||||
});
|
||||
|
||||
@@ -255,7 +247,7 @@ describe("uwf thread resume", () => {
|
||||
|
||||
const result = runUwf(["thread", "resume", THREAD_ID], casDir);
|
||||
expect(result.status).not.toBe(0);
|
||||
expect(result.stderr).toContain("thread cannot be resumed");
|
||||
expect(result.stderr).toContain("thread is not suspended");
|
||||
});
|
||||
|
||||
test("resume suspended thread executes step and becomes idle", async () => {
|
||||
@@ -355,10 +347,8 @@ describe("uwf thread resume", () => {
|
||||
const uwfAfterFirst = await createUwfStore(tmpDir);
|
||||
expect(getThread(uwfAfterFirst.varStore, THREAD_ID)).toEqual({
|
||||
head: firstResume.head,
|
||||
status: "suspended",
|
||||
suspendedRole: "worker",
|
||||
suspendMessage: SUSPEND_MESSAGE,
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
const { mockAgentPath: okMockAgentPath } = await setupOkMockAgent(
|
||||
@@ -454,272 +444,3 @@ echo '${adapterJson}'
|
||||
|
||||
return { mockAgentPath };
|
||||
}
|
||||
|
||||
describe("uwf thread resume - completed threads", () => {
|
||||
test("resume completed thread starts from $START role", async () => {
|
||||
const casDir = join(tmpDir, "cas");
|
||||
await mkdir(casDir, { recursive: true });
|
||||
const store = await openStore(casDir);
|
||||
const schemas = await registerUwfSchemas(store);
|
||||
const outputSchemaHash = await putSchema(store, OUTPUT_SCHEMA);
|
||||
|
||||
const workflowHash = await store.cas.put(schemas.workflow, {
|
||||
name: "test-completed-resume",
|
||||
description: "completed thread resume test",
|
||||
roles: {
|
||||
worker: {
|
||||
description: "Worker role",
|
||||
goal: "Work",
|
||||
capabilities: [],
|
||||
procedure: "work",
|
||||
output: "result",
|
||||
frontmatter: outputSchemaHash,
|
||||
},
|
||||
reviewer: {
|
||||
description: "Reviewer role",
|
||||
goal: "Review",
|
||||
capabilities: [],
|
||||
procedure: "review",
|
||||
output: "result",
|
||||
frontmatter: outputSchemaHash,
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Start work", location: null },
|
||||
resume: { role: "worker", prompt: "Resume the work", location: null },
|
||||
},
|
||||
worker: { done: { role: "reviewer", prompt: "Review the work", location: null } },
|
||||
reviewer: { done: { role: "$END", prompt: "Done", location: null } },
|
||||
},
|
||||
});
|
||||
|
||||
const startHash = await store.cas.put(schemas.startNode, {
|
||||
workflow: workflowHash,
|
||||
prompt: "Initial task",
|
||||
cwd: tmpDir,
|
||||
});
|
||||
|
||||
process.env.OCAS_HOME = casDir;
|
||||
|
||||
const workerOutputHash = await store.cas.put(outputSchemaHash, { $status: "done" });
|
||||
const reviewerOutputHash = await store.cas.put(outputSchemaHash, { $status: "done" });
|
||||
const detailHash = await store.cas.put(schemas.text, "mock detail");
|
||||
|
||||
const workerStepHash = await store.cas.put(schemas.stepNode, {
|
||||
start: startHash,
|
||||
prev: null,
|
||||
role: "worker",
|
||||
output: workerOutputHash,
|
||||
detail: detailHash,
|
||||
agent: "uwf-mock",
|
||||
edgePrompt: "Start work",
|
||||
startedAtMs: 1716600000000,
|
||||
completedAtMs: 1716600001000,
|
||||
cwd: tmpDir,
|
||||
assembledPrompt: null,
|
||||
});
|
||||
|
||||
const reviewerStepHash = await store.cas.put(schemas.stepNode, {
|
||||
start: startHash,
|
||||
prev: workerStepHash,
|
||||
role: "reviewer",
|
||||
output: reviewerOutputHash,
|
||||
detail: detailHash,
|
||||
agent: "uwf-mock",
|
||||
edgePrompt: "Review the work",
|
||||
startedAtMs: 1716600001000,
|
||||
completedAtMs: 1716600002000,
|
||||
cwd: tmpDir,
|
||||
assembledPrompt: null,
|
||||
});
|
||||
|
||||
await seedThreads(tmpDir, {
|
||||
[THREAD_ID]: {
|
||||
head: reviewerStepHash,
|
||||
status: "completed",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: 1716600002000,
|
||||
},
|
||||
});
|
||||
|
||||
// Verify the status was actually set
|
||||
const { createUwfStore, getThread } = await import("../store.js");
|
||||
const verifyUwf = await createUwfStore(tmpDir);
|
||||
const verifyEntry = getThread(verifyUwf.varStore, THREAD_ID);
|
||||
console.log("Seeded entry status:", verifyEntry?.status);
|
||||
console.log("Seeded entry:", JSON.stringify(verifyEntry, null, 2));
|
||||
|
||||
const promptCapturePath = join(tmpDir, "captured-prompt-completed.txt");
|
||||
const mockAgentPath = join(tmpDir, "mock-agent-completed.sh");
|
||||
|
||||
const newWorkerStepHash = await store.cas.put(schemas.stepNode, {
|
||||
start: startHash,
|
||||
prev: reviewerStepHash,
|
||||
role: "worker",
|
||||
output: workerOutputHash,
|
||||
detail: detailHash,
|
||||
agent: "uwf-mock",
|
||||
edgePrompt: "Start work",
|
||||
startedAtMs: 1716600003000,
|
||||
completedAtMs: 1716600004000,
|
||||
cwd: tmpDir,
|
||||
assembledPrompt: null,
|
||||
});
|
||||
|
||||
const adapterJson = JSON.stringify({
|
||||
stepHash: newWorkerStepHash,
|
||||
detailHash,
|
||||
role: "worker",
|
||||
frontmatter: { $status: "done" },
|
||||
body: "",
|
||||
startedAtMs: 1716600003000,
|
||||
completedAtMs: 1716600004000,
|
||||
});
|
||||
|
||||
await writeFile(
|
||||
mockAgentPath,
|
||||
`#!/bin/sh
|
||||
prompt=""
|
||||
while [ $# -gt 0 ]; do
|
||||
if [ "$1" = "--prompt" ]; then
|
||||
prompt="$2"
|
||||
shift 2
|
||||
else
|
||||
shift
|
||||
fi
|
||||
done
|
||||
printf '%s' "$prompt" > '${promptCapturePath}'
|
||||
echo '${adapterJson}'
|
||||
`,
|
||||
{ mode: 0o755 },
|
||||
);
|
||||
|
||||
const configPath = join(tmpDir, "config.yaml");
|
||||
await writeFile(
|
||||
configPath,
|
||||
`defaultAgent: uwf-hermes\ndefaultModel: test-model\nagentOverrides: null\nagents: {}\nproviders: {}\nmodels: {}\n`,
|
||||
);
|
||||
|
||||
const result = runUwf(
|
||||
["thread", "resume", THREAD_ID, "-p", "Additional context", "--agent", mockAgentPath],
|
||||
casDir,
|
||||
);
|
||||
|
||||
if (result.status !== 0) {
|
||||
console.error("Command failed:", result.stderr);
|
||||
}
|
||||
|
||||
expect(result.status).toBe(0);
|
||||
|
||||
const cliOutput = JSON.parse(result.stdout.trim());
|
||||
expect(cliOutput.status).toBe("idle");
|
||||
expect(cliOutput.currentRole).toBe("reviewer");
|
||||
expect(cliOutput.done).toBe(false);
|
||||
|
||||
const capturedPrompt = await readFile(promptCapturePath, "utf8");
|
||||
expect(capturedPrompt).toContain("Resume the work");
|
||||
expect(capturedPrompt).toContain("Additional context");
|
||||
|
||||
const storeModule = await import("../store.js");
|
||||
const uwf2 = await storeModule.createUwfStore(tmpDir);
|
||||
const entry2 = storeModule.getThread(uwf2.varStore, THREAD_ID);
|
||||
expect(entry2?.status).toBe("idle");
|
||||
expect(entry2?.completedAt).toBeNull();
|
||||
});
|
||||
|
||||
test("resume cancelled thread returns error", async () => {
|
||||
const casDir = join(tmpDir, "cas");
|
||||
await mkdir(casDir, { recursive: true });
|
||||
const store = await openStore(casDir);
|
||||
const schemas = await registerUwfSchemas(store);
|
||||
|
||||
const workflowHash = await store.cas.put(schemas.workflow, {
|
||||
name: "cancelled-workflow",
|
||||
description: "cancelled thread",
|
||||
roles: {
|
||||
worker: {
|
||||
description: "Worker",
|
||||
goal: "Work",
|
||||
capabilities: [],
|
||||
procedure: "work",
|
||||
output: "result",
|
||||
frontmatter: await putSchema(store, OUTPUT_SCHEMA),
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Start", location: null },
|
||||
resume: { role: "worker", prompt: "Resume", location: null },
|
||||
},
|
||||
worker: { done: { role: "$END", prompt: "Done", location: null } },
|
||||
},
|
||||
});
|
||||
|
||||
const startHash = await store.cas.put(schemas.startNode, {
|
||||
workflow: workflowHash,
|
||||
prompt: "task",
|
||||
cwd: tmpDir,
|
||||
});
|
||||
|
||||
process.env.OCAS_HOME = casDir;
|
||||
await seedThreads(tmpDir, {
|
||||
[THREAD_ID]: {
|
||||
head: startHash,
|
||||
status: "cancelled",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
},
|
||||
});
|
||||
|
||||
const result = runUwf(["thread", "resume", THREAD_ID], casDir);
|
||||
expect(result.status).not.toBe(0);
|
||||
expect(result.stderr).toContain("thread cannot be resumed");
|
||||
expect(result.stderr).toContain("cancelled");
|
||||
});
|
||||
|
||||
test("resume idle thread returns error", async () => {
|
||||
const casDir = join(tmpDir, "cas");
|
||||
await mkdir(casDir, { recursive: true });
|
||||
const store = await openStore(casDir);
|
||||
const schemas = await registerUwfSchemas(store);
|
||||
|
||||
const workflowHash = await store.cas.put(schemas.workflow, {
|
||||
name: "idle-workflow",
|
||||
description: "idle thread",
|
||||
roles: {
|
||||
worker: {
|
||||
description: "Worker",
|
||||
goal: "Work",
|
||||
capabilities: [],
|
||||
procedure: "work",
|
||||
output: "result",
|
||||
frontmatter: await putSchema(store, OUTPUT_SCHEMA),
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Start", location: null },
|
||||
resume: { role: "worker", prompt: "Resume", location: null },
|
||||
},
|
||||
worker: { done: { role: "$END", prompt: "Done", location: null } },
|
||||
},
|
||||
});
|
||||
|
||||
const startHash = await store.cas.put(schemas.startNode, {
|
||||
workflow: workflowHash,
|
||||
prompt: "task",
|
||||
cwd: tmpDir,
|
||||
});
|
||||
|
||||
process.env.OCAS_HOME = casDir;
|
||||
await seedThreads(tmpDir, { [THREAD_ID]: startHash });
|
||||
|
||||
const result = runUwf(["thread", "resume", THREAD_ID], casDir);
|
||||
expect(result.status).not.toBe(0);
|
||||
expect(result.stderr).toContain("thread cannot be resumed");
|
||||
expect(result.stderr).toContain("idle");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -6,7 +6,13 @@ import type { CasRef, ThreadId } from "@united-workforce/protocol";
|
||||
import { describe, expect, test } from "vitest";
|
||||
import { createMarker, deleteMarker } from "../background/index.js";
|
||||
import { cmdThreadShow, cmdThreadStart } from "../commands/thread.js";
|
||||
import { completeThread, createUwfStore, loadAllThreads, setThread } from "../store.js";
|
||||
import {
|
||||
addHistoryEntry,
|
||||
createUwfStore,
|
||||
deleteThread,
|
||||
loadAllThreads,
|
||||
setThread,
|
||||
} from "../store.js";
|
||||
|
||||
const OUTPUT_SCHEMA = {
|
||||
type: "object" as const,
|
||||
@@ -31,19 +37,15 @@ roles:
|
||||
type: object
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { const: "ready" }
|
||||
$status: { type: string }
|
||||
graph:
|
||||
$START:
|
||||
new:
|
||||
_:
|
||||
role: planner
|
||||
prompt: "Plan the work"
|
||||
location: null
|
||||
resume:
|
||||
role: planner
|
||||
prompt: "Resume the work"
|
||||
location: null
|
||||
planner:
|
||||
ready:
|
||||
_:
|
||||
role: $END
|
||||
prompt: "Done"
|
||||
location: null
|
||||
@@ -70,14 +72,10 @@ roles:
|
||||
question: { type: string }
|
||||
graph:
|
||||
$START:
|
||||
new:
|
||||
_:
|
||||
role: worker
|
||||
prompt: "Start work"
|
||||
location: null
|
||||
resume:
|
||||
role: worker
|
||||
prompt: "Resume work"
|
||||
location: null
|
||||
worker:
|
||||
needs_input:
|
||||
role: $SUSPEND
|
||||
@@ -120,13 +118,7 @@ async function insertStepNode(
|
||||
assembledPrompt: null,
|
||||
})) as CasRef;
|
||||
|
||||
setThread(uwf.varStore, threadId, {
|
||||
head: stepHash,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
setThread(uwf.varStore, threadId, { head: stepHash, suspendedRole: null, suspendMessage: null });
|
||||
}
|
||||
|
||||
describe("thread show status field", () => {
|
||||
@@ -208,7 +200,7 @@ describe("thread show status field", () => {
|
||||
// Create a thread
|
||||
const startResult = await cmdThreadStart(storageRoot, workflowPath, "test prompt", tmpDir);
|
||||
const threadId = startResult.thread as ThreadId;
|
||||
const _workflow = startResult.workflow;
|
||||
const workflow = startResult.workflow;
|
||||
|
||||
// Get the head hash before moving to history
|
||||
const uwfForIndex = await createUwfStore(storageRoot);
|
||||
@@ -216,7 +208,15 @@ describe("thread show status field", () => {
|
||||
const head = index[threadId]!.head;
|
||||
if (!head) throw new Error("Thread not found in index");
|
||||
|
||||
completeThread(uwfForIndex.varStore, threadId, "completed");
|
||||
deleteThread(uwfForIndex.varStore, threadId);
|
||||
|
||||
addHistoryEntry(uwfForIndex.varStore, {
|
||||
thread: threadId,
|
||||
workflow,
|
||||
head,
|
||||
completedAt: Date.now(),
|
||||
reason: "completed",
|
||||
});
|
||||
|
||||
const result = await cmdThreadShow(storageRoot, threadId);
|
||||
|
||||
@@ -237,7 +237,7 @@ describe("thread show status field", () => {
|
||||
// Create a thread
|
||||
const startResult = await cmdThreadStart(storageRoot, workflowPath, "test prompt", tmpDir);
|
||||
const threadId = startResult.thread as ThreadId;
|
||||
const _workflow = startResult.workflow;
|
||||
const workflow = startResult.workflow;
|
||||
|
||||
// Get the head hash before moving to history
|
||||
const uwfForIndex = await createUwfStore(storageRoot);
|
||||
@@ -245,7 +245,15 @@ describe("thread show status field", () => {
|
||||
const head = index[threadId]!.head;
|
||||
if (!head) throw new Error("Thread not found in index");
|
||||
|
||||
completeThread(uwfForIndex.varStore, threadId, "cancelled");
|
||||
deleteThread(uwfForIndex.varStore, threadId);
|
||||
|
||||
addHistoryEntry(uwfForIndex.varStore, {
|
||||
thread: threadId,
|
||||
workflow,
|
||||
head,
|
||||
completedAt: Date.now(),
|
||||
reason: "cancelled",
|
||||
});
|
||||
|
||||
const result = await cmdThreadShow(storageRoot, threadId);
|
||||
|
||||
@@ -266,7 +274,7 @@ describe("thread show status field", () => {
|
||||
// Create a thread
|
||||
const startResult = await cmdThreadStart(storageRoot, workflowPath, "test prompt", tmpDir);
|
||||
const threadId = startResult.thread as ThreadId;
|
||||
const _workflow = startResult.workflow;
|
||||
const workflow = startResult.workflow;
|
||||
|
||||
// Get the head hash before moving to history
|
||||
const uwfForIndex = await createUwfStore(storageRoot);
|
||||
@@ -274,7 +282,15 @@ describe("thread show status field", () => {
|
||||
const head = index[threadId]!.head;
|
||||
if (!head) throw new Error("Thread not found in index");
|
||||
|
||||
completeThread(uwfForIndex.varStore, threadId, "completed");
|
||||
deleteThread(uwfForIndex.varStore, threadId);
|
||||
|
||||
addHistoryEntry(uwfForIndex.varStore, {
|
||||
thread: threadId,
|
||||
workflow,
|
||||
head,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
});
|
||||
|
||||
const result = await cmdThreadShow(storageRoot, threadId);
|
||||
|
||||
|
||||
@@ -54,19 +54,15 @@ roles:
|
||||
type: object
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { const: "ready" }
|
||||
$status: { type: string }
|
||||
graph:
|
||||
$START:
|
||||
new:
|
||||
_:
|
||||
role: planner
|
||||
prompt: "Plan the work"
|
||||
location: null
|
||||
resume:
|
||||
role: planner
|
||||
prompt: "Resume the work"
|
||||
location: null
|
||||
planner:
|
||||
ready:
|
||||
_:
|
||||
role: $END
|
||||
prompt: "Done"
|
||||
location: null
|
||||
|
||||
@@ -2,28 +2,19 @@ import { execFileSync } from "node:child_process";
|
||||
import { dirname, join } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import { describe, expect, test } from "vitest";
|
||||
import { validateCount } from "../commands/thread.js";
|
||||
|
||||
const CLI_PATH = join(dirname(fileURLToPath(import.meta.url)), "..", "..", "dist", "cli.js");
|
||||
const CLI_PATH = join(dirname(fileURLToPath(import.meta.url)), "..", "cli.js");
|
||||
|
||||
function runCli(args: string[]): {
|
||||
stdout: string;
|
||||
stderr: string;
|
||||
exitCode: number;
|
||||
} {
|
||||
function runCli(args: string[]): { stdout: string; stderr: string; exitCode: number } {
|
||||
try {
|
||||
const stdout = execFileSync("node", [CLI_PATH, ...args], {
|
||||
const stdout = execFileSync("npx", ["tsx", CLI_PATH, ...args], {
|
||||
encoding: "utf8",
|
||||
env: { ...process.env, UWF_HOME: "/tmp/uwf-test-nonexistent" },
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
});
|
||||
return { stdout, stderr: "", exitCode: 0 };
|
||||
} catch (e: unknown) {
|
||||
const err = e as NodeJS.ErrnoException & {
|
||||
stdout?: string;
|
||||
stderr?: string;
|
||||
status?: number;
|
||||
};
|
||||
const err = e as NodeJS.ErrnoException & { stdout?: string; stderr?: string; status?: number };
|
||||
return {
|
||||
stdout: err.stdout ?? "",
|
||||
stderr: err.stderr ?? "",
|
||||
@@ -32,39 +23,50 @@ function runCli(args: string[]): {
|
||||
}
|
||||
}
|
||||
|
||||
describe("thread exec --count CLI parsing", { timeout: 30_000 }, () => {
|
||||
describe("thread exec --count CLI parsing", () => {
|
||||
test("--help shows -c/--count option", () => {
|
||||
const result = runCli(["thread", "exec", "--help"]);
|
||||
const combined = result.stdout + result.stderr;
|
||||
expect(combined).toContain("--count");
|
||||
expect(combined).toContain("-c");
|
||||
expect(result.stdout).toContain("--count");
|
||||
expect(result.stdout).toContain("-c");
|
||||
});
|
||||
|
||||
test("description says 'one or more steps'", () => {
|
||||
const result = runCli(["thread", "exec", "--help"]);
|
||||
const combined = result.stdout + result.stderr;
|
||||
expect(combined).toContain("one or more steps");
|
||||
expect(result.stdout).toContain("one or more steps");
|
||||
});
|
||||
});
|
||||
|
||||
describe("validateCount", () => {
|
||||
test("count=0 throws validation error", () => {
|
||||
expect(() => validateCount(0)).toThrow("positive integer");
|
||||
describe("cmdThreadExec count logic", () => {
|
||||
test("count=0 fails with validation error", () => {
|
||||
const result = runCli(["thread", "exec", "FAKE_THREAD_ID", "-c", "0"]);
|
||||
expect(result.exitCode).not.toBe(0);
|
||||
expect(result.stderr).toContain("positive integer");
|
||||
});
|
||||
|
||||
test("negative count throws validation error", () => {
|
||||
expect(() => validateCount(-1)).toThrow("positive integer");
|
||||
test("negative count fails with validation error", () => {
|
||||
const result = runCli(["thread", "exec", "FAKE_THREAD_ID", "-c", "-1"]);
|
||||
expect(result.exitCode).not.toBe(0);
|
||||
expect(result.stderr).toContain("positive integer");
|
||||
});
|
||||
|
||||
test("non-integer count throws validation error", () => {
|
||||
expect(() => validateCount(1.5)).toThrow("positive integer");
|
||||
test("non-integer count fails with validation error", () => {
|
||||
const result = runCli(["thread", "exec", "FAKE_THREAD_ID", "-c", "1.5"]);
|
||||
expect(result.exitCode).not.toBe(0);
|
||||
expect(result.stderr).toContain("positive integer");
|
||||
});
|
||||
|
||||
test("count=1 passes validation", () => {
|
||||
expect(() => validateCount(1)).not.toThrow();
|
||||
test("count=1 is the default (no -c flag)", () => {
|
||||
// Without -c, it should attempt to run 1 step (failing on missing thread, not on count validation)
|
||||
const result = runCli(["thread", "exec", "FAKE_THREAD_ID"]);
|
||||
expect(result.exitCode).not.toBe(0);
|
||||
// Should NOT contain "positive integer" error — should fail on thread lookup instead
|
||||
expect(result.stderr).not.toContain("positive integer");
|
||||
});
|
||||
|
||||
test("count=3 passes validation", () => {
|
||||
expect(() => validateCount(3)).not.toThrow();
|
||||
test("count=3 passes validation (fails on thread lookup)", () => {
|
||||
const result = runCli(["thread", "exec", "FAKE_THREAD_ID", "-c", "3"]);
|
||||
expect(result.exitCode).not.toBe(0);
|
||||
// Should NOT contain "positive integer" error — should fail on thread/storage lookup
|
||||
expect(result.stderr).not.toContain("positive integer");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -58,10 +58,7 @@ describe("suspend step CAS chain and threads.yaml metadata", () => {
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Start work", location: null },
|
||||
resume: { role: "worker", prompt: "Resume work", location: null },
|
||||
},
|
||||
$START: { _: { role: "worker", prompt: "Start work", location: null } },
|
||||
worker: {
|
||||
needs_input: {
|
||||
role: "$SUSPEND",
|
||||
@@ -163,10 +160,8 @@ describe("suspend step CAS chain and threads.yaml metadata", () => {
|
||||
const threadEntry = getThread(uwf.varStore, threadId);
|
||||
expect(threadEntry).toEqual({
|
||||
head: stepHash,
|
||||
status: "suspended",
|
||||
suspendedRole: "worker",
|
||||
suspendMessage: "Please clarify: Which API?",
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
const showResult = await cmdThreadShow(tmpDir, threadId);
|
||||
|
||||
@@ -55,10 +55,7 @@ describe("suspended thread display", () => {
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Start work", location: null },
|
||||
resume: { role: "worker", prompt: "Resume work", location: null },
|
||||
},
|
||||
$START: { _: { role: "worker", prompt: "Start work", location: null } },
|
||||
worker: {
|
||||
needs_input: {
|
||||
role: "$SUSPEND",
|
||||
@@ -165,10 +162,7 @@ describe("suspended thread display", () => {
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Start work", location: null },
|
||||
resume: { role: "worker", prompt: "Resume work", location: null },
|
||||
},
|
||||
$START: { _: { role: "worker", prompt: "Start work", location: null } },
|
||||
worker: {
|
||||
needs_input: {
|
||||
role: "$SUSPEND",
|
||||
@@ -254,10 +248,7 @@ describe("suspended thread display", () => {
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Start work", location: null },
|
||||
resume: { role: "worker", prompt: "Resume work", location: null },
|
||||
},
|
||||
$START: { _: { role: "worker", prompt: "Start work", location: null } },
|
||||
},
|
||||
});
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ import {
|
||||
THREAD_READ_DEFAULT_QUOTA,
|
||||
} from "../commands/thread.js";
|
||||
import type { UwfStore } from "../store.js";
|
||||
import { completeThread, createUwfStore, setThread } from "../store.js";
|
||||
import { addHistoryEntry, createUwfStore } from "../store.js";
|
||||
import { seedThreads } from "./thread-test-helpers.js";
|
||||
|
||||
// ── schemas used in tests ────────────────────────────────────────────────────
|
||||
@@ -745,14 +745,13 @@ describe("cmdStepList with completed threads", () => {
|
||||
const threadId = "01JTEST0000000000000000A2" as ThreadId;
|
||||
// Thread is NOT in active index (simulating completed thread)
|
||||
// But it IS in history variable store
|
||||
setThread(uwf.varStore, threadId, {
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: workflowHash,
|
||||
head: step2Hash,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
});
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const result = await cmdStepList(tmpDir, threadId);
|
||||
|
||||
@@ -873,15 +872,14 @@ describe("cmdStepShow with completed threads", () => {
|
||||
|
||||
const threadId = "01JTEST0000000000000000B2" as ThreadId;
|
||||
// Thread is NOT in active index
|
||||
// But it IS in the unified store with completed status
|
||||
setThread(uwf.varStore, threadId, {
|
||||
// But it IS in history variable store
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: workflowHash,
|
||||
head: stepHash,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
});
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const result = await cmdStepShow(tmpDir, stepHash);
|
||||
|
||||
@@ -936,15 +934,15 @@ describe("cmdThreadRead with completed threads", () => {
|
||||
});
|
||||
|
||||
const threadId = "01JTEST0000000000000000C1" as ThreadId;
|
||||
// Thread is in store with completed status
|
||||
setThread(uwf.varStore, threadId, {
|
||||
// Thread is NOT in active index
|
||||
// But it IS in history variable store
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: workflowHash,
|
||||
head: stepHash,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
});
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const markdown = await cmdThreadRead(tmpDir, threadId, THREAD_READ_DEFAULT_QUOTA, null, false);
|
||||
|
||||
@@ -1000,14 +998,13 @@ describe("cmdThreadRead with completed threads", () => {
|
||||
});
|
||||
|
||||
const threadId = "01JTEST0000000000000000C2" as ThreadId;
|
||||
setThread(uwf.varStore, threadId, {
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: workflowHash,
|
||||
head: step3Hash,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
});
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const markdown = await cmdThreadRead(
|
||||
tmpDir,
|
||||
|
||||
@@ -17,7 +17,7 @@ function makeWorkflow(overrides?: Partial<WorkflowPayload>): WorkflowPayload {
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: {
|
||||
$status: { const: "done" },
|
||||
$status: { enum: ["_"] },
|
||||
plan: { type: "string" },
|
||||
},
|
||||
required: ["$status", "plan"],
|
||||
@@ -51,11 +51,8 @@ function makeWorkflow(overrides?: Partial<WorkflowPayload>): WorkflowPayload {
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: {
|
||||
new: { role: "writer", prompt: "Begin writing", location: null },
|
||||
resume: { role: "writer", prompt: "Review previous output and continue", location: null },
|
||||
},
|
||||
writer: { done: { role: "reviewer", prompt: "Review this: {{{plan}}}", location: null } },
|
||||
$START: { _: { role: "writer", prompt: "Begin writing", location: null } },
|
||||
writer: { _: { role: "reviewer", prompt: "Review this: {{{plan}}}", location: null } },
|
||||
reviewer: {
|
||||
approved: { role: "$END", prompt: "Done: {{{summary}}}", location: null },
|
||||
rejected: { role: "writer", prompt: "Fix: {{{reason}}}", location: null },
|
||||
@@ -85,7 +82,7 @@ describe("Suite 1: Role Reference Integrity", () => {
|
||||
output: "None",
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: { $status: { const: "done" } },
|
||||
properties: { $status: { enum: ["_"] } },
|
||||
required: ["$status"],
|
||||
} as unknown as string,
|
||||
};
|
||||
@@ -138,38 +135,27 @@ describe("Suite 2: Graph Structure", () => {
|
||||
expect(errors.some((e) => e.includes("$START must be defined in graph"))).toBe(true);
|
||||
});
|
||||
|
||||
test("2.2 $START missing resume edge", () => {
|
||||
test("2.2 $START has multiple status keys", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.$START = {
|
||||
new: { role: "writer", prompt: "Begin", location: null },
|
||||
_: { role: "writer", prompt: "Begin", location: null },
|
||||
other: { role: "reviewer", prompt: "Also", location: null },
|
||||
};
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(
|
||||
errors.some((e) => e.includes('$START must have edges with statuses "new" and "resume"')),
|
||||
errors.some((e) => e.includes('$START must have exactly one edge with status "_"')),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("2.3 $START missing new edge", () => {
|
||||
test("2.3 $START edge uses non-_ status", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.$START = {
|
||||
resume: { role: "writer", prompt: "Resume", location: null },
|
||||
};
|
||||
wf.graph.$START = { ready: { role: "writer", prompt: "Begin", location: null } };
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(
|
||||
errors.some((e) => e.includes('$START must have edges with statuses "new" and "resume"')),
|
||||
errors.some((e) => e.includes('$START must have exactly one edge with status "_"')),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("2.3b $START with new and resume passes", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.$START = {
|
||||
new: { role: "writer", prompt: "Begin", location: null },
|
||||
resume: { role: "writer", prompt: "Resume", location: null },
|
||||
};
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.some((e) => e.includes("$START must have edges"))).toBe(false);
|
||||
});
|
||||
|
||||
test("2.4 $END has outgoing edges", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.$END = { _: { role: "writer", prompt: "Loop", location: null } };
|
||||
@@ -187,11 +173,11 @@ describe("Suite 2: Graph Structure", () => {
|
||||
output: "Isolated",
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: { $status: { const: "done" } },
|
||||
properties: { $status: { enum: ["_"] } },
|
||||
required: ["$status"],
|
||||
} as unknown as string,
|
||||
};
|
||||
wf.graph.isolated = { done: { role: "$END", prompt: "done", location: null } };
|
||||
wf.graph.isolated = { _: { role: "$END", prompt: "done", location: null } };
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.some((e) => e.includes('role "isolated" is not reachable from $START'))).toBe(
|
||||
true,
|
||||
@@ -200,37 +186,34 @@ describe("Suite 2: Graph Structure", () => {
|
||||
|
||||
test("2.6 edge target references invalid role", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.writer = { done: { role: "ghost", prompt: "Go to ghost", location: null } };
|
||||
wf.graph.writer = { _: { role: "ghost", prompt: "Go to ghost", location: null } };
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.some((e) => e.includes('unknown target role "ghost"'))).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("Suite 3: Status-Edge Consistency", () => {
|
||||
test("3.1 user role using _ graph key is treated as an unknown status", () => {
|
||||
// "_" is no longer special-cased — it's just a status key that does not
|
||||
// match the role's $status enum, so it surfaces as extra/missing keys.
|
||||
test("3.1 single-exit role with multiple graph keys", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.writer = { _: { role: "reviewer", prompt: "Review", location: null } };
|
||||
wf.graph.writer = {
|
||||
_: { role: "reviewer", prompt: "Review", location: null },
|
||||
extra: { role: "$END", prompt: "Done", location: null },
|
||||
};
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.some((e) => e.includes('role "writer" graph has extra status keys: _'))).toBe(
|
||||
true,
|
||||
);
|
||||
expect(errors.some((e) => e.includes('role "writer" graph is missing status keys: done'))).toBe(
|
||||
true,
|
||||
);
|
||||
expect(
|
||||
errors.some((e) =>
|
||||
e.includes('role "writer" is single-exit but has status keys other than "_"'),
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("3.2 user role graph key not matching $status enum", () => {
|
||||
test("3.2 single-exit role missing _ key", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.writer = { wrong: { role: "reviewer", prompt: "Review", location: null } };
|
||||
wf.graph.writer = { done: { role: "reviewer", prompt: "Review", location: null } };
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.some((e) => e.includes('role "writer" graph has extra status keys: wrong'))).toBe(
|
||||
true,
|
||||
);
|
||||
expect(errors.some((e) => e.includes('role "writer" graph is missing status keys: done'))).toBe(
|
||||
true,
|
||||
);
|
||||
expect(
|
||||
errors.some((e) => e.includes('role "writer" is single-exit but graph has no "_" key')),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("3.3 multi-exit role with extra statuses", () => {
|
||||
@@ -257,23 +240,18 @@ describe("Suite 3: Status-Edge Consistency", () => {
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("3.5 multi-exit role with _ key is treated as an unknown status", () => {
|
||||
test("3.5 multi-exit role with _ key", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.reviewer = { _: { role: "$END", prompt: "Done", location: null } };
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.some((e) => e.includes('role "reviewer" graph has extra status keys: _'))).toBe(
|
||||
expect(errors.some((e) => e.includes('role "reviewer" is multi-exit but graph uses "_"'))).toBe(
|
||||
true,
|
||||
);
|
||||
expect(
|
||||
errors.some((e) =>
|
||||
e.includes('role "reviewer" graph is missing status keys: approved, rejected'),
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("Suite 3b: Enum-Based $status is Rejected", () => {
|
||||
test("3b.1 enum multi-exit is rejected (must use oneOf + const)", () => {
|
||||
describe("Suite 3b: Enum-Based Multi-Exit", () => {
|
||||
test("3b.1 enum multi-exit passes with matching graph keys", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.roles.reviewer = {
|
||||
...wf.roles.reviewer,
|
||||
@@ -291,102 +269,99 @@ describe("Suite 3b: Enum-Based $status is Rejected", () => {
|
||||
rejected: { role: "writer", prompt: "Fix: {{{comments}}}", location: null },
|
||||
};
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.some((e) => e.includes("must define") && e.includes("const"))).toBe(true);
|
||||
expect(errors).toEqual([]);
|
||||
});
|
||||
|
||||
test("3b.2 enum single-exit is rejected (must use const)", () => {
|
||||
test("3b.2 enum multi-exit with extra graph key", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.roles.writer = {
|
||||
...wf.roles.writer,
|
||||
wf.roles.reviewer = {
|
||||
...wf.roles.reviewer,
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: {
|
||||
$status: { enum: ["ready"] },
|
||||
plan: { type: "string" },
|
||||
$status: { enum: ["approved", "rejected"] },
|
||||
comments: { type: "string" },
|
||||
},
|
||||
required: ["$status", "plan"],
|
||||
required: ["$status", "comments"],
|
||||
} as unknown as string,
|
||||
};
|
||||
wf.graph.writer = { ready: { role: "reviewer", prompt: "Review: {{{plan}}}", location: null } };
|
||||
wf.graph.reviewer = {
|
||||
approved: { role: "$END", prompt: "Done", location: null },
|
||||
rejected: { role: "writer", prompt: "Fix", location: null },
|
||||
timeout: { role: "$END", prompt: "Timed out", location: null },
|
||||
};
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.some((e) => e.includes("must define") && e.includes("const"))).toBe(true);
|
||||
});
|
||||
expect(errors.some((e) => e.includes("extra status keys: timeout"))).toBe(true);
|
||||
});
|
||||
|
||||
describe("Suite 3c: Const-Based Flat Schema", () => {
|
||||
test("3c.1 flat schema with const $status passes validation", () => {
|
||||
test("3b.3 enum multi-exit with missing graph key", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.roles.reviewer = {
|
||||
...wf.roles.reviewer,
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: {
|
||||
$status: { enum: ["approved", "rejected"] },
|
||||
comments: { type: "string" },
|
||||
},
|
||||
required: ["$status", "comments"],
|
||||
} as unknown as string,
|
||||
};
|
||||
wf.graph.reviewer = {
|
||||
approved: { role: "$END", prompt: "Done", location: null },
|
||||
};
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.some((e) => e.includes("missing status keys: rejected"))).toBe(true);
|
||||
});
|
||||
|
||||
test("3b.4 enum with single value (not multi-exit) treated as single-exit", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.roles.writer = {
|
||||
...wf.roles.writer,
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: {
|
||||
$status: { const: "done" },
|
||||
$status: { enum: ["_"] },
|
||||
plan: { type: "string" },
|
||||
},
|
||||
required: ["$status", "plan"],
|
||||
} as unknown as string,
|
||||
};
|
||||
wf.graph.writer = { _: { role: "reviewer", prompt: "Review: {{{plan}}}", location: null } };
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors).toEqual([]);
|
||||
});
|
||||
|
||||
test("3c.2 flat schema with const $status detects extra graph key", () => {
|
||||
test("3b.5 enum multi-exit mustache var not in frontmatter", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.roles.writer = {
|
||||
...wf.roles.writer,
|
||||
wf.roles.reviewer = {
|
||||
...wf.roles.reviewer,
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: {
|
||||
$status: { const: "done" },
|
||||
plan: { type: "string" },
|
||||
$status: { enum: ["approved", "rejected"] },
|
||||
comments: { type: "string" },
|
||||
},
|
||||
required: ["$status", "plan"],
|
||||
required: ["$status", "comments"],
|
||||
} as unknown as string,
|
||||
};
|
||||
wf.graph.writer = {
|
||||
done: { role: "reviewer", prompt: "Review.", location: null },
|
||||
extra: { role: "$END", prompt: "Nope.", location: null },
|
||||
wf.graph.reviewer = {
|
||||
approved: { role: "$END", prompt: "Done: {{{nonexistent}}}", location: null },
|
||||
rejected: { role: "writer", prompt: "Fix: {{{comments}}}", location: null },
|
||||
};
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.some((e) => e.includes("extra status keys") && e.includes("extra"))).toBe(true);
|
||||
});
|
||||
|
||||
test("3c.3 flat schema with const $status validates mustache vars", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.roles.writer = {
|
||||
...wf.roles.writer,
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: {
|
||||
$status: { const: "done" },
|
||||
plan: { type: "string" },
|
||||
},
|
||||
required: ["$status", "plan"],
|
||||
} as unknown as string,
|
||||
};
|
||||
wf.graph.writer = {
|
||||
done: { role: "reviewer", prompt: "Review: {{{nonexistent}}}", location: null },
|
||||
};
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(
|
||||
errors.some(
|
||||
(e) => e.includes('prompt variable "nonexistent"') && e.includes('role "writer"'),
|
||||
),
|
||||
).toBe(true);
|
||||
expect(errors.some((e) => e.includes("nonexistent") && e.includes("not found"))).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("Suite 4: Mustache Template Variable Existence", () => {
|
||||
test("4.1 prompt references nonexistent variable (enum status)", () => {
|
||||
test("4.1 prompt references nonexistent variable (single-exit)", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.writer = {
|
||||
done: { role: "reviewer", prompt: "Review: {{{branch}}}", location: null },
|
||||
};
|
||||
wf.graph.writer = { _: { role: "reviewer", prompt: "Review: {{{branch}}}", location: null } };
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(
|
||||
errors.some(
|
||||
(e) => e.includes('prompt variable "branch"') && e.includes('role "writer" frontmatter'),
|
||||
errors.some((e) =>
|
||||
e.includes('prompt variable "branch" not found in role "writer" frontmatter'),
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
@@ -413,7 +388,7 @@ describe("Suite 4: Mustache Template Variable Existence", () => {
|
||||
|
||||
test("4.4 $status variable is always valid", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.writer = { done: { role: "reviewer", prompt: "Status: {{$status}}", location: null } };
|
||||
wf.graph.writer = { _: { role: "reviewer", prompt: "Status: {{$status}}", location: null } };
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors).toEqual([]);
|
||||
});
|
||||
@@ -481,14 +456,14 @@ describe("Suite 6: Multiple Errors Collection", () => {
|
||||
output: "None",
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: { $status: { const: "done" } },
|
||||
properties: { $status: { enum: ["_"] } },
|
||||
required: ["$status"],
|
||||
} as unknown as string,
|
||||
};
|
||||
// unknown graph reference
|
||||
wf.graph.nonexistent = { done: { role: "$END", prompt: "done", location: null } };
|
||||
wf.graph.nonexistent = { _: { role: "$END", prompt: "done", location: null } };
|
||||
// bad mustache var
|
||||
wf.graph.writer = { done: { role: "reviewer", prompt: "{{{badvar}}}", location: null } };
|
||||
wf.graph.writer = { _: { role: "reviewer", prompt: "{{{badvar}}}", location: null } };
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.length).toBeGreaterThanOrEqual(3);
|
||||
});
|
||||
|
||||
@@ -31,18 +31,15 @@ function makeMinimalPayload(name: string, description: string): WorkflowPayload
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: {
|
||||
$status: { const: "done" },
|
||||
$status: { type: "string" },
|
||||
},
|
||||
required: ["$status"],
|
||||
} as unknown as CasRef,
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "start working", location: null },
|
||||
resume: { role: "worker", prompt: "resume working", location: null },
|
||||
},
|
||||
worker: { done: { role: "$END", prompt: "done", location: null } },
|
||||
$START: { _: { role: "worker", prompt: "start working", location: null } },
|
||||
worker: { _: { role: "$END", prompt: "done", location: null } },
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
+48
-28
@@ -1,17 +1,20 @@
|
||||
#!/usr/bin/env -S node --disable-warning=ExperimentalWarning
|
||||
#!/usr/bin/env node
|
||||
|
||||
import type { CasRef, ThreadId, ThreadStatus } from "@united-workforce/protocol";
|
||||
import { Command } from "commander";
|
||||
import { cmdConfigGet, cmdConfigList, cmdConfigSet } from "./commands/config.js";
|
||||
import { cmdLogClean, cmdLogList, cmdLogShow } from "./commands/log.js";
|
||||
import {
|
||||
cmdPromptAdapterDeveloping,
|
||||
cmdPromptAdapter,
|
||||
cmdPromptAuthor,
|
||||
cmdPromptBootstrap,
|
||||
cmdPromptDeveloper,
|
||||
cmdPromptList,
|
||||
cmdPromptSetup,
|
||||
cmdPromptUsage,
|
||||
cmdPromptWorkflowAuthoring,
|
||||
cmdPromptUser,
|
||||
} from "./commands/prompt.js";
|
||||
import { cmdSetup, cmdSetupInteractive, resolvePresetBaseUrl } from "./commands/setup.js";
|
||||
import { cmdSetup, cmdSetupInteractive } from "./commands/setup.js";
|
||||
import { cmdStepFork, cmdStepList, cmdStepRead, cmdStepShow } from "./commands/step.js";
|
||||
import {
|
||||
cmdThreadCancel,
|
||||
@@ -507,32 +510,53 @@ prompt.addHelpCommand(false);
|
||||
|
||||
prompt
|
||||
.command("usage")
|
||||
.description("Print the usage reference (CLI guide + typical workflows)")
|
||||
.description("Print the complete skill content (all references combined)")
|
||||
.action(() => {
|
||||
console.log(cmdPromptUsage());
|
||||
});
|
||||
|
||||
prompt
|
||||
.command("setup")
|
||||
.description("Print setup instructions for installing the uwf skill")
|
||||
.action(() => {
|
||||
console.log(cmdPromptSetup());
|
||||
});
|
||||
|
||||
prompt
|
||||
.command("adapter")
|
||||
.description("Print the adapter reference (building agent adapters)")
|
||||
.action(() => {
|
||||
console.log(cmdPromptAdapter());
|
||||
});
|
||||
|
||||
prompt
|
||||
.command("author")
|
||||
.description("Print the author reference (workflow YAML design guide)")
|
||||
.action(() => {
|
||||
console.log(cmdPromptAuthor());
|
||||
});
|
||||
|
||||
prompt
|
||||
.command("developer")
|
||||
.description("Print the developer reference (coding conventions + architecture)")
|
||||
.action(() => {
|
||||
console.log(cmdPromptDeveloper());
|
||||
});
|
||||
|
||||
prompt
|
||||
.command("user")
|
||||
.description("Print the user reference (CLI guide + typical workflows)")
|
||||
.action(() => {
|
||||
console.log(cmdPromptUser());
|
||||
});
|
||||
|
||||
prompt
|
||||
.command("bootstrap")
|
||||
.description("Print setup instructions for installing uwf skills")
|
||||
.description("Print the bootstrap skill YAML for Hermes agents")
|
||||
.action(() => {
|
||||
console.log(cmdPromptBootstrap());
|
||||
});
|
||||
|
||||
prompt
|
||||
.command("workflow-authoring")
|
||||
.description("Print the workflow authoring reference (YAML design guide)")
|
||||
.action(() => {
|
||||
console.log(cmdPromptWorkflowAuthoring());
|
||||
});
|
||||
|
||||
prompt
|
||||
.command("adapter-developing")
|
||||
.description("Print the adapter developing reference (building agent adapters)")
|
||||
.action(() => {
|
||||
console.log(cmdPromptAdapterDeveloping());
|
||||
});
|
||||
|
||||
prompt
|
||||
.command("list")
|
||||
.description("List all available prompt names")
|
||||
@@ -542,7 +566,7 @@ prompt
|
||||
|
||||
program
|
||||
.command("setup")
|
||||
.description("Configure provider, model, and agent. Run without options for interactive wizard.")
|
||||
.description("Configure provider, model, and agent")
|
||||
.option("--provider <name>", "Provider name")
|
||||
.option("--base-url <url>", "OpenAI-compatible API base URL")
|
||||
.option("--api-key <key>", "API key")
|
||||
@@ -558,14 +582,10 @@ program
|
||||
}) => {
|
||||
const storageRoot = resolveStorageRoot();
|
||||
runAction(async () => {
|
||||
// Resolve preset base-url when provider is known but --base-url is omitted
|
||||
const resolvedBaseUrl =
|
||||
opts.baseUrl ??
|
||||
(opts.provider !== undefined ? resolvePresetBaseUrl(opts.provider) : null);
|
||||
if (opts.provider && resolvedBaseUrl && opts.apiKey && opts.model) {
|
||||
if (opts.provider && opts.baseUrl && opts.apiKey && opts.model) {
|
||||
const result = await cmdSetup({
|
||||
provider: opts.provider,
|
||||
baseUrl: resolvedBaseUrl,
|
||||
baseUrl: opts.baseUrl,
|
||||
apiKey: opts.apiKey,
|
||||
model: opts.model,
|
||||
agent: opts.agent ?? undefined,
|
||||
@@ -576,7 +596,7 @@ program
|
||||
await cmdSetupInteractive(storageRoot);
|
||||
} else {
|
||||
throw new Error(
|
||||
"Non-interactive setup requires: --provider, --api-key, --model (--base-url is optional for preset providers)",
|
||||
"Non-interactive setup requires all of: --provider, --base-url, --api-key, --model",
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -1,330 +1,101 @@
|
||||
import { readFileSync } from "node:fs";
|
||||
import { dirname, join } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import {
|
||||
generateAdapterDevelopingReference,
|
||||
generateUsageReference,
|
||||
generateWorkflowAuthoringReference,
|
||||
generateAdapterReference,
|
||||
generateAuthorReference,
|
||||
generateBootstrapReference,
|
||||
generateDeveloperReference,
|
||||
generateUserReference,
|
||||
} from "@united-workforce/util";
|
||||
|
||||
// CLI package version (for bootstrap prompt — uwf --version prints this)
|
||||
// Walk up from __dirname to find the nearest package.json (works from both src/ and dist/)
|
||||
function _findCliVersion(): string {
|
||||
let dir = dirname(fileURLToPath(import.meta.url));
|
||||
for (let i = 0; i < 5; i++) {
|
||||
const candidate = join(dir, "package.json");
|
||||
try {
|
||||
const pkg = JSON.parse(readFileSync(candidate, "utf-8")) as {
|
||||
name?: string;
|
||||
version?: string;
|
||||
};
|
||||
if (pkg.name === "@united-workforce/cli") {
|
||||
return pkg.version ?? "0.0.0";
|
||||
}
|
||||
} catch {
|
||||
// not found, keep walking
|
||||
}
|
||||
dir = dirname(dir);
|
||||
}
|
||||
return "0.0.0";
|
||||
}
|
||||
const CLI_VERSION = _findCliVersion();
|
||||
|
||||
export {
|
||||
generateAdapterDevelopingReference as cmdPromptAdapterDeveloping,
|
||||
generateUsageReference as cmdPromptUsage,
|
||||
generateWorkflowAuthoringReference as cmdPromptWorkflowAuthoring,
|
||||
generateAdapterReference as cmdPromptAdapter,
|
||||
generateAuthorReference as cmdPromptAuthor,
|
||||
generateBootstrapReference as cmdPromptBootstrap,
|
||||
generateDeveloperReference as cmdPromptDeveloper,
|
||||
generateUserReference as cmdPromptUser,
|
||||
};
|
||||
|
||||
const PROMPT_ENTRIES: ReadonlyArray<{ name: string; generate: () => string }> = [
|
||||
{ name: "usage", generate: generateUsageReference },
|
||||
{ name: "workflow-authoring", generate: generateWorkflowAuthoringReference },
|
||||
{ name: "adapter-developing", generate: generateAdapterDevelopingReference },
|
||||
{ name: "user", generate: generateUserReference },
|
||||
{ name: "author", generate: generateAuthorReference },
|
||||
{ name: "developer", generate: generateDeveloperReference },
|
||||
{ name: "adapter", generate: generateAdapterReference },
|
||||
{ name: "bootstrap", generate: generateBootstrapReference },
|
||||
];
|
||||
|
||||
export function cmdPromptList(): ReadonlyArray<string> {
|
||||
return PROMPT_ENTRIES.map((e) => e.name);
|
||||
}
|
||||
|
||||
export function cmdPromptBootstrap(): string {
|
||||
return `# uwf Bootstrap
|
||||
export function cmdPromptUsage(): string {
|
||||
return PROMPT_ENTRIES.map((e) => e.generate()).join("\n\n---\n\n");
|
||||
}
|
||||
|
||||
Set up or upgrade uwf (United Workforce) — from zero to running your first workflow.
|
||||
export function cmdPromptSetup(): string {
|
||||
return `# uwf Skill Setup
|
||||
|
||||
## Scenario A: Fresh Install
|
||||
You are being asked to install or update the uwf (United Workforce) skill
|
||||
so that you know how to use the \`uwf\` CLI for workflow orchestration.
|
||||
|
||||
### Step 0 — Environment pre-flight check
|
||||
|
||||
Run ALL checks below BEFORE installing anything. If any check fails, fix it first — do NOT skip ahead.
|
||||
## Steps
|
||||
|
||||
1. **Check if uwf CLI is installed:**
|
||||
\`\`\`bash
|
||||
# 1. Node.js
|
||||
node --version # need v20+
|
||||
# FIX: install via nvm (https://github.com/nvm-sh/nvm) or your OS package manager
|
||||
uwf --help
|
||||
\`\`\`
|
||||
If not installed: \`bun add -g @united-workforce/cli\`
|
||||
|
||||
# 2. Package manager — pnpm (preferred) or npm
|
||||
pnpm --version || npm --version
|
||||
# FIX for pnpm: corepack enable pnpm (Node 20+)
|
||||
# or: npm install -g pnpm
|
||||
# If pnpm is not an option, npm works too — adjust install commands below
|
||||
2. **Clean up old versions of the skill** (if any exist):
|
||||
- Look for any existing uwf/workflow skill files in your skill directories
|
||||
(e.g. \`~/.hermes/skills/uwf/SKILL.md\` or similar paths).
|
||||
- If a skill exists, read its \`version\` from the YAML frontmatter.
|
||||
- If the version matches the current CLI version (\`uwf --version\`), you're done — no update needed.
|
||||
- If the version is older, **delete the old skill file** before proceeding.
|
||||
|
||||
# 3. Global bin directory is in PATH
|
||||
# For pnpm:
|
||||
pnpm bin -g # prints the global bin directory
|
||||
# For npm:
|
||||
npm prefix -g # global prefix; bin is <prefix>/bin
|
||||
# The directory printed above MUST be in your PATH.
|
||||
# Test: echo \\$PATH | tr ':' '\\\\n'
|
||||
# FIX: add the missing directory to your shell's startup file
|
||||
# (e.g. ~/.bashrc, ~/.zshrc, ~/.profile, or fish config):
|
||||
# export PATH="<global-bin-dir>:$PATH"
|
||||
3. **Get the usage skill content:**
|
||||
\`\`\`bash
|
||||
uwf prompt usage
|
||||
\`\`\`
|
||||
|
||||
**All checks must pass before continuing.** If you had to modify PATH, verify the change persists by opening a new shell or sourcing your shell config.
|
||||
4. **Create a SKILL.md file** in your skill directory (e.g. \`~/.hermes/skills/uwf/SKILL.md\`).
|
||||
The output of \`uwf prompt usage\` is the skill body WITHOUT frontmatter.
|
||||
You must prepend the following YAML frontmatter:
|
||||
|
||||
### Step 1 — Discover agents and install adapter
|
||||
|
||||
**First, detect which supported agents are already installed on the user's machine:**
|
||||
|
||||
\`\`\`bash
|
||||
# Check for Hermes Agent
|
||||
which hermes 2>/dev/null && hermes --version
|
||||
|
||||
# Check for Claude Code
|
||||
which claude 2>/dev/null && claude --version # should show "X.Y.Z (Claude Code)"
|
||||
\`\`\`
|
||||
|
||||
**Based on the results:**
|
||||
|
||||
- **Only hermes found** → install \`uwf-hermes\` adapter
|
||||
- **Only claude found** → install \`uwf-claude-code\` adapter
|
||||
- **Both found** → ask the user which agent they want uwf to use as default
|
||||
- **Neither found** → the user must install at least one agent first:
|
||||
- Hermes Agent: https://hermes-agent.nousresearch.com/docs
|
||||
- Claude Code: \`npm install -g @anthropic-ai/claude-code\`
|
||||
|
||||
**Install the uwf CLI and the chosen adapter** using pnpm or npm:
|
||||
|
||||
\`\`\`bash
|
||||
# CLI (required)
|
||||
pnpm add -g @united-workforce/cli # or: npm install -g @united-workforce/cli
|
||||
|
||||
# Adapter — install the one matching the detected agent:
|
||||
pnpm add -g @united-workforce/agent-hermes # or: npm i -g @united-workforce/agent-hermes
|
||||
pnpm add -g @united-workforce/agent-claude-code # or: npm i -g @united-workforce/agent-claude-code
|
||||
\`\`\`
|
||||
|
||||
**⚠ Adapter versions are independent from CLI versions.** Do NOT try to match adapter version to CLI version. Just install \`@latest\` (the default).
|
||||
|
||||
**After installing, verify that \`uwf\` and the adapter are available in PATH:**
|
||||
|
||||
\`\`\`bash
|
||||
uwf --version # should print ${CLI_VERSION}
|
||||
uwf-hermes --version # or: uwf-claude-code --version
|
||||
\`\`\`
|
||||
|
||||
If either command is not found, the global bin directory is not in the current shell's PATH. **You must fix this before continuing:**
|
||||
|
||||
1. Find where the binary was installed:
|
||||
\`\`\`bash
|
||||
find ~/.local ~/.hermes /usr/local -name uwf -type f 2>/dev/null
|
||||
npm prefix -g # global prefix — bin is <prefix>/bin
|
||||
\`\`\`
|
||||
2. Add the directory to PATH permanently by appending to the user's shell config (e.g. \`~/.bashrc\`, \`~/.zshrc\`, \`~/.profile\`, or fish config):
|
||||
\`\`\`bash
|
||||
export PATH="<global-bin-dir>:$PATH"
|
||||
\`\`\`
|
||||
3. Source the updated config or open a new shell, then re-verify the commands work.
|
||||
|
||||
**uwf-hermes** also requires the Hermes ACP plugin. Verify with \`hermes acp --help\`. If not available, install it:
|
||||
\`\`\`bash
|
||||
# Option A: install into hermes venv (recommended)
|
||||
source ~/.hermes/hermes-agent/.venv/bin/activate && pip install hermes-agent[acp]
|
||||
|
||||
# Option B: pipx
|
||||
pipx install 'hermes-agent[acp]'
|
||||
|
||||
# Option C: if installed from source
|
||||
pip install -e '.[acp]'
|
||||
\`\`\`
|
||||
|
||||
### Step 2 — Configure provider and model
|
||||
|
||||
uwf needs an LLM provider to run agents. **Ask the user** for their provider, API key, and model, then run:
|
||||
|
||||
\`\`\`bash
|
||||
uwf setup --provider <name> --api-key <key> --model <model> --agent <adapter-command>
|
||||
\`\`\`
|
||||
|
||||
**Note:** \`--agent\` takes the adapter **command name** (e.g. \`uwf-hermes\`), not the npm package name.
|
||||
|
||||
**Preset providers** — when using a preset name, \`--base-url\` is auto-filled and can be omitted:
|
||||
|
||||
| Provider | Name | Default base URL |
|
||||
|----------|------|-----------------|
|
||||
| OpenAI | \`openai\` | https://api.openai.com/v1 |
|
||||
| xAI | \`xai\` | https://api.x.ai/v1 |
|
||||
| OpenRouter | \`openrouter\` | https://openrouter.ai/api/v1 |
|
||||
| Venice | \`venice\` | https://api.venice.ai/api/v1 |
|
||||
| Dashscope | \`dashscope\` | https://dashscope.aliyuncs.com/compatible-mode/v1 |
|
||||
| DeepSeek | \`deepseek\` | https://api.deepseek.com/v1 |
|
||||
| SiliconFlow | \`siliconflow\` | https://api.siliconflow.cn/v1 |
|
||||
| VolcEngine | \`volcengine\` | https://ark.cn-beijing.volces.com/api/v3 |
|
||||
| Kimi (Moonshot) | \`kimi\` | https://api.moonshot.cn/v1 |
|
||||
| GLM (Zhipu AI) | \`glm\` | https://open.bigmodel.cn/api/paas/v4 |
|
||||
| StepFun | \`stepfun\` | https://api.stepfun.com/v1 |
|
||||
| MiniMax | \`minimax\` | https://api.minimax.io/v1 |
|
||||
| Ollama (local) | \`ollama\` | http://localhost:11434/v1 |
|
||||
|
||||
For **non-preset providers**, you must specify \`--base-url\` manually.
|
||||
|
||||
Example:
|
||||
\`\`\`bash
|
||||
uwf setup --provider openrouter --api-key sk-or-... --model anthropic/claude-sonnet-4 --agent uwf-hermes
|
||||
\`\`\`
|
||||
|
||||
If the user doesn't know what to choose, suggest \`openrouter\` with \`anthropic/claude-sonnet-4\` as a sensible default.
|
||||
|
||||
Config is saved to \`~/.uwf/config.yaml\`. Verify with \`cat ~/.uwf/config.yaml\`.
|
||||
|
||||
### Step 3 — Install skills
|
||||
|
||||
Run each command and create a skill from its output:
|
||||
|
||||
\`\`\`bash
|
||||
uwf prompt usage # → save as skill "uwf-usage"
|
||||
uwf prompt workflow-authoring # → save as skill "uwf-workflow-authoring"
|
||||
uwf prompt adapter-developing # → save as skill "uwf-adapter-developing"
|
||||
\`\`\`
|
||||
|
||||
Each command outputs a complete SKILL.md with YAML frontmatter. Use your agent framework's skill creation API to save them (e.g. \`skill_manage(action='create', name='uwf-usage', content=<output>)\`).
|
||||
|
||||
Verify skills are installed by listing them (e.g. \`skills_list()\`) and confirming all three appear.
|
||||
|
||||
**⚠ After saving all skills, start a new session** so the agent loads the updated skill content. Skills saved in the current session are not active until the next session.
|
||||
|
||||
### Step 4 — Verify end-to-end
|
||||
|
||||
Create a minimal workflow file to test your setup:
|
||||
|
||||
\`\`\`bash
|
||||
cat > /tmp/hello.yaml << 'YAML'
|
||||
name: hello
|
||||
description: Minimal smoke test
|
||||
roles:
|
||||
greeter:
|
||||
description: "Greet the user"
|
||||
goal: "Respond with a friendly greeting"
|
||||
capabilities: []
|
||||
procedure: "Write a short greeting based on the prompt."
|
||||
output: "A greeting message."
|
||||
frontmatter:
|
||||
type: object
|
||||
properties:
|
||||
$status: { const: done }
|
||||
message: { type: string }
|
||||
required: [$status, message]
|
||||
graph:
|
||||
$START:
|
||||
new: { role: greeter, prompt: "Say hello to the user." }
|
||||
resume: { role: greeter, prompt: "Greet the user again." }
|
||||
greeter:
|
||||
done: { role: "$END", prompt: "Done." }
|
||||
YAML
|
||||
\`\`\`
|
||||
|
||||
Then run:
|
||||
|
||||
\`\`\`bash
|
||||
uwf thread start /tmp/hello.yaml -p "Hello, world!"
|
||||
uwf thread exec <thread-id>
|
||||
uwf thread show <thread-id>
|
||||
\`\`\`
|
||||
|
||||
If the thread reaches \`$END\` with status \`completed\`, the setup is working.
|
||||
|
||||
## Scenario B: Upgrade from Previous Version
|
||||
|
||||
### Step 1 — Update packages
|
||||
|
||||
\`\`\`bash
|
||||
# Using pnpm
|
||||
pnpm add -g @united-workforce/cli@latest
|
||||
|
||||
# Using npm
|
||||
npm install -g @united-workforce/cli@latest
|
||||
\`\`\`
|
||||
|
||||
\`\`\`bash
|
||||
uwf --version # should print ${CLI_VERSION}
|
||||
\`\`\`
|
||||
|
||||
Also update your adapter(s):
|
||||
|
||||
\`\`\`bash
|
||||
# pnpm
|
||||
pnpm add -g @united-workforce/agent-hermes@latest
|
||||
|
||||
# npm
|
||||
npm install -g @united-workforce/agent-hermes@latest
|
||||
\`\`\`
|
||||
|
||||
### Step 2 — Regenerate skills
|
||||
|
||||
Skill content is bundled with the CLI — always regenerate after upgrading:
|
||||
|
||||
\`\`\`bash
|
||||
uwf prompt usage # → update skill "uwf-usage"
|
||||
uwf prompt workflow-authoring # → update skill "uwf-workflow-authoring"
|
||||
uwf prompt adapter-developing # → update skill "uwf-adapter-developing"
|
||||
\`\`\`
|
||||
|
||||
**⚠ After updating skills, start a new session** to load the new skill content.
|
||||
|
||||
### Step 3 — Migrate workflow YAML files (if needed)
|
||||
|
||||
Check the changelog for breaking changes. Known migrations:
|
||||
|
||||
- **v0.2.0**: \`$START._\` → \`$START.new\` + \`$START.resume\`. All workflow YAML files must be updated:
|
||||
\`\`\`yaml
|
||||
# Before (v0.1.x)
|
||||
$START:
|
||||
_: { role: planner, prompt: "..." }
|
||||
|
||||
# After (v0.2.0+)
|
||||
$START:
|
||||
new: { role: planner, prompt: "..." }
|
||||
resume: { role: planner, prompt: "Review previous run and continue." }
|
||||
---
|
||||
name: uwf
|
||||
description: "Use when orchestrating multi-agent workflows with uwf (United Workforce). Covers CLI usage, workflow YAML authoring, moderator logic, and agent development."
|
||||
version: <VERSION>
|
||||
author: United Workforce
|
||||
license: MIT
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [workflow, agents, orchestration, cli, yaml]
|
||||
---
|
||||
\`\`\`
|
||||
|
||||
Update all \`.workflow/\` and \`.workflows/\` YAML files in your projects. \`uwf workflow add\` will reject files with the old \`_\` syntax.
|
||||
Replace \`<VERSION>\` with the installed CLI version (from \`uwf --version\`).
|
||||
|
||||
- **v0.2.1**: \`$status: { enum: [value] }\` → \`$status: { const: "value" }\`. The validator no longer accepts \`enum\` for \`$status\`. Update all workflow YAML files:
|
||||
\`\`\`yaml
|
||||
# Before (v0.2.0)
|
||||
$status: { enum: [done] }
|
||||
$status: { type: string, enum: ["ready", "failed"] }
|
||||
5. **Verify** the skill is loadable by your agent framework.
|
||||
|
||||
# After (v0.2.1+)
|
||||
$status: { const: "done" }
|
||||
# For multi-exit, use oneOf with const (unchanged)
|
||||
\`\`\`
|
||||
## Individual prompts
|
||||
|
||||
### Step 4 — Verify
|
||||
|
||||
\`\`\`bash
|
||||
uwf thread start <your-workflow> -p "upgrade test"
|
||||
uwf thread exec <thread-id>
|
||||
\`\`\`
|
||||
|
||||
## Available prompts
|
||||
You can also get individual reference sections:
|
||||
|
||||
\`\`\`bash
|
||||
uwf prompt list # list available prompt names
|
||||
uwf prompt usage # CLI usage guide
|
||||
uwf prompt workflow-authoring # workflow YAML design guide
|
||||
uwf prompt adapter-developing # building agent adapters
|
||||
uwf prompt bootstrap # this guide
|
||||
uwf prompt user # user reference (CLI guide + typical workflows)
|
||||
uwf prompt author # author reference (workflow YAML design guide)
|
||||
uwf prompt developer # developer reference (coding conventions + architecture)
|
||||
uwf prompt adapter # adapter reference (building agent adapters)
|
||||
uwf prompt bootstrap # bootstrap skill YAML for Hermes agents
|
||||
\`\`\`
|
||||
|
||||
## Notes
|
||||
|
||||
- The skill content is bundled with the CLI and versioned with it — always use
|
||||
\`uwf prompt usage\` to get the content matching your installed version.
|
||||
- Do NOT hand-edit the skill body. If the CLI is updated, re-run \`uwf prompt setup\`
|
||||
and follow the steps again.
|
||||
- When upgrading, always delete the old skill first to avoid stale instructions.
|
||||
`;
|
||||
}
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
import { execFileSync } from "node:child_process";
|
||||
import { existsSync, mkdirSync, readdirSync, readFileSync, statSync, writeFileSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import { stdin as input, stdout as output } from "node:process";
|
||||
@@ -73,12 +72,6 @@ const PRESET_PROVIDERS = [
|
||||
{ name: "ollama", label: "Ollama (local)", baseUrl: "http://localhost:11434/v1" },
|
||||
] as const;
|
||||
|
||||
/** Look up the base URL for a preset provider name. Returns null if not a preset. */
|
||||
export function resolvePresetBaseUrl(providerName: string): string | null {
|
||||
const preset = PRESET_PROVIDERS.find((p) => p.name === providerName);
|
||||
return preset !== undefined ? preset.baseUrl : null;
|
||||
}
|
||||
|
||||
type SetupArgs = {
|
||||
provider: string;
|
||||
baseUrl: string;
|
||||
@@ -182,6 +175,7 @@ export async function _discoverAgents(): Promise<string[]> {
|
||||
|
||||
async function _tryWhichDiscovery(): Promise<string[] | null> {
|
||||
try {
|
||||
const { execFileSync } = await import("node:child_process");
|
||||
const text = execFileSync("which", ["-a", "uwf-hermes", "uwf-claude-code", "uwf-cursor"], {
|
||||
encoding: "utf-8",
|
||||
stdio: ["pipe", "pipe", "pipe"],
|
||||
@@ -397,37 +391,6 @@ function mergeConfig(existing: Record<string, unknown>, args: SetupArgs): Record
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the configured adapter binary (and its dependencies) are in PATH.
|
||||
* Returns warnings array — empty means all good.
|
||||
*/
|
||||
export function _checkAdapterAvailability(agentName: string): string[] {
|
||||
const warnings: string[] = [];
|
||||
const binary = `uwf-${agentName}`;
|
||||
|
||||
try {
|
||||
execFileSync("which", [binary], { encoding: "utf8", stdio: ["pipe", "pipe", "pipe"] });
|
||||
} catch {
|
||||
warnings.push(
|
||||
`${binary} not found in PATH. Install it: pnpm add -g @united-workforce/agent-${agentName}`,
|
||||
);
|
||||
return warnings; // skip dependency check if adapter itself is missing
|
||||
}
|
||||
|
||||
// uwf-hermes depends on hermes CLI
|
||||
if (agentName === "hermes") {
|
||||
try {
|
||||
execFileSync("which", ["hermes"], { encoding: "utf8", stdio: ["pipe", "pipe", "pipe"] });
|
||||
} catch {
|
||||
warnings.push(
|
||||
'hermes CLI not found in PATH (required by uwf-hermes). Fix: export PATH="$HOME/.hermes/hermes-agent/.venv/bin:$PATH"',
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return warnings;
|
||||
}
|
||||
|
||||
/**
|
||||
* Non-interactive setup. All required args provided via CLI flags.
|
||||
*/
|
||||
@@ -442,26 +405,15 @@ export async function cmdSetup(args: SetupArgs): Promise<Record<string, unknown>
|
||||
|
||||
writeFileSync(configPath, stringify(merged, { indent: 2 }), "utf8");
|
||||
|
||||
// Print config path to stderr (stdout is reserved for JSON output)
|
||||
console.error(`Config saved to ${configPath} ✓`);
|
||||
|
||||
// Validate model connectivity
|
||||
const validation = await validateModel(args.baseUrl, args.apiKey, args.model);
|
||||
|
||||
// Check adapter availability
|
||||
const agentName = _agentNameFromBinary(args.agent ?? "hermes");
|
||||
const adapterWarnings = _checkAdapterAvailability(agentName);
|
||||
for (const w of adapterWarnings) {
|
||||
console.error(`⚠ ${w}`);
|
||||
}
|
||||
|
||||
return {
|
||||
configPath,
|
||||
provider: args.provider,
|
||||
model: args.model,
|
||||
defaultAgent: merged.defaultAgent,
|
||||
validation,
|
||||
adapterWarnings,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ import type {
|
||||
StepNodePayload,
|
||||
ThreadId,
|
||||
} from "@united-workforce/protocol";
|
||||
import { createUwfStore, getThread, type UwfStore } from "../store.js";
|
||||
import { createUwfStore, findHistoryEntry, getThread, type UwfStore } from "../store.js";
|
||||
|
||||
type ChainState = {
|
||||
startHash: CasRef;
|
||||
@@ -207,6 +207,10 @@ async function resolveHeadHash(storageRoot: string, threadId: ThreadId): Promise
|
||||
if (entry !== null) {
|
||||
return entry.head;
|
||||
}
|
||||
const hist = findHistoryEntry(uwf.varStore, threadId);
|
||||
if (hist !== null) {
|
||||
return hist.head;
|
||||
}
|
||||
fail(`thread not found: ${threadId}`);
|
||||
}
|
||||
|
||||
|
||||
@@ -66,7 +66,6 @@ export async function cmdStepList(
|
||||
agent: item.payload.agent,
|
||||
timestamp: item.timestamp,
|
||||
durationMs: item.payload.completedAtMs - item.payload.startedAtMs,
|
||||
usage: item.payload.usage ?? null,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -115,10 +114,8 @@ export async function cmdStepFork(
|
||||
const newThreadId = generateUlid(Date.now()) as ThreadId;
|
||||
setThread(uwf.varStore, newThreadId, {
|
||||
head: stepHash,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
return {
|
||||
|
||||
@@ -38,14 +38,17 @@ import { createMarker, deleteMarker, isThreadRunning } from "../background/index
|
||||
import { createIncludeTag } from "../include.js";
|
||||
import { evaluate, isSuspendResult } from "../moderator/index.js";
|
||||
import {
|
||||
completeThread,
|
||||
addHistoryEntry,
|
||||
createUwfStore,
|
||||
deleteThread,
|
||||
findHistoryEntry,
|
||||
getThread,
|
||||
loadActiveThreads,
|
||||
loadHistoryThreads,
|
||||
loadAllHistory,
|
||||
loadAllThreads,
|
||||
loadWorkflowRegistry,
|
||||
resolveWorkflowHash,
|
||||
setThread,
|
||||
type ThreadHistoryLine,
|
||||
type UwfStore,
|
||||
} from "../store.js";
|
||||
import { checkWorkflowFilenameConsistency, isCasRef, parseWorkflowPayload } from "../validate.js";
|
||||
@@ -482,35 +485,20 @@ export async function cmdThreadShow(
|
||||
): Promise<ThreadShowOutput> {
|
||||
const uwf = await createUwfStore(storageRoot);
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
if (entry === null) {
|
||||
fail(`thread not found: ${threadId}`);
|
||||
}
|
||||
|
||||
if (entry !== null) {
|
||||
const activeHead = entry.head;
|
||||
const workflow = resolveWorkflowFromHead(uwf, activeHead);
|
||||
if (workflow === null) {
|
||||
fail(`failed to resolve workflow from head: ${activeHead}`);
|
||||
}
|
||||
|
||||
// Determine if this is a completed/cancelled thread
|
||||
if (entry.status === "completed" || entry.status === "cancelled") {
|
||||
const hint = null;
|
||||
return {
|
||||
const status = await resolveActiveThreadStatus(
|
||||
storageRoot,
|
||||
threadId,
|
||||
uwf,
|
||||
activeHead,
|
||||
workflow,
|
||||
thread: threadId,
|
||||
head: activeHead,
|
||||
status: entry.status,
|
||||
currentRole: null,
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
done: true,
|
||||
background: null,
|
||||
hint,
|
||||
};
|
||||
}
|
||||
|
||||
// Active thread
|
||||
const status = await resolveActiveThreadStatus(storageRoot, threadId, uwf, activeHead, workflow);
|
||||
);
|
||||
const currentRole = resolveCurrentRole(uwf, activeHead, workflow);
|
||||
const suspendFields = resolveSuspendFieldsForShow(entry, status, uwf, activeHead, workflow);
|
||||
|
||||
@@ -533,6 +521,27 @@ export async function cmdThreadShow(
|
||||
};
|
||||
}
|
||||
|
||||
const hist = findHistoryEntry(uwf.varStore, threadId);
|
||||
if (hist !== null) {
|
||||
const status: ThreadStatus = hist.reason === "cancelled" ? "cancelled" : "completed";
|
||||
|
||||
return {
|
||||
workflow: hist.workflow,
|
||||
thread: threadId,
|
||||
head: hist.head,
|
||||
status,
|
||||
currentRole: null,
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
done: true,
|
||||
background: null,
|
||||
hint: null,
|
||||
};
|
||||
}
|
||||
|
||||
fail(`thread not found: ${threadId}`);
|
||||
}
|
||||
|
||||
export type ThreadListItemWithStatus = ThreadListItem & {
|
||||
status: ThreadStatus;
|
||||
currentRole: string | null;
|
||||
@@ -585,20 +594,19 @@ async function collectActiveThreads(
|
||||
}
|
||||
|
||||
function collectCompletedThreads(
|
||||
uwf: UwfStore,
|
||||
varStore: VarStore,
|
||||
activeIds: Set<ThreadId>,
|
||||
): ThreadListItemWithStatus[] {
|
||||
const items: ThreadListItemWithStatus[] = [];
|
||||
const history = loadHistoryThreads(uwf.varStore);
|
||||
const history = loadAllHistory(varStore);
|
||||
const seen = new Set<ThreadId>(); // Deduplication (issue #470)
|
||||
for (const [threadId, entry] of Object.entries(history)) {
|
||||
if (!activeIds.has(threadId as ThreadId) && !seen.has(threadId as ThreadId)) {
|
||||
seen.add(threadId as ThreadId);
|
||||
const status = entry.status;
|
||||
const workflow = resolveWorkflowFromHead(uwf, entry.head);
|
||||
for (const entry of history) {
|
||||
if (!activeIds.has(entry.thread) && !seen.has(entry.thread)) {
|
||||
seen.add(entry.thread);
|
||||
const status = entry.reason === "cancelled" ? "cancelled" : "completed";
|
||||
items.push({
|
||||
thread: threadId as ThreadId,
|
||||
workflow: workflow ?? "",
|
||||
thread: entry.thread,
|
||||
workflow: entry.workflow,
|
||||
head: entry.head,
|
||||
status,
|
||||
currentRole: null,
|
||||
@@ -651,7 +659,7 @@ export async function cmdThreadList(
|
||||
take: number | null,
|
||||
): Promise<ThreadListItemWithStatus[]> {
|
||||
const uwf = await createUwfStore(storageRoot);
|
||||
const index = loadActiveThreads(uwf.varStore);
|
||||
const index = loadAllThreads(uwf.varStore);
|
||||
|
||||
// Collect active threads
|
||||
let items = await collectActiveThreads(storageRoot, uwf, index);
|
||||
@@ -663,7 +671,7 @@ export async function cmdThreadList(
|
||||
statusFilter.includes("cancelled");
|
||||
if (includeCompleted) {
|
||||
const activeIds = new Set(items.map((i) => i.thread));
|
||||
const completedItems = collectCompletedThreads(uwf, activeIds);
|
||||
const completedItems = collectCompletedThreads(uwf.varStore, activeIds);
|
||||
items = items.concat(completedItems);
|
||||
}
|
||||
|
||||
@@ -911,7 +919,7 @@ function resolveEvaluateArgs(
|
||||
chain: ChainState,
|
||||
): { lastRole: string; lastOutput: EvaluateLastOutput } {
|
||||
if (chain.headIsStart) {
|
||||
return { lastRole: START_ROLE, lastOutput: { [STATUS_KEY]: "new" } };
|
||||
return { lastRole: START_ROLE, lastOutput: { [STATUS_KEY]: "_" } };
|
||||
}
|
||||
|
||||
const lastStep = chain.stepsNewestFirst[0];
|
||||
@@ -961,12 +969,6 @@ function resolveAgentConfig(
|
||||
agentOverride: string | null,
|
||||
): AgentConfig {
|
||||
if (agentOverride !== null) {
|
||||
// Try config alias first (e.g. "hermes" → config.agents.hermes),
|
||||
// then fall back to raw command name (e.g. "uwf-hermes" or "/usr/bin/agent").
|
||||
const fromAlias = config.agents[agentOverride as AgentAlias];
|
||||
if (fromAlias !== undefined) {
|
||||
return fromAlias;
|
||||
}
|
||||
return parseAgentOverride(agentOverride);
|
||||
}
|
||||
|
||||
@@ -1004,12 +1006,6 @@ function spawnAgent(
|
||||
});
|
||||
} catch (e) {
|
||||
const err = e as NodeJS.ErrnoException & { stderr?: Buffer | string | null };
|
||||
if (err.code === "ENOENT") {
|
||||
failStep(
|
||||
plog,
|
||||
`"${agent.command}" not found in PATH. Install it or check your PATH config. Run: which ${agent.command}`,
|
||||
);
|
||||
}
|
||||
const stderr =
|
||||
err.stderr == null
|
||||
? ""
|
||||
@@ -1039,8 +1035,15 @@ function spawnAgent(
|
||||
return obj as unknown as AdapterOutput;
|
||||
}
|
||||
|
||||
function archiveThread(uwf: UwfStore, threadId: ThreadId, _workflow: CasRef, _head: CasRef): void {
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
function archiveThread(uwf: UwfStore, threadId: ThreadId, workflow: CasRef, head: CasRef): void {
|
||||
deleteThread(uwf.varStore, threadId);
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow,
|
||||
head,
|
||||
completedAt: Date.now(),
|
||||
reason: "completed",
|
||||
});
|
||||
}
|
||||
|
||||
export async function cmdThreadResume(
|
||||
@@ -1064,24 +1067,17 @@ export async function cmdThreadResume(
|
||||
const chain = walkChain(uwf, headHash);
|
||||
const workflowHash = chain.start.workflow;
|
||||
|
||||
// Check entry.status first for completed/cancelled (like in cmdThreadShow)
|
||||
let status: ThreadStatus;
|
||||
if (entry.status === "completed" || entry.status === "cancelled") {
|
||||
status = entry.status;
|
||||
} else {
|
||||
status = await resolveActiveThreadStatus(storageRoot, threadId, uwf, headHash, workflowHash);
|
||||
}
|
||||
|
||||
if (status !== "suspended" && status !== "completed") {
|
||||
fail(`thread cannot be resumed: ${threadId} (status: ${status})`);
|
||||
}
|
||||
|
||||
const plog = createProcessLogger({
|
||||
const status = await resolveActiveThreadStatus(
|
||||
storageRoot,
|
||||
context: { thread: threadId, workflow: workflowHash },
|
||||
});
|
||||
threadId,
|
||||
uwf,
|
||||
headHash,
|
||||
workflowHash,
|
||||
);
|
||||
if (status !== "suspended") {
|
||||
fail(`thread is not suspended: ${threadId} (status: ${status})`);
|
||||
}
|
||||
|
||||
if (status === "suspended") {
|
||||
const suspendFields = resolveSuspendFieldsForShow(entry, status, uwf, headHash, workflowHash);
|
||||
if (suspendFields.suspendedRole === null) {
|
||||
fail(`thread is suspended but suspendedRole is missing: ${threadId}`);
|
||||
@@ -1091,6 +1087,10 @@ export async function cmdThreadResume(
|
||||
}
|
||||
|
||||
const resumePrompt = buildResumePrompt(suspendFields.suspendMessage, supplement);
|
||||
const plog = createProcessLogger({
|
||||
storageRoot,
|
||||
context: { thread: threadId, workflow: workflowHash },
|
||||
});
|
||||
|
||||
plog.log(
|
||||
PL_THREAD_RESUME,
|
||||
@@ -1104,43 +1104,6 @@ export async function cmdThreadResume(
|
||||
});
|
||||
}
|
||||
|
||||
// status === "completed"
|
||||
const workflow = loadWorkflowPayload(uwf, workflowHash);
|
||||
const startResult = evaluate(workflow.graph, START_ROLE, { [STATUS_KEY]: "resume" });
|
||||
if (!startResult.ok) {
|
||||
fail(`failed to evaluate $START: ${startResult.error.message}`);
|
||||
}
|
||||
if (isSuspendResult(startResult.value)) {
|
||||
fail("workflow cannot start with $SUSPEND");
|
||||
}
|
||||
if (startResult.value.role === END_ROLE) {
|
||||
fail("workflow cannot start with $END");
|
||||
}
|
||||
|
||||
const startRole = startResult.value.role;
|
||||
const completedResumePrompt = buildResumePrompt(startResult.value.prompt, supplement);
|
||||
|
||||
const updatedEntry = { ...entry, status: "idle" as const, completedAt: null };
|
||||
setThread(uwf.varStore, threadId, updatedEntry);
|
||||
|
||||
plog.log(
|
||||
PL_THREAD_RESUME,
|
||||
`resume completed role=${startRole} supplement=${supplement !== null}`,
|
||||
null,
|
||||
);
|
||||
|
||||
return cmdThreadStepOnce(storageRoot, threadId, agentOverride, plog, {
|
||||
role: startRole,
|
||||
prompt: completedResumePrompt,
|
||||
});
|
||||
}
|
||||
|
||||
export function validateCount(count: number): void {
|
||||
if (count < 1 || !Number.isInteger(count)) {
|
||||
throw new Error(`--count must be a positive integer, got: ${count}`);
|
||||
}
|
||||
}
|
||||
|
||||
export async function cmdThreadExec(
|
||||
storageRoot: string,
|
||||
threadId: ThreadId,
|
||||
@@ -1149,7 +1112,9 @@ export async function cmdThreadExec(
|
||||
background: boolean,
|
||||
backgroundWorker: boolean,
|
||||
): Promise<StepOutput[]> {
|
||||
validateCount(count);
|
||||
if (count < 1 || !Number.isInteger(count)) {
|
||||
fail(`--count must be a positive integer, got: ${count}`);
|
||||
}
|
||||
|
||||
// Check if thread is already running in background (unless we ARE the background worker)
|
||||
if (!backgroundWorker) {
|
||||
@@ -1284,7 +1249,7 @@ function resolveResumeStepTarget(
|
||||
}
|
||||
|
||||
async function resolveModeratorStepTarget(
|
||||
_storageRoot: string,
|
||||
storageRoot: string,
|
||||
threadId: ThreadId,
|
||||
entry: ThreadIndexEntry,
|
||||
headHash: CasRef,
|
||||
@@ -1353,7 +1318,7 @@ async function resolveModeratorStepTarget(
|
||||
}
|
||||
|
||||
async function finalizeAgentStep(
|
||||
_storageRoot: string,
|
||||
storageRoot: string,
|
||||
threadId: ThreadId,
|
||||
workflowHash: CasRef,
|
||||
workflow: WorkflowPayload,
|
||||
@@ -1485,6 +1450,10 @@ async function resolveHeadHash(storageRoot: string, threadId: ThreadId): Promise
|
||||
if (entry !== null) {
|
||||
return entry.head;
|
||||
}
|
||||
const hist = findHistoryEntry(uwf.varStore, threadId);
|
||||
if (hist !== null) {
|
||||
return hist.head;
|
||||
}
|
||||
fail(`thread not found: ${threadId}`);
|
||||
}
|
||||
|
||||
@@ -1564,6 +1533,7 @@ export async function cmdThreadCancel(
|
||||
if (entry === null) {
|
||||
fail(`thread not active: ${threadId}`);
|
||||
}
|
||||
const head = entry.head;
|
||||
|
||||
// Check if thread is running in background and terminate it
|
||||
const runningMarker = await isThreadRunning(storageRoot, threadId);
|
||||
@@ -1576,7 +1546,21 @@ export async function cmdThreadCancel(
|
||||
await deleteMarker(storageRoot, threadId);
|
||||
}
|
||||
|
||||
completeThread(uwf.varStore, threadId, "cancelled");
|
||||
const workflow = resolveWorkflowFromHead(uwf, head);
|
||||
if (workflow === null) {
|
||||
fail(`failed to resolve workflow from head: ${head}`);
|
||||
}
|
||||
|
||||
deleteThread(uwf.varStore, threadId);
|
||||
|
||||
const historyEntry: ThreadHistoryLine = {
|
||||
thread: threadId,
|
||||
workflow,
|
||||
head,
|
||||
completedAt: Date.now(),
|
||||
reason: "cancelled",
|
||||
};
|
||||
addHistoryEntry(uwf.varStore, historyEntry);
|
||||
|
||||
return { thread: threadId, cancelled: true };
|
||||
}
|
||||
|
||||
@@ -6,11 +6,11 @@ describe("Edge prompt template variable resolution", () => {
|
||||
test("returns error when rendered prompt is empty string", () => {
|
||||
const graph = {
|
||||
$START: {
|
||||
new: { role: "classifier", prompt: "{{{userPrompt}}}", location: null },
|
||||
_: { role: "classifier", prompt: "{{{userPrompt}}}", location: null },
|
||||
},
|
||||
};
|
||||
|
||||
const result = evaluate(graph, "$START", { $status: "new" });
|
||||
const result = evaluate(graph, "$START", {});
|
||||
|
||||
expect(result.ok).toBe(false);
|
||||
if (!result.ok) {
|
||||
@@ -22,11 +22,11 @@ describe("Edge prompt template variable resolution", () => {
|
||||
test("returns error when rendered prompt is whitespace-only", () => {
|
||||
const graph = {
|
||||
$START: {
|
||||
new: { role: "classifier", prompt: " {{{userPrompt}}} ", location: null },
|
||||
_: { role: "classifier", prompt: " {{{userPrompt}}} ", location: null },
|
||||
},
|
||||
};
|
||||
|
||||
const result = evaluate(graph, "$START", { $status: "new" });
|
||||
const result = evaluate(graph, "$START", {});
|
||||
|
||||
expect(result.ok).toBe(false);
|
||||
if (!result.ok) {
|
||||
@@ -38,11 +38,11 @@ describe("Edge prompt template variable resolution", () => {
|
||||
test("succeeds when all template variables resolve to non-empty values", () => {
|
||||
const graph = {
|
||||
$START: {
|
||||
new: { role: "classifier", prompt: "{{{userPrompt}}}", location: null },
|
||||
_: { role: "classifier", prompt: "{{{userPrompt}}}", location: null },
|
||||
},
|
||||
};
|
||||
|
||||
const result = evaluate(graph, "$START", { $status: "new", userPrompt: "Fix the bug" });
|
||||
const result = evaluate(graph, "$START", { userPrompt: "Fix the bug" });
|
||||
|
||||
expect(result.ok).toBe(true);
|
||||
if (result.ok) {
|
||||
@@ -53,11 +53,11 @@ describe("Edge prompt template variable resolution", () => {
|
||||
test("succeeds with static (no-variable) prompt", () => {
|
||||
const graph = {
|
||||
$START: {
|
||||
new: { role: "classifier", prompt: "Classify this input", location: null },
|
||||
_: { role: "classifier", prompt: "Classify this input", location: null },
|
||||
},
|
||||
};
|
||||
|
||||
const result = evaluate(graph, "$START", { $status: "new" });
|
||||
const result = evaluate(graph, "$START", {});
|
||||
|
||||
expect(result.ok).toBe(true);
|
||||
if (result.ok) {
|
||||
@@ -68,11 +68,11 @@ describe("Edge prompt template variable resolution", () => {
|
||||
test("succeeds when prompt has mix of static text and unresolved variables", () => {
|
||||
const graph = {
|
||||
$START: {
|
||||
new: { role: "classifier", prompt: "Please handle: {{{userPrompt}}}", location: null },
|
||||
_: { role: "classifier", prompt: "Please handle: {{{userPrompt}}}", location: null },
|
||||
},
|
||||
};
|
||||
|
||||
const result = evaluate(graph, "$START", { $status: "new" });
|
||||
const result = evaluate(graph, "$START", {});
|
||||
|
||||
expect(result.ok).toBe(true);
|
||||
if (result.ok) {
|
||||
@@ -83,11 +83,11 @@ describe("Edge prompt template variable resolution", () => {
|
||||
test("returns error when ALL variables missing and no static text remains", () => {
|
||||
const graph = {
|
||||
$START: {
|
||||
new: { role: "classifier", prompt: "{{{a}}}{{{b}}}", location: null },
|
||||
_: { role: "classifier", prompt: "{{{a}}}{{{b}}}", location: null },
|
||||
},
|
||||
};
|
||||
|
||||
const result = evaluate(graph, "$START", { $status: "new" });
|
||||
const result = evaluate(graph, "$START", {});
|
||||
|
||||
expect(result.ok).toBe(false);
|
||||
});
|
||||
|
||||
@@ -6,7 +6,9 @@ import type { EvaluateResult, Result } from "./types.js";
|
||||
// Disable HTML escaping — prompts are plain text, not HTML.
|
||||
mustache.escape = (text: string) => text;
|
||||
|
||||
const START_ROLE = "$START";
|
||||
const SUSPEND_ROLE = "$SUSPEND";
|
||||
const UNIT_STATUS = "_";
|
||||
|
||||
type LastOutput = Record<string, unknown>;
|
||||
|
||||
@@ -17,15 +19,12 @@ export function evaluate(
|
||||
lastRole: string,
|
||||
lastOutput: LastOutput,
|
||||
): Result<EvaluateResult, Error> {
|
||||
let status: string;
|
||||
if (typeof lastOutput[STATUS_KEY] === "string") {
|
||||
status = lastOutput[STATUS_KEY] as string;
|
||||
} else {
|
||||
return {
|
||||
ok: false,
|
||||
error: new Error(`agent output for role "${lastRole}" is missing required "$status" string`),
|
||||
};
|
||||
}
|
||||
const status =
|
||||
lastRole === START_ROLE
|
||||
? UNIT_STATUS
|
||||
: typeof lastOutput[STATUS_KEY] === "string"
|
||||
? (lastOutput[STATUS_KEY] as string)
|
||||
: UNIT_STATUS;
|
||||
|
||||
const roleTargets = graph[lastRole];
|
||||
if (roleTargets === undefined) {
|
||||
|
||||
+53
-91
@@ -6,7 +6,13 @@ import { join } from "node:path";
|
||||
|
||||
import { bootstrap, type Hash, type Store, type VarStore } from "@ocas/core";
|
||||
import { createFsStore, createSqliteVarStore } from "@ocas/fs";
|
||||
import type { CasRef, ThreadId, ThreadIndexEntry, ThreadsIndex } from "@united-workforce/protocol";
|
||||
import type {
|
||||
CasRef,
|
||||
ThreadId,
|
||||
ThreadIndexEntry,
|
||||
ThreadListItem,
|
||||
ThreadsIndex,
|
||||
} from "@united-workforce/protocol";
|
||||
import { parseThreadsIndex } from "@united-workforce/protocol";
|
||||
import { parse } from "yaml";
|
||||
|
||||
@@ -20,6 +26,9 @@ export const REGISTRY_VAR_PREFIX = "@uwf/registry/";
|
||||
/** Variable name prefix for active thread entries (`@uwf/thread/<thread-id>`). */
|
||||
export const THREAD_VAR_PREFIX = "@uwf/thread/";
|
||||
|
||||
/** Variable name prefix for completed/cancelled thread history (`@uwf/history/<thread-id>`). */
|
||||
export const HISTORY_VAR_PREFIX = "@uwf/history/";
|
||||
|
||||
/** A workflow entry discovered from the project-local .workflows/ directory. */
|
||||
export type ProjectWorkflowEntry = {
|
||||
/** Workflow name (from YAML `name` field, equals filename stem). */
|
||||
@@ -147,6 +156,11 @@ export function getThreadsPath(storageRoot: string): string {
|
||||
return join(storageRoot, "threads.yaml");
|
||||
}
|
||||
|
||||
export type ThreadHistoryLine = ThreadListItem & {
|
||||
completedAt: number;
|
||||
reason: "completed" | "cancelled" | null;
|
||||
};
|
||||
|
||||
export type UwfStore = {
|
||||
storageRoot: string;
|
||||
store: Store;
|
||||
@@ -165,7 +179,6 @@ export async function createUwfStore(storageRoot: string): Promise<UwfStore> {
|
||||
await migrateWorkflowRegistryIfNeeded(storageRoot, varStore);
|
||||
await migrateThreadsIndexIfNeeded(storageRoot, varStore);
|
||||
await migrateHistoryIfNeeded(storageRoot, varStore);
|
||||
migrateHistoryVarsToThreadVars(varStore);
|
||||
return { storageRoot, store, schemas, varStore };
|
||||
}
|
||||
|
||||
@@ -286,10 +299,8 @@ function threadVarName(threadId: ThreadId): string {
|
||||
function entryFromVariable(v: { value: string; tags: Record<string, string> }): ThreadIndexEntry {
|
||||
return {
|
||||
head: v.value as CasRef,
|
||||
status: (v.tags.status ?? "idle") as ThreadIndexEntry["status"],
|
||||
suspendedRole: v.tags.suspendedRole ?? null,
|
||||
suspendMessage: v.tags.suspendMessage ?? null,
|
||||
completedAt: v.tags.completedAt !== undefined ? Number(v.tags.completedAt) : null,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -320,74 +331,21 @@ export function setThread(varStore: VarStore, threadId: ThreadId, entry: ThreadI
|
||||
// Head CAS nodes may use different schemas (StartNode vs StepNode) — clear all variants first.
|
||||
varStore.remove(name);
|
||||
const tags: Record<string, string> = {};
|
||||
if (entry.status !== "idle") {
|
||||
tags.status = entry.status;
|
||||
}
|
||||
if (entry.suspendedRole !== null) {
|
||||
tags.suspendedRole = entry.suspendedRole;
|
||||
}
|
||||
if (entry.suspendMessage !== null) {
|
||||
tags.suspendMessage = entry.suspendMessage;
|
||||
}
|
||||
if (entry.completedAt !== null) {
|
||||
tags.completedAt = String(entry.completedAt);
|
||||
}
|
||||
varStore.set(name, entry.head, { tags });
|
||||
}
|
||||
|
||||
/** Load only active threads (status not in completed/cancelled). */
|
||||
export function loadActiveThreads(varStore: VarStore): ThreadsIndex {
|
||||
const all = loadAllThreads(varStore);
|
||||
const active: ThreadsIndex = {};
|
||||
for (const [threadId, entry] of Object.entries(all)) {
|
||||
if (entry.status !== "completed" && entry.status !== "cancelled") {
|
||||
active[threadId as ThreadId] = entry;
|
||||
}
|
||||
}
|
||||
return active;
|
||||
/** Remove an active thread entry (on complete/cancel). */
|
||||
export function deleteThread(varStore: VarStore, threadId: ThreadId): void {
|
||||
varStore.remove(threadVarName(threadId));
|
||||
}
|
||||
|
||||
/** Load only completed/cancelled threads (history). */
|
||||
export function loadHistoryThreads(varStore: VarStore): ThreadsIndex {
|
||||
const all = loadAllThreads(varStore);
|
||||
const history: ThreadsIndex = {};
|
||||
for (const [threadId, entry] of Object.entries(all)) {
|
||||
if (entry.status === "completed" || entry.status === "cancelled") {
|
||||
history[threadId as ThreadId] = entry;
|
||||
}
|
||||
}
|
||||
return history;
|
||||
}
|
||||
|
||||
/** Complete a thread by marking it completed or cancelled. */
|
||||
export function completeThread(
|
||||
varStore: VarStore,
|
||||
threadId: ThreadId,
|
||||
reason: "completed" | "cancelled",
|
||||
): void {
|
||||
const entry = getThread(varStore, threadId);
|
||||
if (entry === null) {
|
||||
return;
|
||||
}
|
||||
const completed = {
|
||||
head: entry.head,
|
||||
status: reason,
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: Date.now(),
|
||||
} as ThreadIndexEntry;
|
||||
setThread(varStore, threadId, completed);
|
||||
}
|
||||
|
||||
type LegacyHistoryEntry = {
|
||||
thread: ThreadId;
|
||||
workflow: CasRef;
|
||||
head: CasRef;
|
||||
completedAt: number;
|
||||
reason: "completed" | "cancelled" | null;
|
||||
};
|
||||
|
||||
function parseLegacyHistoryJsonlLine(trimmed: string): LegacyHistoryEntry | null {
|
||||
function parseHistoryJsonlLine(trimmed: string): ThreadHistoryLine | null {
|
||||
let raw: unknown;
|
||||
try {
|
||||
raw = JSON.parse(trimmed) as unknown;
|
||||
@@ -421,7 +379,7 @@ function parseLegacyHistoryJsonlLine(trimmed: string): LegacyHistoryEntry | null
|
||||
return null;
|
||||
}
|
||||
|
||||
/** One-time migration: `~/.uwf/history.jsonl` → `@uwf/thread/*` variables with status tags. */
|
||||
/** One-time migration: `~/.uwf/history.jsonl` → `@uwf/history/*` variables. */
|
||||
export async function migrateHistoryIfNeeded(
|
||||
storageRoot: string,
|
||||
varStore: VarStore,
|
||||
@@ -437,43 +395,47 @@ export async function migrateHistoryIfNeeded(
|
||||
if (trimmed === "") {
|
||||
continue;
|
||||
}
|
||||
const entry = parseLegacyHistoryJsonlLine(trimmed);
|
||||
const entry = parseHistoryJsonlLine(trimmed);
|
||||
if (entry !== null) {
|
||||
const status = entry.reason === "cancelled" ? "cancelled" : "completed";
|
||||
const threadEntry: ThreadIndexEntry = {
|
||||
head: entry.head,
|
||||
status: status as ThreadIndexEntry["status"],
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: entry.completedAt,
|
||||
};
|
||||
setThread(varStore, entry.thread, threadEntry);
|
||||
addHistoryEntry(varStore, entry);
|
||||
}
|
||||
}
|
||||
|
||||
await rename(path, `${path}.migrated`);
|
||||
}
|
||||
|
||||
/** Migrate `@uwf/history/*` variables to `@uwf/thread/*` with status tags. */
|
||||
export function migrateHistoryVarsToThreadVars(varStore: VarStore): void {
|
||||
const LEGACY_HISTORY_VAR_PREFIX = "@uwf/history/";
|
||||
const vars = varStore.list({ namePrefix: LEGACY_HISTORY_VAR_PREFIX });
|
||||
|
||||
for (const v of vars) {
|
||||
const threadId = v.name.slice(LEGACY_HISTORY_VAR_PREFIX.length) as ThreadId;
|
||||
const reason = v.tags.reason;
|
||||
const status = reason === "cancelled" ? "cancelled" : "completed";
|
||||
const completedAt = Number(v.tags.completedAt ?? Date.now());
|
||||
|
||||
const threadEntry: ThreadIndexEntry = {
|
||||
export function loadAllHistory(varStore: VarStore): ThreadHistoryLine[] {
|
||||
const vars = varStore.list({ namePrefix: HISTORY_VAR_PREFIX });
|
||||
return vars.map((v) => ({
|
||||
thread: v.name.slice(HISTORY_VAR_PREFIX.length) as ThreadId,
|
||||
workflow: v.tags.workflow ?? "",
|
||||
head: v.value as CasRef,
|
||||
status: status as ThreadIndexEntry["status"],
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt,
|
||||
};
|
||||
completedAt: Number(v.tags.completedAt ?? "0"),
|
||||
reason: v.tags.reason === "completed" || v.tags.reason === "cancelled" ? v.tags.reason : null,
|
||||
}));
|
||||
}
|
||||
|
||||
setThread(varStore, threadId, threadEntry);
|
||||
varStore.remove(v.name);
|
||||
export function findHistoryEntry(varStore: VarStore, threadId: ThreadId): ThreadHistoryLine | null {
|
||||
const vars = varStore.list({ namePrefix: `${HISTORY_VAR_PREFIX}${threadId}` });
|
||||
const v = vars.find((entry) => entry.name === `${HISTORY_VAR_PREFIX}${threadId}`);
|
||||
if (v === undefined) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
thread: threadId,
|
||||
workflow: v.tags.workflow ?? "",
|
||||
head: v.value as CasRef,
|
||||
completedAt: Number(v.tags.completedAt ?? "0"),
|
||||
reason: v.tags.reason === "completed" || v.tags.reason === "cancelled" ? v.tags.reason : null,
|
||||
};
|
||||
}
|
||||
|
||||
export function addHistoryEntry(varStore: VarStore, entry: ThreadHistoryLine): void {
|
||||
varStore.set(`${HISTORY_VAR_PREFIX}${entry.thread}`, entry.head, {
|
||||
tags: {
|
||||
workflow: entry.workflow,
|
||||
completedAt: String(entry.completedAt),
|
||||
reason: entry.reason ?? "completed",
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
@@ -24,22 +24,26 @@ function isOneOfSchema(fm: unknown): fm is SchemaObj & { oneOf: SchemaObj[] } {
|
||||
return Array.isArray(obj.oneOf);
|
||||
}
|
||||
|
||||
/** Check if a frontmatter schema declares "$status" as const (flat schema form). */
|
||||
function hasStatusConst(fm: unknown): boolean {
|
||||
/** Check if a frontmatter schema uses enum-based multi-exit ($status with multiple enum values). */
|
||||
function isEnumMultiExit(fm: unknown): boolean {
|
||||
if (typeof fm !== "object" || fm === null) return false;
|
||||
const obj = fm as SchemaObj;
|
||||
const props = obj.properties as Record<string, SchemaObj> | undefined;
|
||||
if (!props?.$status) return false;
|
||||
return typeof props.$status.const === "string";
|
||||
const statusDef = props.$status;
|
||||
if (!Array.isArray(statusDef.enum)) return false;
|
||||
// Filter out "_" (wildcard) — if remaining values > 1, it's multi-exit
|
||||
const statuses = (statusDef.enum as string[]).filter((s) => s !== "_");
|
||||
return statuses.length > 1;
|
||||
}
|
||||
|
||||
/** Extract status values from a const-based $status field. */
|
||||
function getConstStatuses(fm: SchemaObj): string[] {
|
||||
/** Extract status values from an enum-based $status field. */
|
||||
function getEnumStatuses(fm: SchemaObj): string[] {
|
||||
const props = fm.properties as Record<string, SchemaObj> | undefined;
|
||||
if (!props?.$status) return [];
|
||||
const statusDef = props.$status;
|
||||
if (typeof statusDef.const === "string") return [statusDef.const];
|
||||
return [];
|
||||
if (!Array.isArray(statusDef.enum)) return [];
|
||||
return (statusDef.enum as string[]).filter((s) => s !== "_");
|
||||
}
|
||||
|
||||
/** Get property names from a schema object. */
|
||||
@@ -97,9 +101,9 @@ function checkGraphStructure(payload: WorkflowPayload, errors: string[]): void {
|
||||
if (!graphNodes.has("$START")) {
|
||||
errors.push("$START must be defined in graph");
|
||||
} else {
|
||||
const startKeys = new Set(Object.keys(payload.graph.$START));
|
||||
if (!startKeys.has("new") || !startKeys.has("resume")) {
|
||||
errors.push('$START must have edges with statuses "new" and "resume"');
|
||||
const startKeys = Object.keys(payload.graph.$START);
|
||||
if (startKeys.length !== 1 || startKeys[0] !== "_") {
|
||||
errors.push('$START must have exactly one edge with status "_"');
|
||||
}
|
||||
}
|
||||
|
||||
@@ -190,13 +194,18 @@ function checkOneOfDiscriminant(
|
||||
}
|
||||
}
|
||||
|
||||
/** Check status-edge consistency for a user role. */
|
||||
function checkStatusEdges(
|
||||
/** Check status-edge consistency for a multi-exit role. */
|
||||
function checkMultiExitEdges(
|
||||
roleName: string,
|
||||
graphKeys: Set<string>,
|
||||
statusSet: Set<string>,
|
||||
errors: string[],
|
||||
): void {
|
||||
if (graphKeys.has("_")) {
|
||||
errors.push(`role "${roleName}" is multi-exit but graph uses "_"`);
|
||||
return;
|
||||
}
|
||||
|
||||
const extraKeys = [...graphKeys].filter((k) => !statusSet.has(k));
|
||||
const missingKeys = [...statusSet].filter((k) => !graphKeys.has(k));
|
||||
if (extraKeys.length > 0) {
|
||||
@@ -246,23 +255,50 @@ function checkRoleConsistency(payload: WorkflowPayload, errors: string[]): void
|
||||
const statuses = getOneOfStatuses(variants);
|
||||
|
||||
checkOneOfDiscriminant(roleName, variants, statuses, errors);
|
||||
checkStatusEdges(roleName, graphKeys, new Set(statuses), errors);
|
||||
checkMultiExitEdges(roleName, graphKeys, new Set(statuses), errors);
|
||||
checkMultiExitMustache(roleName, graphEntry, variants, errors);
|
||||
} else if (hasStatusConst(fm)) {
|
||||
const statuses = getConstStatuses(fm as SchemaObj);
|
||||
checkStatusEdges(roleName, graphKeys, new Set(statuses), errors);
|
||||
// For const-based flat schemas, mustache vars come from the flat properties
|
||||
checkFlatMustache(roleName, graphEntry, fm as SchemaObj, errors);
|
||||
} else if (isEnumMultiExit(fm)) {
|
||||
const statuses = getEnumStatuses(fm as SchemaObj);
|
||||
checkMultiExitEdges(roleName, graphKeys, new Set(statuses), errors);
|
||||
// For enum-based schemas, mustache vars come from the flat properties
|
||||
checkSingleExitMustache(roleName, graphEntry, fm as SchemaObj, errors);
|
||||
} else {
|
||||
errors.push(
|
||||
`role "${roleName}" must define "$status" as const (or oneOf with const) in frontmatter`,
|
||||
);
|
||||
checkSingleExitRole(roleName, graphKeys, graphEntry, fm as SchemaObj | null, errors);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Check single-exit role status and mustache. */
|
||||
function checkSingleExitRole(
|
||||
roleName: string,
|
||||
graphKeys: Set<string>,
|
||||
graphEntry: Record<string, { role: string; prompt: string }>,
|
||||
fm: SchemaObj | null,
|
||||
errors: string[],
|
||||
): void {
|
||||
if (graphKeys.size > 1 || (graphKeys.size === 1 && !graphKeys.has("_"))) {
|
||||
if (!graphKeys.has("_")) {
|
||||
errors.push(`role "${roleName}" is single-exit but graph has no "_" key`);
|
||||
} else {
|
||||
errors.push(`role "${roleName}" is single-exit but has status keys other than "_"`);
|
||||
}
|
||||
}
|
||||
|
||||
const singleTarget = graphEntry._;
|
||||
if (!singleTarget) return;
|
||||
|
||||
const vars = extractMustacheVars(singleTarget.prompt);
|
||||
const propNames = fm ? getPropertyNames(fm) : new Set<string>();
|
||||
for (const v of vars) {
|
||||
if (v === "$status") continue;
|
||||
if (!propNames.has(v)) {
|
||||
errors.push(`prompt variable "${v}" not found in role "${roleName}" frontmatter`);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Check mustache vars in all edge prompts against flat schema properties. */
|
||||
function checkFlatMustache(
|
||||
function checkSingleExitMustache(
|
||||
roleName: string,
|
||||
graphEntry: Record<string, { role: string; prompt: string }>,
|
||||
fm: SchemaObj,
|
||||
|
||||
@@ -57,18 +57,9 @@ function isGraph(value: unknown): boolean {
|
||||
if (!isRecord(value)) {
|
||||
return false;
|
||||
}
|
||||
return Object.values(value).every((statusMap) => {
|
||||
if (!isRecord(statusMap)) {
|
||||
return false;
|
||||
}
|
||||
return Object.entries(statusMap).every(([status, target]) => {
|
||||
// "_" is no longer a valid status key anywhere — $START uses "new"/"resume".
|
||||
if (status === "_") {
|
||||
return false;
|
||||
}
|
||||
return isTarget(target);
|
||||
});
|
||||
});
|
||||
return Object.values(value).every(
|
||||
(statusMap) => isRecord(statusMap) && Object.values(statusMap).every((t) => isTarget(t)),
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -99,13 +90,12 @@ export function checkWorkflowFilenameConsistency(
|
||||
): string | null {
|
||||
const expected = workflowNameFromPath(filePath);
|
||||
if (payload.name !== expected) {
|
||||
return `workflow name mismatch: file "${basename(filePath)}" implies name "${expected}" but YAML declares name "${payload.name}". Either rename the file to "${payload.name}.yaml" or change the YAML \`name\` field to "${expected}"`;
|
||||
return `workflow name mismatch: file "${basename(filePath)}" implies name "${expected}" but YAML declares name "${payload.name}"`;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Validate YAML-parsed workflow document shape (outputSchema may be inline JSON Schema). */
|
||||
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: validation function with many field checks
|
||||
export function parseWorkflowPayload(raw: unknown): WorkflowPayload | null {
|
||||
if (!isRecord(raw)) {
|
||||
return null;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@united-workforce/dashboard",
|
||||
"version": "0.1.0",
|
||||
"version": "0.5.0-alpha.4",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
|
||||
@@ -1,9 +0,0 @@
|
||||
# @united-workforce/eval
|
||||
|
||||
## 0.1.2
|
||||
|
||||
### Patch Changes
|
||||
|
||||
- 850a3b2: fix: resolve --agent override via config alias before raw command
|
||||
|
||||
`resolveAgentConfig()` now checks `config.agents[alias]` first before falling back to `parseAgentOverride()`. Eval CLI default `--agent` changed from `"hermes"` to `"uwf-hermes"`.
|
||||
@@ -1,219 +0,0 @@
|
||||
import type { StepEntry } from "@united-workforce/protocol";
|
||||
import { beforeEach, describe, expect, test, vi } from "vitest";
|
||||
|
||||
import {
|
||||
runFrontmatterJudge,
|
||||
runHallucinationJudge,
|
||||
runTokenStatsJudge,
|
||||
runUpstreamJudge,
|
||||
} from "../src/judge/builtin/index.js";
|
||||
|
||||
// Mock the shared read-steps helper so the judges never shell out to `uwf`.
|
||||
vi.mock("../src/judge/builtin/read-steps.js", () => ({
|
||||
readThreadSteps: vi.fn(),
|
||||
}));
|
||||
|
||||
import { readThreadSteps } from "../src/judge/builtin/read-steps.js";
|
||||
|
||||
const mockedReadSteps = vi.mocked(readThreadSteps);
|
||||
|
||||
function makeStep(overrides: Partial<StepEntry>): StepEntry {
|
||||
return {
|
||||
hash: "HASH000000000",
|
||||
role: "worker",
|
||||
output: "---\n$status: done\n---\n\nbody",
|
||||
detail: "DETAIL0000000",
|
||||
agent: "hermes",
|
||||
timestamp: 0,
|
||||
durationMs: 0,
|
||||
usage: null,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
beforeEach(() => {
|
||||
mockedReadSteps.mockReset();
|
||||
});
|
||||
|
||||
describe("frontmatter-compliance judge", () => {
|
||||
test("all steps have valid frontmatter → score 1.0", async () => {
|
||||
mockedReadSteps.mockReturnValue([
|
||||
makeStep({ role: "a", output: "---\n$status: done\n---\n\nwork" }),
|
||||
makeStep({ role: "b", output: "---\n$status: needs_input\n---\nmore" }),
|
||||
]);
|
||||
|
||||
const result = await runFrontmatterJudge("T1");
|
||||
const data = result.data as { stepsTotal: number; stepsValid: number; invalidSteps: unknown[] };
|
||||
|
||||
expect(result.score).toBe(1.0);
|
||||
expect(data.stepsTotal).toBe(2);
|
||||
expect(data.stepsValid).toBe(2);
|
||||
expect(data.invalidSteps).toHaveLength(0);
|
||||
});
|
||||
|
||||
test("some steps missing $status → partial score", async () => {
|
||||
mockedReadSteps.mockReturnValue([
|
||||
makeStep({ role: "a", output: "---\n$status: done\n---\nok" }),
|
||||
makeStep({ role: "b", output: "---\nfoo: bar\n---\nmissing status" }),
|
||||
makeStep({ role: "c", output: "no frontmatter at all" }),
|
||||
]);
|
||||
|
||||
const result = await runFrontmatterJudge("T2");
|
||||
const data = result.data as {
|
||||
stepsTotal: number;
|
||||
stepsValid: number;
|
||||
invalidSteps: Array<{ stepIndex: number; role: string; errors: string[] }>;
|
||||
};
|
||||
|
||||
expect(result.score).toBeCloseTo(1 / 3, 10);
|
||||
expect(data.stepsTotal).toBe(3);
|
||||
expect(data.stepsValid).toBe(1);
|
||||
expect(data.invalidSteps).toHaveLength(2);
|
||||
expect(data.invalidSteps[0]).toMatchObject({ stepIndex: 1, role: "b" });
|
||||
expect(data.invalidSteps[1]).toMatchObject({ stepIndex: 2, role: "c" });
|
||||
});
|
||||
|
||||
test("no steps → score 0 (0/0 edge case)", async () => {
|
||||
mockedReadSteps.mockReturnValue([]);
|
||||
|
||||
const result = await runFrontmatterJudge("T3");
|
||||
const data = result.data as { stepsTotal: number; stepsValid: number; invalidSteps: unknown[] };
|
||||
|
||||
expect(result.score).toBe(0);
|
||||
expect(data.stepsTotal).toBe(0);
|
||||
expect(data.stepsValid).toBe(0);
|
||||
expect(data.invalidSteps).toHaveLength(0);
|
||||
});
|
||||
|
||||
test("empty-string $status counts as invalid", async () => {
|
||||
mockedReadSteps.mockReturnValue([makeStep({ role: "a", output: '---\n$status: ""\n---\nx' })]);
|
||||
|
||||
const result = await runFrontmatterJudge("T4");
|
||||
expect(result.score).toBe(0);
|
||||
});
|
||||
|
||||
test("parsed object output with $status → score 1.0", async () => {
|
||||
mockedReadSteps.mockReturnValue([
|
||||
makeStep({ role: "a", output: { $status: "done", summary: "fixed" } as unknown as string }),
|
||||
makeStep({ role: "b", output: { $status: "reviewed" } as unknown as string }),
|
||||
]);
|
||||
|
||||
const result = await runFrontmatterJudge("T5");
|
||||
const data = result.data as { stepsTotal: number; stepsValid: number; invalidSteps: unknown[] };
|
||||
|
||||
expect(result.score).toBe(1.0);
|
||||
expect(data.stepsTotal).toBe(2);
|
||||
expect(data.stepsValid).toBe(2);
|
||||
});
|
||||
|
||||
test("parsed object output missing $status → score 0", async () => {
|
||||
mockedReadSteps.mockReturnValue([
|
||||
makeStep({ role: "a", output: { summary: "no status field" } as unknown as string }),
|
||||
]);
|
||||
|
||||
const result = await runFrontmatterJudge("T6");
|
||||
expect(result.score).toBe(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe("token-stats judge", () => {
|
||||
test("steps with usage → sums correctly", async () => {
|
||||
mockedReadSteps.mockReturnValue([
|
||||
makeStep({
|
||||
role: "a",
|
||||
usage: { turns: 2, inputTokens: 100, outputTokens: 50, duration: 1.5 },
|
||||
}),
|
||||
makeStep({
|
||||
role: "b",
|
||||
usage: { turns: 3, inputTokens: 200, outputTokens: 75, duration: 2.0 },
|
||||
}),
|
||||
]);
|
||||
|
||||
const result = await runTokenStatsJudge("T1");
|
||||
const data = result.data as {
|
||||
totalInput: number;
|
||||
totalOutput: number;
|
||||
totalTurns: number;
|
||||
perStep: Array<{ role: string; inputTokens: number; outputTokens: number; turns: number }>;
|
||||
};
|
||||
|
||||
expect(result.score).toBe(1.0);
|
||||
expect(data.totalInput).toBe(300);
|
||||
expect(data.totalOutput).toBe(125);
|
||||
expect(data.totalTurns).toBe(5);
|
||||
expect(data.perStep).toHaveLength(2);
|
||||
expect(data.perStep[0]).toEqual({
|
||||
role: "a",
|
||||
inputTokens: 100,
|
||||
outputTokens: 50,
|
||||
turns: 2,
|
||||
duration: 1.5,
|
||||
});
|
||||
});
|
||||
|
||||
test("steps with null usage → zeros", async () => {
|
||||
mockedReadSteps.mockReturnValue([
|
||||
makeStep({ role: "a", usage: null }),
|
||||
makeStep({ role: "b", usage: null }),
|
||||
]);
|
||||
|
||||
const result = await runTokenStatsJudge("T2");
|
||||
const data = result.data as {
|
||||
totalInput: number;
|
||||
totalOutput: number;
|
||||
totalTurns: number;
|
||||
perStep: Array<{
|
||||
inputTokens: number;
|
||||
outputTokens: number;
|
||||
turns: number;
|
||||
duration: number;
|
||||
}>;
|
||||
};
|
||||
|
||||
expect(result.score).toBe(1.0);
|
||||
expect(data.totalInput).toBe(0);
|
||||
expect(data.totalOutput).toBe(0);
|
||||
expect(data.totalTurns).toBe(0);
|
||||
expect(data.perStep[0]).toEqual({
|
||||
role: "a",
|
||||
inputTokens: 0,
|
||||
outputTokens: 0,
|
||||
turns: 0,
|
||||
duration: 0,
|
||||
});
|
||||
});
|
||||
|
||||
test("empty steps → all zeros, score 1.0", async () => {
|
||||
mockedReadSteps.mockReturnValue([]);
|
||||
|
||||
const result = await runTokenStatsJudge("T3");
|
||||
const data = result.data as {
|
||||
totalInput: number;
|
||||
totalOutput: number;
|
||||
totalTurns: number;
|
||||
perStep: unknown[];
|
||||
};
|
||||
|
||||
expect(result.score).toBe(1.0);
|
||||
expect(data.totalInput).toBe(0);
|
||||
expect(data.totalOutput).toBe(0);
|
||||
expect(data.totalTurns).toBe(0);
|
||||
expect(data.perStep).toHaveLength(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe("LLM-as-judge stubs", () => {
|
||||
test("upstream-consumption returns a stub", async () => {
|
||||
const result = await runUpstreamJudge("T1");
|
||||
expect(result.score).toBe(0);
|
||||
expect(result.data).toEqual({ perStep: [] });
|
||||
expect(result.schema.title).toBe("@uwf/eval-judge-upstream");
|
||||
});
|
||||
|
||||
test("hallucination returns a stub", async () => {
|
||||
const result = await runHallucinationJudge("T1");
|
||||
expect(result.score).toBe(0);
|
||||
expect(result.data).toEqual({ perStep: [] });
|
||||
expect(result.schema.title).toBe("@uwf/eval-judge-hallucination");
|
||||
});
|
||||
});
|
||||
@@ -1,152 +0,0 @@
|
||||
import { bootstrap, createMemoryStore } from "@ocas/core";
|
||||
import { describe, expect, test } from "vitest";
|
||||
import type { JudgeRunner } from "../src/runner/index.js";
|
||||
import { collect, computeOverall } from "../src/runner/index.js";
|
||||
import type { EvalRunConfig, EvalStore } from "../src/storage/index.js";
|
||||
import type { JudgeEntry, TaskManifest } from "../src/task/index.js";
|
||||
|
||||
function makeJudge(name: string, weight: number, builtin: boolean): JudgeEntry {
|
||||
return {
|
||||
name,
|
||||
weight,
|
||||
builtin,
|
||||
entry: builtin ? null : `dist/judges/${name}.js`,
|
||||
schema: null,
|
||||
};
|
||||
}
|
||||
|
||||
function makeManifest(judges: JudgeEntry[]): TaskManifest {
|
||||
return {
|
||||
name: "fix-off-by-one",
|
||||
description: "test task",
|
||||
workflow: "solve-issue",
|
||||
prompt: "Fix the bug",
|
||||
limits: { maxSteps: 10, timeoutMinutes: 30 },
|
||||
judges,
|
||||
};
|
||||
}
|
||||
|
||||
function makeEvalStore(): EvalStore {
|
||||
const store = createMemoryStore();
|
||||
bootstrap(store);
|
||||
return { store, varStore: store.var };
|
||||
}
|
||||
|
||||
const CONFIG: EvalRunConfig = {
|
||||
agent: "hermes",
|
||||
model: "claude-sonnet-4",
|
||||
engineVersion: "test",
|
||||
};
|
||||
|
||||
/** Returns a fixed score per judge name. */
|
||||
function scriptedRunner(scores: Record<string, number>): JudgeRunner {
|
||||
return async (_taskDir, _workDir, _threadId, judge) => ({
|
||||
score: scores[judge.name] ?? 0,
|
||||
data: { judged: judge.name },
|
||||
schema: { type: "object" },
|
||||
});
|
||||
}
|
||||
|
||||
describe("computeOverall", () => {
|
||||
test("computes the weighted average correctly", () => {
|
||||
const overall = computeOverall([
|
||||
{ score: 0.8, weight: 0.3 },
|
||||
{ score: 0.6, weight: 0.3 },
|
||||
{ score: 1.0, weight: 0.4 },
|
||||
]);
|
||||
// 0.24 + 0.18 + 0.4 = 0.82
|
||||
expect(overall).toBeCloseTo(0.82, 10);
|
||||
});
|
||||
|
||||
test("a weight-0 judge does not affect the result", () => {
|
||||
const withInformational = computeOverall([
|
||||
{ score: 1.0, weight: 1.0 },
|
||||
{ score: 0.0, weight: 0.0 },
|
||||
]);
|
||||
expect(withInformational).toBe(1.0);
|
||||
});
|
||||
|
||||
test("returns 0 when total weight is 0", () => {
|
||||
expect(computeOverall([{ score: 0.5, weight: 0 }])).toBe(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe("collect", () => {
|
||||
test("computes weighted score correctly across judges", async () => {
|
||||
const evalStore = makeEvalStore();
|
||||
const manifest = makeManifest([
|
||||
makeJudge("test-pass", 0.6, false),
|
||||
makeJudge("code-quality", 0.4, false),
|
||||
]);
|
||||
const runJudge = scriptedRunner({ "test-pass": 1.0, "code-quality": 0.5 });
|
||||
|
||||
const result = await collect(
|
||||
{
|
||||
evalStore,
|
||||
taskDir: "/tmp/task",
|
||||
workDir: "/tmp/work",
|
||||
threadId: "THREAD123",
|
||||
manifest,
|
||||
config: CONFIG,
|
||||
},
|
||||
runJudge,
|
||||
);
|
||||
|
||||
// 1.0 * 0.6 + 0.5 * 0.4 = 0.8
|
||||
expect(result.overall).toBeCloseTo(0.8, 10);
|
||||
expect(result.runHash).toBeTruthy();
|
||||
expect(result.judges).toHaveLength(2);
|
||||
expect(result.judges[0]).toEqual({ name: "test-pass", score: 1.0, weight: 0.6 });
|
||||
|
||||
const latest = evalStore.varStore.list({
|
||||
exactName: "@uwf/eval/fix-off-by-one/latest",
|
||||
});
|
||||
expect(latest[0]?.value).toBe(result.runHash);
|
||||
});
|
||||
|
||||
test("handles a judge with weight 0 (informational)", async () => {
|
||||
const evalStore = makeEvalStore();
|
||||
const manifest = makeManifest([
|
||||
makeJudge("test-pass", 1.0, false),
|
||||
makeJudge("token-stats", 0, true),
|
||||
]);
|
||||
// token-stats is builtin → default runner would score 0; give scripted score
|
||||
// that would skew the result if it were counted.
|
||||
const runJudge = scriptedRunner({ "test-pass": 0.5, "token-stats": 1.0 });
|
||||
|
||||
const result = await collect(
|
||||
{
|
||||
evalStore,
|
||||
taskDir: "/tmp/task",
|
||||
workDir: "/tmp/work",
|
||||
threadId: "THREAD123",
|
||||
manifest,
|
||||
config: CONFIG,
|
||||
},
|
||||
runJudge,
|
||||
);
|
||||
|
||||
// Only test-pass (weight 1.0) counts → overall = 0.5
|
||||
expect(result.overall).toBeCloseTo(0.5, 10);
|
||||
expect(result.judges).toHaveLength(2);
|
||||
const tokenStats = result.judges.find((j) => j.name === "token-stats");
|
||||
expect(tokenStats?.weight).toBe(0);
|
||||
});
|
||||
|
||||
test("unknown builtin judge name throws via the default runner", async () => {
|
||||
const evalStore = makeEvalStore();
|
||||
const manifest = makeManifest([makeJudge("not-a-real-judge", 1.0, true)]);
|
||||
|
||||
// Use the default runner (no injected runner) → builtin dispatch → unknown name throws.
|
||||
await expect(
|
||||
collect({
|
||||
evalStore,
|
||||
taskDir: "/tmp/task",
|
||||
workDir: "/tmp/work",
|
||||
threadId: "THREAD123",
|
||||
manifest,
|
||||
config: CONFIG,
|
||||
}),
|
||||
).rejects.toThrow(/unknown builtin judge/);
|
||||
});
|
||||
});
|
||||
@@ -1,171 +0,0 @@
|
||||
import { bootstrap, createMemoryStore, putSchema } from "@ocas/core";
|
||||
import type { CasRef } from "@united-workforce/protocol";
|
||||
import { describe, expect, test } from "vitest";
|
||||
|
||||
import {
|
||||
formatDiff,
|
||||
formatList,
|
||||
formatReport,
|
||||
readEvalEntries,
|
||||
readEvalRun,
|
||||
selectEntries,
|
||||
} from "../src/commands/index.js";
|
||||
import type { EvalRunPayload, EvalStore } from "../src/storage/index.js";
|
||||
import { EVAL_RUN_SCHEMA, setEvalLatest } from "../src/storage/index.js";
|
||||
|
||||
function makeEvalStore(): EvalStore {
|
||||
const store = createMemoryStore();
|
||||
bootstrap(store);
|
||||
return { store, varStore: store.var };
|
||||
}
|
||||
|
||||
function makePayload(
|
||||
task: string,
|
||||
overall: number,
|
||||
timestamp: number,
|
||||
judges: EvalRunPayload["judges"] = [
|
||||
{
|
||||
name: "frontmatter-compliance",
|
||||
score: 1.0,
|
||||
weight: 0.6,
|
||||
dataHash: "AAAAAAAAAAAAA" as CasRef,
|
||||
},
|
||||
{ name: "token-stats", score: 0.5, weight: 0, dataHash: "BBBBBBBBBBBBB" as CasRef },
|
||||
],
|
||||
config: EvalRunPayload["config"] = {
|
||||
agent: "hermes",
|
||||
model: "claude-sonnet-4",
|
||||
engineVersion: "1.0.0",
|
||||
},
|
||||
): EvalRunPayload {
|
||||
return { task, config, threadId: "THREAD0123456789", judges, overall, timestamp };
|
||||
}
|
||||
|
||||
/** Store an eval-run node in CAS and index it under @uwf/eval/<task>/latest. */
|
||||
function storeRun(evalStore: EvalStore, payload: EvalRunPayload): string {
|
||||
const schemaHash = putSchema(evalStore.store, EVAL_RUN_SCHEMA);
|
||||
const hash = evalStore.store.cas.put(schemaHash, payload);
|
||||
setEvalLatest(evalStore.varStore, payload.task, hash);
|
||||
return hash;
|
||||
}
|
||||
|
||||
describe("formatReport", () => {
|
||||
test("includes task, overall, config and judges", () => {
|
||||
const payload = makePayload("fix-off-by-one", 0.8, Date.UTC(2026, 0, 2, 3, 4, 5));
|
||||
const output = formatReport(payload, "RUNHASH123456");
|
||||
|
||||
expect(output).toContain("fix-off-by-one");
|
||||
expect(output).toContain("0.8000");
|
||||
expect(output).toContain("hermes");
|
||||
expect(output).toContain("claude-sonnet-4");
|
||||
expect(output).toContain("1.0.0");
|
||||
expect(output).toContain("frontmatter-compliance");
|
||||
expect(output).toContain("token-stats");
|
||||
expect(output).toContain("THREAD0123456789");
|
||||
expect(output).toContain("RUNHASH123456");
|
||||
});
|
||||
|
||||
test("round-trips a stored run via readEvalRun", () => {
|
||||
const evalStore = makeEvalStore();
|
||||
const payload = makePayload("fix-off-by-one", 0.75, Date.now());
|
||||
const hash = storeRun(evalStore, payload);
|
||||
|
||||
const loaded = readEvalRun(evalStore, hash);
|
||||
expect(loaded).not.toBeNull();
|
||||
const output = formatReport(loaded as EvalRunPayload, hash);
|
||||
expect(output).toContain("fix-off-by-one");
|
||||
expect(output).toContain("0.7500");
|
||||
});
|
||||
|
||||
test("readEvalRun returns null for a missing hash", () => {
|
||||
const evalStore = makeEvalStore();
|
||||
expect(readEvalRun(evalStore, "NOPENOPENOPE0")).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe("list", () => {
|
||||
test("lists eval runs stored under different tasks", () => {
|
||||
const evalStore = makeEvalStore();
|
||||
storeRun(evalStore, makePayload("fix-off-by-one", 0.8, 2000));
|
||||
storeRun(evalStore, makePayload("write-docs", 0.6, 1000));
|
||||
|
||||
const entries = readEvalEntries(evalStore);
|
||||
expect(entries).toHaveLength(2);
|
||||
|
||||
const output = formatList(selectEntries(entries, null, 20));
|
||||
expect(output).toContain("fix-off-by-one");
|
||||
expect(output).toContain("write-docs");
|
||||
});
|
||||
|
||||
test("sorts newest-first by timestamp", () => {
|
||||
const evalStore = makeEvalStore();
|
||||
storeRun(evalStore, makePayload("old-task", 0.5, 1000));
|
||||
storeRun(evalStore, makePayload("new-task", 0.5, 2000));
|
||||
|
||||
const selected = selectEntries(readEvalEntries(evalStore), null, 20);
|
||||
expect(selected[0]?.task).toBe("new-task");
|
||||
expect(selected[1]?.task).toBe("old-task");
|
||||
});
|
||||
|
||||
test("--task filter only shows the matching task", () => {
|
||||
const evalStore = makeEvalStore();
|
||||
storeRun(evalStore, makePayload("fix-off-by-one", 0.8, 2000));
|
||||
storeRun(evalStore, makePayload("write-docs", 0.6, 1000));
|
||||
|
||||
const output = formatList(selectEntries(readEvalEntries(evalStore), "write-docs", 20));
|
||||
expect(output).toContain("write-docs");
|
||||
expect(output).not.toContain("fix-off-by-one");
|
||||
});
|
||||
|
||||
test("--limit caps the number of rows", () => {
|
||||
const evalStore = makeEvalStore();
|
||||
storeRun(evalStore, makePayload("task-a", 0.8, 3000));
|
||||
storeRun(evalStore, makePayload("task-b", 0.6, 2000));
|
||||
storeRun(evalStore, makePayload("task-c", 0.4, 1000));
|
||||
|
||||
const selected = selectEntries(readEvalEntries(evalStore), null, 2);
|
||||
expect(selected).toHaveLength(2);
|
||||
expect(selected.map((e) => e.task)).toEqual(["task-a", "task-b"]);
|
||||
});
|
||||
|
||||
test("empty store renders a placeholder", () => {
|
||||
const evalStore = makeEvalStore();
|
||||
const output = formatList(selectEntries(readEvalEntries(evalStore), null, 20));
|
||||
expect(output).toContain("(no eval runs found)");
|
||||
});
|
||||
});
|
||||
|
||||
describe("formatDiff", () => {
|
||||
test("shows an upward delta when B scores higher", () => {
|
||||
const a = makePayload("fix-off-by-one", 0.6, 1000);
|
||||
const b = makePayload("fix-off-by-one", 0.8, 2000);
|
||||
const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000");
|
||||
|
||||
expect(output).toContain("▲");
|
||||
expect(output).toContain("HASHA00000000");
|
||||
expect(output).toContain("HASHB00000000");
|
||||
});
|
||||
|
||||
test("shows a downward delta when B scores lower", () => {
|
||||
const a = makePayload("fix-off-by-one", 0.9, 1000);
|
||||
const b = makePayload("fix-off-by-one", 0.4, 2000);
|
||||
const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000");
|
||||
expect(output).toContain("▼");
|
||||
});
|
||||
|
||||
test("marks differing config values", () => {
|
||||
const a = makePayload("fix-off-by-one", 0.6, 1000, undefined, {
|
||||
agent: "hermes",
|
||||
model: "claude-sonnet-4",
|
||||
engineVersion: "1.0.0",
|
||||
});
|
||||
const b = makePayload("fix-off-by-one", 0.6, 2000, undefined, {
|
||||
agent: "claude-code",
|
||||
model: "claude-sonnet-4",
|
||||
engineVersion: "1.0.0",
|
||||
});
|
||||
const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000");
|
||||
expect(output).toContain("≠");
|
||||
expect(output).toContain("claude-code");
|
||||
});
|
||||
});
|
||||
@@ -1,74 +0,0 @@
|
||||
import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
|
||||
import { afterEach, beforeEach, describe, expect, test } from "vitest";
|
||||
|
||||
import { prepare } from "../src/runner/index.js";
|
||||
|
||||
const TASK_YAML = `
|
||||
name: fix-off-by-one
|
||||
description: Fix an off-by-one error
|
||||
workflow: solve-issue
|
||||
prompt: "Fix the bug"
|
||||
limits:
|
||||
maxSteps: 12
|
||||
timeoutMinutes: 20
|
||||
judges:
|
||||
- name: frontmatter-compliance
|
||||
weight: 0.5
|
||||
builtin: true
|
||||
- name: test-pass
|
||||
weight: 0.5
|
||||
entry: dist/judges/test-pass.js
|
||||
`;
|
||||
|
||||
let taskDir: string;
|
||||
|
||||
beforeEach(async () => {
|
||||
taskDir = await mkdtemp(join(tmpdir(), "uwf-eval-task-"));
|
||||
await writeFile(join(taskDir, "task.yaml"), TASK_YAML, "utf8");
|
||||
const fixtureDir = join(taskDir, "fixture");
|
||||
await mkdir(join(fixtureDir, "src"), { recursive: true });
|
||||
await writeFile(join(fixtureDir, "src", "calc.ts"), "export const add = (a, b) => a + b + 1;\n");
|
||||
await writeFile(join(fixtureDir, "package.json"), '{ "name": "fixture" }\n');
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(taskDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
describe("prepare", () => {
|
||||
test("returns the parsed manifest", async () => {
|
||||
const result = await prepare(taskDir);
|
||||
expect(result.taskDir).toBe(taskDir);
|
||||
expect(result.manifest.name).toBe("fix-off-by-one");
|
||||
expect(result.manifest.workflow).toBe("solve-issue");
|
||||
expect(result.manifest.limits.maxSteps).toBe(12);
|
||||
expect(result.manifest.judges).toHaveLength(2);
|
||||
});
|
||||
|
||||
test("copies fixture into a fresh temp work dir", async () => {
|
||||
const result = await prepare(taskDir);
|
||||
expect(result.workDir).not.toBe(taskDir);
|
||||
expect(result.workDir.startsWith(tmpdir())).toBe(true);
|
||||
|
||||
const calc = await readFile(join(result.workDir, "src", "calc.ts"), "utf8");
|
||||
expect(calc).toContain("export const add");
|
||||
const pkg = await readFile(join(result.workDir, "package.json"), "utf8");
|
||||
expect(pkg).toContain("fixture");
|
||||
|
||||
await rm(result.workDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
test("creates an empty work dir when no fixture/ exists", async () => {
|
||||
const noFixtureDir = await mkdtemp(join(tmpdir(), "uwf-eval-nofix-"));
|
||||
await writeFile(join(noFixtureDir, "task.yaml"), TASK_YAML, "utf8");
|
||||
|
||||
const result = await prepare(noFixtureDir);
|
||||
expect(result.workDir.startsWith(tmpdir())).toBe(true);
|
||||
|
||||
await rm(noFixtureDir, { recursive: true, force: true });
|
||||
await rm(result.workDir, { recursive: true, force: true });
|
||||
});
|
||||
});
|
||||
@@ -1,63 +0,0 @@
|
||||
import { describe, expect, test } from "vitest";
|
||||
import {
|
||||
EVAL_JUDGE_FRONTMATTER_SCHEMA,
|
||||
EVAL_JUDGE_HALLUCINATION_SCHEMA,
|
||||
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
|
||||
EVAL_JUDGE_UPSTREAM_SCHEMA,
|
||||
EVAL_RUN_SCHEMA,
|
||||
} from "../src/storage/index.js";
|
||||
|
||||
describe("OCAS schema definitions", () => {
|
||||
test("eval-run schema has correct title and required fields", () => {
|
||||
expect(EVAL_RUN_SCHEMA.title).toBe("@uwf/eval-run");
|
||||
const required = EVAL_RUN_SCHEMA.required as string[];
|
||||
expect(required).toContain("task");
|
||||
expect(required).toContain("config");
|
||||
expect(required).toContain("threadId");
|
||||
expect(required).toContain("judges");
|
||||
expect(required).toContain("overall");
|
||||
expect(required).toContain("timestamp");
|
||||
});
|
||||
|
||||
test("frontmatter judge schema has correct title", () => {
|
||||
expect(EVAL_JUDGE_FRONTMATTER_SCHEMA.title).toBe("@uwf/eval-judge-frontmatter");
|
||||
const required = EVAL_JUDGE_FRONTMATTER_SCHEMA.required as string[];
|
||||
expect(required).toContain("stepsTotal");
|
||||
expect(required).toContain("stepsValid");
|
||||
expect(required).toContain("invalidSteps");
|
||||
});
|
||||
|
||||
test("upstream judge schema has correct title", () => {
|
||||
expect(EVAL_JUDGE_UPSTREAM_SCHEMA.title).toBe("@uwf/eval-judge-upstream");
|
||||
const required = EVAL_JUDGE_UPSTREAM_SCHEMA.required as string[];
|
||||
expect(required).toContain("perStep");
|
||||
});
|
||||
|
||||
test("hallucination judge schema has correct title", () => {
|
||||
expect(EVAL_JUDGE_HALLUCINATION_SCHEMA.title).toBe("@uwf/eval-judge-hallucination");
|
||||
const required = EVAL_JUDGE_HALLUCINATION_SCHEMA.required as string[];
|
||||
expect(required).toContain("perStep");
|
||||
});
|
||||
|
||||
test("token-stats judge schema has correct title", () => {
|
||||
expect(EVAL_JUDGE_TOKEN_STATS_SCHEMA.title).toBe("@uwf/eval-judge-token-stats");
|
||||
const required = EVAL_JUDGE_TOKEN_STATS_SCHEMA.required as string[];
|
||||
expect(required).toContain("totalInput");
|
||||
expect(required).toContain("totalOutput");
|
||||
expect(required).toContain("totalTurns");
|
||||
expect(required).toContain("perStep");
|
||||
});
|
||||
|
||||
test("all schemas have type object at root", () => {
|
||||
const schemas = [
|
||||
EVAL_RUN_SCHEMA,
|
||||
EVAL_JUDGE_FRONTMATTER_SCHEMA,
|
||||
EVAL_JUDGE_UPSTREAM_SCHEMA,
|
||||
EVAL_JUDGE_HALLUCINATION_SCHEMA,
|
||||
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
|
||||
];
|
||||
for (const s of schemas) {
|
||||
expect(s.type).toBe("object");
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -1,163 +0,0 @@
|
||||
import { describe, expect, test } from "vitest";
|
||||
import { parseTaskManifest } from "../src/task/index.js";
|
||||
|
||||
const VALID_YAML = `
|
||||
name: fix-off-by-one
|
||||
description: Fix an off-by-one error in a calculator
|
||||
workflow: solve-issue
|
||||
prompt: "Fix the bug: add(1,2) returns 4 instead of 3"
|
||||
limits:
|
||||
maxSteps: 15
|
||||
timeoutMinutes: 30
|
||||
judges:
|
||||
- name: frontmatter-compliance
|
||||
weight: 0.15
|
||||
builtin: true
|
||||
- name: test-pass
|
||||
weight: 0.3
|
||||
entry: dist/judges/test-pass.js
|
||||
schema: schemas/test-pass.json
|
||||
`;
|
||||
|
||||
describe("parseTaskManifest", () => {
|
||||
test("parses valid task.yaml", () => {
|
||||
const manifest = parseTaskManifest(VALID_YAML);
|
||||
expect(manifest.name).toBe("fix-off-by-one");
|
||||
expect(manifest.description).toBe("Fix an off-by-one error in a calculator");
|
||||
expect(manifest.workflow).toBe("solve-issue");
|
||||
expect(manifest.prompt).toBe("Fix the bug: add(1,2) returns 4 instead of 3");
|
||||
expect(manifest.limits).toEqual({ maxSteps: 15, timeoutMinutes: 30 });
|
||||
expect(manifest.judges).toHaveLength(2);
|
||||
});
|
||||
|
||||
test("parses builtin judge", () => {
|
||||
const manifest = parseTaskManifest(VALID_YAML);
|
||||
const builtin = manifest.judges[0];
|
||||
expect(builtin).toBeDefined();
|
||||
expect(builtin!.name).toBe("frontmatter-compliance");
|
||||
expect(builtin!.weight).toBe(0.15);
|
||||
expect(builtin!.builtin).toBe(true);
|
||||
expect(builtin!.entry).toBeNull();
|
||||
});
|
||||
|
||||
test("parses custom judge with entry + schema", () => {
|
||||
const manifest = parseTaskManifest(VALID_YAML);
|
||||
const custom = manifest.judges[1];
|
||||
expect(custom).toBeDefined();
|
||||
expect(custom!.name).toBe("test-pass");
|
||||
expect(custom!.weight).toBe(0.3);
|
||||
expect(custom!.builtin).toBe(false);
|
||||
expect(custom!.entry).toBe("dist/judges/test-pass.js");
|
||||
expect(custom!.schema).toBe("schemas/test-pass.json");
|
||||
});
|
||||
|
||||
test("defaults limits when omitted", () => {
|
||||
const yaml = `
|
||||
name: minimal
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges:
|
||||
- name: check
|
||||
builtin: true
|
||||
`;
|
||||
const manifest = parseTaskManifest(yaml);
|
||||
expect(manifest.limits).toEqual({ maxSteps: 20, timeoutMinutes: 30 });
|
||||
});
|
||||
|
||||
test("defaults description to empty string", () => {
|
||||
const yaml = `
|
||||
name: no-desc
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges:
|
||||
- name: check
|
||||
builtin: true
|
||||
`;
|
||||
const manifest = parseTaskManifest(yaml);
|
||||
expect(manifest.description).toBe("");
|
||||
});
|
||||
|
||||
test("rejects missing name", () => {
|
||||
const yaml = `
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges:
|
||||
- name: check
|
||||
builtin: true
|
||||
`;
|
||||
expect(() => parseTaskManifest(yaml)).toThrow("name is required");
|
||||
});
|
||||
|
||||
test("rejects missing workflow", () => {
|
||||
const yaml = `
|
||||
name: test
|
||||
prompt: do something
|
||||
judges:
|
||||
- name: check
|
||||
builtin: true
|
||||
`;
|
||||
expect(() => parseTaskManifest(yaml)).toThrow("workflow is required");
|
||||
});
|
||||
|
||||
test("rejects missing prompt", () => {
|
||||
const yaml = `
|
||||
name: test
|
||||
workflow: solve-issue
|
||||
judges:
|
||||
- name: check
|
||||
builtin: true
|
||||
`;
|
||||
expect(() => parseTaskManifest(yaml)).toThrow("prompt is required");
|
||||
});
|
||||
|
||||
test("rejects empty judges array", () => {
|
||||
const yaml = `
|
||||
name: test
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges: []
|
||||
`;
|
||||
expect(() => parseTaskManifest(yaml)).toThrow("at least one judge");
|
||||
});
|
||||
|
||||
test("rejects non-builtin judge without entry", () => {
|
||||
const yaml = `
|
||||
name: test
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges:
|
||||
- name: custom-check
|
||||
weight: 0.5
|
||||
`;
|
||||
expect(() => parseTaskManifest(yaml)).toThrow("non-builtin judge must have entry");
|
||||
});
|
||||
|
||||
test("rejects non-object YAML root", () => {
|
||||
expect(() => parseTaskManifest("just a string")).toThrow("must be a YAML mapping");
|
||||
});
|
||||
|
||||
test("rejects judge without name", () => {
|
||||
const yaml = `
|
||||
name: test
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges:
|
||||
- weight: 0.5
|
||||
builtin: true
|
||||
`;
|
||||
expect(() => parseTaskManifest(yaml)).toThrow("name is required");
|
||||
});
|
||||
|
||||
test("defaults weight to 0 when omitted", () => {
|
||||
const yaml = `
|
||||
name: test
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges:
|
||||
- name: token-stats
|
||||
builtin: true
|
||||
`;
|
||||
const manifest = parseTaskManifest(yaml);
|
||||
expect(manifest.judges[0]!.weight).toBe(0);
|
||||
});
|
||||
});
|
||||
@@ -1,45 +0,0 @@
|
||||
{
|
||||
"name": "@united-workforce/eval",
|
||||
"version": "0.1.3",
|
||||
"private": false,
|
||||
"files": [
|
||||
"src",
|
||||
"dist",
|
||||
"package.json"
|
||||
],
|
||||
"type": "module",
|
||||
"bin": {
|
||||
"uwf-eval": "./dist/cli.js"
|
||||
},
|
||||
"exports": {
|
||||
".": {
|
||||
"types": "./dist/index.d.ts",
|
||||
"import": "./dist/index.js"
|
||||
}
|
||||
},
|
||||
"scripts": {
|
||||
"test": "vitest run __tests__/",
|
||||
"test:ci": "vitest run __tests__/"
|
||||
},
|
||||
"dependencies": {
|
||||
"@ocas/core": "^0.3.0",
|
||||
"@ocas/fs": "^0.3.0",
|
||||
"@united-workforce/protocol": "workspace:^",
|
||||
"@united-workforce/util": "workspace:^",
|
||||
"commander": "^14.0.3",
|
||||
"yaml": "^2.9.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"typescript": "^5.8.3"
|
||||
},
|
||||
"repository": {
|
||||
"type": "git",
|
||||
"url": "https://git.shazhou.work/shazhou/united-workforce.git",
|
||||
"directory": "packages/eval"
|
||||
},
|
||||
"homepage": "https://git.shazhou.work/shazhou/united-workforce#readme",
|
||||
"bugs": {
|
||||
"url": "https://git.shazhou.work/shazhou/united-workforce/issues"
|
||||
},
|
||||
"license": "MIT"
|
||||
}
|
||||
@@ -1,25 +0,0 @@
|
||||
#!/usr/bin/env node
|
||||
import { Command } from "commander";
|
||||
import {
|
||||
registerDiffCommand,
|
||||
registerListCommand,
|
||||
registerReportCommand,
|
||||
registerRunCommand,
|
||||
} from "./commands/index.js";
|
||||
|
||||
// eslint-disable-next-line -- dynamic import for version
|
||||
const pkg = await import("../package.json", { with: { type: "json" } });
|
||||
|
||||
const program = new Command();
|
||||
|
||||
program
|
||||
.name("uwf-eval")
|
||||
.description("Evaluate uwf workflow quality with real agents")
|
||||
.version(pkg.default.version, "-V, --version");
|
||||
|
||||
registerRunCommand(program);
|
||||
registerReportCommand(program);
|
||||
registerDiffCommand(program);
|
||||
registerListCommand(program);
|
||||
|
||||
program.parse();
|
||||
@@ -1,38 +0,0 @@
|
||||
import { createLogger } from "@united-workforce/util";
|
||||
import type { Command } from "commander";
|
||||
|
||||
import { createEvalStore } from "../storage/index.js";
|
||||
import { formatDiff } from "./format.js";
|
||||
import { readEvalRun } from "./read.js";
|
||||
|
||||
const log = createLogger({ sink: { kind: "stderr" } });
|
||||
const LOG_DIFF = "D3WZ8N5T";
|
||||
|
||||
export function registerDiffCommand(program: Command): void {
|
||||
program
|
||||
.command("diff <hash1> <hash2>")
|
||||
.description("Compare two eval runs side-by-side")
|
||||
.action(async (hash1: string, hash2: string) => {
|
||||
try {
|
||||
const evalStore = await createEvalStore();
|
||||
const payloadA = readEvalRun(evalStore, hash1);
|
||||
if (payloadA === null) {
|
||||
process.stderr.write(`eval run not found: ${hash1}\n`);
|
||||
process.exitCode = 1;
|
||||
return;
|
||||
}
|
||||
const payloadB = readEvalRun(evalStore, hash2);
|
||||
if (payloadB === null) {
|
||||
process.stderr.write(`eval run not found: ${hash2}\n`);
|
||||
process.exitCode = 1;
|
||||
return;
|
||||
}
|
||||
log(LOG_DIFF, `diff a=${hash1} b=${hash2}`);
|
||||
process.stdout.write(formatDiff(payloadA, hash1, payloadB, hash2));
|
||||
} catch (e) {
|
||||
const message = e instanceof Error ? e.message : String(e);
|
||||
process.stderr.write(`${message}\n`);
|
||||
process.exitCode = 1;
|
||||
}
|
||||
});
|
||||
}
|
||||
@@ -1,148 +0,0 @@
|
||||
import type { EvalRunPayload } from "../storage/index.js";
|
||||
import type { EvalListEntry } from "./types.js";
|
||||
|
||||
const NAME_WIDTH = 28;
|
||||
const SCORE_WIDTH = 10;
|
||||
const TIMESTAMP_WIDTH = 26;
|
||||
|
||||
/** Format a 0..1 score (or weight) with fixed precision. */
|
||||
function formatScore(value: number): string {
|
||||
return value.toFixed(4);
|
||||
}
|
||||
|
||||
/** Human-readable ISO-8601 timestamp from epoch milliseconds. */
|
||||
function formatTimestamp(ms: number): string {
|
||||
return new Date(ms).toISOString();
|
||||
}
|
||||
|
||||
/** Right-pad to a fixed column width (with a trailing space if already full). */
|
||||
function pad(value: string, width: number): string {
|
||||
return value.length >= width ? `${value} ` : value.padEnd(width);
|
||||
}
|
||||
|
||||
/** Directional indicator for a score delta (B relative to A). */
|
||||
function formatDelta(delta: number): string {
|
||||
if (delta > 0) {
|
||||
return `▲ +${formatScore(delta)}`;
|
||||
}
|
||||
if (delta < 0) {
|
||||
return `▼ ${formatScore(delta)}`;
|
||||
}
|
||||
return `= ${formatScore(0)}`;
|
||||
}
|
||||
|
||||
/** Render a single eval run as a human-readable report. */
|
||||
export function formatReport(payload: EvalRunPayload, runHash: string): string {
|
||||
const lines: string[] = [];
|
||||
lines.push("=== Eval Report ===");
|
||||
lines.push(`Task: ${payload.task}`);
|
||||
lines.push(`Overall: ${formatScore(payload.overall)}`);
|
||||
lines.push(`Timestamp: ${formatTimestamp(payload.timestamp)}`);
|
||||
lines.push("");
|
||||
lines.push("Config:");
|
||||
lines.push(` Agent: ${payload.config.agent}`);
|
||||
lines.push(` Model: ${payload.config.model}`);
|
||||
lines.push(` Engine: ${payload.config.engineVersion}`);
|
||||
lines.push("");
|
||||
lines.push("Judges:");
|
||||
lines.push(` ${pad("NAME", NAME_WIDTH)}${pad("SCORE", SCORE_WIDTH)}WEIGHT`);
|
||||
for (const judge of payload.judges) {
|
||||
lines.push(
|
||||
` ${pad(judge.name, NAME_WIDTH)}${pad(formatScore(judge.score), SCORE_WIDTH)}${formatScore(judge.weight)}`,
|
||||
);
|
||||
}
|
||||
lines.push("");
|
||||
lines.push(`Thread: ${payload.threadId}`);
|
||||
lines.push(`Run: ${runHash}`);
|
||||
return `${lines.join("\n")}\n`;
|
||||
}
|
||||
|
||||
/** Render a side-by-side comparison of two eval runs. */
|
||||
export function formatDiff(
|
||||
payloadA: EvalRunPayload,
|
||||
hashA: string,
|
||||
payloadB: EvalRunPayload,
|
||||
hashB: string,
|
||||
): string {
|
||||
const lines: string[] = [];
|
||||
lines.push("=== Eval Diff ===");
|
||||
lines.push(`A: ${hashA} (${payloadA.task})`);
|
||||
lines.push(`B: ${hashB} (${payloadB.task})`);
|
||||
lines.push("");
|
||||
|
||||
const overallDelta = payloadB.overall - payloadA.overall;
|
||||
lines.push("Overall:");
|
||||
lines.push(
|
||||
` A=${formatScore(payloadA.overall)} B=${formatScore(payloadB.overall)} ${formatDelta(overallDelta)}`,
|
||||
);
|
||||
lines.push("");
|
||||
|
||||
lines.push("Config:");
|
||||
lines.push(configLine("Agent", payloadA.config.agent, payloadB.config.agent));
|
||||
lines.push(configLine("Model", payloadA.config.model, payloadB.config.model));
|
||||
lines.push(configLine("Engine", payloadA.config.engineVersion, payloadB.config.engineVersion));
|
||||
lines.push("");
|
||||
|
||||
lines.push("Judges:");
|
||||
lines.push(` ${pad("NAME", NAME_WIDTH)}${pad("A", SCORE_WIDTH)}${pad("B", SCORE_WIDTH)}DELTA`);
|
||||
const scoresA = new Map(payloadA.judges.map((judge) => [judge.name, judge.score]));
|
||||
const scoresB = new Map(payloadB.judges.map((judge) => [judge.name, judge.score]));
|
||||
for (const name of unionJudgeNames(payloadA, payloadB)) {
|
||||
const scoreA = scoresA.get(name);
|
||||
const scoreB = scoresB.get(name);
|
||||
const cellA = scoreA === undefined ? "—" : formatScore(scoreA);
|
||||
const cellB = scoreB === undefined ? "—" : formatScore(scoreB);
|
||||
const delta = scoreA !== undefined && scoreB !== undefined ? formatDelta(scoreB - scoreA) : "";
|
||||
lines.push(
|
||||
` ${pad(name, NAME_WIDTH)}${pad(cellA, SCORE_WIDTH)}${pad(cellB, SCORE_WIDTH)}${delta}`,
|
||||
);
|
||||
}
|
||||
return `${lines.join("\n")}\n`;
|
||||
}
|
||||
|
||||
/** Render a table of indexed eval runs. */
|
||||
export function formatList(entries: ReadonlyArray<EvalListEntry>): string {
|
||||
const lines: string[] = [];
|
||||
lines.push(
|
||||
` ${pad("TASK", NAME_WIDTH)}${pad("OVERALL", SCORE_WIDTH)}${pad("TIMESTAMP", TIMESTAMP_WIDTH)}HASH`,
|
||||
);
|
||||
if (entries.length === 0) {
|
||||
lines.push(" (no eval runs found)");
|
||||
}
|
||||
for (const entry of entries) {
|
||||
lines.push(
|
||||
` ${pad(entry.task, NAME_WIDTH)}${pad(formatScore(entry.overall), SCORE_WIDTH)}${pad(formatTimestamp(entry.timestamp), TIMESTAMP_WIDTH)}${entry.hash}`,
|
||||
);
|
||||
}
|
||||
return `${lines.join("\n")}\n`;
|
||||
}
|
||||
|
||||
/** Sort newest-first, then apply optional task filter and result limit. */
|
||||
export function selectEntries(
|
||||
entries: ReadonlyArray<EvalListEntry>,
|
||||
task: string | null,
|
||||
limit: number | null,
|
||||
): EvalListEntry[] {
|
||||
const sorted = [...entries].sort((a, b) => b.timestamp - a.timestamp);
|
||||
const filtered = task !== null ? sorted.filter((entry) => entry.task === task) : sorted;
|
||||
return limit !== null ? filtered.slice(0, limit) : filtered;
|
||||
}
|
||||
|
||||
/** Ordered union of judge names: A's order first, then B-only names. */
|
||||
function unionJudgeNames(payloadA: EvalRunPayload, payloadB: EvalRunPayload): string[] {
|
||||
const names: string[] = [];
|
||||
const seen = new Set<string>();
|
||||
for (const judge of [...payloadA.judges, ...payloadB.judges]) {
|
||||
if (!seen.has(judge.name)) {
|
||||
seen.add(judge.name);
|
||||
names.push(judge.name);
|
||||
}
|
||||
}
|
||||
return names;
|
||||
}
|
||||
|
||||
/** One config row: `=` when equal, `≠` otherwise. */
|
||||
function configLine(label: string, valueA: string, valueB: string): string {
|
||||
const marker = valueA === valueB ? "=" : "≠";
|
||||
return ` ${pad(`${label}:`, SCORE_WIDTH)}${marker} A=${valueA} B=${valueB}`;
|
||||
}
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user