Compare commits
140 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 69ec8c2c5e | |||
| 81aa282c92 | |||
| a620defbcf | |||
| 439891f6b6 | |||
| df244c52e8 | |||
| cb6e0d6a11 | |||
| 9d0c6df62c | |||
| 0f5bb1f191 | |||
| 00d960daba | |||
| 3a26285872 | |||
| 13c0812944 | |||
| 2e7e5f6ec4 | |||
| 88c077d439 | |||
| aaadab4445 | |||
| adf7837975 | |||
| 513846f4ab | |||
| aee123cc82 | |||
| 8ddada5879 | |||
| aa732f5466 | |||
| e354fc4341 | |||
| 0e7e3ea44b | |||
| aa454c85dd | |||
| 6dd7d521be | |||
| 950dc056d8 | |||
| d360b85374 | |||
| 509dfad857 | |||
| 58b84e3b3c | |||
| f821ac99f4 | |||
| 2c4700c49f | |||
| 4410afcd4a | |||
| a0e254a681 | |||
| dd77b40f6c | |||
| 5ed6f68e4b | |||
| 1ed0bf1f76 | |||
| d97840cf8d | |||
| b560818f1a | |||
| f989dee85b | |||
| 7e4a59de7e | |||
| 68079cc003 | |||
| 1a37928bb9 | |||
| 57511a93fe | |||
| adc3982a4a | |||
| 4580388270 | |||
| caba82fe36 | |||
| 6aee2ed5ef | |||
| 709b9dc1e5 | |||
| 7a788a9d90 | |||
| e5af5e9027 | |||
| fde87b6274 | |||
| a33f12c74f | |||
| 0ad10b9b6d | |||
| 3be92bfac2 | |||
| 8d6f480b0f | |||
| 5450bc1230 | |||
| f1f122b0b1 | |||
| 57ae6d1755 | |||
| d64d150071 | |||
| c5eb8b79d1 | |||
| 36a3ca6a08 | |||
| eb0b7b514f | |||
| a47871ec4e | |||
| 5851e5d162 | |||
| 61dfb40933 | |||
| fbfd31a042 | |||
| d99a376b60 | |||
| a536efee00 | |||
| 9260d81084 | |||
| c8d884072a | |||
| abeb465f46 | |||
| 28427a973f | |||
| 794f9db568 | |||
| cd585a26f1 | |||
| 1cf8f350d0 | |||
| 427568a21d | |||
| d3a2353acf | |||
| 8085d1d6e0 | |||
| 8764d7bda3 | |||
| 850a3b2f25 | |||
| 3d6a517e83 | |||
| 825f0c641a | |||
| 81bbe1178f | |||
| a0e139935e | |||
| a08775896f | |||
| c892b9125b | |||
| 8c5e12c5c8 | |||
| 5edb67b79d | |||
| 3d8df5c8e2 | |||
| 63cb4d3645 | |||
| f373945304 | |||
| ae81e4b5ac | |||
| 8c26f16716 | |||
| fae9e9ed3a | |||
| 99619d85db | |||
| b94234652a | |||
| 1593dbb521 | |||
| d1c523c442 | |||
| 4283e6766b | |||
| 4e4fb61ff5 | |||
| be92cb2dd2 | |||
| 7681e8b8e2 | |||
| 780005ad65 | |||
| 248ac710fd | |||
| 172c232e61 | |||
| 5fe97591de | |||
| 99f40c2488 | |||
| bf489c59a5 | |||
| 9908d069ec | |||
| 83bcda60ff | |||
| 17f7f44c43 | |||
| 3401873051 | |||
| 7fc02e50c0 | |||
| 18170a4313 | |||
| 1ce0b9b9ee | |||
| 8bf5b88172 | |||
| 9fbdd1dd2c | |||
| 66c2e2a79b | |||
| 58b58d511e | |||
| 596c05bfcc | |||
| d26f54e8ea | |||
| 883bd79bcb | |||
| 63454a4cfd | |||
| 5fe492c011 | |||
| 9f5891169e | |||
| 0470d9445a | |||
| 07128b89af | |||
| 1fdeb716ca | |||
| 1b99f0e2c1 | |||
| f56e24cf82 | |||
| 974c2b8f1b | |||
| 6e7276425d | |||
| dbb7885ffd | |||
| cd7e4e77ff | |||
| 64a8bab5ce | |||
| 80e8efb05e | |||
| 75fb752a82 | |||
| 06af1dc668 | |||
| bbea89c067 | |||
| bda3e3a861 | |||
| ca7b68ca5f | |||
| 23e2ae9eb4 |
@@ -1,8 +0,0 @@
|
||||
# Changesets
|
||||
|
||||
Hello and welcome! This folder has been automatically generated by `@changesets/cli`, a build tool that works
|
||||
with multi-package repos, or single-package repos to help you version and publish your code. You can
|
||||
find the full documentation for it [in our repository](https://github.com/changesets/changesets).
|
||||
|
||||
We have a quick list of common questions to get you started engaging with this project in
|
||||
[our documentation](https://github.com/changesets/changesets/blob/main/docs/common-questions.md).
|
||||
@@ -1,11 +0,0 @@
|
||||
{
|
||||
"$schema": "https://unpkg.com/@changesets/config@3.1.4/schema.json",
|
||||
"changelog": "@changesets/cli/changelog",
|
||||
"commit": false,
|
||||
"fixed": [["@united-workforce/*"]],
|
||||
"linked": [],
|
||||
"access": "public",
|
||||
"baseBranch": "main",
|
||||
"updateInternalDependencies": "patch",
|
||||
"ignore": ["@united-workforce/dashboard"]
|
||||
}
|
||||
@@ -1,30 +0,0 @@
|
||||
{
|
||||
"mode": "exit",
|
||||
"tag": "alpha",
|
||||
"initialVersions": {
|
||||
"@uncaged/cli": "0.4.5",
|
||||
"@uncaged/workflow-agent-cursor": "0.4.5",
|
||||
"@uncaged/agent-hermes": "0.4.5",
|
||||
"@uncaged/workflow-agent-llm": "0.4.5",
|
||||
"@uncaged/workflow-agent-react": "0.4.5",
|
||||
"@uncaged/workflow-cas": "0.4.5",
|
||||
"@uncaged/dashboard": "0.1.0",
|
||||
"@uncaged/workflow-execute": "0.4.5",
|
||||
"@uncaged/workflow-gateway": "0.4.5",
|
||||
"@uncaged/protocol": "0.4.5",
|
||||
"@uncaged/workflow-reactor": "0.4.5",
|
||||
"@uncaged/workflow-register": "0.4.5",
|
||||
"@uncaged/workflow-runtime": "0.4.5",
|
||||
"@uncaged/workflow-template-develop": "0.4.5",
|
||||
"@uncaged/workflow-template-solve-issue": "0.4.5",
|
||||
"@uncaged/util": "0.4.5",
|
||||
"@uncaged/util-agent": "0.4.5"
|
||||
},
|
||||
"changesets": [
|
||||
"env-api-unify",
|
||||
"fix-internal-deps",
|
||||
"fix-publish-src",
|
||||
"fix-workspace-deps",
|
||||
"rfc-252-agent-fn"
|
||||
]
|
||||
}
|
||||
@@ -12,15 +12,17 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: oven-sh/setup-bun@v2
|
||||
- uses: actions/setup-node@v4
|
||||
with:
|
||||
node-version: 22
|
||||
|
||||
- run: bun install
|
||||
- run: corepack enable && pnpm install
|
||||
|
||||
- name: Build
|
||||
run: bun run build
|
||||
run: pnpm run build
|
||||
|
||||
- name: Lint
|
||||
run: bun run check
|
||||
run: pnpm run check
|
||||
|
||||
- name: Test
|
||||
run: bun run test:ci
|
||||
run: pnpm run test:ci
|
||||
|
||||
@@ -0,0 +1,226 @@
|
||||
# Eval Framework Implementation Plan
|
||||
|
||||
## Goal
|
||||
|
||||
Build `uwf-eval` CLI + eval task infrastructure for evaluating uwf workflow quality with real agents.
|
||||
|
||||
## Architecture
|
||||
|
||||
```
|
||||
uwf-eval (runner) task package (npm) OCAS (storage)
|
||||
│ │ │
|
||||
├─ unpack tarball ───────► fixture/ → tmp cwd │
|
||||
├─ read task.yaml │ │
|
||||
├─ uwf thread start/exec │ │
|
||||
├─ run judges ───────────► dist/judges/*.js │
|
||||
├─ collect scores │ │
|
||||
└─ store results ─────────────────────────────────────► CAS nodes + variables
|
||||
```
|
||||
|
||||
### Key Design Decisions
|
||||
|
||||
- **uwf-eval is NOT part of uwf** — separate package, shells out to uwf CLI
|
||||
- **Task = npm package** — fixture + task.yaml + judge scripts, distributable as tarball
|
||||
- **Judge = Node script** — `node <entry> <cwd> <thread-id>`, outputs `{score, data}` JSON
|
||||
- **Every output is OCAS typed** — eval-run, judge results all have registered schemas
|
||||
- **Builtin judges** — frontmatter compliance, upstream consumption, hallucination, token stats
|
||||
- **Task-specific judges** — bundled in the task package, custom schema per judge
|
||||
|
||||
## Deliverables
|
||||
|
||||
### Phase 1: Foundation (`@united-workforce/eval`)
|
||||
|
||||
New package in the uwf monorepo.
|
||||
|
||||
```
|
||||
packages/eval/
|
||||
src/
|
||||
cli.ts # uwf-eval entry point
|
||||
commands/
|
||||
run.ts # uwf-eval run
|
||||
report.ts # uwf-eval report <hash>
|
||||
diff.ts # uwf-eval diff <hash> <hash>
|
||||
list.ts # uwf-eval list
|
||||
runner/
|
||||
prepare.ts # unpack tarball/dir → tmp cwd
|
||||
execute.ts # shell out to uwf thread start/exec
|
||||
collect.ts # run judges, collect scores
|
||||
judge/
|
||||
types.ts # JudgeInput, JudgeOutput types
|
||||
builtin/
|
||||
frontmatter.ts # frontmatter compliance check
|
||||
upstream.ts # upstream info consumption (LLM-as-judge)
|
||||
hallucination.ts # hallucination detection (LLM-as-judge)
|
||||
token-stats.ts # token usage from $usage field (#68)
|
||||
storage/
|
||||
schemas.ts # OCAS schema definitions
|
||||
store.ts # CAS read/write helpers
|
||||
index.ts # variable indexing (@uwf/eval/*)
|
||||
task/
|
||||
types.ts # TaskManifest type (task.yaml)
|
||||
loader.ts # parse task.yaml, validate
|
||||
package.json
|
||||
tsconfig.json
|
||||
```
|
||||
|
||||
#### OCAS Schemas to Register
|
||||
|
||||
1. `@uwf/eval-run` — full eval execution record
|
||||
```
|
||||
{ task, config: {agent, model, engineVersion}, threadId,
|
||||
judges: [{name, score, weight, dataHash}], overall, timestamp }
|
||||
```
|
||||
|
||||
2. `@uwf/eval-judge-frontmatter` — frontmatter judge data
|
||||
```
|
||||
{ stepsTotal, stepsValid, invalidSteps: [{stepIndex, role, errors: string[]}] }
|
||||
```
|
||||
|
||||
3. `@uwf/eval-judge-upstream` — upstream consumption judge data
|
||||
```
|
||||
{ perStep: [{role, consumed: string[], missed: string[], score}] }
|
||||
```
|
||||
|
||||
4. `@uwf/eval-judge-hallucination` — hallucination judge data
|
||||
```
|
||||
{ perStep: [{role, hallucinations: string[], score}] }
|
||||
```
|
||||
|
||||
5. `@uwf/eval-judge-token-stats` — token stats (not scored, informational)
|
||||
```
|
||||
{ totalInput, totalOutput, totalTurns, perStep: [{role, input, output, turns, duration}] }
|
||||
```
|
||||
|
||||
#### CLI Design
|
||||
|
||||
```bash
|
||||
# Run eval
|
||||
uwf-eval run <task-dir-or-tarball> [--agent hermes] [--model claude-sonnet-4] [--count 20]
|
||||
|
||||
# View results
|
||||
uwf-eval report <run-hash> # render via ocas render
|
||||
uwf-eval diff <hash1> <hash2> # side-by-side comparison
|
||||
uwf-eval list # list past runs
|
||||
```
|
||||
|
||||
### Phase 2: Task Package Scaffold
|
||||
|
||||
Template for creating eval tasks. Also serves as the first real task.
|
||||
|
||||
```
|
||||
eval-tasks/ # shazhou/uwf-eval-tasks monorepo
|
||||
packages/
|
||||
_template/ # copypaste template
|
||||
package.json
|
||||
task.yaml
|
||||
fixture/
|
||||
src/judges/
|
||||
tsconfig.json
|
||||
fix-off-by-one/ # first real task
|
||||
package.json # @uwf-eval/fix-off-by-one
|
||||
task.yaml
|
||||
fixture/
|
||||
src/calc.ts # buggy calculator
|
||||
src/calc.test.ts # test that exposes the bug
|
||||
package.json
|
||||
src/judges/
|
||||
test-pass.ts # runs pnpm test, checks exit code
|
||||
code-quality.ts # LLM judge: minimal change, correct fix
|
||||
schemas/
|
||||
test-pass.json # OCAS schema for test-pass data
|
||||
code-quality.json # OCAS schema for code-quality data
|
||||
tsconfig.json
|
||||
pnpm-workspace.yaml
|
||||
tsconfig.json
|
||||
biome.json
|
||||
```
|
||||
|
||||
#### task.yaml Format
|
||||
|
||||
```yaml
|
||||
name: fix-off-by-one
|
||||
description: Fix an off-by-one error in a calculator's add function
|
||||
workflow: solve-issue # registered workflow name, or relative path to .yaml
|
||||
prompt: "Fix the bug: add(1,2) returns 4 instead of 3"
|
||||
limits:
|
||||
maxSteps: 15
|
||||
timeoutMinutes: 30
|
||||
judges:
|
||||
- name: frontmatter-compliance
|
||||
weight: 0.15
|
||||
builtin: true
|
||||
- name: upstream-consumption
|
||||
weight: 0.15
|
||||
builtin: true
|
||||
- name: hallucination
|
||||
weight: 0.1
|
||||
builtin: true
|
||||
- name: token-stats
|
||||
weight: 0 # informational, not scored
|
||||
builtin: true
|
||||
- name: test-pass
|
||||
weight: 0.3
|
||||
entry: dist/judges/test-pass.js
|
||||
schema: schemas/test-pass.json
|
||||
- name: code-quality
|
||||
weight: 0.3
|
||||
entry: dist/judges/code-quality.js
|
||||
schema: schemas/code-quality.json
|
||||
```
|
||||
|
||||
#### Judge Script Contract
|
||||
|
||||
```typescript
|
||||
// Input: process.argv = [node, script, cwd, threadId]
|
||||
// Output: stdout JSON
|
||||
// Exit 0 = success, non-zero = judge error (not low score)
|
||||
|
||||
import type { JudgeOutput } from "@united-workforce/eval";
|
||||
|
||||
const result: JudgeOutput<TestPassData> = {
|
||||
score: 1.0, // 0.0 - 1.0
|
||||
data: { // typed per judge schema
|
||||
command: "pnpm test",
|
||||
exitCode: 0,
|
||||
output: "3 tests passed"
|
||||
}
|
||||
};
|
||||
|
||||
console.log(JSON.stringify(result));
|
||||
```
|
||||
|
||||
### Phase 3: Prerequisite — $usage in Adapter Protocol (#68)
|
||||
|
||||
Blocked by #68. Token stats judge needs `$usage` in step nodes.
|
||||
|
||||
Can proceed with Phase 1+2 without it — token-stats judge just returns zeros until adapters report usage.
|
||||
|
||||
## Implementation Order
|
||||
|
||||
1. **Phase 1a**: `@united-workforce/eval` package scaffold + CLI skeleton + OCAS schemas
|
||||
2. **Phase 1b**: `run` command — prepare, execute, collect flow
|
||||
3. **Phase 1c**: Builtin judges — frontmatter (deterministic), upstream + hallucination (LLM-as-judge)
|
||||
4. **Phase 2a**: Create `shazhou/uwf-eval-tasks` monorepo with proman
|
||||
5. **Phase 2b**: First task `fix-off-by-one` with fixture repo + 2 custom judges
|
||||
6. **Phase 2c**: End-to-end test: `uwf-eval run packages/fix-off-by-one --agent hermes`
|
||||
7. **Phase 1d**: `report`, `diff`, `list` commands (read from CAS, render via ocas render)
|
||||
|
||||
## Dependencies
|
||||
|
||||
- `@ocas/core` + `@ocas/fs` — CAS storage
|
||||
- `@united-workforce/protocol` — step node types
|
||||
- `commander` — CLI framework (consistent with uwf)
|
||||
- LLM API access — for LLM-as-judge (upstream, hallucination, task-specific quality judges)
|
||||
|
||||
## Open Questions
|
||||
|
||||
1. **LLM-as-judge provider config** — reuse uwf's `~/.uwf/config.yaml` provider settings? Or separate config?
|
||||
2. **Workflow file location** — task.yaml references a workflow. Should the workflow YAML be inside the tarball, or reference a registered workflow by name?
|
||||
3. **Non-coding tasks** — debate workflow has no fixture repo. task.yaml needs `fixture: null` or simply omit the `fixture/` dir. Runner creates empty cwd.
|
||||
4. **Parallel judge execution** — judges are independent, can run in parallel. Worth the complexity?
|
||||
|
||||
## Risks
|
||||
|
||||
- LLM-as-judge consistency — same input may get different scores. Mitigation: run judge multiple times, take average? Or accept variance.
|
||||
- Token cost of judges — each LLM judge call costs tokens. For a 10-step workflow with 2 LLM judges = 20 LLM calls just for judging. Acceptable?
|
||||
- Fixture repo drift — if the fixture evolves, old eval runs become non-comparable. Pin fixture version in task.yaml.
|
||||
@@ -1,246 +0,0 @@
|
||||
name: "solve-issue"
|
||||
description: "TDD-driven issue resolution for small, focused changes. Loop protection relies on engine maxRounds."
|
||||
roles:
|
||||
planner:
|
||||
description: "Analyzes issue and outputs a TDD test spec"
|
||||
goal: "You are a planning agent. You analyze Gitea issues and produce a TDD test specification that downstream roles will implement and verify."
|
||||
capabilities:
|
||||
- issue-analysis
|
||||
- planning
|
||||
procedure: |
|
||||
On first run (no previous steps):
|
||||
1. Read the issue and all comments from Gitea using `tea issues <number> -r <owner/repo>`
|
||||
2. Look for project conventions files (CLAUDE.md, CONTRIBUTING.md, .cursor/rules/) in the repo
|
||||
3. Assess whether the issue has enough information to produce a test spec
|
||||
4. If insufficient info: comment on the issue via `echo "..." | tea comment <number> -r <owner/repo>` (skip if you already commented), then output $status=insufficient_info
|
||||
5. If sufficient: produce a detailed TDD test spec in markdown covering all scenarios
|
||||
|
||||
On subsequent runs (bounced back by tester with fix_spec):
|
||||
1. Read the tester's output from the previous step to understand what's wrong with the spec
|
||||
2. Revise the test spec accordingly
|
||||
|
||||
After producing the test spec:
|
||||
1. The test spec is stored in CAS automatically by the uwf pipeline (agents do not need to call `ocas put` directly)
|
||||
2. Put the plan hash in frontmatter.plan (required when $status=ready)
|
||||
3. Set repoPath to the absolute path of the repository root
|
||||
|
||||
IMPORTANT: Extract the repo remote (owner/repo) from git:
|
||||
```bash
|
||||
git remote get-url origin | sed 's|.*[:/]\([^/]*/[^.]*\).*|\1|'
|
||||
```
|
||||
Store the result as repoRemote in your frontmatter output so downstream roles can use it for tea/API calls.
|
||||
output: "Output a brief summary of the test spec. Set $status to ready (with plan hash and repoPath) or insufficient_info."
|
||||
frontmatter:
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: "ready" }
|
||||
plan: { type: string }
|
||||
repoPath: { type: string }
|
||||
repoRemote: { type: string }
|
||||
required: [$status, plan, repoPath, repoRemote]
|
||||
- properties:
|
||||
$status: { const: "insufficient_info" }
|
||||
reason: { type: string }
|
||||
required: [$status, reason]
|
||||
developer:
|
||||
description: "TDD implementation per test spec"
|
||||
goal: "You are a developer agent. You implement code changes following TDD — write tests first, then implementation."
|
||||
capabilities:
|
||||
- coding
|
||||
procedure: |
|
||||
IMPORTANT: Always work in a git worktree, NEVER modify the main working directory directly.
|
||||
The repo path and other details are provided in your task prompt.
|
||||
|
||||
Before starting any work, set up an isolated worktree:
|
||||
1. cd into the repo path provided in your task prompt
|
||||
2. `git fetch origin` to get latest refs
|
||||
3. First time (no existing branch):
|
||||
- `git worktree add .worktrees/fix/<issue-number>-<short-slug> -b fix/<issue-number>-<short-slug> origin/main`
|
||||
- `cd .worktrees/fix/<issue-number>-<short-slug> && bun install`
|
||||
4. If bounced back from reviewer or tester (branch already exists):
|
||||
- cd into the existing worktree under `.worktrees/fix/<issue-number>-<short-slug>`
|
||||
- `git fetch origin && git rebase origin/main`
|
||||
5. ALL subsequent work must happen inside the worktree directory.
|
||||
|
||||
Then implement TDD:
|
||||
6. Read the test spec from CAS: `ocas get <plan hash>` (find the hash from the planner's output in your task prompt)
|
||||
7. If bounced back from reviewer or tester: read the previous role's feedback in your task prompt
|
||||
8. Write tests first based on the spec
|
||||
9. Implement the code to make tests pass
|
||||
10. Ensure `bun run build` passes with no errors
|
||||
11. Run `bun test` to verify all tests pass
|
||||
- If tests fail on first run:
|
||||
* Read the test output carefully for missing imports or setup issues
|
||||
* Check if you're running tests from the correct working directory (package root vs workspace root)
|
||||
* Fix the immediate issue and rerun ONCE
|
||||
* If tests still fail after 2 attempts: check the test spec for ambiguities
|
||||
* If stuck after 3 test cycles: set $status=failed with detailed error report rather than continuing blind retries
|
||||
12. MANDATORY VERIFICATION before reporting done:
|
||||
- Run `git branch --show-current` and confirm branch name matches expected
|
||||
- Run `git status` and verify changed files exist
|
||||
- Run `ls -la <key-implementation-files>` to verify they exist on disk
|
||||
- If ANY verification fails: retry the implementation, do NOT report done
|
||||
|
||||
If you cannot complete the implementation (e.g. the issue is too complex, blocked by external factors,
|
||||
or repeated attempts fail), set $status=failed with a reason.
|
||||
output: "List all files changed and provide a summary. Set $status to done (with branch/worktree), or failed (with reason)."
|
||||
frontmatter:
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: "done" }
|
||||
branch: { type: string }
|
||||
worktree: { type: string }
|
||||
repoRemote: { type: string }
|
||||
required: [$status, branch, worktree]
|
||||
- properties:
|
||||
$status: { const: "failed" }
|
||||
reason: { type: string }
|
||||
required: [$status, reason]
|
||||
reviewer:
|
||||
description: "Code standards compliance check"
|
||||
goal: "You are a code reviewer. You verify code standards compliance — NOT functionality (that's the tester's job)."
|
||||
capabilities:
|
||||
- code-review
|
||||
- static-analysis
|
||||
procedure: |
|
||||
The worktree path is provided in your task prompt. cd into it first.
|
||||
|
||||
CRITICAL: You MUST execute every verification command below. Do NOT report results without running the actual commands. Do NOT rely on prior context or assumptions.
|
||||
|
||||
Before reviewing, verify the worktree and branch exist:
|
||||
0. Run `cd <worktree-path> && pwd` to confirm the path is accessible
|
||||
- If the cd fails: the worktree truly doesn't exist, reject with that reason
|
||||
- If the cd succeeds: proceed with step 1 below
|
||||
1. Run `git branch --show-current` — confirm the branch name references the issue number being worked on
|
||||
2. If the branch doesn't correspond to the issue, flag it in your output and reject
|
||||
|
||||
Then perform code review:
|
||||
Hard checks (must all pass):
|
||||
3. `bun run build` — no build errors
|
||||
4. `bunx biome check` — no lint violations
|
||||
5. TypeScript strict mode — no type errors
|
||||
|
||||
Soft checks (review against project conventions if CLAUDE.md / .cursor/rules exist):
|
||||
- Naming conventions, module boundaries, code style
|
||||
- No `console.log` in production code
|
||||
- No dynamic imports in production code
|
||||
|
||||
Only review standards compliance. Do NOT test functionality.
|
||||
If rejecting, you MUST explain the specific reason in your output.
|
||||
output: "Explain your decision with specific file/line references. Set $status to approved (with branch/worktree) or rejected (with comments)."
|
||||
frontmatter:
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: "approved" }
|
||||
branch: { type: string }
|
||||
worktree: { type: string }
|
||||
repoRemote: { type: string }
|
||||
required: [$status, branch, worktree]
|
||||
- properties:
|
||||
$status: { const: "rejected" }
|
||||
comments: { type: string }
|
||||
worktree: { type: string }
|
||||
repoRemote: { type: string }
|
||||
required: [$status, comments, worktree]
|
||||
tester:
|
||||
description: "Functional correctness verification"
|
||||
goal: "You are a tester agent. You verify that the implementation correctly satisfies every scenario in the test spec."
|
||||
capabilities:
|
||||
- testing
|
||||
procedure: |
|
||||
The worktree path is provided in your task prompt. cd into it first.
|
||||
|
||||
1. Run `bun test` for automated test verification
|
||||
2. Read the test spec from CAS: `ocas get <plan hash>` (find the hash from the planner step in the thread history)
|
||||
3. Verify each scenario in the spec is covered and passing
|
||||
4. Determine outcome:
|
||||
- passed: all scenarios verified, tests pass
|
||||
- fix_code: tests fail or implementation doesn't match spec → send back to developer
|
||||
- fix_spec: the spec itself is wrong or incomplete → send back to planner
|
||||
output: "Report test results per scenario. Set $status to passed (with branch/worktree), fix_code (with report), or fix_spec (with report)."
|
||||
frontmatter:
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: "passed" }
|
||||
branch: { type: string }
|
||||
worktree: { type: string }
|
||||
repoRemote: { type: string }
|
||||
required: [$status, branch, worktree]
|
||||
- properties:
|
||||
$status: { const: "fix_code" }
|
||||
report: { type: string }
|
||||
repoRemote: { type: string }
|
||||
worktree: { type: string }
|
||||
branch: { type: string }
|
||||
required: [$status, report]
|
||||
- properties:
|
||||
$status: { const: "fix_spec" }
|
||||
report: { type: string }
|
||||
repoRemote: { type: string }
|
||||
worktree: { type: string }
|
||||
branch: { type: string }
|
||||
required: [$status, report]
|
||||
committer:
|
||||
description: "Commits and creates PR"
|
||||
goal: "You are a committer agent. You create a clean commit and push a PR linking the original issue."
|
||||
capabilities: []
|
||||
procedure: |
|
||||
The worktree path, branch name, and repo remote (owner/repo) are provided in your task prompt.
|
||||
cd into the worktree first.
|
||||
|
||||
Note: You inherit the developer's worktree and branch. Do NOT create a new branch.
|
||||
1. Check `git status` — if working tree is clean and branch is ahead of origin, skip to step 3 (push).
|
||||
2. If there are unstaged/uncommitted changes: `git add -A` then `git commit -m "type: description\n\nFixes #N"`
|
||||
3. Push the branch: `git push -u origin <branch-name>`
|
||||
4. **Verify push succeeded** — run `git ls-remote origin <branch-name>` and confirm it prints a commit hash.
|
||||
- If no output or push failed: capture the error, mark hook_failed
|
||||
5. Create a PR using the Gitea API (do NOT use `tea pr create` — it fails in worktrees):
|
||||
```bash
|
||||
GITEA_TOKEN=$(cfg get GITEA_TOKEN)
|
||||
curl -s -X POST -H "Authorization: token $GITEA_TOKEN" -H "Content-Type: application/json" \
|
||||
"https://git.shazhou.work/api/v1/repos/<owner>/<repo>/pulls" \
|
||||
-d '{"title":"...","body":"...","head":"<branch>","base":"main"}'
|
||||
```
|
||||
- The repo remote (owner/repo format, e.g. "shazhou/united-workforce") is given in your task prompt — use it directly.
|
||||
- PR body must include: What / Why / Changes / Ref sections, with `Fixes #N` in Ref
|
||||
6. **Verify PR was created** — parse the curl response JSON: it must contain a `"number"` field. Print the PR URL.
|
||||
- If curl returns an error or no number field: capture the response, mark hook_failed
|
||||
7. After PR creation, clean up the worktree:
|
||||
- cd to the repo root (parent of .worktrees)
|
||||
- `git worktree remove <worktree-path>`
|
||||
output: "Include PR URL on success or error log on failure. Set $status to committed (with prUrl) or hook_failed (with error)."
|
||||
frontmatter:
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: "committed" }
|
||||
prUrl: { type: string }
|
||||
repoRemote: { type: string }
|
||||
worktree: { type: string }
|
||||
branch: { type: string }
|
||||
required: [$status, prUrl]
|
||||
- properties:
|
||||
$status: { const: "hook_failed" }
|
||||
error: { type: string }
|
||||
repoRemote: { type: string }
|
||||
worktree: { type: string }
|
||||
branch: { type: string }
|
||||
required: [$status, error]
|
||||
graph:
|
||||
$START:
|
||||
_: { role: "planner", prompt: "Analyze the issue and produce an implementation plan." }
|
||||
planner:
|
||||
insufficient_info: { role: "$SUSPEND", prompt: "信息不足,需要补充:{{{reason}}}" }
|
||||
ready: { role: "developer", prompt: "Implement the TDD test spec (CAS hash: {{{plan}}}) in repo {{{repoPath}}}. Repo remote: {{{repoRemote}}}." }
|
||||
developer:
|
||||
done: { role: "reviewer", prompt: "Review branch {{{branch}}} at {{{worktree}}} for code standards compliance. Repo remote: {{{repoRemote}}}." }
|
||||
failed: { role: "$END", prompt: "Developer failed: {{{reason}}}. Ending workflow." }
|
||||
reviewer:
|
||||
rejected: { role: "developer", prompt: "Reviewer rejected: {{{comments}}}. Fix the issues in repo {{{worktree}}}. Repo remote: {{{repoRemote}}}." }
|
||||
approved: { role: "tester", prompt: "Review passed. Run tests on branch {{{branch}}} at {{{worktree}}}. Repo remote: {{{repoRemote}}}." }
|
||||
tester:
|
||||
fix_code: { role: "developer", prompt: "Tests found code issues: {{{report}}}. Fix and re-submit. Worktree: {{{worktree}}}. Repo remote: {{{repoRemote}}}." }
|
||||
fix_spec: { role: "planner", prompt: "Tests found spec issues: {{{report}}}. Revise the test spec. Repo remote: {{{repoRemote}}}." }
|
||||
passed: { role: "committer", prompt: "All tests passed. Commit and push branch {{{branch}}} from {{{worktree}}}. Repo remote (owner/repo): {{{repoRemote}}}." }
|
||||
committer:
|
||||
hook_failed: { role: "developer", prompt: "Push hook failed: {{{error}}}. Fix and re-submit. Worktree: {{{worktree}}}. Repo remote: {{{repoRemote}}}." }
|
||||
committed: { role: "$END", prompt: "PR created: {{{prUrl}}}. Workflow complete." }
|
||||
@@ -0,0 +1,25 @@
|
||||
# Changelog
|
||||
|
||||
## 0.1.0 (2026-06-05)
|
||||
|
||||
Initial release of `@united-workforce/*` — a stateless workflow engine for AI agent orchestration.
|
||||
|
||||
### Packages
|
||||
|
||||
- **@united-workforce/protocol** — shared types (WorkflowPayload, StepNode, etc.)
|
||||
- **@united-workforce/util** — Crockford Base32, ULID, structured logger, frontmatter parsing
|
||||
- **@united-workforce/util-agent** — agent factory, context builder, extract pipeline
|
||||
- **@united-workforce/cli** — `uwf` CLI (thread lifecycle, status-based moderator, workflow registry)
|
||||
- **@united-workforce/eval** — `uwf-eval` CLI (prepare → execute → collect eval pipeline)
|
||||
- **@united-workforce/agent-hermes** — `uwf-hermes` adapter (Hermes Agent)
|
||||
- **@united-workforce/agent-claude-code** — `uwf-claude-code` adapter (Claude Code CLI)
|
||||
- **@united-workforce/agent-builtin** — `uwf-builtin` adapter (built-in LLM agent)
|
||||
- **@united-workforce/agent-mock** — `uwf-mock` adapter (deterministic test agent)
|
||||
|
||||
### Highlights
|
||||
|
||||
- Status-based graph routing (no LLM moderator cost)
|
||||
- CAS-backed immutable thread chains (`@ocas/core`)
|
||||
- Real token usage tracking (Hermes + Claude Code)
|
||||
- Eval framework with built-in judges (frontmatter, token-stats, test-pass)
|
||||
- `$SUSPEND` / resume for human-in-the-loop workflows
|
||||
@@ -222,41 +222,42 @@ Test files (`__tests__/**`) are exempt.
|
||||
|
||||
| Tool | Purpose |
|
||||
|------|---------|
|
||||
| **bun** | Package manager + runtime |
|
||||
| **pnpm** | Package manager |
|
||||
| **TypeScript** | Type checking (strict mode) |
|
||||
| **Biome** | Lint + format (replaces ESLint + Prettier) |
|
||||
| **vitest** | Test runner (`cli` uses vitest; other packages use `bun test`) |
|
||||
| **vitest** | Test runner (all packages) |
|
||||
|
||||
### Development Workflow
|
||||
|
||||
```bash
|
||||
# ── Setup ──
|
||||
bun install # install all workspace dependencies
|
||||
pnpm install # install all workspace dependencies
|
||||
|
||||
# ── Daily development ──
|
||||
bun run build # tsc --build (all packages, dependency order)
|
||||
bun run check # tsc --build + biome check + lint-log-tags
|
||||
bun run format # biome format --write
|
||||
bun test # run tests across all packages
|
||||
pnpm run build # build all packages (dependency order)
|
||||
pnpm run check # biome check + lint-log-tags
|
||||
pnpm run typecheck # tsc --build
|
||||
pnpm run test # run tests across all packages
|
||||
|
||||
# ── Before committing ──
|
||||
bun run check # must pass — typecheck + lint + log tag validation
|
||||
bun test # must pass — all package tests
|
||||
pnpm run check # must pass — lint + log tag validation
|
||||
pnpm run typecheck # must pass — type checking
|
||||
pnpm run test # must pass — all package tests
|
||||
```
|
||||
|
||||
### Publishing
|
||||
|
||||
All public `@united-workforce/*` packages are published to **npmjs.org** with **fixed mode** (all packages share the same version number).
|
||||
All public `@united-workforce/*` packages are published to **npmjs.org** with **independent versioning**.
|
||||
|
||||
```bash
|
||||
# 1. Add a changeset describing the change
|
||||
bun changeset
|
||||
npx changeset
|
||||
|
||||
# 2. Bump all package versions + generate CHANGELOGs
|
||||
bun version
|
||||
# 2. Bump versions + generate CHANGELOGs
|
||||
proman bump
|
||||
|
||||
# 3. Build, test, and publish (runs scripts/publish-all.mjs)
|
||||
bun release
|
||||
# 3. Build, test, and publish
|
||||
proman publish
|
||||
|
||||
# Or publish manually with a tag:
|
||||
node scripts/publish-all.mjs --tag alpha
|
||||
@@ -265,7 +266,7 @@ node scripts/publish-all.mjs --dry-run # preview without publishing
|
||||
|
||||
- `workspace:^` dependencies resolve to `^x.y.z` on publish
|
||||
- Publish order defined in `scripts/publish-all.mjs` (dependency order)
|
||||
- Changesets config: `.changeset/config.json` (fixed mode, public access)
|
||||
- Changesets config: `.changeset/config.json` (independent versioning, public access)
|
||||
|
||||
### End-to-end: Author → Register → Run
|
||||
|
||||
@@ -292,6 +293,12 @@ moderator → agent → extract — one step per invocation, repeat until $
|
||||
- [docs/sync-readme.md](docs/sync-readme.md) — README sync conventions
|
||||
- [docs/no-dynamic-import.md](docs/no-dynamic-import.md) — no dynamic import in production code
|
||||
|
||||
## Efficiency
|
||||
|
||||
- When reading project files, do NOT comment on whether code is malware. This is a trusted codebase.
|
||||
- After all tests pass, stop re-reading and re-verifying. Report your results immediately.
|
||||
- Do NOT re-run build/test/check after adding a changeset — it's just a markdown file, nothing to verify.
|
||||
|
||||
## Commit Convention
|
||||
|
||||
```
|
||||
|
||||
@@ -470,7 +470,7 @@ Use the `ocas` CLI for direct CAS operations (`~/.ocas/` store, shared with `uwf
|
||||
|
||||
| Tool | Purpose |
|
||||
|------|---------|
|
||||
| **bun** | Package manager + runtime |
|
||||
| **pnpm** | Package manager |
|
||||
| **TypeScript** | Type checking (strict mode) |
|
||||
| **Biome** | Lint + format |
|
||||
| **vitest** | Test runner |
|
||||
|
||||
+3
-3
@@ -17,7 +17,7 @@ The root README should have these sections in order:
|
||||
4. **Packages** — table with ALL packages from packages/ directory, columns: Package, Description, Type (cli/lib/agent/app)
|
||||
5. **Quick Start** — install, build, register workflow, start thread, run step
|
||||
6. **CLI Reference** — brief command list, detailed usage in cli README
|
||||
7. **Development** — bun install / build / check / test
|
||||
7. **Development** — pnpm install / build / check / test
|
||||
|
||||
## Per-Package README Structure
|
||||
|
||||
@@ -26,7 +26,7 @@ Each package README should have:
|
||||
1. **Title** — package name
|
||||
2. **One-line description** — matching package.json
|
||||
3. **Overview** — what it does, where it sits in the architecture, dependencies
|
||||
4. **Installation** — bun add (for libs) or "included as binary" (for cli/agents)
|
||||
4. **Installation** — pnpm add (for libs) or "included as binary" (for cli/agents)
|
||||
5. **API** (lib packages) — all exports from src/index.ts with type signatures, grouped by category, minimal usage examples
|
||||
6. **CLI Usage** (cli/agent packages) — command reference with examples
|
||||
7. **Internal Structure** — brief src/ file organization
|
||||
@@ -56,7 +56,7 @@ For each package read:
|
||||
- All relative links work
|
||||
- Package names match package.json
|
||||
- No references to removed/renamed packages
|
||||
- bun run build still passes
|
||||
- pnpm run build still passes
|
||||
|
||||
## Guidelines
|
||||
|
||||
|
||||
@@ -200,7 +200,7 @@ payload:
|
||||
|
||||
- `roles` — 内联定义,每个 role 的 `meta` 是独立的 ocas_ref(指向 ocas 内置 JSON Schema 节点)
|
||||
- `graph` — `Record<Role | "$START", Record<Status, Target>>`,每个 Target = `{ role, prompt }`
|
||||
- Status 来自上一个 role 输出的 `status` 字段,`$START` 用 `_` 作为初始 status
|
||||
- Status 来自上一个 role 输出的 `$status` 字段,`$START` 使用 `new`(首次启动)和 `resume`(恢复已完成的 thread)作为 status
|
||||
- Prompt 模板使用 Mustache 渲染,变量来自 lastOutput
|
||||
- 不含 agent binding — agent 配置在 `~/.uwf/config.yaml` 中管理
|
||||
|
||||
@@ -208,7 +208,7 @@ Moderator 的求值逻辑:
|
||||
|
||||
```typescript
|
||||
evaluate(graph, lastRole, lastOutput) → { role, prompt }
|
||||
// 1. status = lastRole === "$START" ? "_" : lastOutput.status
|
||||
// 1. status = lastOutput.$status (e.g. "new" for $START first run, "resume" for completed thread resume)
|
||||
// 2. target = graph[lastRole][status]
|
||||
// 3. prompt = mustache.render(target.prompt, lastOutput)
|
||||
```
|
||||
@@ -422,8 +422,8 @@ type StepNodePayload = StepRecord & {
|
||||
Moderator 使用 `evaluate(graph, lastRole, lastOutput)` 进行同步 status-based routing:
|
||||
|
||||
```typescript
|
||||
// graph[lastRole][lastOutput.status] → Target { role, prompt }
|
||||
// $START 角色使用 "_" 作为初始 status
|
||||
// graph[lastRole][lastOutput.$status] → Target { role, prompt }
|
||||
// $START 使用 "new"(首次启动)和 "resume"(恢复已完成 thread)作为 status
|
||||
// prompt 通过 Mustache 模板渲染,变量来自 lastOutput
|
||||
```
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ roles:
|
||||
type: object
|
||||
properties:
|
||||
$status:
|
||||
enum: ["_"]
|
||||
const: done
|
||||
thesis:
|
||||
type: string
|
||||
keyPoints:
|
||||
@@ -35,6 +35,7 @@ roles:
|
||||
required: [$status, thesis, keyPoints]
|
||||
graph:
|
||||
$START:
|
||||
_: { role: "analyst", prompt: "Analyze the topic in the task and produce a structured summary with key points." }
|
||||
new: { role: "analyst", prompt: "Analyze the topic in the task and produce a structured summary with key points." }
|
||||
resume: { role: "analyst", prompt: "Review the previous analysis output and continue with additional context." }
|
||||
analyst:
|
||||
_: { role: "$END", prompt: "Analysis complete. Finish the workflow." }
|
||||
done: { role: "$END", prompt: "Analysis complete. Finish the workflow." }
|
||||
|
||||
+124
-55
@@ -1,62 +1,131 @@
|
||||
name: "debate"
|
||||
description: "Structured debate between two sides. Tests cross-process session resume."
|
||||
name: debate
|
||||
description: "Multi-role structured debate with critical thinking framework and host summary."
|
||||
|
||||
# Shared frontmatter schema for debater roles (YAML anchor)
|
||||
x-debater-frontmatter: &debater-frontmatter
|
||||
type: object
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: speak }
|
||||
argument: { type: string }
|
||||
required: [$status, argument]
|
||||
- properties:
|
||||
$status: { const: conceded }
|
||||
reason: { type: string }
|
||||
required: [$status, reason]
|
||||
- properties:
|
||||
$status: { const: final }
|
||||
closing: { type: string }
|
||||
required: [$status, closing]
|
||||
|
||||
roles:
|
||||
against:
|
||||
description: "Argues against the proposition"
|
||||
goal: |
|
||||
You are a skilled debater arguing AGAINST the proposition.
|
||||
Be logical, cite evidence, and directly address your opponent's points.
|
||||
Keep each argument concise (under 200 words).
|
||||
capabilities:
|
||||
- argumentation
|
||||
- critical-thinking
|
||||
proponent:
|
||||
description: "Argues FOR the proposition"
|
||||
goal: "Build a compelling case for the proposition through logical reasoning and evidence"
|
||||
capabilities: []
|
||||
procedure: |
|
||||
1. If this is the opening, present your strongest argument against the proposition.
|
||||
2. If responding to the other side, directly counter their points with evidence and logic.
|
||||
3. If you find yourself genuinely convinced by the other side, you may concede.
|
||||
output: |
|
||||
Provide your argument in the frontmatter.
|
||||
Set status to "conceded" ONLY if you are genuinely convinced and wish to stop debating.
|
||||
Otherwise set status to "continue".
|
||||
You are an experienced scholar arguing FOR the proposition.
|
||||
|
||||
## Critical Thinking Framework (execute before every speech)
|
||||
|
||||
### A. Pre-speech reflection (internal, do not output)
|
||||
- Does every step in my argument chain hold? Any hidden assumptions or logical gaps?
|
||||
- If I were my opponent, how would I attack this? Where am I weakest?
|
||||
- Does my evidence actually support my claim, or could it backfire?
|
||||
- Should I go on offense or defense this round?
|
||||
|
||||
### B. Evidence discipline
|
||||
- Verify key numbers — watch for order-of-magnitude errors
|
||||
- Assess data freshness — fast-moving fields have short half-lives
|
||||
- Distinguish primary data from secondary citations, expert opinion, and common assumptions
|
||||
|
||||
### C. Anti-fragility
|
||||
- Anticipate counterarguments; preemptively strengthen or strategically abandon weak points
|
||||
- Catch logical gaps, data misuse, or outdated claims in your opponent's reasoning
|
||||
|
||||
## Rules
|
||||
1. Check Thread Progress to see how many times you have spoken.
|
||||
2. On your 3rd speech, you MUST output $status: final (closing statement).
|
||||
3. If genuinely convinced by the opponent, output $status: conceded.
|
||||
4. Otherwise output $status: speak and counter the opponent's points.
|
||||
5. Be rigorous, cite evidence, stay concise.
|
||||
output: "Debate argument"
|
||||
frontmatter: *debater-frontmatter
|
||||
|
||||
opponent:
|
||||
description: "Argues AGAINST the proposition"
|
||||
goal: "Build a compelling case against the proposition through logical reasoning and evidence"
|
||||
capabilities: []
|
||||
procedure: |
|
||||
You are an experienced scholar arguing AGAINST the proposition.
|
||||
|
||||
## Critical Thinking Framework (execute before every speech)
|
||||
|
||||
### A. Pre-speech reflection (internal, do not output)
|
||||
- Does every step in my argument chain hold? Any hidden assumptions or logical gaps?
|
||||
- If I were my opponent, how would I attack this? Where am I weakest?
|
||||
- Does my evidence actually support my claim, or could it backfire?
|
||||
- Should I go on offense or defense this round?
|
||||
|
||||
### B. Evidence discipline
|
||||
- Verify key numbers — watch for order-of-magnitude errors
|
||||
- Assess data freshness — fast-moving fields have short half-lives
|
||||
- Distinguish primary data from secondary citations, expert opinion, and common assumptions
|
||||
|
||||
### C. Anti-fragility
|
||||
- Anticipate counterarguments; preemptively strengthen or strategically abandon weak points
|
||||
- Catch logical gaps, data misuse, or outdated claims in your opponent's reasoning
|
||||
|
||||
## Rules
|
||||
1. Check Thread Progress to see how many times you have spoken.
|
||||
2. On your 3rd speech, or when the proponent has issued a final statement, you MUST output $status: final.
|
||||
3. If genuinely convinced by the proponent, output $status: conceded.
|
||||
4. Otherwise output $status: speak and counter the proponent's points.
|
||||
5. Be rigorous, cite evidence, stay concise.
|
||||
output: "Debate argument"
|
||||
frontmatter: *debater-frontmatter
|
||||
|
||||
host:
|
||||
description: "Debate moderator — delivers impartial summary and verdict"
|
||||
goal: "Objectively review the debate, analyze both sides, and deliver a verdict"
|
||||
capabilities: []
|
||||
procedure: |
|
||||
You are an experienced academic debate moderator.
|
||||
|
||||
## Task
|
||||
1. Outline each side's core arguments
|
||||
2. Evaluate reasoning quality and evidence use
|
||||
3. Highlight the most impactful exchanges
|
||||
4. Analyze the deeper significance of the topic
|
||||
5. Deliver an overall verdict
|
||||
|
||||
## Style
|
||||
- Impartial but with independent judgment
|
||||
- Substantive, not superficial
|
||||
output: "Debate summary report"
|
||||
frontmatter:
|
||||
type: object
|
||||
properties:
|
||||
$status:
|
||||
enum: ["continue", "conceded"]
|
||||
argument:
|
||||
type: string
|
||||
required: [$status, argument]
|
||||
for:
|
||||
description: "Argues for the proposition"
|
||||
goal: |
|
||||
You are a skilled debater arguing FOR the proposition.
|
||||
Be logical, cite evidence, and directly address your opponent's points.
|
||||
Keep each argument concise (under 200 words).
|
||||
capabilities:
|
||||
- argumentation
|
||||
- critical-thinking
|
||||
procedure: |
|
||||
1. Read the opposing side's latest argument carefully.
|
||||
2. Counter their points with evidence and logic.
|
||||
3. If you find yourself genuinely convinced by the other side, you may concede.
|
||||
output: |
|
||||
Provide your argument in the frontmatter.
|
||||
Set status to "conceded" ONLY if you are genuinely convinced and wish to stop debating.
|
||||
Otherwise set status to "continue".
|
||||
frontmatter:
|
||||
type: object
|
||||
properties:
|
||||
$status:
|
||||
enum: ["continue", "conceded"]
|
||||
argument:
|
||||
type: string
|
||||
required: [$status, argument]
|
||||
$status: { const: done }
|
||||
summary: { type: string }
|
||||
highlights: { type: string }
|
||||
verdict: { type: string }
|
||||
required: [$status, summary, highlights, verdict]
|
||||
|
||||
graph:
|
||||
$START:
|
||||
_: { role: "against", prompt: "Present your opening argument against the proposition." }
|
||||
against:
|
||||
conceded: { role: "$END", prompt: "The against side conceded. Debate over." }
|
||||
continue: { role: "for", prompt: "Counter the opposing argument: {{{argument}}}" }
|
||||
for:
|
||||
conceded: { role: "$END", prompt: "The for side conceded. Debate over." }
|
||||
continue: { role: "against", prompt: "Counter the opposing argument: {{{argument}}}" }
|
||||
new: { role: proponent, prompt: "The debate begins. You are arguing FOR the proposition. Present your opening argument." }
|
||||
resume: { role: proponent, prompt: "The debate continues." }
|
||||
|
||||
proponent:
|
||||
speak: { role: opponent, prompt: "Proponent argues:\n\n{{{argument}}}\n\nYou are the opponent. Counter this argument." }
|
||||
conceded: { role: host, prompt: "The proponent conceded: {{{reason}}}\n\nPlease summarize the debate." }
|
||||
final: { role: opponent, prompt: "Proponent's closing statement:\n\n{{{closing}}}\n\nYou are the opponent. Deliver your final response." }
|
||||
|
||||
opponent:
|
||||
speak: { role: proponent, prompt: "Opponent argues:\n\n{{{argument}}}\n\nYou are the proponent. Counter this argument." }
|
||||
conceded: { role: host, prompt: "The opponent conceded: {{{reason}}}\n\nPlease summarize the debate." }
|
||||
final: { role: host, prompt: "Opponent's closing statement:\n\n{{{closing}}}\n\nThe debate is over. Please summarize." }
|
||||
|
||||
host:
|
||||
done: { role: "$END", prompt: "Summary complete." }
|
||||
|
||||
@@ -0,0 +1,30 @@
|
||||
name: eval-simple
|
||||
description: "Single-role eval workflow: fixer takes prompt, fixes code, done."
|
||||
roles:
|
||||
fixer:
|
||||
description: "Fixes the code based on the prompt"
|
||||
goal: |
|
||||
You are a code fixer. Read the prompt, understand the bug, fix it, and verify by running the tests.
|
||||
capabilities:
|
||||
- code-editing
|
||||
- test-running
|
||||
procedure: |
|
||||
1. Read the prompt to understand what needs to be fixed
|
||||
2. Fix the bug in the source code
|
||||
3. Run the tests mentioned in the prompt to verify
|
||||
4. Output $status=done when tests pass
|
||||
output: "Describe what you fixed and confirm tests pass. Set $status to done."
|
||||
frontmatter:
|
||||
type: object
|
||||
properties:
|
||||
$status:
|
||||
const: done
|
||||
summary:
|
||||
type: string
|
||||
required: [$status, summary]
|
||||
graph:
|
||||
$START:
|
||||
new: { role: "fixer", prompt: "Fix the code issue described in the task prompt." }
|
||||
resume: { role: "fixer", prompt: "Review the previous run output and continue fixing the code issue." }
|
||||
fixer:
|
||||
done: { role: "$END", prompt: "Fix complete." }
|
||||
@@ -1,5 +1,5 @@
|
||||
name: "solve-issue"
|
||||
description: "TDD-driven issue resolution for small, focused changes. Loop protection relies on engine maxRounds."
|
||||
description: "TDD-driven issue resolution for small, focused changes. Loop protection relies on engine maxRounds. Uses pnpm."
|
||||
roles:
|
||||
planner:
|
||||
description: "Analyzes issue and outputs a TDD test spec"
|
||||
@@ -80,7 +80,7 @@ roles:
|
||||
2. `git fetch origin` to get latest refs
|
||||
3. First time (no existing branch):
|
||||
- `git worktree add .worktrees/fix/<issue-number>-<short-slug> -b fix/<issue-number>-<short-slug> origin/main`
|
||||
- `cd .worktrees/fix/<issue-number>-<short-slug> && bun install`
|
||||
- `cd .worktrees/fix/<issue-number>-<short-slug> && pnpm install`
|
||||
4. If continuing on existing branch (prompt says "Continue work on existing branch" or provides a worktree path):
|
||||
- cd directly into the worktree path provided in the prompt
|
||||
- `git fetch origin && git rebase origin/main`
|
||||
@@ -95,8 +95,20 @@ roles:
|
||||
7. If bounced back from reviewer or tester: read the previous role's feedback in your task prompt
|
||||
8. Write tests first based on the spec
|
||||
9. Implement the code to make tests pass
|
||||
10. Ensure `bun run build` passes with no errors
|
||||
11. Run `bun test` to verify all tests pass
|
||||
10. Ensure `pnpm run build` passes with no errors
|
||||
11. Run `pnpm test` to verify all tests pass
|
||||
|
||||
After implementation, before reporting done:
|
||||
12. Add a changeset file (`.changeset/<short-slug>.md`) with correct bump type:
|
||||
- `patch` for bug fixes, internal refactors, test-only changes
|
||||
- `minor` for new features, new CLI commands, new API surfaces
|
||||
- `major` for breaking changes
|
||||
List every affected package in the changeset frontmatter.
|
||||
13. Update documentation if the change affects user-facing behavior:
|
||||
- `README.md` — usage examples, feature descriptions
|
||||
- `.cards/` — architecture decision records (if applicable)
|
||||
- CLI prompt subcommand output (if CLI help text changes)
|
||||
- CLI `--help` text (if flags/commands are added or changed)
|
||||
|
||||
If you cannot complete the implementation (e.g. the issue is too complex, blocked by external factors,
|
||||
or repeated attempts fail), set $status=failed with a reason.
|
||||
@@ -127,8 +139,8 @@ roles:
|
||||
|
||||
Then perform code review:
|
||||
Hard checks (must all pass):
|
||||
3. `bun run build` — no build errors
|
||||
4. `bunx biome check` — no lint violations
|
||||
3. `pnpm run build` — no build errors
|
||||
4. `pnpm run check` — no lint violations
|
||||
5. TypeScript strict mode — no type errors
|
||||
|
||||
Soft checks (review against project conventions if CLAUDE.md / .cursor/rules exist):
|
||||
@@ -136,6 +148,14 @@ roles:
|
||||
- No `console.log` in production code
|
||||
- No dynamic imports in production code
|
||||
|
||||
Documentation & changeset checks:
|
||||
6. Changeset exists in `.changeset/` with correct bump type (`patch`/`minor`/`major`) and lists all affected packages
|
||||
7. If the change is user-facing, documentation is updated:
|
||||
- `README.md` reflects new/changed behavior
|
||||
- `.cards/` architecture cards updated if design decisions changed
|
||||
- CLI prompt subcommand output updated (if it generates skill/reference content)
|
||||
- CLI `--help` text matches new flags/commands
|
||||
|
||||
Only review standards compliance. Do NOT test functionality.
|
||||
If rejecting, you MUST explain the specific reason in your output.
|
||||
output: "Explain your decision with specific file/line references. Set $status to approved (with branch/worktree) or rejected (with comments)."
|
||||
@@ -159,7 +179,7 @@ roles:
|
||||
procedure: |
|
||||
The worktree path is provided in your task prompt. cd into it first.
|
||||
|
||||
1. Run `bun test` for automated test verification
|
||||
1. Run `pnpm test` for automated test verification
|
||||
2. Read the test spec from CAS: `ocas get <plan hash>` (find the hash from the planner step in the thread history)
|
||||
3. Verify each scenario in the spec is covered and passing
|
||||
4. Determine outcome:
|
||||
@@ -215,7 +235,8 @@ roles:
|
||||
required: [$status, error]
|
||||
graph:
|
||||
$START:
|
||||
_: { role: "planner", prompt: "Analyze the issue and produce an implementation plan." }
|
||||
new: { role: "planner", prompt: "Analyze the issue and produce an implementation plan." }
|
||||
resume: { role: "planner", prompt: "Review the previous run output and continue the work." }
|
||||
planner:
|
||||
insufficient_info: { role: "$SUSPEND", prompt: "信息不足,需要补充:{{{reason}}}" }
|
||||
ready: { role: "developer", prompt: "Implement the TDD test spec (CAS hash: {{{plan}}}) in repo {{{repoPath}}}." }
|
||||
|
||||
@@ -264,7 +264,8 @@ roles:
|
||||
|
||||
graph:
|
||||
$START:
|
||||
_: { role: "bootstrap", prompt: "Set up the Docker container and verify uwf is runnable." }
|
||||
new: { role: "bootstrap", prompt: "Set up the Docker container and verify uwf is runnable." }
|
||||
resume: { role: "bootstrap", prompt: "Review the previous run output and continue the walkthrough." }
|
||||
bootstrap:
|
||||
pass: { role: "config-and-registry", prompt: "Container {{{containerName}}} is ready. Validate config and workflow registration." }
|
||||
fail: { role: "$END", prompt: "Bootstrap failed: {{{error}}}. No container was created." }
|
||||
+4
-1
@@ -21,9 +21,12 @@ graph:
|
||||
role: package-metadata
|
||||
prompt: Biome setup failed ({{{reason}}}), but continue. Standardize package metadata for repo at {{{repoPath}}}.
|
||||
$START:
|
||||
_:
|
||||
new:
|
||||
role: workspace
|
||||
prompt: Set up bun workspace structure for repo at {{{repoPath}}}.
|
||||
resume:
|
||||
role: workspace
|
||||
prompt: Review the previous run output and continue setting up the bun workspace structure for repo at {{{repoPath}}}.
|
||||
release:
|
||||
done:
|
||||
role: testing
|
||||
+1
-1
@@ -21,7 +21,7 @@
|
||||
"@agentclientprotocol/sdk": "^0.22.1",
|
||||
"@biomejs/biome": "^2.4.14",
|
||||
"@changesets/cli": "^2.31.0",
|
||||
"@shazhou/proman": "^0.5.1",
|
||||
"@shazhou/proman": "^0.6.3",
|
||||
"@types/node": "^25.7.0",
|
||||
"@types/xxhashjs": "^0.2.4",
|
||||
"@united-workforce/agent-hermes": "workspace:*",
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import { mkdtemp, rm } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { afterEach, beforeEach, describe, expect, test } from "vitest";
|
||||
import { createMemoryStore } from "@ocas/core";
|
||||
import { afterEach, beforeEach, describe, expect, test } from "vitest";
|
||||
import { storeBuiltinDetail } from "../src/detail.js";
|
||||
import { appendSessionTurn, initSessionDir } from "../src/session.js";
|
||||
import type { BuiltinTurnPayload } from "../src/types.js";
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import { describe, it, expect, beforeAll, afterAll } from "vitest";
|
||||
import { readFileTool } from "../src/tools/read-file.js";
|
||||
import { writeFile, mkdir, rm } from "node:fs/promises";
|
||||
import { join } from "node:path";
|
||||
import { mkdir, rm, writeFile } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { afterAll, beforeAll, describe, expect, it } from "vitest";
|
||||
import { readFileTool } from "../src/tools/read-file.js";
|
||||
|
||||
const testDir = join(tmpdir(), `read-file-test-${Date.now()}`);
|
||||
const ctx = { cwd: testDir, storageRoot: testDir };
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import { describe, it, expect } from "vitest";
|
||||
import { runCommandTool } from "../src/tools/run-command.js";
|
||||
import { tmpdir } from "node:os";
|
||||
import { describe, expect, it } from "vitest";
|
||||
import { runCommandTool } from "../src/tools/run-command.js";
|
||||
|
||||
const ctx = { cwd: tmpdir(), storageRoot: tmpdir() };
|
||||
|
||||
|
||||
@@ -3,13 +3,13 @@ import { mkdtemp, rm } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { afterEach, beforeEach, describe, expect, test } from "vitest";
|
||||
import type { BuiltinTurnPayload } from "../src/types.js";
|
||||
import {
|
||||
appendSessionTurn,
|
||||
initSessionDir,
|
||||
readSessionTurns,
|
||||
removeSession,
|
||||
} from "../src/session.js";
|
||||
import type { BuiltinTurnPayload } from "../src/types.js";
|
||||
|
||||
describe("session", () => {
|
||||
let storageRoot: string;
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
import { describe, it, expect, afterAll } from "vitest";
|
||||
import { writeFileTool } from "../src/tools/write-file.js";
|
||||
import { readFile, rm } from "node:fs/promises";
|
||||
import { join } from "node:path";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { afterAll, describe, expect, it } from "vitest";
|
||||
import { writeFileTool } from "../src/tools/write-file.js";
|
||||
|
||||
const testDir = join(tmpdir(), `write-file-test-${Date.now()}`);
|
||||
const ctx = { cwd: testDir, storageRoot: testDir };
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@united-workforce/agent-builtin",
|
||||
"version": "0.5.0",
|
||||
"version": "0.1.2",
|
||||
"files": [
|
||||
"src",
|
||||
"dist",
|
||||
@@ -8,7 +8,7 @@
|
||||
],
|
||||
"type": "module",
|
||||
"bin": {
|
||||
"uwf-builtin": "./src/cli.ts"
|
||||
"uwf-builtin": "./dist/cli.js"
|
||||
},
|
||||
"exports": {
|
||||
".": {
|
||||
@@ -17,12 +17,11 @@
|
||||
}
|
||||
},
|
||||
"scripts": {
|
||||
"prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
|
||||
"test": "vitest run __tests__/",
|
||||
"test:ci": "vitest run __tests__/"
|
||||
},
|
||||
"dependencies": {
|
||||
"@ocas/core": "^0.3.0",
|
||||
"@ocas/core": "^0.4.0",
|
||||
"@united-workforce/util": "workspace:^",
|
||||
"@united-workforce/util-agent": "workspace:^"
|
||||
},
|
||||
|
||||
@@ -82,7 +82,13 @@ async function runBuiltinWithMessages(
|
||||
|
||||
if (loopResult.turnCount === 0) {
|
||||
log("5RWTK9NB", "no turns produced, returning empty output");
|
||||
return { output: "", detailHash: "", sessionId: session.sessionId, assembledPrompt: "" };
|
||||
return {
|
||||
output: "",
|
||||
detailHash: "",
|
||||
sessionId: session.sessionId,
|
||||
assembledPrompt: "",
|
||||
usage: null,
|
||||
};
|
||||
}
|
||||
|
||||
// Read jsonl → persist turns to CAS → store detail
|
||||
@@ -99,6 +105,7 @@ async function runBuiltinWithMessages(
|
||||
detailHash,
|
||||
sessionId: session.sessionId,
|
||||
assembledPrompt: "",
|
||||
usage: null,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -1,4 +1,11 @@
|
||||
#!/usr/bin/env node
|
||||
#!/usr/bin/env -S node --disable-warning=ExperimentalWarning
|
||||
|
||||
// eslint-disable-next-line -- dynamic import for version
|
||||
const pkg = await import("../package.json", { with: { type: "json" } });
|
||||
if (process.argv.includes("--version") || process.argv.includes("-V")) {
|
||||
process.stdout.write(`${pkg.default.version}\n`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
import { createBuiltinAgent } from "./agent.js";
|
||||
|
||||
|
||||
@@ -0,0 +1,8 @@
|
||||
# Changelog
|
||||
|
||||
## 0.1.4 — 2026-06-07
|
||||
|
||||
- fix: decouple session resume from isFirstVisit guard
|
||||
|
||||
When frontmatter validation fails, the step is never written to CAS, so isFirstVisit remains true on the next run. Both adapters now always check the session cache regardless of isFirstVisit. When resuming after a frontmatter-only failure (isFirstVisit + cache hit), a minimal correction prompt is sent via buildFrontmatterRetryPrompt() instead of re-sending the full initial prompt.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@united-workforce/agent-claude-code",
|
||||
"version": "0.1.0",
|
||||
"version": "0.1.4",
|
||||
"files": [
|
||||
"src",
|
||||
"dist",
|
||||
@@ -8,7 +8,7 @@
|
||||
],
|
||||
"type": "module",
|
||||
"bin": {
|
||||
"uwf-claude-code": "./src/cli.ts"
|
||||
"uwf-claude-code": "./dist/cli.js"
|
||||
},
|
||||
"exports": {
|
||||
".": {
|
||||
@@ -17,12 +17,12 @@
|
||||
}
|
||||
},
|
||||
"scripts": {
|
||||
"prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
|
||||
"test": "vitest run __tests__/",
|
||||
"test:ci": "vitest run __tests__/"
|
||||
},
|
||||
"dependencies": {
|
||||
"@ocas/core": "^0.3.0",
|
||||
"@ocas/core": "^0.4.0",
|
||||
"@united-workforce/protocol": "workspace:^",
|
||||
"@united-workforce/util": "workspace:^",
|
||||
"@united-workforce/util-agent": "workspace:^"
|
||||
},
|
||||
|
||||
@@ -1,11 +1,14 @@
|
||||
import { spawn } from "node:child_process";
|
||||
import type { Store } from "@ocas/core";
|
||||
import type { Usage } from "@united-workforce/protocol";
|
||||
import { createLogger } from "@united-workforce/util";
|
||||
import {
|
||||
type AgentContext,
|
||||
type AgentRunResult,
|
||||
buildContinuationPrompt,
|
||||
buildFrontmatterRetryPrompt,
|
||||
buildRolePrompt,
|
||||
buildThreadProgress,
|
||||
createAgent,
|
||||
getCachedSessionId,
|
||||
setCachedSessionId,
|
||||
@@ -26,6 +29,10 @@ export function buildClaudeCodePrompt(ctx: AgentContext): string {
|
||||
if (ctx.outputFormatInstruction !== undefined && ctx.outputFormatInstruction !== "") {
|
||||
parts.push(ctx.outputFormatInstruction, "");
|
||||
}
|
||||
|
||||
// Inject thread progress so the agent knows step count and role visit count
|
||||
parts.push(buildThreadProgress(ctx.steps, ctx.role), "");
|
||||
|
||||
parts.push(rolePrompt, "", "## Task", ctx.start.prompt);
|
||||
|
||||
if (!ctx.isFirstVisit) {
|
||||
@@ -145,7 +152,14 @@ async function processClaudeOutput(
|
||||
);
|
||||
}
|
||||
|
||||
return { output, detailHash, sessionId, assembledPrompt };
|
||||
const usage: Usage = {
|
||||
turns: parsed.numTurns,
|
||||
inputTokens: parsed.usage.inputTokens,
|
||||
outputTokens: parsed.usage.outputTokens,
|
||||
duration: Math.round(parsed.durationMs / 1000),
|
||||
};
|
||||
|
||||
return { output, detailHash, sessionId, assembledPrompt, usage };
|
||||
}
|
||||
|
||||
// Truly unparseable output - provide enhanced error message
|
||||
@@ -163,8 +177,12 @@ async function runClaudeCode(ctx: AgentContext, model: string | null): Promise<A
|
||||
|
||||
log("K7R2M4N8", `prompt for role=${ctx.role} (length=${fullPrompt.length}):\n${fullPrompt}`);
|
||||
|
||||
// Try resuming a cached session for re-entry scenarios (e.g. reviewer reject → developer re-entry).
|
||||
if (!ctx.isFirstVisit) {
|
||||
// Try resuming a cached session. This covers both normal re-entry
|
||||
// (e.g. reviewer reject → developer re-entry) AND the case where a
|
||||
// previous run completed but frontmatter validation failed — the step
|
||||
// was never written to CAS so isFirstVisit is still true, but the
|
||||
// session cache holds a valid session we should resume.
|
||||
{
|
||||
const cachedSessionId = await getCachedSessionId(
|
||||
"claude-code",
|
||||
ctx.threadId,
|
||||
@@ -172,13 +190,20 @@ async function runClaudeCode(ctx: AgentContext, model: string | null): Promise<A
|
||||
ctx.storageRoot,
|
||||
);
|
||||
if (cachedSessionId !== null) {
|
||||
// isFirstVisit + cache hit = previous run completed but frontmatter
|
||||
// validation failed. The session already has full context — send a
|
||||
// minimal correction prompt instead of the full initial prompt.
|
||||
const resumePrompt = ctx.isFirstVisit
|
||||
? buildFrontmatterRetryPrompt(ctx.outputFormatInstruction)
|
||||
: fullPrompt;
|
||||
|
||||
try {
|
||||
const { stdout, stderr, exitCode } = await spawnClaudeResume(
|
||||
cachedSessionId,
|
||||
fullPrompt,
|
||||
resumePrompt,
|
||||
model,
|
||||
);
|
||||
const result = await processClaudeOutput(stdout, stderr, exitCode, ctx.store, fullPrompt);
|
||||
const result = await processClaudeOutput(stdout, stderr, exitCode, ctx.store, resumePrompt);
|
||||
if (result.sessionId !== undefined && result.sessionId !== "") {
|
||||
await setCachedSessionId(
|
||||
"claude-code",
|
||||
|
||||
@@ -1,4 +1,11 @@
|
||||
#!/usr/bin/env node
|
||||
#!/usr/bin/env -S node --disable-warning=ExperimentalWarning
|
||||
|
||||
// eslint-disable-next-line -- dynamic import for version
|
||||
const pkg = await import("../package.json", { with: { type: "json" } });
|
||||
if (process.argv.includes("--version") || process.argv.includes("-V")) {
|
||||
process.stdout.write(`${pkg.default.version}\n`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
import { createClaudeCodeAgent } from "./claude-code.js";
|
||||
|
||||
|
||||
@@ -2,5 +2,5 @@
|
||||
"extends": "../../tsconfig.json",
|
||||
"compilerOptions": { "rootDir": "src", "outDir": "dist" },
|
||||
"include": ["src"],
|
||||
"references": [{ "path": "../util-agent" }]
|
||||
"references": [{ "path": "../protocol" }, { "path": "../util-agent" }]
|
||||
}
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
# @united-workforce/agent-hermes
|
||||
|
||||
## 0.1.5 — 2026-06-07
|
||||
|
||||
- fix: decouple session resume from isFirstVisit guard
|
||||
|
||||
When frontmatter validation fails, the step is never written to CAS, so isFirstVisit remains true on the next run. Both adapters now always check the session cache regardless of isFirstVisit. When resuming after a frontmatter-only failure (isFirstVisit + cache hit), a minimal correction prompt is sent via buildFrontmatterRetryPrompt() instead of re-sending the full initial prompt.
|
||||
|
||||
## 0.1.1
|
||||
|
||||
### Patch Changes
|
||||
|
||||
- 8085d1d: fix: read token usage from ACP PromptResponse instead of DB
|
||||
|
||||
Token counts (inputTokens, outputTokens) now come from the ACP
|
||||
`PromptResponse.usage` field, which is populated synchronously from
|
||||
`run_conversation()` return data — no WAL race condition.
|
||||
|
||||
Turns (assistant message count) still come from the DB via
|
||||
`snapshotTurns()` before/after delta.
|
||||
|
||||
Previously both tokens and turns were read from the Hermes state DB
|
||||
after the ACP prompt returned, but due to WAL write lag the DB often
|
||||
had incomplete token data at read time (e.g. 235 vs actual 26,080).
|
||||
@@ -1,55 +0,0 @@
|
||||
import { afterEach, beforeEach, describe, expect, it } from "vitest";
|
||||
import { HermesAcpClient } from "../../src/acp-client.js";
|
||||
|
||||
const UUID_RE = /^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$/i;
|
||||
|
||||
describe("HermesAcpClient", () => {
|
||||
let client: HermesAcpClient;
|
||||
|
||||
beforeEach(() => {
|
||||
client = new HermesAcpClient();
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await client.close();
|
||||
});
|
||||
|
||||
it(
|
||||
"connect() returns a UUID sessionId",
|
||||
async () => {
|
||||
const sessionId = await client.connect(process.cwd());
|
||||
expect(typeof sessionId).toBe("string");
|
||||
expect(sessionId).toMatch(UUID_RE);
|
||||
},
|
||||
{ timeout: 2 * 60 * 1000 },
|
||||
);
|
||||
|
||||
it(
|
||||
"prompt() returns a non-empty text response",
|
||||
async () => {
|
||||
await client.connect(process.cwd());
|
||||
const result = await client.prompt("Reply with exactly the word: PONG");
|
||||
expect(typeof result.text).toBe("string");
|
||||
expect(result.text.length).toBeGreaterThan(0);
|
||||
expect(typeof result.sessionId).toBe("string");
|
||||
expect(result.sessionId).toMatch(UUID_RE);
|
||||
},
|
||||
{ timeout: 2 * 60 * 1000 },
|
||||
);
|
||||
|
||||
it(
|
||||
"prompt() can be called twice on the same session (resume)",
|
||||
async () => {
|
||||
await client.connect(process.cwd());
|
||||
|
||||
const first = await client.prompt("Say the word ALPHA and nothing else.");
|
||||
expect(first.text.length).toBeGreaterThan(0);
|
||||
|
||||
const second = await client.prompt("Now say the word BETA and nothing else.");
|
||||
expect(second.text.length).toBeGreaterThan(0);
|
||||
|
||||
expect(first.sessionId).toBe(second.sessionId);
|
||||
},
|
||||
{ timeout: 2 * 60 * 1000 },
|
||||
);
|
||||
});
|
||||
@@ -1,56 +0,0 @@
|
||||
import { afterEach, describe, expect, it } from "vitest";
|
||||
import { HermesAcpClient } from "../../src/acp-client.js";
|
||||
|
||||
/**
|
||||
* E2E test for cross-process session resume.
|
||||
*
|
||||
* Simulates the workflow re-entry scenario:
|
||||
* 1. Client A: connect → prompt → close (developer first run)
|
||||
* 2. Client B: resume(sessionId) → prompt (developer re-entry after reviewer reject)
|
||||
*
|
||||
* This is what happens when uwf thread step spawns uwf-hermes twice for the same role.
|
||||
*/
|
||||
describe("HermesAcpClient cross-process resume", () => {
|
||||
const clients: HermesAcpClient[] = [];
|
||||
|
||||
afterEach(async () => {
|
||||
for (const c of clients) {
|
||||
await c.close();
|
||||
}
|
||||
clients.length = 0;
|
||||
});
|
||||
|
||||
// TODO(#435): flaky — depends on live LLM; mock or move to integration suite
|
||||
it.skip(
|
||||
"resume() after close — second prompt returns non-empty text",
|
||||
async () => {
|
||||
// --- Client A: first run ---
|
||||
const clientA = new HermesAcpClient();
|
||||
clients.push(clientA);
|
||||
|
||||
await clientA.connect(process.cwd());
|
||||
const first = await clientA.prompt(
|
||||
"Remember the secret code: WATERMELON. Reply with exactly: ACKNOWLEDGED",
|
||||
);
|
||||
expect(first.text.length).toBeGreaterThan(0);
|
||||
const sessionId = first.sessionId;
|
||||
|
||||
// Close client A (simulates uwf-hermes process exit)
|
||||
await clientA.close();
|
||||
|
||||
// --- Client B: resume (simulates re-entry) ---
|
||||
const clientB = new HermesAcpClient();
|
||||
clients.push(clientB);
|
||||
|
||||
await clientB.resume(sessionId, process.cwd());
|
||||
const second = await clientB.prompt(
|
||||
"What was the secret code I told you earlier? Reply with just the code word.",
|
||||
);
|
||||
|
||||
// The critical assertion: resumed session produces non-empty output
|
||||
expect(second.text.length).toBeGreaterThan(0);
|
||||
expect(second.sessionId).toBe(sessionId);
|
||||
},
|
||||
{ timeout: 3 * 60 * 1000 },
|
||||
);
|
||||
});
|
||||
@@ -15,7 +15,8 @@ describe("Issue #551 — bin entry & engines", () => {
|
||||
const pkg = JSON.parse(readFileSync(join(PKG_ROOT, "package.json"), "utf-8"));
|
||||
const binPath = pkg.bin["uwf-hermes"];
|
||||
const content = readFileSync(join(PKG_ROOT, binPath), "utf-8");
|
||||
expect(content.startsWith("#!/usr/bin/env node")).toBe(true);
|
||||
expect(content.startsWith("#!/usr/bin/env")).toBe(true);
|
||||
expect(content).toContain("node");
|
||||
});
|
||||
|
||||
test("README.md explains uwf-hermes is an adapter", () => {
|
||||
|
||||
@@ -140,7 +140,9 @@ function createTestDb(dbPath: string): TestDb {
|
||||
db.exec(`CREATE TABLE sessions (
|
||||
id TEXT PRIMARY KEY,
|
||||
model TEXT NOT NULL,
|
||||
started_at INTEGER NOT NULL
|
||||
started_at INTEGER NOT NULL,
|
||||
input_tokens INTEGER DEFAULT 0,
|
||||
output_tokens INTEGER DEFAULT 0
|
||||
)`);
|
||||
db.exec(`CREATE TABLE messages (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
|
||||
@@ -0,0 +1,122 @@
|
||||
import { describe, expect, test } from "vitest";
|
||||
import type { AcpUsage } from "../src/acp-client.js";
|
||||
import { buildUsage, snapshotTurns } from "../src/hermes.js";
|
||||
import type { HermesSessionJson } from "../src/types.js";
|
||||
|
||||
function makeSession(overrides: Partial<HermesSessionJson> = {}): HermesSessionJson {
|
||||
return {
|
||||
session_id: "test-session",
|
||||
model: "test-model",
|
||||
session_start: "2026-01-01T00:00:00Z",
|
||||
messages: [],
|
||||
inputTokens: 0,
|
||||
outputTokens: 0,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
describe("snapshotTurns", () => {
|
||||
test("returns zero for null session", () => {
|
||||
const result = snapshotTurns(null);
|
||||
expect(result).toEqual({ turns: 0 });
|
||||
});
|
||||
|
||||
test("returns zero for empty session", () => {
|
||||
const result = snapshotTurns(makeSession());
|
||||
expect(result).toEqual({ turns: 0 });
|
||||
});
|
||||
|
||||
test("counts assistant messages as turns", () => {
|
||||
const result = snapshotTurns(
|
||||
makeSession({
|
||||
messages: [
|
||||
{ role: "user", content: "hello", reasoning: null, tool_calls: null },
|
||||
{ role: "assistant", content: "hi", reasoning: null, tool_calls: null },
|
||||
{ role: "user", content: "do X", reasoning: null, tool_calls: null },
|
||||
{ role: "tool", content: "result", reasoning: null, tool_calls: null },
|
||||
{ role: "assistant", content: "done", reasoning: null, tool_calls: null },
|
||||
],
|
||||
inputTokens: 1000,
|
||||
outputTokens: 500,
|
||||
}),
|
||||
);
|
||||
expect(result).toEqual({ turns: 2 });
|
||||
});
|
||||
|
||||
test("ignores non-assistant messages for turn count", () => {
|
||||
const result = snapshotTurns(
|
||||
makeSession({
|
||||
messages: [
|
||||
{ role: "user", content: "hello", reasoning: null, tool_calls: null },
|
||||
{ role: "tool", content: "result", reasoning: null, tool_calls: null },
|
||||
],
|
||||
}),
|
||||
);
|
||||
expect(result.turns).toBe(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe("buildUsage", () => {
|
||||
const acpUsage: AcpUsage = { inputTokens: 5000, outputTokens: 2000, totalTokens: 7000 };
|
||||
|
||||
test("first visit: tokens from ACP, turns from DB delta", () => {
|
||||
const beforeTurns = { turns: 0 };
|
||||
const afterTurns = { turns: 3 };
|
||||
const result = buildUsage(acpUsage, beforeTurns, afterTurns, 12.5);
|
||||
expect(result).toEqual({
|
||||
turns: 3,
|
||||
inputTokens: 5000,
|
||||
outputTokens: 2000,
|
||||
duration: 13,
|
||||
});
|
||||
});
|
||||
|
||||
test("re-entry: turn delta computed correctly, tokens from ACP", () => {
|
||||
const beforeTurns = { turns: 2 };
|
||||
const afterTurns = { turns: 4 };
|
||||
const acpDelta: AcpUsage = { inputTokens: 8000, outputTokens: 3500, totalTokens: 11500 };
|
||||
const result = buildUsage(acpDelta, beforeTurns, afterTurns, 7.3);
|
||||
expect(result).toEqual({
|
||||
turns: 2,
|
||||
inputTokens: 8000,
|
||||
outputTokens: 3500,
|
||||
duration: 7,
|
||||
});
|
||||
});
|
||||
|
||||
test("floors negative turn deltas at 0, then defaults to 1", () => {
|
||||
const beforeTurns = { turns: 5 };
|
||||
const afterTurns = { turns: 3 };
|
||||
const result = buildUsage(acpUsage, beforeTurns, afterTurns, 1.0);
|
||||
// turns would be negative (-2), floored to 0, then || 1 gives 1
|
||||
expect(result.turns).toBe(1);
|
||||
});
|
||||
|
||||
test("zero turns delta defaults to 1 (at least one turn happened)", () => {
|
||||
const beforeTurns = { turns: 3 };
|
||||
const afterTurns = { turns: 3 };
|
||||
const result = buildUsage(acpUsage, beforeTurns, afterTurns, 5.0);
|
||||
// turns delta is 0, || 1 gives 1
|
||||
expect(result.turns).toBe(1);
|
||||
});
|
||||
|
||||
test("null ACP usage yields zero tokens", () => {
|
||||
const beforeTurns = { turns: 0 };
|
||||
const afterTurns = { turns: 2 };
|
||||
const result = buildUsage(null, beforeTurns, afterTurns, 10.0);
|
||||
expect(result).toEqual({
|
||||
turns: 2,
|
||||
inputTokens: 0,
|
||||
outputTokens: 0,
|
||||
duration: 10,
|
||||
});
|
||||
});
|
||||
|
||||
test("duration is rounded", () => {
|
||||
const beforeTurns = { turns: 0 };
|
||||
const afterTurns = { turns: 1 };
|
||||
expect(buildUsage(acpUsage, beforeTurns, afterTurns, 3.7).duration).toBe(4);
|
||||
expect(buildUsage(acpUsage, beforeTurns, afterTurns, 3.2).duration).toBe(3);
|
||||
expect(buildUsage(acpUsage, beforeTurns, afterTurns, 0.0).duration).toBe(0);
|
||||
});
|
||||
});
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@united-workforce/agent-hermes",
|
||||
"version": "0.5.0",
|
||||
"version": "0.1.5",
|
||||
"files": [
|
||||
"src",
|
||||
"dist",
|
||||
@@ -8,7 +8,7 @@
|
||||
],
|
||||
"type": "module",
|
||||
"bin": {
|
||||
"uwf-hermes": "./src/cli.ts"
|
||||
"uwf-hermes": "./dist/cli.js"
|
||||
},
|
||||
"exports": {
|
||||
".": {
|
||||
@@ -17,12 +17,11 @@
|
||||
}
|
||||
},
|
||||
"scripts": {
|
||||
"prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
|
||||
"test": "vitest run __tests__/",
|
||||
"test:ci": "vitest run __tests__/"
|
||||
},
|
||||
"dependencies": {
|
||||
"@ocas/core": "^0.3.0",
|
||||
"@ocas/core": "^0.4.0",
|
||||
"@united-workforce/protocol": "workspace:^",
|
||||
"@united-workforce/util": "workspace:^",
|
||||
"@united-workforce/util-agent": "workspace:^"
|
||||
|
||||
@@ -1,8 +1,22 @@
|
||||
import type { ChildProcess } from "node:child_process";
|
||||
import { spawn } from "node:child_process";
|
||||
import { readFileSync } from "node:fs";
|
||||
import { dirname, join } from "node:path";
|
||||
import { createInterface } from "node:readline";
|
||||
import { fileURLToPath } from "node:url";
|
||||
|
||||
const HERMES_COMMAND = "hermes";
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const OWN_VERSION = (
|
||||
JSON.parse(readFileSync(join(__dirname, "..", "package.json"), "utf-8")) as {
|
||||
version: string;
|
||||
}
|
||||
).version;
|
||||
|
||||
/** Resolve hermes binary: `UWF_HERMES_BIN` override → default `"hermes"` via PATH. */
|
||||
function resolveHermesCommand(): string {
|
||||
const override = process.env.UWF_HERMES_BIN;
|
||||
return override !== undefined && override !== "" ? override : "hermes";
|
||||
}
|
||||
const PROTOCOL_VERSION = 1;
|
||||
|
||||
type JsonRpcResponse = {
|
||||
@@ -17,9 +31,17 @@ type PendingRequest = {
|
||||
reject: (reason: Error) => void;
|
||||
};
|
||||
|
||||
/** Token usage returned by ACP PromptResponse. */
|
||||
export type AcpUsage = {
|
||||
inputTokens: number;
|
||||
outputTokens: number;
|
||||
totalTokens: number;
|
||||
};
|
||||
|
||||
export type AcpPromptResult = {
|
||||
text: string;
|
||||
sessionId: string;
|
||||
usage: AcpUsage | null;
|
||||
};
|
||||
|
||||
export class HermesAcpClient {
|
||||
@@ -72,6 +94,11 @@ export class HermesAcpClient {
|
||||
return sessionId;
|
||||
}
|
||||
|
||||
/** Return the current session ID, or null if not connected. */
|
||||
getSessionId(): string | null {
|
||||
return this.sessionId;
|
||||
}
|
||||
|
||||
/** Send prompt and collect final assistant text from ACP stream chunks. */
|
||||
async prompt(text: string): Promise<AcpPromptResult> {
|
||||
if (this.sessionId === null) {
|
||||
@@ -91,9 +118,25 @@ export class HermesAcpClient {
|
||||
);
|
||||
}
|
||||
|
||||
// Extract token usage from ACP PromptResponse.result.usage (camelCase wire format)
|
||||
const result = (response as { result?: Record<string, unknown> }).result;
|
||||
const rawUsage = result?.usage as Record<string, unknown> | undefined;
|
||||
const usage: AcpUsage | null =
|
||||
rawUsage !== undefined &&
|
||||
typeof rawUsage.inputTokens === "number" &&
|
||||
typeof rawUsage.outputTokens === "number" &&
|
||||
typeof rawUsage.totalTokens === "number"
|
||||
? {
|
||||
inputTokens: rawUsage.inputTokens,
|
||||
outputTokens: rawUsage.outputTokens,
|
||||
totalTokens: rawUsage.totalTokens,
|
||||
}
|
||||
: null;
|
||||
|
||||
return {
|
||||
text: this.messageChunks.join(""),
|
||||
sessionId: this.sessionId,
|
||||
usage,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -232,7 +275,8 @@ export class HermesAcpClient {
|
||||
return;
|
||||
}
|
||||
|
||||
const child = spawn(HERMES_COMMAND, ["acp"], {
|
||||
const hermesCommand = resolveHermesCommand();
|
||||
const child = spawn(hermesCommand, ["acp"], {
|
||||
env: process.env,
|
||||
shell: false,
|
||||
stdio: ["pipe", "pipe", "pipe"],
|
||||
@@ -270,7 +314,7 @@ export class HermesAcpClient {
|
||||
private async initialize(): Promise<void> {
|
||||
const initResponse = await this.sendRequest("initialize", {
|
||||
protocolVersion: PROTOCOL_VERSION,
|
||||
clientInfo: { name: "uwf", version: "0.1.0" },
|
||||
clientInfo: { name: "uwf-hermes", version: OWN_VERSION },
|
||||
capabilities: {},
|
||||
});
|
||||
|
||||
|
||||
@@ -1,4 +1,11 @@
|
||||
#!/usr/bin/env node
|
||||
#!/usr/bin/env -S node --disable-warning=ExperimentalWarning
|
||||
|
||||
// eslint-disable-next-line -- dynamic import for version
|
||||
const pkg = await import("../package.json", { with: { type: "json" } });
|
||||
if (process.argv.includes("--version") || process.argv.includes("-V")) {
|
||||
process.stdout.write(`${pkg.default.version}\n`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
import { createHermesAgent } from "./hermes.js";
|
||||
import { isResumeDisabled } from "./session-cache.js";
|
||||
|
||||
@@ -1,19 +1,59 @@
|
||||
import type { Store } from "@ocas/core";
|
||||
import type { Usage } from "@united-workforce/protocol";
|
||||
import { createLogger } from "@united-workforce/util";
|
||||
import {
|
||||
type AgentContext,
|
||||
type AgentRunResult,
|
||||
buildContinuationPrompt,
|
||||
buildFrontmatterRetryPrompt,
|
||||
buildRolePrompt,
|
||||
buildThreadProgress,
|
||||
createAgent,
|
||||
} from "@united-workforce/util-agent";
|
||||
|
||||
import type { AcpUsage } from "./acp-client.js";
|
||||
import { HermesAcpClient } from "./acp-client.js";
|
||||
import { getCachedSessionId, setCachedSessionId } from "./session-cache.js";
|
||||
import { loadHermesSession, storeHermesSessionDetail } from "./session-detail.js";
|
||||
import type { HermesSessionJson } from "./types.js";
|
||||
|
||||
const log = createLogger({ sink: { kind: "stderr" } });
|
||||
|
||||
/** Snapshot of session metrics taken before and after a prompt call. */
|
||||
type TurnsSnapshot = {
|
||||
turns: number;
|
||||
};
|
||||
|
||||
const ZERO_TURNS: TurnsSnapshot = { turns: 0 };
|
||||
|
||||
/** Extract assistant turn count from a session. Returns zero for null sessions. */
|
||||
export function snapshotTurns(session: HermesSessionJson | null): TurnsSnapshot {
|
||||
if (session === null) {
|
||||
return ZERO_TURNS;
|
||||
}
|
||||
return {
|
||||
turns: session.messages.filter((m) => m.role === "assistant").length,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Build Usage from ACP token data + DB turn delta.
|
||||
* Tokens come from ACP PromptResponse (synchronous, accurate).
|
||||
* Turns come from DB before/after snapshots (may have WAL lag, but acceptable).
|
||||
*/
|
||||
export function buildUsage(
|
||||
acpUsage: AcpUsage | null,
|
||||
beforeTurns: TurnsSnapshot,
|
||||
afterTurns: TurnsSnapshot,
|
||||
durationSec: number,
|
||||
): Usage {
|
||||
return {
|
||||
turns: Math.max(0, afterTurns.turns - beforeTurns.turns) || 1,
|
||||
inputTokens: acpUsage?.inputTokens ?? 0,
|
||||
outputTokens: acpUsage?.outputTokens ?? 0,
|
||||
duration: Math.round(durationSec),
|
||||
};
|
||||
}
|
||||
|
||||
/** Assemble system prompt, task, and prior step outputs for Hermes. */
|
||||
export function buildHermesPrompt(ctx: AgentContext): string {
|
||||
const parts: string[] = [];
|
||||
@@ -22,6 +62,9 @@ export function buildHermesPrompt(ctx: AgentContext): string {
|
||||
parts.push(ctx.outputFormatInstruction, "");
|
||||
}
|
||||
|
||||
// Inject thread progress so the agent knows step count and role visit count
|
||||
parts.push(buildThreadProgress(ctx.steps, ctx.role), "");
|
||||
|
||||
if (!ctx.isFirstVisit) {
|
||||
// Re-entry: show only steps since last visit, meta only
|
||||
parts.push(buildContinuationPrompt(ctx.steps, ctx.role, ctx.edgePrompt));
|
||||
@@ -60,6 +103,8 @@ async function storePromptResult(store: Store, sessionId: string): Promise<{ det
|
||||
type PromptAttempt = {
|
||||
useContinuation: boolean;
|
||||
resumed: boolean;
|
||||
/** True when resuming after a frontmatter-only failure (isFirstVisit + cache hit). */
|
||||
frontmatterRetry: boolean;
|
||||
};
|
||||
|
||||
async function prepareSession(
|
||||
@@ -68,28 +113,36 @@ async function prepareSession(
|
||||
cwd: string,
|
||||
resumeDisabled: boolean,
|
||||
): Promise<PromptAttempt> {
|
||||
if (ctx.isFirstVisit || resumeDisabled) {
|
||||
if (resumeDisabled) {
|
||||
await client.connect(cwd);
|
||||
return { useContinuation: false, resumed: false };
|
||||
return { useContinuation: false, resumed: false, frontmatterRetry: false };
|
||||
}
|
||||
|
||||
// Check session cache regardless of isFirstVisit. A previous run may
|
||||
// have completed and cached its session but failed frontmatter
|
||||
// validation — the step never got written to CAS so isFirstVisit is
|
||||
// still true, yet we should resume the existing session.
|
||||
const cachedSessionId = await getCachedSessionId(ctx.threadId, ctx.role, ctx.storageRoot);
|
||||
if (cachedSessionId === null) {
|
||||
log("6RWK3N8Q", `no cached session for ${ctx.threadId}:${ctx.role}, starting new session`);
|
||||
await client.connect(cwd);
|
||||
return { useContinuation: false, resumed: false };
|
||||
return { useContinuation: false, resumed: false, frontmatterRetry: false };
|
||||
}
|
||||
|
||||
try {
|
||||
await client.resume(cachedSessionId, cwd);
|
||||
log("9MHT4V2P", `resumed hermes session ${cachedSessionId} for ${ctx.threadId}:${ctx.role}`);
|
||||
return { useContinuation: true, resumed: true };
|
||||
return {
|
||||
useContinuation: !ctx.isFirstVisit,
|
||||
resumed: true,
|
||||
frontmatterRetry: ctx.isFirstVisit,
|
||||
};
|
||||
} catch (error) {
|
||||
const message = error instanceof Error ? error.message : String(error);
|
||||
log("3XPN7K4W", `session resume failed, falling back to new session: ${message}`);
|
||||
await client.close();
|
||||
await client.connect(cwd);
|
||||
return { useContinuation: false, resumed: false };
|
||||
return { useContinuation: false, resumed: false, frontmatterRetry: false };
|
||||
}
|
||||
}
|
||||
|
||||
@@ -108,25 +161,48 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise<void>
|
||||
void client.close();
|
||||
});
|
||||
|
||||
async function runPrompt(ctx: AgentContext, useContinuation: boolean): Promise<AgentRunResult> {
|
||||
const effectiveCtx = useContinuation ? ctx : { ...ctx, isFirstVisit: true };
|
||||
const fullPrompt = buildHermesPrompt(effectiveCtx);
|
||||
const { text, sessionId } = await client.prompt(fullPrompt);
|
||||
async function runPrompt(
|
||||
ctx: AgentContext,
|
||||
useContinuation: boolean,
|
||||
beforeTurns: TurnsSnapshot,
|
||||
frontmatterRetry: boolean,
|
||||
): Promise<AgentRunResult> {
|
||||
// Frontmatter retry: session has full context, just re-output the format.
|
||||
const fullPrompt = frontmatterRetry
|
||||
? buildFrontmatterRetryPrompt(ctx.outputFormatInstruction)
|
||||
: buildHermesPrompt(useContinuation ? ctx : { ...ctx, isFirstVisit: true });
|
||||
const startMs = Date.now();
|
||||
const { text, sessionId, usage: acpUsage } = await client.prompt(fullPrompt);
|
||||
const durationSec = (Date.now() - startMs) / 1000;
|
||||
const { detailHash } = await storePromptResult(ctx.store, sessionId);
|
||||
|
||||
if (!resumeDisabled) {
|
||||
await setCachedSessionId(ctx.threadId, ctx.role, sessionId, ctx.storageRoot);
|
||||
}
|
||||
|
||||
return { output: text, detailHash, sessionId, assembledPrompt: fullPrompt };
|
||||
// Turns from DB (may lag slightly due to WAL, but acceptable)
|
||||
const afterSession = await loadHermesSession(sessionId);
|
||||
const afterTurns = snapshotTurns(afterSession);
|
||||
const usage = buildUsage(acpUsage, beforeTurns, afterTurns, durationSec);
|
||||
|
||||
return { output: text, detailHash, sessionId, assembledPrompt: fullPrompt, usage };
|
||||
}
|
||||
|
||||
async function runHermes(ctx: AgentContext): Promise<AgentRunResult> {
|
||||
const cwd = process.cwd();
|
||||
const attempt = await prepareSession(client, ctx, cwd, resumeDisabled);
|
||||
|
||||
// Snapshot before prompt: for resumed sessions, captures cumulative state
|
||||
// so we can compute the turn delta. For new sessions, this is ZERO_TURNS.
|
||||
const currentSessionId = client.getSessionId();
|
||||
const beforeSession =
|
||||
attempt.resumed && currentSessionId !== null
|
||||
? await loadHermesSession(currentSessionId)
|
||||
: null;
|
||||
const beforeTurns = snapshotTurns(beforeSession);
|
||||
|
||||
try {
|
||||
return await runPrompt(ctx, attempt.useContinuation);
|
||||
return await runPrompt(ctx, attempt.useContinuation, beforeTurns, attempt.frontmatterRetry);
|
||||
} catch (error) {
|
||||
if (!attempt.resumed) {
|
||||
throw error;
|
||||
@@ -136,7 +212,8 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise<void>
|
||||
log("8FQW2R6N", `continuation prompt failed, retrying with initial prompt: ${message}`);
|
||||
await client.close();
|
||||
await client.connect(cwd);
|
||||
return runPrompt(ctx, false);
|
||||
// Fresh session after retry — reset snapshot to zero
|
||||
return runPrompt(ctx, false, ZERO_TURNS, false);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -147,9 +224,22 @@ export function createHermesAgent(resumeDisabled: boolean): () => Promise<void>
|
||||
): Promise<AgentRunResult> {
|
||||
// Client is already connected from runHermes — same ACP session,
|
||||
// so the agent sees the full conversation history (crucial for retries).
|
||||
const { text, sessionId } = await client.prompt(message);
|
||||
// Snapshot turns before the continuation prompt for delta computation.
|
||||
const currentSessionId = client.getSessionId();
|
||||
const beforeSession =
|
||||
currentSessionId !== null ? await loadHermesSession(currentSessionId) : null;
|
||||
const beforeTurns = snapshotTurns(beforeSession);
|
||||
|
||||
const startMs = Date.now();
|
||||
const { text, sessionId, usage: acpUsage } = await client.prompt(message);
|
||||
const durationSec = (Date.now() - startMs) / 1000;
|
||||
const { detailHash } = await storePromptResult(store, sessionId);
|
||||
return { output: text, detailHash, sessionId, assembledPrompt: "" };
|
||||
|
||||
const afterSession = await loadHermesSession(sessionId);
|
||||
const afterTurns = snapshotTurns(afterSession);
|
||||
const usage = buildUsage(acpUsage, beforeTurns, afterTurns, durationSec);
|
||||
|
||||
return { output: text, detailHash, sessionId, assembledPrompt: "", usage };
|
||||
}
|
||||
|
||||
const agentMain = createAgent({
|
||||
|
||||
@@ -1,2 +1,8 @@
|
||||
export type { AcpUsage } from "./acp-client.js";
|
||||
export { HermesAcpClient } from "./acp-client.js";
|
||||
export { buildHermesPrompt, createHermesAgent } from "./hermes.js";
|
||||
export {
|
||||
buildHermesPrompt,
|
||||
buildUsage,
|
||||
createHermesAgent,
|
||||
snapshotTurns,
|
||||
} from "./hermes.js";
|
||||
|
||||
@@ -106,7 +106,7 @@ function parseSessionJson(raw: unknown): HermesSessionJson | null {
|
||||
messages.push(msg);
|
||||
}
|
||||
}
|
||||
return { session_id, model, session_start, messages };
|
||||
return { session_id, model, session_start, messages, inputTokens: 0, outputTokens: 0 };
|
||||
}
|
||||
|
||||
export function getHermesDbPath(): string {
|
||||
@@ -117,6 +117,8 @@ type DbSessionRow = {
|
||||
id: string;
|
||||
model: string;
|
||||
started_at: number;
|
||||
input_tokens: number;
|
||||
output_tokens: number;
|
||||
};
|
||||
|
||||
type DbMessageRow = {
|
||||
@@ -156,7 +158,9 @@ export function loadHermesSessionFromDb(
|
||||
try {
|
||||
db = new DatabaseSync(resolvedPath, { readOnly: true });
|
||||
const session = db
|
||||
.prepare("SELECT id, model, started_at FROM sessions WHERE id = ?")
|
||||
.prepare(
|
||||
"SELECT id, model, started_at, input_tokens, output_tokens FROM sessions WHERE id = ?",
|
||||
)
|
||||
.get(sessionId) as DbSessionRow | null;
|
||||
if (session === null) {
|
||||
return null;
|
||||
@@ -181,6 +185,8 @@ export function loadHermesSessionFromDb(
|
||||
model: session.model,
|
||||
session_start: new Date(session.started_at * 1000).toISOString(),
|
||||
messages,
|
||||
inputTokens: session.input_tokens ?? 0,
|
||||
outputTokens: session.output_tokens ?? 0,
|
||||
};
|
||||
} catch {
|
||||
return null;
|
||||
|
||||
@@ -40,4 +40,6 @@ export type HermesSessionJson = {
|
||||
model: string;
|
||||
session_start: string;
|
||||
messages: HermesSessionMessage[];
|
||||
inputTokens: number;
|
||||
outputTokens: number;
|
||||
};
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@united-workforce/agent-mock",
|
||||
"version": "0.5.0",
|
||||
"version": "0.1.2",
|
||||
"files": [
|
||||
"src",
|
||||
"dist",
|
||||
@@ -17,12 +17,11 @@
|
||||
}
|
||||
},
|
||||
"scripts": {
|
||||
"prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
|
||||
"test": "vitest run __tests__/",
|
||||
"test:ci": "vitest run __tests__/"
|
||||
},
|
||||
"dependencies": {
|
||||
"@ocas/core": "^0.3.0",
|
||||
"@ocas/core": "^0.4.0",
|
||||
"@united-workforce/protocol": "workspace:^",
|
||||
"@united-workforce/util": "workspace:^",
|
||||
"@united-workforce/util-agent": "workspace:^",
|
||||
|
||||
@@ -1,4 +1,11 @@
|
||||
#!/usr/bin/env node
|
||||
#!/usr/bin/env -S node --disable-warning=ExperimentalWarning
|
||||
|
||||
// eslint-disable-next-line -- dynamic import for version
|
||||
const pkg = await import("../package.json", { with: { type: "json" } });
|
||||
if (process.argv.includes("--version") || process.argv.includes("-V")) {
|
||||
process.stdout.write(`${pkg.default.version}\n`);
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
import { createMockAgent } from "./mock-agent.js";
|
||||
|
||||
|
||||
@@ -103,6 +103,7 @@ export function createMockAgent(mockDataPath: string): () => Promise<void> {
|
||||
detailHash,
|
||||
sessionId,
|
||||
assembledPrompt: "",
|
||||
usage: { turns: 1, inputTokens: 0, outputTokens: 0, duration: 0 },
|
||||
};
|
||||
lastResult = result;
|
||||
return result;
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
# @united-workforce/cli
|
||||
|
||||
## 0.1.1
|
||||
|
||||
### Patch Changes
|
||||
|
||||
- 850a3b2: fix: resolve --agent override via config alias before raw command
|
||||
|
||||
`resolveAgentConfig()` now checks `config.agents[alias]` first before falling back to `parseAgentOverride()`. Eval CLI default `--agent` changed from `"hermes"` to `"uwf-hermes"`.
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@united-workforce/cli",
|
||||
"version": "0.5.0",
|
||||
"version": "0.3.0",
|
||||
"files": [
|
||||
"src",
|
||||
"dist",
|
||||
@@ -11,8 +11,8 @@
|
||||
"uwf": "./dist/cli.js"
|
||||
},
|
||||
"dependencies": {
|
||||
"@ocas/core": "^0.3.0",
|
||||
"@ocas/fs": "^0.3.0",
|
||||
"@ocas/core": "^0.4.0",
|
||||
"@ocas/fs": "^0.4.0",
|
||||
"@united-workforce/protocol": "workspace:^",
|
||||
"@united-workforce/util": "workspace:^",
|
||||
"@united-workforce/util-agent": "workspace:^",
|
||||
@@ -22,7 +22,6 @@
|
||||
"yaml": "^2.8.4"
|
||||
},
|
||||
"scripts": {
|
||||
"prepublishOnly": "echo 'Use pnpm run release from repo root' && exit 1",
|
||||
"test": "vitest run src/",
|
||||
"test:ci": "vitest run src/"
|
||||
},
|
||||
|
||||
@@ -58,7 +58,10 @@ describe("C1: adapter JSON round-trip integration", () => {
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: { _: { role: "worker", prompt: "Do the work", location: null } },
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Do the work", location: null },
|
||||
resume: { role: "worker", prompt: "Resume the work", location: null },
|
||||
},
|
||||
worker: { done: { role: "$END", prompt: "completed", location: null } },
|
||||
},
|
||||
});
|
||||
|
||||
@@ -6,13 +6,7 @@ import type { CasRef, ThreadId } from "@united-workforce/protocol";
|
||||
import { describe, expect, test } from "vitest";
|
||||
import { createMarker, deleteMarker } from "../background/index.js";
|
||||
import { cmdThreadList, cmdThreadShow, cmdThreadStart } from "../commands/thread.js";
|
||||
import {
|
||||
addHistoryEntry,
|
||||
createUwfStore,
|
||||
deleteThread,
|
||||
loadAllThreads,
|
||||
setThread,
|
||||
} from "../store.js";
|
||||
import { completeThread, createUwfStore, loadActiveThreads, setThread } from "../store.js";
|
||||
|
||||
const OUTPUT_SCHEMA = {
|
||||
type: "object" as const,
|
||||
@@ -34,9 +28,13 @@ roles:
|
||||
$status: "ready"
|
||||
frontmatter:
|
||||
type: object
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: "ready" }
|
||||
required: ["$status"]
|
||||
- properties:
|
||||
$status: { const: "not-ready" }
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { type: string, enum: ["ready", "not-ready"] }
|
||||
roleB:
|
||||
description: Second role
|
||||
goal: Do B
|
||||
@@ -48,13 +46,17 @@ roles:
|
||||
type: object
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { type: string }
|
||||
$status: { const: "done" }
|
||||
graph:
|
||||
$START:
|
||||
_:
|
||||
new:
|
||||
role: roleA
|
||||
prompt: "Do A"
|
||||
location: null
|
||||
resume:
|
||||
role: roleA
|
||||
prompt: "Resume A"
|
||||
location: null
|
||||
roleA:
|
||||
ready:
|
||||
role: roleB
|
||||
@@ -65,7 +67,7 @@ graph:
|
||||
prompt: "Try again"
|
||||
location: null
|
||||
roleB:
|
||||
_:
|
||||
done:
|
||||
role: $END
|
||||
prompt: "Done"
|
||||
location: null
|
||||
@@ -84,9 +86,13 @@ roles:
|
||||
$status: "pass"
|
||||
frontmatter:
|
||||
type: object
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: "pass" }
|
||||
required: ["$status"]
|
||||
- properties:
|
||||
$status: { const: "fail" }
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { type: string, enum: ["pass", "fail"] }
|
||||
roleB:
|
||||
description: Pass role
|
||||
goal: Do B
|
||||
@@ -98,7 +104,7 @@ roles:
|
||||
type: object
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { type: string }
|
||||
$status: { const: "done" }
|
||||
roleC:
|
||||
description: Fail role
|
||||
goal: Do C
|
||||
@@ -110,13 +116,17 @@ roles:
|
||||
type: object
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { type: string }
|
||||
$status: { const: "done" }
|
||||
graph:
|
||||
$START:
|
||||
_:
|
||||
new:
|
||||
role: roleA
|
||||
prompt: "Do A"
|
||||
location: null
|
||||
resume:
|
||||
role: roleA
|
||||
prompt: "Resume A"
|
||||
location: null
|
||||
roleA:
|
||||
pass:
|
||||
role: roleB
|
||||
@@ -127,12 +137,12 @@ graph:
|
||||
prompt: "Do C (fail)"
|
||||
location: null
|
||||
roleB:
|
||||
_:
|
||||
done:
|
||||
role: $END
|
||||
prompt: "Done"
|
||||
location: null
|
||||
roleC:
|
||||
_:
|
||||
done:
|
||||
role: $END
|
||||
prompt: "Done"
|
||||
location: null
|
||||
@@ -153,15 +163,19 @@ roles:
|
||||
type: object
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { type: string }
|
||||
$status: { const: "done" }
|
||||
graph:
|
||||
$START:
|
||||
_:
|
||||
new:
|
||||
role: worker
|
||||
prompt: "Work"
|
||||
location: null
|
||||
resume:
|
||||
role: worker
|
||||
prompt: "Resume work"
|
||||
location: null
|
||||
worker:
|
||||
_:
|
||||
done:
|
||||
role: $END
|
||||
prompt: "Done"
|
||||
location: null
|
||||
@@ -175,7 +189,7 @@ async function insertStepNode(
|
||||
outputPayload: Record<string, unknown>,
|
||||
): Promise<void> {
|
||||
const uwf = await createUwfStore(storageRoot);
|
||||
const index = loadAllThreads(uwf.varStore);
|
||||
const index = loadActiveThreads(uwf.varStore);
|
||||
const headEntry = index[threadId];
|
||||
if (headEntry === undefined) throw new Error(`thread ${threadId} not in index`);
|
||||
const head = headEntry.head;
|
||||
@@ -206,7 +220,13 @@ async function insertStepNode(
|
||||
assembledPrompt: null,
|
||||
})) as CasRef;
|
||||
|
||||
setThread(uwf.varStore, threadId, { head: stepHash, suspendedRole: null, suspendMessage: null });
|
||||
setThread(uwf.varStore, threadId, {
|
||||
head: stepHash,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
}
|
||||
|
||||
describe("currentRole field", () => {
|
||||
@@ -282,19 +302,12 @@ describe("currentRole field", () => {
|
||||
try {
|
||||
const wf = join(tmpDir, "test-current-role.yaml");
|
||||
await writeFile(wf, SIMPLE_WORKFLOW_YAML, "utf8");
|
||||
const { thread, workflow } = await cmdThreadStart(storageRoot, wf, "test", tmpDir);
|
||||
const { thread } = await cmdThreadStart(storageRoot, wf, "test", tmpDir);
|
||||
const tid = thread as ThreadId;
|
||||
|
||||
const uwfForIndex = await createUwfStore(storageRoot);
|
||||
const head = loadAllThreads(uwfForIndex.varStore)[tid]!.head;
|
||||
deleteThread(uwfForIndex.varStore, tid);
|
||||
addHistoryEntry(uwfForIndex.varStore, {
|
||||
thread: tid,
|
||||
workflow,
|
||||
head,
|
||||
completedAt: Date.now(),
|
||||
reason: "completed",
|
||||
});
|
||||
loadActiveThreads(uwfForIndex.varStore)[tid]!.head;
|
||||
completeThread(uwfForIndex.varStore, tid, "completed");
|
||||
|
||||
const result = await cmdThreadShow(storageRoot, tid);
|
||||
expect(result.status).toBe("completed");
|
||||
@@ -310,19 +323,12 @@ describe("currentRole field", () => {
|
||||
try {
|
||||
const wf = join(tmpDir, "test-current-role.yaml");
|
||||
await writeFile(wf, SIMPLE_WORKFLOW_YAML, "utf8");
|
||||
const { thread, workflow } = await cmdThreadStart(storageRoot, wf, "test", tmpDir);
|
||||
const { thread } = await cmdThreadStart(storageRoot, wf, "test", tmpDir);
|
||||
const tid = thread as ThreadId;
|
||||
|
||||
const uwfForIndex = await createUwfStore(storageRoot);
|
||||
const head = loadAllThreads(uwfForIndex.varStore)[tid]!.head;
|
||||
deleteThread(uwfForIndex.varStore, tid);
|
||||
addHistoryEntry(uwfForIndex.varStore, {
|
||||
thread: tid,
|
||||
workflow,
|
||||
head,
|
||||
completedAt: Date.now(),
|
||||
reason: "cancelled",
|
||||
});
|
||||
loadActiveThreads(uwfForIndex.varStore)[tid]!.head;
|
||||
completeThread(uwfForIndex.varStore, tid, "cancelled");
|
||||
|
||||
const result = await cmdThreadShow(storageRoot, tid);
|
||||
expect(result.status).toBe("cancelled");
|
||||
@@ -375,15 +381,8 @@ describe("currentRole field", () => {
|
||||
const comp = await cmdThreadStart(storageRoot, wf, "completed", tmpDir);
|
||||
const compId = comp.thread as ThreadId;
|
||||
const uwfForIndex = await createUwfStore(storageRoot);
|
||||
const compHead = loadAllThreads(uwfForIndex.varStore)[compId]!.head;
|
||||
deleteThread(uwfForIndex.varStore, compId);
|
||||
addHistoryEntry(uwfForIndex.varStore, {
|
||||
thread: compId,
|
||||
workflow: comp.workflow,
|
||||
head: compHead,
|
||||
completedAt: Date.now(),
|
||||
reason: "completed",
|
||||
});
|
||||
const _compHead = loadActiveThreads(uwfForIndex.varStore)[compId]!.head;
|
||||
completeThread(uwfForIndex.varStore, compId, "completed");
|
||||
|
||||
const list = await cmdThreadList(storageRoot, null, null, null, 0, 100);
|
||||
|
||||
@@ -447,8 +446,8 @@ describe("currentRole field", () => {
|
||||
await writeFile(wf, SINGLE_ROLE_WORKFLOW_YAML, "utf8");
|
||||
|
||||
const { thread } = await cmdThreadStart(storageRoot, wf, "test", tmpDir);
|
||||
// worker → _ maps to $END
|
||||
await insertStepNode(storageRoot, thread as ThreadId, "worker", {});
|
||||
// worker → done maps to $END
|
||||
await insertStepNode(storageRoot, thread as ThreadId, "worker", { $status: "done" });
|
||||
|
||||
const result = await cmdThreadShow(storageRoot, thread as ThreadId);
|
||||
expect(result.currentRole).toBe(null);
|
||||
|
||||
@@ -10,7 +10,7 @@ import { afterEach, beforeAll, beforeEach, describe, expect, test } from "vitest
|
||||
import { stringify } from "yaml";
|
||||
import { cmdThreadStart } from "../commands/thread.js";
|
||||
import { cmdWorkflowAdd } from "../commands/workflow.js";
|
||||
import { createUwfStore, findHistoryEntry, getThread } from "../store.js";
|
||||
import { createUwfStore, getThread } from "../store.js";
|
||||
|
||||
// ── paths ──────────────────────────────────────────────────────────────────
|
||||
|
||||
@@ -106,9 +106,13 @@ async function addWorkflow(workflowFixture: string, workflowName: string): Promi
|
||||
|
||||
type ExecResult = { stdout: string; stderr: string; exitCode: number };
|
||||
|
||||
function runExec(threadId: string): ExecResult {
|
||||
function runExec(threadId: string, count: number | null = null): ExecResult {
|
||||
const args = [CLI_PATH, "thread", "exec", threadId];
|
||||
if (count !== null) {
|
||||
args.push("--count", String(count));
|
||||
}
|
||||
try {
|
||||
const stdout = execFileSync(process.execPath, [CLI_PATH, "thread", "exec", threadId], {
|
||||
const stdout = execFileSync(process.execPath, args, {
|
||||
encoding: "utf8",
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
env: { ...process.env, UWF_HOME: uwfHome, OCAS_HOME: casDir },
|
||||
@@ -126,11 +130,38 @@ function runExec(threadId: string): ExecResult {
|
||||
}
|
||||
}
|
||||
|
||||
/** Invoke `uwf thread resume <threadId> -p <prompt>` through the built CLI. */
|
||||
function runResume(threadId: string, prompt: string): ExecResult {
|
||||
try {
|
||||
const stdout = execFileSync(
|
||||
process.execPath,
|
||||
[CLI_PATH, "thread", "resume", threadId, "-p", prompt],
|
||||
{
|
||||
encoding: "utf8",
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
env: { ...process.env, UWF_HOME: uwfHome, OCAS_HOME: casDir },
|
||||
cwd: tmpDir,
|
||||
timeout: 30000,
|
||||
},
|
||||
);
|
||||
return { stdout, stderr: "", exitCode: 0 };
|
||||
} catch (e: unknown) {
|
||||
const err = e as NodeJS.ErrnoException & {
|
||||
stdout?: string;
|
||||
stderr?: string;
|
||||
status?: number;
|
||||
};
|
||||
return { stdout: err.stdout ?? "", stderr: err.stderr ?? "", exitCode: err.status ?? 1 };
|
||||
}
|
||||
}
|
||||
|
||||
type StepOutputJson = {
|
||||
thread: string;
|
||||
head: string;
|
||||
status: string;
|
||||
currentRole: string | null;
|
||||
suspendedRole: string | null;
|
||||
suspendMessage: string | null;
|
||||
done: boolean;
|
||||
};
|
||||
|
||||
@@ -198,19 +229,25 @@ describe("E2E mock-agent: full uwf pipeline", () => {
|
||||
expect(getStatus(store, s1.output)).toBe("ready");
|
||||
expect(getStatus(store, s2.output)).toBe("done");
|
||||
|
||||
// Mock agent reports usage stats in step nodes.
|
||||
expect(s1.usage).toEqual({ turns: 1, inputTokens: 0, outputTokens: 0, duration: 0 });
|
||||
expect(s2.usage).toEqual({ turns: 1, inputTokens: 0, outputTokens: 0, duration: 0 });
|
||||
|
||||
// The start node points at the registered workflow.
|
||||
const startNode = store.cas.get(startHash as CasRef);
|
||||
expect((startNode!.payload as StartNodePayload).workflow).toBe(workflowHash);
|
||||
|
||||
// Thread is completed: removed from active index, present in history.
|
||||
// Thread is completed: status changed to "completed", head updated.
|
||||
const uwf = await createUwfStore(uwfHome);
|
||||
expect(getThread(uwf.varStore, threadId)).toBeNull();
|
||||
const hist = findHistoryEntry(uwf.varStore, threadId);
|
||||
expect(hist).not.toBeNull();
|
||||
expect(hist!.head).toBe(step2.head);
|
||||
const finalEntry = getThread(uwf.varStore, threadId);
|
||||
expect(finalEntry).not.toBeNull();
|
||||
expect(finalEntry!.status).toBe("completed");
|
||||
expect(finalEntry!.head).toBe(step2.head);
|
||||
});
|
||||
|
||||
test("2. branching workflow loops developer→reviewer→developer→reviewer→$END", async () => {
|
||||
test("2. branching workflow loops developer→reviewer→developer→reviewer→$END", {
|
||||
timeout: 30_000,
|
||||
}, async () => {
|
||||
await writeMockConfig("e2e-loop.mock.yaml");
|
||||
const workflowHash = await addWorkflow("e2e-loop.workflow.yaml", "test-loop");
|
||||
|
||||
@@ -263,11 +300,14 @@ describe("E2E mock-agent: full uwf pipeline", () => {
|
||||
expect(getStatus(store, n4.output)).toBe("approved");
|
||||
|
||||
const uwf = await createUwfStore(uwfHome);
|
||||
expect(getThread(uwf.varStore, threadId)).toBeNull();
|
||||
expect(findHistoryEntry(uwf.varStore, threadId)).not.toBeNull();
|
||||
const finalEntry = getThread(uwf.varStore, threadId);
|
||||
expect(finalEntry).not.toBeNull();
|
||||
expect(finalEntry!.status).toBe("completed");
|
||||
});
|
||||
|
||||
test("3. role mismatch in mock data makes the agent exit with an error", async () => {
|
||||
test("3. role mismatch in mock data makes the agent exit with an error", {
|
||||
timeout: 30_000,
|
||||
}, async () => {
|
||||
// Reuses the linear workflow but with a mock whose step[1].role is wrong.
|
||||
await writeMockConfig("e2e-mismatch.mock.yaml");
|
||||
const workflowHash = await addWorkflow("e2e-linear.workflow.yaml", "test-linear");
|
||||
@@ -287,7 +327,172 @@ describe("E2E mock-agent: full uwf pipeline", () => {
|
||||
|
||||
// The thread remains active (no step node was written for the failed step).
|
||||
const uwf = await createUwfStore(uwfHome);
|
||||
expect(getThread(uwf.varStore, threadId)).not.toBeNull();
|
||||
expect(getThread(uwf.varStore, threadId)!.head).toBe(step1.head);
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry!.status).not.toBe("completed");
|
||||
expect(entry!.head).toBe(step1.head);
|
||||
});
|
||||
|
||||
test("4. planner $SUSPEND then resume re-runs planner and reaches $END", {
|
||||
timeout: 30_000,
|
||||
}, async () => {
|
||||
await writeMockConfig("e2e-suspend.mock.yaml");
|
||||
const workflowHash = await addWorkflow("e2e-suspend.workflow.yaml", "test-suspend");
|
||||
|
||||
const start = await cmdThreadStart(uwfHome, workflowHash, "Analyze the task", uwfHome, tmpDir);
|
||||
const threadId = start.thread;
|
||||
|
||||
// Step 1 → planner emits insufficient_info → thread suspends.
|
||||
const step1 = execStep(threadId);
|
||||
expect(step1.status).toBe("suspended");
|
||||
expect(step1.done).toBe(false);
|
||||
expect(step1.currentRole).toBeNull();
|
||||
expect(step1.suspendedRole).toBe("planner");
|
||||
expect(step1.suspendMessage).toBe("Need more info: missing requirements");
|
||||
|
||||
// Thread index entry reflects the suspension with rendered metadata.
|
||||
const suspendedEntry = getThread((await createUwfStore(uwfHome)).varStore, threadId);
|
||||
expect(suspendedEntry).not.toBeNull();
|
||||
expect(suspendedEntry!.status).toBe("suspended");
|
||||
expect(suspendedEntry!.suspendedRole).toBe("planner");
|
||||
expect(suspendedEntry!.suspendMessage).toBe("Need more info: missing requirements");
|
||||
|
||||
// Resume re-runs the planner role; the second scripted step is `ready` → $END.
|
||||
const resume = runResume(threadId, "Here are the requirements");
|
||||
expect(resume.exitCode).toBe(0);
|
||||
const resumeOut = JSON.parse(resume.stdout.trim()) as StepOutputJson;
|
||||
expect(resumeOut.status).toBe("completed");
|
||||
expect(resumeOut.done).toBe(true);
|
||||
expect(resumeOut.currentRole).toBeNull();
|
||||
expect(resumeOut.suspendedRole).toBeNull();
|
||||
|
||||
// CAS chain: suspended planner step → resumed planner step.
|
||||
const store = await openStore(casDir);
|
||||
const s1 = getStepNode(store, step1.head);
|
||||
const s2 = getStepNode(store, resumeOut.head);
|
||||
expect(s1.role).toBe("planner");
|
||||
expect(s2.role).toBe("planner");
|
||||
expect(s2.prev).toBe(step1.head);
|
||||
expect(getStatus(store, s1.output)).toBe("insufficient_info");
|
||||
expect(getStatus(store, s2.output)).toBe("ready");
|
||||
|
||||
const finalEntry = getThread((await createUwfStore(uwfHome)).varStore, threadId);
|
||||
expect(finalEntry).not.toBeNull();
|
||||
expect(finalEntry!.status).toBe("completed");
|
||||
expect(finalEntry!.head).toBe(resumeOut.head);
|
||||
});
|
||||
|
||||
test("5. --count 3 runs the whole linear pipeline in one invocation", {
|
||||
timeout: 30_000,
|
||||
}, async () => {
|
||||
await writeMockConfig("e2e-count.mock.yaml");
|
||||
const workflowHash = await addWorkflow("e2e-count.workflow.yaml", "test-count");
|
||||
|
||||
const start = await cmdThreadStart(uwfHome, workflowHash, "Ship the feature", uwfHome, tmpDir);
|
||||
const threadId = start.thread;
|
||||
|
||||
// Single invocation with --count 3 → moderator drives analyst → developer → reviewer → $END.
|
||||
const { stdout, stderr, exitCode } = runExec(threadId, 3);
|
||||
expect(exitCode, `stderr: ${stderr}`).toBe(0);
|
||||
|
||||
// Multi-step exec emits a JSON array (one entry per executed step).
|
||||
const results = JSON.parse(stdout.trim()) as StepOutputJson[];
|
||||
expect(Array.isArray(results)).toBe(true);
|
||||
expect(results).toHaveLength(3);
|
||||
|
||||
expect(results[0].status).toBe("idle");
|
||||
expect(results[0].currentRole).toBe("developer");
|
||||
expect(results[1].status).toBe("idle");
|
||||
expect(results[1].currentRole).toBe("reviewer");
|
||||
expect(results[2].status).toBe("completed");
|
||||
expect(results[2].done).toBe(true);
|
||||
|
||||
// Verify the CAS chain holds 3 step nodes in the correct order.
|
||||
const store = await openStore(casDir);
|
||||
const n1 = getStepNode(store, results[0].head);
|
||||
const n2 = getStepNode(store, results[1].head);
|
||||
const n3 = getStepNode(store, results[2].head);
|
||||
expect([n1.role, n2.role, n3.role]).toEqual(["analyst", "developer", "reviewer"]);
|
||||
expect(n1.prev).toBeNull();
|
||||
expect(n2.prev).toBe(results[0].head);
|
||||
expect(n3.prev).toBe(results[1].head);
|
||||
expect(new Set([n1.start, n2.start, n3.start]).size).toBe(1);
|
||||
|
||||
const finalEntry = getThread((await createUwfStore(uwfHome)).varStore, threadId);
|
||||
expect(finalEntry).not.toBeNull();
|
||||
expect(finalEntry!.status).toBe("completed");
|
||||
expect(finalEntry!.head).toBe(results[2].head);
|
||||
});
|
||||
|
||||
test("6. mustache edge prompt renders planner variables into the worker step", {
|
||||
timeout: 30_000,
|
||||
}, async () => {
|
||||
await writeMockConfig("e2e-mustache.mock.yaml");
|
||||
const workflowHash = await addWorkflow("e2e-mustache.workflow.yaml", "test-mustache");
|
||||
|
||||
const start = await cmdThreadStart(uwfHome, workflowHash, "Plan the task", uwfHome, tmpDir);
|
||||
const threadId = start.thread;
|
||||
|
||||
// Step 1 → planner emits branch + repoPath.
|
||||
const step1 = execStep(threadId);
|
||||
expect(step1.status).toBe("idle");
|
||||
expect(step1.currentRole).toBe("worker");
|
||||
|
||||
// Step 2 → worker; the moderator renders the templated edge prompt before spawning it.
|
||||
const step2 = execStep(threadId);
|
||||
expect(step2.done).toBe(true);
|
||||
expect(step2.status).toBe("completed");
|
||||
|
||||
const store = await openStore(casDir);
|
||||
const plannerStep = getStepNode(store, step1.head);
|
||||
expect(getStatus(store, plannerStep.output)).toBe("ready");
|
||||
|
||||
// The worker step's edgePrompt is the mustache-rendered template.
|
||||
const workerStep = getStepNode(store, step2.head);
|
||||
expect(workerStep.role).toBe("worker");
|
||||
expect(workerStep.edgePrompt).toContain("fix/42-auth");
|
||||
expect(workerStep.edgePrompt).toContain("/tmp/my-repo");
|
||||
expect(workerStep.edgePrompt).toBe("Work on branch fix/42-auth in /tmp/my-repo");
|
||||
});
|
||||
|
||||
test("7. completed thread can be resumed (衔尾蛇: end → start)", {
|
||||
timeout: 30_000,
|
||||
}, async () => {
|
||||
// Reuse the suspend workflow (planner with ready → $END), but mock data
|
||||
// goes straight to ready on first run, then ready again after resume.
|
||||
await writeMockConfig("e2e-completed-resume.mock.yaml");
|
||||
const workflowHash = await addWorkflow("e2e-suspend.workflow.yaml", "test-suspend");
|
||||
|
||||
const start = await cmdThreadStart(uwfHome, workflowHash, "Do the work", uwfHome, tmpDir);
|
||||
const threadId = start.thread;
|
||||
|
||||
// Step 1: planner outputs ready → $END → thread completed.
|
||||
const step1 = execStep(threadId);
|
||||
expect(step1.done).toBe(true);
|
||||
expect(step1.status).toBe("completed");
|
||||
|
||||
const uwf1 = await createUwfStore(uwfHome);
|
||||
const entry1 = getThread(uwf1.varStore, threadId);
|
||||
expect(entry1).not.toBeNull();
|
||||
expect(entry1!.status).toBe("completed");
|
||||
|
||||
// Resume the completed thread — should re-evaluate $START → planner.
|
||||
const resumeResult = runResume(threadId, "Additional context for round 2");
|
||||
expect(resumeResult.exitCode).toBe(0);
|
||||
|
||||
// After resume step, planner ran again (step index 1 in mock) → ready → $END.
|
||||
const uwf2 = await createUwfStore(uwfHome);
|
||||
const entry2 = getThread(uwf2.varStore, threadId);
|
||||
expect(entry2).not.toBeNull();
|
||||
expect(entry2!.status).toBe("completed");
|
||||
// Head should have advanced (not the same as step1).
|
||||
expect(entry2!.head).not.toBe(step1.head);
|
||||
|
||||
// CAS chain: step2.prev === step1 head (chain is preserved across resume).
|
||||
const store = await openStore(casDir);
|
||||
const resumeOutput = JSON.parse(resumeResult.stdout.trim());
|
||||
const step2Node = getStepNode(store, resumeOutput.head);
|
||||
expect(step2Node.role).toBe("planner");
|
||||
expect(step2Node.prev).toBe(step1.head);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
steps:
|
||||
# Step 0: planner → ready → $END (thread completes)
|
||||
- role: planner
|
||||
output: |
|
||||
---
|
||||
$status: ready
|
||||
---
|
||||
Initial plan complete.
|
||||
# Step 1: after resume, planner runs again from $START → ready → $END again
|
||||
- role: planner
|
||||
output: |
|
||||
---
|
||||
$status: ready
|
||||
---
|
||||
Revised plan after resume.
|
||||
@@ -0,0 +1,19 @@
|
||||
steps:
|
||||
- role: analyst
|
||||
output: |
|
||||
---
|
||||
$status: analyzed
|
||||
---
|
||||
Analysis complete.
|
||||
- role: developer
|
||||
output: |
|
||||
---
|
||||
$status: implemented
|
||||
---
|
||||
Implementation complete.
|
||||
- role: reviewer
|
||||
output: |
|
||||
---
|
||||
$status: approved
|
||||
---
|
||||
Approved.
|
||||
@@ -0,0 +1,46 @@
|
||||
name: test-count
|
||||
description: 3-step linear pipeline (analyst -> developer -> reviewer -> $END)
|
||||
roles:
|
||||
analyst:
|
||||
description: Analyzes the task
|
||||
goal: Analyze the task
|
||||
capabilities: []
|
||||
procedure: Analyze it
|
||||
output: Output the analysis and set $status to analyzed
|
||||
frontmatter:
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: analyzed }
|
||||
required: [$status]
|
||||
developer:
|
||||
description: Implements the change
|
||||
goal: Implement the change
|
||||
capabilities: []
|
||||
procedure: Write code
|
||||
output: Output the implementation and set $status to implemented
|
||||
frontmatter:
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: implemented }
|
||||
required: [$status]
|
||||
reviewer:
|
||||
description: Reviews the change
|
||||
goal: Review the change
|
||||
capabilities: []
|
||||
procedure: Review code
|
||||
output: Approve and set $status to approved
|
||||
frontmatter:
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: approved }
|
||||
required: [$status]
|
||||
graph:
|
||||
$START:
|
||||
new: { role: analyst, prompt: 'Analyze the task' }
|
||||
resume: { role: analyst, prompt: 'Review the previous run output and continue the work.' }
|
||||
analyst:
|
||||
analyzed: { role: developer, prompt: 'Implement the change' }
|
||||
developer:
|
||||
implemented: { role: reviewer, prompt: 'Review the change' }
|
||||
reviewer:
|
||||
approved: { role: '$END', prompt: 'Done' }
|
||||
@@ -25,7 +25,8 @@ roles:
|
||||
required: [$status]
|
||||
graph:
|
||||
$START:
|
||||
_: { role: planner, prompt: 'Plan the task' }
|
||||
new: { role: planner, prompt: 'Plan the task' }
|
||||
resume: { role: planner, prompt: 'Review the previous run output and continue the work.' }
|
||||
planner:
|
||||
ready: { role: worker, prompt: 'Do the work' }
|
||||
worker:
|
||||
|
||||
@@ -28,7 +28,8 @@ roles:
|
||||
required: [$status]
|
||||
graph:
|
||||
$START:
|
||||
_: { role: developer, prompt: 'Implement the change' }
|
||||
new: { role: developer, prompt: 'Implement the change' }
|
||||
resume: { role: developer, prompt: 'Review the previous run output and continue the work.' }
|
||||
developer:
|
||||
review_needed: { role: reviewer, prompt: 'Review the change' }
|
||||
reviewer:
|
||||
|
||||
@@ -0,0 +1,15 @@
|
||||
steps:
|
||||
- role: planner
|
||||
output: |
|
||||
---
|
||||
$status: ready
|
||||
branch: fix/42-auth
|
||||
repoPath: /tmp/my-repo
|
||||
---
|
||||
Planned the work.
|
||||
- role: worker
|
||||
output: |
|
||||
---
|
||||
$status: done
|
||||
---
|
||||
Work complete.
|
||||
@@ -0,0 +1,35 @@
|
||||
name: test-mustache
|
||||
description: Planner emits template variables consumed by the worker edge prompt
|
||||
roles:
|
||||
planner:
|
||||
description: Plans work and emits branch + repo path
|
||||
goal: Plan the task
|
||||
capabilities: []
|
||||
procedure: Decide the branch and repo path
|
||||
output: Set $status to ready and emit branch and repoPath
|
||||
frontmatter:
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: ready }
|
||||
branch: { type: string }
|
||||
repoPath: { type: string }
|
||||
required: [$status, branch, repoPath]
|
||||
worker:
|
||||
description: Works on the planned branch
|
||||
goal: Do the work
|
||||
capabilities: []
|
||||
procedure: Do it
|
||||
output: Output the result and set $status to done
|
||||
frontmatter:
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: done }
|
||||
required: [$status]
|
||||
graph:
|
||||
$START:
|
||||
new: { role: planner, prompt: 'Plan the task' }
|
||||
resume: { role: planner, prompt: 'Review the previous run output and continue the work.' }
|
||||
planner:
|
||||
ready: { role: worker, prompt: 'Work on branch {{{branch}}} in {{{repoPath}}}' }
|
||||
worker:
|
||||
done: { role: '$END', prompt: 'Complete' }
|
||||
@@ -0,0 +1,14 @@
|
||||
steps:
|
||||
- role: planner
|
||||
output: |
|
||||
---
|
||||
$status: insufficient_info
|
||||
reason: missing requirements
|
||||
---
|
||||
I need more information before I can plan this.
|
||||
- role: planner
|
||||
output: |
|
||||
---
|
||||
$status: ready
|
||||
---
|
||||
I now have what I need. Ready to proceed.
|
||||
@@ -0,0 +1,25 @@
|
||||
name: test-suspend
|
||||
description: Planner can suspend for more info or finish when ready
|
||||
roles:
|
||||
planner:
|
||||
description: Plans work and may request more info
|
||||
goal: Analyze the task
|
||||
capabilities: []
|
||||
procedure: Analyze the task and decide if more info is needed
|
||||
output: Set $status to insufficient_info (with reason) or ready
|
||||
frontmatter:
|
||||
oneOf:
|
||||
- properties:
|
||||
$status: { const: insufficient_info }
|
||||
reason: { type: string }
|
||||
required: [$status, reason]
|
||||
- properties:
|
||||
$status: { const: ready }
|
||||
required: [$status]
|
||||
graph:
|
||||
$START:
|
||||
new: { role: planner, prompt: 'Analyze the task' }
|
||||
resume: { role: planner, prompt: 'Review the previous run output and continue the work.' }
|
||||
planner:
|
||||
insufficient_info: { role: '$SUSPEND', prompt: 'Need more info: {{{reason}}}' }
|
||||
ready: { role: '$END', prompt: 'Done' }
|
||||
@@ -5,13 +5,18 @@ import { evaluate } from "../moderator/evaluate.js";
|
||||
|
||||
const solveIssueGraph: WorkflowPayload["graph"] = {
|
||||
$START: {
|
||||
_: { role: "planner", prompt: "Start planning from the issue in the task.", location: null },
|
||||
new: { role: "planner", prompt: "Start planning from the issue in the task.", location: null },
|
||||
resume: {
|
||||
role: "planner",
|
||||
prompt: "Review the previous run output and continue the work.",
|
||||
location: null,
|
||||
},
|
||||
},
|
||||
planner: {
|
||||
_: { role: "developer", prompt: "Implement the plan: {{plan}}", location: null },
|
||||
planned: { role: "developer", prompt: "Implement the plan: {{plan}}", location: null },
|
||||
},
|
||||
developer: {
|
||||
_: { role: "reviewer", prompt: "Review the changes: {{summary}}", location: null },
|
||||
implemented: { role: "reviewer", prompt: "Review the changes: {{summary}}", location: null },
|
||||
},
|
||||
reviewer: {
|
||||
approved: { role: "$END", prompt: "Done.", location: null },
|
||||
@@ -20,8 +25,8 @@ const solveIssueGraph: WorkflowPayload["graph"] = {
|
||||
};
|
||||
|
||||
describe("evaluate", () => {
|
||||
test("$START → first role (unit status _)", () => {
|
||||
const result = evaluate(solveIssueGraph, "$START", { $status: "_" });
|
||||
test("$START → first role (status new)", () => {
|
||||
const result = evaluate(solveIssueGraph, "$START", { $status: "new" });
|
||||
expect(result).toEqual({
|
||||
ok: true,
|
||||
value: {
|
||||
@@ -32,6 +37,18 @@ describe("evaluate", () => {
|
||||
});
|
||||
});
|
||||
|
||||
test("$START → first role (status resume)", () => {
|
||||
const result = evaluate(solveIssueGraph, "$START", { $status: "resume" });
|
||||
expect(result).toEqual({
|
||||
ok: true,
|
||||
value: {
|
||||
role: "planner",
|
||||
prompt: "Review the previous run output and continue the work.",
|
||||
location: null,
|
||||
},
|
||||
});
|
||||
});
|
||||
|
||||
test("status-based routing (reviewer rejected → developer)", () => {
|
||||
const result = evaluate(solveIssueGraph, "reviewer", {
|
||||
$status: "rejected",
|
||||
@@ -95,7 +112,7 @@ describe("evaluate", () => {
|
||||
});
|
||||
|
||||
test("missing role in graph → error", () => {
|
||||
const result = evaluate(solveIssueGraph, "unknown-role", { $status: "_" });
|
||||
const result = evaluate(solveIssueGraph, "unknown-role", { $status: "new" });
|
||||
expect(result.ok).toBe(false);
|
||||
if (!result.ok) {
|
||||
expect(result.error.message).toBe('no transitions defined for role "unknown-role"');
|
||||
@@ -112,7 +129,7 @@ describe("evaluate", () => {
|
||||
|
||||
test("mustache template rendering with simple fields", () => {
|
||||
const result = evaluate(solveIssueGraph, "planner", {
|
||||
$status: "_",
|
||||
$status: "planned",
|
||||
plan: "Add auth middleware",
|
||||
});
|
||||
expect(result).toEqual({
|
||||
@@ -139,11 +156,11 @@ describe("evaluate", () => {
|
||||
test("triple mustache also works for unescaped output", () => {
|
||||
const graph: Record<string, Record<string, Target>> = {
|
||||
reviewer: {
|
||||
_: { role: "developer", prompt: "Fix: {{{comments}}}", location: null },
|
||||
rejected: { role: "developer", prompt: "Fix: {{{comments}}}", location: null },
|
||||
},
|
||||
};
|
||||
const result = evaluate(graph, "reviewer", {
|
||||
$status: "_",
|
||||
$status: "rejected",
|
||||
comments: "<script>alert(1)</script>",
|
||||
});
|
||||
expect(result).toEqual({
|
||||
@@ -152,24 +169,22 @@ describe("evaluate", () => {
|
||||
});
|
||||
});
|
||||
|
||||
test("missing $status defaults to _ (unit routing)", () => {
|
||||
test("missing $status → error (no unit fallback)", () => {
|
||||
const result = evaluate(solveIssueGraph, "planner", {
|
||||
plan: "Add auth middleware",
|
||||
});
|
||||
expect(result).toEqual({
|
||||
ok: true,
|
||||
value: {
|
||||
role: "developer",
|
||||
prompt: "Implement the plan: Add auth middleware",
|
||||
location: null,
|
||||
},
|
||||
});
|
||||
expect(result.ok).toBe(false);
|
||||
if (!result.ok) {
|
||||
expect(result.error.message).toBe(
|
||||
'agent output for role "planner" is missing required "$status" string',
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
test("mustache template with nested object paths", () => {
|
||||
const graph: Record<string, Record<string, Target>> = {
|
||||
reviewer: {
|
||||
_: {
|
||||
rejected: {
|
||||
role: "developer",
|
||||
prompt: "Address: {{review.comments}}",
|
||||
location: null,
|
||||
@@ -177,7 +192,7 @@ describe("evaluate", () => {
|
||||
},
|
||||
};
|
||||
const result = evaluate(graph, "reviewer", {
|
||||
$status: "_",
|
||||
$status: "rejected",
|
||||
review: { comments: "refactor the handler" },
|
||||
});
|
||||
expect(result).toEqual({
|
||||
|
||||
@@ -6,101 +6,107 @@ import { describe, expect, test } from "vitest";
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
import {
|
||||
cmdPromptAdapter,
|
||||
cmdPromptAuthor,
|
||||
cmdPromptDeveloper,
|
||||
cmdPromptAdapterDeveloping,
|
||||
cmdPromptBootstrap,
|
||||
cmdPromptList,
|
||||
cmdPromptSetup,
|
||||
cmdPromptUsage,
|
||||
cmdPromptUser,
|
||||
cmdPromptWorkflowAuthoring,
|
||||
} from "../commands/prompt.js";
|
||||
|
||||
describe("prompt commands", () => {
|
||||
test("prompt list returns all prompt names", () => {
|
||||
test("prompt list returns prompt names (no bootstrap)", () => {
|
||||
const result = cmdPromptList();
|
||||
expect(result).toBeInstanceOf(Array);
|
||||
expect(result).toContain("user");
|
||||
expect(result).toContain("author");
|
||||
expect(result).toContain("developer");
|
||||
expect(result).toContain("adapter");
|
||||
expect(result).toContain("usage");
|
||||
expect(result).toContain("workflow-authoring");
|
||||
expect(result).toContain("adapter-developing");
|
||||
expect(result).not.toContain("bootstrap");
|
||||
for (const name of result) {
|
||||
expect(name).toMatch(/^\S+$/);
|
||||
}
|
||||
});
|
||||
|
||||
test("prompt user returns non-empty markdown string", () => {
|
||||
const result = cmdPromptUser();
|
||||
test("prompt usage returns only the usage reference with frontmatter", () => {
|
||||
const result = cmdPromptUsage();
|
||||
expect(typeof result).toBe("string");
|
||||
expect(result).toContain("uwf");
|
||||
expect(result).toContain("thread");
|
||||
expect(result).toContain("workflow");
|
||||
expect(result).toContain("Quick Start");
|
||||
expect(result).toContain("---");
|
||||
expect(result).toContain("name:");
|
||||
expect(result).toContain("version:");
|
||||
// Should NOT contain other references
|
||||
expect(result).not.toContain("Workflow Authoring Reference");
|
||||
expect(result).not.toContain("Adapter Developing Reference");
|
||||
expect(result.length).toBeGreaterThan(500);
|
||||
});
|
||||
|
||||
test("prompt author returns non-empty markdown string", () => {
|
||||
const result = cmdPromptAuthor();
|
||||
test("prompt workflow-authoring returns non-empty markdown string with frontmatter", () => {
|
||||
const result = cmdPromptWorkflowAuthoring();
|
||||
expect(typeof result).toBe("string");
|
||||
expect(result).toContain("frontmatter");
|
||||
expect(result).toContain("graph");
|
||||
expect(result).toContain("$START");
|
||||
expect(result).toContain("$END");
|
||||
expect(result).toContain("$status");
|
||||
expect(result).toContain("---");
|
||||
expect(result).toContain("name:");
|
||||
expect(result).toContain("version:");
|
||||
expect(result.length).toBeGreaterThan(500);
|
||||
});
|
||||
|
||||
test("prompt developer returns non-empty markdown string", () => {
|
||||
const result = cmdPromptDeveloper();
|
||||
expect(typeof result).toBe("string");
|
||||
expect(result).toContain("Monorepo");
|
||||
expect(result).toContain("CAS");
|
||||
expect(result).toContain("Biome");
|
||||
expect(result.length).toBeGreaterThan(500);
|
||||
});
|
||||
|
||||
test("prompt adapter returns non-empty markdown string", () => {
|
||||
const result = cmdPromptAdapter();
|
||||
test("prompt adapter-developing returns non-empty markdown string with frontmatter", () => {
|
||||
const result = cmdPromptAdapterDeveloping();
|
||||
expect(typeof result).toBe("string");
|
||||
expect(result).toContain("createAgent");
|
||||
expect(result).toContain("AgentContext");
|
||||
expect(result).toContain("frontmatter");
|
||||
expect(result).toContain("---");
|
||||
expect(result).toContain("name:");
|
||||
expect(result).toContain("version:");
|
||||
expect(result.length).toBeGreaterThan(500);
|
||||
});
|
||||
|
||||
test("prompt usage combines all references", () => {
|
||||
const result = cmdPromptUsage();
|
||||
test("prompt bootstrap returns framework-agnostic setup instructions", () => {
|
||||
const result = cmdPromptBootstrap();
|
||||
expect(typeof result).toBe("string");
|
||||
expect(result).toContain("User Reference");
|
||||
expect(result).toContain("Author Reference");
|
||||
expect(result).toContain("Developer Reference");
|
||||
expect(result).toContain("Adapter Reference");
|
||||
expect(result).toContain("---");
|
||||
expect(result.length).toBeGreaterThan(2000);
|
||||
});
|
||||
|
||||
test("prompt setup returns setup instructions", () => {
|
||||
const result = cmdPromptSetup();
|
||||
expect(typeof result).toBe("string");
|
||||
expect(result).toContain("uwf Skill Setup");
|
||||
// Skills installation
|
||||
expect(result).toContain("uwf prompt usage");
|
||||
expect(result).toContain("uwf prompt setup");
|
||||
expect(result).toContain("SKILL.md");
|
||||
expect(result).toContain("version");
|
||||
expect(result).toContain("uwf prompt workflow-authoring");
|
||||
expect(result).toContain("uwf prompt adapter-developing");
|
||||
expect(result).toContain("uwf-usage");
|
||||
expect(result).toContain("uwf-workflow-authoring");
|
||||
expect(result).toContain("uwf-adapter-developing");
|
||||
// Fresh install scenario
|
||||
expect(result).toContain("Fresh Install");
|
||||
expect(result).toContain("uwf setup");
|
||||
expect(result).toContain("--provider");
|
||||
expect(result).toContain("--api-key");
|
||||
expect(result).toContain("agent adapter");
|
||||
// Upgrade scenario
|
||||
expect(result).toContain("Upgrade");
|
||||
expect(result).toContain("Migrate");
|
||||
// Should NOT contain Hermes-specific paths
|
||||
expect(result).not.toContain("~/.hermes/skills/");
|
||||
expect(result).not.toContain("> ~/.hermes/");
|
||||
expect(result.length).toBeGreaterThan(100);
|
||||
});
|
||||
|
||||
test("prompt help subcommand is suppressed", () => {
|
||||
const output = execFileSync("npx", ["tsx", "src/cli.ts", "prompt", "--help"], {
|
||||
cwd: join(__dirname, "..", ".."),
|
||||
test("prompt help subcommand is suppressed", { timeout: 30_000 }, () => {
|
||||
const cliPath = join(__dirname, "..", "..", "dist", "cli.js");
|
||||
const output = execFileSync("node", [cliPath, "prompt", "--help"], {
|
||||
encoding: "utf-8",
|
||||
env: { ...process.env, PATH: `/opt/homebrew/bin:${process.env.PATH}` },
|
||||
env: { ...process.env },
|
||||
});
|
||||
expect(output).not.toMatch(/help\s+\[command\]/i);
|
||||
expect(output).toContain("usage");
|
||||
expect(output).toContain("setup");
|
||||
expect(output).toContain("user");
|
||||
expect(output).toContain("author");
|
||||
expect(output).toContain("developer");
|
||||
expect(output).toContain("adapter");
|
||||
expect(output).toContain("bootstrap");
|
||||
expect(output).toContain("workflow-authoring");
|
||||
expect(output).toContain("adapter-developing");
|
||||
expect(output).toContain("list");
|
||||
// Removed subcommands should not appear as command names
|
||||
expect(output).not.toMatch(/^\s+setup\s/m);
|
||||
expect(output).not.toContain("usage-reference");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -4,7 +4,7 @@ import { join } from "node:path";
|
||||
import { type CasRef, createThreadIndexEntry, type ThreadId } from "@united-workforce/protocol";
|
||||
import { afterEach, beforeEach, describe, expect, test } from "vitest";
|
||||
import { resolveHeadHash } from "../commands/shared.js";
|
||||
import { addHistoryEntry, createUwfStore, setThread } from "../store.js";
|
||||
import { completeThread, createUwfStore, setThread } from "../store.js";
|
||||
|
||||
let tmpDir: string;
|
||||
|
||||
@@ -31,19 +31,13 @@ describe("resolveHeadHash", () => {
|
||||
expect(result).toBe(headHash);
|
||||
});
|
||||
|
||||
test("falls back to history variable when thread not in active index", async () => {
|
||||
test("finds completed thread", async () => {
|
||||
const threadId = "01JTEST0000000000000000002" as ThreadId;
|
||||
const workflowHash = "workflow_hash_789" as CasRef;
|
||||
|
||||
const uwf = await createUwfStore(tmpDir);
|
||||
const headHash = (await uwf.store.cas.put(uwf.schemas.text, "completed-head")) as CasRef;
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: workflowHash,
|
||||
head: headHash,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
});
|
||||
setThread(uwf.varStore, threadId, createThreadIndexEntry(headHash));
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const result = await resolveHeadHash(tmpDir, threadId);
|
||||
|
||||
@@ -54,58 +48,36 @@ describe("resolveHeadHash", () => {
|
||||
// calls fail() which does process.exit(1), terminating the test runner.
|
||||
// The error behavior is tested in integration tests below via CLI invocation.
|
||||
|
||||
test("prioritizes active thread over history when thread exists in both", async () => {
|
||||
test("prioritizes active thread", async () => {
|
||||
const threadId = "01JTEST0000000000000000004" as ThreadId;
|
||||
const workflowHash = "workflow_hash_xyz" as CasRef;
|
||||
|
||||
const uwf = await createUwfStore(tmpDir);
|
||||
const activeHead = (await uwf.store.cas.put(uwf.schemas.text, "active-v2")) as CasRef;
|
||||
const historicalHash = (await uwf.store.cas.put(uwf.schemas.text, "historical-v1")) as CasRef;
|
||||
setThread(uwf.varStore, threadId, createThreadIndexEntry(activeHead));
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: workflowHash,
|
||||
head: historicalHash,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
});
|
||||
|
||||
const result = await resolveHeadHash(tmpDir, threadId);
|
||||
|
||||
// Should return the active head, not the historical one
|
||||
// Should return the active head
|
||||
expect(result).toBe(activeHead);
|
||||
});
|
||||
|
||||
test("finds thread from multiple history entries", async () => {
|
||||
test("finds thread from multiple completed threads", async () => {
|
||||
const threadId1 = "01JTEST0000000000000000005" as ThreadId;
|
||||
const threadId2 = "01JTEST0000000000000000006" as ThreadId;
|
||||
const threadId3 = "01JTEST0000000000000000007" as ThreadId;
|
||||
const workflowHash = "workflow_hash_abc" as CasRef;
|
||||
const uwf = await createUwfStore(tmpDir);
|
||||
const hash1 = (await uwf.store.cas.put(uwf.schemas.text, "hash-thread1")) as CasRef;
|
||||
const hash2 = (await uwf.store.cas.put(uwf.schemas.text, "hash-thread2")) as CasRef;
|
||||
const hash3 = (await uwf.store.cas.put(uwf.schemas.text, "hash-thread3")) as CasRef;
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId1,
|
||||
workflow: workflowHash,
|
||||
head: hash1,
|
||||
completedAt: Date.now() - 2000,
|
||||
reason: null,
|
||||
});
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId2,
|
||||
workflow: workflowHash,
|
||||
head: hash2,
|
||||
completedAt: Date.now() - 1000,
|
||||
reason: null,
|
||||
});
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId3,
|
||||
workflow: workflowHash,
|
||||
head: hash3,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
});
|
||||
|
||||
setThread(uwf.varStore, threadId1, createThreadIndexEntry(hash1));
|
||||
completeThread(uwf.varStore, threadId1, "completed");
|
||||
|
||||
setThread(uwf.varStore, threadId2, createThreadIndexEntry(hash2));
|
||||
completeThread(uwf.varStore, threadId2, "completed");
|
||||
|
||||
setThread(uwf.varStore, threadId3, createThreadIndexEntry(hash3));
|
||||
completeThread(uwf.varStore, threadId3, "completed");
|
||||
|
||||
const result = await resolveHeadHash(tmpDir, threadId2);
|
||||
|
||||
|
||||
@@ -21,11 +21,11 @@ describe("solve-issue workflow: Gitea API PR creation", () => {
|
||||
"..",
|
||||
"..",
|
||||
"..",
|
||||
".workflows",
|
||||
"examples",
|
||||
"solve-issue.yaml",
|
||||
);
|
||||
|
||||
test("committer procedure should use curl API instead of tea pr create", async () => {
|
||||
test("committer procedure should create PR via tea pr create", async () => {
|
||||
const yamlContent = await readFile(workflowPath, "utf-8");
|
||||
const workflow = parse(yamlContent) as WorkflowPayload;
|
||||
|
||||
@@ -33,25 +33,22 @@ describe("solve-issue workflow: Gitea API PR creation", () => {
|
||||
const committerProcedure = workflow.roles.committer?.procedure;
|
||||
expect(committerProcedure).toBeDefined();
|
||||
|
||||
// Verify the procedure uses curl API, not tea pr create
|
||||
expect(committerProcedure).toContain("curl");
|
||||
expect(committerProcedure).toContain("api/v1/repos");
|
||||
expect(committerProcedure).toContain("/pulls");
|
||||
|
||||
// Verify it explicitly warns against tea pr create
|
||||
expect(committerProcedure).toMatch(/do NOT use.*tea pr create/i);
|
||||
// Verify the procedure uses tea pr create for PR creation
|
||||
expect(committerProcedure).toContain("tea pr create");
|
||||
expect(committerProcedure).toContain("git push");
|
||||
expect(committerProcedure).toContain("Fixes #N");
|
||||
});
|
||||
|
||||
test("committer procedure should reference repoRemote from task prompt", async () => {
|
||||
test("committer procedure should extract owner/repo from git remote", async () => {
|
||||
const yamlContent = await readFile(workflowPath, "utf-8");
|
||||
const workflow = parse(yamlContent) as WorkflowPayload;
|
||||
|
||||
const committerProcedure = workflow.roles.committer?.procedure;
|
||||
expect(committerProcedure).toBeDefined();
|
||||
|
||||
// Verify the procedure mentions repoRemote is provided in task prompt
|
||||
expect(committerProcedure).toMatch(/repo remote.*provided.*task prompt/i);
|
||||
expect(committerProcedure).toMatch(/owner\/repo/i);
|
||||
// Verify the procedure extracts owner/repo from remote
|
||||
expect(committerProcedure).toContain("git remote get-url origin");
|
||||
expect(committerProcedure).toContain("hook_failed");
|
||||
});
|
||||
|
||||
test("committer procedure should include error handling for curl failures", async () => {
|
||||
@@ -100,45 +97,42 @@ describe("solve-issue workflow: Gitea API PR creation", () => {
|
||||
expect(committedVariant.required).toContain("$status");
|
||||
});
|
||||
|
||||
test("developer procedure should include mandatory verification step", async () => {
|
||||
test("developer procedure should include worktree setup", async () => {
|
||||
const yamlContent = await readFile(workflowPath, "utf-8");
|
||||
const workflow = parse(yamlContent) as WorkflowPayload;
|
||||
|
||||
const developerProcedure = workflow.roles.developer?.procedure;
|
||||
expect(developerProcedure).toBeDefined();
|
||||
|
||||
// Verify the procedure includes mandatory verification step
|
||||
expect(developerProcedure).toContain("MANDATORY VERIFICATION");
|
||||
expect(developerProcedure).toContain("git branch --show-current");
|
||||
expect(developerProcedure).toContain("git status");
|
||||
expect(developerProcedure).toMatch(/ls -la|verify.*exist/i);
|
||||
// Verify the procedure includes worktree setup
|
||||
expect(developerProcedure).toContain("IMPORTANT");
|
||||
expect(developerProcedure).toContain("git worktree add");
|
||||
expect(developerProcedure).toContain("pnpm install");
|
||||
});
|
||||
|
||||
test("reviewer procedure should enforce worktree path verification", async () => {
|
||||
test("reviewer procedure should verify branch and run checks", async () => {
|
||||
const yamlContent = await readFile(workflowPath, "utf-8");
|
||||
const workflow = parse(yamlContent) as WorkflowPayload;
|
||||
|
||||
const reviewerProcedure = workflow.roles.reviewer?.procedure;
|
||||
expect(reviewerProcedure).toBeDefined();
|
||||
|
||||
// Verify the procedure includes critical enforcement
|
||||
expect(reviewerProcedure).toContain("CRITICAL");
|
||||
expect(reviewerProcedure).toMatch(/cd.*pwd/);
|
||||
expect(reviewerProcedure).toContain(
|
||||
"Do NOT report results without running the actual commands",
|
||||
);
|
||||
// Verify the procedure includes branch verification and build checks
|
||||
expect(reviewerProcedure).toContain("git branch --show-current");
|
||||
expect(reviewerProcedure).toContain("pnpm run build");
|
||||
expect(reviewerProcedure).toContain("pnpm run check");
|
||||
});
|
||||
|
||||
test("developer procedure should include test debugging escalation", async () => {
|
||||
test("developer procedure should include changeset and failure handling", async () => {
|
||||
const yamlContent = await readFile(workflowPath, "utf-8");
|
||||
const workflow = parse(yamlContent) as WorkflowPayload;
|
||||
|
||||
const developerProcedure = workflow.roles.developer?.procedure;
|
||||
expect(developerProcedure).toBeDefined();
|
||||
|
||||
// Verify the procedure includes test failure guidance
|
||||
expect(developerProcedure).toMatch(/tests fail.*first run/i);
|
||||
expect(developerProcedure).toMatch(/3 test cycles|after 3 attempts/i);
|
||||
// Verify the procedure includes changeset requirement and failure path
|
||||
expect(developerProcedure).toContain(".changeset/");
|
||||
expect(developerProcedure).toContain("$status=failed");
|
||||
expect(developerProcedure).toContain("pnpm test");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -118,6 +118,7 @@ async function createTestStep(
|
||||
completedAtMs: Date.now() + 1000,
|
||||
assembledPrompt: null,
|
||||
cwd: "/tmp",
|
||||
usage: null,
|
||||
};
|
||||
return store.cas.put(schemas.stepNode, stepPayload);
|
||||
}
|
||||
|
||||
@@ -96,6 +96,7 @@ describe("protocol types", () => {
|
||||
completedAtMs: 2000,
|
||||
assembledPrompt: null,
|
||||
cwd: "/test/path",
|
||||
usage: null,
|
||||
};
|
||||
expect(record.startedAtMs).toBe(1000);
|
||||
expect(record.completedAtMs).toBe(2000);
|
||||
@@ -110,6 +111,7 @@ describe("protocol types", () => {
|
||||
agent: "uwf-test",
|
||||
timestamp: 123,
|
||||
durationMs: 5000,
|
||||
usage: null,
|
||||
};
|
||||
expect(entry.durationMs).toBe(5000);
|
||||
});
|
||||
@@ -251,8 +253,11 @@ describe("thread read timing", () => {
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: { _: { role: "worker", prompt: "go", location: null } },
|
||||
worker: { _: { role: "$END", prompt: "", location: null } },
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "go", location: null },
|
||||
resume: { role: "worker", prompt: "resume", location: null },
|
||||
},
|
||||
worker: { done: { role: "$END", prompt: "", location: null } },
|
||||
},
|
||||
});
|
||||
|
||||
@@ -317,8 +322,11 @@ describe("thread read timing", () => {
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: { _: { role: "worker", prompt: "go", location: null } },
|
||||
worker: { _: { role: "$END", prompt: "", location: null } },
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "go", location: null },
|
||||
resume: { role: "worker", prompt: "resume", location: null },
|
||||
},
|
||||
worker: { done: { role: "$END", prompt: "", location: null } },
|
||||
},
|
||||
});
|
||||
|
||||
|
||||
@@ -226,19 +226,15 @@ describe("Global CAS directory", () => {
|
||||
const uwf = await createUwfStore(storageRoot);
|
||||
const threadId = "thread-123" as ThreadId;
|
||||
const headHash = await uwf.store.cas.put(uwf.schemas.text, "history-head");
|
||||
const { addHistoryEntry, findHistoryEntry } = await import("../store.js");
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: "workflow-456",
|
||||
head: headHash,
|
||||
completedAt: Date.now(),
|
||||
reason: "completed",
|
||||
});
|
||||
const { completeThread, setThread, getThread } = await import("../store.js");
|
||||
const { createThreadIndexEntry } = await import("@united-workforce/protocol");
|
||||
|
||||
const entry = findHistoryEntry(uwf.varStore, threadId);
|
||||
expect(entry?.thread).toBe(threadId);
|
||||
expect(entry?.workflow).toBe("workflow-456");
|
||||
setThread(uwf.varStore, threadId, createThreadIndexEntry(headHash));
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry?.head).toBe(headHash);
|
||||
expect(entry?.status).toBe("completed");
|
||||
|
||||
const { access } = await import("node:fs/promises");
|
||||
await access(join(globalCasDir, "vars"));
|
||||
@@ -274,15 +270,12 @@ describe("Global CAS directory", () => {
|
||||
);
|
||||
|
||||
const uwf = await createUwfStore(storageRoot);
|
||||
const { findHistoryEntry } = await import("../store.js");
|
||||
const entry = findHistoryEntry(uwf.varStore, threadId);
|
||||
expect(entry).toEqual({
|
||||
thread: threadId,
|
||||
workflow: workflowHash,
|
||||
head: headHash,
|
||||
completedAt,
|
||||
reason: "cancelled",
|
||||
});
|
||||
const { getThread } = await import("../store.js");
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry?.head).toBe(headHash);
|
||||
expect(entry?.status).toBe("cancelled");
|
||||
expect(entry?.completedAt).toBe(completedAt);
|
||||
|
||||
await expect(access(historyPath)).rejects.toThrow();
|
||||
const migratedContent = await readFile(`${historyPath}.migrated`, "utf8");
|
||||
|
||||
@@ -0,0 +1,235 @@
|
||||
import { mkdir, mkdtemp } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import type { CasRef, ThreadId } from "@united-workforce/protocol";
|
||||
import { describe, expect, test } from "vitest";
|
||||
import {
|
||||
completeThread,
|
||||
createUwfStore,
|
||||
getThread,
|
||||
loadActiveThreads,
|
||||
loadHistoryThreads,
|
||||
setThread,
|
||||
} from "../store.js";
|
||||
|
||||
async function makeUwfStore(storageRoot: string) {
|
||||
const casDir = join(storageRoot, "cas");
|
||||
await mkdir(casDir, { recursive: true });
|
||||
process.env.OCAS_HOME = casDir;
|
||||
return createUwfStore(storageRoot);
|
||||
}
|
||||
|
||||
async function seedThreadHead(
|
||||
uwf: Awaited<ReturnType<typeof createUwfStore>>,
|
||||
label: string,
|
||||
): Promise<CasRef> {
|
||||
return (await uwf.store.cas.put(uwf.schemas.text, label)) as CasRef;
|
||||
}
|
||||
|
||||
describe("unified thread storage", () => {
|
||||
test("loadActiveThreads excludes completed threads", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-active-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
|
||||
const threadId1 = "01JTEST000000000000ACTIVE1" as ThreadId;
|
||||
const threadId2 = "01JTEST000000000000ACTIVE2" as ThreadId;
|
||||
const head1 = await seedThreadHead(uwf, "active-head");
|
||||
const head2 = await seedThreadHead(uwf, "completed-head");
|
||||
|
||||
setThread(uwf.varStore, threadId1, {
|
||||
head: head1,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
setThread(uwf.varStore, threadId2, {
|
||||
head: head2,
|
||||
status: "completed",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: Date.now(),
|
||||
});
|
||||
|
||||
const active = loadActiveThreads(uwf.varStore);
|
||||
expect(Object.keys(active)).toHaveLength(1);
|
||||
expect(active[threadId1]).toBeDefined();
|
||||
expect(active[threadId2]).toBeUndefined();
|
||||
});
|
||||
|
||||
test("loadActiveThreads excludes cancelled threads", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-active-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
|
||||
const threadId1 = "01JTEST000000000000ACTIVE3" as ThreadId;
|
||||
const threadId2 = "01JTEST000000000000ACTIVE4" as ThreadId;
|
||||
const head1 = await seedThreadHead(uwf, "active-head");
|
||||
const head2 = await seedThreadHead(uwf, "cancelled-head");
|
||||
|
||||
setThread(uwf.varStore, threadId1, {
|
||||
head: head1,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
setThread(uwf.varStore, threadId2, {
|
||||
head: head2,
|
||||
status: "cancelled",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: Date.now(),
|
||||
});
|
||||
|
||||
const active = loadActiveThreads(uwf.varStore);
|
||||
expect(Object.keys(active)).toHaveLength(1);
|
||||
expect(active[threadId1]).toBeDefined();
|
||||
expect(active[threadId2]).toBeUndefined();
|
||||
});
|
||||
|
||||
test("loadHistoryThreads only returns completed and cancelled", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-history-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
|
||||
const threadId1 = "01JTEST000000000000HISTOR1" as ThreadId;
|
||||
const threadId2 = "01JTEST000000000000HISTOR2" as ThreadId;
|
||||
const threadId3 = "01JTEST000000000000HISTOR3" as ThreadId;
|
||||
const head1 = await seedThreadHead(uwf, "active-head");
|
||||
const head2 = await seedThreadHead(uwf, "completed-head");
|
||||
const head3 = await seedThreadHead(uwf, "cancelled-head");
|
||||
|
||||
setThread(uwf.varStore, threadId1, {
|
||||
head: head1,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
setThread(uwf.varStore, threadId2, {
|
||||
head: head2,
|
||||
status: "completed",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: Date.now(),
|
||||
});
|
||||
|
||||
setThread(uwf.varStore, threadId3, {
|
||||
head: head3,
|
||||
status: "cancelled",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: Date.now(),
|
||||
});
|
||||
|
||||
const history = loadHistoryThreads(uwf.varStore);
|
||||
expect(Object.keys(history)).toHaveLength(2);
|
||||
expect(history[threadId1]).toBeUndefined();
|
||||
expect(history[threadId2]).toBeDefined();
|
||||
expect(history[threadId3]).toBeDefined();
|
||||
});
|
||||
|
||||
test("completeThread marks thread as completed", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-complete-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const threadId = "01JTEST000000000000COMPLE1" as ThreadId;
|
||||
const head = await seedThreadHead(uwf, "active-head");
|
||||
|
||||
setThread(uwf.varStore, threadId, {
|
||||
head,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry?.status).toBe("completed");
|
||||
expect(entry?.completedAt).toBeDefined();
|
||||
expect(entry?.completedAt).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
test("completeThread marks thread as cancelled", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-complete-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const threadId = "01JTEST000000000000COMPLE2" as ThreadId;
|
||||
const head = await seedThreadHead(uwf, "active-head");
|
||||
|
||||
setThread(uwf.varStore, threadId, {
|
||||
head,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
completeThread(uwf.varStore, threadId, "cancelled");
|
||||
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry?.status).toBe("cancelled");
|
||||
expect(entry?.completedAt).toBeDefined();
|
||||
expect(entry?.completedAt).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
test("completeThread clears suspend metadata", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-complete-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const threadId = "01JTEST000000000000COMPLE3" as ThreadId;
|
||||
const head = await seedThreadHead(uwf, "suspended-head");
|
||||
|
||||
setThread(uwf.varStore, threadId, {
|
||||
head,
|
||||
status: "suspended",
|
||||
suspendedRole: "test-role",
|
||||
suspendMessage: "test message",
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry?.status).toBe("completed");
|
||||
expect(entry?.suspendedRole).toBeNull();
|
||||
expect(entry?.suspendMessage).toBeNull();
|
||||
});
|
||||
|
||||
test("completeThread handles non-existent thread gracefully", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-complete-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const threadId = "01JTEST000000000000NOEXIST" as ThreadId;
|
||||
|
||||
// Should not throw
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry).toBeNull();
|
||||
});
|
||||
|
||||
test("status and completedAt tags are persisted and loaded", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-tags-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const threadId = "01JTEST000000000000TAGTEST" as ThreadId;
|
||||
const head = await seedThreadHead(uwf, "test-head");
|
||||
const now = Date.now();
|
||||
|
||||
setThread(uwf.varStore, threadId, {
|
||||
head,
|
||||
status: "completed",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: now,
|
||||
});
|
||||
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry?.status).toBe("completed");
|
||||
expect(entry?.completedAt).toBe(now);
|
||||
});
|
||||
});
|
||||
@@ -3,7 +3,13 @@ import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import type { CasRef, ThreadId } from "@united-workforce/protocol";
|
||||
import { describe, expect, test } from "vitest";
|
||||
import { addHistoryEntry, createUwfStore, loadAllHistory } from "../store.js";
|
||||
import {
|
||||
completeThread,
|
||||
createUwfStore,
|
||||
getThread,
|
||||
loadHistoryThreads,
|
||||
setThread,
|
||||
} from "../store.js";
|
||||
|
||||
async function makeUwfStore(storageRoot: string) {
|
||||
const casDir = join(storageRoot, "cas");
|
||||
@@ -20,88 +26,113 @@ async function seedHistoryHead(
|
||||
}
|
||||
|
||||
describe("thread cancel status", () => {
|
||||
test("cancelled history entry has reason 'cancelled'", async () => {
|
||||
test("cancelled thread has status 'cancelled'", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-cancel-test-"));
|
||||
const threadId = "01JTEST000000000000CANCEL1" as ThreadId;
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const head = await seedHistoryHead(uwf, "cancelled-head");
|
||||
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: "test-workflow",
|
||||
setThread(uwf.varStore, threadId, {
|
||||
head,
|
||||
completedAt: Date.now(),
|
||||
reason: "cancelled",
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
const history = loadAllHistory(uwf.varStore);
|
||||
expect(history).toHaveLength(1);
|
||||
expect(history[0]?.reason).toBe("cancelled");
|
||||
completeThread(uwf.varStore, threadId, "cancelled");
|
||||
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry?.status).toBe("cancelled");
|
||||
});
|
||||
|
||||
test("completed history entry has reason 'completed'", async () => {
|
||||
test("completed thread has status 'completed'", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-cancel-test-"));
|
||||
const threadId = "01JTEST000000000000CANCEL2" as ThreadId;
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const head = await seedHistoryHead(uwf, "completed-head");
|
||||
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: "test-workflow",
|
||||
setThread(uwf.varStore, threadId, {
|
||||
head,
|
||||
completedAt: Date.now(),
|
||||
reason: "completed",
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
const history = loadAllHistory(uwf.varStore);
|
||||
expect(history).toHaveLength(1);
|
||||
expect(history[0]?.reason).toBe("completed");
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
expect(entry).not.toBeNull();
|
||||
expect(entry?.status).toBe("completed");
|
||||
});
|
||||
|
||||
test("history entry with null reason is stored as completed", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-cancel-test-"));
|
||||
const threadId = "01JTEST000000000000CANCEL3" as ThreadId;
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const head = await seedHistoryHead(uwf, "legacy-head");
|
||||
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: "test-workflow",
|
||||
head,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
});
|
||||
|
||||
const history = loadAllHistory(uwf.varStore);
|
||||
expect(history).toHaveLength(1);
|
||||
expect(history[0]?.reason).toBe("completed");
|
||||
});
|
||||
|
||||
test("mixed completed and cancelled entries preserve distinct reasons", async () => {
|
||||
test("loadHistoryThreads returns completed and cancelled", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-cancel-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const head1 = await seedHistoryHead(uwf, "head1");
|
||||
const head2 = await seedHistoryHead(uwf, "head2");
|
||||
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: "01JTEST000000000000CANCEL4" as ThreadId,
|
||||
workflow: "test-workflow",
|
||||
const threadId1 = "01JTEST000000000000CANCEL4" as ThreadId;
|
||||
setThread(uwf.varStore, threadId1, {
|
||||
head: head1,
|
||||
completedAt: Date.now(),
|
||||
reason: "completed",
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
completeThread(uwf.varStore, threadId1, "completed");
|
||||
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: "01JTEST000000000000CANCEL5" as ThreadId,
|
||||
workflow: "test-workflow",
|
||||
const threadId2 = "01JTEST000000000000CANCEL5" as ThreadId;
|
||||
setThread(uwf.varStore, threadId2, {
|
||||
head: head2,
|
||||
completedAt: Date.now(),
|
||||
reason: "cancelled",
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
completeThread(uwf.varStore, threadId2, "cancelled");
|
||||
|
||||
const history = loadHistoryThreads(uwf.varStore);
|
||||
expect(Object.keys(history)).toHaveLength(2);
|
||||
const statuses = Object.values(history)
|
||||
.map((entry) => entry.status)
|
||||
.sort();
|
||||
expect(statuses).toEqual(["cancelled", "completed"]);
|
||||
});
|
||||
|
||||
const history = loadAllHistory(uwf.varStore);
|
||||
expect(history).toHaveLength(2);
|
||||
const reasons = history.map((entry) => entry.reason).sort();
|
||||
expect(reasons).toEqual(["cancelled", "completed"]);
|
||||
test("mixed completed and cancelled entries preserve distinct statuses", async () => {
|
||||
const tmpDir = await mkdtemp(join(tmpdir(), "uwf-cancel-test-"));
|
||||
const uwf = await makeUwfStore(tmpDir);
|
||||
const head1 = await seedHistoryHead(uwf, "head1");
|
||||
const head2 = await seedHistoryHead(uwf, "head2");
|
||||
|
||||
const threadId1 = "01JTEST000000000000CANCEL6" as ThreadId;
|
||||
setThread(uwf.varStore, threadId1, {
|
||||
head: head1,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
completeThread(uwf.varStore, threadId1, "completed");
|
||||
|
||||
const threadId2 = "01JTEST000000000000CANCEL7" as ThreadId;
|
||||
setThread(uwf.varStore, threadId2, {
|
||||
head: head2,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
completeThread(uwf.varStore, threadId2, "cancelled");
|
||||
|
||||
const history = loadHistoryThreads(uwf.varStore);
|
||||
expect(Object.keys(history)).toHaveLength(2);
|
||||
const statuses = Object.values(history)
|
||||
.map((entry) => entry.status)
|
||||
.sort();
|
||||
expect(statuses).toEqual(["cancelled", "completed"]);
|
||||
});
|
||||
});
|
||||
|
||||
@@ -10,9 +10,8 @@ import { cmdThreadList } from "../commands/thread.js";
|
||||
import { parseTimeInput } from "../commands/thread-time-parser.js";
|
||||
import type { UwfStore } from "../store.js";
|
||||
import {
|
||||
addHistoryEntry,
|
||||
completeThread as completeThreadInStore,
|
||||
createUwfStore,
|
||||
deleteThread,
|
||||
loadAllThreads,
|
||||
setThread,
|
||||
} from "../store.js";
|
||||
@@ -73,18 +72,11 @@ async function markThreadRunning(storageRoot: string, threadId: ThreadId, workfl
|
||||
async function completeThread(
|
||||
storageRoot: string,
|
||||
threadId: ThreadId,
|
||||
workflowHash: CasRef,
|
||||
headHash: CasRef,
|
||||
_workflowHash: CasRef,
|
||||
_headHash: CasRef,
|
||||
) {
|
||||
const uwfIdx = await createUwfStore(storageRoot);
|
||||
deleteThread(uwfIdx.varStore, threadId);
|
||||
addHistoryEntry(uwfIdx.varStore, {
|
||||
thread: threadId,
|
||||
workflow: workflowHash,
|
||||
head: headHash,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
});
|
||||
completeThreadInStore(uwfIdx.varStore, threadId, "completed");
|
||||
}
|
||||
|
||||
// ── test setup ────────────────────────────────────────────────────────────────
|
||||
@@ -500,8 +492,10 @@ describe("edge cases", () => {
|
||||
)) as CasRef;
|
||||
index["INVALID_ULID_FORMAT_HERE" as ThreadId] = {
|
||||
head: placeholderHead,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
};
|
||||
for (const [tid, ent] of Object.entries(index)) {
|
||||
setThread(uwfIdx.varStore, tid as ThreadId, ent);
|
||||
|
||||
@@ -54,15 +54,19 @@ roles:
|
||||
type: object
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { type: string }
|
||||
$status: { const: "ready" }
|
||||
graph:
|
||||
$START:
|
||||
_:
|
||||
new:
|
||||
role: planner
|
||||
prompt: "Plan the work"
|
||||
location: null
|
||||
resume:
|
||||
role: planner
|
||||
prompt: "Resume the work"
|
||||
location: null
|
||||
planner:
|
||||
_:
|
||||
ready:
|
||||
role: $END
|
||||
prompt: "Done"
|
||||
location: null
|
||||
@@ -110,15 +114,19 @@ roles:
|
||||
type: object
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { type: string }
|
||||
$status: { const: "ready" }
|
||||
graph:
|
||||
$START:
|
||||
_:
|
||||
new:
|
||||
role: planner
|
||||
prompt: "Plan"
|
||||
location: null
|
||||
resume:
|
||||
role: planner
|
||||
prompt: "Resume"
|
||||
location: null
|
||||
planner:
|
||||
_:
|
||||
ready:
|
||||
role: $END
|
||||
prompt: "Done"
|
||||
location: null
|
||||
@@ -153,15 +161,19 @@ roles:
|
||||
type: object
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { type: string }
|
||||
$status: { const: "ready" }
|
||||
graph:
|
||||
$START:
|
||||
_:
|
||||
new:
|
||||
role: planner
|
||||
prompt: "Plan"
|
||||
location: null
|
||||
resume:
|
||||
role: planner
|
||||
prompt: "Resume"
|
||||
location: null
|
||||
planner:
|
||||
_:
|
||||
ready:
|
||||
role: $END
|
||||
prompt: "Done"
|
||||
location: null
|
||||
|
||||
@@ -70,7 +70,10 @@ async function setupSuspendedThread(mode: MockAgentMode): Promise<{
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: { _: { role: "worker", prompt: "Start work", location: null } },
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Start work", location: null },
|
||||
resume: { role: "worker", prompt: "Resume the work", location: null },
|
||||
},
|
||||
worker: {
|
||||
needs_input: {
|
||||
role: "$SUSPEND",
|
||||
@@ -79,7 +82,7 @@ async function setupSuspendedThread(mode: MockAgentMode): Promise<{
|
||||
},
|
||||
ok: { role: "reviewer", prompt: "Review the work", location: null },
|
||||
},
|
||||
reviewer: { _: { role: "$END", prompt: "Done", location: null } },
|
||||
reviewer: { done: { role: "$END", prompt: "Done", location: null } },
|
||||
},
|
||||
});
|
||||
|
||||
@@ -118,8 +121,10 @@ async function setupSuspendedThread(mode: MockAgentMode): Promise<{
|
||||
await seedThreads(tmpDir, {
|
||||
[THREAD_ID]: {
|
||||
head: stepHash,
|
||||
status: "suspended",
|
||||
suspendedRole: "worker",
|
||||
suspendMessage: SUSPEND_MESSAGE,
|
||||
completedAt: null,
|
||||
},
|
||||
});
|
||||
|
||||
@@ -231,8 +236,11 @@ describe("uwf thread resume", () => {
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: { _: { role: "worker", prompt: "Start", location: null } },
|
||||
worker: { _: { role: "$END", prompt: "Done", location: null } },
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Start", location: null },
|
||||
resume: { role: "worker", prompt: "Resume", location: null },
|
||||
},
|
||||
worker: { done: { role: "$END", prompt: "Done", location: null } },
|
||||
},
|
||||
});
|
||||
|
||||
@@ -247,7 +255,7 @@ describe("uwf thread resume", () => {
|
||||
|
||||
const result = runUwf(["thread", "resume", THREAD_ID], casDir);
|
||||
expect(result.status).not.toBe(0);
|
||||
expect(result.stderr).toContain("thread is not suspended");
|
||||
expect(result.stderr).toContain("thread cannot be resumed");
|
||||
});
|
||||
|
||||
test("resume suspended thread executes step and becomes idle", async () => {
|
||||
@@ -347,8 +355,10 @@ describe("uwf thread resume", () => {
|
||||
const uwfAfterFirst = await createUwfStore(tmpDir);
|
||||
expect(getThread(uwfAfterFirst.varStore, THREAD_ID)).toEqual({
|
||||
head: firstResume.head,
|
||||
status: "suspended",
|
||||
suspendedRole: "worker",
|
||||
suspendMessage: SUSPEND_MESSAGE,
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
const { mockAgentPath: okMockAgentPath } = await setupOkMockAgent(
|
||||
@@ -444,3 +454,272 @@ echo '${adapterJson}'
|
||||
|
||||
return { mockAgentPath };
|
||||
}
|
||||
|
||||
describe("uwf thread resume - completed threads", () => {
|
||||
test("resume completed thread starts from $START role", async () => {
|
||||
const casDir = join(tmpDir, "cas");
|
||||
await mkdir(casDir, { recursive: true });
|
||||
const store = await openStore(casDir);
|
||||
const schemas = await registerUwfSchemas(store);
|
||||
const outputSchemaHash = await putSchema(store, OUTPUT_SCHEMA);
|
||||
|
||||
const workflowHash = await store.cas.put(schemas.workflow, {
|
||||
name: "test-completed-resume",
|
||||
description: "completed thread resume test",
|
||||
roles: {
|
||||
worker: {
|
||||
description: "Worker role",
|
||||
goal: "Work",
|
||||
capabilities: [],
|
||||
procedure: "work",
|
||||
output: "result",
|
||||
frontmatter: outputSchemaHash,
|
||||
},
|
||||
reviewer: {
|
||||
description: "Reviewer role",
|
||||
goal: "Review",
|
||||
capabilities: [],
|
||||
procedure: "review",
|
||||
output: "result",
|
||||
frontmatter: outputSchemaHash,
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Start work", location: null },
|
||||
resume: { role: "worker", prompt: "Resume the work", location: null },
|
||||
},
|
||||
worker: { done: { role: "reviewer", prompt: "Review the work", location: null } },
|
||||
reviewer: { done: { role: "$END", prompt: "Done", location: null } },
|
||||
},
|
||||
});
|
||||
|
||||
const startHash = await store.cas.put(schemas.startNode, {
|
||||
workflow: workflowHash,
|
||||
prompt: "Initial task",
|
||||
cwd: tmpDir,
|
||||
});
|
||||
|
||||
process.env.OCAS_HOME = casDir;
|
||||
|
||||
const workerOutputHash = await store.cas.put(outputSchemaHash, { $status: "done" });
|
||||
const reviewerOutputHash = await store.cas.put(outputSchemaHash, { $status: "done" });
|
||||
const detailHash = await store.cas.put(schemas.text, "mock detail");
|
||||
|
||||
const workerStepHash = await store.cas.put(schemas.stepNode, {
|
||||
start: startHash,
|
||||
prev: null,
|
||||
role: "worker",
|
||||
output: workerOutputHash,
|
||||
detail: detailHash,
|
||||
agent: "uwf-mock",
|
||||
edgePrompt: "Start work",
|
||||
startedAtMs: 1716600000000,
|
||||
completedAtMs: 1716600001000,
|
||||
cwd: tmpDir,
|
||||
assembledPrompt: null,
|
||||
});
|
||||
|
||||
const reviewerStepHash = await store.cas.put(schemas.stepNode, {
|
||||
start: startHash,
|
||||
prev: workerStepHash,
|
||||
role: "reviewer",
|
||||
output: reviewerOutputHash,
|
||||
detail: detailHash,
|
||||
agent: "uwf-mock",
|
||||
edgePrompt: "Review the work",
|
||||
startedAtMs: 1716600001000,
|
||||
completedAtMs: 1716600002000,
|
||||
cwd: tmpDir,
|
||||
assembledPrompt: null,
|
||||
});
|
||||
|
||||
await seedThreads(tmpDir, {
|
||||
[THREAD_ID]: {
|
||||
head: reviewerStepHash,
|
||||
status: "completed",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: 1716600002000,
|
||||
},
|
||||
});
|
||||
|
||||
// Verify the status was actually set
|
||||
const { createUwfStore, getThread } = await import("../store.js");
|
||||
const verifyUwf = await createUwfStore(tmpDir);
|
||||
const verifyEntry = getThread(verifyUwf.varStore, THREAD_ID);
|
||||
console.log("Seeded entry status:", verifyEntry?.status);
|
||||
console.log("Seeded entry:", JSON.stringify(verifyEntry, null, 2));
|
||||
|
||||
const promptCapturePath = join(tmpDir, "captured-prompt-completed.txt");
|
||||
const mockAgentPath = join(tmpDir, "mock-agent-completed.sh");
|
||||
|
||||
const newWorkerStepHash = await store.cas.put(schemas.stepNode, {
|
||||
start: startHash,
|
||||
prev: reviewerStepHash,
|
||||
role: "worker",
|
||||
output: workerOutputHash,
|
||||
detail: detailHash,
|
||||
agent: "uwf-mock",
|
||||
edgePrompt: "Start work",
|
||||
startedAtMs: 1716600003000,
|
||||
completedAtMs: 1716600004000,
|
||||
cwd: tmpDir,
|
||||
assembledPrompt: null,
|
||||
});
|
||||
|
||||
const adapterJson = JSON.stringify({
|
||||
stepHash: newWorkerStepHash,
|
||||
detailHash,
|
||||
role: "worker",
|
||||
frontmatter: { $status: "done" },
|
||||
body: "",
|
||||
startedAtMs: 1716600003000,
|
||||
completedAtMs: 1716600004000,
|
||||
});
|
||||
|
||||
await writeFile(
|
||||
mockAgentPath,
|
||||
`#!/bin/sh
|
||||
prompt=""
|
||||
while [ $# -gt 0 ]; do
|
||||
if [ "$1" = "--prompt" ]; then
|
||||
prompt="$2"
|
||||
shift 2
|
||||
else
|
||||
shift
|
||||
fi
|
||||
done
|
||||
printf '%s' "$prompt" > '${promptCapturePath}'
|
||||
echo '${adapterJson}'
|
||||
`,
|
||||
{ mode: 0o755 },
|
||||
);
|
||||
|
||||
const configPath = join(tmpDir, "config.yaml");
|
||||
await writeFile(
|
||||
configPath,
|
||||
`defaultAgent: uwf-hermes\ndefaultModel: test-model\nagentOverrides: null\nagents: {}\nproviders: {}\nmodels: {}\n`,
|
||||
);
|
||||
|
||||
const result = runUwf(
|
||||
["thread", "resume", THREAD_ID, "-p", "Additional context", "--agent", mockAgentPath],
|
||||
casDir,
|
||||
);
|
||||
|
||||
if (result.status !== 0) {
|
||||
console.error("Command failed:", result.stderr);
|
||||
}
|
||||
|
||||
expect(result.status).toBe(0);
|
||||
|
||||
const cliOutput = JSON.parse(result.stdout.trim());
|
||||
expect(cliOutput.status).toBe("idle");
|
||||
expect(cliOutput.currentRole).toBe("reviewer");
|
||||
expect(cliOutput.done).toBe(false);
|
||||
|
||||
const capturedPrompt = await readFile(promptCapturePath, "utf8");
|
||||
expect(capturedPrompt).toContain("Resume the work");
|
||||
expect(capturedPrompt).toContain("Additional context");
|
||||
|
||||
const storeModule = await import("../store.js");
|
||||
const uwf2 = await storeModule.createUwfStore(tmpDir);
|
||||
const entry2 = storeModule.getThread(uwf2.varStore, THREAD_ID);
|
||||
expect(entry2?.status).toBe("idle");
|
||||
expect(entry2?.completedAt).toBeNull();
|
||||
});
|
||||
|
||||
test("resume cancelled thread returns error", async () => {
|
||||
const casDir = join(tmpDir, "cas");
|
||||
await mkdir(casDir, { recursive: true });
|
||||
const store = await openStore(casDir);
|
||||
const schemas = await registerUwfSchemas(store);
|
||||
|
||||
const workflowHash = await store.cas.put(schemas.workflow, {
|
||||
name: "cancelled-workflow",
|
||||
description: "cancelled thread",
|
||||
roles: {
|
||||
worker: {
|
||||
description: "Worker",
|
||||
goal: "Work",
|
||||
capabilities: [],
|
||||
procedure: "work",
|
||||
output: "result",
|
||||
frontmatter: await putSchema(store, OUTPUT_SCHEMA),
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Start", location: null },
|
||||
resume: { role: "worker", prompt: "Resume", location: null },
|
||||
},
|
||||
worker: { done: { role: "$END", prompt: "Done", location: null } },
|
||||
},
|
||||
});
|
||||
|
||||
const startHash = await store.cas.put(schemas.startNode, {
|
||||
workflow: workflowHash,
|
||||
prompt: "task",
|
||||
cwd: tmpDir,
|
||||
});
|
||||
|
||||
process.env.OCAS_HOME = casDir;
|
||||
await seedThreads(tmpDir, {
|
||||
[THREAD_ID]: {
|
||||
head: startHash,
|
||||
status: "cancelled",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
},
|
||||
});
|
||||
|
||||
const result = runUwf(["thread", "resume", THREAD_ID], casDir);
|
||||
expect(result.status).not.toBe(0);
|
||||
expect(result.stderr).toContain("thread cannot be resumed");
|
||||
expect(result.stderr).toContain("cancelled");
|
||||
});
|
||||
|
||||
test("resume idle thread returns error", async () => {
|
||||
const casDir = join(tmpDir, "cas");
|
||||
await mkdir(casDir, { recursive: true });
|
||||
const store = await openStore(casDir);
|
||||
const schemas = await registerUwfSchemas(store);
|
||||
|
||||
const workflowHash = await store.cas.put(schemas.workflow, {
|
||||
name: "idle-workflow",
|
||||
description: "idle thread",
|
||||
roles: {
|
||||
worker: {
|
||||
description: "Worker",
|
||||
goal: "Work",
|
||||
capabilities: [],
|
||||
procedure: "work",
|
||||
output: "result",
|
||||
frontmatter: await putSchema(store, OUTPUT_SCHEMA),
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Start", location: null },
|
||||
resume: { role: "worker", prompt: "Resume", location: null },
|
||||
},
|
||||
worker: { done: { role: "$END", prompt: "Done", location: null } },
|
||||
},
|
||||
});
|
||||
|
||||
const startHash = await store.cas.put(schemas.startNode, {
|
||||
workflow: workflowHash,
|
||||
prompt: "task",
|
||||
cwd: tmpDir,
|
||||
});
|
||||
|
||||
process.env.OCAS_HOME = casDir;
|
||||
await seedThreads(tmpDir, { [THREAD_ID]: startHash });
|
||||
|
||||
const result = runUwf(["thread", "resume", THREAD_ID], casDir);
|
||||
expect(result.status).not.toBe(0);
|
||||
expect(result.stderr).toContain("thread cannot be resumed");
|
||||
expect(result.stderr).toContain("idle");
|
||||
});
|
||||
});
|
||||
|
||||
@@ -6,13 +6,7 @@ import type { CasRef, ThreadId } from "@united-workforce/protocol";
|
||||
import { describe, expect, test } from "vitest";
|
||||
import { createMarker, deleteMarker } from "../background/index.js";
|
||||
import { cmdThreadShow, cmdThreadStart } from "../commands/thread.js";
|
||||
import {
|
||||
addHistoryEntry,
|
||||
createUwfStore,
|
||||
deleteThread,
|
||||
loadAllThreads,
|
||||
setThread,
|
||||
} from "../store.js";
|
||||
import { completeThread, createUwfStore, loadAllThreads, setThread } from "../store.js";
|
||||
|
||||
const OUTPUT_SCHEMA = {
|
||||
type: "object" as const,
|
||||
@@ -37,15 +31,19 @@ roles:
|
||||
type: object
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { type: string }
|
||||
$status: { const: "ready" }
|
||||
graph:
|
||||
$START:
|
||||
_:
|
||||
new:
|
||||
role: planner
|
||||
prompt: "Plan the work"
|
||||
location: null
|
||||
resume:
|
||||
role: planner
|
||||
prompt: "Resume the work"
|
||||
location: null
|
||||
planner:
|
||||
_:
|
||||
ready:
|
||||
role: $END
|
||||
prompt: "Done"
|
||||
location: null
|
||||
@@ -72,10 +70,14 @@ roles:
|
||||
question: { type: string }
|
||||
graph:
|
||||
$START:
|
||||
_:
|
||||
new:
|
||||
role: worker
|
||||
prompt: "Start work"
|
||||
location: null
|
||||
resume:
|
||||
role: worker
|
||||
prompt: "Resume work"
|
||||
location: null
|
||||
worker:
|
||||
needs_input:
|
||||
role: $SUSPEND
|
||||
@@ -118,7 +120,13 @@ async function insertStepNode(
|
||||
assembledPrompt: null,
|
||||
})) as CasRef;
|
||||
|
||||
setThread(uwf.varStore, threadId, { head: stepHash, suspendedRole: null, suspendMessage: null });
|
||||
setThread(uwf.varStore, threadId, {
|
||||
head: stepHash,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
}
|
||||
|
||||
describe("thread show status field", () => {
|
||||
@@ -200,7 +208,7 @@ describe("thread show status field", () => {
|
||||
// Create a thread
|
||||
const startResult = await cmdThreadStart(storageRoot, workflowPath, "test prompt", tmpDir);
|
||||
const threadId = startResult.thread as ThreadId;
|
||||
const workflow = startResult.workflow;
|
||||
const _workflow = startResult.workflow;
|
||||
|
||||
// Get the head hash before moving to history
|
||||
const uwfForIndex = await createUwfStore(storageRoot);
|
||||
@@ -208,15 +216,7 @@ describe("thread show status field", () => {
|
||||
const head = index[threadId]!.head;
|
||||
if (!head) throw new Error("Thread not found in index");
|
||||
|
||||
deleteThread(uwfForIndex.varStore, threadId);
|
||||
|
||||
addHistoryEntry(uwfForIndex.varStore, {
|
||||
thread: threadId,
|
||||
workflow,
|
||||
head,
|
||||
completedAt: Date.now(),
|
||||
reason: "completed",
|
||||
});
|
||||
completeThread(uwfForIndex.varStore, threadId, "completed");
|
||||
|
||||
const result = await cmdThreadShow(storageRoot, threadId);
|
||||
|
||||
@@ -237,7 +237,7 @@ describe("thread show status field", () => {
|
||||
// Create a thread
|
||||
const startResult = await cmdThreadStart(storageRoot, workflowPath, "test prompt", tmpDir);
|
||||
const threadId = startResult.thread as ThreadId;
|
||||
const workflow = startResult.workflow;
|
||||
const _workflow = startResult.workflow;
|
||||
|
||||
// Get the head hash before moving to history
|
||||
const uwfForIndex = await createUwfStore(storageRoot);
|
||||
@@ -245,15 +245,7 @@ describe("thread show status field", () => {
|
||||
const head = index[threadId]!.head;
|
||||
if (!head) throw new Error("Thread not found in index");
|
||||
|
||||
deleteThread(uwfForIndex.varStore, threadId);
|
||||
|
||||
addHistoryEntry(uwfForIndex.varStore, {
|
||||
thread: threadId,
|
||||
workflow,
|
||||
head,
|
||||
completedAt: Date.now(),
|
||||
reason: "cancelled",
|
||||
});
|
||||
completeThread(uwfForIndex.varStore, threadId, "cancelled");
|
||||
|
||||
const result = await cmdThreadShow(storageRoot, threadId);
|
||||
|
||||
@@ -274,7 +266,7 @@ describe("thread show status field", () => {
|
||||
// Create a thread
|
||||
const startResult = await cmdThreadStart(storageRoot, workflowPath, "test prompt", tmpDir);
|
||||
const threadId = startResult.thread as ThreadId;
|
||||
const workflow = startResult.workflow;
|
||||
const _workflow = startResult.workflow;
|
||||
|
||||
// Get the head hash before moving to history
|
||||
const uwfForIndex = await createUwfStore(storageRoot);
|
||||
@@ -282,15 +274,7 @@ describe("thread show status field", () => {
|
||||
const head = index[threadId]!.head;
|
||||
if (!head) throw new Error("Thread not found in index");
|
||||
|
||||
deleteThread(uwfForIndex.varStore, threadId);
|
||||
|
||||
addHistoryEntry(uwfForIndex.varStore, {
|
||||
thread: threadId,
|
||||
workflow,
|
||||
head,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
});
|
||||
completeThread(uwfForIndex.varStore, threadId, "completed");
|
||||
|
||||
const result = await cmdThreadShow(storageRoot, threadId);
|
||||
|
||||
|
||||
@@ -54,15 +54,19 @@ roles:
|
||||
type: object
|
||||
required: ["$status"]
|
||||
properties:
|
||||
$status: { type: string }
|
||||
$status: { const: "ready" }
|
||||
graph:
|
||||
$START:
|
||||
_:
|
||||
new:
|
||||
role: planner
|
||||
prompt: "Plan the work"
|
||||
location: null
|
||||
resume:
|
||||
role: planner
|
||||
prompt: "Resume the work"
|
||||
location: null
|
||||
planner:
|
||||
_:
|
||||
ready:
|
||||
role: $END
|
||||
prompt: "Done"
|
||||
location: null
|
||||
|
||||
@@ -2,19 +2,28 @@ import { execFileSync } from "node:child_process";
|
||||
import { dirname, join } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import { describe, expect, test } from "vitest";
|
||||
import { validateCount } from "../commands/thread.js";
|
||||
|
||||
const CLI_PATH = join(dirname(fileURLToPath(import.meta.url)), "..", "cli.js");
|
||||
const CLI_PATH = join(dirname(fileURLToPath(import.meta.url)), "..", "..", "dist", "cli.js");
|
||||
|
||||
function runCli(args: string[]): { stdout: string; stderr: string; exitCode: number } {
|
||||
function runCli(args: string[]): {
|
||||
stdout: string;
|
||||
stderr: string;
|
||||
exitCode: number;
|
||||
} {
|
||||
try {
|
||||
const stdout = execFileSync("npx", ["tsx", CLI_PATH, ...args], {
|
||||
const stdout = execFileSync("node", [CLI_PATH, ...args], {
|
||||
encoding: "utf8",
|
||||
env: { ...process.env, UWF_HOME: "/tmp/uwf-test-nonexistent" },
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
});
|
||||
return { stdout, stderr: "", exitCode: 0 };
|
||||
} catch (e: unknown) {
|
||||
const err = e as NodeJS.ErrnoException & { stdout?: string; stderr?: string; status?: number };
|
||||
const err = e as NodeJS.ErrnoException & {
|
||||
stdout?: string;
|
||||
stderr?: string;
|
||||
status?: number;
|
||||
};
|
||||
return {
|
||||
stdout: err.stdout ?? "",
|
||||
stderr: err.stderr ?? "",
|
||||
@@ -23,50 +32,39 @@ function runCli(args: string[]): { stdout: string; stderr: string; exitCode: num
|
||||
}
|
||||
}
|
||||
|
||||
describe("thread exec --count CLI parsing", () => {
|
||||
describe("thread exec --count CLI parsing", { timeout: 30_000 }, () => {
|
||||
test("--help shows -c/--count option", () => {
|
||||
const result = runCli(["thread", "exec", "--help"]);
|
||||
expect(result.stdout).toContain("--count");
|
||||
expect(result.stdout).toContain("-c");
|
||||
const combined = result.stdout + result.stderr;
|
||||
expect(combined).toContain("--count");
|
||||
expect(combined).toContain("-c");
|
||||
});
|
||||
|
||||
test("description says 'one or more steps'", () => {
|
||||
const result = runCli(["thread", "exec", "--help"]);
|
||||
expect(result.stdout).toContain("one or more steps");
|
||||
const combined = result.stdout + result.stderr;
|
||||
expect(combined).toContain("one or more steps");
|
||||
});
|
||||
});
|
||||
|
||||
describe("cmdThreadExec count logic", () => {
|
||||
test("count=0 fails with validation error", () => {
|
||||
const result = runCli(["thread", "exec", "FAKE_THREAD_ID", "-c", "0"]);
|
||||
expect(result.exitCode).not.toBe(0);
|
||||
expect(result.stderr).toContain("positive integer");
|
||||
describe("validateCount", () => {
|
||||
test("count=0 throws validation error", () => {
|
||||
expect(() => validateCount(0)).toThrow("positive integer");
|
||||
});
|
||||
|
||||
test("negative count fails with validation error", () => {
|
||||
const result = runCli(["thread", "exec", "FAKE_THREAD_ID", "-c", "-1"]);
|
||||
expect(result.exitCode).not.toBe(0);
|
||||
expect(result.stderr).toContain("positive integer");
|
||||
test("negative count throws validation error", () => {
|
||||
expect(() => validateCount(-1)).toThrow("positive integer");
|
||||
});
|
||||
|
||||
test("non-integer count fails with validation error", () => {
|
||||
const result = runCli(["thread", "exec", "FAKE_THREAD_ID", "-c", "1.5"]);
|
||||
expect(result.exitCode).not.toBe(0);
|
||||
expect(result.stderr).toContain("positive integer");
|
||||
test("non-integer count throws validation error", () => {
|
||||
expect(() => validateCount(1.5)).toThrow("positive integer");
|
||||
});
|
||||
|
||||
test("count=1 is the default (no -c flag)", () => {
|
||||
// Without -c, it should attempt to run 1 step (failing on missing thread, not on count validation)
|
||||
const result = runCli(["thread", "exec", "FAKE_THREAD_ID"]);
|
||||
expect(result.exitCode).not.toBe(0);
|
||||
// Should NOT contain "positive integer" error — should fail on thread lookup instead
|
||||
expect(result.stderr).not.toContain("positive integer");
|
||||
test("count=1 passes validation", () => {
|
||||
expect(() => validateCount(1)).not.toThrow();
|
||||
});
|
||||
|
||||
test("count=3 passes validation (fails on thread lookup)", () => {
|
||||
const result = runCli(["thread", "exec", "FAKE_THREAD_ID", "-c", "3"]);
|
||||
expect(result.exitCode).not.toBe(0);
|
||||
// Should NOT contain "positive integer" error — should fail on thread/storage lookup
|
||||
expect(result.stderr).not.toContain("positive integer");
|
||||
test("count=3 passes validation", () => {
|
||||
expect(() => validateCount(3)).not.toThrow();
|
||||
});
|
||||
});
|
||||
|
||||
@@ -58,7 +58,10 @@ describe("suspend step CAS chain and threads.yaml metadata", () => {
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: { _: { role: "worker", prompt: "Start work", location: null } },
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Start work", location: null },
|
||||
resume: { role: "worker", prompt: "Resume work", location: null },
|
||||
},
|
||||
worker: {
|
||||
needs_input: {
|
||||
role: "$SUSPEND",
|
||||
@@ -160,8 +163,10 @@ describe("suspend step CAS chain and threads.yaml metadata", () => {
|
||||
const threadEntry = getThread(uwf.varStore, threadId);
|
||||
expect(threadEntry).toEqual({
|
||||
head: stepHash,
|
||||
status: "suspended",
|
||||
suspendedRole: "worker",
|
||||
suspendMessage: "Please clarify: Which API?",
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
const showResult = await cmdThreadShow(tmpDir, threadId);
|
||||
|
||||
@@ -55,7 +55,10 @@ describe("suspended thread display", () => {
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: { _: { role: "worker", prompt: "Start work", location: null } },
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Start work", location: null },
|
||||
resume: { role: "worker", prompt: "Resume work", location: null },
|
||||
},
|
||||
worker: {
|
||||
needs_input: {
|
||||
role: "$SUSPEND",
|
||||
@@ -162,7 +165,10 @@ describe("suspended thread display", () => {
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: { _: { role: "worker", prompt: "Start work", location: null } },
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Start work", location: null },
|
||||
resume: { role: "worker", prompt: "Resume work", location: null },
|
||||
},
|
||||
worker: {
|
||||
needs_input: {
|
||||
role: "$SUSPEND",
|
||||
@@ -248,7 +254,10 @@ describe("suspended thread display", () => {
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: { _: { role: "worker", prompt: "Start work", location: null } },
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "Start work", location: null },
|
||||
resume: { role: "worker", prompt: "Resume work", location: null },
|
||||
},
|
||||
},
|
||||
});
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ import {
|
||||
THREAD_READ_DEFAULT_QUOTA,
|
||||
} from "../commands/thread.js";
|
||||
import type { UwfStore } from "../store.js";
|
||||
import { addHistoryEntry, createUwfStore } from "../store.js";
|
||||
import { completeThread, createUwfStore, setThread } from "../store.js";
|
||||
import { seedThreads } from "./thread-test-helpers.js";
|
||||
|
||||
// ── schemas used in tests ────────────────────────────────────────────────────
|
||||
@@ -745,13 +745,14 @@ describe("cmdStepList with completed threads", () => {
|
||||
const threadId = "01JTEST0000000000000000A2" as ThreadId;
|
||||
// Thread is NOT in active index (simulating completed thread)
|
||||
// But it IS in history variable store
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: workflowHash,
|
||||
setThread(uwf.varStore, threadId, {
|
||||
head: step2Hash,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const result = await cmdStepList(tmpDir, threadId);
|
||||
|
||||
@@ -872,14 +873,15 @@ describe("cmdStepShow with completed threads", () => {
|
||||
|
||||
const threadId = "01JTEST0000000000000000B2" as ThreadId;
|
||||
// Thread is NOT in active index
|
||||
// But it IS in history variable store
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: workflowHash,
|
||||
// But it IS in the unified store with completed status
|
||||
setThread(uwf.varStore, threadId, {
|
||||
head: stepHash,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const result = await cmdStepShow(tmpDir, stepHash);
|
||||
|
||||
@@ -934,15 +936,15 @@ describe("cmdThreadRead with completed threads", () => {
|
||||
});
|
||||
|
||||
const threadId = "01JTEST0000000000000000C1" as ThreadId;
|
||||
// Thread is NOT in active index
|
||||
// But it IS in history variable store
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: workflowHash,
|
||||
// Thread is in store with completed status
|
||||
setThread(uwf.varStore, threadId, {
|
||||
head: stepHash,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const markdown = await cmdThreadRead(tmpDir, threadId, THREAD_READ_DEFAULT_QUOTA, null, false);
|
||||
|
||||
@@ -998,13 +1000,14 @@ describe("cmdThreadRead with completed threads", () => {
|
||||
});
|
||||
|
||||
const threadId = "01JTEST0000000000000000C2" as ThreadId;
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow: workflowHash,
|
||||
setThread(uwf.varStore, threadId, {
|
||||
head: step3Hash,
|
||||
completedAt: Date.now(),
|
||||
reason: null,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
|
||||
const markdown = await cmdThreadRead(
|
||||
tmpDir,
|
||||
|
||||
@@ -17,7 +17,7 @@ function makeWorkflow(overrides?: Partial<WorkflowPayload>): WorkflowPayload {
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: {
|
||||
$status: { enum: ["_"] },
|
||||
$status: { const: "done" },
|
||||
plan: { type: "string" },
|
||||
},
|
||||
required: ["$status", "plan"],
|
||||
@@ -51,8 +51,11 @@ function makeWorkflow(overrides?: Partial<WorkflowPayload>): WorkflowPayload {
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: { _: { role: "writer", prompt: "Begin writing", location: null } },
|
||||
writer: { _: { role: "reviewer", prompt: "Review this: {{{plan}}}", location: null } },
|
||||
$START: {
|
||||
new: { role: "writer", prompt: "Begin writing", location: null },
|
||||
resume: { role: "writer", prompt: "Review previous output and continue", location: null },
|
||||
},
|
||||
writer: { done: { role: "reviewer", prompt: "Review this: {{{plan}}}", location: null } },
|
||||
reviewer: {
|
||||
approved: { role: "$END", prompt: "Done: {{{summary}}}", location: null },
|
||||
rejected: { role: "writer", prompt: "Fix: {{{reason}}}", location: null },
|
||||
@@ -82,7 +85,7 @@ describe("Suite 1: Role Reference Integrity", () => {
|
||||
output: "None",
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: { $status: { enum: ["_"] } },
|
||||
properties: { $status: { const: "done" } },
|
||||
required: ["$status"],
|
||||
} as unknown as string,
|
||||
};
|
||||
@@ -135,27 +138,38 @@ describe("Suite 2: Graph Structure", () => {
|
||||
expect(errors.some((e) => e.includes("$START must be defined in graph"))).toBe(true);
|
||||
});
|
||||
|
||||
test("2.2 $START has multiple status keys", () => {
|
||||
test("2.2 $START missing resume edge", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.$START = {
|
||||
_: { role: "writer", prompt: "Begin", location: null },
|
||||
other: { role: "reviewer", prompt: "Also", location: null },
|
||||
new: { role: "writer", prompt: "Begin", location: null },
|
||||
};
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(
|
||||
errors.some((e) => e.includes('$START must have exactly one edge with status "_"')),
|
||||
errors.some((e) => e.includes('$START must have edges with statuses "new" and "resume"')),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("2.3 $START edge uses non-_ status", () => {
|
||||
test("2.3 $START missing new edge", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.$START = { ready: { role: "writer", prompt: "Begin", location: null } };
|
||||
wf.graph.$START = {
|
||||
resume: { role: "writer", prompt: "Resume", location: null },
|
||||
};
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(
|
||||
errors.some((e) => e.includes('$START must have exactly one edge with status "_"')),
|
||||
errors.some((e) => e.includes('$START must have edges with statuses "new" and "resume"')),
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("2.3b $START with new and resume passes", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.$START = {
|
||||
new: { role: "writer", prompt: "Begin", location: null },
|
||||
resume: { role: "writer", prompt: "Resume", location: null },
|
||||
};
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.some((e) => e.includes("$START must have edges"))).toBe(false);
|
||||
});
|
||||
|
||||
test("2.4 $END has outgoing edges", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.$END = { _: { role: "writer", prompt: "Loop", location: null } };
|
||||
@@ -173,11 +187,11 @@ describe("Suite 2: Graph Structure", () => {
|
||||
output: "Isolated",
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: { $status: { enum: ["_"] } },
|
||||
properties: { $status: { const: "done" } },
|
||||
required: ["$status"],
|
||||
} as unknown as string,
|
||||
};
|
||||
wf.graph.isolated = { _: { role: "$END", prompt: "done", location: null } };
|
||||
wf.graph.isolated = { done: { role: "$END", prompt: "done", location: null } };
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.some((e) => e.includes('role "isolated" is not reachable from $START'))).toBe(
|
||||
true,
|
||||
@@ -186,34 +200,37 @@ describe("Suite 2: Graph Structure", () => {
|
||||
|
||||
test("2.6 edge target references invalid role", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.writer = { _: { role: "ghost", prompt: "Go to ghost", location: null } };
|
||||
wf.graph.writer = { done: { role: "ghost", prompt: "Go to ghost", location: null } };
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.some((e) => e.includes('unknown target role "ghost"'))).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("Suite 3: Status-Edge Consistency", () => {
|
||||
test("3.1 single-exit role with multiple graph keys", () => {
|
||||
test("3.1 user role using _ graph key is treated as an unknown status", () => {
|
||||
// "_" is no longer special-cased — it's just a status key that does not
|
||||
// match the role's $status enum, so it surfaces as extra/missing keys.
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.writer = {
|
||||
_: { role: "reviewer", prompt: "Review", location: null },
|
||||
extra: { role: "$END", prompt: "Done", location: null },
|
||||
};
|
||||
wf.graph.writer = { _: { role: "reviewer", prompt: "Review", location: null } };
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(
|
||||
errors.some((e) =>
|
||||
e.includes('role "writer" is single-exit but has status keys other than "_"'),
|
||||
),
|
||||
).toBe(true);
|
||||
expect(errors.some((e) => e.includes('role "writer" graph has extra status keys: _'))).toBe(
|
||||
true,
|
||||
);
|
||||
expect(errors.some((e) => e.includes('role "writer" graph is missing status keys: done'))).toBe(
|
||||
true,
|
||||
);
|
||||
});
|
||||
|
||||
test("3.2 single-exit role missing _ key", () => {
|
||||
test("3.2 user role graph key not matching $status enum", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.writer = { done: { role: "reviewer", prompt: "Review", location: null } };
|
||||
wf.graph.writer = { wrong: { role: "reviewer", prompt: "Review", location: null } };
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(
|
||||
errors.some((e) => e.includes('role "writer" is single-exit but graph has no "_" key')),
|
||||
).toBe(true);
|
||||
expect(errors.some((e) => e.includes('role "writer" graph has extra status keys: wrong'))).toBe(
|
||||
true,
|
||||
);
|
||||
expect(errors.some((e) => e.includes('role "writer" graph is missing status keys: done'))).toBe(
|
||||
true,
|
||||
);
|
||||
});
|
||||
|
||||
test("3.3 multi-exit role with extra statuses", () => {
|
||||
@@ -240,18 +257,23 @@ describe("Suite 3: Status-Edge Consistency", () => {
|
||||
).toBe(true);
|
||||
});
|
||||
|
||||
test("3.5 multi-exit role with _ key", () => {
|
||||
test("3.5 multi-exit role with _ key is treated as an unknown status", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.reviewer = { _: { role: "$END", prompt: "Done", location: null } };
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.some((e) => e.includes('role "reviewer" is multi-exit but graph uses "_"'))).toBe(
|
||||
expect(errors.some((e) => e.includes('role "reviewer" graph has extra status keys: _'))).toBe(
|
||||
true,
|
||||
);
|
||||
expect(
|
||||
errors.some((e) =>
|
||||
e.includes('role "reviewer" graph is missing status keys: approved, rejected'),
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("Suite 3b: Enum-Based Multi-Exit", () => {
|
||||
test("3b.1 enum multi-exit passes with matching graph keys", () => {
|
||||
describe("Suite 3b: Enum-Based $status is Rejected", () => {
|
||||
test("3b.1 enum multi-exit is rejected (must use oneOf + const)", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.roles.reviewer = {
|
||||
...wf.roles.reviewer,
|
||||
@@ -269,99 +291,102 @@ describe("Suite 3b: Enum-Based Multi-Exit", () => {
|
||||
rejected: { role: "writer", prompt: "Fix: {{{comments}}}", location: null },
|
||||
};
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors).toEqual([]);
|
||||
expect(errors.some((e) => e.includes("must define") && e.includes("const"))).toBe(true);
|
||||
});
|
||||
|
||||
test("3b.2 enum multi-exit with extra graph key", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.roles.reviewer = {
|
||||
...wf.roles.reviewer,
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: {
|
||||
$status: { enum: ["approved", "rejected"] },
|
||||
comments: { type: "string" },
|
||||
},
|
||||
required: ["$status", "comments"],
|
||||
} as unknown as string,
|
||||
};
|
||||
wf.graph.reviewer = {
|
||||
approved: { role: "$END", prompt: "Done", location: null },
|
||||
rejected: { role: "writer", prompt: "Fix", location: null },
|
||||
timeout: { role: "$END", prompt: "Timed out", location: null },
|
||||
};
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.some((e) => e.includes("extra status keys: timeout"))).toBe(true);
|
||||
});
|
||||
|
||||
test("3b.3 enum multi-exit with missing graph key", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.roles.reviewer = {
|
||||
...wf.roles.reviewer,
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: {
|
||||
$status: { enum: ["approved", "rejected"] },
|
||||
comments: { type: "string" },
|
||||
},
|
||||
required: ["$status", "comments"],
|
||||
} as unknown as string,
|
||||
};
|
||||
wf.graph.reviewer = {
|
||||
approved: { role: "$END", prompt: "Done", location: null },
|
||||
};
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.some((e) => e.includes("missing status keys: rejected"))).toBe(true);
|
||||
});
|
||||
|
||||
test("3b.4 enum with single value (not multi-exit) treated as single-exit", () => {
|
||||
test("3b.2 enum single-exit is rejected (must use const)", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.roles.writer = {
|
||||
...wf.roles.writer,
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: {
|
||||
$status: { enum: ["_"] },
|
||||
$status: { enum: ["ready"] },
|
||||
plan: { type: "string" },
|
||||
},
|
||||
required: ["$status", "plan"],
|
||||
} as unknown as string,
|
||||
};
|
||||
wf.graph.writer = { ready: { role: "reviewer", prompt: "Review: {{{plan}}}", location: null } };
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.some((e) => e.includes("must define") && e.includes("const"))).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("Suite 3c: Const-Based Flat Schema", () => {
|
||||
test("3c.1 flat schema with const $status passes validation", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.roles.writer = {
|
||||
...wf.roles.writer,
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: {
|
||||
$status: { const: "done" },
|
||||
plan: { type: "string" },
|
||||
},
|
||||
required: ["$status", "plan"],
|
||||
} as unknown as string,
|
||||
};
|
||||
wf.graph.writer = { _: { role: "reviewer", prompt: "Review: {{{plan}}}", location: null } };
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors).toEqual([]);
|
||||
});
|
||||
|
||||
test("3b.5 enum multi-exit mustache var not in frontmatter", () => {
|
||||
test("3c.2 flat schema with const $status detects extra graph key", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.roles.reviewer = {
|
||||
...wf.roles.reviewer,
|
||||
wf.roles.writer = {
|
||||
...wf.roles.writer,
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: {
|
||||
$status: { enum: ["approved", "rejected"] },
|
||||
comments: { type: "string" },
|
||||
$status: { const: "done" },
|
||||
plan: { type: "string" },
|
||||
},
|
||||
required: ["$status", "comments"],
|
||||
required: ["$status", "plan"],
|
||||
} as unknown as string,
|
||||
};
|
||||
wf.graph.reviewer = {
|
||||
approved: { role: "$END", prompt: "Done: {{{nonexistent}}}", location: null },
|
||||
rejected: { role: "writer", prompt: "Fix: {{{comments}}}", location: null },
|
||||
wf.graph.writer = {
|
||||
done: { role: "reviewer", prompt: "Review.", location: null },
|
||||
extra: { role: "$END", prompt: "Nope.", location: null },
|
||||
};
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.some((e) => e.includes("nonexistent") && e.includes("not found"))).toBe(true);
|
||||
expect(errors.some((e) => e.includes("extra status keys") && e.includes("extra"))).toBe(true);
|
||||
});
|
||||
|
||||
test("3c.3 flat schema with const $status validates mustache vars", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.roles.writer = {
|
||||
...wf.roles.writer,
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: {
|
||||
$status: { const: "done" },
|
||||
plan: { type: "string" },
|
||||
},
|
||||
required: ["$status", "plan"],
|
||||
} as unknown as string,
|
||||
};
|
||||
wf.graph.writer = {
|
||||
done: { role: "reviewer", prompt: "Review: {{{nonexistent}}}", location: null },
|
||||
};
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(
|
||||
errors.some(
|
||||
(e) => e.includes('prompt variable "nonexistent"') && e.includes('role "writer"'),
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
});
|
||||
|
||||
describe("Suite 4: Mustache Template Variable Existence", () => {
|
||||
test("4.1 prompt references nonexistent variable (single-exit)", () => {
|
||||
test("4.1 prompt references nonexistent variable (enum status)", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.writer = { _: { role: "reviewer", prompt: "Review: {{{branch}}}", location: null } };
|
||||
wf.graph.writer = {
|
||||
done: { role: "reviewer", prompt: "Review: {{{branch}}}", location: null },
|
||||
};
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(
|
||||
errors.some((e) =>
|
||||
e.includes('prompt variable "branch" not found in role "writer" frontmatter'),
|
||||
errors.some(
|
||||
(e) => e.includes('prompt variable "branch"') && e.includes('role "writer" frontmatter'),
|
||||
),
|
||||
).toBe(true);
|
||||
});
|
||||
@@ -388,7 +413,7 @@ describe("Suite 4: Mustache Template Variable Existence", () => {
|
||||
|
||||
test("4.4 $status variable is always valid", () => {
|
||||
const wf = makeWorkflow();
|
||||
wf.graph.writer = { _: { role: "reviewer", prompt: "Status: {{$status}}", location: null } };
|
||||
wf.graph.writer = { done: { role: "reviewer", prompt: "Status: {{$status}}", location: null } };
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors).toEqual([]);
|
||||
});
|
||||
@@ -456,14 +481,14 @@ describe("Suite 6: Multiple Errors Collection", () => {
|
||||
output: "None",
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: { $status: { enum: ["_"] } },
|
||||
properties: { $status: { const: "done" } },
|
||||
required: ["$status"],
|
||||
} as unknown as string,
|
||||
};
|
||||
// unknown graph reference
|
||||
wf.graph.nonexistent = { _: { role: "$END", prompt: "done", location: null } };
|
||||
wf.graph.nonexistent = { done: { role: "$END", prompt: "done", location: null } };
|
||||
// bad mustache var
|
||||
wf.graph.writer = { _: { role: "reviewer", prompt: "{{{badvar}}}", location: null } };
|
||||
wf.graph.writer = { done: { role: "reviewer", prompt: "{{{badvar}}}", location: null } };
|
||||
const errors = validateWorkflow(wf);
|
||||
expect(errors.length).toBeGreaterThanOrEqual(3);
|
||||
});
|
||||
|
||||
@@ -31,15 +31,18 @@ function makeMinimalPayload(name: string, description: string): WorkflowPayload
|
||||
frontmatter: {
|
||||
type: "object",
|
||||
properties: {
|
||||
$status: { type: "string" },
|
||||
$status: { const: "done" },
|
||||
},
|
||||
required: ["$status"],
|
||||
} as unknown as CasRef,
|
||||
},
|
||||
},
|
||||
graph: {
|
||||
$START: { _: { role: "worker", prompt: "start working", location: null } },
|
||||
worker: { _: { role: "$END", prompt: "done", location: null } },
|
||||
$START: {
|
||||
new: { role: "worker", prompt: "start working", location: null },
|
||||
resume: { role: "worker", prompt: "resume working", location: null },
|
||||
},
|
||||
worker: { done: { role: "$END", prompt: "done", location: null } },
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
+28
-48
@@ -1,20 +1,17 @@
|
||||
#!/usr/bin/env node
|
||||
#!/usr/bin/env -S node --disable-warning=ExperimentalWarning
|
||||
|
||||
import type { CasRef, ThreadId, ThreadStatus } from "@united-workforce/protocol";
|
||||
import { Command } from "commander";
|
||||
import { cmdConfigGet, cmdConfigList, cmdConfigSet } from "./commands/config.js";
|
||||
import { cmdLogClean, cmdLogList, cmdLogShow } from "./commands/log.js";
|
||||
import {
|
||||
cmdPromptAdapter,
|
||||
cmdPromptAuthor,
|
||||
cmdPromptAdapterDeveloping,
|
||||
cmdPromptBootstrap,
|
||||
cmdPromptDeveloper,
|
||||
cmdPromptList,
|
||||
cmdPromptSetup,
|
||||
cmdPromptUsage,
|
||||
cmdPromptUser,
|
||||
cmdPromptWorkflowAuthoring,
|
||||
} from "./commands/prompt.js";
|
||||
import { cmdSetup, cmdSetupInteractive } from "./commands/setup.js";
|
||||
import { cmdSetup, cmdSetupInteractive, resolvePresetBaseUrl } from "./commands/setup.js";
|
||||
import { cmdStepFork, cmdStepList, cmdStepRead, cmdStepShow } from "./commands/step.js";
|
||||
import {
|
||||
cmdThreadCancel,
|
||||
@@ -510,53 +507,32 @@ prompt.addHelpCommand(false);
|
||||
|
||||
prompt
|
||||
.command("usage")
|
||||
.description("Print the complete skill content (all references combined)")
|
||||
.description("Print the usage reference (CLI guide + typical workflows)")
|
||||
.action(() => {
|
||||
console.log(cmdPromptUsage());
|
||||
});
|
||||
|
||||
prompt
|
||||
.command("setup")
|
||||
.description("Print setup instructions for installing the uwf skill")
|
||||
.action(() => {
|
||||
console.log(cmdPromptSetup());
|
||||
});
|
||||
|
||||
prompt
|
||||
.command("adapter")
|
||||
.description("Print the adapter reference (building agent adapters)")
|
||||
.action(() => {
|
||||
console.log(cmdPromptAdapter());
|
||||
});
|
||||
|
||||
prompt
|
||||
.command("author")
|
||||
.description("Print the author reference (workflow YAML design guide)")
|
||||
.action(() => {
|
||||
console.log(cmdPromptAuthor());
|
||||
});
|
||||
|
||||
prompt
|
||||
.command("developer")
|
||||
.description("Print the developer reference (coding conventions + architecture)")
|
||||
.action(() => {
|
||||
console.log(cmdPromptDeveloper());
|
||||
});
|
||||
|
||||
prompt
|
||||
.command("user")
|
||||
.description("Print the user reference (CLI guide + typical workflows)")
|
||||
.action(() => {
|
||||
console.log(cmdPromptUser());
|
||||
});
|
||||
|
||||
prompt
|
||||
.command("bootstrap")
|
||||
.description("Print the bootstrap skill YAML for Hermes agents")
|
||||
.description("Print setup instructions for installing uwf skills")
|
||||
.action(() => {
|
||||
console.log(cmdPromptBootstrap());
|
||||
});
|
||||
|
||||
prompt
|
||||
.command("workflow-authoring")
|
||||
.description("Print the workflow authoring reference (YAML design guide)")
|
||||
.action(() => {
|
||||
console.log(cmdPromptWorkflowAuthoring());
|
||||
});
|
||||
|
||||
prompt
|
||||
.command("adapter-developing")
|
||||
.description("Print the adapter developing reference (building agent adapters)")
|
||||
.action(() => {
|
||||
console.log(cmdPromptAdapterDeveloping());
|
||||
});
|
||||
|
||||
prompt
|
||||
.command("list")
|
||||
.description("List all available prompt names")
|
||||
@@ -566,7 +542,7 @@ prompt
|
||||
|
||||
program
|
||||
.command("setup")
|
||||
.description("Configure provider, model, and agent")
|
||||
.description("Configure provider, model, and agent. Run without options for interactive wizard.")
|
||||
.option("--provider <name>", "Provider name")
|
||||
.option("--base-url <url>", "OpenAI-compatible API base URL")
|
||||
.option("--api-key <key>", "API key")
|
||||
@@ -582,10 +558,14 @@ program
|
||||
}) => {
|
||||
const storageRoot = resolveStorageRoot();
|
||||
runAction(async () => {
|
||||
if (opts.provider && opts.baseUrl && opts.apiKey && opts.model) {
|
||||
// Resolve preset base-url when provider is known but --base-url is omitted
|
||||
const resolvedBaseUrl =
|
||||
opts.baseUrl ??
|
||||
(opts.provider !== undefined ? resolvePresetBaseUrl(opts.provider) : null);
|
||||
if (opts.provider && resolvedBaseUrl && opts.apiKey && opts.model) {
|
||||
const result = await cmdSetup({
|
||||
provider: opts.provider,
|
||||
baseUrl: opts.baseUrl,
|
||||
baseUrl: resolvedBaseUrl,
|
||||
apiKey: opts.apiKey,
|
||||
model: opts.model,
|
||||
agent: opts.agent ?? undefined,
|
||||
@@ -596,7 +576,7 @@ program
|
||||
await cmdSetupInteractive(storageRoot);
|
||||
} else {
|
||||
throw new Error(
|
||||
"Non-interactive setup requires all of: --provider, --base-url, --api-key, --model",
|
||||
"Non-interactive setup requires: --provider, --api-key, --model (--base-url is optional for preset providers)",
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
@@ -1,101 +1,330 @@
|
||||
import { readFileSync } from "node:fs";
|
||||
import { dirname, join } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import {
|
||||
generateAdapterReference,
|
||||
generateAuthorReference,
|
||||
generateBootstrapReference,
|
||||
generateDeveloperReference,
|
||||
generateUserReference,
|
||||
generateAdapterDevelopingReference,
|
||||
generateUsageReference,
|
||||
generateWorkflowAuthoringReference,
|
||||
} from "@united-workforce/util";
|
||||
|
||||
// CLI package version (for bootstrap prompt — uwf --version prints this)
|
||||
// Walk up from __dirname to find the nearest package.json (works from both src/ and dist/)
|
||||
function _findCliVersion(): string {
|
||||
let dir = dirname(fileURLToPath(import.meta.url));
|
||||
for (let i = 0; i < 5; i++) {
|
||||
const candidate = join(dir, "package.json");
|
||||
try {
|
||||
const pkg = JSON.parse(readFileSync(candidate, "utf-8")) as {
|
||||
name?: string;
|
||||
version?: string;
|
||||
};
|
||||
if (pkg.name === "@united-workforce/cli") {
|
||||
return pkg.version ?? "0.0.0";
|
||||
}
|
||||
} catch {
|
||||
// not found, keep walking
|
||||
}
|
||||
dir = dirname(dir);
|
||||
}
|
||||
return "0.0.0";
|
||||
}
|
||||
const CLI_VERSION = _findCliVersion();
|
||||
|
||||
export {
|
||||
generateAdapterReference as cmdPromptAdapter,
|
||||
generateAuthorReference as cmdPromptAuthor,
|
||||
generateBootstrapReference as cmdPromptBootstrap,
|
||||
generateDeveloperReference as cmdPromptDeveloper,
|
||||
generateUserReference as cmdPromptUser,
|
||||
generateAdapterDevelopingReference as cmdPromptAdapterDeveloping,
|
||||
generateUsageReference as cmdPromptUsage,
|
||||
generateWorkflowAuthoringReference as cmdPromptWorkflowAuthoring,
|
||||
};
|
||||
|
||||
const PROMPT_ENTRIES: ReadonlyArray<{ name: string; generate: () => string }> = [
|
||||
{ name: "user", generate: generateUserReference },
|
||||
{ name: "author", generate: generateAuthorReference },
|
||||
{ name: "developer", generate: generateDeveloperReference },
|
||||
{ name: "adapter", generate: generateAdapterReference },
|
||||
{ name: "bootstrap", generate: generateBootstrapReference },
|
||||
{ name: "usage", generate: generateUsageReference },
|
||||
{ name: "workflow-authoring", generate: generateWorkflowAuthoringReference },
|
||||
{ name: "adapter-developing", generate: generateAdapterDevelopingReference },
|
||||
];
|
||||
|
||||
export function cmdPromptList(): ReadonlyArray<string> {
|
||||
return PROMPT_ENTRIES.map((e) => e.name);
|
||||
}
|
||||
|
||||
export function cmdPromptUsage(): string {
|
||||
return PROMPT_ENTRIES.map((e) => e.generate()).join("\n\n---\n\n");
|
||||
}
|
||||
export function cmdPromptBootstrap(): string {
|
||||
return `# uwf Bootstrap
|
||||
|
||||
export function cmdPromptSetup(): string {
|
||||
return `# uwf Skill Setup
|
||||
Set up or upgrade uwf (United Workforce) — from zero to running your first workflow.
|
||||
|
||||
You are being asked to install or update the uwf (United Workforce) skill
|
||||
so that you know how to use the \`uwf\` CLI for workflow orchestration.
|
||||
## Scenario A: Fresh Install
|
||||
|
||||
## Steps
|
||||
### Step 0 — Environment pre-flight check
|
||||
|
||||
Run ALL checks below BEFORE installing anything. If any check fails, fix it first — do NOT skip ahead.
|
||||
|
||||
1. **Check if uwf CLI is installed:**
|
||||
\`\`\`bash
|
||||
uwf --help
|
||||
\`\`\`
|
||||
If not installed: \`bun add -g @united-workforce/cli\`
|
||||
# 1. Node.js
|
||||
node --version # need v20+
|
||||
# FIX: install via nvm (https://github.com/nvm-sh/nvm) or your OS package manager
|
||||
|
||||
2. **Clean up old versions of the skill** (if any exist):
|
||||
- Look for any existing uwf/workflow skill files in your skill directories
|
||||
(e.g. \`~/.hermes/skills/uwf/SKILL.md\` or similar paths).
|
||||
- If a skill exists, read its \`version\` from the YAML frontmatter.
|
||||
- If the version matches the current CLI version (\`uwf --version\`), you're done — no update needed.
|
||||
- If the version is older, **delete the old skill file** before proceeding.
|
||||
# 2. Package manager — pnpm (preferred) or npm
|
||||
pnpm --version || npm --version
|
||||
# FIX for pnpm: corepack enable pnpm (Node 20+)
|
||||
# or: npm install -g pnpm
|
||||
# If pnpm is not an option, npm works too — adjust install commands below
|
||||
|
||||
3. **Get the usage skill content:**
|
||||
\`\`\`bash
|
||||
uwf prompt usage
|
||||
# 3. Global bin directory is in PATH
|
||||
# For pnpm:
|
||||
pnpm bin -g # prints the global bin directory
|
||||
# For npm:
|
||||
npm prefix -g # global prefix; bin is <prefix>/bin
|
||||
# The directory printed above MUST be in your PATH.
|
||||
# Test: echo \\$PATH | tr ':' '\\\\n'
|
||||
# FIX: add the missing directory to your shell's startup file
|
||||
# (e.g. ~/.bashrc, ~/.zshrc, ~/.profile, or fish config):
|
||||
# export PATH="<global-bin-dir>:$PATH"
|
||||
\`\`\`
|
||||
|
||||
4. **Create a SKILL.md file** in your skill directory (e.g. \`~/.hermes/skills/uwf/SKILL.md\`).
|
||||
The output of \`uwf prompt usage\` is the skill body WITHOUT frontmatter.
|
||||
You must prepend the following YAML frontmatter:
|
||||
**All checks must pass before continuing.** If you had to modify PATH, verify the change persists by opening a new shell or sourcing your shell config.
|
||||
|
||||
### Step 1 — Discover agents and install adapter
|
||||
|
||||
**First, detect which supported agents are already installed on the user's machine:**
|
||||
|
||||
\`\`\`bash
|
||||
# Check for Hermes Agent
|
||||
which hermes 2>/dev/null && hermes --version
|
||||
|
||||
# Check for Claude Code
|
||||
which claude 2>/dev/null && claude --version # should show "X.Y.Z (Claude Code)"
|
||||
\`\`\`
|
||||
|
||||
**Based on the results:**
|
||||
|
||||
- **Only hermes found** → install \`uwf-hermes\` adapter
|
||||
- **Only claude found** → install \`uwf-claude-code\` adapter
|
||||
- **Both found** → ask the user which agent they want uwf to use as default
|
||||
- **Neither found** → the user must install at least one agent first:
|
||||
- Hermes Agent: https://hermes-agent.nousresearch.com/docs
|
||||
- Claude Code: \`npm install -g @anthropic-ai/claude-code\`
|
||||
|
||||
**Install the uwf CLI and the chosen adapter** using pnpm or npm:
|
||||
|
||||
\`\`\`bash
|
||||
# CLI (required)
|
||||
pnpm add -g @united-workforce/cli # or: npm install -g @united-workforce/cli
|
||||
|
||||
# Adapter — install the one matching the detected agent:
|
||||
pnpm add -g @united-workforce/agent-hermes # or: npm i -g @united-workforce/agent-hermes
|
||||
pnpm add -g @united-workforce/agent-claude-code # or: npm i -g @united-workforce/agent-claude-code
|
||||
\`\`\`
|
||||
|
||||
**⚠ Adapter versions are independent from CLI versions.** Do NOT try to match adapter version to CLI version. Just install \`@latest\` (the default).
|
||||
|
||||
**After installing, verify that \`uwf\` and the adapter are available in PATH:**
|
||||
|
||||
\`\`\`bash
|
||||
uwf --version # should print ${CLI_VERSION}
|
||||
uwf-hermes --version # or: uwf-claude-code --version
|
||||
\`\`\`
|
||||
|
||||
If either command is not found, the global bin directory is not in the current shell's PATH. **You must fix this before continuing:**
|
||||
|
||||
1. Find where the binary was installed:
|
||||
\`\`\`bash
|
||||
find ~/.local ~/.hermes /usr/local -name uwf -type f 2>/dev/null
|
||||
npm prefix -g # global prefix — bin is <prefix>/bin
|
||||
\`\`\`
|
||||
2. Add the directory to PATH permanently by appending to the user's shell config (e.g. \`~/.bashrc\`, \`~/.zshrc\`, \`~/.profile\`, or fish config):
|
||||
\`\`\`bash
|
||||
export PATH="<global-bin-dir>:$PATH"
|
||||
\`\`\`
|
||||
3. Source the updated config or open a new shell, then re-verify the commands work.
|
||||
|
||||
**uwf-hermes** also requires the Hermes ACP plugin. Verify with \`hermes acp --help\`. If not available, install it:
|
||||
\`\`\`bash
|
||||
# Option A: install into hermes venv (recommended)
|
||||
source ~/.hermes/hermes-agent/.venv/bin/activate && pip install hermes-agent[acp]
|
||||
|
||||
# Option B: pipx
|
||||
pipx install 'hermes-agent[acp]'
|
||||
|
||||
# Option C: if installed from source
|
||||
pip install -e '.[acp]'
|
||||
\`\`\`
|
||||
|
||||
### Step 2 — Configure provider and model
|
||||
|
||||
uwf needs an LLM provider to run agents. **Ask the user** for their provider, API key, and model, then run:
|
||||
|
||||
\`\`\`bash
|
||||
uwf setup --provider <name> --api-key <key> --model <model> --agent <adapter-command>
|
||||
\`\`\`
|
||||
|
||||
**Note:** \`--agent\` takes the adapter **command name** (e.g. \`uwf-hermes\`), not the npm package name.
|
||||
|
||||
**Preset providers** — when using a preset name, \`--base-url\` is auto-filled and can be omitted:
|
||||
|
||||
| Provider | Name | Default base URL |
|
||||
|----------|------|-----------------|
|
||||
| OpenAI | \`openai\` | https://api.openai.com/v1 |
|
||||
| xAI | \`xai\` | https://api.x.ai/v1 |
|
||||
| OpenRouter | \`openrouter\` | https://openrouter.ai/api/v1 |
|
||||
| Venice | \`venice\` | https://api.venice.ai/api/v1 |
|
||||
| Dashscope | \`dashscope\` | https://dashscope.aliyuncs.com/compatible-mode/v1 |
|
||||
| DeepSeek | \`deepseek\` | https://api.deepseek.com/v1 |
|
||||
| SiliconFlow | \`siliconflow\` | https://api.siliconflow.cn/v1 |
|
||||
| VolcEngine | \`volcengine\` | https://ark.cn-beijing.volces.com/api/v3 |
|
||||
| Kimi (Moonshot) | \`kimi\` | https://api.moonshot.cn/v1 |
|
||||
| GLM (Zhipu AI) | \`glm\` | https://open.bigmodel.cn/api/paas/v4 |
|
||||
| StepFun | \`stepfun\` | https://api.stepfun.com/v1 |
|
||||
| MiniMax | \`minimax\` | https://api.minimax.io/v1 |
|
||||
| Ollama (local) | \`ollama\` | http://localhost:11434/v1 |
|
||||
|
||||
For **non-preset providers**, you must specify \`--base-url\` manually.
|
||||
|
||||
Example:
|
||||
\`\`\`bash
|
||||
uwf setup --provider openrouter --api-key sk-or-... --model anthropic/claude-sonnet-4 --agent uwf-hermes
|
||||
\`\`\`
|
||||
|
||||
If the user doesn't know what to choose, suggest \`openrouter\` with \`anthropic/claude-sonnet-4\` as a sensible default.
|
||||
|
||||
Config is saved to \`~/.uwf/config.yaml\`. Verify with \`cat ~/.uwf/config.yaml\`.
|
||||
|
||||
### Step 3 — Install skills
|
||||
|
||||
Run each command and create a skill from its output:
|
||||
|
||||
\`\`\`bash
|
||||
uwf prompt usage # → save as skill "uwf-usage"
|
||||
uwf prompt workflow-authoring # → save as skill "uwf-workflow-authoring"
|
||||
uwf prompt adapter-developing # → save as skill "uwf-adapter-developing"
|
||||
\`\`\`
|
||||
|
||||
Each command outputs a complete SKILL.md with YAML frontmatter. Use your agent framework's skill creation API to save them (e.g. \`skill_manage(action='create', name='uwf-usage', content=<output>)\`).
|
||||
|
||||
Verify skills are installed by listing them (e.g. \`skills_list()\`) and confirming all three appear.
|
||||
|
||||
**⚠ After saving all skills, start a new session** so the agent loads the updated skill content. Skills saved in the current session are not active until the next session.
|
||||
|
||||
### Step 4 — Verify end-to-end
|
||||
|
||||
Create a minimal workflow file to test your setup:
|
||||
|
||||
\`\`\`bash
|
||||
cat > /tmp/hello.yaml << 'YAML'
|
||||
name: hello
|
||||
description: Minimal smoke test
|
||||
roles:
|
||||
greeter:
|
||||
description: "Greet the user"
|
||||
goal: "Respond with a friendly greeting"
|
||||
capabilities: []
|
||||
procedure: "Write a short greeting based on the prompt."
|
||||
output: "A greeting message."
|
||||
frontmatter:
|
||||
type: object
|
||||
properties:
|
||||
$status: { const: done }
|
||||
message: { type: string }
|
||||
required: [$status, message]
|
||||
graph:
|
||||
$START:
|
||||
new: { role: greeter, prompt: "Say hello to the user." }
|
||||
resume: { role: greeter, prompt: "Greet the user again." }
|
||||
greeter:
|
||||
done: { role: "$END", prompt: "Done." }
|
||||
YAML
|
||||
\`\`\`
|
||||
|
||||
Then run:
|
||||
|
||||
\`\`\`bash
|
||||
uwf thread start /tmp/hello.yaml -p "Hello, world!"
|
||||
uwf thread exec <thread-id>
|
||||
uwf thread show <thread-id>
|
||||
\`\`\`
|
||||
|
||||
If the thread reaches \`$END\` with status \`completed\`, the setup is working.
|
||||
|
||||
## Scenario B: Upgrade from Previous Version
|
||||
|
||||
### Step 1 — Update packages
|
||||
|
||||
\`\`\`bash
|
||||
# Using pnpm
|
||||
pnpm add -g @united-workforce/cli@latest
|
||||
|
||||
# Using npm
|
||||
npm install -g @united-workforce/cli@latest
|
||||
\`\`\`
|
||||
|
||||
\`\`\`bash
|
||||
uwf --version # should print ${CLI_VERSION}
|
||||
\`\`\`
|
||||
|
||||
Also update your adapter(s):
|
||||
|
||||
\`\`\`bash
|
||||
# pnpm
|
||||
pnpm add -g @united-workforce/agent-hermes@latest
|
||||
|
||||
# npm
|
||||
npm install -g @united-workforce/agent-hermes@latest
|
||||
\`\`\`
|
||||
|
||||
### Step 2 — Regenerate skills
|
||||
|
||||
Skill content is bundled with the CLI — always regenerate after upgrading:
|
||||
|
||||
\`\`\`bash
|
||||
uwf prompt usage # → update skill "uwf-usage"
|
||||
uwf prompt workflow-authoring # → update skill "uwf-workflow-authoring"
|
||||
uwf prompt adapter-developing # → update skill "uwf-adapter-developing"
|
||||
\`\`\`
|
||||
|
||||
**⚠ After updating skills, start a new session** to load the new skill content.
|
||||
|
||||
### Step 3 — Migrate workflow YAML files (if needed)
|
||||
|
||||
Check the changelog for breaking changes. Known migrations:
|
||||
|
||||
- **v0.2.0**: \`$START._\` → \`$START.new\` + \`$START.resume\`. All workflow YAML files must be updated:
|
||||
\`\`\`yaml
|
||||
---
|
||||
name: uwf
|
||||
description: "Use when orchestrating multi-agent workflows with uwf (United Workforce). Covers CLI usage, workflow YAML authoring, moderator logic, and agent development."
|
||||
version: <VERSION>
|
||||
author: United Workforce
|
||||
license: MIT
|
||||
metadata:
|
||||
hermes:
|
||||
tags: [workflow, agents, orchestration, cli, yaml]
|
||||
---
|
||||
# Before (v0.1.x)
|
||||
$START:
|
||||
_: { role: planner, prompt: "..." }
|
||||
|
||||
# After (v0.2.0+)
|
||||
$START:
|
||||
new: { role: planner, prompt: "..." }
|
||||
resume: { role: planner, prompt: "Review previous run and continue." }
|
||||
\`\`\`
|
||||
|
||||
Replace \`<VERSION>\` with the installed CLI version (from \`uwf --version\`).
|
||||
Update all \`.workflow/\` and \`.workflows/\` YAML files in your projects. \`uwf workflow add\` will reject files with the old \`_\` syntax.
|
||||
|
||||
5. **Verify** the skill is loadable by your agent framework.
|
||||
- **v0.2.1**: \`$status: { enum: [value] }\` → \`$status: { const: "value" }\`. The validator no longer accepts \`enum\` for \`$status\`. Update all workflow YAML files:
|
||||
\`\`\`yaml
|
||||
# Before (v0.2.0)
|
||||
$status: { enum: [done] }
|
||||
$status: { type: string, enum: ["ready", "failed"] }
|
||||
|
||||
## Individual prompts
|
||||
# After (v0.2.1+)
|
||||
$status: { const: "done" }
|
||||
# For multi-exit, use oneOf with const (unchanged)
|
||||
\`\`\`
|
||||
|
||||
You can also get individual reference sections:
|
||||
### Step 4 — Verify
|
||||
|
||||
\`\`\`bash
|
||||
uwf thread start <your-workflow> -p "upgrade test"
|
||||
uwf thread exec <thread-id>
|
||||
\`\`\`
|
||||
|
||||
## Available prompts
|
||||
|
||||
\`\`\`bash
|
||||
uwf prompt list # list available prompt names
|
||||
uwf prompt user # user reference (CLI guide + typical workflows)
|
||||
uwf prompt author # author reference (workflow YAML design guide)
|
||||
uwf prompt developer # developer reference (coding conventions + architecture)
|
||||
uwf prompt adapter # adapter reference (building agent adapters)
|
||||
uwf prompt bootstrap # bootstrap skill YAML for Hermes agents
|
||||
uwf prompt usage # CLI usage guide
|
||||
uwf prompt workflow-authoring # workflow YAML design guide
|
||||
uwf prompt adapter-developing # building agent adapters
|
||||
uwf prompt bootstrap # this guide
|
||||
\`\`\`
|
||||
|
||||
## Notes
|
||||
|
||||
- The skill content is bundled with the CLI and versioned with it — always use
|
||||
\`uwf prompt usage\` to get the content matching your installed version.
|
||||
- Do NOT hand-edit the skill body. If the CLI is updated, re-run \`uwf prompt setup\`
|
||||
and follow the steps again.
|
||||
- When upgrading, always delete the old skill first to avoid stale instructions.
|
||||
`;
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import { execFileSync } from "node:child_process";
|
||||
import { existsSync, mkdirSync, readdirSync, readFileSync, statSync, writeFileSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import { stdin as input, stdout as output } from "node:process";
|
||||
@@ -72,6 +73,12 @@ const PRESET_PROVIDERS = [
|
||||
{ name: "ollama", label: "Ollama (local)", baseUrl: "http://localhost:11434/v1" },
|
||||
] as const;
|
||||
|
||||
/** Look up the base URL for a preset provider name. Returns null if not a preset. */
|
||||
export function resolvePresetBaseUrl(providerName: string): string | null {
|
||||
const preset = PRESET_PROVIDERS.find((p) => p.name === providerName);
|
||||
return preset !== undefined ? preset.baseUrl : null;
|
||||
}
|
||||
|
||||
type SetupArgs = {
|
||||
provider: string;
|
||||
baseUrl: string;
|
||||
@@ -175,7 +182,6 @@ export async function _discoverAgents(): Promise<string[]> {
|
||||
|
||||
async function _tryWhichDiscovery(): Promise<string[] | null> {
|
||||
try {
|
||||
const { execFileSync } = await import("node:child_process");
|
||||
const text = execFileSync("which", ["-a", "uwf-hermes", "uwf-claude-code", "uwf-cursor"], {
|
||||
encoding: "utf-8",
|
||||
stdio: ["pipe", "pipe", "pipe"],
|
||||
@@ -391,6 +397,37 @@ function mergeConfig(existing: Record<string, unknown>, args: SetupArgs): Record
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Check if the configured adapter binary (and its dependencies) are in PATH.
|
||||
* Returns warnings array — empty means all good.
|
||||
*/
|
||||
export function _checkAdapterAvailability(agentName: string): string[] {
|
||||
const warnings: string[] = [];
|
||||
const binary = `uwf-${agentName}`;
|
||||
|
||||
try {
|
||||
execFileSync("which", [binary], { encoding: "utf8", stdio: ["pipe", "pipe", "pipe"] });
|
||||
} catch {
|
||||
warnings.push(
|
||||
`${binary} not found in PATH. Install it: pnpm add -g @united-workforce/agent-${agentName}`,
|
||||
);
|
||||
return warnings; // skip dependency check if adapter itself is missing
|
||||
}
|
||||
|
||||
// uwf-hermes depends on hermes CLI
|
||||
if (agentName === "hermes") {
|
||||
try {
|
||||
execFileSync("which", ["hermes"], { encoding: "utf8", stdio: ["pipe", "pipe", "pipe"] });
|
||||
} catch {
|
||||
warnings.push(
|
||||
'hermes CLI not found in PATH (required by uwf-hermes). Fix: export PATH="$HOME/.hermes/hermes-agent/.venv/bin:$PATH"',
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return warnings;
|
||||
}
|
||||
|
||||
/**
|
||||
* Non-interactive setup. All required args provided via CLI flags.
|
||||
*/
|
||||
@@ -405,15 +442,26 @@ export async function cmdSetup(args: SetupArgs): Promise<Record<string, unknown>
|
||||
|
||||
writeFileSync(configPath, stringify(merged, { indent: 2 }), "utf8");
|
||||
|
||||
// Print config path to stderr (stdout is reserved for JSON output)
|
||||
console.error(`Config saved to ${configPath} ✓`);
|
||||
|
||||
// Validate model connectivity
|
||||
const validation = await validateModel(args.baseUrl, args.apiKey, args.model);
|
||||
|
||||
// Check adapter availability
|
||||
const agentName = _agentNameFromBinary(args.agent ?? "hermes");
|
||||
const adapterWarnings = _checkAdapterAvailability(agentName);
|
||||
for (const w of adapterWarnings) {
|
||||
console.error(`⚠ ${w}`);
|
||||
}
|
||||
|
||||
return {
|
||||
configPath,
|
||||
provider: args.provider,
|
||||
model: args.model,
|
||||
defaultAgent: merged.defaultAgent,
|
||||
validation,
|
||||
adapterWarnings,
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ import type {
|
||||
StepNodePayload,
|
||||
ThreadId,
|
||||
} from "@united-workforce/protocol";
|
||||
import { createUwfStore, findHistoryEntry, getThread, type UwfStore } from "../store.js";
|
||||
import { createUwfStore, getThread, type UwfStore } from "../store.js";
|
||||
|
||||
type ChainState = {
|
||||
startHash: CasRef;
|
||||
@@ -207,10 +207,6 @@ async function resolveHeadHash(storageRoot: string, threadId: ThreadId): Promise
|
||||
if (entry !== null) {
|
||||
return entry.head;
|
||||
}
|
||||
const hist = findHistoryEntry(uwf.varStore, threadId);
|
||||
if (hist !== null) {
|
||||
return hist.head;
|
||||
}
|
||||
fail(`thread not found: ${threadId}`);
|
||||
}
|
||||
|
||||
|
||||
@@ -66,6 +66,7 @@ export async function cmdStepList(
|
||||
agent: item.payload.agent,
|
||||
timestamp: item.timestamp,
|
||||
durationMs: item.payload.completedAtMs - item.payload.startedAtMs,
|
||||
usage: item.payload.usage ?? null,
|
||||
});
|
||||
}
|
||||
|
||||
@@ -114,8 +115,10 @@ export async function cmdStepFork(
|
||||
const newThreadId = generateUlid(Date.now()) as ThreadId;
|
||||
setThread(uwf.varStore, newThreadId, {
|
||||
head: stepHash,
|
||||
status: "idle",
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: null,
|
||||
});
|
||||
|
||||
return {
|
||||
|
||||
@@ -38,17 +38,14 @@ import { createMarker, deleteMarker, isThreadRunning } from "../background/index
|
||||
import { createIncludeTag } from "../include.js";
|
||||
import { evaluate, isSuspendResult } from "../moderator/index.js";
|
||||
import {
|
||||
addHistoryEntry,
|
||||
completeThread,
|
||||
createUwfStore,
|
||||
deleteThread,
|
||||
findHistoryEntry,
|
||||
getThread,
|
||||
loadAllHistory,
|
||||
loadAllThreads,
|
||||
loadActiveThreads,
|
||||
loadHistoryThreads,
|
||||
loadWorkflowRegistry,
|
||||
resolveWorkflowHash,
|
||||
setThread,
|
||||
type ThreadHistoryLine,
|
||||
type UwfStore,
|
||||
} from "../store.js";
|
||||
import { checkWorkflowFilenameConsistency, isCasRef, parseWorkflowPayload } from "../validate.js";
|
||||
@@ -485,20 +482,35 @@ export async function cmdThreadShow(
|
||||
): Promise<ThreadShowOutput> {
|
||||
const uwf = await createUwfStore(storageRoot);
|
||||
const entry = getThread(uwf.varStore, threadId);
|
||||
if (entry !== null) {
|
||||
if (entry === null) {
|
||||
fail(`thread not found: ${threadId}`);
|
||||
}
|
||||
|
||||
const activeHead = entry.head;
|
||||
const workflow = resolveWorkflowFromHead(uwf, activeHead);
|
||||
if (workflow === null) {
|
||||
fail(`failed to resolve workflow from head: ${activeHead}`);
|
||||
}
|
||||
|
||||
const status = await resolveActiveThreadStatus(
|
||||
storageRoot,
|
||||
threadId,
|
||||
uwf,
|
||||
activeHead,
|
||||
// Determine if this is a completed/cancelled thread
|
||||
if (entry.status === "completed" || entry.status === "cancelled") {
|
||||
const hint = null;
|
||||
return {
|
||||
workflow,
|
||||
);
|
||||
thread: threadId,
|
||||
head: activeHead,
|
||||
status: entry.status,
|
||||
currentRole: null,
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
done: true,
|
||||
background: null,
|
||||
hint,
|
||||
};
|
||||
}
|
||||
|
||||
// Active thread
|
||||
const status = await resolveActiveThreadStatus(storageRoot, threadId, uwf, activeHead, workflow);
|
||||
const currentRole = resolveCurrentRole(uwf, activeHead, workflow);
|
||||
const suspendFields = resolveSuspendFieldsForShow(entry, status, uwf, activeHead, workflow);
|
||||
|
||||
@@ -521,27 +533,6 @@ export async function cmdThreadShow(
|
||||
};
|
||||
}
|
||||
|
||||
const hist = findHistoryEntry(uwf.varStore, threadId);
|
||||
if (hist !== null) {
|
||||
const status: ThreadStatus = hist.reason === "cancelled" ? "cancelled" : "completed";
|
||||
|
||||
return {
|
||||
workflow: hist.workflow,
|
||||
thread: threadId,
|
||||
head: hist.head,
|
||||
status,
|
||||
currentRole: null,
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
done: true,
|
||||
background: null,
|
||||
hint: null,
|
||||
};
|
||||
}
|
||||
|
||||
fail(`thread not found: ${threadId}`);
|
||||
}
|
||||
|
||||
export type ThreadListItemWithStatus = ThreadListItem & {
|
||||
status: ThreadStatus;
|
||||
currentRole: string | null;
|
||||
@@ -594,19 +585,20 @@ async function collectActiveThreads(
|
||||
}
|
||||
|
||||
function collectCompletedThreads(
|
||||
varStore: VarStore,
|
||||
uwf: UwfStore,
|
||||
activeIds: Set<ThreadId>,
|
||||
): ThreadListItemWithStatus[] {
|
||||
const items: ThreadListItemWithStatus[] = [];
|
||||
const history = loadAllHistory(varStore);
|
||||
const history = loadHistoryThreads(uwf.varStore);
|
||||
const seen = new Set<ThreadId>(); // Deduplication (issue #470)
|
||||
for (const entry of history) {
|
||||
if (!activeIds.has(entry.thread) && !seen.has(entry.thread)) {
|
||||
seen.add(entry.thread);
|
||||
const status = entry.reason === "cancelled" ? "cancelled" : "completed";
|
||||
for (const [threadId, entry] of Object.entries(history)) {
|
||||
if (!activeIds.has(threadId as ThreadId) && !seen.has(threadId as ThreadId)) {
|
||||
seen.add(threadId as ThreadId);
|
||||
const status = entry.status;
|
||||
const workflow = resolveWorkflowFromHead(uwf, entry.head);
|
||||
items.push({
|
||||
thread: entry.thread,
|
||||
workflow: entry.workflow,
|
||||
thread: threadId as ThreadId,
|
||||
workflow: workflow ?? "",
|
||||
head: entry.head,
|
||||
status,
|
||||
currentRole: null,
|
||||
@@ -659,7 +651,7 @@ export async function cmdThreadList(
|
||||
take: number | null,
|
||||
): Promise<ThreadListItemWithStatus[]> {
|
||||
const uwf = await createUwfStore(storageRoot);
|
||||
const index = loadAllThreads(uwf.varStore);
|
||||
const index = loadActiveThreads(uwf.varStore);
|
||||
|
||||
// Collect active threads
|
||||
let items = await collectActiveThreads(storageRoot, uwf, index);
|
||||
@@ -671,7 +663,7 @@ export async function cmdThreadList(
|
||||
statusFilter.includes("cancelled");
|
||||
if (includeCompleted) {
|
||||
const activeIds = new Set(items.map((i) => i.thread));
|
||||
const completedItems = collectCompletedThreads(uwf.varStore, activeIds);
|
||||
const completedItems = collectCompletedThreads(uwf, activeIds);
|
||||
items = items.concat(completedItems);
|
||||
}
|
||||
|
||||
@@ -919,7 +911,7 @@ function resolveEvaluateArgs(
|
||||
chain: ChainState,
|
||||
): { lastRole: string; lastOutput: EvaluateLastOutput } {
|
||||
if (chain.headIsStart) {
|
||||
return { lastRole: START_ROLE, lastOutput: { [STATUS_KEY]: "_" } };
|
||||
return { lastRole: START_ROLE, lastOutput: { [STATUS_KEY]: "new" } };
|
||||
}
|
||||
|
||||
const lastStep = chain.stepsNewestFirst[0];
|
||||
@@ -969,6 +961,12 @@ function resolveAgentConfig(
|
||||
agentOverride: string | null,
|
||||
): AgentConfig {
|
||||
if (agentOverride !== null) {
|
||||
// Try config alias first (e.g. "hermes" → config.agents.hermes),
|
||||
// then fall back to raw command name (e.g. "uwf-hermes" or "/usr/bin/agent").
|
||||
const fromAlias = config.agents[agentOverride as AgentAlias];
|
||||
if (fromAlias !== undefined) {
|
||||
return fromAlias;
|
||||
}
|
||||
return parseAgentOverride(agentOverride);
|
||||
}
|
||||
|
||||
@@ -1006,6 +1004,12 @@ function spawnAgent(
|
||||
});
|
||||
} catch (e) {
|
||||
const err = e as NodeJS.ErrnoException & { stderr?: Buffer | string | null };
|
||||
if (err.code === "ENOENT") {
|
||||
failStep(
|
||||
plog,
|
||||
`"${agent.command}" not found in PATH. Install it or check your PATH config. Run: which ${agent.command}`,
|
||||
);
|
||||
}
|
||||
const stderr =
|
||||
err.stderr == null
|
||||
? ""
|
||||
@@ -1035,15 +1039,8 @@ function spawnAgent(
|
||||
return obj as unknown as AdapterOutput;
|
||||
}
|
||||
|
||||
function archiveThread(uwf: UwfStore, threadId: ThreadId, workflow: CasRef, head: CasRef): void {
|
||||
deleteThread(uwf.varStore, threadId);
|
||||
addHistoryEntry(uwf.varStore, {
|
||||
thread: threadId,
|
||||
workflow,
|
||||
head,
|
||||
completedAt: Date.now(),
|
||||
reason: "completed",
|
||||
});
|
||||
function archiveThread(uwf: UwfStore, threadId: ThreadId, _workflow: CasRef, _head: CasRef): void {
|
||||
completeThread(uwf.varStore, threadId, "completed");
|
||||
}
|
||||
|
||||
export async function cmdThreadResume(
|
||||
@@ -1067,17 +1064,24 @@ export async function cmdThreadResume(
|
||||
const chain = walkChain(uwf, headHash);
|
||||
const workflowHash = chain.start.workflow;
|
||||
|
||||
const status = await resolveActiveThreadStatus(
|
||||
storageRoot,
|
||||
threadId,
|
||||
uwf,
|
||||
headHash,
|
||||
workflowHash,
|
||||
);
|
||||
if (status !== "suspended") {
|
||||
fail(`thread is not suspended: ${threadId} (status: ${status})`);
|
||||
// Check entry.status first for completed/cancelled (like in cmdThreadShow)
|
||||
let status: ThreadStatus;
|
||||
if (entry.status === "completed" || entry.status === "cancelled") {
|
||||
status = entry.status;
|
||||
} else {
|
||||
status = await resolveActiveThreadStatus(storageRoot, threadId, uwf, headHash, workflowHash);
|
||||
}
|
||||
|
||||
if (status !== "suspended" && status !== "completed") {
|
||||
fail(`thread cannot be resumed: ${threadId} (status: ${status})`);
|
||||
}
|
||||
|
||||
const plog = createProcessLogger({
|
||||
storageRoot,
|
||||
context: { thread: threadId, workflow: workflowHash },
|
||||
});
|
||||
|
||||
if (status === "suspended") {
|
||||
const suspendFields = resolveSuspendFieldsForShow(entry, status, uwf, headHash, workflowHash);
|
||||
if (suspendFields.suspendedRole === null) {
|
||||
fail(`thread is suspended but suspendedRole is missing: ${threadId}`);
|
||||
@@ -1087,10 +1091,6 @@ export async function cmdThreadResume(
|
||||
}
|
||||
|
||||
const resumePrompt = buildResumePrompt(suspendFields.suspendMessage, supplement);
|
||||
const plog = createProcessLogger({
|
||||
storageRoot,
|
||||
context: { thread: threadId, workflow: workflowHash },
|
||||
});
|
||||
|
||||
plog.log(
|
||||
PL_THREAD_RESUME,
|
||||
@@ -1104,6 +1104,43 @@ export async function cmdThreadResume(
|
||||
});
|
||||
}
|
||||
|
||||
// status === "completed"
|
||||
const workflow = loadWorkflowPayload(uwf, workflowHash);
|
||||
const startResult = evaluate(workflow.graph, START_ROLE, { [STATUS_KEY]: "resume" });
|
||||
if (!startResult.ok) {
|
||||
fail(`failed to evaluate $START: ${startResult.error.message}`);
|
||||
}
|
||||
if (isSuspendResult(startResult.value)) {
|
||||
fail("workflow cannot start with $SUSPEND");
|
||||
}
|
||||
if (startResult.value.role === END_ROLE) {
|
||||
fail("workflow cannot start with $END");
|
||||
}
|
||||
|
||||
const startRole = startResult.value.role;
|
||||
const completedResumePrompt = buildResumePrompt(startResult.value.prompt, supplement);
|
||||
|
||||
const updatedEntry = { ...entry, status: "idle" as const, completedAt: null };
|
||||
setThread(uwf.varStore, threadId, updatedEntry);
|
||||
|
||||
plog.log(
|
||||
PL_THREAD_RESUME,
|
||||
`resume completed role=${startRole} supplement=${supplement !== null}`,
|
||||
null,
|
||||
);
|
||||
|
||||
return cmdThreadStepOnce(storageRoot, threadId, agentOverride, plog, {
|
||||
role: startRole,
|
||||
prompt: completedResumePrompt,
|
||||
});
|
||||
}
|
||||
|
||||
export function validateCount(count: number): void {
|
||||
if (count < 1 || !Number.isInteger(count)) {
|
||||
throw new Error(`--count must be a positive integer, got: ${count}`);
|
||||
}
|
||||
}
|
||||
|
||||
export async function cmdThreadExec(
|
||||
storageRoot: string,
|
||||
threadId: ThreadId,
|
||||
@@ -1112,9 +1149,7 @@ export async function cmdThreadExec(
|
||||
background: boolean,
|
||||
backgroundWorker: boolean,
|
||||
): Promise<StepOutput[]> {
|
||||
if (count < 1 || !Number.isInteger(count)) {
|
||||
fail(`--count must be a positive integer, got: ${count}`);
|
||||
}
|
||||
validateCount(count);
|
||||
|
||||
// Check if thread is already running in background (unless we ARE the background worker)
|
||||
if (!backgroundWorker) {
|
||||
@@ -1249,7 +1284,7 @@ function resolveResumeStepTarget(
|
||||
}
|
||||
|
||||
async function resolveModeratorStepTarget(
|
||||
storageRoot: string,
|
||||
_storageRoot: string,
|
||||
threadId: ThreadId,
|
||||
entry: ThreadIndexEntry,
|
||||
headHash: CasRef,
|
||||
@@ -1318,7 +1353,7 @@ async function resolveModeratorStepTarget(
|
||||
}
|
||||
|
||||
async function finalizeAgentStep(
|
||||
storageRoot: string,
|
||||
_storageRoot: string,
|
||||
threadId: ThreadId,
|
||||
workflowHash: CasRef,
|
||||
workflow: WorkflowPayload,
|
||||
@@ -1450,10 +1485,6 @@ async function resolveHeadHash(storageRoot: string, threadId: ThreadId): Promise
|
||||
if (entry !== null) {
|
||||
return entry.head;
|
||||
}
|
||||
const hist = findHistoryEntry(uwf.varStore, threadId);
|
||||
if (hist !== null) {
|
||||
return hist.head;
|
||||
}
|
||||
fail(`thread not found: ${threadId}`);
|
||||
}
|
||||
|
||||
@@ -1533,7 +1564,6 @@ export async function cmdThreadCancel(
|
||||
if (entry === null) {
|
||||
fail(`thread not active: ${threadId}`);
|
||||
}
|
||||
const head = entry.head;
|
||||
|
||||
// Check if thread is running in background and terminate it
|
||||
const runningMarker = await isThreadRunning(storageRoot, threadId);
|
||||
@@ -1546,21 +1576,7 @@ export async function cmdThreadCancel(
|
||||
await deleteMarker(storageRoot, threadId);
|
||||
}
|
||||
|
||||
const workflow = resolveWorkflowFromHead(uwf, head);
|
||||
if (workflow === null) {
|
||||
fail(`failed to resolve workflow from head: ${head}`);
|
||||
}
|
||||
|
||||
deleteThread(uwf.varStore, threadId);
|
||||
|
||||
const historyEntry: ThreadHistoryLine = {
|
||||
thread: threadId,
|
||||
workflow,
|
||||
head,
|
||||
completedAt: Date.now(),
|
||||
reason: "cancelled",
|
||||
};
|
||||
addHistoryEntry(uwf.varStore, historyEntry);
|
||||
completeThread(uwf.varStore, threadId, "cancelled");
|
||||
|
||||
return { thread: threadId, cancelled: true };
|
||||
}
|
||||
|
||||
@@ -6,11 +6,11 @@ describe("Edge prompt template variable resolution", () => {
|
||||
test("returns error when rendered prompt is empty string", () => {
|
||||
const graph = {
|
||||
$START: {
|
||||
_: { role: "classifier", prompt: "{{{userPrompt}}}", location: null },
|
||||
new: { role: "classifier", prompt: "{{{userPrompt}}}", location: null },
|
||||
},
|
||||
};
|
||||
|
||||
const result = evaluate(graph, "$START", {});
|
||||
const result = evaluate(graph, "$START", { $status: "new" });
|
||||
|
||||
expect(result.ok).toBe(false);
|
||||
if (!result.ok) {
|
||||
@@ -22,11 +22,11 @@ describe("Edge prompt template variable resolution", () => {
|
||||
test("returns error when rendered prompt is whitespace-only", () => {
|
||||
const graph = {
|
||||
$START: {
|
||||
_: { role: "classifier", prompt: " {{{userPrompt}}} ", location: null },
|
||||
new: { role: "classifier", prompt: " {{{userPrompt}}} ", location: null },
|
||||
},
|
||||
};
|
||||
|
||||
const result = evaluate(graph, "$START", {});
|
||||
const result = evaluate(graph, "$START", { $status: "new" });
|
||||
|
||||
expect(result.ok).toBe(false);
|
||||
if (!result.ok) {
|
||||
@@ -38,11 +38,11 @@ describe("Edge prompt template variable resolution", () => {
|
||||
test("succeeds when all template variables resolve to non-empty values", () => {
|
||||
const graph = {
|
||||
$START: {
|
||||
_: { role: "classifier", prompt: "{{{userPrompt}}}", location: null },
|
||||
new: { role: "classifier", prompt: "{{{userPrompt}}}", location: null },
|
||||
},
|
||||
};
|
||||
|
||||
const result = evaluate(graph, "$START", { userPrompt: "Fix the bug" });
|
||||
const result = evaluate(graph, "$START", { $status: "new", userPrompt: "Fix the bug" });
|
||||
|
||||
expect(result.ok).toBe(true);
|
||||
if (result.ok) {
|
||||
@@ -53,11 +53,11 @@ describe("Edge prompt template variable resolution", () => {
|
||||
test("succeeds with static (no-variable) prompt", () => {
|
||||
const graph = {
|
||||
$START: {
|
||||
_: { role: "classifier", prompt: "Classify this input", location: null },
|
||||
new: { role: "classifier", prompt: "Classify this input", location: null },
|
||||
},
|
||||
};
|
||||
|
||||
const result = evaluate(graph, "$START", {});
|
||||
const result = evaluate(graph, "$START", { $status: "new" });
|
||||
|
||||
expect(result.ok).toBe(true);
|
||||
if (result.ok) {
|
||||
@@ -68,11 +68,11 @@ describe("Edge prompt template variable resolution", () => {
|
||||
test("succeeds when prompt has mix of static text and unresolved variables", () => {
|
||||
const graph = {
|
||||
$START: {
|
||||
_: { role: "classifier", prompt: "Please handle: {{{userPrompt}}}", location: null },
|
||||
new: { role: "classifier", prompt: "Please handle: {{{userPrompt}}}", location: null },
|
||||
},
|
||||
};
|
||||
|
||||
const result = evaluate(graph, "$START", {});
|
||||
const result = evaluate(graph, "$START", { $status: "new" });
|
||||
|
||||
expect(result.ok).toBe(true);
|
||||
if (result.ok) {
|
||||
@@ -83,11 +83,11 @@ describe("Edge prompt template variable resolution", () => {
|
||||
test("returns error when ALL variables missing and no static text remains", () => {
|
||||
const graph = {
|
||||
$START: {
|
||||
_: { role: "classifier", prompt: "{{{a}}}{{{b}}}", location: null },
|
||||
new: { role: "classifier", prompt: "{{{a}}}{{{b}}}", location: null },
|
||||
},
|
||||
};
|
||||
|
||||
const result = evaluate(graph, "$START", {});
|
||||
const result = evaluate(graph, "$START", { $status: "new" });
|
||||
|
||||
expect(result.ok).toBe(false);
|
||||
});
|
||||
|
||||
@@ -6,9 +6,7 @@ import type { EvaluateResult, Result } from "./types.js";
|
||||
// Disable HTML escaping — prompts are plain text, not HTML.
|
||||
mustache.escape = (text: string) => text;
|
||||
|
||||
const START_ROLE = "$START";
|
||||
const SUSPEND_ROLE = "$SUSPEND";
|
||||
const UNIT_STATUS = "_";
|
||||
|
||||
type LastOutput = Record<string, unknown>;
|
||||
|
||||
@@ -19,12 +17,15 @@ export function evaluate(
|
||||
lastRole: string,
|
||||
lastOutput: LastOutput,
|
||||
): Result<EvaluateResult, Error> {
|
||||
const status =
|
||||
lastRole === START_ROLE
|
||||
? UNIT_STATUS
|
||||
: typeof lastOutput[STATUS_KEY] === "string"
|
||||
? (lastOutput[STATUS_KEY] as string)
|
||||
: UNIT_STATUS;
|
||||
let status: string;
|
||||
if (typeof lastOutput[STATUS_KEY] === "string") {
|
||||
status = lastOutput[STATUS_KEY] as string;
|
||||
} else {
|
||||
return {
|
||||
ok: false,
|
||||
error: new Error(`agent output for role "${lastRole}" is missing required "$status" string`),
|
||||
};
|
||||
}
|
||||
|
||||
const roleTargets = graph[lastRole];
|
||||
if (roleTargets === undefined) {
|
||||
|
||||
+90
-52
@@ -6,13 +6,7 @@ import { join } from "node:path";
|
||||
|
||||
import { bootstrap, type Hash, type Store, type VarStore } from "@ocas/core";
|
||||
import { createFsStore, createSqliteVarStore } from "@ocas/fs";
|
||||
import type {
|
||||
CasRef,
|
||||
ThreadId,
|
||||
ThreadIndexEntry,
|
||||
ThreadListItem,
|
||||
ThreadsIndex,
|
||||
} from "@united-workforce/protocol";
|
||||
import type { CasRef, ThreadId, ThreadIndexEntry, ThreadsIndex } from "@united-workforce/protocol";
|
||||
import { parseThreadsIndex } from "@united-workforce/protocol";
|
||||
import { parse } from "yaml";
|
||||
|
||||
@@ -26,9 +20,6 @@ export const REGISTRY_VAR_PREFIX = "@uwf/registry/";
|
||||
/** Variable name prefix for active thread entries (`@uwf/thread/<thread-id>`). */
|
||||
export const THREAD_VAR_PREFIX = "@uwf/thread/";
|
||||
|
||||
/** Variable name prefix for completed/cancelled thread history (`@uwf/history/<thread-id>`). */
|
||||
export const HISTORY_VAR_PREFIX = "@uwf/history/";
|
||||
|
||||
/** A workflow entry discovered from the project-local .workflows/ directory. */
|
||||
export type ProjectWorkflowEntry = {
|
||||
/** Workflow name (from YAML `name` field, equals filename stem). */
|
||||
@@ -156,11 +147,6 @@ export function getThreadsPath(storageRoot: string): string {
|
||||
return join(storageRoot, "threads.yaml");
|
||||
}
|
||||
|
||||
export type ThreadHistoryLine = ThreadListItem & {
|
||||
completedAt: number;
|
||||
reason: "completed" | "cancelled" | null;
|
||||
};
|
||||
|
||||
export type UwfStore = {
|
||||
storageRoot: string;
|
||||
store: Store;
|
||||
@@ -179,6 +165,7 @@ export async function createUwfStore(storageRoot: string): Promise<UwfStore> {
|
||||
await migrateWorkflowRegistryIfNeeded(storageRoot, varStore);
|
||||
await migrateThreadsIndexIfNeeded(storageRoot, varStore);
|
||||
await migrateHistoryIfNeeded(storageRoot, varStore);
|
||||
migrateHistoryVarsToThreadVars(varStore);
|
||||
return { storageRoot, store, schemas, varStore };
|
||||
}
|
||||
|
||||
@@ -299,8 +286,10 @@ function threadVarName(threadId: ThreadId): string {
|
||||
function entryFromVariable(v: { value: string; tags: Record<string, string> }): ThreadIndexEntry {
|
||||
return {
|
||||
head: v.value as CasRef,
|
||||
status: (v.tags.status ?? "idle") as ThreadIndexEntry["status"],
|
||||
suspendedRole: v.tags.suspendedRole ?? null,
|
||||
suspendMessage: v.tags.suspendMessage ?? null,
|
||||
completedAt: v.tags.completedAt !== undefined ? Number(v.tags.completedAt) : null,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -331,21 +320,74 @@ export function setThread(varStore: VarStore, threadId: ThreadId, entry: ThreadI
|
||||
// Head CAS nodes may use different schemas (StartNode vs StepNode) — clear all variants first.
|
||||
varStore.remove(name);
|
||||
const tags: Record<string, string> = {};
|
||||
if (entry.status !== "idle") {
|
||||
tags.status = entry.status;
|
||||
}
|
||||
if (entry.suspendedRole !== null) {
|
||||
tags.suspendedRole = entry.suspendedRole;
|
||||
}
|
||||
if (entry.suspendMessage !== null) {
|
||||
tags.suspendMessage = entry.suspendMessage;
|
||||
}
|
||||
if (entry.completedAt !== null) {
|
||||
tags.completedAt = String(entry.completedAt);
|
||||
}
|
||||
varStore.set(name, entry.head, { tags });
|
||||
}
|
||||
|
||||
/** Remove an active thread entry (on complete/cancel). */
|
||||
export function deleteThread(varStore: VarStore, threadId: ThreadId): void {
|
||||
varStore.remove(threadVarName(threadId));
|
||||
/** Load only active threads (status not in completed/cancelled). */
|
||||
export function loadActiveThreads(varStore: VarStore): ThreadsIndex {
|
||||
const all = loadAllThreads(varStore);
|
||||
const active: ThreadsIndex = {};
|
||||
for (const [threadId, entry] of Object.entries(all)) {
|
||||
if (entry.status !== "completed" && entry.status !== "cancelled") {
|
||||
active[threadId as ThreadId] = entry;
|
||||
}
|
||||
}
|
||||
return active;
|
||||
}
|
||||
|
||||
function parseHistoryJsonlLine(trimmed: string): ThreadHistoryLine | null {
|
||||
/** Load only completed/cancelled threads (history). */
|
||||
export function loadHistoryThreads(varStore: VarStore): ThreadsIndex {
|
||||
const all = loadAllThreads(varStore);
|
||||
const history: ThreadsIndex = {};
|
||||
for (const [threadId, entry] of Object.entries(all)) {
|
||||
if (entry.status === "completed" || entry.status === "cancelled") {
|
||||
history[threadId as ThreadId] = entry;
|
||||
}
|
||||
}
|
||||
return history;
|
||||
}
|
||||
|
||||
/** Complete a thread by marking it completed or cancelled. */
|
||||
export function completeThread(
|
||||
varStore: VarStore,
|
||||
threadId: ThreadId,
|
||||
reason: "completed" | "cancelled",
|
||||
): void {
|
||||
const entry = getThread(varStore, threadId);
|
||||
if (entry === null) {
|
||||
return;
|
||||
}
|
||||
const completed = {
|
||||
head: entry.head,
|
||||
status: reason,
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: Date.now(),
|
||||
} as ThreadIndexEntry;
|
||||
setThread(varStore, threadId, completed);
|
||||
}
|
||||
|
||||
type LegacyHistoryEntry = {
|
||||
thread: ThreadId;
|
||||
workflow: CasRef;
|
||||
head: CasRef;
|
||||
completedAt: number;
|
||||
reason: "completed" | "cancelled" | null;
|
||||
};
|
||||
|
||||
function parseLegacyHistoryJsonlLine(trimmed: string): LegacyHistoryEntry | null {
|
||||
let raw: unknown;
|
||||
try {
|
||||
raw = JSON.parse(trimmed) as unknown;
|
||||
@@ -379,7 +421,7 @@ function parseHistoryJsonlLine(trimmed: string): ThreadHistoryLine | null {
|
||||
return null;
|
||||
}
|
||||
|
||||
/** One-time migration: `~/.uwf/history.jsonl` → `@uwf/history/*` variables. */
|
||||
/** One-time migration: `~/.uwf/history.jsonl` → `@uwf/thread/*` variables with status tags. */
|
||||
export async function migrateHistoryIfNeeded(
|
||||
storageRoot: string,
|
||||
varStore: VarStore,
|
||||
@@ -395,47 +437,43 @@ export async function migrateHistoryIfNeeded(
|
||||
if (trimmed === "") {
|
||||
continue;
|
||||
}
|
||||
const entry = parseHistoryJsonlLine(trimmed);
|
||||
const entry = parseLegacyHistoryJsonlLine(trimmed);
|
||||
if (entry !== null) {
|
||||
addHistoryEntry(varStore, entry);
|
||||
const status = entry.reason === "cancelled" ? "cancelled" : "completed";
|
||||
const threadEntry: ThreadIndexEntry = {
|
||||
head: entry.head,
|
||||
status: status as ThreadIndexEntry["status"],
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt: entry.completedAt,
|
||||
};
|
||||
setThread(varStore, entry.thread, threadEntry);
|
||||
}
|
||||
}
|
||||
|
||||
await rename(path, `${path}.migrated`);
|
||||
}
|
||||
|
||||
export function loadAllHistory(varStore: VarStore): ThreadHistoryLine[] {
|
||||
const vars = varStore.list({ namePrefix: HISTORY_VAR_PREFIX });
|
||||
return vars.map((v) => ({
|
||||
thread: v.name.slice(HISTORY_VAR_PREFIX.length) as ThreadId,
|
||||
workflow: v.tags.workflow ?? "",
|
||||
head: v.value as CasRef,
|
||||
completedAt: Number(v.tags.completedAt ?? "0"),
|
||||
reason: v.tags.reason === "completed" || v.tags.reason === "cancelled" ? v.tags.reason : null,
|
||||
}));
|
||||
}
|
||||
/** Migrate `@uwf/history/*` variables to `@uwf/thread/*` with status tags. */
|
||||
export function migrateHistoryVarsToThreadVars(varStore: VarStore): void {
|
||||
const LEGACY_HISTORY_VAR_PREFIX = "@uwf/history/";
|
||||
const vars = varStore.list({ namePrefix: LEGACY_HISTORY_VAR_PREFIX });
|
||||
|
||||
export function findHistoryEntry(varStore: VarStore, threadId: ThreadId): ThreadHistoryLine | null {
|
||||
const vars = varStore.list({ namePrefix: `${HISTORY_VAR_PREFIX}${threadId}` });
|
||||
const v = vars.find((entry) => entry.name === `${HISTORY_VAR_PREFIX}${threadId}`);
|
||||
if (v === undefined) {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
thread: threadId,
|
||||
workflow: v.tags.workflow ?? "",
|
||||
for (const v of vars) {
|
||||
const threadId = v.name.slice(LEGACY_HISTORY_VAR_PREFIX.length) as ThreadId;
|
||||
const reason = v.tags.reason;
|
||||
const status = reason === "cancelled" ? "cancelled" : "completed";
|
||||
const completedAt = Number(v.tags.completedAt ?? Date.now());
|
||||
|
||||
const threadEntry: ThreadIndexEntry = {
|
||||
head: v.value as CasRef,
|
||||
completedAt: Number(v.tags.completedAt ?? "0"),
|
||||
reason: v.tags.reason === "completed" || v.tags.reason === "cancelled" ? v.tags.reason : null,
|
||||
status: status as ThreadIndexEntry["status"],
|
||||
suspendedRole: null,
|
||||
suspendMessage: null,
|
||||
completedAt,
|
||||
};
|
||||
}
|
||||
|
||||
export function addHistoryEntry(varStore: VarStore, entry: ThreadHistoryLine): void {
|
||||
varStore.set(`${HISTORY_VAR_PREFIX}${entry.thread}`, entry.head, {
|
||||
tags: {
|
||||
workflow: entry.workflow,
|
||||
completedAt: String(entry.completedAt),
|
||||
reason: entry.reason ?? "completed",
|
||||
},
|
||||
});
|
||||
setThread(varStore, threadId, threadEntry);
|
||||
varStore.remove(v.name);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -24,26 +24,22 @@ function isOneOfSchema(fm: unknown): fm is SchemaObj & { oneOf: SchemaObj[] } {
|
||||
return Array.isArray(obj.oneOf);
|
||||
}
|
||||
|
||||
/** Check if a frontmatter schema uses enum-based multi-exit ($status with multiple enum values). */
|
||||
function isEnumMultiExit(fm: unknown): boolean {
|
||||
/** Check if a frontmatter schema declares "$status" as const (flat schema form). */
|
||||
function hasStatusConst(fm: unknown): boolean {
|
||||
if (typeof fm !== "object" || fm === null) return false;
|
||||
const obj = fm as SchemaObj;
|
||||
const props = obj.properties as Record<string, SchemaObj> | undefined;
|
||||
if (!props?.$status) return false;
|
||||
const statusDef = props.$status;
|
||||
if (!Array.isArray(statusDef.enum)) return false;
|
||||
// Filter out "_" (wildcard) — if remaining values > 1, it's multi-exit
|
||||
const statuses = (statusDef.enum as string[]).filter((s) => s !== "_");
|
||||
return statuses.length > 1;
|
||||
return typeof props.$status.const === "string";
|
||||
}
|
||||
|
||||
/** Extract status values from an enum-based $status field. */
|
||||
function getEnumStatuses(fm: SchemaObj): string[] {
|
||||
/** Extract status values from a const-based $status field. */
|
||||
function getConstStatuses(fm: SchemaObj): string[] {
|
||||
const props = fm.properties as Record<string, SchemaObj> | undefined;
|
||||
if (!props?.$status) return [];
|
||||
const statusDef = props.$status;
|
||||
if (!Array.isArray(statusDef.enum)) return [];
|
||||
return (statusDef.enum as string[]).filter((s) => s !== "_");
|
||||
if (typeof statusDef.const === "string") return [statusDef.const];
|
||||
return [];
|
||||
}
|
||||
|
||||
/** Get property names from a schema object. */
|
||||
@@ -101,9 +97,9 @@ function checkGraphStructure(payload: WorkflowPayload, errors: string[]): void {
|
||||
if (!graphNodes.has("$START")) {
|
||||
errors.push("$START must be defined in graph");
|
||||
} else {
|
||||
const startKeys = Object.keys(payload.graph.$START);
|
||||
if (startKeys.length !== 1 || startKeys[0] !== "_") {
|
||||
errors.push('$START must have exactly one edge with status "_"');
|
||||
const startKeys = new Set(Object.keys(payload.graph.$START));
|
||||
if (!startKeys.has("new") || !startKeys.has("resume")) {
|
||||
errors.push('$START must have edges with statuses "new" and "resume"');
|
||||
}
|
||||
}
|
||||
|
||||
@@ -194,18 +190,13 @@ function checkOneOfDiscriminant(
|
||||
}
|
||||
}
|
||||
|
||||
/** Check status-edge consistency for a multi-exit role. */
|
||||
function checkMultiExitEdges(
|
||||
/** Check status-edge consistency for a user role. */
|
||||
function checkStatusEdges(
|
||||
roleName: string,
|
||||
graphKeys: Set<string>,
|
||||
statusSet: Set<string>,
|
||||
errors: string[],
|
||||
): void {
|
||||
if (graphKeys.has("_")) {
|
||||
errors.push(`role "${roleName}" is multi-exit but graph uses "_"`);
|
||||
return;
|
||||
}
|
||||
|
||||
const extraKeys = [...graphKeys].filter((k) => !statusSet.has(k));
|
||||
const missingKeys = [...statusSet].filter((k) => !graphKeys.has(k));
|
||||
if (extraKeys.length > 0) {
|
||||
@@ -255,50 +246,23 @@ function checkRoleConsistency(payload: WorkflowPayload, errors: string[]): void
|
||||
const statuses = getOneOfStatuses(variants);
|
||||
|
||||
checkOneOfDiscriminant(roleName, variants, statuses, errors);
|
||||
checkMultiExitEdges(roleName, graphKeys, new Set(statuses), errors);
|
||||
checkStatusEdges(roleName, graphKeys, new Set(statuses), errors);
|
||||
checkMultiExitMustache(roleName, graphEntry, variants, errors);
|
||||
} else if (isEnumMultiExit(fm)) {
|
||||
const statuses = getEnumStatuses(fm as SchemaObj);
|
||||
checkMultiExitEdges(roleName, graphKeys, new Set(statuses), errors);
|
||||
// For enum-based schemas, mustache vars come from the flat properties
|
||||
checkSingleExitMustache(roleName, graphEntry, fm as SchemaObj, errors);
|
||||
} else if (hasStatusConst(fm)) {
|
||||
const statuses = getConstStatuses(fm as SchemaObj);
|
||||
checkStatusEdges(roleName, graphKeys, new Set(statuses), errors);
|
||||
// For const-based flat schemas, mustache vars come from the flat properties
|
||||
checkFlatMustache(roleName, graphEntry, fm as SchemaObj, errors);
|
||||
} else {
|
||||
checkSingleExitRole(roleName, graphKeys, graphEntry, fm as SchemaObj | null, errors);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Check single-exit role status and mustache. */
|
||||
function checkSingleExitRole(
|
||||
roleName: string,
|
||||
graphKeys: Set<string>,
|
||||
graphEntry: Record<string, { role: string; prompt: string }>,
|
||||
fm: SchemaObj | null,
|
||||
errors: string[],
|
||||
): void {
|
||||
if (graphKeys.size > 1 || (graphKeys.size === 1 && !graphKeys.has("_"))) {
|
||||
if (!graphKeys.has("_")) {
|
||||
errors.push(`role "${roleName}" is single-exit but graph has no "_" key`);
|
||||
} else {
|
||||
errors.push(`role "${roleName}" is single-exit but has status keys other than "_"`);
|
||||
}
|
||||
}
|
||||
|
||||
const singleTarget = graphEntry._;
|
||||
if (!singleTarget) return;
|
||||
|
||||
const vars = extractMustacheVars(singleTarget.prompt);
|
||||
const propNames = fm ? getPropertyNames(fm) : new Set<string>();
|
||||
for (const v of vars) {
|
||||
if (v === "$status") continue;
|
||||
if (!propNames.has(v)) {
|
||||
errors.push(`prompt variable "${v}" not found in role "${roleName}" frontmatter`);
|
||||
errors.push(
|
||||
`role "${roleName}" must define "$status" as const (or oneOf with const) in frontmatter`,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Check mustache vars in all edge prompts against flat schema properties. */
|
||||
function checkSingleExitMustache(
|
||||
function checkFlatMustache(
|
||||
roleName: string,
|
||||
graphEntry: Record<string, { role: string; prompt: string }>,
|
||||
fm: SchemaObj,
|
||||
|
||||
@@ -57,9 +57,18 @@ function isGraph(value: unknown): boolean {
|
||||
if (!isRecord(value)) {
|
||||
return false;
|
||||
}
|
||||
return Object.values(value).every(
|
||||
(statusMap) => isRecord(statusMap) && Object.values(statusMap).every((t) => isTarget(t)),
|
||||
);
|
||||
return Object.values(value).every((statusMap) => {
|
||||
if (!isRecord(statusMap)) {
|
||||
return false;
|
||||
}
|
||||
return Object.entries(statusMap).every(([status, target]) => {
|
||||
// "_" is no longer a valid status key anywhere — $START uses "new"/"resume".
|
||||
if (status === "_") {
|
||||
return false;
|
||||
}
|
||||
return isTarget(target);
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -90,12 +99,13 @@ export function checkWorkflowFilenameConsistency(
|
||||
): string | null {
|
||||
const expected = workflowNameFromPath(filePath);
|
||||
if (payload.name !== expected) {
|
||||
return `workflow name mismatch: file "${basename(filePath)}" implies name "${expected}" but YAML declares name "${payload.name}"`;
|
||||
return `workflow name mismatch: file "${basename(filePath)}" implies name "${expected}" but YAML declares name "${payload.name}". Either rename the file to "${payload.name}.yaml" or change the YAML \`name\` field to "${expected}"`;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
/** Validate YAML-parsed workflow document shape (outputSchema may be inline JSON Schema). */
|
||||
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: validation function with many field checks
|
||||
export function parseWorkflowPayload(raw: unknown): WorkflowPayload | null {
|
||||
if (!isRecord(raw)) {
|
||||
return null;
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "@united-workforce/dashboard",
|
||||
"version": "0.5.0-alpha.4",
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"type": "module",
|
||||
"scripts": {
|
||||
|
||||
@@ -0,0 +1,9 @@
|
||||
# @united-workforce/eval
|
||||
|
||||
## 0.1.2
|
||||
|
||||
### Patch Changes
|
||||
|
||||
- 850a3b2: fix: resolve --agent override via config alias before raw command
|
||||
|
||||
`resolveAgentConfig()` now checks `config.agents[alias]` first before falling back to `parseAgentOverride()`. Eval CLI default `--agent` changed from `"hermes"` to `"uwf-hermes"`.
|
||||
@@ -0,0 +1,219 @@
|
||||
import type { StepEntry } from "@united-workforce/protocol";
|
||||
import { beforeEach, describe, expect, test, vi } from "vitest";
|
||||
|
||||
import {
|
||||
runFrontmatterJudge,
|
||||
runHallucinationJudge,
|
||||
runTokenStatsJudge,
|
||||
runUpstreamJudge,
|
||||
} from "../src/judge/builtin/index.js";
|
||||
|
||||
// Mock the shared read-steps helper so the judges never shell out to `uwf`.
|
||||
vi.mock("../src/judge/builtin/read-steps.js", () => ({
|
||||
readThreadSteps: vi.fn(),
|
||||
}));
|
||||
|
||||
import { readThreadSteps } from "../src/judge/builtin/read-steps.js";
|
||||
|
||||
const mockedReadSteps = vi.mocked(readThreadSteps);
|
||||
|
||||
function makeStep(overrides: Partial<StepEntry>): StepEntry {
|
||||
return {
|
||||
hash: "HASH000000000",
|
||||
role: "worker",
|
||||
output: "---\n$status: done\n---\n\nbody",
|
||||
detail: "DETAIL0000000",
|
||||
agent: "hermes",
|
||||
timestamp: 0,
|
||||
durationMs: 0,
|
||||
usage: null,
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
beforeEach(() => {
|
||||
mockedReadSteps.mockReset();
|
||||
});
|
||||
|
||||
describe("frontmatter-compliance judge", () => {
|
||||
test("all steps have valid frontmatter → score 1.0", async () => {
|
||||
mockedReadSteps.mockReturnValue([
|
||||
makeStep({ role: "a", output: "---\n$status: done\n---\n\nwork" }),
|
||||
makeStep({ role: "b", output: "---\n$status: needs_input\n---\nmore" }),
|
||||
]);
|
||||
|
||||
const result = await runFrontmatterJudge("T1");
|
||||
const data = result.data as { stepsTotal: number; stepsValid: number; invalidSteps: unknown[] };
|
||||
|
||||
expect(result.score).toBe(1.0);
|
||||
expect(data.stepsTotal).toBe(2);
|
||||
expect(data.stepsValid).toBe(2);
|
||||
expect(data.invalidSteps).toHaveLength(0);
|
||||
});
|
||||
|
||||
test("some steps missing $status → partial score", async () => {
|
||||
mockedReadSteps.mockReturnValue([
|
||||
makeStep({ role: "a", output: "---\n$status: done\n---\nok" }),
|
||||
makeStep({ role: "b", output: "---\nfoo: bar\n---\nmissing status" }),
|
||||
makeStep({ role: "c", output: "no frontmatter at all" }),
|
||||
]);
|
||||
|
||||
const result = await runFrontmatterJudge("T2");
|
||||
const data = result.data as {
|
||||
stepsTotal: number;
|
||||
stepsValid: number;
|
||||
invalidSteps: Array<{ stepIndex: number; role: string; errors: string[] }>;
|
||||
};
|
||||
|
||||
expect(result.score).toBeCloseTo(1 / 3, 10);
|
||||
expect(data.stepsTotal).toBe(3);
|
||||
expect(data.stepsValid).toBe(1);
|
||||
expect(data.invalidSteps).toHaveLength(2);
|
||||
expect(data.invalidSteps[0]).toMatchObject({ stepIndex: 1, role: "b" });
|
||||
expect(data.invalidSteps[1]).toMatchObject({ stepIndex: 2, role: "c" });
|
||||
});
|
||||
|
||||
test("no steps → score 0 (0/0 edge case)", async () => {
|
||||
mockedReadSteps.mockReturnValue([]);
|
||||
|
||||
const result = await runFrontmatterJudge("T3");
|
||||
const data = result.data as { stepsTotal: number; stepsValid: number; invalidSteps: unknown[] };
|
||||
|
||||
expect(result.score).toBe(0);
|
||||
expect(data.stepsTotal).toBe(0);
|
||||
expect(data.stepsValid).toBe(0);
|
||||
expect(data.invalidSteps).toHaveLength(0);
|
||||
});
|
||||
|
||||
test("empty-string $status counts as invalid", async () => {
|
||||
mockedReadSteps.mockReturnValue([makeStep({ role: "a", output: '---\n$status: ""\n---\nx' })]);
|
||||
|
||||
const result = await runFrontmatterJudge("T4");
|
||||
expect(result.score).toBe(0);
|
||||
});
|
||||
|
||||
test("parsed object output with $status → score 1.0", async () => {
|
||||
mockedReadSteps.mockReturnValue([
|
||||
makeStep({ role: "a", output: { $status: "done", summary: "fixed" } as unknown as string }),
|
||||
makeStep({ role: "b", output: { $status: "reviewed" } as unknown as string }),
|
||||
]);
|
||||
|
||||
const result = await runFrontmatterJudge("T5");
|
||||
const data = result.data as { stepsTotal: number; stepsValid: number; invalidSteps: unknown[] };
|
||||
|
||||
expect(result.score).toBe(1.0);
|
||||
expect(data.stepsTotal).toBe(2);
|
||||
expect(data.stepsValid).toBe(2);
|
||||
});
|
||||
|
||||
test("parsed object output missing $status → score 0", async () => {
|
||||
mockedReadSteps.mockReturnValue([
|
||||
makeStep({ role: "a", output: { summary: "no status field" } as unknown as string }),
|
||||
]);
|
||||
|
||||
const result = await runFrontmatterJudge("T6");
|
||||
expect(result.score).toBe(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe("token-stats judge", () => {
|
||||
test("steps with usage → sums correctly", async () => {
|
||||
mockedReadSteps.mockReturnValue([
|
||||
makeStep({
|
||||
role: "a",
|
||||
usage: { turns: 2, inputTokens: 100, outputTokens: 50, duration: 1.5 },
|
||||
}),
|
||||
makeStep({
|
||||
role: "b",
|
||||
usage: { turns: 3, inputTokens: 200, outputTokens: 75, duration: 2.0 },
|
||||
}),
|
||||
]);
|
||||
|
||||
const result = await runTokenStatsJudge("T1");
|
||||
const data = result.data as {
|
||||
totalInput: number;
|
||||
totalOutput: number;
|
||||
totalTurns: number;
|
||||
perStep: Array<{ role: string; inputTokens: number; outputTokens: number; turns: number }>;
|
||||
};
|
||||
|
||||
expect(result.score).toBe(1.0);
|
||||
expect(data.totalInput).toBe(300);
|
||||
expect(data.totalOutput).toBe(125);
|
||||
expect(data.totalTurns).toBe(5);
|
||||
expect(data.perStep).toHaveLength(2);
|
||||
expect(data.perStep[0]).toEqual({
|
||||
role: "a",
|
||||
inputTokens: 100,
|
||||
outputTokens: 50,
|
||||
turns: 2,
|
||||
duration: 1.5,
|
||||
});
|
||||
});
|
||||
|
||||
test("steps with null usage → zeros", async () => {
|
||||
mockedReadSteps.mockReturnValue([
|
||||
makeStep({ role: "a", usage: null }),
|
||||
makeStep({ role: "b", usage: null }),
|
||||
]);
|
||||
|
||||
const result = await runTokenStatsJudge("T2");
|
||||
const data = result.data as {
|
||||
totalInput: number;
|
||||
totalOutput: number;
|
||||
totalTurns: number;
|
||||
perStep: Array<{
|
||||
inputTokens: number;
|
||||
outputTokens: number;
|
||||
turns: number;
|
||||
duration: number;
|
||||
}>;
|
||||
};
|
||||
|
||||
expect(result.score).toBe(1.0);
|
||||
expect(data.totalInput).toBe(0);
|
||||
expect(data.totalOutput).toBe(0);
|
||||
expect(data.totalTurns).toBe(0);
|
||||
expect(data.perStep[0]).toEqual({
|
||||
role: "a",
|
||||
inputTokens: 0,
|
||||
outputTokens: 0,
|
||||
turns: 0,
|
||||
duration: 0,
|
||||
});
|
||||
});
|
||||
|
||||
test("empty steps → all zeros, score 1.0", async () => {
|
||||
mockedReadSteps.mockReturnValue([]);
|
||||
|
||||
const result = await runTokenStatsJudge("T3");
|
||||
const data = result.data as {
|
||||
totalInput: number;
|
||||
totalOutput: number;
|
||||
totalTurns: number;
|
||||
perStep: unknown[];
|
||||
};
|
||||
|
||||
expect(result.score).toBe(1.0);
|
||||
expect(data.totalInput).toBe(0);
|
||||
expect(data.totalOutput).toBe(0);
|
||||
expect(data.totalTurns).toBe(0);
|
||||
expect(data.perStep).toHaveLength(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe("LLM-as-judge stubs", () => {
|
||||
test("upstream-consumption returns a stub", async () => {
|
||||
const result = await runUpstreamJudge("T1");
|
||||
expect(result.score).toBe(0);
|
||||
expect(result.data).toEqual({ perStep: [] });
|
||||
expect(result.schema.title).toBe("@uwf/eval-judge-upstream");
|
||||
});
|
||||
|
||||
test("hallucination returns a stub", async () => {
|
||||
const result = await runHallucinationJudge("T1");
|
||||
expect(result.score).toBe(0);
|
||||
expect(result.data).toEqual({ perStep: [] });
|
||||
expect(result.schema.title).toBe("@uwf/eval-judge-hallucination");
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,152 @@
|
||||
import { bootstrap, createMemoryStore } from "@ocas/core";
|
||||
import { describe, expect, test } from "vitest";
|
||||
import type { JudgeRunner } from "../src/runner/index.js";
|
||||
import { collect, computeOverall } from "../src/runner/index.js";
|
||||
import type { EvalRunConfig, EvalStore } from "../src/storage/index.js";
|
||||
import type { JudgeEntry, TaskManifest } from "../src/task/index.js";
|
||||
|
||||
function makeJudge(name: string, weight: number, builtin: boolean): JudgeEntry {
|
||||
return {
|
||||
name,
|
||||
weight,
|
||||
builtin,
|
||||
entry: builtin ? null : `dist/judges/${name}.js`,
|
||||
schema: null,
|
||||
};
|
||||
}
|
||||
|
||||
function makeManifest(judges: JudgeEntry[]): TaskManifest {
|
||||
return {
|
||||
name: "fix-off-by-one",
|
||||
description: "test task",
|
||||
workflow: "solve-issue",
|
||||
prompt: "Fix the bug",
|
||||
limits: { maxSteps: 10, timeoutMinutes: 30 },
|
||||
judges,
|
||||
};
|
||||
}
|
||||
|
||||
function makeEvalStore(): EvalStore {
|
||||
const store = createMemoryStore();
|
||||
bootstrap(store);
|
||||
return { store, varStore: store.var };
|
||||
}
|
||||
|
||||
const CONFIG: EvalRunConfig = {
|
||||
agent: "hermes",
|
||||
model: "claude-sonnet-4",
|
||||
engineVersion: "test",
|
||||
};
|
||||
|
||||
/** Returns a fixed score per judge name. */
|
||||
function scriptedRunner(scores: Record<string, number>): JudgeRunner {
|
||||
return async (_taskDir, _workDir, _threadId, judge) => ({
|
||||
score: scores[judge.name] ?? 0,
|
||||
data: { judged: judge.name },
|
||||
schema: { type: "object" },
|
||||
});
|
||||
}
|
||||
|
||||
describe("computeOverall", () => {
|
||||
test("computes the weighted average correctly", () => {
|
||||
const overall = computeOverall([
|
||||
{ score: 0.8, weight: 0.3 },
|
||||
{ score: 0.6, weight: 0.3 },
|
||||
{ score: 1.0, weight: 0.4 },
|
||||
]);
|
||||
// 0.24 + 0.18 + 0.4 = 0.82
|
||||
expect(overall).toBeCloseTo(0.82, 10);
|
||||
});
|
||||
|
||||
test("a weight-0 judge does not affect the result", () => {
|
||||
const withInformational = computeOverall([
|
||||
{ score: 1.0, weight: 1.0 },
|
||||
{ score: 0.0, weight: 0.0 },
|
||||
]);
|
||||
expect(withInformational).toBe(1.0);
|
||||
});
|
||||
|
||||
test("returns 0 when total weight is 0", () => {
|
||||
expect(computeOverall([{ score: 0.5, weight: 0 }])).toBe(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe("collect", () => {
|
||||
test("computes weighted score correctly across judges", async () => {
|
||||
const evalStore = makeEvalStore();
|
||||
const manifest = makeManifest([
|
||||
makeJudge("test-pass", 0.6, false),
|
||||
makeJudge("code-quality", 0.4, false),
|
||||
]);
|
||||
const runJudge = scriptedRunner({ "test-pass": 1.0, "code-quality": 0.5 });
|
||||
|
||||
const result = await collect(
|
||||
{
|
||||
evalStore,
|
||||
taskDir: "/tmp/task",
|
||||
workDir: "/tmp/work",
|
||||
threadId: "THREAD123",
|
||||
manifest,
|
||||
config: CONFIG,
|
||||
},
|
||||
runJudge,
|
||||
);
|
||||
|
||||
// 1.0 * 0.6 + 0.5 * 0.4 = 0.8
|
||||
expect(result.overall).toBeCloseTo(0.8, 10);
|
||||
expect(result.runHash).toBeTruthy();
|
||||
expect(result.judges).toHaveLength(2);
|
||||
expect(result.judges[0]).toEqual({ name: "test-pass", score: 1.0, weight: 0.6 });
|
||||
|
||||
const latest = evalStore.varStore.list({
|
||||
exactName: "@uwf/eval/fix-off-by-one/latest",
|
||||
});
|
||||
expect(latest[0]?.value).toBe(result.runHash);
|
||||
});
|
||||
|
||||
test("handles a judge with weight 0 (informational)", async () => {
|
||||
const evalStore = makeEvalStore();
|
||||
const manifest = makeManifest([
|
||||
makeJudge("test-pass", 1.0, false),
|
||||
makeJudge("token-stats", 0, true),
|
||||
]);
|
||||
// token-stats is builtin → default runner would score 0; give scripted score
|
||||
// that would skew the result if it were counted.
|
||||
const runJudge = scriptedRunner({ "test-pass": 0.5, "token-stats": 1.0 });
|
||||
|
||||
const result = await collect(
|
||||
{
|
||||
evalStore,
|
||||
taskDir: "/tmp/task",
|
||||
workDir: "/tmp/work",
|
||||
threadId: "THREAD123",
|
||||
manifest,
|
||||
config: CONFIG,
|
||||
},
|
||||
runJudge,
|
||||
);
|
||||
|
||||
// Only test-pass (weight 1.0) counts → overall = 0.5
|
||||
expect(result.overall).toBeCloseTo(0.5, 10);
|
||||
expect(result.judges).toHaveLength(2);
|
||||
const tokenStats = result.judges.find((j) => j.name === "token-stats");
|
||||
expect(tokenStats?.weight).toBe(0);
|
||||
});
|
||||
|
||||
test("unknown builtin judge name throws via the default runner", async () => {
|
||||
const evalStore = makeEvalStore();
|
||||
const manifest = makeManifest([makeJudge("not-a-real-judge", 1.0, true)]);
|
||||
|
||||
// Use the default runner (no injected runner) → builtin dispatch → unknown name throws.
|
||||
await expect(
|
||||
collect({
|
||||
evalStore,
|
||||
taskDir: "/tmp/task",
|
||||
workDir: "/tmp/work",
|
||||
threadId: "THREAD123",
|
||||
manifest,
|
||||
config: CONFIG,
|
||||
}),
|
||||
).rejects.toThrow(/unknown builtin judge/);
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,171 @@
|
||||
import { bootstrap, createMemoryStore, putSchema } from "@ocas/core";
|
||||
import type { CasRef } from "@united-workforce/protocol";
|
||||
import { describe, expect, test } from "vitest";
|
||||
|
||||
import {
|
||||
formatDiff,
|
||||
formatList,
|
||||
formatReport,
|
||||
readEvalEntries,
|
||||
readEvalRun,
|
||||
selectEntries,
|
||||
} from "../src/commands/index.js";
|
||||
import type { EvalRunPayload, EvalStore } from "../src/storage/index.js";
|
||||
import { EVAL_RUN_SCHEMA, setEvalLatest } from "../src/storage/index.js";
|
||||
|
||||
function makeEvalStore(): EvalStore {
|
||||
const store = createMemoryStore();
|
||||
bootstrap(store);
|
||||
return { store, varStore: store.var };
|
||||
}
|
||||
|
||||
function makePayload(
|
||||
task: string,
|
||||
overall: number,
|
||||
timestamp: number,
|
||||
judges: EvalRunPayload["judges"] = [
|
||||
{
|
||||
name: "frontmatter-compliance",
|
||||
score: 1.0,
|
||||
weight: 0.6,
|
||||
dataHash: "AAAAAAAAAAAAA" as CasRef,
|
||||
},
|
||||
{ name: "token-stats", score: 0.5, weight: 0, dataHash: "BBBBBBBBBBBBB" as CasRef },
|
||||
],
|
||||
config: EvalRunPayload["config"] = {
|
||||
agent: "hermes",
|
||||
model: "claude-sonnet-4",
|
||||
engineVersion: "1.0.0",
|
||||
},
|
||||
): EvalRunPayload {
|
||||
return { task, config, threadId: "THREAD0123456789", judges, overall, timestamp };
|
||||
}
|
||||
|
||||
/** Store an eval-run node in CAS and index it under @uwf/eval/<task>/latest. */
|
||||
function storeRun(evalStore: EvalStore, payload: EvalRunPayload): string {
|
||||
const schemaHash = putSchema(evalStore.store, EVAL_RUN_SCHEMA);
|
||||
const hash = evalStore.store.cas.put(schemaHash, payload);
|
||||
setEvalLatest(evalStore.varStore, payload.task, hash);
|
||||
return hash;
|
||||
}
|
||||
|
||||
describe("formatReport", () => {
|
||||
test("includes task, overall, config and judges", () => {
|
||||
const payload = makePayload("fix-off-by-one", 0.8, Date.UTC(2026, 0, 2, 3, 4, 5));
|
||||
const output = formatReport(payload, "RUNHASH123456");
|
||||
|
||||
expect(output).toContain("fix-off-by-one");
|
||||
expect(output).toContain("0.8000");
|
||||
expect(output).toContain("hermes");
|
||||
expect(output).toContain("claude-sonnet-4");
|
||||
expect(output).toContain("1.0.0");
|
||||
expect(output).toContain("frontmatter-compliance");
|
||||
expect(output).toContain("token-stats");
|
||||
expect(output).toContain("THREAD0123456789");
|
||||
expect(output).toContain("RUNHASH123456");
|
||||
});
|
||||
|
||||
test("round-trips a stored run via readEvalRun", () => {
|
||||
const evalStore = makeEvalStore();
|
||||
const payload = makePayload("fix-off-by-one", 0.75, Date.now());
|
||||
const hash = storeRun(evalStore, payload);
|
||||
|
||||
const loaded = readEvalRun(evalStore, hash);
|
||||
expect(loaded).not.toBeNull();
|
||||
const output = formatReport(loaded as EvalRunPayload, hash);
|
||||
expect(output).toContain("fix-off-by-one");
|
||||
expect(output).toContain("0.7500");
|
||||
});
|
||||
|
||||
test("readEvalRun returns null for a missing hash", () => {
|
||||
const evalStore = makeEvalStore();
|
||||
expect(readEvalRun(evalStore, "NOPENOPENOPE0")).toBeNull();
|
||||
});
|
||||
});
|
||||
|
||||
describe("list", () => {
|
||||
test("lists eval runs stored under different tasks", () => {
|
||||
const evalStore = makeEvalStore();
|
||||
storeRun(evalStore, makePayload("fix-off-by-one", 0.8, 2000));
|
||||
storeRun(evalStore, makePayload("write-docs", 0.6, 1000));
|
||||
|
||||
const entries = readEvalEntries(evalStore);
|
||||
expect(entries).toHaveLength(2);
|
||||
|
||||
const output = formatList(selectEntries(entries, null, 20));
|
||||
expect(output).toContain("fix-off-by-one");
|
||||
expect(output).toContain("write-docs");
|
||||
});
|
||||
|
||||
test("sorts newest-first by timestamp", () => {
|
||||
const evalStore = makeEvalStore();
|
||||
storeRun(evalStore, makePayload("old-task", 0.5, 1000));
|
||||
storeRun(evalStore, makePayload("new-task", 0.5, 2000));
|
||||
|
||||
const selected = selectEntries(readEvalEntries(evalStore), null, 20);
|
||||
expect(selected[0]?.task).toBe("new-task");
|
||||
expect(selected[1]?.task).toBe("old-task");
|
||||
});
|
||||
|
||||
test("--task filter only shows the matching task", () => {
|
||||
const evalStore = makeEvalStore();
|
||||
storeRun(evalStore, makePayload("fix-off-by-one", 0.8, 2000));
|
||||
storeRun(evalStore, makePayload("write-docs", 0.6, 1000));
|
||||
|
||||
const output = formatList(selectEntries(readEvalEntries(evalStore), "write-docs", 20));
|
||||
expect(output).toContain("write-docs");
|
||||
expect(output).not.toContain("fix-off-by-one");
|
||||
});
|
||||
|
||||
test("--limit caps the number of rows", () => {
|
||||
const evalStore = makeEvalStore();
|
||||
storeRun(evalStore, makePayload("task-a", 0.8, 3000));
|
||||
storeRun(evalStore, makePayload("task-b", 0.6, 2000));
|
||||
storeRun(evalStore, makePayload("task-c", 0.4, 1000));
|
||||
|
||||
const selected = selectEntries(readEvalEntries(evalStore), null, 2);
|
||||
expect(selected).toHaveLength(2);
|
||||
expect(selected.map((e) => e.task)).toEqual(["task-a", "task-b"]);
|
||||
});
|
||||
|
||||
test("empty store renders a placeholder", () => {
|
||||
const evalStore = makeEvalStore();
|
||||
const output = formatList(selectEntries(readEvalEntries(evalStore), null, 20));
|
||||
expect(output).toContain("(no eval runs found)");
|
||||
});
|
||||
});
|
||||
|
||||
describe("formatDiff", () => {
|
||||
test("shows an upward delta when B scores higher", () => {
|
||||
const a = makePayload("fix-off-by-one", 0.6, 1000);
|
||||
const b = makePayload("fix-off-by-one", 0.8, 2000);
|
||||
const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000");
|
||||
|
||||
expect(output).toContain("▲");
|
||||
expect(output).toContain("HASHA00000000");
|
||||
expect(output).toContain("HASHB00000000");
|
||||
});
|
||||
|
||||
test("shows a downward delta when B scores lower", () => {
|
||||
const a = makePayload("fix-off-by-one", 0.9, 1000);
|
||||
const b = makePayload("fix-off-by-one", 0.4, 2000);
|
||||
const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000");
|
||||
expect(output).toContain("▼");
|
||||
});
|
||||
|
||||
test("marks differing config values", () => {
|
||||
const a = makePayload("fix-off-by-one", 0.6, 1000, undefined, {
|
||||
agent: "hermes",
|
||||
model: "claude-sonnet-4",
|
||||
engineVersion: "1.0.0",
|
||||
});
|
||||
const b = makePayload("fix-off-by-one", 0.6, 2000, undefined, {
|
||||
agent: "claude-code",
|
||||
model: "claude-sonnet-4",
|
||||
engineVersion: "1.0.0",
|
||||
});
|
||||
const output = formatDiff(a, "HASHA00000000", b, "HASHB00000000");
|
||||
expect(output).toContain("≠");
|
||||
expect(output).toContain("claude-code");
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,74 @@
|
||||
import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
|
||||
import { afterEach, beforeEach, describe, expect, test } from "vitest";
|
||||
|
||||
import { prepare } from "../src/runner/index.js";
|
||||
|
||||
const TASK_YAML = `
|
||||
name: fix-off-by-one
|
||||
description: Fix an off-by-one error
|
||||
workflow: solve-issue
|
||||
prompt: "Fix the bug"
|
||||
limits:
|
||||
maxSteps: 12
|
||||
timeoutMinutes: 20
|
||||
judges:
|
||||
- name: frontmatter-compliance
|
||||
weight: 0.5
|
||||
builtin: true
|
||||
- name: test-pass
|
||||
weight: 0.5
|
||||
entry: dist/judges/test-pass.js
|
||||
`;
|
||||
|
||||
let taskDir: string;
|
||||
|
||||
beforeEach(async () => {
|
||||
taskDir = await mkdtemp(join(tmpdir(), "uwf-eval-task-"));
|
||||
await writeFile(join(taskDir, "task.yaml"), TASK_YAML, "utf8");
|
||||
const fixtureDir = join(taskDir, "fixture");
|
||||
await mkdir(join(fixtureDir, "src"), { recursive: true });
|
||||
await writeFile(join(fixtureDir, "src", "calc.ts"), "export const add = (a, b) => a + b + 1;\n");
|
||||
await writeFile(join(fixtureDir, "package.json"), '{ "name": "fixture" }\n');
|
||||
});
|
||||
|
||||
afterEach(async () => {
|
||||
await rm(taskDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
describe("prepare", () => {
|
||||
test("returns the parsed manifest", async () => {
|
||||
const result = await prepare(taskDir);
|
||||
expect(result.taskDir).toBe(taskDir);
|
||||
expect(result.manifest.name).toBe("fix-off-by-one");
|
||||
expect(result.manifest.workflow).toBe("solve-issue");
|
||||
expect(result.manifest.limits.maxSteps).toBe(12);
|
||||
expect(result.manifest.judges).toHaveLength(2);
|
||||
});
|
||||
|
||||
test("copies fixture into a fresh temp work dir", async () => {
|
||||
const result = await prepare(taskDir);
|
||||
expect(result.workDir).not.toBe(taskDir);
|
||||
expect(result.workDir.startsWith(tmpdir())).toBe(true);
|
||||
|
||||
const calc = await readFile(join(result.workDir, "src", "calc.ts"), "utf8");
|
||||
expect(calc).toContain("export const add");
|
||||
const pkg = await readFile(join(result.workDir, "package.json"), "utf8");
|
||||
expect(pkg).toContain("fixture");
|
||||
|
||||
await rm(result.workDir, { recursive: true, force: true });
|
||||
});
|
||||
|
||||
test("creates an empty work dir when no fixture/ exists", async () => {
|
||||
const noFixtureDir = await mkdtemp(join(tmpdir(), "uwf-eval-nofix-"));
|
||||
await writeFile(join(noFixtureDir, "task.yaml"), TASK_YAML, "utf8");
|
||||
|
||||
const result = await prepare(noFixtureDir);
|
||||
expect(result.workDir.startsWith(tmpdir())).toBe(true);
|
||||
|
||||
await rm(noFixtureDir, { recursive: true, force: true });
|
||||
await rm(result.workDir, { recursive: true, force: true });
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,63 @@
|
||||
import { describe, expect, test } from "vitest";
|
||||
import {
|
||||
EVAL_JUDGE_FRONTMATTER_SCHEMA,
|
||||
EVAL_JUDGE_HALLUCINATION_SCHEMA,
|
||||
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
|
||||
EVAL_JUDGE_UPSTREAM_SCHEMA,
|
||||
EVAL_RUN_SCHEMA,
|
||||
} from "../src/storage/index.js";
|
||||
|
||||
describe("OCAS schema definitions", () => {
|
||||
test("eval-run schema has correct title and required fields", () => {
|
||||
expect(EVAL_RUN_SCHEMA.title).toBe("@uwf/eval-run");
|
||||
const required = EVAL_RUN_SCHEMA.required as string[];
|
||||
expect(required).toContain("task");
|
||||
expect(required).toContain("config");
|
||||
expect(required).toContain("threadId");
|
||||
expect(required).toContain("judges");
|
||||
expect(required).toContain("overall");
|
||||
expect(required).toContain("timestamp");
|
||||
});
|
||||
|
||||
test("frontmatter judge schema has correct title", () => {
|
||||
expect(EVAL_JUDGE_FRONTMATTER_SCHEMA.title).toBe("@uwf/eval-judge-frontmatter");
|
||||
const required = EVAL_JUDGE_FRONTMATTER_SCHEMA.required as string[];
|
||||
expect(required).toContain("stepsTotal");
|
||||
expect(required).toContain("stepsValid");
|
||||
expect(required).toContain("invalidSteps");
|
||||
});
|
||||
|
||||
test("upstream judge schema has correct title", () => {
|
||||
expect(EVAL_JUDGE_UPSTREAM_SCHEMA.title).toBe("@uwf/eval-judge-upstream");
|
||||
const required = EVAL_JUDGE_UPSTREAM_SCHEMA.required as string[];
|
||||
expect(required).toContain("perStep");
|
||||
});
|
||||
|
||||
test("hallucination judge schema has correct title", () => {
|
||||
expect(EVAL_JUDGE_HALLUCINATION_SCHEMA.title).toBe("@uwf/eval-judge-hallucination");
|
||||
const required = EVAL_JUDGE_HALLUCINATION_SCHEMA.required as string[];
|
||||
expect(required).toContain("perStep");
|
||||
});
|
||||
|
||||
test("token-stats judge schema has correct title", () => {
|
||||
expect(EVAL_JUDGE_TOKEN_STATS_SCHEMA.title).toBe("@uwf/eval-judge-token-stats");
|
||||
const required = EVAL_JUDGE_TOKEN_STATS_SCHEMA.required as string[];
|
||||
expect(required).toContain("totalInput");
|
||||
expect(required).toContain("totalOutput");
|
||||
expect(required).toContain("totalTurns");
|
||||
expect(required).toContain("perStep");
|
||||
});
|
||||
|
||||
test("all schemas have type object at root", () => {
|
||||
const schemas = [
|
||||
EVAL_RUN_SCHEMA,
|
||||
EVAL_JUDGE_FRONTMATTER_SCHEMA,
|
||||
EVAL_JUDGE_UPSTREAM_SCHEMA,
|
||||
EVAL_JUDGE_HALLUCINATION_SCHEMA,
|
||||
EVAL_JUDGE_TOKEN_STATS_SCHEMA,
|
||||
];
|
||||
for (const s of schemas) {
|
||||
expect(s.type).toBe("object");
|
||||
}
|
||||
});
|
||||
});
|
||||
@@ -0,0 +1,163 @@
|
||||
import { describe, expect, test } from "vitest";
|
||||
import { parseTaskManifest } from "../src/task/index.js";
|
||||
|
||||
const VALID_YAML = `
|
||||
name: fix-off-by-one
|
||||
description: Fix an off-by-one error in a calculator
|
||||
workflow: solve-issue
|
||||
prompt: "Fix the bug: add(1,2) returns 4 instead of 3"
|
||||
limits:
|
||||
maxSteps: 15
|
||||
timeoutMinutes: 30
|
||||
judges:
|
||||
- name: frontmatter-compliance
|
||||
weight: 0.15
|
||||
builtin: true
|
||||
- name: test-pass
|
||||
weight: 0.3
|
||||
entry: dist/judges/test-pass.js
|
||||
schema: schemas/test-pass.json
|
||||
`;
|
||||
|
||||
describe("parseTaskManifest", () => {
|
||||
test("parses valid task.yaml", () => {
|
||||
const manifest = parseTaskManifest(VALID_YAML);
|
||||
expect(manifest.name).toBe("fix-off-by-one");
|
||||
expect(manifest.description).toBe("Fix an off-by-one error in a calculator");
|
||||
expect(manifest.workflow).toBe("solve-issue");
|
||||
expect(manifest.prompt).toBe("Fix the bug: add(1,2) returns 4 instead of 3");
|
||||
expect(manifest.limits).toEqual({ maxSteps: 15, timeoutMinutes: 30 });
|
||||
expect(manifest.judges).toHaveLength(2);
|
||||
});
|
||||
|
||||
test("parses builtin judge", () => {
|
||||
const manifest = parseTaskManifest(VALID_YAML);
|
||||
const builtin = manifest.judges[0];
|
||||
expect(builtin).toBeDefined();
|
||||
expect(builtin!.name).toBe("frontmatter-compliance");
|
||||
expect(builtin!.weight).toBe(0.15);
|
||||
expect(builtin!.builtin).toBe(true);
|
||||
expect(builtin!.entry).toBeNull();
|
||||
});
|
||||
|
||||
test("parses custom judge with entry + schema", () => {
|
||||
const manifest = parseTaskManifest(VALID_YAML);
|
||||
const custom = manifest.judges[1];
|
||||
expect(custom).toBeDefined();
|
||||
expect(custom!.name).toBe("test-pass");
|
||||
expect(custom!.weight).toBe(0.3);
|
||||
expect(custom!.builtin).toBe(false);
|
||||
expect(custom!.entry).toBe("dist/judges/test-pass.js");
|
||||
expect(custom!.schema).toBe("schemas/test-pass.json");
|
||||
});
|
||||
|
||||
test("defaults limits when omitted", () => {
|
||||
const yaml = `
|
||||
name: minimal
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges:
|
||||
- name: check
|
||||
builtin: true
|
||||
`;
|
||||
const manifest = parseTaskManifest(yaml);
|
||||
expect(manifest.limits).toEqual({ maxSteps: 20, timeoutMinutes: 30 });
|
||||
});
|
||||
|
||||
test("defaults description to empty string", () => {
|
||||
const yaml = `
|
||||
name: no-desc
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges:
|
||||
- name: check
|
||||
builtin: true
|
||||
`;
|
||||
const manifest = parseTaskManifest(yaml);
|
||||
expect(manifest.description).toBe("");
|
||||
});
|
||||
|
||||
test("rejects missing name", () => {
|
||||
const yaml = `
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges:
|
||||
- name: check
|
||||
builtin: true
|
||||
`;
|
||||
expect(() => parseTaskManifest(yaml)).toThrow("name is required");
|
||||
});
|
||||
|
||||
test("rejects missing workflow", () => {
|
||||
const yaml = `
|
||||
name: test
|
||||
prompt: do something
|
||||
judges:
|
||||
- name: check
|
||||
builtin: true
|
||||
`;
|
||||
expect(() => parseTaskManifest(yaml)).toThrow("workflow is required");
|
||||
});
|
||||
|
||||
test("rejects missing prompt", () => {
|
||||
const yaml = `
|
||||
name: test
|
||||
workflow: solve-issue
|
||||
judges:
|
||||
- name: check
|
||||
builtin: true
|
||||
`;
|
||||
expect(() => parseTaskManifest(yaml)).toThrow("prompt is required");
|
||||
});
|
||||
|
||||
test("rejects empty judges array", () => {
|
||||
const yaml = `
|
||||
name: test
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges: []
|
||||
`;
|
||||
expect(() => parseTaskManifest(yaml)).toThrow("at least one judge");
|
||||
});
|
||||
|
||||
test("rejects non-builtin judge without entry", () => {
|
||||
const yaml = `
|
||||
name: test
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges:
|
||||
- name: custom-check
|
||||
weight: 0.5
|
||||
`;
|
||||
expect(() => parseTaskManifest(yaml)).toThrow("non-builtin judge must have entry");
|
||||
});
|
||||
|
||||
test("rejects non-object YAML root", () => {
|
||||
expect(() => parseTaskManifest("just a string")).toThrow("must be a YAML mapping");
|
||||
});
|
||||
|
||||
test("rejects judge without name", () => {
|
||||
const yaml = `
|
||||
name: test
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges:
|
||||
- weight: 0.5
|
||||
builtin: true
|
||||
`;
|
||||
expect(() => parseTaskManifest(yaml)).toThrow("name is required");
|
||||
});
|
||||
|
||||
test("defaults weight to 0 when omitted", () => {
|
||||
const yaml = `
|
||||
name: test
|
||||
workflow: solve-issue
|
||||
prompt: do something
|
||||
judges:
|
||||
- name: token-stats
|
||||
builtin: true
|
||||
`;
|
||||
const manifest = parseTaskManifest(yaml);
|
||||
expect(manifest.judges[0]!.weight).toBe(0);
|
||||
});
|
||||
});
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user