diff --git a/packages/cli/src/__tests__/e2e-mock-agent.test.ts b/packages/cli/src/__tests__/e2e-mock-agent.test.ts new file mode 100644 index 0000000..e71e488 --- /dev/null +++ b/packages/cli/src/__tests__/e2e-mock-agent.test.ts @@ -0,0 +1,296 @@ +import { execFileSync } from "node:child_process"; +import { existsSync } from "node:fs"; +import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; +import { openStore } from "@ocas/fs"; +import type { CasRef, StartNodePayload, StepNodePayload } from "@united-workforce/protocol"; +import { afterEach, beforeAll, beforeEach, describe, expect, test } from "vitest"; +import { stringify } from "yaml"; +import { cmdThreadStart } from "../commands/thread.js"; +import { cmdWorkflowAdd } from "../commands/workflow.js"; +import { createUwfStore, getThread } from "../store.js"; + +// ── paths ────────────────────────────────────────────────────────────────── + +const TEST_DIR = dirname(fileURLToPath(import.meta.url)); +const FIXTURES_DIR = join(TEST_DIR, "fixtures"); +const CLI_PATH = join(TEST_DIR, "..", "..", "dist", "cli.js"); +const REPO_ROOT = join(TEST_DIR, "..", "..", "..", ".."); +const AGENT_MOCK_DIR = join(REPO_ROOT, "packages", "agent-mock"); +const AGENT_MOCK_CLI = join(AGENT_MOCK_DIR, "dist", "cli.js"); + +// ── shared fixture state ───────────────────────────────────────────────────── + +let tmpDir: string; +let uwfHome: string; +let casDir: string; +let savedEnv: { uwf: string | undefined; ocas: string | undefined }; + +/** + * The mock agent runs from its built `dist/cli.js`. When the test suite runs + * standalone (no prior `pnpm run build`), build it on demand so the E2E run is + * self-contained. + */ +beforeAll(() => { + if (existsSync(AGENT_MOCK_CLI)) { + return; + } + execFileSync( + process.execPath, + [ + join(REPO_ROOT, "node_modules", "typescript", "bin", "tsc"), + "--build", + "--force", + AGENT_MOCK_DIR, + ], + { cwd: REPO_ROOT, stdio: "ignore" }, + ); +}, 120000); + +beforeEach(async () => { + tmpDir = await mkdtemp(join(tmpdir(), "cli-e2e-mock-")); + uwfHome = join(tmpDir, "uwf"); + casDir = join(tmpDir, "ocas"); + await mkdir(uwfHome, { recursive: true }); + await mkdir(casDir, { recursive: true }); + // Programmatic CLI APIs (cmdWorkflowAdd, cmdThreadStart) read the global CAS + // directory from OCAS_HOME and the storage root from UWF_HOME. + savedEnv = { uwf: process.env.UWF_HOME, ocas: process.env.OCAS_HOME }; + process.env.UWF_HOME = uwfHome; + process.env.OCAS_HOME = casDir; +}); + +afterEach(async () => { + process.env.UWF_HOME = savedEnv.uwf; + process.env.OCAS_HOME = savedEnv.ocas; + await rm(tmpDir, { recursive: true, force: true }); +}); + +// ── helpers ────────────────────────────────────────────────────────────────── + +/** + * Write a `config.yaml` into UWF_HOME that wires the default agent to the mock + * agent. The mock data path is baked into the agent args so the CLI's + * `thread exec` (without an `--agent` override) resolves it from config. + */ +async function writeMockConfig(mockDataFixture: string): Promise { + const config = { + defaultAgent: "mock", + defaultModel: "test", + providers: {}, + models: {}, + agentOverrides: null, + agents: { + mock: { + command: process.execPath, + args: [AGENT_MOCK_CLI, "--mock-data", join(FIXTURES_DIR, mockDataFixture)], + }, + }, + }; + await writeFile(join(uwfHome, "config.yaml"), stringify(config)); +} + +/** + * `cmdWorkflowAdd` enforces filename↔name consistency, so copy the fixture into + * UWF_HOME under `.yaml` before registering it. + */ +async function addWorkflow(workflowFixture: string, workflowName: string): Promise { + const text = await readFile(join(FIXTURES_DIR, workflowFixture), "utf8"); + const filePath = join(uwfHome, `${workflowName}.yaml`); + await writeFile(filePath, text); + const result = await cmdWorkflowAdd(uwfHome, filePath); + return result.hash; +} + +type ExecResult = { stdout: string; stderr: string; exitCode: number }; + +function runExec(threadId: string): ExecResult { + try { + const stdout = execFileSync(process.execPath, [CLI_PATH, "thread", "exec", threadId], { + encoding: "utf8", + stdio: ["ignore", "pipe", "pipe"], + env: { ...process.env, UWF_HOME: uwfHome, OCAS_HOME: casDir }, + cwd: tmpDir, + timeout: 30000, + }); + return { stdout, stderr: "", exitCode: 0 }; + } catch (e: unknown) { + const err = e as NodeJS.ErrnoException & { + stdout?: string; + stderr?: string; + status?: number; + }; + return { stdout: err.stdout ?? "", stderr: err.stderr ?? "", exitCode: err.status ?? 1 }; + } +} + +type StepOutputJson = { + thread: string; + head: string; + status: string; + currentRole: string | null; + done: boolean; +}; + +function execStep(threadId: string): StepOutputJson { + const { stdout, stderr, exitCode } = runExec(threadId); + if (exitCode !== 0) { + throw new Error(`thread exec failed (code ${exitCode})\nstdout: ${stdout}\nstderr: ${stderr}`); + } + return JSON.parse(stdout.trim()) as StepOutputJson; +} + +function getStepNode(store: Awaited>, hash: string): StepNodePayload { + const node = store.cas.get(hash as CasRef); + expect(node).not.toBeNull(); + return node!.payload as StepNodePayload; +} + +function getStatus(store: Awaited>, outputRef: CasRef): unknown { + const node = store.cas.get(outputRef); + expect(node).not.toBeNull(); + return (node!.payload as Record).$status; +} + +// ── scenarios ───────────────────────────────────────────────────────────────── + +describe("E2E mock-agent: full uwf pipeline", () => { + test("1. linear workflow runs planner then worker and reaches $END", async () => { + await writeMockConfig("e2e-linear.mock.yaml"); + const workflowHash = await addWorkflow("e2e-linear.workflow.yaml", "test-linear"); + + const start = await cmdThreadStart(uwfHome, workflowHash, "Build the thing", uwfHome, tmpDir); + const threadId = start.thread; + + // Capture the start node hash (thread head before any step). + const startHash = getThread((await createUwfStore(uwfHome)).varStore, threadId)?.head; + expect(startHash).toBeDefined(); + + // Step 1 → planner. + const step1 = execStep(threadId); + expect(step1.thread).toBe(threadId); + expect(step1.done).toBe(false); + expect(step1.status).toBe("idle"); + expect(step1.currentRole).toBe("worker"); + + // Step 2 → worker → $END (thread archived to history). + const step2 = execStep(threadId); + expect(step2.done).toBe(true); + expect(step2.status).toBe("completed"); + expect(step2.currentRole).toBeNull(); + + // Verify CAS chain integrity: start → step1 → step2. + const store = await openStore(casDir); + const s1 = getStepNode(store, step1.head); + const s2 = getStepNode(store, step2.head); + + expect(s1.role).toBe("planner"); + expect(s1.prev).toBeNull(); + expect(s1.start).toBe(startHash); + + expect(s2.role).toBe("worker"); + expect(s2.prev).toBe(step1.head); + expect(s2.start).toBe(s1.start); + + // Output frontmatter statuses persisted correctly. + expect(getStatus(store, s1.output)).toBe("ready"); + expect(getStatus(store, s2.output)).toBe("done"); + + // The start node points at the registered workflow. + const startNode = store.cas.get(startHash as CasRef); + expect((startNode!.payload as StartNodePayload).workflow).toBe(workflowHash); + + // Thread is completed: status changed to "completed", head updated. + const uwf = await createUwfStore(uwfHome); + const finalEntry = getThread(uwf.varStore, threadId); + expect(finalEntry).not.toBeNull(); + expect(finalEntry!.status).toBe("completed"); + expect(finalEntry!.head).toBe(step2.head); + }); + + test("2. branching workflow loops developer→reviewer→developer→reviewer→$END", async () => { + await writeMockConfig("e2e-loop.mock.yaml"); + const workflowHash = await addWorkflow("e2e-loop.workflow.yaml", "test-loop"); + + const start = await cmdThreadStart(uwfHome, workflowHash, "Implement feature", uwfHome, tmpDir); + const threadId = start.thread; + + // 4 steps: developer, reviewer (rejected → loop), developer, reviewer (approved → $END). + const s1 = execStep(threadId); + expect(s1.status).toBe("idle"); + expect(s1.currentRole).toBe("reviewer"); + + const s2 = execStep(threadId); + expect(s2.status).toBe("idle"); + // reviewer rejected → loops back to developer. + expect(s2.currentRole).toBe("developer"); + + const s3 = execStep(threadId); + expect(s3.status).toBe("idle"); + expect(s3.currentRole).toBe("reviewer"); + + const s4 = execStep(threadId); + expect(s4.done).toBe(true); + expect(s4.status).toBe("completed"); + + // Verify the chain order and roles. + const store = await openStore(casDir); + const n1 = getStepNode(store, s1.head); + const n2 = getStepNode(store, s2.head); + const n3 = getStepNode(store, s3.head); + const n4 = getStepNode(store, s4.head); + + expect([n1.role, n2.role, n3.role, n4.role]).toEqual([ + "developer", + "reviewer", + "developer", + "reviewer", + ]); + expect(n1.prev).toBeNull(); + expect(n2.prev).toBe(s1.head); + expect(n3.prev).toBe(s2.head); + expect(n4.prev).toBe(s3.head); + + // All steps share the same start node. + expect(new Set([n1.start, n2.start, n3.start, n4.start]).size).toBe(1); + + // Statuses drove the loop routing. + expect(getStatus(store, n1.output)).toBe("review_needed"); + expect(getStatus(store, n2.output)).toBe("rejected"); + expect(getStatus(store, n3.output)).toBe("review_needed"); + expect(getStatus(store, n4.output)).toBe("approved"); + + const uwf = await createUwfStore(uwfHome); + const finalEntry = getThread(uwf.varStore, threadId); + expect(finalEntry).not.toBeNull(); + expect(finalEntry!.status).toBe("completed"); + }); + + test("3. role mismatch in mock data makes the agent exit with an error", async () => { + // Reuses the linear workflow but with a mock whose step[1].role is wrong. + await writeMockConfig("e2e-mismatch.mock.yaml"); + const workflowHash = await addWorkflow("e2e-linear.workflow.yaml", "test-linear"); + + const start = await cmdThreadStart(uwfHome, workflowHash, "Build the thing", uwfHome, tmpDir); + const threadId = start.thread; + + // Step 1 (planner) matches and succeeds. + const step1 = execStep(threadId); + expect(step1.status).toBe("idle"); + expect(step1.currentRole).toBe("worker"); + + // Step 2: moderator routes to "worker" but mock step[1].role is "planner". + const result = runExec(threadId); + expect(result.exitCode).not.toBe(0); + expect(`${result.stdout}\n${result.stderr}`).toMatch(/expected role "planner"/); + + // The thread remains active (no step node was written for the failed step). + const uwf = await createUwfStore(uwfHome); + const entry = getThread(uwf.varStore, threadId); + expect(entry).not.toBeNull(); + expect(entry!.status).not.toBe("completed"); + expect(entry!.head).toBe(step1.head); + }); +}); diff --git a/packages/cli/src/__tests__/fixtures/e2e-linear.mock.yaml b/packages/cli/src/__tests__/fixtures/e2e-linear.mock.yaml new file mode 100644 index 0000000..66a5135 --- /dev/null +++ b/packages/cli/src/__tests__/fixtures/e2e-linear.mock.yaml @@ -0,0 +1,13 @@ +steps: + - role: planner + output: | + --- + $status: ready + --- + Planning complete. + - role: worker + output: | + --- + $status: done + --- + Work complete. diff --git a/packages/cli/src/__tests__/fixtures/e2e-linear.workflow.yaml b/packages/cli/src/__tests__/fixtures/e2e-linear.workflow.yaml new file mode 100644 index 0000000..9a4a638 --- /dev/null +++ b/packages/cli/src/__tests__/fixtures/e2e-linear.workflow.yaml @@ -0,0 +1,32 @@ +name: test-linear +description: Simple 2-step linear test (planner -> worker -> $END) +roles: + planner: + description: Plans work + goal: Plan the task + capabilities: [] + procedure: Plan it + output: Output a plan and set $status to ready + frontmatter: + oneOf: + - properties: + $status: { const: ready } + required: [$status] + worker: + description: Does work + goal: Do the work + capabilities: [] + procedure: Do it + output: Output the result and set $status to done + frontmatter: + oneOf: + - properties: + $status: { const: done } + required: [$status] +graph: + $START: + _: { role: planner, prompt: 'Plan the task' } + planner: + ready: { role: worker, prompt: 'Do the work' } + worker: + done: { role: '$END', prompt: 'Done' } diff --git a/packages/cli/src/__tests__/fixtures/e2e-loop.mock.yaml b/packages/cli/src/__tests__/fixtures/e2e-loop.mock.yaml new file mode 100644 index 0000000..e2fa37d --- /dev/null +++ b/packages/cli/src/__tests__/fixtures/e2e-loop.mock.yaml @@ -0,0 +1,25 @@ +steps: + - role: developer + output: | + --- + $status: review_needed + --- + First implementation. + - role: reviewer + output: | + --- + $status: rejected + --- + Needs changes, sending back. + - role: developer + output: | + --- + $status: review_needed + --- + Second implementation addressing feedback. + - role: reviewer + output: | + --- + $status: approved + --- + Looks good, approved. diff --git a/packages/cli/src/__tests__/fixtures/e2e-loop.workflow.yaml b/packages/cli/src/__tests__/fixtures/e2e-loop.workflow.yaml new file mode 100644 index 0000000..604452a --- /dev/null +++ b/packages/cli/src/__tests__/fixtures/e2e-loop.workflow.yaml @@ -0,0 +1,36 @@ +name: test-loop +description: Branching test where the reviewer can reject and loop back to the developer +roles: + developer: + description: Implements changes + goal: Implement the change + capabilities: [] + procedure: Write code + output: Summarize the change and set $status to review_needed + frontmatter: + oneOf: + - properties: + $status: { const: review_needed } + required: [$status] + reviewer: + description: Reviews changes + goal: Review the change + capabilities: [] + procedure: Review code + output: Approve or reject; set $status to approved or rejected + frontmatter: + oneOf: + - properties: + $status: { const: rejected } + required: [$status] + - properties: + $status: { const: approved } + required: [$status] +graph: + $START: + _: { role: developer, prompt: 'Implement the change' } + developer: + review_needed: { role: reviewer, prompt: 'Review the change' } + reviewer: + rejected: { role: developer, prompt: 'Fix the issues and resubmit' } + approved: { role: '$END', prompt: 'Approved, done' } diff --git a/packages/cli/src/__tests__/fixtures/e2e-mismatch.mock.yaml b/packages/cli/src/__tests__/fixtures/e2e-mismatch.mock.yaml new file mode 100644 index 0000000..d397d90 --- /dev/null +++ b/packages/cli/src/__tests__/fixtures/e2e-mismatch.mock.yaml @@ -0,0 +1,16 @@ +# Reuses the test-linear workflow. The moderator routes step 0 -> planner and +# step 1 -> worker, but step[1].role below is "planner", so the mock agent must +# detect the role mismatch on the second step and exit with an error. +steps: + - role: planner + output: | + --- + $status: ready + --- + Planning complete. + - role: planner + output: | + --- + $status: done + --- + This step claims to be planner, but the moderator routes to worker.