test: E2E integration tests with uwf-mock agent (#33)

Three scenarios testing the full CLI pipeline: 1. Linear workflow (planner → worker → $END): CAS chain integrity 2. Loop workflow (developer ↔ reviewer): moderator routing through cycles 3. Role mismatch detection: agent catches routing bugs Uses workflow add → thread start → thread exec with uwf-mock, verifying CAS state, thread lifecycle, and error handling. Refs #33
2026-06-04 07:44:48 +00:00
parent 6850826abe
commit ede428bff2
6 changed files with 415 additions and 0 deletions
@@ -0,0 +1,293 @@
+import { execFileSync } from "node:child_process";
+import { existsSync } from "node:fs";
+import { mkdir, mkdtemp, readFile, rm, writeFile } from "node:fs/promises";
+import { tmpdir } from "node:os";
+import { dirname, join } from "node:path";
+import { fileURLToPath } from "node:url";
+import { openStore } from "@ocas/fs";
+import type { CasRef, StartNodePayload, StepNodePayload } from "@united-workforce/protocol";
+import { afterEach, beforeAll, beforeEach, describe, expect, test } from "vitest";
+import { stringify } from "yaml";
+import { cmdThreadStart } from "../commands/thread.js";
+import { cmdWorkflowAdd } from "../commands/workflow.js";
+import { createUwfStore, findHistoryEntry, getThread } from "../store.js";
+
+// ── paths ──────────────────────────────────────────────────────────────────
+
+const TEST_DIR = dirname(fileURLToPath(import.meta.url));
+const FIXTURES_DIR = join(TEST_DIR, "fixtures");
+const CLI_PATH = join(TEST_DIR, "..", "..", "dist", "cli.js");
+const REPO_ROOT = join(TEST_DIR, "..", "..", "..", "..");
+const AGENT_MOCK_DIR = join(REPO_ROOT, "packages", "agent-mock");
+const AGENT_MOCK_CLI = join(AGENT_MOCK_DIR, "dist", "cli.js");
+
+// ── shared fixture state ─────────────────────────────────────────────────────
+
+let tmpDir: string;
+let uwfHome: string;
+let casDir: string;
+let savedEnv: { uwf: string | undefined; ocas: string | undefined };
+
+/**
+ * The mock agent runs from its built `dist/cli.js`. When the test suite runs
+ * standalone (no prior `pnpm run build`), build it on demand so the E2E run is
+ * self-contained.
+ */
+beforeAll(() => {
+  if (existsSync(AGENT_MOCK_CLI)) {
+    return;
+  }
+  execFileSync(
+    process.execPath,
+    [
+      join(REPO_ROOT, "node_modules", "typescript", "bin", "tsc"),
+      "--build",
+      "--force",
+      AGENT_MOCK_DIR,
+    ],
+    { cwd: REPO_ROOT, stdio: "ignore" },
+  );
+}, 120000);
+
+beforeEach(async () => {
+  tmpDir = await mkdtemp(join(tmpdir(), "cli-e2e-mock-"));
+  uwfHome = join(tmpDir, "uwf");
+  casDir = join(tmpDir, "ocas");
+  await mkdir(uwfHome, { recursive: true });
+  await mkdir(casDir, { recursive: true });
+  // Programmatic CLI APIs (cmdWorkflowAdd, cmdThreadStart) read the global CAS
+  // directory from OCAS_HOME and the storage root from UWF_HOME.
+  savedEnv = { uwf: process.env.UWF_HOME, ocas: process.env.OCAS_HOME };
+  process.env.UWF_HOME = uwfHome;
+  process.env.OCAS_HOME = casDir;
+});
+
+afterEach(async () => {
+  process.env.UWF_HOME = savedEnv.uwf;
+  process.env.OCAS_HOME = savedEnv.ocas;
+  await rm(tmpDir, { recursive: true, force: true });
+});
+
+// ── helpers ──────────────────────────────────────────────────────────────────
+
+/**
+ * Write a `config.yaml` into UWF_HOME that wires the default agent to the mock
+ * agent. The mock data path is baked into the agent args so the CLI's
+ * `thread exec` (without an `--agent` override) resolves it from config.
+ */
+async function writeMockConfig(mockDataFixture: string): Promise<void> {
+  const config = {
+    defaultAgent: "mock",
+    defaultModel: "test",
+    providers: {},
+    models: {},
+    agentOverrides: null,
+    agents: {
+      mock: {
+        command: process.execPath,
+        args: [AGENT_MOCK_CLI, "--mock-data", join(FIXTURES_DIR, mockDataFixture)],
+      },
+    },
+  };
+  await writeFile(join(uwfHome, "config.yaml"), stringify(config));
+}
+
+/**
+ * `cmdWorkflowAdd` enforces filename↔name consistency, so copy the fixture into
+ * UWF_HOME under `<workflow-name>.yaml` before registering it.
+ */
+async function addWorkflow(workflowFixture: string, workflowName: string): Promise<CasRef> {
+  const text = await readFile(join(FIXTURES_DIR, workflowFixture), "utf8");
+  const filePath = join(uwfHome, `${workflowName}.yaml`);
+  await writeFile(filePath, text);
+  const result = await cmdWorkflowAdd(uwfHome, filePath);
+  return result.hash;
+}
+
+type ExecResult = { stdout: string; stderr: string; exitCode: number };
+
+function runExec(threadId: string): ExecResult {
+  try {
+    const stdout = execFileSync(process.execPath, [CLI_PATH, "thread", "exec", threadId], {
+      encoding: "utf8",
+      stdio: ["ignore", "pipe", "pipe"],
+      env: { ...process.env, UWF_HOME: uwfHome, OCAS_HOME: casDir },
+      cwd: tmpDir,
+      timeout: 30000,
+    });
+    return { stdout, stderr: "", exitCode: 0 };
+  } catch (e: unknown) {
+    const err = e as NodeJS.ErrnoException & {
+      stdout?: string;
+      stderr?: string;
+      status?: number;
+    };
+    return { stdout: err.stdout ?? "", stderr: err.stderr ?? "", exitCode: err.status ?? 1 };
+  }
+}
+
+type StepOutputJson = {
+  thread: string;
+  head: string;
+  status: string;
+  currentRole: string | null;
+  done: boolean;
+};
+
+function execStep(threadId: string): StepOutputJson {
+  const { stdout, stderr, exitCode } = runExec(threadId);
+  if (exitCode !== 0) {
+    throw new Error(`thread exec failed (code ${exitCode})\nstdout: ${stdout}\nstderr: ${stderr}`);
+  }
+  return JSON.parse(stdout.trim()) as StepOutputJson;
+}
+
+function getStepNode(store: Awaited<ReturnType<typeof openStore>>, hash: string): StepNodePayload {
+  const node = store.cas.get(hash as CasRef);
+  expect(node).not.toBeNull();
+  return node!.payload as StepNodePayload;
+}
+
+function getStatus(store: Awaited<ReturnType<typeof openStore>>, outputRef: CasRef): unknown {
+  const node = store.cas.get(outputRef);
+  expect(node).not.toBeNull();
+  return (node!.payload as Record<string, unknown>).$status;
+}
+
+// ── scenarios ─────────────────────────────────────────────────────────────────
+
+describe("E2E mock-agent: full uwf pipeline", () => {
+  test("1. linear workflow runs planner then worker and reaches $END", async () => {
+    await writeMockConfig("e2e-linear.mock.yaml");
+    const workflowHash = await addWorkflow("e2e-linear.workflow.yaml", "test-linear");
+
+    const start = await cmdThreadStart(uwfHome, workflowHash, "Build the thing", uwfHome, tmpDir);
+    const threadId = start.thread;
+
+    // Capture the start node hash (thread head before any step).
+    const startHash = getThread((await createUwfStore(uwfHome)).varStore, threadId)?.head;
+    expect(startHash).toBeDefined();
+
+    // Step 1 → planner.
+    const step1 = execStep(threadId);
+    expect(step1.thread).toBe(threadId);
+    expect(step1.done).toBe(false);
+    expect(step1.status).toBe("idle");
+    expect(step1.currentRole).toBe("worker");
+
+    // Step 2 → worker → $END (thread archived to history).
+    const step2 = execStep(threadId);
+    expect(step2.done).toBe(true);
+    expect(step2.status).toBe("completed");
+    expect(step2.currentRole).toBeNull();
+
+    // Verify CAS chain integrity: start → step1 → step2.
+    const store = await openStore(casDir);
+    const s1 = getStepNode(store, step1.head);
+    const s2 = getStepNode(store, step2.head);
+
+    expect(s1.role).toBe("planner");
+    expect(s1.prev).toBeNull();
+    expect(s1.start).toBe(startHash);
+
+    expect(s2.role).toBe("worker");
+    expect(s2.prev).toBe(step1.head);
+    expect(s2.start).toBe(s1.start);
+
+    // Output frontmatter statuses persisted correctly.
+    expect(getStatus(store, s1.output)).toBe("ready");
+    expect(getStatus(store, s2.output)).toBe("done");
+
+    // The start node points at the registered workflow.
+    const startNode = store.cas.get(startHash as CasRef);
+    expect((startNode!.payload as StartNodePayload).workflow).toBe(workflowHash);
+
+    // Thread is completed: removed from active index, present in history.
+    const uwf = await createUwfStore(uwfHome);
+    expect(getThread(uwf.varStore, threadId)).toBeNull();
+    const hist = findHistoryEntry(uwf.varStore, threadId);
+    expect(hist).not.toBeNull();
+    expect(hist!.head).toBe(step2.head);
+  });
+
+  test("2. branching workflow loops developer→reviewer→developer→reviewer→$END", async () => {
+    await writeMockConfig("e2e-loop.mock.yaml");
+    const workflowHash = await addWorkflow("e2e-loop.workflow.yaml", "test-loop");
+
+    const start = await cmdThreadStart(uwfHome, workflowHash, "Implement feature", uwfHome, tmpDir);
+    const threadId = start.thread;
+
+    // 4 steps: developer, reviewer (rejected → loop), developer, reviewer (approved → $END).
+    const s1 = execStep(threadId);
+    expect(s1.status).toBe("idle");
+    expect(s1.currentRole).toBe("reviewer");
+
+    const s2 = execStep(threadId);
+    expect(s2.status).toBe("idle");
+    // reviewer rejected → loops back to developer.
+    expect(s2.currentRole).toBe("developer");
+
+    const s3 = execStep(threadId);
+    expect(s3.status).toBe("idle");
+    expect(s3.currentRole).toBe("reviewer");
+
+    const s4 = execStep(threadId);
+    expect(s4.done).toBe(true);
+    expect(s4.status).toBe("completed");
+
+    // Verify the chain order and roles.
+    const store = await openStore(casDir);
+    const n1 = getStepNode(store, s1.head);
+    const n2 = getStepNode(store, s2.head);
+    const n3 = getStepNode(store, s3.head);
+    const n4 = getStepNode(store, s4.head);
+
+    expect([n1.role, n2.role, n3.role, n4.role]).toEqual([
+      "developer",
+      "reviewer",
+      "developer",
+      "reviewer",
+    ]);
+    expect(n1.prev).toBeNull();
+    expect(n2.prev).toBe(s1.head);
+    expect(n3.prev).toBe(s2.head);
+    expect(n4.prev).toBe(s3.head);
+
+    // All steps share the same start node.
+    expect(new Set([n1.start, n2.start, n3.start, n4.start]).size).toBe(1);
+
+    // Statuses drove the loop routing.
+    expect(getStatus(store, n1.output)).toBe("review_needed");
+    expect(getStatus(store, n2.output)).toBe("rejected");
+    expect(getStatus(store, n3.output)).toBe("review_needed");
+    expect(getStatus(store, n4.output)).toBe("approved");
+
+    const uwf = await createUwfStore(uwfHome);
+    expect(getThread(uwf.varStore, threadId)).toBeNull();
+    expect(findHistoryEntry(uwf.varStore, threadId)).not.toBeNull();
+  });
+
+  test("3. role mismatch in mock data makes the agent exit with an error", async () => {
+    // Reuses the linear workflow but with a mock whose step[1].role is wrong.
+    await writeMockConfig("e2e-mismatch.mock.yaml");
+    const workflowHash = await addWorkflow("e2e-linear.workflow.yaml", "test-linear");
+
+    const start = await cmdThreadStart(uwfHome, workflowHash, "Build the thing", uwfHome, tmpDir);
+    const threadId = start.thread;
+
+    // Step 1 (planner) matches and succeeds.
+    const step1 = execStep(threadId);
+    expect(step1.status).toBe("idle");
+    expect(step1.currentRole).toBe("worker");
+
+    // Step 2: moderator routes to "worker" but mock step[1].role is "planner".
+    const result = runExec(threadId);
+    expect(result.exitCode).not.toBe(0);
+    expect(`${result.stdout}\n${result.stderr}`).toMatch(/expected role "planner"/);
+
+    // The thread remains active (no step node was written for the failed step).
+    const uwf = await createUwfStore(uwfHome);
+    expect(getThread(uwf.varStore, threadId)).not.toBeNull();
+    expect(getThread(uwf.varStore, threadId)!.head).toBe(step1.head);
+  });
+});
@@ -0,0 +1,13 @@
+steps:
+  - role: planner
+    output: |
+      ---
+      $status: ready
+      ---
+      Planning complete.
+  - role: worker
+    output: |
+      ---
+      $status: done
+      ---
+      Work complete.
@@ -0,0 +1,32 @@
+name: test-linear
+description: Simple 2-step linear test (planner -> worker -> $END)
+roles:
+  planner:
+    description: Plans work
+    goal: Plan the task
+    capabilities: []
+    procedure: Plan it
+    output: Output a plan and set $status to ready
+    frontmatter:
+      oneOf:
+        - properties:
+            $status: { const: ready }
+          required: [$status]
+  worker:
+    description: Does work
+    goal: Do the work
+    capabilities: []
+    procedure: Do it
+    output: Output the result and set $status to done
+    frontmatter:
+      oneOf:
+        - properties:
+            $status: { const: done }
+          required: [$status]
+graph:
+  $START:
+    _: { role: planner, prompt: 'Plan the task' }
+  planner:
+    ready: { role: worker, prompt: 'Do the work' }
+  worker:
+    done: { role: '$END', prompt: 'Done' }
@@ -0,0 +1,25 @@
+steps:
+  - role: developer
+    output: |
+      ---
+      $status: review_needed
+      ---
+      First implementation.
+  - role: reviewer
+    output: |
+      ---
+      $status: rejected
+      ---
+      Needs changes, sending back.
+  - role: developer
+    output: |
+      ---
+      $status: review_needed
+      ---
+      Second implementation addressing feedback.
+  - role: reviewer
+    output: |
+      ---
+      $status: approved
+      ---
+      Looks good, approved.
@@ -0,0 +1,36 @@
+name: test-loop
+description: Branching test where the reviewer can reject and loop back to the developer
+roles:
+  developer:
+    description: Implements changes
+    goal: Implement the change
+    capabilities: []
+    procedure: Write code
+    output: Summarize the change and set $status to review_needed
+    frontmatter:
+      oneOf:
+        - properties:
+            $status: { const: review_needed }
+          required: [$status]
+  reviewer:
+    description: Reviews changes
+    goal: Review the change
+    capabilities: []
+    procedure: Review code
+    output: Approve or reject; set $status to approved or rejected
+    frontmatter:
+      oneOf:
+        - properties:
+            $status: { const: rejected }
+          required: [$status]
+        - properties:
+            $status: { const: approved }
+          required: [$status]
+graph:
+  $START:
+    _: { role: developer, prompt: 'Implement the change' }
+  developer:
+    review_needed: { role: reviewer, prompt: 'Review the change' }
+  reviewer:
+    rejected: { role: developer, prompt: 'Fix the issues and resubmit' }
+    approved: { role: '$END', prompt: 'Approved, done' }
@@ -0,0 +1,16 @@
+# Reuses the test-linear workflow. The moderator routes step 0 -> planner and
+# step 1 -> worker, but step[1].role below is "planner", so the mock agent must
+# detect the role mismatch on the second step and exit with an error.
+steps:
+  - role: planner
+    output: |
+      ---
+      $status: ready
+      ---
+      Planning complete.
+  - role: planner
+    output: |
+      ---
+      $status: done
+      ---
+      This step claims to be planner, but the moderator routes to worker.