improve: solve-issue — add mandatory verification and escalation steps

Fixes hallucination issues observed in thread 06F7FSTXQGY3D5CY5YPQFK2Y3W: 1. Developer self-verification (critical): Added step 12 requiring mandatory verification of branch, file existence, and git status before reporting done status. Prevents hallucinated completions without actual tool execution. 2. Reviewer hard-check enforcement (critical): Added critical warning and step 0 requiring cd/pwd verification before review. Prevents false rejections based on assumptions without actual path checks. 3. Test debugging escalation (medium): Added structured debugging guidance with escalation path after 3 test cycles. Prevents infinite retry loops by providing strategy and fail-fast guidance. Also added 3 test cases to verify the new procedure steps exist. Based on change plan 9EVZPDTS16PMG analyzing execution anomalies that resulted in 58% waste (13 of 23 minutes). Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-30 08:40:34 +00:00
parent 0fcab06b80
commit 0ece23f03e
2 changed files with 59 additions and 1 deletions
@@ -61,6 +61,17 @@ roles:
      9. Implement the code to make tests pass
      10. Ensure `bun run build` passes with no errors
      11. Run `bun test` to verify all tests pass
+          - If tests fail on first run:
+            * Read the test output carefully for missing imports or setup issues
+            * Check if you're running tests from the correct working directory (package root vs workspace root)
+            * Fix the immediate issue and rerun ONCE
+            * If tests still fail after 2 attempts: check the test spec for ambiguities
+            * If stuck after 3 test cycles: set $status=failed with detailed error report rather than continuing blind retries
+      12. MANDATORY VERIFICATION before reporting done:
+          - Run `git branch --show-current` and confirm branch name matches expected
+          - Run `git status` and verify changed files exist
+          - Run `ls -la <key-implementation-files>` to verify they exist on disk
+          - If ANY verification fails: retry the implementation, do NOT report done

      If you cannot complete the implementation (e.g. the issue is too complex, blocked by external factors,
      or repeated attempts fail), set $status=failed with a reason.
@@ -85,7 +96,12 @@ roles:
    procedure: |
      The worktree path is provided in your task prompt. cd into it first.

-      Before reviewing, verify the git branch:
+      CRITICAL: You MUST execute every verification command below. Do NOT report results without running the actual commands. Do NOT rely on prior context or assumptions.
+
+      Before reviewing, verify the worktree and branch exist:
+      0. Run `cd <worktree-path> && pwd` to confirm the path is accessible
+         - If the cd fails: the worktree truly doesn't exist, reject with that reason
+         - If the cd succeeds: proceed with step 1 below
      1. Run `git branch --show-current` — confirm the branch name references the issue number being worked on
      2. If the branch doesn't correspond to the issue, flag it in your output and reject

@@ -103,4 +103,46 @@ describe("solve-issue workflow: tea pr create worktree fix", () => {
    expect(committedVariant).toBeDefined();
    expect(committedVariant.required).toContain("$status");
  });
+
+  test("developer procedure should include mandatory verification step", async () => {
+    const yamlContent = await readFile(workflowPath, "utf-8");
+    const workflow = parse(yamlContent) as WorkflowPayload;
+
+    const developerProcedure = workflow.roles.developer?.procedure;
+    expect(developerProcedure).toBeDefined();
+
+    // Verify the procedure includes mandatory verification step
+    expect(developerProcedure).toContain("MANDATORY VERIFICATION");
+    expect(developerProcedure).toContain("git branch --show-current");
+    expect(developerProcedure).toContain("git status");
+    expect(developerProcedure).toMatch(/ls -la|verify.*exist/i);
+  });
+
+  test("reviewer procedure should enforce worktree path verification", async () => {
+    const yamlContent = await readFile(workflowPath, "utf-8");
+    const workflow = parse(yamlContent) as WorkflowPayload;
+
+    const reviewerProcedure = workflow.roles.reviewer?.procedure;
+    expect(reviewerProcedure).toBeDefined();
+
+    // Verify the procedure includes critical enforcement
+    expect(reviewerProcedure).toContain("CRITICAL");
+    expect(reviewerProcedure).toMatch(/cd.*pwd/);
+    expect(reviewerProcedure).toContain(
+      "Do NOT report results without running the actual commands",
+    );
+  });
+
+  test("developer procedure should include test debugging escalation", async () => {
+    const yamlContent = await readFile(workflowPath, "utf-8");
+    const workflow = parse(yamlContent) as WorkflowPayload;
+
+    const developerProcedure = workflow.roles.developer?.procedure;
+    expect(developerProcedure).toBeDefined();
+
+    // Verify the procedure includes test failure guidance
+    expect(developerProcedure).toMatch(/tests fail.*first run/i);
+    expect(developerProcedure).toMatch(/3 test cycles|after 3 attempts/i);
+    expect(developerProcedure).toContain("$status=failed");
+  });
 });