From 0ece23f03e821abf5deb13fa9e5037745323bfa4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=B0=8F=E6=A9=98?= <xiaoju@shazhou.work>
Date: Sat, 30 May 2026 08:40:34 +0000
Subject: [PATCH] =?UTF-8?q?improve:=20solve-issue=20=E2=80=94=20add=20mand?=
 =?UTF-8?q?atory=20verification=20and=20escalation=20steps?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes hallucination issues observed in thread 06F7FSTXQGY3D5CY5YPQFK2Y3W:

1. Developer self-verification (critical): Added step 12 requiring
   mandatory verification of branch, file existence, and git status
   before reporting done status. Prevents hallucinated completions
   without actual tool execution.

2. Reviewer hard-check enforcement (critical): Added critical warning
   and step 0 requiring cd/pwd verification before review. Prevents
   false rejections based on assumptions without actual path checks.

3. Test debugging escalation (medium): Added structured debugging
   guidance with escalation path after 3 test cycles. Prevents
   infinite retry loops by providing strategy and fail-fast guidance.

Also added 3 test cases to verify the new procedure steps exist.

Based on change plan 9EVZPDTS16PMG analyzing execution anomalies
that resulted in 58% waste (13 of 23 minutes).

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .workflows/solve-issue.yaml                   | 18 +++++++-
 .../solve-issue-tea-worktree.test.ts          | 42 +++++++++++++++++++
 2 files changed, 59 insertions(+), 1 deletion(-)
diff --git a/.workflows/solve-issue.yaml b/.workflows/solve-issue.yaml
index 2bc614e..4c1a79c 100644
--- a/.workflows/solve-issue.yaml
+++ b/.workflows/solve-issue.yaml
@@ -61,6 +61,17 @@ roles:
       9. Implement the code to make tests pass
       10. Ensure `bun run build` passes with no errors
       11. Run `bun test` to verify all tests pass
+          - If tests fail on first run:
+            * Read the test output carefully for missing imports or setup issues
+            * Check if you're running tests from the correct working directory (package root vs workspace root)
+            * Fix the immediate issue and rerun ONCE
+            * If tests still fail after 2 attempts: check the test spec for ambiguities
+            * If stuck after 3 test cycles: set $status=failed with detailed error report rather than continuing blind retries
+      12. MANDATORY VERIFICATION before reporting done:
+          - Run `git branch --show-current` and confirm branch name matches expected
+          - Run `git status` and verify changed files exist
+          - Run `ls -la <key-implementation-files>` to verify they exist on disk
+          - If ANY verification fails: retry the implementation, do NOT report done
 
       If you cannot complete the implementation (e.g. the issue is too complex, blocked by external factors,
       or repeated attempts fail), set $status=failed with a reason.
@@ -85,7 +96,12 @@ roles:
     procedure: |
       The worktree path is provided in your task prompt. cd into it first.
 
-      Before reviewing, verify the git branch:
+      CRITICAL: You MUST execute every verification command below. Do NOT report results without running the actual commands. Do NOT rely on prior context or assumptions.
+
+      Before reviewing, verify the worktree and branch exist:
+      0. Run `cd <worktree-path> && pwd` to confirm the path is accessible
+         - If the cd fails: the worktree truly doesn't exist, reject with that reason
+         - If the cd succeeds: proceed with step 1 below
       1. Run `git branch --show-current` — confirm the branch name references the issue number being worked on
       2. If the branch doesn't correspond to the issue, flag it in your output and reject
 
diff --git a/packages/cli-workflow/src/__tests__/solve-issue-tea-worktree.test.ts b/packages/cli-workflow/src/__tests__/solve-issue-tea-worktree.test.ts
index e9ae476..1a0e3b6 100644
--- a/packages/cli-workflow/src/__tests__/solve-issue-tea-worktree.test.ts
+++ b/packages/cli-workflow/src/__tests__/solve-issue-tea-worktree.test.ts
@@ -103,4 +103,46 @@ describe("solve-issue workflow: tea pr create worktree fix", () => {
     expect(committedVariant).toBeDefined();
     expect(committedVariant.required).toContain("$status");
   });
+
+  test("developer procedure should include mandatory verification step", async () => {
+    const yamlContent = await readFile(workflowPath, "utf-8");
+    const workflow = parse(yamlContent) as WorkflowPayload;
+
+    const developerProcedure = workflow.roles.developer?.procedure;
+    expect(developerProcedure).toBeDefined();
+
+    // Verify the procedure includes mandatory verification step
+    expect(developerProcedure).toContain("MANDATORY VERIFICATION");
+    expect(developerProcedure).toContain("git branch --show-current");
+    expect(developerProcedure).toContain("git status");
+    expect(developerProcedure).toMatch(/ls -la|verify.*exist/i);
+  });
+
+  test("reviewer procedure should enforce worktree path verification", async () => {
+    const yamlContent = await readFile(workflowPath, "utf-8");
+    const workflow = parse(yamlContent) as WorkflowPayload;
+
+    const reviewerProcedure = workflow.roles.reviewer?.procedure;
+    expect(reviewerProcedure).toBeDefined();
+
+    // Verify the procedure includes critical enforcement
+    expect(reviewerProcedure).toContain("CRITICAL");
+    expect(reviewerProcedure).toMatch(/cd.*pwd/);
+    expect(reviewerProcedure).toContain(
+      "Do NOT report results without running the actual commands",
+    );
+  });
+
+  test("developer procedure should include test debugging escalation", async () => {
+    const yamlContent = await readFile(workflowPath, "utf-8");
+    const workflow = parse(yamlContent) as WorkflowPayload;
+
+    const developerProcedure = workflow.roles.developer?.procedure;
+    expect(developerProcedure).toBeDefined();
+
+    // Verify the procedure includes test failure guidance
+    expect(developerProcedure).toMatch(/tests fail.*first run/i);
+    expect(developerProcedure).toMatch(/3 test cycles|after 3 attempts/i);
+    expect(developerProcedure).toContain("$status=failed");
+  });
 });