fix: detect crashed threads as failed instead of stuck running

- resolveThreadListStatus() checks CAS chain for __end__ node
- Stale .running markers no longer cause false 'running' status
- Distinguish 'failed' (returnCode != 0) from 'completed'
- Worker signal handlers (SIGINT/SIGTERM) clean up .running files
- listRunningThreads filters out terminated threads with stale markers

Fixes #170

小橘 <xiaoju@shazhou.work>
This commit is contained in:
2026-05-09 12:28:33 +00:00
parent d0803019b5
commit d6fe3f844c
4 changed files with 122 additions and 9 deletions
+19 -1
View File
@@ -1,3 +1,4 @@
import { unlinkSync } from "node:fs";
import { mkdir, unlink, writeFile } from "node:fs/promises";
import { createServer, type Socket } from "node:net";
import { dirname, join } from "node:path";
@@ -382,6 +383,23 @@ async function main(): Promise<void> {
let activeThreads = 0;
let shutdownTimer: ReturnType<typeof setTimeout> | null = null;
function cleanupAllRunningMarkersSync(): void {
for (const threadId of threads.keys()) {
try {
unlinkSync(join(storageRoot, "logs", hash, `${threadId}.running`));
} catch {
// ignore missing file or other fs errors
}
}
}
for (const sig of ["SIGINT", "SIGTERM"] as const) {
process.on(sig, () => {
cleanupAllRunningMarkersSync();
process.exit(sig === "SIGINT" ? 130 : 143);
});
}
const cas = createCasStore(getGlobalCasDir(storageRoot));
const workerCtlPath = join(storageRoot, "workers", `${hash}.json`);
@@ -498,8 +516,8 @@ async function main(): Promise<void> {
const message = e instanceof Error ? e.message : String(e);
bootLog("Q3MN8YKW", `thread ${threadId} failed: ${message}`);
} finally {
threads.delete(threadId);
await unlink(runningPath).catch(() => {});
threads.delete(threadId);
bumpDone();
socket?.end();
}