fix: correct survival rules data structure mismatches to prevent silent failures

- Fix processWatchdog to read Record<string, boolean> from process-alive watcher
- Fix resourceGuard to read flat {diskPct, memoryPct} from system-resource watcher
- Fix networkWatchdog to check {dnsOk, httpOk} from network watcher
- Add proper SurvivalSnapshot interface replacing Rule<any, SurvivalEffect>
- Fix rebuildHealth panicCount to count rollback-config events from DB
- Update rollback-code executor to use upulse rollback instead of git checkout
- Update tests with correct mock data structures

All survival rules now access correct data fields, preventing silent failures.
This commit is contained in:
2026-04-14 13:46:16 +00:00
parent 6b9d913ec3
commit 5e6951404b
5 changed files with 89 additions and 54 deletions
@@ -97,10 +97,9 @@ describe('executeSurvivalEffect', () => {
await executeSurvivalEffect(effect);
expect(mockExecSync).toHaveBeenCalledWith(
'cd ~/.upulse/engine && git checkout abc123',
{ timeout: 30000 },
);
expect(mockExecSync).toHaveBeenCalledWith('upulse rollback abc123', {
timeout: 30000,
});
});
test('should rollback config for all services', async () => {
+1 -3
View File
@@ -65,9 +65,7 @@ export async function executeSurvivalEffect(
case 'rollback-code': {
const to = effect.to as string;
try {
execSync(`cd ~/.upulse/engine && git checkout ${to}`, {
timeout: 30000,
});
execSync(`upulse rollback ${to}`, { timeout: 30000 });
} catch (err) {
console.error(`[survival] Failed to rollback to ${to}:`, err);
}
+12 -1
View File
@@ -60,11 +60,22 @@ export function rebuildHealth(store: PulseStore): HealthSnapshot {
};
}
// Count rollback-config events as panic events
const panicCount = recentEvents.filter((ev) => {
if (!ev.meta) return false;
try {
const meta = JSON.parse(ev.meta);
return meta.type === 'rollback-config';
} catch {
return false;
}
}).length;
return {
lastRestart,
lastGc: null, // TODO: rebuild from events
lastNotify: null, // TODO: rebuild from events
panicCount: 0, // TODO: rebuild from events
panicCount,
lastPromote,
recentErrorCount: errorEvents.length,
};
+33 -30
View File
@@ -113,10 +113,11 @@ describe('Survival Rules', () => {
const curr = {
processes: {
data: {
processes: [
{ name: 'openclaw-service', status: 'dead' },
{ name: 'other-service', status: 'running' },
],
// Record<string, boolean>: name → alive
processes: {
'openclaw-service': false,
'other-service': true,
},
},
},
health: {
@@ -138,7 +139,9 @@ describe('Survival Rules', () => {
const curr = {
processes: {
data: {
processes: [{ name: 'openclaw-service', status: 'dead' }],
processes: {
'openclaw-service': false,
},
},
},
health: {
@@ -161,13 +164,13 @@ describe('Survival Rules', () => {
const curr = {
processes: {
data: {
processes: [
{ name: 'openclaw-service', status: 'running' },
{ name: 'litellm-service', status: 'running' },
{ name: 'upulse-daemon', status: 'running' },
{ name: 'sshd', status: 'running' },
{ name: 'systemd', status: 'running' },
],
processes: {
'openclaw-service': true,
'litellm-service': true,
'upulse-daemon': true,
sshd: true,
systemd: true,
},
},
},
};
@@ -187,10 +190,8 @@ describe('Survival Rules', () => {
test('should trigger gc and clear cache when disk > 95%', async () => {
const curr = {
system: {
data: {
disk: { usagePercent: 96 },
memory: { usagePercent: 50 },
},
// system-resource watcher returns flat { diskPct, memoryPct, cpuPct, swapPct }
data: { diskPct: 96, memoryPct: 50, cpuPct: 20, swapPct: 0 },
},
};
@@ -210,10 +211,7 @@ describe('Survival Rules', () => {
test('should archive sessions and restart when memory > 90%', async () => {
const curr = {
system: {
data: {
disk: { usagePercent: 50 },
memory: { usagePercent: 92 },
},
data: { diskPct: 50, memoryPct: 92, cpuPct: 20, swapPct: 0 },
},
};
@@ -232,10 +230,7 @@ describe('Survival Rules', () => {
test('should pass through when resources are normal', async () => {
const curr = {
system: {
data: {
disk: { usagePercent: 50 },
memory: { usagePercent: 60 },
},
data: { diskPct: 50, memoryPct: 60, cpuPct: 20, swapPct: 0 },
},
};
@@ -319,9 +314,8 @@ describe('Survival Rules', () => {
test('should notify when network is down but not bypass', async () => {
const curr = {
network: {
data: {
connected: false,
},
// network watcher returns { dnsOk, httpOk, latencyMs }
data: { dnsOk: false, httpOk: false, latencyMs: 5000 },
},
};
@@ -342,9 +336,18 @@ describe('Survival Rules', () => {
test('should pass through when network is healthy', async () => {
const curr = {
network: {
data: {
connected: true,
},
data: { dnsOk: true, httpOk: true, latencyMs: 50 },
},
};
await networkWatchdog({}, curr, mockInner);
expect(mockInner).toHaveBeenCalledWith({}, curr);
});
test('should pass through when only DNS fails but HTTP is ok', async () => {
const curr = {
network: {
data: { dnsOk: false, httpOk: true, latencyMs: 100 },
},
};
+40 -16
View File
@@ -5,7 +5,12 @@
* Rules are applied in array order (first element is outermost layer).
*/
import type { Rule } from '../index.js';
import type { Rule, Sensed } from '../index.js';
import type { ErrorLogData } from '../watchers/error-log.js';
import type { LlmHealthData } from '../watchers/llm-health.js';
import type { NetworkData } from '../watchers/network.js';
import type { ProcessAliveData } from '../watchers/process-alive.js';
import type { SystemResourceData } from '../watchers/system-resource.js';
import {
ESSENTIAL_PROCESSES,
MAX_RESTART_COUNT,
@@ -13,12 +18,24 @@ import {
ROLLBACK_WINDOW_MS,
} from './constants.js';
import type { SurvivalEffect } from './executors.js';
import type { HealthSnapshot } from './health.js';
// Define proper type for survival snapshot
export interface SurvivalSnapshot {
timestamp: number;
system?: Sensed<SystemResourceData>;
processes?: Sensed<ProcessAliveData>;
network?: Sensed<NetworkData>;
errorLog?: Sensed<ErrorLogData>;
llm?: Sensed<LlmHealthData>;
health?: HealthSnapshot;
}
/**
* Panic rollback - outermost fallback
* If panicCount >= 3, rollback all configs and bypass inner layers
*/
export const panicRollback: Rule<any, SurvivalEffect> = async (
export const panicRollback: Rule<SurvivalSnapshot, SurvivalEffect> = async (
prev,
curr,
inner,
@@ -43,7 +60,7 @@ export const panicRollback: Rule<any, SurvivalEffect> = async (
/**
* Auto rollback after recent promote if too many errors
*/
export const autoRollback: Rule<any, SurvivalEffect> = async (
export const autoRollback: Rule<SurvivalSnapshot, SurvivalEffect> = async (
prev,
curr,
inner,
@@ -74,11 +91,12 @@ export const autoRollback: Rule<any, SurvivalEffect> = async (
/**
* Process watchdog - restart dead essential processes
*/
export const processWatchdog: Rule<any, SurvivalEffect> = async (
export const processWatchdog: Rule<SurvivalSnapshot, SurvivalEffect> = async (
prev,
curr,
inner,
) => {
// processes.data.processes is Record<string, boolean> (name → alive)
const processes = curr.processes?.data?.processes;
if (!processes) {
return await inner(prev, curr);
@@ -89,14 +107,19 @@ export const processWatchdog: Rule<any, SurvivalEffect> = async (
// Check each essential process
for (const serviceName of ESSENTIAL_PROCESSES) {
const process = processes.find((p: any) =>
p.name?.toLowerCase().includes(serviceName.toLowerCase()),
// Check if Record contains matching key and value is false
const isAlive = Object.entries(processes).some(
([name, alive]) =>
name.toLowerCase().includes(serviceName.toLowerCase()) && alive,
);
const isMonitored = Object.keys(processes).some((k) =>
k.toLowerCase().includes(serviceName.toLowerCase()),
);
if (!process || process.status === 'dead') {
if (isMonitored && !isAlive) {
// Process is being monitored but is dead
hasDeadProcess = true;
// Check if we've already restarted this service too many times
const restartCount = curr.health?.lastRestart?.[serviceName]?.count ?? 0;
if (restartCount < MAX_RESTART_COUNT) {
@@ -105,7 +128,6 @@ export const processWatchdog: Rule<any, SurvivalEffect> = async (
service: serviceName,
});
} else {
// Too many restarts, just notify
effects.push({
type: 'notify-owner',
message: `[CRITICAL] Process ${serviceName} failed ${restartCount} times, needs manual intervention`,
@@ -126,7 +148,7 @@ export const processWatchdog: Rule<any, SurvivalEffect> = async (
/**
* Resource guard - handle disk/memory pressure
*/
export const resourceGuard: Rule<any, SurvivalEffect> = async (
export const resourceGuard: Rule<SurvivalSnapshot, SurvivalEffect> = async (
prev,
curr,
inner,
@@ -136,8 +158,9 @@ export const resourceGuard: Rule<any, SurvivalEffect> = async (
return await inner(prev, curr);
}
const diskUsage = system.disk?.usagePercent ?? 0;
const memUsage = system.memory?.usagePercent ?? 0;
// system-resource watcher returns flat { diskPct, memoryPct, cpuPct, swapPct }
const diskUsage = system.diskPct ?? 0;
const memUsage = system.memoryPct ?? 0;
const effects: SurvivalEffect[] = [];
@@ -167,7 +190,7 @@ export const resourceGuard: Rule<any, SurvivalEffect> = async (
/**
* LLM watchdog - monitor LLM service health
*/
export const llmWatchdog: Rule<any, SurvivalEffect> = async (
export const llmWatchdog: Rule<SurvivalSnapshot, SurvivalEffect> = async (
prev,
curr,
inner,
@@ -203,14 +226,15 @@ export const llmWatchdog: Rule<any, SurvivalEffect> = async (
/**
* Network watchdog - monitor connectivity (notify-only stub)
*/
export const networkWatchdog: Rule<any, SurvivalEffect> = async (
export const networkWatchdog: Rule<SurvivalSnapshot, SurvivalEffect> = async (
prev,
curr,
inner,
) => {
const network = curr.network?.data;
if (network && network.connected === false) {
// network watcher returns { dnsOk, httpOk, latencyMs }; connectivity is lost when both fail
if (network && !network.dnsOk && !network.httpOk) {
const effects: SurvivalEffect[] = [
{
type: 'notify-owner',
@@ -230,7 +254,7 @@ export const networkWatchdog: Rule<any, SurvivalEffect> = async (
/**
* Error escalation - accelerate patrol on errors (notify-only stub)
*/
export const errorEscalate: Rule<any, SurvivalEffect> = async (
export const errorEscalate: Rule<SurvivalSnapshot, SurvivalEffect> = async (
prev,
curr,
inner,