fix: correct survival rules data structure mismatches to prevent silent failures
- Fix processWatchdog to read Record<string, boolean> from process-alive watcher
- Fix resourceGuard to read flat {diskPct, memoryPct} from system-resource watcher
- Fix networkWatchdog to check {dnsOk, httpOk} from network watcher
- Add proper SurvivalSnapshot interface replacing Rule<any, SurvivalEffect>
- Fix rebuildHealth panicCount to count rollback-config events from DB
- Update rollback-code executor to use upulse rollback instead of git checkout
- Update tests with correct mock data structures
All survival rules now access correct data fields, preventing silent failures.
This commit is contained in:
@@ -97,10 +97,9 @@ describe('executeSurvivalEffect', () => {
|
||||
|
||||
await executeSurvivalEffect(effect);
|
||||
|
||||
expect(mockExecSync).toHaveBeenCalledWith(
|
||||
'cd ~/.upulse/engine && git checkout abc123',
|
||||
{ timeout: 30000 },
|
||||
);
|
||||
expect(mockExecSync).toHaveBeenCalledWith('upulse rollback abc123', {
|
||||
timeout: 30000,
|
||||
});
|
||||
});
|
||||
|
||||
test('should rollback config for all services', async () => {
|
||||
|
||||
@@ -65,9 +65,7 @@ export async function executeSurvivalEffect(
|
||||
case 'rollback-code': {
|
||||
const to = effect.to as string;
|
||||
try {
|
||||
execSync(`cd ~/.upulse/engine && git checkout ${to}`, {
|
||||
timeout: 30000,
|
||||
});
|
||||
execSync(`upulse rollback ${to}`, { timeout: 30000 });
|
||||
} catch (err) {
|
||||
console.error(`[survival] Failed to rollback to ${to}:`, err);
|
||||
}
|
||||
|
||||
@@ -60,11 +60,22 @@ export function rebuildHealth(store: PulseStore): HealthSnapshot {
|
||||
};
|
||||
}
|
||||
|
||||
// Count rollback-config events as panic events
|
||||
const panicCount = recentEvents.filter((ev) => {
|
||||
if (!ev.meta) return false;
|
||||
try {
|
||||
const meta = JSON.parse(ev.meta);
|
||||
return meta.type === 'rollback-config';
|
||||
} catch {
|
||||
return false;
|
||||
}
|
||||
}).length;
|
||||
|
||||
return {
|
||||
lastRestart,
|
||||
lastGc: null, // TODO: rebuild from events
|
||||
lastNotify: null, // TODO: rebuild from events
|
||||
panicCount: 0, // TODO: rebuild from events
|
||||
panicCount,
|
||||
lastPromote,
|
||||
recentErrorCount: errorEvents.length,
|
||||
};
|
||||
|
||||
@@ -113,10 +113,11 @@ describe('Survival Rules', () => {
|
||||
const curr = {
|
||||
processes: {
|
||||
data: {
|
||||
processes: [
|
||||
{ name: 'openclaw-service', status: 'dead' },
|
||||
{ name: 'other-service', status: 'running' },
|
||||
],
|
||||
// Record<string, boolean>: name → alive
|
||||
processes: {
|
||||
'openclaw-service': false,
|
||||
'other-service': true,
|
||||
},
|
||||
},
|
||||
},
|
||||
health: {
|
||||
@@ -138,7 +139,9 @@ describe('Survival Rules', () => {
|
||||
const curr = {
|
||||
processes: {
|
||||
data: {
|
||||
processes: [{ name: 'openclaw-service', status: 'dead' }],
|
||||
processes: {
|
||||
'openclaw-service': false,
|
||||
},
|
||||
},
|
||||
},
|
||||
health: {
|
||||
@@ -161,13 +164,13 @@ describe('Survival Rules', () => {
|
||||
const curr = {
|
||||
processes: {
|
||||
data: {
|
||||
processes: [
|
||||
{ name: 'openclaw-service', status: 'running' },
|
||||
{ name: 'litellm-service', status: 'running' },
|
||||
{ name: 'upulse-daemon', status: 'running' },
|
||||
{ name: 'sshd', status: 'running' },
|
||||
{ name: 'systemd', status: 'running' },
|
||||
],
|
||||
processes: {
|
||||
'openclaw-service': true,
|
||||
'litellm-service': true,
|
||||
'upulse-daemon': true,
|
||||
sshd: true,
|
||||
systemd: true,
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
@@ -187,10 +190,8 @@ describe('Survival Rules', () => {
|
||||
test('should trigger gc and clear cache when disk > 95%', async () => {
|
||||
const curr = {
|
||||
system: {
|
||||
data: {
|
||||
disk: { usagePercent: 96 },
|
||||
memory: { usagePercent: 50 },
|
||||
},
|
||||
// system-resource watcher returns flat { diskPct, memoryPct, cpuPct, swapPct }
|
||||
data: { diskPct: 96, memoryPct: 50, cpuPct: 20, swapPct: 0 },
|
||||
},
|
||||
};
|
||||
|
||||
@@ -210,10 +211,7 @@ describe('Survival Rules', () => {
|
||||
test('should archive sessions and restart when memory > 90%', async () => {
|
||||
const curr = {
|
||||
system: {
|
||||
data: {
|
||||
disk: { usagePercent: 50 },
|
||||
memory: { usagePercent: 92 },
|
||||
},
|
||||
data: { diskPct: 50, memoryPct: 92, cpuPct: 20, swapPct: 0 },
|
||||
},
|
||||
};
|
||||
|
||||
@@ -232,10 +230,7 @@ describe('Survival Rules', () => {
|
||||
test('should pass through when resources are normal', async () => {
|
||||
const curr = {
|
||||
system: {
|
||||
data: {
|
||||
disk: { usagePercent: 50 },
|
||||
memory: { usagePercent: 60 },
|
||||
},
|
||||
data: { diskPct: 50, memoryPct: 60, cpuPct: 20, swapPct: 0 },
|
||||
},
|
||||
};
|
||||
|
||||
@@ -319,9 +314,8 @@ describe('Survival Rules', () => {
|
||||
test('should notify when network is down but not bypass', async () => {
|
||||
const curr = {
|
||||
network: {
|
||||
data: {
|
||||
connected: false,
|
||||
},
|
||||
// network watcher returns { dnsOk, httpOk, latencyMs }
|
||||
data: { dnsOk: false, httpOk: false, latencyMs: 5000 },
|
||||
},
|
||||
};
|
||||
|
||||
@@ -342,9 +336,18 @@ describe('Survival Rules', () => {
|
||||
test('should pass through when network is healthy', async () => {
|
||||
const curr = {
|
||||
network: {
|
||||
data: {
|
||||
connected: true,
|
||||
},
|
||||
data: { dnsOk: true, httpOk: true, latencyMs: 50 },
|
||||
},
|
||||
};
|
||||
|
||||
await networkWatchdog({}, curr, mockInner);
|
||||
expect(mockInner).toHaveBeenCalledWith({}, curr);
|
||||
});
|
||||
|
||||
test('should pass through when only DNS fails but HTTP is ok', async () => {
|
||||
const curr = {
|
||||
network: {
|
||||
data: { dnsOk: false, httpOk: true, latencyMs: 100 },
|
||||
},
|
||||
};
|
||||
|
||||
|
||||
@@ -5,7 +5,12 @@
|
||||
* Rules are applied in array order (first element is outermost layer).
|
||||
*/
|
||||
|
||||
import type { Rule } from '../index.js';
|
||||
import type { Rule, Sensed } from '../index.js';
|
||||
import type { ErrorLogData } from '../watchers/error-log.js';
|
||||
import type { LlmHealthData } from '../watchers/llm-health.js';
|
||||
import type { NetworkData } from '../watchers/network.js';
|
||||
import type { ProcessAliveData } from '../watchers/process-alive.js';
|
||||
import type { SystemResourceData } from '../watchers/system-resource.js';
|
||||
import {
|
||||
ESSENTIAL_PROCESSES,
|
||||
MAX_RESTART_COUNT,
|
||||
@@ -13,12 +18,24 @@ import {
|
||||
ROLLBACK_WINDOW_MS,
|
||||
} from './constants.js';
|
||||
import type { SurvivalEffect } from './executors.js';
|
||||
import type { HealthSnapshot } from './health.js';
|
||||
|
||||
// Define proper type for survival snapshot
|
||||
export interface SurvivalSnapshot {
|
||||
timestamp: number;
|
||||
system?: Sensed<SystemResourceData>;
|
||||
processes?: Sensed<ProcessAliveData>;
|
||||
network?: Sensed<NetworkData>;
|
||||
errorLog?: Sensed<ErrorLogData>;
|
||||
llm?: Sensed<LlmHealthData>;
|
||||
health?: HealthSnapshot;
|
||||
}
|
||||
|
||||
/**
|
||||
* Panic rollback - outermost fallback
|
||||
* If panicCount >= 3, rollback all configs and bypass inner layers
|
||||
*/
|
||||
export const panicRollback: Rule<any, SurvivalEffect> = async (
|
||||
export const panicRollback: Rule<SurvivalSnapshot, SurvivalEffect> = async (
|
||||
prev,
|
||||
curr,
|
||||
inner,
|
||||
@@ -43,7 +60,7 @@ export const panicRollback: Rule<any, SurvivalEffect> = async (
|
||||
/**
|
||||
* Auto rollback after recent promote if too many errors
|
||||
*/
|
||||
export const autoRollback: Rule<any, SurvivalEffect> = async (
|
||||
export const autoRollback: Rule<SurvivalSnapshot, SurvivalEffect> = async (
|
||||
prev,
|
||||
curr,
|
||||
inner,
|
||||
@@ -74,11 +91,12 @@ export const autoRollback: Rule<any, SurvivalEffect> = async (
|
||||
/**
|
||||
* Process watchdog - restart dead essential processes
|
||||
*/
|
||||
export const processWatchdog: Rule<any, SurvivalEffect> = async (
|
||||
export const processWatchdog: Rule<SurvivalSnapshot, SurvivalEffect> = async (
|
||||
prev,
|
||||
curr,
|
||||
inner,
|
||||
) => {
|
||||
// processes.data.processes is Record<string, boolean> (name → alive)
|
||||
const processes = curr.processes?.data?.processes;
|
||||
if (!processes) {
|
||||
return await inner(prev, curr);
|
||||
@@ -89,14 +107,19 @@ export const processWatchdog: Rule<any, SurvivalEffect> = async (
|
||||
|
||||
// Check each essential process
|
||||
for (const serviceName of ESSENTIAL_PROCESSES) {
|
||||
const process = processes.find((p: any) =>
|
||||
p.name?.toLowerCase().includes(serviceName.toLowerCase()),
|
||||
// Check if Record contains matching key and value is false
|
||||
const isAlive = Object.entries(processes).some(
|
||||
([name, alive]) =>
|
||||
name.toLowerCase().includes(serviceName.toLowerCase()) && alive,
|
||||
);
|
||||
const isMonitored = Object.keys(processes).some((k) =>
|
||||
k.toLowerCase().includes(serviceName.toLowerCase()),
|
||||
);
|
||||
|
||||
if (!process || process.status === 'dead') {
|
||||
if (isMonitored && !isAlive) {
|
||||
// Process is being monitored but is dead
|
||||
hasDeadProcess = true;
|
||||
|
||||
// Check if we've already restarted this service too many times
|
||||
const restartCount = curr.health?.lastRestart?.[serviceName]?.count ?? 0;
|
||||
|
||||
if (restartCount < MAX_RESTART_COUNT) {
|
||||
@@ -105,7 +128,6 @@ export const processWatchdog: Rule<any, SurvivalEffect> = async (
|
||||
service: serviceName,
|
||||
});
|
||||
} else {
|
||||
// Too many restarts, just notify
|
||||
effects.push({
|
||||
type: 'notify-owner',
|
||||
message: `[CRITICAL] Process ${serviceName} failed ${restartCount} times, needs manual intervention`,
|
||||
@@ -126,7 +148,7 @@ export const processWatchdog: Rule<any, SurvivalEffect> = async (
|
||||
/**
|
||||
* Resource guard - handle disk/memory pressure
|
||||
*/
|
||||
export const resourceGuard: Rule<any, SurvivalEffect> = async (
|
||||
export const resourceGuard: Rule<SurvivalSnapshot, SurvivalEffect> = async (
|
||||
prev,
|
||||
curr,
|
||||
inner,
|
||||
@@ -136,8 +158,9 @@ export const resourceGuard: Rule<any, SurvivalEffect> = async (
|
||||
return await inner(prev, curr);
|
||||
}
|
||||
|
||||
const diskUsage = system.disk?.usagePercent ?? 0;
|
||||
const memUsage = system.memory?.usagePercent ?? 0;
|
||||
// system-resource watcher returns flat { diskPct, memoryPct, cpuPct, swapPct }
|
||||
const diskUsage = system.diskPct ?? 0;
|
||||
const memUsage = system.memoryPct ?? 0;
|
||||
|
||||
const effects: SurvivalEffect[] = [];
|
||||
|
||||
@@ -167,7 +190,7 @@ export const resourceGuard: Rule<any, SurvivalEffect> = async (
|
||||
/**
|
||||
* LLM watchdog - monitor LLM service health
|
||||
*/
|
||||
export const llmWatchdog: Rule<any, SurvivalEffect> = async (
|
||||
export const llmWatchdog: Rule<SurvivalSnapshot, SurvivalEffect> = async (
|
||||
prev,
|
||||
curr,
|
||||
inner,
|
||||
@@ -203,14 +226,15 @@ export const llmWatchdog: Rule<any, SurvivalEffect> = async (
|
||||
/**
|
||||
* Network watchdog - monitor connectivity (notify-only stub)
|
||||
*/
|
||||
export const networkWatchdog: Rule<any, SurvivalEffect> = async (
|
||||
export const networkWatchdog: Rule<SurvivalSnapshot, SurvivalEffect> = async (
|
||||
prev,
|
||||
curr,
|
||||
inner,
|
||||
) => {
|
||||
const network = curr.network?.data;
|
||||
|
||||
if (network && network.connected === false) {
|
||||
// network watcher returns { dnsOk, httpOk, latencyMs }; connectivity is lost when both fail
|
||||
if (network && !network.dnsOk && !network.httpOk) {
|
||||
const effects: SurvivalEffect[] = [
|
||||
{
|
||||
type: 'notify-owner',
|
||||
@@ -230,7 +254,7 @@ export const networkWatchdog: Rule<any, SurvivalEffect> = async (
|
||||
/**
|
||||
* Error escalation - accelerate patrol on errors (notify-only stub)
|
||||
*/
|
||||
export const errorEscalate: Rule<any, SurvivalEffect> = async (
|
||||
export const errorEscalate: Rule<SurvivalSnapshot, SurvivalEffect> = async (
|
||||
prev,
|
||||
curr,
|
||||
inner,
|
||||
|
||||
Reference in New Issue
Block a user