Compare commits

..

No commits in common. "master" and "feat/restart-gateway-workflow" have entirely different histories.

6 changed files with 161 additions and 16 deletions

View File

@ -12,15 +12,6 @@ senses:
timeout: 30s
workflows:
develop-sense:
restart-gateway:
concurrency: 1
overflow: queue
develop-workflow:
concurrency: 1
overflow: queue
solve-issue:
concurrency: 1
overflow: queue
extract-knowledge:
concurrency: 1
overflow: queue
overflow: drop

View File

@ -0,0 +1,14 @@
-- Migration: 0001_init
-- Creates the hermes_gateway_health table for hermes-gateway-health sense.
CREATE TABLE IF NOT EXISTS hermes_gateway_health (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ts INTEGER NOT NULL,
alive INTEGER NOT NULL,
main_pid INTEGER NOT NULL,
rss_bytes INTEGER NOT NULL,
cpu_percent REAL NOT NULL,
uptime_sec INTEGER NOT NULL,
active_sessions INTEGER NOT NULL,
child_process_count INTEGER NOT NULL
);

View File

@ -0,0 +1,7 @@
-- Migration: 0002_add_http_probe
-- HTTP reachability columns for hermes-gateway-health sense.
ALTER TABLE hermes_gateway_health ADD COLUMN http_ok INTEGER NOT NULL DEFAULT 0;
ALTER TABLE hermes_gateway_health ADD COLUMN http_status_code INTEGER NOT NULL DEFAULT 0;
ALTER TABLE hermes_gateway_health ADD COLUMN http_latency_ms INTEGER NOT NULL DEFAULT 0;
ALTER TABLE hermes_gateway_health ADD COLUMN http_error TEXT NOT NULL DEFAULT '';

View File

@ -1,4 +1,6 @@
import { execFile } from "node:child_process";
export { hermesGatewayHealth as table } from "./schema.ts";
/** Keep subprocess deadlines slightly under typical sense timeout (30s). */
const EXEC_TIMEOUT_MS = 25_000;
@ -7,7 +9,7 @@ const HTTP_TIMEOUT_MS = Math.min(23_000, EXEC_TIMEOUT_MS - 2000);
const HTTP_ERROR_MAX_LEN = 256;
/** How many consecutive failures before triggering a restart. */
/** How many consecutive failures before triggering a restart workflow. */
const FAILURE_THRESHOLD = 3;
type SenseState = {
@ -337,7 +339,7 @@ export async function compute(prevState: SenseState) {
const cooldown = prevState.restartCooldownMs;
const cooldownElapsed = now - lastRestartTs >= cooldown;
// --- trigger restart? ---
// --- trigger restart workflow? ---
const shouldRestart =
consecutiveFailures >= FAILURE_THRESHOLD && cooldownElapsed;
@ -361,9 +363,14 @@ export async function compute(prevState: SenseState) {
consecutiveFailures,
};
const trigger = shouldRestart
? { command: "systemctl --user restart hermes-gateway" }
const workflow = shouldRestart
? {
name: "restart-gateway",
maxRounds: 3,
prompt: `Hermes gateway is down (${consecutiveFailures} consecutive failures). Last HTTP error: "${httpError}". systemd active+running: ${systemdActiveRunning}, process alive: ${psOk}. Restart the gateway and verify it comes back.`,
dryRun: false,
}
: null;
return { state: nextState, signal, trigger };
return { state: nextState, signal, workflow };
}

View File

@ -0,0 +1,17 @@
import { integer, real, sqliteTable, text } from "drizzle-orm/sqlite-core";
export const hermesGatewayHealth = sqliteTable("hermes_gateway_health", {
id: integer("id").primaryKey({ autoIncrement: true }),
ts: integer("ts").notNull(),
alive: integer("alive").notNull(),
mainPid: integer("main_pid").notNull(),
rssBytes: integer("rss_bytes").notNull(),
cpuPercent: real("cpu_percent").notNull(),
uptimeSec: integer("uptime_sec").notNull(),
activeSessions: integer("active_sessions").notNull(),
childProcessCount: integer("child_process_count").notNull(),
httpOk: integer("http_ok").notNull(),
httpStatusCode: integer("http_status_code").notNull(),
httpLatencyMs: integer("http_latency_ms").notNull(),
httpError: text("http_error").notNull(),
});

View File

@ -0,0 +1,109 @@
import { execFile } from "node:child_process";
import type { RoleResult, ThreadContext, WorkflowDefinition } from "@uncaged/nerve-core";
import { END } from "@uncaged/nerve-core";
const EXEC_TIMEOUT_MS = 30_000;
const VERIFY_DELAY_MS = 5_000;
type ExecResult = {
exitCode: number;
stdout: string;
stderr: string;
};
function exec(file: string, args: string[]): Promise<ExecResult> {
return new Promise((resolve) => {
execFile(
file,
args,
{
encoding: "utf8",
timeout: EXEC_TIMEOUT_MS,
maxBuffer: 4 * 1024 * 1024,
} as Parameters<typeof execFile>[2],
(err, stdout, stderr) => {
const exitCode =
err && typeof (err as NodeJS.ErrnoException).status === "number"
? (err as NodeJS.ErrnoException & { status: number }).status
: err ? -1 : 0;
resolve({
exitCode,
stdout: String(stdout ?? ""),
stderr: String(stderr ?? ""),
});
},
);
});
}
function sleep(ms: number): Promise<void> {
return new Promise((r) => setTimeout(r, ms));
}
type RestartMeta = {
action: string;
exitCode: number;
output: string;
};
type VerifyMeta = {
alive: boolean;
activeState: string;
subState: string;
};
async function restarter(_ctx: ThreadContext): Promise<RoleResult<RestartMeta>> {
const r = await exec("systemctl", ["--user", "restart", "hermes-gateway"]);
return {
content: r.exitCode === 0
? "Gateway restart command succeeded."
: `Gateway restart failed (exit ${r.exitCode}): ${r.stderr.trim()}`,
meta: {
action: "systemctl --user restart hermes-gateway",
exitCode: r.exitCode,
output: `${r.stdout}\n${r.stderr}`.trim().slice(0, 500),
},
};
}
async function verifier(_ctx: ThreadContext): Promise<RoleResult<VerifyMeta>> {
// Wait a few seconds for the service to come up
await sleep(VERIFY_DELAY_MS);
const r = await exec("systemctl", [
"--user",
"--no-pager",
"show",
"hermes-gateway",
"-p", "ActiveState",
"-p", "SubState",
]);
let activeState = "unknown";
let subState = "unknown";
for (const line of r.stdout.split("\n")) {
const t = line.trim();
if (t.startsWith("ActiveState=")) activeState = t.slice("ActiveState=".length);
if (t.startsWith("SubState=")) subState = t.slice("SubState=".length);
}
const alive = activeState === "active" && subState === "running";
return {
content: alive
? `Gateway recovered: ${activeState} (${subState}).`
: `Gateway still down: ${activeState} (${subState}). May need manual intervention.`,
meta: { alive, activeState, subState },
};
}
export const workflow: WorkflowDefinition<Record<"restarter", RestartMeta> & Record<"verifier", VerifyMeta>> = {
name: "restart-gateway",
roles: { restarter, verifier },
moderator(ctx) {
// Round 0: restart. Round 1: verify. Done.
if (ctx.steps.length === 0) return "restarter";
if (ctx.steps.length === 1) return "verifier";
return END;
},
};