Compare commits
No commits in common. "master" and "feat/restart-gateway-workflow" have entirely different histories.
master
...
feat/resta
13
nerve.yaml
13
nerve.yaml
@ -12,15 +12,6 @@ senses:
|
|||||||
timeout: 30s
|
timeout: 30s
|
||||||
|
|
||||||
workflows:
|
workflows:
|
||||||
develop-sense:
|
restart-gateway:
|
||||||
concurrency: 1
|
concurrency: 1
|
||||||
overflow: queue
|
overflow: drop
|
||||||
develop-workflow:
|
|
||||||
concurrency: 1
|
|
||||||
overflow: queue
|
|
||||||
solve-issue:
|
|
||||||
concurrency: 1
|
|
||||||
overflow: queue
|
|
||||||
extract-knowledge:
|
|
||||||
concurrency: 1
|
|
||||||
overflow: queue
|
|
||||||
|
|||||||
14
senses/hermes-gateway-health/migrations/0001_init.sql
Normal file
14
senses/hermes-gateway-health/migrations/0001_init.sql
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
-- Migration: 0001_init
|
||||||
|
-- Creates the hermes_gateway_health table for hermes-gateway-health sense.
|
||||||
|
|
||||||
|
CREATE TABLE IF NOT EXISTS hermes_gateway_health (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
ts INTEGER NOT NULL,
|
||||||
|
alive INTEGER NOT NULL,
|
||||||
|
main_pid INTEGER NOT NULL,
|
||||||
|
rss_bytes INTEGER NOT NULL,
|
||||||
|
cpu_percent REAL NOT NULL,
|
||||||
|
uptime_sec INTEGER NOT NULL,
|
||||||
|
active_sessions INTEGER NOT NULL,
|
||||||
|
child_process_count INTEGER NOT NULL
|
||||||
|
);
|
||||||
@ -0,0 +1,7 @@
|
|||||||
|
-- Migration: 0002_add_http_probe
|
||||||
|
-- HTTP reachability columns for hermes-gateway-health sense.
|
||||||
|
|
||||||
|
ALTER TABLE hermes_gateway_health ADD COLUMN http_ok INTEGER NOT NULL DEFAULT 0;
|
||||||
|
ALTER TABLE hermes_gateway_health ADD COLUMN http_status_code INTEGER NOT NULL DEFAULT 0;
|
||||||
|
ALTER TABLE hermes_gateway_health ADD COLUMN http_latency_ms INTEGER NOT NULL DEFAULT 0;
|
||||||
|
ALTER TABLE hermes_gateway_health ADD COLUMN http_error TEXT NOT NULL DEFAULT '';
|
||||||
@ -1,4 +1,6 @@
|
|||||||
import { execFile } from "node:child_process";
|
import { execFile } from "node:child_process";
|
||||||
|
export { hermesGatewayHealth as table } from "./schema.ts";
|
||||||
|
|
||||||
/** Keep subprocess deadlines slightly under typical sense timeout (30s). */
|
/** Keep subprocess deadlines slightly under typical sense timeout (30s). */
|
||||||
const EXEC_TIMEOUT_MS = 25_000;
|
const EXEC_TIMEOUT_MS = 25_000;
|
||||||
|
|
||||||
@ -7,7 +9,7 @@ const HTTP_TIMEOUT_MS = Math.min(23_000, EXEC_TIMEOUT_MS - 2000);
|
|||||||
|
|
||||||
const HTTP_ERROR_MAX_LEN = 256;
|
const HTTP_ERROR_MAX_LEN = 256;
|
||||||
|
|
||||||
/** How many consecutive failures before triggering a restart. */
|
/** How many consecutive failures before triggering a restart workflow. */
|
||||||
const FAILURE_THRESHOLD = 3;
|
const FAILURE_THRESHOLD = 3;
|
||||||
|
|
||||||
type SenseState = {
|
type SenseState = {
|
||||||
@ -337,7 +339,7 @@ export async function compute(prevState: SenseState) {
|
|||||||
const cooldown = prevState.restartCooldownMs;
|
const cooldown = prevState.restartCooldownMs;
|
||||||
const cooldownElapsed = now - lastRestartTs >= cooldown;
|
const cooldownElapsed = now - lastRestartTs >= cooldown;
|
||||||
|
|
||||||
// --- trigger restart? ---
|
// --- trigger restart workflow? ---
|
||||||
const shouldRestart =
|
const shouldRestart =
|
||||||
consecutiveFailures >= FAILURE_THRESHOLD && cooldownElapsed;
|
consecutiveFailures >= FAILURE_THRESHOLD && cooldownElapsed;
|
||||||
|
|
||||||
@ -361,9 +363,14 @@ export async function compute(prevState: SenseState) {
|
|||||||
consecutiveFailures,
|
consecutiveFailures,
|
||||||
};
|
};
|
||||||
|
|
||||||
const trigger = shouldRestart
|
const workflow = shouldRestart
|
||||||
? { command: "systemctl --user restart hermes-gateway" }
|
? {
|
||||||
|
name: "restart-gateway",
|
||||||
|
maxRounds: 3,
|
||||||
|
prompt: `Hermes gateway is down (${consecutiveFailures} consecutive failures). Last HTTP error: "${httpError}". systemd active+running: ${systemdActiveRunning}, process alive: ${psOk}. Restart the gateway and verify it comes back.`,
|
||||||
|
dryRun: false,
|
||||||
|
}
|
||||||
: null;
|
: null;
|
||||||
|
|
||||||
return { state: nextState, signal, trigger };
|
return { state: nextState, signal, workflow };
|
||||||
}
|
}
|
||||||
|
|||||||
17
senses/hermes-gateway-health/src/schema.ts
Normal file
17
senses/hermes-gateway-health/src/schema.ts
Normal file
@ -0,0 +1,17 @@
|
|||||||
|
import { integer, real, sqliteTable, text } from "drizzle-orm/sqlite-core";
|
||||||
|
|
||||||
|
export const hermesGatewayHealth = sqliteTable("hermes_gateway_health", {
|
||||||
|
id: integer("id").primaryKey({ autoIncrement: true }),
|
||||||
|
ts: integer("ts").notNull(),
|
||||||
|
alive: integer("alive").notNull(),
|
||||||
|
mainPid: integer("main_pid").notNull(),
|
||||||
|
rssBytes: integer("rss_bytes").notNull(),
|
||||||
|
cpuPercent: real("cpu_percent").notNull(),
|
||||||
|
uptimeSec: integer("uptime_sec").notNull(),
|
||||||
|
activeSessions: integer("active_sessions").notNull(),
|
||||||
|
childProcessCount: integer("child_process_count").notNull(),
|
||||||
|
httpOk: integer("http_ok").notNull(),
|
||||||
|
httpStatusCode: integer("http_status_code").notNull(),
|
||||||
|
httpLatencyMs: integer("http_latency_ms").notNull(),
|
||||||
|
httpError: text("http_error").notNull(),
|
||||||
|
});
|
||||||
109
workflows/restart-gateway/src/index.ts
Normal file
109
workflows/restart-gateway/src/index.ts
Normal file
@ -0,0 +1,109 @@
|
|||||||
|
import { execFile } from "node:child_process";
|
||||||
|
import type { RoleResult, ThreadContext, WorkflowDefinition } from "@uncaged/nerve-core";
|
||||||
|
import { END } from "@uncaged/nerve-core";
|
||||||
|
|
||||||
|
const EXEC_TIMEOUT_MS = 30_000;
|
||||||
|
const VERIFY_DELAY_MS = 5_000;
|
||||||
|
|
||||||
|
type ExecResult = {
|
||||||
|
exitCode: number;
|
||||||
|
stdout: string;
|
||||||
|
stderr: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
function exec(file: string, args: string[]): Promise<ExecResult> {
|
||||||
|
return new Promise((resolve) => {
|
||||||
|
execFile(
|
||||||
|
file,
|
||||||
|
args,
|
||||||
|
{
|
||||||
|
encoding: "utf8",
|
||||||
|
timeout: EXEC_TIMEOUT_MS,
|
||||||
|
maxBuffer: 4 * 1024 * 1024,
|
||||||
|
} as Parameters<typeof execFile>[2],
|
||||||
|
(err, stdout, stderr) => {
|
||||||
|
const exitCode =
|
||||||
|
err && typeof (err as NodeJS.ErrnoException).status === "number"
|
||||||
|
? (err as NodeJS.ErrnoException & { status: number }).status
|
||||||
|
: err ? -1 : 0;
|
||||||
|
resolve({
|
||||||
|
exitCode,
|
||||||
|
stdout: String(stdout ?? ""),
|
||||||
|
stderr: String(stderr ?? ""),
|
||||||
|
});
|
||||||
|
},
|
||||||
|
);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
function sleep(ms: number): Promise<void> {
|
||||||
|
return new Promise((r) => setTimeout(r, ms));
|
||||||
|
}
|
||||||
|
|
||||||
|
type RestartMeta = {
|
||||||
|
action: string;
|
||||||
|
exitCode: number;
|
||||||
|
output: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
type VerifyMeta = {
|
||||||
|
alive: boolean;
|
||||||
|
activeState: string;
|
||||||
|
subState: string;
|
||||||
|
};
|
||||||
|
|
||||||
|
async function restarter(_ctx: ThreadContext): Promise<RoleResult<RestartMeta>> {
|
||||||
|
const r = await exec("systemctl", ["--user", "restart", "hermes-gateway"]);
|
||||||
|
return {
|
||||||
|
content: r.exitCode === 0
|
||||||
|
? "Gateway restart command succeeded."
|
||||||
|
: `Gateway restart failed (exit ${r.exitCode}): ${r.stderr.trim()}`,
|
||||||
|
meta: {
|
||||||
|
action: "systemctl --user restart hermes-gateway",
|
||||||
|
exitCode: r.exitCode,
|
||||||
|
output: `${r.stdout}\n${r.stderr}`.trim().slice(0, 500),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
async function verifier(_ctx: ThreadContext): Promise<RoleResult<VerifyMeta>> {
|
||||||
|
// Wait a few seconds for the service to come up
|
||||||
|
await sleep(VERIFY_DELAY_MS);
|
||||||
|
|
||||||
|
const r = await exec("systemctl", [
|
||||||
|
"--user",
|
||||||
|
"--no-pager",
|
||||||
|
"show",
|
||||||
|
"hermes-gateway",
|
||||||
|
"-p", "ActiveState",
|
||||||
|
"-p", "SubState",
|
||||||
|
]);
|
||||||
|
|
||||||
|
let activeState = "unknown";
|
||||||
|
let subState = "unknown";
|
||||||
|
for (const line of r.stdout.split("\n")) {
|
||||||
|
const t = line.trim();
|
||||||
|
if (t.startsWith("ActiveState=")) activeState = t.slice("ActiveState=".length);
|
||||||
|
if (t.startsWith("SubState=")) subState = t.slice("SubState=".length);
|
||||||
|
}
|
||||||
|
|
||||||
|
const alive = activeState === "active" && subState === "running";
|
||||||
|
|
||||||
|
return {
|
||||||
|
content: alive
|
||||||
|
? `Gateway recovered: ${activeState} (${subState}).`
|
||||||
|
: `Gateway still down: ${activeState} (${subState}). May need manual intervention.`,
|
||||||
|
meta: { alive, activeState, subState },
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export const workflow: WorkflowDefinition<Record<"restarter", RestartMeta> & Record<"verifier", VerifyMeta>> = {
|
||||||
|
name: "restart-gateway",
|
||||||
|
roles: { restarter, verifier },
|
||||||
|
moderator(ctx) {
|
||||||
|
// Round 0: restart. Round 1: verify. Done.
|
||||||
|
if (ctx.steps.length === 0) return "restarter";
|
||||||
|
if (ctx.steps.length === 1) return "verifier";
|
||||||
|
return END;
|
||||||
|
},
|
||||||
|
};
|
||||||
Loading…
x
Reference in New Issue
Block a user