Compare commits
3 Commits
feat/resta
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| a4625a4559 | |||
| c71212a0ce | |||
| 8186a23ceb |
13
nerve.yaml
13
nerve.yaml
@ -12,6 +12,15 @@ senses:
|
||||
timeout: 30s
|
||||
|
||||
workflows:
|
||||
restart-gateway:
|
||||
develop-sense:
|
||||
concurrency: 1
|
||||
overflow: drop
|
||||
overflow: queue
|
||||
develop-workflow:
|
||||
concurrency: 1
|
||||
overflow: queue
|
||||
solve-issue:
|
||||
concurrency: 1
|
||||
overflow: queue
|
||||
extract-knowledge:
|
||||
concurrency: 1
|
||||
overflow: queue
|
||||
|
||||
@ -1,14 +0,0 @@
|
||||
-- Migration: 0001_init
|
||||
-- Creates the hermes_gateway_health table for hermes-gateway-health sense.
|
||||
|
||||
CREATE TABLE IF NOT EXISTS hermes_gateway_health (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
ts INTEGER NOT NULL,
|
||||
alive INTEGER NOT NULL,
|
||||
main_pid INTEGER NOT NULL,
|
||||
rss_bytes INTEGER NOT NULL,
|
||||
cpu_percent REAL NOT NULL,
|
||||
uptime_sec INTEGER NOT NULL,
|
||||
active_sessions INTEGER NOT NULL,
|
||||
child_process_count INTEGER NOT NULL
|
||||
);
|
||||
@ -1,7 +0,0 @@
|
||||
-- Migration: 0002_add_http_probe
|
||||
-- HTTP reachability columns for hermes-gateway-health sense.
|
||||
|
||||
ALTER TABLE hermes_gateway_health ADD COLUMN http_ok INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE hermes_gateway_health ADD COLUMN http_status_code INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE hermes_gateway_health ADD COLUMN http_latency_ms INTEGER NOT NULL DEFAULT 0;
|
||||
ALTER TABLE hermes_gateway_health ADD COLUMN http_error TEXT NOT NULL DEFAULT '';
|
||||
@ -1,6 +1,4 @@
|
||||
import { execFile } from "node:child_process";
|
||||
export { hermesGatewayHealth as table } from "./schema.ts";
|
||||
|
||||
/** Keep subprocess deadlines slightly under typical sense timeout (30s). */
|
||||
const EXEC_TIMEOUT_MS = 25_000;
|
||||
|
||||
@ -9,7 +7,7 @@ const HTTP_TIMEOUT_MS = Math.min(23_000, EXEC_TIMEOUT_MS - 2000);
|
||||
|
||||
const HTTP_ERROR_MAX_LEN = 256;
|
||||
|
||||
/** How many consecutive failures before triggering a restart workflow. */
|
||||
/** How many consecutive failures before triggering a restart. */
|
||||
const FAILURE_THRESHOLD = 3;
|
||||
|
||||
type SenseState = {
|
||||
@ -339,7 +337,7 @@ export async function compute(prevState: SenseState) {
|
||||
const cooldown = prevState.restartCooldownMs;
|
||||
const cooldownElapsed = now - lastRestartTs >= cooldown;
|
||||
|
||||
// --- trigger restart workflow? ---
|
||||
// --- trigger restart? ---
|
||||
const shouldRestart =
|
||||
consecutiveFailures >= FAILURE_THRESHOLD && cooldownElapsed;
|
||||
|
||||
@ -363,14 +361,9 @@ export async function compute(prevState: SenseState) {
|
||||
consecutiveFailures,
|
||||
};
|
||||
|
||||
const workflow = shouldRestart
|
||||
? {
|
||||
name: "restart-gateway",
|
||||
maxRounds: 3,
|
||||
prompt: `Hermes gateway is down (${consecutiveFailures} consecutive failures). Last HTTP error: "${httpError}". systemd active+running: ${systemdActiveRunning}, process alive: ${psOk}. Restart the gateway and verify it comes back.`,
|
||||
dryRun: false,
|
||||
}
|
||||
const trigger = shouldRestart
|
||||
? { command: "systemctl --user restart hermes-gateway" }
|
||||
: null;
|
||||
|
||||
return { state: nextState, signal, workflow };
|
||||
return { state: nextState, signal, trigger };
|
||||
}
|
||||
|
||||
@ -1,17 +0,0 @@
|
||||
import { integer, real, sqliteTable, text } from "drizzle-orm/sqlite-core";
|
||||
|
||||
export const hermesGatewayHealth = sqliteTable("hermes_gateway_health", {
|
||||
id: integer("id").primaryKey({ autoIncrement: true }),
|
||||
ts: integer("ts").notNull(),
|
||||
alive: integer("alive").notNull(),
|
||||
mainPid: integer("main_pid").notNull(),
|
||||
rssBytes: integer("rss_bytes").notNull(),
|
||||
cpuPercent: real("cpu_percent").notNull(),
|
||||
uptimeSec: integer("uptime_sec").notNull(),
|
||||
activeSessions: integer("active_sessions").notNull(),
|
||||
childProcessCount: integer("child_process_count").notNull(),
|
||||
httpOk: integer("http_ok").notNull(),
|
||||
httpStatusCode: integer("http_status_code").notNull(),
|
||||
httpLatencyMs: integer("http_latency_ms").notNull(),
|
||||
httpError: text("http_error").notNull(),
|
||||
});
|
||||
@ -1,109 +0,0 @@
|
||||
import { execFile } from "node:child_process";
|
||||
import type { RoleResult, ThreadContext, WorkflowDefinition } from "@uncaged/nerve-core";
|
||||
import { END } from "@uncaged/nerve-core";
|
||||
|
||||
const EXEC_TIMEOUT_MS = 30_000;
|
||||
const VERIFY_DELAY_MS = 5_000;
|
||||
|
||||
type ExecResult = {
|
||||
exitCode: number;
|
||||
stdout: string;
|
||||
stderr: string;
|
||||
};
|
||||
|
||||
function exec(file: string, args: string[]): Promise<ExecResult> {
|
||||
return new Promise((resolve) => {
|
||||
execFile(
|
||||
file,
|
||||
args,
|
||||
{
|
||||
encoding: "utf8",
|
||||
timeout: EXEC_TIMEOUT_MS,
|
||||
maxBuffer: 4 * 1024 * 1024,
|
||||
} as Parameters<typeof execFile>[2],
|
||||
(err, stdout, stderr) => {
|
||||
const exitCode =
|
||||
err && typeof (err as NodeJS.ErrnoException).status === "number"
|
||||
? (err as NodeJS.ErrnoException & { status: number }).status
|
||||
: err ? -1 : 0;
|
||||
resolve({
|
||||
exitCode,
|
||||
stdout: String(stdout ?? ""),
|
||||
stderr: String(stderr ?? ""),
|
||||
});
|
||||
},
|
||||
);
|
||||
});
|
||||
}
|
||||
|
||||
function sleep(ms: number): Promise<void> {
|
||||
return new Promise((r) => setTimeout(r, ms));
|
||||
}
|
||||
|
||||
type RestartMeta = {
|
||||
action: string;
|
||||
exitCode: number;
|
||||
output: string;
|
||||
};
|
||||
|
||||
type VerifyMeta = {
|
||||
alive: boolean;
|
||||
activeState: string;
|
||||
subState: string;
|
||||
};
|
||||
|
||||
async function restarter(_ctx: ThreadContext): Promise<RoleResult<RestartMeta>> {
|
||||
const r = await exec("systemctl", ["--user", "restart", "hermes-gateway"]);
|
||||
return {
|
||||
content: r.exitCode === 0
|
||||
? "Gateway restart command succeeded."
|
||||
: `Gateway restart failed (exit ${r.exitCode}): ${r.stderr.trim()}`,
|
||||
meta: {
|
||||
action: "systemctl --user restart hermes-gateway",
|
||||
exitCode: r.exitCode,
|
||||
output: `${r.stdout}\n${r.stderr}`.trim().slice(0, 500),
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
async function verifier(_ctx: ThreadContext): Promise<RoleResult<VerifyMeta>> {
|
||||
// Wait a few seconds for the service to come up
|
||||
await sleep(VERIFY_DELAY_MS);
|
||||
|
||||
const r = await exec("systemctl", [
|
||||
"--user",
|
||||
"--no-pager",
|
||||
"show",
|
||||
"hermes-gateway",
|
||||
"-p", "ActiveState",
|
||||
"-p", "SubState",
|
||||
]);
|
||||
|
||||
let activeState = "unknown";
|
||||
let subState = "unknown";
|
||||
for (const line of r.stdout.split("\n")) {
|
||||
const t = line.trim();
|
||||
if (t.startsWith("ActiveState=")) activeState = t.slice("ActiveState=".length);
|
||||
if (t.startsWith("SubState=")) subState = t.slice("SubState=".length);
|
||||
}
|
||||
|
||||
const alive = activeState === "active" && subState === "running";
|
||||
|
||||
return {
|
||||
content: alive
|
||||
? `Gateway recovered: ${activeState} (${subState}).`
|
||||
: `Gateway still down: ${activeState} (${subState}). May need manual intervention.`,
|
||||
meta: { alive, activeState, subState },
|
||||
};
|
||||
}
|
||||
|
||||
export const workflow: WorkflowDefinition<Record<"restarter", RestartMeta> & Record<"verifier", VerifyMeta>> = {
|
||||
name: "restart-gateway",
|
||||
roles: { restarter, verifier },
|
||||
moderator(ctx) {
|
||||
// Round 0: restart. Round 1: verify. Done.
|
||||
if (ctx.steps.length === 0) return "restarter";
|
||||
if (ctx.steps.length === 1) return "verifier";
|
||||
return END;
|
||||
},
|
||||
};
|
||||
Loading…
x
Reference in New Issue
Block a user