Compare commits
3 Commits
feat/resta
...
master
| Author | SHA1 | Date | |
|---|---|---|---|
| a4625a4559 | |||
| c71212a0ce | |||
| 8186a23ceb |
13
nerve.yaml
13
nerve.yaml
@ -12,6 +12,15 @@ senses:
|
|||||||
timeout: 30s
|
timeout: 30s
|
||||||
|
|
||||||
workflows:
|
workflows:
|
||||||
restart-gateway:
|
develop-sense:
|
||||||
concurrency: 1
|
concurrency: 1
|
||||||
overflow: drop
|
overflow: queue
|
||||||
|
develop-workflow:
|
||||||
|
concurrency: 1
|
||||||
|
overflow: queue
|
||||||
|
solve-issue:
|
||||||
|
concurrency: 1
|
||||||
|
overflow: queue
|
||||||
|
extract-knowledge:
|
||||||
|
concurrency: 1
|
||||||
|
overflow: queue
|
||||||
|
|||||||
@ -1,14 +0,0 @@
|
|||||||
-- Migration: 0001_init
|
|
||||||
-- Creates the hermes_gateway_health table for hermes-gateway-health sense.
|
|
||||||
|
|
||||||
CREATE TABLE IF NOT EXISTS hermes_gateway_health (
|
|
||||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
||||||
ts INTEGER NOT NULL,
|
|
||||||
alive INTEGER NOT NULL,
|
|
||||||
main_pid INTEGER NOT NULL,
|
|
||||||
rss_bytes INTEGER NOT NULL,
|
|
||||||
cpu_percent REAL NOT NULL,
|
|
||||||
uptime_sec INTEGER NOT NULL,
|
|
||||||
active_sessions INTEGER NOT NULL,
|
|
||||||
child_process_count INTEGER NOT NULL
|
|
||||||
);
|
|
||||||
@ -1,7 +0,0 @@
|
|||||||
-- Migration: 0002_add_http_probe
|
|
||||||
-- HTTP reachability columns for hermes-gateway-health sense.
|
|
||||||
|
|
||||||
ALTER TABLE hermes_gateway_health ADD COLUMN http_ok INTEGER NOT NULL DEFAULT 0;
|
|
||||||
ALTER TABLE hermes_gateway_health ADD COLUMN http_status_code INTEGER NOT NULL DEFAULT 0;
|
|
||||||
ALTER TABLE hermes_gateway_health ADD COLUMN http_latency_ms INTEGER NOT NULL DEFAULT 0;
|
|
||||||
ALTER TABLE hermes_gateway_health ADD COLUMN http_error TEXT NOT NULL DEFAULT '';
|
|
||||||
@ -1,6 +1,4 @@
|
|||||||
import { execFile } from "node:child_process";
|
import { execFile } from "node:child_process";
|
||||||
export { hermesGatewayHealth as table } from "./schema.ts";
|
|
||||||
|
|
||||||
/** Keep subprocess deadlines slightly under typical sense timeout (30s). */
|
/** Keep subprocess deadlines slightly under typical sense timeout (30s). */
|
||||||
const EXEC_TIMEOUT_MS = 25_000;
|
const EXEC_TIMEOUT_MS = 25_000;
|
||||||
|
|
||||||
@ -9,7 +7,7 @@ const HTTP_TIMEOUT_MS = Math.min(23_000, EXEC_TIMEOUT_MS - 2000);
|
|||||||
|
|
||||||
const HTTP_ERROR_MAX_LEN = 256;
|
const HTTP_ERROR_MAX_LEN = 256;
|
||||||
|
|
||||||
/** How many consecutive failures before triggering a restart workflow. */
|
/** How many consecutive failures before triggering a restart. */
|
||||||
const FAILURE_THRESHOLD = 3;
|
const FAILURE_THRESHOLD = 3;
|
||||||
|
|
||||||
type SenseState = {
|
type SenseState = {
|
||||||
@ -339,7 +337,7 @@ export async function compute(prevState: SenseState) {
|
|||||||
const cooldown = prevState.restartCooldownMs;
|
const cooldown = prevState.restartCooldownMs;
|
||||||
const cooldownElapsed = now - lastRestartTs >= cooldown;
|
const cooldownElapsed = now - lastRestartTs >= cooldown;
|
||||||
|
|
||||||
// --- trigger restart workflow? ---
|
// --- trigger restart? ---
|
||||||
const shouldRestart =
|
const shouldRestart =
|
||||||
consecutiveFailures >= FAILURE_THRESHOLD && cooldownElapsed;
|
consecutiveFailures >= FAILURE_THRESHOLD && cooldownElapsed;
|
||||||
|
|
||||||
@ -363,14 +361,9 @@ export async function compute(prevState: SenseState) {
|
|||||||
consecutiveFailures,
|
consecutiveFailures,
|
||||||
};
|
};
|
||||||
|
|
||||||
const workflow = shouldRestart
|
const trigger = shouldRestart
|
||||||
? {
|
? { command: "systemctl --user restart hermes-gateway" }
|
||||||
name: "restart-gateway",
|
|
||||||
maxRounds: 3,
|
|
||||||
prompt: `Hermes gateway is down (${consecutiveFailures} consecutive failures). Last HTTP error: "${httpError}". systemd active+running: ${systemdActiveRunning}, process alive: ${psOk}. Restart the gateway and verify it comes back.`,
|
|
||||||
dryRun: false,
|
|
||||||
}
|
|
||||||
: null;
|
: null;
|
||||||
|
|
||||||
return { state: nextState, signal, workflow };
|
return { state: nextState, signal, trigger };
|
||||||
}
|
}
|
||||||
|
|||||||
@ -1,17 +0,0 @@
|
|||||||
import { integer, real, sqliteTable, text } from "drizzle-orm/sqlite-core";
|
|
||||||
|
|
||||||
export const hermesGatewayHealth = sqliteTable("hermes_gateway_health", {
|
|
||||||
id: integer("id").primaryKey({ autoIncrement: true }),
|
|
||||||
ts: integer("ts").notNull(),
|
|
||||||
alive: integer("alive").notNull(),
|
|
||||||
mainPid: integer("main_pid").notNull(),
|
|
||||||
rssBytes: integer("rss_bytes").notNull(),
|
|
||||||
cpuPercent: real("cpu_percent").notNull(),
|
|
||||||
uptimeSec: integer("uptime_sec").notNull(),
|
|
||||||
activeSessions: integer("active_sessions").notNull(),
|
|
||||||
childProcessCount: integer("child_process_count").notNull(),
|
|
||||||
httpOk: integer("http_ok").notNull(),
|
|
||||||
httpStatusCode: integer("http_status_code").notNull(),
|
|
||||||
httpLatencyMs: integer("http_latency_ms").notNull(),
|
|
||||||
httpError: text("http_error").notNull(),
|
|
||||||
});
|
|
||||||
@ -1,109 +0,0 @@
|
|||||||
import { execFile } from "node:child_process";
|
|
||||||
import type { RoleResult, ThreadContext, WorkflowDefinition } from "@uncaged/nerve-core";
|
|
||||||
import { END } from "@uncaged/nerve-core";
|
|
||||||
|
|
||||||
const EXEC_TIMEOUT_MS = 30_000;
|
|
||||||
const VERIFY_DELAY_MS = 5_000;
|
|
||||||
|
|
||||||
type ExecResult = {
|
|
||||||
exitCode: number;
|
|
||||||
stdout: string;
|
|
||||||
stderr: string;
|
|
||||||
};
|
|
||||||
|
|
||||||
function exec(file: string, args: string[]): Promise<ExecResult> {
|
|
||||||
return new Promise((resolve) => {
|
|
||||||
execFile(
|
|
||||||
file,
|
|
||||||
args,
|
|
||||||
{
|
|
||||||
encoding: "utf8",
|
|
||||||
timeout: EXEC_TIMEOUT_MS,
|
|
||||||
maxBuffer: 4 * 1024 * 1024,
|
|
||||||
} as Parameters<typeof execFile>[2],
|
|
||||||
(err, stdout, stderr) => {
|
|
||||||
const exitCode =
|
|
||||||
err && typeof (err as NodeJS.ErrnoException).status === "number"
|
|
||||||
? (err as NodeJS.ErrnoException & { status: number }).status
|
|
||||||
: err ? -1 : 0;
|
|
||||||
resolve({
|
|
||||||
exitCode,
|
|
||||||
stdout: String(stdout ?? ""),
|
|
||||||
stderr: String(stderr ?? ""),
|
|
||||||
});
|
|
||||||
},
|
|
||||||
);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
function sleep(ms: number): Promise<void> {
|
|
||||||
return new Promise((r) => setTimeout(r, ms));
|
|
||||||
}
|
|
||||||
|
|
||||||
type RestartMeta = {
|
|
||||||
action: string;
|
|
||||||
exitCode: number;
|
|
||||||
output: string;
|
|
||||||
};
|
|
||||||
|
|
||||||
type VerifyMeta = {
|
|
||||||
alive: boolean;
|
|
||||||
activeState: string;
|
|
||||||
subState: string;
|
|
||||||
};
|
|
||||||
|
|
||||||
async function restarter(_ctx: ThreadContext): Promise<RoleResult<RestartMeta>> {
|
|
||||||
const r = await exec("systemctl", ["--user", "restart", "hermes-gateway"]);
|
|
||||||
return {
|
|
||||||
content: r.exitCode === 0
|
|
||||||
? "Gateway restart command succeeded."
|
|
||||||
: `Gateway restart failed (exit ${r.exitCode}): ${r.stderr.trim()}`,
|
|
||||||
meta: {
|
|
||||||
action: "systemctl --user restart hermes-gateway",
|
|
||||||
exitCode: r.exitCode,
|
|
||||||
output: `${r.stdout}\n${r.stderr}`.trim().slice(0, 500),
|
|
||||||
},
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
async function verifier(_ctx: ThreadContext): Promise<RoleResult<VerifyMeta>> {
|
|
||||||
// Wait a few seconds for the service to come up
|
|
||||||
await sleep(VERIFY_DELAY_MS);
|
|
||||||
|
|
||||||
const r = await exec("systemctl", [
|
|
||||||
"--user",
|
|
||||||
"--no-pager",
|
|
||||||
"show",
|
|
||||||
"hermes-gateway",
|
|
||||||
"-p", "ActiveState",
|
|
||||||
"-p", "SubState",
|
|
||||||
]);
|
|
||||||
|
|
||||||
let activeState = "unknown";
|
|
||||||
let subState = "unknown";
|
|
||||||
for (const line of r.stdout.split("\n")) {
|
|
||||||
const t = line.trim();
|
|
||||||
if (t.startsWith("ActiveState=")) activeState = t.slice("ActiveState=".length);
|
|
||||||
if (t.startsWith("SubState=")) subState = t.slice("SubState=".length);
|
|
||||||
}
|
|
||||||
|
|
||||||
const alive = activeState === "active" && subState === "running";
|
|
||||||
|
|
||||||
return {
|
|
||||||
content: alive
|
|
||||||
? `Gateway recovered: ${activeState} (${subState}).`
|
|
||||||
: `Gateway still down: ${activeState} (${subState}). May need manual intervention.`,
|
|
||||||
meta: { alive, activeState, subState },
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
export const workflow: WorkflowDefinition<Record<"restarter", RestartMeta> & Record<"verifier", VerifyMeta>> = {
|
|
||||||
name: "restart-gateway",
|
|
||||||
roles: { restarter, verifier },
|
|
||||||
moderator(ctx) {
|
|
||||||
// Round 0: restart. Round 1: verify. Done.
|
|
||||||
if (ctx.steps.length === 0) return "restarter";
|
|
||||||
if (ctx.steps.length === 1) return "verifier";
|
|
||||||
return END;
|
|
||||||
},
|
|
||||||
};
|
|
||||||
Loading…
x
Reference in New Issue
Block a user