Compare commits

...

3 Commits

Author SHA1 Message Date
a4625a4559 fix: restore CLI-triggered workflows, only remove restart-gateway
The previous commit incorrectly deleted all workflows. Only restart-gateway
should be removed (replaced by direct shell trigger). Other workflows
(solve-issue, extract-knowledge, develop-sense, develop-workflow) are
CLI-triggered and independent of sense coupling.
2026-05-02 13:55:27 +00:00
c71212a0ce refactor: sense triggers shell command directly, remove workflow
- SenseTrigger is now { command: string } — no workflow coupling
- Restart gateway via direct systemctl command instead of workflow
- Remove workflows/ directory and workflow config from nerve.yaml
2026-05-02 13:44:22 +00:00
8186a23ceb chore: remove unused schema and migrations 2026-05-02 09:38:22 +00:00
6 changed files with 16 additions and 161 deletions

View File

@ -12,6 +12,15 @@ senses:
timeout: 30s
workflows:
restart-gateway:
develop-sense:
concurrency: 1
overflow: drop
overflow: queue
develop-workflow:
concurrency: 1
overflow: queue
solve-issue:
concurrency: 1
overflow: queue
extract-knowledge:
concurrency: 1
overflow: queue

View File

@ -1,14 +0,0 @@
-- Migration: 0001_init
-- Creates the hermes_gateway_health table for hermes-gateway-health sense.
CREATE TABLE IF NOT EXISTS hermes_gateway_health (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ts INTEGER NOT NULL,
alive INTEGER NOT NULL,
main_pid INTEGER NOT NULL,
rss_bytes INTEGER NOT NULL,
cpu_percent REAL NOT NULL,
uptime_sec INTEGER NOT NULL,
active_sessions INTEGER NOT NULL,
child_process_count INTEGER NOT NULL
);

View File

@ -1,7 +0,0 @@
-- Migration: 0002_add_http_probe
-- HTTP reachability columns for hermes-gateway-health sense.
ALTER TABLE hermes_gateway_health ADD COLUMN http_ok INTEGER NOT NULL DEFAULT 0;
ALTER TABLE hermes_gateway_health ADD COLUMN http_status_code INTEGER NOT NULL DEFAULT 0;
ALTER TABLE hermes_gateway_health ADD COLUMN http_latency_ms INTEGER NOT NULL DEFAULT 0;
ALTER TABLE hermes_gateway_health ADD COLUMN http_error TEXT NOT NULL DEFAULT '';

View File

@ -1,6 +1,4 @@
import { execFile } from "node:child_process";
export { hermesGatewayHealth as table } from "./schema.ts";
/** Keep subprocess deadlines slightly under typical sense timeout (30s). */
const EXEC_TIMEOUT_MS = 25_000;
@ -9,7 +7,7 @@ const HTTP_TIMEOUT_MS = Math.min(23_000, EXEC_TIMEOUT_MS - 2000);
const HTTP_ERROR_MAX_LEN = 256;
/** How many consecutive failures before triggering a restart workflow. */
/** How many consecutive failures before triggering a restart. */
const FAILURE_THRESHOLD = 3;
type SenseState = {
@ -339,7 +337,7 @@ export async function compute(prevState: SenseState) {
const cooldown = prevState.restartCooldownMs;
const cooldownElapsed = now - lastRestartTs >= cooldown;
// --- trigger restart workflow? ---
// --- trigger restart? ---
const shouldRestart =
consecutiveFailures >= FAILURE_THRESHOLD && cooldownElapsed;
@ -363,14 +361,9 @@ export async function compute(prevState: SenseState) {
consecutiveFailures,
};
const workflow = shouldRestart
? {
name: "restart-gateway",
maxRounds: 3,
prompt: `Hermes gateway is down (${consecutiveFailures} consecutive failures). Last HTTP error: "${httpError}". systemd active+running: ${systemdActiveRunning}, process alive: ${psOk}. Restart the gateway and verify it comes back.`,
dryRun: false,
}
const trigger = shouldRestart
? { command: "systemctl --user restart hermes-gateway" }
: null;
return { state: nextState, signal, workflow };
return { state: nextState, signal, trigger };
}

View File

@ -1,17 +0,0 @@
import { integer, real, sqliteTable, text } from "drizzle-orm/sqlite-core";
export const hermesGatewayHealth = sqliteTable("hermes_gateway_health", {
id: integer("id").primaryKey({ autoIncrement: true }),
ts: integer("ts").notNull(),
alive: integer("alive").notNull(),
mainPid: integer("main_pid").notNull(),
rssBytes: integer("rss_bytes").notNull(),
cpuPercent: real("cpu_percent").notNull(),
uptimeSec: integer("uptime_sec").notNull(),
activeSessions: integer("active_sessions").notNull(),
childProcessCount: integer("child_process_count").notNull(),
httpOk: integer("http_ok").notNull(),
httpStatusCode: integer("http_status_code").notNull(),
httpLatencyMs: integer("http_latency_ms").notNull(),
httpError: text("http_error").notNull(),
});

View File

@ -1,109 +0,0 @@
import { execFile } from "node:child_process";
import type { RoleResult, ThreadContext, WorkflowDefinition } from "@uncaged/nerve-core";
import { END } from "@uncaged/nerve-core";
const EXEC_TIMEOUT_MS = 30_000;
const VERIFY_DELAY_MS = 5_000;
type ExecResult = {
exitCode: number;
stdout: string;
stderr: string;
};
function exec(file: string, args: string[]): Promise<ExecResult> {
return new Promise((resolve) => {
execFile(
file,
args,
{
encoding: "utf8",
timeout: EXEC_TIMEOUT_MS,
maxBuffer: 4 * 1024 * 1024,
} as Parameters<typeof execFile>[2],
(err, stdout, stderr) => {
const exitCode =
err && typeof (err as NodeJS.ErrnoException).status === "number"
? (err as NodeJS.ErrnoException & { status: number }).status
: err ? -1 : 0;
resolve({
exitCode,
stdout: String(stdout ?? ""),
stderr: String(stderr ?? ""),
});
},
);
});
}
function sleep(ms: number): Promise<void> {
return new Promise((r) => setTimeout(r, ms));
}
type RestartMeta = {
action: string;
exitCode: number;
output: string;
};
type VerifyMeta = {
alive: boolean;
activeState: string;
subState: string;
};
async function restarter(_ctx: ThreadContext): Promise<RoleResult<RestartMeta>> {
const r = await exec("systemctl", ["--user", "restart", "hermes-gateway"]);
return {
content: r.exitCode === 0
? "Gateway restart command succeeded."
: `Gateway restart failed (exit ${r.exitCode}): ${r.stderr.trim()}`,
meta: {
action: "systemctl --user restart hermes-gateway",
exitCode: r.exitCode,
output: `${r.stdout}\n${r.stderr}`.trim().slice(0, 500),
},
};
}
async function verifier(_ctx: ThreadContext): Promise<RoleResult<VerifyMeta>> {
// Wait a few seconds for the service to come up
await sleep(VERIFY_DELAY_MS);
const r = await exec("systemctl", [
"--user",
"--no-pager",
"show",
"hermes-gateway",
"-p", "ActiveState",
"-p", "SubState",
]);
let activeState = "unknown";
let subState = "unknown";
for (const line of r.stdout.split("\n")) {
const t = line.trim();
if (t.startsWith("ActiveState=")) activeState = t.slice("ActiveState=".length);
if (t.startsWith("SubState=")) subState = t.slice("SubState=".length);
}
const alive = activeState === "active" && subState === "running";
return {
content: alive
? `Gateway recovered: ${activeState} (${subState}).`
: `Gateway still down: ${activeState} (${subState}). May need manual intervention.`,
meta: { alive, activeState, subState },
};
}
export const workflow: WorkflowDefinition<Record<"restarter", RestartMeta> & Record<"verifier", VerifyMeta>> = {
name: "restart-gateway",
roles: { restarter, verifier },
moderator(ctx) {
// Round 0: restart. Round 1: verify. Done.
if (ctx.steps.length === 0) return "restarter";
if (ctx.steps.length === 1) return "verifier";
return END;
},
};