From 29d47bd9c4d6768b1dc17c84b378c75a956aa3d0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=B0=8F=E6=A9=98?= Date: Sat, 2 May 2026 05:38:44 +0000 Subject: [PATCH] feat: add restart-gateway workflow, remove unused senses MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove 4 data-only senses (linux-system-health, worker-process-metrics, hermes-session-message-stats, git-workspace-status) — none triggered workflows - Refactor hermes-gateway-health sense: add state tracking, trigger restart-gateway workflow after 3 consecutive failures (with 5min cooldown) - Add restart-gateway workflow: restarter role (systemctl restart) + verifier role (check service came back) - Simplify nerve.yaml to single sense + single workflow --- nerve.yaml | 33 +--- .../migrations/0001_init.sql | 13 -- senses/git-workspace-status/src/index.ts | 76 --------- senses/git-workspace-status/src/schema.ts | 13 -- senses/hermes-gateway-health/src/index.ts | 146 +++++++----------- .../migrations/0001_init.sql | 13 -- .../hermes-session-message-stats/src/index.ts | 117 -------------- .../src/schema.ts | 12 -- .../migrations/0001_init.sql | 16 -- .../migrations/0002_add_tcp_stats.sql | 6 - senses/linux-system-health/src/index.ts | 88 ----------- senses/linux-system-health/src/schema.ts | 22 --- .../migrations/0001_init.sql | 11 -- senses/worker-process-metrics/src/index.ts | 26 ---- senses/worker-process-metrics/src/schema.ts | 10 -- workflows/restart-gateway/src/index.ts | 109 +++++++++++++ 16 files changed, 168 insertions(+), 543 deletions(-) delete mode 100644 senses/git-workspace-status/migrations/0001_init.sql delete mode 100644 senses/git-workspace-status/src/index.ts delete mode 100644 senses/git-workspace-status/src/schema.ts delete mode 100644 senses/hermes-session-message-stats/migrations/0001_init.sql delete mode 100644 senses/hermes-session-message-stats/src/index.ts delete mode 100644 senses/hermes-session-message-stats/src/schema.ts delete mode 100644 senses/linux-system-health/migrations/0001_init.sql delete mode 100644 senses/linux-system-health/migrations/0002_add_tcp_stats.sql delete mode 100644 senses/linux-system-health/src/index.ts delete mode 100644 senses/linux-system-health/src/schema.ts delete mode 100644 senses/worker-process-metrics/migrations/0001_init.sql delete mode 100644 senses/worker-process-metrics/src/index.ts delete mode 100644 senses/worker-process-metrics/src/schema.ts create mode 100644 workflows/restart-gateway/src/index.ts diff --git a/nerve.yaml b/nerve.yaml index df951cb..a230e85 100644 --- a/nerve.yaml +++ b/nerve.yaml @@ -5,42 +5,13 @@ extract: model: qwen-plus senses: - linux-system-health: - group: system - interval: 30s - throttle: 10s - timeout: 15s hermes-gateway-health: group: system interval: 2m throttle: 30s timeout: 30s - hermes-session-message-stats: - group: hermes - interval: 15m - throttle: 30s - timeout: 60s - worker-process-metrics: - group: system - interval: 1m - throttle: 15s - timeout: 5s - git-workspace-status: - group: workspace - interval: 2m - throttle: 30s - timeout: 15s workflows: - develop-sense: + restart-gateway: concurrency: 1 - overflow: queue - develop-workflow: - concurrency: 1 - overflow: queue - solve-issue: - concurrency: 1 - overflow: queue - extract-knowledge: - concurrency: 1 - overflow: queue + overflow: drop diff --git a/senses/git-workspace-status/migrations/0001_init.sql b/senses/git-workspace-status/migrations/0001_init.sql deleted file mode 100644 index 5b090ce..0000000 --- a/senses/git-workspace-status/migrations/0001_init.sql +++ /dev/null @@ -1,13 +0,0 @@ --- Migration: 0001_init --- Creates the snapshots table for git-workspace-status sense. - -CREATE TABLE IF NOT EXISTS snapshots ( - ts INTEGER PRIMARY KEY, - branch TEXT NOT NULL, - head_short TEXT NOT NULL, - porcelain_lines INTEGER NOT NULL, - has_upstream INTEGER NOT NULL, - ahead_count INTEGER NOT NULL, - behind_count INTEGER NOT NULL, - git_error TEXT NOT NULL -); diff --git a/senses/git-workspace-status/src/index.ts b/senses/git-workspace-status/src/index.ts deleted file mode 100644 index 27901c0..0000000 --- a/senses/git-workspace-status/src/index.ts +++ /dev/null @@ -1,76 +0,0 @@ -import { execFileSync } from "node:child_process"; -import { resolve } from "node:path"; -export { snapshots as table } from "./schema.ts"; - -const GIT_TIMEOUT_MS = 15_000; - -function workspaceRoot(): string { - const raw = process.env.GIT_WORKSPACE_ROOT; - return raw ? resolve(raw) : resolve(process.cwd()); -} - -function gitErrorMessage(err: unknown): string { - if (err instanceof Error) { - const m = err.message.trim(); - return m.length > 200 ? `${m.slice(0, 197)}...` : m; - } - return String(err); -} - -function runGit(cwd: string, args: string[]): string { - return execFileSync("git", args, { - cwd, - encoding: "utf8", - timeout: GIT_TIMEOUT_MS, - maxBuffer: 2 * 1024 * 1024, - }).trimEnd(); -} - -function countPorcelainLines(output: string): number { - if (!output) return 0; - return output.split("\n").filter((line) => line.length > 0).length; -} - -export async function compute() { - const root = workspaceRoot(); - const ts = Date.now(); - - let branch = ""; - let headShort = ""; - let porcelainLines = 0; - let hasUpstream = 0; - let aheadCount = 0; - let behindCount = 0; - let gitError = ""; - - try { - const inside = runGit(root, ["rev-parse", "--is-inside-work-tree"]).trim(); - if (inside !== "true") { - gitError = "not a git work tree"; - return { signal: { ts, branch, headShort, porcelainLines, hasUpstream, aheadCount, behindCount, gitError }, workflow: null }; - } - - branch = runGit(root, ["rev-parse", "--abbrev-ref", "HEAD"]); - headShort = runGit(root, ["rev-parse", "--short", "HEAD"]); - porcelainLines = countPorcelainLines(runGit(root, ["status", "--porcelain"])); - - try { - runGit(root, ["rev-parse", "--abbrev-ref", "@{upstream}"]); - hasUpstream = 1; - const lb = runGit(root, ["rev-list", "--left-right", "--count", "HEAD...@{upstream}"]); - const parts = lb.split(/[\t\s]+/).filter(Boolean); - if (parts.length >= 2) { - aheadCount = Number.parseInt(parts[0], 10) || 0; - behindCount = Number.parseInt(parts[1], 10) || 0; - } - } catch { - hasUpstream = 0; - aheadCount = 0; - behindCount = 0; - } - } catch (e) { - gitError = gitErrorMessage(e); - } - - return { signal: { ts, branch, headShort, porcelainLines, hasUpstream, aheadCount, behindCount, gitError }, workflow: null }; -} diff --git a/senses/git-workspace-status/src/schema.ts b/senses/git-workspace-status/src/schema.ts deleted file mode 100644 index b39de2c..0000000 --- a/senses/git-workspace-status/src/schema.ts +++ /dev/null @@ -1,13 +0,0 @@ -import { integer, sqliteTable, text } from "drizzle-orm/sqlite-core"; - -export const snapshots = sqliteTable("snapshots", { - ts: integer("ts").primaryKey(), - branch: text("branch").notNull(), - headShort: text("head_short").notNull(), - porcelainLines: integer("porcelain_lines").notNull(), - hasUpstream: integer("has_upstream").notNull(), - aheadCount: integer("ahead_count").notNull(), - behindCount: integer("behind_count").notNull(), - /** Empty string when the snapshot succeeded; otherwise a short error summary. */ - gitError: text("git_error").notNull(), -}); diff --git a/senses/hermes-gateway-health/src/index.ts b/senses/hermes-gateway-health/src/index.ts index 4df345e..a403e6e 100644 --- a/senses/hermes-gateway-health/src/index.ts +++ b/senses/hermes-gateway-health/src/index.ts @@ -9,6 +9,22 @@ const HTTP_TIMEOUT_MS = Math.min(23_000, EXEC_TIMEOUT_MS - 2000); const HTTP_ERROR_MAX_LEN = 256; +/** How many consecutive failures before triggering a restart workflow. */ +const FAILURE_THRESHOLD = 3; + +type SenseState = { + consecutiveFailures: number; + lastRestartTs: number; + /** Minimum ms between restart attempts to avoid restart loops. */ + restartCooldownMs: number; +}; + +export const initialState: SenseState = { + consecutiveFailures: 0, + lastRestartTs: 0, + restartCooldownMs: 300_000, // 5 minutes +}; + function gatewayProbeUrl(): string { const u = process.env.HERMES_GATEWAY_HEALTH_URL ?? @@ -26,17 +42,13 @@ function truncateHttpError(err: unknown): string { return s.length > HTTP_ERROR_MAX_LEN ? s.slice(0, HTTP_ERROR_MAX_LEN) : s; } -interface HttpProbeResult { +type HttpProbeResult = { httpOk: number; httpStatusCode: number; httpLatencyMs: number; httpError: string; -} +}; -/** - * GET the gateway URL; success = HTTP 200–399. - * URL must be set via HERMES_GATEWAY_HEALTH_URL or NERVE_HERMES_GATEWAY_URL. - */ async function probeGatewayHttp(url: string): Promise { if (!url) { return { @@ -74,10 +86,6 @@ async function probeGatewayHttp(url: string): Promise { } } -/** - * When `ps` lacks `etimes` (wall-clock seconds since start), parse `etime` - * ([[dd-]hh:]mm:ss) into seconds. See ps(1) `etime` field description. - */ function etimeToSeconds(etime: string): number { let s = String(etime).trim(); if (!s) return 0; @@ -102,12 +110,12 @@ function etimeToSeconds(etime: string): number { return 0; } -interface ExecResult { +type ExecResult = { exitCode: number; errCode: string | undefined; stdout: string; stderr: string; -} +}; function execFileUtf8(file: string, args: string[], opts: Record = {}): Promise { return new Promise((resolve) => { @@ -216,11 +224,11 @@ async function processExists(mainPid: number): Promise { return r.stdout.trim().length > 0; } -interface PsMetrics { +type PsMetrics = { rssBytes: number; cpuPercent: number; uptimeSec: number; -} +}; async function readPsMetrics(mainPid: number): Promise { if (mainPid <= 0) { @@ -265,61 +273,12 @@ async function readPsMetrics(mainPid: number): Promise { return { rssBytes, cpuPercent, uptimeSec }; } -function parseActiveSessionsFromHermesStats(text: string): number { - const src = String(text); - const patterns = [ - /^\s*Active\s+sessions?:\s*(\d+)/gim, - /^\s*active\s+sessions?:\s*(\d+)/gim, - /^\s*Total\s+sessions?:\s*(\d+)/gim, - ]; - for (const re of patterns) { - re.lastIndex = 0; - const m = re.exec(src); - if (m) { - const n = Math.trunc(Number.parseInt(m[1], 10)); - return Number.isFinite(n) ? n : 0; - } - } - return 0; -} - -async function readActiveSessions(): Promise { - try { - const r = await execFileUtf8("hermes", ["sessions", "stats"]); - if (r.errCode === "ENOENT") return 0; - return parseActiveSessionsFromHermesStats(`${r.stdout}\n${r.stderr}`); - } catch { - return 0; - } -} - -async function countDirectChildren(mainPid: number): Promise { - if (mainPid <= 0) return 0; - try { - const r = await execFileUtf8("ps", [ - "--no-headers", - "-o", - "pid", - "--ppid", - String(mainPid), - ]); - if (r.errCode === "ENOENT") return 0; - const lines = r.stdout - .split("\n") - .map((l) => l.trim()) - .filter(Boolean); - return lines.length; - } catch { - return 0; - } -} - -export async function compute() { - const ts = Date.now(); +export async function compute(prevState: SenseState) { + const now = Date.now(); + // --- probe gateway --- let mainPid = 0; let systemdActiveRunning = false; - try { const st = await readSystemdState(); mainPid = st.mainPid; @@ -354,22 +313,6 @@ export async function compute() { const alive = systemdActiveRunning && mainPid > 0 && psOk ? 1 : 0; - let activeSessions = 0; - try { - activeSessions = await readActiveSessions(); - } catch { - activeSessions = 0; - } - - let childProcessCount = 0; - if (alive && mainPid > 0) { - try { - childProcessCount = await countDirectChildren(mainPid); - } catch { - childProcessCount = 0; - } - } - let httpOk = 0; let httpStatusCode = 0; let httpLatencyMs = 0; @@ -387,22 +330,47 @@ export async function compute() { httpError = "probe_failed"; } - const storedMainPid = mainPid > 0 ? mainPid : 0; + // --- decide health --- + const healthy = alive === 1 && httpOk === 1; - const row = { - ts, + // --- state machine: track consecutive failures --- + const consecutiveFailures = healthy ? 0 : prevState.consecutiveFailures + 1; + const lastRestartTs = prevState.lastRestartTs; + const cooldown = prevState.restartCooldownMs; + const cooldownElapsed = now - lastRestartTs >= cooldown; + + // --- trigger restart workflow? --- + const shouldRestart = + consecutiveFailures >= FAILURE_THRESHOLD && cooldownElapsed; + + const nextState: SenseState = { + consecutiveFailures, + lastRestartTs: shouldRestart ? now : lastRestartTs, + restartCooldownMs: cooldown, + }; + + const signal = { + ts: now, alive, - mainPid: storedMainPid, + mainPid: mainPid > 0 ? mainPid : 0, rssBytes: alive ? rssBytes : 0, cpuPercent: alive ? cpuPercent : 0, uptimeSec: alive ? uptimeSec : 0, - activeSessions, - childProcessCount: alive ? childProcessCount : 0, httpOk, httpStatusCode, httpLatencyMs, httpError, + consecutiveFailures, }; - return { signal: row, workflow: null }; + const workflow = shouldRestart + ? { + name: "restart-gateway", + maxRounds: 3, + prompt: `Hermes gateway is down (${consecutiveFailures} consecutive failures). Last HTTP error: "${httpError}". systemd active+running: ${systemdActiveRunning}, process alive: ${psOk}. Restart the gateway and verify it comes back.`, + dryRun: false, + } + : null; + + return { state: nextState, signal, workflow }; } diff --git a/senses/hermes-session-message-stats/migrations/0001_init.sql b/senses/hermes-session-message-stats/migrations/0001_init.sql deleted file mode 100644 index bcfa113..0000000 --- a/senses/hermes-session-message-stats/migrations/0001_init.sql +++ /dev/null @@ -1,13 +0,0 @@ --- Migration: 0001_init --- Creates the hermes_session_message_stats table for hermes-session-message-stats sense. - -CREATE TABLE IF NOT EXISTS hermes_session_message_stats ( - id INTEGER PRIMARY KEY AUTOINCREMENT, - ts INTEGER NOT NULL, - total_user_messages INTEGER NOT NULL, - total_assistant_messages INTEGER NOT NULL, - total_tool_messages INTEGER NOT NULL, - total_messages INTEGER NOT NULL, - active_sessions INTEGER NOT NULL, - measurement_window_seconds INTEGER NOT NULL -); diff --git a/senses/hermes-session-message-stats/src/index.ts b/senses/hermes-session-message-stats/src/index.ts deleted file mode 100644 index 263d1bf..0000000 --- a/senses/hermes-session-message-stats/src/index.ts +++ /dev/null @@ -1,117 +0,0 @@ -import { createReadStream } from "node:fs"; -import { readdir } from "node:fs/promises"; -import { homedir } from "node:os"; -import { join } from "node:path"; -import { createInterface } from "node:readline"; -export { hermesSessionMessageStats as table } from "./schema.ts"; - -const MEASUREMENT_WINDOW_MS = 900_000; -const MEASUREMENT_WINDOW_SECONDS = 900; - -interface MessageCounts { - user: number; - assistant: number; - tool: number; - fileHadActivity: boolean; -} - -async function aggregateJsonlFile(filePath: string, cutoffMs: number, nowMs: number): Promise { - let user = 0; - let assistant = 0; - let tool = 0; - let fileHadActivity = false; - - const input = createReadStream(filePath, { encoding: "utf8" }); - const rl = createInterface({ input, crlfDelay: Infinity }); - try { - for await (const line of rl) { - const trimmed = line.trim(); - if (!trimmed) continue; - let obj: unknown; - try { - obj = JSON.parse(trimmed); - } catch { - continue; - } - if ( - typeof obj !== "object" || obj === null || - typeof (obj as Record).role !== "string" || - typeof (obj as Record).timestamp !== "string" - ) { - continue; - } - const record = obj as { role: string; timestamp: string }; - const t = Date.parse(record.timestamp); - if (!Number.isFinite(t) || t < cutoffMs || t > nowMs) continue; - - const roleNorm = record.role.trim().toLowerCase(); - if (roleNorm === "user") { - user++; - fileHadActivity = true; - } else if (roleNorm === "assistant") { - assistant++; - fileHadActivity = true; - } else if (roleNorm === "tool") { - tool++; - fileHadActivity = true; - } - } - } finally { - rl.close(); - } - - return { user, assistant, tool, fileHadActivity }; -} - -export async function compute() { - const nowMs = Date.now(); - const cutoffMs = nowMs - MEASUREMENT_WINDOW_MS; - const ts = nowMs; - - let totalUserMessages = 0; - let totalAssistantMessages = 0; - let totalToolMessages = 0; - let activeSessions = 0; - - const sessionsDir = join(homedir(), ".hermes", "sessions"); - let files: string[] = []; - try { - const entries = await readdir(sessionsDir, { withFileTypes: true }); - files = entries - .filter((e) => e.isFile() && e.name.endsWith(".jsonl")) - .map((e) => join(sessionsDir, e.name)); - } catch (err) { - if (err && typeof err === "object" && "code" in err && (err as NodeJS.ErrnoException).code === "ENOENT") { - files = []; - } else { - throw err; - } - } - - for (const filePath of files) { - const { user, assistant, tool, fileHadActivity } = await aggregateJsonlFile( - filePath, - cutoffMs, - nowMs, - ); - totalUserMessages += user; - totalAssistantMessages += assistant; - totalToolMessages += tool; - if (fileHadActivity) activeSessions++; - } - - const totalMessages = - totalUserMessages + totalAssistantMessages + totalToolMessages; - - const row = { - ts, - totalUserMessages, - totalAssistantMessages, - totalToolMessages, - totalMessages, - activeSessions, - measurementWindowSeconds: MEASUREMENT_WINDOW_SECONDS, - }; - - return { signal: row, workflow: null }; -} diff --git a/senses/hermes-session-message-stats/src/schema.ts b/senses/hermes-session-message-stats/src/schema.ts deleted file mode 100644 index bef4403..0000000 --- a/senses/hermes-session-message-stats/src/schema.ts +++ /dev/null @@ -1,12 +0,0 @@ -import { integer, sqliteTable } from "drizzle-orm/sqlite-core"; - -export const hermesSessionMessageStats = sqliteTable("hermes_session_message_stats", { - id: integer("id").primaryKey({ autoIncrement: true }), - ts: integer("ts").notNull(), - totalUserMessages: integer("total_user_messages").notNull(), - totalAssistantMessages: integer("total_assistant_messages").notNull(), - totalToolMessages: integer("total_tool_messages").notNull(), - totalMessages: integer("total_messages").notNull(), - activeSessions: integer("active_sessions").notNull(), - measurementWindowSeconds: integer("measurement_window_seconds").notNull(), -}); diff --git a/senses/linux-system-health/migrations/0001_init.sql b/senses/linux-system-health/migrations/0001_init.sql deleted file mode 100644 index 895f665..0000000 --- a/senses/linux-system-health/migrations/0001_init.sql +++ /dev/null @@ -1,16 +0,0 @@ --- Migration: 0001_init --- Creates the snapshots table for linux-system-health sense. - -CREATE TABLE IF NOT EXISTS snapshots ( - ts INTEGER PRIMARY KEY, - cpu_load_1m REAL NOT NULL, - cpu_load_5m REAL NOT NULL, - cpu_load_15m REAL NOT NULL, - mem_total_mb INTEGER NOT NULL, - mem_used_mb INTEGER NOT NULL, - mem_used_pct REAL NOT NULL, - disk_total_gb REAL NOT NULL, - disk_used_gb REAL NOT NULL, - disk_used_pct REAL NOT NULL, - uptime_sec INTEGER NOT NULL -); diff --git a/senses/linux-system-health/migrations/0002_add_tcp_stats.sql b/senses/linux-system-health/migrations/0002_add_tcp_stats.sql deleted file mode 100644 index 5efd2d5..0000000 --- a/senses/linux-system-health/migrations/0002_add_tcp_stats.sql +++ /dev/null @@ -1,6 +0,0 @@ -ALTER TABLE snapshots ADD COLUMN sockets_used INTEGER; -ALTER TABLE snapshots ADD COLUMN tcp_inuse INTEGER; -ALTER TABLE snapshots ADD COLUMN tcp_orphan INTEGER; -ALTER TABLE snapshots ADD COLUMN tcp_tw INTEGER; -ALTER TABLE snapshots ADD COLUMN tcp_alloc INTEGER; -ALTER TABLE snapshots ADD COLUMN tcp_mem_pages INTEGER; diff --git a/senses/linux-system-health/src/index.ts b/senses/linux-system-health/src/index.ts deleted file mode 100644 index f255505..0000000 --- a/senses/linux-system-health/src/index.ts +++ /dev/null @@ -1,88 +0,0 @@ -import { loadavg, totalmem, freemem, uptime } from "node:os"; -import { execSync } from "node:child_process"; -import { readFile } from "node:fs/promises"; -export { snapshots as table } from "./schema.ts"; - -const SOCKSTAT_PATH = "/proc/net/sockstat"; - -interface SockstatResult { - socketsUsed: number; - tcpInuse: number; - tcpOrphan: number; - tcpTw: number; - tcpAlloc: number; - tcpMemPages: number; -} - -function parseSockstat(content: string): SockstatResult { - let socketsUsed = 0, tcpInuse = 0, tcpOrphan = 0, tcpTw = 0, tcpAlloc = 0, tcpMemPages = 0; - - for (const line of content.split("\n")) { - const trimmed = line.trim(); - if (trimmed.startsWith("sockets:")) { - const parts = trimmed.split(/\s+/); - const idx = parts.indexOf("used"); - if (idx !== -1 && idx + 1 < parts.length) { - socketsUsed = Number.parseInt(parts[idx + 1], 10) || 0; - } - } else if (trimmed.startsWith("TCP:")) { - const parts = trimmed.split(/\s+/); - const map: Record = {}; - for (let i = 1; i + 1 < parts.length; i += 2) { - map[parts[i]] = Number.parseInt(parts[i + 1], 10) || 0; - } - tcpInuse = map.inuse ?? 0; - tcpOrphan = map.orphan ?? 0; - tcpTw = map.tw ?? 0; - tcpAlloc = map.alloc ?? 0; - tcpMemPages = map.mem ?? 0; - } - } - - return { socketsUsed, tcpInuse, tcpOrphan, tcpTw, tcpAlloc, tcpMemPages }; -} - -export async function compute() { - const [load1, load5, load15] = loadavg(); - - const memTotal = totalmem(); - const memFree = freemem(); - const memUsed = memTotal - memFree; - const memTotalMB = Math.round(memTotal / 1024 / 1024); - const memUsedMB = Math.round(memUsed / 1024 / 1024); - const memUsedPct = Math.round((memUsed / memTotal) * 10000) / 100; - - let diskTotalGB = 0, diskUsedGB = 0, diskUsedPct = 0; - try { - const df = execSync("df -B1 / | tail -1", { encoding: "utf-8" }).trim(); - const parts = df.split(/\s+/); - const total = Number(parts[1]); - const used = Number(parts[2]); - diskTotalGB = Math.round(total / 1024 / 1024 / 1024 * 100) / 100; - diskUsedGB = Math.round(used / 1024 / 1024 / 1024 * 100) / 100; - diskUsedPct = total > 0 ? Math.round((used / total) * 10000) / 100 : 0; - } catch {} - - let tcp: SockstatResult = { socketsUsed: 0, tcpInuse: 0, tcpOrphan: 0, tcpTw: 0, tcpAlloc: 0, tcpMemPages: 0 }; - try { - const content = await readFile(SOCKSTAT_PATH, "utf8"); - tcp = parseSockstat(content); - } catch {} - - const ts = Date.now(); - const uptimeSec = Math.round(uptime()); - - const data = { - ts, cpuLoad1m: load1, cpuLoad5m: load5, cpuLoad15m: load15, - memTotalMB, memUsedMB, memUsedPct, - diskTotalGB, diskUsedGB, diskUsedPct, - uptimeSec, - socketsUsed: tcp.socketsUsed, - tcpInuse: tcp.tcpInuse, - tcpOrphan: tcp.tcpOrphan, - tcpTw: tcp.tcpTw, - tcpAlloc: tcp.tcpAlloc, - tcpMemPages: tcp.tcpMemPages, - }; - return { signal: data, workflow: null }; -} diff --git a/senses/linux-system-health/src/schema.ts b/senses/linux-system-health/src/schema.ts deleted file mode 100644 index de382a2..0000000 --- a/senses/linux-system-health/src/schema.ts +++ /dev/null @@ -1,22 +0,0 @@ -import { integer, real, sqliteTable, text } from "drizzle-orm/sqlite-core"; - -export const snapshots = sqliteTable("snapshots", { - ts: integer("ts").primaryKey(), - cpuLoad1m: real("cpu_load_1m").notNull(), - cpuLoad5m: real("cpu_load_5m").notNull(), - cpuLoad15m: real("cpu_load_15m").notNull(), - memTotalMB: integer("mem_total_mb").notNull(), - memUsedMB: integer("mem_used_mb").notNull(), - memUsedPct: real("mem_used_pct").notNull(), - diskTotalGB: real("disk_total_gb").notNull(), - diskUsedGB: real("disk_used_gb").notNull(), - diskUsedPct: real("disk_used_pct").notNull(), - uptimeSec: integer("uptime_sec").notNull(), - // TCP socket stats (merged from linux-tcp-socket-stats) - socketsUsed: integer("sockets_used"), - tcpInuse: integer("tcp_inuse"), - tcpOrphan: integer("tcp_orphan"), - tcpTw: integer("tcp_tw"), - tcpAlloc: integer("tcp_alloc"), - tcpMemPages: integer("tcp_mem_pages"), -}); diff --git a/senses/worker-process-metrics/migrations/0001_init.sql b/senses/worker-process-metrics/migrations/0001_init.sql deleted file mode 100644 index 989f19d..0000000 --- a/senses/worker-process-metrics/migrations/0001_init.sql +++ /dev/null @@ -1,11 +0,0 @@ --- Migration: 0001_init --- Creates the worker_process_metrics table for worker-process-metrics sense. - -CREATE TABLE IF NOT EXISTS worker_process_metrics ( - ts INTEGER PRIMARY KEY, - pid INTEGER NOT NULL, - uptime_sec REAL NOT NULL, - heap_used_mb REAL NOT NULL, - rss_mb REAL NOT NULL, - external_mb REAL NOT NULL -); diff --git a/senses/worker-process-metrics/src/index.ts b/senses/worker-process-metrics/src/index.ts deleted file mode 100644 index b584519..0000000 --- a/senses/worker-process-metrics/src/index.ts +++ /dev/null @@ -1,26 +0,0 @@ -export { workerProcessMetrics as table } from "./schema.ts"; - -function round2(n: number): number { - return Math.round(n * 100) / 100; -} - -export async function compute() { - const ts = Date.now(); - const pid = process.pid; - const uptimeSec = process.uptime(); - const m = process.memoryUsage(); - const heapUsedMB = round2(m.heapUsed / 1024 / 1024); - const rssMB = round2(m.rss / 1024 / 1024); - const externalMB = round2(m.external / 1024 / 1024); - - const row = { - ts, - pid, - uptimeSec, - heapUsedMB, - rssMB, - externalMB, - }; - - return { signal: row, workflow: null }; -} diff --git a/senses/worker-process-metrics/src/schema.ts b/senses/worker-process-metrics/src/schema.ts deleted file mode 100644 index 59045f2..0000000 --- a/senses/worker-process-metrics/src/schema.ts +++ /dev/null @@ -1,10 +0,0 @@ -import { integer, real, sqliteTable } from "drizzle-orm/sqlite-core"; - -export const workerProcessMetrics = sqliteTable("worker_process_metrics", { - ts: integer("ts").primaryKey(), - pid: integer("pid").notNull(), - uptimeSec: real("uptime_sec").notNull(), - heapUsedMB: real("heap_used_mb").notNull(), - rssMB: real("rss_mb").notNull(), - externalMB: real("external_mb").notNull(), -}); diff --git a/workflows/restart-gateway/src/index.ts b/workflows/restart-gateway/src/index.ts new file mode 100644 index 0000000..9311007 --- /dev/null +++ b/workflows/restart-gateway/src/index.ts @@ -0,0 +1,109 @@ +import { execFile } from "node:child_process"; +import type { RoleResult, ThreadContext, WorkflowDefinition } from "@uncaged/nerve-core"; +import { END } from "@uncaged/nerve-core"; + +const EXEC_TIMEOUT_MS = 30_000; +const VERIFY_DELAY_MS = 5_000; + +type ExecResult = { + exitCode: number; + stdout: string; + stderr: string; +}; + +function exec(file: string, args: string[]): Promise { + return new Promise((resolve) => { + execFile( + file, + args, + { + encoding: "utf8", + timeout: EXEC_TIMEOUT_MS, + maxBuffer: 4 * 1024 * 1024, + } as Parameters[2], + (err, stdout, stderr) => { + const exitCode = + err && typeof (err as NodeJS.ErrnoException).status === "number" + ? (err as NodeJS.ErrnoException & { status: number }).status + : err ? -1 : 0; + resolve({ + exitCode, + stdout: String(stdout ?? ""), + stderr: String(stderr ?? ""), + }); + }, + ); + }); +} + +function sleep(ms: number): Promise { + return new Promise((r) => setTimeout(r, ms)); +} + +type RestartMeta = { + action: string; + exitCode: number; + output: string; +}; + +type VerifyMeta = { + alive: boolean; + activeState: string; + subState: string; +}; + +async function restarter(_ctx: ThreadContext): Promise> { + const r = await exec("systemctl", ["--user", "restart", "hermes-gateway"]); + return { + content: r.exitCode === 0 + ? "Gateway restart command succeeded." + : `Gateway restart failed (exit ${r.exitCode}): ${r.stderr.trim()}`, + meta: { + action: "systemctl --user restart hermes-gateway", + exitCode: r.exitCode, + output: `${r.stdout}\n${r.stderr}`.trim().slice(0, 500), + }, + }; +} + +async function verifier(_ctx: ThreadContext): Promise> { + // Wait a few seconds for the service to come up + await sleep(VERIFY_DELAY_MS); + + const r = await exec("systemctl", [ + "--user", + "--no-pager", + "show", + "hermes-gateway", + "-p", "ActiveState", + "-p", "SubState", + ]); + + let activeState = "unknown"; + let subState = "unknown"; + for (const line of r.stdout.split("\n")) { + const t = line.trim(); + if (t.startsWith("ActiveState=")) activeState = t.slice("ActiveState=".length); + if (t.startsWith("SubState=")) subState = t.slice("SubState=".length); + } + + const alive = activeState === "active" && subState === "running"; + + return { + content: alive + ? `Gateway recovered: ${activeState} (${subState}).` + : `Gateway still down: ${activeState} (${subState}). May need manual intervention.`, + meta: { alive, activeState, subState }, + }; +} + +export const workflow: WorkflowDefinition & Record<"verifier", VerifyMeta>> = { + name: "restart-gateway", + roles: { restarter, verifier }, + moderator(ctx) { + // Round 0: restart. Round 1: verify. Done. + if (ctx.steps.length === 0) return "restarter"; + if (ctx.steps.length === 1) return "verifier"; + return END; + }, +};