import { execFile } from "node:child_process"; import { hermesGatewayHealth } from "./schema.ts"; /** Keep subprocess deadlines slightly under typical sense timeout (30s). */ const EXEC_TIMEOUT_MS = 25_000; /** HTTP probe stays below EXEC_TIMEOUT_MS and sense timeout (30s). */ const HTTP_TIMEOUT_MS = Math.min(23_000, EXEC_TIMEOUT_MS - 2000); const HTTP_ERROR_MAX_LEN = 256; function gatewayProbeUrl() { const u = process.env.HERMES_GATEWAY_HEALTH_URL ?? process.env.NERVE_HERMES_GATEWAY_URL ?? ""; return String(u).trim(); } function truncateHttpError(err) { const raw = err && typeof err === "object" && "code" in err && err.code ? String(err.code) : String(err?.message ?? err ?? "error"); const s = raw.trim() || "error"; return s.length > HTTP_ERROR_MAX_LEN ? s.slice(0, HTTP_ERROR_MAX_LEN) : s; } /** * GET the gateway URL; success = HTTP 200–399. * URL must be set via HERMES_GATEWAY_HEALTH_URL or NERVE_HERMES_GATEWAY_URL. */ async function probeGatewayHttp(url) { if (!url) { return { httpOk: 0, httpStatusCode: 0, httpLatencyMs: 0, httpError: "missing_url", }; } const t0 = Date.now(); try { const signal = AbortSignal.timeout(HTTP_TIMEOUT_MS); const res = await fetch(url, { method: "GET", signal, redirect: "follow", }); const httpLatencyMs = Date.now() - t0; const code = res.status; const ok = code >= 200 && code < 400; return { httpOk: ok ? 1 : 0, httpStatusCode: code, httpLatencyMs, httpError: ok ? "" : truncateHttpError({ message: `HTTP ${code}` }), }; } catch (err) { const httpLatencyMs = Date.now() - t0; return { httpOk: 0, httpStatusCode: 0, httpLatencyMs, httpError: truncateHttpError(err), }; } } /** * When `ps` lacks `etimes` (wall-clock seconds since start), parse `etime` * ([[dd-]hh:]mm:ss) into seconds. See ps(1) `etime` field description. */ function etimeToSeconds(etime) { let s = String(etime).trim(); if (!s) return 0; let days = 0; if (s.includes("-")) { const idx = s.indexOf("-"); const d = Number.parseInt(s.slice(0, idx), 10); days = Number.isFinite(d) ? d : 0; s = s.slice(idx + 1); } const parts = s.split(":").map((x) => Number.parseInt(String(x).trim(), 10)); if (parts.some((n) => !Number.isFinite(n))) return 0; if (parts.length === 3) { return Math.trunc(days * 86_400 + parts[0] * 3600 + parts[1] * 60 + parts[2]); } if (parts.length === 2) { return Math.trunc(days * 86_400 + parts[0] * 60 + parts[1]); } if (parts.length === 1) { return Math.trunc(days * 86_400 + parts[0]); } return 0; } function execFileUtf8(file, args, opts = {}) { return new Promise((resolve) => { execFile( file, args, { encoding: "utf8", maxBuffer: 8 * 1024 * 1024, timeout: EXEC_TIMEOUT_MS, ...opts, }, (err, stdout, stderr) => { const exitCode = err && typeof err.status === "number" ? err.status : err ? -1 : 0; resolve({ exitCode, errCode: err?.code, stdout: String(stdout ?? ""), stderr: String(stderr ?? ""), }); }, ); }); } function parseMainPidFromStatus(text) { const m = text.match(/Main PID:\s*(\d+)/i); return m ? Math.trunc(Number.parseInt(m[1], 10)) || 0 : 0; } function parseActiveLineFromStatus(text) { for (const line of text.split("\n")) { if (/^\s*Active:/i.test(line)) { const m = line.match(/Active:\s*(\S+)\s*\(([^)]*)\)/i); if (m) { return { active: m[1].toLowerCase() === "active", subRunning: m[2].toLowerCase().includes("running"), }; } } } return { active: false, subRunning: false }; } function parseSystemctlShow(text) { let mainPid = 0; let active = false; let subRunning = false; for (const line of text.split("\n")) { const t = line.trim(); if (t.startsWith("MainPID=")) { mainPid = Math.trunc(Number.parseInt(t.slice("MainPID=".length), 10)) || 0; } else if (t.startsWith("ActiveState=")) { active = t.slice("ActiveState=".length).trim().toLowerCase() === "active"; } else if (t.startsWith("SubState=")) { subRunning = t.slice("SubState=".length).trim().toLowerCase() === "running"; } } return { mainPid, active, subRunning }; } async function readSystemdState() { const status = await execFileUtf8("systemctl", [ "--user", "--no-pager", "status", "hermes-gateway", ]); const combined = `${status.stdout}\n${status.stderr}`.trim(); let mainPid = parseMainPidFromStatus(combined); let { active, subRunning } = parseActiveLineFromStatus(combined); const needShow = mainPid <= 0 || !active || !subRunning; if (needShow) { const show = await execFileUtf8("systemctl", [ "--user", "--no-pager", "show", "hermes-gateway", "-p", "MainPID", "-p", "ActiveState", "-p", "SubState", ]); const showText = `${show.stdout}\n${show.stderr}`; const s = parseSystemctlShow(showText); if (mainPid <= 0 && s.mainPid > 0) mainPid = s.mainPid; if (!active) active = s.active; if (!subRunning) subRunning = s.subRunning; } return { mainPid, systemdActiveRunning: active && subRunning }; } async function processExists(mainPid) { if (mainPid <= 0) return false; const r = await execFileUtf8("ps", ["-p", String(mainPid), "-o", "pid="]); if (r.errCode === "ENOENT") return false; return r.stdout.trim().length > 0; } async function readPsMetrics(mainPid) { if (mainPid <= 0) { return { rssBytes: 0, cpuPercent: 0, uptimeSec: 0 }; } let r = await execFileUtf8("ps", [ "-p", String(mainPid), "-o", "rss=,%cpu=,etimes=", ]); let line = r.stdout.trim().replace(/\s+/g, " "); if (r.errCode === "ENOENT" || !line) { return { rssBytes: 0, cpuPercent: 0, uptimeSec: 0 }; } let parts = line.split(" ").filter(Boolean); if (parts.length < 3) { r = await execFileUtf8("ps", [ "-p", String(mainPid), "-o", "rss=,%cpu=,etime=", ]); line = r.stdout.trim().replace(/\s+/g, " "); parts = line.split(" ").filter(Boolean); if (parts.length < 3) { return { rssBytes: 0, cpuPercent: 0, uptimeSec: 0 }; } const rssKiB = Number(parts[0]); const cpu = Number(parts[1]); const uptimeSec = etimeToSeconds(parts.slice(2).join(" ")); const rssBytes = Number.isFinite(rssKiB) ? Math.trunc(rssKiB * 1024) : 0; const cpuPercent = Number.isFinite(cpu) ? Math.round(cpu * 100) / 100 : 0; return { rssBytes, cpuPercent, uptimeSec }; } const rssKiB = Number(parts[0]); const cpu = Number(parts[1]); const etimes = Number(parts[2]); const rssBytes = Number.isFinite(rssKiB) ? Math.trunc(rssKiB * 1024) : 0; const cpuPercent = Number.isFinite(cpu) ? Math.round(cpu * 100) / 100 : 0; const uptimeSec = Number.isFinite(etimes) ? Math.trunc(etimes) : 0; return { rssBytes, cpuPercent, uptimeSec }; } function parseActiveSessionsFromHermesStats(text) { const src = String(text); const patterns = [ /^\s*Active\s+sessions?:\s*(\d+)/gim, /^\s*active\s+sessions?:\s*(\d+)/gim, /^\s*Total\s+sessions?:\s*(\d+)/gim, ]; for (const re of patterns) { re.lastIndex = 0; const m = re.exec(src); if (m) { const n = Math.trunc(Number.parseInt(m[1], 10)); return Number.isFinite(n) ? n : 0; } } return 0; } async function readActiveSessions() { try { const r = await execFileUtf8("hermes", ["sessions", "stats"]); if (r.errCode === "ENOENT") return 0; return parseActiveSessionsFromHermesStats(`${r.stdout}\n${r.stderr}`); } catch { return 0; } } async function countDirectChildren(mainPid) { if (mainPid <= 0) return 0; try { const r = await execFileUtf8("ps", [ "--no-headers", "-o", "pid", "--ppid", String(mainPid), ]); if (r.errCode === "ENOENT") return 0; const lines = r.stdout .split("\n") .map((l) => l.trim()) .filter(Boolean); return lines.length; } catch { return 0; } } export async function compute(db, _peers) { const ts = Date.now(); let mainPid = 0; let systemdActiveRunning = false; try { const st = await readSystemdState(); mainPid = st.mainPid; systemdActiveRunning = st.systemdActiveRunning; } catch { mainPid = 0; systemdActiveRunning = false; } let psOk = false; try { psOk = await processExists(mainPid); } catch { psOk = false; } let rssBytes = 0; let cpuPercent = 0; let uptimeSec = 0; if (psOk) { try { const m = await readPsMetrics(mainPid); rssBytes = m.rssBytes; cpuPercent = m.cpuPercent; uptimeSec = m.uptimeSec; } catch { rssBytes = 0; cpuPercent = 0; uptimeSec = 0; } } const alive = systemdActiveRunning && mainPid > 0 && psOk ? 1 : 0; let activeSessions = 0; try { activeSessions = await readActiveSessions(); } catch { activeSessions = 0; } let childProcessCount = 0; if (alive && mainPid > 0) { try { childProcessCount = await countDirectChildren(mainPid); } catch { childProcessCount = 0; } } let httpOk = 0; let httpStatusCode = 0; let httpLatencyMs = 0; let httpError = ""; try { const h = await probeGatewayHttp(gatewayProbeUrl()); httpOk = h.httpOk; httpStatusCode = h.httpStatusCode; httpLatencyMs = h.httpLatencyMs; httpError = h.httpError; } catch { httpOk = 0; httpStatusCode = 0; httpLatencyMs = 0; httpError = "probe_failed"; } const storedMainPid = mainPid > 0 ? mainPid : 0; const row = { ts, alive, mainPid: storedMainPid, rssBytes: alive ? rssBytes : 0, cpuPercent: alive ? cpuPercent : 0, uptimeSec: alive ? uptimeSec : 0, activeSessions, childProcessCount: alive ? childProcessCount : 0, httpOk, httpStatusCode, httpLatencyMs, httpError, }; await db.insert(hermesGatewayHealth).values(row); return { ts: row.ts, alive: row.alive, mainPid: row.mainPid, rssBytes: row.rssBytes, cpuPercent: row.cpuPercent, uptimeSec: row.uptimeSec, activeSessions: row.activeSessions, childProcessCount: row.childProcessCount, httpOk: row.httpOk, httpStatusCode: row.httpStatusCode, httpLatencyMs: row.httpLatencyMs, httpError: row.httpError, }; }