fix: harden gateway restart recovery
- store gateway PID metadata and validate the live process before trusting gateway.pid - auto-refresh outdated systemd user units before start/restart so installs pick up --replace fixes - sweep stray manual gateway processes after service stops - add regression tests for PID validation and service drift recovery
This commit is contained in:
parent
917adcbaf4
commit
eb8316ea69
4 changed files with 251 additions and 15 deletions
|
|
@ -11,10 +11,14 @@ that will be useful when we add named profiles (multiple agents running
|
|||
concurrently under distinct configurations).
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
_GATEWAY_KIND = "hermes-gateway"
|
||||
|
||||
|
||||
def _get_pid_path() -> Path:
|
||||
"""Return the path to the gateway PID file, respecting HERMES_HOME."""
|
||||
|
|
@ -22,11 +26,82 @@ def _get_pid_path() -> Path:
|
|||
return home / "gateway.pid"
|
||||
|
||||
|
||||
def _get_process_start_time(pid: int) -> Optional[int]:
|
||||
"""Return the kernel start time for a process when available."""
|
||||
stat_path = Path(f"/proc/{pid}/stat")
|
||||
try:
|
||||
# Field 22 in /proc/<pid>/stat is process start time (clock ticks).
|
||||
return int(stat_path.read_text().split()[21])
|
||||
except (FileNotFoundError, IndexError, PermissionError, ValueError, OSError):
|
||||
return None
|
||||
|
||||
|
||||
def _read_process_cmdline(pid: int) -> Optional[str]:
|
||||
"""Return the process command line as a space-separated string."""
|
||||
cmdline_path = Path(f"/proc/{pid}/cmdline")
|
||||
try:
|
||||
raw = cmdline_path.read_bytes()
|
||||
except (FileNotFoundError, PermissionError, OSError):
|
||||
return None
|
||||
|
||||
if not raw:
|
||||
return None
|
||||
return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip()
|
||||
|
||||
|
||||
def _looks_like_gateway_process(pid: int) -> bool:
|
||||
"""Return True when the live PID still looks like the Hermes gateway."""
|
||||
cmdline = _read_process_cmdline(pid)
|
||||
if not cmdline:
|
||||
# If we cannot inspect the process, fall back to the liveness check.
|
||||
return True
|
||||
|
||||
patterns = (
|
||||
"hermes_cli.main gateway",
|
||||
"hermes gateway",
|
||||
"gateway/run.py",
|
||||
)
|
||||
return any(pattern in cmdline for pattern in patterns)
|
||||
|
||||
|
||||
def _build_pid_record() -> dict:
|
||||
return {
|
||||
"pid": os.getpid(),
|
||||
"kind": _GATEWAY_KIND,
|
||||
"argv": list(sys.argv),
|
||||
"start_time": _get_process_start_time(os.getpid()),
|
||||
}
|
||||
|
||||
|
||||
def _read_pid_record() -> Optional[dict]:
|
||||
pid_path = _get_pid_path()
|
||||
if not pid_path.exists():
|
||||
return None
|
||||
|
||||
raw = pid_path.read_text().strip()
|
||||
if not raw:
|
||||
return None
|
||||
|
||||
try:
|
||||
payload = json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
try:
|
||||
return {"pid": int(raw)}
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
if isinstance(payload, int):
|
||||
return {"pid": payload}
|
||||
if isinstance(payload, dict):
|
||||
return payload
|
||||
return None
|
||||
|
||||
|
||||
def write_pid_file() -> None:
|
||||
"""Write the current process PID to the gateway PID file."""
|
||||
"""Write the current process PID and metadata to the gateway PID file."""
|
||||
pid_path = _get_pid_path()
|
||||
pid_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
pid_path.write_text(str(os.getpid()))
|
||||
pid_path.write_text(json.dumps(_build_pid_record()))
|
||||
|
||||
|
||||
def remove_pid_file() -> None:
|
||||
|
|
@ -43,18 +118,35 @@ def get_running_pid() -> Optional[int]:
|
|||
Checks the PID file and verifies the process is actually alive.
|
||||
Cleans up stale PID files automatically.
|
||||
"""
|
||||
pid_path = _get_pid_path()
|
||||
if not pid_path.exists():
|
||||
return None
|
||||
try:
|
||||
pid = int(pid_path.read_text().strip())
|
||||
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
|
||||
return pid
|
||||
except (ValueError, ProcessLookupError, PermissionError):
|
||||
# Stale PID file — process is gone
|
||||
record = _read_pid_record()
|
||||
if not record:
|
||||
remove_pid_file()
|
||||
return None
|
||||
|
||||
try:
|
||||
pid = int(record["pid"])
|
||||
except (KeyError, TypeError, ValueError):
|
||||
remove_pid_file()
|
||||
return None
|
||||
|
||||
try:
|
||||
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
|
||||
except (ProcessLookupError, PermissionError):
|
||||
remove_pid_file()
|
||||
return None
|
||||
|
||||
recorded_start = record.get("start_time")
|
||||
current_start = _get_process_start_time(pid)
|
||||
if recorded_start is not None and current_start is not None and current_start != recorded_start:
|
||||
remove_pid_file()
|
||||
return None
|
||||
|
||||
if not _looks_like_gateway_process(pid):
|
||||
remove_pid_file()
|
||||
return None
|
||||
|
||||
return pid
|
||||
|
||||
|
||||
def is_gateway_running() -> bool:
|
||||
"""Check if the gateway daemon is currently running."""
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue