fix: harden gateway restart recovery
- store gateway PID metadata and validate the live process before trusting gateway.pid - auto-refresh outdated systemd user units before start/restart so installs pick up --replace fixes - sweep stray manual gateway processes after service stops - add regression tests for PID validation and service drift recovery
This commit is contained in:
parent
917adcbaf4
commit
eb8316ea69
4 changed files with 251 additions and 15 deletions
|
|
@ -11,10 +11,14 @@ that will be useful when we add named profiles (multiple agents running
|
||||||
concurrently under distinct configurations).
|
concurrently under distinct configurations).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
import os
|
import os
|
||||||
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
|
||||||
|
_GATEWAY_KIND = "hermes-gateway"
|
||||||
|
|
||||||
|
|
||||||
def _get_pid_path() -> Path:
|
def _get_pid_path() -> Path:
|
||||||
"""Return the path to the gateway PID file, respecting HERMES_HOME."""
|
"""Return the path to the gateway PID file, respecting HERMES_HOME."""
|
||||||
|
|
@ -22,11 +26,82 @@ def _get_pid_path() -> Path:
|
||||||
return home / "gateway.pid"
|
return home / "gateway.pid"
|
||||||
|
|
||||||
|
|
||||||
|
def _get_process_start_time(pid: int) -> Optional[int]:
|
||||||
|
"""Return the kernel start time for a process when available."""
|
||||||
|
stat_path = Path(f"/proc/{pid}/stat")
|
||||||
|
try:
|
||||||
|
# Field 22 in /proc/<pid>/stat is process start time (clock ticks).
|
||||||
|
return int(stat_path.read_text().split()[21])
|
||||||
|
except (FileNotFoundError, IndexError, PermissionError, ValueError, OSError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _read_process_cmdline(pid: int) -> Optional[str]:
|
||||||
|
"""Return the process command line as a space-separated string."""
|
||||||
|
cmdline_path = Path(f"/proc/{pid}/cmdline")
|
||||||
|
try:
|
||||||
|
raw = cmdline_path.read_bytes()
|
||||||
|
except (FileNotFoundError, PermissionError, OSError):
|
||||||
|
return None
|
||||||
|
|
||||||
|
if not raw:
|
||||||
|
return None
|
||||||
|
return raw.replace(b"\x00", b" ").decode("utf-8", errors="ignore").strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _looks_like_gateway_process(pid: int) -> bool:
|
||||||
|
"""Return True when the live PID still looks like the Hermes gateway."""
|
||||||
|
cmdline = _read_process_cmdline(pid)
|
||||||
|
if not cmdline:
|
||||||
|
# If we cannot inspect the process, fall back to the liveness check.
|
||||||
|
return True
|
||||||
|
|
||||||
|
patterns = (
|
||||||
|
"hermes_cli.main gateway",
|
||||||
|
"hermes gateway",
|
||||||
|
"gateway/run.py",
|
||||||
|
)
|
||||||
|
return any(pattern in cmdline for pattern in patterns)
|
||||||
|
|
||||||
|
|
||||||
|
def _build_pid_record() -> dict:
|
||||||
|
return {
|
||||||
|
"pid": os.getpid(),
|
||||||
|
"kind": _GATEWAY_KIND,
|
||||||
|
"argv": list(sys.argv),
|
||||||
|
"start_time": _get_process_start_time(os.getpid()),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _read_pid_record() -> Optional[dict]:
|
||||||
|
pid_path = _get_pid_path()
|
||||||
|
if not pid_path.exists():
|
||||||
|
return None
|
||||||
|
|
||||||
|
raw = pid_path.read_text().strip()
|
||||||
|
if not raw:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
payload = json.loads(raw)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
try:
|
||||||
|
return {"pid": int(raw)}
|
||||||
|
except ValueError:
|
||||||
|
return None
|
||||||
|
|
||||||
|
if isinstance(payload, int):
|
||||||
|
return {"pid": payload}
|
||||||
|
if isinstance(payload, dict):
|
||||||
|
return payload
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def write_pid_file() -> None:
|
def write_pid_file() -> None:
|
||||||
"""Write the current process PID to the gateway PID file."""
|
"""Write the current process PID and metadata to the gateway PID file."""
|
||||||
pid_path = _get_pid_path()
|
pid_path = _get_pid_path()
|
||||||
pid_path.parent.mkdir(parents=True, exist_ok=True)
|
pid_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
pid_path.write_text(str(os.getpid()))
|
pid_path.write_text(json.dumps(_build_pid_record()))
|
||||||
|
|
||||||
|
|
||||||
def remove_pid_file() -> None:
|
def remove_pid_file() -> None:
|
||||||
|
|
@ -43,18 +118,35 @@ def get_running_pid() -> Optional[int]:
|
||||||
Checks the PID file and verifies the process is actually alive.
|
Checks the PID file and verifies the process is actually alive.
|
||||||
Cleans up stale PID files automatically.
|
Cleans up stale PID files automatically.
|
||||||
"""
|
"""
|
||||||
pid_path = _get_pid_path()
|
record = _read_pid_record()
|
||||||
if not pid_path.exists():
|
if not record:
|
||||||
return None
|
|
||||||
try:
|
|
||||||
pid = int(pid_path.read_text().strip())
|
|
||||||
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
|
|
||||||
return pid
|
|
||||||
except (ValueError, ProcessLookupError, PermissionError):
|
|
||||||
# Stale PID file — process is gone
|
|
||||||
remove_pid_file()
|
remove_pid_file()
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
pid = int(record["pid"])
|
||||||
|
except (KeyError, TypeError, ValueError):
|
||||||
|
remove_pid_file()
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
|
||||||
|
except (ProcessLookupError, PermissionError):
|
||||||
|
remove_pid_file()
|
||||||
|
return None
|
||||||
|
|
||||||
|
recorded_start = record.get("start_time")
|
||||||
|
current_start = _get_process_start_time(pid)
|
||||||
|
if recorded_start is not None and current_start is not None and current_start != recorded_start:
|
||||||
|
remove_pid_file()
|
||||||
|
return None
|
||||||
|
|
||||||
|
if not _looks_like_gateway_process(pid):
|
||||||
|
remove_pid_file()
|
||||||
|
return None
|
||||||
|
|
||||||
|
return pid
|
||||||
|
|
||||||
|
|
||||||
def is_gateway_running() -> bool:
|
def is_gateway_running() -> bool:
|
||||||
"""Check if the gateway daemon is currently running."""
|
"""Check if the gateway daemon is currently running."""
|
||||||
|
|
|
||||||
|
|
@ -251,6 +251,34 @@ StandardError=journal
|
||||||
WantedBy=default.target
|
WantedBy=default.target
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
def _normalize_service_definition(text: str) -> str:
|
||||||
|
return "\n".join(line.rstrip() for line in text.strip().splitlines())
|
||||||
|
|
||||||
|
|
||||||
|
def systemd_unit_is_current() -> bool:
|
||||||
|
unit_path = get_systemd_unit_path()
|
||||||
|
if not unit_path.exists():
|
||||||
|
return False
|
||||||
|
|
||||||
|
installed = unit_path.read_text(encoding="utf-8")
|
||||||
|
expected = generate_systemd_unit()
|
||||||
|
return _normalize_service_definition(installed) == _normalize_service_definition(expected)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def refresh_systemd_unit_if_needed() -> bool:
|
||||||
|
"""Rewrite the installed user unit when the generated definition has changed."""
|
||||||
|
unit_path = get_systemd_unit_path()
|
||||||
|
if not unit_path.exists() or systemd_unit_is_current():
|
||||||
|
return False
|
||||||
|
|
||||||
|
unit_path.write_text(generate_systemd_unit(), encoding="utf-8")
|
||||||
|
subprocess.run(["systemctl", "--user", "daemon-reload"], check=True)
|
||||||
|
print("↻ Updated gateway service definition to match the current Hermes install")
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
def systemd_install(force: bool = False):
|
def systemd_install(force: bool = False):
|
||||||
unit_path = get_systemd_unit_path()
|
unit_path = get_systemd_unit_path()
|
||||||
|
|
||||||
|
|
@ -289,17 +317,22 @@ def systemd_uninstall():
|
||||||
print("✓ Service uninstalled")
|
print("✓ Service uninstalled")
|
||||||
|
|
||||||
def systemd_start():
|
def systemd_start():
|
||||||
|
refresh_systemd_unit_if_needed()
|
||||||
subprocess.run(["systemctl", "--user", "start", SERVICE_NAME], check=True)
|
subprocess.run(["systemctl", "--user", "start", SERVICE_NAME], check=True)
|
||||||
print("✓ Service started")
|
print("✓ Service started")
|
||||||
|
|
||||||
|
|
||||||
def systemd_stop():
|
def systemd_stop():
|
||||||
subprocess.run(["systemctl", "--user", "stop", SERVICE_NAME], check=True)
|
subprocess.run(["systemctl", "--user", "stop", SERVICE_NAME], check=True)
|
||||||
print("✓ Service stopped")
|
print("✓ Service stopped")
|
||||||
|
|
||||||
|
|
||||||
def systemd_restart():
|
def systemd_restart():
|
||||||
|
refresh_systemd_unit_if_needed()
|
||||||
subprocess.run(["systemctl", "--user", "restart", SERVICE_NAME], check=True)
|
subprocess.run(["systemctl", "--user", "restart", SERVICE_NAME], check=True)
|
||||||
print("✓ Service restarted")
|
print("✓ Service restarted")
|
||||||
|
|
||||||
|
|
||||||
def systemd_status(deep: bool = False):
|
def systemd_status(deep: bool = False):
|
||||||
# Check if service unit file exists
|
# Check if service unit file exists
|
||||||
unit_path = get_systemd_unit_path()
|
unit_path = get_systemd_unit_path()
|
||||||
|
|
@ -308,6 +341,11 @@ def systemd_status(deep: bool = False):
|
||||||
print(" Run: hermes gateway install")
|
print(" Run: hermes gateway install")
|
||||||
return
|
return
|
||||||
|
|
||||||
|
if not systemd_unit_is_current():
|
||||||
|
print("⚠ Installed gateway service definition is outdated")
|
||||||
|
print(" Run: hermes gateway restart # auto-refreshes the unit")
|
||||||
|
print()
|
||||||
|
|
||||||
# Show detailed status first
|
# Show detailed status first
|
||||||
subprocess.run(
|
subprocess.run(
|
||||||
["systemctl", "--user", "status", SERVICE_NAME, "--no-pager"],
|
["systemctl", "--user", "status", SERVICE_NAME, "--no-pager"],
|
||||||
|
|
@ -1079,7 +1117,7 @@ def gateway_command(args):
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
elif subcmd == "stop":
|
elif subcmd == "stop":
|
||||||
# Try service first, fall back to killing processes directly
|
# Try service first, then sweep any stray/manual gateway processes.
|
||||||
service_available = False
|
service_available = False
|
||||||
|
|
||||||
if is_linux() and get_systemd_unit_path().exists():
|
if is_linux() and get_systemd_unit_path().exists():
|
||||||
|
|
@ -1094,14 +1132,15 @@ def gateway_command(args):
|
||||||
service_available = True
|
service_available = True
|
||||||
except subprocess.CalledProcessError:
|
except subprocess.CalledProcessError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
killed = kill_gateway_processes()
|
||||||
if not service_available:
|
if not service_available:
|
||||||
# Kill gateway processes directly
|
|
||||||
killed = kill_gateway_processes()
|
|
||||||
if killed:
|
if killed:
|
||||||
print(f"✓ Stopped {killed} gateway process(es)")
|
print(f"✓ Stopped {killed} gateway process(es)")
|
||||||
else:
|
else:
|
||||||
print("✗ No gateway processes found")
|
print("✗ No gateway processes found")
|
||||||
|
elif killed:
|
||||||
|
print(f"✓ Stopped {killed} additional manual gateway process(es)")
|
||||||
|
|
||||||
elif subcmd == "restart":
|
elif subcmd == "restart":
|
||||||
# Try service first, fall back to killing and restarting
|
# Try service first, fall back to killing and restarting
|
||||||
|
|
|
||||||
27
tests/gateway/test_status.py
Normal file
27
tests/gateway/test_status.py
Normal file
|
|
@ -0,0 +1,27 @@
|
||||||
|
"""Tests for gateway runtime status tracking."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
|
||||||
|
from gateway import status
|
||||||
|
|
||||||
|
|
||||||
|
class TestGatewayPidState:
|
||||||
|
def test_write_pid_file_records_gateway_metadata(self, tmp_path, monkeypatch):
|
||||||
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||||
|
|
||||||
|
status.write_pid_file()
|
||||||
|
|
||||||
|
payload = json.loads((tmp_path / "gateway.pid").read_text())
|
||||||
|
assert payload["pid"] == os.getpid()
|
||||||
|
assert payload["kind"] == "hermes-gateway"
|
||||||
|
assert isinstance(payload["argv"], list)
|
||||||
|
assert payload["argv"]
|
||||||
|
|
||||||
|
def test_get_running_pid_rejects_live_non_gateway_pid(self, tmp_path, monkeypatch):
|
||||||
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||||
|
pid_path = tmp_path / "gateway.pid"
|
||||||
|
pid_path.write_text(str(os.getpid()))
|
||||||
|
|
||||||
|
assert status.get_running_pid() is None
|
||||||
|
assert not pid_path.exists()
|
||||||
78
tests/hermes_cli/test_gateway_service.py
Normal file
78
tests/hermes_cli/test_gateway_service.py
Normal file
|
|
@ -0,0 +1,78 @@
|
||||||
|
"""Tests for gateway service management helpers."""
|
||||||
|
|
||||||
|
from types import SimpleNamespace
|
||||||
|
|
||||||
|
import hermes_cli.gateway as gateway_cli
|
||||||
|
|
||||||
|
|
||||||
|
class TestSystemdServiceRefresh:
|
||||||
|
def test_systemd_start_refreshes_outdated_unit(self, tmp_path, monkeypatch):
|
||||||
|
unit_path = tmp_path / "hermes-gateway.service"
|
||||||
|
unit_path.write_text("old unit\n", encoding="utf-8")
|
||||||
|
|
||||||
|
monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda: unit_path)
|
||||||
|
monkeypatch.setattr(gateway_cli, "generate_systemd_unit", lambda: "new unit\n")
|
||||||
|
|
||||||
|
calls = []
|
||||||
|
|
||||||
|
def fake_run(cmd, check=True, **kwargs):
|
||||||
|
calls.append(cmd)
|
||||||
|
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||||
|
|
||||||
|
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||||
|
|
||||||
|
gateway_cli.systemd_start()
|
||||||
|
|
||||||
|
assert unit_path.read_text(encoding="utf-8") == "new unit\n"
|
||||||
|
assert calls[:2] == [
|
||||||
|
["systemctl", "--user", "daemon-reload"],
|
||||||
|
["systemctl", "--user", "start", gateway_cli.SERVICE_NAME],
|
||||||
|
]
|
||||||
|
|
||||||
|
def test_systemd_restart_refreshes_outdated_unit(self, tmp_path, monkeypatch):
|
||||||
|
unit_path = tmp_path / "hermes-gateway.service"
|
||||||
|
unit_path.write_text("old unit\n", encoding="utf-8")
|
||||||
|
|
||||||
|
monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda: unit_path)
|
||||||
|
monkeypatch.setattr(gateway_cli, "generate_systemd_unit", lambda: "new unit\n")
|
||||||
|
|
||||||
|
calls = []
|
||||||
|
|
||||||
|
def fake_run(cmd, check=True, **kwargs):
|
||||||
|
calls.append(cmd)
|
||||||
|
return SimpleNamespace(returncode=0, stdout="", stderr="")
|
||||||
|
|
||||||
|
monkeypatch.setattr(gateway_cli.subprocess, "run", fake_run)
|
||||||
|
|
||||||
|
gateway_cli.systemd_restart()
|
||||||
|
|
||||||
|
assert unit_path.read_text(encoding="utf-8") == "new unit\n"
|
||||||
|
assert calls[:2] == [
|
||||||
|
["systemctl", "--user", "daemon-reload"],
|
||||||
|
["systemctl", "--user", "restart", gateway_cli.SERVICE_NAME],
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
class TestGatewayStopCleanup:
|
||||||
|
def test_stop_sweeps_manual_gateway_processes_after_service_stop(self, tmp_path, monkeypatch):
|
||||||
|
unit_path = tmp_path / "hermes-gateway.service"
|
||||||
|
unit_path.write_text("unit\n", encoding="utf-8")
|
||||||
|
|
||||||
|
monkeypatch.setattr(gateway_cli, "is_linux", lambda: True)
|
||||||
|
monkeypatch.setattr(gateway_cli, "is_macos", lambda: False)
|
||||||
|
monkeypatch.setattr(gateway_cli, "get_systemd_unit_path", lambda: unit_path)
|
||||||
|
|
||||||
|
service_calls = []
|
||||||
|
kill_calls = []
|
||||||
|
|
||||||
|
monkeypatch.setattr(gateway_cli, "systemd_stop", lambda: service_calls.append("stop"))
|
||||||
|
monkeypatch.setattr(
|
||||||
|
gateway_cli,
|
||||||
|
"kill_gateway_processes",
|
||||||
|
lambda force=False: kill_calls.append(force) or 2,
|
||||||
|
)
|
||||||
|
|
||||||
|
gateway_cli.gateway_command(SimpleNamespace(gateway_command="stop"))
|
||||||
|
|
||||||
|
assert service_calls == ["stop"]
|
||||||
|
assert kill_calls == [False]
|
||||||
Loading…
Add table
Add a link
Reference in a new issue