fix: prevent duplicate gateway instances from running simultaneously

start_gateway() now checks for an existing running instance via PID file
before starting. If another gateway is already running under the same
HERMES_HOME, it refuses to start with a clear error message directing the
user to 'hermes gateway restart' or 'hermes gateway stop'.

Also fixes gateway/status.py to respect the HERMES_HOME env var instead of
hardcoding ~/.hermes. This scopes the PID file per HERMES_HOME directory,
which lays the groundwork for future multi-profile support where distinct
HERMES_HOME directories can run concurrent gateway instances independently.
This commit is contained in:
teknium1 2026-03-05 20:35:33 -08:00
parent 2317d115cd
commit 014a5b712d
2 changed files with 57 additions and 14 deletions

View file

@ -2389,6 +2389,27 @@ async def start_gateway(config: Optional[GatewayConfig] = None) -> bool:
Returns True if the gateway ran successfully, False if it failed to start. Returns True if the gateway ran successfully, False if it failed to start.
A False return causes a non-zero exit code so systemd can auto-restart. A False return causes a non-zero exit code so systemd can auto-restart.
""" """
# ── Duplicate-instance guard ──────────────────────────────────────
# Prevent two gateways from running under the same HERMES_HOME.
# The PID file is scoped to HERMES_HOME, so future multi-profile
# setups (each profile using a distinct HERMES_HOME) will naturally
# allow concurrent instances without tripping this guard.
from gateway.status import get_running_pid
existing_pid = get_running_pid()
if existing_pid is not None and existing_pid != os.getpid():
hermes_home = os.getenv("HERMES_HOME", "~/.hermes")
logger.error(
"Another gateway instance is already running (PID %d, HERMES_HOME=%s). "
"Use 'hermes gateway restart' to replace it, or 'hermes gateway stop' first.",
existing_pid, hermes_home,
)
print(
f"\n❌ Gateway already running (PID {existing_pid}).\n"
f" Use 'hermes gateway restart' to replace it,\n"
f" or 'hermes gateway stop' to kill it first.\n"
)
return False
# Configure rotating file log so gateway output is persisted for debugging # Configure rotating file log so gateway output is persisted for debugging
log_dir = _hermes_home / 'logs' log_dir = _hermes_home / 'logs'
log_dir.mkdir(parents=True, exist_ok=True) log_dir.mkdir(parents=True, exist_ok=True)

View file

@ -3,37 +3,59 @@ Gateway runtime status helpers.
Provides PID-file based detection of whether the gateway daemon is running, Provides PID-file based detection of whether the gateway daemon is running,
used by send_message's check_fn to gate availability in the CLI. used by send_message's check_fn to gate availability in the CLI.
The PID file lives at ``{HERMES_HOME}/gateway.pid``. HERMES_HOME defaults to
``~/.hermes`` but can be overridden via the environment variable. This means
separate HERMES_HOME directories naturally get separate PID files a property
that will be useful when we add named profiles (multiple agents running
concurrently under distinct configurations).
""" """
import os import os
from pathlib import Path from pathlib import Path
from typing import Optional
_PID_FILE = Path.home() / ".hermes" / "gateway.pid"
def _get_pid_path() -> Path:
"""Return the path to the gateway PID file, respecting HERMES_HOME."""
home = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes"))
return home / "gateway.pid"
def write_pid_file() -> None: def write_pid_file() -> None:
"""Write the current process PID to the gateway PID file.""" """Write the current process PID to the gateway PID file."""
_PID_FILE.parent.mkdir(parents=True, exist_ok=True) pid_path = _get_pid_path()
_PID_FILE.write_text(str(os.getpid())) pid_path.parent.mkdir(parents=True, exist_ok=True)
pid_path.write_text(str(os.getpid()))
def remove_pid_file() -> None: def remove_pid_file() -> None:
"""Remove the gateway PID file if it exists.""" """Remove the gateway PID file if it exists."""
try: try:
_PID_FILE.unlink(missing_ok=True) _get_pid_path().unlink(missing_ok=True)
except Exception: except Exception:
pass pass
def get_running_pid() -> Optional[int]:
"""Return the PID of a running gateway instance, or ``None``.
Checks the PID file and verifies the process is actually alive.
Cleans up stale PID files automatically.
"""
pid_path = _get_pid_path()
if not pid_path.exists():
return None
try:
pid = int(pid_path.read_text().strip())
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
return pid
except (ValueError, ProcessLookupError, PermissionError):
# Stale PID file — process is gone
remove_pid_file()
return None
def is_gateway_running() -> bool: def is_gateway_running() -> bool:
"""Check if the gateway daemon is currently running.""" """Check if the gateway daemon is currently running."""
if not _PID_FILE.exists(): return get_running_pid() is not None
return False
try:
pid = int(_PID_FILE.read_text().strip())
os.kill(pid, 0) # signal 0 = existence check, no actual signal sent
return True
except (ValueError, ProcessLookupError, PermissionError):
# Stale PID file -- process is gone
remove_pid_file()
return False