fix: resolve systemd restart loop with --replace flag (#576)

When running under systemd, the gateway could enter restart loops in two
scenarios:

1. The previous gateway process hasn't fully exited when systemd starts
   a new one, causing 'Gateway already running (PID ...)' → exit 1 →
   restart → same error → infinite loop.

2. The interactive CLI exits immediately in non-TTY mode, and systemd
   keeps restarting it.

Changes:

- Add --replace flag to 'hermes gateway run' that gracefully kills any
  existing gateway instance (SIGTERM → wait 10s → SIGKILL) before
  starting, preventing the PID-lock deadlock.

- Update the generated systemd unit template to use --replace by default,
  add ExecStop for clean shutdown, set KillMode=mixed and
  TimeoutStopSec=15 for proper process management.

- Existing behavior (without --replace) is unchanged: still prints the
  error message and exits, now also mentioning the --replace option.

Fixes #576
This commit is contained in:
d 🔹 2026-03-07 18:08:12 +00:00
parent 23e84de830
commit ee5daba061
3 changed files with 78 additions and 19 deletions

View file

@ -2437,34 +2437,77 @@ def _start_cron_ticker(stop_event: threading.Event, adapters=None, interval: int
logger.info("Cron ticker stopped") logger.info("Cron ticker stopped")
async def start_gateway(config: Optional[GatewayConfig] = None) -> bool: async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = False) -> bool:
""" """
Start the gateway and run until interrupted. Start the gateway and run until interrupted.
This is the main entry point for running the gateway. This is the main entry point for running the gateway.
Returns True if the gateway ran successfully, False if it failed to start. Returns True if the gateway ran successfully, False if it failed to start.
A False return causes a non-zero exit code so systemd can auto-restart. A False return causes a non-zero exit code so systemd can auto-restart.
Args:
config: Optional gateway configuration override.
replace: If True, kill any existing gateway instance before starting.
Useful for systemd services to avoid restart-loop deadlocks
when the previous process hasn't fully exited yet.
""" """
# ── Duplicate-instance guard ────────────────────────────────────── # ── Duplicate-instance guard ──────────────────────────────────────
# Prevent two gateways from running under the same HERMES_HOME. # Prevent two gateways from running under the same HERMES_HOME.
# The PID file is scoped to HERMES_HOME, so future multi-profile # The PID file is scoped to HERMES_HOME, so future multi-profile
# setups (each profile using a distinct HERMES_HOME) will naturally # setups (each profile using a distinct HERMES_HOME) will naturally
# allow concurrent instances without tripping this guard. # allow concurrent instances without tripping this guard.
from gateway.status import get_running_pid import time as _time
from gateway.status import get_running_pid, remove_pid_file
existing_pid = get_running_pid() existing_pid = get_running_pid()
if existing_pid is not None and existing_pid != os.getpid(): if existing_pid is not None and existing_pid != os.getpid():
hermes_home = os.getenv("HERMES_HOME", "~/.hermes") if replace:
logger.error( logger.info(
"Another gateway instance is already running (PID %d, HERMES_HOME=%s). " "Replacing existing gateway instance (PID %d) with --replace.",
"Use 'hermes gateway restart' to replace it, or 'hermes gateway stop' first.", existing_pid,
existing_pid, hermes_home, )
) try:
print( os.kill(existing_pid, signal.SIGTERM)
f"\n❌ Gateway already running (PID {existing_pid}).\n" except ProcessLookupError:
f" Use 'hermes gateway restart' to replace it,\n" pass # Already gone
f" or 'hermes gateway stop' to kill it first.\n" except PermissionError:
) logger.error(
return False "Permission denied killing PID %d. Cannot replace.",
existing_pid,
)
return False
# Wait up to 10 seconds for the old process to exit
for _ in range(20):
try:
os.kill(existing_pid, 0)
_time.sleep(0.5)
except (ProcessLookupError, PermissionError):
break # Process is gone
else:
# Still alive after 10s — force kill
logger.warning(
"Old gateway (PID %d) did not exit after SIGTERM, sending SIGKILL.",
existing_pid,
)
try:
os.kill(existing_pid, signal.SIGKILL)
_time.sleep(0.5)
except (ProcessLookupError, PermissionError):
pass
remove_pid_file()
else:
hermes_home = os.getenv("HERMES_HOME", "~/.hermes")
logger.error(
"Another gateway instance is already running (PID %d, HERMES_HOME=%s). "
"Use 'hermes gateway restart' to replace it, or 'hermes gateway stop' first.",
existing_pid, hermes_home,
)
print(
f"\n❌ Gateway already running (PID {existing_pid}).\n"
f" Use 'hermes gateway restart' to replace it,\n"
f" or 'hermes gateway stop' to kill it first.\n"
f" Or use 'hermes gateway run --replace' to auto-replace.\n"
)
return False
# Sync bundled skills on gateway start (fast -- skips unchanged) # Sync bundled skills on gateway start (fast -- skips unchanged)
try: try:

View file

@ -154,19 +154,25 @@ def get_hermes_cli_path() -> str:
# ============================================================================= # =============================================================================
def generate_systemd_unit() -> str: def generate_systemd_unit() -> str:
import shutil
python_path = get_python_path() python_path = get_python_path()
working_dir = str(PROJECT_ROOT) working_dir = str(PROJECT_ROOT)
hermes_cli = shutil.which("hermes") or f"{python_path} -m hermes_cli.main"
return f"""[Unit] return f"""[Unit]
Description={SERVICE_DESCRIPTION} Description={SERVICE_DESCRIPTION}
After=network.target After=network.target
[Service] [Service]
Type=simple Type=simple
ExecStart={python_path} -m hermes_cli.main gateway run ExecStart={python_path} -m hermes_cli.main gateway run --replace
ExecStop={hermes_cli} gateway stop
WorkingDirectory={working_dir} WorkingDirectory={working_dir}
Restart=on-failure Restart=on-failure
RestartSec=10 RestartSec=10
KillMode=mixed
KillSignal=SIGTERM
TimeoutStopSec=15
StandardOutput=journal StandardOutput=journal
StandardError=journal StandardError=journal
@ -377,8 +383,15 @@ def launchd_status(deep: bool = False):
# Gateway Runner # Gateway Runner
# ============================================================================= # =============================================================================
def run_gateway(verbose: bool = False): def run_gateway(verbose: bool = False, replace: bool = False):
"""Run the gateway in foreground.""" """Run the gateway in foreground.
Args:
verbose: Enable verbose logging output.
replace: If True, kill any existing gateway instance before starting.
This prevents systemd restart loops when the old process
hasn't fully exited yet.
"""
sys.path.insert(0, str(PROJECT_ROOT)) sys.path.insert(0, str(PROJECT_ROOT))
from gateway.run import start_gateway from gateway.run import start_gateway
@ -393,7 +406,7 @@ def run_gateway(verbose: bool = False):
# Exit with code 1 if gateway fails to connect any platform, # Exit with code 1 if gateway fails to connect any platform,
# so systemd Restart=on-failure will retry on transient errors # so systemd Restart=on-failure will retry on transient errors
success = asyncio.run(start_gateway()) success = asyncio.run(start_gateway(replace=replace))
if not success: if not success:
sys.exit(1) sys.exit(1)
@ -765,7 +778,8 @@ def gateway_command(args):
# Default to run if no subcommand # Default to run if no subcommand
if subcmd is None or subcmd == "run": if subcmd is None or subcmd == "run":
verbose = getattr(args, 'verbose', False) verbose = getattr(args, 'verbose', False)
run_gateway(verbose) replace = getattr(args, 'replace', False)
run_gateway(verbose, replace=replace)
return return
if subcmd == "setup": if subcmd == "setup":

View file

@ -1315,6 +1315,8 @@ For more help on a command:
# gateway run (default) # gateway run (default)
gateway_run = gateway_subparsers.add_parser("run", help="Run gateway in foreground") gateway_run = gateway_subparsers.add_parser("run", help="Run gateway in foreground")
gateway_run.add_argument("-v", "--verbose", action="store_true") gateway_run.add_argument("-v", "--verbose", action="store_true")
gateway_run.add_argument("--replace", action="store_true",
help="Replace any existing gateway instance (useful for systemd)")
# gateway start # gateway start
gateway_start = gateway_subparsers.add_parser("start", help="Start gateway service") gateway_start = gateway_subparsers.add_parser("start", help="Start gateway service")