Merge PR #614: fix: resolve systemd restart loop with --replace flag

Authored by voidborne-d. Fixes #576.

Adds --replace flag to 'hermes gateway run' that terminates any existing
gateway instance (SIGTERM with SIGKILL fallback) before starting.
Updated systemd unit template with --replace, ExecStop, KillMode, and
TimeoutStopSec for robust service management.
This commit is contained in:
teknium1 2026-03-07 16:33:27 -08:00
commit 39ee3512cb
3 changed files with 78 additions and 19 deletions

View file

@ -2459,34 +2459,77 @@ def _start_cron_ticker(stop_event: threading.Event, adapters=None, interval: int
logger.info("Cron ticker stopped") logger.info("Cron ticker stopped")
async def start_gateway(config: Optional[GatewayConfig] = None) -> bool: async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = False) -> bool:
""" """
Start the gateway and run until interrupted. Start the gateway and run until interrupted.
This is the main entry point for running the gateway. This is the main entry point for running the gateway.
Returns True if the gateway ran successfully, False if it failed to start. Returns True if the gateway ran successfully, False if it failed to start.
A False return causes a non-zero exit code so systemd can auto-restart. A False return causes a non-zero exit code so systemd can auto-restart.
Args:
config: Optional gateway configuration override.
replace: If True, kill any existing gateway instance before starting.
Useful for systemd services to avoid restart-loop deadlocks
when the previous process hasn't fully exited yet.
""" """
# ── Duplicate-instance guard ────────────────────────────────────── # ── Duplicate-instance guard ──────────────────────────────────────
# Prevent two gateways from running under the same HERMES_HOME. # Prevent two gateways from running under the same HERMES_HOME.
# The PID file is scoped to HERMES_HOME, so future multi-profile # The PID file is scoped to HERMES_HOME, so future multi-profile
# setups (each profile using a distinct HERMES_HOME) will naturally # setups (each profile using a distinct HERMES_HOME) will naturally
# allow concurrent instances without tripping this guard. # allow concurrent instances without tripping this guard.
from gateway.status import get_running_pid import time as _time
from gateway.status import get_running_pid, remove_pid_file
existing_pid = get_running_pid() existing_pid = get_running_pid()
if existing_pid is not None and existing_pid != os.getpid(): if existing_pid is not None and existing_pid != os.getpid():
hermes_home = os.getenv("HERMES_HOME", "~/.hermes") if replace:
logger.error( logger.info(
"Another gateway instance is already running (PID %d, HERMES_HOME=%s). " "Replacing existing gateway instance (PID %d) with --replace.",
"Use 'hermes gateway restart' to replace it, or 'hermes gateway stop' first.", existing_pid,
existing_pid, hermes_home, )
) try:
print( os.kill(existing_pid, signal.SIGTERM)
f"\n❌ Gateway already running (PID {existing_pid}).\n" except ProcessLookupError:
f" Use 'hermes gateway restart' to replace it,\n" pass # Already gone
f" or 'hermes gateway stop' to kill it first.\n" except PermissionError:
) logger.error(
return False "Permission denied killing PID %d. Cannot replace.",
existing_pid,
)
return False
# Wait up to 10 seconds for the old process to exit
for _ in range(20):
try:
os.kill(existing_pid, 0)
_time.sleep(0.5)
except (ProcessLookupError, PermissionError):
break # Process is gone
else:
# Still alive after 10s — force kill
logger.warning(
"Old gateway (PID %d) did not exit after SIGTERM, sending SIGKILL.",
existing_pid,
)
try:
os.kill(existing_pid, signal.SIGKILL)
_time.sleep(0.5)
except (ProcessLookupError, PermissionError):
pass
remove_pid_file()
else:
hermes_home = os.getenv("HERMES_HOME", "~/.hermes")
logger.error(
"Another gateway instance is already running (PID %d, HERMES_HOME=%s). "
"Use 'hermes gateway restart' to replace it, or 'hermes gateway stop' first.",
existing_pid, hermes_home,
)
print(
f"\n❌ Gateway already running (PID {existing_pid}).\n"
f" Use 'hermes gateway restart' to replace it,\n"
f" or 'hermes gateway stop' to kill it first.\n"
f" Or use 'hermes gateway run --replace' to auto-replace.\n"
)
return False
# Sync bundled skills on gateway start (fast -- skips unchanged) # Sync bundled skills on gateway start (fast -- skips unchanged)
try: try:

View file

@ -154,19 +154,25 @@ def get_hermes_cli_path() -> str:
# ============================================================================= # =============================================================================
def generate_systemd_unit() -> str: def generate_systemd_unit() -> str:
import shutil
python_path = get_python_path() python_path = get_python_path()
working_dir = str(PROJECT_ROOT) working_dir = str(PROJECT_ROOT)
hermes_cli = shutil.which("hermes") or f"{python_path} -m hermes_cli.main"
return f"""[Unit] return f"""[Unit]
Description={SERVICE_DESCRIPTION} Description={SERVICE_DESCRIPTION}
After=network.target After=network.target
[Service] [Service]
Type=simple Type=simple
ExecStart={python_path} -m hermes_cli.main gateway run ExecStart={python_path} -m hermes_cli.main gateway run --replace
ExecStop={hermes_cli} gateway stop
WorkingDirectory={working_dir} WorkingDirectory={working_dir}
Restart=on-failure Restart=on-failure
RestartSec=10 RestartSec=10
KillMode=mixed
KillSignal=SIGTERM
TimeoutStopSec=15
StandardOutput=journal StandardOutput=journal
StandardError=journal StandardError=journal
@ -377,8 +383,15 @@ def launchd_status(deep: bool = False):
# Gateway Runner # Gateway Runner
# ============================================================================= # =============================================================================
def run_gateway(verbose: bool = False): def run_gateway(verbose: bool = False, replace: bool = False):
"""Run the gateway in foreground.""" """Run the gateway in foreground.
Args:
verbose: Enable verbose logging output.
replace: If True, kill any existing gateway instance before starting.
This prevents systemd restart loops when the old process
hasn't fully exited yet.
"""
sys.path.insert(0, str(PROJECT_ROOT)) sys.path.insert(0, str(PROJECT_ROOT))
from gateway.run import start_gateway from gateway.run import start_gateway
@ -393,7 +406,7 @@ def run_gateway(verbose: bool = False):
# Exit with code 1 if gateway fails to connect any platform, # Exit with code 1 if gateway fails to connect any platform,
# so systemd Restart=on-failure will retry on transient errors # so systemd Restart=on-failure will retry on transient errors
success = asyncio.run(start_gateway()) success = asyncio.run(start_gateway(replace=replace))
if not success: if not success:
sys.exit(1) sys.exit(1)
@ -765,7 +778,8 @@ def gateway_command(args):
# Default to run if no subcommand # Default to run if no subcommand
if subcmd is None or subcmd == "run": if subcmd is None or subcmd == "run":
verbose = getattr(args, 'verbose', False) verbose = getattr(args, 'verbose', False)
run_gateway(verbose) replace = getattr(args, 'replace', False)
run_gateway(verbose, replace=replace)
return return
if subcmd == "setup": if subcmd == "setup":

View file

@ -1315,6 +1315,8 @@ For more help on a command:
# gateway run (default) # gateway run (default)
gateway_run = gateway_subparsers.add_parser("run", help="Run gateway in foreground") gateway_run = gateway_subparsers.add_parser("run", help="Run gateway in foreground")
gateway_run.add_argument("-v", "--verbose", action="store_true") gateway_run.add_argument("-v", "--verbose", action="store_true")
gateway_run.add_argument("--replace", action="store_true",
help="Replace any existing gateway instance (useful for systemd)")
# gateway start # gateway start
gateway_start = gateway_subparsers.add_parser("start", help="Start gateway service") gateway_start = gateway_subparsers.add_parser("start", help="Start gateway service")