fix: resolve systemd restart loop with --replace flag (#576)
When running under systemd, the gateway could enter restart loops in two scenarios: 1. The previous gateway process hasn't fully exited when systemd starts a new one, causing 'Gateway already running (PID ...)' → exit 1 → restart → same error → infinite loop. 2. The interactive CLI exits immediately in non-TTY mode, and systemd keeps restarting it. Changes: - Add --replace flag to 'hermes gateway run' that gracefully kills any existing gateway instance (SIGTERM → wait 10s → SIGKILL) before starting, preventing the PID-lock deadlock. - Update the generated systemd unit template to use --replace by default, add ExecStop for clean shutdown, set KillMode=mixed and TimeoutStopSec=15 for proper process management. - Existing behavior (without --replace) is unchanged: still prints the error message and exits, now also mentioning the --replace option. Fixes #576
This commit is contained in:
parent
23e84de830
commit
ee5daba061
3 changed files with 78 additions and 19 deletions
|
|
@ -2437,34 +2437,77 @@ def _start_cron_ticker(stop_event: threading.Event, adapters=None, interval: int
|
||||||
logger.info("Cron ticker stopped")
|
logger.info("Cron ticker stopped")
|
||||||
|
|
||||||
|
|
||||||
async def start_gateway(config: Optional[GatewayConfig] = None) -> bool:
|
async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = False) -> bool:
|
||||||
"""
|
"""
|
||||||
Start the gateway and run until interrupted.
|
Start the gateway and run until interrupted.
|
||||||
|
|
||||||
This is the main entry point for running the gateway.
|
This is the main entry point for running the gateway.
|
||||||
Returns True if the gateway ran successfully, False if it failed to start.
|
Returns True if the gateway ran successfully, False if it failed to start.
|
||||||
A False return causes a non-zero exit code so systemd can auto-restart.
|
A False return causes a non-zero exit code so systemd can auto-restart.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
config: Optional gateway configuration override.
|
||||||
|
replace: If True, kill any existing gateway instance before starting.
|
||||||
|
Useful for systemd services to avoid restart-loop deadlocks
|
||||||
|
when the previous process hasn't fully exited yet.
|
||||||
"""
|
"""
|
||||||
# ── Duplicate-instance guard ──────────────────────────────────────
|
# ── Duplicate-instance guard ──────────────────────────────────────
|
||||||
# Prevent two gateways from running under the same HERMES_HOME.
|
# Prevent two gateways from running under the same HERMES_HOME.
|
||||||
# The PID file is scoped to HERMES_HOME, so future multi-profile
|
# The PID file is scoped to HERMES_HOME, so future multi-profile
|
||||||
# setups (each profile using a distinct HERMES_HOME) will naturally
|
# setups (each profile using a distinct HERMES_HOME) will naturally
|
||||||
# allow concurrent instances without tripping this guard.
|
# allow concurrent instances without tripping this guard.
|
||||||
from gateway.status import get_running_pid
|
import time as _time
|
||||||
|
from gateway.status import get_running_pid, remove_pid_file
|
||||||
existing_pid = get_running_pid()
|
existing_pid = get_running_pid()
|
||||||
if existing_pid is not None and existing_pid != os.getpid():
|
if existing_pid is not None and existing_pid != os.getpid():
|
||||||
hermes_home = os.getenv("HERMES_HOME", "~/.hermes")
|
if replace:
|
||||||
logger.error(
|
logger.info(
|
||||||
"Another gateway instance is already running (PID %d, HERMES_HOME=%s). "
|
"Replacing existing gateway instance (PID %d) with --replace.",
|
||||||
"Use 'hermes gateway restart' to replace it, or 'hermes gateway stop' first.",
|
existing_pid,
|
||||||
existing_pid, hermes_home,
|
)
|
||||||
)
|
try:
|
||||||
print(
|
os.kill(existing_pid, signal.SIGTERM)
|
||||||
f"\n❌ Gateway already running (PID {existing_pid}).\n"
|
except ProcessLookupError:
|
||||||
f" Use 'hermes gateway restart' to replace it,\n"
|
pass # Already gone
|
||||||
f" or 'hermes gateway stop' to kill it first.\n"
|
except PermissionError:
|
||||||
)
|
logger.error(
|
||||||
return False
|
"Permission denied killing PID %d. Cannot replace.",
|
||||||
|
existing_pid,
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
# Wait up to 10 seconds for the old process to exit
|
||||||
|
for _ in range(20):
|
||||||
|
try:
|
||||||
|
os.kill(existing_pid, 0)
|
||||||
|
_time.sleep(0.5)
|
||||||
|
except (ProcessLookupError, PermissionError):
|
||||||
|
break # Process is gone
|
||||||
|
else:
|
||||||
|
# Still alive after 10s — force kill
|
||||||
|
logger.warning(
|
||||||
|
"Old gateway (PID %d) did not exit after SIGTERM, sending SIGKILL.",
|
||||||
|
existing_pid,
|
||||||
|
)
|
||||||
|
try:
|
||||||
|
os.kill(existing_pid, signal.SIGKILL)
|
||||||
|
_time.sleep(0.5)
|
||||||
|
except (ProcessLookupError, PermissionError):
|
||||||
|
pass
|
||||||
|
remove_pid_file()
|
||||||
|
else:
|
||||||
|
hermes_home = os.getenv("HERMES_HOME", "~/.hermes")
|
||||||
|
logger.error(
|
||||||
|
"Another gateway instance is already running (PID %d, HERMES_HOME=%s). "
|
||||||
|
"Use 'hermes gateway restart' to replace it, or 'hermes gateway stop' first.",
|
||||||
|
existing_pid, hermes_home,
|
||||||
|
)
|
||||||
|
print(
|
||||||
|
f"\n❌ Gateway already running (PID {existing_pid}).\n"
|
||||||
|
f" Use 'hermes gateway restart' to replace it,\n"
|
||||||
|
f" or 'hermes gateway stop' to kill it first.\n"
|
||||||
|
f" Or use 'hermes gateway run --replace' to auto-replace.\n"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
# Sync bundled skills on gateway start (fast -- skips unchanged)
|
# Sync bundled skills on gateway start (fast -- skips unchanged)
|
||||||
try:
|
try:
|
||||||
|
|
|
||||||
|
|
@ -154,19 +154,25 @@ def get_hermes_cli_path() -> str:
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
def generate_systemd_unit() -> str:
|
def generate_systemd_unit() -> str:
|
||||||
|
import shutil
|
||||||
python_path = get_python_path()
|
python_path = get_python_path()
|
||||||
working_dir = str(PROJECT_ROOT)
|
working_dir = str(PROJECT_ROOT)
|
||||||
|
|
||||||
|
hermes_cli = shutil.which("hermes") or f"{python_path} -m hermes_cli.main"
|
||||||
return f"""[Unit]
|
return f"""[Unit]
|
||||||
Description={SERVICE_DESCRIPTION}
|
Description={SERVICE_DESCRIPTION}
|
||||||
After=network.target
|
After=network.target
|
||||||
|
|
||||||
[Service]
|
[Service]
|
||||||
Type=simple
|
Type=simple
|
||||||
ExecStart={python_path} -m hermes_cli.main gateway run
|
ExecStart={python_path} -m hermes_cli.main gateway run --replace
|
||||||
|
ExecStop={hermes_cli} gateway stop
|
||||||
WorkingDirectory={working_dir}
|
WorkingDirectory={working_dir}
|
||||||
Restart=on-failure
|
Restart=on-failure
|
||||||
RestartSec=10
|
RestartSec=10
|
||||||
|
KillMode=mixed
|
||||||
|
KillSignal=SIGTERM
|
||||||
|
TimeoutStopSec=15
|
||||||
StandardOutput=journal
|
StandardOutput=journal
|
||||||
StandardError=journal
|
StandardError=journal
|
||||||
|
|
||||||
|
|
@ -377,8 +383,15 @@ def launchd_status(deep: bool = False):
|
||||||
# Gateway Runner
|
# Gateway Runner
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
||||||
def run_gateway(verbose: bool = False):
|
def run_gateway(verbose: bool = False, replace: bool = False):
|
||||||
"""Run the gateway in foreground."""
|
"""Run the gateway in foreground.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
verbose: Enable verbose logging output.
|
||||||
|
replace: If True, kill any existing gateway instance before starting.
|
||||||
|
This prevents systemd restart loops when the old process
|
||||||
|
hasn't fully exited yet.
|
||||||
|
"""
|
||||||
sys.path.insert(0, str(PROJECT_ROOT))
|
sys.path.insert(0, str(PROJECT_ROOT))
|
||||||
|
|
||||||
from gateway.run import start_gateway
|
from gateway.run import start_gateway
|
||||||
|
|
@ -393,7 +406,7 @@ def run_gateway(verbose: bool = False):
|
||||||
|
|
||||||
# Exit with code 1 if gateway fails to connect any platform,
|
# Exit with code 1 if gateway fails to connect any platform,
|
||||||
# so systemd Restart=on-failure will retry on transient errors
|
# so systemd Restart=on-failure will retry on transient errors
|
||||||
success = asyncio.run(start_gateway())
|
success = asyncio.run(start_gateway(replace=replace))
|
||||||
if not success:
|
if not success:
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|
@ -765,7 +778,8 @@ def gateway_command(args):
|
||||||
# Default to run if no subcommand
|
# Default to run if no subcommand
|
||||||
if subcmd is None or subcmd == "run":
|
if subcmd is None or subcmd == "run":
|
||||||
verbose = getattr(args, 'verbose', False)
|
verbose = getattr(args, 'verbose', False)
|
||||||
run_gateway(verbose)
|
replace = getattr(args, 'replace', False)
|
||||||
|
run_gateway(verbose, replace=replace)
|
||||||
return
|
return
|
||||||
|
|
||||||
if subcmd == "setup":
|
if subcmd == "setup":
|
||||||
|
|
|
||||||
|
|
@ -1315,6 +1315,8 @@ For more help on a command:
|
||||||
# gateway run (default)
|
# gateway run (default)
|
||||||
gateway_run = gateway_subparsers.add_parser("run", help="Run gateway in foreground")
|
gateway_run = gateway_subparsers.add_parser("run", help="Run gateway in foreground")
|
||||||
gateway_run.add_argument("-v", "--verbose", action="store_true")
|
gateway_run.add_argument("-v", "--verbose", action="store_true")
|
||||||
|
gateway_run.add_argument("--replace", action="store_true",
|
||||||
|
help="Replace any existing gateway instance (useful for systemd)")
|
||||||
|
|
||||||
# gateway start
|
# gateway start
|
||||||
gateway_start = gateway_subparsers.add_parser("start", help="Start gateway service")
|
gateway_start = gateway_subparsers.add_parser("start", help="Start gateway service")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue