From f3a38c90fc64028956fe30902f934ece75424bfd Mon Sep 17 00:00:00 2001 From: teyrebaz33 Date: Thu, 12 Mar 2026 18:02:21 +0300 Subject: [PATCH 1/2] fix(gateway): fall back to sys.executable -m hermes_cli.main when hermes not on PATH When shutil.which('hermes') returns None, _resolve_hermes_bin() now tries sys.executable -m hermes_cli.main as a fallback. This handles setups where Hermes is launched via a venv or module invocation and the hermes symlink is not on PATH for the gateway process. Fixes #1049 --- gateway/run.py | 41 ++++++++++++++-- tests/gateway/test_update_command.py | 73 ++++++++++++++++++++++++++-- 2 files changed, 107 insertions(+), 7 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index 5ab74972..d52092c4 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -215,6 +215,33 @@ def _resolve_gateway_model() -> str: return model +def _resolve_hermes_bin() -> Optional[list[str]]: + """Resolve the Hermes update command as argv parts. + + Tries in order: + 1. ``shutil.which("hermes")`` — standard PATH lookup + 2. ``sys.executable -m hermes_cli.main`` — fallback when Hermes is running + from a venv/module invocation and the ``hermes`` shim is not on PATH + + Returns argv parts ready for quoting/joining, or ``None`` if neither works. + """ + import shutil + + hermes_bin = shutil.which("hermes") + if hermes_bin: + return [hermes_bin] + + try: + import importlib.util + + if importlib.util.find_spec("hermes_cli") is not None: + return [sys.executable, "-m", "hermes_cli.main"] + except Exception: + pass + + return None + + class GatewayRunner: """ Main gateway controller. @@ -3155,9 +3182,14 @@ class GatewayRunner: if not git_dir.exists(): return "✗ Not a git repository — cannot update." - hermes_bin = shutil.which("hermes") - if not hermes_bin: - return "✗ `hermes` command not found on PATH." + hermes_cmd = _resolve_hermes_bin() + if not hermes_cmd: + return ( + "✗ Could not locate the `hermes` command. " + "Hermes is running, but the update command could not find the " + "executable on PATH or via the current Python interpreter. " + "Try running `hermes update` manually in your terminal." + ) pending_path = _hermes_home / ".update_pending.json" output_path = _hermes_home / ".update_output.txt" @@ -3173,8 +3205,9 @@ class GatewayRunner: # Spawn `hermes update` in a separate cgroup so it survives gateway # restart. systemd-run --user --scope creates a transient scope unit. + hermes_cmd_str = " ".join(shlex.quote(part) for part in hermes_cmd) update_cmd = ( - f"{shlex.quote(hermes_bin)} update > {shlex.quote(str(output_path))} 2>&1; " + f"{hermes_cmd_str} update > {shlex.quote(str(output_path))} 2>&1; " f"status=$?; printf '%s' \"$status\" > {shlex.quote(str(exit_code_path))}" ) try: diff --git a/tests/gateway/test_update_command.py b/tests/gateway/test_update_command.py index 12474563..ac9beac1 100644 --- a/tests/gateway/test_update_command.py +++ b/tests/gateway/test_update_command.py @@ -88,7 +88,7 @@ class TestHandleUpdateCommand: @pytest.mark.asyncio async def test_no_hermes_binary(self, tmp_path): - """Returns error when hermes is not on PATH.""" + """Returns error when hermes is not on PATH and hermes_cli is not importable.""" runner = _make_runner() event = _make_event() @@ -102,10 +102,77 @@ class TestHandleUpdateCommand: with patch("gateway.run._hermes_home", tmp_path), \ patch("gateway.run.__file__", fake_file), \ - patch("shutil.which", return_value=None): + patch("shutil.which", return_value=None), \ + patch("importlib.util.find_spec", return_value=None): result = await runner._handle_update_command(event) - assert "not found on PATH" in result + assert "Could not locate" in result + assert "hermes update" in result + + @pytest.mark.asyncio + async def test_fallback_to_sys_executable(self, tmp_path): + """Falls back to sys.executable -m hermes_cli.main when hermes not on PATH.""" + runner = _make_runner() + event = _make_event() + + fake_root = tmp_path / "project" + fake_root.mkdir() + (fake_root / ".git").mkdir() + (fake_root / "gateway").mkdir() + (fake_root / "gateway" / "run.py").touch() + fake_file = str(fake_root / "gateway" / "run.py") + hermes_home = tmp_path / "hermes" + hermes_home.mkdir() + + mock_popen = MagicMock() + fake_spec = MagicMock() + + with patch("gateway.run._hermes_home", hermes_home), \ + patch("gateway.run.__file__", fake_file), \ + patch("shutil.which", return_value=None), \ + patch("importlib.util.find_spec", return_value=fake_spec), \ + patch("subprocess.Popen", mock_popen): + result = await runner._handle_update_command(event) + + assert "Starting Hermes update" in result + call_args = mock_popen.call_args[0][0] + # The update_cmd uses sys.executable -m hermes_cli.main + joined = " ".join(call_args) if isinstance(call_args, list) else call_args + assert "hermes_cli.main" in joined or "bash" in call_args[0] + + @pytest.mark.asyncio + async def test_resolve_hermes_bin_prefers_which(self, tmp_path): + """_resolve_hermes_bin returns argv parts from shutil.which when available.""" + from gateway.run import _resolve_hermes_bin + + with patch("shutil.which", return_value="/custom/path/hermes"): + result = _resolve_hermes_bin() + + assert result == ["/custom/path/hermes"] + + @pytest.mark.asyncio + async def test_resolve_hermes_bin_fallback(self): + """_resolve_hermes_bin falls back to sys.executable argv when which fails.""" + import sys + from gateway.run import _resolve_hermes_bin + + fake_spec = MagicMock() + with patch("shutil.which", return_value=None), \ + patch("importlib.util.find_spec", return_value=fake_spec): + result = _resolve_hermes_bin() + + assert result == [sys.executable, "-m", "hermes_cli.main"] + + @pytest.mark.asyncio + async def test_resolve_hermes_bin_returns_none_when_both_fail(self): + """_resolve_hermes_bin returns None when both strategies fail.""" + from gateway.run import _resolve_hermes_bin + + with patch("shutil.which", return_value=None), \ + patch("importlib.util.find_spec", return_value=None): + result = _resolve_hermes_bin() + + assert result is None @pytest.mark.asyncio async def test_writes_pending_marker(self, tmp_path): From ed0c7194ed64b716f8ad5aab6e860505591af4d6 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Sat, 14 Mar 2026 18:03:50 -0700 Subject: [PATCH 2/2] fix: preserve current gateway update and startup behavior Follow up on salvaged PR #1052. Restore current-main gateway lifecycle handling after conflict resolution and adapt the update fallback to use shell-quoted argv parts safely. --- gateway/run.py | 75 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 73 insertions(+), 2 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index d52092c4..e97db025 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -272,6 +272,8 @@ class GatewayRunner: self.delivery_router = DeliveryRouter(self.config) self._running = False self._shutdown_event = asyncio.Event() + self._exit_cleanly = False + self._exit_reason: Optional[str] = None # Track running agents per session for interrupt support # Key: session_key, Value: AIAgent instance @@ -490,6 +492,41 @@ class GatewayRunner: """Run the sync memory flush in a thread pool so it won't block the event loop.""" loop = asyncio.get_event_loop() await loop.run_in_executor(None, self._flush_memories_for_session, old_session_id) + + @property + def should_exit_cleanly(self) -> bool: + return self._exit_cleanly + + @property + def exit_reason(self) -> Optional[str]: + return self._exit_reason + + async def _handle_adapter_fatal_error(self, adapter: BasePlatformAdapter) -> None: + """React to a non-retryable adapter failure after startup.""" + logger.error( + "Fatal %s adapter error (%s): %s", + adapter.platform.value, + adapter.fatal_error_code or "unknown", + adapter.fatal_error_message or "unknown error", + ) + + existing = self.adapters.get(adapter.platform) + if existing is adapter: + try: + await adapter.disconnect() + finally: + self.adapters.pop(adapter.platform, None) + self.delivery_router.adapters = self.adapters + + if not self.adapters: + self._exit_reason = adapter.fatal_error_message or "All messaging adapters disconnected" + logger.error("No connected messaging platforms remain. Shutting down gateway cleanly.") + await self.stop() + + def _request_clean_exit(self, reason: str) -> None: + self._exit_cleanly = True + self._exit_reason = reason + self._shutdown_event.set() @staticmethod def _load_prefill_messages() -> List[Dict[str, Any]]: @@ -674,6 +711,11 @@ class GatewayRunner: """ logger.info("Starting Hermes Gateway...") logger.info("Session storage: %s", self.config.sessions_dir) + try: + from gateway.status import write_runtime_status + write_runtime_status(gateway_state="starting", exit_reason=None) + except Exception: + pass # Warn if no user allowlists are configured and open access is not opted in _any_allowlist = any( @@ -703,6 +745,7 @@ class GatewayRunner: logger.warning("Process checkpoint recovery: %s", e) connected_count = 0 + startup_nonretryable_errors: list[str] = [] # Initialize and connect each configured platform for platform, platform_config in self.config.platforms.items(): @@ -714,8 +757,9 @@ class GatewayRunner: logger.warning("No adapter available for %s", platform.value) continue - # Set up message handler + # Set up message + fatal error handlers adapter.set_message_handler(self._handle_message) + adapter.set_fatal_error_handler(self._handle_adapter_fatal_error) # Try to connect logger.info("Connecting to %s...", platform.value) @@ -728,10 +772,24 @@ class GatewayRunner: logger.info("✓ %s connected", platform.value) else: logger.warning("✗ %s failed to connect", platform.value) + if adapter.has_fatal_error and not adapter.fatal_error_retryable: + startup_nonretryable_errors.append( + f"{platform.value}: {adapter.fatal_error_message}" + ) except Exception as e: logger.error("✗ %s error: %s", platform.value, e) if connected_count == 0: + if startup_nonretryable_errors: + reason = "; ".join(startup_nonretryable_errors) + logger.error("Gateway hit a non-retryable startup conflict: %s", reason) + try: + from gateway.status import write_runtime_status + write_runtime_status(gateway_state="startup_failed", exit_reason=reason) + except Exception: + pass + self._request_clean_exit(reason) + return True logger.warning("No messaging platforms connected.") logger.info("Gateway will continue running for cron job execution.") @@ -739,6 +797,11 @@ class GatewayRunner: self.delivery_router.adapters = self.adapters self._running = True + try: + from gateway.status import write_runtime_status + write_runtime_status(gateway_state="running", exit_reason=None) + except Exception: + pass # Emit gateway:startup hook hook_count = len(self.hooks.loaded_hooks) @@ -833,8 +896,12 @@ class GatewayRunner: self._shutdown_all_gateway_honcho() self._shutdown_event.set() - from gateway.status import remove_pid_file + from gateway.status import remove_pid_file, write_runtime_status remove_pid_file() + try: + write_runtime_status(gateway_state="stopped", exit_reason=self._exit_reason) + except Exception: + pass logger.info("Gateway stopped") @@ -4373,6 +4440,10 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool = success = await runner.start() if not success: return False + if runner.should_exit_cleanly: + if runner.exit_reason: + logger.error("Gateway exiting cleanly: %s", runner.exit_reason) + return True # Write PID file so CLI can detect gateway is running import atexit