fix: auto-detect D-Bus session bus for systemctl --user on headless servers (#1601)
* fix: Anthropic OAuth compatibility — Claude Code identity fingerprinting Anthropic routes OAuth/subscription requests based on Claude Code's identity markers. Without them, requests get intermittent 500 errors (~25% failure rate observed). This matches what pi-ai (clawdbot) and OpenCode both implement for OAuth compatibility. Changes (OAuth tokens only — API key users unaffected): 1. Headers: user-agent 'claude-cli/2.1.2 (external, cli)' + x-app 'cli' 2. System prompt: prepend 'You are Claude Code, Anthropic's official CLI' 3. System prompt sanitization: replace Hermes/Nous references 4. Tool names: prefix with 'mcp_' (Claude Code convention for non-native tools) 5. Tool name stripping: remove 'mcp_' prefix from response tool calls Before: 9/12 OK, 1 hard fail, 4 needed retries (~25% error rate) After: 16/16 OK, 0 failures, 0 retries (0% error rate) * fix: auto-detect DBUS_SESSION_BUS_ADDRESS for systemctl --user on headless servers On SSH sessions to headless servers, DBUS_SESSION_BUS_ADDRESS and XDG_RUNTIME_DIR may not be set even when the user's systemd instance is running via linger. This causes 'systemctl --user' to fail with 'Failed to connect to bus: No medium found', breaking gateway restart/start/stop as a service and falling back to foreground mode. Add _ensure_user_systemd_env() that detects the standard D-Bus socket at /run/user/<UID>/bus and sets the env vars before any systemctl --user call. Called from _systemctl_cmd() so all existing call sites benefit automatically with zero changes. Fixes: gateway restart falling back to foreground on headless servers * fix: show linger guidance when gateway restart fails during update and gateway restart When systemctl --user restart fails during 'hermes update' or 'hermes gateway restart', check linger status and tell the user exactly what to run (sudo -S -p '' loginctl enable-linger) instead of silently falling back to foreground mode. Also applies _ensure_user_systemd_env() to the raw systemctl calls in cmd_update so they work properly on SSH sessions where D-Bus env vars are missing.
This commit is contained in:
parent
ce430fed4c
commit
60e38e82ec
3 changed files with 136 additions and 2 deletions
|
|
@ -150,7 +150,31 @@ def get_systemd_unit_path(system: bool = False) -> Path:
|
|||
return Path.home() / ".config" / "systemd" / "user" / f"{name}.service"
|
||||
|
||||
|
||||
def _ensure_user_systemd_env() -> None:
|
||||
"""Ensure DBUS_SESSION_BUS_ADDRESS and XDG_RUNTIME_DIR are set for systemctl --user.
|
||||
|
||||
On headless servers (SSH sessions), these env vars may be missing even when
|
||||
the user's systemd instance is running (via linger). Without them,
|
||||
``systemctl --user`` fails with "Failed to connect to bus: No medium found".
|
||||
We detect the standard socket path and set the vars so all subsequent
|
||||
subprocess calls inherit them.
|
||||
"""
|
||||
uid = os.getuid()
|
||||
if "XDG_RUNTIME_DIR" not in os.environ:
|
||||
runtime_dir = f"/run/user/{uid}"
|
||||
if Path(runtime_dir).exists():
|
||||
os.environ["XDG_RUNTIME_DIR"] = runtime_dir
|
||||
|
||||
if "DBUS_SESSION_BUS_ADDRESS" not in os.environ:
|
||||
xdg_runtime = os.environ.get("XDG_RUNTIME_DIR", f"/run/user/{uid}")
|
||||
bus_path = Path(xdg_runtime) / "bus"
|
||||
if bus_path.exists():
|
||||
os.environ["DBUS_SESSION_BUS_ADDRESS"] = f"unix:path={bus_path}"
|
||||
|
||||
|
||||
def _systemctl_cmd(system: bool = False) -> list[str]:
|
||||
if not system:
|
||||
_ensure_user_systemd_env()
|
||||
return ["systemctl"] if system else ["systemctl", "--user"]
|
||||
|
||||
|
||||
|
|
@ -1546,6 +1570,22 @@ def gateway_command(args):
|
|||
pass
|
||||
|
||||
if not service_available:
|
||||
# systemd/launchd restart failed — check if linger is the issue
|
||||
if is_linux():
|
||||
linger_ok, _detail = get_systemd_linger_status()
|
||||
if linger_ok is not True:
|
||||
import getpass
|
||||
_username = getpass.getuser()
|
||||
print()
|
||||
print("⚠ Cannot restart gateway as a service — linger is not enabled.")
|
||||
print(" The gateway user service requires linger to function on headless servers.")
|
||||
print()
|
||||
print(f" Run: sudo loginctl enable-linger {_username}")
|
||||
print()
|
||||
print(" Then restart the gateway:")
|
||||
print(" hermes gateway restart")
|
||||
return
|
||||
|
||||
# Manual restart: kill existing processes
|
||||
killed = kill_gateway_processes()
|
||||
if killed:
|
||||
|
|
|
|||
|
|
@ -2307,8 +2307,9 @@ def cmd_update(args):
|
|||
try:
|
||||
from gateway.status import get_running_pid, remove_pid_file
|
||||
from hermes_cli.gateway import (
|
||||
get_service_name, get_launchd_plist_path, is_macos,
|
||||
get_service_name, get_launchd_plist_path, is_macos, is_linux,
|
||||
refresh_launchd_plist_if_needed,
|
||||
_ensure_user_systemd_env, get_systemd_linger_status,
|
||||
)
|
||||
import signal as _signal
|
||||
|
||||
|
|
@ -2318,6 +2319,7 @@ def cmd_update(args):
|
|||
has_launchd_service = False
|
||||
|
||||
try:
|
||||
_ensure_user_systemd_env()
|
||||
check = subprocess.run(
|
||||
["systemctl", "--user", "is-active", _gw_service_name],
|
||||
capture_output=True, text=True, timeout=5,
|
||||
|
|
@ -2366,7 +2368,20 @@ def cmd_update(args):
|
|||
print("✓ Gateway restarted.")
|
||||
else:
|
||||
print(f"⚠ Gateway restart failed: {restart.stderr.strip()}")
|
||||
print(" Try manually: hermes gateway restart")
|
||||
# Check if linger is the issue
|
||||
if is_linux():
|
||||
linger_ok, _detail = get_systemd_linger_status()
|
||||
if linger_ok is not True:
|
||||
import getpass
|
||||
_username = getpass.getuser()
|
||||
print()
|
||||
print(" Linger must be enabled for the gateway user service to function.")
|
||||
print(f" Run: sudo loginctl enable-linger {_username}")
|
||||
print()
|
||||
print(" Then restart the gateway:")
|
||||
print(" hermes gateway restart")
|
||||
else:
|
||||
print(" Try manually: hermes gateway restart")
|
||||
elif has_launchd_service:
|
||||
# Refresh the plist first (picks up --replace and other
|
||||
# changes from the update we just pulled).
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
"""Tests for gateway service management helpers."""
|
||||
|
||||
import os
|
||||
from types import SimpleNamespace
|
||||
|
||||
import hermes_cli.gateway as gateway_cli
|
||||
|
|
@ -156,3 +157,81 @@ class TestGatewaySystemServiceRouting:
|
|||
gateway_cli.gateway_command(SimpleNamespace(gateway_command="status", deep=False, system=False))
|
||||
|
||||
assert calls == [(False, False)]
|
||||
|
||||
|
||||
class TestEnsureUserSystemdEnv:
|
||||
"""Tests for _ensure_user_systemd_env() D-Bus session bus auto-detection."""
|
||||
|
||||
def test_sets_xdg_runtime_dir_when_missing(self, tmp_path, monkeypatch):
|
||||
monkeypatch.delenv("XDG_RUNTIME_DIR", raising=False)
|
||||
monkeypatch.delenv("DBUS_SESSION_BUS_ADDRESS", raising=False)
|
||||
monkeypatch.setattr(os, "getuid", lambda: 42)
|
||||
|
||||
# Patch Path so /run/user/42 resolves to our tmp dir (which exists)
|
||||
from pathlib import Path as RealPath
|
||||
|
||||
class FakePath(type(RealPath())):
|
||||
def __new__(cls, *args):
|
||||
p = str(args[0]) if args else ""
|
||||
if p == "/run/user/42":
|
||||
return RealPath.__new__(cls, str(tmp_path))
|
||||
return RealPath.__new__(cls, *args)
|
||||
|
||||
monkeypatch.setattr(gateway_cli, "Path", FakePath)
|
||||
|
||||
gateway_cli._ensure_user_systemd_env()
|
||||
|
||||
# Function sets the canonical string, not the fake path
|
||||
assert os.environ.get("XDG_RUNTIME_DIR") == "/run/user/42"
|
||||
|
||||
def test_sets_dbus_address_when_bus_socket_exists(self, tmp_path, monkeypatch):
|
||||
runtime = tmp_path / "runtime"
|
||||
runtime.mkdir()
|
||||
bus_socket = runtime / "bus"
|
||||
bus_socket.touch() # simulate the socket file
|
||||
|
||||
monkeypatch.setenv("XDG_RUNTIME_DIR", str(runtime))
|
||||
monkeypatch.delenv("DBUS_SESSION_BUS_ADDRESS", raising=False)
|
||||
monkeypatch.setattr(os, "getuid", lambda: 99)
|
||||
|
||||
gateway_cli._ensure_user_systemd_env()
|
||||
|
||||
assert os.environ["DBUS_SESSION_BUS_ADDRESS"] == f"unix:path={bus_socket}"
|
||||
|
||||
def test_preserves_existing_env_vars(self, monkeypatch):
|
||||
monkeypatch.setenv("XDG_RUNTIME_DIR", "/custom/runtime")
|
||||
monkeypatch.setenv("DBUS_SESSION_BUS_ADDRESS", "unix:path=/custom/bus")
|
||||
|
||||
gateway_cli._ensure_user_systemd_env()
|
||||
|
||||
assert os.environ["XDG_RUNTIME_DIR"] == "/custom/runtime"
|
||||
assert os.environ["DBUS_SESSION_BUS_ADDRESS"] == "unix:path=/custom/bus"
|
||||
|
||||
def test_no_dbus_when_bus_socket_missing(self, tmp_path, monkeypatch):
|
||||
runtime = tmp_path / "runtime"
|
||||
runtime.mkdir()
|
||||
# no bus socket created
|
||||
|
||||
monkeypatch.setenv("XDG_RUNTIME_DIR", str(runtime))
|
||||
monkeypatch.delenv("DBUS_SESSION_BUS_ADDRESS", raising=False)
|
||||
monkeypatch.setattr(os, "getuid", lambda: 99)
|
||||
|
||||
gateway_cli._ensure_user_systemd_env()
|
||||
|
||||
assert "DBUS_SESSION_BUS_ADDRESS" not in os.environ
|
||||
|
||||
def test_systemctl_cmd_calls_ensure_for_user_mode(self, monkeypatch):
|
||||
calls = []
|
||||
monkeypatch.setattr(gateway_cli, "_ensure_user_systemd_env", lambda: calls.append("called"))
|
||||
|
||||
result = gateway_cli._systemctl_cmd(system=False)
|
||||
assert result == ["systemctl", "--user"]
|
||||
assert calls == ["called"]
|
||||
|
||||
def test_systemctl_cmd_skips_ensure_for_system_mode(self, monkeypatch):
|
||||
calls = []
|
||||
monkeypatch.setattr(gateway_cli, "_ensure_user_systemd_env", lambda: calls.append("called"))
|
||||
|
||||
result = gateway_cli._systemctl_cmd(system=True)
|
||||
assert result == ["systemctl"]
|
||||
assert calls == []
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue