fix(gateway): restart on retryable startup failures (#1517)

This commit is contained in:
JP Lew 2026-03-16 17:56:31 +05:30 committed by GitHub
parent dfe72b9d97
commit 17e87478d2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 149 additions and 3 deletions

View file

@ -265,6 +265,8 @@ class TelegramAdapter(BasePlatformAdapter):
release_scoped_lock("telegram-bot-token", self._token_lock_identity)
except Exception:
pass
message = f"Telegram startup failed: {e}"
self._set_fatal_error("telegram_connect_error", message, retryable=True)
logger.error("[%s] Failed to connect to Telegram: %s", self.name, e, exc_info=True)
return False

View file

@ -831,12 +831,15 @@ class GatewayRunner:
logger.warning("Process checkpoint recovery: %s", e)
connected_count = 0
enabled_platform_count = 0
startup_nonretryable_errors: list[str] = []
startup_retryable_errors: list[str] = []
# Initialize and connect each configured platform
for platform, platform_config in self.config.platforms.items():
if not platform_config.enabled:
continue
enabled_platform_count += 1
adapter = self._create_adapter(platform, platform_config)
if not adapter:
@ -858,12 +861,22 @@ class GatewayRunner:
logger.info("%s connected", platform.value)
else:
logger.warning("%s failed to connect", platform.value)
if adapter.has_fatal_error and not adapter.fatal_error_retryable:
startup_nonretryable_errors.append(
if adapter.has_fatal_error:
target = (
startup_retryable_errors
if adapter.fatal_error_retryable
else startup_nonretryable_errors
)
target.append(
f"{platform.value}: {adapter.fatal_error_message}"
)
else:
startup_retryable_errors.append(
f"{platform.value}: failed to connect"
)
except Exception as e:
logger.error("%s error: %s", platform.value, e)
startup_retryable_errors.append(f"{platform.value}: {e}")
if connected_count == 0:
if startup_nonretryable_errors:
@ -876,7 +889,16 @@ class GatewayRunner:
pass
self._request_clean_exit(reason)
return True
logger.warning("No messaging platforms connected.")
if enabled_platform_count > 0:
reason = "; ".join(startup_retryable_errors) or "all configured messaging platforms failed to connect"
logger.error("Gateway failed to connect any configured messaging platform: %s", reason)
try:
from gateway.status import write_runtime_status
write_runtime_status(gateway_state="startup_failed", exit_reason=reason)
except Exception:
pass
return False
logger.warning("No messaging platforms enabled.")
logger.info("Gateway will continue running for cron job execution.")
# Update delivery router with adapters

View file

@ -0,0 +1,89 @@
import pytest
from gateway.config import GatewayConfig, Platform, PlatformConfig
from gateway.platforms.base import BasePlatformAdapter
from gateway.run import GatewayRunner
from gateway.status import read_runtime_status
class _RetryableFailureAdapter(BasePlatformAdapter):
def __init__(self):
super().__init__(PlatformConfig(enabled=True, token="***"), Platform.TELEGRAM)
async def connect(self) -> bool:
self._set_fatal_error(
"telegram_connect_error",
"Telegram startup failed: temporary DNS resolution failure.",
retryable=True,
)
return False
async def disconnect(self) -> None:
self._mark_disconnected()
async def send(self, chat_id, content, reply_to=None, metadata=None):
raise NotImplementedError
async def get_chat_info(self, chat_id):
return {"id": chat_id}
class _DisabledAdapter(BasePlatformAdapter):
def __init__(self):
super().__init__(PlatformConfig(enabled=False, token="***"), Platform.TELEGRAM)
async def connect(self) -> bool:
raise AssertionError("connect should not be called for disabled platforms")
async def disconnect(self) -> None:
self._mark_disconnected()
async def send(self, chat_id, content, reply_to=None, metadata=None):
raise NotImplementedError
async def get_chat_info(self, chat_id):
return {"id": chat_id}
@pytest.mark.asyncio
async def test_runner_returns_failure_for_retryable_startup_errors(monkeypatch, tmp_path):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
config = GatewayConfig(
platforms={
Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")
},
sessions_dir=tmp_path / "sessions",
)
runner = GatewayRunner(config)
monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _RetryableFailureAdapter())
ok = await runner.start()
assert ok is False
assert runner.should_exit_cleanly is False
state = read_runtime_status()
assert state["gateway_state"] == "startup_failed"
assert "temporary DNS resolution failure" in state["exit_reason"]
assert state["platforms"]["telegram"]["state"] == "fatal"
assert state["platforms"]["telegram"]["error_code"] == "telegram_connect_error"
@pytest.mark.asyncio
async def test_runner_allows_cron_only_mode_when_no_platforms_are_enabled(monkeypatch, tmp_path):
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
config = GatewayConfig(
platforms={
Platform.TELEGRAM: PlatformConfig(enabled=False, token="***")
},
sessions_dir=tmp_path / "sessions",
)
runner = GatewayRunner(config)
ok = await runner.start()
assert ok is True
assert runner.should_exit_cleanly is False
assert runner.adapters == {}
state = read_runtime_status()
assert state["gateway_state"] == "running"

View file

@ -100,6 +100,39 @@ async def test_polling_conflict_stops_polling_and_notifies_handler(monkeypatch):
fatal_handler.assert_awaited_once()
@pytest.mark.asyncio
async def test_connect_marks_retryable_fatal_error_for_startup_network_failure(monkeypatch):
adapter = TelegramAdapter(PlatformConfig(enabled=True, token="***"))
monkeypatch.setattr(
"gateway.status.acquire_scoped_lock",
lambda scope, identity, metadata=None: (True, None),
)
monkeypatch.setattr(
"gateway.status.release_scoped_lock",
lambda scope, identity: None,
)
builder = MagicMock()
builder.token.return_value = builder
app = SimpleNamespace(
bot=SimpleNamespace(),
updater=SimpleNamespace(),
add_handler=MagicMock(),
initialize=AsyncMock(side_effect=RuntimeError("Temporary failure in name resolution")),
start=AsyncMock(),
)
builder.build.return_value = app
monkeypatch.setattr("gateway.platforms.telegram.Application", SimpleNamespace(builder=MagicMock(return_value=builder)))
ok = await adapter.connect()
assert ok is False
assert adapter.fatal_error_code == "telegram_connect_error"
assert adapter.fatal_error_retryable is True
assert "Temporary failure in name resolution" in adapter.fatal_error_message
@pytest.mark.asyncio
async def test_disconnect_skips_inactive_updater_and_app(monkeypatch):
adapter = TelegramAdapter(PlatformConfig(enabled=True, token="***"))