fix(gateway): restart on retryable startup failures (#1517)
This commit is contained in:
parent
dfe72b9d97
commit
17e87478d2
4 changed files with 149 additions and 3 deletions
|
|
@ -265,6 +265,8 @@ class TelegramAdapter(BasePlatformAdapter):
|
|||
release_scoped_lock("telegram-bot-token", self._token_lock_identity)
|
||||
except Exception:
|
||||
pass
|
||||
message = f"Telegram startup failed: {e}"
|
||||
self._set_fatal_error("telegram_connect_error", message, retryable=True)
|
||||
logger.error("[%s] Failed to connect to Telegram: %s", self.name, e, exc_info=True)
|
||||
return False
|
||||
|
||||
|
|
|
|||
|
|
@ -831,12 +831,15 @@ class GatewayRunner:
|
|||
logger.warning("Process checkpoint recovery: %s", e)
|
||||
|
||||
connected_count = 0
|
||||
enabled_platform_count = 0
|
||||
startup_nonretryable_errors: list[str] = []
|
||||
startup_retryable_errors: list[str] = []
|
||||
|
||||
# Initialize and connect each configured platform
|
||||
for platform, platform_config in self.config.platforms.items():
|
||||
if not platform_config.enabled:
|
||||
continue
|
||||
enabled_platform_count += 1
|
||||
|
||||
adapter = self._create_adapter(platform, platform_config)
|
||||
if not adapter:
|
||||
|
|
@ -858,12 +861,22 @@ class GatewayRunner:
|
|||
logger.info("✓ %s connected", platform.value)
|
||||
else:
|
||||
logger.warning("✗ %s failed to connect", platform.value)
|
||||
if adapter.has_fatal_error and not adapter.fatal_error_retryable:
|
||||
startup_nonretryable_errors.append(
|
||||
if adapter.has_fatal_error:
|
||||
target = (
|
||||
startup_retryable_errors
|
||||
if adapter.fatal_error_retryable
|
||||
else startup_nonretryable_errors
|
||||
)
|
||||
target.append(
|
||||
f"{platform.value}: {adapter.fatal_error_message}"
|
||||
)
|
||||
else:
|
||||
startup_retryable_errors.append(
|
||||
f"{platform.value}: failed to connect"
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error("✗ %s error: %s", platform.value, e)
|
||||
startup_retryable_errors.append(f"{platform.value}: {e}")
|
||||
|
||||
if connected_count == 0:
|
||||
if startup_nonretryable_errors:
|
||||
|
|
@ -876,7 +889,16 @@ class GatewayRunner:
|
|||
pass
|
||||
self._request_clean_exit(reason)
|
||||
return True
|
||||
logger.warning("No messaging platforms connected.")
|
||||
if enabled_platform_count > 0:
|
||||
reason = "; ".join(startup_retryable_errors) or "all configured messaging platforms failed to connect"
|
||||
logger.error("Gateway failed to connect any configured messaging platform: %s", reason)
|
||||
try:
|
||||
from gateway.status import write_runtime_status
|
||||
write_runtime_status(gateway_state="startup_failed", exit_reason=reason)
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
logger.warning("No messaging platforms enabled.")
|
||||
logger.info("Gateway will continue running for cron job execution.")
|
||||
|
||||
# Update delivery router with adapters
|
||||
|
|
|
|||
89
tests/gateway/test_runner_startup_failures.py
Normal file
89
tests/gateway/test_runner_startup_failures.py
Normal file
|
|
@ -0,0 +1,89 @@
|
|||
import pytest
|
||||
|
||||
from gateway.config import GatewayConfig, Platform, PlatformConfig
|
||||
from gateway.platforms.base import BasePlatformAdapter
|
||||
from gateway.run import GatewayRunner
|
||||
from gateway.status import read_runtime_status
|
||||
|
||||
|
||||
class _RetryableFailureAdapter(BasePlatformAdapter):
|
||||
def __init__(self):
|
||||
super().__init__(PlatformConfig(enabled=True, token="***"), Platform.TELEGRAM)
|
||||
|
||||
async def connect(self) -> bool:
|
||||
self._set_fatal_error(
|
||||
"telegram_connect_error",
|
||||
"Telegram startup failed: temporary DNS resolution failure.",
|
||||
retryable=True,
|
||||
)
|
||||
return False
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
self._mark_disconnected()
|
||||
|
||||
async def send(self, chat_id, content, reply_to=None, metadata=None):
|
||||
raise NotImplementedError
|
||||
|
||||
async def get_chat_info(self, chat_id):
|
||||
return {"id": chat_id}
|
||||
|
||||
|
||||
class _DisabledAdapter(BasePlatformAdapter):
|
||||
def __init__(self):
|
||||
super().__init__(PlatformConfig(enabled=False, token="***"), Platform.TELEGRAM)
|
||||
|
||||
async def connect(self) -> bool:
|
||||
raise AssertionError("connect should not be called for disabled platforms")
|
||||
|
||||
async def disconnect(self) -> None:
|
||||
self._mark_disconnected()
|
||||
|
||||
async def send(self, chat_id, content, reply_to=None, metadata=None):
|
||||
raise NotImplementedError
|
||||
|
||||
async def get_chat_info(self, chat_id):
|
||||
return {"id": chat_id}
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_runner_returns_failure_for_retryable_startup_errors(monkeypatch, tmp_path):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
config = GatewayConfig(
|
||||
platforms={
|
||||
Platform.TELEGRAM: PlatformConfig(enabled=True, token="***")
|
||||
},
|
||||
sessions_dir=tmp_path / "sessions",
|
||||
)
|
||||
runner = GatewayRunner(config)
|
||||
|
||||
monkeypatch.setattr(runner, "_create_adapter", lambda platform, platform_config: _RetryableFailureAdapter())
|
||||
|
||||
ok = await runner.start()
|
||||
|
||||
assert ok is False
|
||||
assert runner.should_exit_cleanly is False
|
||||
state = read_runtime_status()
|
||||
assert state["gateway_state"] == "startup_failed"
|
||||
assert "temporary DNS resolution failure" in state["exit_reason"]
|
||||
assert state["platforms"]["telegram"]["state"] == "fatal"
|
||||
assert state["platforms"]["telegram"]["error_code"] == "telegram_connect_error"
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_runner_allows_cron_only_mode_when_no_platforms_are_enabled(monkeypatch, tmp_path):
|
||||
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||
config = GatewayConfig(
|
||||
platforms={
|
||||
Platform.TELEGRAM: PlatformConfig(enabled=False, token="***")
|
||||
},
|
||||
sessions_dir=tmp_path / "sessions",
|
||||
)
|
||||
runner = GatewayRunner(config)
|
||||
|
||||
ok = await runner.start()
|
||||
|
||||
assert ok is True
|
||||
assert runner.should_exit_cleanly is False
|
||||
assert runner.adapters == {}
|
||||
state = read_runtime_status()
|
||||
assert state["gateway_state"] == "running"
|
||||
|
|
@ -100,6 +100,39 @@ async def test_polling_conflict_stops_polling_and_notifies_handler(monkeypatch):
|
|||
fatal_handler.assert_awaited_once()
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_connect_marks_retryable_fatal_error_for_startup_network_failure(monkeypatch):
|
||||
adapter = TelegramAdapter(PlatformConfig(enabled=True, token="***"))
|
||||
|
||||
monkeypatch.setattr(
|
||||
"gateway.status.acquire_scoped_lock",
|
||||
lambda scope, identity, metadata=None: (True, None),
|
||||
)
|
||||
monkeypatch.setattr(
|
||||
"gateway.status.release_scoped_lock",
|
||||
lambda scope, identity: None,
|
||||
)
|
||||
|
||||
builder = MagicMock()
|
||||
builder.token.return_value = builder
|
||||
app = SimpleNamespace(
|
||||
bot=SimpleNamespace(),
|
||||
updater=SimpleNamespace(),
|
||||
add_handler=MagicMock(),
|
||||
initialize=AsyncMock(side_effect=RuntimeError("Temporary failure in name resolution")),
|
||||
start=AsyncMock(),
|
||||
)
|
||||
builder.build.return_value = app
|
||||
monkeypatch.setattr("gateway.platforms.telegram.Application", SimpleNamespace(builder=MagicMock(return_value=builder)))
|
||||
|
||||
ok = await adapter.connect()
|
||||
|
||||
assert ok is False
|
||||
assert adapter.fatal_error_code == "telegram_connect_error"
|
||||
assert adapter.fatal_error_retryable is True
|
||||
assert "Temporary failure in name resolution" in adapter.fatal_error_message
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_disconnect_skips_inactive_updater_and_app(monkeypatch):
|
||||
adapter = TelegramAdapter(PlatformConfig(enabled=True, token="***"))
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue