fix(gateway): restart on whatsapp bridge child exit (#2334)

Co-authored-by: Frederico Ribeiro <fr@tecompanytea.com>
This commit is contained in:
Teknium 2026-03-21 09:38:52 -07:00 committed by GitHub
parent e6299960cc
commit 8304a7716d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 160 additions and 5 deletions

View file

@ -196,7 +196,7 @@ class WhatsAppAdapter(BasePlatformAdapter):
bridge_status = data.get("status", "unknown")
if bridge_status == "connected":
print(f"[{self.name}] Using existing bridge (status: {bridge_status})")
self._running = True
self._mark_connected()
self._bridge_process = None # Not managed by us
asyncio.create_task(self._poll_messages())
return True
@ -306,7 +306,7 @@ class WhatsAppAdapter(BasePlatformAdapter):
# Start message polling task
asyncio.create_task(self._poll_messages())
self._running = True
self._mark_connected()
print(f"[{self.name}] Bridge started on port {self._bridge_port}")
return True
@ -324,6 +324,23 @@ class WhatsAppAdapter(BasePlatformAdapter):
pass
self._bridge_log_fh = None
async def _check_managed_bridge_exit(self) -> Optional[str]:
"""Return a fatal error message if the managed bridge child exited."""
if self._bridge_process is None:
return None
returncode = self._bridge_process.poll()
if returncode is None:
return None
message = f"WhatsApp bridge process exited unexpectedly (code {returncode})."
if not self.has_fatal_error:
logger.error("[%s] %s", self.name, message)
self._set_fatal_error("whatsapp_bridge_exited", message, retryable=True)
self._close_bridge_log()
await self._notify_fatal_error()
return self.fatal_error_message or message
async def disconnect(self) -> None:
"""Stop the WhatsApp bridge and clean up any orphaned processes."""
if self._bridge_process:
@ -352,7 +369,7 @@ class WhatsAppAdapter(BasePlatformAdapter):
# Bridge was not started by us, don't kill it
print(f"[{self.name}] Disconnecting (external bridge left running)")
self._running = False
self._mark_disconnected()
self._bridge_process = None
self._close_bridge_log()
print(f"[{self.name}] Disconnected")
@ -367,6 +384,9 @@ class WhatsAppAdapter(BasePlatformAdapter):
"""Send a message via the WhatsApp bridge."""
if not self._running:
return SendResult(success=False, error="Not connected")
bridge_exit = await self._check_managed_bridge_exit()
if bridge_exit:
return SendResult(success=False, error=bridge_exit)
try:
import aiohttp
@ -412,6 +432,9 @@ class WhatsAppAdapter(BasePlatformAdapter):
"""Edit a previously sent message via the WhatsApp bridge."""
if not self._running:
return SendResult(success=False, error="Not connected")
bridge_exit = await self._check_managed_bridge_exit()
if bridge_exit:
return SendResult(success=False, error=bridge_exit)
try:
import aiohttp
async with aiohttp.ClientSession() as session:
@ -443,6 +466,9 @@ class WhatsAppAdapter(BasePlatformAdapter):
"""Send any media file via bridge /send-media endpoint."""
if not self._running:
return SendResult(success=False, error="Not connected")
bridge_exit = await self._check_managed_bridge_exit()
if bridge_exit:
return SendResult(success=False, error=bridge_exit)
try:
import aiohttp
@ -531,6 +557,8 @@ class WhatsAppAdapter(BasePlatformAdapter):
"""Send typing indicator via bridge."""
if not self._running:
return
if await self._check_managed_bridge_exit():
return
try:
import aiohttp
@ -548,6 +576,8 @@ class WhatsAppAdapter(BasePlatformAdapter):
"""Get information about a WhatsApp chat."""
if not self._running:
return {"name": "Unknown", "type": "dm"}
if await self._check_managed_bridge_exit():
return {"name": chat_id, "type": "dm"}
try:
import aiohttp
@ -578,6 +608,10 @@ class WhatsAppAdapter(BasePlatformAdapter):
return
while self._running:
bridge_exit = await self._check_managed_bridge_exit()
if bridge_exit:
print(f"[{self.name}] {bridge_exit}")
break
try:
async with aiohttp.ClientSession() as session:
async with session.get(
@ -593,6 +627,10 @@ class WhatsAppAdapter(BasePlatformAdapter):
except asyncio.CancelledError:
break
except Exception as e:
bridge_exit = await self._check_managed_bridge_exit()
if bridge_exit:
print(f"[{self.name}] {bridge_exit}")
break
print(f"[{self.name}] Poll error: {e}")
await asyncio.sleep(5)
@ -674,4 +712,3 @@ class WhatsAppAdapter(BasePlatformAdapter):
except Exception as e:
print(f"[{self.name}] Error building event: {e}")
return None

View file

@ -336,6 +336,7 @@ class GatewayRunner:
self._running = False
self._shutdown_event = asyncio.Event()
self._exit_cleanly = False
self._exit_with_failure = False
self._exit_reason: Optional[str] = None
# Track running agents per session for interrupt support
@ -591,6 +592,10 @@ class GatewayRunner:
def should_exit_cleanly(self) -> bool:
return self._exit_cleanly
@property
def should_exit_with_failure(self) -> bool:
return self._exit_with_failure
@property
def exit_reason(self) -> Optional[str]:
return self._exit_reason
@ -643,7 +648,11 @@ class GatewayRunner:
if not self.adapters:
self._exit_reason = adapter.fatal_error_message or "All messaging adapters disconnected"
logger.error("No connected messaging platforms remain. Shutting down gateway cleanly.")
if adapter.fatal_error_retryable:
self._exit_with_failure = True
logger.error("No connected messaging platforms remain. Shutting down gateway for service restart.")
else:
logger.error("No connected messaging platforms remain. Shutting down gateway cleanly.")
await self.stop()
def _request_clean_exit(self, reason: str) -> None:
@ -5266,6 +5275,11 @@ async def start_gateway(config: Optional[GatewayConfig] = None, replace: bool =
# Wait for shutdown
await runner.wait_for_shutdown()
if runner.should_exit_with_failure:
if runner.exit_reason:
logger.error("Gateway exiting with failure: %s", runner.exit_reason)
return False
# Stop cron ticker cleanly
cron_stop.set()