fix: notify gateway users when updates finish or fail

This commit is contained in:
clabbe-bot 2026-03-11 20:43:31 +01:00 committed by teknium1
parent cf3dceafe1
commit 3126c60885
2 changed files with 201 additions and 20 deletions

View file

@ -17,6 +17,7 @@ import asyncio
import logging import logging
import os import os
import re import re
import shlex
import sys import sys
import signal import signal
import threading import threading
@ -674,8 +675,17 @@ class GatewayRunner:
except Exception as e: except Exception as e:
logger.warning("Channel directory build failed: %s", e) logger.warning("Channel directory build failed: %s", e)
# Check if we're restarting after a /update command # Check if we're restarting after a /update command. If the update is
await self._send_update_notification() # still running, keep watching so we notify once it actually finishes.
notified = await self._send_update_notification()
if not notified and any(
path.exists()
for path in (
_hermes_home / ".update_pending.json",
_hermes_home / ".update_pending.claimed.json",
)
):
self._schedule_update_notification_watch()
# Start background session expiry watcher for proactive memory flushing # Start background session expiry watcher for proactive memory flushing
asyncio.create_task(self._session_expiry_watcher()) asyncio.create_task(self._session_expiry_watcher())
@ -2714,9 +2724,9 @@ class GatewayRunner:
"""Handle /update command — update Hermes Agent to the latest version. """Handle /update command — update Hermes Agent to the latest version.
Spawns ``hermes update`` in a separate systemd scope so it survives the Spawns ``hermes update`` in a separate systemd scope so it survives the
gateway restart that ``hermes update`` triggers at the end. A marker gateway restart that ``hermes update`` may trigger at the end. Marker
file is written so the *new* gateway process can notify the user of the files are written so either the current gateway process or the next one
result on startup. can notify the user when the update finishes.
""" """
import json import json
import shutil import shutil
@ -2733,9 +2743,9 @@ class GatewayRunner:
if not hermes_bin: if not hermes_bin:
return "✗ `hermes` command not found on PATH." return "✗ `hermes` command not found on PATH."
# Write marker so the restarted gateway can notify this chat
pending_path = _hermes_home / ".update_pending.json" pending_path = _hermes_home / ".update_pending.json"
output_path = _hermes_home / ".update_output.txt" output_path = _hermes_home / ".update_output.txt"
exit_code_path = _hermes_home / ".update_exit_code"
pending = { pending = {
"platform": event.source.platform.value, "platform": event.source.platform.value,
"chat_id": event.source.chat_id, "chat_id": event.source.chat_id,
@ -2743,10 +2753,14 @@ class GatewayRunner:
"timestamp": datetime.now().isoformat(), "timestamp": datetime.now().isoformat(),
} }
pending_path.write_text(json.dumps(pending)) pending_path.write_text(json.dumps(pending))
exit_code_path.unlink(missing_ok=True)
# Spawn `hermes update` in a separate cgroup so it survives gateway # Spawn `hermes update` in a separate cgroup so it survives gateway
# restart. systemd-run --user --scope creates a transient scope unit. # restart. systemd-run --user --scope creates a transient scope unit.
update_cmd = f"{hermes_bin} update > {output_path} 2>&1" update_cmd = (
f"{shlex.quote(hermes_bin)} update > {shlex.quote(str(output_path))} 2>&1; "
f"status=$?; printf '%s' \"$status\" > {shlex.quote(str(exit_code_path))}"
)
try: try:
systemd_run = shutil.which("systemd-run") systemd_run = shutil.which("systemd-run")
if systemd_run: if systemd_run:
@ -2768,26 +2782,91 @@ class GatewayRunner:
) )
except Exception as e: except Exception as e:
pending_path.unlink(missing_ok=True) pending_path.unlink(missing_ok=True)
exit_code_path.unlink(missing_ok=True)
return f"✗ Failed to start update: {e}" return f"✗ Failed to start update: {e}"
self._schedule_update_notification_watch()
return "⚕ Starting Hermes update… I'll notify you when it's done." return "⚕ Starting Hermes update… I'll notify you when it's done."
async def _send_update_notification(self) -> None: def _schedule_update_notification_watch(self) -> None:
"""If the gateway is starting after a ``/update``, notify the user.""" """Ensure a background task is watching for update completion."""
existing_task = getattr(self, "_update_notification_task", None)
if existing_task and not existing_task.done():
return
try:
self._update_notification_task = asyncio.create_task(
self._watch_for_update_completion()
)
except RuntimeError:
logger.debug("Skipping update notification watcher: no running event loop")
async def _watch_for_update_completion(
self,
poll_interval: float = 2.0,
timeout: float = 1800.0,
) -> None:
"""Wait for ``hermes update`` to finish, then send its notification."""
pending_path = _hermes_home / ".update_pending.json"
claimed_path = _hermes_home / ".update_pending.claimed.json"
exit_code_path = _hermes_home / ".update_exit_code"
loop = asyncio.get_running_loop()
deadline = loop.time() + timeout
while (pending_path.exists() or claimed_path.exists()) and loop.time() < deadline:
if exit_code_path.exists():
await self._send_update_notification()
return
await asyncio.sleep(poll_interval)
if (pending_path.exists() or claimed_path.exists()) and not exit_code_path.exists():
logger.warning("Update watcher timed out waiting for completion marker")
exit_code_path.write_text("124")
await self._send_update_notification()
async def _send_update_notification(self) -> bool:
"""If an update finished, notify the user.
Returns False when the update is still running so a caller can retry
later. Returns True after a definitive send/skip decision.
"""
import json import json
import re as _re import re as _re
pending_path = _hermes_home / ".update_pending.json" pending_path = _hermes_home / ".update_pending.json"
claimed_path = _hermes_home / ".update_pending.claimed.json"
output_path = _hermes_home / ".update_output.txt" output_path = _hermes_home / ".update_output.txt"
exit_code_path = _hermes_home / ".update_exit_code"
if not pending_path.exists(): if not pending_path.exists() and not claimed_path.exists():
return return False
cleanup = True
active_pending_path = claimed_path
try: try:
pending = json.loads(pending_path.read_text()) if pending_path.exists():
try:
pending_path.replace(claimed_path)
except FileNotFoundError:
if not claimed_path.exists():
return True
elif not claimed_path.exists():
return True
pending = json.loads(claimed_path.read_text())
platform_str = pending.get("platform") platform_str = pending.get("platform")
chat_id = pending.get("chat_id") chat_id = pending.get("chat_id")
if not exit_code_path.exists():
logger.info("Update notification deferred: update still running")
cleanup = False
active_pending_path = pending_path
claimed_path.replace(pending_path)
return False
exit_code_raw = exit_code_path.read_text().strip() or "1"
exit_code = int(exit_code_raw)
# Read the captured update output # Read the captured update output
output = "" output = ""
if output_path.exists(): if output_path.exists():
@ -2801,19 +2880,34 @@ class GatewayRunner:
# Strip ANSI escape codes for clean display # Strip ANSI escape codes for clean display
output = _re.sub(r'\x1b\[[0-9;]*m', '', output).strip() output = _re.sub(r'\x1b\[[0-9;]*m', '', output).strip()
if output: if output:
# Truncate if too long for a single message
if len(output) > 3500: if len(output) > 3500:
output = "" + output[-3500:] output = "" + output[-3500:]
msg = f"✅ Hermes update finished — gateway restarted.\n\n```\n{output}\n```" if exit_code == 0:
msg = f"✅ Hermes update finished.\n\n```\n{output}\n```"
else:
msg = f"❌ Hermes update failed.\n\n```\n{output}\n```"
else: else:
msg = "✅ Hermes update finished — gateway restarted successfully." if exit_code == 0:
msg = "✅ Hermes update finished successfully."
else:
msg = "❌ Hermes update failed. Check the gateway logs or run `hermes update` manually for details."
await adapter.send(chat_id, msg) await adapter.send(chat_id, msg)
logger.info("Sent post-update notification to %s:%s", platform_str, chat_id) logger.info(
"Sent post-update notification to %s:%s (exit=%s)",
platform_str,
chat_id,
exit_code,
)
except Exception as e: except Exception as e:
logger.warning("Post-update notification failed: %s", e) logger.warning("Post-update notification failed: %s", e)
finally: finally:
pending_path.unlink(missing_ok=True) if cleanup:
output_path.unlink(missing_ok=True) active_pending_path.unlink(missing_ok=True)
claimed_path.unlink(missing_ok=True)
output_path.unlink(missing_ok=True)
exit_code_path.unlink(missing_ok=True)
return True
def _set_session_env(self, context: SessionContext) -> None: def _set_session_env(self, context: SessionContext) -> None:
"""Set environment variables for the current session.""" """Set environment variables for the current session."""

View file

@ -133,6 +133,7 @@ class TestHandleUpdateCommand:
assert data["platform"] == "telegram" assert data["platform"] == "telegram"
assert data["chat_id"] == "99999" assert data["chat_id"] == "99999"
assert "timestamp" in data assert "timestamp" in data
assert not (hermes_home / ".update_exit_code").exists()
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_spawns_systemd_run(self, tmp_path): async def test_spawns_systemd_run(self, tmp_path):
@ -160,6 +161,7 @@ class TestHandleUpdateCommand:
call_args = mock_popen.call_args[0][0] call_args = mock_popen.call_args[0][0]
assert call_args[0] == "/usr/bin/systemd-run" assert call_args[0] == "/usr/bin/systemd-run"
assert "--scope" in call_args assert "--scope" in call_args
assert ".update_exit_code" in call_args[-1]
assert "Starting Hermes update" in result assert "Starting Hermes update" in result
@pytest.mark.asyncio @pytest.mark.asyncio
@ -196,6 +198,7 @@ class TestHandleUpdateCommand:
call_args = mock_popen.call_args[0][0] call_args = mock_popen.call_args[0][0]
assert call_args[0] == "bash" assert call_args[0] == "bash"
assert "nohup" in call_args[2] assert "nohup" in call_args[2]
assert ".update_exit_code" in call_args[2]
assert "Starting Hermes update" in result assert "Starting Hermes update" in result
@pytest.mark.asyncio @pytest.mark.asyncio
@ -222,6 +225,7 @@ class TestHandleUpdateCommand:
assert "Failed to start update" in result assert "Failed to start update" in result
# Pending file should be cleaned up # Pending file should be cleaned up
assert not (hermes_home / ".update_pending.json").exists() assert not (hermes_home / ".update_pending.json").exists()
assert not (hermes_home / ".update_exit_code").exists()
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_returns_user_friendly_message(self, tmp_path): async def test_returns_user_friendly_message(self, tmp_path):
@ -266,6 +270,53 @@ class TestSendUpdateNotification:
# Should not raise # Should not raise
await runner._send_update_notification() await runner._send_update_notification()
@pytest.mark.asyncio
async def test_defers_notification_while_update_still_running(self, tmp_path):
"""Returns False and keeps marker files when the update has not exited yet."""
runner = _make_runner()
hermes_home = tmp_path / "hermes"
hermes_home.mkdir()
pending_path = hermes_home / ".update_pending.json"
pending_path.write_text(json.dumps({
"platform": "telegram", "chat_id": "67890", "user_id": "12345",
}))
(hermes_home / ".update_output.txt").write_text("still running")
mock_adapter = AsyncMock()
runner.adapters = {Platform.TELEGRAM: mock_adapter}
with patch("gateway.run._hermes_home", hermes_home):
result = await runner._send_update_notification()
assert result is False
mock_adapter.send.assert_not_called()
assert pending_path.exists()
@pytest.mark.asyncio
async def test_recovers_from_claimed_pending_file(self, tmp_path):
"""A claimed pending file from a crashed notifier is still deliverable."""
runner = _make_runner()
hermes_home = tmp_path / "hermes"
hermes_home.mkdir()
claimed_path = hermes_home / ".update_pending.claimed.json"
claimed_path.write_text(json.dumps({
"platform": "telegram", "chat_id": "67890", "user_id": "12345",
}))
(hermes_home / ".update_output.txt").write_text("done")
(hermes_home / ".update_exit_code").write_text("0")
mock_adapter = AsyncMock()
runner.adapters = {Platform.TELEGRAM: mock_adapter}
with patch("gateway.run._hermes_home", hermes_home):
result = await runner._send_update_notification()
assert result is True
mock_adapter.send.assert_called_once()
assert not claimed_path.exists()
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_sends_notification_with_output(self, tmp_path): async def test_sends_notification_with_output(self, tmp_path):
"""Sends update output to the correct platform and chat.""" """Sends update output to the correct platform and chat."""
@ -284,6 +335,7 @@ class TestSendUpdateNotification:
(hermes_home / ".update_output.txt").write_text( (hermes_home / ".update_output.txt").write_text(
"→ Found 3 new commit(s)\n✓ Code updated!\n✓ Update complete!" "→ Found 3 new commit(s)\n✓ Code updated!\n✓ Update complete!"
) )
(hermes_home / ".update_exit_code").write_text("0")
# Mock the adapter # Mock the adapter
mock_adapter = AsyncMock() mock_adapter = AsyncMock()
@ -310,6 +362,7 @@ class TestSendUpdateNotification:
(hermes_home / ".update_output.txt").write_text( (hermes_home / ".update_output.txt").write_text(
"\x1b[32m✓ Code updated!\x1b[0m\n\x1b[1mDone\x1b[0m" "\x1b[32m✓ Code updated!\x1b[0m\n\x1b[1mDone\x1b[0m"
) )
(hermes_home / ".update_exit_code").write_text("0")
mock_adapter = AsyncMock() mock_adapter = AsyncMock()
runner.adapters = {Platform.TELEGRAM: mock_adapter} runner.adapters = {Platform.TELEGRAM: mock_adapter}
@ -331,6 +384,7 @@ class TestSendUpdateNotification:
pending = {"platform": "telegram", "chat_id": "111", "user_id": "222"} pending = {"platform": "telegram", "chat_id": "111", "user_id": "222"}
(hermes_home / ".update_pending.json").write_text(json.dumps(pending)) (hermes_home / ".update_pending.json").write_text(json.dumps(pending))
(hermes_home / ".update_output.txt").write_text("x" * 5000) (hermes_home / ".update_output.txt").write_text("x" * 5000)
(hermes_home / ".update_exit_code").write_text("0")
mock_adapter = AsyncMock() mock_adapter = AsyncMock()
runner.adapters = {Platform.TELEGRAM: mock_adapter} runner.adapters = {Platform.TELEGRAM: mock_adapter}
@ -344,6 +398,29 @@ class TestSendUpdateNotification:
# Total message should not be absurdly long # Total message should not be absurdly long
assert len(sent_text) < 4500 assert len(sent_text) < 4500
@pytest.mark.asyncio
async def test_sends_failure_message_when_update_fails(self, tmp_path):
"""Non-zero exit codes produce a failure notification with captured output."""
runner = _make_runner()
hermes_home = tmp_path / "hermes"
hermes_home.mkdir()
pending = {"platform": "telegram", "chat_id": "111", "user_id": "222"}
(hermes_home / ".update_pending.json").write_text(json.dumps(pending))
(hermes_home / ".update_output.txt").write_text("Traceback: boom")
(hermes_home / ".update_exit_code").write_text("1")
mock_adapter = AsyncMock()
runner.adapters = {Platform.TELEGRAM: mock_adapter}
with patch("gateway.run._hermes_home", hermes_home):
result = await runner._send_update_notification()
assert result is True
sent_text = mock_adapter.send.call_args[0][1]
assert "update failed" in sent_text.lower()
assert "Traceback: boom" in sent_text
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_sends_generic_message_when_no_output(self, tmp_path): async def test_sends_generic_message_when_no_output(self, tmp_path):
"""Sends a success message even if the output file is missing.""" """Sends a success message even if the output file is missing."""
@ -354,6 +431,7 @@ class TestSendUpdateNotification:
pending = {"platform": "telegram", "chat_id": "111", "user_id": "222"} pending = {"platform": "telegram", "chat_id": "111", "user_id": "222"}
(hermes_home / ".update_pending.json").write_text(json.dumps(pending)) (hermes_home / ".update_pending.json").write_text(json.dumps(pending))
# No .update_output.txt created # No .update_output.txt created
(hermes_home / ".update_exit_code").write_text("0")
mock_adapter = AsyncMock() mock_adapter = AsyncMock()
runner.adapters = {Platform.TELEGRAM: mock_adapter} runner.adapters = {Platform.TELEGRAM: mock_adapter}
@ -362,7 +440,7 @@ class TestSendUpdateNotification:
await runner._send_update_notification() await runner._send_update_notification()
sent_text = mock_adapter.send.call_args[0][1] sent_text = mock_adapter.send.call_args[0][1]
assert "restarted successfully" in sent_text assert "finished successfully" in sent_text
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_cleans_up_files_after_notification(self, tmp_path): async def test_cleans_up_files_after_notification(self, tmp_path):
@ -373,10 +451,12 @@ class TestSendUpdateNotification:
pending_path = hermes_home / ".update_pending.json" pending_path = hermes_home / ".update_pending.json"
output_path = hermes_home / ".update_output.txt" output_path = hermes_home / ".update_output.txt"
exit_code_path = hermes_home / ".update_exit_code"
pending_path.write_text(json.dumps({ pending_path.write_text(json.dumps({
"platform": "telegram", "chat_id": "111", "user_id": "222", "platform": "telegram", "chat_id": "111", "user_id": "222",
})) }))
output_path.write_text("✓ Done") output_path.write_text("✓ Done")
exit_code_path.write_text("0")
mock_adapter = AsyncMock() mock_adapter = AsyncMock()
runner.adapters = {Platform.TELEGRAM: mock_adapter} runner.adapters = {Platform.TELEGRAM: mock_adapter}
@ -386,6 +466,7 @@ class TestSendUpdateNotification:
assert not pending_path.exists() assert not pending_path.exists()
assert not output_path.exists() assert not output_path.exists()
assert not exit_code_path.exists()
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_cleans_up_on_error(self, tmp_path): async def test_cleans_up_on_error(self, tmp_path):
@ -396,10 +477,12 @@ class TestSendUpdateNotification:
pending_path = hermes_home / ".update_pending.json" pending_path = hermes_home / ".update_pending.json"
output_path = hermes_home / ".update_output.txt" output_path = hermes_home / ".update_output.txt"
exit_code_path = hermes_home / ".update_exit_code"
pending_path.write_text(json.dumps({ pending_path.write_text(json.dumps({
"platform": "telegram", "chat_id": "111", "user_id": "222", "platform": "telegram", "chat_id": "111", "user_id": "222",
})) }))
output_path.write_text("✓ Done") output_path.write_text("✓ Done")
exit_code_path.write_text("0")
# Adapter send raises # Adapter send raises
mock_adapter = AsyncMock() mock_adapter = AsyncMock()
@ -412,6 +495,7 @@ class TestSendUpdateNotification:
# Files should still be cleaned up (finally block) # Files should still be cleaned up (finally block)
assert not pending_path.exists() assert not pending_path.exists()
assert not output_path.exists() assert not output_path.exists()
assert not exit_code_path.exists()
@pytest.mark.asyncio @pytest.mark.asyncio
async def test_handles_corrupt_pending_file(self, tmp_path): async def test_handles_corrupt_pending_file(self, tmp_path):
@ -440,8 +524,10 @@ class TestSendUpdateNotification:
pending = {"platform": "discord", "chat_id": "111", "user_id": "222"} pending = {"platform": "discord", "chat_id": "111", "user_id": "222"}
pending_path = hermes_home / ".update_pending.json" pending_path = hermes_home / ".update_pending.json"
output_path = hermes_home / ".update_output.txt" output_path = hermes_home / ".update_output.txt"
exit_code_path = hermes_home / ".update_exit_code"
pending_path.write_text(json.dumps(pending)) pending_path.write_text(json.dumps(pending))
output_path.write_text("Done") output_path.write_text("Done")
exit_code_path.write_text("0")
# Only telegram adapter available, but pending says discord # Only telegram adapter available, but pending says discord
mock_adapter = AsyncMock() mock_adapter = AsyncMock()
@ -454,6 +540,7 @@ class TestSendUpdateNotification:
mock_adapter.send.assert_not_called() mock_adapter.send.assert_not_called()
# Files should still be cleaned up # Files should still be cleaned up
assert not pending_path.exists() assert not pending_path.exists()
assert not exit_code_path.exists()
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------