Merge pull request #1290 from NousResearch/hermes/hermes-f48b210a

fix(send_message): salvage and complete MEDIA delivery from #971
2026-03-14 04:12:54 -07:00 · 2026-03-14 04:12:54 -07:00 · b42ee3050e
commit b42ee3050e
parent 9525db913f 5c9a84219d
4 changed files with 347 additions and 28 deletions
--- a/gateway/platforms/base.py
+++ b/gateway/platforms/base.py
@ -618,16 +618,22 @@ class BasePlatformAdapter(ABC):
        has_voice_tag = "[[audio_as_voice]]" in content
        cleaned = cleaned.replace("[[audio_as_voice]]", "")
        
-        # Extract MEDIA:<path> tags (path may contain spaces)
-        media_pattern = r'MEDIA:(\S+)'
-        for match in re.finditer(media_pattern, content):
-            path = match.group(1).strip()
+        # Extract MEDIA:<path> tags, allowing optional whitespace after the colon
+        # and quoted/backticked paths for LLM-formatted outputs.
+        media_pattern = re.compile(
+            r'''[`"']?MEDIA:\s*(?P<path>`[^`\n]+`|"[^"\n]+"|'[^'\n]+'|\S+)[`"']?'''
+        )
+        for match in media_pattern.finditer(content):
+            path = match.group("path").strip()
+            if len(path) >= 2 and path[0] == path[-1] and path[0] in "`\"'":
+                path = path[1:-1].strip()
+            path = path.lstrip("`\"'").rstrip("`\"',.;:)}]")
            if path:
                media.append((path, has_voice_tag))
-        
-        # Remove MEDIA tags from content
+
+        # Remove MEDIA tags from content (including surrounding quote/backtick wrappers)
        if media:
-            cleaned = re.sub(media_pattern, '', cleaned)
+            cleaned = media_pattern.sub('', cleaned)
            cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip()
        
        return media, cleaned
--- a/tests/gateway/test_platform_base.py
+++ b/tests/gateway/test_platform_base.py
@ -258,6 +258,29 @@ class TestExtractMedia:
        _, cleaned = BasePlatformAdapter.extract_media(content)
        assert "\n\n\n" not in cleaned

+    def test_media_tag_allows_optional_whitespace_after_colon(self):
+        content = "MEDIA: /path/to/audio.ogg"
+        media, cleaned = BasePlatformAdapter.extract_media(content)
+        assert media == [("/path/to/audio.ogg", False)]
+        assert cleaned == ""
+
+    def test_media_tag_strips_wrapping_quotes_and_backticks(self):
+        content = "MEDIA: `/path/to/file.png`\nMEDIA:\"/path/to/file2.png\"\nMEDIA:'/path/to/file3.png'"
+        media, cleaned = BasePlatformAdapter.extract_media(content)
+        assert media == [
+            ("/path/to/file.png", False),
+            ("/path/to/file2.png", False),
+            ("/path/to/file3.png", False),
+        ]
+        assert cleaned == ""
+
+    def test_media_tag_supports_quoted_paths_with_spaces(self):
+        content = "Here\nMEDIA: '/tmp/my image.png'\nAfter"
+        media, cleaned = BasePlatformAdapter.extract_media(content)
+        assert media == [("/tmp/my image.png", False)]
+        assert "Here" in cleaned
+        assert "After" in cleaned
+

 # ---------------------------------------------------------------------------
 # truncate_message
--- a/tests/tools/test_send_message_tool.py
+++ b/tests/tools/test_send_message_tool.py
@ -2,11 +2,13 @@

 import asyncio
 import json
+import sys
+from pathlib import Path
 from types import SimpleNamespace
-from unittest.mock import AsyncMock, patch
+from unittest.mock import AsyncMock, MagicMock, patch

 from gateway.config import Platform
-from tools.send_message_tool import send_message_tool
+from tools.send_message_tool import _send_telegram, send_message_tool


 def _run_async_immediately(coro):
@ -14,13 +16,18 @@ def _run_async_immediately(coro):


 def _make_config():
-    telegram_cfg = SimpleNamespace(enabled=True, token="fake-token", extra={})
+    telegram_cfg = SimpleNamespace(enabled=True, token="***", extra={})
    return SimpleNamespace(
        platforms={Platform.TELEGRAM: telegram_cfg},
        get_home_channel=lambda _platform: None,
    ), telegram_cfg


+def _install_telegram_mock(monkeypatch, bot):
+    telegram_mod = SimpleNamespace(Bot=lambda token: bot)
+    monkeypatch.setitem(sys.modules, "telegram", telegram_mod)
+
+
 class TestSendMessageTool:
    def test_sends_to_explicit_telegram_topic_target(self):
        config, telegram_cfg = _make_config()
@ -41,7 +48,14 @@ class TestSendMessageTool:
            )

        assert result["success"] is True
-        send_mock.assert_awaited_once_with(Platform.TELEGRAM, telegram_cfg, "-1001", "hello", thread_id="17585")
+        send_mock.assert_awaited_once_with(
+            Platform.TELEGRAM,
+            telegram_cfg,
+            "-1001",
+            "hello",
+            thread_id="17585",
+            media_files=[],
+        )
        mirror_mock.assert_called_once_with("telegram", "-1001", "hello", source_label="cli", thread_id="17585")

    def test_resolved_telegram_topic_name_preserves_thread_id(self):
@ -64,4 +78,154 @@ class TestSendMessageTool:
            )

        assert result["success"] is True
-        send_mock.assert_awaited_once_with(Platform.TELEGRAM, telegram_cfg, "-1001", "hello", thread_id="17585")
+        send_mock.assert_awaited_once_with(
+            Platform.TELEGRAM,
+            telegram_cfg,
+            "-1001",
+            "hello",
+            thread_id="17585",
+            media_files=[],
+        )
+
+    def test_media_only_message_uses_placeholder_for_mirroring(self):
+        config, telegram_cfg = _make_config()
+
+        with patch("gateway.config.load_gateway_config", return_value=config), \
+             patch("tools.interrupt.is_interrupted", return_value=False), \
+             patch("model_tools._run_async", side_effect=_run_async_immediately), \
+             patch("tools.send_message_tool._send_to_platform", new=AsyncMock(return_value={"success": True})) as send_mock, \
+             patch("gateway.mirror.mirror_to_session", return_value=True) as mirror_mock:
+            result = json.loads(
+                send_message_tool(
+                    {
+                        "action": "send",
+                        "target": "telegram:-1001",
+                        "message": "MEDIA:/tmp/example.ogg",
+                    }
+                )
+            )
+
+        assert result["success"] is True
+        send_mock.assert_awaited_once_with(
+            Platform.TELEGRAM,
+            telegram_cfg,
+            "-1001",
+            "",
+            thread_id=None,
+            media_files=[("/tmp/example.ogg", False)],
+        )
+        mirror_mock.assert_called_once_with(
+            "telegram",
+            "-1001",
+            "[Sent audio attachment]",
+            source_label="cli",
+            thread_id=None,
+        )
+
+
+class TestSendTelegramMediaDelivery:
+    def test_sends_text_then_photo_for_media_tag(self, tmp_path, monkeypatch):
+        image_path = tmp_path / "photo.png"
+        image_path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 32)
+
+        bot = MagicMock()
+        bot.send_message = AsyncMock(return_value=SimpleNamespace(message_id=1))
+        bot.send_photo = AsyncMock(return_value=SimpleNamespace(message_id=2))
+        bot.send_video = AsyncMock()
+        bot.send_voice = AsyncMock()
+        bot.send_audio = AsyncMock()
+        bot.send_document = AsyncMock()
+        _install_telegram_mock(monkeypatch, bot)
+
+        result = asyncio.run(
+            _send_telegram(
+                "token",
+                "12345",
+                "Hello there",
+                media_files=[(str(image_path), False)],
+            )
+        )
+
+        assert result["success"] is True
+        assert result["message_id"] == "2"
+        bot.send_message.assert_awaited_once()
+        bot.send_photo.assert_awaited_once()
+        sent_text = bot.send_message.await_args.kwargs["text"]
+        assert "MEDIA:" not in sent_text
+        assert sent_text == "Hello there"
+
+    def test_sends_voice_for_ogg_with_voice_directive(self, tmp_path, monkeypatch):
+        voice_path = tmp_path / "voice.ogg"
+        voice_path.write_bytes(b"OggS" + b"\x00" * 32)
+
+        bot = MagicMock()
+        bot.send_message = AsyncMock()
+        bot.send_photo = AsyncMock()
+        bot.send_video = AsyncMock()
+        bot.send_voice = AsyncMock(return_value=SimpleNamespace(message_id=7))
+        bot.send_audio = AsyncMock()
+        bot.send_document = AsyncMock()
+        _install_telegram_mock(monkeypatch, bot)
+
+        result = asyncio.run(
+            _send_telegram(
+                "token",
+                "12345",
+                "",
+                media_files=[(str(voice_path), True)],
+            )
+        )
+
+        assert result["success"] is True
+        bot.send_voice.assert_awaited_once()
+        bot.send_audio.assert_not_awaited()
+        bot.send_message.assert_not_awaited()
+
+    def test_sends_audio_for_mp3(self, tmp_path, monkeypatch):
+        audio_path = tmp_path / "clip.mp3"
+        audio_path.write_bytes(b"ID3" + b"\x00" * 32)
+
+        bot = MagicMock()
+        bot.send_message = AsyncMock()
+        bot.send_photo = AsyncMock()
+        bot.send_video = AsyncMock()
+        bot.send_voice = AsyncMock()
+        bot.send_audio = AsyncMock(return_value=SimpleNamespace(message_id=8))
+        bot.send_document = AsyncMock()
+        _install_telegram_mock(monkeypatch, bot)
+
+        result = asyncio.run(
+            _send_telegram(
+                "token",
+                "12345",
+                "",
+                media_files=[(str(audio_path), False)],
+            )
+        )
+
+        assert result["success"] is True
+        bot.send_audio.assert_awaited_once()
+        bot.send_voice.assert_not_awaited()
+
+    def test_missing_media_returns_error_without_leaking_raw_tag(self, monkeypatch):
+        bot = MagicMock()
+        bot.send_message = AsyncMock()
+        bot.send_photo = AsyncMock()
+        bot.send_video = AsyncMock()
+        bot.send_voice = AsyncMock()
+        bot.send_audio = AsyncMock()
+        bot.send_document = AsyncMock()
+        _install_telegram_mock(monkeypatch, bot)
+
+        result = asyncio.run(
+            _send_telegram(
+                "token",
+                "12345",
+                "",
+                media_files=[("/tmp/does-not-exist.png", False)],
+            )
+        )
+
+        assert "error" in result
+        assert "No deliverable text or media remained" in result["error"]
+        bot.send_message.assert_not_awaited()
--- a/tools/send_message_tool.py
+++ b/tools/send_message_tool.py
@ -14,6 +14,10 @@ import time
 logger = logging.getLogger(__name__)

 _TELEGRAM_TOPIC_TARGET_RE = re.compile(r"^\s*(-?\d+)(?::(\d+))?\s*$")
+_IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".gif"}
+_VIDEO_EXTS = {".mp4", ".mov", ".avi", ".mkv", ".3gp"}
+_AUDIO_EXTS = {".ogg", ".opus", ".mp3", ".wav", ".m4a"}
+_VOICE_EXTS = {".ogg", ".opus"}


 SEND_MESSAGE_SCHEMA = {
@ -130,6 +134,11 @@ def _handle_send(args):
    if not pconfig or not pconfig.enabled:
        return json.dumps({"error": f"Platform '{platform_name}' is not configured. Set up credentials in ~/.hermes/gateway.json or environment variables."})

+    from gateway.platforms.base import BasePlatformAdapter
+
+    media_files, cleaned_message = BasePlatformAdapter.extract_media(message)
+    mirror_text = cleaned_message.strip() or _describe_media_for_mirror(media_files)
+
    used_home_channel = False
    if not chat_id:
        home = config.get_home_channel(platform)
@ -145,16 +154,25 @@ def _handle_send(args):

    try:
        from model_tools import _run_async
-        result = _run_async(_send_to_platform(platform, pconfig, chat_id, message, thread_id=thread_id))
+        result = _run_async(
+            _send_to_platform(
+                platform,
+                pconfig,
+                chat_id,
+                cleaned_message,
+                thread_id=thread_id,
+                media_files=media_files,
+            )
+        )
        if used_home_channel and isinstance(result, dict) and result.get("success"):
            result["note"] = f"Sent to {platform_name} home channel (chat_id: {chat_id})"

        # Mirror the sent message into the target's gateway session
-        if isinstance(result, dict) and result.get("success"):
+        if isinstance(result, dict) and result.get("success") and mirror_text:
            try:
                from gateway.mirror import mirror_to_session
                source_label = os.getenv("HERMES_SESSION_PLATFORM", "cli")
-                if mirror_to_session(platform_name, chat_id, message, source_label=source_label, thread_id=thread_id):
+                if mirror_to_session(platform_name, chat_id, mirror_text, source_label=source_label, thread_id=thread_id):
                    result["mirrored"] = True
            except Exception:
                pass
@ -175,32 +193,140 @@ def _parse_target_ref(platform_name: str, target_ref: str):
    return None, None, False


-async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None):
+def _describe_media_for_mirror(media_files):
+    """Return a human-readable mirror summary when a message only contains media."""
+    if not media_files:
+        return ""
+    if len(media_files) == 1:
+        media_path, is_voice = media_files[0]
+        ext = os.path.splitext(media_path)[1].lower()
+        if is_voice and ext in _VOICE_EXTS:
+            return "[Sent voice message]"
+        if ext in _IMAGE_EXTS:
+            return "[Sent image attachment]"
+        if ext in _VIDEO_EXTS:
+            return "[Sent video attachment]"
+        if ext in _AUDIO_EXTS:
+            return "[Sent audio attachment]"
+        return "[Sent document attachment]"
+    return f"[Sent {len(media_files)} media attachments]"
+
+
+async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None, media_files=None):
    """Route a message to the appropriate platform sender."""
    from gateway.config import Platform
+
+    media_files = media_files or []
    if platform == Platform.TELEGRAM:
-        return await _send_telegram(pconfig.token, chat_id, message, thread_id=thread_id)
-    elif platform == Platform.DISCORD:
-        return await _send_discord(pconfig.token, chat_id, message)
+        return await _send_telegram(
+            pconfig.token,
+            chat_id,
+            message,
+            media_files=media_files,
+            thread_id=thread_id,
+        )
+    if media_files and not message.strip():
+        return {
+            "error": (
+                f"send_message MEDIA delivery is currently only supported for telegram; "
+                f"target {platform.value} had only media attachments"
+            )
+        }
+    warning = None
+    if media_files:
+        warning = (
+            f"MEDIA attachments were omitted for {platform.value}; "
+            "native send_message media delivery is currently only supported for telegram"
+        )
+
+    if platform == Platform.DISCORD:
+        result = await _send_discord(pconfig.token, chat_id, message)
    elif platform == Platform.SLACK:
-        return await _send_slack(pconfig.token, chat_id, message)
+        result = await _send_slack(pconfig.token, chat_id, message)
    elif platform == Platform.SIGNAL:
-        return await _send_signal(pconfig.extra, chat_id, message)
+        result = await _send_signal(pconfig.extra, chat_id, message)
    elif platform == Platform.EMAIL:
-        return await _send_email(pconfig.extra, chat_id, message)
-    return {"error": f"Direct sending not yet implemented for {platform.value}"}
+        result = await _send_email(pconfig.extra, chat_id, message)
+    else:
+        result = {"error": f"Direct sending not yet implemented for {platform.value}"}
+
+    if warning and isinstance(result, dict) and result.get("success"):
+        warnings = list(result.get("warnings", []))
+        warnings.append(warning)
+        result["warnings"] = warnings
+    return result


-async def _send_telegram(token, chat_id, message, thread_id=None):
+async def _send_telegram(token, chat_id, message, media_files=None, thread_id=None):
    """Send via Telegram Bot API (one-shot, no polling needed)."""
    try:
        from telegram import Bot
+
        bot = Bot(token=token)
-        send_kwargs = {"chat_id": int(chat_id), "text": message}
+        int_chat_id = int(chat_id)
+        media_files = media_files or []
+        thread_kwargs = {}
        if thread_id is not None:
-            send_kwargs["message_thread_id"] = int(thread_id)
-        msg = await bot.send_message(**send_kwargs)
-        return {"success": True, "platform": "telegram", "chat_id": chat_id, "message_id": str(msg.message_id)}
+            thread_kwargs["message_thread_id"] = int(thread_id)
+
+        last_msg = None
+        warnings = []
+
+        if message.strip():
+            last_msg = await bot.send_message(
+                chat_id=int_chat_id, text=message, **thread_kwargs
+            )
+
+        for media_path, is_voice in media_files:
+            if not os.path.exists(media_path):
+                warning = f"Media file not found, skipping: {media_path}"
+                logger.warning(warning)
+                warnings.append(warning)
+                continue
+
+            ext = os.path.splitext(media_path)[1].lower()
+            try:
+                with open(media_path, "rb") as f:
+                    if ext in _IMAGE_EXTS:
+                        last_msg = await bot.send_photo(
+                            chat_id=int_chat_id, photo=f, **thread_kwargs
+                        )
+                    elif ext in _VIDEO_EXTS:
+                        last_msg = await bot.send_video(
+                            chat_id=int_chat_id, video=f, **thread_kwargs
+                        )
+                    elif ext in _VOICE_EXTS and is_voice:
+                        last_msg = await bot.send_voice(
+                            chat_id=int_chat_id, voice=f, **thread_kwargs
+                        )
+                    elif ext in _AUDIO_EXTS:
+                        last_msg = await bot.send_audio(
+                            chat_id=int_chat_id, audio=f, **thread_kwargs
+                        )
+                    else:
+                        last_msg = await bot.send_document(
+                            chat_id=int_chat_id, document=f, **thread_kwargs
+                        )
+            except Exception as e:
+                warning = f"Failed to send media {media_path}: {e}"
+                logger.error(warning)
+                warnings.append(warning)
+
+        if last_msg is None:
+            error = "No deliverable text or media remained after processing MEDIA tags"
+            if warnings:
+                return {"error": error, "warnings": warnings}
+            return {"error": error}
+
+        result = {
+            "success": True,
+            "platform": "telegram",
+            "chat_id": chat_id,
+            "message_id": str(last_msg.message_id),
+        }
+        if warnings:
+            result["warnings"] = warnings
+        return result
    except ImportError:
        return {"error": "python-telegram-bot not installed. Run: pip install python-telegram-bot"}
    except Exception as e: