Merge pull request #1290 from NousResearch/hermes/hermes-f48b210a

fix(send_message): salvage and complete MEDIA delivery from #971
This commit is contained in:
Teknium 2026-03-14 04:12:54 -07:00 committed by GitHub
commit b42ee3050e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 347 additions and 28 deletions

View file

@ -618,16 +618,22 @@ class BasePlatformAdapter(ABC):
has_voice_tag = "[[audio_as_voice]]" in content
cleaned = cleaned.replace("[[audio_as_voice]]", "")
# Extract MEDIA:<path> tags (path may contain spaces)
media_pattern = r'MEDIA:(\S+)'
for match in re.finditer(media_pattern, content):
path = match.group(1).strip()
# Extract MEDIA:<path> tags, allowing optional whitespace after the colon
# and quoted/backticked paths for LLM-formatted outputs.
media_pattern = re.compile(
r'''[`"']?MEDIA:\s*(?P<path>`[^`\n]+`|"[^"\n]+"|'[^'\n]+'|\S+)[`"']?'''
)
for match in media_pattern.finditer(content):
path = match.group("path").strip()
if len(path) >= 2 and path[0] == path[-1] and path[0] in "`\"'":
path = path[1:-1].strip()
path = path.lstrip("`\"'").rstrip("`\"',.;:)}]")
if path:
media.append((path, has_voice_tag))
# Remove MEDIA tags from content
# Remove MEDIA tags from content (including surrounding quote/backtick wrappers)
if media:
cleaned = re.sub(media_pattern, '', cleaned)
cleaned = media_pattern.sub('', cleaned)
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip()
return media, cleaned

View file

@ -258,6 +258,29 @@ class TestExtractMedia:
_, cleaned = BasePlatformAdapter.extract_media(content)
assert "\n\n\n" not in cleaned
def test_media_tag_allows_optional_whitespace_after_colon(self):
content = "MEDIA: /path/to/audio.ogg"
media, cleaned = BasePlatformAdapter.extract_media(content)
assert media == [("/path/to/audio.ogg", False)]
assert cleaned == ""
def test_media_tag_strips_wrapping_quotes_and_backticks(self):
content = "MEDIA: `/path/to/file.png`\nMEDIA:\"/path/to/file2.png\"\nMEDIA:'/path/to/file3.png'"
media, cleaned = BasePlatformAdapter.extract_media(content)
assert media == [
("/path/to/file.png", False),
("/path/to/file2.png", False),
("/path/to/file3.png", False),
]
assert cleaned == ""
def test_media_tag_supports_quoted_paths_with_spaces(self):
content = "Here\nMEDIA: '/tmp/my image.png'\nAfter"
media, cleaned = BasePlatformAdapter.extract_media(content)
assert media == [("/tmp/my image.png", False)]
assert "Here" in cleaned
assert "After" in cleaned
# ---------------------------------------------------------------------------
# truncate_message

View file

@ -2,11 +2,13 @@
import asyncio
import json
import sys
from pathlib import Path
from types import SimpleNamespace
from unittest.mock import AsyncMock, patch
from unittest.mock import AsyncMock, MagicMock, patch
from gateway.config import Platform
from tools.send_message_tool import send_message_tool
from tools.send_message_tool import _send_telegram, send_message_tool
def _run_async_immediately(coro):
@ -14,13 +16,18 @@ def _run_async_immediately(coro):
def _make_config():
telegram_cfg = SimpleNamespace(enabled=True, token="fake-token", extra={})
telegram_cfg = SimpleNamespace(enabled=True, token="***", extra={})
return SimpleNamespace(
platforms={Platform.TELEGRAM: telegram_cfg},
get_home_channel=lambda _platform: None,
), telegram_cfg
def _install_telegram_mock(monkeypatch, bot):
telegram_mod = SimpleNamespace(Bot=lambda token: bot)
monkeypatch.setitem(sys.modules, "telegram", telegram_mod)
class TestSendMessageTool:
def test_sends_to_explicit_telegram_topic_target(self):
config, telegram_cfg = _make_config()
@ -41,7 +48,14 @@ class TestSendMessageTool:
)
assert result["success"] is True
send_mock.assert_awaited_once_with(Platform.TELEGRAM, telegram_cfg, "-1001", "hello", thread_id="17585")
send_mock.assert_awaited_once_with(
Platform.TELEGRAM,
telegram_cfg,
"-1001",
"hello",
thread_id="17585",
media_files=[],
)
mirror_mock.assert_called_once_with("telegram", "-1001", "hello", source_label="cli", thread_id="17585")
def test_resolved_telegram_topic_name_preserves_thread_id(self):
@ -64,4 +78,154 @@ class TestSendMessageTool:
)
assert result["success"] is True
send_mock.assert_awaited_once_with(Platform.TELEGRAM, telegram_cfg, "-1001", "hello", thread_id="17585")
send_mock.assert_awaited_once_with(
Platform.TELEGRAM,
telegram_cfg,
"-1001",
"hello",
thread_id="17585",
media_files=[],
)
def test_media_only_message_uses_placeholder_for_mirroring(self):
config, telegram_cfg = _make_config()
with patch("gateway.config.load_gateway_config", return_value=config), \
patch("tools.interrupt.is_interrupted", return_value=False), \
patch("model_tools._run_async", side_effect=_run_async_immediately), \
patch("tools.send_message_tool._send_to_platform", new=AsyncMock(return_value={"success": True})) as send_mock, \
patch("gateway.mirror.mirror_to_session", return_value=True) as mirror_mock:
result = json.loads(
send_message_tool(
{
"action": "send",
"target": "telegram:-1001",
"message": "MEDIA:/tmp/example.ogg",
}
)
)
assert result["success"] is True
send_mock.assert_awaited_once_with(
Platform.TELEGRAM,
telegram_cfg,
"-1001",
"",
thread_id=None,
media_files=[("/tmp/example.ogg", False)],
)
mirror_mock.assert_called_once_with(
"telegram",
"-1001",
"[Sent audio attachment]",
source_label="cli",
thread_id=None,
)
class TestSendTelegramMediaDelivery:
def test_sends_text_then_photo_for_media_tag(self, tmp_path, monkeypatch):
image_path = tmp_path / "photo.png"
image_path.write_bytes(b"\x89PNG\r\n\x1a\n" + b"\x00" * 32)
bot = MagicMock()
bot.send_message = AsyncMock(return_value=SimpleNamespace(message_id=1))
bot.send_photo = AsyncMock(return_value=SimpleNamespace(message_id=2))
bot.send_video = AsyncMock()
bot.send_voice = AsyncMock()
bot.send_audio = AsyncMock()
bot.send_document = AsyncMock()
_install_telegram_mock(monkeypatch, bot)
result = asyncio.run(
_send_telegram(
"token",
"12345",
"Hello there",
media_files=[(str(image_path), False)],
)
)
assert result["success"] is True
assert result["message_id"] == "2"
bot.send_message.assert_awaited_once()
bot.send_photo.assert_awaited_once()
sent_text = bot.send_message.await_args.kwargs["text"]
assert "MEDIA:" not in sent_text
assert sent_text == "Hello there"
def test_sends_voice_for_ogg_with_voice_directive(self, tmp_path, monkeypatch):
voice_path = tmp_path / "voice.ogg"
voice_path.write_bytes(b"OggS" + b"\x00" * 32)
bot = MagicMock()
bot.send_message = AsyncMock()
bot.send_photo = AsyncMock()
bot.send_video = AsyncMock()
bot.send_voice = AsyncMock(return_value=SimpleNamespace(message_id=7))
bot.send_audio = AsyncMock()
bot.send_document = AsyncMock()
_install_telegram_mock(monkeypatch, bot)
result = asyncio.run(
_send_telegram(
"token",
"12345",
"",
media_files=[(str(voice_path), True)],
)
)
assert result["success"] is True
bot.send_voice.assert_awaited_once()
bot.send_audio.assert_not_awaited()
bot.send_message.assert_not_awaited()
def test_sends_audio_for_mp3(self, tmp_path, monkeypatch):
audio_path = tmp_path / "clip.mp3"
audio_path.write_bytes(b"ID3" + b"\x00" * 32)
bot = MagicMock()
bot.send_message = AsyncMock()
bot.send_photo = AsyncMock()
bot.send_video = AsyncMock()
bot.send_voice = AsyncMock()
bot.send_audio = AsyncMock(return_value=SimpleNamespace(message_id=8))
bot.send_document = AsyncMock()
_install_telegram_mock(monkeypatch, bot)
result = asyncio.run(
_send_telegram(
"token",
"12345",
"",
media_files=[(str(audio_path), False)],
)
)
assert result["success"] is True
bot.send_audio.assert_awaited_once()
bot.send_voice.assert_not_awaited()
def test_missing_media_returns_error_without_leaking_raw_tag(self, monkeypatch):
bot = MagicMock()
bot.send_message = AsyncMock()
bot.send_photo = AsyncMock()
bot.send_video = AsyncMock()
bot.send_voice = AsyncMock()
bot.send_audio = AsyncMock()
bot.send_document = AsyncMock()
_install_telegram_mock(monkeypatch, bot)
result = asyncio.run(
_send_telegram(
"token",
"12345",
"",
media_files=[("/tmp/does-not-exist.png", False)],
)
)
assert "error" in result
assert "No deliverable text or media remained" in result["error"]
bot.send_message.assert_not_awaited()

View file

@ -14,6 +14,10 @@ import time
logger = logging.getLogger(__name__)
_TELEGRAM_TOPIC_TARGET_RE = re.compile(r"^\s*(-?\d+)(?::(\d+))?\s*$")
_IMAGE_EXTS = {".jpg", ".jpeg", ".png", ".webp", ".gif"}
_VIDEO_EXTS = {".mp4", ".mov", ".avi", ".mkv", ".3gp"}
_AUDIO_EXTS = {".ogg", ".opus", ".mp3", ".wav", ".m4a"}
_VOICE_EXTS = {".ogg", ".opus"}
SEND_MESSAGE_SCHEMA = {
@ -130,6 +134,11 @@ def _handle_send(args):
if not pconfig or not pconfig.enabled:
return json.dumps({"error": f"Platform '{platform_name}' is not configured. Set up credentials in ~/.hermes/gateway.json or environment variables."})
from gateway.platforms.base import BasePlatformAdapter
media_files, cleaned_message = BasePlatformAdapter.extract_media(message)
mirror_text = cleaned_message.strip() or _describe_media_for_mirror(media_files)
used_home_channel = False
if not chat_id:
home = config.get_home_channel(platform)
@ -145,16 +154,25 @@ def _handle_send(args):
try:
from model_tools import _run_async
result = _run_async(_send_to_platform(platform, pconfig, chat_id, message, thread_id=thread_id))
result = _run_async(
_send_to_platform(
platform,
pconfig,
chat_id,
cleaned_message,
thread_id=thread_id,
media_files=media_files,
)
)
if used_home_channel and isinstance(result, dict) and result.get("success"):
result["note"] = f"Sent to {platform_name} home channel (chat_id: {chat_id})"
# Mirror the sent message into the target's gateway session
if isinstance(result, dict) and result.get("success"):
if isinstance(result, dict) and result.get("success") and mirror_text:
try:
from gateway.mirror import mirror_to_session
source_label = os.getenv("HERMES_SESSION_PLATFORM", "cli")
if mirror_to_session(platform_name, chat_id, message, source_label=source_label, thread_id=thread_id):
if mirror_to_session(platform_name, chat_id, mirror_text, source_label=source_label, thread_id=thread_id):
result["mirrored"] = True
except Exception:
pass
@ -175,32 +193,140 @@ def _parse_target_ref(platform_name: str, target_ref: str):
return None, None, False
async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None):
def _describe_media_for_mirror(media_files):
"""Return a human-readable mirror summary when a message only contains media."""
if not media_files:
return ""
if len(media_files) == 1:
media_path, is_voice = media_files[0]
ext = os.path.splitext(media_path)[1].lower()
if is_voice and ext in _VOICE_EXTS:
return "[Sent voice message]"
if ext in _IMAGE_EXTS:
return "[Sent image attachment]"
if ext in _VIDEO_EXTS:
return "[Sent video attachment]"
if ext in _AUDIO_EXTS:
return "[Sent audio attachment]"
return "[Sent document attachment]"
return f"[Sent {len(media_files)} media attachments]"
async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None, media_files=None):
"""Route a message to the appropriate platform sender."""
from gateway.config import Platform
media_files = media_files or []
if platform == Platform.TELEGRAM:
return await _send_telegram(pconfig.token, chat_id, message, thread_id=thread_id)
elif platform == Platform.DISCORD:
return await _send_discord(pconfig.token, chat_id, message)
return await _send_telegram(
pconfig.token,
chat_id,
message,
media_files=media_files,
thread_id=thread_id,
)
if media_files and not message.strip():
return {
"error": (
f"send_message MEDIA delivery is currently only supported for telegram; "
f"target {platform.value} had only media attachments"
)
}
warning = None
if media_files:
warning = (
f"MEDIA attachments were omitted for {platform.value}; "
"native send_message media delivery is currently only supported for telegram"
)
if platform == Platform.DISCORD:
result = await _send_discord(pconfig.token, chat_id, message)
elif platform == Platform.SLACK:
return await _send_slack(pconfig.token, chat_id, message)
result = await _send_slack(pconfig.token, chat_id, message)
elif platform == Platform.SIGNAL:
return await _send_signal(pconfig.extra, chat_id, message)
result = await _send_signal(pconfig.extra, chat_id, message)
elif platform == Platform.EMAIL:
return await _send_email(pconfig.extra, chat_id, message)
return {"error": f"Direct sending not yet implemented for {platform.value}"}
result = await _send_email(pconfig.extra, chat_id, message)
else:
result = {"error": f"Direct sending not yet implemented for {platform.value}"}
if warning and isinstance(result, dict) and result.get("success"):
warnings = list(result.get("warnings", []))
warnings.append(warning)
result["warnings"] = warnings
return result
async def _send_telegram(token, chat_id, message, thread_id=None):
async def _send_telegram(token, chat_id, message, media_files=None, thread_id=None):
"""Send via Telegram Bot API (one-shot, no polling needed)."""
try:
from telegram import Bot
bot = Bot(token=token)
send_kwargs = {"chat_id": int(chat_id), "text": message}
int_chat_id = int(chat_id)
media_files = media_files or []
thread_kwargs = {}
if thread_id is not None:
send_kwargs["message_thread_id"] = int(thread_id)
msg = await bot.send_message(**send_kwargs)
return {"success": True, "platform": "telegram", "chat_id": chat_id, "message_id": str(msg.message_id)}
thread_kwargs["message_thread_id"] = int(thread_id)
last_msg = None
warnings = []
if message.strip():
last_msg = await bot.send_message(
chat_id=int_chat_id, text=message, **thread_kwargs
)
for media_path, is_voice in media_files:
if not os.path.exists(media_path):
warning = f"Media file not found, skipping: {media_path}"
logger.warning(warning)
warnings.append(warning)
continue
ext = os.path.splitext(media_path)[1].lower()
try:
with open(media_path, "rb") as f:
if ext in _IMAGE_EXTS:
last_msg = await bot.send_photo(
chat_id=int_chat_id, photo=f, **thread_kwargs
)
elif ext in _VIDEO_EXTS:
last_msg = await bot.send_video(
chat_id=int_chat_id, video=f, **thread_kwargs
)
elif ext in _VOICE_EXTS and is_voice:
last_msg = await bot.send_voice(
chat_id=int_chat_id, voice=f, **thread_kwargs
)
elif ext in _AUDIO_EXTS:
last_msg = await bot.send_audio(
chat_id=int_chat_id, audio=f, **thread_kwargs
)
else:
last_msg = await bot.send_document(
chat_id=int_chat_id, document=f, **thread_kwargs
)
except Exception as e:
warning = f"Failed to send media {media_path}: {e}"
logger.error(warning)
warnings.append(warning)
if last_msg is None:
error = "No deliverable text or media remained after processing MEDIA tags"
if warnings:
return {"error": error, "warnings": warnings}
return {"error": error}
result = {
"success": True,
"platform": "telegram",
"chat_id": chat_id,
"message_id": str(last_msg.message_id),
}
if warnings:
result["warnings"] = warnings
return result
except ImportError:
return {"error": "python-telegram-bot not installed. Run: pip install python-telegram-bot"}
except Exception as e: