From cbe4c23efa064c6572af6bed547c989b509a2508 Mon Sep 17 00:00:00 2001
From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com>
Date: Wed, 11 Mar 2026 00:24:29 +0300
Subject: [PATCH] fix: Discord voice bubble + edge-tts mp3/ogg format mismatch

- Send Discord voice messages with flags=8192 and waveform metadata
  so they render as native voice bubbles instead of file attachments
- Use .mp3 output path for TTS so edge-tts opus conversion works
  correctly (edge always outputs mp3, convert was skipped for .ogg)
- Use actual file_path from TTS result after potential opus conversion
---
 gateway/platforms/discord.py | 63 ++++++++++++++++++++++++++++++++++--
 gateway/run.py               | 25 ++++++++------
 2 files changed, 75 insertions(+), 13 deletions(-)

diff --git a/gateway/platforms/discord.py b/gateway/platforms/discord.py
index d472aead..717fc921 100644
--- a/gateway/platforms/discord.py
+++ b/gateway/platforms/discord.py
@@ -297,9 +297,66 @@ class DiscordAdapter(BasePlatformAdapter):
     ) -> SendResult:
         """Send audio as a Discord file attachment."""
         try:
-            return await self._send_file_attachment(chat_id, audio_path, caption)
-        except FileNotFoundError:
-            return SendResult(success=False, error=f"Audio file not found: {audio_path}")
+            import io
+
+            channel = self._client.get_channel(int(chat_id))
+            if not channel:
+                channel = await self._client.fetch_channel(int(chat_id))
+            if not channel:
+                return SendResult(success=False, error=f"Channel {chat_id} not found")
+
+            if not os.path.exists(audio_path):
+                return SendResult(success=False, error=f"Audio file not found: {audio_path}")
+
+            filename = os.path.basename(audio_path)
+
+            with open(audio_path, "rb") as f:
+                file_data = f.read()
+
+            # Try sending as a native voice message via raw API (flags=8192).
+            try:
+                import base64
+
+                duration_secs = 5.0
+                try:
+                    from mutagen.oggopus import OggOpus
+                    info = OggOpus(audio_path)
+                    duration_secs = info.info.length
+                except Exception:
+                    duration_secs = max(1.0, len(file_data) / 2000.0)
+
+                waveform_bytes = bytes([128] * 256)
+                waveform_b64 = base64.b64encode(waveform_bytes).decode()
+
+                import json as _json
+                payload = _json.dumps({
+                    "flags": 8192,
+                    "attachments": [{
+                        "id": "0",
+                        "filename": "voice-message.ogg",
+                        "duration_secs": round(duration_secs, 2),
+                        "waveform": waveform_b64,
+                    }],
+                })
+                form = [
+                    {"name": "payload_json", "value": payload},
+                    {
+                        "name": "files[0]",
+                        "value": file_data,
+                        "filename": "voice-message.ogg",
+                        "content_type": "audio/ogg",
+                    },
+                ]
+                msg_data = await self._client.http.request(
+                    discord.http.Route("POST", "/channels/{channel_id}/messages", channel_id=channel.id),
+                    form=form,
+                )
+                return SendResult(success=True, message_id=str(msg_data["id"]))
+            except Exception as voice_err:
+                logger.debug("Voice message flag failed, falling back to file: %s", voice_err)
+                file = discord.File(io.BytesIO(file_data), filename=filename)
+                msg = await channel.send(file=file)
+                return SendResult(success=True, message_id=str(msg.id))
         except Exception as e:  # pragma: no cover - defensive logging
             logger.error("[%s] Failed to send audio, falling back to base adapter: %s", self.name, e, exc_info=True)
             return await super().send_voice(chat_id, audio_path, caption, reply_to, metadata=metadata)
diff --git a/gateway/run.py b/gateway/run.py
index 18757f93..79e5c3bc 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -2158,18 +2158,22 @@ class GatewayRunner:
             if not tts_text:
                 return
 
-            ogg_path = os.path.join(
+            # Use .mp3 extension so edge-tts conversion to opus works correctly.
+            # The TTS tool may convert to .ogg — use file_path from result.
+            audio_path = os.path.join(
                 tempfile.gettempdir(), "hermes_voice",
-                f"tts_reply_{int(time.time())}_{id(event) % 10000}.ogg",
+                f"tts_reply_{int(time.time())}_{id(event) % 10000}.mp3",
             )
-            os.makedirs(os.path.dirname(ogg_path), exist_ok=True)
+            os.makedirs(os.path.dirname(audio_path), exist_ok=True)
 
             result_json = await asyncio.to_thread(
-                text_to_speech_tool, text=tts_text, output_path=ogg_path
+                text_to_speech_tool, text=tts_text, output_path=audio_path
             )
             result = json.loads(result_json)
 
-            if not result.get("success") or not os.path.isfile(ogg_path):
+            # Use the actual file path from result (may differ after opus conversion)
+            actual_path = result.get("file_path", audio_path)
+            if not result.get("success") or not os.path.isfile(actual_path):
                 logger.warning("Auto voice reply TTS failed: %s", result.get("error"))
                 return
 
@@ -2177,7 +2181,7 @@ class GatewayRunner:
             if adapter and hasattr(adapter, "send_voice"):
                 send_kwargs: Dict[str, Any] = {
                     "chat_id": event.source.chat_id,
-                    "audio_path": ogg_path,
+                    "audio_path": actual_path,
                     "reply_to": event.message_id,
                 }
                 if event.source.thread_id:
@@ -2188,10 +2192,11 @@ class GatewayRunner:
                 if "metadata" not in sig.parameters:
                     send_kwargs.pop("metadata", None)
                 await adapter.send_voice(**send_kwargs)
-            try:
-                os.unlink(ogg_path)
-            except OSError:
-                pass
+            for p in {audio_path, actual_path}:
+                try:
+                    os.unlink(p)
+                except OSError:
+                    pass
         except Exception as e:
             logger.warning("Auto voice reply failed: %s", e)