From cbe4c23efa064c6572af6bed547c989b509a2508 Mon Sep 17 00:00:00 2001 From: 0xbyt4 <35742124+0xbyt4@users.noreply.github.com> Date: Wed, 11 Mar 2026 00:24:29 +0300 Subject: [PATCH] fix: Discord voice bubble + edge-tts mp3/ogg format mismatch - Send Discord voice messages with flags=8192 and waveform metadata so they render as native voice bubbles instead of file attachments - Use .mp3 output path for TTS so edge-tts opus conversion works correctly (edge always outputs mp3, convert was skipped for .ogg) - Use actual file_path from TTS result after potential opus conversion --- gateway/platforms/discord.py | 63 ++++++++++++++++++++++++++++++++++-- gateway/run.py | 25 ++++++++------ 2 files changed, 75 insertions(+), 13 deletions(-) diff --git a/gateway/platforms/discord.py b/gateway/platforms/discord.py index d472aead..717fc921 100644 --- a/gateway/platforms/discord.py +++ b/gateway/platforms/discord.py @@ -297,9 +297,66 @@ class DiscordAdapter(BasePlatformAdapter): ) -> SendResult: """Send audio as a Discord file attachment.""" try: - return await self._send_file_attachment(chat_id, audio_path, caption) - except FileNotFoundError: - return SendResult(success=False, error=f"Audio file not found: {audio_path}") + import io + + channel = self._client.get_channel(int(chat_id)) + if not channel: + channel = await self._client.fetch_channel(int(chat_id)) + if not channel: + return SendResult(success=False, error=f"Channel {chat_id} not found") + + if not os.path.exists(audio_path): + return SendResult(success=False, error=f"Audio file not found: {audio_path}") + + filename = os.path.basename(audio_path) + + with open(audio_path, "rb") as f: + file_data = f.read() + + # Try sending as a native voice message via raw API (flags=8192). + try: + import base64 + + duration_secs = 5.0 + try: + from mutagen.oggopus import OggOpus + info = OggOpus(audio_path) + duration_secs = info.info.length + except Exception: + duration_secs = max(1.0, len(file_data) / 2000.0) + + waveform_bytes = bytes([128] * 256) + waveform_b64 = base64.b64encode(waveform_bytes).decode() + + import json as _json + payload = _json.dumps({ + "flags": 8192, + "attachments": [{ + "id": "0", + "filename": "voice-message.ogg", + "duration_secs": round(duration_secs, 2), + "waveform": waveform_b64, + }], + }) + form = [ + {"name": "payload_json", "value": payload}, + { + "name": "files[0]", + "value": file_data, + "filename": "voice-message.ogg", + "content_type": "audio/ogg", + }, + ] + msg_data = await self._client.http.request( + discord.http.Route("POST", "/channels/{channel_id}/messages", channel_id=channel.id), + form=form, + ) + return SendResult(success=True, message_id=str(msg_data["id"])) + except Exception as voice_err: + logger.debug("Voice message flag failed, falling back to file: %s", voice_err) + file = discord.File(io.BytesIO(file_data), filename=filename) + msg = await channel.send(file=file) + return SendResult(success=True, message_id=str(msg.id)) except Exception as e: # pragma: no cover - defensive logging logger.error("[%s] Failed to send audio, falling back to base adapter: %s", self.name, e, exc_info=True) return await super().send_voice(chat_id, audio_path, caption, reply_to, metadata=metadata) diff --git a/gateway/run.py b/gateway/run.py index 18757f93..79e5c3bc 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -2158,18 +2158,22 @@ class GatewayRunner: if not tts_text: return - ogg_path = os.path.join( + # Use .mp3 extension so edge-tts conversion to opus works correctly. + # The TTS tool may convert to .ogg — use file_path from result. + audio_path = os.path.join( tempfile.gettempdir(), "hermes_voice", - f"tts_reply_{int(time.time())}_{id(event) % 10000}.ogg", + f"tts_reply_{int(time.time())}_{id(event) % 10000}.mp3", ) - os.makedirs(os.path.dirname(ogg_path), exist_ok=True) + os.makedirs(os.path.dirname(audio_path), exist_ok=True) result_json = await asyncio.to_thread( - text_to_speech_tool, text=tts_text, output_path=ogg_path + text_to_speech_tool, text=tts_text, output_path=audio_path ) result = json.loads(result_json) - if not result.get("success") or not os.path.isfile(ogg_path): + # Use the actual file path from result (may differ after opus conversion) + actual_path = result.get("file_path", audio_path) + if not result.get("success") or not os.path.isfile(actual_path): logger.warning("Auto voice reply TTS failed: %s", result.get("error")) return @@ -2177,7 +2181,7 @@ class GatewayRunner: if adapter and hasattr(adapter, "send_voice"): send_kwargs: Dict[str, Any] = { "chat_id": event.source.chat_id, - "audio_path": ogg_path, + "audio_path": actual_path, "reply_to": event.message_id, } if event.source.thread_id: @@ -2188,10 +2192,11 @@ class GatewayRunner: if "metadata" not in sig.parameters: send_kwargs.pop("metadata", None) await adapter.send_voice(**send_kwargs) - try: - os.unlink(ogg_path) - except OSError: - pass + for p in {audio_path, actual_path}: + try: + os.unlink(p) + except OSError: + pass except Exception as e: logger.warning("Auto voice reply failed: %s", e)