fix: Discord voice bubble + edge-tts mp3/ogg format mismatch
- Send Discord voice messages with flags=8192 and waveform metadata so they render as native voice bubbles instead of file attachments - Use .mp3 output path for TTS so edge-tts opus conversion works correctly (edge always outputs mp3, convert was skipped for .ogg) - Use actual file_path from TTS result after potential opus conversion
This commit is contained in:
parent
f6cf4ca826
commit
cbe4c23efa
2 changed files with 75 additions and 13 deletions
|
|
@ -297,9 +297,66 @@ class DiscordAdapter(BasePlatformAdapter):
|
|||
) -> SendResult:
|
||||
"""Send audio as a Discord file attachment."""
|
||||
try:
|
||||
return await self._send_file_attachment(chat_id, audio_path, caption)
|
||||
except FileNotFoundError:
|
||||
return SendResult(success=False, error=f"Audio file not found: {audio_path}")
|
||||
import io
|
||||
|
||||
channel = self._client.get_channel(int(chat_id))
|
||||
if not channel:
|
||||
channel = await self._client.fetch_channel(int(chat_id))
|
||||
if not channel:
|
||||
return SendResult(success=False, error=f"Channel {chat_id} not found")
|
||||
|
||||
if not os.path.exists(audio_path):
|
||||
return SendResult(success=False, error=f"Audio file not found: {audio_path}")
|
||||
|
||||
filename = os.path.basename(audio_path)
|
||||
|
||||
with open(audio_path, "rb") as f:
|
||||
file_data = f.read()
|
||||
|
||||
# Try sending as a native voice message via raw API (flags=8192).
|
||||
try:
|
||||
import base64
|
||||
|
||||
duration_secs = 5.0
|
||||
try:
|
||||
from mutagen.oggopus import OggOpus
|
||||
info = OggOpus(audio_path)
|
||||
duration_secs = info.info.length
|
||||
except Exception:
|
||||
duration_secs = max(1.0, len(file_data) / 2000.0)
|
||||
|
||||
waveform_bytes = bytes([128] * 256)
|
||||
waveform_b64 = base64.b64encode(waveform_bytes).decode()
|
||||
|
||||
import json as _json
|
||||
payload = _json.dumps({
|
||||
"flags": 8192,
|
||||
"attachments": [{
|
||||
"id": "0",
|
||||
"filename": "voice-message.ogg",
|
||||
"duration_secs": round(duration_secs, 2),
|
||||
"waveform": waveform_b64,
|
||||
}],
|
||||
})
|
||||
form = [
|
||||
{"name": "payload_json", "value": payload},
|
||||
{
|
||||
"name": "files[0]",
|
||||
"value": file_data,
|
||||
"filename": "voice-message.ogg",
|
||||
"content_type": "audio/ogg",
|
||||
},
|
||||
]
|
||||
msg_data = await self._client.http.request(
|
||||
discord.http.Route("POST", "/channels/{channel_id}/messages", channel_id=channel.id),
|
||||
form=form,
|
||||
)
|
||||
return SendResult(success=True, message_id=str(msg_data["id"]))
|
||||
except Exception as voice_err:
|
||||
logger.debug("Voice message flag failed, falling back to file: %s", voice_err)
|
||||
file = discord.File(io.BytesIO(file_data), filename=filename)
|
||||
msg = await channel.send(file=file)
|
||||
return SendResult(success=True, message_id=str(msg.id))
|
||||
except Exception as e: # pragma: no cover - defensive logging
|
||||
logger.error("[%s] Failed to send audio, falling back to base adapter: %s", self.name, e, exc_info=True)
|
||||
return await super().send_voice(chat_id, audio_path, caption, reply_to, metadata=metadata)
|
||||
|
|
|
|||
|
|
@ -2158,18 +2158,22 @@ class GatewayRunner:
|
|||
if not tts_text:
|
||||
return
|
||||
|
||||
ogg_path = os.path.join(
|
||||
# Use .mp3 extension so edge-tts conversion to opus works correctly.
|
||||
# The TTS tool may convert to .ogg — use file_path from result.
|
||||
audio_path = os.path.join(
|
||||
tempfile.gettempdir(), "hermes_voice",
|
||||
f"tts_reply_{int(time.time())}_{id(event) % 10000}.ogg",
|
||||
f"tts_reply_{int(time.time())}_{id(event) % 10000}.mp3",
|
||||
)
|
||||
os.makedirs(os.path.dirname(ogg_path), exist_ok=True)
|
||||
os.makedirs(os.path.dirname(audio_path), exist_ok=True)
|
||||
|
||||
result_json = await asyncio.to_thread(
|
||||
text_to_speech_tool, text=tts_text, output_path=ogg_path
|
||||
text_to_speech_tool, text=tts_text, output_path=audio_path
|
||||
)
|
||||
result = json.loads(result_json)
|
||||
|
||||
if not result.get("success") or not os.path.isfile(ogg_path):
|
||||
# Use the actual file path from result (may differ after opus conversion)
|
||||
actual_path = result.get("file_path", audio_path)
|
||||
if not result.get("success") or not os.path.isfile(actual_path):
|
||||
logger.warning("Auto voice reply TTS failed: %s", result.get("error"))
|
||||
return
|
||||
|
||||
|
|
@ -2177,7 +2181,7 @@ class GatewayRunner:
|
|||
if adapter and hasattr(adapter, "send_voice"):
|
||||
send_kwargs: Dict[str, Any] = {
|
||||
"chat_id": event.source.chat_id,
|
||||
"audio_path": ogg_path,
|
||||
"audio_path": actual_path,
|
||||
"reply_to": event.message_id,
|
||||
}
|
||||
if event.source.thread_id:
|
||||
|
|
@ -2188,10 +2192,11 @@ class GatewayRunner:
|
|||
if "metadata" not in sig.parameters:
|
||||
send_kwargs.pop("metadata", None)
|
||||
await adapter.send_voice(**send_kwargs)
|
||||
try:
|
||||
os.unlink(ogg_path)
|
||||
except OSError:
|
||||
pass
|
||||
for p in {audio_path, actual_path}:
|
||||
try:
|
||||
os.unlink(p)
|
||||
except OSError:
|
||||
pass
|
||||
except Exception as e:
|
||||
logger.warning("Auto voice reply failed: %s", e)
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue