feat: add Discord voice channel listening — STT transcription and agent response pipeline

Phase 2 of voice channel support: bot listens to users speaking in VC,
transcribes speech via Groq Whisper, and processes through the agent pipeline.

- Add VoiceReceiver class for RTP packet capture, NaCl/DAVE decryption, Opus decode
- Add silence detection and per-user PCM buffering
- Wire voice input callback from adapter to GatewayRunner
- Fix adapter dict key: use Platform.DISCORD enum instead of string
- Fix guild_id extraction for synthetic voice events via SimpleNamespace raw_message
- Pause/resume receiver during TTS playback to prevent echo
This commit is contained in:
0xbyt4 2026-03-11 04:34:58 +03:00
parent cc974904f8
commit c0c358d051
2 changed files with 454 additions and 17 deletions

View file

@ -1608,6 +1608,8 @@ class GatewayRunner:
(voice_mode == "all")
or (voice_mode == "voice_only" and is_voice_input)
)
logger.info("Voice reply check: chat_id=%s, voice_mode=%s, is_voice=%s, should_reply=%s, has_response=%s",
chat_id, voice_mode, is_voice_input, should_voice_reply, bool(response))
if should_voice_reply and response and not response.startswith("Error:"):
# Skip if agent already called TTS tool (avoid double voice)
has_agent_tts = any(
@ -1618,6 +1620,7 @@ class GatewayRunner:
)
for msg in agent_messages
)
logger.info("Voice reply: has_agent_tts=%s, calling _send_voice_reply", has_agent_tts)
if not has_agent_tts:
await self._send_voice_reply(event, response)
@ -2201,9 +2204,12 @@ class GatewayRunner:
adapter._voice_text_channels[guild_id] = int(event.source.chat_id)
self._voice_mode[event.source.chat_id] = "all"
self._save_voice_modes()
# Wire voice input callback so the adapter can deliver transcripts
if hasattr(adapter, "_voice_input_callback"):
adapter._voice_input_callback = self._handle_voice_channel_input
return (
f"Joined voice channel **{voice_channel.name}**.\n"
f"I'll speak my replies here. Use /voice leave to disconnect."
f"I'll speak my replies and listen to you. Use /voice leave to disconnect."
)
return "Failed to join voice channel. Check bot permissions (Connect + Speak)."
@ -2223,6 +2229,49 @@ class GatewayRunner:
self._save_voice_modes()
return "Left voice channel."
async def _handle_voice_channel_input(
self, guild_id: int, user_id: int, transcript: str
):
"""Handle transcribed voice from a user in a voice channel.
Creates a synthetic MessageEvent and processes it through the
adapter's full message pipeline (session, typing, agent, TTS reply).
"""
adapter = self.adapters.get(Platform.DISCORD)
if not adapter:
return
text_ch_id = adapter._voice_text_channels.get(guild_id)
if not text_ch_id:
return
# Show transcript in text channel
try:
channel = adapter._client.get_channel(text_ch_id)
if channel:
await channel.send(f"**[Voice]** <@{user_id}>: {transcript}")
except Exception:
pass
# Build a synthetic MessageEvent and feed through the normal pipeline
source = SessionSource(
platform=Platform.DISCORD,
chat_id=str(text_ch_id),
user_id=str(user_id),
user_name=str(user_id),
)
# Use SimpleNamespace as raw_message so _get_guild_id() can extract
# guild_id and _send_voice_reply() plays audio in the voice channel.
from types import SimpleNamespace
event = MessageEvent(
source=source,
text=transcript,
message_type=MessageType.VOICE,
raw_message=SimpleNamespace(guild_id=guild_id, guild=None),
)
await adapter.handle_message(event)
async def _send_voice_reply(self, event: MessageEvent, text: str) -> None:
"""Generate TTS audio and send as a voice message before the text reply."""
try: