fix: 8 voice pipeline bugs with tests proving each fix

1. VoiceReceiver.stop() now acquires _lock before clearing shared state
   to prevent race with _on_packet on the socket reader thread
2. _packet_debug_count moved from class-level to instance-level to avoid
   cross-instance race condition in multi-guild setups
3. play_in_voice_channel uses asyncio.get_running_loop() instead of
   deprecated asyncio.get_event_loop()
4. _send_voice_reply uses uuid for filenames instead of time-based names
   that can collide when two replies happen in the same second
5. Voice timeout now notifies runner via _on_voice_disconnect callback
   so runner cleans up _voice_mode state (prevents orphaned TTS replies)
6. play_in_voice_channel adds PLAYBACK_TIMEOUT (120s) to prevent
   infinite blocking when FFmpeg callback is never called
7. _send_voice_reply moves temp file cleanup to finally block so files
   are always cleaned up even when send_voice/play raises
8. Base adapter auto-TTS wraps play_tts in try/finally with os.remove
   to clean up generated audio files after playback

18 new tests (120 total voice tests)
This commit is contained in:
0xbyt4 2026-03-11 23:57:42 +03:00
parent c925d2ee76
commit 9722bd8be0
4 changed files with 517 additions and 26 deletions

View file

@ -752,11 +752,17 @@ class BasePlatformAdapter(ABC):
# Play TTS audio before text (voice-first experience)
if _tts_path and Path(_tts_path).exists():
await self.play_tts(
chat_id=event.source.chat_id,
audio_path=_tts_path,
metadata=_thread_metadata,
)
try:
await self.play_tts(
chat_id=event.source.chat_id,
audio_path=_tts_path,
metadata=_thread_metadata,
)
finally:
try:
os.remove(_tts_path)
except OSError:
pass
# Send the text portion
if text_content:

View file

@ -108,6 +108,9 @@ class VoiceReceiver:
# Pause flag: don't capture while bot is playing TTS
self._paused = False
# Debug logging counter (instance-level to avoid cross-instance races)
self._packet_debug_count = 0
# ------------------------------------------------------------------
# Lifecycle
# ------------------------------------------------------------------
@ -131,10 +134,11 @@ class VoiceReceiver:
self._vc._connection.remove_socket_listener(self._on_packet)
except Exception:
pass
self._buffers.clear()
self._last_packet_time.clear()
self._decoders.clear()
self._ssrc_to_user.clear()
with self._lock:
self._buffers.clear()
self._last_packet_time.clear()
self._decoders.clear()
self._ssrc_to_user.clear()
logger.info("VoiceReceiver stopped")
def pause(self):
@ -188,15 +192,13 @@ class VoiceReceiver:
# Packet handler (called from SocketReader thread)
# ------------------------------------------------------------------
_packet_debug_count = 0 # class-level counter for debug logging
def _on_packet(self, data: bytes):
if not self._running or self._paused:
return
# Log first few raw packets for debugging
VoiceReceiver._packet_debug_count += 1
if VoiceReceiver._packet_debug_count <= 5:
self._packet_debug_count += 1
if self._packet_debug_count <= 5:
logger.info(
"Raw UDP packet: len=%d, first_bytes=%s",
len(data), data[:4].hex() if len(data) >= 4 else "short",
@ -209,7 +211,7 @@ class VoiceReceiver:
# Lower bits may vary (padding, extension, CSRC count).
# Payload type (byte 1 lower 7 bits) = 0x78 (120) for voice.
if (data[0] >> 6) != 2 or (data[1] & 0x7F) != 0x78:
if VoiceReceiver._packet_debug_count <= 5:
if self._packet_debug_count <= 5:
logger.info("Skipped non-RTP: byte0=0x%02x byte1=0x%02x", data[0], data[1])
return
@ -235,7 +237,7 @@ class VoiceReceiver:
ext_words = struct.unpack_from(">H", data, ext_preamble_offset + 2)[0]
ext_data_len = ext_words * 4
if VoiceReceiver._packet_debug_count <= 10:
if self._packet_debug_count <= 10:
with self._lock:
known_user = self._ssrc_to_user.get(ssrc, "unknown")
logger.info(
@ -258,7 +260,7 @@ class VoiceReceiver:
box = nacl.secret.Aead(self._secret_key)
decrypted = box.decrypt(encrypted, header, bytes(nonce))
except Exception as e:
if VoiceReceiver._packet_debug_count <= 10:
if self._packet_debug_count <= 10:
logger.warning("NaCl decrypt failed: %s (hdr=%d, enc=%d)", e, header_size, len(encrypted))
return
@ -271,7 +273,7 @@ class VoiceReceiver:
with self._lock:
user_id = self._ssrc_to_user.get(ssrc, 0)
if user_id == 0:
if VoiceReceiver._packet_debug_count <= 10:
if self._packet_debug_count <= 10:
logger.warning("DAVE skip: unknown user for ssrc=%d", ssrc)
return # unknown user, can't DAVE-decrypt
try:
@ -280,7 +282,7 @@ class VoiceReceiver:
user_id, davey.MediaType.audio, decrypted
)
except Exception as e:
if VoiceReceiver._packet_debug_count <= 10:
if self._packet_debug_count <= 10:
logger.warning("DAVE decrypt failed for ssrc=%d: %s", ssrc, e)
return
@ -394,6 +396,7 @@ class DiscordAdapter(BasePlatformAdapter):
self._voice_receivers: Dict[int, VoiceReceiver] = {} # guild_id -> VoiceReceiver
self._voice_listen_tasks: Dict[int, asyncio.Task] = {} # guild_id -> listen loop
self._voice_input_callback: Optional[Callable] = None # set by run.py
self._on_voice_disconnect: Optional[Callable] = None # set by run.py
async def connect(self) -> bool:
"""Connect to Discord and start receiving events."""
@ -751,6 +754,9 @@ class DiscordAdapter(BasePlatformAdapter):
task.cancel()
self._voice_text_channels.pop(guild_id, None)
# Maximum seconds to wait for voice playback before giving up
PLAYBACK_TIMEOUT = 120
async def play_in_voice_channel(self, guild_id: int, audio_path: str) -> bool:
"""Play an audio file in the connected voice channel."""
vc = self._voice_clients.get(guild_id)
@ -763,12 +769,17 @@ class DiscordAdapter(BasePlatformAdapter):
receiver.pause()
try:
# Wait for current playback to finish
# Wait for current playback to finish (with timeout)
wait_start = time.monotonic()
while vc.is_playing():
if time.monotonic() - wait_start > self.PLAYBACK_TIMEOUT:
logger.warning("Timed out waiting for previous playback to finish")
vc.stop()
break
await asyncio.sleep(0.1)
done = asyncio.Event()
loop = asyncio.get_event_loop()
loop = asyncio.get_running_loop()
def _after(error):
if error:
@ -778,7 +789,11 @@ class DiscordAdapter(BasePlatformAdapter):
source = discord.FFmpegPCMAudio(audio_path)
source = discord.PCMVolumeTransformer(source, volume=1.0)
vc.play(source, after=_after)
await done.wait()
try:
await asyncio.wait_for(done.wait(), timeout=self.PLAYBACK_TIMEOUT)
except asyncio.TimeoutError:
logger.warning("Voice playback timed out after %ds", self.PLAYBACK_TIMEOUT)
vc.stop()
self._reset_voice_timeout(guild_id)
return True
finally:
@ -814,6 +829,12 @@ class DiscordAdapter(BasePlatformAdapter):
return
text_ch_id = self._voice_text_channels.get(guild_id)
await self.leave_voice_channel(guild_id)
# Notify the runner so it can clean up voice_mode state
if self._on_voice_disconnect and text_ch_id:
try:
self._on_voice_disconnect(str(text_ch_id))
except Exception:
pass
if text_ch_id and self._client:
ch = self._client.get_channel(text_ch_id)
if ch: