fix: voice pipeline thread safety and error handling bugs

- Add lock protection around VoiceReceiver buffer writes in _on_packet to prevent race condition with check_silence on different threads - Wire _voice_input_callback BEFORE join_voice_channel to avoid losing voice input during the join window - Add try/except around leave_voice_channel to ensure state cleanup (voice_mode, callback) even if leave raises an exception - Guard against empty text after markdown stripping in base.py auto-TTS - Add 11 tests proving each bug and verifying the fix
2026-03-11 23:36:47 +03:00 · 2026-03-11 23:36:47 +03:00 · c925d2ee76
commit c925d2ee76
parent 34c324ff59
4 changed files with 275 additions and 23 deletions
--- a/gateway/platforms/base.py
+++ b/gateway/platforms/base.py
@ -739,7 +739,9 @@ class BasePlatformAdapter(ABC):
                        from tools.tts_tool import text_to_speech_tool, check_tts_requirements
                        if check_tts_requirements():
                            import json as _json
-                            speech_text = re.sub(r'[*_`#\[\]()]', '', text_content)[:4000]
+                            speech_text = re.sub(r'[*_`#\[\]()]', '', text_content)[:4000].strip()
+                            if not speech_text:
+                                raise ValueError("Empty text after markdown cleanup")
                            tts_result_str = await asyncio.to_thread(
                                text_to_speech_tool, text=speech_text
                            )
--- a/gateway/platforms/discord.py
+++ b/gateway/platforms/discord.py
@ -289,8 +289,9 @@ class VoiceReceiver:
            if ssrc not in self._decoders:
                self._decoders[ssrc] = discord.opus.Decoder()
            pcm = self._decoders[ssrc].decode(decrypted)
-            self._buffers[ssrc].extend(pcm)
-            self._last_packet_time[ssrc] = time.monotonic()
+            with self._lock:
+                self._buffers[ssrc].extend(pcm)
+                self._last_packet_time[ssrc] = time.monotonic()
        except Exception:
            return

@ -305,24 +306,25 @@ class VoiceReceiver:

        with self._lock:
            ssrc_user_map = dict(self._ssrc_to_user)
+            ssrc_list = list(self._buffers.keys())

-        for ssrc in list(self._buffers.keys()):
-            last_time = self._last_packet_time.get(ssrc, now)
-            silence_duration = now - last_time
-            buf = self._buffers[ssrc]
-            # 48kHz, 16-bit, stereo = 192000 bytes/sec
-            buf_duration = len(buf) / (self.SAMPLE_RATE * self.CHANNELS * 2)
+            for ssrc in ssrc_list:
+                last_time = self._last_packet_time.get(ssrc, now)
+                silence_duration = now - last_time
+                buf = self._buffers[ssrc]
+                # 48kHz, 16-bit, stereo = 192000 bytes/sec
+                buf_duration = len(buf) / (self.SAMPLE_RATE * self.CHANNELS * 2)

-            if silence_duration >= self.SILENCE_THRESHOLD and buf_duration >= self.MIN_SPEECH_DURATION:
-                user_id = ssrc_user_map.get(ssrc, 0)
-                if user_id:
-                    completed.append((user_id, bytes(buf)))
-                self._buffers[ssrc] = bytearray()
-                self._last_packet_time.pop(ssrc, None)
-            elif silence_duration >= self.SILENCE_THRESHOLD * 2:
-                # Stale buffer with no valid user — discard
-                self._buffers.pop(ssrc, None)
-                self._last_packet_time.pop(ssrc, None)
+                if silence_duration >= self.SILENCE_THRESHOLD and buf_duration >= self.MIN_SPEECH_DURATION:
+                    user_id = ssrc_user_map.get(ssrc, 0)
+                    if user_id:
+                        completed.append((user_id, bytes(buf)))
+                    self._buffers[ssrc] = bytearray()
+                    self._last_packet_time.pop(ssrc, None)
+                elif silence_duration >= self.SILENCE_THRESHOLD * 2:
+                    # Stale buffer with no valid user — discard
+                    self._buffers.pop(ssrc, None)
+                    self._last_packet_time.pop(ssrc, None)

        return completed

--- a/gateway/run.py
+++ b/gateway/run.py
@ -2190,23 +2190,28 @@ class GatewayRunner:
        if not voice_channel:
            return "You need to be in a voice channel first."

+        # Wire callback BEFORE join so voice input arriving immediately
+        # after connection is not lost.
+        if hasattr(adapter, "_voice_input_callback"):
+            adapter._voice_input_callback = self._handle_voice_channel_input
+
        try:
            success = await adapter.join_voice_channel(voice_channel)
        except Exception as e:
            logger.warning("Failed to join voice channel: %s", e)
+            adapter._voice_input_callback = None
            return f"Failed to join voice channel: {e}"

        if success:
            adapter._voice_text_channels[guild_id] = int(event.source.chat_id)
            self._voice_mode[event.source.chat_id] = "all"
            self._save_voice_modes()
-            # Wire voice input callback so the adapter can deliver transcripts
-            if hasattr(adapter, "_voice_input_callback"):
-                adapter._voice_input_callback = self._handle_voice_channel_input
            return (
                f"Joined voice channel **{voice_channel.name}**.\n"
                f"I'll speak my replies and listen to you. Use /voice leave to disconnect."
            )
+        # Join failed — clear callback
+        adapter._voice_input_callback = None
        return "Failed to join voice channel. Check bot permissions (Connect + Speak)."

    async def _handle_voice_channel_leave(self, event: MessageEvent) -> str:
@ -2220,9 +2225,15 @@ class GatewayRunner:
        if not hasattr(adapter, "is_in_voice_channel") or not adapter.is_in_voice_channel(guild_id):
            return "Not in a voice channel."

-        await adapter.leave_voice_channel(guild_id)
+        try:
+            await adapter.leave_voice_channel(guild_id)
+        except Exception as e:
+            logger.warning("Error leaving voice channel: %s", e)
+        # Always clean up state even if leave raised an exception
        self._voice_mode.pop(event.source.chat_id, None)
        self._save_voice_modes()
+        if hasattr(adapter, "_voice_input_callback"):
+            adapter._voice_input_callback = None
        return "Left voice channel."

    async def _handle_voice_channel_input(