fix: address PR review round 5 — streaming guard, VC auth, history prefix, auto-TTS control

1. Gate _streaming_api_call to chat_completions mode only — Anthropic and
   Codex fall back to _interruptible_api_call. Preserve Anthropic base_url
   across all client rebuild paths (interrupt, fallback, 401 refresh).

2. Discord VC synthetic events now use chat_type="channel" instead of
   defaulting to "dm" — prevents session bleed into DM context.
   Authorization runs before echoing transcript. Sanitize @everyone/@here
   in voice transcripts.

3. CLI voice prefix ("[Voice input...]") is now API-call-local only —
   stripped from returned history so it never persists to session DB or
   resumed sessions.

4. /voice off now disables base adapter auto-TTS via _auto_tts_disabled_chats
   set — voice input no longer triggers TTS when voice mode is off.
This commit is contained in:
0xbyt4 2026-03-14 10:31:49 +03:00
parent 35748a2fb0
commit cc0a453476
5 changed files with 59 additions and 22 deletions

View file

@ -351,6 +351,8 @@ class BasePlatformAdapter(ABC):
# Key: session_key (e.g., chat_id), Value: (event, asyncio.Event for interrupt)
self._active_sessions: Dict[str, asyncio.Event] = {}
self._pending_messages: Dict[str, MessageEvent] = {}
# Chats where auto-TTS on voice input is disabled (set by /voice off)
self._auto_tts_disabled_chats: set = set()
@property
def name(self) -> str:
@ -733,8 +735,12 @@ class BasePlatformAdapter(ABC):
logger.info("[%s] extract_images found %d image(s) in response (%d chars)", self.name, len(images), len(response))
# Auto-TTS: if voice message, generate audio FIRST (before sending text)
# Skipped when the chat has voice mode disabled (/voice off)
_tts_path = None
if event.message_type == MessageType.VOICE and text_content and not media_files:
if (event.message_type == MessageType.VOICE
and text_content
and not media_files
and event.source.chat_id not in self._auto_tts_disabled_chats):
try:
from tools.tts_tool import text_to_speech_tool, check_tts_requirements
if check_tts_requirements():

View file

@ -2119,9 +2119,13 @@ class GatewayRunner:
args = event.get_command_args().strip().lower()
chat_id = event.source.chat_id
adapter = self.adapters.get(event.source.platform)
if args in ("on", "enable"):
self._voice_mode[chat_id] = "voice_only"
self._save_voice_modes()
if adapter:
adapter._auto_tts_disabled_chats.discard(chat_id)
return (
"Voice mode enabled.\n"
"I'll reply with voice when you send voice messages.\n"
@ -2130,10 +2134,14 @@ class GatewayRunner:
elif args in ("off", "disable"):
self._voice_mode.pop(chat_id, None)
self._save_voice_modes()
if adapter:
adapter._auto_tts_disabled_chats.add(chat_id)
return "Voice mode disabled. Text-only replies."
elif args == "tts":
self._voice_mode[chat_id] = "all"
self._save_voice_modes()
if adapter:
adapter._auto_tts_disabled_chats.discard(chat_id)
return (
"Auto-TTS enabled.\n"
"All replies will include a voice message."
@ -2171,10 +2179,14 @@ class GatewayRunner:
if current == "off":
self._voice_mode[chat_id] = "voice_only"
self._save_voice_modes()
if adapter:
adapter._auto_tts_disabled_chats.discard(chat_id)
return "Voice mode enabled."
else:
self._voice_mode.pop(chat_id, None)
self._save_voice_modes()
if adapter:
adapter._auto_tts_disabled_chats.add(chat_id)
return "Voice mode disabled."
async def _handle_voice_channel_join(self, event: MessageEvent) -> str:
@ -2211,6 +2223,7 @@ class GatewayRunner:
adapter._voice_text_channels[guild_id] = int(event.source.chat_id)
self._voice_mode[event.source.chat_id] = "all"
self._save_voice_modes()
adapter._auto_tts_disabled_chats.discard(event.source.chat_id)
return (
f"Joined voice channel **{voice_channel.name}**.\n"
f"I'll speak my replies and listen to you. Use /voice leave to disconnect."
@ -2265,21 +2278,28 @@ class GatewayRunner:
if not text_ch_id:
return
# Show transcript in text channel
try:
channel = adapter._client.get_channel(text_ch_id)
if channel:
await channel.send(f"**[Voice]** <@{user_id}>: {transcript}")
except Exception:
pass
# Build a synthetic MessageEvent and feed through the normal pipeline
# Check authorization before processing voice input
source = SessionSource(
platform=Platform.DISCORD,
chat_id=str(text_ch_id),
user_id=str(user_id),
user_name=str(user_id),
chat_type="channel",
)
if not self._is_user_authorized(source):
logger.debug("Unauthorized voice input from user %d, ignoring", user_id)
return
# Show transcript in text channel (after auth, with mention sanitization)
try:
channel = adapter._client.get_channel(text_ch_id)
if channel:
safe_text = transcript[:2000].replace("@everyone", "@\u200beveryone").replace("@here", "@\u200bhere")
await channel.send(f"**[Voice]** <@{user_id}>: {safe_text}")
except Exception:
pass
# Build a synthetic MessageEvent and feed through the normal pipeline
# Use SimpleNamespace as raw_message so _get_guild_id() can extract
# guild_id and _send_voice_reply() plays audio in the voice channel.
from types import SimpleNamespace