fix: address PR review round 5 — streaming guard, VC auth, history prefix, auto-TTS control
1. Gate _streaming_api_call to chat_completions mode only — Anthropic and
Codex fall back to _interruptible_api_call. Preserve Anthropic base_url
across all client rebuild paths (interrupt, fallback, 401 refresh).
2. Discord VC synthetic events now use chat_type="channel" instead of
defaulting to "dm" — prevents session bleed into DM context.
Authorization runs before echoing transcript. Sanitize @everyone/@here
in voice transcripts.
3. CLI voice prefix ("[Voice input...]") is now API-call-local only —
stripped from returned history so it never persists to session DB or
resumed sessions.
4. /voice off now disables base adapter auto-TTS via _auto_tts_disabled_chats
set — voice input no longer triggers TTS when voice mode is off.
This commit is contained in:
parent
35748a2fb0
commit
cc0a453476
5 changed files with 59 additions and 22 deletions
|
|
@ -351,6 +351,8 @@ class BasePlatformAdapter(ABC):
|
|||
# Key: session_key (e.g., chat_id), Value: (event, asyncio.Event for interrupt)
|
||||
self._active_sessions: Dict[str, asyncio.Event] = {}
|
||||
self._pending_messages: Dict[str, MessageEvent] = {}
|
||||
# Chats where auto-TTS on voice input is disabled (set by /voice off)
|
||||
self._auto_tts_disabled_chats: set = set()
|
||||
|
||||
@property
|
||||
def name(self) -> str:
|
||||
|
|
@ -733,8 +735,12 @@ class BasePlatformAdapter(ABC):
|
|||
logger.info("[%s] extract_images found %d image(s) in response (%d chars)", self.name, len(images), len(response))
|
||||
|
||||
# Auto-TTS: if voice message, generate audio FIRST (before sending text)
|
||||
# Skipped when the chat has voice mode disabled (/voice off)
|
||||
_tts_path = None
|
||||
if event.message_type == MessageType.VOICE and text_content and not media_files:
|
||||
if (event.message_type == MessageType.VOICE
|
||||
and text_content
|
||||
and not media_files
|
||||
and event.source.chat_id not in self._auto_tts_disabled_chats):
|
||||
try:
|
||||
from tools.tts_tool import text_to_speech_tool, check_tts_requirements
|
||||
if check_tts_requirements():
|
||||
|
|
|
|||
|
|
@ -2119,9 +2119,13 @@ class GatewayRunner:
|
|||
args = event.get_command_args().strip().lower()
|
||||
chat_id = event.source.chat_id
|
||||
|
||||
adapter = self.adapters.get(event.source.platform)
|
||||
|
||||
if args in ("on", "enable"):
|
||||
self._voice_mode[chat_id] = "voice_only"
|
||||
self._save_voice_modes()
|
||||
if adapter:
|
||||
adapter._auto_tts_disabled_chats.discard(chat_id)
|
||||
return (
|
||||
"Voice mode enabled.\n"
|
||||
"I'll reply with voice when you send voice messages.\n"
|
||||
|
|
@ -2130,10 +2134,14 @@ class GatewayRunner:
|
|||
elif args in ("off", "disable"):
|
||||
self._voice_mode.pop(chat_id, None)
|
||||
self._save_voice_modes()
|
||||
if adapter:
|
||||
adapter._auto_tts_disabled_chats.add(chat_id)
|
||||
return "Voice mode disabled. Text-only replies."
|
||||
elif args == "tts":
|
||||
self._voice_mode[chat_id] = "all"
|
||||
self._save_voice_modes()
|
||||
if adapter:
|
||||
adapter._auto_tts_disabled_chats.discard(chat_id)
|
||||
return (
|
||||
"Auto-TTS enabled.\n"
|
||||
"All replies will include a voice message."
|
||||
|
|
@ -2171,10 +2179,14 @@ class GatewayRunner:
|
|||
if current == "off":
|
||||
self._voice_mode[chat_id] = "voice_only"
|
||||
self._save_voice_modes()
|
||||
if adapter:
|
||||
adapter._auto_tts_disabled_chats.discard(chat_id)
|
||||
return "Voice mode enabled."
|
||||
else:
|
||||
self._voice_mode.pop(chat_id, None)
|
||||
self._save_voice_modes()
|
||||
if adapter:
|
||||
adapter._auto_tts_disabled_chats.add(chat_id)
|
||||
return "Voice mode disabled."
|
||||
|
||||
async def _handle_voice_channel_join(self, event: MessageEvent) -> str:
|
||||
|
|
@ -2211,6 +2223,7 @@ class GatewayRunner:
|
|||
adapter._voice_text_channels[guild_id] = int(event.source.chat_id)
|
||||
self._voice_mode[event.source.chat_id] = "all"
|
||||
self._save_voice_modes()
|
||||
adapter._auto_tts_disabled_chats.discard(event.source.chat_id)
|
||||
return (
|
||||
f"Joined voice channel **{voice_channel.name}**.\n"
|
||||
f"I'll speak my replies and listen to you. Use /voice leave to disconnect."
|
||||
|
|
@ -2265,21 +2278,28 @@ class GatewayRunner:
|
|||
if not text_ch_id:
|
||||
return
|
||||
|
||||
# Show transcript in text channel
|
||||
try:
|
||||
channel = adapter._client.get_channel(text_ch_id)
|
||||
if channel:
|
||||
await channel.send(f"**[Voice]** <@{user_id}>: {transcript}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Build a synthetic MessageEvent and feed through the normal pipeline
|
||||
# Check authorization before processing voice input
|
||||
source = SessionSource(
|
||||
platform=Platform.DISCORD,
|
||||
chat_id=str(text_ch_id),
|
||||
user_id=str(user_id),
|
||||
user_name=str(user_id),
|
||||
chat_type="channel",
|
||||
)
|
||||
if not self._is_user_authorized(source):
|
||||
logger.debug("Unauthorized voice input from user %d, ignoring", user_id)
|
||||
return
|
||||
|
||||
# Show transcript in text channel (after auth, with mention sanitization)
|
||||
try:
|
||||
channel = adapter._client.get_channel(text_ch_id)
|
||||
if channel:
|
||||
safe_text = transcript[:2000].replace("@everyone", "@\u200beveryone").replace("@here", "@\u200bhere")
|
||||
await channel.send(f"**[Voice]** <@{user_id}>: {safe_text}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Build a synthetic MessageEvent and feed through the normal pipeline
|
||||
# Use SimpleNamespace as raw_message so _get_guild_id() can extract
|
||||
# guild_id and _send_voice_reply() plays audio in the voice channel.
|
||||
from types import SimpleNamespace
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue