Add Text-to-Speech (TTS) functionality with multiple providers

Add tool previews Add AGENTS and SOUL.md support Add Exec Approval
2026-02-12 10:05:08 -08:00 · 2026-02-12 10:05:08 -08:00 · f5be6177b2
commit f5be6177b2
parent 89c6f24d48
18 changed files with 1200 additions and 21 deletions
--- a/gateway/platforms/base.py
+++ b/gateway/platforms/base.py
@ -240,6 +240,61 @@ class BasePlatformAdapter(ABC):
        
        return images, cleaned
    
+    async def send_voice(
+        self,
+        chat_id: str,
+        audio_path: str,
+        caption: Optional[str] = None,
+        reply_to: Optional[str] = None,
+    ) -> SendResult:
+        """
+        Send an audio file as a native voice message via the platform API.
+        
+        Override in subclasses to send audio as voice bubbles (Telegram)
+        or file attachments (Discord). Default falls back to sending the
+        file path as text.
+        """
+        text = f"🔊 Audio: {audio_path}"
+        if caption:
+            text = f"{caption}\n{text}"
+        return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
+    
+    @staticmethod
+    def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]:
+        """
+        Extract MEDIA:<path> tags and [[audio_as_voice]] directives from response text.
+        
+        The TTS tool returns responses like:
+            [[audio_as_voice]]
+            MEDIA:/path/to/audio.ogg
+        
+        Args:
+            content: The response text to scan.
+        
+        Returns:
+            Tuple of (list of (path, is_voice) pairs, cleaned content with tags removed).
+        """
+        media = []
+        cleaned = content
+        
+        # Check for [[audio_as_voice]] directive
+        has_voice_tag = "[[audio_as_voice]]" in content
+        cleaned = cleaned.replace("[[audio_as_voice]]", "")
+        
+        # Extract MEDIA:<path> tags (path may contain spaces)
+        media_pattern = r'MEDIA:(\S+)'
+        for match in re.finditer(media_pattern, content):
+            path = match.group(1).strip()
+            if path:
+                media.append((path, has_voice_tag))
+        
+        # Remove MEDIA tags from content
+        if media:
+            cleaned = re.sub(media_pattern, '', cleaned)
+            cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip()
+        
+        return media, cleaned
+    
    async def _keep_typing(self, chat_id: str, interval: float = 2.0) -> None:
        """
        Continuously send typing indicator until cancelled.
@ -294,10 +349,13 @@ class BasePlatformAdapter(ABC):
            
            # Send response if any
            if response:
+                # Extract MEDIA:<path> tags (from TTS tool) before other processing
+                media_files, response = self.extract_media(response)
+                
                # Extract image URLs and send them as native platform attachments
                images, text_content = self.extract_images(response)
                
-                # Send the text portion first (if any remains after extracting images)
+                # Send the text portion first (if any remains after extractions)
                if text_content:
                    result = await self.send(
                        chat_id=event.source.chat_id,
@ -329,6 +387,18 @@ class BasePlatformAdapter(ABC):
                            print(f"[{self.name}] Failed to send image: {img_result.error}")
                    except Exception as img_err:
                        print(f"[{self.name}] Error sending image: {img_err}")
+                
+                # Send extracted audio/voice files as native attachments
+                for audio_path, is_voice in media_files:
+                    try:
+                        voice_result = await self.send_voice(
+                            chat_id=event.source.chat_id,
+                            audio_path=audio_path,
+                        )
+                        if not voice_result.success:
+                            print(f"[{self.name}] Failed to send voice: {voice_result.error}")
+                    except Exception as voice_err:
+                        print(f"[{self.name}] Error sending voice: {voice_err}")
            
            # Check if there's a pending message that was queued during our processing
            if session_key in self._pending_messages:
--- a/gateway/platforms/discord.py
+++ b/gateway/platforms/discord.py
@ -174,6 +174,44 @@ class DiscordAdapter(BasePlatformAdapter):
        except Exception as e:
            return SendResult(success=False, error=str(e))
    
+    async def send_voice(
+        self,
+        chat_id: str,
+        audio_path: str,
+        caption: Optional[str] = None,
+        reply_to: Optional[str] = None,
+    ) -> SendResult:
+        """Send audio as a Discord file attachment."""
+        if not self._client:
+            return SendResult(success=False, error="Not connected")
+        
+        try:
+            import io
+            
+            channel = self._client.get_channel(int(chat_id))
+            if not channel:
+                channel = await self._client.fetch_channel(int(chat_id))
+            if not channel:
+                return SendResult(success=False, error=f"Channel {chat_id} not found")
+            
+            if not os.path.exists(audio_path):
+                return SendResult(success=False, error=f"Audio file not found: {audio_path}")
+            
+            # Determine filename from path
+            filename = os.path.basename(audio_path)
+            
+            with open(audio_path, "rb") as f:
+                file = discord.File(io.BytesIO(f.read()), filename=filename)
+                msg = await channel.send(
+                    content=caption if caption else None,
+                    file=file,
+                )
+                return SendResult(success=True, message_id=str(msg.id))
+        
+        except Exception as e:
+            print(f"[{self.name}] Failed to send audio: {e}")
+            return await super().send_voice(chat_id, audio_path, caption, reply_to)
+    
    async def send_image(
        self,
        chat_id: str,
--- a/gateway/platforms/telegram.py
+++ b/gateway/platforms/telegram.py
@ -174,6 +174,44 @@ class TelegramAdapter(BasePlatformAdapter):
        except Exception as e:
            return SendResult(success=False, error=str(e))
    
+    async def send_voice(
+        self,
+        chat_id: str,
+        audio_path: str,
+        caption: Optional[str] = None,
+        reply_to: Optional[str] = None,
+    ) -> SendResult:
+        """Send audio as a native Telegram voice message or audio file."""
+        if not self._bot:
+            return SendResult(success=False, error="Not connected")
+        
+        try:
+            import os
+            if not os.path.exists(audio_path):
+                return SendResult(success=False, error=f"Audio file not found: {audio_path}")
+            
+            with open(audio_path, "rb") as audio_file:
+                # .ogg files -> send as voice (round playable bubble)
+                if audio_path.endswith(".ogg") or audio_path.endswith(".opus"):
+                    msg = await self._bot.send_voice(
+                        chat_id=int(chat_id),
+                        voice=audio_file,
+                        caption=caption[:1024] if caption else None,
+                        reply_to_message_id=int(reply_to) if reply_to else None,
+                    )
+                else:
+                    # .mp3 and others -> send as audio file
+                    msg = await self._bot.send_audio(
+                        chat_id=int(chat_id),
+                        audio=audio_file,
+                        caption=caption[:1024] if caption else None,
+                        reply_to_message_id=int(reply_to) if reply_to else None,
+                    )
+            return SendResult(success=True, message_id=str(msg.message_id))
+        except Exception as e:
+            print(f"[{self.name}] Failed to send voice/audio: {e}")
+            return await super().send_voice(chat_id, audio_path, caption, reply_to)
+    
    async def send_image(
        self,
        chat_id: str,