Add Text-to-Speech (TTS) functionality with multiple providers

Add tool previews Add AGENTS and SOUL.md support Add Exec Approval
2026-02-12 10:05:08 -08:00 · 2026-02-12 10:05:08 -08:00 · f5be6177b2
commit f5be6177b2
parent 89c6f24d48
18 changed files with 1200 additions and 21 deletions
--- a/gateway/platforms/base.py
+++ b/gateway/platforms/base.py
@ -240,6 +240,61 @@ class BasePlatformAdapter(ABC):
        
        return images, cleaned
    
+    async def send_voice(
+        self,
+        chat_id: str,
+        audio_path: str,
+        caption: Optional[str] = None,
+        reply_to: Optional[str] = None,
+    ) -> SendResult:
+        """
+        Send an audio file as a native voice message via the platform API.
+        
+        Override in subclasses to send audio as voice bubbles (Telegram)
+        or file attachments (Discord). Default falls back to sending the
+        file path as text.
+        """
+        text = f"🔊 Audio: {audio_path}"
+        if caption:
+            text = f"{caption}\n{text}"
+        return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
+    
+    @staticmethod
+    def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]:
+        """
+        Extract MEDIA:<path> tags and [[audio_as_voice]] directives from response text.
+        
+        The TTS tool returns responses like:
+            [[audio_as_voice]]
+            MEDIA:/path/to/audio.ogg
+        
+        Args:
+            content: The response text to scan.
+        
+        Returns:
+            Tuple of (list of (path, is_voice) pairs, cleaned content with tags removed).
+        """
+        media = []
+        cleaned = content
+        
+        # Check for [[audio_as_voice]] directive
+        has_voice_tag = "[[audio_as_voice]]" in content
+        cleaned = cleaned.replace("[[audio_as_voice]]", "")
+        
+        # Extract MEDIA:<path> tags (path may contain spaces)
+        media_pattern = r'MEDIA:(\S+)'
+        for match in re.finditer(media_pattern, content):
+            path = match.group(1).strip()
+            if path:
+                media.append((path, has_voice_tag))
+        
+        # Remove MEDIA tags from content
+        if media:
+            cleaned = re.sub(media_pattern, '', cleaned)
+            cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip()
+        
+        return media, cleaned
+    
    async def _keep_typing(self, chat_id: str, interval: float = 2.0) -> None:
        """
        Continuously send typing indicator until cancelled.
@ -294,10 +349,13 @@ class BasePlatformAdapter(ABC):
            
            # Send response if any
            if response:
+                # Extract MEDIA:<path> tags (from TTS tool) before other processing
+                media_files, response = self.extract_media(response)
+                
                # Extract image URLs and send them as native platform attachments
                images, text_content = self.extract_images(response)
                
-                # Send the text portion first (if any remains after extracting images)
+                # Send the text portion first (if any remains after extractions)
                if text_content:
                    result = await self.send(
                        chat_id=event.source.chat_id,
@ -329,6 +387,18 @@ class BasePlatformAdapter(ABC):
                            print(f"[{self.name}] Failed to send image: {img_result.error}")
                    except Exception as img_err:
                        print(f"[{self.name}] Error sending image: {img_err}")
+                
+                # Send extracted audio/voice files as native attachments
+                for audio_path, is_voice in media_files:
+                    try:
+                        voice_result = await self.send_voice(
+                            chat_id=event.source.chat_id,
+                            audio_path=audio_path,
+                        )
+                        if not voice_result.success:
+                            print(f"[{self.name}] Failed to send voice: {voice_result.error}")
+                    except Exception as voice_err:
+                        print(f"[{self.name}] Error sending voice: {voice_err}")
            
            # Check if there's a pending message that was queued during our processing
            if session_key in self._pending_messages:
--- a/gateway/platforms/discord.py
+++ b/gateway/platforms/discord.py
@ -174,6 +174,44 @@ class DiscordAdapter(BasePlatformAdapter):
        except Exception as e:
            return SendResult(success=False, error=str(e))
    
+    async def send_voice(
+        self,
+        chat_id: str,
+        audio_path: str,
+        caption: Optional[str] = None,
+        reply_to: Optional[str] = None,
+    ) -> SendResult:
+        """Send audio as a Discord file attachment."""
+        if not self._client:
+            return SendResult(success=False, error="Not connected")
+        
+        try:
+            import io
+            
+            channel = self._client.get_channel(int(chat_id))
+            if not channel:
+                channel = await self._client.fetch_channel(int(chat_id))
+            if not channel:
+                return SendResult(success=False, error=f"Channel {chat_id} not found")
+            
+            if not os.path.exists(audio_path):
+                return SendResult(success=False, error=f"Audio file not found: {audio_path}")
+            
+            # Determine filename from path
+            filename = os.path.basename(audio_path)
+            
+            with open(audio_path, "rb") as f:
+                file = discord.File(io.BytesIO(f.read()), filename=filename)
+                msg = await channel.send(
+                    content=caption if caption else None,
+                    file=file,
+                )
+                return SendResult(success=True, message_id=str(msg.id))
+        
+        except Exception as e:
+            print(f"[{self.name}] Failed to send audio: {e}")
+            return await super().send_voice(chat_id, audio_path, caption, reply_to)
+    
    async def send_image(
        self,
        chat_id: str,
--- a/gateway/platforms/telegram.py
+++ b/gateway/platforms/telegram.py
@ -174,6 +174,44 @@ class TelegramAdapter(BasePlatformAdapter):
        except Exception as e:
            return SendResult(success=False, error=str(e))
    
+    async def send_voice(
+        self,
+        chat_id: str,
+        audio_path: str,
+        caption: Optional[str] = None,
+        reply_to: Optional[str] = None,
+    ) -> SendResult:
+        """Send audio as a native Telegram voice message or audio file."""
+        if not self._bot:
+            return SendResult(success=False, error="Not connected")
+        
+        try:
+            import os
+            if not os.path.exists(audio_path):
+                return SendResult(success=False, error=f"Audio file not found: {audio_path}")
+            
+            with open(audio_path, "rb") as audio_file:
+                # .ogg files -> send as voice (round playable bubble)
+                if audio_path.endswith(".ogg") or audio_path.endswith(".opus"):
+                    msg = await self._bot.send_voice(
+                        chat_id=int(chat_id),
+                        voice=audio_file,
+                        caption=caption[:1024] if caption else None,
+                        reply_to_message_id=int(reply_to) if reply_to else None,
+                    )
+                else:
+                    # .mp3 and others -> send as audio file
+                    msg = await self._bot.send_audio(
+                        chat_id=int(chat_id),
+                        audio=audio_file,
+                        caption=caption[:1024] if caption else None,
+                        reply_to_message_id=int(reply_to) if reply_to else None,
+                    )
+            return SendResult(success=True, message_id=str(msg.message_id))
+        except Exception as e:
+            print(f"[{self.name}] Failed to send voice/audio: {e}")
+            return await super().send_voice(chat_id, audio_path, caption, reply_to)
+    
    async def send_image(
        self,
        chat_id: str,
--- a/gateway/run.py
+++ b/gateway/run.py
@ -35,6 +35,9 @@ load_dotenv()
 # Gateway runs in quiet mode - suppress debug output and use cwd directly (no temp dirs)
 os.environ["HERMES_QUIET"] = "1"

+# Enable interactive exec approval for dangerous commands on messaging platforms
+os.environ["HERMES_EXEC_ASK"] = "1"
+
 # Set terminal working directory for messaging platforms
 # Uses MESSAGING_CWD if set, otherwise defaults to home directory
 # This is separate from CLI which uses the directory where `hermes` is run
@ -77,6 +80,10 @@ class GatewayRunner:
        # Key: session_key, Value: AIAgent instance
        self._running_agents: Dict[str, Any] = {}
        self._pending_messages: Dict[str, str] = {}  # Queued messages during interrupt
+        
+        # Track pending exec approvals per session
+        # Key: session_key, Value: {"command": str, "pattern_key": str}
+        self._pending_approvals: Dict[str, Dict[str, str]] = {}
    
    async def start(self) -> bool:
        """
@ -246,6 +253,25 @@ class GatewayRunner:
        if command == "stop":
            return await self._handle_stop_command(event)
        
+        # Check for pending exec approval responses
+        session_key_preview = f"agent:main:{source.platform.value}:{source.chat_type}:{source.chat_id}" if source.chat_type != "dm" else f"agent:main:{source.platform.value}:dm"
+        if session_key_preview in self._pending_approvals:
+            user_text = event.text.strip().lower()
+            if user_text in ("yes", "y", "approve", "ok", "go", "do it"):
+                approval = self._pending_approvals.pop(session_key_preview)
+                cmd = approval["command"]
+                pattern_key = approval.get("pattern_key", "")
+                print(f"[gateway] ✅ User approved dangerous command: {cmd[:60]}...")
+                # Approve for session and re-run via terminal_tool with force=True
+                from tools.terminal_tool import terminal_tool, _session_approved_patterns
+                _session_approved_patterns.add(pattern_key)
+                result = terminal_tool(command=cmd, force=True)
+                return f"✅ Command approved and executed.\n\n```\n{result[:3500]}\n```"
+            elif user_text in ("no", "n", "deny", "cancel", "nope"):
+                self._pending_approvals.pop(session_key_preview)
+                return "❌ Command denied."
+            # If it's not clearly an approval/denial, fall through to normal processing
+        
        # Get or create session
        session_entry = self.session_store.get_or_create_session(source)
        session_key = session_entry.session_key
@ -282,6 +308,17 @@ class GatewayRunner:
                session_key=session_key
            )
            
+            # Check if the agent encountered a dangerous command needing approval
+            # The terminal tool stores the last pending approval globally
+            try:
+                from tools.terminal_tool import _last_pending_approval
+                if _last_pending_approval:
+                    self._pending_approvals[session_key] = _last_pending_approval.copy()
+                    # Clear the global so it doesn't leak to other sessions
+                    _last_pending_approval.clear()
+            except Exception:
+                pass
+            
            # Append to transcript
            self.session_store.append_to_transcript(
                session_entry.session_id,
@ -418,23 +455,35 @@ class GatewayRunner:
                return
            last_tool[0] = tool_name
            
-            # Build progress message
+            # Build progress message with primary argument preview
            tool_emojis = {
                "terminal": "💻",
                "web_search": "🔍",
                "web_extract": "📄",
                "read_file": "📖",
                "write_file": "✍️",
+                "patch": "🔧",
+                "search": "🔎",
                "list_directory": "📂",
                "image_generate": "🎨",
+                "text_to_speech": "🔊",
                "browser_navigate": "🌐",
                "browser_click": "👆",
+                "browser_type": "⌨️",
+                "browser_snapshot": "📸",
                "moa_query": "🧠",
+                "mixture_of_agents": "🧠",
+                "vision_analyze": "👁️",
+                "skill_view": "📚",
+                "skills_list": "📋",
            }
            emoji = tool_emojis.get(tool_name, "⚙️")
            
-            if tool_name == "terminal" and preview:
-                msg = f"{emoji} `{preview}`..."
+            if preview:
+                # Truncate preview to keep messages clean
+                if len(preview) > 40:
+                    preview = preview[:37] + "..."
+                msg = f"{emoji} {tool_name}... \"{preview}\""
            else:
                msg = f"{emoji} {tool_name}..."