Add messaging platform enhancements: STT, stickers, Discord UX, Slack, pairing, hooks

Major feature additions inspired by OpenClaw/ClawdBot integration analysis: Voice Message Transcription (STT): - Auto-transcribe voice/audio messages via OpenAI Whisper API - Download voice to ~/.hermes/audio_cache/ on Telegram/Discord/WhatsApp - Inject transcript as text so all models can understand voice input - Configurable model (whisper-1, gpt-4o-mini-transcribe, gpt-4o-transcribe) Telegram Sticker Understanding: - Describe static stickers via vision tool with JSON-backed cache - Cache keyed by file_unique_id avoids redundant API calls - Animated/video stickers get emoji-based fallback description Discord Rich UX: - Native slash commands (/ask, /reset, /status, /stop) via app_commands - Button-based exec approvals (Allow Once / Always Allow / Deny) - ExecApprovalView with user authorization and timeout handling Slack Integration: - Full SlackAdapter using slack-bolt with Socket Mode - DMs, channel messages (mention-gated), /hermes slash command - File attachment handling with bot-token-authenticated downloads DM Pairing System: - Code-based user authorization as alternative to static allowlists - 8-char codes from unambiguous alphabet, 1-hour expiry - Rate limiting, lockout after failed attempts, chmod 0600 on data - CLI: hermes pairing list/approve/revoke/clear-pending Event Hook System: - File-based hook discovery from ~/.hermes/hooks/ - HOOK.yaml + handler.py per hook, sync/async handler support - Events: gateway:startup, session:start/reset, agent:start/step/end - Wildcard matching (command:* catches all command events) Cross-Channel Messaging: - send_message agent tool for delivering to any connected platform - Enables cron job delivery and cross-platform notifications Human-Like Response Pacing: - Configurable delays between message chunks (off/natural/custom) - HERMES_HUMAN_DELAY_MODE env var with min/max ms settings Warm Injection Message Style: - Retrofitted image vision messages with friendly kawaii-consistent tone - All new injection messages (STT, stickers, errors) use warm style Also: updated config migration to prompt for optional keys interactively, bumped config version, updated README, AGENTS.md, .env.example, cli-config.yaml.example, install scripts, pyproject.toml, and toolsets.
2026-02-15 21:38:59 -08:00 · 2026-02-15 21:38:59 -08:00 · 69aa35a51c
commit 69aa35a51c
parent 5404a8fcd8
23 changed files with 2080 additions and 32 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@ -85,6 +85,14 @@ class GatewayRunner:
        # Track pending exec approvals per session
        # Key: session_key, Value: {"command": str, "pattern_key": str}
        self._pending_approvals: Dict[str, Dict[str, str]] = {}
+        
+        # DM pairing store for code-based user authorization
+        from gateway.pairing import PairingStore
+        self.pairing_store = PairingStore()
+        
+        # Event hook system
+        from gateway.hooks import HookRegistry
+        self.hooks = HookRegistry()
    
    async def start(self) -> bool:
        """
@ -95,6 +103,9 @@ class GatewayRunner:
        print("[gateway] Starting Hermes Gateway...")
        print(f"[gateway] Session storage: {self.config.sessions_dir}")
        
+        # Discover and load event hooks
+        self.hooks.discover_and_load()
+        
        connected_count = 0
        
        # Initialize and connect each configured platform
@ -131,6 +142,15 @@ class GatewayRunner:
        self.delivery_router.adapters = self.adapters
        
        self._running = True
+        
+        # Emit gateway:startup hook
+        hook_count = len(self.hooks.loaded_hooks)
+        if hook_count:
+            print(f"[gateway] {hook_count} hook(s) loaded")
+        await self.hooks.emit("gateway:startup", {
+            "platforms": [p.value for p in self.adapters.keys()],
+        })
+        
        print(f"[gateway] Gateway running with {connected_count} platform(s)")
        print("[gateway] Press Ctrl+C to stop")
        
@ -183,18 +203,23 @@ class GatewayRunner:
                return None
            return WhatsAppAdapter(config)
        
+        elif platform == Platform.SLACK:
+            from gateway.platforms.slack import SlackAdapter, check_slack_requirements
+            if not check_slack_requirements():
+                print(f"[gateway] Slack: slack-bolt not installed. Run: pip install 'hermes-agent[slack]'")
+                return None
+            return SlackAdapter(config)
+        
        return None
    
    def _is_user_authorized(self, source: SessionSource) -> bool:
        """
        Check if a user is authorized to use the bot.
        
-        Authorization is checked via environment variables:
-        - GATEWAY_ALLOWED_USERS: Comma-separated list of user IDs (all platforms)
-        - TELEGRAM_ALLOWED_USERS: Telegram-specific user IDs
-        - DISCORD_ALLOWED_USERS: Discord-specific user IDs
-        
-        If no allowlist is configured, all users are allowed (open access).
+        Checks in order:
+        1. Environment variable allowlists (TELEGRAM_ALLOWED_USERS, etc.)
+        2. DM pairing approved list
+        3. If no allowlists AND no pairing approvals exist, allow all (open access)
        """
        user_id = source.user_id
        if not user_id:
@ -205,12 +230,18 @@ class GatewayRunner:
            Platform.TELEGRAM: "TELEGRAM_ALLOWED_USERS",
            Platform.DISCORD: "DISCORD_ALLOWED_USERS",
            Platform.WHATSAPP: "WHATSAPP_ALLOWED_USERS",
+            Platform.SLACK: "SLACK_ALLOWED_USERS",
        }
        
        platform_allowlist = os.getenv(platform_env_map.get(source.platform, ""))
        global_allowlist = os.getenv("GATEWAY_ALLOWED_USERS", "")
        
-        # If no allowlists configured, allow all (backward compatible)
+        # Check pairing store (always checked, regardless of allowlists)
+        platform_name = source.platform.value if source.platform else ""
+        if self.pairing_store.is_approved(platform_name, user_id):
+            return True
+        
+        # If no allowlists configured and no pairing approvals, allow all (backward compatible)
        if not platform_allowlist and not global_allowlist:
            return True
        
@ -241,7 +272,31 @@ class GatewayRunner:
        # Check if user is authorized
        if not self._is_user_authorized(source):
            print(f"[gateway] Unauthorized user: {source.user_id} ({source.user_name}) on {source.platform.value}")
-            return None  # Silently ignore unauthorized users
+            # In DMs: offer pairing code. In groups: silently ignore.
+            if source.chat_type == "dm":
+                platform_name = source.platform.value if source.platform else "unknown"
+                code = self.pairing_store.generate_code(
+                    platform_name, source.user_id, source.user_name or ""
+                )
+                if code:
+                    adapter = self.adapters.get(source.platform)
+                    if adapter:
+                        await adapter.send(
+                            source.chat_id,
+                            f"Hi~ I don't recognize you yet!\n\n"
+                            f"Here's your pairing code: `{code}`\n\n"
+                            f"Ask the bot owner to run:\n"
+                            f"`hermes pairing approve {platform_name} {code}`"
+                        )
+                else:
+                    adapter = self.adapters.get(source.platform)
+                    if adapter:
+                        await adapter.send(
+                            source.chat_id,
+                            "Too many pairing requests right now~ "
+                            "Please try again later!"
+                        )
+            return None
        
        # Check for commands
        command = event.get_command()
@ -327,7 +382,34 @@ class GatewayRunner:
                    message_text, image_paths
                )
        
+        # -----------------------------------------------------------------
+        # Auto-transcribe voice/audio messages sent by the user
+        # -----------------------------------------------------------------
+        if event.media_urls:
+            audio_paths = []
+            for i, path in enumerate(event.media_urls):
+                mtype = event.media_types[i] if i < len(event.media_types) else ""
+                is_audio = (
+                    mtype.startswith("audio/")
+                    or event.message_type in (MessageType.VOICE, MessageType.AUDIO)
+                )
+                if is_audio:
+                    audio_paths.append(path)
+            if audio_paths:
+                message_text = await self._enrich_message_with_transcription(
+                    message_text, audio_paths
+                )
+        
        try:
+            # Emit agent:start hook
+            hook_ctx = {
+                "platform": source.platform.value if source.platform else "",
+                "user_id": source.user_id,
+                "session_id": session_entry.session_id,
+                "message": message_text[:500],
+            }
+            await self.hooks.emit("agent:start", hook_ctx)
+            
            # Run the agent
            response = await self._run_agent(
                message=message_text,
@ -338,6 +420,12 @@ class GatewayRunner:
                session_key=session_key
            )
            
+            # Emit agent:end hook
+            await self.hooks.emit("agent:end", {
+                **hook_ctx,
+                "response": (response or "")[:500],
+            })
+            
            # Check if the agent encountered a dangerous command needing approval
            # The terminal tool stores the last pending approval globally
            try:
@ -382,6 +470,13 @@ class GatewayRunner:
        # Reset the session
        new_entry = self.session_store.reset_session(session_key)
        
+        # Emit session:reset hook
+        await self.hooks.emit("session:reset", {
+            "platform": source.platform.value if source.platform else "",
+            "user_id": source.user_id,
+            "session_key": session_key,
+        })
+        
        if new_entry:
            return "✨ Session reset! I've started fresh with no memory of our previous conversation."
        else:
@ -482,23 +577,22 @@ class GatewayRunner:
                if result.get("success"):
                    description = result.get("analysis", "")
                    enriched_parts.append(
-                        f"[User sent an image. Vision analysis:\n{description}]\n"
-                        f"[To examine this image further, use vision_analyze with "
-                        f"image_url: {path}]"
+                        f"[The user sent an image~ Here's what I can see:\n{description}]\n"
+                        f"[If you need a closer look, use vision_analyze with "
+                        f"image_url: {path} ~]"
                    )
                else:
-                    # Analysis failed -- still tell the model the image exists
                    enriched_parts.append(
-                        f"[User sent an image but automatic analysis failed. "
-                        f"You can try analyzing it with vision_analyze using "
-                        f"image_url: {path}]"
+                        "[The user sent an image but I couldn't quite see it "
+                        "this time (>_<) You can try looking at it yourself "
+                        f"with vision_analyze using image_url: {path}]"
                    )
            except Exception as e:
                print(f"[gateway] Vision auto-analysis error: {e}", flush=True)
                enriched_parts.append(
-                    f"[User sent an image but automatic analysis encountered an error. "
-                    f"You can try analyzing it with vision_analyze using "
-                    f"image_url: {path}]"
+                    f"[The user sent an image but something went wrong when I "
+                    f"tried to look at it~ You can try examining it yourself "
+                    f"with vision_analyze using image_url: {path}]"
                )

        # Combine: vision descriptions first, then the user's original text
@ -509,6 +603,63 @@ class GatewayRunner:
            return prefix
        return user_text

+    async def _enrich_message_with_transcription(
+        self,
+        user_text: str,
+        audio_paths: List[str],
+    ) -> str:
+        """
+        Auto-transcribe user voice/audio messages using OpenAI Whisper API
+        and prepend the transcript to the message text.
+
+        Args:
+            user_text:   The user's original caption / message text.
+            audio_paths: List of local file paths to cached audio files.
+
+        Returns:
+            The enriched message string with transcriptions prepended.
+        """
+        from tools.transcription_tools import transcribe_audio
+        import asyncio
+
+        enriched_parts = []
+        for path in audio_paths:
+            try:
+                print(f"[gateway] Transcribing user voice: {path}", flush=True)
+                result = await asyncio.to_thread(transcribe_audio, path)
+                if result["success"]:
+                    transcript = result["transcript"]
+                    enriched_parts.append(
+                        f'[The user sent a voice message~ '
+                        f'Here\'s what they said: "{transcript}"]'
+                    )
+                else:
+                    error = result.get("error", "unknown error")
+                    if "OPENAI_API_KEY" in error:
+                        enriched_parts.append(
+                            "[The user sent a voice message but I can't listen "
+                            "to it right now~ OPENAI_API_KEY isn't set up yet "
+                            "(';w;') Let them know!]"
+                        )
+                    else:
+                        enriched_parts.append(
+                            "[The user sent a voice message but I had trouble "
+                            f"transcribing it~ ({error})]"
+                        )
+            except Exception as e:
+                print(f"[gateway] Transcription error: {e}", flush=True)
+                enriched_parts.append(
+                    "[The user sent a voice message but something went wrong "
+                    "when I tried to listen to it~ Let them know!]"
+                )
+
+        if enriched_parts:
+            prefix = "\n\n".join(enriched_parts)
+            if user_text:
+                return f"{prefix}\n\n{user_text}"
+            return prefix
+        return user_text
+
    async def _run_agent(
        self,
        message: str,