feat: add voice channel awareness — inject participant and speaking state into agent context

2026-03-14 02:14:34 +03:00 · 2026-03-14 02:14:34 +03:00 · 1ad5e0ed15
commit 1ad5e0ed15
parent 49f3f0fc62
3 changed files with 230 additions and 6 deletions
--- a/gateway/platforms/discord.py
+++ b/gateway/platforms/discord.py
@ -441,6 +441,7 @@ class DiscordAdapter(BasePlatformAdapter):
            intents.dm_messages = True
            intents.guild_messages = True
            intents.members = True
+            intents.voice_states = True
            
            # Create bot
            self._client = commands.Bot(
@ -494,7 +495,40 @@ class DiscordAdapter(BasePlatformAdapter):
                    # "all" falls through to handle_message
                
                await self._handle_message(message)
-            
+
+            @self._client.event
+            async def on_voice_state_update(member, before, after):
+                """Track voice channel join/leave events."""
+                # Only track channels where the bot is connected
+                bot_guild_ids = set(adapter_self._voice_clients.keys())
+                if not bot_guild_ids:
+                    return
+                guild_id = member.guild.id
+                if guild_id not in bot_guild_ids:
+                    return
+                # Ignore the bot itself
+                if member == adapter_self._client.user:
+                    return
+
+                joined = before.channel is None and after.channel is not None
+                left = before.channel is not None and after.channel is None
+                switched = (
+                    before.channel is not None
+                    and after.channel is not None
+                    and before.channel != after.channel
+                )
+
+                if joined or left or switched:
+                    logger.info(
+                        "Voice state: %s (%d) %s (guild %d)",
+                        member.display_name,
+                        member.id,
+                        "joined " + after.channel.name if joined
+                        else "left " + before.channel.name if left
+                        else f"moved {before.channel.name} -> {after.channel.name}",
+                        guild_id,
+                    )
+
            # Register slash commands
            self._register_slash_commands()
            
@ -864,6 +898,75 @@ class DiscordAdapter(BasePlatformAdapter):
        vc = self._voice_clients.get(guild_id)
        return vc is not None and vc.is_connected()

+    def get_voice_channel_info(self, guild_id: int) -> Optional[Dict[str, Any]]:
+        """Return voice channel awareness info for the given guild.
+
+        Returns None if the bot is not in a voice channel.  Otherwise
+        returns a dict with channel name, member list, count, and
+        currently-speaking user IDs (from SSRC mapping).
+        """
+        vc = self._voice_clients.get(guild_id)
+        if not vc or not vc.is_connected():
+            return None
+
+        channel = vc.channel
+        if not channel:
+            return None
+
+        # Members currently in the voice channel (includes bot)
+        members_info = []
+        bot_user = self._client.user if self._client else None
+        for m in channel.members:
+            if bot_user and m.id == bot_user.id:
+                continue  # skip the bot itself
+            members_info.append({
+                "user_id": m.id,
+                "display_name": m.display_name,
+                "is_bot": m.bot,
+            })
+
+        # Currently speaking users (from SSRC mapping + active buffers)
+        speaking_user_ids: set = set()
+        receiver = self._voice_receivers.get(guild_id)
+        if receiver:
+            import time as _time
+            now = _time.monotonic()
+            with receiver._lock:
+                for ssrc, last_t in receiver._last_packet_time.items():
+                    # Consider "speaking" if audio received within last 2 seconds
+                    if now - last_t < 2.0:
+                        uid = receiver._ssrc_to_user.get(ssrc)
+                        if uid:
+                            speaking_user_ids.add(uid)
+
+        # Tag speaking status on members
+        for info in members_info:
+            info["is_speaking"] = info["user_id"] in speaking_user_ids
+
+        return {
+            "channel_name": channel.name,
+            "member_count": len(members_info),
+            "members": members_info,
+            "speaking_count": len(speaking_user_ids),
+        }
+
+    def get_voice_channel_context(self, guild_id: int) -> str:
+        """Return a human-readable voice channel context string.
+
+        Suitable for injection into the system/ephemeral prompt so the
+        agent is always aware of voice channel state.
+        """
+        info = self.get_voice_channel_info(guild_id)
+        if not info:
+            return ""
+
+        parts = [f"[Voice channel: #{info['channel_name']} — {info['member_count']} participant(s)]"]
+        for m in info["members"]:
+            status = " (speaking)" if m["is_speaking"] else ""
+            parts.append(f"  - {m['display_name']}{status}")
+
+        return "\n".join(parts)
+
    # ------------------------------------------------------------------
    # Voice listening (Phase 2)
    # ------------------------------------------------------------------
--- a/gateway/run.py
+++ b/gateway/run.py
@ -1408,6 +1408,19 @@ class GatewayRunner:
                        f"or ignore to skip."
                    )
        
+        # -----------------------------------------------------------------
+        # Voice channel awareness — inject current voice channel state
+        # into context so the agent knows who is in the channel and who
+        # is speaking, without needing a separate tool call.
+        # -----------------------------------------------------------------
+        if source.platform == Platform.DISCORD:
+            adapter = self.adapters.get(Platform.DISCORD)
+            guild_id = self._get_guild_id(event)
+            if guild_id and adapter and hasattr(adapter, "get_voice_channel_context"):
+                vc_context = adapter.get_voice_channel_context(guild_id)
+                if vc_context:
+                    context_prompt += f"\n\n{vc_context}"
+
        # -----------------------------------------------------------------
        # Auto-analyze images sent by the user
        #
@ -2156,11 +2169,18 @@ class GatewayRunner:
            # Append voice channel info if connected
            adapter = self.adapters.get(event.source.platform)
            guild_id = self._get_guild_id(event)
-            if guild_id and hasattr(adapter, "is_in_voice_channel"):
-                if adapter.is_in_voice_channel(guild_id):
-                    vc = adapter._voice_clients.get(guild_id)
-                    ch_name = vc.channel.name if vc and vc.channel else "unknown"
-                    return f"Voice mode: {labels.get(mode, mode)}\nVoice channel: {ch_name}"
+            if guild_id and hasattr(adapter, "get_voice_channel_info"):
+                info = adapter.get_voice_channel_info(guild_id)
+                if info:
+                    lines = [
+                        f"Voice mode: {labels.get(mode, mode)}",
+                        f"Voice channel: #{info['channel_name']}",
+                        f"Participants: {info['member_count']}",
+                    ]
+                    for m in info["members"]:
+                        status = " (speaking)" if m.get("is_speaking") else ""
+                        lines.append(f"  - {m['display_name']}{status}")
+                    return "\n".join(lines)
            return f"Voice mode: {labels.get(mode, mode)}"
        else:
            # Toggle: off → on, on/all → off
--- a/tests/gateway/test_voice_command.py
+++ b/tests/gateway/test_voice_command.py
@ -1852,3 +1852,104 @@ class TestAutoTtsTempFileCleanup:
        assert finally_idx > 0, "play_tts must be in a try/finally block"
        assert remove_idx > 0, "finally block must call os.remove on _tts_path"
        assert remove_idx > finally_idx, "os.remove must be inside the finally block"
+
+
+# =====================================================================
+# Voice channel awareness (get_voice_channel_info / context)
+# =====================================================================
+
+
+class TestVoiceChannelAwareness:
+    """Tests for get_voice_channel_info() and get_voice_channel_context()."""
+
+    def _make_adapter(self):
+        from gateway.platforms.discord import DiscordAdapter
+        from gateway.config import PlatformConfig
+        config = PlatformConfig(enabled=True, extra={})
+        config.token = "fake-token"
+        adapter = object.__new__(DiscordAdapter)
+        adapter._voice_clients = {}
+        adapter._voice_text_channels = {}
+        adapter._voice_receivers = {}
+        adapter._client = MagicMock()
+        adapter._client.user = SimpleNamespace(id=99999, name="HermesBot")
+        return adapter
+
+    def _make_member(self, user_id, display_name, is_bot=False):
+        return SimpleNamespace(
+            id=user_id, display_name=display_name, bot=is_bot,
+        )
+
+    def test_returns_none_when_not_connected(self):
+        adapter = self._make_adapter()
+        assert adapter.get_voice_channel_info(111) is None
+
+    def test_returns_none_when_vc_disconnected(self):
+        adapter = self._make_adapter()
+        vc = MagicMock()
+        vc.is_connected.return_value = False
+        adapter._voice_clients[111] = vc
+        assert adapter.get_voice_channel_info(111) is None
+
+    def test_returns_info_with_members(self):
+        adapter = self._make_adapter()
+        vc = MagicMock()
+        vc.is_connected.return_value = True
+        bot_member = self._make_member(99999, "HermesBot", is_bot=True)
+        user_a = self._make_member(1001, "Alice")
+        user_b = self._make_member(1002, "Bob")
+        vc.channel.name = "general-voice"
+        vc.channel.members = [bot_member, user_a, user_b]
+        adapter._voice_clients[111] = vc
+
+        info = adapter.get_voice_channel_info(111)
+        assert info is not None
+        assert info["channel_name"] == "general-voice"
+        assert info["member_count"] == 2  # bot excluded
+        names = [m["display_name"] for m in info["members"]]
+        assert "Alice" in names
+        assert "Bob" in names
+        assert "HermesBot" not in names
+
+    def test_speaking_detection(self):
+        adapter = self._make_adapter()
+        vc = MagicMock()
+        vc.is_connected.return_value = True
+        user_a = self._make_member(1001, "Alice")
+        user_b = self._make_member(1002, "Bob")
+        vc.channel.name = "voice"
+        vc.channel.members = [user_a, user_b]
+        adapter._voice_clients[111] = vc
+
+        # Set up a mock receiver with Alice speaking
+        import time as _time
+        receiver = MagicMock()
+        receiver._lock = threading.Lock()
+        receiver._last_packet_time = {100: _time.monotonic()}  # ssrc 100 is active
+        receiver._ssrc_to_user = {100: 1001}  # ssrc 100 -> Alice
+        adapter._voice_receivers[111] = receiver
+
+        info = adapter.get_voice_channel_info(111)
+        alice = [m for m in info["members"] if m["display_name"] == "Alice"][0]
+        bob = [m for m in info["members"] if m["display_name"] == "Bob"][0]
+        assert alice["is_speaking"] is True
+        assert bob["is_speaking"] is False
+        assert info["speaking_count"] == 1
+
+    def test_context_string_format(self):
+        adapter = self._make_adapter()
+        vc = MagicMock()
+        vc.is_connected.return_value = True
+        user_a = self._make_member(1001, "Alice")
+        vc.channel.name = "chat-room"
+        vc.channel.members = [user_a]
+        adapter._voice_clients[111] = vc
+
+        ctx = adapter.get_voice_channel_context(111)
+        assert "#chat-room" in ctx
+        assert "1 participant" in ctx
+        assert "Alice" in ctx
+
+    def test_context_empty_when_not_connected(self):
+        adapter = self._make_adapter()
+        assert adapter.get_voice_channel_context(111) == ""