merge: salvage PR #327 voice mode branch

Merge contributor branch feature/voice-mode onto current main for follow-up fixes.
This commit is contained in:
teknium1 2026-03-14 06:03:07 -07:00
commit 523a1b6faf
37 changed files with 9248 additions and 228 deletions

View file

@ -14,13 +14,16 @@ Usage:
"""
import asyncio
import json
import logging
import os
import re
import shlex
import sys
import signal
import tempfile
import threading
import time
from logging.handlers import RotatingFileHandler
from pathlib import Path
from datetime import datetime
@ -281,6 +284,9 @@ class GatewayRunner:
from gateway.hooks import HookRegistry
self.hooks = HookRegistry()
# Per-chat voice reply mode: "off" | "voice_only" | "all"
self._voice_mode: Dict[str, str] = self._load_voice_modes()
def _get_or_create_gateway_honcho(self, session_key: str):
"""Return a persistent Honcho manager/config pair for this gateway session."""
if not hasattr(self, "_honcho_managers"):
@ -336,6 +342,27 @@ class GatewayRunner:
for session_key in list(managers.keys()):
self._shutdown_gateway_honcho(session_key)
# -- Voice mode persistence ------------------------------------------
_VOICE_MODE_PATH = _hermes_home / "gateway_voice_mode.json"
def _load_voice_modes(self) -> Dict[str, str]:
try:
return json.loads(self._VOICE_MODE_PATH.read_text())
except (FileNotFoundError, json.JSONDecodeError, OSError):
return {}
def _save_voice_modes(self) -> None:
try:
self._VOICE_MODE_PATH.parent.mkdir(parents=True, exist_ok=True)
self._VOICE_MODE_PATH.write_text(
json.dumps(self._voice_mode, indent=2)
)
except OSError as e:
logger.warning("Failed to save voice modes: %s", e)
# -----------------------------------------------------------------
def _flush_memories_for_session(self, old_session_id: str):
"""Prompt the agent to save memories/skills before context is lost.
@ -737,7 +764,7 @@ class GatewayRunner:
logger.info("Stopping gateway...")
self._running = False
for platform, adapter in self.adapters.items():
for platform, adapter in list(self.adapters.items()):
try:
await adapter.disconnect()
logger.info("%s disconnected", platform.value)
@ -897,7 +924,7 @@ class GatewayRunner:
7. Return response
"""
source = event.source
# Check if user is authorized
if not self._is_user_authorized(source):
logger.warning("Unauthorized user: %s (%s) on %s", source.user_id, source.user_name, source.platform.value)
@ -949,7 +976,7 @@ class GatewayRunner:
"personality", "retry", "undo", "sethome", "set-home",
"compress", "usage", "insights", "reload-mcp", "reload_mcp",
"update", "title", "resume", "provider", "rollback",
"background", "reasoning"}
"background", "reasoning", "voice"}
if command and command in _known_commands:
await self.hooks.emit(f"command:{command}", {
"platform": source.platform.value if source.platform else "",
@ -1020,7 +1047,10 @@ class GatewayRunner:
if command == "reasoning":
return await self._handle_reasoning_command(event)
if command == "voice":
return await self._handle_voice_command(event)
# User-defined quick commands (bypass agent loop, no LLM call)
if command:
if isinstance(self.config, dict):
@ -1377,6 +1407,19 @@ class GatewayRunner:
f"or ignore to skip."
)
# -----------------------------------------------------------------
# Voice channel awareness — inject current voice channel state
# into context so the agent knows who is in the channel and who
# is speaking, without needing a separate tool call.
# -----------------------------------------------------------------
if source.platform == Platform.DISCORD:
adapter = self.adapters.get(Platform.DISCORD)
guild_id = self._get_guild_id(event)
if guild_id and adapter and hasattr(adapter, "get_voice_channel_context"):
vc_context = adapter.get_voice_channel_context(guild_id)
if vc_context:
context_prompt += f"\n\n{vc_context}"
# -----------------------------------------------------------------
# Auto-analyze images sent by the user
#
@ -1583,7 +1626,11 @@ class GatewayRunner:
session_entry.session_key,
last_prompt_tokens=agent_result.get("last_prompt_tokens", 0),
)
# Auto voice reply: send TTS audio before the text response
if self._should_send_voice_reply(event, response, agent_messages):
await self._send_voice_reply(event, response)
return response
except Exception as e:
@ -1692,6 +1739,7 @@ class GatewayRunner:
"`/reasoning [level|show|hide]` — Set reasoning effort or toggle display",
"`/rollback [number]` — List or restore filesystem checkpoints",
"`/background <prompt>` — Run a prompt in a separate background session",
"`/voice [on|off|tts|status]` — Toggle voice reply mode",
"`/reload-mcp` — Reload MCP servers from config",
"`/update` — Update Hermes Agent to the latest version",
"`/help` — Show this message",
@ -2067,6 +2115,334 @@ class GatewayRunner:
f"Cron jobs and cross-platform messages will be delivered here."
)
@staticmethod
def _get_guild_id(event: MessageEvent) -> Optional[int]:
"""Extract Discord guild_id from the raw message object."""
raw = getattr(event, "raw_message", None)
if raw is None:
return None
# Slash command interaction
if hasattr(raw, "guild_id") and raw.guild_id:
return int(raw.guild_id)
# Regular message
if hasattr(raw, "guild") and raw.guild:
return raw.guild.id
return None
async def _handle_voice_command(self, event: MessageEvent) -> str:
"""Handle /voice [on|off|tts|channel|leave|status] command."""
args = event.get_command_args().strip().lower()
chat_id = event.source.chat_id
adapter = self.adapters.get(event.source.platform)
if args in ("on", "enable"):
self._voice_mode[chat_id] = "voice_only"
self._save_voice_modes()
if adapter:
adapter._auto_tts_disabled_chats.discard(chat_id)
return (
"Voice mode enabled.\n"
"I'll reply with voice when you send voice messages.\n"
"Use /voice tts to get voice replies for all messages."
)
elif args in ("off", "disable"):
self._voice_mode.pop(chat_id, None)
self._save_voice_modes()
if adapter:
adapter._auto_tts_disabled_chats.add(chat_id)
return "Voice mode disabled. Text-only replies."
elif args == "tts":
self._voice_mode[chat_id] = "all"
self._save_voice_modes()
if adapter:
adapter._auto_tts_disabled_chats.discard(chat_id)
return (
"Auto-TTS enabled.\n"
"All replies will include a voice message."
)
elif args in ("channel", "join"):
return await self._handle_voice_channel_join(event)
elif args == "leave":
return await self._handle_voice_channel_leave(event)
elif args == "status":
mode = self._voice_mode.get(chat_id, "off")
labels = {
"off": "Off (text only)",
"voice_only": "On (voice reply to voice messages)",
"all": "TTS (voice reply to all messages)",
}
# Append voice channel info if connected
adapter = self.adapters.get(event.source.platform)
guild_id = self._get_guild_id(event)
if guild_id and hasattr(adapter, "get_voice_channel_info"):
info = adapter.get_voice_channel_info(guild_id)
if info:
lines = [
f"Voice mode: {labels.get(mode, mode)}",
f"Voice channel: #{info['channel_name']}",
f"Participants: {info['member_count']}",
]
for m in info["members"]:
status = " (speaking)" if m.get("is_speaking") else ""
lines.append(f" - {m['display_name']}{status}")
return "\n".join(lines)
return f"Voice mode: {labels.get(mode, mode)}"
else:
# Toggle: off → on, on/all → off
current = self._voice_mode.get(chat_id, "off")
if current == "off":
self._voice_mode[chat_id] = "voice_only"
self._save_voice_modes()
if adapter:
adapter._auto_tts_disabled_chats.discard(chat_id)
return "Voice mode enabled."
else:
self._voice_mode.pop(chat_id, None)
self._save_voice_modes()
if adapter:
adapter._auto_tts_disabled_chats.add(chat_id)
return "Voice mode disabled."
async def _handle_voice_channel_join(self, event: MessageEvent) -> str:
"""Join the user's current Discord voice channel."""
adapter = self.adapters.get(event.source.platform)
if not hasattr(adapter, "join_voice_channel"):
return "Voice channels are not supported on this platform."
guild_id = self._get_guild_id(event)
if not guild_id:
return "This command only works in a Discord server."
voice_channel = await adapter.get_user_voice_channel(
guild_id, event.source.user_id
)
if not voice_channel:
return "You need to be in a voice channel first."
# Wire callbacks BEFORE join so voice input arriving immediately
# after connection is not lost.
if hasattr(adapter, "_voice_input_callback"):
adapter._voice_input_callback = self._handle_voice_channel_input
if hasattr(adapter, "_on_voice_disconnect"):
adapter._on_voice_disconnect = self._handle_voice_timeout_cleanup
try:
success = await adapter.join_voice_channel(voice_channel)
except Exception as e:
logger.warning("Failed to join voice channel: %s", e)
adapter._voice_input_callback = None
return f"Failed to join voice channel: {e}"
if success:
adapter._voice_text_channels[guild_id] = int(event.source.chat_id)
self._voice_mode[event.source.chat_id] = "all"
self._save_voice_modes()
adapter._auto_tts_disabled_chats.discard(event.source.chat_id)
return (
f"Joined voice channel **{voice_channel.name}**.\n"
f"I'll speak my replies and listen to you. Use /voice leave to disconnect."
)
# Join failed — clear callback
adapter._voice_input_callback = None
return "Failed to join voice channel. Check bot permissions (Connect + Speak)."
async def _handle_voice_channel_leave(self, event: MessageEvent) -> str:
"""Leave the Discord voice channel."""
adapter = self.adapters.get(event.source.platform)
guild_id = self._get_guild_id(event)
if not guild_id or not hasattr(adapter, "leave_voice_channel"):
return "Not in a voice channel."
if not hasattr(adapter, "is_in_voice_channel") or not adapter.is_in_voice_channel(guild_id):
return "Not in a voice channel."
try:
await adapter.leave_voice_channel(guild_id)
except Exception as e:
logger.warning("Error leaving voice channel: %s", e)
# Always clean up state even if leave raised an exception
self._voice_mode.pop(event.source.chat_id, None)
self._save_voice_modes()
if hasattr(adapter, "_voice_input_callback"):
adapter._voice_input_callback = None
return "Left voice channel."
def _handle_voice_timeout_cleanup(self, chat_id: str) -> None:
"""Called by the adapter when a voice channel times out.
Cleans up runner-side voice_mode state that the adapter cannot reach.
"""
self._voice_mode.pop(chat_id, None)
self._save_voice_modes()
async def _handle_voice_channel_input(
self, guild_id: int, user_id: int, transcript: str
):
"""Handle transcribed voice from a user in a voice channel.
Creates a synthetic MessageEvent and processes it through the
adapter's full message pipeline (session, typing, agent, TTS reply).
"""
adapter = self.adapters.get(Platform.DISCORD)
if not adapter:
return
text_ch_id = adapter._voice_text_channels.get(guild_id)
if not text_ch_id:
return
# Check authorization before processing voice input
source = SessionSource(
platform=Platform.DISCORD,
chat_id=str(text_ch_id),
user_id=str(user_id),
user_name=str(user_id),
chat_type="channel",
)
if not self._is_user_authorized(source):
logger.debug("Unauthorized voice input from user %d, ignoring", user_id)
return
# Show transcript in text channel (after auth, with mention sanitization)
try:
channel = adapter._client.get_channel(text_ch_id)
if channel:
safe_text = transcript[:2000].replace("@everyone", "@\u200beveryone").replace("@here", "@\u200bhere")
await channel.send(f"**[Voice]** <@{user_id}>: {safe_text}")
except Exception:
pass
# Build a synthetic MessageEvent and feed through the normal pipeline
# Use SimpleNamespace as raw_message so _get_guild_id() can extract
# guild_id and _send_voice_reply() plays audio in the voice channel.
from types import SimpleNamespace
event = MessageEvent(
source=source,
text=transcript,
message_type=MessageType.VOICE,
raw_message=SimpleNamespace(guild_id=guild_id, guild=None),
)
await adapter.handle_message(event)
def _should_send_voice_reply(
self,
event: MessageEvent,
response: str,
agent_messages: list,
) -> bool:
"""Decide whether the runner should send a TTS voice reply.
Returns False when:
- voice_mode is off for this chat
- response is empty or an error
- agent already called text_to_speech tool (dedup)
- voice input and base adapter auto-TTS already handled it (skip_double)
Exception: Discord voice channel base play_tts is a no-op there,
so the runner must handle VC playback.
"""
if not response or response.startswith("Error:"):
return False
chat_id = event.source.chat_id
voice_mode = self._voice_mode.get(chat_id, "off")
is_voice_input = (event.message_type == MessageType.VOICE)
should = (
(voice_mode == "all")
or (voice_mode == "voice_only" and is_voice_input)
)
if not should:
return False
# Dedup: agent already called TTS tool
has_agent_tts = any(
msg.get("role") == "assistant"
and any(
tc.get("function", {}).get("name") == "text_to_speech"
for tc in (msg.get("tool_calls") or [])
)
for msg in agent_messages
)
if has_agent_tts:
return False
# Dedup: base adapter auto-TTS already handles voice input.
# Exception: Discord voice channel — play_tts override is a no-op,
# so the runner must handle VC playback.
skip_double = is_voice_input
if skip_double:
adapter = self.adapters.get(event.source.platform)
guild_id = self._get_guild_id(event)
if (guild_id and adapter
and hasattr(adapter, "is_in_voice_channel")
and adapter.is_in_voice_channel(guild_id)):
skip_double = False
if skip_double:
return False
return True
async def _send_voice_reply(self, event: MessageEvent, text: str) -> None:
"""Generate TTS audio and send as a voice message before the text reply."""
import uuid as _uuid
audio_path = None
actual_path = None
try:
from tools.tts_tool import text_to_speech_tool, _strip_markdown_for_tts
tts_text = _strip_markdown_for_tts(text[:4000])
if not tts_text:
return
# Use .mp3 extension so edge-tts conversion to opus works correctly.
# The TTS tool may convert to .ogg — use file_path from result.
audio_path = os.path.join(
tempfile.gettempdir(), "hermes_voice",
f"tts_reply_{_uuid.uuid4().hex[:12]}.mp3",
)
os.makedirs(os.path.dirname(audio_path), exist_ok=True)
result_json = await asyncio.to_thread(
text_to_speech_tool, text=tts_text, output_path=audio_path
)
result = json.loads(result_json)
# Use the actual file path from result (may differ after opus conversion)
actual_path = result.get("file_path", audio_path)
if not result.get("success") or not os.path.isfile(actual_path):
logger.warning("Auto voice reply TTS failed: %s", result.get("error"))
return
adapter = self.adapters.get(event.source.platform)
# If connected to a voice channel, play there instead of sending a file
guild_id = self._get_guild_id(event)
if (guild_id
and hasattr(adapter, "play_in_voice_channel")
and hasattr(adapter, "is_in_voice_channel")
and adapter.is_in_voice_channel(guild_id)):
await adapter.play_in_voice_channel(guild_id, actual_path)
elif adapter and hasattr(adapter, "send_voice"):
send_kwargs: Dict[str, Any] = {
"chat_id": event.source.chat_id,
"audio_path": actual_path,
"reply_to": event.message_id,
}
if event.source.thread_id:
send_kwargs["metadata"] = {"thread_id": event.source.thread_id}
await adapter.send_voice(**send_kwargs)
except Exception as e:
logger.warning("Auto voice reply failed: %s", e, exc_info=True)
finally:
for p in {audio_path, actual_path} - {None}:
try:
os.unlink(p)
except OSError:
pass
async def _handle_rollback_command(self, event: MessageEvent) -> str:
"""Handle /rollback command — list or restore filesystem checkpoints."""
from tools.checkpoint_manager import CheckpointManager, format_checkpoint_list
@ -3011,14 +3387,16 @@ class GatewayRunner:
Returns:
The enriched message string with transcriptions prepended.
"""
from tools.transcription_tools import transcribe_audio
from tools.transcription_tools import transcribe_audio, get_stt_model_from_config
import asyncio
stt_model = get_stt_model_from_config()
enriched_parts = []
for path in audio_paths:
try:
logger.debug("Transcribing user voice: %s", path)
result = await asyncio.to_thread(transcribe_audio, path)
result = await asyncio.to_thread(transcribe_audio, path, model=stt_model)
if result["success"]:
transcript = result["transcript"]
enriched_parts.append(
@ -3027,10 +3405,10 @@ class GatewayRunner:
)
else:
error = result.get("error", "unknown error")
if "OPENAI_API_KEY" in error or "VOICE_TOOLS_OPENAI_KEY" in error:
if "No STT provider" in error or "not set" in error:
enriched_parts.append(
"[The user sent a voice message but I can't listen "
"to it right now~ VOICE_TOOLS_OPENAI_KEY isn't set up yet "
"to it right now~ No STT provider is configured "
"(';w;') Let them know!]"
)
else:
@ -3180,7 +3558,7 @@ class GatewayRunner:
Platform.HOMEASSISTANT: "hermes-homeassistant",
Platform.EMAIL: "hermes-email",
}
# Try to load platform_toolsets from config
platform_toolsets_config = {}
try:
@ -3192,7 +3570,7 @@ class GatewayRunner:
platform_toolsets_config = user_config.get("platform_toolsets", {})
except Exception as e:
logger.debug("Could not load platform_toolsets config: %s", e)
# Map platform enum to config key
platform_config_key = {
Platform.LOCAL: "cli",