merge: salvage PR #327 voice mode branch
Merge contributor branch feature/voice-mode onto current main for follow-up fixes.
This commit is contained in:
commit
523a1b6faf
37 changed files with 9248 additions and 228 deletions
400
gateway/run.py
400
gateway/run.py
|
|
@ -14,13 +14,16 @@ Usage:
|
|||
"""
|
||||
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import shlex
|
||||
import sys
|
||||
import signal
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
from logging.handlers import RotatingFileHandler
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
|
|
@ -281,6 +284,9 @@ class GatewayRunner:
|
|||
from gateway.hooks import HookRegistry
|
||||
self.hooks = HookRegistry()
|
||||
|
||||
# Per-chat voice reply mode: "off" | "voice_only" | "all"
|
||||
self._voice_mode: Dict[str, str] = self._load_voice_modes()
|
||||
|
||||
def _get_or_create_gateway_honcho(self, session_key: str):
|
||||
"""Return a persistent Honcho manager/config pair for this gateway session."""
|
||||
if not hasattr(self, "_honcho_managers"):
|
||||
|
|
@ -336,6 +342,27 @@ class GatewayRunner:
|
|||
for session_key in list(managers.keys()):
|
||||
self._shutdown_gateway_honcho(session_key)
|
||||
|
||||
# -- Voice mode persistence ------------------------------------------
|
||||
|
||||
_VOICE_MODE_PATH = _hermes_home / "gateway_voice_mode.json"
|
||||
|
||||
def _load_voice_modes(self) -> Dict[str, str]:
|
||||
try:
|
||||
return json.loads(self._VOICE_MODE_PATH.read_text())
|
||||
except (FileNotFoundError, json.JSONDecodeError, OSError):
|
||||
return {}
|
||||
|
||||
def _save_voice_modes(self) -> None:
|
||||
try:
|
||||
self._VOICE_MODE_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
self._VOICE_MODE_PATH.write_text(
|
||||
json.dumps(self._voice_mode, indent=2)
|
||||
)
|
||||
except OSError as e:
|
||||
logger.warning("Failed to save voice modes: %s", e)
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
|
||||
def _flush_memories_for_session(self, old_session_id: str):
|
||||
"""Prompt the agent to save memories/skills before context is lost.
|
||||
|
||||
|
|
@ -737,7 +764,7 @@ class GatewayRunner:
|
|||
logger.info("Stopping gateway...")
|
||||
self._running = False
|
||||
|
||||
for platform, adapter in self.adapters.items():
|
||||
for platform, adapter in list(self.adapters.items()):
|
||||
try:
|
||||
await adapter.disconnect()
|
||||
logger.info("✓ %s disconnected", platform.value)
|
||||
|
|
@ -897,7 +924,7 @@ class GatewayRunner:
|
|||
7. Return response
|
||||
"""
|
||||
source = event.source
|
||||
|
||||
|
||||
# Check if user is authorized
|
||||
if not self._is_user_authorized(source):
|
||||
logger.warning("Unauthorized user: %s (%s) on %s", source.user_id, source.user_name, source.platform.value)
|
||||
|
|
@ -949,7 +976,7 @@ class GatewayRunner:
|
|||
"personality", "retry", "undo", "sethome", "set-home",
|
||||
"compress", "usage", "insights", "reload-mcp", "reload_mcp",
|
||||
"update", "title", "resume", "provider", "rollback",
|
||||
"background", "reasoning"}
|
||||
"background", "reasoning", "voice"}
|
||||
if command and command in _known_commands:
|
||||
await self.hooks.emit(f"command:{command}", {
|
||||
"platform": source.platform.value if source.platform else "",
|
||||
|
|
@ -1020,7 +1047,10 @@ class GatewayRunner:
|
|||
|
||||
if command == "reasoning":
|
||||
return await self._handle_reasoning_command(event)
|
||||
|
||||
|
||||
if command == "voice":
|
||||
return await self._handle_voice_command(event)
|
||||
|
||||
# User-defined quick commands (bypass agent loop, no LLM call)
|
||||
if command:
|
||||
if isinstance(self.config, dict):
|
||||
|
|
@ -1377,6 +1407,19 @@ class GatewayRunner:
|
|||
f"or ignore to skip."
|
||||
)
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# Voice channel awareness — inject current voice channel state
|
||||
# into context so the agent knows who is in the channel and who
|
||||
# is speaking, without needing a separate tool call.
|
||||
# -----------------------------------------------------------------
|
||||
if source.platform == Platform.DISCORD:
|
||||
adapter = self.adapters.get(Platform.DISCORD)
|
||||
guild_id = self._get_guild_id(event)
|
||||
if guild_id and adapter and hasattr(adapter, "get_voice_channel_context"):
|
||||
vc_context = adapter.get_voice_channel_context(guild_id)
|
||||
if vc_context:
|
||||
context_prompt += f"\n\n{vc_context}"
|
||||
|
||||
# -----------------------------------------------------------------
|
||||
# Auto-analyze images sent by the user
|
||||
#
|
||||
|
|
@ -1583,7 +1626,11 @@ class GatewayRunner:
|
|||
session_entry.session_key,
|
||||
last_prompt_tokens=agent_result.get("last_prompt_tokens", 0),
|
||||
)
|
||||
|
||||
|
||||
# Auto voice reply: send TTS audio before the text response
|
||||
if self._should_send_voice_reply(event, response, agent_messages):
|
||||
await self._send_voice_reply(event, response)
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
|
|
@ -1692,6 +1739,7 @@ class GatewayRunner:
|
|||
"`/reasoning [level|show|hide]` — Set reasoning effort or toggle display",
|
||||
"`/rollback [number]` — List or restore filesystem checkpoints",
|
||||
"`/background <prompt>` — Run a prompt in a separate background session",
|
||||
"`/voice [on|off|tts|status]` — Toggle voice reply mode",
|
||||
"`/reload-mcp` — Reload MCP servers from config",
|
||||
"`/update` — Update Hermes Agent to the latest version",
|
||||
"`/help` — Show this message",
|
||||
|
|
@ -2067,6 +2115,334 @@ class GatewayRunner:
|
|||
f"Cron jobs and cross-platform messages will be delivered here."
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _get_guild_id(event: MessageEvent) -> Optional[int]:
|
||||
"""Extract Discord guild_id from the raw message object."""
|
||||
raw = getattr(event, "raw_message", None)
|
||||
if raw is None:
|
||||
return None
|
||||
# Slash command interaction
|
||||
if hasattr(raw, "guild_id") and raw.guild_id:
|
||||
return int(raw.guild_id)
|
||||
# Regular message
|
||||
if hasattr(raw, "guild") and raw.guild:
|
||||
return raw.guild.id
|
||||
return None
|
||||
|
||||
async def _handle_voice_command(self, event: MessageEvent) -> str:
|
||||
"""Handle /voice [on|off|tts|channel|leave|status] command."""
|
||||
args = event.get_command_args().strip().lower()
|
||||
chat_id = event.source.chat_id
|
||||
|
||||
adapter = self.adapters.get(event.source.platform)
|
||||
|
||||
if args in ("on", "enable"):
|
||||
self._voice_mode[chat_id] = "voice_only"
|
||||
self._save_voice_modes()
|
||||
if adapter:
|
||||
adapter._auto_tts_disabled_chats.discard(chat_id)
|
||||
return (
|
||||
"Voice mode enabled.\n"
|
||||
"I'll reply with voice when you send voice messages.\n"
|
||||
"Use /voice tts to get voice replies for all messages."
|
||||
)
|
||||
elif args in ("off", "disable"):
|
||||
self._voice_mode.pop(chat_id, None)
|
||||
self._save_voice_modes()
|
||||
if adapter:
|
||||
adapter._auto_tts_disabled_chats.add(chat_id)
|
||||
return "Voice mode disabled. Text-only replies."
|
||||
elif args == "tts":
|
||||
self._voice_mode[chat_id] = "all"
|
||||
self._save_voice_modes()
|
||||
if adapter:
|
||||
adapter._auto_tts_disabled_chats.discard(chat_id)
|
||||
return (
|
||||
"Auto-TTS enabled.\n"
|
||||
"All replies will include a voice message."
|
||||
)
|
||||
elif args in ("channel", "join"):
|
||||
return await self._handle_voice_channel_join(event)
|
||||
elif args == "leave":
|
||||
return await self._handle_voice_channel_leave(event)
|
||||
elif args == "status":
|
||||
mode = self._voice_mode.get(chat_id, "off")
|
||||
labels = {
|
||||
"off": "Off (text only)",
|
||||
"voice_only": "On (voice reply to voice messages)",
|
||||
"all": "TTS (voice reply to all messages)",
|
||||
}
|
||||
# Append voice channel info if connected
|
||||
adapter = self.adapters.get(event.source.platform)
|
||||
guild_id = self._get_guild_id(event)
|
||||
if guild_id and hasattr(adapter, "get_voice_channel_info"):
|
||||
info = adapter.get_voice_channel_info(guild_id)
|
||||
if info:
|
||||
lines = [
|
||||
f"Voice mode: {labels.get(mode, mode)}",
|
||||
f"Voice channel: #{info['channel_name']}",
|
||||
f"Participants: {info['member_count']}",
|
||||
]
|
||||
for m in info["members"]:
|
||||
status = " (speaking)" if m.get("is_speaking") else ""
|
||||
lines.append(f" - {m['display_name']}{status}")
|
||||
return "\n".join(lines)
|
||||
return f"Voice mode: {labels.get(mode, mode)}"
|
||||
else:
|
||||
# Toggle: off → on, on/all → off
|
||||
current = self._voice_mode.get(chat_id, "off")
|
||||
if current == "off":
|
||||
self._voice_mode[chat_id] = "voice_only"
|
||||
self._save_voice_modes()
|
||||
if adapter:
|
||||
adapter._auto_tts_disabled_chats.discard(chat_id)
|
||||
return "Voice mode enabled."
|
||||
else:
|
||||
self._voice_mode.pop(chat_id, None)
|
||||
self._save_voice_modes()
|
||||
if adapter:
|
||||
adapter._auto_tts_disabled_chats.add(chat_id)
|
||||
return "Voice mode disabled."
|
||||
|
||||
async def _handle_voice_channel_join(self, event: MessageEvent) -> str:
|
||||
"""Join the user's current Discord voice channel."""
|
||||
adapter = self.adapters.get(event.source.platform)
|
||||
if not hasattr(adapter, "join_voice_channel"):
|
||||
return "Voice channels are not supported on this platform."
|
||||
|
||||
guild_id = self._get_guild_id(event)
|
||||
if not guild_id:
|
||||
return "This command only works in a Discord server."
|
||||
|
||||
voice_channel = await adapter.get_user_voice_channel(
|
||||
guild_id, event.source.user_id
|
||||
)
|
||||
if not voice_channel:
|
||||
return "You need to be in a voice channel first."
|
||||
|
||||
# Wire callbacks BEFORE join so voice input arriving immediately
|
||||
# after connection is not lost.
|
||||
if hasattr(adapter, "_voice_input_callback"):
|
||||
adapter._voice_input_callback = self._handle_voice_channel_input
|
||||
if hasattr(adapter, "_on_voice_disconnect"):
|
||||
adapter._on_voice_disconnect = self._handle_voice_timeout_cleanup
|
||||
|
||||
try:
|
||||
success = await adapter.join_voice_channel(voice_channel)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to join voice channel: %s", e)
|
||||
adapter._voice_input_callback = None
|
||||
return f"Failed to join voice channel: {e}"
|
||||
|
||||
if success:
|
||||
adapter._voice_text_channels[guild_id] = int(event.source.chat_id)
|
||||
self._voice_mode[event.source.chat_id] = "all"
|
||||
self._save_voice_modes()
|
||||
adapter._auto_tts_disabled_chats.discard(event.source.chat_id)
|
||||
return (
|
||||
f"Joined voice channel **{voice_channel.name}**.\n"
|
||||
f"I'll speak my replies and listen to you. Use /voice leave to disconnect."
|
||||
)
|
||||
# Join failed — clear callback
|
||||
adapter._voice_input_callback = None
|
||||
return "Failed to join voice channel. Check bot permissions (Connect + Speak)."
|
||||
|
||||
async def _handle_voice_channel_leave(self, event: MessageEvent) -> str:
|
||||
"""Leave the Discord voice channel."""
|
||||
adapter = self.adapters.get(event.source.platform)
|
||||
guild_id = self._get_guild_id(event)
|
||||
|
||||
if not guild_id or not hasattr(adapter, "leave_voice_channel"):
|
||||
return "Not in a voice channel."
|
||||
|
||||
if not hasattr(adapter, "is_in_voice_channel") or not adapter.is_in_voice_channel(guild_id):
|
||||
return "Not in a voice channel."
|
||||
|
||||
try:
|
||||
await adapter.leave_voice_channel(guild_id)
|
||||
except Exception as e:
|
||||
logger.warning("Error leaving voice channel: %s", e)
|
||||
# Always clean up state even if leave raised an exception
|
||||
self._voice_mode.pop(event.source.chat_id, None)
|
||||
self._save_voice_modes()
|
||||
if hasattr(adapter, "_voice_input_callback"):
|
||||
adapter._voice_input_callback = None
|
||||
return "Left voice channel."
|
||||
|
||||
def _handle_voice_timeout_cleanup(self, chat_id: str) -> None:
|
||||
"""Called by the adapter when a voice channel times out.
|
||||
|
||||
Cleans up runner-side voice_mode state that the adapter cannot reach.
|
||||
"""
|
||||
self._voice_mode.pop(chat_id, None)
|
||||
self._save_voice_modes()
|
||||
|
||||
async def _handle_voice_channel_input(
|
||||
self, guild_id: int, user_id: int, transcript: str
|
||||
):
|
||||
"""Handle transcribed voice from a user in a voice channel.
|
||||
|
||||
Creates a synthetic MessageEvent and processes it through the
|
||||
adapter's full message pipeline (session, typing, agent, TTS reply).
|
||||
"""
|
||||
adapter = self.adapters.get(Platform.DISCORD)
|
||||
if not adapter:
|
||||
return
|
||||
|
||||
text_ch_id = adapter._voice_text_channels.get(guild_id)
|
||||
if not text_ch_id:
|
||||
return
|
||||
|
||||
# Check authorization before processing voice input
|
||||
source = SessionSource(
|
||||
platform=Platform.DISCORD,
|
||||
chat_id=str(text_ch_id),
|
||||
user_id=str(user_id),
|
||||
user_name=str(user_id),
|
||||
chat_type="channel",
|
||||
)
|
||||
if not self._is_user_authorized(source):
|
||||
logger.debug("Unauthorized voice input from user %d, ignoring", user_id)
|
||||
return
|
||||
|
||||
# Show transcript in text channel (after auth, with mention sanitization)
|
||||
try:
|
||||
channel = adapter._client.get_channel(text_ch_id)
|
||||
if channel:
|
||||
safe_text = transcript[:2000].replace("@everyone", "@\u200beveryone").replace("@here", "@\u200bhere")
|
||||
await channel.send(f"**[Voice]** <@{user_id}>: {safe_text}")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Build a synthetic MessageEvent and feed through the normal pipeline
|
||||
# Use SimpleNamespace as raw_message so _get_guild_id() can extract
|
||||
# guild_id and _send_voice_reply() plays audio in the voice channel.
|
||||
from types import SimpleNamespace
|
||||
event = MessageEvent(
|
||||
source=source,
|
||||
text=transcript,
|
||||
message_type=MessageType.VOICE,
|
||||
raw_message=SimpleNamespace(guild_id=guild_id, guild=None),
|
||||
)
|
||||
|
||||
await adapter.handle_message(event)
|
||||
|
||||
def _should_send_voice_reply(
|
||||
self,
|
||||
event: MessageEvent,
|
||||
response: str,
|
||||
agent_messages: list,
|
||||
) -> bool:
|
||||
"""Decide whether the runner should send a TTS voice reply.
|
||||
|
||||
Returns False when:
|
||||
- voice_mode is off for this chat
|
||||
- response is empty or an error
|
||||
- agent already called text_to_speech tool (dedup)
|
||||
- voice input and base adapter auto-TTS already handled it (skip_double)
|
||||
Exception: Discord voice channel — base play_tts is a no-op there,
|
||||
so the runner must handle VC playback.
|
||||
"""
|
||||
if not response or response.startswith("Error:"):
|
||||
return False
|
||||
|
||||
chat_id = event.source.chat_id
|
||||
voice_mode = self._voice_mode.get(chat_id, "off")
|
||||
is_voice_input = (event.message_type == MessageType.VOICE)
|
||||
|
||||
should = (
|
||||
(voice_mode == "all")
|
||||
or (voice_mode == "voice_only" and is_voice_input)
|
||||
)
|
||||
if not should:
|
||||
return False
|
||||
|
||||
# Dedup: agent already called TTS tool
|
||||
has_agent_tts = any(
|
||||
msg.get("role") == "assistant"
|
||||
and any(
|
||||
tc.get("function", {}).get("name") == "text_to_speech"
|
||||
for tc in (msg.get("tool_calls") or [])
|
||||
)
|
||||
for msg in agent_messages
|
||||
)
|
||||
if has_agent_tts:
|
||||
return False
|
||||
|
||||
# Dedup: base adapter auto-TTS already handles voice input.
|
||||
# Exception: Discord voice channel — play_tts override is a no-op,
|
||||
# so the runner must handle VC playback.
|
||||
skip_double = is_voice_input
|
||||
if skip_double:
|
||||
adapter = self.adapters.get(event.source.platform)
|
||||
guild_id = self._get_guild_id(event)
|
||||
if (guild_id and adapter
|
||||
and hasattr(adapter, "is_in_voice_channel")
|
||||
and adapter.is_in_voice_channel(guild_id)):
|
||||
skip_double = False
|
||||
if skip_double:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
async def _send_voice_reply(self, event: MessageEvent, text: str) -> None:
|
||||
"""Generate TTS audio and send as a voice message before the text reply."""
|
||||
import uuid as _uuid
|
||||
audio_path = None
|
||||
actual_path = None
|
||||
try:
|
||||
from tools.tts_tool import text_to_speech_tool, _strip_markdown_for_tts
|
||||
|
||||
tts_text = _strip_markdown_for_tts(text[:4000])
|
||||
if not tts_text:
|
||||
return
|
||||
|
||||
# Use .mp3 extension so edge-tts conversion to opus works correctly.
|
||||
# The TTS tool may convert to .ogg — use file_path from result.
|
||||
audio_path = os.path.join(
|
||||
tempfile.gettempdir(), "hermes_voice",
|
||||
f"tts_reply_{_uuid.uuid4().hex[:12]}.mp3",
|
||||
)
|
||||
os.makedirs(os.path.dirname(audio_path), exist_ok=True)
|
||||
|
||||
result_json = await asyncio.to_thread(
|
||||
text_to_speech_tool, text=tts_text, output_path=audio_path
|
||||
)
|
||||
result = json.loads(result_json)
|
||||
|
||||
# Use the actual file path from result (may differ after opus conversion)
|
||||
actual_path = result.get("file_path", audio_path)
|
||||
if not result.get("success") or not os.path.isfile(actual_path):
|
||||
logger.warning("Auto voice reply TTS failed: %s", result.get("error"))
|
||||
return
|
||||
|
||||
adapter = self.adapters.get(event.source.platform)
|
||||
|
||||
# If connected to a voice channel, play there instead of sending a file
|
||||
guild_id = self._get_guild_id(event)
|
||||
if (guild_id
|
||||
and hasattr(adapter, "play_in_voice_channel")
|
||||
and hasattr(adapter, "is_in_voice_channel")
|
||||
and adapter.is_in_voice_channel(guild_id)):
|
||||
await adapter.play_in_voice_channel(guild_id, actual_path)
|
||||
elif adapter and hasattr(adapter, "send_voice"):
|
||||
send_kwargs: Dict[str, Any] = {
|
||||
"chat_id": event.source.chat_id,
|
||||
"audio_path": actual_path,
|
||||
"reply_to": event.message_id,
|
||||
}
|
||||
if event.source.thread_id:
|
||||
send_kwargs["metadata"] = {"thread_id": event.source.thread_id}
|
||||
await adapter.send_voice(**send_kwargs)
|
||||
except Exception as e:
|
||||
logger.warning("Auto voice reply failed: %s", e, exc_info=True)
|
||||
finally:
|
||||
for p in {audio_path, actual_path} - {None}:
|
||||
try:
|
||||
os.unlink(p)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
async def _handle_rollback_command(self, event: MessageEvent) -> str:
|
||||
"""Handle /rollback command — list or restore filesystem checkpoints."""
|
||||
from tools.checkpoint_manager import CheckpointManager, format_checkpoint_list
|
||||
|
|
@ -3011,14 +3387,16 @@ class GatewayRunner:
|
|||
Returns:
|
||||
The enriched message string with transcriptions prepended.
|
||||
"""
|
||||
from tools.transcription_tools import transcribe_audio
|
||||
from tools.transcription_tools import transcribe_audio, get_stt_model_from_config
|
||||
import asyncio
|
||||
|
||||
stt_model = get_stt_model_from_config()
|
||||
|
||||
enriched_parts = []
|
||||
for path in audio_paths:
|
||||
try:
|
||||
logger.debug("Transcribing user voice: %s", path)
|
||||
result = await asyncio.to_thread(transcribe_audio, path)
|
||||
result = await asyncio.to_thread(transcribe_audio, path, model=stt_model)
|
||||
if result["success"]:
|
||||
transcript = result["transcript"]
|
||||
enriched_parts.append(
|
||||
|
|
@ -3027,10 +3405,10 @@ class GatewayRunner:
|
|||
)
|
||||
else:
|
||||
error = result.get("error", "unknown error")
|
||||
if "OPENAI_API_KEY" in error or "VOICE_TOOLS_OPENAI_KEY" in error:
|
||||
if "No STT provider" in error or "not set" in error:
|
||||
enriched_parts.append(
|
||||
"[The user sent a voice message but I can't listen "
|
||||
"to it right now~ VOICE_TOOLS_OPENAI_KEY isn't set up yet "
|
||||
"to it right now~ No STT provider is configured "
|
||||
"(';w;') Let them know!]"
|
||||
)
|
||||
else:
|
||||
|
|
@ -3180,7 +3558,7 @@ class GatewayRunner:
|
|||
Platform.HOMEASSISTANT: "hermes-homeassistant",
|
||||
Platform.EMAIL: "hermes-email",
|
||||
}
|
||||
|
||||
|
||||
# Try to load platform_toolsets from config
|
||||
platform_toolsets_config = {}
|
||||
try:
|
||||
|
|
@ -3192,7 +3570,7 @@ class GatewayRunner:
|
|||
platform_toolsets_config = user_config.get("platform_toolsets", {})
|
||||
except Exception as e:
|
||||
logger.debug("Could not load platform_toolsets config: %s", e)
|
||||
|
||||
|
||||
# Map platform enum to config key
|
||||
platform_config_key = {
|
||||
Platform.LOCAL: "cli",
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue