Add Text-to-Speech (TTS) functionality with multiple providers
Add tool previews Add AGENTS and SOUL.md support Add Exec Approval
This commit is contained in:
parent
89c6f24d48
commit
f5be6177b2
18 changed files with 1200 additions and 21 deletions
|
|
@ -240,6 +240,61 @@ class BasePlatformAdapter(ABC):
|
|||
|
||||
return images, cleaned
|
||||
|
||||
async def send_voice(
|
||||
self,
|
||||
chat_id: str,
|
||||
audio_path: str,
|
||||
caption: Optional[str] = None,
|
||||
reply_to: Optional[str] = None,
|
||||
) -> SendResult:
|
||||
"""
|
||||
Send an audio file as a native voice message via the platform API.
|
||||
|
||||
Override in subclasses to send audio as voice bubbles (Telegram)
|
||||
or file attachments (Discord). Default falls back to sending the
|
||||
file path as text.
|
||||
"""
|
||||
text = f"🔊 Audio: {audio_path}"
|
||||
if caption:
|
||||
text = f"{caption}\n{text}"
|
||||
return await self.send(chat_id=chat_id, content=text, reply_to=reply_to)
|
||||
|
||||
@staticmethod
|
||||
def extract_media(content: str) -> Tuple[List[Tuple[str, bool]], str]:
|
||||
"""
|
||||
Extract MEDIA:<path> tags and [[audio_as_voice]] directives from response text.
|
||||
|
||||
The TTS tool returns responses like:
|
||||
[[audio_as_voice]]
|
||||
MEDIA:/path/to/audio.ogg
|
||||
|
||||
Args:
|
||||
content: The response text to scan.
|
||||
|
||||
Returns:
|
||||
Tuple of (list of (path, is_voice) pairs, cleaned content with tags removed).
|
||||
"""
|
||||
media = []
|
||||
cleaned = content
|
||||
|
||||
# Check for [[audio_as_voice]] directive
|
||||
has_voice_tag = "[[audio_as_voice]]" in content
|
||||
cleaned = cleaned.replace("[[audio_as_voice]]", "")
|
||||
|
||||
# Extract MEDIA:<path> tags (path may contain spaces)
|
||||
media_pattern = r'MEDIA:(\S+)'
|
||||
for match in re.finditer(media_pattern, content):
|
||||
path = match.group(1).strip()
|
||||
if path:
|
||||
media.append((path, has_voice_tag))
|
||||
|
||||
# Remove MEDIA tags from content
|
||||
if media:
|
||||
cleaned = re.sub(media_pattern, '', cleaned)
|
||||
cleaned = re.sub(r'\n{3,}', '\n\n', cleaned).strip()
|
||||
|
||||
return media, cleaned
|
||||
|
||||
async def _keep_typing(self, chat_id: str, interval: float = 2.0) -> None:
|
||||
"""
|
||||
Continuously send typing indicator until cancelled.
|
||||
|
|
@ -294,10 +349,13 @@ class BasePlatformAdapter(ABC):
|
|||
|
||||
# Send response if any
|
||||
if response:
|
||||
# Extract MEDIA:<path> tags (from TTS tool) before other processing
|
||||
media_files, response = self.extract_media(response)
|
||||
|
||||
# Extract image URLs and send them as native platform attachments
|
||||
images, text_content = self.extract_images(response)
|
||||
|
||||
# Send the text portion first (if any remains after extracting images)
|
||||
# Send the text portion first (if any remains after extractions)
|
||||
if text_content:
|
||||
result = await self.send(
|
||||
chat_id=event.source.chat_id,
|
||||
|
|
@ -329,6 +387,18 @@ class BasePlatformAdapter(ABC):
|
|||
print(f"[{self.name}] Failed to send image: {img_result.error}")
|
||||
except Exception as img_err:
|
||||
print(f"[{self.name}] Error sending image: {img_err}")
|
||||
|
||||
# Send extracted audio/voice files as native attachments
|
||||
for audio_path, is_voice in media_files:
|
||||
try:
|
||||
voice_result = await self.send_voice(
|
||||
chat_id=event.source.chat_id,
|
||||
audio_path=audio_path,
|
||||
)
|
||||
if not voice_result.success:
|
||||
print(f"[{self.name}] Failed to send voice: {voice_result.error}")
|
||||
except Exception as voice_err:
|
||||
print(f"[{self.name}] Error sending voice: {voice_err}")
|
||||
|
||||
# Check if there's a pending message that was queued during our processing
|
||||
if session_key in self._pending_messages:
|
||||
|
|
|
|||
|
|
@ -174,6 +174,44 @@ class DiscordAdapter(BasePlatformAdapter):
|
|||
except Exception as e:
|
||||
return SendResult(success=False, error=str(e))
|
||||
|
||||
async def send_voice(
|
||||
self,
|
||||
chat_id: str,
|
||||
audio_path: str,
|
||||
caption: Optional[str] = None,
|
||||
reply_to: Optional[str] = None,
|
||||
) -> SendResult:
|
||||
"""Send audio as a Discord file attachment."""
|
||||
if not self._client:
|
||||
return SendResult(success=False, error="Not connected")
|
||||
|
||||
try:
|
||||
import io
|
||||
|
||||
channel = self._client.get_channel(int(chat_id))
|
||||
if not channel:
|
||||
channel = await self._client.fetch_channel(int(chat_id))
|
||||
if not channel:
|
||||
return SendResult(success=False, error=f"Channel {chat_id} not found")
|
||||
|
||||
if not os.path.exists(audio_path):
|
||||
return SendResult(success=False, error=f"Audio file not found: {audio_path}")
|
||||
|
||||
# Determine filename from path
|
||||
filename = os.path.basename(audio_path)
|
||||
|
||||
with open(audio_path, "rb") as f:
|
||||
file = discord.File(io.BytesIO(f.read()), filename=filename)
|
||||
msg = await channel.send(
|
||||
content=caption if caption else None,
|
||||
file=file,
|
||||
)
|
||||
return SendResult(success=True, message_id=str(msg.id))
|
||||
|
||||
except Exception as e:
|
||||
print(f"[{self.name}] Failed to send audio: {e}")
|
||||
return await super().send_voice(chat_id, audio_path, caption, reply_to)
|
||||
|
||||
async def send_image(
|
||||
self,
|
||||
chat_id: str,
|
||||
|
|
|
|||
|
|
@ -174,6 +174,44 @@ class TelegramAdapter(BasePlatformAdapter):
|
|||
except Exception as e:
|
||||
return SendResult(success=False, error=str(e))
|
||||
|
||||
async def send_voice(
|
||||
self,
|
||||
chat_id: str,
|
||||
audio_path: str,
|
||||
caption: Optional[str] = None,
|
||||
reply_to: Optional[str] = None,
|
||||
) -> SendResult:
|
||||
"""Send audio as a native Telegram voice message or audio file."""
|
||||
if not self._bot:
|
||||
return SendResult(success=False, error="Not connected")
|
||||
|
||||
try:
|
||||
import os
|
||||
if not os.path.exists(audio_path):
|
||||
return SendResult(success=False, error=f"Audio file not found: {audio_path}")
|
||||
|
||||
with open(audio_path, "rb") as audio_file:
|
||||
# .ogg files -> send as voice (round playable bubble)
|
||||
if audio_path.endswith(".ogg") or audio_path.endswith(".opus"):
|
||||
msg = await self._bot.send_voice(
|
||||
chat_id=int(chat_id),
|
||||
voice=audio_file,
|
||||
caption=caption[:1024] if caption else None,
|
||||
reply_to_message_id=int(reply_to) if reply_to else None,
|
||||
)
|
||||
else:
|
||||
# .mp3 and others -> send as audio file
|
||||
msg = await self._bot.send_audio(
|
||||
chat_id=int(chat_id),
|
||||
audio=audio_file,
|
||||
caption=caption[:1024] if caption else None,
|
||||
reply_to_message_id=int(reply_to) if reply_to else None,
|
||||
)
|
||||
return SendResult(success=True, message_id=str(msg.message_id))
|
||||
except Exception as e:
|
||||
print(f"[{self.name}] Failed to send voice/audio: {e}")
|
||||
return await super().send_voice(chat_id, audio_path, caption, reply_to)
|
||||
|
||||
async def send_image(
|
||||
self,
|
||||
chat_id: str,
|
||||
|
|
|
|||
|
|
@ -35,6 +35,9 @@ load_dotenv()
|
|||
# Gateway runs in quiet mode - suppress debug output and use cwd directly (no temp dirs)
|
||||
os.environ["HERMES_QUIET"] = "1"
|
||||
|
||||
# Enable interactive exec approval for dangerous commands on messaging platforms
|
||||
os.environ["HERMES_EXEC_ASK"] = "1"
|
||||
|
||||
# Set terminal working directory for messaging platforms
|
||||
# Uses MESSAGING_CWD if set, otherwise defaults to home directory
|
||||
# This is separate from CLI which uses the directory where `hermes` is run
|
||||
|
|
@ -77,6 +80,10 @@ class GatewayRunner:
|
|||
# Key: session_key, Value: AIAgent instance
|
||||
self._running_agents: Dict[str, Any] = {}
|
||||
self._pending_messages: Dict[str, str] = {} # Queued messages during interrupt
|
||||
|
||||
# Track pending exec approvals per session
|
||||
# Key: session_key, Value: {"command": str, "pattern_key": str}
|
||||
self._pending_approvals: Dict[str, Dict[str, str]] = {}
|
||||
|
||||
async def start(self) -> bool:
|
||||
"""
|
||||
|
|
@ -246,6 +253,25 @@ class GatewayRunner:
|
|||
if command == "stop":
|
||||
return await self._handle_stop_command(event)
|
||||
|
||||
# Check for pending exec approval responses
|
||||
session_key_preview = f"agent:main:{source.platform.value}:{source.chat_type}:{source.chat_id}" if source.chat_type != "dm" else f"agent:main:{source.platform.value}:dm"
|
||||
if session_key_preview in self._pending_approvals:
|
||||
user_text = event.text.strip().lower()
|
||||
if user_text in ("yes", "y", "approve", "ok", "go", "do it"):
|
||||
approval = self._pending_approvals.pop(session_key_preview)
|
||||
cmd = approval["command"]
|
||||
pattern_key = approval.get("pattern_key", "")
|
||||
print(f"[gateway] ✅ User approved dangerous command: {cmd[:60]}...")
|
||||
# Approve for session and re-run via terminal_tool with force=True
|
||||
from tools.terminal_tool import terminal_tool, _session_approved_patterns
|
||||
_session_approved_patterns.add(pattern_key)
|
||||
result = terminal_tool(command=cmd, force=True)
|
||||
return f"✅ Command approved and executed.\n\n```\n{result[:3500]}\n```"
|
||||
elif user_text in ("no", "n", "deny", "cancel", "nope"):
|
||||
self._pending_approvals.pop(session_key_preview)
|
||||
return "❌ Command denied."
|
||||
# If it's not clearly an approval/denial, fall through to normal processing
|
||||
|
||||
# Get or create session
|
||||
session_entry = self.session_store.get_or_create_session(source)
|
||||
session_key = session_entry.session_key
|
||||
|
|
@ -282,6 +308,17 @@ class GatewayRunner:
|
|||
session_key=session_key
|
||||
)
|
||||
|
||||
# Check if the agent encountered a dangerous command needing approval
|
||||
# The terminal tool stores the last pending approval globally
|
||||
try:
|
||||
from tools.terminal_tool import _last_pending_approval
|
||||
if _last_pending_approval:
|
||||
self._pending_approvals[session_key] = _last_pending_approval.copy()
|
||||
# Clear the global so it doesn't leak to other sessions
|
||||
_last_pending_approval.clear()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Append to transcript
|
||||
self.session_store.append_to_transcript(
|
||||
session_entry.session_id,
|
||||
|
|
@ -418,23 +455,35 @@ class GatewayRunner:
|
|||
return
|
||||
last_tool[0] = tool_name
|
||||
|
||||
# Build progress message
|
||||
# Build progress message with primary argument preview
|
||||
tool_emojis = {
|
||||
"terminal": "💻",
|
||||
"web_search": "🔍",
|
||||
"web_extract": "📄",
|
||||
"read_file": "📖",
|
||||
"write_file": "✍️",
|
||||
"patch": "🔧",
|
||||
"search": "🔎",
|
||||
"list_directory": "📂",
|
||||
"image_generate": "🎨",
|
||||
"text_to_speech": "🔊",
|
||||
"browser_navigate": "🌐",
|
||||
"browser_click": "👆",
|
||||
"browser_type": "⌨️",
|
||||
"browser_snapshot": "📸",
|
||||
"moa_query": "🧠",
|
||||
"mixture_of_agents": "🧠",
|
||||
"vision_analyze": "👁️",
|
||||
"skill_view": "📚",
|
||||
"skills_list": "📋",
|
||||
}
|
||||
emoji = tool_emojis.get(tool_name, "⚙️")
|
||||
|
||||
if tool_name == "terminal" and preview:
|
||||
msg = f"{emoji} `{preview}`..."
|
||||
if preview:
|
||||
# Truncate preview to keep messages clean
|
||||
if len(preview) > 40:
|
||||
preview = preview[:37] + "..."
|
||||
msg = f"{emoji} {tool_name}... \"{preview}\""
|
||||
else:
|
||||
msg = f"{emoji} {tool_name}..."
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue