Add Text-to-Speech (TTS) support with Edge TTS and ElevenLabs integration
- Updated `pyproject.toml` to include Edge TTS and ElevenLabs as dependencies. - Enhanced documentation to detail voice message capabilities across platforms and TTS provider options. - Modified the GatewayRunner to handle MEDIA tags from TTS tool responses, ensuring proper delivery of audio messages.
This commit is contained in:
parent
84718d183a
commit
586b0a7047
4 changed files with 62 additions and 7 deletions
|
|
@ -307,6 +307,28 @@ This is intentional: CLI users are in a terminal and expect the agent to work in
|
||||||
|
|
||||||
If the agent hits the max iteration limit while working, instead of a generic error, it asks the model to summarize what it found so far. This gives you a useful response even when the task couldn't be fully completed.
|
If the agent hits the max iteration limit while working, instead of a generic error, it asks the model to summarize what it found so far. This gives you a useful response even when the task couldn't be fully completed.
|
||||||
|
|
||||||
|
## Voice Messages (TTS)
|
||||||
|
|
||||||
|
The `text_to_speech` tool generates audio that the gateway delivers as native voice messages on each platform:
|
||||||
|
|
||||||
|
| Platform | Delivery | Format |
|
||||||
|
|----------|----------|--------|
|
||||||
|
| Telegram | Voice bubble (plays inline) | Opus `.ogg` (converted from MP3 via ffmpeg) |
|
||||||
|
| Discord | Audio file attachment | MP3 |
|
||||||
|
| WhatsApp | Audio file attachment | MP3 |
|
||||||
|
| CLI | Saved to `~/voice-memos/` | MP3 (or Opus if ffmpeg available) |
|
||||||
|
|
||||||
|
**Providers:**
|
||||||
|
- **Edge TTS** (default) — Free, no API key, 322 voices in 74 languages
|
||||||
|
- **ElevenLabs** — Premium quality, requires `ELEVENLABS_API_KEY`
|
||||||
|
- **OpenAI TTS** — Good quality, requires `OPENAI_API_KEY`
|
||||||
|
|
||||||
|
Voice and provider are configured by the user in `~/.hermes/config.yaml` under the `tts:` key. The model only sends text; it does not choose the voice.
|
||||||
|
|
||||||
|
The tool returns a `MEDIA:<path>` tag that the gateway send pipeline intercepts and delivers as a native audio message. If `[[audio_as_voice]]` is present (Opus format available), Telegram sends it as a voice bubble instead of an audio file.
|
||||||
|
|
||||||
|
> **Note:** Telegram voice bubbles require `ffmpeg` for Opus conversion (Edge TTS outputs MP3). Install with `apt install ffmpeg` or `brew install ffmpeg`. Without ffmpeg, audio is sent as a regular file.
|
||||||
|
|
||||||
## Cron Job Delivery
|
## Cron Job Delivery
|
||||||
|
|
||||||
When scheduling cron jobs, you can specify where the output should be delivered:
|
When scheduling cron jobs, you can specify where the output should be delivered:
|
||||||
|
|
|
||||||
|
|
@ -40,11 +40,15 @@ async def web_search(query: str) -> dict:
|
||||||
|----------|--------|-------|
|
|----------|--------|-------|
|
||||||
| **Web** | `web_tools.py` | `web_search`, `web_extract`, `web_crawl` |
|
| **Web** | `web_tools.py` | `web_search`, `web_extract`, `web_crawl` |
|
||||||
| **Terminal** | `terminal_tool.py` | `terminal` (local/docker/singularity/modal/ssh backends) |
|
| **Terminal** | `terminal_tool.py` | `terminal` (local/docker/singularity/modal/ssh backends) |
|
||||||
|
| **File** | `file_tools.py` | `read_file`, `write_file`, `patch`, `search` |
|
||||||
| **Browser** | `browser_tool.py` | `browser_navigate`, `browser_click`, `browser_type`, etc. |
|
| **Browser** | `browser_tool.py` | `browser_navigate`, `browser_click`, `browser_type`, etc. |
|
||||||
| **Vision** | `vision_tools.py` | `vision_analyze` |
|
| **Vision** | `vision_tools.py` | `vision_analyze` |
|
||||||
| **Image Gen** | `image_generation_tool.py` | `image_generate` |
|
| **Image Gen** | `image_generation_tool.py` | `image_generate` |
|
||||||
|
| **TTS** | `tts_tool.py` | `text_to_speech` (Edge TTS free / ElevenLabs / OpenAI) |
|
||||||
| **Reasoning** | `mixture_of_agents_tool.py` | `mixture_of_agents` |
|
| **Reasoning** | `mixture_of_agents_tool.py` | `mixture_of_agents` |
|
||||||
| **Skills** | `skills_tool.py` | `skills_categories`, `skills_list`, `skill_view` |
|
| **Skills** | `skills_tool.py` | `skills_list`, `skill_view` |
|
||||||
|
| **Cronjob** | `cronjob_tools.py` | `schedule_cronjob`, `list_cronjobs`, `remove_cronjob` |
|
||||||
|
| **RL Training** | `rl_training_tool.py` | `rl_list_environments`, `rl_start_training`, `rl_check_status`, etc. |
|
||||||
|
|
||||||
## Tool Registration
|
## Tool Registration
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,7 @@ Usage:
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
import signal
|
import signal
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
@ -583,13 +584,37 @@ class GatewayRunner:
|
||||||
|
|
||||||
# Return final response, or a message if something went wrong
|
# Return final response, or a message if something went wrong
|
||||||
final_response = result.get("final_response")
|
final_response = result.get("final_response")
|
||||||
if final_response:
|
if not final_response:
|
||||||
return final_response
|
if result.get("error"):
|
||||||
elif result.get("error"):
|
return f"⚠️ {result['error']}"
|
||||||
# Agent couldn't recover - show the error
|
|
||||||
return f"⚠️ {result['error']}"
|
|
||||||
else:
|
|
||||||
return "(No response generated)"
|
return "(No response generated)"
|
||||||
|
|
||||||
|
# Scan tool results in the conversation for MEDIA:<path> tags.
|
||||||
|
# The TTS tool (and potentially other media-producing tools) embed
|
||||||
|
# MEDIA: tags in their JSON responses, but the model's final reply
|
||||||
|
# typically doesn't include them -- it just says "here you go".
|
||||||
|
# We collect those tags and append them to the final response so
|
||||||
|
# the adapter's extract_media() can find and deliver the files.
|
||||||
|
media_tags = []
|
||||||
|
for msg in result.get("messages", []):
|
||||||
|
if msg.get("role") == "tool" or (msg.get("role") == "function"):
|
||||||
|
content = msg.get("content", "")
|
||||||
|
if "MEDIA:" in content:
|
||||||
|
# Extract MEDIA: tags from tool result (may be inside JSON).
|
||||||
|
# Strip trailing JSON artifacts like quotes and commas that
|
||||||
|
# get caught by the \S+ when the tag is inside a JSON string.
|
||||||
|
for match in re.finditer(r'MEDIA:(\S+)', content):
|
||||||
|
path = match.group(1).strip().rstrip('",}')
|
||||||
|
if path:
|
||||||
|
media_tags.append(f"MEDIA:{path}")
|
||||||
|
# Also capture the [[audio_as_voice]] directive
|
||||||
|
if "[[audio_as_voice]]" in content:
|
||||||
|
media_tags.insert(0, "[[audio_as_voice]]")
|
||||||
|
|
||||||
|
if media_tags:
|
||||||
|
final_response = final_response + "\n" + "\n".join(media_tags)
|
||||||
|
|
||||||
|
return final_response
|
||||||
|
|
||||||
# Start progress message sender if enabled
|
# Start progress message sender if enabled
|
||||||
progress_task = None
|
progress_task = None
|
||||||
|
|
|
||||||
|
|
@ -27,6 +27,8 @@ dependencies = [
|
||||||
# Tools
|
# Tools
|
||||||
"firecrawl-py",
|
"firecrawl-py",
|
||||||
"fal-client",
|
"fal-client",
|
||||||
|
# Text-to-speech (Edge TTS is free, no API key needed)
|
||||||
|
"edge-tts",
|
||||||
# mini-swe-agent deps (terminal tool)
|
# mini-swe-agent deps (terminal tool)
|
||||||
"litellm>=1.75.5",
|
"litellm>=1.75.5",
|
||||||
"typer",
|
"typer",
|
||||||
|
|
@ -39,12 +41,14 @@ dev = ["pytest", "pytest-asyncio"]
|
||||||
messaging = ["python-telegram-bot>=20.0", "discord.py>=2.0", "aiohttp>=3.9.0"]
|
messaging = ["python-telegram-bot>=20.0", "discord.py>=2.0", "aiohttp>=3.9.0"]
|
||||||
cron = ["croniter"]
|
cron = ["croniter"]
|
||||||
cli = ["simple-term-menu"]
|
cli = ["simple-term-menu"]
|
||||||
|
tts-premium = ["elevenlabs"]
|
||||||
all = [
|
all = [
|
||||||
"hermes-agent[modal]",
|
"hermes-agent[modal]",
|
||||||
"hermes-agent[messaging]",
|
"hermes-agent[messaging]",
|
||||||
"hermes-agent[cron]",
|
"hermes-agent[cron]",
|
||||||
"hermes-agent[cli]",
|
"hermes-agent[cli]",
|
||||||
"hermes-agent[dev]",
|
"hermes-agent[dev]",
|
||||||
|
"hermes-agent[tts-premium]",
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue