feat: add NeuTTS optional skill + local TTS provider backend

* feat(skills): add bundled neutts optional skill

Add NeuTTS optional skill with CLI scaffold, bootstrap helper, and
sample voice profile. Also fixes skills_hub.py to handle binary
assets (WAV files) during skill installation.

Changes:
- optional-skills/mlops/models/neutts/ — skill + CLI scaffold
- tools/skills_hub.py — binary asset support (read_bytes, write_bytes)
- tests/tools/test_skills_hub.py — regression tests for binary assets

* feat(tts): add NeuTTS as local TTS provider backend

Add NeuTTS as a fourth TTS provider option alongside Edge, ElevenLabs,
and OpenAI. NeuTTS runs fully on-device via neutts_cli — no API key
needed.

Provider behavior:
- Explicit: set tts.provider to 'neutts' in config.yaml
- Fallback: when Edge TTS is unavailable and neutts_cli is installed,
  automatically falls back to NeuTTS instead of failing
- check_tts_requirements() now includes NeuTTS in availability checks

NeuTTS outputs WAV natively. For Telegram voice bubbles, ffmpeg
converts to Opus (same pattern as Edge TTS).

Changes:
- tools/tts_tool.py — _generate_neutts(), _check_neutts_available(),
  provider dispatch, fallback logic, Opus conversion
- hermes_cli/config.py — tts.neutts config defaults

---------

Co-authored-by: unmodeled-tyler <unmodeled.tyler@proton.me>
This commit is contained in:
Teknium 2026-03-17 02:13:34 -07:00 committed by GitHub
parent 766f4aae2b
commit cb0deb5f9d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 1359 additions and 24 deletions

View file

@ -2,10 +2,11 @@
"""
Text-to-Speech Tool Module
Supports three TTS providers:
Supports four TTS providers:
- Edge TTS (default, free, no API key): Microsoft Edge neural voices
- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
- OpenAI TTS: Good quality, needs OPENAI_API_KEY
- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
Output formats:
- Opus (.ogg) for Telegram voice bubbles (requires ffmpeg for Edge TTS)
@ -72,6 +73,7 @@ DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"
DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5"
DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"
DEFAULT_OPENAI_VOICE = "alloy"
DEFAULT_NEUTTS_VOICE = "" # empty = use neutts_cli default voice
DEFAULT_OUTPUT_DIR = str(Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "audio_cache")
MAX_TEXT_LENGTH = 4000
@ -258,6 +260,59 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any]
return output_path
# ===========================================================================
# NeuTTS (local, on-device TTS via neutts_cli)
# ===========================================================================
def _check_neutts_available() -> bool:
"""Check if neutts_cli is importable (installed locally)."""
try:
import importlib.util
return importlib.util.find_spec("neutts_cli") is not None
except Exception:
return False
def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
"""Generate speech using the local NeuTTS CLI.
Calls neutts_cli.cli synth via subprocess. Outputs WAV by default;
the caller handles conversion to .ogg for Telegram if needed.
"""
import sys
neutts_config = tts_config.get("neutts", {})
voice = neutts_config.get("voice", DEFAULT_NEUTTS_VOICE)
# NeuTTS outputs WAV natively — use a .wav path for generation,
# let the caller convert to the final format afterward.
wav_path = output_path
if not output_path.endswith(".wav"):
wav_path = output_path.rsplit(".", 1)[0] + ".wav"
cmd = [sys.executable, "-m", "neutts_cli.cli", "synth", "--text", text, "--out", wav_path]
if voice:
cmd.extend(["--voice", voice])
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode != 0:
stderr = result.stderr.strip()
raise RuntimeError(f"NeuTTS synthesis failed: {stderr or 'unknown error'}")
# If the caller wanted .mp3 or .ogg, convert from WAV
if wav_path != output_path:
ffmpeg = shutil.which("ffmpeg")
if ffmpeg:
conv_cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path]
subprocess.run(conv_cmd, check=True, timeout=30)
os.remove(wav_path)
else:
# No ffmpeg — just rename the WAV to the expected path
os.rename(wav_path, output_path)
return output_path
# ===========================================================================
# Main tool function
# ===========================================================================
@ -342,26 +397,45 @@ def text_to_speech_tool(
logger.info("Generating speech with OpenAI TTS...")
_generate_openai_tts(text, file_str, tts_config)
elif provider == "neutts":
if not _check_neutts_available():
return json.dumps({
"success": False,
"error": "NeuTTS provider selected but neutts_cli is not installed. "
"Install the NeuTTS skill and run the bootstrap helper first."
}, ensure_ascii=False)
logger.info("Generating speech with NeuTTS (local)...")
_generate_neutts(text, file_str, tts_config)
else:
# Default: Edge TTS (free)
# Default: Edge TTS (free), with NeuTTS as local fallback
edge_available = True
try:
_import_edge_tts()
except ImportError:
edge_available = False
if edge_available:
logger.info("Generating speech with Edge TTS...")
try:
loop = asyncio.get_running_loop()
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
pool.submit(
lambda: asyncio.run(_generate_edge_tts(text, file_str, tts_config))
).result(timeout=60)
except RuntimeError:
asyncio.run(_generate_edge_tts(text, file_str, tts_config))
elif _check_neutts_available():
logger.info("Edge TTS not available, falling back to NeuTTS (local)...")
provider = "neutts"
_generate_neutts(text, file_str, tts_config)
else:
return json.dumps({
"success": False,
"error": "Edge TTS not available. Run: pip install edge-tts"
"error": "No TTS provider available. Install edge-tts (pip install edge-tts) "
"or set up NeuTTS for local synthesis."
}, ensure_ascii=False)
logger.info("Generating speech with Edge TTS...")
# Edge TTS is async, run it
try:
loop = asyncio.get_running_loop()
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
pool.submit(
lambda: asyncio.run(_generate_edge_tts(text, file_str, tts_config))
).result(timeout=60)
except RuntimeError:
asyncio.run(_generate_edge_tts(text, file_str, tts_config))
# Check the file was actually created
if not os.path.exists(file_str) or os.path.getsize(file_str) == 0:
@ -370,9 +444,10 @@ def text_to_speech_tool(
"error": f"TTS generation produced no output (provider: {provider})"
}, ensure_ascii=False)
# Try Opus conversion for Telegram compatibility (Edge TTS only outputs MP3)
# Try Opus conversion for Telegram compatibility
# Edge TTS outputs MP3, NeuTTS outputs WAV — both need ffmpeg conversion
voice_compatible = False
if provider == "edge" and file_str.endswith(".mp3"):
if provider in ("edge", "neutts") and not file_str.endswith(".ogg"):
opus_path = _convert_to_opus(file_str)
if opus_path:
file_str = opus_path
@ -444,6 +519,8 @@ def check_tts_requirements() -> bool:
return True
except ImportError:
pass
if _check_neutts_available():
return True
return False