feat: add NeuTTS optional skill + local TTS provider backend

* feat(skills): add bundled neutts optional skill

Add NeuTTS optional skill with CLI scaffold, bootstrap helper, and
sample voice profile. Also fixes skills_hub.py to handle binary
assets (WAV files) during skill installation.

Changes:
- optional-skills/mlops/models/neutts/ — skill + CLI scaffold
- tools/skills_hub.py — binary asset support (read_bytes, write_bytes)
- tests/tools/test_skills_hub.py — regression tests for binary assets

* feat(tts): add NeuTTS as local TTS provider backend

Add NeuTTS as a fourth TTS provider option alongside Edge, ElevenLabs,
and OpenAI. NeuTTS runs fully on-device via neutts_cli — no API key
needed.

Provider behavior:
- Explicit: set tts.provider to 'neutts' in config.yaml
- Fallback: when Edge TTS is unavailable and neutts_cli is installed,
  automatically falls back to NeuTTS instead of failing
- check_tts_requirements() now includes NeuTTS in availability checks

NeuTTS outputs WAV natively. For Telegram voice bubbles, ffmpeg
converts to Opus (same pattern as Edge TTS).

Changes:
- tools/tts_tool.py — _generate_neutts(), _check_neutts_available(),
  provider dispatch, fallback logic, Opus conversion
- hermes_cli/config.py — tts.neutts config defaults

---------

Co-authored-by: unmodeled-tyler <unmodeled.tyler@proton.me>
This commit is contained in:
Teknium 2026-03-17 02:13:34 -07:00 committed by GitHub
parent 766f4aae2b
commit cb0deb5f9d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 1359 additions and 24 deletions

View file

@ -25,7 +25,7 @@ from abc import ABC, abstractmethod
from dataclasses import dataclass, field
from datetime import datetime, timezone
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple
from typing import Any, Dict, List, Optional, Tuple, Union
from urllib.parse import urlparse, urlunparse
import httpx
@ -77,7 +77,7 @@ class SkillMeta:
class SkillBundle:
"""A downloaded skill ready for quarantine/scanning/installation."""
name: str
files: Dict[str, str] # relative_path -> text content
files: Dict[str, Union[str, bytes]] # relative_path -> file content
source: str
identifier: str
trust_level: str
@ -1940,13 +1940,18 @@ class OptionalSkillSource(SkillSource):
else:
skill_dir = resolved
files: Dict[str, str] = {}
files: Dict[str, Union[str, bytes]] = {}
for f in skill_dir.rglob("*"):
if f.is_file() and not f.name.startswith("."):
if (
f.is_file()
and not f.name.startswith(".")
and "__pycache__" not in f.parts
and f.suffix != ".pyc"
):
rel_path = str(f.relative_to(skill_dir))
try:
files[rel_path] = f.read_text(encoding="utf-8")
except (OSError, UnicodeDecodeError):
files[rel_path] = f.read_bytes()
except OSError:
continue
if not files:
@ -2257,7 +2262,10 @@ def quarantine_bundle(bundle: SkillBundle) -> Path:
for rel_path, file_content in bundle.files.items():
file_dest = dest / rel_path
file_dest.parent.mkdir(parents=True, exist_ok=True)
file_dest.write_text(file_content, encoding="utf-8")
if isinstance(file_content, bytes):
file_dest.write_bytes(file_content)
else:
file_dest.write_text(file_content, encoding="utf-8")
return dest

View file

@ -2,10 +2,11 @@
"""
Text-to-Speech Tool Module
Supports three TTS providers:
Supports four TTS providers:
- Edge TTS (default, free, no API key): Microsoft Edge neural voices
- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
- OpenAI TTS: Good quality, needs OPENAI_API_KEY
- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
Output formats:
- Opus (.ogg) for Telegram voice bubbles (requires ffmpeg for Edge TTS)
@ -72,6 +73,7 @@ DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"
DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5"
DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"
DEFAULT_OPENAI_VOICE = "alloy"
DEFAULT_NEUTTS_VOICE = "" # empty = use neutts_cli default voice
DEFAULT_OUTPUT_DIR = str(Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "audio_cache")
MAX_TEXT_LENGTH = 4000
@ -258,6 +260,59 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any]
return output_path
# ===========================================================================
# NeuTTS (local, on-device TTS via neutts_cli)
# ===========================================================================
def _check_neutts_available() -> bool:
"""Check if neutts_cli is importable (installed locally)."""
try:
import importlib.util
return importlib.util.find_spec("neutts_cli") is not None
except Exception:
return False
def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
"""Generate speech using the local NeuTTS CLI.
Calls neutts_cli.cli synth via subprocess. Outputs WAV by default;
the caller handles conversion to .ogg for Telegram if needed.
"""
import sys
neutts_config = tts_config.get("neutts", {})
voice = neutts_config.get("voice", DEFAULT_NEUTTS_VOICE)
# NeuTTS outputs WAV natively — use a .wav path for generation,
# let the caller convert to the final format afterward.
wav_path = output_path
if not output_path.endswith(".wav"):
wav_path = output_path.rsplit(".", 1)[0] + ".wav"
cmd = [sys.executable, "-m", "neutts_cli.cli", "synth", "--text", text, "--out", wav_path]
if voice:
cmd.extend(["--voice", voice])
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode != 0:
stderr = result.stderr.strip()
raise RuntimeError(f"NeuTTS synthesis failed: {stderr or 'unknown error'}")
# If the caller wanted .mp3 or .ogg, convert from WAV
if wav_path != output_path:
ffmpeg = shutil.which("ffmpeg")
if ffmpeg:
conv_cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path]
subprocess.run(conv_cmd, check=True, timeout=30)
os.remove(wav_path)
else:
# No ffmpeg — just rename the WAV to the expected path
os.rename(wav_path, output_path)
return output_path
# ===========================================================================
# Main tool function
# ===========================================================================
@ -342,26 +397,45 @@ def text_to_speech_tool(
logger.info("Generating speech with OpenAI TTS...")
_generate_openai_tts(text, file_str, tts_config)
elif provider == "neutts":
if not _check_neutts_available():
return json.dumps({
"success": False,
"error": "NeuTTS provider selected but neutts_cli is not installed. "
"Install the NeuTTS skill and run the bootstrap helper first."
}, ensure_ascii=False)
logger.info("Generating speech with NeuTTS (local)...")
_generate_neutts(text, file_str, tts_config)
else:
# Default: Edge TTS (free)
# Default: Edge TTS (free), with NeuTTS as local fallback
edge_available = True
try:
_import_edge_tts()
except ImportError:
edge_available = False
if edge_available:
logger.info("Generating speech with Edge TTS...")
try:
loop = asyncio.get_running_loop()
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
pool.submit(
lambda: asyncio.run(_generate_edge_tts(text, file_str, tts_config))
).result(timeout=60)
except RuntimeError:
asyncio.run(_generate_edge_tts(text, file_str, tts_config))
elif _check_neutts_available():
logger.info("Edge TTS not available, falling back to NeuTTS (local)...")
provider = "neutts"
_generate_neutts(text, file_str, tts_config)
else:
return json.dumps({
"success": False,
"error": "Edge TTS not available. Run: pip install edge-tts"
"error": "No TTS provider available. Install edge-tts (pip install edge-tts) "
"or set up NeuTTS for local synthesis."
}, ensure_ascii=False)
logger.info("Generating speech with Edge TTS...")
# Edge TTS is async, run it
try:
loop = asyncio.get_running_loop()
import concurrent.futures
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
pool.submit(
lambda: asyncio.run(_generate_edge_tts(text, file_str, tts_config))
).result(timeout=60)
except RuntimeError:
asyncio.run(_generate_edge_tts(text, file_str, tts_config))
# Check the file was actually created
if not os.path.exists(file_str) or os.path.getsize(file_str) == 0:
@ -370,9 +444,10 @@ def text_to_speech_tool(
"error": f"TTS generation produced no output (provider: {provider})"
}, ensure_ascii=False)
# Try Opus conversion for Telegram compatibility (Edge TTS only outputs MP3)
# Try Opus conversion for Telegram compatibility
# Edge TTS outputs MP3, NeuTTS outputs WAV — both need ffmpeg conversion
voice_compatible = False
if provider == "edge" and file_str.endswith(".mp3"):
if provider in ("edge", "neutts") and not file_str.endswith(".ogg"):
opus_path = _convert_to_opus(file_str)
if opus_path:
file_str = opus_path
@ -444,6 +519,8 @@ def check_tts_requirements() -> bool:
return True
except ImportError:
pass
if _check_neutts_available():
return True
return False