refactor(tts): replace NeuTTS optional skill with built-in provider + setup flow

Remove the optional skill (redundant now that NeuTTS is a built-in TTS
provider). Replace neutts_cli dependency with a standalone synthesis
helper (tools/neutts_synth.py) that calls the neutts Python API directly
in a subprocess.

Add TTS provider selection to hermes setup:
- 'hermes setup' now prompts for TTS provider after model selection
- 'hermes setup tts' available as standalone section
- Selecting NeuTTS checks for deps and offers to install:
  espeak-ng (system) + neutts[all] (pip)
- ElevenLabs/OpenAI selections prompt for API keys
- Tool status display shows NeuTTS install state

Changes:
- Remove optional-skills/mlops/models/neutts/ (skill + CLI scaffold)
- Add tools/neutts_synth.py (standalone synthesis subprocess helper)
- Move jo.wav/jo.txt to tools/neutts_samples/ (bundled default voice)
- Refactor _generate_neutts() — uses neutts API via subprocess, no
  neutts_cli dependency, config-driven ref_audio/ref_text/model/device
- Add TTS setup to hermes_cli/setup.py (SETUP_SECTIONS, tool status)
- Update config.py defaults (ref_audio, ref_text, model, device)
This commit is contained in:
Teknium 2026-03-17 02:33:12 -07:00 committed by GitHub
parent e2e53d497f
commit d50e0711c2
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
15 changed files with 310 additions and 1192 deletions

View file

@ -0,0 +1 @@
So I just tried Neuphonic and Im genuinely impressed. It's super responsive, it sounds clean, supports voice cloning, and the agent feature is fun to play with too. Highly recommend it for podcasts, conversations, or even just messing around with voiceovers.

BIN
tools/neutts_samples/jo.wav Normal file

Binary file not shown.

104
tools/neutts_synth.py Normal file
View file

@ -0,0 +1,104 @@
#!/usr/bin/env python3
"""Standalone NeuTTS synthesis helper.
Called by tts_tool.py via subprocess to keep the TTS model (~500MB)
in a separate process that exits after synthesis no lingering memory.
Usage:
python -m tools.neutts_synth --text "Hello" --out output.wav \
--ref-audio samples/jo.wav --ref-text samples/jo.txt
Requires: pip install neutts[all]
System: apt install espeak-ng (or brew install espeak-ng)
"""
import argparse
import struct
import sys
from pathlib import Path
def _write_wav(path: str, samples, sample_rate: int = 24000) -> None:
"""Write a WAV file from float32 samples (no soundfile dependency)."""
import numpy as np
if not isinstance(samples, np.ndarray):
samples = np.array(samples, dtype=np.float32)
samples = samples.flatten()
# Clamp and convert to int16
samples = np.clip(samples, -1.0, 1.0)
pcm = (samples * 32767).astype(np.int16)
num_channels = 1
bits_per_sample = 16
byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
block_align = num_channels * (bits_per_sample // 8)
data_size = len(pcm) * (bits_per_sample // 8)
with open(path, "wb") as f:
f.write(b"RIFF")
f.write(struct.pack("<I", 36 + data_size))
f.write(b"WAVE")
f.write(b"fmt ")
f.write(struct.pack("<IHHIIHH", 16, 1, num_channels, sample_rate,
byte_rate, block_align, bits_per_sample))
f.write(b"data")
f.write(struct.pack("<I", data_size))
f.write(pcm.tobytes())
def main():
parser = argparse.ArgumentParser(description="NeuTTS synthesis helper")
parser.add_argument("--text", required=True, help="Text to synthesize")
parser.add_argument("--out", required=True, help="Output WAV path")
parser.add_argument("--ref-audio", required=True, help="Reference voice audio path")
parser.add_argument("--ref-text", required=True, help="Reference voice transcript path")
parser.add_argument("--model", default="neuphonic/neutts-air-q4-gguf",
help="HuggingFace backbone model repo")
parser.add_argument("--device", default="cpu", help="Device (cpu/cuda/mps)")
args = parser.parse_args()
# Validate inputs
ref_audio = Path(args.ref_audio).expanduser()
ref_text_path = Path(args.ref_text).expanduser()
if not ref_audio.exists():
print(f"Error: reference audio not found: {ref_audio}", file=sys.stderr)
sys.exit(1)
if not ref_text_path.exists():
print(f"Error: reference text not found: {ref_text_path}", file=sys.stderr)
sys.exit(1)
ref_text = ref_text_path.read_text(encoding="utf-8").strip()
# Import and run NeuTTS
try:
from neutts import NeuTTS
except ImportError:
print("Error: neutts not installed. Run: pip install neutts[all]", file=sys.stderr)
sys.exit(1)
tts = NeuTTS(
backbone_repo=args.model,
backbone_device=args.device,
codec_repo="neuphonic/neucodec",
codec_device=args.device,
)
ref_codes = tts.encode_reference(str(ref_audio))
wav = tts.infer(args.text, ref_codes, ref_text)
# Write output
out_path = Path(args.out)
out_path.parent.mkdir(parents=True, exist_ok=True)
try:
import soundfile as sf
sf.write(str(out_path), wav, 24000)
except ImportError:
_write_wav(str(out_path), wav, 24000)
print(f"OK: {out_path}", file=sys.stderr)
if __name__ == "__main__":
main()

View file

@ -73,7 +73,6 @@ DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"
DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5"
DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"
DEFAULT_OPENAI_VOICE = "alloy"
DEFAULT_NEUTTS_VOICE = "" # empty = use neutts_cli default voice
DEFAULT_OUTPUT_DIR = str(Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "audio_cache")
MAX_TEXT_LENGTH = 4000
@ -265,24 +264,38 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any]
# ===========================================================================
def _check_neutts_available() -> bool:
"""Check if neutts_cli is importable (installed locally)."""
"""Check if the neutts engine is importable (installed locally)."""
try:
import importlib.util
return importlib.util.find_spec("neutts_cli") is not None
return importlib.util.find_spec("neutts") is not None
except Exception:
return False
def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
"""Generate speech using the local NeuTTS CLI.
def _default_neutts_ref_audio() -> str:
"""Return path to the bundled default voice reference audio."""
return str(Path(__file__).parent / "neutts_samples" / "jo.wav")
Calls neutts_cli.cli synth via subprocess. Outputs WAV by default;
the caller handles conversion to .ogg for Telegram if needed.
def _default_neutts_ref_text() -> str:
"""Return path to the bundled default voice reference transcript."""
return str(Path(__file__).parent / "neutts_samples" / "jo.txt")
def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
"""Generate speech using the local NeuTTS engine.
Runs synthesis in a subprocess via tools/neutts_synth.py to keep the
~500MB model in a separate process that exits after synthesis.
Outputs WAV; the caller handles conversion for Telegram if needed.
"""
import sys
neutts_config = tts_config.get("neutts", {})
voice = neutts_config.get("voice", DEFAULT_NEUTTS_VOICE)
ref_audio = neutts_config.get("ref_audio", "") or _default_neutts_ref_audio()
ref_text = neutts_config.get("ref_text", "") or _default_neutts_ref_text()
model = neutts_config.get("model", "neuphonic/neutts-air-q4-gguf")
device = neutts_config.get("device", "cpu")
# NeuTTS outputs WAV natively — use a .wav path for generation,
# let the caller convert to the final format afterward.
@ -290,14 +303,23 @@ def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) ->
if not output_path.endswith(".wav"):
wav_path = output_path.rsplit(".", 1)[0] + ".wav"
cmd = [sys.executable, "-m", "neutts_cli.cli", "synth", "--text", text, "--out", wav_path]
if voice:
cmd.extend(["--voice", voice])
synth_script = str(Path(__file__).parent / "neutts_synth.py")
cmd = [
sys.executable, synth_script,
"--text", text,
"--out", wav_path,
"--ref-audio", ref_audio,
"--ref-text", ref_text,
"--model", model,
"--device", device,
]
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
if result.returncode != 0:
stderr = result.stderr.strip()
raise RuntimeError(f"NeuTTS synthesis failed: {stderr or 'unknown error'}")
# Filter out the "OK:" line from stderr
error_lines = [l for l in stderr.splitlines() if not l.startswith("OK:")]
raise RuntimeError(f"NeuTTS synthesis failed: {chr(10).join(error_lines) or 'unknown error'}")
# If the caller wanted .mp3 or .ogg, convert from WAV
if wav_path != output_path: