refactor(tts): replace NeuTTS optional skill with built-in provider + setup flow

Remove the optional skill (redundant now that NeuTTS is a built-in TTS provider). Replace neutts_cli dependency with a standalone synthesis helper (tools/neutts_synth.py) that calls the neutts Python API directly in a subprocess. Add TTS provider selection to hermes setup: - 'hermes setup' now prompts for TTS provider after model selection - 'hermes setup tts' available as standalone section - Selecting NeuTTS checks for deps and offers to install: espeak-ng (system) + neutts[all] (pip) - ElevenLabs/OpenAI selections prompt for API keys - Tool status display shows NeuTTS install state Changes: - Remove optional-skills/mlops/models/neutts/ (skill + CLI scaffold) - Add tools/neutts_synth.py (standalone synthesis subprocess helper) - Move jo.wav/jo.txt to tools/neutts_samples/ (bundled default voice) - Refactor _generate_neutts() — uses neutts API via subprocess, no neutts_cli dependency, config-driven ref_audio/ref_text/model/device - Add TTS setup to hermes_cli/setup.py (SETUP_SECTIONS, tool status) - Update config.py defaults (ref_audio, ref_text, model, device)
2026-03-17 02:33:12 -07:00 · 2026-03-17 02:33:12 -07:00 · d50e0711c2
commit d50e0711c2
parent e2e53d497f
15 changed files with 310 additions and 1192 deletions
--- a/tools/neutts_samples/jo.txt
+++ b/tools/neutts_samples/jo.txt
@ -0,0 +1 @@
+So I just tried Neuphonic and I’m genuinely impressed. It's super responsive, it sounds clean, supports voice cloning, and the agent feature is fun to play with too. Highly recommend it for podcasts, conversations, or even just messing around with voiceovers.
--- a/tools/neutts_samples/jo.wav
+++ b/tools/neutts_samples/jo.wav
--- a/tools/neutts_synth.py
+++ b/tools/neutts_synth.py
@ -0,0 +1,104 @@
+#!/usr/bin/env python3
+"""Standalone NeuTTS synthesis helper.
+
+Called by tts_tool.py via subprocess to keep the TTS model (~500MB)
+in a separate process that exits after synthesis — no lingering memory.
+
+Usage:
+    python -m tools.neutts_synth --text "Hello" --out output.wav \
+        --ref-audio samples/jo.wav --ref-text samples/jo.txt
+
+Requires: pip install neutts[all]
+System:   apt install espeak-ng  (or brew install espeak-ng)
+"""
+
+import argparse
+import struct
+import sys
+from pathlib import Path
+
+
+def _write_wav(path: str, samples, sample_rate: int = 24000) -> None:
+    """Write a WAV file from float32 samples (no soundfile dependency)."""
+    import numpy as np
+
+    if not isinstance(samples, np.ndarray):
+        samples = np.array(samples, dtype=np.float32)
+    samples = samples.flatten()
+
+    # Clamp and convert to int16
+    samples = np.clip(samples, -1.0, 1.0)
+    pcm = (samples * 32767).astype(np.int16)
+
+    num_channels = 1
+    bits_per_sample = 16
+    byte_rate = sample_rate * num_channels * (bits_per_sample // 8)
+    block_align = num_channels * (bits_per_sample // 8)
+    data_size = len(pcm) * (bits_per_sample // 8)
+
+    with open(path, "wb") as f:
+        f.write(b"RIFF")
+        f.write(struct.pack("<I", 36 + data_size))
+        f.write(b"WAVE")
+        f.write(b"fmt ")
+        f.write(struct.pack("<IHHIIHH", 16, 1, num_channels, sample_rate,
+                            byte_rate, block_align, bits_per_sample))
+        f.write(b"data")
+        f.write(struct.pack("<I", data_size))
+        f.write(pcm.tobytes())
+
+
+def main():
+    parser = argparse.ArgumentParser(description="NeuTTS synthesis helper")
+    parser.add_argument("--text", required=True, help="Text to synthesize")
+    parser.add_argument("--out", required=True, help="Output WAV path")
+    parser.add_argument("--ref-audio", required=True, help="Reference voice audio path")
+    parser.add_argument("--ref-text", required=True, help="Reference voice transcript path")
+    parser.add_argument("--model", default="neuphonic/neutts-air-q4-gguf",
+                        help="HuggingFace backbone model repo")
+    parser.add_argument("--device", default="cpu", help="Device (cpu/cuda/mps)")
+    args = parser.parse_args()
+
+    # Validate inputs
+    ref_audio = Path(args.ref_audio).expanduser()
+    ref_text_path = Path(args.ref_text).expanduser()
+    if not ref_audio.exists():
+        print(f"Error: reference audio not found: {ref_audio}", file=sys.stderr)
+        sys.exit(1)
+    if not ref_text_path.exists():
+        print(f"Error: reference text not found: {ref_text_path}", file=sys.stderr)
+        sys.exit(1)
+
+    ref_text = ref_text_path.read_text(encoding="utf-8").strip()
+
+    # Import and run NeuTTS
+    try:
+        from neutts import NeuTTS
+    except ImportError:
+        print("Error: neutts not installed. Run: pip install neutts[all]", file=sys.stderr)
+        sys.exit(1)
+
+    tts = NeuTTS(
+        backbone_repo=args.model,
+        backbone_device=args.device,
+        codec_repo="neuphonic/neucodec",
+        codec_device=args.device,
+    )
+    ref_codes = tts.encode_reference(str(ref_audio))
+    wav = tts.infer(args.text, ref_codes, ref_text)
+
+    # Write output
+    out_path = Path(args.out)
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+
+    try:
+        import soundfile as sf
+        sf.write(str(out_path), wav, 24000)
+    except ImportError:
+        _write_wav(str(out_path), wav, 24000)
+
+    print(f"OK: {out_path}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@ -73,7 +73,6 @@ DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"
 DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5"
 DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"
 DEFAULT_OPENAI_VOICE = "alloy"
-DEFAULT_NEUTTS_VOICE = ""  # empty = use neutts_cli default voice
 DEFAULT_OUTPUT_DIR = str(Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "audio_cache")
 MAX_TEXT_LENGTH = 4000

@ -265,24 +264,38 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any]
 # ===========================================================================

 def _check_neutts_available() -> bool:
-    """Check if neutts_cli is importable (installed locally)."""
+    """Check if the neutts engine is importable (installed locally)."""
    try:
        import importlib.util
-        return importlib.util.find_spec("neutts_cli") is not None
+        return importlib.util.find_spec("neutts") is not None
    except Exception:
        return False


-def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
-    """Generate speech using the local NeuTTS CLI.
+def _default_neutts_ref_audio() -> str:
+    """Return path to the bundled default voice reference audio."""
+    return str(Path(__file__).parent / "neutts_samples" / "jo.wav")

-    Calls neutts_cli.cli synth via subprocess. Outputs WAV by default;
-    the caller handles conversion to .ogg for Telegram if needed.
+
+def _default_neutts_ref_text() -> str:
+    """Return path to the bundled default voice reference transcript."""
+    return str(Path(__file__).parent / "neutts_samples" / "jo.txt")
+
+
+def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
+    """Generate speech using the local NeuTTS engine.
+
+    Runs synthesis in a subprocess via tools/neutts_synth.py to keep the
+    ~500MB model in a separate process that exits after synthesis.
+    Outputs WAV; the caller handles conversion for Telegram if needed.
    """
    import sys

    neutts_config = tts_config.get("neutts", {})
-    voice = neutts_config.get("voice", DEFAULT_NEUTTS_VOICE)
+    ref_audio = neutts_config.get("ref_audio", "") or _default_neutts_ref_audio()
+    ref_text = neutts_config.get("ref_text", "") or _default_neutts_ref_text()
+    model = neutts_config.get("model", "neuphonic/neutts-air-q4-gguf")
+    device = neutts_config.get("device", "cpu")

    # NeuTTS outputs WAV natively — use a .wav path for generation,
    # let the caller convert to the final format afterward.
@ -290,14 +303,23 @@ def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) ->
    if not output_path.endswith(".wav"):
        wav_path = output_path.rsplit(".", 1)[0] + ".wav"

-    cmd = [sys.executable, "-m", "neutts_cli.cli", "synth", "--text", text, "--out", wav_path]
-    if voice:
-        cmd.extend(["--voice", voice])
+    synth_script = str(Path(__file__).parent / "neutts_synth.py")
+    cmd = [
+        sys.executable, synth_script,
+        "--text", text,
+        "--out", wav_path,
+        "--ref-audio", ref_audio,
+        "--ref-text", ref_text,
+        "--model", model,
+        "--device", device,
+    ]

    result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
    if result.returncode != 0:
        stderr = result.stderr.strip()
-        raise RuntimeError(f"NeuTTS synthesis failed: {stderr or 'unknown error'}")
+        # Filter out the "OK:" line from stderr
+        error_lines = [l for l in stderr.splitlines() if not l.startswith("OK:")]
+        raise RuntimeError(f"NeuTTS synthesis failed: {chr(10).join(error_lines) or 'unknown error'}")

    # If the caller wanted .mp3 or .ogg, convert from WAV
    if wav_path != output_path:
				`@ -0,0 +1 @@`
				`So I just tried Neuphonic and I’m genuinely impressed. It's super responsive, it sounds clean, supports voice cloning, and the agent feature is fun to play with too. Highly recommend it for podcasts, conversations, or even just messing around with voiceovers.`