feat: add NeuTTS optional skill + local TTS provider backend
* feat(skills): add bundled neutts optional skill Add NeuTTS optional skill with CLI scaffold, bootstrap helper, and sample voice profile. Also fixes skills_hub.py to handle binary assets (WAV files) during skill installation. Changes: - optional-skills/mlops/models/neutts/ — skill + CLI scaffold - tools/skills_hub.py — binary asset support (read_bytes, write_bytes) - tests/tools/test_skills_hub.py — regression tests for binary assets * feat(tts): add NeuTTS as local TTS provider backend Add NeuTTS as a fourth TTS provider option alongside Edge, ElevenLabs, and OpenAI. NeuTTS runs fully on-device via neutts_cli — no API key needed. Provider behavior: - Explicit: set tts.provider to 'neutts' in config.yaml - Fallback: when Edge TTS is unavailable and neutts_cli is installed, automatically falls back to NeuTTS instead of failing - check_tts_requirements() now includes NeuTTS in availability checks NeuTTS outputs WAV natively. For Telegram voice bubbles, ffmpeg converts to Opus (same pattern as Edge TTS). Changes: - tools/tts_tool.py — _generate_neutts(), _check_neutts_available(), provider dispatch, fallback logic, Opus conversion - hermes_cli/config.py — tts.neutts config defaults --------- Co-authored-by: unmodeled-tyler <unmodeled.tyler@proton.me>
This commit is contained in:
parent
766f4aae2b
commit
cb0deb5f9d
15 changed files with 1359 additions and 24 deletions
|
|
@ -25,7 +25,7 @@ from abc import ABC, abstractmethod
|
|||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from typing import Any, Dict, List, Optional, Tuple, Union
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
|
||||
import httpx
|
||||
|
|
@ -77,7 +77,7 @@ class SkillMeta:
|
|||
class SkillBundle:
|
||||
"""A downloaded skill ready for quarantine/scanning/installation."""
|
||||
name: str
|
||||
files: Dict[str, str] # relative_path -> text content
|
||||
files: Dict[str, Union[str, bytes]] # relative_path -> file content
|
||||
source: str
|
||||
identifier: str
|
||||
trust_level: str
|
||||
|
|
@ -1940,13 +1940,18 @@ class OptionalSkillSource(SkillSource):
|
|||
else:
|
||||
skill_dir = resolved
|
||||
|
||||
files: Dict[str, str] = {}
|
||||
files: Dict[str, Union[str, bytes]] = {}
|
||||
for f in skill_dir.rglob("*"):
|
||||
if f.is_file() and not f.name.startswith("."):
|
||||
if (
|
||||
f.is_file()
|
||||
and not f.name.startswith(".")
|
||||
and "__pycache__" not in f.parts
|
||||
and f.suffix != ".pyc"
|
||||
):
|
||||
rel_path = str(f.relative_to(skill_dir))
|
||||
try:
|
||||
files[rel_path] = f.read_text(encoding="utf-8")
|
||||
except (OSError, UnicodeDecodeError):
|
||||
files[rel_path] = f.read_bytes()
|
||||
except OSError:
|
||||
continue
|
||||
|
||||
if not files:
|
||||
|
|
@ -2257,7 +2262,10 @@ def quarantine_bundle(bundle: SkillBundle) -> Path:
|
|||
for rel_path, file_content in bundle.files.items():
|
||||
file_dest = dest / rel_path
|
||||
file_dest.parent.mkdir(parents=True, exist_ok=True)
|
||||
file_dest.write_text(file_content, encoding="utf-8")
|
||||
if isinstance(file_content, bytes):
|
||||
file_dest.write_bytes(file_content)
|
||||
else:
|
||||
file_dest.write_text(file_content, encoding="utf-8")
|
||||
|
||||
return dest
|
||||
|
||||
|
|
|
|||
|
|
@ -2,10 +2,11 @@
|
|||
"""
|
||||
Text-to-Speech Tool Module
|
||||
|
||||
Supports three TTS providers:
|
||||
Supports four TTS providers:
|
||||
- Edge TTS (default, free, no API key): Microsoft Edge neural voices
|
||||
- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
|
||||
- OpenAI TTS: Good quality, needs OPENAI_API_KEY
|
||||
- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
|
||||
|
||||
Output formats:
|
||||
- Opus (.ogg) for Telegram voice bubbles (requires ffmpeg for Edge TTS)
|
||||
|
|
@ -72,6 +73,7 @@ DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"
|
|||
DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5"
|
||||
DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"
|
||||
DEFAULT_OPENAI_VOICE = "alloy"
|
||||
DEFAULT_NEUTTS_VOICE = "" # empty = use neutts_cli default voice
|
||||
DEFAULT_OUTPUT_DIR = str(Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "audio_cache")
|
||||
MAX_TEXT_LENGTH = 4000
|
||||
|
||||
|
|
@ -258,6 +260,59 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any]
|
|||
return output_path
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# NeuTTS (local, on-device TTS via neutts_cli)
|
||||
# ===========================================================================
|
||||
|
||||
def _check_neutts_available() -> bool:
|
||||
"""Check if neutts_cli is importable (installed locally)."""
|
||||
try:
|
||||
import importlib.util
|
||||
return importlib.util.find_spec("neutts_cli") is not None
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
|
||||
def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
|
||||
"""Generate speech using the local NeuTTS CLI.
|
||||
|
||||
Calls neutts_cli.cli synth via subprocess. Outputs WAV by default;
|
||||
the caller handles conversion to .ogg for Telegram if needed.
|
||||
"""
|
||||
import sys
|
||||
|
||||
neutts_config = tts_config.get("neutts", {})
|
||||
voice = neutts_config.get("voice", DEFAULT_NEUTTS_VOICE)
|
||||
|
||||
# NeuTTS outputs WAV natively — use a .wav path for generation,
|
||||
# let the caller convert to the final format afterward.
|
||||
wav_path = output_path
|
||||
if not output_path.endswith(".wav"):
|
||||
wav_path = output_path.rsplit(".", 1)[0] + ".wav"
|
||||
|
||||
cmd = [sys.executable, "-m", "neutts_cli.cli", "synth", "--text", text, "--out", wav_path]
|
||||
if voice:
|
||||
cmd.extend(["--voice", voice])
|
||||
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
|
||||
if result.returncode != 0:
|
||||
stderr = result.stderr.strip()
|
||||
raise RuntimeError(f"NeuTTS synthesis failed: {stderr or 'unknown error'}")
|
||||
|
||||
# If the caller wanted .mp3 or .ogg, convert from WAV
|
||||
if wav_path != output_path:
|
||||
ffmpeg = shutil.which("ffmpeg")
|
||||
if ffmpeg:
|
||||
conv_cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path]
|
||||
subprocess.run(conv_cmd, check=True, timeout=30)
|
||||
os.remove(wav_path)
|
||||
else:
|
||||
# No ffmpeg — just rename the WAV to the expected path
|
||||
os.rename(wav_path, output_path)
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
# ===========================================================================
|
||||
# Main tool function
|
||||
# ===========================================================================
|
||||
|
|
@ -342,26 +397,45 @@ def text_to_speech_tool(
|
|||
logger.info("Generating speech with OpenAI TTS...")
|
||||
_generate_openai_tts(text, file_str, tts_config)
|
||||
|
||||
elif provider == "neutts":
|
||||
if not _check_neutts_available():
|
||||
return json.dumps({
|
||||
"success": False,
|
||||
"error": "NeuTTS provider selected but neutts_cli is not installed. "
|
||||
"Install the NeuTTS skill and run the bootstrap helper first."
|
||||
}, ensure_ascii=False)
|
||||
logger.info("Generating speech with NeuTTS (local)...")
|
||||
_generate_neutts(text, file_str, tts_config)
|
||||
|
||||
else:
|
||||
# Default: Edge TTS (free)
|
||||
# Default: Edge TTS (free), with NeuTTS as local fallback
|
||||
edge_available = True
|
||||
try:
|
||||
_import_edge_tts()
|
||||
except ImportError:
|
||||
edge_available = False
|
||||
|
||||
if edge_available:
|
||||
logger.info("Generating speech with Edge TTS...")
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
import concurrent.futures
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
||||
pool.submit(
|
||||
lambda: asyncio.run(_generate_edge_tts(text, file_str, tts_config))
|
||||
).result(timeout=60)
|
||||
except RuntimeError:
|
||||
asyncio.run(_generate_edge_tts(text, file_str, tts_config))
|
||||
elif _check_neutts_available():
|
||||
logger.info("Edge TTS not available, falling back to NeuTTS (local)...")
|
||||
provider = "neutts"
|
||||
_generate_neutts(text, file_str, tts_config)
|
||||
else:
|
||||
return json.dumps({
|
||||
"success": False,
|
||||
"error": "Edge TTS not available. Run: pip install edge-tts"
|
||||
"error": "No TTS provider available. Install edge-tts (pip install edge-tts) "
|
||||
"or set up NeuTTS for local synthesis."
|
||||
}, ensure_ascii=False)
|
||||
logger.info("Generating speech with Edge TTS...")
|
||||
# Edge TTS is async, run it
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
import concurrent.futures
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
|
||||
pool.submit(
|
||||
lambda: asyncio.run(_generate_edge_tts(text, file_str, tts_config))
|
||||
).result(timeout=60)
|
||||
except RuntimeError:
|
||||
asyncio.run(_generate_edge_tts(text, file_str, tts_config))
|
||||
|
||||
# Check the file was actually created
|
||||
if not os.path.exists(file_str) or os.path.getsize(file_str) == 0:
|
||||
|
|
@ -370,9 +444,10 @@ def text_to_speech_tool(
|
|||
"error": f"TTS generation produced no output (provider: {provider})"
|
||||
}, ensure_ascii=False)
|
||||
|
||||
# Try Opus conversion for Telegram compatibility (Edge TTS only outputs MP3)
|
||||
# Try Opus conversion for Telegram compatibility
|
||||
# Edge TTS outputs MP3, NeuTTS outputs WAV — both need ffmpeg conversion
|
||||
voice_compatible = False
|
||||
if provider == "edge" and file_str.endswith(".mp3"):
|
||||
if provider in ("edge", "neutts") and not file_str.endswith(".ogg"):
|
||||
opus_path = _convert_to_opus(file_str)
|
||||
if opus_path:
|
||||
file_str = opus_path
|
||||
|
|
@ -444,6 +519,8 @@ def check_tts_requirements() -> bool:
|
|||
return True
|
||||
except ImportError:
|
||||
pass
|
||||
if _check_neutts_available():
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue