The architecture has been updated

2026-03-31 23:31:36 +03:00 · 2026-03-31 23:31:36 +03:00 · a01257ead9
commit a01257ead9
parent 805f7a017e
1119 changed files with 226 additions and 352 deletions
--- a/hermes_code/tools/tts_tool.py
+++ b/hermes_code/tools/tts_tool.py
@ -0,0 +1,847 @@
+#!/usr/bin/env python3
+"""
+Text-to-Speech Tool Module
+
+Supports four TTS providers:
+- Edge TTS (default, free, no API key): Microsoft Edge neural voices
+- ElevenLabs (premium): High-quality voices, needs ELEVENLABS_API_KEY
+- OpenAI TTS: Good quality, needs OPENAI_API_KEY
+- NeuTTS (local, free, no API key): On-device TTS via neutts_cli, needs neutts installed
+
+Output formats:
+- Opus (.ogg) for Telegram voice bubbles (requires ffmpeg for Edge TTS)
+- MP3 (.mp3) for everything else (CLI, Discord, WhatsApp)
+
+Configuration is loaded from ~/.hermes/config.yaml under the 'tts:' key.
+The user chooses the provider and voice; the model just sends text.
+
+Usage:
+    from tools.tts_tool import text_to_speech_tool, check_tts_requirements
+
+    result = text_to_speech_tool(text="Hello world")
+"""
+
+import asyncio
+import datetime
+import json
+import logging
+import os
+import queue
+import re
+import shutil
+import subprocess
+import tempfile
+import threading
+from pathlib import Path
+from typing import Callable, Dict, Any, Optional
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Lazy imports -- providers are imported only when actually used to avoid
+# crashing in headless environments (SSH, Docker, WSL, no PortAudio).
+# ---------------------------------------------------------------------------
+
+def _import_edge_tts():
+    """Lazy import edge_tts. Returns the module or raises ImportError."""
+    import edge_tts
+    return edge_tts
+
+def _import_elevenlabs():
+    """Lazy import ElevenLabs client. Returns the class or raises ImportError."""
+    from elevenlabs.client import ElevenLabs
+    return ElevenLabs
+
+def _import_openai_client():
+    """Lazy import OpenAI client. Returns the class or raises ImportError."""
+    from openai import OpenAI as OpenAIClient
+    return OpenAIClient
+
+def _import_sounddevice():
+    """Lazy import sounddevice. Returns the module or raises ImportError/OSError."""
+    import sounddevice as sd
+    return sd
+
+
+# ===========================================================================
+# Defaults
+# ===========================================================================
+DEFAULT_PROVIDER = "edge"
+DEFAULT_EDGE_VOICE = "en-US-AriaNeural"
+DEFAULT_ELEVENLABS_VOICE_ID = "pNInz6obpgDQGcFmaJgB"  # Adam
+DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"
+DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5"
+DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"
+DEFAULT_OPENAI_VOICE = "alloy"
+DEFAULT_OUTPUT_DIR = str(Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "audio_cache")
+MAX_TEXT_LENGTH = 4000
+
+
+# ===========================================================================
+# Config loader -- reads tts: section from ~/.hermes/config.yaml
+# ===========================================================================
+def _load_tts_config() -> Dict[str, Any]:
+    """
+    Load TTS configuration from ~/.hermes/config.yaml.
+
+    Returns a dict with provider settings. Falls back to defaults
+    for any missing fields.
+    """
+    try:
+        from hermes_cli.config import load_config
+        config = load_config()
+        return config.get("tts", {})
+    except ImportError:
+        logger.debug("hermes_cli.config not available, using default TTS config")
+        return {}
+    except Exception as e:
+        logger.warning("Failed to load TTS config: %s", e, exc_info=True)
+        return {}
+
+
+def _get_provider(tts_config: Dict[str, Any]) -> str:
+    """Get the configured TTS provider name."""
+    return tts_config.get("provider", DEFAULT_PROVIDER).lower().strip()
+
+
+# ===========================================================================
+# ffmpeg Opus conversion (Edge TTS MP3 -> OGG Opus for Telegram)
+# ===========================================================================
+def _has_ffmpeg() -> bool:
+    """Check if ffmpeg is available on the system."""
+    return shutil.which("ffmpeg") is not None
+
+
+def _convert_to_opus(mp3_path: str) -> Optional[str]:
+    """
+    Convert an MP3 file to OGG Opus format for Telegram voice bubbles.
+
+    Args:
+        mp3_path: Path to the input MP3 file.
+
+    Returns:
+        Path to the .ogg file, or None if conversion fails.
+    """
+    if not _has_ffmpeg():
+        return None
+
+    ogg_path = mp3_path.rsplit(".", 1)[0] + ".ogg"
+    try:
+        result = subprocess.run(
+            ["ffmpeg", "-i", mp3_path, "-acodec", "libopus",
+             "-ac", "1", "-b:a", "64k", "-vbr", "off", ogg_path, "-y"],
+            capture_output=True, timeout=30,
+        )
+        if result.returncode != 0:
+            logger.warning("ffmpeg conversion failed with return code %d: %s", 
+                          result.returncode, result.stderr.decode('utf-8', errors='ignore')[:200])
+            return None
+        if os.path.exists(ogg_path) and os.path.getsize(ogg_path) > 0:
+            return ogg_path
+    except subprocess.TimeoutExpired:
+        logger.warning("ffmpeg OGG conversion timed out after 30s")
+    except FileNotFoundError:
+        logger.warning("ffmpeg not found in PATH")
+    except Exception as e:
+        logger.warning("ffmpeg OGG conversion failed: %s", e, exc_info=True)
+    return None
+
+
+# ===========================================================================
+# Provider: Edge TTS (free)
+# ===========================================================================
+async def _generate_edge_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
+    """
+    Generate audio using Edge TTS.
+
+    Args:
+        text: Text to convert.
+        output_path: Where to save the MP3 file.
+        tts_config: TTS config dict.
+
+    Returns:
+        Path to the saved audio file.
+    """
+    _edge_tts = _import_edge_tts()
+    edge_config = tts_config.get("edge", {})
+    voice = edge_config.get("voice", DEFAULT_EDGE_VOICE)
+
+    communicate = _edge_tts.Communicate(text, voice)
+    await communicate.save(output_path)
+    return output_path
+
+
+# ===========================================================================
+# Provider: ElevenLabs (premium)
+# ===========================================================================
+def _generate_elevenlabs(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
+    """
+    Generate audio using ElevenLabs.
+
+    Args:
+        text: Text to convert.
+        output_path: Where to save the audio file.
+        tts_config: TTS config dict.
+
+    Returns:
+        Path to the saved audio file.
+    """
+    api_key = os.getenv("ELEVENLABS_API_KEY", "")
+    if not api_key:
+        raise ValueError("ELEVENLABS_API_KEY not set. Get one at https://elevenlabs.io/")
+
+    el_config = tts_config.get("elevenlabs", {})
+    voice_id = el_config.get("voice_id", DEFAULT_ELEVENLABS_VOICE_ID)
+    model_id = el_config.get("model_id", DEFAULT_ELEVENLABS_MODEL_ID)
+
+    # Determine output format based on file extension
+    if output_path.endswith(".ogg"):
+        output_format = "opus_48000_64"
+    else:
+        output_format = "mp3_44100_128"
+
+    ElevenLabs = _import_elevenlabs()
+    client = ElevenLabs(api_key=api_key)
+    audio_generator = client.text_to_speech.convert(
+        text=text,
+        voice_id=voice_id,
+        model_id=model_id,
+        output_format=output_format,
+    )
+
+    # audio_generator yields chunks -- write them all
+    with open(output_path, "wb") as f:
+        for chunk in audio_generator:
+            f.write(chunk)
+
+    return output_path
+
+
+# ===========================================================================
+# Provider: OpenAI TTS
+# ===========================================================================
+def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
+    """
+    Generate audio using OpenAI TTS.
+
+    Args:
+        text: Text to convert.
+        output_path: Where to save the audio file.
+        tts_config: TTS config dict.
+
+    Returns:
+        Path to the saved audio file.
+    """
+    api_key = os.getenv("VOICE_TOOLS_OPENAI_KEY", "")
+    if not api_key:
+        raise ValueError("VOICE_TOOLS_OPENAI_KEY not set. Get one at https://platform.openai.com/api-keys")
+
+    oai_config = tts_config.get("openai", {})
+    model = oai_config.get("model", DEFAULT_OPENAI_MODEL)
+    voice = oai_config.get("voice", DEFAULT_OPENAI_VOICE)
+    base_url = oai_config.get("base_url", "https://api.openai.com/v1")
+
+    # Determine response format from extension
+    if output_path.endswith(".ogg"):
+        response_format = "opus"
+    else:
+        response_format = "mp3"
+
+    OpenAIClient = _import_openai_client()
+    client = OpenAIClient(api_key=api_key, base_url=base_url)
+    response = client.audio.speech.create(
+        model=model,
+        voice=voice,
+        input=text,
+        response_format=response_format,
+    )
+
+    response.stream_to_file(output_path)
+    return output_path
+
+
+# ===========================================================================
+# NeuTTS (local, on-device TTS via neutts_cli)
+# ===========================================================================
+
+def _check_neutts_available() -> bool:
+    """Check if the neutts engine is importable (installed locally)."""
+    try:
+        import importlib.util
+        return importlib.util.find_spec("neutts") is not None
+    except Exception:
+        return False
+
+
+def _default_neutts_ref_audio() -> str:
+    """Return path to the bundled default voice reference audio."""
+    return str(Path(__file__).parent / "neutts_samples" / "jo.wav")
+
+
+def _default_neutts_ref_text() -> str:
+    """Return path to the bundled default voice reference transcript."""
+    return str(Path(__file__).parent / "neutts_samples" / "jo.txt")
+
+
+def _generate_neutts(text: str, output_path: str, tts_config: Dict[str, Any]) -> str:
+    """Generate speech using the local NeuTTS engine.
+
+    Runs synthesis in a subprocess via tools/neutts_synth.py to keep the
+    ~500MB model in a separate process that exits after synthesis.
+    Outputs WAV; the caller handles conversion for Telegram if needed.
+    """
+    import sys
+
+    neutts_config = tts_config.get("neutts", {})
+    ref_audio = neutts_config.get("ref_audio", "") or _default_neutts_ref_audio()
+    ref_text = neutts_config.get("ref_text", "") or _default_neutts_ref_text()
+    model = neutts_config.get("model", "neuphonic/neutts-air-q4-gguf")
+    device = neutts_config.get("device", "cpu")
+
+    # NeuTTS outputs WAV natively — use a .wav path for generation,
+    # let the caller convert to the final format afterward.
+    wav_path = output_path
+    if not output_path.endswith(".wav"):
+        wav_path = output_path.rsplit(".", 1)[0] + ".wav"
+
+    synth_script = str(Path(__file__).parent / "neutts_synth.py")
+    cmd = [
+        sys.executable, synth_script,
+        "--text", text,
+        "--out", wav_path,
+        "--ref-audio", ref_audio,
+        "--ref-text", ref_text,
+        "--model", model,
+        "--device", device,
+    ]
+
+    result = subprocess.run(cmd, capture_output=True, text=True, timeout=120)
+    if result.returncode != 0:
+        stderr = result.stderr.strip()
+        # Filter out the "OK:" line from stderr
+        error_lines = [l for l in stderr.splitlines() if not l.startswith("OK:")]
+        raise RuntimeError(f"NeuTTS synthesis failed: {chr(10).join(error_lines) or 'unknown error'}")
+
+    # If the caller wanted .mp3 or .ogg, convert from WAV
+    if wav_path != output_path:
+        ffmpeg = shutil.which("ffmpeg")
+        if ffmpeg:
+            conv_cmd = [ffmpeg, "-i", wav_path, "-y", "-loglevel", "error", output_path]
+            subprocess.run(conv_cmd, check=True, timeout=30)
+            os.remove(wav_path)
+        else:
+            # No ffmpeg — just rename the WAV to the expected path
+            os.rename(wav_path, output_path)
+
+    return output_path
+
+
+# ===========================================================================
+# Main tool function
+# ===========================================================================
+def text_to_speech_tool(
+    text: str,
+    output_path: Optional[str] = None,
+) -> str:
+    """
+    Convert text to speech audio.
+
+    Reads provider/voice config from ~/.hermes/config.yaml (tts: section).
+    The model sends text; the user configures voice and provider.
+
+    On messaging platforms, the returned MEDIA:<path> tag is intercepted
+    by the send pipeline and delivered as a native voice message.
+    In CLI mode, the file is saved to ~/voice-memos/.
+
+    Args:
+        text: The text to convert to speech.
+        output_path: Optional custom save path. Defaults to ~/voice-memos/<timestamp>.mp3
+
+    Returns:
+        str: JSON result with success, file_path, and optionally MEDIA tag.
+    """
+    if not text or not text.strip():
+        return json.dumps({"success": False, "error": "Text is required"}, ensure_ascii=False)
+
+    # Truncate very long text with a warning
+    if len(text) > MAX_TEXT_LENGTH:
+        logger.warning("TTS text too long (%d chars), truncating to %d", len(text), MAX_TEXT_LENGTH)
+        text = text[:MAX_TEXT_LENGTH]
+
+    tts_config = _load_tts_config()
+    provider = _get_provider(tts_config)
+
+    # Detect platform from gateway env var to choose the best output format.
+    # Telegram voice bubbles require Opus (.ogg); OpenAI and ElevenLabs can
+    # produce Opus natively (no ffmpeg needed).  Edge TTS always outputs MP3
+    # and needs ffmpeg for conversion.
+    platform = os.getenv("HERMES_SESSION_PLATFORM", "").lower()
+    want_opus = (platform == "telegram")
+
+    # Determine output path
+    if output_path:
+        file_path = Path(output_path).expanduser()
+    else:
+        timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        out_dir = Path(DEFAULT_OUTPUT_DIR)
+        out_dir.mkdir(parents=True, exist_ok=True)
+        # Use .ogg for Telegram with providers that support native Opus output,
+        # otherwise fall back to .mp3 (Edge TTS will attempt ffmpeg conversion later).
+        if want_opus and provider in ("openai", "elevenlabs"):
+            file_path = out_dir / f"tts_{timestamp}.ogg"
+        else:
+            file_path = out_dir / f"tts_{timestamp}.mp3"
+
+    # Ensure parent directory exists
+    file_path.parent.mkdir(parents=True, exist_ok=True)
+    file_str = str(file_path)
+
+    try:
+        # Generate audio with the configured provider
+        if provider == "elevenlabs":
+            try:
+                _import_elevenlabs()
+            except ImportError:
+                return json.dumps({
+                    "success": False,
+                    "error": "ElevenLabs provider selected but 'elevenlabs' package not installed. Run: pip install elevenlabs"
+                }, ensure_ascii=False)
+            logger.info("Generating speech with ElevenLabs...")
+            _generate_elevenlabs(text, file_str, tts_config)
+
+        elif provider == "openai":
+            try:
+                _import_openai_client()
+            except ImportError:
+                return json.dumps({
+                    "success": False,
+                    "error": "OpenAI provider selected but 'openai' package not installed."
+                }, ensure_ascii=False)
+            logger.info("Generating speech with OpenAI TTS...")
+            _generate_openai_tts(text, file_str, tts_config)
+
+        elif provider == "neutts":
+            if not _check_neutts_available():
+                return json.dumps({
+                    "success": False,
+                    "error": "NeuTTS provider selected but neutts is not installed. "
+                             "Run hermes setup and choose NeuTTS, or install espeak-ng and run python -m pip install -U neutts[all]."
+                }, ensure_ascii=False)
+            logger.info("Generating speech with NeuTTS (local)...")
+            _generate_neutts(text, file_str, tts_config)
+
+        else:
+            # Default: Edge TTS (free), with NeuTTS as local fallback
+            edge_available = True
+            try:
+                _import_edge_tts()
+            except ImportError:
+                edge_available = False
+
+            if edge_available:
+                logger.info("Generating speech with Edge TTS...")
+                try:
+                    loop = asyncio.get_running_loop()
+                    import concurrent.futures
+                    with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
+                        pool.submit(
+                            lambda: asyncio.run(_generate_edge_tts(text, file_str, tts_config))
+                        ).result(timeout=60)
+                except RuntimeError:
+                    asyncio.run(_generate_edge_tts(text, file_str, tts_config))
+            elif _check_neutts_available():
+                logger.info("Edge TTS not available, falling back to NeuTTS (local)...")
+                provider = "neutts"
+                _generate_neutts(text, file_str, tts_config)
+            else:
+                return json.dumps({
+                    "success": False,
+                    "error": "No TTS provider available. Install edge-tts (pip install edge-tts) "
+                             "or set up NeuTTS for local synthesis."
+                }, ensure_ascii=False)
+
+        # Check the file was actually created
+        if not os.path.exists(file_str) or os.path.getsize(file_str) == 0:
+            return json.dumps({
+                "success": False,
+                "error": f"TTS generation produced no output (provider: {provider})"
+            }, ensure_ascii=False)
+
+        # Try Opus conversion for Telegram compatibility
+        # Edge TTS outputs MP3, NeuTTS outputs WAV — both need ffmpeg conversion
+        voice_compatible = False
+        if provider in ("edge", "neutts") and not file_str.endswith(".ogg"):
+            opus_path = _convert_to_opus(file_str)
+            if opus_path:
+                file_str = opus_path
+                voice_compatible = True
+        elif provider in ("elevenlabs", "openai"):
+            # These providers can output Opus natively if the path ends in .ogg
+            voice_compatible = file_str.endswith(".ogg")
+
+        file_size = os.path.getsize(file_str)
+        logger.info("TTS audio saved: %s (%s bytes, provider: %s)", file_str, f"{file_size:,}", provider)
+
+        # Build response with MEDIA tag for platform delivery
+        media_tag = f"MEDIA:{file_str}"
+        if voice_compatible:
+            media_tag = f"[[audio_as_voice]]\n{media_tag}"
+
+        return json.dumps({
+            "success": True,
+            "file_path": file_str,
+            "media_tag": media_tag,
+            "provider": provider,
+            "voice_compatible": voice_compatible,
+        }, ensure_ascii=False)
+
+    except ValueError as e:
+        # Configuration errors (missing API keys, etc.)
+        error_msg = f"TTS configuration error ({provider}): {e}"
+        logger.error("%s", error_msg)
+        return json.dumps({"success": False, "error": error_msg}, ensure_ascii=False)
+    except FileNotFoundError as e:
+        # Missing dependencies or files
+        error_msg = f"TTS dependency missing ({provider}): {e}"
+        logger.error("%s", error_msg, exc_info=True)
+        return json.dumps({"success": False, "error": error_msg}, ensure_ascii=False)
+    except Exception as e:
+        # Unexpected errors
+        error_msg = f"TTS generation failed ({provider}): {e}"
+        logger.error("%s", error_msg, exc_info=True)
+        return json.dumps({"success": False, "error": error_msg}, ensure_ascii=False)
+
+
+# ===========================================================================
+# Requirements check
+# ===========================================================================
+def check_tts_requirements() -> bool:
+    """
+    Check if at least one TTS provider is available.
+
+    Edge TTS needs no API key and is the default, so if the package
+    is installed, TTS is available.
+
+    Returns:
+        bool: True if at least one provider can work.
+    """
+    try:
+        _import_edge_tts()
+        return True
+    except ImportError:
+        pass
+    try:
+        _import_elevenlabs()
+        if os.getenv("ELEVENLABS_API_KEY"):
+            return True
+    except ImportError:
+        pass
+    try:
+        _import_openai_client()
+        if os.getenv("VOICE_TOOLS_OPENAI_KEY"):
+            return True
+    except ImportError:
+        pass
+    if _check_neutts_available():
+        return True
+    return False
+
+
+# ===========================================================================
+# Streaming TTS: sentence-by-sentence pipeline for ElevenLabs
+# ===========================================================================
+# Sentence boundary pattern: punctuation followed by space or newline
+_SENTENCE_BOUNDARY_RE = re.compile(r'(?<=[.!?])(?:\s|\n)|(?:\n\n)')
+
+# Markdown stripping patterns (same as cli.py _voice_speak_response)
+_MD_CODE_BLOCK = re.compile(r'```[\s\S]*?```')
+_MD_LINK = re.compile(r'\[([^\]]+)\]\([^)]+\)')
+_MD_URL = re.compile(r'https?://\S+')
+_MD_BOLD = re.compile(r'\*\*(.+?)\*\*')
+_MD_ITALIC = re.compile(r'\*(.+?)\*')
+_MD_INLINE_CODE = re.compile(r'`(.+?)`')
+_MD_HEADER = re.compile(r'^#+\s*', flags=re.MULTILINE)
+_MD_LIST_ITEM = re.compile(r'^\s*[-*]\s+', flags=re.MULTILINE)
+_MD_HR = re.compile(r'---+')
+_MD_EXCESS_NL = re.compile(r'\n{3,}')
+
+
+def _strip_markdown_for_tts(text: str) -> str:
+    """Remove markdown formatting that shouldn't be spoken aloud."""
+    text = _MD_CODE_BLOCK.sub(' ', text)
+    text = _MD_LINK.sub(r'\1', text)
+    text = _MD_URL.sub('', text)
+    text = _MD_BOLD.sub(r'\1', text)
+    text = _MD_ITALIC.sub(r'\1', text)
+    text = _MD_INLINE_CODE.sub(r'\1', text)
+    text = _MD_HEADER.sub('', text)
+    text = _MD_LIST_ITEM.sub('', text)
+    text = _MD_HR.sub('', text)
+    text = _MD_EXCESS_NL.sub('\n\n', text)
+    return text.strip()
+
+
+def stream_tts_to_speaker(
+    text_queue: queue.Queue,
+    stop_event: threading.Event,
+    tts_done_event: threading.Event,
+    display_callback: Optional[Callable[[str], None]] = None,
+):
+    """Consume text deltas from *text_queue*, buffer them into sentences,
+    and stream each sentence through ElevenLabs TTS to the speaker in
+    real-time.
+
+    Protocol:
+        * The producer puts ``str`` deltas onto *text_queue*.
+        * A ``None`` sentinel signals end-of-text (flush remaining buffer).
+        * *stop_event* can be set to abort early (e.g. user interrupt).
+        * *tts_done_event* is **set** in the ``finally`` block so callers
+          waiting on it (continuous voice mode) know playback is finished.
+    """
+    tts_done_event.clear()
+
+    try:
+        # --- TTS client setup (optional -- display_callback works without it) ---
+        client = None
+        output_stream = None
+        voice_id = DEFAULT_ELEVENLABS_VOICE_ID
+        model_id = DEFAULT_ELEVENLABS_STREAMING_MODEL_ID
+
+        tts_config = _load_tts_config()
+        el_config = tts_config.get("elevenlabs", {})
+        voice_id = el_config.get("voice_id", voice_id)
+        model_id = el_config.get("streaming_model_id",
+                                 el_config.get("model_id", model_id))
+
+        api_key = os.getenv("ELEVENLABS_API_KEY", "")
+        if not api_key:
+            logger.warning("ELEVENLABS_API_KEY not set; streaming TTS audio disabled")
+        else:
+            try:
+                ElevenLabs = _import_elevenlabs()
+                client = ElevenLabs(api_key=api_key)
+            except ImportError:
+                logger.warning("elevenlabs package not installed; streaming TTS disabled")
+
+            # Open a single sounddevice output stream for the lifetime of
+            # this function.  ElevenLabs pcm_24000 produces signed 16-bit
+            # little-endian mono PCM at 24 kHz.
+            if client is not None:
+                try:
+                    sd = _import_sounddevice()
+                    import numpy as _np
+                    output_stream = sd.OutputStream(
+                        samplerate=24000, channels=1, dtype="int16",
+                    )
+                    output_stream.start()
+                except (ImportError, OSError) as exc:
+                    logger.debug("sounddevice not available: %s", exc)
+                    output_stream = None
+                except Exception as exc:
+                    logger.warning("sounddevice OutputStream failed: %s", exc)
+                    output_stream = None
+
+        sentence_buf = ""
+        min_sentence_len = 20
+        long_flush_len = 100
+        queue_timeout = 0.5
+        _spoken_sentences: list[str] = []  # track spoken sentences to skip duplicates
+        # Regex to strip complete <think>...</think> blocks from buffer
+        _think_block_re = re.compile(r'<think[\s>].*?</think>', flags=re.DOTALL)
+
+        def _speak_sentence(sentence: str):
+            """Display sentence and optionally generate + play audio."""
+            if stop_event.is_set():
+                return
+            cleaned = _strip_markdown_for_tts(sentence).strip()
+            if not cleaned:
+                return
+            # Skip duplicate/near-duplicate sentences (LLM repetition)
+            cleaned_lower = cleaned.lower().rstrip(".!,")
+            for prev in _spoken_sentences:
+                if prev.lower().rstrip(".!,") == cleaned_lower:
+                    return
+            _spoken_sentences.append(cleaned)
+            # Display raw sentence on screen before TTS processing
+            if display_callback is not None:
+                display_callback(sentence)
+            # Skip audio generation if no TTS client available
+            if client is None:
+                return
+            # Truncate very long sentences
+            if len(cleaned) > MAX_TEXT_LENGTH:
+                cleaned = cleaned[:MAX_TEXT_LENGTH]
+            try:
+                audio_iter = client.text_to_speech.convert(
+                    text=cleaned,
+                    voice_id=voice_id,
+                    model_id=model_id,
+                    output_format="pcm_24000",
+                )
+                if output_stream is not None:
+                    for chunk in audio_iter:
+                        if stop_event.is_set():
+                            break
+                        import numpy as _np
+                        audio_array = _np.frombuffer(chunk, dtype=_np.int16)
+                        output_stream.write(audio_array.reshape(-1, 1))
+                else:
+                    # Fallback: write chunks to temp file and play via system player
+                    _play_via_tempfile(audio_iter, stop_event)
+            except Exception as exc:
+                logger.warning("Streaming TTS sentence failed: %s", exc)
+
+        def _play_via_tempfile(audio_iter, stop_evt):
+            """Write PCM chunks to a temp WAV file and play it."""
+            tmp_path = None
+            try:
+                import wave
+                tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
+                tmp_path = tmp.name
+                with wave.open(tmp, "wb") as wf:
+                    wf.setnchannels(1)
+                    wf.setsampwidth(2)  # 16-bit
+                    wf.setframerate(24000)
+                    for chunk in audio_iter:
+                        if stop_evt.is_set():
+                            break
+                        wf.writeframes(chunk)
+                from tools.voice_mode import play_audio_file
+                play_audio_file(tmp_path)
+            except Exception as exc:
+                logger.warning("Temp-file TTS fallback failed: %s", exc)
+            finally:
+                if tmp_path:
+                    try:
+                        os.unlink(tmp_path)
+                    except OSError:
+                        pass
+
+        while not stop_event.is_set():
+            # Read next delta from queue
+            try:
+                delta = text_queue.get(timeout=queue_timeout)
+            except queue.Empty:
+                # Timeout: if we have accumulated a long buffer, flush it
+                if len(sentence_buf) > long_flush_len:
+                    _speak_sentence(sentence_buf)
+                    sentence_buf = ""
+                continue
+
+            if delta is None:
+                # End-of-text sentinel: strip any remaining think blocks, flush
+                sentence_buf = _think_block_re.sub('', sentence_buf)
+                if sentence_buf.strip():
+                    _speak_sentence(sentence_buf)
+                break
+
+            sentence_buf += delta
+
+            # --- Think block filtering ---
+            # Strip complete <think>...</think> blocks from buffer.
+            # Works correctly even when tags span multiple deltas.
+            sentence_buf = _think_block_re.sub('', sentence_buf)
+
+            # If an incomplete <think tag is at the end, wait for more data
+            # before extracting sentences (the closing tag may arrive next).
+            if '<think' in sentence_buf and '</think>' not in sentence_buf:
+                continue
+
+            # Check for sentence boundaries
+            while True:
+                m = _SENTENCE_BOUNDARY_RE.search(sentence_buf)
+                if m is None:
+                    break
+                end_pos = m.end()
+                sentence = sentence_buf[:end_pos]
+                sentence_buf = sentence_buf[end_pos:]
+                # Merge short fragments into the next sentence
+                if len(sentence.strip()) < min_sentence_len:
+                    sentence_buf = sentence + sentence_buf
+                    break
+                _speak_sentence(sentence)
+
+        # Drain any remaining items from the queue
+        while True:
+            try:
+                text_queue.get_nowait()
+            except queue.Empty:
+                break
+
+        # output_stream is closed in the finally block below
+
+    except Exception as exc:
+        logger.warning("Streaming TTS pipeline error: %s", exc)
+    finally:
+        # Always close the audio output stream to avoid locking the device
+        if output_stream is not None:
+            try:
+                output_stream.stop()
+                output_stream.close()
+            except Exception:
+                pass
+        tts_done_event.set()
+
+
+# ===========================================================================
+# Main -- quick diagnostics
+# ===========================================================================
+if __name__ == "__main__":
+    print("🔊 Text-to-Speech Tool Module")
+    print("=" * 50)
+
+    def _check(importer, label):
+        try:
+            importer()
+            return True
+        except ImportError:
+            return False
+
+    print(f"\nProvider availability:")
+    print(f"  Edge TTS:   {'installed' if _check(_import_edge_tts, 'edge') else 'not installed (pip install edge-tts)'}")
+    print(f"  ElevenLabs: {'installed' if _check(_import_elevenlabs, 'el') else 'not installed (pip install elevenlabs)'}")
+    print(f"    API Key:  {'set' if os.getenv('ELEVENLABS_API_KEY') else 'not set'}")
+    print(f"  OpenAI:     {'installed' if _check(_import_openai_client, 'oai') else 'not installed'}")
+    print(f"    API Key:  {'set' if os.getenv('VOICE_TOOLS_OPENAI_KEY') else 'not set (VOICE_TOOLS_OPENAI_KEY)'}")
+    print(f"  ffmpeg:     {'✅ found' if _has_ffmpeg() else '❌ not found (needed for Telegram Opus)'}")
+    print(f"\n  Output dir: {DEFAULT_OUTPUT_DIR}")
+
+    config = _load_tts_config()
+    provider = _get_provider(config)
+    print(f"  Configured provider: {provider}")
+
+
+# ---------------------------------------------------------------------------
+# Registry
+# ---------------------------------------------------------------------------
+from tools.registry import registry
+
+TTS_SCHEMA = {
+    "name": "text_to_speech",
+    "description": "Convert text to speech audio. Returns a MEDIA: path that the platform delivers as a voice message. On Telegram it plays as a voice bubble, on Discord/WhatsApp as an audio attachment. In CLI mode, saves to ~/voice-memos/. Voice and provider are user-configured, not model-selected.",
+    "parameters": {
+        "type": "object",
+        "properties": {
+            "text": {
+                "type": "string",
+                "description": "The text to convert to speech. Keep under 4000 characters."
+            },
+            "output_path": {
+                "type": "string",
+                "description": "Optional custom file path to save the audio. Defaults to ~/.hermes/audio_cache/<timestamp>.mp3"
+            }
+        },
+        "required": ["text"]
+    }
+}
+
+registry.register(
+    name="text_to_speech",
+    toolset="tts",
+    schema=TTS_SCHEMA,
+    handler=lambda args, **kw: text_to_speech_tool(
+        text=args.get("text", ""),
+        output_path=args.get("output_path")),
+    check_fn=check_tts_requirements,
+    emoji="🔊",
+)