feat: add streaming sentence-by-sentence TTS via ElevenLabs

Stream audio to speaker as the agent generates tokens instead of
waiting for the full response. First sentence plays within ~1-2s
of agent starting to respond.

- run_agent: add stream_callback to run_conversation/chat, streaming
  path in _interruptible_api_call accumulates chunks into mock
  ChatCompletion while forwarding content deltas to callback
- tts_tool: add stream_tts_to_speaker() with sentence buffering,
  think block filtering, markdown stripping, ElevenLabs pcm_24000
  streaming to sounddevice OutputStream
- cli: wire up streaming TTS pipeline in chat(), detect elevenlabs
  provider + sounddevice availability, skip batch TTS when streaming
  is active, signal stop on interrupt

Falls back to batch TTS for Edge/OpenAI providers or when
elevenlabs/sounddevice are not available. Zero impact on non-voice
mode (callback defaults to None).
This commit is contained in:
0xbyt4 2026-03-03 23:03:42 +03:00
parent d7425343ee
commit 179d9e1a22
3 changed files with 410 additions and 18 deletions

View file

@ -25,9 +25,12 @@ import datetime
import json
import logging
import os
import queue
import re
import shutil
import subprocess
import tempfile
import threading
from pathlib import Path
from typing import Dict, Any, Optional
@ -55,6 +58,13 @@ try:
except ImportError:
_HAS_OPENAI = False
try:
import sounddevice as sd
_HAS_AUDIO = True
except ImportError:
sd = None # type: ignore[assignment]
_HAS_AUDIO = False
# ===========================================================================
# Defaults
@ -63,6 +73,7 @@ DEFAULT_PROVIDER = "edge"
DEFAULT_EDGE_VOICE = "en-US-AriaNeural"
DEFAULT_ELEVENLABS_VOICE_ID = "pNInz6obpgDQGcFmaJgB" # Adam
DEFAULT_ELEVENLABS_MODEL_ID = "eleven_multilingual_v2"
DEFAULT_ELEVENLABS_STREAMING_MODEL_ID = "eleven_flash_v2_5"
DEFAULT_OPENAI_MODEL = "gpt-4o-mini-tts"
DEFAULT_OPENAI_VOICE = "alloy"
DEFAULT_OUTPUT_DIR = str(Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "audio_cache")
@ -420,6 +431,226 @@ def check_tts_requirements() -> bool:
return False
# ===========================================================================
# Streaming TTS: sentence-by-sentence pipeline for ElevenLabs
# ===========================================================================
# Sentence boundary pattern: punctuation followed by space or newline
_SENTENCE_BOUNDARY_RE = re.compile(r'(?<=[.!?])(?:\s|\n)|(?:\n\n)')
# Markdown stripping patterns (same as cli.py _voice_speak_response)
_MD_CODE_BLOCK = re.compile(r'```[\s\S]*?```')
_MD_LINK = re.compile(r'\[([^\]]+)\]\([^)]+\)')
_MD_URL = re.compile(r'https?://\S+')
_MD_BOLD = re.compile(r'\*\*(.+?)\*\*')
_MD_ITALIC = re.compile(r'\*(.+?)\*')
_MD_INLINE_CODE = re.compile(r'`(.+?)`')
_MD_HEADER = re.compile(r'^#+\s*', flags=re.MULTILINE)
_MD_LIST_ITEM = re.compile(r'^\s*[-*]\s+', flags=re.MULTILINE)
_MD_HR = re.compile(r'---+')
_MD_EXCESS_NL = re.compile(r'\n{3,}')
def _strip_markdown_for_tts(text: str) -> str:
"""Remove markdown formatting that shouldn't be spoken aloud."""
text = _MD_CODE_BLOCK.sub(' ', text)
text = _MD_LINK.sub(r'\1', text)
text = _MD_URL.sub('', text)
text = _MD_BOLD.sub(r'\1', text)
text = _MD_ITALIC.sub(r'\1', text)
text = _MD_INLINE_CODE.sub(r'\1', text)
text = _MD_HEADER.sub('', text)
text = _MD_LIST_ITEM.sub('', text)
text = _MD_HR.sub('', text)
text = _MD_EXCESS_NL.sub('\n\n', text)
return text.strip()
def stream_tts_to_speaker(
text_queue: queue.Queue,
stop_event: threading.Event,
tts_done_event: threading.Event,
):
"""Consume text deltas from *text_queue*, buffer them into sentences,
and stream each sentence through ElevenLabs TTS to the speaker in
real-time.
Protocol:
* The producer puts ``str`` deltas onto *text_queue*.
* A ``None`` sentinel signals end-of-text (flush remaining buffer).
* *stop_event* can be set to abort early (e.g. user interrupt).
* *tts_done_event* is **set** in the ``finally`` block so callers
waiting on it (continuous voice mode) know playback is finished.
"""
tts_done_event.clear()
try:
tts_config = _load_tts_config()
el_config = tts_config.get("elevenlabs", {})
voice_id = el_config.get("voice_id", DEFAULT_ELEVENLABS_VOICE_ID)
model_id = el_config.get("streaming_model_id",
el_config.get("model_id", DEFAULT_ELEVENLABS_STREAMING_MODEL_ID))
api_key = os.getenv("ELEVENLABS_API_KEY", "")
if not api_key:
logger.warning("ELEVENLABS_API_KEY not set; streaming TTS disabled")
return
client = ElevenLabs(api_key=api_key)
# Open a single sounddevice output stream for the lifetime of
# this function. ElevenLabs pcm_24000 produces signed 16-bit
# little-endian mono PCM at 24 kHz.
use_sd = _HAS_AUDIO and sd is not None
output_stream = None
if use_sd:
try:
import numpy as _np
output_stream = sd.OutputStream(
samplerate=24000, channels=1, dtype="int16",
)
output_stream.start()
except Exception as exc:
logger.warning("sounddevice OutputStream failed: %s", exc)
output_stream = None
sentence_buf = ""
in_think = False # track <think>...</think> blocks
min_sentence_len = 20
long_flush_len = 100
queue_timeout = 0.5
def _speak_sentence(sentence: str):
"""Generate and play audio for a single sentence."""
if stop_event.is_set():
return
cleaned = _strip_markdown_for_tts(sentence).strip()
if not cleaned:
return
# Truncate very long sentences
if len(cleaned) > MAX_TEXT_LENGTH:
cleaned = cleaned[:MAX_TEXT_LENGTH]
try:
audio_iter = client.text_to_speech.convert(
text=cleaned,
voice_id=voice_id,
model_id=model_id,
output_format="pcm_24000",
)
if output_stream is not None:
for chunk in audio_iter:
if stop_event.is_set():
break
import numpy as _np
audio_array = _np.frombuffer(chunk, dtype=_np.int16)
output_stream.write(audio_array.reshape(-1, 1))
else:
# Fallback: write chunks to temp file and play via system player
_play_via_tempfile(audio_iter, stop_event)
except Exception as exc:
logger.warning("Streaming TTS sentence failed: %s", exc)
def _play_via_tempfile(audio_iter, stop_evt):
"""Write PCM chunks to a temp WAV file and play it."""
try:
import wave
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
tmp_path = tmp.name
with wave.open(tmp, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2) # 16-bit
wf.setframerate(24000)
for chunk in audio_iter:
if stop_evt.is_set():
break
wf.writeframes(chunk)
from tools.voice_mode import play_audio_file
play_audio_file(tmp_path)
os.unlink(tmp_path)
except Exception as exc:
logger.warning("Temp-file TTS fallback failed: %s", exc)
while not stop_event.is_set():
# Read next delta from queue
try:
delta = text_queue.get(timeout=queue_timeout)
except queue.Empty:
# Timeout: if we have accumulated a long buffer, flush it
if len(sentence_buf) > long_flush_len:
_speak_sentence(sentence_buf)
sentence_buf = ""
continue
if delta is None:
# End-of-text sentinel: flush remaining buffer
if sentence_buf.strip():
_speak_sentence(sentence_buf)
break
# --- Think block filtering ---
# Process delta character by character for think tags
i = 0
filtered_delta = []
while i < len(delta):
# Check for opening <think tag
if delta[i:].startswith("<think"):
in_think = True
# Skip past the tag
end = delta.find(">", i)
if end != -1:
i = end + 1
else:
i = len(delta)
continue
# Check for closing </think> tag
if delta[i:].startswith("</think>"):
in_think = False
i += len("</think>")
continue
if not in_think:
filtered_delta.append(delta[i])
i += 1
text = "".join(filtered_delta)
if not text:
continue
sentence_buf += text
# Check for sentence boundaries
while True:
m = _SENTENCE_BOUNDARY_RE.search(sentence_buf)
if m is None:
break
end_pos = m.end()
sentence = sentence_buf[:end_pos]
sentence_buf = sentence_buf[end_pos:]
# Merge short fragments into the next sentence
if len(sentence.strip()) < min_sentence_len:
sentence_buf = sentence + sentence_buf
break
_speak_sentence(sentence)
# Drain any remaining items from the queue
while True:
try:
text_queue.get_nowait()
except queue.Empty:
break
# Close the audio output stream
if output_stream is not None:
try:
output_stream.stop()
output_stream.close()
except Exception:
pass
except Exception as exc:
logger.warning("Streaming TTS pipeline error: %s", exc)
finally:
tts_done_event.set()
# ===========================================================================
# Main -- quick diagnostics
# ===========================================================================