feat: add streaming sentence-by-sentence TTS via ElevenLabs
Stream audio to speaker as the agent generates tokens instead of waiting for the full response. First sentence plays within ~1-2s of agent starting to respond. - run_agent: add stream_callback to run_conversation/chat, streaming path in _interruptible_api_call accumulates chunks into mock ChatCompletion while forwarding content deltas to callback - tts_tool: add stream_tts_to_speaker() with sentence buffering, think block filtering, markdown stripping, ElevenLabs pcm_24000 streaming to sounddevice OutputStream - cli: wire up streaming TTS pipeline in chat(), detect elevenlabs provider + sounddevice availability, skip batch TTS when streaming is active, signal stop on interrupt Falls back to batch TTS for Edge/OpenAI providers or when elevenlabs/sounddevice are not available. Zero impact on non-voice mode (callback defaults to None).
This commit is contained in:
parent
d7425343ee
commit
179d9e1a22
3 changed files with 410 additions and 18 deletions
70
cli.py
70
cli.py
|
|
@ -4093,19 +4093,60 @@ class HermesCLI:
|
|||
try:
|
||||
# Run the conversation with interrupt monitoring
|
||||
result = None
|
||||
|
||||
|
||||
# --- Streaming TTS setup ---
|
||||
# When ElevenLabs is the TTS provider and sounddevice is available,
|
||||
# we stream audio sentence-by-sentence as the agent generates tokens
|
||||
# instead of waiting for the full response.
|
||||
use_streaming_tts = False
|
||||
text_queue = None
|
||||
tts_thread = None
|
||||
stream_callback = None
|
||||
stop_event = None
|
||||
|
||||
if self._voice_tts:
|
||||
try:
|
||||
from tools.tts_tool import (
|
||||
_load_tts_config as _load_tts_cfg,
|
||||
_get_provider as _get_prov,
|
||||
_HAS_ELEVENLABS as _el_ok,
|
||||
_HAS_AUDIO as _audio_ok,
|
||||
stream_tts_to_speaker,
|
||||
)
|
||||
_tts_cfg = _load_tts_cfg()
|
||||
if (_get_prov(_tts_cfg) == "elevenlabs" and _el_ok and _audio_ok):
|
||||
use_streaming_tts = True
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if use_streaming_tts:
|
||||
text_queue = queue.Queue()
|
||||
stop_event = threading.Event()
|
||||
|
||||
tts_thread = threading.Thread(
|
||||
target=stream_tts_to_speaker,
|
||||
args=(text_queue, stop_event, self._voice_tts_done),
|
||||
daemon=True,
|
||||
)
|
||||
tts_thread.start()
|
||||
|
||||
def stream_callback(delta: str):
|
||||
if text_queue is not None:
|
||||
text_queue.put(delta)
|
||||
|
||||
def run_agent():
|
||||
nonlocal result
|
||||
result = self.agent.run_conversation(
|
||||
user_message=message,
|
||||
conversation_history=self.conversation_history[:-1], # Exclude the message we just added
|
||||
stream_callback=stream_callback,
|
||||
task_id=self.session_id,
|
||||
)
|
||||
|
||||
|
||||
# Start agent in background thread
|
||||
agent_thread = threading.Thread(target=run_agent)
|
||||
agent_thread.start()
|
||||
|
||||
|
||||
# Monitor the dedicated interrupt queue while the agent runs.
|
||||
# _interrupt_queue is separate from _pending_input, so process_loop
|
||||
# and chat() never compete for the same queue.
|
||||
|
|
@ -4124,6 +4165,9 @@ class HermesCLI:
|
|||
if self._clarify_state or self._clarify_freetext:
|
||||
continue
|
||||
print(f"\n⚡ New message detected, interrupting...")
|
||||
# Signal TTS to stop on interrupt
|
||||
if stop_event is not None:
|
||||
stop_event.set()
|
||||
self.agent.interrupt(interrupt_msg)
|
||||
# Debug: log to file (stdout may be devnull from redirect_stdout)
|
||||
try:
|
||||
|
|
@ -4143,9 +4187,15 @@ class HermesCLI:
|
|||
else:
|
||||
# Fallback for non-interactive mode (e.g., single-query)
|
||||
agent_thread.join(0.1)
|
||||
|
||||
|
||||
agent_thread.join() # Ensure agent thread completes
|
||||
|
||||
# Signal end-of-text to TTS consumer and wait for it to finish
|
||||
if use_streaming_tts and text_queue is not None:
|
||||
text_queue.put(None) # sentinel
|
||||
if tts_thread is not None:
|
||||
tts_thread.join(timeout=120)
|
||||
|
||||
# Drain any remaining agent output still in the StdoutProxy
|
||||
# buffer so tool/status lines render ABOVE our response box.
|
||||
# The flush pushes data into the renderer queue; the short
|
||||
|
|
@ -4156,15 +4206,15 @@ class HermesCLI:
|
|||
|
||||
# Update history with full conversation
|
||||
self.conversation_history = result.get("messages", self.conversation_history) if result else self.conversation_history
|
||||
|
||||
|
||||
# Get the final response
|
||||
response = result.get("final_response", "") if result else ""
|
||||
|
||||
|
||||
# Handle failed results (e.g., non-retryable errors like invalid model)
|
||||
if result and result.get("failed") and not response:
|
||||
error_detail = result.get("error", "Unknown error")
|
||||
response = f"Error: {error_detail}"
|
||||
|
||||
|
||||
# Handle interrupt - check if we were interrupted
|
||||
pending_message = None
|
||||
if result and result.get("interrupted"):
|
||||
|
|
@ -4172,8 +4222,9 @@ class HermesCLI:
|
|||
# Add indicator that we were interrupted
|
||||
if response and pending_message:
|
||||
response = response + "\n\n---\n_[Interrupted - processing new message]_"
|
||||
|
||||
|
||||
response_previewed = result.get("response_previewed", False) if result else False
|
||||
|
||||
# Display reasoning (thinking) box if enabled and available
|
||||
if self.show_reasoning and result:
|
||||
reasoning = result.get("last_reasoning")
|
||||
|
|
@ -4226,7 +4277,8 @@ class HermesCLI:
|
|||
sys.stdout.flush()
|
||||
|
||||
# Speak response aloud if voice TTS is enabled
|
||||
if self._voice_tts and response:
|
||||
# Skip batch TTS when streaming TTS already handled it
|
||||
if self._voice_tts and response and not use_streaming_tts:
|
||||
threading.Thread(
|
||||
target=self._voice_speak_response,
|
||||
args=(response,),
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue