feat: sync text display with TTS audio playback

Move screen output from stream_callback to display_callback called by
TTS consumer thread. Text now appears sentence-by-sentence in sync with
audio instead of streaming ahead at LLM speed. Removes quiet_mode hack.
This commit is contained in:
0xbyt4 2026-03-06 00:58:29 +03:00
parent a15fa85248
commit 7d4b4e95f1
2 changed files with 64 additions and 36 deletions

43
cli.py
View file

@ -4099,6 +4099,7 @@ class HermesCLI:
# we stream audio sentence-by-sentence as the agent generates tokens # we stream audio sentence-by-sentence as the agent generates tokens
# instead of waiting for the full response. # instead of waiting for the full response.
use_streaming_tts = False use_streaming_tts = False
_streaming_box_opened = False
text_queue = None text_queue = None
tts_thread = None tts_thread = None
stream_callback = None stream_callback = None
@ -4123,9 +4124,21 @@ class HermesCLI:
text_queue = queue.Queue() text_queue = queue.Queue()
stop_event = threading.Event() stop_event = threading.Event()
def display_callback(sentence: str):
"""Called by TTS consumer when a sentence is ready to display + speak."""
nonlocal _streaming_box_opened
if not _streaming_box_opened:
_streaming_box_opened = True
w = self.console.width
label = " ⚕ Hermes "
fill = w - 2 - len(label)
_cprint(f"\n{_GOLD}╭─{label}{'' * max(fill - 1, 0)}{_RST}")
_cprint(sentence.rstrip())
tts_thread = threading.Thread( tts_thread = threading.Thread(
target=stream_tts_to_speaker, target=stream_tts_to_speaker,
args=(text_queue, stop_event, self._voice_tts_done), args=(text_queue, stop_event, self._voice_tts_done),
kwargs={"display_callback": display_callback},
daemon=True, daemon=True,
) )
tts_thread.start() tts_thread.start()
@ -4244,8 +4257,7 @@ class HermesCLI:
_cprint(f"\n{r_top}\n{_DIM}{display_reasoning}{_RST}\n{r_bot}") _cprint(f"\n{r_top}\n{_DIM}{display_reasoning}{_RST}\n{r_bot}")
if response and not response_previewed: if response and not response_previewed:
# Use a Rich Panel for the response box — adapts to terminal # Use skin engine for label/color with fallback
# width at render time instead of hard-coding border length.
try: try:
from hermes_cli.skin_engine import get_active_skin from hermes_cli.skin_engine import get_active_skin
_skin = get_active_skin() _skin = get_active_skin()
@ -4257,17 +4269,22 @@ class HermesCLI:
_resp_color = "#CD7F32" _resp_color = "#CD7F32"
_resp_text = "#FFF8DC" _resp_text = "#FFF8DC"
_chat_console = ChatConsole() is_error_response = result and (result.get("failed") or result.get("partial"))
_chat_console.print(Panel( if use_streaming_tts and _streaming_box_opened and not is_error_response:
_rich_text_from_ansi(response), # Text was already printed sentence-by-sentence; just close the box
title=f"[{_resp_color} bold]{label}[/]", w = shutil.get_terminal_size().columns
title_align="left", _cprint(f"\n{_GOLD}{'' * (w - 2)}{_RST}")
border_style=_resp_color, else:
style=_resp_text, _chat_console = ChatConsole()
box=rich_box.HORIZONTALS, _chat_console.print(Panel(
padding=(1, 2), _rich_text_from_ansi(response),
)) title=f"[{_resp_color} bold]{label}[/]",
title_align="left",
border_style=_resp_color,
style=_resp_text,
box=rich_box.HORIZONTALS,
padding=(1, 2),
))
# Play terminal bell when agent finishes (if enabled). # Play terminal bell when agent finishes (if enabled).

View file

@ -32,7 +32,7 @@ import subprocess
import tempfile import tempfile
import threading import threading
from pathlib import Path from pathlib import Path
from typing import Dict, Any, Optional from typing import Callable, Dict, Any, Optional
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -469,6 +469,7 @@ def stream_tts_to_speaker(
text_queue: queue.Queue, text_queue: queue.Queue,
stop_event: threading.Event, stop_event: threading.Event,
tts_done_event: threading.Event, tts_done_event: threading.Event,
display_callback: Optional[Callable[[str], None]] = None,
): ):
"""Consume text deltas from *text_queue*, buffer them into sentences, """Consume text deltas from *text_queue*, buffer them into sentences,
and stream each sentence through ElevenLabs TTS to the speaker in and stream each sentence through ElevenLabs TTS to the speaker in
@ -484,34 +485,38 @@ def stream_tts_to_speaker(
tts_done_event.clear() tts_done_event.clear()
try: try:
# --- TTS client setup (optional -- display_callback works without it) ---
client = None
output_stream = None
voice_id = DEFAULT_ELEVENLABS_VOICE_ID
model_id = DEFAULT_ELEVENLABS_STREAMING_MODEL_ID
tts_config = _load_tts_config() tts_config = _load_tts_config()
el_config = tts_config.get("elevenlabs", {}) el_config = tts_config.get("elevenlabs", {})
voice_id = el_config.get("voice_id", DEFAULT_ELEVENLABS_VOICE_ID) voice_id = el_config.get("voice_id", voice_id)
model_id = el_config.get("streaming_model_id", model_id = el_config.get("streaming_model_id",
el_config.get("model_id", DEFAULT_ELEVENLABS_STREAMING_MODEL_ID)) el_config.get("model_id", model_id))
api_key = os.getenv("ELEVENLABS_API_KEY", "") api_key = os.getenv("ELEVENLABS_API_KEY", "")
if not api_key: if not api_key:
logger.warning("ELEVENLABS_API_KEY not set; streaming TTS disabled") logger.warning("ELEVENLABS_API_KEY not set; streaming TTS audio disabled")
return elif _HAS_ELEVENLABS:
client = ElevenLabs(api_key=api_key)
client = ElevenLabs(api_key=api_key) # Open a single sounddevice output stream for the lifetime of
# this function. ElevenLabs pcm_24000 produces signed 16-bit
# Open a single sounddevice output stream for the lifetime of # little-endian mono PCM at 24 kHz.
# this function. ElevenLabs pcm_24000 produces signed 16-bit use_sd = _HAS_AUDIO and sd is not None
# little-endian mono PCM at 24 kHz. if use_sd:
use_sd = _HAS_AUDIO and sd is not None try:
output_stream = None import numpy as _np
if use_sd: output_stream = sd.OutputStream(
try: samplerate=24000, channels=1, dtype="int16",
import numpy as _np )
output_stream = sd.OutputStream( output_stream.start()
samplerate=24000, channels=1, dtype="int16", except Exception as exc:
) logger.warning("sounddevice OutputStream failed: %s", exc)
output_stream.start() output_stream = None
except Exception as exc:
logger.warning("sounddevice OutputStream failed: %s", exc)
output_stream = None
sentence_buf = "" sentence_buf = ""
in_think = False # track <think>...</think> blocks in_think = False # track <think>...</think> blocks
@ -520,12 +525,18 @@ def stream_tts_to_speaker(
queue_timeout = 0.5 queue_timeout = 0.5
def _speak_sentence(sentence: str): def _speak_sentence(sentence: str):
"""Generate and play audio for a single sentence.""" """Display sentence and optionally generate + play audio."""
if stop_event.is_set(): if stop_event.is_set():
return return
cleaned = _strip_markdown_for_tts(sentence).strip() cleaned = _strip_markdown_for_tts(sentence).strip()
if not cleaned: if not cleaned:
return return
# Display raw sentence on screen before TTS processing
if display_callback is not None:
display_callback(sentence)
# Skip audio generation if no TTS client available
if client is None:
return
# Truncate very long sentences # Truncate very long sentences
if len(cleaned) > MAX_TEXT_LENGTH: if len(cleaned) > MAX_TEXT_LENGTH:
cleaned = cleaned[:MAX_TEXT_LENGTH] cleaned = cleaned[:MAX_TEXT_LENGTH]