feat: sync text display with TTS audio playback
Move screen output from stream_callback to display_callback called by TTS consumer thread. Text now appears sentence-by-sentence in sync with audio instead of streaming ahead at LLM speed. Removes quiet_mode hack.
This commit is contained in:
parent
a15fa85248
commit
7d4b4e95f1
2 changed files with 64 additions and 36 deletions
43
cli.py
43
cli.py
|
|
@ -4099,6 +4099,7 @@ class HermesCLI:
|
||||||
# we stream audio sentence-by-sentence as the agent generates tokens
|
# we stream audio sentence-by-sentence as the agent generates tokens
|
||||||
# instead of waiting for the full response.
|
# instead of waiting for the full response.
|
||||||
use_streaming_tts = False
|
use_streaming_tts = False
|
||||||
|
_streaming_box_opened = False
|
||||||
text_queue = None
|
text_queue = None
|
||||||
tts_thread = None
|
tts_thread = None
|
||||||
stream_callback = None
|
stream_callback = None
|
||||||
|
|
@ -4123,9 +4124,21 @@ class HermesCLI:
|
||||||
text_queue = queue.Queue()
|
text_queue = queue.Queue()
|
||||||
stop_event = threading.Event()
|
stop_event = threading.Event()
|
||||||
|
|
||||||
|
def display_callback(sentence: str):
|
||||||
|
"""Called by TTS consumer when a sentence is ready to display + speak."""
|
||||||
|
nonlocal _streaming_box_opened
|
||||||
|
if not _streaming_box_opened:
|
||||||
|
_streaming_box_opened = True
|
||||||
|
w = self.console.width
|
||||||
|
label = " ⚕ Hermes "
|
||||||
|
fill = w - 2 - len(label)
|
||||||
|
_cprint(f"\n{_GOLD}╭─{label}{'─' * max(fill - 1, 0)}╮{_RST}")
|
||||||
|
_cprint(sentence.rstrip())
|
||||||
|
|
||||||
tts_thread = threading.Thread(
|
tts_thread = threading.Thread(
|
||||||
target=stream_tts_to_speaker,
|
target=stream_tts_to_speaker,
|
||||||
args=(text_queue, stop_event, self._voice_tts_done),
|
args=(text_queue, stop_event, self._voice_tts_done),
|
||||||
|
kwargs={"display_callback": display_callback},
|
||||||
daemon=True,
|
daemon=True,
|
||||||
)
|
)
|
||||||
tts_thread.start()
|
tts_thread.start()
|
||||||
|
|
@ -4244,8 +4257,7 @@ class HermesCLI:
|
||||||
_cprint(f"\n{r_top}\n{_DIM}{display_reasoning}{_RST}\n{r_bot}")
|
_cprint(f"\n{r_top}\n{_DIM}{display_reasoning}{_RST}\n{r_bot}")
|
||||||
|
|
||||||
if response and not response_previewed:
|
if response and not response_previewed:
|
||||||
# Use a Rich Panel for the response box — adapts to terminal
|
# Use skin engine for label/color with fallback
|
||||||
# width at render time instead of hard-coding border length.
|
|
||||||
try:
|
try:
|
||||||
from hermes_cli.skin_engine import get_active_skin
|
from hermes_cli.skin_engine import get_active_skin
|
||||||
_skin = get_active_skin()
|
_skin = get_active_skin()
|
||||||
|
|
@ -4257,17 +4269,22 @@ class HermesCLI:
|
||||||
_resp_color = "#CD7F32"
|
_resp_color = "#CD7F32"
|
||||||
_resp_text = "#FFF8DC"
|
_resp_text = "#FFF8DC"
|
||||||
|
|
||||||
_chat_console = ChatConsole()
|
is_error_response = result and (result.get("failed") or result.get("partial"))
|
||||||
_chat_console.print(Panel(
|
if use_streaming_tts and _streaming_box_opened and not is_error_response:
|
||||||
_rich_text_from_ansi(response),
|
# Text was already printed sentence-by-sentence; just close the box
|
||||||
title=f"[{_resp_color} bold]{label}[/]",
|
w = shutil.get_terminal_size().columns
|
||||||
title_align="left",
|
_cprint(f"\n{_GOLD}╰{'─' * (w - 2)}╯{_RST}")
|
||||||
border_style=_resp_color,
|
else:
|
||||||
style=_resp_text,
|
_chat_console = ChatConsole()
|
||||||
box=rich_box.HORIZONTALS,
|
_chat_console.print(Panel(
|
||||||
padding=(1, 2),
|
_rich_text_from_ansi(response),
|
||||||
))
|
title=f"[{_resp_color} bold]{label}[/]",
|
||||||
|
title_align="left",
|
||||||
|
border_style=_resp_color,
|
||||||
|
style=_resp_text,
|
||||||
|
box=rich_box.HORIZONTALS,
|
||||||
|
padding=(1, 2),
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
# Play terminal bell when agent finishes (if enabled).
|
# Play terminal bell when agent finishes (if enabled).
|
||||||
|
|
|
||||||
|
|
@ -32,7 +32,7 @@ import subprocess
|
||||||
import tempfile
|
import tempfile
|
||||||
import threading
|
import threading
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Dict, Any, Optional
|
from typing import Callable, Dict, Any, Optional
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -469,6 +469,7 @@ def stream_tts_to_speaker(
|
||||||
text_queue: queue.Queue,
|
text_queue: queue.Queue,
|
||||||
stop_event: threading.Event,
|
stop_event: threading.Event,
|
||||||
tts_done_event: threading.Event,
|
tts_done_event: threading.Event,
|
||||||
|
display_callback: Optional[Callable[[str], None]] = None,
|
||||||
):
|
):
|
||||||
"""Consume text deltas from *text_queue*, buffer them into sentences,
|
"""Consume text deltas from *text_queue*, buffer them into sentences,
|
||||||
and stream each sentence through ElevenLabs TTS to the speaker in
|
and stream each sentence through ElevenLabs TTS to the speaker in
|
||||||
|
|
@ -484,34 +485,38 @@ def stream_tts_to_speaker(
|
||||||
tts_done_event.clear()
|
tts_done_event.clear()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
# --- TTS client setup (optional -- display_callback works without it) ---
|
||||||
|
client = None
|
||||||
|
output_stream = None
|
||||||
|
voice_id = DEFAULT_ELEVENLABS_VOICE_ID
|
||||||
|
model_id = DEFAULT_ELEVENLABS_STREAMING_MODEL_ID
|
||||||
|
|
||||||
tts_config = _load_tts_config()
|
tts_config = _load_tts_config()
|
||||||
el_config = tts_config.get("elevenlabs", {})
|
el_config = tts_config.get("elevenlabs", {})
|
||||||
voice_id = el_config.get("voice_id", DEFAULT_ELEVENLABS_VOICE_ID)
|
voice_id = el_config.get("voice_id", voice_id)
|
||||||
model_id = el_config.get("streaming_model_id",
|
model_id = el_config.get("streaming_model_id",
|
||||||
el_config.get("model_id", DEFAULT_ELEVENLABS_STREAMING_MODEL_ID))
|
el_config.get("model_id", model_id))
|
||||||
|
|
||||||
api_key = os.getenv("ELEVENLABS_API_KEY", "")
|
api_key = os.getenv("ELEVENLABS_API_KEY", "")
|
||||||
if not api_key:
|
if not api_key:
|
||||||
logger.warning("ELEVENLABS_API_KEY not set; streaming TTS disabled")
|
logger.warning("ELEVENLABS_API_KEY not set; streaming TTS audio disabled")
|
||||||
return
|
elif _HAS_ELEVENLABS:
|
||||||
|
client = ElevenLabs(api_key=api_key)
|
||||||
|
|
||||||
client = ElevenLabs(api_key=api_key)
|
# Open a single sounddevice output stream for the lifetime of
|
||||||
|
# this function. ElevenLabs pcm_24000 produces signed 16-bit
|
||||||
# Open a single sounddevice output stream for the lifetime of
|
# little-endian mono PCM at 24 kHz.
|
||||||
# this function. ElevenLabs pcm_24000 produces signed 16-bit
|
use_sd = _HAS_AUDIO and sd is not None
|
||||||
# little-endian mono PCM at 24 kHz.
|
if use_sd:
|
||||||
use_sd = _HAS_AUDIO and sd is not None
|
try:
|
||||||
output_stream = None
|
import numpy as _np
|
||||||
if use_sd:
|
output_stream = sd.OutputStream(
|
||||||
try:
|
samplerate=24000, channels=1, dtype="int16",
|
||||||
import numpy as _np
|
)
|
||||||
output_stream = sd.OutputStream(
|
output_stream.start()
|
||||||
samplerate=24000, channels=1, dtype="int16",
|
except Exception as exc:
|
||||||
)
|
logger.warning("sounddevice OutputStream failed: %s", exc)
|
||||||
output_stream.start()
|
output_stream = None
|
||||||
except Exception as exc:
|
|
||||||
logger.warning("sounddevice OutputStream failed: %s", exc)
|
|
||||||
output_stream = None
|
|
||||||
|
|
||||||
sentence_buf = ""
|
sentence_buf = ""
|
||||||
in_think = False # track <think>...</think> blocks
|
in_think = False # track <think>...</think> blocks
|
||||||
|
|
@ -520,12 +525,18 @@ def stream_tts_to_speaker(
|
||||||
queue_timeout = 0.5
|
queue_timeout = 0.5
|
||||||
|
|
||||||
def _speak_sentence(sentence: str):
|
def _speak_sentence(sentence: str):
|
||||||
"""Generate and play audio for a single sentence."""
|
"""Display sentence and optionally generate + play audio."""
|
||||||
if stop_event.is_set():
|
if stop_event.is_set():
|
||||||
return
|
return
|
||||||
cleaned = _strip_markdown_for_tts(sentence).strip()
|
cleaned = _strip_markdown_for_tts(sentence).strip()
|
||||||
if not cleaned:
|
if not cleaned:
|
||||||
return
|
return
|
||||||
|
# Display raw sentence on screen before TTS processing
|
||||||
|
if display_callback is not None:
|
||||||
|
display_callback(sentence)
|
||||||
|
# Skip audio generation if no TTS client available
|
||||||
|
if client is None:
|
||||||
|
return
|
||||||
# Truncate very long sentences
|
# Truncate very long sentences
|
||||||
if len(cleaned) > MAX_TEXT_LENGTH:
|
if len(cleaned) > MAX_TEXT_LENGTH:
|
||||||
cleaned = cleaned[:MAX_TEXT_LENGTH]
|
cleaned = cleaned[:MAX_TEXT_LENGTH]
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue