feat: add streaming sentence-by-sentence TTS via ElevenLabs

Stream audio to speaker as the agent generates tokens instead of
waiting for the full response. First sentence plays within ~1-2s
of agent starting to respond.

- run_agent: add stream_callback to run_conversation/chat, streaming
  path in _interruptible_api_call accumulates chunks into mock
  ChatCompletion while forwarding content deltas to callback
- tts_tool: add stream_tts_to_speaker() with sentence buffering,
  think block filtering, markdown stripping, ElevenLabs pcm_24000
  streaming to sounddevice OutputStream
- cli: wire up streaming TTS pipeline in chat(), detect elevenlabs
  provider + sounddevice availability, skip batch TTS when streaming
  is active, signal stop on interrupt

Falls back to batch TTS for Edge/OpenAI providers or when
elevenlabs/sounddevice are not available. Zero impact on non-voice
mode (callback defaults to None).
This commit is contained in:
0xbyt4 2026-03-03 23:03:42 +03:00
parent d7425343ee
commit 179d9e1a22
3 changed files with 410 additions and 18 deletions

View file

@ -2576,10 +2576,16 @@ class AIAgent:
"""
Run the API call in a background thread so the main conversation loop
can detect interrupts without waiting for the full HTTP round-trip.
On interrupt, closes the HTTP client to cancel the in-flight request
(stops token generation and avoids wasting money), then rebuilds the
client for future calls.
When ``self._stream_callback`` is set (streaming TTS mode), the call
uses ``stream=True`` and iterates over chunks inside the background
thread. Content deltas are forwarded to the callback in real-time
while the full response is accumulated and returned as a
``SimpleNamespace`` that mimics a normal ``ChatCompletion``.
"""
result = {"response": None, "error": None}
@ -2587,10 +2593,103 @@ class AIAgent:
try:
if self.api_mode == "codex_responses":
result["response"] = self._run_codex_stream(api_kwargs)
return
elif self.api_mode == "anthropic_messages":
result["response"] = self._anthropic_client.messages.create(**api_kwargs)
else:
return
cb = getattr(self, "_stream_callback", None)
if cb is None:
# Non-streaming path (default)
result["response"] = self.client.chat.completions.create(**api_kwargs)
return
# --- Streaming path for TTS pipeline ---
stream_kwargs = {**api_kwargs, "stream": True}
stream = self.client.chat.completions.create(**stream_kwargs)
content_parts: list[str] = []
tool_calls_acc: dict[int, dict] = {} # index -> {id, type, function:{name, arguments}}
finish_reason = None
model_name = None
role = "assistant"
for chunk in stream:
if not chunk.choices:
# Usage-only or empty chunk
if hasattr(chunk, "model") and chunk.model:
model_name = chunk.model
continue
delta = chunk.choices[0].delta
if hasattr(chunk, "model") and chunk.model:
model_name = chunk.model
# Content delta
if delta and delta.content:
content_parts.append(delta.content)
try:
cb(delta.content)
except Exception:
pass
# Tool call deltas
if delta and delta.tool_calls:
for tc_delta in delta.tool_calls:
idx = tc_delta.index
if idx not in tool_calls_acc:
tool_calls_acc[idx] = {
"id": tc_delta.id or "",
"type": "function",
"function": {"name": "", "arguments": ""},
}
entry = tool_calls_acc[idx]
if tc_delta.id:
entry["id"] = tc_delta.id
if tc_delta.function:
if tc_delta.function.name:
entry["function"]["name"] += tc_delta.function.name
if tc_delta.function.arguments:
entry["function"]["arguments"] += tc_delta.function.arguments
if chunk.choices[0].finish_reason:
finish_reason = chunk.choices[0].finish_reason
# Build a mock ChatCompletion matching the non-streaming interface
full_content = "".join(content_parts) or None
mock_tool_calls = None
if tool_calls_acc:
mock_tool_calls = []
for idx in sorted(tool_calls_acc):
tc = tool_calls_acc[idx]
mock_tool_calls.append(SimpleNamespace(
id=tc["id"],
type=tc["type"],
function=SimpleNamespace(
name=tc["function"]["name"],
arguments=tc["function"]["arguments"],
),
))
mock_message = SimpleNamespace(
role=role,
content=full_content,
tool_calls=mock_tool_calls,
reasoning_content=None,
)
mock_choice = SimpleNamespace(
index=0,
message=mock_message,
finish_reason=finish_reason or "stop",
)
mock_response = SimpleNamespace(
id="stream-" + str(uuid.uuid4()),
model=model_name,
choices=[mock_choice],
usage=None,
)
result["response"] = mock_response
except Exception as e:
result["error"] = e
@ -3915,7 +4014,8 @@ class AIAgent:
user_message: str,
system_message: str = None,
conversation_history: List[Dict[str, Any]] = None,
task_id: str = None
task_id: str = None,
stream_callback: Optional[callable] = None,
) -> Dict[str, Any]:
"""
Run a complete conversation with tool calling until completion.
@ -3925,6 +4025,9 @@ class AIAgent:
system_message (str): Custom system message (optional, overrides ephemeral_system_prompt if provided)
conversation_history (List[Dict]): Previous conversation messages (optional)
task_id (str): Unique identifier for this task to isolate VMs between concurrent tasks (optional, auto-generated if not provided)
stream_callback: Optional callback invoked with each text delta during streaming.
Used by the TTS pipeline to start audio generation before the full response.
When None (default), API calls use the standard non-streaming path.
Returns:
Dict: Complete conversation result with final response and message history
@ -3933,6 +4036,8 @@ class AIAgent:
# Installed once, transparent when streams are healthy, prevents crash on write.
_install_safe_stdio()
# Store stream callback for _interruptible_api_call to pick up
self._stream_callback = stream_callback
# Generate unique task_id if not provided to isolate VMs between concurrent tasks
effective_task_id = task_id or str(uuid.uuid4())
@ -5377,20 +5482,24 @@ class AIAgent:
# Clear interrupt state after handling
self.clear_interrupt()
# Clear stream callback so it doesn't leak into future calls
self._stream_callback = None
return result
def chat(self, message: str) -> str:
def chat(self, message: str, stream_callback: Optional[callable] = None) -> str:
"""
Simple chat interface that returns just the final response.
Args:
message (str): User message
stream_callback: Optional callback invoked with each text delta during streaming.
Returns:
str: Final assistant response
"""
result = self.run_conversation(message)
result = self.run_conversation(message, stream_callback=stream_callback)
return result["final_response"]