feat: add streaming sentence-by-sentence TTS via ElevenLabs

Stream audio to speaker as the agent generates tokens instead of waiting for the full response. First sentence plays within ~1-2s of agent starting to respond. - run_agent: add stream_callback to run_conversation/chat, streaming path in _interruptible_api_call accumulates chunks into mock ChatCompletion while forwarding content deltas to callback - tts_tool: add stream_tts_to_speaker() with sentence buffering, think block filtering, markdown stripping, ElevenLabs pcm_24000 streaming to sounddevice OutputStream - cli: wire up streaming TTS pipeline in chat(), detect elevenlabs provider + sounddevice availability, skip batch TTS when streaming is active, signal stop on interrupt Falls back to batch TTS for Edge/OpenAI providers or when elevenlabs/sounddevice are not available. Zero impact on non-voice mode (callback defaults to None).
2026-03-03 23:03:42 +03:00 · 2026-03-03 23:03:42 +03:00 · 179d9e1a22
commit 179d9e1a22
parent d7425343ee
3 changed files with 410 additions and 18 deletions
--- a/run_agent.py
+++ b/run_agent.py
@ -2576,10 +2576,16 @@ class AIAgent:
        """
        Run the API call in a background thread so the main conversation loop
        can detect interrupts without waiting for the full HTTP round-trip.
-        
+
        On interrupt, closes the HTTP client to cancel the in-flight request
        (stops token generation and avoids wasting money), then rebuilds the
        client for future calls.
+
+        When ``self._stream_callback`` is set (streaming TTS mode), the call
+        uses ``stream=True`` and iterates over chunks inside the background
+        thread.  Content deltas are forwarded to the callback in real-time
+        while the full response is accumulated and returned as a
+        ``SimpleNamespace`` that mimics a normal ``ChatCompletion``.
        """
        result = {"response": None, "error": None}

@ -2587,10 +2593,103 @@ class AIAgent:
            try:
                if self.api_mode == "codex_responses":
                    result["response"] = self._run_codex_stream(api_kwargs)
+                    return
                elif self.api_mode == "anthropic_messages":
                    result["response"] = self._anthropic_client.messages.create(**api_kwargs)
-                else:
+                    return
+
+                cb = getattr(self, "_stream_callback", None)
+                if cb is None:
+                    # Non-streaming path (default)
                    result["response"] = self.client.chat.completions.create(**api_kwargs)
+                    return
+
+                # --- Streaming path for TTS pipeline ---
+                stream_kwargs = {**api_kwargs, "stream": True}
+                stream = self.client.chat.completions.create(**stream_kwargs)
+
+                content_parts: list[str] = []
+                tool_calls_acc: dict[int, dict] = {}  # index -> {id, type, function:{name, arguments}}
+                finish_reason = None
+                model_name = None
+                role = "assistant"
+
+                for chunk in stream:
+                    if not chunk.choices:
+                        # Usage-only or empty chunk
+                        if hasattr(chunk, "model") and chunk.model:
+                            model_name = chunk.model
+                        continue
+
+                    delta = chunk.choices[0].delta
+                    if hasattr(chunk, "model") and chunk.model:
+                        model_name = chunk.model
+
+                    # Content delta
+                    if delta and delta.content:
+                        content_parts.append(delta.content)
+                        try:
+                            cb(delta.content)
+                        except Exception:
+                            pass
+
+                    # Tool call deltas
+                    if delta and delta.tool_calls:
+                        for tc_delta in delta.tool_calls:
+                            idx = tc_delta.index
+                            if idx not in tool_calls_acc:
+                                tool_calls_acc[idx] = {
+                                    "id": tc_delta.id or "",
+                                    "type": "function",
+                                    "function": {"name": "", "arguments": ""},
+                                }
+                            entry = tool_calls_acc[idx]
+                            if tc_delta.id:
+                                entry["id"] = tc_delta.id
+                            if tc_delta.function:
+                                if tc_delta.function.name:
+                                    entry["function"]["name"] += tc_delta.function.name
+                                if tc_delta.function.arguments:
+                                    entry["function"]["arguments"] += tc_delta.function.arguments
+
+                    if chunk.choices[0].finish_reason:
+                        finish_reason = chunk.choices[0].finish_reason
+
+                # Build a mock ChatCompletion matching the non-streaming interface
+                full_content = "".join(content_parts) or None
+                mock_tool_calls = None
+                if tool_calls_acc:
+                    mock_tool_calls = []
+                    for idx in sorted(tool_calls_acc):
+                        tc = tool_calls_acc[idx]
+                        mock_tool_calls.append(SimpleNamespace(
+                            id=tc["id"],
+                            type=tc["type"],
+                            function=SimpleNamespace(
+                                name=tc["function"]["name"],
+                                arguments=tc["function"]["arguments"],
+                            ),
+                        ))
+
+                mock_message = SimpleNamespace(
+                    role=role,
+                    content=full_content,
+                    tool_calls=mock_tool_calls,
+                    reasoning_content=None,
+                )
+                mock_choice = SimpleNamespace(
+                    index=0,
+                    message=mock_message,
+                    finish_reason=finish_reason or "stop",
+                )
+                mock_response = SimpleNamespace(
+                    id="stream-" + str(uuid.uuid4()),
+                    model=model_name,
+                    choices=[mock_choice],
+                    usage=None,
+                )
+                result["response"] = mock_response
+
            except Exception as e:
                result["error"] = e

@ -3915,7 +4014,8 @@ class AIAgent:
        user_message: str,
        system_message: str = None,
        conversation_history: List[Dict[str, Any]] = None,
-        task_id: str = None
+        task_id: str = None,
+        stream_callback: Optional[callable] = None,
    ) -> Dict[str, Any]:
        """
        Run a complete conversation with tool calling until completion.
@ -3925,6 +4025,9 @@ class AIAgent:
            system_message (str): Custom system message (optional, overrides ephemeral_system_prompt if provided)
            conversation_history (List[Dict]): Previous conversation messages (optional)
            task_id (str): Unique identifier for this task to isolate VMs between concurrent tasks (optional, auto-generated if not provided)
+            stream_callback: Optional callback invoked with each text delta during streaming.
+                Used by the TTS pipeline to start audio generation before the full response.
+                When None (default), API calls use the standard non-streaming path.

        Returns:
            Dict: Complete conversation result with final response and message history
@ -3933,6 +4036,8 @@ class AIAgent:
        # Installed once, transparent when streams are healthy, prevents crash on write.
        _install_safe_stdio()

+        # Store stream callback for _interruptible_api_call to pick up
+        self._stream_callback = stream_callback
        # Generate unique task_id if not provided to isolate VMs between concurrent tasks
        effective_task_id = task_id or str(uuid.uuid4())
        
@ -5377,20 +5482,24 @@ class AIAgent:
        
        # Clear interrupt state after handling
        self.clear_interrupt()
-        
+
+        # Clear stream callback so it doesn't leak into future calls
+        self._stream_callback = None
+
        return result
-    
-    def chat(self, message: str) -> str:
+
+    def chat(self, message: str, stream_callback: Optional[callable] = None) -> str:
        """
        Simple chat interface that returns just the final response.
-        
+
        Args:
            message (str): User message
-            
+            stream_callback: Optional callback invoked with each text delta during streaming.
+
        Returns:
            str: Final assistant response
        """
-        result = self.run_conversation(message)
+        result = self.run_conversation(message, stream_callback=stream_callback)
        return result["final_response"]