fix: address voice mode review feedback
1. Fully lazy imports: sounddevice, numpy, elevenlabs, edge_tts, and openai are never imported at module level. Each is imported only when the feature is explicitly activated, preventing crashes in headless environments (SSH, Docker, WSL, no PortAudio). 2. No core agent loop changes: streaming TTS path extracted from _interruptible_api_call() into separate _streaming_api_call() method. The original method is restored to its upstream form. 3. Configurable key binding: push-to-talk key changed from Ctrl+R (conflicts with readline reverse-search) to Ctrl+B by default. Configurable via voice.push_to_talk_key in config.yaml. 4. Environment detection: new detect_audio_environment() function checks for SSH, Docker, WSL, and missing audio devices before enabling voice mode. Auto-disables with clear warnings in incompatible environments. 5. Graceful degradation: every audio touchpoint (sd.play, sd.InputStream, sd.OutputStream) wrapped in try/except with ImportError/OSError handling. Failures produce warnings, not crashes.
This commit is contained in:
parent
143cc68946
commit
b859dfab16
5 changed files with 526 additions and 142 deletions
101
cli.py
101
cli.py
|
|
@ -3779,7 +3779,15 @@ class HermesCLI:
|
||||||
_cprint(f"{_DIM}Voice mode is already enabled.{_RST}")
|
_cprint(f"{_DIM}Voice mode is already enabled.{_RST}")
|
||||||
return
|
return
|
||||||
|
|
||||||
from tools.voice_mode import check_voice_requirements
|
from tools.voice_mode import check_voice_requirements, detect_audio_environment
|
||||||
|
|
||||||
|
# Environment detection -- warn and block in incompatible environments
|
||||||
|
env_check = detect_audio_environment()
|
||||||
|
if not env_check["available"]:
|
||||||
|
_cprint(f"\n{_GOLD}Voice mode unavailable in this environment:{_RST}")
|
||||||
|
for warning in env_check["warnings"]:
|
||||||
|
_cprint(f" {_DIM}{warning}{_RST}")
|
||||||
|
return
|
||||||
|
|
||||||
reqs = check_voice_requirements()
|
reqs = check_voice_requirements()
|
||||||
if not reqs["available"]:
|
if not reqs["available"]:
|
||||||
|
|
@ -3815,8 +3823,14 @@ class HermesCLI:
|
||||||
self.system_prompt = (self.system_prompt or "") + voice_instruction
|
self.system_prompt = (self.system_prompt or "") + voice_instruction
|
||||||
|
|
||||||
tts_status = " (TTS enabled)" if self._voice_tts else ""
|
tts_status = " (TTS enabled)" if self._voice_tts else ""
|
||||||
|
try:
|
||||||
|
from hermes_cli.config import load_config
|
||||||
|
_ptt_key = load_config().get("voice", {}).get("push_to_talk_key", "c-b")
|
||||||
|
except Exception:
|
||||||
|
_ptt_key = "c-b"
|
||||||
|
_ptt_display = _ptt_key.replace("c-", "Ctrl+").upper()
|
||||||
_cprint(f"\n{_GOLD}Voice mode enabled{tts_status}{_RST}")
|
_cprint(f"\n{_GOLD}Voice mode enabled{tts_status}{_RST}")
|
||||||
_cprint(f" {_DIM}Ctrl+R to start/stop recording{_RST}")
|
_cprint(f" {_DIM}{_ptt_display} to start/stop recording{_RST}")
|
||||||
_cprint(f" {_DIM}/voice tts to toggle speech output{_RST}")
|
_cprint(f" {_DIM}/voice tts to toggle speech output{_RST}")
|
||||||
_cprint(f" {_DIM}/voice off to disable voice mode{_RST}")
|
_cprint(f" {_DIM}/voice off to disable voice mode{_RST}")
|
||||||
|
|
||||||
|
|
@ -4804,6 +4818,51 @@ class HermesCLI:
|
||||||
self._should_exit = True
|
self._should_exit = True
|
||||||
event.app.exit()
|
event.app.exit()
|
||||||
|
|
||||||
|
# Voice push-to-talk key: configurable via config.yaml (voice.push_to_talk_key)
|
||||||
|
# Default: Ctrl+B (avoids conflict with Ctrl+R readline reverse-search)
|
||||||
|
try:
|
||||||
|
from hermes_cli.config import load_config
|
||||||
|
_voice_key = load_config().get("voice", {}).get("push_to_talk_key", "c-b")
|
||||||
|
except Exception:
|
||||||
|
_voice_key = "c-b"
|
||||||
|
|
||||||
|
@kb.add(_voice_key)
|
||||||
|
def handle_voice_record(event):
|
||||||
|
"""Toggle voice recording when voice mode is active."""
|
||||||
|
if not cli_ref._voice_mode:
|
||||||
|
return
|
||||||
|
# Always allow STOPPING a recording (even when agent is running)
|
||||||
|
if cli_ref._voice_recording:
|
||||||
|
# Manual stop via Ctrl+R: stop continuous mode
|
||||||
|
with cli_ref._voice_lock:
|
||||||
|
cli_ref._voice_continuous = False
|
||||||
|
# Flag clearing is handled atomically inside _voice_stop_and_transcribe
|
||||||
|
event.app.invalidate()
|
||||||
|
threading.Thread(
|
||||||
|
target=cli_ref._voice_stop_and_transcribe,
|
||||||
|
daemon=True,
|
||||||
|
).start()
|
||||||
|
else:
|
||||||
|
# Guard: don't START recording during agent run or interactive prompts
|
||||||
|
if cli_ref._agent_running:
|
||||||
|
return
|
||||||
|
if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state:
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
# Interrupt TTS if playing, so user can start talking
|
||||||
|
if not cli_ref._voice_tts_done.is_set():
|
||||||
|
try:
|
||||||
|
from tools.voice_mode import stop_playback
|
||||||
|
stop_playback()
|
||||||
|
cli_ref._voice_tts_done.set()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
with cli_ref._voice_lock:
|
||||||
|
cli_ref._voice_continuous = True
|
||||||
|
cli_ref._voice_start_recording()
|
||||||
|
event.app.invalidate()
|
||||||
|
except Exception as e:
|
||||||
|
_cprint(f"\n{_DIM}Voice recording failed: {e}{_RST}")
|
||||||
from prompt_toolkit.keys import Keys
|
from prompt_toolkit.keys import Keys
|
||||||
|
|
||||||
@kb.add(Keys.BracketedPaste, eager=True)
|
@kb.add(Keys.BracketedPaste, eager=True)
|
||||||
|
|
@ -4850,44 +4909,6 @@ class HermesCLI:
|
||||||
# No image found — show a hint
|
# No image found — show a hint
|
||||||
pass # silent when no image (avoid noise on accidental press)
|
pass # silent when no image (avoid noise on accidental press)
|
||||||
|
|
||||||
@kb.add('c-space')
|
|
||||||
def handle_ctrl_space(event):
|
|
||||||
"""Toggle voice recording when voice mode is active."""
|
|
||||||
if not cli_ref._voice_mode:
|
|
||||||
return
|
|
||||||
# Always allow STOPPING a recording (even when agent is running)
|
|
||||||
if cli_ref._voice_recording:
|
|
||||||
# Manual stop via Ctrl+R: stop continuous mode
|
|
||||||
with cli_ref._voice_lock:
|
|
||||||
cli_ref._voice_continuous = False
|
|
||||||
# Flag clearing is handled atomically inside _voice_stop_and_transcribe
|
|
||||||
event.app.invalidate()
|
|
||||||
threading.Thread(
|
|
||||||
target=cli_ref._voice_stop_and_transcribe,
|
|
||||||
daemon=True,
|
|
||||||
).start()
|
|
||||||
else:
|
|
||||||
# Guard: don't START recording during agent run or interactive prompts
|
|
||||||
if cli_ref._agent_running:
|
|
||||||
return
|
|
||||||
if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state:
|
|
||||||
return
|
|
||||||
try:
|
|
||||||
# Interrupt TTS if playing, so user can start talking
|
|
||||||
if not cli_ref._voice_tts_done.is_set():
|
|
||||||
try:
|
|
||||||
from tools.voice_mode import stop_playback
|
|
||||||
stop_playback()
|
|
||||||
cli_ref._voice_tts_done.set()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
with cli_ref._voice_lock:
|
|
||||||
cli_ref._voice_continuous = True
|
|
||||||
cli_ref._voice_start_recording()
|
|
||||||
event.app.invalidate()
|
|
||||||
except Exception as e:
|
|
||||||
_cprint(f"\n{_DIM}Voice recording failed: {e}{_RST}")
|
|
||||||
|
|
||||||
# Dynamic prompt: shows Hermes symbol when agent is working,
|
# Dynamic prompt: shows Hermes symbol when agent is working,
|
||||||
# or answer prompt when clarify freetext mode is active.
|
# or answer prompt when clarify freetext mode is active.
|
||||||
cli_ref = self
|
cli_ref = self
|
||||||
|
|
|
||||||
72
run_agent.py
72
run_agent.py
|
|
@ -2590,12 +2590,6 @@ class AIAgent:
|
||||||
On interrupt, closes the HTTP client to cancel the in-flight request
|
On interrupt, closes the HTTP client to cancel the in-flight request
|
||||||
(stops token generation and avoids wasting money), then rebuilds the
|
(stops token generation and avoids wasting money), then rebuilds the
|
||||||
client for future calls.
|
client for future calls.
|
||||||
|
|
||||||
When ``self._stream_callback`` is set (streaming TTS mode), the call
|
|
||||||
uses ``stream=True`` and iterates over chunks inside the background
|
|
||||||
thread. Content deltas are forwarded to the callback in real-time
|
|
||||||
while the full response is accumulated and returned as a
|
|
||||||
``SimpleNamespace`` that mimics a normal ``ChatCompletion``.
|
|
||||||
"""
|
"""
|
||||||
result = {"response": None, "error": None}
|
result = {"response": None, "error": None}
|
||||||
|
|
||||||
|
|
@ -2603,30 +2597,58 @@ class AIAgent:
|
||||||
try:
|
try:
|
||||||
if self.api_mode == "codex_responses":
|
if self.api_mode == "codex_responses":
|
||||||
result["response"] = self._run_codex_stream(api_kwargs)
|
result["response"] = self._run_codex_stream(api_kwargs)
|
||||||
return
|
|
||||||
elif self.api_mode == "anthropic_messages":
|
elif self.api_mode == "anthropic_messages":
|
||||||
result["response"] = self._anthropic_client.messages.create(**api_kwargs)
|
result["response"] = self._anthropic_client.messages.create(**api_kwargs)
|
||||||
return
|
else:
|
||||||
|
|
||||||
cb = getattr(self, "_stream_callback", None)
|
|
||||||
if cb is None:
|
|
||||||
# Non-streaming path (default)
|
|
||||||
result["response"] = self.client.chat.completions.create(**api_kwargs)
|
result["response"] = self.client.chat.completions.create(**api_kwargs)
|
||||||
return
|
except Exception as e:
|
||||||
|
result["error"] = e
|
||||||
|
|
||||||
# --- Streaming path for TTS pipeline ---
|
t = threading.Thread(target=_call, daemon=True)
|
||||||
|
t.start()
|
||||||
|
while t.is_alive():
|
||||||
|
t.join(timeout=0.3)
|
||||||
|
if self._interrupt_requested:
|
||||||
|
# Force-close the HTTP connection to stop token generation
|
||||||
|
try:
|
||||||
|
self.client.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Rebuild the client for future calls (cheap, no network)
|
||||||
|
try:
|
||||||
|
self.client = OpenAI(**self._client_kwargs)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
raise InterruptedError("Agent interrupted during API call")
|
||||||
|
if result["error"] is not None:
|
||||||
|
raise result["error"]
|
||||||
|
return result["response"]
|
||||||
|
|
||||||
|
def _streaming_api_call(self, api_kwargs: dict, stream_callback):
|
||||||
|
"""Streaming variant of _interruptible_api_call for voice TTS pipeline.
|
||||||
|
|
||||||
|
Uses ``stream=True`` and forwards content deltas to *stream_callback*
|
||||||
|
in real-time. Returns a ``SimpleNamespace`` that mimics a normal
|
||||||
|
``ChatCompletion`` so the rest of the agent loop works unchanged.
|
||||||
|
|
||||||
|
This method is separate from ``_interruptible_api_call`` to keep the
|
||||||
|
core agent loop untouched for non-voice users.
|
||||||
|
"""
|
||||||
|
result = {"response": None, "error": None}
|
||||||
|
|
||||||
|
def _call():
|
||||||
|
try:
|
||||||
stream_kwargs = {**api_kwargs, "stream": True}
|
stream_kwargs = {**api_kwargs, "stream": True}
|
||||||
stream = self.client.chat.completions.create(**stream_kwargs)
|
stream = self.client.chat.completions.create(**stream_kwargs)
|
||||||
|
|
||||||
content_parts: list[str] = []
|
content_parts: list[str] = []
|
||||||
tool_calls_acc: dict[int, dict] = {} # index -> {id, type, function:{name, arguments}}
|
tool_calls_acc: dict[int, dict] = {}
|
||||||
finish_reason = None
|
finish_reason = None
|
||||||
model_name = None
|
model_name = None
|
||||||
role = "assistant"
|
role = "assistant"
|
||||||
|
|
||||||
for chunk in stream:
|
for chunk in stream:
|
||||||
if not chunk.choices:
|
if not chunk.choices:
|
||||||
# Usage-only or empty chunk
|
|
||||||
if hasattr(chunk, "model") and chunk.model:
|
if hasattr(chunk, "model") and chunk.model:
|
||||||
model_name = chunk.model
|
model_name = chunk.model
|
||||||
continue
|
continue
|
||||||
|
|
@ -2635,24 +2657,17 @@ class AIAgent:
|
||||||
if hasattr(chunk, "model") and chunk.model:
|
if hasattr(chunk, "model") and chunk.model:
|
||||||
model_name = chunk.model
|
model_name = chunk.model
|
||||||
|
|
||||||
# Content delta
|
|
||||||
if delta and delta.content:
|
if delta and delta.content:
|
||||||
content_parts.append(delta.content)
|
content_parts.append(delta.content)
|
||||||
try:
|
try:
|
||||||
cb(delta.content)
|
stream_callback(delta.content)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Tool call deltas
|
|
||||||
if delta and delta.tool_calls:
|
if delta and delta.tool_calls:
|
||||||
for tc_delta in delta.tool_calls:
|
for tc_delta in delta.tool_calls:
|
||||||
idx = tc_delta.index if tc_delta.index is not None else 0
|
idx = tc_delta.index if tc_delta.index is not None else 0
|
||||||
# Gemini may reuse index 0 for multiple tool calls,
|
|
||||||
# sending a new id each time. Detect this and assign
|
|
||||||
# a fresh virtual index so calls don't merge.
|
|
||||||
if idx in tool_calls_acc and tc_delta.id and tc_delta.id != tool_calls_acc[idx]["id"]:
|
if idx in tool_calls_acc and tc_delta.id and tc_delta.id != tool_calls_acc[idx]["id"]:
|
||||||
# Look for existing entry with this id first
|
|
||||||
# (follow-up deltas for an already-created tool call)
|
|
||||||
matched = False
|
matched = False
|
||||||
for eidx, eentry in tool_calls_acc.items():
|
for eidx, eentry in tool_calls_acc.items():
|
||||||
if eentry["id"] == tc_delta.id:
|
if eentry["id"] == tc_delta.id:
|
||||||
|
|
@ -2679,7 +2694,6 @@ class AIAgent:
|
||||||
if chunk.choices[0].finish_reason:
|
if chunk.choices[0].finish_reason:
|
||||||
finish_reason = chunk.choices[0].finish_reason
|
finish_reason = chunk.choices[0].finish_reason
|
||||||
|
|
||||||
# Build a mock ChatCompletion matching the non-streaming interface
|
|
||||||
full_content = "".join(content_parts) or None
|
full_content = "".join(content_parts) or None
|
||||||
mock_tool_calls = None
|
mock_tool_calls = None
|
||||||
if tool_calls_acc:
|
if tool_calls_acc:
|
||||||
|
|
@ -2722,7 +2736,6 @@ class AIAgent:
|
||||||
while t.is_alive():
|
while t.is_alive():
|
||||||
t.join(timeout=0.3)
|
t.join(timeout=0.3)
|
||||||
if self._interrupt_requested:
|
if self._interrupt_requested:
|
||||||
# Force-close the HTTP connection to stop token generation
|
|
||||||
try:
|
try:
|
||||||
if self.api_mode == "anthropic_messages":
|
if self.api_mode == "anthropic_messages":
|
||||||
self._anthropic_client.close()
|
self._anthropic_client.close()
|
||||||
|
|
@ -2730,7 +2743,6 @@ class AIAgent:
|
||||||
self.client.close()
|
self.client.close()
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
# Rebuild the client for future calls (cheap, no network)
|
|
||||||
try:
|
try:
|
||||||
if self.api_mode == "anthropic_messages":
|
if self.api_mode == "anthropic_messages":
|
||||||
from agent.anthropic_adapter import build_anthropic_client
|
from agent.anthropic_adapter import build_anthropic_client
|
||||||
|
|
@ -4412,7 +4424,11 @@ class AIAgent:
|
||||||
if os.getenv("HERMES_DUMP_REQUESTS", "").strip().lower() in {"1", "true", "yes", "on"}:
|
if os.getenv("HERMES_DUMP_REQUESTS", "").strip().lower() in {"1", "true", "yes", "on"}:
|
||||||
self._dump_api_request_debug(api_kwargs, reason="preflight")
|
self._dump_api_request_debug(api_kwargs, reason="preflight")
|
||||||
|
|
||||||
response = self._interruptible_api_call(api_kwargs)
|
cb = getattr(self, "_stream_callback", None)
|
||||||
|
if cb is not None:
|
||||||
|
response = self._streaming_api_call(api_kwargs, cb)
|
||||||
|
else:
|
||||||
|
response = self._interruptible_api_call(api_kwargs)
|
||||||
|
|
||||||
api_duration = time.time() - api_start_time
|
api_duration = time.time() - api_start_time
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -41,16 +41,18 @@ def temp_voice_dir(tmp_path, monkeypatch):
|
||||||
|
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def mock_sd(monkeypatch):
|
def mock_sd(monkeypatch):
|
||||||
"""Replace tools.voice_mode.sd with a MagicMock (sounddevice may not be installed)."""
|
"""Mock _import_audio to return (mock_sd, real_np) so lazy imports work."""
|
||||||
mock = MagicMock()
|
mock = MagicMock()
|
||||||
monkeypatch.setattr("tools.voice_mode.sd", mock)
|
|
||||||
monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True)
|
|
||||||
# Also ensure numpy is available (use real numpy if installed, else mock)
|
|
||||||
try:
|
try:
|
||||||
import numpy as real_np
|
import numpy as real_np
|
||||||
monkeypatch.setattr("tools.voice_mode.np", real_np)
|
|
||||||
except ImportError:
|
except ImportError:
|
||||||
monkeypatch.setattr("tools.voice_mode.np", MagicMock())
|
real_np = MagicMock()
|
||||||
|
|
||||||
|
def _fake_import_audio():
|
||||||
|
return mock, real_np
|
||||||
|
|
||||||
|
monkeypatch.setattr("tools.voice_mode._import_audio", _fake_import_audio)
|
||||||
|
monkeypatch.setattr("tools.voice_mode._audio_available", lambda: True)
|
||||||
return mock
|
return mock
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -60,7 +62,9 @@ def mock_sd(monkeypatch):
|
||||||
|
|
||||||
class TestCheckVoiceRequirements:
|
class TestCheckVoiceRequirements:
|
||||||
def test_all_requirements_met(self, monkeypatch):
|
def test_all_requirements_met(self, monkeypatch):
|
||||||
monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True)
|
monkeypatch.setattr("tools.voice_mode._audio_available", lambda: True)
|
||||||
|
monkeypatch.setattr("tools.voice_mode.detect_audio_environment",
|
||||||
|
lambda: {"available": True, "warnings": []})
|
||||||
monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test-key")
|
monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test-key")
|
||||||
|
|
||||||
from tools.voice_mode import check_voice_requirements
|
from tools.voice_mode import check_voice_requirements
|
||||||
|
|
@ -72,7 +76,9 @@ class TestCheckVoiceRequirements:
|
||||||
assert result["missing_packages"] == []
|
assert result["missing_packages"] == []
|
||||||
|
|
||||||
def test_missing_audio_packages(self, monkeypatch):
|
def test_missing_audio_packages(self, monkeypatch):
|
||||||
monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False)
|
monkeypatch.setattr("tools.voice_mode._audio_available", lambda: False)
|
||||||
|
monkeypatch.setattr("tools.voice_mode.detect_audio_environment",
|
||||||
|
lambda: {"available": False, "warnings": ["Audio libraries not installed"]})
|
||||||
monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test-key")
|
monkeypatch.setenv("VOICE_TOOLS_OPENAI_KEY", "sk-test-key")
|
||||||
|
|
||||||
from tools.voice_mode import check_voice_requirements
|
from tools.voice_mode import check_voice_requirements
|
||||||
|
|
@ -84,7 +90,9 @@ class TestCheckVoiceRequirements:
|
||||||
assert "numpy" in result["missing_packages"]
|
assert "numpy" in result["missing_packages"]
|
||||||
|
|
||||||
def test_missing_stt_key(self, monkeypatch):
|
def test_missing_stt_key(self, monkeypatch):
|
||||||
monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True)
|
monkeypatch.setattr("tools.voice_mode._audio_available", lambda: True)
|
||||||
|
monkeypatch.setattr("tools.voice_mode.detect_audio_environment",
|
||||||
|
lambda: {"available": True, "warnings": []})
|
||||||
monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
|
monkeypatch.delenv("VOICE_TOOLS_OPENAI_KEY", raising=False)
|
||||||
monkeypatch.delenv("GROQ_API_KEY", raising=False)
|
monkeypatch.delenv("GROQ_API_KEY", raising=False)
|
||||||
|
|
||||||
|
|
@ -102,7 +110,9 @@ class TestCheckVoiceRequirements:
|
||||||
|
|
||||||
class TestAudioRecorderStart:
|
class TestAudioRecorderStart:
|
||||||
def test_start_raises_without_audio(self, monkeypatch):
|
def test_start_raises_without_audio(self, monkeypatch):
|
||||||
monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False)
|
def _fail_import():
|
||||||
|
raise ImportError("no sounddevice")
|
||||||
|
monkeypatch.setattr("tools.voice_mode._import_audio", _fail_import)
|
||||||
|
|
||||||
from tools.voice_mode import AudioRecorder
|
from tools.voice_mode import AudioRecorder
|
||||||
|
|
||||||
|
|
@ -334,21 +344,25 @@ class TestPlayAudioFile:
|
||||||
def test_play_wav_via_sounddevice(self, monkeypatch, sample_wav):
|
def test_play_wav_via_sounddevice(self, monkeypatch, sample_wav):
|
||||||
np = pytest.importorskip("numpy")
|
np = pytest.importorskip("numpy")
|
||||||
|
|
||||||
mock_sd = MagicMock()
|
mock_sd_obj = MagicMock()
|
||||||
monkeypatch.setattr("tools.voice_mode.sd", mock_sd)
|
|
||||||
monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", True)
|
def _fake_import():
|
||||||
monkeypatch.setattr("tools.voice_mode.np", np)
|
return mock_sd_obj, np
|
||||||
|
|
||||||
|
monkeypatch.setattr("tools.voice_mode._import_audio", _fake_import)
|
||||||
|
|
||||||
from tools.voice_mode import play_audio_file
|
from tools.voice_mode import play_audio_file
|
||||||
|
|
||||||
result = play_audio_file(sample_wav)
|
result = play_audio_file(sample_wav)
|
||||||
|
|
||||||
assert result is True
|
assert result is True
|
||||||
mock_sd.play.assert_called_once()
|
mock_sd_obj.play.assert_called_once()
|
||||||
mock_sd.wait.assert_called_once()
|
mock_sd_obj.wait.assert_called_once()
|
||||||
|
|
||||||
def test_returns_false_when_no_player(self, monkeypatch, sample_wav):
|
def test_returns_false_when_no_player(self, monkeypatch, sample_wav):
|
||||||
monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False)
|
def _fail_import():
|
||||||
|
raise ImportError("no sounddevice")
|
||||||
|
monkeypatch.setattr("tools.voice_mode._import_audio", _fail_import)
|
||||||
monkeypatch.setattr("shutil.which", lambda _: None)
|
monkeypatch.setattr("shutil.which", lambda _: None)
|
||||||
|
|
||||||
from tools.voice_mode import play_audio_file
|
from tools.voice_mode import play_audio_file
|
||||||
|
|
@ -446,7 +460,9 @@ class TestPlayBeep:
|
||||||
assert len(audio_arg) > single_beep_samples
|
assert len(audio_arg) > single_beep_samples
|
||||||
|
|
||||||
def test_beep_noop_without_audio(self, monkeypatch):
|
def test_beep_noop_without_audio(self, monkeypatch):
|
||||||
monkeypatch.setattr("tools.voice_mode._HAS_AUDIO", False)
|
def _fail_import():
|
||||||
|
raise ImportError("no sounddevice")
|
||||||
|
monkeypatch.setattr("tools.voice_mode._import_audio", _fail_import)
|
||||||
|
|
||||||
from tools.voice_mode import play_beep
|
from tools.voice_mode import play_beep
|
||||||
|
|
||||||
|
|
@ -607,3 +623,237 @@ class TestSilenceDetection:
|
||||||
# No crash, no callback
|
# No crash, no callback
|
||||||
assert recorder._on_silence_stop is None
|
assert recorder._on_silence_stop is None
|
||||||
recorder.cancel()
|
recorder.cancel()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Playback interrupt
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
class TestPlaybackInterrupt:
|
||||||
|
"""Verify that TTS playback can be interrupted."""
|
||||||
|
|
||||||
|
def test_stop_playback_terminates_process(self):
|
||||||
|
from tools.voice_mode import stop_playback, _playback_lock
|
||||||
|
import tools.voice_mode as vm
|
||||||
|
|
||||||
|
mock_proc = MagicMock()
|
||||||
|
mock_proc.poll.return_value = None # process is running
|
||||||
|
|
||||||
|
with _playback_lock:
|
||||||
|
vm._active_playback = mock_proc
|
||||||
|
|
||||||
|
stop_playback()
|
||||||
|
|
||||||
|
mock_proc.terminate.assert_called_once()
|
||||||
|
|
||||||
|
with _playback_lock:
|
||||||
|
assert vm._active_playback is None
|
||||||
|
|
||||||
|
def test_stop_playback_noop_when_nothing_playing(self):
|
||||||
|
import tools.voice_mode as vm
|
||||||
|
|
||||||
|
with vm._playback_lock:
|
||||||
|
vm._active_playback = None
|
||||||
|
|
||||||
|
vm.stop_playback()
|
||||||
|
|
||||||
|
def test_play_audio_file_sets_active_playback(self, monkeypatch, sample_wav):
|
||||||
|
import tools.voice_mode as vm
|
||||||
|
|
||||||
|
def _fail_import():
|
||||||
|
raise ImportError("no sounddevice")
|
||||||
|
monkeypatch.setattr("tools.voice_mode._import_audio", _fail_import)
|
||||||
|
|
||||||
|
mock_proc = MagicMock()
|
||||||
|
mock_proc.wait.return_value = 0
|
||||||
|
|
||||||
|
mock_popen = MagicMock(return_value=mock_proc)
|
||||||
|
monkeypatch.setattr("subprocess.Popen", mock_popen)
|
||||||
|
monkeypatch.setattr("shutil.which", lambda cmd: "/usr/bin/" + cmd)
|
||||||
|
|
||||||
|
vm.play_audio_file(sample_wav)
|
||||||
|
|
||||||
|
assert mock_popen.called
|
||||||
|
with vm._playback_lock:
|
||||||
|
assert vm._active_playback is None
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Continuous mode flow
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
class TestContinuousModeFlow:
|
||||||
|
"""Verify continuous mode: auto-restart after transcription or silence."""
|
||||||
|
|
||||||
|
def test_continuous_restart_on_no_speech(self, mock_sd, temp_voice_dir):
|
||||||
|
np = pytest.importorskip("numpy")
|
||||||
|
|
||||||
|
mock_stream = MagicMock()
|
||||||
|
mock_sd.InputStream.return_value = mock_stream
|
||||||
|
|
||||||
|
from tools.voice_mode import AudioRecorder
|
||||||
|
|
||||||
|
recorder = AudioRecorder()
|
||||||
|
|
||||||
|
# First recording: only silence -> stop returns None
|
||||||
|
recorder.start()
|
||||||
|
callback = mock_sd.InputStream.call_args.kwargs.get("callback")
|
||||||
|
if callback is None:
|
||||||
|
callback = mock_sd.InputStream.call_args[1]["callback"]
|
||||||
|
|
||||||
|
for _ in range(10):
|
||||||
|
silence = np.full((1600, 1), 10, dtype="int16")
|
||||||
|
callback(silence, 1600, None, None)
|
||||||
|
|
||||||
|
wav_path = recorder.stop()
|
||||||
|
assert wav_path is None
|
||||||
|
|
||||||
|
# Simulate continuous mode restart
|
||||||
|
recorder.start()
|
||||||
|
assert recorder.is_recording is True
|
||||||
|
|
||||||
|
callback = mock_sd.InputStream.call_args.kwargs.get("callback")
|
||||||
|
if callback is None:
|
||||||
|
callback = mock_sd.InputStream.call_args[1]["callback"]
|
||||||
|
|
||||||
|
for _ in range(10):
|
||||||
|
speech = np.full((1600, 1), 5000, dtype="int16")
|
||||||
|
callback(speech, 1600, None, None)
|
||||||
|
|
||||||
|
wav_path = recorder.stop()
|
||||||
|
assert wav_path is not None
|
||||||
|
|
||||||
|
recorder.cancel()
|
||||||
|
|
||||||
|
def test_recorder_reusable_after_stop(self, mock_sd, temp_voice_dir):
|
||||||
|
np = pytest.importorskip("numpy")
|
||||||
|
|
||||||
|
mock_stream = MagicMock()
|
||||||
|
mock_sd.InputStream.return_value = mock_stream
|
||||||
|
|
||||||
|
from tools.voice_mode import AudioRecorder
|
||||||
|
|
||||||
|
recorder = AudioRecorder()
|
||||||
|
results = []
|
||||||
|
|
||||||
|
for i in range(3):
|
||||||
|
recorder.start()
|
||||||
|
callback = mock_sd.InputStream.call_args.kwargs.get("callback")
|
||||||
|
if callback is None:
|
||||||
|
callback = mock_sd.InputStream.call_args[1]["callback"]
|
||||||
|
loud = np.full((1600, 1), 5000, dtype="int16")
|
||||||
|
for _ in range(10):
|
||||||
|
callback(loud, 1600, None, None)
|
||||||
|
wav_path = recorder.stop()
|
||||||
|
results.append(wav_path)
|
||||||
|
|
||||||
|
assert all(r is not None for r in results)
|
||||||
|
assert os.path.isfile(results[-1])
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Audio level indicator
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
class TestAudioLevelIndicator:
|
||||||
|
"""Verify current_rms property updates in real-time for UI feedback."""
|
||||||
|
|
||||||
|
def test_rms_updates_with_audio_chunks(self, mock_sd):
|
||||||
|
np = pytest.importorskip("numpy")
|
||||||
|
|
||||||
|
mock_stream = MagicMock()
|
||||||
|
mock_sd.InputStream.return_value = mock_stream
|
||||||
|
|
||||||
|
from tools.voice_mode import AudioRecorder
|
||||||
|
|
||||||
|
recorder = AudioRecorder()
|
||||||
|
recorder.start()
|
||||||
|
callback = mock_sd.InputStream.call_args.kwargs.get("callback")
|
||||||
|
if callback is None:
|
||||||
|
callback = mock_sd.InputStream.call_args[1]["callback"]
|
||||||
|
|
||||||
|
assert recorder.current_rms == 0
|
||||||
|
|
||||||
|
loud = np.full((1600, 1), 5000, dtype="int16")
|
||||||
|
callback(loud, 1600, None, None)
|
||||||
|
assert recorder.current_rms == 5000
|
||||||
|
|
||||||
|
quiet = np.full((1600, 1), 100, dtype="int16")
|
||||||
|
callback(quiet, 1600, None, None)
|
||||||
|
assert recorder.current_rms == 100
|
||||||
|
|
||||||
|
recorder.cancel()
|
||||||
|
|
||||||
|
def test_peak_rms_tracks_maximum(self, mock_sd):
|
||||||
|
np = pytest.importorskip("numpy")
|
||||||
|
|
||||||
|
mock_stream = MagicMock()
|
||||||
|
mock_sd.InputStream.return_value = mock_stream
|
||||||
|
|
||||||
|
from tools.voice_mode import AudioRecorder
|
||||||
|
|
||||||
|
recorder = AudioRecorder()
|
||||||
|
recorder.start()
|
||||||
|
callback = mock_sd.InputStream.call_args.kwargs.get("callback")
|
||||||
|
if callback is None:
|
||||||
|
callback = mock_sd.InputStream.call_args[1]["callback"]
|
||||||
|
|
||||||
|
frames = [
|
||||||
|
np.full((1600, 1), 100, dtype="int16"),
|
||||||
|
np.full((1600, 1), 8000, dtype="int16"),
|
||||||
|
np.full((1600, 1), 500, dtype="int16"),
|
||||||
|
np.full((1600, 1), 3000, dtype="int16"),
|
||||||
|
]
|
||||||
|
for frame in frames:
|
||||||
|
callback(frame, 1600, None, None)
|
||||||
|
|
||||||
|
assert recorder._peak_rms == 8000
|
||||||
|
assert recorder.current_rms == 3000
|
||||||
|
|
||||||
|
recorder.cancel()
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Configurable silence parameters
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
class TestConfigurableSilenceParams:
|
||||||
|
"""Verify that silence detection params can be configured."""
|
||||||
|
|
||||||
|
def test_custom_threshold_and_duration(self, mock_sd):
|
||||||
|
np = pytest.importorskip("numpy")
|
||||||
|
|
||||||
|
mock_stream = MagicMock()
|
||||||
|
mock_sd.InputStream.return_value = mock_stream
|
||||||
|
|
||||||
|
from tools.voice_mode import AudioRecorder
|
||||||
|
import threading
|
||||||
|
|
||||||
|
recorder = AudioRecorder()
|
||||||
|
recorder._silence_threshold = 5000
|
||||||
|
recorder._silence_duration = 0.05
|
||||||
|
recorder._min_speech_duration = 0.05
|
||||||
|
|
||||||
|
fired = threading.Event()
|
||||||
|
recorder.start(on_silence_stop=lambda: fired.set())
|
||||||
|
callback = mock_sd.InputStream.call_args.kwargs.get("callback")
|
||||||
|
if callback is None:
|
||||||
|
callback = mock_sd.InputStream.call_args[1]["callback"]
|
||||||
|
|
||||||
|
# Audio at RMS 1000 -- below custom threshold (5000)
|
||||||
|
moderate = np.full((1600, 1), 1000, dtype="int16")
|
||||||
|
for _ in range(5):
|
||||||
|
callback(moderate, 1600, None, None)
|
||||||
|
time.sleep(0.02)
|
||||||
|
|
||||||
|
assert recorder._has_spoken is False
|
||||||
|
assert fired.wait(timeout=0.2) is False
|
||||||
|
|
||||||
|
# Now send really loud audio (above 5000 threshold)
|
||||||
|
very_loud = np.full((1600, 1), 8000, dtype="int16")
|
||||||
|
callback(very_loud, 1600, None, None)
|
||||||
|
time.sleep(0.06)
|
||||||
|
callback(very_loud, 1600, None, None)
|
||||||
|
assert recorder._has_spoken is True
|
||||||
|
|
||||||
|
recorder.cancel()
|
||||||
|
|
|
||||||
|
|
@ -37,33 +37,29 @@ from typing import Callable, Dict, Any, Optional
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Optional imports -- providers degrade gracefully if not installed
|
# Lazy imports -- providers are imported only when actually used to avoid
|
||||||
|
# crashing in headless environments (SSH, Docker, WSL, no PortAudio).
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
try:
|
|
||||||
|
def _import_edge_tts():
|
||||||
|
"""Lazy import edge_tts. Returns the module or raises ImportError."""
|
||||||
import edge_tts
|
import edge_tts
|
||||||
_HAS_EDGE_TTS = True
|
return edge_tts
|
||||||
except ImportError:
|
|
||||||
_HAS_EDGE_TTS = False
|
|
||||||
|
|
||||||
try:
|
def _import_elevenlabs():
|
||||||
|
"""Lazy import ElevenLabs client. Returns the class or raises ImportError."""
|
||||||
from elevenlabs.client import ElevenLabs
|
from elevenlabs.client import ElevenLabs
|
||||||
_HAS_ELEVENLABS = True
|
return ElevenLabs
|
||||||
except ImportError:
|
|
||||||
_HAS_ELEVENLABS = False
|
|
||||||
|
|
||||||
# openai is a core dependency, but guard anyway
|
def _import_openai_client():
|
||||||
try:
|
"""Lazy import OpenAI client. Returns the class or raises ImportError."""
|
||||||
from openai import OpenAI as OpenAIClient
|
from openai import OpenAI as OpenAIClient
|
||||||
_HAS_OPENAI = True
|
return OpenAIClient
|
||||||
except ImportError:
|
|
||||||
_HAS_OPENAI = False
|
|
||||||
|
|
||||||
try:
|
def _import_sounddevice():
|
||||||
|
"""Lazy import sounddevice. Returns the module or raises ImportError/OSError."""
|
||||||
import sounddevice as sd
|
import sounddevice as sd
|
||||||
_HAS_AUDIO = True
|
return sd
|
||||||
except (ImportError, OSError):
|
|
||||||
sd = None # type: ignore[assignment]
|
|
||||||
_HAS_AUDIO = False
|
|
||||||
|
|
||||||
|
|
||||||
# ===========================================================================
|
# ===========================================================================
|
||||||
|
|
@ -202,6 +198,7 @@ def _generate_elevenlabs(text: str, output_path: str, tts_config: Dict[str, Any]
|
||||||
else:
|
else:
|
||||||
output_format = "mp3_44100_128"
|
output_format = "mp3_44100_128"
|
||||||
|
|
||||||
|
ElevenLabs = _import_elevenlabs()
|
||||||
client = ElevenLabs(api_key=api_key)
|
client = ElevenLabs(api_key=api_key)
|
||||||
audio_generator = client.text_to_speech.convert(
|
audio_generator = client.text_to_speech.convert(
|
||||||
text=text,
|
text=text,
|
||||||
|
|
@ -247,6 +244,7 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any]
|
||||||
else:
|
else:
|
||||||
response_format = "mp3"
|
response_format = "mp3"
|
||||||
|
|
||||||
|
OpenAIClient = _import_openai_client()
|
||||||
client = OpenAIClient(api_key=api_key, base_url="https://api.openai.com/v1")
|
client = OpenAIClient(api_key=api_key, base_url="https://api.openai.com/v1")
|
||||||
response = client.audio.speech.create(
|
response = client.audio.speech.create(
|
||||||
model=model,
|
model=model,
|
||||||
|
|
@ -322,7 +320,9 @@ def text_to_speech_tool(
|
||||||
try:
|
try:
|
||||||
# Generate audio with the configured provider
|
# Generate audio with the configured provider
|
||||||
if provider == "elevenlabs":
|
if provider == "elevenlabs":
|
||||||
if not _HAS_ELEVENLABS:
|
try:
|
||||||
|
_import_elevenlabs()
|
||||||
|
except ImportError:
|
||||||
return json.dumps({
|
return json.dumps({
|
||||||
"success": False,
|
"success": False,
|
||||||
"error": "ElevenLabs provider selected but 'elevenlabs' package not installed. Run: pip install elevenlabs"
|
"error": "ElevenLabs provider selected but 'elevenlabs' package not installed. Run: pip install elevenlabs"
|
||||||
|
|
@ -331,7 +331,9 @@ def text_to_speech_tool(
|
||||||
_generate_elevenlabs(text, file_str, tts_config)
|
_generate_elevenlabs(text, file_str, tts_config)
|
||||||
|
|
||||||
elif provider == "openai":
|
elif provider == "openai":
|
||||||
if not _HAS_OPENAI:
|
try:
|
||||||
|
_import_openai_client()
|
||||||
|
except ImportError:
|
||||||
return json.dumps({
|
return json.dumps({
|
||||||
"success": False,
|
"success": False,
|
||||||
"error": "OpenAI provider selected but 'openai' package not installed."
|
"error": "OpenAI provider selected but 'openai' package not installed."
|
||||||
|
|
@ -341,7 +343,9 @@ def text_to_speech_tool(
|
||||||
|
|
||||||
else:
|
else:
|
||||||
# Default: Edge TTS (free)
|
# Default: Edge TTS (free)
|
||||||
if not _HAS_EDGE_TTS:
|
try:
|
||||||
|
_import_edge_tts()
|
||||||
|
except ImportError:
|
||||||
return json.dumps({
|
return json.dumps({
|
||||||
"success": False,
|
"success": False,
|
||||||
"error": "Edge TTS not available. Run: pip install edge-tts"
|
"error": "Edge TTS not available. Run: pip install edge-tts"
|
||||||
|
|
@ -422,12 +426,23 @@ def check_tts_requirements() -> bool:
|
||||||
Returns:
|
Returns:
|
||||||
bool: True if at least one provider can work.
|
bool: True if at least one provider can work.
|
||||||
"""
|
"""
|
||||||
if _HAS_EDGE_TTS:
|
try:
|
||||||
return True
|
_import_edge_tts()
|
||||||
if _HAS_ELEVENLABS and os.getenv("ELEVENLABS_API_KEY"):
|
|
||||||
return True
|
|
||||||
if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"):
|
|
||||||
return True
|
return True
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
_import_elevenlabs()
|
||||||
|
if os.getenv("ELEVENLABS_API_KEY"):
|
||||||
|
return True
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
|
try:
|
||||||
|
_import_openai_client()
|
||||||
|
if os.getenv("VOICE_TOOLS_OPENAI_KEY"):
|
||||||
|
return True
|
||||||
|
except ImportError:
|
||||||
|
pass
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -500,20 +515,27 @@ def stream_tts_to_speaker(
|
||||||
api_key = os.getenv("ELEVENLABS_API_KEY", "")
|
api_key = os.getenv("ELEVENLABS_API_KEY", "")
|
||||||
if not api_key:
|
if not api_key:
|
||||||
logger.warning("ELEVENLABS_API_KEY not set; streaming TTS audio disabled")
|
logger.warning("ELEVENLABS_API_KEY not set; streaming TTS audio disabled")
|
||||||
elif _HAS_ELEVENLABS:
|
else:
|
||||||
client = ElevenLabs(api_key=api_key)
|
try:
|
||||||
|
ElevenLabs = _import_elevenlabs()
|
||||||
|
client = ElevenLabs(api_key=api_key)
|
||||||
|
except ImportError:
|
||||||
|
logger.warning("elevenlabs package not installed; streaming TTS disabled")
|
||||||
|
|
||||||
# Open a single sounddevice output stream for the lifetime of
|
# Open a single sounddevice output stream for the lifetime of
|
||||||
# this function. ElevenLabs pcm_24000 produces signed 16-bit
|
# this function. ElevenLabs pcm_24000 produces signed 16-bit
|
||||||
# little-endian mono PCM at 24 kHz.
|
# little-endian mono PCM at 24 kHz.
|
||||||
use_sd = _HAS_AUDIO and sd is not None
|
if client is not None:
|
||||||
if use_sd:
|
|
||||||
try:
|
try:
|
||||||
|
sd = _import_sounddevice()
|
||||||
import numpy as _np
|
import numpy as _np
|
||||||
output_stream = sd.OutputStream(
|
output_stream = sd.OutputStream(
|
||||||
samplerate=24000, channels=1, dtype="int16",
|
samplerate=24000, channels=1, dtype="int16",
|
||||||
)
|
)
|
||||||
output_stream.start()
|
output_stream.start()
|
||||||
|
except (ImportError, OSError) as exc:
|
||||||
|
logger.debug("sounddevice not available: %s", exc)
|
||||||
|
output_stream = None
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.warning("sounddevice OutputStream failed: %s", exc)
|
logger.warning("sounddevice OutputStream failed: %s", exc)
|
||||||
output_stream = None
|
output_stream = None
|
||||||
|
|
@ -666,12 +688,19 @@ if __name__ == "__main__":
|
||||||
print("🔊 Text-to-Speech Tool Module")
|
print("🔊 Text-to-Speech Tool Module")
|
||||||
print("=" * 50)
|
print("=" * 50)
|
||||||
|
|
||||||
|
def _check(importer, label):
|
||||||
|
try:
|
||||||
|
importer()
|
||||||
|
return True
|
||||||
|
except ImportError:
|
||||||
|
return False
|
||||||
|
|
||||||
print(f"\nProvider availability:")
|
print(f"\nProvider availability:")
|
||||||
print(f" Edge TTS: {'✅ installed' if _HAS_EDGE_TTS else '❌ not installed (pip install edge-tts)'}")
|
print(f" Edge TTS: {'installed' if _check(_import_edge_tts, 'edge') else 'not installed (pip install edge-tts)'}")
|
||||||
print(f" ElevenLabs: {'✅ installed' if _HAS_ELEVENLABS else '❌ not installed (pip install elevenlabs)'}")
|
print(f" ElevenLabs: {'installed' if _check(_import_elevenlabs, 'el') else 'not installed (pip install elevenlabs)'}")
|
||||||
print(f" API Key: {'✅ set' if os.getenv('ELEVENLABS_API_KEY') else '❌ not set'}")
|
print(f" API Key: {'set' if os.getenv('ELEVENLABS_API_KEY') else 'not set'}")
|
||||||
print(f" OpenAI: {'✅ installed' if _HAS_OPENAI else '❌ not installed'}")
|
print(f" OpenAI: {'installed' if _check(_import_openai_client, 'oai') else 'not installed'}")
|
||||||
print(f" API Key: {'✅ set' if os.getenv('VOICE_TOOLS_OPENAI_KEY') else '❌ not set (VOICE_TOOLS_OPENAI_KEY)'}")
|
print(f" API Key: {'set' if os.getenv('VOICE_TOOLS_OPENAI_KEY') else 'not set (VOICE_TOOLS_OPENAI_KEY)'}")
|
||||||
print(f" ffmpeg: {'✅ found' if _has_ffmpeg() else '❌ not found (needed for Telegram Opus)'}")
|
print(f" ffmpeg: {'✅ found' if _has_ffmpeg() else '❌ not found (needed for Telegram Opus)'}")
|
||||||
print(f"\n Output dir: {DEFAULT_OUTPUT_DIR}")
|
print(f"\n Output dir: {DEFAULT_OUTPUT_DIR}")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -25,17 +25,69 @@ from typing import Any, Dict, List, Optional
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Optional imports with graceful degradation
|
# Lazy audio imports -- never imported at module level to avoid crashing
|
||||||
|
# in headless environments (SSH, Docker, WSL, no PortAudio).
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
try:
|
|
||||||
|
def _import_audio():
|
||||||
|
"""Lazy-import sounddevice and numpy. Returns (sd, np).
|
||||||
|
|
||||||
|
Raises ImportError or OSError if the libraries are not available
|
||||||
|
(e.g. PortAudio missing on headless servers).
|
||||||
|
"""
|
||||||
import sounddevice as sd
|
import sounddevice as sd
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
return sd, np
|
||||||
|
|
||||||
_HAS_AUDIO = True
|
|
||||||
except (ImportError, OSError):
|
def _audio_available() -> bool:
|
||||||
sd = None # type: ignore[assignment]
|
"""Return True if audio libraries can be imported."""
|
||||||
np = None # type: ignore[assignment]
|
try:
|
||||||
_HAS_AUDIO = False
|
_import_audio()
|
||||||
|
return True
|
||||||
|
except (ImportError, OSError):
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def detect_audio_environment() -> dict:
|
||||||
|
"""Detect if the current environment supports audio I/O.
|
||||||
|
|
||||||
|
Returns dict with 'available' (bool) and 'warnings' (list of strings).
|
||||||
|
"""
|
||||||
|
warnings = []
|
||||||
|
|
||||||
|
# SSH detection
|
||||||
|
if any(os.environ.get(v) for v in ('SSH_CLIENT', 'SSH_TTY', 'SSH_CONNECTION')):
|
||||||
|
warnings.append("Running over SSH -- no audio devices available")
|
||||||
|
|
||||||
|
# Docker detection
|
||||||
|
if os.path.exists('/.dockerenv'):
|
||||||
|
warnings.append("Running inside Docker container -- no audio devices")
|
||||||
|
|
||||||
|
# WSL detection
|
||||||
|
try:
|
||||||
|
with open('/proc/version', 'r') as f:
|
||||||
|
if 'microsoft' in f.read().lower():
|
||||||
|
warnings.append("Running in WSL -- audio requires PulseAudio bridge to Windows")
|
||||||
|
except (FileNotFoundError, PermissionError, OSError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Check audio libraries
|
||||||
|
try:
|
||||||
|
sd, _ = _import_audio()
|
||||||
|
try:
|
||||||
|
devices = sd.query_devices()
|
||||||
|
if not devices:
|
||||||
|
warnings.append("No audio input/output devices detected")
|
||||||
|
except Exception:
|
||||||
|
warnings.append("Audio subsystem error (PortAudio cannot query devices)")
|
||||||
|
except (ImportError, OSError):
|
||||||
|
warnings.append("Audio libraries not installed (pip install sounddevice numpy)")
|
||||||
|
|
||||||
|
return {
|
||||||
|
"available": len(warnings) == 0,
|
||||||
|
"warnings": warnings,
|
||||||
|
}
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Recording parameters
|
# Recording parameters
|
||||||
|
|
@ -65,7 +117,9 @@ def play_beep(frequency: int = 880, duration: float = 0.12, count: int = 1) -> N
|
||||||
duration: Duration of each beep in seconds.
|
duration: Duration of each beep in seconds.
|
||||||
count: Number of beeps to play (with short gap between).
|
count: Number of beeps to play (with short gap between).
|
||||||
"""
|
"""
|
||||||
if not _HAS_AUDIO:
|
try:
|
||||||
|
sd, np = _import_audio()
|
||||||
|
except (ImportError, OSError):
|
||||||
return
|
return
|
||||||
try:
|
try:
|
||||||
gap = 0.06 # seconds between beeps
|
gap = 0.06 # seconds between beeps
|
||||||
|
|
@ -161,12 +215,14 @@ class AudioRecorder:
|
||||||
Raises ``RuntimeError`` if sounddevice/numpy are not installed
|
Raises ``RuntimeError`` if sounddevice/numpy are not installed
|
||||||
or if a recording is already in progress.
|
or if a recording is already in progress.
|
||||||
"""
|
"""
|
||||||
if not _HAS_AUDIO:
|
try:
|
||||||
|
sd, np = _import_audio()
|
||||||
|
except (ImportError, OSError) as e:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
"Voice mode requires sounddevice and numpy.\n"
|
"Voice mode requires sounddevice and numpy.\n"
|
||||||
"Install with: pip install sounddevice numpy\n"
|
"Install with: pip install sounddevice numpy\n"
|
||||||
"Or: pip install hermes-agent[voice]"
|
"Or: pip install hermes-agent[voice]"
|
||||||
)
|
) from e
|
||||||
|
|
||||||
with self._lock:
|
with self._lock:
|
||||||
if self._recording:
|
if self._recording:
|
||||||
|
|
@ -269,6 +325,7 @@ class AudioRecorder:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Concatenate frames and write WAV
|
# Concatenate frames and write WAV
|
||||||
|
_, np = _import_audio()
|
||||||
audio_data = np.concatenate(self._frames, axis=0)
|
audio_data = np.concatenate(self._frames, axis=0)
|
||||||
self._frames = []
|
self._frames = []
|
||||||
|
|
||||||
|
|
@ -434,11 +491,11 @@ def stop_playback() -> None:
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
# Also stop sounddevice playback if active
|
# Also stop sounddevice playback if active
|
||||||
if _HAS_AUDIO:
|
try:
|
||||||
try:
|
sd, _ = _import_audio()
|
||||||
sd.stop()
|
sd.stop()
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
def play_audio_file(file_path: str) -> bool:
|
def play_audio_file(file_path: str) -> bool:
|
||||||
|
|
@ -461,8 +518,9 @@ def play_audio_file(file_path: str) -> bool:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
# Try sounddevice for WAV files
|
# Try sounddevice for WAV files
|
||||||
if _HAS_AUDIO and file_path.endswith(".wav"):
|
if file_path.endswith(".wav"):
|
||||||
try:
|
try:
|
||||||
|
sd, np = _import_audio()
|
||||||
with wave.open(file_path, "rb") as wf:
|
with wave.open(file_path, "rb") as wf:
|
||||||
frames = wf.readframes(wf.getnframes())
|
frames = wf.readframes(wf.getnframes())
|
||||||
audio_data = np.frombuffer(frames, dtype=np.int16)
|
audio_data = np.frombuffer(frames, dtype=np.int16)
|
||||||
|
|
@ -471,6 +529,8 @@ def play_audio_file(file_path: str) -> bool:
|
||||||
sd.play(audio_data, samplerate=sample_rate)
|
sd.play(audio_data, samplerate=sample_rate)
|
||||||
sd.wait()
|
sd.wait()
|
||||||
return True
|
return True
|
||||||
|
except (ImportError, OSError):
|
||||||
|
pass # audio libs not available, fall through to system players
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug("sounddevice playback failed: %s", e)
|
logger.debug("sounddevice playback failed: %s", e)
|
||||||
|
|
||||||
|
|
@ -518,14 +578,18 @@ def check_voice_requirements() -> Dict[str, Any]:
|
||||||
groq_key = bool(os.getenv("GROQ_API_KEY"))
|
groq_key = bool(os.getenv("GROQ_API_KEY"))
|
||||||
stt_key_set = openai_key or groq_key
|
stt_key_set = openai_key or groq_key
|
||||||
missing: List[str] = []
|
missing: List[str] = []
|
||||||
|
has_audio = _audio_available()
|
||||||
|
|
||||||
if not _HAS_AUDIO:
|
if not has_audio:
|
||||||
missing.extend(["sounddevice", "numpy"])
|
missing.extend(["sounddevice", "numpy"])
|
||||||
|
|
||||||
available = _HAS_AUDIO and stt_key_set
|
# Environment detection
|
||||||
|
env_check = detect_audio_environment()
|
||||||
|
|
||||||
|
available = has_audio and stt_key_set and env_check["available"]
|
||||||
details_parts = []
|
details_parts = []
|
||||||
|
|
||||||
if _HAS_AUDIO:
|
if has_audio:
|
||||||
details_parts.append("Audio capture: OK")
|
details_parts.append("Audio capture: OK")
|
||||||
else:
|
else:
|
||||||
details_parts.append("Audio capture: MISSING (pip install sounddevice numpy)")
|
details_parts.append("Audio capture: MISSING (pip install sounddevice numpy)")
|
||||||
|
|
@ -537,12 +601,16 @@ def check_voice_requirements() -> Dict[str, Any]:
|
||||||
else:
|
else:
|
||||||
details_parts.append("STT API key: MISSING (set GROQ_API_KEY or VOICE_TOOLS_OPENAI_KEY)")
|
details_parts.append("STT API key: MISSING (set GROQ_API_KEY or VOICE_TOOLS_OPENAI_KEY)")
|
||||||
|
|
||||||
|
for warning in env_check["warnings"]:
|
||||||
|
details_parts.append(f"Environment: {warning}")
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"available": available,
|
"available": available,
|
||||||
"audio_available": _HAS_AUDIO,
|
"audio_available": has_audio,
|
||||||
"stt_key_set": stt_key_set,
|
"stt_key_set": stt_key_set,
|
||||||
"missing_packages": missing,
|
"missing_packages": missing,
|
||||||
"details": "\n".join(details_parts),
|
"details": "\n".join(details_parts),
|
||||||
|
"environment": env_check,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue