fix: address voice mode review feedback

1. Fully lazy imports: sounddevice, numpy, elevenlabs, edge_tts, and openai are never imported at module level. Each is imported only when the feature is explicitly activated, preventing crashes in headless environments (SSH, Docker, WSL, no PortAudio). 2. No core agent loop changes: streaming TTS path extracted from _interruptible_api_call() into separate _streaming_api_call() method. The original method is restored to its upstream form. 3. Configurable key binding: push-to-talk key changed from Ctrl+R (conflicts with readline reverse-search) to Ctrl+B by default. Configurable via voice.push_to_talk_key in config.yaml. 4. Environment detection: new detect_audio_environment() function checks for SSH, Docker, WSL, and missing audio devices before enabling voice mode. Auto-disables with clear warnings in incompatible environments. 5. Graceful degradation: every audio touchpoint (sd.play, sd.InputStream, sd.OutputStream) wrapped in try/except with ImportError/OSError handling. Failures produce warnings, not crashes.
2026-03-09 12:48:49 +03:00 · 2026-03-09 12:48:49 +03:00 · b859dfab16
commit b859dfab16
parent 143cc68946
5 changed files with 526 additions and 142 deletions
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@ -37,33 +37,29 @@ from typing import Callable, Dict, Any, Optional
 logger = logging.getLogger(__name__)

 # ---------------------------------------------------------------------------
-# Optional imports -- providers degrade gracefully if not installed
+# Lazy imports -- providers are imported only when actually used to avoid
+# crashing in headless environments (SSH, Docker, WSL, no PortAudio).
 # ---------------------------------------------------------------------------
-try:
+
+def _import_edge_tts():
+    """Lazy import edge_tts. Returns the module or raises ImportError."""
    import edge_tts
-    _HAS_EDGE_TTS = True
-except ImportError:
-    _HAS_EDGE_TTS = False
+    return edge_tts

-try:
+def _import_elevenlabs():
+    """Lazy import ElevenLabs client. Returns the class or raises ImportError."""
    from elevenlabs.client import ElevenLabs
-    _HAS_ELEVENLABS = True
-except ImportError:
-    _HAS_ELEVENLABS = False
+    return ElevenLabs

-# openai is a core dependency, but guard anyway
-try:
+def _import_openai_client():
+    """Lazy import OpenAI client. Returns the class or raises ImportError."""
    from openai import OpenAI as OpenAIClient
-    _HAS_OPENAI = True
-except ImportError:
-    _HAS_OPENAI = False
+    return OpenAIClient

-try:
+def _import_sounddevice():
+    """Lazy import sounddevice. Returns the module or raises ImportError/OSError."""
    import sounddevice as sd
-    _HAS_AUDIO = True
-except (ImportError, OSError):
-    sd = None  # type: ignore[assignment]
-    _HAS_AUDIO = False
+    return sd


 # ===========================================================================
@ -202,6 +198,7 @@ def _generate_elevenlabs(text: str, output_path: str, tts_config: Dict[str, Any]
    else:
        output_format = "mp3_44100_128"

+    ElevenLabs = _import_elevenlabs()
    client = ElevenLabs(api_key=api_key)
    audio_generator = client.text_to_speech.convert(
        text=text,
@ -247,6 +244,7 @@ def _generate_openai_tts(text: str, output_path: str, tts_config: Dict[str, Any]
    else:
        response_format = "mp3"

+    OpenAIClient = _import_openai_client()
    client = OpenAIClient(api_key=api_key, base_url="https://api.openai.com/v1")
    response = client.audio.speech.create(
        model=model,
@ -322,7 +320,9 @@ def text_to_speech_tool(
    try:
        # Generate audio with the configured provider
        if provider == "elevenlabs":
-            if not _HAS_ELEVENLABS:
+            try:
+                _import_elevenlabs()
+            except ImportError:
                return json.dumps({
                    "success": False,
                    "error": "ElevenLabs provider selected but 'elevenlabs' package not installed. Run: pip install elevenlabs"
@ -331,7 +331,9 @@ def text_to_speech_tool(
            _generate_elevenlabs(text, file_str, tts_config)

        elif provider == "openai":
-            if not _HAS_OPENAI:
+            try:
+                _import_openai_client()
+            except ImportError:
                return json.dumps({
                    "success": False,
                    "error": "OpenAI provider selected but 'openai' package not installed."
@ -341,7 +343,9 @@ def text_to_speech_tool(

        else:
            # Default: Edge TTS (free)
-            if not _HAS_EDGE_TTS:
+            try:
+                _import_edge_tts()
+            except ImportError:
                return json.dumps({
                    "success": False,
                    "error": "Edge TTS not available. Run: pip install edge-tts"
@ -422,12 +426,23 @@ def check_tts_requirements() -> bool:
    Returns:
        bool: True if at least one provider can work.
    """
-    if _HAS_EDGE_TTS:
-        return True
-    if _HAS_ELEVENLABS and os.getenv("ELEVENLABS_API_KEY"):
-        return True
-    if _HAS_OPENAI and os.getenv("VOICE_TOOLS_OPENAI_KEY"):
+    try:
+        _import_edge_tts()
        return True
+    except ImportError:
+        pass
+    try:
+        _import_elevenlabs()
+        if os.getenv("ELEVENLABS_API_KEY"):
+            return True
+    except ImportError:
+        pass
+    try:
+        _import_openai_client()
+        if os.getenv("VOICE_TOOLS_OPENAI_KEY"):
+            return True
+    except ImportError:
+        pass
    return False


@ -500,20 +515,27 @@ def stream_tts_to_speaker(
        api_key = os.getenv("ELEVENLABS_API_KEY", "")
        if not api_key:
            logger.warning("ELEVENLABS_API_KEY not set; streaming TTS audio disabled")
-        elif _HAS_ELEVENLABS:
-            client = ElevenLabs(api_key=api_key)
+        else:
+            try:
+                ElevenLabs = _import_elevenlabs()
+                client = ElevenLabs(api_key=api_key)
+            except ImportError:
+                logger.warning("elevenlabs package not installed; streaming TTS disabled")

            # Open a single sounddevice output stream for the lifetime of
            # this function.  ElevenLabs pcm_24000 produces signed 16-bit
            # little-endian mono PCM at 24 kHz.
-            use_sd = _HAS_AUDIO and sd is not None
-            if use_sd:
+            if client is not None:
                try:
+                    sd = _import_sounddevice()
                    import numpy as _np
                    output_stream = sd.OutputStream(
                        samplerate=24000, channels=1, dtype="int16",
                    )
                    output_stream.start()
+                except (ImportError, OSError) as exc:
+                    logger.debug("sounddevice not available: %s", exc)
+                    output_stream = None
                except Exception as exc:
                    logger.warning("sounddevice OutputStream failed: %s", exc)
                    output_stream = None
@ -666,12 +688,19 @@ if __name__ == "__main__":
    print("🔊 Text-to-Speech Tool Module")
    print("=" * 50)

+    def _check(importer, label):
+        try:
+            importer()
+            return True
+        except ImportError:
+            return False
+
    print(f"\nProvider availability:")
-    print(f"  Edge TTS:   {'✅ installed' if _HAS_EDGE_TTS else '❌ not installed (pip install edge-tts)'}")
-    print(f"  ElevenLabs: {'✅ installed' if _HAS_ELEVENLABS else '❌ not installed (pip install elevenlabs)'}")
-    print(f"    API Key:  {'✅ set' if os.getenv('ELEVENLABS_API_KEY') else '❌ not set'}")
-    print(f"  OpenAI:     {'✅ installed' if _HAS_OPENAI else '❌ not installed'}")
-    print(f"    API Key:  {'✅ set' if os.getenv('VOICE_TOOLS_OPENAI_KEY') else '❌ not set (VOICE_TOOLS_OPENAI_KEY)'}")
+    print(f"  Edge TTS:   {'installed' if _check(_import_edge_tts, 'edge') else 'not installed (pip install edge-tts)'}")
+    print(f"  ElevenLabs: {'installed' if _check(_import_elevenlabs, 'el') else 'not installed (pip install elevenlabs)'}")
+    print(f"    API Key:  {'set' if os.getenv('ELEVENLABS_API_KEY') else 'not set'}")
+    print(f"  OpenAI:     {'installed' if _check(_import_openai_client, 'oai') else 'not installed'}")
+    print(f"    API Key:  {'set' if os.getenv('VOICE_TOOLS_OPENAI_KEY') else 'not set (VOICE_TOOLS_OPENAI_KEY)'}")
    print(f"  ffmpeg:     {'✅ found' if _has_ffmpeg() else '❌ not found (needed for Telegram Opus)'}")
    print(f"\n  Output dir: {DEFAULT_OUTPUT_DIR}")

--- a/tools/voice_mode.py
+++ b/tools/voice_mode.py
@ -25,17 +25,69 @@ from typing import Any, Dict, List, Optional
 logger = logging.getLogger(__name__)

 # ---------------------------------------------------------------------------
-# Optional imports with graceful degradation
+# Lazy audio imports -- never imported at module level to avoid crashing
+# in headless environments (SSH, Docker, WSL, no PortAudio).
 # ---------------------------------------------------------------------------
-try:
+
+def _import_audio():
+    """Lazy-import sounddevice and numpy.  Returns (sd, np).
+
+    Raises ImportError or OSError if the libraries are not available
+    (e.g. PortAudio missing on headless servers).
+    """
    import sounddevice as sd
    import numpy as np
+    return sd, np

-    _HAS_AUDIO = True
-except (ImportError, OSError):
-    sd = None  # type: ignore[assignment]
-    np = None  # type: ignore[assignment]
-    _HAS_AUDIO = False
+
+def _audio_available() -> bool:
+    """Return True if audio libraries can be imported."""
+    try:
+        _import_audio()
+        return True
+    except (ImportError, OSError):
+        return False
+
+
+def detect_audio_environment() -> dict:
+    """Detect if the current environment supports audio I/O.
+
+    Returns dict with 'available' (bool) and 'warnings' (list of strings).
+    """
+    warnings = []
+
+    # SSH detection
+    if any(os.environ.get(v) for v in ('SSH_CLIENT', 'SSH_TTY', 'SSH_CONNECTION')):
+        warnings.append("Running over SSH -- no audio devices available")
+
+    # Docker detection
+    if os.path.exists('/.dockerenv'):
+        warnings.append("Running inside Docker container -- no audio devices")
+
+    # WSL detection
+    try:
+        with open('/proc/version', 'r') as f:
+            if 'microsoft' in f.read().lower():
+                warnings.append("Running in WSL -- audio requires PulseAudio bridge to Windows")
+    except (FileNotFoundError, PermissionError, OSError):
+        pass
+
+    # Check audio libraries
+    try:
+        sd, _ = _import_audio()
+        try:
+            devices = sd.query_devices()
+            if not devices:
+                warnings.append("No audio input/output devices detected")
+        except Exception:
+            warnings.append("Audio subsystem error (PortAudio cannot query devices)")
+    except (ImportError, OSError):
+        warnings.append("Audio libraries not installed (pip install sounddevice numpy)")
+
+    return {
+        "available": len(warnings) == 0,
+        "warnings": warnings,
+    }

 # ---------------------------------------------------------------------------
 # Recording parameters
@ -65,7 +117,9 @@ def play_beep(frequency: int = 880, duration: float = 0.12, count: int = 1) -> N
        duration: Duration of each beep in seconds.
        count: Number of beeps to play (with short gap between).
    """
-    if not _HAS_AUDIO:
+    try:
+        sd, np = _import_audio()
+    except (ImportError, OSError):
        return
    try:
        gap = 0.06  # seconds between beeps
@ -161,12 +215,14 @@ class AudioRecorder:
        Raises ``RuntimeError`` if sounddevice/numpy are not installed
        or if a recording is already in progress.
        """
-        if not _HAS_AUDIO:
+        try:
+            sd, np = _import_audio()
+        except (ImportError, OSError) as e:
            raise RuntimeError(
                "Voice mode requires sounddevice and numpy.\n"
                "Install with: pip install sounddevice numpy\n"
                "Or: pip install hermes-agent[voice]"
-            )
+            ) from e

        with self._lock:
            if self._recording:
@ -269,6 +325,7 @@ class AudioRecorder:
                return None

            # Concatenate frames and write WAV
+            _, np = _import_audio()
            audio_data = np.concatenate(self._frames, axis=0)
            self._frames = []

@ -434,11 +491,11 @@ def stop_playback() -> None:
        except Exception:
            pass
    # Also stop sounddevice playback if active
-    if _HAS_AUDIO:
-        try:
-            sd.stop()
-        except Exception:
-            pass
+    try:
+        sd, _ = _import_audio()
+        sd.stop()
+    except Exception:
+        pass


 def play_audio_file(file_path: str) -> bool:
@ -461,8 +518,9 @@ def play_audio_file(file_path: str) -> bool:
        return False

    # Try sounddevice for WAV files
-    if _HAS_AUDIO and file_path.endswith(".wav"):
+    if file_path.endswith(".wav"):
        try:
+            sd, np = _import_audio()
            with wave.open(file_path, "rb") as wf:
                frames = wf.readframes(wf.getnframes())
                audio_data = np.frombuffer(frames, dtype=np.int16)
@ -471,6 +529,8 @@ def play_audio_file(file_path: str) -> bool:
            sd.play(audio_data, samplerate=sample_rate)
            sd.wait()
            return True
+        except (ImportError, OSError):
+            pass  # audio libs not available, fall through to system players
        except Exception as e:
            logger.debug("sounddevice playback failed: %s", e)

@ -518,14 +578,18 @@ def check_voice_requirements() -> Dict[str, Any]:
    groq_key = bool(os.getenv("GROQ_API_KEY"))
    stt_key_set = openai_key or groq_key
    missing: List[str] = []
+    has_audio = _audio_available()

-    if not _HAS_AUDIO:
+    if not has_audio:
        missing.extend(["sounddevice", "numpy"])

-    available = _HAS_AUDIO and stt_key_set
+    # Environment detection
+    env_check = detect_audio_environment()
+
+    available = has_audio and stt_key_set and env_check["available"]
    details_parts = []

-    if _HAS_AUDIO:
+    if has_audio:
        details_parts.append("Audio capture: OK")
    else:
        details_parts.append("Audio capture: MISSING (pip install sounddevice numpy)")
@ -537,12 +601,16 @@ def check_voice_requirements() -> Dict[str, Any]:
    else:
        details_parts.append("STT API key: MISSING (set GROQ_API_KEY or VOICE_TOOLS_OPENAI_KEY)")

+    for warning in env_check["warnings"]:
+        details_parts.append(f"Environment: {warning}")
+
    return {
        "available": available,
-        "audio_available": _HAS_AUDIO,
+        "audio_available": has_audio,
        "stt_key_set": stt_key_set,
        "missing_packages": missing,
        "details": "\n".join(details_parts),
+        "environment": env_check,
    }