fix: persistent audio stream and silence detection improvements

- Keep InputStream alive across recordings to avoid CoreAudio hang on
  repeated open/close cycles on macOS.  New _ensure_stream() creates the
  stream once; start()/stop()/cancel() only toggle frame collection.
- Add _close_stream_with_timeout() with daemon thread to prevent
  stream.stop()/close() from blocking indefinitely.
- Add generation counter to detect stale stream-open completions after
  cancel or restart.
- Run recorder.cancel() in background thread from Ctrl+C handler to
  keep the event loop responsive.
- Add shutdown() method called on /voice off to release audio resources.
- Fix silence timer reset during active speech: use dip tolerance for
  _resume_start tracker so natural speech pauses (< 0.3s) don't prevent
  the silence timer from being reset.
- Update tests to match persistent stream behavior.
This commit is contained in:
0xbyt4 2026-03-10 20:37:17 +03:00
parent eec04d180a
commit eb79dda04b
4 changed files with 221 additions and 132 deletions

31
cli.py
View file

@ -3848,14 +3848,26 @@ class HermesCLI:
def _disable_voice_mode(self): def _disable_voice_mode(self):
"""Disable voice mode, cancel any active recording, and stop TTS.""" """Disable voice mode, cancel any active recording, and stop TTS."""
recorder = None
with self._voice_lock: with self._voice_lock:
if self._voice_recording and self._voice_recorder: if self._voice_recording and self._voice_recorder:
self._voice_recorder.cancel() self._voice_recorder.cancel()
self._voice_recording = False self._voice_recording = False
recorder = self._voice_recorder
self._voice_mode = False self._voice_mode = False
self._voice_tts = False self._voice_tts = False
self._voice_continuous = False self._voice_continuous = False
# Shut down the persistent audio stream in background
if recorder is not None:
def _bg_shutdown(rec=recorder):
try:
rec.shutdown()
except Exception:
pass
threading.Thread(target=_bg_shutdown, daemon=True).start()
self._voice_recorder = None
# Stop any active TTS playback # Stop any active TTS playback
try: try:
from tools.voice_mode import stop_playback from tools.voice_mode import stop_playback
@ -4799,15 +4811,24 @@ class HermesCLI:
import time as _time import time as _time
now = _time.time() now = _time.time()
# Cancel active voice recording # Cancel active voice recording.
# Run cancel() in a background thread to prevent blocking the
# event loop if AudioRecorder._lock or CoreAudio takes time.
_should_cancel_voice = False
_recorder_ref = None
with cli_ref._voice_lock: with cli_ref._voice_lock:
if cli_ref._voice_recording and cli_ref._voice_recorder: if cli_ref._voice_recording and cli_ref._voice_recorder:
cli_ref._voice_recorder.cancel() _recorder_ref = cli_ref._voice_recorder
cli_ref._voice_recording = False cli_ref._voice_recording = False
cli_ref._voice_continuous = False cli_ref._voice_continuous = False
_cprint(f"\n{_DIM}Recording cancelled.{_RST}") _should_cancel_voice = True
event.app.invalidate() if _should_cancel_voice:
return _cprint(f"\n{_DIM}Recording cancelled.{_RST}")
threading.Thread(
target=_recorder_ref.cancel, daemon=True
).start()
event.app.invalidate()
return
# Cancel sudo prompt # Cancel sudo prompt
if self._sudo_state: if self._sudo_state:

View file

@ -603,28 +603,14 @@ class TestDisableVoiceModeStopsTTS:
def test_disable_voice_mode_calls_stop_playback(self): def test_disable_voice_mode_calls_stop_playback(self):
"""Source check: _disable_voice_mode must call stop_playback().""" """Source check: _disable_voice_mode must call stop_playback()."""
with open("cli.py") as f: import inspect
source = f.read() from cli import HermesCLI
# Extract _disable_voice_mode method body source = inspect.getsource(HermesCLI._disable_voice_mode)
lines = source.split("\n") assert "stop_playback" in source, (
in_method = False
method_lines = []
for line in lines:
if "def _disable_voice_mode" in line:
in_method = True
elif in_method:
if line.strip() and not line.startswith(" ") and not line.startswith("\t"):
break
if line.strip().startswith("def "):
break
method_lines.append(line)
method_body = "\n".join(method_lines)
assert "stop_playback" in method_body, (
"_disable_voice_mode must call stop_playback()" "_disable_voice_mode must call stop_playback()"
) )
assert "_voice_tts_done.set()" in method_body, ( assert "_voice_tts_done.set()" in source, (
"_disable_voice_mode must set _voice_tts_done" "_disable_voice_mode must set _voice_tts_done"
) )

View file

@ -235,8 +235,9 @@ class TestAudioRecorderCancel:
assert recorder.is_recording is False assert recorder.is_recording is False
assert recorder._frames == [] assert recorder._frames == []
mock_stream.stop.assert_called_once() # Stream is kept alive (persistent) — cancel() does NOT close it.
mock_stream.close.assert_called_once() mock_stream.stop.assert_not_called()
mock_stream.close.assert_not_called()
def test_cancel_when_not_recording_is_safe(self): def test_cancel_when_not_recording_is_safe(self):
from tools.voice_mode import AudioRecorder from tools.voice_mode import AudioRecorder

View file

@ -175,6 +175,9 @@ class AudioRecorder:
self._frames: List[Any] = [] self._frames: List[Any] = []
self._recording = False self._recording = False
self._start_time: float = 0.0 self._start_time: float = 0.0
# Generation counter — incremented on each start/cancel/stop to
# detect stale stream-open completions after a cancel or restart.
self._generation: int = 0
# Silence detection state # Silence detection state
self._has_spoken = False self._has_spoken = False
self._speech_start: float = 0.0 # When speech attempt began self._speech_start: float = 0.0 # When speech attempt began
@ -182,6 +185,8 @@ class AudioRecorder:
self._min_speech_duration: float = 0.3 # Seconds of speech needed to confirm self._min_speech_duration: float = 0.3 # Seconds of speech needed to confirm
self._max_dip_tolerance: float = 0.3 # Max dip duration before resetting speech self._max_dip_tolerance: float = 0.3 # Max dip duration before resetting speech
self._silence_start: float = 0.0 self._silence_start: float = 0.0
self._resume_start: float = 0.0 # Tracks sustained speech after silence starts
self._resume_dip_start: float = 0.0 # Dip tolerance tracker for resume detection
self._on_silence_stop = None self._on_silence_stop = None
self._silence_threshold: int = SILENCE_RMS_THRESHOLD self._silence_threshold: int = SILENCE_RMS_THRESHOLD
self._silence_duration: float = SILENCE_DURATION_SECONDS self._silence_duration: float = SILENCE_DURATION_SECONDS
@ -210,9 +215,137 @@ class AudioRecorder:
# -- public methods ------------------------------------------------------ # -- public methods ------------------------------------------------------
def _ensure_stream(self) -> None:
"""Create the audio InputStream once and keep it alive.
The stream stays open for the lifetime of the recorder. Between
recordings the callback simply discards audio chunks (``_recording``
is ``False``). This avoids the CoreAudio bug where closing and
re-opening an ``InputStream`` hangs indefinitely on macOS.
"""
if self._stream is not None:
return # already alive
sd, np = _import_audio()
def _callback(indata, frames, time_info, status): # noqa: ARG001
if status:
logger.debug("sounddevice status: %s", status)
# When not recording the stream is idle — discard audio.
if not self._recording:
return
self._frames.append(indata.copy())
# Compute RMS for level display and silence detection
rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2)))
self._current_rms = rms
if rms > self._peak_rms:
self._peak_rms = rms
# Silence detection
if self._on_silence_stop is not None:
now = time.monotonic()
elapsed = now - self._start_time
if rms > self._silence_threshold:
# Audio is above threshold -- this is speech (or noise).
self._dip_start = 0.0 # Reset dip tracker
if self._speech_start == 0.0:
self._speech_start = now
elif not self._has_spoken and now - self._speech_start >= self._min_speech_duration:
self._has_spoken = True
logger.debug("Speech confirmed (%.2fs above threshold)",
now - self._speech_start)
# After speech is confirmed, only reset silence timer if
# speech is sustained (>0.3s above threshold). Brief
# spikes from ambient noise should NOT reset the timer.
if not self._has_spoken:
self._silence_start = 0.0
else:
# Track resumed speech with dip tolerance.
# Brief dips below threshold are normal during speech,
# so we mirror the initial speech detection pattern:
# start tracking, tolerate short dips, confirm after 0.3s.
self._resume_dip_start = 0.0 # Above threshold — no dip
if self._resume_start == 0.0:
self._resume_start = now
elif now - self._resume_start >= self._min_speech_duration:
self._silence_start = 0.0
self._resume_start = 0.0
elif self._has_spoken:
# Below threshold after speech confirmed.
# Use dip tolerance before resetting resume tracker —
# natural speech has brief dips below threshold.
if self._resume_start > 0:
if self._resume_dip_start == 0.0:
self._resume_dip_start = now
elif now - self._resume_dip_start >= self._max_dip_tolerance:
# Sustained dip — user actually stopped speaking
self._resume_start = 0.0
self._resume_dip_start = 0.0
elif self._speech_start > 0:
# We were in a speech attempt but RMS dipped.
# Tolerate brief dips (micro-pauses between syllables).
if self._dip_start == 0.0:
self._dip_start = now
elif now - self._dip_start >= self._max_dip_tolerance:
# Dip lasted too long -- genuine silence, reset
logger.debug("Speech attempt reset (dip lasted %.2fs)",
now - self._dip_start)
self._speech_start = 0.0
self._dip_start = 0.0
# Fire silence callback when:
# 1. User spoke then went silent for silence_duration, OR
# 2. No speech detected at all for max_wait seconds
should_fire = False
if self._has_spoken and rms <= self._silence_threshold:
# User was speaking and now is silent
if self._silence_start == 0.0:
self._silence_start = now
elif now - self._silence_start >= self._silence_duration:
logger.info("Silence detected (%.1fs), auto-stopping",
self._silence_duration)
should_fire = True
elif not self._has_spoken and elapsed >= self._max_wait:
logger.info("No speech within %.0fs, auto-stopping",
self._max_wait)
should_fire = True
if should_fire:
cb = self._on_silence_stop
self._on_silence_stop = None # fire only once
if cb:
def _safe_cb():
try:
cb()
except Exception as e:
logger.error("Silence callback failed: %s", e, exc_info=True)
threading.Thread(target=_safe_cb, daemon=True).start()
# Create stream — may block on CoreAudio (first call only).
try:
stream = sd.InputStream(
samplerate=SAMPLE_RATE,
channels=CHANNELS,
dtype=DTYPE,
callback=_callback,
)
stream.start()
except Exception as e:
raise RuntimeError(
f"Failed to open audio input stream: {e}. "
"Check that a microphone is connected and accessible."
) from e
self._stream = stream
def start(self, on_silence_stop=None) -> None: def start(self, on_silence_stop=None) -> None:
"""Start capturing audio from the default input device. """Start capturing audio from the default input device.
The underlying InputStream is created once and kept alive across
recordings. Subsequent calls simply reset detection state and
toggle frame collection via ``_recording``.
Args: Args:
on_silence_stop: Optional callback invoked (in a daemon thread) when on_silence_stop: Optional callback invoked (in a daemon thread) when
silence is detected after speech. The callback receives no arguments. silence is detected after speech. The callback receives no arguments.
@ -222,7 +355,7 @@ class AudioRecorder:
or if a recording is already in progress. or if a recording is already in progress.
""" """
try: try:
sd, np = _import_audio() _import_audio()
except (ImportError, OSError) as e: except (ImportError, OSError) as e:
raise RuntimeError( raise RuntimeError(
"Voice mode requires sounddevice and numpy.\n" "Voice mode requires sounddevice and numpy.\n"
@ -234,107 +367,54 @@ class AudioRecorder:
if self._recording: if self._recording:
return # already recording return # already recording
self._generation += 1
self._frames = [] self._frames = []
self._start_time = time.monotonic() self._start_time = time.monotonic()
self._has_spoken = False self._has_spoken = False
self._speech_start = 0.0 self._speech_start = 0.0
self._dip_start = 0.0 self._dip_start = 0.0
self._silence_start = 0.0 self._silence_start = 0.0
self._resume_start = 0.0
self._resume_dip_start = 0.0
self._peak_rms = 0 self._peak_rms = 0
self._current_rms = 0
self._on_silence_stop = on_silence_stop self._on_silence_stop = on_silence_stop
def _callback(indata, frames, time_info, status): # noqa: ARG001 # Ensure the persistent stream is alive (no-op after first call).
if status: self._ensure_stream()
logger.debug("sounddevice status: %s", status)
self._frames.append(indata.copy())
# Compute RMS for level display and silence detection with self._lock:
rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2)))
self._current_rms = rms
if rms > self._peak_rms:
self._peak_rms = rms
# Silence detection
if self._on_silence_stop is not None and self._recording:
now = time.monotonic()
if rms > self._silence_threshold:
# Audio is above threshold -- this is speech (or noise).
self._dip_start = 0.0 # Reset dip tracker
if self._speech_start == 0.0:
self._speech_start = now
elif not self._has_spoken and now - self._speech_start >= self._min_speech_duration:
self._has_spoken = True
logger.debug("Speech confirmed (%.2fs above threshold)",
now - self._speech_start)
self._silence_start = 0.0
elif self._has_spoken:
# Speech already confirmed, let silence timer run below
pass
elif self._speech_start > 0:
# We were in a speech attempt but RMS dipped.
# Tolerate brief dips (micro-pauses between syllables).
if self._dip_start == 0.0:
self._dip_start = now
elif now - self._dip_start >= self._max_dip_tolerance:
# Dip lasted too long -- genuine silence, reset
logger.debug("Speech attempt reset (dip lasted %.2fs)",
now - self._dip_start)
self._speech_start = 0.0
self._dip_start = 0.0
# else: brief dip, keep tolerating
# else: no speech attempt, just silence -- nothing to do
# Fire silence callback when:
# 1. User spoke then went silent for silence_duration, OR
# 2. No speech detected at all for max_wait seconds
should_fire = False
if self._has_spoken and rms <= self._silence_threshold:
# User was speaking and now is silent
if self._silence_start == 0.0:
self._silence_start = now
elif now - self._silence_start >= self._silence_duration:
logger.info("Silence detected (%.1fs), auto-stopping",
self._silence_duration)
should_fire = True
elif not self._has_spoken and now - self._start_time >= self._max_wait:
# No speech detected within max_wait — stop to avoid
# infinite recording in quiet environments.
logger.info("No speech within %.0fs, auto-stopping",
self._max_wait)
should_fire = True
if should_fire:
cb = self._on_silence_stop
self._on_silence_stop = None # fire only once
if cb:
def _safe_cb():
try:
cb()
except Exception as e:
logger.error("Silence callback failed: %s", e, exc_info=True)
threading.Thread(target=_safe_cb, daemon=True).start()
try:
self._stream = sd.InputStream(
samplerate=SAMPLE_RATE,
channels=CHANNELS,
dtype=DTYPE,
callback=_callback,
)
self._stream.start()
except Exception as e:
self._stream = None
raise RuntimeError(
f"Failed to open audio input stream: {e}. "
"Check that a microphone is connected and accessible."
) from e
self._recording = True self._recording = True
logger.info("Voice recording started (rate=%d, channels=%d)", SAMPLE_RATE, CHANNELS) logger.info("Voice recording started (rate=%d, channels=%d)", SAMPLE_RATE, CHANNELS)
def _close_stream_with_timeout(self, timeout: float = 3.0) -> None:
"""Close the audio stream with a timeout to prevent CoreAudio hangs."""
if self._stream is None:
return
stream = self._stream
self._stream = None
def _do_close():
try:
stream.stop()
stream.close()
except Exception:
pass
t = threading.Thread(target=_do_close, daemon=True)
t.start()
t.join(timeout=timeout)
if t.is_alive():
logger.warning("Audio stream close timed out after %.1fs — forcing ahead", timeout)
def stop(self) -> Optional[str]: def stop(self) -> Optional[str]:
"""Stop recording and write captured audio to a WAV file. """Stop recording and write captured audio to a WAV file.
The underlying stream is kept alive for reuse only frame
collection is stopped.
Returns: Returns:
Path to the WAV file, or ``None`` if no audio was captured. Path to the WAV file, or ``None`` if no audio was captured.
""" """
@ -343,14 +423,9 @@ class AudioRecorder:
return None return None
self._recording = False self._recording = False
self._generation += 1 # Invalidate any pending start()
if self._stream is not None: self._current_rms = 0
try: # Stream stays alive — no close needed.
self._stream.stop()
self._stream.close()
except Exception:
pass
self._stream = None
if not self._frames: if not self._frames:
return None return None
@ -379,20 +454,26 @@ class AudioRecorder:
return self._write_wav(audio_data) return self._write_wav(audio_data)
def cancel(self) -> None: def cancel(self) -> None:
"""Stop recording and discard all captured audio.""" """Stop recording and discard all captured audio.
The underlying stream is kept alive for reuse.
"""
with self._lock:
self._generation += 1 # Invalidate any pending start()
self._recording = False
self._frames = []
self._on_silence_stop = None
self._current_rms = 0
logger.info("Voice recording cancelled")
def shutdown(self) -> None:
"""Release the audio stream. Call when voice mode is disabled."""
with self._lock: with self._lock:
self._recording = False self._recording = False
self._frames = [] self._frames = []
self._on_silence_stop = None
if self._stream is not None: self._close_stream_with_timeout()
try: logger.info("AudioRecorder shut down")
self._stream.stop()
self._stream.close()
except Exception:
pass
self._stream = None
logger.info("Voice recording cancelled")
# -- private helpers ----------------------------------------------------- # -- private helpers -----------------------------------------------------