fix: persistent audio stream and silence detection improvements
- Keep InputStream alive across recordings to avoid CoreAudio hang on repeated open/close cycles on macOS. New _ensure_stream() creates the stream once; start()/stop()/cancel() only toggle frame collection. - Add _close_stream_with_timeout() with daemon thread to prevent stream.stop()/close() from blocking indefinitely. - Add generation counter to detect stale stream-open completions after cancel or restart. - Run recorder.cancel() in background thread from Ctrl+C handler to keep the event loop responsive. - Add shutdown() method called on /voice off to release audio resources. - Fix silence timer reset during active speech: use dip tolerance for _resume_start tracker so natural speech pauses (< 0.3s) don't prevent the silence timer from being reset. - Update tests to match persistent stream behavior.
This commit is contained in:
parent
eec04d180a
commit
eb79dda04b
4 changed files with 221 additions and 132 deletions
25
cli.py
25
cli.py
|
|
@ -3848,14 +3848,26 @@ class HermesCLI:
|
||||||
|
|
||||||
def _disable_voice_mode(self):
|
def _disable_voice_mode(self):
|
||||||
"""Disable voice mode, cancel any active recording, and stop TTS."""
|
"""Disable voice mode, cancel any active recording, and stop TTS."""
|
||||||
|
recorder = None
|
||||||
with self._voice_lock:
|
with self._voice_lock:
|
||||||
if self._voice_recording and self._voice_recorder:
|
if self._voice_recording and self._voice_recorder:
|
||||||
self._voice_recorder.cancel()
|
self._voice_recorder.cancel()
|
||||||
self._voice_recording = False
|
self._voice_recording = False
|
||||||
|
recorder = self._voice_recorder
|
||||||
self._voice_mode = False
|
self._voice_mode = False
|
||||||
self._voice_tts = False
|
self._voice_tts = False
|
||||||
self._voice_continuous = False
|
self._voice_continuous = False
|
||||||
|
|
||||||
|
# Shut down the persistent audio stream in background
|
||||||
|
if recorder is not None:
|
||||||
|
def _bg_shutdown(rec=recorder):
|
||||||
|
try:
|
||||||
|
rec.shutdown()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
threading.Thread(target=_bg_shutdown, daemon=True).start()
|
||||||
|
self._voice_recorder = None
|
||||||
|
|
||||||
# Stop any active TTS playback
|
# Stop any active TTS playback
|
||||||
try:
|
try:
|
||||||
from tools.voice_mode import stop_playback
|
from tools.voice_mode import stop_playback
|
||||||
|
|
@ -4799,13 +4811,22 @@ class HermesCLI:
|
||||||
import time as _time
|
import time as _time
|
||||||
now = _time.time()
|
now = _time.time()
|
||||||
|
|
||||||
# Cancel active voice recording
|
# Cancel active voice recording.
|
||||||
|
# Run cancel() in a background thread to prevent blocking the
|
||||||
|
# event loop if AudioRecorder._lock or CoreAudio takes time.
|
||||||
|
_should_cancel_voice = False
|
||||||
|
_recorder_ref = None
|
||||||
with cli_ref._voice_lock:
|
with cli_ref._voice_lock:
|
||||||
if cli_ref._voice_recording and cli_ref._voice_recorder:
|
if cli_ref._voice_recording and cli_ref._voice_recorder:
|
||||||
cli_ref._voice_recorder.cancel()
|
_recorder_ref = cli_ref._voice_recorder
|
||||||
cli_ref._voice_recording = False
|
cli_ref._voice_recording = False
|
||||||
cli_ref._voice_continuous = False
|
cli_ref._voice_continuous = False
|
||||||
|
_should_cancel_voice = True
|
||||||
|
if _should_cancel_voice:
|
||||||
_cprint(f"\n{_DIM}Recording cancelled.{_RST}")
|
_cprint(f"\n{_DIM}Recording cancelled.{_RST}")
|
||||||
|
threading.Thread(
|
||||||
|
target=_recorder_ref.cancel, daemon=True
|
||||||
|
).start()
|
||||||
event.app.invalidate()
|
event.app.invalidate()
|
||||||
return
|
return
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -603,28 +603,14 @@ class TestDisableVoiceModeStopsTTS:
|
||||||
|
|
||||||
def test_disable_voice_mode_calls_stop_playback(self):
|
def test_disable_voice_mode_calls_stop_playback(self):
|
||||||
"""Source check: _disable_voice_mode must call stop_playback()."""
|
"""Source check: _disable_voice_mode must call stop_playback()."""
|
||||||
with open("cli.py") as f:
|
import inspect
|
||||||
source = f.read()
|
from cli import HermesCLI
|
||||||
|
|
||||||
# Extract _disable_voice_mode method body
|
source = inspect.getsource(HermesCLI._disable_voice_mode)
|
||||||
lines = source.split("\n")
|
assert "stop_playback" in source, (
|
||||||
in_method = False
|
|
||||||
method_lines = []
|
|
||||||
for line in lines:
|
|
||||||
if "def _disable_voice_mode" in line:
|
|
||||||
in_method = True
|
|
||||||
elif in_method:
|
|
||||||
if line.strip() and not line.startswith(" ") and not line.startswith("\t"):
|
|
||||||
break
|
|
||||||
if line.strip().startswith("def "):
|
|
||||||
break
|
|
||||||
method_lines.append(line)
|
|
||||||
|
|
||||||
method_body = "\n".join(method_lines)
|
|
||||||
assert "stop_playback" in method_body, (
|
|
||||||
"_disable_voice_mode must call stop_playback()"
|
"_disable_voice_mode must call stop_playback()"
|
||||||
)
|
)
|
||||||
assert "_voice_tts_done.set()" in method_body, (
|
assert "_voice_tts_done.set()" in source, (
|
||||||
"_disable_voice_mode must set _voice_tts_done"
|
"_disable_voice_mode must set _voice_tts_done"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -235,8 +235,9 @@ class TestAudioRecorderCancel:
|
||||||
|
|
||||||
assert recorder.is_recording is False
|
assert recorder.is_recording is False
|
||||||
assert recorder._frames == []
|
assert recorder._frames == []
|
||||||
mock_stream.stop.assert_called_once()
|
# Stream is kept alive (persistent) — cancel() does NOT close it.
|
||||||
mock_stream.close.assert_called_once()
|
mock_stream.stop.assert_not_called()
|
||||||
|
mock_stream.close.assert_not_called()
|
||||||
|
|
||||||
def test_cancel_when_not_recording_is_safe(self):
|
def test_cancel_when_not_recording_is_safe(self):
|
||||||
from tools.voice_mode import AudioRecorder
|
from tools.voice_mode import AudioRecorder
|
||||||
|
|
|
||||||
|
|
@ -175,6 +175,9 @@ class AudioRecorder:
|
||||||
self._frames: List[Any] = []
|
self._frames: List[Any] = []
|
||||||
self._recording = False
|
self._recording = False
|
||||||
self._start_time: float = 0.0
|
self._start_time: float = 0.0
|
||||||
|
# Generation counter — incremented on each start/cancel/stop to
|
||||||
|
# detect stale stream-open completions after a cancel or restart.
|
||||||
|
self._generation: int = 0
|
||||||
# Silence detection state
|
# Silence detection state
|
||||||
self._has_spoken = False
|
self._has_spoken = False
|
||||||
self._speech_start: float = 0.0 # When speech attempt began
|
self._speech_start: float = 0.0 # When speech attempt began
|
||||||
|
|
@ -182,6 +185,8 @@ class AudioRecorder:
|
||||||
self._min_speech_duration: float = 0.3 # Seconds of speech needed to confirm
|
self._min_speech_duration: float = 0.3 # Seconds of speech needed to confirm
|
||||||
self._max_dip_tolerance: float = 0.3 # Max dip duration before resetting speech
|
self._max_dip_tolerance: float = 0.3 # Max dip duration before resetting speech
|
||||||
self._silence_start: float = 0.0
|
self._silence_start: float = 0.0
|
||||||
|
self._resume_start: float = 0.0 # Tracks sustained speech after silence starts
|
||||||
|
self._resume_dip_start: float = 0.0 # Dip tolerance tracker for resume detection
|
||||||
self._on_silence_stop = None
|
self._on_silence_stop = None
|
||||||
self._silence_threshold: int = SILENCE_RMS_THRESHOLD
|
self._silence_threshold: int = SILENCE_RMS_THRESHOLD
|
||||||
self._silence_duration: float = SILENCE_DURATION_SECONDS
|
self._silence_duration: float = SILENCE_DURATION_SECONDS
|
||||||
|
|
@ -210,42 +215,25 @@ class AudioRecorder:
|
||||||
|
|
||||||
# -- public methods ------------------------------------------------------
|
# -- public methods ------------------------------------------------------
|
||||||
|
|
||||||
def start(self, on_silence_stop=None) -> None:
|
def _ensure_stream(self) -> None:
|
||||||
"""Start capturing audio from the default input device.
|
"""Create the audio InputStream once and keep it alive.
|
||||||
|
|
||||||
Args:
|
The stream stays open for the lifetime of the recorder. Between
|
||||||
on_silence_stop: Optional callback invoked (in a daemon thread) when
|
recordings the callback simply discards audio chunks (``_recording``
|
||||||
silence is detected after speech. The callback receives no arguments.
|
is ``False``). This avoids the CoreAudio bug where closing and
|
||||||
Use this to auto-stop recording and trigger transcription.
|
re-opening an ``InputStream`` hangs indefinitely on macOS.
|
||||||
|
|
||||||
Raises ``RuntimeError`` if sounddevice/numpy are not installed
|
|
||||||
or if a recording is already in progress.
|
|
||||||
"""
|
"""
|
||||||
try:
|
if self._stream is not None:
|
||||||
|
return # already alive
|
||||||
|
|
||||||
sd, np = _import_audio()
|
sd, np = _import_audio()
|
||||||
except (ImportError, OSError) as e:
|
|
||||||
raise RuntimeError(
|
|
||||||
"Voice mode requires sounddevice and numpy.\n"
|
|
||||||
"Install with: pip install sounddevice numpy\n"
|
|
||||||
"Or: pip install hermes-agent[voice]"
|
|
||||||
) from e
|
|
||||||
|
|
||||||
with self._lock:
|
|
||||||
if self._recording:
|
|
||||||
return # already recording
|
|
||||||
|
|
||||||
self._frames = []
|
|
||||||
self._start_time = time.monotonic()
|
|
||||||
self._has_spoken = False
|
|
||||||
self._speech_start = 0.0
|
|
||||||
self._dip_start = 0.0
|
|
||||||
self._silence_start = 0.0
|
|
||||||
self._peak_rms = 0
|
|
||||||
self._on_silence_stop = on_silence_stop
|
|
||||||
|
|
||||||
def _callback(indata, frames, time_info, status): # noqa: ARG001
|
def _callback(indata, frames, time_info, status): # noqa: ARG001
|
||||||
if status:
|
if status:
|
||||||
logger.debug("sounddevice status: %s", status)
|
logger.debug("sounddevice status: %s", status)
|
||||||
|
# When not recording the stream is idle — discard audio.
|
||||||
|
if not self._recording:
|
||||||
|
return
|
||||||
self._frames.append(indata.copy())
|
self._frames.append(indata.copy())
|
||||||
|
|
||||||
# Compute RMS for level display and silence detection
|
# Compute RMS for level display and silence detection
|
||||||
|
|
@ -255,8 +243,9 @@ class AudioRecorder:
|
||||||
self._peak_rms = rms
|
self._peak_rms = rms
|
||||||
|
|
||||||
# Silence detection
|
# Silence detection
|
||||||
if self._on_silence_stop is not None and self._recording:
|
if self._on_silence_stop is not None:
|
||||||
now = time.monotonic()
|
now = time.monotonic()
|
||||||
|
elapsed = now - self._start_time
|
||||||
|
|
||||||
if rms > self._silence_threshold:
|
if rms > self._silence_threshold:
|
||||||
# Audio is above threshold -- this is speech (or noise).
|
# Audio is above threshold -- this is speech (or noise).
|
||||||
|
|
@ -267,10 +256,33 @@ class AudioRecorder:
|
||||||
self._has_spoken = True
|
self._has_spoken = True
|
||||||
logger.debug("Speech confirmed (%.2fs above threshold)",
|
logger.debug("Speech confirmed (%.2fs above threshold)",
|
||||||
now - self._speech_start)
|
now - self._speech_start)
|
||||||
|
# After speech is confirmed, only reset silence timer if
|
||||||
|
# speech is sustained (>0.3s above threshold). Brief
|
||||||
|
# spikes from ambient noise should NOT reset the timer.
|
||||||
|
if not self._has_spoken:
|
||||||
self._silence_start = 0.0
|
self._silence_start = 0.0
|
||||||
|
else:
|
||||||
|
# Track resumed speech with dip tolerance.
|
||||||
|
# Brief dips below threshold are normal during speech,
|
||||||
|
# so we mirror the initial speech detection pattern:
|
||||||
|
# start tracking, tolerate short dips, confirm after 0.3s.
|
||||||
|
self._resume_dip_start = 0.0 # Above threshold — no dip
|
||||||
|
if self._resume_start == 0.0:
|
||||||
|
self._resume_start = now
|
||||||
|
elif now - self._resume_start >= self._min_speech_duration:
|
||||||
|
self._silence_start = 0.0
|
||||||
|
self._resume_start = 0.0
|
||||||
elif self._has_spoken:
|
elif self._has_spoken:
|
||||||
# Speech already confirmed, let silence timer run below
|
# Below threshold after speech confirmed.
|
||||||
pass
|
# Use dip tolerance before resetting resume tracker —
|
||||||
|
# natural speech has brief dips below threshold.
|
||||||
|
if self._resume_start > 0:
|
||||||
|
if self._resume_dip_start == 0.0:
|
||||||
|
self._resume_dip_start = now
|
||||||
|
elif now - self._resume_dip_start >= self._max_dip_tolerance:
|
||||||
|
# Sustained dip — user actually stopped speaking
|
||||||
|
self._resume_start = 0.0
|
||||||
|
self._resume_dip_start = 0.0
|
||||||
elif self._speech_start > 0:
|
elif self._speech_start > 0:
|
||||||
# We were in a speech attempt but RMS dipped.
|
# We were in a speech attempt but RMS dipped.
|
||||||
# Tolerate brief dips (micro-pauses between syllables).
|
# Tolerate brief dips (micro-pauses between syllables).
|
||||||
|
|
@ -282,8 +294,6 @@ class AudioRecorder:
|
||||||
now - self._dip_start)
|
now - self._dip_start)
|
||||||
self._speech_start = 0.0
|
self._speech_start = 0.0
|
||||||
self._dip_start = 0.0
|
self._dip_start = 0.0
|
||||||
# else: brief dip, keep tolerating
|
|
||||||
# else: no speech attempt, just silence -- nothing to do
|
|
||||||
|
|
||||||
# Fire silence callback when:
|
# Fire silence callback when:
|
||||||
# 1. User spoke then went silent for silence_duration, OR
|
# 1. User spoke then went silent for silence_duration, OR
|
||||||
|
|
@ -297,9 +307,7 @@ class AudioRecorder:
|
||||||
logger.info("Silence detected (%.1fs), auto-stopping",
|
logger.info("Silence detected (%.1fs), auto-stopping",
|
||||||
self._silence_duration)
|
self._silence_duration)
|
||||||
should_fire = True
|
should_fire = True
|
||||||
elif not self._has_spoken and now - self._start_time >= self._max_wait:
|
elif not self._has_spoken and elapsed >= self._max_wait:
|
||||||
# No speech detected within max_wait — stop to avoid
|
|
||||||
# infinite recording in quiet environments.
|
|
||||||
logger.info("No speech within %.0fs, auto-stopping",
|
logger.info("No speech within %.0fs, auto-stopping",
|
||||||
self._max_wait)
|
self._max_wait)
|
||||||
should_fire = True
|
should_fire = True
|
||||||
|
|
@ -315,26 +323,98 @@ class AudioRecorder:
|
||||||
logger.error("Silence callback failed: %s", e, exc_info=True)
|
logger.error("Silence callback failed: %s", e, exc_info=True)
|
||||||
threading.Thread(target=_safe_cb, daemon=True).start()
|
threading.Thread(target=_safe_cb, daemon=True).start()
|
||||||
|
|
||||||
|
# Create stream — may block on CoreAudio (first call only).
|
||||||
try:
|
try:
|
||||||
self._stream = sd.InputStream(
|
stream = sd.InputStream(
|
||||||
samplerate=SAMPLE_RATE,
|
samplerate=SAMPLE_RATE,
|
||||||
channels=CHANNELS,
|
channels=CHANNELS,
|
||||||
dtype=DTYPE,
|
dtype=DTYPE,
|
||||||
callback=_callback,
|
callback=_callback,
|
||||||
)
|
)
|
||||||
self._stream.start()
|
stream.start()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self._stream = None
|
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Failed to open audio input stream: {e}. "
|
f"Failed to open audio input stream: {e}. "
|
||||||
"Check that a microphone is connected and accessible."
|
"Check that a microphone is connected and accessible."
|
||||||
) from e
|
) from e
|
||||||
|
self._stream = stream
|
||||||
|
|
||||||
|
def start(self, on_silence_stop=None) -> None:
|
||||||
|
"""Start capturing audio from the default input device.
|
||||||
|
|
||||||
|
The underlying InputStream is created once and kept alive across
|
||||||
|
recordings. Subsequent calls simply reset detection state and
|
||||||
|
toggle frame collection via ``_recording``.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
on_silence_stop: Optional callback invoked (in a daemon thread) when
|
||||||
|
silence is detected after speech. The callback receives no arguments.
|
||||||
|
Use this to auto-stop recording and trigger transcription.
|
||||||
|
|
||||||
|
Raises ``RuntimeError`` if sounddevice/numpy are not installed
|
||||||
|
or if a recording is already in progress.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
_import_audio()
|
||||||
|
except (ImportError, OSError) as e:
|
||||||
|
raise RuntimeError(
|
||||||
|
"Voice mode requires sounddevice and numpy.\n"
|
||||||
|
"Install with: pip install sounddevice numpy\n"
|
||||||
|
"Or: pip install hermes-agent[voice]"
|
||||||
|
) from e
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
|
if self._recording:
|
||||||
|
return # already recording
|
||||||
|
|
||||||
|
self._generation += 1
|
||||||
|
|
||||||
|
self._frames = []
|
||||||
|
self._start_time = time.monotonic()
|
||||||
|
self._has_spoken = False
|
||||||
|
self._speech_start = 0.0
|
||||||
|
self._dip_start = 0.0
|
||||||
|
self._silence_start = 0.0
|
||||||
|
self._resume_start = 0.0
|
||||||
|
self._resume_dip_start = 0.0
|
||||||
|
self._peak_rms = 0
|
||||||
|
self._current_rms = 0
|
||||||
|
self._on_silence_stop = on_silence_stop
|
||||||
|
|
||||||
|
# Ensure the persistent stream is alive (no-op after first call).
|
||||||
|
self._ensure_stream()
|
||||||
|
|
||||||
|
with self._lock:
|
||||||
self._recording = True
|
self._recording = True
|
||||||
logger.info("Voice recording started (rate=%d, channels=%d)", SAMPLE_RATE, CHANNELS)
|
logger.info("Voice recording started (rate=%d, channels=%d)", SAMPLE_RATE, CHANNELS)
|
||||||
|
|
||||||
|
def _close_stream_with_timeout(self, timeout: float = 3.0) -> None:
|
||||||
|
"""Close the audio stream with a timeout to prevent CoreAudio hangs."""
|
||||||
|
if self._stream is None:
|
||||||
|
return
|
||||||
|
|
||||||
|
stream = self._stream
|
||||||
|
self._stream = None
|
||||||
|
|
||||||
|
def _do_close():
|
||||||
|
try:
|
||||||
|
stream.stop()
|
||||||
|
stream.close()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
t = threading.Thread(target=_do_close, daemon=True)
|
||||||
|
t.start()
|
||||||
|
t.join(timeout=timeout)
|
||||||
|
if t.is_alive():
|
||||||
|
logger.warning("Audio stream close timed out after %.1fs — forcing ahead", timeout)
|
||||||
|
|
||||||
def stop(self) -> Optional[str]:
|
def stop(self) -> Optional[str]:
|
||||||
"""Stop recording and write captured audio to a WAV file.
|
"""Stop recording and write captured audio to a WAV file.
|
||||||
|
|
||||||
|
The underlying stream is kept alive for reuse — only frame
|
||||||
|
collection is stopped.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
Path to the WAV file, or ``None`` if no audio was captured.
|
Path to the WAV file, or ``None`` if no audio was captured.
|
||||||
"""
|
"""
|
||||||
|
|
@ -343,14 +423,9 @@ class AudioRecorder:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
self._recording = False
|
self._recording = False
|
||||||
|
self._generation += 1 # Invalidate any pending start()
|
||||||
if self._stream is not None:
|
self._current_rms = 0
|
||||||
try:
|
# Stream stays alive — no close needed.
|
||||||
self._stream.stop()
|
|
||||||
self._stream.close()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
self._stream = None
|
|
||||||
|
|
||||||
if not self._frames:
|
if not self._frames:
|
||||||
return None
|
return None
|
||||||
|
|
@ -379,20 +454,26 @@ class AudioRecorder:
|
||||||
return self._write_wav(audio_data)
|
return self._write_wav(audio_data)
|
||||||
|
|
||||||
def cancel(self) -> None:
|
def cancel(self) -> None:
|
||||||
"""Stop recording and discard all captured audio."""
|
"""Stop recording and discard all captured audio.
|
||||||
|
|
||||||
|
The underlying stream is kept alive for reuse.
|
||||||
|
"""
|
||||||
|
with self._lock:
|
||||||
|
self._generation += 1 # Invalidate any pending start()
|
||||||
|
self._recording = False
|
||||||
|
self._frames = []
|
||||||
|
self._on_silence_stop = None
|
||||||
|
self._current_rms = 0
|
||||||
|
logger.info("Voice recording cancelled")
|
||||||
|
|
||||||
|
def shutdown(self) -> None:
|
||||||
|
"""Release the audio stream. Call when voice mode is disabled."""
|
||||||
with self._lock:
|
with self._lock:
|
||||||
self._recording = False
|
self._recording = False
|
||||||
self._frames = []
|
self._frames = []
|
||||||
|
self._on_silence_stop = None
|
||||||
if self._stream is not None:
|
self._close_stream_with_timeout()
|
||||||
try:
|
logger.info("AudioRecorder shut down")
|
||||||
self._stream.stop()
|
|
||||||
self._stream.close()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
self._stream = None
|
|
||||||
|
|
||||||
logger.info("Voice recording cancelled")
|
|
||||||
|
|
||||||
# -- private helpers -----------------------------------------------------
|
# -- private helpers -----------------------------------------------------
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue