fix: fix silence detection bugs and add Phase 4 voice mode features
Fix 3 critical bugs in silence detection: - Micro-pause tolerance now tracks dip duration (not time since speech start) - Peak RMS check in stop() prevents discarding recordings with real speech - Reduced min_speech_duration from 0.5s to 0.3s for reliable speech confirmation Phase 4 features: configurable silence params, visual audio level indicator, voice system prompt, tool call audio cues, TTS interrupt, continuous mode auto-restart, interruptable playback via Popen tracking.
This commit is contained in:
parent
32b033c11c
commit
dad865e920
4 changed files with 245 additions and 23 deletions
113
cli.py
113
cli.py
|
|
@ -1550,6 +1550,7 @@ class HermesCLI:
|
||||||
checkpoints_enabled=self.checkpoints_enabled,
|
checkpoints_enabled=self.checkpoints_enabled,
|
||||||
checkpoint_max_snapshots=self.checkpoint_max_snapshots,
|
checkpoint_max_snapshots=self.checkpoint_max_snapshots,
|
||||||
pass_session_id=self.pass_session_id,
|
pass_session_id=self.pass_session_id,
|
||||||
|
tool_progress_callback=self._on_tool_progress,
|
||||||
)
|
)
|
||||||
# Apply any pending title now that the session exists in the DB
|
# Apply any pending title now that the session exists in the DB
|
||||||
if self._pending_title and self._session_db:
|
if self._pending_title and self._session_db:
|
||||||
|
|
@ -3515,6 +3516,28 @@ class HermesCLI:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f" ❌ MCP reload failed: {e}")
|
print(f" ❌ MCP reload failed: {e}")
|
||||||
|
|
||||||
|
# ====================================================================
|
||||||
|
# Tool progress callback (audio cues for voice mode)
|
||||||
|
# ====================================================================
|
||||||
|
|
||||||
|
def _on_tool_progress(self, function_name: str, preview: str, function_args: dict):
|
||||||
|
"""Called when a tool starts executing. Plays audio cue in voice mode."""
|
||||||
|
if not self._voice_mode:
|
||||||
|
return
|
||||||
|
# Skip internal/thinking tools
|
||||||
|
if function_name.startswith("_"):
|
||||||
|
return
|
||||||
|
try:
|
||||||
|
from tools.voice_mode import play_beep
|
||||||
|
# Short, subtle tick sound (higher pitch, very brief)
|
||||||
|
threading.Thread(
|
||||||
|
target=play_beep,
|
||||||
|
kwargs={"frequency": 1200, "duration": 0.06, "count": 1},
|
||||||
|
daemon=True,
|
||||||
|
).start()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
# ====================================================================
|
# ====================================================================
|
||||||
# Voice mode methods
|
# Voice mode methods
|
||||||
# ====================================================================
|
# ====================================================================
|
||||||
|
|
@ -3536,9 +3559,21 @@ class HermesCLI:
|
||||||
"Get one at: https://platform.openai.com/api-keys"
|
"Get one at: https://platform.openai.com/api-keys"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Load silence detection params from config
|
||||||
|
voice_cfg = {}
|
||||||
|
try:
|
||||||
|
from hermes_cli.config import load_config
|
||||||
|
voice_cfg = load_config().get("voice", {})
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
if self._voice_recorder is None:
|
if self._voice_recorder is None:
|
||||||
self._voice_recorder = AudioRecorder()
|
self._voice_recorder = AudioRecorder()
|
||||||
|
|
||||||
|
# Apply config-driven silence params
|
||||||
|
self._voice_recorder._silence_threshold = voice_cfg.get("silence_threshold", 200)
|
||||||
|
self._voice_recorder._silence_duration = voice_cfg.get("silence_duration", 3.0)
|
||||||
|
|
||||||
def _on_silence():
|
def _on_silence():
|
||||||
"""Called by AudioRecorder when silence is detected after speech."""
|
"""Called by AudioRecorder when silence is detected after speech."""
|
||||||
with self._voice_lock:
|
with self._voice_lock:
|
||||||
|
|
@ -3549,18 +3584,26 @@ class HermesCLI:
|
||||||
self._app.invalidate()
|
self._app.invalidate()
|
||||||
self._voice_stop_and_transcribe()
|
self._voice_stop_and_transcribe()
|
||||||
|
|
||||||
|
# Audio cue: single beep BEFORE starting stream (avoid CoreAudio conflict)
|
||||||
|
try:
|
||||||
|
from tools.voice_mode import play_beep
|
||||||
|
play_beep(frequency=880, count=1)
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
self._voice_recorder.start(on_silence_stop=_on_silence)
|
self._voice_recorder.start(on_silence_stop=_on_silence)
|
||||||
with self._voice_lock:
|
with self._voice_lock:
|
||||||
self._voice_recording = True
|
self._voice_recording = True
|
||||||
|
|
||||||
# Audio cue: single beep on recording start
|
|
||||||
try:
|
|
||||||
from tools.voice_mode import play_beep
|
|
||||||
threading.Thread(target=play_beep, kwargs={"frequency": 880, "count": 1}, daemon=True).start()
|
|
||||||
except Exception:
|
|
||||||
pass
|
|
||||||
_cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(auto-stops on silence | Ctrl+R to stop & exit continuous){_RST}")
|
_cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(auto-stops on silence | Ctrl+R to stop & exit continuous){_RST}")
|
||||||
|
|
||||||
|
# Periodically refresh prompt to update audio level indicator
|
||||||
|
def _refresh_level():
|
||||||
|
while self._voice_recording:
|
||||||
|
if hasattr(self, '_app') and self._app:
|
||||||
|
self._app.invalidate()
|
||||||
|
time.sleep(0.15)
|
||||||
|
threading.Thread(target=_refresh_level, daemon=True).start()
|
||||||
|
|
||||||
def _voice_stop_and_transcribe(self):
|
def _voice_stop_and_transcribe(self):
|
||||||
"""Stop recording, transcribe via STT, and queue the transcript as input."""
|
"""Stop recording, transcribe via STT, and queue the transcript as input."""
|
||||||
try:
|
try:
|
||||||
|
|
@ -3571,15 +3614,15 @@ class HermesCLI:
|
||||||
with self._voice_lock:
|
with self._voice_lock:
|
||||||
self._voice_recording = False
|
self._voice_recording = False
|
||||||
|
|
||||||
# Audio cue: double beep on recording stop
|
# Audio cue: double beep after stream stopped (no CoreAudio conflict)
|
||||||
try:
|
try:
|
||||||
from tools.voice_mode import play_beep
|
from tools.voice_mode import play_beep
|
||||||
threading.Thread(target=play_beep, kwargs={"frequency": 660, "count": 2}, daemon=True).start()
|
play_beep(frequency=660, count=2)
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if wav_path is None:
|
if wav_path is None:
|
||||||
_cprint(f"{_DIM}No speech detected (recording too short).{_RST}")
|
_cprint(f"{_DIM}No speech detected.{_RST}")
|
||||||
return
|
return
|
||||||
|
|
||||||
with self._voice_lock:
|
with self._voice_lock:
|
||||||
|
|
@ -3614,6 +3657,7 @@ class HermesCLI:
|
||||||
finally:
|
finally:
|
||||||
with self._voice_lock:
|
with self._voice_lock:
|
||||||
self._voice_processing = False
|
self._voice_processing = False
|
||||||
|
submitted = self._pending_input.qsize() > 0
|
||||||
if hasattr(self, '_app') and self._app:
|
if hasattr(self, '_app') and self._app:
|
||||||
self._app.invalidate()
|
self._app.invalidate()
|
||||||
# Clean up temp file
|
# Clean up temp file
|
||||||
|
|
@ -3623,6 +3667,18 @@ class HermesCLI:
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# If no transcript was submitted but continuous mode is active,
|
||||||
|
# restart recording so the user can keep talking.
|
||||||
|
# (When transcript IS submitted, process_loop handles restart
|
||||||
|
# after chat() completes.)
|
||||||
|
if self._voice_continuous and not submitted and not self._voice_recording:
|
||||||
|
try:
|
||||||
|
self._voice_start_recording()
|
||||||
|
if hasattr(self, '_app') and self._app:
|
||||||
|
self._app.invalidate()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
def _voice_speak_response(self, text: str):
|
def _voice_speak_response(self, text: str):
|
||||||
"""Speak the agent's response aloud using TTS (runs in background thread)."""
|
"""Speak the agent's response aloud using TTS (runs in background thread)."""
|
||||||
if not self._voice_tts:
|
if not self._voice_tts:
|
||||||
|
|
@ -3727,6 +3783,16 @@ class HermesCLI:
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
# Append voice-mode system prompt for concise, conversational responses
|
||||||
|
self._voice_original_prompt = self.system_prompt
|
||||||
|
voice_instruction = (
|
||||||
|
"\n\n[Voice mode active] The user is speaking via voice input. "
|
||||||
|
"Keep responses concise and conversational — 2-3 sentences max unless "
|
||||||
|
"the user asks for detail. Avoid code blocks, markdown formatting, "
|
||||||
|
"and long lists. Respond naturally as in a spoken conversation."
|
||||||
|
)
|
||||||
|
self.system_prompt = (self.system_prompt or "") + voice_instruction
|
||||||
|
|
||||||
tts_status = " (TTS enabled)" if self._voice_tts else ""
|
tts_status = " (TTS enabled)" if self._voice_tts else ""
|
||||||
_cprint(f"\n{_GOLD}Voice mode enabled{tts_status}{_RST}")
|
_cprint(f"\n{_GOLD}Voice mode enabled{tts_status}{_RST}")
|
||||||
_cprint(f" {_DIM}Ctrl+R to start/stop recording{_RST}")
|
_cprint(f" {_DIM}Ctrl+R to start/stop recording{_RST}")
|
||||||
|
|
@ -3742,6 +3808,10 @@ class HermesCLI:
|
||||||
self._voice_mode = False
|
self._voice_mode = False
|
||||||
self._voice_tts = False
|
self._voice_tts = False
|
||||||
self._voice_continuous = False
|
self._voice_continuous = False
|
||||||
|
|
||||||
|
# Restore original system prompt
|
||||||
|
if hasattr(self, '_voice_original_prompt'):
|
||||||
|
self.system_prompt = self._voice_original_prompt
|
||||||
_cprint(f"\n{_DIM}Voice mode disabled.{_RST}")
|
_cprint(f"\n{_DIM}Voice mode disabled.{_RST}")
|
||||||
|
|
||||||
def _toggle_voice_tts(self):
|
def _toggle_voice_tts(self):
|
||||||
|
|
@ -4237,11 +4307,24 @@ class HermesCLI:
|
||||||
# Icon-only custom prompts should still remain visible in special states.
|
# Icon-only custom prompts should still remain visible in special states.
|
||||||
return symbol, symbol
|
return symbol, symbol
|
||||||
|
|
||||||
|
def _audio_level_bar(self) -> str:
|
||||||
|
"""Return a visual audio level indicator based on current RMS."""
|
||||||
|
_LEVEL_BARS = " ▁▂▃▄▅▆▇"
|
||||||
|
rec = getattr(self, "_voice_recorder", None)
|
||||||
|
if rec is None:
|
||||||
|
return ""
|
||||||
|
rms = rec.current_rms
|
||||||
|
# Normalize RMS (0-32767) to 0-7 index, with log-ish scaling
|
||||||
|
# Typical speech RMS is 500-5000, we cap display at ~8000
|
||||||
|
level = min(rms, 8000) * 7 // 8000
|
||||||
|
return _LEVEL_BARS[level]
|
||||||
|
|
||||||
def _get_tui_prompt_fragments(self):
|
def _get_tui_prompt_fragments(self):
|
||||||
"""Return the prompt_toolkit fragments for the current interactive state."""
|
"""Return the prompt_toolkit fragments for the current interactive state."""
|
||||||
symbol, state_suffix = self._get_tui_prompt_symbols()
|
symbol, state_suffix = self._get_tui_prompt_symbols()
|
||||||
if self._voice_recording:
|
if self._voice_recording:
|
||||||
return [("class:voice-recording", f"● {state_suffix}")]
|
bar = self._audio_level_bar()
|
||||||
|
return [("class:voice-recording", f"● {bar} {state_suffix}")]
|
||||||
if self._voice_processing:
|
if self._voice_processing:
|
||||||
return [("class:voice-processing", f"◉ {state_suffix}")]
|
return [("class:voice-processing", f"◉ {state_suffix}")]
|
||||||
if self._sudo_state:
|
if self._sudo_state:
|
||||||
|
|
@ -4692,6 +4775,14 @@ class HermesCLI:
|
||||||
).start()
|
).start()
|
||||||
else:
|
else:
|
||||||
try:
|
try:
|
||||||
|
# Interrupt TTS if playing, so user can start talking
|
||||||
|
if not cli_ref._voice_tts_done.is_set():
|
||||||
|
try:
|
||||||
|
from tools.voice_mode import stop_playback
|
||||||
|
stop_playback()
|
||||||
|
cli_ref._voice_tts_done.set()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
with cli_ref._voice_lock:
|
with cli_ref._voice_lock:
|
||||||
cli_ref._voice_continuous = True
|
cli_ref._voice_continuous = True
|
||||||
cli_ref._voice_start_recording()
|
cli_ref._voice_start_recording()
|
||||||
|
|
|
||||||
|
|
@ -207,6 +207,8 @@ DEFAULT_CONFIG = {
|
||||||
"record_key": "ctrl+r",
|
"record_key": "ctrl+r",
|
||||||
"max_recording_seconds": 120,
|
"max_recording_seconds": 120,
|
||||||
"auto_tts": False,
|
"auto_tts": False,
|
||||||
|
"silence_threshold": 200, # RMS below this = silence (0-32767)
|
||||||
|
"silence_duration": 3.0, # Seconds of silence before auto-stop
|
||||||
},
|
},
|
||||||
|
|
||||||
"human_delay": {
|
"human_delay": {
|
||||||
|
|
|
||||||
|
|
@ -157,6 +157,7 @@ class TestAudioRecorderStop:
|
||||||
# Simulate captured audio frames (1 second of loud audio above RMS threshold)
|
# Simulate captured audio frames (1 second of loud audio above RMS threshold)
|
||||||
frame = np.full((SAMPLE_RATE, 1), 1000, dtype="int16")
|
frame = np.full((SAMPLE_RATE, 1), 1000, dtype="int16")
|
||||||
recorder._frames = [frame]
|
recorder._frames = [frame]
|
||||||
|
recorder._peak_rms = 1000 # Peak RMS above threshold
|
||||||
|
|
||||||
wav_path = recorder.stop()
|
wav_path = recorder.stop()
|
||||||
|
|
||||||
|
|
@ -203,6 +204,7 @@ class TestAudioRecorderStop:
|
||||||
# 1 second of near-silence (RMS well below threshold)
|
# 1 second of near-silence (RMS well below threshold)
|
||||||
frame = np.full((SAMPLE_RATE, 1), 10, dtype="int16")
|
frame = np.full((SAMPLE_RATE, 1), 10, dtype="int16")
|
||||||
recorder._frames = [frame]
|
recorder._frames = [frame]
|
||||||
|
recorder._peak_rms = 10 # Peak RMS also below threshold
|
||||||
|
|
||||||
wav_path = recorder.stop()
|
wav_path = recorder.stop()
|
||||||
assert wav_path is None
|
assert wav_path is None
|
||||||
|
|
@ -475,8 +477,9 @@ class TestSilenceDetection:
|
||||||
from tools.voice_mode import AudioRecorder, SAMPLE_RATE
|
from tools.voice_mode import AudioRecorder, SAMPLE_RATE
|
||||||
|
|
||||||
recorder = AudioRecorder()
|
recorder = AudioRecorder()
|
||||||
# Use very short silence duration for testing
|
# Use very short durations for testing
|
||||||
recorder._silence_duration = 0.05
|
recorder._silence_duration = 0.05
|
||||||
|
recorder._min_speech_duration = 0.05
|
||||||
|
|
||||||
fired = threading.Event()
|
fired = threading.Event()
|
||||||
|
|
||||||
|
|
@ -490,9 +493,11 @@ class TestSilenceDetection:
|
||||||
if callback is None:
|
if callback is None:
|
||||||
callback = mock_sd.InputStream.call_args[1]["callback"]
|
callback = mock_sd.InputStream.call_args[1]["callback"]
|
||||||
|
|
||||||
# Simulate loud audio (speech) -- RMS well above threshold
|
# Simulate sustained speech (multiple loud chunks to exceed min_speech_duration)
|
||||||
loud_frame = np.full((1600, 1), 5000, dtype="int16")
|
loud_frame = np.full((1600, 1), 5000, dtype="int16")
|
||||||
callback(loud_frame, 1600, None, None)
|
callback(loud_frame, 1600, None, None)
|
||||||
|
time.sleep(0.06)
|
||||||
|
callback(loud_frame, 1600, None, None)
|
||||||
assert recorder._has_spoken is True
|
assert recorder._has_spoken is True
|
||||||
|
|
||||||
# Simulate silence
|
# Simulate silence
|
||||||
|
|
@ -537,6 +542,47 @@ class TestSilenceDetection:
|
||||||
|
|
||||||
recorder.cancel()
|
recorder.cancel()
|
||||||
|
|
||||||
|
def test_micro_pause_tolerance_during_speech(self, mock_sd):
|
||||||
|
"""Brief dips below threshold during speech should NOT reset speech tracking."""
|
||||||
|
np = pytest.importorskip("numpy")
|
||||||
|
import threading
|
||||||
|
|
||||||
|
mock_stream = MagicMock()
|
||||||
|
mock_sd.InputStream.return_value = mock_stream
|
||||||
|
|
||||||
|
from tools.voice_mode import AudioRecorder
|
||||||
|
|
||||||
|
recorder = AudioRecorder()
|
||||||
|
recorder._silence_duration = 0.05
|
||||||
|
recorder._min_speech_duration = 0.15
|
||||||
|
recorder._max_dip_tolerance = 0.1
|
||||||
|
|
||||||
|
fired = threading.Event()
|
||||||
|
recorder.start(on_silence_stop=lambda: fired.set())
|
||||||
|
|
||||||
|
callback = mock_sd.InputStream.call_args.kwargs.get("callback")
|
||||||
|
if callback is None:
|
||||||
|
callback = mock_sd.InputStream.call_args[1]["callback"]
|
||||||
|
|
||||||
|
loud_frame = np.full((1600, 1), 5000, dtype="int16")
|
||||||
|
quiet_frame = np.full((1600, 1), 50, dtype="int16")
|
||||||
|
|
||||||
|
# Speech chunk 1
|
||||||
|
callback(loud_frame, 1600, None, None)
|
||||||
|
time.sleep(0.05)
|
||||||
|
# Brief micro-pause (dip < max_dip_tolerance)
|
||||||
|
callback(quiet_frame, 1600, None, None)
|
||||||
|
time.sleep(0.05)
|
||||||
|
# Speech resumes -- speech_start should NOT have been reset
|
||||||
|
callback(loud_frame, 1600, None, None)
|
||||||
|
assert recorder._speech_start > 0, "Speech start should be preserved across brief dips"
|
||||||
|
time.sleep(0.06)
|
||||||
|
# Another speech chunk to exceed min_speech_duration
|
||||||
|
callback(loud_frame, 1600, None, None)
|
||||||
|
assert recorder._has_spoken is True, "Speech should be confirmed after tolerating micro-pause"
|
||||||
|
|
||||||
|
recorder.cancel()
|
||||||
|
|
||||||
def test_no_callback_means_no_silence_detection(self, mock_sd):
|
def test_no_callback_means_no_silence_detection(self, mock_sd):
|
||||||
np = pytest.importorskip("numpy")
|
np = pytest.importorskip("numpy")
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -117,10 +117,18 @@ class AudioRecorder:
|
||||||
self._start_time: float = 0.0
|
self._start_time: float = 0.0
|
||||||
# Silence detection state
|
# Silence detection state
|
||||||
self._has_spoken = False
|
self._has_spoken = False
|
||||||
|
self._speech_start: float = 0.0 # When speech attempt began
|
||||||
|
self._dip_start: float = 0.0 # When current below-threshold dip began
|
||||||
|
self._min_speech_duration: float = 0.3 # Seconds of speech needed to confirm
|
||||||
|
self._max_dip_tolerance: float = 0.3 # Max dip duration before resetting speech
|
||||||
self._silence_start: float = 0.0
|
self._silence_start: float = 0.0
|
||||||
self._on_silence_stop = None
|
self._on_silence_stop = None
|
||||||
self._silence_threshold: int = SILENCE_RMS_THRESHOLD
|
self._silence_threshold: int = SILENCE_RMS_THRESHOLD
|
||||||
self._silence_duration: float = SILENCE_DURATION_SECONDS
|
self._silence_duration: float = SILENCE_DURATION_SECONDS
|
||||||
|
# Peak RMS seen during recording (for speech presence check in stop())
|
||||||
|
self._peak_rms: int = 0
|
||||||
|
# Live audio level (read by UI for visual feedback)
|
||||||
|
self._current_rms: int = 0
|
||||||
|
|
||||||
# -- public properties ---------------------------------------------------
|
# -- public properties ---------------------------------------------------
|
||||||
|
|
||||||
|
|
@ -134,6 +142,11 @@ class AudioRecorder:
|
||||||
return 0.0
|
return 0.0
|
||||||
return time.monotonic() - self._start_time
|
return time.monotonic() - self._start_time
|
||||||
|
|
||||||
|
@property
|
||||||
|
def current_rms(self) -> int:
|
||||||
|
"""Current audio input RMS level (0-32767). Updated each audio chunk."""
|
||||||
|
return self._current_rms
|
||||||
|
|
||||||
# -- public methods ------------------------------------------------------
|
# -- public methods ------------------------------------------------------
|
||||||
|
|
||||||
def start(self, on_silence_stop=None) -> None:
|
def start(self, on_silence_stop=None) -> None:
|
||||||
|
|
@ -161,7 +174,10 @@ class AudioRecorder:
|
||||||
self._frames = []
|
self._frames = []
|
||||||
self._start_time = time.monotonic()
|
self._start_time = time.monotonic()
|
||||||
self._has_spoken = False
|
self._has_spoken = False
|
||||||
|
self._speech_start = 0.0
|
||||||
|
self._dip_start = 0.0
|
||||||
self._silence_start = 0.0
|
self._silence_start = 0.0
|
||||||
|
self._peak_rms = 0
|
||||||
self._on_silence_stop = on_silence_stop
|
self._on_silence_stop = on_silence_stop
|
||||||
|
|
||||||
def _callback(indata, frames, time_info, status): # noqa: ARG001
|
def _callback(indata, frames, time_info, status): # noqa: ARG001
|
||||||
|
|
@ -169,15 +185,44 @@ class AudioRecorder:
|
||||||
logger.debug("sounddevice status: %s", status)
|
logger.debug("sounddevice status: %s", status)
|
||||||
self._frames.append(indata.copy())
|
self._frames.append(indata.copy())
|
||||||
|
|
||||||
# Silence detection: compute RMS of this chunk
|
# Compute RMS for level display and silence detection
|
||||||
|
rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2)))
|
||||||
|
self._current_rms = rms
|
||||||
|
if rms > self._peak_rms:
|
||||||
|
self._peak_rms = rms
|
||||||
|
|
||||||
|
# Silence detection
|
||||||
if self._on_silence_stop is not None and self._recording:
|
if self._on_silence_stop is not None and self._recording:
|
||||||
rms = int(np.sqrt(np.mean(indata.astype(np.float64) ** 2)))
|
|
||||||
now = time.monotonic()
|
now = time.monotonic()
|
||||||
|
|
||||||
if rms > self._silence_threshold:
|
if rms > self._silence_threshold:
|
||||||
self._has_spoken = True
|
# Audio is above threshold -- this is speech (or noise).
|
||||||
|
self._dip_start = 0.0 # Reset dip tracker
|
||||||
|
if self._speech_start == 0.0:
|
||||||
|
self._speech_start = now
|
||||||
|
elif not self._has_spoken and now - self._speech_start >= self._min_speech_duration:
|
||||||
|
self._has_spoken = True
|
||||||
|
logger.debug("Speech confirmed (%.2fs above threshold)",
|
||||||
|
now - self._speech_start)
|
||||||
self._silence_start = 0.0
|
self._silence_start = 0.0
|
||||||
elif self._has_spoken:
|
elif self._has_spoken:
|
||||||
|
# Speech already confirmed, let silence timer run below
|
||||||
|
pass
|
||||||
|
elif self._speech_start > 0:
|
||||||
|
# We were in a speech attempt but RMS dipped.
|
||||||
|
# Tolerate brief dips (micro-pauses between syllables).
|
||||||
|
if self._dip_start == 0.0:
|
||||||
|
self._dip_start = now
|
||||||
|
elif now - self._dip_start >= self._max_dip_tolerance:
|
||||||
|
# Dip lasted too long -- genuine silence, reset
|
||||||
|
logger.debug("Speech attempt reset (dip lasted %.2fs)",
|
||||||
|
now - self._dip_start)
|
||||||
|
self._speech_start = 0.0
|
||||||
|
self._dip_start = 0.0
|
||||||
|
# else: brief dip, keep tolerating
|
||||||
|
# else: no speech attempt, just silence -- nothing to do
|
||||||
|
|
||||||
|
if self._has_spoken and rms <= self._silence_threshold:
|
||||||
# User was speaking and now is silent
|
# User was speaking and now is silent
|
||||||
if self._silence_start == 0.0:
|
if self._silence_start == 0.0:
|
||||||
self._silence_start = now
|
self._silence_start = now
|
||||||
|
|
@ -235,10 +280,11 @@ class AudioRecorder:
|
||||||
logger.debug("Recording too short (%d samples), discarding", len(audio_data))
|
logger.debug("Recording too short (%d samples), discarding", len(audio_data))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Skip silent recordings (RMS below threshold = no real speech)
|
# Skip silent recordings using peak RMS (not overall average, which
|
||||||
rms = int(np.sqrt(np.mean(audio_data.astype(np.float64) ** 2)))
|
# gets diluted by silence at the end of the recording).
|
||||||
if rms < SILENCE_RMS_THRESHOLD:
|
if self._peak_rms < SILENCE_RMS_THRESHOLD:
|
||||||
logger.info("Recording too quiet (RMS=%d < %d), discarding", rms, SILENCE_RMS_THRESHOLD)
|
logger.info("Recording too quiet (peak RMS=%d < %d), discarding",
|
||||||
|
self._peak_rms, SILENCE_RMS_THRESHOLD)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return self._write_wav(audio_data)
|
return self._write_wav(audio_data)
|
||||||
|
|
@ -341,8 +387,34 @@ def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# Audio playback
|
# Audio playback (interruptable)
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
||||||
|
# Global reference to the active playback process so it can be interrupted.
|
||||||
|
_active_playback: Optional[subprocess.Popen] = None
|
||||||
|
_playback_lock = threading.Lock()
|
||||||
|
|
||||||
|
|
||||||
|
def stop_playback() -> None:
|
||||||
|
"""Interrupt the currently playing audio (if any)."""
|
||||||
|
global _active_playback
|
||||||
|
with _playback_lock:
|
||||||
|
proc = _active_playback
|
||||||
|
_active_playback = None
|
||||||
|
if proc and proc.poll() is None:
|
||||||
|
try:
|
||||||
|
proc.terminate()
|
||||||
|
logger.info("Audio playback interrupted")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
# Also stop sounddevice playback if active
|
||||||
|
if _HAS_AUDIO:
|
||||||
|
try:
|
||||||
|
sd.stop()
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
def play_audio_file(file_path: str) -> bool:
|
def play_audio_file(file_path: str) -> bool:
|
||||||
"""Play an audio file through the default output device.
|
"""Play an audio file through the default output device.
|
||||||
|
|
||||||
|
|
@ -351,9 +423,13 @@ def play_audio_file(file_path: str) -> bool:
|
||||||
2. System commands: ``afplay`` (macOS), ``ffplay`` (cross-platform),
|
2. System commands: ``afplay`` (macOS), ``ffplay`` (cross-platform),
|
||||||
``aplay`` (Linux ALSA).
|
``aplay`` (Linux ALSA).
|
||||||
|
|
||||||
|
Playback can be interrupted by calling ``stop_playback()``.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
``True`` if playback succeeded, ``False`` otherwise.
|
``True`` if playback succeeded, ``False`` otherwise.
|
||||||
"""
|
"""
|
||||||
|
global _active_playback
|
||||||
|
|
||||||
if not os.path.isfile(file_path):
|
if not os.path.isfile(file_path):
|
||||||
logger.warning("Audio file not found: %s", file_path)
|
logger.warning("Audio file not found: %s", file_path)
|
||||||
return False
|
return False
|
||||||
|
|
@ -372,7 +448,7 @@ def play_audio_file(file_path: str) -> bool:
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug("sounddevice playback failed: %s", e)
|
logger.debug("sounddevice playback failed: %s", e)
|
||||||
|
|
||||||
# Fall back to system audio players
|
# Fall back to system audio players (using Popen for interruptability)
|
||||||
system = platform.system()
|
system = platform.system()
|
||||||
players = []
|
players = []
|
||||||
|
|
||||||
|
|
@ -386,10 +462,17 @@ def play_audio_file(file_path: str) -> bool:
|
||||||
exe = shutil.which(cmd[0])
|
exe = shutil.which(cmd[0])
|
||||||
if exe:
|
if exe:
|
||||||
try:
|
try:
|
||||||
subprocess.run(cmd, capture_output=True, timeout=300)
|
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||||
|
with _playback_lock:
|
||||||
|
_active_playback = proc
|
||||||
|
proc.wait(timeout=300)
|
||||||
|
with _playback_lock:
|
||||||
|
_active_playback = None
|
||||||
return True
|
return True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.debug("System player %s failed: %s", cmd[0], e)
|
logger.debug("System player %s failed: %s", cmd[0], e)
|
||||||
|
with _playback_lock:
|
||||||
|
_active_playback = None
|
||||||
|
|
||||||
logger.warning("No audio player available for %s", file_path)
|
logger.warning("No audio player available for %s", file_path)
|
||||||
return False
|
return False
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue