feat: add silence filter, hallucination guard, and continuous mode control
- Skip silent recordings before STT call (RMS check in AudioRecorder.stop)
- Filter known Whisper hallucinations ("Thank you.", "Bye." etc.)
- Continuous mode: Ctrl+R starts loop, Ctrl+R during recording exits it
- Wait for TTS to finish before auto-restart to avoid recording speaker
- Silence timeout increased to 3s for natural pauses
- Tests: hallucination filter, silent recording skip, real speech passthrough
This commit is contained in:
parent
bfd9c97705
commit
32b033c11c
2 changed files with 111 additions and 3 deletions
|
|
@ -154,8 +154,8 @@ class TestAudioRecorderStop:
|
||||||
recorder = AudioRecorder()
|
recorder = AudioRecorder()
|
||||||
recorder.start()
|
recorder.start()
|
||||||
|
|
||||||
# Simulate captured audio frames (1 second of silence)
|
# Simulate captured audio frames (1 second of loud audio above RMS threshold)
|
||||||
frame = np.zeros((SAMPLE_RATE, 1), dtype="int16")
|
frame = np.full((SAMPLE_RATE, 1), 1000, dtype="int16")
|
||||||
recorder._frames = [frame]
|
recorder._frames = [frame]
|
||||||
|
|
||||||
wav_path = recorder.stop()
|
wav_path = recorder.stop()
|
||||||
|
|
@ -189,6 +189,24 @@ class TestAudioRecorderStop:
|
||||||
wav_path = recorder.stop()
|
wav_path = recorder.stop()
|
||||||
assert wav_path is None
|
assert wav_path is None
|
||||||
|
|
||||||
|
def test_stop_returns_none_for_silent_recording(self, mock_sd, temp_voice_dir):
|
||||||
|
np = pytest.importorskip("numpy")
|
||||||
|
|
||||||
|
mock_stream = MagicMock()
|
||||||
|
mock_sd.InputStream.return_value = mock_stream
|
||||||
|
|
||||||
|
from tools.voice_mode import AudioRecorder, SAMPLE_RATE
|
||||||
|
|
||||||
|
recorder = AudioRecorder()
|
||||||
|
recorder.start()
|
||||||
|
|
||||||
|
# 1 second of near-silence (RMS well below threshold)
|
||||||
|
frame = np.full((SAMPLE_RATE, 1), 10, dtype="int16")
|
||||||
|
recorder._frames = [frame]
|
||||||
|
|
||||||
|
wav_path = recorder.stop()
|
||||||
|
assert wav_path is None
|
||||||
|
|
||||||
|
|
||||||
class TestAudioRecorderCancel:
|
class TestAudioRecorderCancel:
|
||||||
def test_cancel_discards_frames(self, mock_sd):
|
def test_cancel_discards_frames(self, mock_sd):
|
||||||
|
|
@ -259,6 +277,52 @@ class TestTranscribeRecording:
|
||||||
assert result["transcript"] == "hello world"
|
assert result["transcript"] == "hello world"
|
||||||
mock_transcribe.assert_called_once_with("/tmp/test.wav", model="whisper-1")
|
mock_transcribe.assert_called_once_with("/tmp/test.wav", model="whisper-1")
|
||||||
|
|
||||||
|
def test_filters_whisper_hallucination(self):
|
||||||
|
mock_transcribe = MagicMock(return_value={
|
||||||
|
"success": True,
|
||||||
|
"transcript": "Thank you.",
|
||||||
|
})
|
||||||
|
|
||||||
|
with patch("tools.transcription_tools.transcribe_audio", mock_transcribe):
|
||||||
|
from tools.voice_mode import transcribe_recording
|
||||||
|
result = transcribe_recording("/tmp/test.wav")
|
||||||
|
|
||||||
|
assert result["success"] is True
|
||||||
|
assert result["transcript"] == ""
|
||||||
|
assert result["filtered"] is True
|
||||||
|
|
||||||
|
def test_does_not_filter_real_speech(self):
|
||||||
|
mock_transcribe = MagicMock(return_value={
|
||||||
|
"success": True,
|
||||||
|
"transcript": "Thank you for helping me with this code.",
|
||||||
|
})
|
||||||
|
|
||||||
|
with patch("tools.transcription_tools.transcribe_audio", mock_transcribe):
|
||||||
|
from tools.voice_mode import transcribe_recording
|
||||||
|
result = transcribe_recording("/tmp/test.wav")
|
||||||
|
|
||||||
|
assert result["transcript"] == "Thank you for helping me with this code."
|
||||||
|
assert "filtered" not in result
|
||||||
|
|
||||||
|
|
||||||
|
class TestWhisperHallucinationFilter:
|
||||||
|
def test_known_hallucinations(self):
|
||||||
|
from tools.voice_mode import is_whisper_hallucination
|
||||||
|
|
||||||
|
assert is_whisper_hallucination("Thank you.") is True
|
||||||
|
assert is_whisper_hallucination("thank you") is True
|
||||||
|
assert is_whisper_hallucination("Thanks for watching.") is True
|
||||||
|
assert is_whisper_hallucination("Bye.") is True
|
||||||
|
assert is_whisper_hallucination(" Thank you. ") is True # with whitespace
|
||||||
|
assert is_whisper_hallucination("you") is True
|
||||||
|
|
||||||
|
def test_real_speech_not_filtered(self):
|
||||||
|
from tools.voice_mode import is_whisper_hallucination
|
||||||
|
|
||||||
|
assert is_whisper_hallucination("Hello, how are you?") is False
|
||||||
|
assert is_whisper_hallucination("Thank you for your help with the project.") is False
|
||||||
|
assert is_whisper_hallucination("Can you explain this code?") is False
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# play_audio_file
|
# play_audio_file
|
||||||
|
|
|
||||||
|
|
@ -235,6 +235,12 @@ class AudioRecorder:
|
||||||
logger.debug("Recording too short (%d samples), discarding", len(audio_data))
|
logger.debug("Recording too short (%d samples), discarding", len(audio_data))
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
# Skip silent recordings (RMS below threshold = no real speech)
|
||||||
|
rms = int(np.sqrt(np.mean(audio_data.astype(np.float64) ** 2)))
|
||||||
|
if rms < SILENCE_RMS_THRESHOLD:
|
||||||
|
logger.info("Recording too quiet (RMS=%d < %d), discarding", rms, SILENCE_RMS_THRESHOLD)
|
||||||
|
return None
|
||||||
|
|
||||||
return self._write_wav(audio_data)
|
return self._write_wav(audio_data)
|
||||||
|
|
||||||
def cancel(self) -> None:
|
def cancel(self) -> None:
|
||||||
|
|
@ -276,6 +282,36 @@ class AudioRecorder:
|
||||||
return wav_path
|
return wav_path
|
||||||
|
|
||||||
|
|
||||||
|
# ============================================================================
|
||||||
|
# Whisper hallucination filter
|
||||||
|
# ============================================================================
|
||||||
|
# Whisper commonly hallucinates these phrases on silent/near-silent audio.
|
||||||
|
WHISPER_HALLUCINATIONS = {
|
||||||
|
"thank you.",
|
||||||
|
"thank you",
|
||||||
|
"thanks for watching.",
|
||||||
|
"thanks for watching",
|
||||||
|
"subscribe to my channel.",
|
||||||
|
"subscribe to my channel",
|
||||||
|
"like and subscribe.",
|
||||||
|
"like and subscribe",
|
||||||
|
"please subscribe.",
|
||||||
|
"please subscribe",
|
||||||
|
"thank you for watching.",
|
||||||
|
"thank you for watching",
|
||||||
|
"bye.",
|
||||||
|
"bye",
|
||||||
|
"you",
|
||||||
|
"the end.",
|
||||||
|
"the end",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def is_whisper_hallucination(transcript: str) -> bool:
|
||||||
|
"""Check if a transcript is a known Whisper hallucination on silence."""
|
||||||
|
return transcript.strip().lower() in WHISPER_HALLUCINATIONS
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
# STT dispatch
|
# STT dispatch
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
@ -283,6 +319,7 @@ def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str
|
||||||
"""Transcribe a WAV recording using the existing Whisper pipeline.
|
"""Transcribe a WAV recording using the existing Whisper pipeline.
|
||||||
|
|
||||||
Delegates to ``tools.transcription_tools.transcribe_audio()``.
|
Delegates to ``tools.transcription_tools.transcribe_audio()``.
|
||||||
|
Filters out known Whisper hallucinations on silent audio.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
wav_path: Path to the WAV file.
|
wav_path: Path to the WAV file.
|
||||||
|
|
@ -293,7 +330,14 @@ def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str
|
||||||
"""
|
"""
|
||||||
from tools.transcription_tools import transcribe_audio
|
from tools.transcription_tools import transcribe_audio
|
||||||
|
|
||||||
return transcribe_audio(wav_path, model=model)
|
result = transcribe_audio(wav_path, model=model)
|
||||||
|
|
||||||
|
# Filter out Whisper hallucinations (common on silent/near-silent audio)
|
||||||
|
if result.get("success") and is_whisper_hallucination(result.get("transcript", "")):
|
||||||
|
logger.info("Filtered Whisper hallucination: %r", result["transcript"])
|
||||||
|
return {"success": True, "transcript": "", "filtered": True}
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue