diff --git a/tests/tools/test_voice_mode.py b/tests/tools/test_voice_mode.py index ff1a99b2..0d40932e 100644 --- a/tests/tools/test_voice_mode.py +++ b/tests/tools/test_voice_mode.py @@ -154,8 +154,8 @@ class TestAudioRecorderStop: recorder = AudioRecorder() recorder.start() - # Simulate captured audio frames (1 second of silence) - frame = np.zeros((SAMPLE_RATE, 1), dtype="int16") + # Simulate captured audio frames (1 second of loud audio above RMS threshold) + frame = np.full((SAMPLE_RATE, 1), 1000, dtype="int16") recorder._frames = [frame] wav_path = recorder.stop() @@ -189,6 +189,24 @@ class TestAudioRecorderStop: wav_path = recorder.stop() assert wav_path is None + def test_stop_returns_none_for_silent_recording(self, mock_sd, temp_voice_dir): + np = pytest.importorskip("numpy") + + mock_stream = MagicMock() + mock_sd.InputStream.return_value = mock_stream + + from tools.voice_mode import AudioRecorder, SAMPLE_RATE + + recorder = AudioRecorder() + recorder.start() + + # 1 second of near-silence (RMS well below threshold) + frame = np.full((SAMPLE_RATE, 1), 10, dtype="int16") + recorder._frames = [frame] + + wav_path = recorder.stop() + assert wav_path is None + class TestAudioRecorderCancel: def test_cancel_discards_frames(self, mock_sd): @@ -259,6 +277,52 @@ class TestTranscribeRecording: assert result["transcript"] == "hello world" mock_transcribe.assert_called_once_with("/tmp/test.wav", model="whisper-1") + def test_filters_whisper_hallucination(self): + mock_transcribe = MagicMock(return_value={ + "success": True, + "transcript": "Thank you.", + }) + + with patch("tools.transcription_tools.transcribe_audio", mock_transcribe): + from tools.voice_mode import transcribe_recording + result = transcribe_recording("/tmp/test.wav") + + assert result["success"] is True + assert result["transcript"] == "" + assert result["filtered"] is True + + def test_does_not_filter_real_speech(self): + mock_transcribe = MagicMock(return_value={ + "success": True, + "transcript": "Thank you for helping me with this code.", + }) + + with patch("tools.transcription_tools.transcribe_audio", mock_transcribe): + from tools.voice_mode import transcribe_recording + result = transcribe_recording("/tmp/test.wav") + + assert result["transcript"] == "Thank you for helping me with this code." + assert "filtered" not in result + + +class TestWhisperHallucinationFilter: + def test_known_hallucinations(self): + from tools.voice_mode import is_whisper_hallucination + + assert is_whisper_hallucination("Thank you.") is True + assert is_whisper_hallucination("thank you") is True + assert is_whisper_hallucination("Thanks for watching.") is True + assert is_whisper_hallucination("Bye.") is True + assert is_whisper_hallucination(" Thank you. ") is True # with whitespace + assert is_whisper_hallucination("you") is True + + def test_real_speech_not_filtered(self): + from tools.voice_mode import is_whisper_hallucination + + assert is_whisper_hallucination("Hello, how are you?") is False + assert is_whisper_hallucination("Thank you for your help with the project.") is False + assert is_whisper_hallucination("Can you explain this code?") is False + # ============================================================================ # play_audio_file diff --git a/tools/voice_mode.py b/tools/voice_mode.py index 5abdc4d6..cdffa990 100644 --- a/tools/voice_mode.py +++ b/tools/voice_mode.py @@ -235,6 +235,12 @@ class AudioRecorder: logger.debug("Recording too short (%d samples), discarding", len(audio_data)) return None + # Skip silent recordings (RMS below threshold = no real speech) + rms = int(np.sqrt(np.mean(audio_data.astype(np.float64) ** 2))) + if rms < SILENCE_RMS_THRESHOLD: + logger.info("Recording too quiet (RMS=%d < %d), discarding", rms, SILENCE_RMS_THRESHOLD) + return None + return self._write_wav(audio_data) def cancel(self) -> None: @@ -276,6 +282,36 @@ class AudioRecorder: return wav_path +# ============================================================================ +# Whisper hallucination filter +# ============================================================================ +# Whisper commonly hallucinates these phrases on silent/near-silent audio. +WHISPER_HALLUCINATIONS = { + "thank you.", + "thank you", + "thanks for watching.", + "thanks for watching", + "subscribe to my channel.", + "subscribe to my channel", + "like and subscribe.", + "like and subscribe", + "please subscribe.", + "please subscribe", + "thank you for watching.", + "thank you for watching", + "bye.", + "bye", + "you", + "the end.", + "the end", +} + + +def is_whisper_hallucination(transcript: str) -> bool: + """Check if a transcript is a known Whisper hallucination on silence.""" + return transcript.strip().lower() in WHISPER_HALLUCINATIONS + + # ============================================================================ # STT dispatch # ============================================================================ @@ -283,6 +319,7 @@ def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str """Transcribe a WAV recording using the existing Whisper pipeline. Delegates to ``tools.transcription_tools.transcribe_audio()``. + Filters out known Whisper hallucinations on silent audio. Args: wav_path: Path to the WAV file. @@ -293,7 +330,14 @@ def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str """ from tools.transcription_tools import transcribe_audio - return transcribe_audio(wav_path, model=model) + result = transcribe_audio(wav_path, model=model) + + # Filter out Whisper hallucinations (common on silent/near-silent audio) + if result.get("success") and is_whisper_hallucination(result.get("transcript", "")): + logger.info("Filtered Whisper hallucination: %r", result["transcript"]) + return {"success": True, "transcript": "", "filtered": True} + + return result # ============================================================================