feat: add silence filter, hallucination guard, and continuous mode control

- Skip silent recordings before STT call (RMS check in AudioRecorder.stop) - Filter known Whisper hallucinations ("Thank you.", "Bye." etc.) - Continuous mode: Ctrl+R starts loop, Ctrl+R during recording exits it - Wait for TTS to finish before auto-restart to avoid recording speaker - Silence timeout increased to 3s for natural pauses - Tests: hallucination filter, silent recording skip, real speech passthrough
2026-03-03 19:58:38 +03:00 · 2026-03-03 19:58:38 +03:00 · 32b033c11c
commit 32b033c11c
parent bfd9c97705
2 changed files with 111 additions and 3 deletions
--- a/tests/tools/test_voice_mode.py
+++ b/tests/tools/test_voice_mode.py
@ -154,8 +154,8 @@ class TestAudioRecorderStop:
        recorder = AudioRecorder()
        recorder.start()
-        # Simulate captured audio frames (1 second of silence)
+        # Simulate captured audio frames (1 second of loud audio above RMS threshold)
-        frame = np.zeros((SAMPLE_RATE, 1), dtype="int16")
+        frame = np.full((SAMPLE_RATE, 1), 1000, dtype="int16")
        recorder._frames = [frame]
        wav_path = recorder.stop()
@ -189,6 +189,24 @@ class TestAudioRecorderStop:
        wav_path = recorder.stop()
        assert wav_path is None
    def test_stop_returns_none_for_silent_recording(self, mock_sd, temp_voice_dir):
        np = pytest.importorskip("numpy")
        mock_stream = MagicMock()
        mock_sd.InputStream.return_value = mock_stream
        from tools.voice_mode import AudioRecorder, SAMPLE_RATE
        recorder = AudioRecorder()
        recorder.start()
        # 1 second of near-silence (RMS well below threshold)
        frame = np.full((SAMPLE_RATE, 1), 10, dtype="int16")
        recorder._frames = [frame]
        wav_path = recorder.stop()
        assert wav_path is None
 class TestAudioRecorderCancel:
    def test_cancel_discards_frames(self, mock_sd):
@ -259,6 +277,52 @@ class TestTranscribeRecording:
        assert result["transcript"] == "hello world"
        mock_transcribe.assert_called_once_with("/tmp/test.wav", model="whisper-1")
    def test_filters_whisper_hallucination(self):
        mock_transcribe = MagicMock(return_value={
            "success": True,
            "transcript": "Thank you.",
        })
        with patch("tools.transcription_tools.transcribe_audio", mock_transcribe):
            from tools.voice_mode import transcribe_recording
            result = transcribe_recording("/tmp/test.wav")
        assert result["success"] is True
        assert result["transcript"] == ""
        assert result["filtered"] is True
    def test_does_not_filter_real_speech(self):
        mock_transcribe = MagicMock(return_value={
            "success": True,
            "transcript": "Thank you for helping me with this code.",
        })
        with patch("tools.transcription_tools.transcribe_audio", mock_transcribe):
            from tools.voice_mode import transcribe_recording
            result = transcribe_recording("/tmp/test.wav")
        assert result["transcript"] == "Thank you for helping me with this code."
        assert "filtered" not in result
 class TestWhisperHallucinationFilter:
    def test_known_hallucinations(self):
        from tools.voice_mode import is_whisper_hallucination
        assert is_whisper_hallucination("Thank you.") is True
        assert is_whisper_hallucination("thank you") is True
        assert is_whisper_hallucination("Thanks for watching.") is True
        assert is_whisper_hallucination("Bye.") is True
        assert is_whisper_hallucination("  Thank you.  ") is True  # with whitespace
        assert is_whisper_hallucination("you") is True
    def test_real_speech_not_filtered(self):
        from tools.voice_mode import is_whisper_hallucination
        assert is_whisper_hallucination("Hello, how are you?") is False
        assert is_whisper_hallucination("Thank you for your help with the project.") is False
        assert is_whisper_hallucination("Can you explain this code?") is False
 # ============================================================================
 # play_audio_file
--- a/tools/voice_mode.py
+++ b/tools/voice_mode.py
@ -235,6 +235,12 @@ class AudioRecorder:
                logger.debug("Recording too short (%d samples), discarding", len(audio_data))
                return None
            # Skip silent recordings (RMS below threshold = no real speech)
            rms = int(np.sqrt(np.mean(audio_data.astype(np.float64) ** 2)))
            if rms < SILENCE_RMS_THRESHOLD:
                logger.info("Recording too quiet (RMS=%d < %d), discarding", rms, SILENCE_RMS_THRESHOLD)
                return None
            return self._write_wav(audio_data)
    def cancel(self) -> None:
@ -276,6 +282,36 @@ class AudioRecorder:
        return wav_path
 # ============================================================================
 # Whisper hallucination filter
 # ============================================================================
 # Whisper commonly hallucinates these phrases on silent/near-silent audio.
 WHISPER_HALLUCINATIONS = {
    "thank you.",
    "thank you",
    "thanks for watching.",
    "thanks for watching",
    "subscribe to my channel.",
    "subscribe to my channel",
    "like and subscribe.",
    "like and subscribe",
    "please subscribe.",
    "please subscribe",
    "thank you for watching.",
    "thank you for watching",
    "bye.",
    "bye",
    "you",
    "the end.",
    "the end",
 }
 def is_whisper_hallucination(transcript: str) -> bool:
    """Check if a transcript is a known Whisper hallucination on silence."""
    return transcript.strip().lower() in WHISPER_HALLUCINATIONS
 # ============================================================================
 # STT dispatch
 # ============================================================================
@ -283,6 +319,7 @@ def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str
    """Transcribe a WAV recording using the existing Whisper pipeline.
    Delegates to ``tools.transcription_tools.transcribe_audio()``.
    Filters out known Whisper hallucinations on silent audio.
    Args:
        wav_path: Path to the WAV file.
@ -293,7 +330,14 @@ def transcribe_recording(wav_path: str, model: Optional[str] = None) -> Dict[str
    """
    from tools.transcription_tools import transcribe_audio
-    return transcribe_audio(wav_path, model=model)
+    result = transcribe_audio(wav_path, model=model)
    # Filter out Whisper hallucinations (common on silent/near-silent audio)
    if result.get("success") and is_whisper_hallucination(result.get("transcript", "")):
        logger.info("Filtered Whisper hallucination: %r", result["transcript"])
        return {"success": True, "transcript": "", "filtered": True}
    return result
 # ============================================================================