fix: voice mode race conditions, temp file leak, think tag parsing

- Atomic check-and-set for _voice_recording flag with _voice_lock - Guard _voice_stop_and_transcribe against concurrent invocation - Remove premature flag clearing from Ctrl+R handler - Clean up temp WAV files in finally block (_play_via_tempfile) - Use buffer-level regex for <think> block filtering (handles chunked tags) - Prevent /voice on prompt accumulation on repeated calls - Include Groq in STT key error message
2026-03-06 01:32:37 +03:00 · 2026-03-06 01:32:37 +03:00 · 3a1b35ed92
commit 3a1b35ed92
parent 7d4b4e95f1
2 changed files with 46 additions and 41 deletions
--- a/cli.py
+++ b/cli.py
@ -3544,10 +3544,6 @@ class HermesCLI:

    def _voice_start_recording(self):
        """Start capturing audio from the microphone."""
-        # Prevent double-start from concurrent threads
-        if self._voice_recording:
-            return
-
        from tools.voice_mode import AudioRecorder, check_voice_requirements

        reqs = check_voice_requirements()
@ -3559,10 +3555,18 @@ class HermesCLI:
            )
        if not reqs["stt_key_set"]:
            raise RuntimeError(
-                "Voice mode requires VOICE_TOOLS_OPENAI_KEY for transcription.\n"
-                "Get one at: https://platform.openai.com/api-keys"
+                "Voice mode requires an STT API key for transcription.\n"
+                "Set GROQ_API_KEY (free) or VOICE_TOOLS_OPENAI_KEY.\n"
+                "Groq: https://console.groq.com/keys\n"
+                "OpenAI: https://platform.openai.com/api-keys"
            )

+        # Prevent double-start from concurrent threads (atomic check-and-set)
+        with self._voice_lock:
+            if self._voice_recording:
+                return
+            self._voice_recording = True
+
        # Load silence detection params from config
        voice_cfg = {}
        try:
@ -3595,9 +3599,12 @@ class HermesCLI:
        except Exception:
            pass

-        self._voice_recorder.start(on_silence_stop=_on_silence)
-        with self._voice_lock:
-            self._voice_recording = True
+        try:
+            self._voice_recorder.start(on_silence_stop=_on_silence)
+        except Exception:
+            with self._voice_lock:
+                self._voice_recording = False
+            raise
        _cprint(f"\n{_GOLD}● Recording...{_RST} {_DIM}(auto-stops on silence | Ctrl+R to stop & exit continuous){_RST}")

        # Periodically refresh prompt to update audio level indicator
@ -3610,6 +3617,12 @@ class HermesCLI:

    def _voice_stop_and_transcribe(self):
        """Stop recording, transcribe via STT, and queue the transcript as input."""
+        # Atomic guard: only one thread can enter stop-and-transcribe
+        with self._voice_lock:
+            if not self._voice_recording:
+                return
+            self._voice_recording = False
+
        submitted = False
        wav_path = None
        try:
@ -3617,8 +3630,6 @@ class HermesCLI:
                return

            wav_path = self._voice_recorder.stop()
-            with self._voice_lock:
-                self._voice_recording = False

            # Audio cue: double beep after stream stopped (no CoreAudio conflict)
            try:
@ -3764,6 +3775,10 @@ class HermesCLI:

    def _enable_voice_mode(self):
        """Enable voice mode after checking requirements."""
+        if self._voice_mode:
+            _cprint(f"{_DIM}Voice mode is already enabled.{_RST}")
+            return
+
        from tools.voice_mode import check_voice_requirements

        reqs = check_voice_requirements()
@ -4838,7 +4853,7 @@ class HermesCLI:
                # Manual stop via Ctrl+R: stop continuous mode
                with cli_ref._voice_lock:
                    cli_ref._voice_continuous = False
-                    cli_ref._voice_recording = False
+                # Flag clearing is handled atomically inside _voice_stop_and_transcribe
                event.app.invalidate()
                threading.Thread(
                    target=cli_ref._voice_stop_and_transcribe,
--- a/tools/tts_tool.py
+++ b/tools/tts_tool.py
@ -519,10 +519,11 @@ def stream_tts_to_speaker(
                    output_stream = None

        sentence_buf = ""
-        in_think = False  # track <think>...</think> blocks
        min_sentence_len = 20
        long_flush_len = 100
        queue_timeout = 0.5
+        # Regex to strip complete <think>...</think> blocks from buffer
+        _think_block_re = re.compile(r'<think[\s>].*?</think>', flags=re.DOTALL)

        def _speak_sentence(sentence: str):
            """Display sentence and optionally generate + play audio."""
@ -562,6 +563,7 @@ def stream_tts_to_speaker(

        def _play_via_tempfile(audio_iter, stop_evt):
            """Write PCM chunks to a temp WAV file and play it."""
+            tmp_path = None
            try:
                import wave
                tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
@ -576,9 +578,14 @@ def stream_tts_to_speaker(
                        wf.writeframes(chunk)
                from tools.voice_mode import play_audio_file
                play_audio_file(tmp_path)
-                os.unlink(tmp_path)
            except Exception as exc:
                logger.warning("Temp-file TTS fallback failed: %s", exc)
+            finally:
+                if tmp_path:
+                    try:
+                        os.unlink(tmp_path)
+                    except OSError:
+                        pass

        while not stop_event.is_set():
            # Read next delta from queue
@ -592,41 +599,24 @@ def stream_tts_to_speaker(
                continue

            if delta is None:
-                # End-of-text sentinel: flush remaining buffer
+                # End-of-text sentinel: strip any remaining think blocks, flush
+                sentence_buf = _think_block_re.sub('', sentence_buf)
                if sentence_buf.strip():
                    _speak_sentence(sentence_buf)
                break

+            sentence_buf += delta
+
            # --- Think block filtering ---
-            # Process delta character by character for think tags
-            i = 0
-            filtered_delta = []
-            while i < len(delta):
-                # Check for opening <think tag
-                if delta[i:].startswith("<think"):
-                    in_think = True
-                    # Skip past the tag
-                    end = delta.find(">", i)
-                    if end != -1:
-                        i = end + 1
-                    else:
-                        i = len(delta)
-                    continue
-                # Check for closing </think> tag
-                if delta[i:].startswith("</think>"):
-                    in_think = False
-                    i += len("</think>")
-                    continue
-                if not in_think:
-                    filtered_delta.append(delta[i])
-                i += 1
+            # Strip complete <think>...</think> blocks from buffer.
+            # Works correctly even when tags span multiple deltas.
+            sentence_buf = _think_block_re.sub('', sentence_buf)

-            text = "".join(filtered_delta)
-            if not text:
+            # If an incomplete <think tag is at the end, wait for more data
+            # before extracting sentences (the closing tag may arrive next).
+            if '<think' in sentence_buf and '</think>' not in sentence_buf:
                continue

-            sentence_buf += text
-
            # Check for sentence boundaries
            while True:
                m = _SENTENCE_BOUNDARY_RE.search(sentence_buf)