fix(cli): reasoning tag suppression during streaming + fix fallback detection

Fixes two issues found during live testing: 1. Reasoning tag suppression: close tags like </REASONING_SCRATCHPAD> that arrive split across stream tokens (e.g. '</REASONING_SCRATCH' + 'PAD>\n\nHello') were being lost because the buffer was discarded. Fix: keep a sliding window of the tail (max close tag length) so partial tags survive across tokens. 2. Streaming fallback detection was too broad — 'stream' matched any error containing that word (including 'stream_options' rejections). Narrowed to specific phrases: 'streaming is not', 'streaming not support', 'does not support stream', 'not available'. Verified with real API calls: streaming works end-to-end with reasoning block suppression, response box framing, and proper fallback to Rich Panel when streaming isn't active.
2026-03-16 05:28:10 -07:00 · 2026-03-16 05:28:10 -07:00 · ac739e485f
commit ac739e485f
parent 2219695d92
2 changed files with 82 additions and 4 deletions
--- a/cli.py
+++ b/cli.py
@ -1416,12 +1416,86 @@ class HermesCLI:
        Receives text deltas from the agent as tokens arrive. Buffers
        partial lines and emits complete lines via _cprint to work
        reliably with prompt_toolkit's patch_stdout.
+
+        Reasoning/thinking blocks (<REASONING_SCRATCHPAD>, <think>, etc.)
+        are suppressed during streaming since they'd display raw XML tags.
+        The agent strips them from the final response anyway.
        """
        if not text:
            return

-        # Open the response box header on the very first delta
+        self._stream_started = True
+
+        # ── Tag-based reasoning suppression ──
+        # Track whether we're inside a reasoning/thinking block.
+        # These tags are model-generated (system prompt tells the model
+        # to use them) and get stripped from final_response. We must
+        # suppress them during streaming too.
+        _OPEN_TAGS = ("<REASONING_SCRATCHPAD>", "<think>", "<reasoning>")
+        _CLOSE_TAGS = ("</REASONING_SCRATCHPAD>", "</think>", "</reasoning>")
+
+        # Append to a pre-filter buffer first
+        self._stream_prefilt = getattr(self, "_stream_prefilt", "") + text
+
+        # Check if we're entering a reasoning block
+        if not getattr(self, "_in_reasoning_block", False):
+            for tag in _OPEN_TAGS:
+                idx = self._stream_prefilt.find(tag)
+                if idx != -1:
+                    # Emit everything before the tag
+                    before = self._stream_prefilt[:idx]
+                    if before:
+                        self._emit_stream_text(before)
+                    self._in_reasoning_block = True
+                    self._stream_prefilt = self._stream_prefilt[idx + len(tag):]
+                    break
+
+            # Could also be a partial open tag at the end — hold it back
+            if not getattr(self, "_in_reasoning_block", False):
+                # Check for partial tag match at the end
+                safe = self._stream_prefilt
+                for tag in _OPEN_TAGS:
+                    for i in range(1, len(tag)):
+                        if self._stream_prefilt.endswith(tag[:i]):
+                            safe = self._stream_prefilt[:-i]
+                            break
+                if safe:
+                    self._emit_stream_text(safe)
+                    self._stream_prefilt = self._stream_prefilt[len(safe):]
+                return
+
+        # Inside a reasoning block — look for close tag.
+        # Keep accumulating _stream_prefilt because close tags can arrive
+        # split across multiple tokens (e.g. "</REASONING_SCRATCH" + "PAD>...").
+        if getattr(self, "_in_reasoning_block", False):
+            for tag in _CLOSE_TAGS:
+                idx = self._stream_prefilt.find(tag)
+                if idx != -1:
+                    self._in_reasoning_block = False
+                    after = self._stream_prefilt[idx + len(tag):]
+                    self._stream_prefilt = ""
+                    # Process remaining text after close tag
+                    if after:
+                        self._emit_stream_text(after)
+                    return
+            # Still inside reasoning block — keep only the tail that could
+            # be a partial close tag prefix (save memory on long blocks).
+            max_tag_len = max(len(t) for t in _CLOSE_TAGS)
+            if len(self._stream_prefilt) > max_tag_len:
+                self._stream_prefilt = self._stream_prefilt[-max_tag_len:]
+            return
+
+    def _emit_stream_text(self, text: str) -> None:
+        """Emit filtered text to the streaming display."""
+        if not text:
+            return
+
+        # Open the response box header on the very first visible text
        if not self._stream_box_opened:
+            # Strip leading whitespace/newlines before first visible content
+            text = text.lstrip("\n")
+            if not text:
+                return
            self._stream_box_opened = True
            try:
                from hermes_cli.skin_engine import get_active_skin
@ -1433,7 +1507,6 @@ class HermesCLI:
            fill = w - 2 - len(label)
            _cprint(f"\n{_GOLD}╭─{label}{'─' * max(fill - 1, 0)}╮{_RST}")

-        self._stream_started = True
        self._stream_buf += text

        # Emit complete lines, keep partial remainder in buffer
@ -1457,6 +1530,8 @@ class HermesCLI:
        self._stream_buf = ""
        self._stream_started = False
        self._stream_box_opened = False
+        self._stream_prefilt = ""
+        self._in_reasoning_block = False

    def _slow_command_status(self, command: str) -> str:
        """Return a user-facing status message for slower slash commands."""
--- a/run_agent.py
+++ b/run_agent.py
@ -3209,10 +3209,13 @@ class AIAgent:
                    result["response"] = _call_chat_completions()
            except Exception as e:
                err_text = str(e).lower()
-                # Fall back to non-streaming if provider doesn't support it
+                # Fall back to non-streaming if provider doesn't support it.
+                # Be specific in matching — "stream" alone is too broad and
+                # catches unrelated errors like "stream_options" rejections.
                stream_unsupported = any(
                    kw in err_text
-                    for kw in ("stream", "not support", "unsupported", "not available")
+                    for kw in ("streaming is not", "streaming not support",
+                               "does not support stream", "not available")
                )
                if stream_unsupported:
                    logger.info("Streaming not supported by provider, falling back to non-streaming: %s", e)