fix: unify gateway session hygiene with agent compression config

The gateway had a SEPARATE compression system ('session hygiene') with hardcoded thresholds (100k tokens / 200 messages) that were completely disconnected from the model's context length and the user's compression config in config.yaml. This caused premature auto-compression on Telegram/Discord — triggering at ~60k tokens (from the 200-message threshold) or inconsistent token counts. Changes: - Gateway hygiene now reads model name from config.yaml and uses get_model_context_length() to derive the actual context limit - Compression threshold comes from compression.threshold in config.yaml (default 0.85), same as the agent's ContextCompressor - Removed the message-count-based trigger (was redundant and caused false positives in tool-heavy sessions) - Removed the undocumented session_hygiene config section — the standard compression.* config now controls everything - Env var overrides (CONTEXT_COMPRESSION_THRESHOLD, CONTEXT_COMPRESSION_ENABLED) are respected - Warn threshold is now 95% of model context (was hardcoded 200k) - Updated tests to verify model-aware thresholds, scaling across models, and that message count alone no longer triggers compression For claude-opus-4.6 (200k context) at 85% threshold: gateway hygiene now triggers at 170k tokens instead of the old 100k.
2026-03-08 20:08:02 -07:00 · 2026-03-08 20:08:02 -07:00 · 67275641f8
commit 67275641f8
parent 3ffaac00dd
2 changed files with 253 additions and 180 deletions
--- a/gateway/run.py
+++ b/gateway/run.py
@ -900,159 +900,187 @@ class GatewayRunner:
        # every new message rehydrates an oversized transcript, causing
        # repeated truncation/context failures.  Detect this early and
        # compress proactively — before the agent even starts.  (#628)
        #
        # Thresholds are derived from the SAME compression config the
        # agent uses (compression.threshold × model context length) so
        # CLI and messaging platforms behave identically.
        # -----------------------------------------------------------------
        if history and len(history) >= 4:
-            from agent.model_metadata import estimate_messages_tokens_rough
+            from agent.model_metadata import (
                estimate_messages_tokens_rough,
                get_model_context_length,
            )
-            # Read thresholds from config.yaml → session_hygiene section
+            # Read model + compression config from config.yaml — same
-            _hygiene_cfg = {}
+            # source of truth the agent itself uses.
            _hyg_model = "anthropic/claude-sonnet-4.6"
            _hyg_threshold_pct = 0.85
            _hyg_compression_enabled = True
            try:
                _hyg_cfg_path = _hermes_home / "config.yaml"
                if _hyg_cfg_path.exists():
                    import yaml as _hyg_yaml
                    with open(_hyg_cfg_path) as _hyg_f:
                        _hyg_data = _hyg_yaml.safe_load(_hyg_f) or {}
-                    _hygiene_cfg = _hyg_data.get("session_hygiene", {})
+
-                    if not isinstance(_hygiene_cfg, dict):
+                    # Resolve model name (same logic as run_sync)
-                        _hygiene_cfg = {}
+                    _model_cfg = _hyg_data.get("model", {})
                    if isinstance(_model_cfg, str):
                        _hyg_model = _model_cfg
                    elif isinstance(_model_cfg, dict):
                        _hyg_model = _model_cfg.get("default", _hyg_model)
                    # Read compression settings
                    _comp_cfg = _hyg_data.get("compression", {})
                    if isinstance(_comp_cfg, dict):
                        _hyg_threshold_pct = float(
                            _comp_cfg.get("threshold", _hyg_threshold_pct)
                        )
                        _hyg_compression_enabled = str(
                            _comp_cfg.get("enabled", True)
                        ).lower() in ("true", "1", "yes")
            except Exception:
                pass
-            _compress_token_threshold = int(
+            # Also check env overrides (same as run_agent.py)
-                _hygiene_cfg.get("auto_compress_tokens", 100_000)
+            _hyg_threshold_pct = float(
-            )
+                os.getenv("CONTEXT_COMPRESSION_THRESHOLD", str(_hyg_threshold_pct))
            _compress_msg_threshold = int(
                _hygiene_cfg.get("auto_compress_messages", 200)
            )
            _warn_token_threshold = int(
                _hygiene_cfg.get("warn_tokens", 200_000)
            )
            if os.getenv("CONTEXT_COMPRESSION_ENABLED", "").lower() in ("false", "0", "no"):
                _hyg_compression_enabled = False
-            _msg_count = len(history)
+            if _hyg_compression_enabled:
-            _approx_tokens = estimate_messages_tokens_rough(history)
+                _hyg_context_length = get_model_context_length(_hyg_model)
-
+                _compress_token_threshold = int(
-            _needs_compress = (
+                    _hyg_context_length * _hyg_threshold_pct
                _approx_tokens >= _compress_token_threshold
                or _msg_count >= _compress_msg_threshold
            )
            if _needs_compress:
                logger.info(
                    "Session hygiene: %s messages, ~%s tokens — auto-compressing "
                    "(thresholds: %s msgs / %s tokens)",
                    _msg_count, f"{_approx_tokens:,}",
                    _compress_msg_threshold, f"{_compress_token_threshold:,}",
                )
                # Warn if still huge after compression (95% of context)
                _warn_token_threshold = int(_hyg_context_length * 0.95)
                _msg_count = len(history)
                _approx_tokens = estimate_messages_tokens_rough(history)
                _needs_compress = _approx_tokens >= _compress_token_threshold
                if _needs_compress:
                    logger.info(
                        "Session hygiene: %s messages, ~%s tokens — auto-compressing "
                        "(threshold: %s%% of %s = %s tokens)",
                        _msg_count, f"{_approx_tokens:,}",
                        int(_hyg_threshold_pct * 100),
                        f"{_hyg_context_length:,}",
                        f"{_compress_token_threshold:,}",
                    )
                    _hyg_adapter = self.adapters.get(source.platform)
                    if _hyg_adapter:
                        try:
                            await _hyg_adapter.send(
                                source.chat_id,
                                f"🗜️ Session is large ({_msg_count} messages, "
                                f"~{_approx_tokens:,} tokens). Auto-compressing..."
                            )
                        except Exception:
                            pass
                _hyg_adapter = self.adapters.get(source.platform)
                if _hyg_adapter:
                    try:
-                        await _hyg_adapter.send(
+                        from run_agent import AIAgent
                            source.chat_id,
                            f"🗜️ Session is large ({_msg_count} messages, "
                            f"~{_approx_tokens:,} tokens). Auto-compressing..."
                        )
                    except Exception:
                        pass
-                try:
+                        _hyg_runtime = _resolve_runtime_agent_kwargs()
-                    from run_agent import AIAgent
+                        if _hyg_runtime.get("api_key"):
                            _hyg_msgs = [
                                {"role": m.get("role"), "content": m.get("content")}
                                for m in history
                                if m.get("role") in ("user", "assistant")
                                and m.get("content")
                            ]
-                    _hyg_runtime = _resolve_runtime_agent_kwargs()
+                            if len(_hyg_msgs) >= 4:
-                    if _hyg_runtime.get("api_key"):
+                                _hyg_agent = AIAgent(
-                        _hyg_msgs = [
+                                    **_hyg_runtime,
-                            {"role": m.get("role"), "content": m.get("content")}
+                                    max_iterations=4,
-                            for m in history
+                                    quiet_mode=True,
-                            if m.get("role") in ("user", "assistant")
+                                    enabled_toolsets=["memory"],
-                            and m.get("content")
+                                    session_id=session_entry.session_id,
                        ]
                        if len(_hyg_msgs) >= 4:
                            _hyg_agent = AIAgent(
                                **_hyg_runtime,
                                max_iterations=4,
                                quiet_mode=True,
                                enabled_toolsets=["memory"],
                                session_id=session_entry.session_id,
                            )
                            loop = asyncio.get_event_loop()
                            _compressed, _ = await loop.run_in_executor(
                                None,
                                lambda: _hyg_agent._compress_context(
                                    _hyg_msgs, "",
                                    approx_tokens=_approx_tokens,
                                ),
                            )
                            self.session_store.rewrite_transcript(
                                session_entry.session_id, _compressed
                            )
                            history = _compressed
                            _new_count = len(_compressed)
                            _new_tokens = estimate_messages_tokens_rough(
                                _compressed
                            )
                            logger.info(
                                "Session hygiene: compressed %s → %s msgs, "
                                "~%s → ~%s tokens",
                                _msg_count, _new_count,
                                f"{_approx_tokens:,}", f"{_new_tokens:,}",
                            )
                            if _hyg_adapter:
                                try:
                                    await _hyg_adapter.send(
                                        source.chat_id,
                                        f"🗜️ Compressed: {_msg_count} → "
                                        f"{_new_count} messages, "
                                        f"~{_approx_tokens:,} → "
                                        f"~{_new_tokens:,} tokens"
                                    )
                                except Exception:
                                    pass
                            # Still too large after compression — warn user
                            if _new_tokens >= _warn_token_threshold:
                                logger.warning(
                                    "Session hygiene: still ~%s tokens after "
                                    "compression — suggesting /reset",
                                    f"{_new_tokens:,}",
                                )
                                loop = asyncio.get_event_loop()
                                _compressed, _ = await loop.run_in_executor(
                                    None,
                                    lambda: _hyg_agent._compress_context(
                                        _hyg_msgs, "",
                                        approx_tokens=_approx_tokens,
                                    ),
                                )
                                self.session_store.rewrite_transcript(
                                    session_entry.session_id, _compressed
                                )
                                history = _compressed
                                _new_count = len(_compressed)
                                _new_tokens = estimate_messages_tokens_rough(
                                    _compressed
                                )
                                logger.info(
                                    "Session hygiene: compressed %s → %s msgs, "
                                    "~%s → ~%s tokens",
                                    _msg_count, _new_count,
                                    f"{_approx_tokens:,}", f"{_new_tokens:,}",
                                )
                                if _hyg_adapter:
                                    try:
                                        await _hyg_adapter.send(
                                            source.chat_id,
-                                            "⚠️ Session is still very large "
+                                            f"🗜️ Compressed: {_msg_count} → "
-                                            "after compression "
+                                            f"{_new_count} messages, "
-                                            f"(~{_new_tokens:,} tokens). "
+                                            f"~{_approx_tokens:,} → "
-                                            "Consider using /reset to start "
+                                            f"~{_new_tokens:,} tokens"
                                            "fresh if you experience issues."
                                        )
                                    except Exception:
                                        pass
-                except Exception as e:
+                                # Still too large after compression — warn user
-                    logger.warning(
+                                if _new_tokens >= _warn_token_threshold:
-                        "Session hygiene auto-compress failed: %s", e
+                                    logger.warning(
-                    )
+                                        "Session hygiene: still ~%s tokens after "
-                    # Compression failed and session is dangerously large
+                                        "compression — suggesting /reset",
-                    if _approx_tokens >= _warn_token_threshold:
+                                        f"{_new_tokens:,}",
-                        _hyg_adapter = self.adapters.get(source.platform)
+                                    )
-                        if _hyg_adapter:
+                                    if _hyg_adapter:
-                            try:
+                                        try:
-                                await _hyg_adapter.send(
+                                            await _hyg_adapter.send(
-                                    source.chat_id,
+                                                source.chat_id,
-                                    f"⚠️ Session is very large "
+                                                "⚠️ Session is still very large "
-                                    f"({_msg_count} messages, "
+                                                "after compression "
-                                    f"~{_approx_tokens:,} tokens) and "
+                                                f"(~{_new_tokens:,} tokens). "
-                                    "auto-compression failed. Consider "
+                                                "Consider using /reset to start "
-                                    "using /compress or /reset to avoid "
+                                                "fresh if you experience issues."
-                                    "issues."
+                                            )
-                                )
+                                        except Exception:
-                            except Exception:
+                                            pass
-                                pass
+
                    except Exception as e:
                        logger.warning(
                            "Session hygiene auto-compress failed: %s", e
                        )
                        # Compression failed and session is dangerously large
                        if _approx_tokens >= _warn_token_threshold:
                            _hyg_adapter = self.adapters.get(source.platform)
                            if _hyg_adapter:
                                try:
                                    await _hyg_adapter.send(
                                        source.chat_id,
                                        f"⚠️ Session is very large "
                                        f"({_msg_count} messages, "
                                        f"~{_approx_tokens:,} tokens) and "
                                        "auto-compression failed. Consider "
                                        "using /compress or /reset to avoid "
                                        "issues."
                                    )
                                except Exception:
                                    pass
        # First-message onboarding -- only on the very first interaction ever
        if not history and not self.session_store.has_any_sessions():
--- a/tests/gateway/test_session_hygiene.py
+++ b/tests/gateway/test_session_hygiene.py
@ -2,6 +2,10 @@
 Verifies that the gateway detects pathologically large transcripts and
 triggers auto-compression before running the agent.  (#628)
 The hygiene system uses the SAME compression config as the agent:
  compression.threshold × model context length
 so CLI and messaging platforms behave identically.
 """
 import pytest
@ -38,75 +42,113 @@ def _make_large_history_tokens(target_tokens: int) -> list:
 # ---------------------------------------------------------------------------
-# Detection threshold tests
+# Detection threshold tests (model-aware, unified with compression config)
 # ---------------------------------------------------------------------------
 class TestSessionHygieneThresholds:
-    """Test that the threshold logic correctly identifies large sessions."""
+    """Test that the threshold logic correctly identifies large sessions.
    Thresholds are derived from model context length × compression threshold,
    matching what the agent's ContextCompressor uses.
    """
    def test_small_session_below_thresholds(self):
        """A 10-message session should not trigger compression."""
        history = _make_history(10)
        msg_count = len(history)
        approx_tokens = estimate_messages_tokens_rough(history)
-        compress_token_threshold = 100_000
+        # For a 200k-context model at 85% threshold = 170k
-        compress_msg_threshold = 200
+        context_length = 200_000
        threshold_pct = 0.85
        compress_token_threshold = int(context_length * threshold_pct)
-        needs_compress = (
+        needs_compress = approx_tokens >= compress_token_threshold
            approx_tokens >= compress_token_threshold
            or msg_count >= compress_msg_threshold
        )
        assert not needs_compress
    def test_large_message_count_triggers(self):
        """200+ messages should trigger compression even if tokens are low."""
        history = _make_history(250, content_size=10)
        msg_count = len(history)
        compress_msg_threshold = 200
        needs_compress = msg_count >= compress_msg_threshold
        assert needs_compress
    def test_large_token_count_triggers(self):
-        """High token count should trigger compression even if message count is low."""
+        """High token count should trigger compression when exceeding model threshold."""
-        # 50 messages with huge content to exceed 100K tokens
+        # Build a history that exceeds 85% of a 200k model (170k tokens)
-        history = _make_history(50, content_size=10_000)
+        history = _make_large_history_tokens(180_000)
        approx_tokens = estimate_messages_tokens_rough(history)
-        compress_token_threshold = 100_000
+        context_length = 200_000
        threshold_pct = 0.85
        compress_token_threshold = int(context_length * threshold_pct)
        needs_compress = approx_tokens >= compress_token_threshold
        assert needs_compress
-    def test_under_both_thresholds_no_trigger(self):
+    def test_under_threshold_no_trigger(self):
-        """Session under both thresholds should not trigger."""
+        """Session under threshold should not trigger, even with many messages."""
-        history = _make_history(100, content_size=100)
+        # 250 short messages — lots of messages but well under token threshold
-        msg_count = len(history)
+        history = _make_history(250, content_size=10)
        approx_tokens = estimate_messages_tokens_rough(history)
-        compress_token_threshold = 100_000
+        # 200k model at 85% = 170k token threshold
-        compress_msg_threshold = 200
+        context_length = 200_000
        threshold_pct = 0.85
        compress_token_threshold = int(context_length * threshold_pct)
-        needs_compress = (
+        needs_compress = approx_tokens >= compress_token_threshold
-            approx_tokens >= compress_token_threshold
+        assert not needs_compress, (
-            or msg_count >= compress_msg_threshold
+            f"250 short messages (~{approx_tokens} tokens) should NOT trigger "
            f"compression at {compress_token_threshold} token threshold"
        )
    def test_message_count_alone_does_not_trigger(self):
        """Message count alone should NOT trigger — only token count matters.
        The old system used an OR of token-count and message-count thresholds,
        which caused premature compression in tool-heavy sessions with 200+
        messages but low total tokens.
        """
        # 300 very short messages — old system would compress, new should not
        history = _make_history(300, content_size=10)
        approx_tokens = estimate_messages_tokens_rough(history)
        context_length = 200_000
        threshold_pct = 0.85
        compress_token_threshold = int(context_length * threshold_pct)
        # Token-based check only
        needs_compress = approx_tokens >= compress_token_threshold
        assert not needs_compress
-    def test_custom_thresholds(self):
+    def test_threshold_scales_with_model(self):
-        """Custom thresholds from config should be respected."""
+        """Different models should have different compression thresholds."""
-        history = _make_history(60, content_size=100)
+        # 128k model at 85% = 108,800 tokens
-        msg_count = len(history)
+        small_model_threshold = int(128_000 * 0.85)
        # 200k model at 85% = 170,000 tokens
        large_model_threshold = int(200_000 * 0.85)
        # 1M model at 85% = 850,000 tokens
        huge_model_threshold = int(1_000_000 * 0.85)
-        # Custom lower threshold
+        # A session at ~120k tokens:
-        compress_msg_threshold = 50
+        history = _make_large_history_tokens(120_000)
-        needs_compress = msg_count >= compress_msg_threshold
+        approx_tokens = estimate_messages_tokens_rough(history)
        assert needs_compress
-        # Custom higher threshold
+        # Should trigger for 128k model
-        compress_msg_threshold = 100
+        assert approx_tokens >= small_model_threshold
-        needs_compress = msg_count >= compress_msg_threshold
+        # Should NOT trigger for 200k model
-        assert not needs_compress
+        assert approx_tokens < large_model_threshold
        # Should NOT trigger for 1M model
        assert approx_tokens < huge_model_threshold
    def test_custom_threshold_percentage(self):
        """Custom threshold percentage from config should be respected."""
        context_length = 200_000
        # At 50% threshold = 100k
        low_threshold = int(context_length * 0.50)
        # At 90% threshold = 180k
        high_threshold = int(context_length * 0.90)
        history = _make_large_history_tokens(150_000)
        approx_tokens = estimate_messages_tokens_rough(history)
        # Should trigger at 50% but not at 90%
        assert approx_tokens >= low_threshold
        assert approx_tokens < high_threshold
    def test_minimum_message_guard(self):
        """Sessions with fewer than 4 messages should never trigger."""
@ -117,18 +159,19 @@ class TestSessionHygieneThresholds:
 class TestSessionHygieneWarnThreshold:
-    """Test the post-compression warning threshold."""
+    """Test the post-compression warning threshold (95% of context)."""
    def test_warn_when_still_large(self):
-        """If compressed result is still above warn_tokens, should warn."""
+        """If compressed result is still above 95% of context, should warn."""
-        # Simulate post-compression tokens
+        context_length = 200_000
-        warn_threshold = 200_000
+        warn_threshold = int(context_length * 0.95)  # 190k
-        post_compress_tokens = 250_000
+        post_compress_tokens = 195_000
        assert post_compress_tokens >= warn_threshold
    def test_no_warn_when_under(self):
-        """If compressed result is under warn_tokens, no warning."""
+        """If compressed result is under 95% of context, no warning."""
-        warn_threshold = 200_000
+        context_length = 200_000
        warn_threshold = int(context_length * 0.95)  # 190k
        post_compress_tokens = 150_000
        assert post_compress_tokens < warn_threshold
@ -150,10 +193,12 @@ class TestTokenEstimation:
        assert estimate_messages_tokens_rough(many) > estimate_messages_tokens_rough(few)
    def test_pathological_session_detected(self):
-        """The reported pathological case: 648 messages, ~299K tokens."""
+        """The reported pathological case: 648 messages, ~299K tokens.
-        # Simulate a 648-message session averaging ~460 tokens per message
+
        With a 200k model at 85% threshold (170k), this should trigger.
        """
        history = _make_history(648, content_size=1800)
        tokens = estimate_messages_tokens_rough(history)
-        # Should be well above the 100K default threshold
+        # Should be well above the 170K threshold for a 200k model
-        assert tokens > 100_000
+        threshold = int(200_000 * 0.85)
-        assert len(history) > 200
+        assert tokens > threshold