From 67b94702075acb586c8666ce6741a47c62f552eb Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Tue, 10 Mar 2026 23:16:49 -0700
Subject: [PATCH] fix: reduce premature gateway compression on tool-heavy
 sessions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The gateway's session hygiene pre-check uses a rough char-based token
estimate (total_chars / 4) to decide whether to compress before the
agent starts. This significantly overestimates for tool-heavy and
code-heavy conversations because:

1. str(msg) on dicts includes Python repr overhead (keys, brackets, etc.)
2. Code/JSON tokenizes at 5-7+ chars/token, not the assumed 4

This caused users with 200k context to see compression trigger at
~100-113k actual tokens instead of the expected 170k (85% threshold).
Reported by TigerHix on Twitter.

Fix: apply a 1.4x safety factor to the gateway pre-check threshold.
This pre-check is only meant to catch pathologically large transcripts
— the agent's own compression uses actual API-reported token counts
for precise threshold management.
---
 gateway/run.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/gateway/run.py b/gateway/run.py
index d1a639b8..151ffad1 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -950,9 +950,12 @@ class GatewayRunner:
         # repeated truncation/context failures.  Detect this early and
         # compress proactively — before the agent even starts.  (#628)
         #
-        # Thresholds are derived from the SAME compression config the
-        # agent uses (compression.threshold × model context length) so
-        # CLI and messaging platforms behave identically.
+        # IMPORTANT: This pre-check uses a rough char-based estimate
+        # (~4 chars/token) which significantly overestimates for
+        # tool-heavy conversations (code/JSON tokenizes at 5-7+
+        # chars/token).  To avoid premature compression, we apply a
+        # 1.4x safety factor — the agent's own compression uses actual
+        # API-reported token counts and handles precise thresholds.
         # -----------------------------------------------------------------
         if history and len(history) >= 4:
             from agent.model_metadata import (
@@ -1000,11 +1003,14 @@ class GatewayRunner:
 
             if _hyg_compression_enabled:
                 _hyg_context_length = get_model_context_length(_hyg_model)
+                # Apply 1.4x safety factor to account for rough estimate
+                # overestimation on tool-heavy / code-heavy conversations.
+                _ROUGH_ESTIMATE_SAFETY = 1.4
                 _compress_token_threshold = int(
-                    _hyg_context_length * _hyg_threshold_pct
+                    _hyg_context_length * _hyg_threshold_pct * _ROUGH_ESTIMATE_SAFETY
                 )
-                # Warn if still huge after compression (95% of context)
-                _warn_token_threshold = int(_hyg_context_length * 0.95)
+                # Warn if still huge after compression (95% of context, with same safety factor)
+                _warn_token_threshold = int(_hyg_context_length * 0.95 * _ROUGH_ESTIMATE_SAFETY)
 
                 _msg_count = len(history)
                 _approx_tokens = estimate_messages_tokens_rough(history)