From 67b94702075acb586c8666ce6741a47c62f552eb Mon Sep 17 00:00:00 2001 From: teknium1 Date: Tue, 10 Mar 2026 23:16:49 -0700 Subject: [PATCH] fix: reduce premature gateway compression on tool-heavy sessions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The gateway's session hygiene pre-check uses a rough char-based token estimate (total_chars / 4) to decide whether to compress before the agent starts. This significantly overestimates for tool-heavy and code-heavy conversations because: 1. str(msg) on dicts includes Python repr overhead (keys, brackets, etc.) 2. Code/JSON tokenizes at 5-7+ chars/token, not the assumed 4 This caused users with 200k context to see compression trigger at ~100-113k actual tokens instead of the expected 170k (85% threshold). Reported by TigerHix on Twitter. Fix: apply a 1.4x safety factor to the gateway pre-check threshold. This pre-check is only meant to catch pathologically large transcripts — the agent's own compression uses actual API-reported token counts for precise threshold management. --- gateway/run.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/gateway/run.py b/gateway/run.py index d1a639b8..151ffad1 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -950,9 +950,12 @@ class GatewayRunner: # repeated truncation/context failures. Detect this early and # compress proactively — before the agent even starts. (#628) # - # Thresholds are derived from the SAME compression config the - # agent uses (compression.threshold × model context length) so - # CLI and messaging platforms behave identically. + # IMPORTANT: This pre-check uses a rough char-based estimate + # (~4 chars/token) which significantly overestimates for + # tool-heavy conversations (code/JSON tokenizes at 5-7+ + # chars/token). To avoid premature compression, we apply a + # 1.4x safety factor — the agent's own compression uses actual + # API-reported token counts and handles precise thresholds. # ----------------------------------------------------------------- if history and len(history) >= 4: from agent.model_metadata import ( @@ -1000,11 +1003,14 @@ class GatewayRunner: if _hyg_compression_enabled: _hyg_context_length = get_model_context_length(_hyg_model) + # Apply 1.4x safety factor to account for rough estimate + # overestimation on tool-heavy / code-heavy conversations. + _ROUGH_ESTIMATE_SAFETY = 1.4 _compress_token_threshold = int( - _hyg_context_length * _hyg_threshold_pct + _hyg_context_length * _hyg_threshold_pct * _ROUGH_ESTIMATE_SAFETY ) - # Warn if still huge after compression (95% of context) - _warn_token_threshold = int(_hyg_context_length * 0.95) + # Warn if still huge after compression (95% of context, with same safety factor) + _warn_token_threshold = int(_hyg_context_length * 0.95 * _ROUGH_ESTIMATE_SAFETY) _msg_count = len(history) _approx_tokens = estimate_messages_tokens_rough(history)