fix: unify gateway session hygiene with agent compression config

The gateway had a SEPARATE compression system ('session hygiene')
with hardcoded thresholds (100k tokens / 200 messages) that were
completely disconnected from the model's context length and the
user's compression config in config.yaml. This caused premature
auto-compression on Telegram/Discord — triggering at ~60k tokens
(from the 200-message threshold) or inconsistent token counts.

Changes:
- Gateway hygiene now reads model name from config.yaml and uses
  get_model_context_length() to derive the actual context limit
- Compression threshold comes from compression.threshold in
  config.yaml (default 0.85), same as the agent's ContextCompressor
- Removed the message-count-based trigger (was redundant and caused
  false positives in tool-heavy sessions)
- Removed the undocumented session_hygiene config section — the
  standard compression.* config now controls everything
- Env var overrides (CONTEXT_COMPRESSION_THRESHOLD,
  CONTEXT_COMPRESSION_ENABLED) are respected
- Warn threshold is now 95% of model context (was hardcoded 200k)
- Updated tests to verify model-aware thresholds, scaling across
  models, and that message count alone no longer triggers compression

For claude-opus-4.6 (200k context) at 85% threshold: gateway
hygiene now triggers at 170k tokens instead of the old 100k.
This commit is contained in:
teknium1 2026-03-08 20:08:02 -07:00
parent 3ffaac00dd
commit 67275641f8
2 changed files with 253 additions and 180 deletions

View file

@ -900,159 +900,187 @@ class GatewayRunner:
# every new message rehydrates an oversized transcript, causing # every new message rehydrates an oversized transcript, causing
# repeated truncation/context failures. Detect this early and # repeated truncation/context failures. Detect this early and
# compress proactively — before the agent even starts. (#628) # compress proactively — before the agent even starts. (#628)
#
# Thresholds are derived from the SAME compression config the
# agent uses (compression.threshold × model context length) so
# CLI and messaging platforms behave identically.
# ----------------------------------------------------------------- # -----------------------------------------------------------------
if history and len(history) >= 4: if history and len(history) >= 4:
from agent.model_metadata import estimate_messages_tokens_rough from agent.model_metadata import (
estimate_messages_tokens_rough,
get_model_context_length,
)
# Read thresholds from config.yaml → session_hygiene section # Read model + compression config from config.yaml — same
_hygiene_cfg = {} # source of truth the agent itself uses.
_hyg_model = "anthropic/claude-sonnet-4.6"
_hyg_threshold_pct = 0.85
_hyg_compression_enabled = True
try: try:
_hyg_cfg_path = _hermes_home / "config.yaml" _hyg_cfg_path = _hermes_home / "config.yaml"
if _hyg_cfg_path.exists(): if _hyg_cfg_path.exists():
import yaml as _hyg_yaml import yaml as _hyg_yaml
with open(_hyg_cfg_path) as _hyg_f: with open(_hyg_cfg_path) as _hyg_f:
_hyg_data = _hyg_yaml.safe_load(_hyg_f) or {} _hyg_data = _hyg_yaml.safe_load(_hyg_f) or {}
_hygiene_cfg = _hyg_data.get("session_hygiene", {})
if not isinstance(_hygiene_cfg, dict): # Resolve model name (same logic as run_sync)
_hygiene_cfg = {} _model_cfg = _hyg_data.get("model", {})
if isinstance(_model_cfg, str):
_hyg_model = _model_cfg
elif isinstance(_model_cfg, dict):
_hyg_model = _model_cfg.get("default", _hyg_model)
# Read compression settings
_comp_cfg = _hyg_data.get("compression", {})
if isinstance(_comp_cfg, dict):
_hyg_threshold_pct = float(
_comp_cfg.get("threshold", _hyg_threshold_pct)
)
_hyg_compression_enabled = str(
_comp_cfg.get("enabled", True)
).lower() in ("true", "1", "yes")
except Exception: except Exception:
pass pass
_compress_token_threshold = int( # Also check env overrides (same as run_agent.py)
_hygiene_cfg.get("auto_compress_tokens", 100_000) _hyg_threshold_pct = float(
) os.getenv("CONTEXT_COMPRESSION_THRESHOLD", str(_hyg_threshold_pct))
_compress_msg_threshold = int(
_hygiene_cfg.get("auto_compress_messages", 200)
)
_warn_token_threshold = int(
_hygiene_cfg.get("warn_tokens", 200_000)
) )
if os.getenv("CONTEXT_COMPRESSION_ENABLED", "").lower() in ("false", "0", "no"):
_hyg_compression_enabled = False
_msg_count = len(history) if _hyg_compression_enabled:
_approx_tokens = estimate_messages_tokens_rough(history) _hyg_context_length = get_model_context_length(_hyg_model)
_compress_token_threshold = int(
_needs_compress = ( _hyg_context_length * _hyg_threshold_pct
_approx_tokens >= _compress_token_threshold
or _msg_count >= _compress_msg_threshold
)
if _needs_compress:
logger.info(
"Session hygiene: %s messages, ~%s tokens — auto-compressing "
"(thresholds: %s msgs / %s tokens)",
_msg_count, f"{_approx_tokens:,}",
_compress_msg_threshold, f"{_compress_token_threshold:,}",
) )
# Warn if still huge after compression (95% of context)
_warn_token_threshold = int(_hyg_context_length * 0.95)
_msg_count = len(history)
_approx_tokens = estimate_messages_tokens_rough(history)
_needs_compress = _approx_tokens >= _compress_token_threshold
if _needs_compress:
logger.info(
"Session hygiene: %s messages, ~%s tokens — auto-compressing "
"(threshold: %s%% of %s = %s tokens)",
_msg_count, f"{_approx_tokens:,}",
int(_hyg_threshold_pct * 100),
f"{_hyg_context_length:,}",
f"{_compress_token_threshold:,}",
)
_hyg_adapter = self.adapters.get(source.platform)
if _hyg_adapter:
try:
await _hyg_adapter.send(
source.chat_id,
f"🗜️ Session is large ({_msg_count} messages, "
f"~{_approx_tokens:,} tokens). Auto-compressing..."
)
except Exception:
pass
_hyg_adapter = self.adapters.get(source.platform)
if _hyg_adapter:
try: try:
await _hyg_adapter.send( from run_agent import AIAgent
source.chat_id,
f"🗜️ Session is large ({_msg_count} messages, "
f"~{_approx_tokens:,} tokens). Auto-compressing..."
)
except Exception:
pass
try: _hyg_runtime = _resolve_runtime_agent_kwargs()
from run_agent import AIAgent if _hyg_runtime.get("api_key"):
_hyg_msgs = [
{"role": m.get("role"), "content": m.get("content")}
for m in history
if m.get("role") in ("user", "assistant")
and m.get("content")
]
_hyg_runtime = _resolve_runtime_agent_kwargs() if len(_hyg_msgs) >= 4:
if _hyg_runtime.get("api_key"): _hyg_agent = AIAgent(
_hyg_msgs = [ **_hyg_runtime,
{"role": m.get("role"), "content": m.get("content")} max_iterations=4,
for m in history quiet_mode=True,
if m.get("role") in ("user", "assistant") enabled_toolsets=["memory"],
and m.get("content") session_id=session_entry.session_id,
]
if len(_hyg_msgs) >= 4:
_hyg_agent = AIAgent(
**_hyg_runtime,
max_iterations=4,
quiet_mode=True,
enabled_toolsets=["memory"],
session_id=session_entry.session_id,
)
loop = asyncio.get_event_loop()
_compressed, _ = await loop.run_in_executor(
None,
lambda: _hyg_agent._compress_context(
_hyg_msgs, "",
approx_tokens=_approx_tokens,
),
)
self.session_store.rewrite_transcript(
session_entry.session_id, _compressed
)
history = _compressed
_new_count = len(_compressed)
_new_tokens = estimate_messages_tokens_rough(
_compressed
)
logger.info(
"Session hygiene: compressed %s%s msgs, "
"~%s → ~%s tokens",
_msg_count, _new_count,
f"{_approx_tokens:,}", f"{_new_tokens:,}",
)
if _hyg_adapter:
try:
await _hyg_adapter.send(
source.chat_id,
f"🗜️ Compressed: {_msg_count}"
f"{_new_count} messages, "
f"~{_approx_tokens:,}"
f"~{_new_tokens:,} tokens"
)
except Exception:
pass
# Still too large after compression — warn user
if _new_tokens >= _warn_token_threshold:
logger.warning(
"Session hygiene: still ~%s tokens after "
"compression — suggesting /reset",
f"{_new_tokens:,}",
) )
loop = asyncio.get_event_loop()
_compressed, _ = await loop.run_in_executor(
None,
lambda: _hyg_agent._compress_context(
_hyg_msgs, "",
approx_tokens=_approx_tokens,
),
)
self.session_store.rewrite_transcript(
session_entry.session_id, _compressed
)
history = _compressed
_new_count = len(_compressed)
_new_tokens = estimate_messages_tokens_rough(
_compressed
)
logger.info(
"Session hygiene: compressed %s%s msgs, "
"~%s → ~%s tokens",
_msg_count, _new_count,
f"{_approx_tokens:,}", f"{_new_tokens:,}",
)
if _hyg_adapter: if _hyg_adapter:
try: try:
await _hyg_adapter.send( await _hyg_adapter.send(
source.chat_id, source.chat_id,
"⚠️ Session is still very large " f"🗜️ Compressed: {_msg_count}"
"after compression " f"{_new_count} messages, "
f"(~{_new_tokens:,} tokens). " f"~{_approx_tokens:,}"
"Consider using /reset to start " f"~{_new_tokens:,} tokens"
"fresh if you experience issues."
) )
except Exception: except Exception:
pass pass
except Exception as e: # Still too large after compression — warn user
logger.warning( if _new_tokens >= _warn_token_threshold:
"Session hygiene auto-compress failed: %s", e logger.warning(
) "Session hygiene: still ~%s tokens after "
# Compression failed and session is dangerously large "compression — suggesting /reset",
if _approx_tokens >= _warn_token_threshold: f"{_new_tokens:,}",
_hyg_adapter = self.adapters.get(source.platform) )
if _hyg_adapter: if _hyg_adapter:
try: try:
await _hyg_adapter.send( await _hyg_adapter.send(
source.chat_id, source.chat_id,
f"⚠️ Session is very large " "⚠️ Session is still very large "
f"({_msg_count} messages, " "after compression "
f"~{_approx_tokens:,} tokens) and " f"(~{_new_tokens:,} tokens). "
"auto-compression failed. Consider " "Consider using /reset to start "
"using /compress or /reset to avoid " "fresh if you experience issues."
"issues." )
) except Exception:
except Exception: pass
pass
except Exception as e:
logger.warning(
"Session hygiene auto-compress failed: %s", e
)
# Compression failed and session is dangerously large
if _approx_tokens >= _warn_token_threshold:
_hyg_adapter = self.adapters.get(source.platform)
if _hyg_adapter:
try:
await _hyg_adapter.send(
source.chat_id,
f"⚠️ Session is very large "
f"({_msg_count} messages, "
f"~{_approx_tokens:,} tokens) and "
"auto-compression failed. Consider "
"using /compress or /reset to avoid "
"issues."
)
except Exception:
pass
# First-message onboarding -- only on the very first interaction ever # First-message onboarding -- only on the very first interaction ever
if not history and not self.session_store.has_any_sessions(): if not history and not self.session_store.has_any_sessions():

View file

@ -2,6 +2,10 @@
Verifies that the gateway detects pathologically large transcripts and Verifies that the gateway detects pathologically large transcripts and
triggers auto-compression before running the agent. (#628) triggers auto-compression before running the agent. (#628)
The hygiene system uses the SAME compression config as the agent:
compression.threshold × model context length
so CLI and messaging platforms behave identically.
""" """
import pytest import pytest
@ -38,75 +42,113 @@ def _make_large_history_tokens(target_tokens: int) -> list:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Detection threshold tests # Detection threshold tests (model-aware, unified with compression config)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
class TestSessionHygieneThresholds: class TestSessionHygieneThresholds:
"""Test that the threshold logic correctly identifies large sessions.""" """Test that the threshold logic correctly identifies large sessions.
Thresholds are derived from model context length × compression threshold,
matching what the agent's ContextCompressor uses.
"""
def test_small_session_below_thresholds(self): def test_small_session_below_thresholds(self):
"""A 10-message session should not trigger compression.""" """A 10-message session should not trigger compression."""
history = _make_history(10) history = _make_history(10)
msg_count = len(history)
approx_tokens = estimate_messages_tokens_rough(history) approx_tokens = estimate_messages_tokens_rough(history)
compress_token_threshold = 100_000 # For a 200k-context model at 85% threshold = 170k
compress_msg_threshold = 200 context_length = 200_000
threshold_pct = 0.85
compress_token_threshold = int(context_length * threshold_pct)
needs_compress = ( needs_compress = approx_tokens >= compress_token_threshold
approx_tokens >= compress_token_threshold
or msg_count >= compress_msg_threshold
)
assert not needs_compress assert not needs_compress
def test_large_message_count_triggers(self):
"""200+ messages should trigger compression even if tokens are low."""
history = _make_history(250, content_size=10)
msg_count = len(history)
compress_msg_threshold = 200
needs_compress = msg_count >= compress_msg_threshold
assert needs_compress
def test_large_token_count_triggers(self): def test_large_token_count_triggers(self):
"""High token count should trigger compression even if message count is low.""" """High token count should trigger compression when exceeding model threshold."""
# 50 messages with huge content to exceed 100K tokens # Build a history that exceeds 85% of a 200k model (170k tokens)
history = _make_history(50, content_size=10_000) history = _make_large_history_tokens(180_000)
approx_tokens = estimate_messages_tokens_rough(history) approx_tokens = estimate_messages_tokens_rough(history)
compress_token_threshold = 100_000 context_length = 200_000
threshold_pct = 0.85
compress_token_threshold = int(context_length * threshold_pct)
needs_compress = approx_tokens >= compress_token_threshold needs_compress = approx_tokens >= compress_token_threshold
assert needs_compress assert needs_compress
def test_under_both_thresholds_no_trigger(self): def test_under_threshold_no_trigger(self):
"""Session under both thresholds should not trigger.""" """Session under threshold should not trigger, even with many messages."""
history = _make_history(100, content_size=100) # 250 short messages — lots of messages but well under token threshold
msg_count = len(history) history = _make_history(250, content_size=10)
approx_tokens = estimate_messages_tokens_rough(history) approx_tokens = estimate_messages_tokens_rough(history)
compress_token_threshold = 100_000 # 200k model at 85% = 170k token threshold
compress_msg_threshold = 200 context_length = 200_000
threshold_pct = 0.85
compress_token_threshold = int(context_length * threshold_pct)
needs_compress = ( needs_compress = approx_tokens >= compress_token_threshold
approx_tokens >= compress_token_threshold assert not needs_compress, (
or msg_count >= compress_msg_threshold f"250 short messages (~{approx_tokens} tokens) should NOT trigger "
f"compression at {compress_token_threshold} token threshold"
) )
def test_message_count_alone_does_not_trigger(self):
"""Message count alone should NOT trigger — only token count matters.
The old system used an OR of token-count and message-count thresholds,
which caused premature compression in tool-heavy sessions with 200+
messages but low total tokens.
"""
# 300 very short messages — old system would compress, new should not
history = _make_history(300, content_size=10)
approx_tokens = estimate_messages_tokens_rough(history)
context_length = 200_000
threshold_pct = 0.85
compress_token_threshold = int(context_length * threshold_pct)
# Token-based check only
needs_compress = approx_tokens >= compress_token_threshold
assert not needs_compress assert not needs_compress
def test_custom_thresholds(self): def test_threshold_scales_with_model(self):
"""Custom thresholds from config should be respected.""" """Different models should have different compression thresholds."""
history = _make_history(60, content_size=100) # 128k model at 85% = 108,800 tokens
msg_count = len(history) small_model_threshold = int(128_000 * 0.85)
# 200k model at 85% = 170,000 tokens
large_model_threshold = int(200_000 * 0.85)
# 1M model at 85% = 850,000 tokens
huge_model_threshold = int(1_000_000 * 0.85)
# Custom lower threshold # A session at ~120k tokens:
compress_msg_threshold = 50 history = _make_large_history_tokens(120_000)
needs_compress = msg_count >= compress_msg_threshold approx_tokens = estimate_messages_tokens_rough(history)
assert needs_compress
# Custom higher threshold # Should trigger for 128k model
compress_msg_threshold = 100 assert approx_tokens >= small_model_threshold
needs_compress = msg_count >= compress_msg_threshold # Should NOT trigger for 200k model
assert not needs_compress assert approx_tokens < large_model_threshold
# Should NOT trigger for 1M model
assert approx_tokens < huge_model_threshold
def test_custom_threshold_percentage(self):
"""Custom threshold percentage from config should be respected."""
context_length = 200_000
# At 50% threshold = 100k
low_threshold = int(context_length * 0.50)
# At 90% threshold = 180k
high_threshold = int(context_length * 0.90)
history = _make_large_history_tokens(150_000)
approx_tokens = estimate_messages_tokens_rough(history)
# Should trigger at 50% but not at 90%
assert approx_tokens >= low_threshold
assert approx_tokens < high_threshold
def test_minimum_message_guard(self): def test_minimum_message_guard(self):
"""Sessions with fewer than 4 messages should never trigger.""" """Sessions with fewer than 4 messages should never trigger."""
@ -117,18 +159,19 @@ class TestSessionHygieneThresholds:
class TestSessionHygieneWarnThreshold: class TestSessionHygieneWarnThreshold:
"""Test the post-compression warning threshold.""" """Test the post-compression warning threshold (95% of context)."""
def test_warn_when_still_large(self): def test_warn_when_still_large(self):
"""If compressed result is still above warn_tokens, should warn.""" """If compressed result is still above 95% of context, should warn."""
# Simulate post-compression tokens context_length = 200_000
warn_threshold = 200_000 warn_threshold = int(context_length * 0.95) # 190k
post_compress_tokens = 250_000 post_compress_tokens = 195_000
assert post_compress_tokens >= warn_threshold assert post_compress_tokens >= warn_threshold
def test_no_warn_when_under(self): def test_no_warn_when_under(self):
"""If compressed result is under warn_tokens, no warning.""" """If compressed result is under 95% of context, no warning."""
warn_threshold = 200_000 context_length = 200_000
warn_threshold = int(context_length * 0.95) # 190k
post_compress_tokens = 150_000 post_compress_tokens = 150_000
assert post_compress_tokens < warn_threshold assert post_compress_tokens < warn_threshold
@ -150,10 +193,12 @@ class TestTokenEstimation:
assert estimate_messages_tokens_rough(many) > estimate_messages_tokens_rough(few) assert estimate_messages_tokens_rough(many) > estimate_messages_tokens_rough(few)
def test_pathological_session_detected(self): def test_pathological_session_detected(self):
"""The reported pathological case: 648 messages, ~299K tokens.""" """The reported pathological case: 648 messages, ~299K tokens.
# Simulate a 648-message session averaging ~460 tokens per message
With a 200k model at 85% threshold (170k), this should trigger.
"""
history = _make_history(648, content_size=1800) history = _make_history(648, content_size=1800)
tokens = estimate_messages_tokens_rough(history) tokens = estimate_messages_tokens_rough(history)
# Should be well above the 100K default threshold # Should be well above the 170K threshold for a 200k model
assert tokens > 100_000 threshold = int(200_000 * 0.85)
assert len(history) > 200 assert tokens > threshold