fix(gateway): hygiene compression ignores config context_length and 1.4x exceeds model limit
Three bugs in gateway session hygiene pre-compression caused 'Session too
large' errors for ~200K context models like GLM-5-turbo on z.ai:
1. Gateway hygiene called get_model_context_length(model) without passing
config_context_length, provider, or base_url — so user overrides like
model.context_length: 180000 were ignored, and provider-aware detection
(models.dev, z.ai endpoint) couldn't fire. The agent's own compressor
correctly passed all three (run_agent.py line 1038).
2. The 1.4x safety factor on rough token estimates pushed the compression
threshold above the model's actual context limit:
200K * 0.85 * 1.4 = 238K > 200K (model limit)
So hygiene never compressed, sessions grew past the limit, and the API
rejected the request.
3. Same issue for the warn threshold: 200K * 0.95 * 1.4 = 266K.
Fix:
- Read model.context_length, provider, and base_url from config.yaml
(same as run_agent.py does) and pass them to get_model_context_length()
- Resolve provider/base_url from runtime when not in config
- Cap the 1.4x-adjusted compress threshold at 95% of context_length
- Cap the 1.4x-adjusted warn threshold at context_length
Affects: z.ai GLM-5/GLM-5-turbo, any ~200K or smaller context model
where the 1.4x factor would push 85% above 100%.
Ref: Discord report from Ddox — glm-5-turbo on z.ai coding plan
This commit is contained in:
parent
ed805f57ff
commit
b2b4a9ee7d
2 changed files with 112 additions and 5 deletions
|
|
@ -1778,6 +1778,10 @@ class GatewayRunner:
|
||||||
_hyg_model = "anthropic/claude-sonnet-4.6"
|
_hyg_model = "anthropic/claude-sonnet-4.6"
|
||||||
_hyg_threshold_pct = 0.85
|
_hyg_threshold_pct = 0.85
|
||||||
_hyg_compression_enabled = True
|
_hyg_compression_enabled = True
|
||||||
|
_hyg_config_context_length = None
|
||||||
|
_hyg_provider = None
|
||||||
|
_hyg_base_url = None
|
||||||
|
_hyg_api_key = None
|
||||||
try:
|
try:
|
||||||
_hyg_cfg_path = _hermes_home / "config.yaml"
|
_hyg_cfg_path = _hermes_home / "config.yaml"
|
||||||
if _hyg_cfg_path.exists():
|
if _hyg_cfg_path.exists():
|
||||||
|
|
@ -1791,6 +1795,17 @@ class GatewayRunner:
|
||||||
_hyg_model = _model_cfg
|
_hyg_model = _model_cfg
|
||||||
elif isinstance(_model_cfg, dict):
|
elif isinstance(_model_cfg, dict):
|
||||||
_hyg_model = _model_cfg.get("default", _hyg_model)
|
_hyg_model = _model_cfg.get("default", _hyg_model)
|
||||||
|
# Read explicit context_length override from model config
|
||||||
|
# (same as run_agent.py lines 995-1005)
|
||||||
|
_raw_ctx = _model_cfg.get("context_length")
|
||||||
|
if _raw_ctx is not None:
|
||||||
|
try:
|
||||||
|
_hyg_config_context_length = int(_raw_ctx)
|
||||||
|
except (TypeError, ValueError):
|
||||||
|
pass
|
||||||
|
# Read provider for accurate context detection
|
||||||
|
_hyg_provider = _model_cfg.get("provider") or None
|
||||||
|
_hyg_base_url = _model_cfg.get("base_url") or None
|
||||||
|
|
||||||
# Read compression settings — only use enabled flag.
|
# Read compression settings — only use enabled flag.
|
||||||
# The threshold is intentionally separate from the agent's
|
# The threshold is intentionally separate from the agent's
|
||||||
|
|
@ -1800,11 +1815,27 @@ class GatewayRunner:
|
||||||
_hyg_compression_enabled = str(
|
_hyg_compression_enabled = str(
|
||||||
_comp_cfg.get("enabled", True)
|
_comp_cfg.get("enabled", True)
|
||||||
).lower() in ("true", "1", "yes")
|
).lower() in ("true", "1", "yes")
|
||||||
|
|
||||||
|
# Resolve provider/base_url from runtime if not in config
|
||||||
|
if not _hyg_provider or not _hyg_base_url:
|
||||||
|
try:
|
||||||
|
_hyg_runtime = _resolve_runtime_agent_kwargs()
|
||||||
|
_hyg_provider = _hyg_provider or _hyg_runtime.get("provider")
|
||||||
|
_hyg_base_url = _hyg_base_url or _hyg_runtime.get("base_url")
|
||||||
|
_hyg_api_key = _hyg_runtime.get("api_key")
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
except Exception:
|
except Exception:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
if _hyg_compression_enabled:
|
if _hyg_compression_enabled:
|
||||||
_hyg_context_length = get_model_context_length(_hyg_model)
|
_hyg_context_length = get_model_context_length(
|
||||||
|
_hyg_model,
|
||||||
|
base_url=_hyg_base_url or "",
|
||||||
|
api_key=_hyg_api_key or "",
|
||||||
|
config_context_length=_hyg_config_context_length,
|
||||||
|
provider=_hyg_provider or "",
|
||||||
|
)
|
||||||
_compress_token_threshold = int(
|
_compress_token_threshold = int(
|
||||||
_hyg_context_length * _hyg_threshold_pct
|
_hyg_context_length * _hyg_threshold_pct
|
||||||
)
|
)
|
||||||
|
|
@ -1822,11 +1853,20 @@ class GatewayRunner:
|
||||||
_token_source = "actual"
|
_token_source = "actual"
|
||||||
else:
|
else:
|
||||||
_approx_tokens = estimate_messages_tokens_rough(history)
|
_approx_tokens = estimate_messages_tokens_rough(history)
|
||||||
# Apply safety factor only for rough estimates
|
# Apply safety factor only for rough estimates.
|
||||||
_compress_token_threshold = int(
|
# Cap the adjusted threshold at 95% of context length
|
||||||
_compress_token_threshold * 1.4
|
# so it never exceeds what the model can actually handle
|
||||||
|
# (the 1.4x factor previously pushed the threshold above
|
||||||
|
# the model's context limit for ~200K models like GLM-5).
|
||||||
|
_max_safe_threshold = int(_hyg_context_length * 0.95)
|
||||||
|
_compress_token_threshold = min(
|
||||||
|
int(_compress_token_threshold * 1.4),
|
||||||
|
_max_safe_threshold,
|
||||||
|
)
|
||||||
|
_warn_token_threshold = min(
|
||||||
|
int(_warn_token_threshold * 1.4),
|
||||||
|
_hyg_context_length,
|
||||||
)
|
)
|
||||||
_warn_token_threshold = int(_warn_token_threshold * 1.4)
|
|
||||||
_token_source = "estimated"
|
_token_source = "estimated"
|
||||||
|
|
||||||
_needs_compress = _approx_tokens >= _compress_token_threshold
|
_needs_compress = _approx_tokens >= _compress_token_threshold
|
||||||
|
|
|
||||||
|
|
@ -212,6 +212,73 @@ class TestSessionHygieneWarnThreshold:
|
||||||
assert post_compress_tokens < warn_threshold
|
assert post_compress_tokens < warn_threshold
|
||||||
|
|
||||||
|
|
||||||
|
class TestEstimatedTokenSafetyCap:
|
||||||
|
"""Verify the 1.4x safety factor on rough estimates is capped at 95% of
|
||||||
|
context length, preventing the threshold from exceeding the model's
|
||||||
|
actual limit.
|
||||||
|
|
||||||
|
Bug: For ~200K models (GLM-5-turbo), the uncapped 1.4x pushed the
|
||||||
|
threshold to 238K — above the model's limit — so hygiene never fired.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def test_uncapped_14x_would_exceed_context(self):
|
||||||
|
"""Without the cap, 200K * 0.85 * 1.4 = 238K > 200K (broken)."""
|
||||||
|
context_length = 200_000
|
||||||
|
threshold_pct = 0.85
|
||||||
|
raw_threshold = int(context_length * threshold_pct) # 170K
|
||||||
|
uncapped = int(raw_threshold * 1.4) # 238K
|
||||||
|
assert uncapped > context_length, (
|
||||||
|
"Uncapped 1.4x should exceed model context (this is the bug)"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_capped_14x_stays_within_context(self):
|
||||||
|
"""With the cap, the threshold stays at 95% of context length."""
|
||||||
|
context_length = 200_000
|
||||||
|
threshold_pct = 0.85
|
||||||
|
raw_threshold = int(context_length * threshold_pct) # 170K
|
||||||
|
max_safe = int(context_length * 0.95) # 190K
|
||||||
|
capped = min(int(raw_threshold * 1.4), max_safe)
|
||||||
|
assert capped <= context_length, (
|
||||||
|
f"Capped threshold ({capped:,}) must not exceed context ({context_length:,})"
|
||||||
|
)
|
||||||
|
assert capped == max_safe, (
|
||||||
|
f"For 200K models, the cap should bind: expected {max_safe:,}, got {capped:,}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_cap_does_not_affect_large_context_models(self):
|
||||||
|
"""For 1M+ models the 1.4x factor stays below 95%, so cap is no-op."""
|
||||||
|
context_length = 1_000_000
|
||||||
|
threshold_pct = 0.85
|
||||||
|
raw_threshold = int(context_length * threshold_pct) # 850K
|
||||||
|
max_safe = int(context_length * 0.95) # 950K
|
||||||
|
uncapped = int(raw_threshold * 1.4) # 1,190K — but that's > 950K
|
||||||
|
capped = min(uncapped, max_safe)
|
||||||
|
# For very large models the cap still applies but the resulting
|
||||||
|
# threshold (950K) is still large enough to prevent premature compression
|
||||||
|
assert capped <= context_length
|
||||||
|
|
||||||
|
def test_cap_for_128k_model(self):
|
||||||
|
"""128K model: 128K * 0.85 * 1.4 = 152K — exceeds 128K, cap binds."""
|
||||||
|
context_length = 128_000
|
||||||
|
threshold_pct = 0.85
|
||||||
|
raw_threshold = int(context_length * threshold_pct) # 108,800
|
||||||
|
max_safe = int(context_length * 0.95) # 121,600
|
||||||
|
uncapped = int(raw_threshold * 1.4) # 152,320
|
||||||
|
capped = min(uncapped, max_safe)
|
||||||
|
assert uncapped > context_length, "1.4x exceeds 128K context"
|
||||||
|
assert capped == max_safe, "Cap should bind for 128K models"
|
||||||
|
assert capped < context_length, "Capped value must be below context limit"
|
||||||
|
|
||||||
|
def test_warn_threshold_capped_at_context_length(self):
|
||||||
|
"""Warn threshold (0.95 * 1.4) must be capped at context_length."""
|
||||||
|
context_length = 200_000
|
||||||
|
raw_warn = int(context_length * 0.95) # 190K
|
||||||
|
uncapped_warn = int(raw_warn * 1.4) # 266K
|
||||||
|
capped_warn = min(uncapped_warn, context_length)
|
||||||
|
assert uncapped_warn > context_length
|
||||||
|
assert capped_warn == context_length
|
||||||
|
|
||||||
|
|
||||||
class TestTokenEstimation:
|
class TestTokenEstimation:
|
||||||
"""Verify rough token estimation works as expected for hygiene checks."""
|
"""Verify rough token estimation works as expected for hygiene checks."""
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue