fix: preflight context compression + error handler ordering for model switches

Two fixes for the case where a user switches to a model with a smaller
context window while having a large existing session:

1. Preflight compression in run_conversation(): Before the main loop,
   estimate tokens of loaded history + system prompt. If it exceeds the
   model's compression threshold (85% of context), compress proactively
   with up to 3 passes. This naturally handles model switches because
   the gateway creates a fresh AIAgent per message with the current
   model's context length.

2. Error handler reordering: Context-length errors (400 with 'maximum
   context length' etc.) are now checked BEFORE the generic 4xx handler.
   Previously, OpenRouter's 400-status context-length errors were caught
   as non-retryable client errors and aborted immediately, never reaching
   the compression+retry logic.

Reported by Sonicrida on Discord: 840-message session (2MB+) crashed
after switching from a large-context model to minimax via OpenRouter.
This commit is contained in:
teknium1 2026-03-04 14:42:41 -08:00
parent 093acd72dd
commit 8311e8984b
2 changed files with 245 additions and 34 deletions

View file

@ -2862,6 +2862,51 @@ class AIAgent:
active_system_prompt = self._cached_system_prompt
# ── Preflight context compression ──
# Before entering the main loop, check if the loaded conversation
# history already exceeds the model's context threshold. This handles
# cases where a user switches to a model with a smaller context window
# while having a large existing session — compress proactively rather
# than waiting for an API error (which might be caught as a non-retryable
# 4xx and abort the request entirely).
if (
self.compression_enabled
and len(messages) > self.context_compressor.protect_first_n
+ self.context_compressor.protect_last_n + 1
):
_sys_tok_est = estimate_tokens_rough(active_system_prompt or "")
_msg_tok_est = estimate_messages_tokens_rough(messages)
_preflight_tokens = _sys_tok_est + _msg_tok_est
if _preflight_tokens >= self.context_compressor.threshold_tokens:
logger.info(
"Preflight compression: ~%s tokens >= %s threshold (model %s, ctx %s)",
f"{_preflight_tokens:,}",
f"{self.context_compressor.threshold_tokens:,}",
self.model,
f"{self.context_compressor.context_length:,}",
)
if not self.quiet_mode:
print(
f"📦 Preflight compression: ~{_preflight_tokens:,} tokens "
f">= {self.context_compressor.threshold_tokens:,} threshold"
)
# May need multiple passes for very large sessions with small
# context windows (each pass summarises the middle N turns).
for _pass in range(3):
_orig_len = len(messages)
messages, active_system_prompt = self._compress_context(
messages, system_message, approx_tokens=_preflight_tokens
)
if len(messages) >= _orig_len:
break # Cannot compress further
# Re-estimate after compression
_sys_tok_est = estimate_tokens_rough(active_system_prompt or "")
_msg_tok_est = estimate_messages_tokens_rough(messages)
_preflight_tokens = _sys_tok_est + _msg_tok_est
if _preflight_tokens < self.context_compressor.threshold_tokens:
break # Under threshold
# Main conversation loop
api_call_count = 0
final_response = None
@ -3287,37 +3332,10 @@ class AIAgent:
"partial": True
}
# Check for non-retryable client errors (4xx HTTP status codes).
# These indicate a problem with the request itself (bad model ID,
# invalid API key, forbidden, etc.) and will never succeed on retry.
# Note: 413 is excluded — it's handled above via compression.
is_client_status_error = isinstance(status_code, int) and 400 <= status_code < 500 and status_code != 413
is_client_error = is_client_status_error or any(phrase in error_msg for phrase in [
'error code: 400', 'error code: 401', 'error code: 403',
'error code: 404', 'error code: 422',
'is not a valid model', 'invalid model', 'model not found',
'invalid api key', 'invalid_api_key', 'authentication',
'unauthorized', 'forbidden', 'not found',
])
if is_client_error:
self._dump_api_request_debug(
api_kwargs, reason="non_retryable_client_error", error=api_error,
)
print(f"{self.log_prefix}❌ Non-retryable client error detected. Aborting immediately.")
print(f"{self.log_prefix} 💡 This type of error won't be fixed by retrying.")
logging.error(f"{self.log_prefix}Non-retryable client error: {api_error}")
self._persist_session(messages, conversation_history)
return {
"final_response": None,
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"failed": True,
"error": str(api_error),
}
# Check for non-retryable errors (context length exceeded)
# Check for context-length errors BEFORE generic 4xx handler.
# OpenRouter returns 400 (not 413) for "maximum context length"
# errors — if we let the generic 4xx handler catch those first,
# it aborts immediately instead of attempting compression+retry.
is_context_length_error = any(phrase in error_msg for phrase in [
'context length', 'maximum context', 'token limit',
'too many tokens', 'reduce the length', 'exceeds the limit',
@ -3348,6 +3366,37 @@ class AIAgent:
"error": f"Context length exceeded ({approx_tokens:,} tokens). Cannot compress further.",
"partial": True
}
# Check for non-retryable client errors (4xx HTTP status codes).
# These indicate a problem with the request itself (bad model ID,
# invalid API key, forbidden, etc.) and will never succeed on retry.
# Note: 413 and context-length errors are excluded — handled above
# via compression.
is_client_status_error = isinstance(status_code, int) and 400 <= status_code < 500 and status_code != 413
is_client_error = (is_client_status_error or any(phrase in error_msg for phrase in [
'error code: 400', 'error code: 401', 'error code: 403',
'error code: 404', 'error code: 422',
'is not a valid model', 'invalid model', 'model not found',
'invalid api key', 'invalid_api_key', 'authentication',
'unauthorized', 'forbidden', 'not found',
])) and not is_context_length_error
if is_client_error:
self._dump_api_request_debug(
api_kwargs, reason="non_retryable_client_error", error=api_error,
)
print(f"{self.log_prefix}❌ Non-retryable client error detected. Aborting immediately.")
print(f"{self.log_prefix} 💡 This type of error won't be fixed by retrying.")
logging.error(f"{self.log_prefix}Non-retryable client error: {api_error}")
self._persist_session(messages, conversation_history)
return {
"final_response": None,
"messages": messages,
"api_calls": api_call_count,
"completed": False,
"failed": True,
"error": str(api_error),
}
if retry_count >= max_retries:
print(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded. Giving up.")

View file

@ -1,7 +1,9 @@
"""Tests for 413 payload-too-large → compression retry logic in AIAgent.
"""Tests for payload/context-length → compression retry logic in AIAgent.
Verifies that HTTP 413 errors trigger history compression and retry,
rather than being treated as non-retryable generic 4xx errors.
Verifies that:
- HTTP 413 errors trigger history compression and retry
- HTTP 400 context-length errors trigger compression (not generic 4xx abort)
- Preflight compression proactively compresses oversized sessions before API calls
"""
import uuid
@ -164,6 +166,74 @@ class TestHTTP413Compression:
mock_compress.assert_called_once()
assert result["completed"] is True
def test_400_context_length_triggers_compression(self, agent):
"""A 400 with 'maximum context length' should trigger compression, not abort as generic 4xx.
OpenRouter returns HTTP 400 (not 413) for context-length errors. Before
the fix, this was caught by the generic 4xx handler which aborted
immediately now it correctly triggers compression+retry.
"""
err_400 = Exception(
"Error code: 400 - {'error': {'message': "
"\"This endpoint's maximum context length is 204800 tokens. "
"However, you requested about 270460 tokens.\", 'code': 400}}"
)
err_400.status_code = 400
ok_resp = _mock_response(content="Recovered after compression", finish_reason="stop")
agent.client.chat.completions.create.side_effect = [err_400, ok_resp]
prefill = [
{"role": "user", "content": "previous question"},
{"role": "assistant", "content": "previous answer"},
]
with (
patch.object(agent, "_compress_context") as mock_compress,
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
mock_compress.return_value = (
[{"role": "user", "content": "hello"}],
"compressed prompt",
)
result = agent.run_conversation("hello", conversation_history=prefill)
mock_compress.assert_called_once()
# Must NOT have "failed": True (which would mean the generic 4xx handler caught it)
assert result.get("failed") is not True
assert result["completed"] is True
assert result["final_response"] == "Recovered after compression"
def test_400_reduce_length_triggers_compression(self, agent):
"""A 400 with 'reduce the length' should trigger compression."""
err_400 = Exception(
"Error code: 400 - Please reduce the length of the messages"
)
err_400.status_code = 400
ok_resp = _mock_response(content="OK", finish_reason="stop")
agent.client.chat.completions.create.side_effect = [err_400, ok_resp]
prefill = [
{"role": "user", "content": "previous question"},
{"role": "assistant", "content": "previous answer"},
]
with (
patch.object(agent, "_compress_context") as mock_compress,
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
mock_compress.return_value = (
[{"role": "user", "content": "hello"}],
"compressed",
)
result = agent.run_conversation("hello", conversation_history=prefill)
mock_compress.assert_called_once()
assert result["completed"] is True
def test_413_cannot_compress_further(self, agent):
"""When compression can't reduce messages, return partial result."""
err_413 = _make_413_error()
@ -185,3 +255,95 @@ class TestHTTP413Compression:
assert result["completed"] is False
assert result.get("partial") is True
assert "413" in result["error"]
class TestPreflightCompression:
"""Preflight compression should compress history before the first API call."""
def test_preflight_compresses_oversized_history(self, agent):
"""When loaded history exceeds the model's context threshold, compress before API call."""
agent.compression_enabled = True
# Set a very small context so the history is "oversized"
agent.context_compressor.context_length = 100
agent.context_compressor.threshold_tokens = 85 # 85% of 100
# Build a history that will be large enough to trigger preflight
# (each message ~20 chars = ~5 tokens, 20 messages = ~100 tokens > 85 threshold)
big_history = []
for i in range(20):
big_history.append({"role": "user", "content": f"Message number {i} with some extra text padding"})
big_history.append({"role": "assistant", "content": f"Response number {i} with extra padding here"})
ok_resp = _mock_response(content="After preflight", finish_reason="stop")
agent.client.chat.completions.create.side_effect = [ok_resp]
with (
patch.object(agent, "_compress_context") as mock_compress,
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
# Simulate compression reducing messages
mock_compress.return_value = (
[
{"role": "user", "content": "[CONTEXT SUMMARY]: Previous conversation"},
{"role": "user", "content": "hello"},
],
"new system prompt",
)
result = agent.run_conversation("hello", conversation_history=big_history)
# Preflight compression should have been called BEFORE the API call
mock_compress.assert_called_once()
assert result["completed"] is True
assert result["final_response"] == "After preflight"
def test_no_preflight_when_under_threshold(self, agent):
"""When history fits within context, no preflight compression needed."""
agent.compression_enabled = True
# Large context — history easily fits
agent.context_compressor.context_length = 1000000
agent.context_compressor.threshold_tokens = 850000
small_history = [
{"role": "user", "content": "hi"},
{"role": "assistant", "content": "hello"},
]
ok_resp = _mock_response(content="No compression needed", finish_reason="stop")
agent.client.chat.completions.create.side_effect = [ok_resp]
with (
patch.object(agent, "_compress_context") as mock_compress,
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
result = agent.run_conversation("hello", conversation_history=small_history)
mock_compress.assert_not_called()
assert result["completed"] is True
def test_no_preflight_when_compression_disabled(self, agent):
"""Preflight should not run when compression is disabled."""
agent.compression_enabled = False
agent.context_compressor.context_length = 100
agent.context_compressor.threshold_tokens = 85
big_history = [
{"role": "user", "content": "x" * 1000},
{"role": "assistant", "content": "y" * 1000},
] * 10
ok_resp = _mock_response(content="OK", finish_reason="stop")
agent.client.chat.completions.create.side_effect = [ok_resp]
with (
patch.object(agent, "_compress_context") as mock_compress,
patch.object(agent, "_persist_session"),
patch.object(agent, "_save_trajectory"),
patch.object(agent, "_cleanup_task_resources"),
):
result = agent.run_conversation("hello", conversation_history=big_history)
mock_compress.assert_not_called()