fix: proactive compression after large tool results + Anthropic error detection

Two fixes for context overflow handling: 1. Proactive compression after tool execution: The compression check now estimates the next prompt size using real token counts from the last API response (prompt_tokens + completion_tokens) plus a conservative estimate of newly appended tool results (chars // 3 for JSON-heavy content). Previously, should_compress() only checked last_prompt_tokens which didn't account for tool results — so a 130k prompt + 100k chars of tool output would pass the 140k threshold check but fail the 200k API limit. 2. Safety net: Added 'prompt is too long' to context-length error detection phrases. Anthropic returns 'prompt is too long: N tokens > M maximum' on HTTP 400, which wasn't matched by existing phrases. This ensures compression fires even if the proactive check underestimates. Fixes #813
2026-03-11 08:04:52 -07:00 · 2026-03-11 08:04:52 -07:00 · a54405e339
commit a54405e339
parent efb780c754
2 changed files with 86 additions and 1 deletions
--- a/run_agent.py
+++ b/run_agent.py
@ -3872,6 +3872,7 @@ class AIAgent:
                        'token limit', 'too many tokens', 'reduce the length',
                        'exceeds the limit', 'context window',
                        'request entity too large',  # OpenRouter/Nous 413 safety net
+                        'prompt is too long',  # Anthropic: "prompt is too long: N tokens > M maximum"
                    ])
                    
                    if is_context_length_error:
@ -4256,6 +4257,7 @@ class AIAgent:
                    
                    messages.append(assistant_msg)
                    
+                    _msg_count_before_tools = len(messages)
                    self._execute_tool_calls(assistant_message, messages, effective_task_id, api_call_count)

                    # Refund the iteration if the ONLY tool(s) called were
@ -4265,7 +4267,20 @@ class AIAgent:
                    if _tc_names == {"execute_code"}:
                        self.iteration_budget.refund()
                    
-                    if self.compression_enabled and self.context_compressor.should_compress():
+                    # Estimate next prompt size using real token counts from the
+                    # last API response + rough estimate of newly appended tool
+                    # results.  This catches cases where tool results push the
+                    # context past the limit that last_prompt_tokens alone misses
+                    # (e.g. large file reads, web extractions).
+                    _compressor = self.context_compressor
+                    _new_tool_msgs = messages[_msg_count_before_tools:]
+                    _new_chars = sum(len(str(m.get("content", "") or "")) for m in _new_tool_msgs)
+                    _estimated_next_prompt = (
+                        _compressor.last_prompt_tokens
+                        + _compressor.last_completion_tokens
+                        + _new_chars // 3  # conservative: JSON-heavy tool results ≈ 3 chars/token
+                    )
+                    if self.compression_enabled and _compressor.should_compress(_estimated_next_prompt):
                        messages, active_system_prompt = self._compress_context(
                            messages, system_message,
                            approx_tokens=self.context_compressor.last_prompt_tokens,