Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing. - Implemented filtering of corrupted entries during batch file combination based on valid tool names. - Updated terminal tool to improve command execution and error handling, including retry logic for transient failures. - Refactored model tools to use a simple terminal tool with no session persistence. - Improved logging and error messages for invalid API responses and tool calls. - Introduced chunked processing for large content in web tools to manage size limitations effectively.
2026-01-10 05:56:26 +00:00 · 2026-01-10 05:56:26 +00:00 · 4071ba29da
commit 4071ba29da
parent 21f9e2df40
8 changed files with 572 additions and 111 deletions
--- a/batch_runner.py
+++ b/batch_runner.py
@ -192,6 +192,7 @@ def _process_single_prompt(
            "trajectory": trajectory,
            "tool_stats": tool_stats,
            "completed": result["completed"],
            "partial": result.get("partial", False),
            "api_calls": result["api_calls"],
            "toolsets_used": selected_toolsets,
            "metadata": {
@ -272,13 +273,23 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
        # Save trajectory if successful
        if result["success"] and result["trajectory"]:
            # Create tool_error_counts mapping tool names to their failure counts
            tool_stats = result.get("tool_stats", {})
            tool_error_counts = {
                tool_name: stats.get("failure", 0) 
                for tool_name, stats in tool_stats.items()
            }
            trajectory_entry = {
                "prompt_index": prompt_index,
                "conversations": result["trajectory"],
                "metadata": result["metadata"],
                "completed": result["completed"],
                "partial": result.get("partial", False),  # True if stopped due to invalid tool calls
                "api_calls": result["api_calls"],
-                "toolsets_used": result["toolsets_used"]
+                "toolsets_used": result["toolsets_used"],
                "tool_stats": tool_stats,  # Full stats: {tool: {count, success, failure}}
                "tool_error_counts": tool_error_counts  # Simple: {tool: failure_count}
            }
            # Append to batch output file
@ -601,18 +612,44 @@ class BatchRunner:
                stats["failure_rate"] = 0.0
        # Combine all batch files into a single trajectories.jsonl file
        # Also filter out corrupted entries (where model generated invalid tool names)
        combined_file = self.output_dir / "trajectories.jsonl"
        print(f"\n📦 Combining batch files into {combined_file.name}...")
        VALID_TOOLS = {'web_search', 'web_extract', 'web_crawl', 'terminal', 'vision_analyze', 
                       'image_generate', 'mixture_of_agents'}
        total_entries = 0
        filtered_entries = 0
        with open(combined_file, 'w', encoding='utf-8') as outfile:
            for batch_num in range(len(self.batches)):
                batch_file = self.output_dir / f"batch_{batch_num}.jsonl"
                if batch_file.exists():
                    with open(batch_file, 'r', encoding='utf-8') as infile:
                        for line in infile:
-                            outfile.write(line)
+                            total_entries += 1
                            try:
                                data = json.loads(line)
                                tool_stats = data.get('tool_stats', {})
-        print(f"✅ Combined {len(self.batches)} batch files into trajectories.jsonl")
+                                # Check for invalid tool names (model hallucinations)
                                invalid_tools = [k for k in tool_stats.keys() if k not in VALID_TOOLS]
                                if invalid_tools:
                                    filtered_entries += 1
                                    invalid_preview = invalid_tools[0][:50] + "..." if len(invalid_tools[0]) > 50 else invalid_tools[0]
                                    print(f"   ⚠️  Filtering corrupted entry (batch {batch_num}): invalid tool '{invalid_preview}'")
                                    continue
                                outfile.write(line)
                            except json.JSONDecodeError:
                                filtered_entries += 1
                                print(f"   ⚠️  Filtering invalid JSON entry (batch {batch_num})")
        if filtered_entries > 0:
            print(f"⚠️  Filtered {filtered_entries} corrupted entries out of {total_entries} total")
        print(f"✅ Combined {len(self.batches)} batch files into trajectories.jsonl ({total_entries - filtered_entries} entries)")
        # Save final statistics
        final_stats = {
--- a/model_tools.py
+++ b/model_tools.py
@ -8,7 +8,7 @@ for defining tools and executing function calls.
 Currently supports:
 - Web tools (search, extract, crawl) from web_tools.py
- Terminal tools (command execution with interactive sessions) from terminal_tool.py
+- Terminal tools (simple command execution, no session persistence) from simple_terminal_tool.py
 - Vision tools (image analysis) from vision_tools.py
 - Mixture of Agents tools (collaborative multi-model reasoning) from mixture_of_agents_tool.py
 - Image generation tools (text-to-image with upscaling) from image_generation_tool.py
--- a/run_agent.py
+++ b/run_agent.py
@ -43,7 +43,7 @@ else:
 # Import our tool system
 from model_tools import get_tool_definitions, handle_function_call, check_toolset_requirements
-from tools.terminal_tool import cleanup_vm
+from tools.simple_terminal_tool import cleanup_vm
 class AIAgent:
@ -177,9 +177,11 @@ class AIAgent:
            disabled_toolsets=disabled_toolsets
        )
-        # Show tool configuration
+        # Show tool configuration and store valid tool names for validation
        self.valid_tool_names = set()
        if self.tools:
-            tool_names = [tool["function"]["name"] for tool in self.tools]
+            self.valid_tool_names = {tool["function"]["name"] for tool in self.tools}
            tool_names = sorted(self.valid_tool_names)
            print(f"🛠️  Loaded {len(self.tools)} tools: {', '.join(tool_names)}")
            # Show filtering info if applied
@ -495,6 +497,49 @@ class AIAgent:
                    if self.verbose_logging:
                        logging.debug(f"API Response received - Usage: {response.usage if hasattr(response, 'usage') else 'N/A'}")
                    # Validate response has valid choices before proceeding
                    if response is None or not hasattr(response, 'choices') or response.choices is None or len(response.choices) == 0:
                        # This is often rate limiting or provider returning malformed response
                        retry_count += 1
                        error_details = []
                        if response is None:
                            error_details.append("response is None")
                        elif not hasattr(response, 'choices'):
                            error_details.append("response has no 'choices' attribute")
                        elif response.choices is None:
                            error_details.append("response.choices is None")
                        else:
                            error_details.append("response.choices is empty")
                        # Check for error field in response (some providers include this)
                        error_msg = "Unknown"
                        if response and hasattr(response, 'error') and response.error:
                            error_msg = str(response.error)
                        elif response and hasattr(response, 'message') and response.message:
                            error_msg = str(response.message)
                        print(f"{self.log_prefix}⚠️  Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}")
                        print(f"{self.log_prefix}   📝 Provider message: {error_msg[:200]}")
                        print(f"{self.log_prefix}   ⏱️  Response time: {api_duration:.2f}s (fast response often indicates rate limiting)")
                        if retry_count > max_retries:
                            print(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded for invalid responses. Giving up.")
                            logging.error(f"{self.log_prefix}Invalid API response after {max_retries} retries.")
                            return {
                                "messages": messages,
                                "completed": False,
                                "api_calls": api_call_count,
                                "error": f"Invalid API response (choices is None/empty). Likely rate limited by provider.",
                                "failed": True  # Mark as failure for filtering
                            }
                        # Longer backoff for rate limiting (likely cause of None choices)
                        wait_time = min(5 * (2 ** (retry_count - 1)), 120)  # 5s, 10s, 20s, 40s, 80s, 120s
                        print(f"{self.log_prefix}⏳ Retrying in {wait_time}s (extended backoff for possible rate limit)...")
                        logging.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)}")
                        time.sleep(wait_time)
                        continue  # Retry the API call
                    break  # Success, exit retry loop
                except Exception as api_error:
@ -503,13 +548,32 @@ class AIAgent:
                    # Enhanced error logging
                    error_type = type(api_error).__name__
-                    error_msg = str(api_error)
+                    error_msg = str(api_error).lower()
                    print(f"{self.log_prefix}⚠️  API call failed (attempt {retry_count}/{max_retries}): {error_type}")
                    print(f"{self.log_prefix}   ⏱️  Time elapsed before failure: {elapsed_time:.2f}s")
-                    print(f"{self.log_prefix}   📝 Error: {error_msg[:200]}")
+                    print(f"{self.log_prefix}   📝 Error: {str(api_error)[:200]}")
                    print(f"{self.log_prefix}   📊 Request context: {len(api_messages)} messages, ~{approx_tokens:,} tokens, {len(self.tools) if self.tools else 0} tools")
                    # Check for non-retryable errors (context length exceeded)
                    is_context_length_error = any(phrase in error_msg for phrase in [
                        'context length', 'maximum context', 'token limit', 
                        'too many tokens', 'reduce the length', 'exceeds the limit'
                    ])
                    if is_context_length_error:
                        print(f"{self.log_prefix}❌ Context length exceeded - this error cannot be resolved by retrying.")
                        print(f"{self.log_prefix}   💡 The conversation has accumulated too much content from tool responses.")
                        logging.error(f"{self.log_prefix}Context length exceeded: {approx_tokens:,} tokens. Cannot continue.")
                        # Return a partial result instead of crashing
                        return {
                            "messages": messages,
                            "completed": False,
                            "api_calls": api_call_count,
                            "error": f"Context length exceeded ({approx_tokens:,} tokens). Conversation terminated early.",
                            "partial": True
                        }
                    if retry_count > max_retries:
                        print(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded. Giving up.")
                        logging.error(f"{self.log_prefix}API call failed after {max_retries} retries. Last error: {api_error}")
@ -537,6 +601,43 @@ class AIAgent:
                        for tc in assistant_message.tool_calls:
                            logging.debug(f"Tool call: {tc.function.name} with args: {tc.function.arguments[:200]}...")
                    # Validate tool call names - detect model hallucinations
                    invalid_tool_calls = [
                        tc.function.name for tc in assistant_message.tool_calls 
                        if tc.function.name not in self.valid_tool_names
                    ]
                    if invalid_tool_calls:
                        # Track retries for invalid tool calls
                        if not hasattr(self, '_invalid_tool_retries'):
                            self._invalid_tool_retries = 0
                        self._invalid_tool_retries += 1
                        invalid_preview = invalid_tool_calls[0][:80] + "..." if len(invalid_tool_calls[0]) > 80 else invalid_tool_calls[0]
                        print(f"{self.log_prefix}⚠️  Invalid tool call detected: '{invalid_preview}'")
                        print(f"{self.log_prefix}   Valid tools: {sorted(self.valid_tool_names)}")
                        if self._invalid_tool_retries < 3:
                            print(f"{self.log_prefix}🔄 Retrying API call ({self._invalid_tool_retries}/3)...")
                            # Don't add anything to messages, just retry the API call
                            continue
                        else:
                            print(f"{self.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.")
                            # Return partial result - don't include the bad tool call in messages
                            self._invalid_tool_retries = 0  # Reset for next conversation
                            return {
                                "final_response": None,
                                "messages": messages,  # Messages up to last valid point
                                "api_calls": api_call_count,
                                "completed": False,
                                "partial": True,
                                "error": f"Model generated invalid tool call: {invalid_preview}"
                            }
                    # Reset retry counter on successful tool call validation
                    if hasattr(self, '_invalid_tool_retries'):
                        self._invalid_tool_retries = 0
                    # Extract reasoning from response if available (for reasoning models like minimax, kimi, etc.)
                    reasoning_content = None
                    if hasattr(assistant_message, 'reasoning') and assistant_message.reasoning:
@ -669,7 +770,8 @@ class AIAgent:
            "final_response": final_response,
            "messages": messages,
            "api_calls": api_call_count,
-            "completed": completed
+            "completed": completed,
            "partial": False  # True only when stopped due to invalid tool calls
        }
    def chat(self, message: str) -> str:
--- a/run_datagen_glm4.7.sh
+++ b/run_datagen_glm4.7.sh
@ -0,0 +1,26 @@
 #!/bin/bash
 # Create logs directory if it doesn't exist
 mkdir -p logs
 # Generate log filename with timestamp
 LOG_FILE="logs/glm4.7-thinking-sft1_$(date +%Y%m%d_%H%M%S).log"
 echo "📝 Logging output to: $LOG_FILE"
 python batch_runner.py \
  --dataset_file="source-data/hermes-agent-agent-tasks-1/agent_tasks_sft_1.jsonl" \
  --batch_size=25 \
  --run_name="megascience_glm4.7-thinking-sft1" \
  --distribution="science" \
  --model="z-ai/glm-4.7" \
  --base_url="https://openrouter.ai/api/v1" \
  --providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
  --num_workers=10 \
  --max_turns=60 \
  --ephemeral_system_prompt="You have access to a variety of tools to help you solve scientific, math, and technology problems presented to you. You can use them in sequence and build off of the results of prior tools you've used results. Always use the terminal or search tool if it can provide additional context, verify formulas, double check concepts and recent studies and understanding, doing all calculations, etc. You should only be confident in your own reasoning, knowledge, or calculations if you've exhaustively used all tools available to you to that can help you verify or validate your work. Always pip install any packages you need to use the python scripts you want to run. If you need to use a tool that isn't available, you can use the terminal tool to install or create it in many cases as well. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. Search for at least 3 sources, but not more than 12." \
  2>&1 | tee "$LOG_FILE"
 echo "✅ Log saved to: $LOG_FILE"
 #  --verbose \
--- a/tools/init.py
+++ b/tools/init.py
@ -6,7 +6,7 @@ This package contains all the specific tool implementations for the Hermes Agent
 Each module provides specialized functionality for different capabilities:
 - web_tools: Web search, content extraction, and crawling
- terminal_tool: Command execution on virtual machines
+- simple_terminal_tool: Simple command execution on virtual machines (no session persistence)
 - vision_tools: Image analysis and understanding
 - mixture_of_agents_tool: Multi-model collaborative reasoning
 - image_generation_tool: Text-to-image generation with upscaling
@ -23,10 +23,11 @@ from .web_tools import (
    check_firecrawl_api_key
 )
-from .terminal_tool import (
+from .simple_terminal_tool import (
-    terminal_tool,
+    simple_terminal_tool,
-    check_hecate_requirements,
+    check_requirements as check_terminal_requirements,
-    TERMINAL_TOOL_DESCRIPTION
+    cleanup_vm,
    SIMPLE_TERMINAL_TOOL_DESCRIPTION
 )
 from .vision_tools import (
@ -50,10 +51,11 @@ __all__ = [
    'web_extract_tool',
    'web_crawl_tool',
    'check_firecrawl_api_key',
-    # Terminal tools
+    # Terminal tools (simple - no session persistence)
-    'terminal_tool',
+    'simple_terminal_tool',
-    'check_hecate_requirements',
+    'check_terminal_requirements',
-    'TERMINAL_TOOL_DESCRIPTION',
+    'cleanup_vm',
    'SIMPLE_TERMINAL_TOOL_DESCRIPTION',
    # Vision tools
    'vision_analyze_tool',
    'check_vision_requirements',
--- a/tools/simple_terminal_tool.py
+++ b/tools/simple_terminal_tool.py
@ -101,6 +101,12 @@ def _cleanup_inactive_vms(vm_lifetime_seconds: int = 300):
                else:
                    print(f"[VM Cleanup] Error cleaning up VM for task {task_id}: {e}")
                # Always remove from tracking dicts to prevent infinite retry loops
                if task_id in _active_instances:
                    del _active_instances[task_id]
                if task_id in _last_activity:
                    del _last_activity[task_id]
 def _cleanup_thread_worker():
    """Background thread worker that periodically cleans up inactive VMs."""
@ -171,48 +177,36 @@ def cleanup_vm(task_id: str):
 atexit.register(_stop_cleanup_thread)
-def _execute_ssh_command(instance, command: str, timeout: Optional[int] = None) -> Dict[str, Any]:
+def _execute_command(instance, command: str, timeout: Optional[int] = None) -> Dict[str, Any]:
    """
-    Execute a command via SSH on the VM instance.
+    Execute a command on the VM instance using instance.exec() for proper stderr capture.
    Args:
        instance: MorphVM instance
        command: Command to execute
-        timeout: Optional timeout in seconds
+        timeout: Optional timeout in seconds (Note: exec() may not support timeout directly)
    Returns:
        dict with stdout, stderr, returncode
    """
    ssh_context_manager = None
    try:
-        # Use the instance's SSH context manager
+        # Use instance.exec() which properly captures both stdout and stderr
-        ssh_context_manager = instance.ssh()
+        # (unlike ssh.run() which doesn't capture stderr correctly)
-        ssh_context = ssh_context_manager.__enter__()
+        result = instance.exec(command)
-        # Execute the command
+        # Debug logging only for verbose mode or unusual cases
-        result = ssh_context.run(command, get_pty=False, timeout=timeout or 120)
+        # Note: Non-zero exit codes are normal (model's command failed) - not a tool error
-
+        if result.exit_code != 0 and not result.stdout and not result.stderr:
-        # Close the SSH connection
+            # Only log if we got absolutely no output - might indicate an issue
-        if ssh_context_manager:
+            print(f"⚠️  Command returned exit={result.exit_code} with no output")
            try:
                ssh_context_manager.__exit__(None, None, None)
            except:
                pass
        return {
            "stdout": result.stdout or "",
            "stderr": result.stderr or "",
-            "returncode": result.returncode
+            "returncode": result.exit_code
        }
    except Exception as e:
        # Close connection on error
        if ssh_context_manager:
            try:
                ssh_context_manager.__exit__(None, None, None)
            except:
                pass
        # Check if it's a timeout
        error_str = str(e).lower()
        if "timeout" in error_str:
@ -224,7 +218,7 @@ def _execute_ssh_command(instance, command: str, timeout: Optional[int] = None)
        return {
            "stdout": "",
-            "stderr": f"SSH execution failed: {str(e)}",
+            "stderr": f"Command execution failed: {str(e)}",
            "returncode": -1
        }
@ -312,7 +306,7 @@ def simple_terminal_tool(
        if background:
            # Run in background with nohup and redirect output
            exec_command = f"nohup {command} > /tmp/bg_output.log 2>&1 &"
-            result = _execute_ssh_command(instance, exec_command, timeout=10)
+            result = _execute_command(instance, exec_command, timeout=10)
            # For background tasks, return immediately with info
            if result["returncode"] == 0:
@ -322,24 +316,72 @@ def simple_terminal_tool(
                    "error": None
                }, ensure_ascii=False)
            else:
                # Include stderr in output but don't set error (command failure, not tool failure)
                bg_output = result["stdout"]
                if result["stderr"]:
                    bg_output = f"{bg_output}\n{result['stderr']}" if bg_output else result["stderr"]
                return json.dumps({
-                    "output": result["stdout"],
+                    "output": bg_output,
                    "exit_code": result["returncode"],
-                    "error": result["stderr"]
+                    "error": None  # Only set for actual tool failures
                }, ensure_ascii=False)
        else:
-            # Run foreground command
+            # Run foreground command with retry logic for transient failures
-            result = _execute_ssh_command(instance, command, timeout=timeout)
+            max_retries = 3
            retry_count = 0
            result = None
            while retry_count <= max_retries:
                result = _execute_command(instance, command, timeout=timeout)
                # Check if we should retry (only for transient errors, not normal results)
                stdout = result.get("stdout", "")
                stderr = result.get("stderr", "")
                returncode = result.get("returncode", 0)
                should_retry = False
                retry_reason = ""
                # NOTE: Empty output with exit_code=0 is NORMAL for many commands:
                # - File writes: cat > file, echo > file
                # - Directory ops: mkdir, cd
                # - Silent installs: pip install --quiet
                # So we do NOT retry on exit_code=0, even with empty output.
                # Only retry on special error codes that suggest transient/infra issues
                if not stdout and not stderr and returncode in [-1, 124]:
                    should_retry = True
                    retry_reason = f"transient error (code {returncode})"
                if should_retry and retry_count < max_retries:
                    retry_count += 1
                    wait_time = 2 ** retry_count  # Exponential backoff: 2s, 4s, 8s
                    print(f"⚠️  Terminal: {retry_reason}, retrying in {wait_time}s (attempt {retry_count}/{max_retries})")
                    time.sleep(wait_time)
                    continue
                # Got a result (success or normal command failure) - exit retry loop
                break
            # Combine stdout and stderr for output
            output = result["stdout"]
            if result["stderr"] and result["returncode"] != 0:
                output = f"{output}\n{result['stderr']}" if output else result["stderr"]
            # Truncate output if too long (max 50,000 chars to avoid context explosion)
            MAX_OUTPUT_CHARS = 50000
            if len(output) > MAX_OUTPUT_CHARS:
                truncated_notice = f"\n\n... [OUTPUT TRUNCATED - showing last {MAX_OUTPUT_CHARS} chars of {len(output)} total] ..."
                output = truncated_notice + output[-MAX_OUTPUT_CHARS:]
            # NOTE: error is only set for FUNCTIONAL tool failures (VM issues, timeouts, etc.)
            # Non-zero exit codes from the model's commands are NOT tool failures - 
            # the model can self-correct. The exit_code field tells the model if the command succeeded.
            # Retries that eventually succeed also don't count as failures.
            return json.dumps({
                "output": output.strip(),
                "exit_code": result["returncode"],
-                "error": result["stderr"] if result["returncode"] != 0 else None
+                "error": None  # Only set for actual tool failures, not command failures
            }, ensure_ascii=False)
    except Exception as e:
--- a/tools/terminal_tool.py
+++ b/tools/terminal_tool.py
@ -270,6 +270,7 @@ def terminal_tool(
        except ImportError as import_error:
            return json.dumps({
                "output": "",
                "stderr": "",
                "screen": "",
                "exit_code": -1,
                "error": f"Terminal tool is disabled due to import error: {import_error}",
@ -287,6 +288,7 @@ def terminal_tool(
        if not morph_api_key:
            return json.dumps({
                "output": "",
                "stderr": "",
                "screen": "",
                "exit_code": -1,
                "error": "MORPH_API_KEY environment variable not set",
@ -349,29 +351,85 @@ def terminal_tool(
        # Generate unique tool block ID
        tool_block_id = f"tool_{uuid.uuid4().hex[:8]}"
-        # Execute the tool with hecate
+        # Retry configuration for handling transient empty responses
-        result = run_tool(
+        max_retries = 3
-            tool_call=tool_call,
+        retry_count = 0
            instance=instance,
            console=console,
            tool_block_id=tool_block_id,
            ctx=ctx
        )
-        # Format the result with only essential fields for the LLM
+        while retry_count <= max_retries:
-        # Map hecate's "stdout" to "output" for compatibility
+            # Execute the tool with hecate
-        formatted_result = {
+            result = run_tool(
-            "output": result.get("stdout", result.get("output", "")),
+                tool_call=tool_call,
-            "screen": result.get("screen", ""),
+                instance=instance,
-            "exit_code": result.get("returncode", result.get("exit_code", -1)),
+                console=console,
-            "error": result.get("error")
+                tool_block_id=tool_block_id,
-        }
+                ctx=ctx
            )
-        return json.dumps(formatted_result, ensure_ascii=False)
+            # Format the result with only essential fields for the LLM
            # Map hecate's "stdout" to "output" for compatibility
            stdout = result.get("stdout", result.get("output", ""))
            stderr = result.get("stderr", "")
            exit_code = result.get("returncode", result.get("exit_code", -1))
            error = result.get("error")
            screen = result.get("screen", "")
            # If there's no explicit error but there's stderr, include it in error field
            # This helps capture why commands failed even without an explicit error message
            if not error and stderr:
                error = stderr
            # If exit code is non-zero but no error info, note that
            elif not error and exit_code and exit_code != 0 and not stdout:
                error = f"Command exited with code {exit_code}"
            # Check if we should retry:
            # 1. Empty output with non-zero exit code (clear failure)
            # 2. Completely empty response (may indicate timing/VM issue)
            should_retry = False
            retry_reason = ""
            if not stdout and not stderr and not screen and not error and exit_code == 0:
                # Completely empty response - might be a timing issue
                should_retry = True
                retry_reason = "completely empty response (possible timing issue)"
            elif not stdout and not stderr and exit_code != 0 and exit_code != -1:
                # Non-zero exit with no output at all - might be transient
                should_retry = True
                retry_reason = f"empty output with exit code {exit_code}"
            if should_retry and retry_count < max_retries:
                retry_count += 1
                wait_time = 2 ** retry_count  # Exponential backoff: 2s, 4s, 8s
                print(f"⚠️  Terminal: {retry_reason}, retrying in {wait_time}s (attempt {retry_count}/{max_retries})")
                time.sleep(wait_time)
                continue
            # Success or max retries reached - return the result
            formatted_result = {
                "output": stdout,
                "stderr": stderr,  # Now capturing stderr separately too
                "screen": screen,
                "exit_code": exit_code,
                "error": error
            }
            if retry_count > 0:
                formatted_result["retries"] = retry_count
            return json.dumps(formatted_result, ensure_ascii=False)
        # Should never reach here, but just in case
        return json.dumps({
            "output": "",
            "stderr": "",
            "screen": "",
            "exit_code": -1,
            "error": "Terminal tool: max retries exceeded"
        }, ensure_ascii=False)
    except Exception as e:
        return json.dumps({
            "output": "",
            "stderr": "",
            "screen": "",
            "exit_code": -1,
            "error": f"Failed to execute terminal command: {str(e)}",
--- a/tools/web_tools.py
+++ b/tools/web_tools.py
@ -139,6 +139,9 @@ async def process_content_with_llm(
    to intelligently extract key information and create markdown summaries,
    significantly reducing token usage while preserving all important information.
    For very large content (>500k chars), uses chunked processing with synthesis.
    For extremely large content (>2M chars), refuses to process entirely.
    Args:
        content (str): The raw content to process
        url (str): The source URL (for context, optional)
@ -149,13 +152,25 @@ async def process_content_with_llm(
    Returns:
        Optional[str]: Processed markdown content, or None if content too short or processing fails
    """
-    try:
+    # Size thresholds
-        # Skip processing if content is too short
+    MAX_CONTENT_SIZE = 2_000_000  # 2M chars - refuse entirely above this
-        if len(content) < min_length:
+    CHUNK_THRESHOLD = 500_000     # 500k chars - use chunked processing above this
-            print(f"📏 Content too short ({len(content)} < {min_length} chars), skipping LLM processing")
+    CHUNK_SIZE = 100_000          # 100k chars per chunk
-            return None
+    MAX_OUTPUT_SIZE = 5000        # Hard cap on final output size
-        print(f"🧠 Processing content with LLM ({len(content)} characters)")
+    try:
        content_len = len(content)
        # Refuse if content is absurdly large
        if content_len > MAX_CONTENT_SIZE:
            size_mb = content_len / 1_000_000
            print(f"🚫 Content too large ({size_mb:.1f}MB > 2MB limit). Refusing to process.")
            return f"[Content too large to process: {size_mb:.1f}MB. Try using web_crawl with specific extraction instructions, or search for a more focused source.]"
        # Skip processing if content is too short
        if content_len < min_length:
            print(f"📏 Content too short ({content_len} < {min_length} chars), skipping LLM processing")
            return None
        # Create context information
        context_info = []
@ -163,10 +178,83 @@ async def process_content_with_llm(
            context_info.append(f"Title: {title}")
        if url:
            context_info.append(f"Source: {url}")
        context_str = "\n".join(context_info) + "\n\n" if context_info else ""
-        # Simplified prompt for better quality markdown output
+        # Check if we need chunked processing
        if content_len > CHUNK_THRESHOLD:
            print(f"📦 Content large ({content_len:,} chars). Using chunked processing...")
            return await _process_large_content_chunked(
                content, context_str, model, CHUNK_SIZE, MAX_OUTPUT_SIZE
            )
        # Standard single-pass processing for normal content
        print(f"🧠 Processing content with LLM ({content_len} characters)")
        processed_content = await _call_summarizer_llm(content, context_str, model)
        if processed_content:
            # Enforce output cap
            if len(processed_content) > MAX_OUTPUT_SIZE:
                processed_content = processed_content[:MAX_OUTPUT_SIZE] + "\n\n[... summary truncated for context management ...]"
            # Log compression metrics
            processed_length = len(processed_content)
            compression_ratio = processed_length / content_len if content_len > 0 else 1.0
            print(f"✅ Content processed: {content_len} → {processed_length} chars ({compression_ratio:.1%})")
        return processed_content
    except Exception as e:
        print(f"❌ Error processing content with LLM: {str(e)}")
        return f"[Failed to process content: {str(e)[:100]}. Content size: {len(content):,} chars]"
 async def _call_summarizer_llm(
    content: str, 
    context_str: str, 
    model: str, 
    max_tokens: int = 4000,
    is_chunk: bool = False,
    chunk_info: str = ""
 ) -> Optional[str]:
    """
    Make a single LLM call to summarize content.
    Args:
        content: The content to summarize
        context_str: Context information (title, URL)
        model: Model to use
        max_tokens: Maximum output tokens
        is_chunk: Whether this is a chunk of a larger document
        chunk_info: Information about chunk position (e.g., "Chunk 2/5")
    Returns:
        Summarized content or None on failure
    """
    if is_chunk:
        # Chunk-specific prompt - aware that this is partial content
        system_prompt = """You are an expert content analyst processing a SECTION of a larger document. Your job is to extract and summarize the key information from THIS SECTION ONLY.
 Important guidelines for chunk processing:
 1. Do NOT write introductions or conclusions - this is a partial document
 2. Focus on extracting ALL key facts, figures, data points, and insights from this section
 3. Preserve important quotes, code snippets, and specific details verbatim
 4. Use bullet points and structured formatting for easy synthesis later
 5. Note any references to other sections (e.g., "as mentioned earlier", "see below") without trying to resolve them
 Your output will be combined with summaries of other sections, so focus on thorough extraction rather than narrative flow."""
        user_prompt = f"""Extract key information from this SECTION of a larger document:
 {context_str}{chunk_info}
 SECTION CONTENT:
 {content}
 Extract all important information from this section in a structured format. Focus on facts, data, insights, and key details. Do not add introductions or conclusions."""
    else:
        # Standard full-document prompt
        system_prompt = """You are an expert content analyst. Your job is to process web content and create a comprehensive yet concise summary that preserves all important information while dramatically reducing bulk.
 Create a well-structured markdown summary that includes:
@ -183,49 +271,155 @@ Your goal is to preserve ALL important information while reducing length. Never
 Create a markdown summary that captures all key information in a well-organized, scannable format. Include important quotes and code snippets in their original formatting. Focus on actionable information, specific details, and unique insights."""
-        # Call the LLM asynchronously with retry logic for flaky API
+    # Call the LLM with retry logic
-        max_retries = 6
+    max_retries = 6
-        retry_delay = 2  # Start with 2 seconds
+    retry_delay = 2
-        last_error = None
+    last_error = None
-        for attempt in range(max_retries):
+    for attempt in range(max_retries):
-            try:
+        try:
-                response = await summarizer_client.chat.completions.create(
+            response = await summarizer_client.chat.completions.create(
-                    model=model,
+                model=model,
-                    messages=[
+                messages=[
-                        {"role": "system", "content": system_prompt},
+                    {"role": "system", "content": system_prompt},
-                        {"role": "user", "content": user_prompt}
+                    {"role": "user", "content": user_prompt}
-                    ],
+                ],
-                    temperature=0.1,  # Low temperature for consistent extraction
+                temperature=0.1,
-                    max_tokens=4000   # Generous limit for comprehensive processing
+                max_tokens=max_tokens
-                )
+            )
-                break  # Success, exit retry loop
+            return response.choices[0].message.content.strip()
-            except Exception as api_error:
+        except Exception as api_error:
-                last_error = api_error
+            last_error = api_error
-                if attempt < max_retries - 1:
+            if attempt < max_retries - 1:
-                    print(f"⚠️  LLM API call failed (attempt {attempt + 1}/{max_retries}): {str(api_error)[:100]}")
+                print(f"⚠️  LLM API call failed (attempt {attempt + 1}/{max_retries}): {str(api_error)[:100]}")
-                    print(f"   Retrying in {retry_delay}s...")
+                print(f"   Retrying in {retry_delay}s...")
-                    await asyncio.sleep(retry_delay)
+                await asyncio.sleep(retry_delay)
-                    retry_delay = min(retry_delay * 2, 60)  # Exponential backoff: 2s, 4s, 8s, 16s, 32s, 60s
+                retry_delay = min(retry_delay * 2, 60)
-                else:
+            else:
-                    # All retries exhausted
+                raise last_error
                    raise last_error
-        # Get the markdown response directly
+    return None
        processed_content = response.choices[0].message.content.strip()
        # Calculate compression metrics for logging
        original_length = len(content)
        processed_length = len(processed_content)
        compression_ratio = processed_length / original_length if original_length > 0 else 1.0
-        print(f"✅ Content processed: {original_length} → {processed_length} chars ({compression_ratio:.1%})")
+async def _process_large_content_chunked(
    content: str, 
    context_str: str, 
    model: str, 
    chunk_size: int,
    max_output_size: int
 ) -> Optional[str]:
    """
    Process large content by chunking, summarizing each chunk in parallel,
    then synthesizing the summaries.
-        return processed_content
+    Args:
        content: The large content to process
        context_str: Context information
        model: Model to use
        chunk_size: Size of each chunk in characters
        max_output_size: Maximum final output size
    Returns:
        Synthesized summary or None on failure
    """
    # Split content into chunks
    chunks = []
    for i in range(0, len(content), chunk_size):
        chunk = content[i:i + chunk_size]
        chunks.append(chunk)
    print(f"   📦 Split into {len(chunks)} chunks of ~{chunk_size:,} chars each")
    # Summarize each chunk in parallel
    async def summarize_chunk(chunk_idx: int, chunk_content: str) -> tuple[int, Optional[str]]:
        """Summarize a single chunk."""
        try:
            chunk_info = f"[Processing chunk {chunk_idx + 1} of {len(chunks)}]"
            summary = await _call_summarizer_llm(
                chunk_content, 
                context_str, 
                model, 
                max_tokens=2000,
                is_chunk=True,
                chunk_info=chunk_info
            )
            if summary:
                print(f"   ✅ Chunk {chunk_idx + 1}/{len(chunks)} summarized: {len(chunk_content):,} → {len(summary):,} chars")
            return chunk_idx, summary
        except Exception as e:
            print(f"   ⚠️  Chunk {chunk_idx + 1}/{len(chunks)} failed: {str(e)[:50]}")
            return chunk_idx, None
    # Run all chunk summarizations in parallel
    tasks = [summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)]
    results = await asyncio.gather(*tasks)
    # Collect successful summaries in order
    summaries = []
    for chunk_idx, summary in sorted(results, key=lambda x: x[0]):
        if summary:
            summaries.append(f"## Section {chunk_idx + 1}\n{summary}")
    if not summaries:
        print(f"   ❌ All chunk summarizations failed")
        return "[Failed to process large content: all chunk summarizations failed]"
    print(f"   📊 Got {len(summaries)}/{len(chunks)} chunk summaries")
    # If only one chunk succeeded, just return it (with cap)
    if len(summaries) == 1:
        result = summaries[0]
        if len(result) > max_output_size:
            result = result[:max_output_size] + "\n\n[... truncated ...]"
        return result
    # Synthesize the summaries into a final summary
    print(f"   🔗 Synthesizing {len(summaries)} summaries...")
    combined_summaries = "\n\n---\n\n".join(summaries)
    synthesis_prompt = f"""You have been given summaries of different sections of a large document. 
 Synthesize these into ONE cohesive, comprehensive summary that:
 1. Removes redundancy between sections
 2. Preserves all key facts, figures, and actionable information
 3. Is well-organized with clear structure
 4. Is under {max_output_size} characters
 {context_str}SECTION SUMMARIES:
 {combined_summaries}
 Create a single, unified markdown summary."""
    try:
        response = await summarizer_client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise."},
                {"role": "user", "content": synthesis_prompt}
            ],
            temperature=0.1,
            max_tokens=4000
        )
        final_summary = response.choices[0].message.content.strip()
        # Enforce hard cap
        if len(final_summary) > max_output_size:
            final_summary = final_summary[:max_output_size] + "\n\n[... summary truncated for context management ...]"
        original_len = len(content)
        final_len = len(final_summary)
        compression = final_len / original_len if original_len > 0 else 1.0
        print(f"   ✅ Synthesis complete: {original_len:,} → {final_len:,} chars ({compression:.2%})")
        return final_summary
    except Exception as e:
-        print(f"❌ Error processing content with LLM: {str(e)}")
+        print(f"   ⚠️  Synthesis failed: {str(e)[:100]}")
-        return None
+        # Fall back to concatenated summaries with truncation
        fallback = "\n\n".join(summaries)
        if len(fallback) > max_output_size:
            fallback = fallback[:max_output_size] + "\n\n[... truncated due to synthesis failure ...]"
        return fallback
 def clean_base64_images(text: str) -> str: