Enhance batch processing and tool validation

- Added support for tracking partial results and tool error counts in batch processing.
- Implemented filtering of corrupted entries during batch file combination based on valid tool names.
- Updated terminal tool to improve command execution and error handling, including retry logic for transient failures.
- Refactored model tools to use a simple terminal tool with no session persistence.
- Improved logging and error messages for invalid API responses and tool calls.
- Introduced chunked processing for large content in web tools to manage size limitations effectively.
This commit is contained in:
teknium 2026-01-10 05:56:26 +00:00
parent 21f9e2df40
commit 4071ba29da
8 changed files with 572 additions and 111 deletions

View file

@ -192,6 +192,7 @@ def _process_single_prompt(
"trajectory": trajectory, "trajectory": trajectory,
"tool_stats": tool_stats, "tool_stats": tool_stats,
"completed": result["completed"], "completed": result["completed"],
"partial": result.get("partial", False),
"api_calls": result["api_calls"], "api_calls": result["api_calls"],
"toolsets_used": selected_toolsets, "toolsets_used": selected_toolsets,
"metadata": { "metadata": {
@ -272,13 +273,23 @@ def _process_batch_worker(args: Tuple) -> Dict[str, Any]:
# Save trajectory if successful # Save trajectory if successful
if result["success"] and result["trajectory"]: if result["success"] and result["trajectory"]:
# Create tool_error_counts mapping tool names to their failure counts
tool_stats = result.get("tool_stats", {})
tool_error_counts = {
tool_name: stats.get("failure", 0)
for tool_name, stats in tool_stats.items()
}
trajectory_entry = { trajectory_entry = {
"prompt_index": prompt_index, "prompt_index": prompt_index,
"conversations": result["trajectory"], "conversations": result["trajectory"],
"metadata": result["metadata"], "metadata": result["metadata"],
"completed": result["completed"], "completed": result["completed"],
"partial": result.get("partial", False), # True if stopped due to invalid tool calls
"api_calls": result["api_calls"], "api_calls": result["api_calls"],
"toolsets_used": result["toolsets_used"] "toolsets_used": result["toolsets_used"],
"tool_stats": tool_stats, # Full stats: {tool: {count, success, failure}}
"tool_error_counts": tool_error_counts # Simple: {tool: failure_count}
} }
# Append to batch output file # Append to batch output file
@ -601,18 +612,44 @@ class BatchRunner:
stats["failure_rate"] = 0.0 stats["failure_rate"] = 0.0
# Combine all batch files into a single trajectories.jsonl file # Combine all batch files into a single trajectories.jsonl file
# Also filter out corrupted entries (where model generated invalid tool names)
combined_file = self.output_dir / "trajectories.jsonl" combined_file = self.output_dir / "trajectories.jsonl"
print(f"\n📦 Combining batch files into {combined_file.name}...") print(f"\n📦 Combining batch files into {combined_file.name}...")
VALID_TOOLS = {'web_search', 'web_extract', 'web_crawl', 'terminal', 'vision_analyze',
'image_generate', 'mixture_of_agents'}
total_entries = 0
filtered_entries = 0
with open(combined_file, 'w', encoding='utf-8') as outfile: with open(combined_file, 'w', encoding='utf-8') as outfile:
for batch_num in range(len(self.batches)): for batch_num in range(len(self.batches)):
batch_file = self.output_dir / f"batch_{batch_num}.jsonl" batch_file = self.output_dir / f"batch_{batch_num}.jsonl"
if batch_file.exists(): if batch_file.exists():
with open(batch_file, 'r', encoding='utf-8') as infile: with open(batch_file, 'r', encoding='utf-8') as infile:
for line in infile: for line in infile:
outfile.write(line) total_entries += 1
try:
data = json.loads(line)
tool_stats = data.get('tool_stats', {})
print(f"✅ Combined {len(self.batches)} batch files into trajectories.jsonl") # Check for invalid tool names (model hallucinations)
invalid_tools = [k for k in tool_stats.keys() if k not in VALID_TOOLS]
if invalid_tools:
filtered_entries += 1
invalid_preview = invalid_tools[0][:50] + "..." if len(invalid_tools[0]) > 50 else invalid_tools[0]
print(f" ⚠️ Filtering corrupted entry (batch {batch_num}): invalid tool '{invalid_preview}'")
continue
outfile.write(line)
except json.JSONDecodeError:
filtered_entries += 1
print(f" ⚠️ Filtering invalid JSON entry (batch {batch_num})")
if filtered_entries > 0:
print(f"⚠️ Filtered {filtered_entries} corrupted entries out of {total_entries} total")
print(f"✅ Combined {len(self.batches)} batch files into trajectories.jsonl ({total_entries - filtered_entries} entries)")
# Save final statistics # Save final statistics
final_stats = { final_stats = {

View file

@ -8,7 +8,7 @@ for defining tools and executing function calls.
Currently supports: Currently supports:
- Web tools (search, extract, crawl) from web_tools.py - Web tools (search, extract, crawl) from web_tools.py
- Terminal tools (command execution with interactive sessions) from terminal_tool.py - Terminal tools (simple command execution, no session persistence) from simple_terminal_tool.py
- Vision tools (image analysis) from vision_tools.py - Vision tools (image analysis) from vision_tools.py
- Mixture of Agents tools (collaborative multi-model reasoning) from mixture_of_agents_tool.py - Mixture of Agents tools (collaborative multi-model reasoning) from mixture_of_agents_tool.py
- Image generation tools (text-to-image with upscaling) from image_generation_tool.py - Image generation tools (text-to-image with upscaling) from image_generation_tool.py

View file

@ -43,7 +43,7 @@ else:
# Import our tool system # Import our tool system
from model_tools import get_tool_definitions, handle_function_call, check_toolset_requirements from model_tools import get_tool_definitions, handle_function_call, check_toolset_requirements
from tools.terminal_tool import cleanup_vm from tools.simple_terminal_tool import cleanup_vm
class AIAgent: class AIAgent:
@ -177,9 +177,11 @@ class AIAgent:
disabled_toolsets=disabled_toolsets disabled_toolsets=disabled_toolsets
) )
# Show tool configuration # Show tool configuration and store valid tool names for validation
self.valid_tool_names = set()
if self.tools: if self.tools:
tool_names = [tool["function"]["name"] for tool in self.tools] self.valid_tool_names = {tool["function"]["name"] for tool in self.tools}
tool_names = sorted(self.valid_tool_names)
print(f"🛠️ Loaded {len(self.tools)} tools: {', '.join(tool_names)}") print(f"🛠️ Loaded {len(self.tools)} tools: {', '.join(tool_names)}")
# Show filtering info if applied # Show filtering info if applied
@ -495,6 +497,49 @@ class AIAgent:
if self.verbose_logging: if self.verbose_logging:
logging.debug(f"API Response received - Usage: {response.usage if hasattr(response, 'usage') else 'N/A'}") logging.debug(f"API Response received - Usage: {response.usage if hasattr(response, 'usage') else 'N/A'}")
# Validate response has valid choices before proceeding
if response is None or not hasattr(response, 'choices') or response.choices is None or len(response.choices) == 0:
# This is often rate limiting or provider returning malformed response
retry_count += 1
error_details = []
if response is None:
error_details.append("response is None")
elif not hasattr(response, 'choices'):
error_details.append("response has no 'choices' attribute")
elif response.choices is None:
error_details.append("response.choices is None")
else:
error_details.append("response.choices is empty")
# Check for error field in response (some providers include this)
error_msg = "Unknown"
if response and hasattr(response, 'error') and response.error:
error_msg = str(response.error)
elif response and hasattr(response, 'message') and response.message:
error_msg = str(response.message)
print(f"{self.log_prefix}⚠️ Invalid API response (attempt {retry_count}/{max_retries}): {', '.join(error_details)}")
print(f"{self.log_prefix} 📝 Provider message: {error_msg[:200]}")
print(f"{self.log_prefix} ⏱️ Response time: {api_duration:.2f}s (fast response often indicates rate limiting)")
if retry_count > max_retries:
print(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded for invalid responses. Giving up.")
logging.error(f"{self.log_prefix}Invalid API response after {max_retries} retries.")
return {
"messages": messages,
"completed": False,
"api_calls": api_call_count,
"error": f"Invalid API response (choices is None/empty). Likely rate limited by provider.",
"failed": True # Mark as failure for filtering
}
# Longer backoff for rate limiting (likely cause of None choices)
wait_time = min(5 * (2 ** (retry_count - 1)), 120) # 5s, 10s, 20s, 40s, 80s, 120s
print(f"{self.log_prefix}⏳ Retrying in {wait_time}s (extended backoff for possible rate limit)...")
logging.warning(f"Invalid API response (retry {retry_count}/{max_retries}): {', '.join(error_details)}")
time.sleep(wait_time)
continue # Retry the API call
break # Success, exit retry loop break # Success, exit retry loop
except Exception as api_error: except Exception as api_error:
@ -503,13 +548,32 @@ class AIAgent:
# Enhanced error logging # Enhanced error logging
error_type = type(api_error).__name__ error_type = type(api_error).__name__
error_msg = str(api_error) error_msg = str(api_error).lower()
print(f"{self.log_prefix}⚠️ API call failed (attempt {retry_count}/{max_retries}): {error_type}") print(f"{self.log_prefix}⚠️ API call failed (attempt {retry_count}/{max_retries}): {error_type}")
print(f"{self.log_prefix} ⏱️ Time elapsed before failure: {elapsed_time:.2f}s") print(f"{self.log_prefix} ⏱️ Time elapsed before failure: {elapsed_time:.2f}s")
print(f"{self.log_prefix} 📝 Error: {error_msg[:200]}") print(f"{self.log_prefix} 📝 Error: {str(api_error)[:200]}")
print(f"{self.log_prefix} 📊 Request context: {len(api_messages)} messages, ~{approx_tokens:,} tokens, {len(self.tools) if self.tools else 0} tools") print(f"{self.log_prefix} 📊 Request context: {len(api_messages)} messages, ~{approx_tokens:,} tokens, {len(self.tools) if self.tools else 0} tools")
# Check for non-retryable errors (context length exceeded)
is_context_length_error = any(phrase in error_msg for phrase in [
'context length', 'maximum context', 'token limit',
'too many tokens', 'reduce the length', 'exceeds the limit'
])
if is_context_length_error:
print(f"{self.log_prefix}❌ Context length exceeded - this error cannot be resolved by retrying.")
print(f"{self.log_prefix} 💡 The conversation has accumulated too much content from tool responses.")
logging.error(f"{self.log_prefix}Context length exceeded: {approx_tokens:,} tokens. Cannot continue.")
# Return a partial result instead of crashing
return {
"messages": messages,
"completed": False,
"api_calls": api_call_count,
"error": f"Context length exceeded ({approx_tokens:,} tokens). Conversation terminated early.",
"partial": True
}
if retry_count > max_retries: if retry_count > max_retries:
print(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded. Giving up.") print(f"{self.log_prefix}❌ Max retries ({max_retries}) exceeded. Giving up.")
logging.error(f"{self.log_prefix}API call failed after {max_retries} retries. Last error: {api_error}") logging.error(f"{self.log_prefix}API call failed after {max_retries} retries. Last error: {api_error}")
@ -537,6 +601,43 @@ class AIAgent:
for tc in assistant_message.tool_calls: for tc in assistant_message.tool_calls:
logging.debug(f"Tool call: {tc.function.name} with args: {tc.function.arguments[:200]}...") logging.debug(f"Tool call: {tc.function.name} with args: {tc.function.arguments[:200]}...")
# Validate tool call names - detect model hallucinations
invalid_tool_calls = [
tc.function.name for tc in assistant_message.tool_calls
if tc.function.name not in self.valid_tool_names
]
if invalid_tool_calls:
# Track retries for invalid tool calls
if not hasattr(self, '_invalid_tool_retries'):
self._invalid_tool_retries = 0
self._invalid_tool_retries += 1
invalid_preview = invalid_tool_calls[0][:80] + "..." if len(invalid_tool_calls[0]) > 80 else invalid_tool_calls[0]
print(f"{self.log_prefix}⚠️ Invalid tool call detected: '{invalid_preview}'")
print(f"{self.log_prefix} Valid tools: {sorted(self.valid_tool_names)}")
if self._invalid_tool_retries < 3:
print(f"{self.log_prefix}🔄 Retrying API call ({self._invalid_tool_retries}/3)...")
# Don't add anything to messages, just retry the API call
continue
else:
print(f"{self.log_prefix}❌ Max retries (3) for invalid tool calls exceeded. Stopping as partial.")
# Return partial result - don't include the bad tool call in messages
self._invalid_tool_retries = 0 # Reset for next conversation
return {
"final_response": None,
"messages": messages, # Messages up to last valid point
"api_calls": api_call_count,
"completed": False,
"partial": True,
"error": f"Model generated invalid tool call: {invalid_preview}"
}
# Reset retry counter on successful tool call validation
if hasattr(self, '_invalid_tool_retries'):
self._invalid_tool_retries = 0
# Extract reasoning from response if available (for reasoning models like minimax, kimi, etc.) # Extract reasoning from response if available (for reasoning models like minimax, kimi, etc.)
reasoning_content = None reasoning_content = None
if hasattr(assistant_message, 'reasoning') and assistant_message.reasoning: if hasattr(assistant_message, 'reasoning') and assistant_message.reasoning:
@ -669,7 +770,8 @@ class AIAgent:
"final_response": final_response, "final_response": final_response,
"messages": messages, "messages": messages,
"api_calls": api_call_count, "api_calls": api_call_count,
"completed": completed "completed": completed,
"partial": False # True only when stopped due to invalid tool calls
} }
def chat(self, message: str) -> str: def chat(self, message: str) -> str:

26
run_datagen_glm4.7.sh Executable file
View file

@ -0,0 +1,26 @@
#!/bin/bash
# Create logs directory if it doesn't exist
mkdir -p logs
# Generate log filename with timestamp
LOG_FILE="logs/glm4.7-thinking-sft1_$(date +%Y%m%d_%H%M%S).log"
echo "📝 Logging output to: $LOG_FILE"
python batch_runner.py \
--dataset_file="source-data/hermes-agent-agent-tasks-1/agent_tasks_sft_1.jsonl" \
--batch_size=25 \
--run_name="megascience_glm4.7-thinking-sft1" \
--distribution="science" \
--model="z-ai/glm-4.7" \
--base_url="https://openrouter.ai/api/v1" \
--providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
--num_workers=10 \
--max_turns=60 \
--ephemeral_system_prompt="You have access to a variety of tools to help you solve scientific, math, and technology problems presented to you. You can use them in sequence and build off of the results of prior tools you've used results. Always use the terminal or search tool if it can provide additional context, verify formulas, double check concepts and recent studies and understanding, doing all calculations, etc. You should only be confident in your own reasoning, knowledge, or calculations if you've exhaustively used all tools available to you to that can help you verify or validate your work. Always pip install any packages you need to use the python scripts you want to run. If you need to use a tool that isn't available, you can use the terminal tool to install or create it in many cases as well. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. Search for at least 3 sources, but not more than 12." \
2>&1 | tee "$LOG_FILE"
echo "✅ Log saved to: $LOG_FILE"
# --verbose \

View file

@ -6,7 +6,7 @@ This package contains all the specific tool implementations for the Hermes Agent
Each module provides specialized functionality for different capabilities: Each module provides specialized functionality for different capabilities:
- web_tools: Web search, content extraction, and crawling - web_tools: Web search, content extraction, and crawling
- terminal_tool: Command execution on virtual machines - simple_terminal_tool: Simple command execution on virtual machines (no session persistence)
- vision_tools: Image analysis and understanding - vision_tools: Image analysis and understanding
- mixture_of_agents_tool: Multi-model collaborative reasoning - mixture_of_agents_tool: Multi-model collaborative reasoning
- image_generation_tool: Text-to-image generation with upscaling - image_generation_tool: Text-to-image generation with upscaling
@ -23,10 +23,11 @@ from .web_tools import (
check_firecrawl_api_key check_firecrawl_api_key
) )
from .terminal_tool import ( from .simple_terminal_tool import (
terminal_tool, simple_terminal_tool,
check_hecate_requirements, check_requirements as check_terminal_requirements,
TERMINAL_TOOL_DESCRIPTION cleanup_vm,
SIMPLE_TERMINAL_TOOL_DESCRIPTION
) )
from .vision_tools import ( from .vision_tools import (
@ -50,10 +51,11 @@ __all__ = [
'web_extract_tool', 'web_extract_tool',
'web_crawl_tool', 'web_crawl_tool',
'check_firecrawl_api_key', 'check_firecrawl_api_key',
# Terminal tools # Terminal tools (simple - no session persistence)
'terminal_tool', 'simple_terminal_tool',
'check_hecate_requirements', 'check_terminal_requirements',
'TERMINAL_TOOL_DESCRIPTION', 'cleanup_vm',
'SIMPLE_TERMINAL_TOOL_DESCRIPTION',
# Vision tools # Vision tools
'vision_analyze_tool', 'vision_analyze_tool',
'check_vision_requirements', 'check_vision_requirements',

View file

@ -101,6 +101,12 @@ def _cleanup_inactive_vms(vm_lifetime_seconds: int = 300):
else: else:
print(f"[VM Cleanup] Error cleaning up VM for task {task_id}: {e}") print(f"[VM Cleanup] Error cleaning up VM for task {task_id}: {e}")
# Always remove from tracking dicts to prevent infinite retry loops
if task_id in _active_instances:
del _active_instances[task_id]
if task_id in _last_activity:
del _last_activity[task_id]
def _cleanup_thread_worker(): def _cleanup_thread_worker():
"""Background thread worker that periodically cleans up inactive VMs.""" """Background thread worker that periodically cleans up inactive VMs."""
@ -171,48 +177,36 @@ def cleanup_vm(task_id: str):
atexit.register(_stop_cleanup_thread) atexit.register(_stop_cleanup_thread)
def _execute_ssh_command(instance, command: str, timeout: Optional[int] = None) -> Dict[str, Any]: def _execute_command(instance, command: str, timeout: Optional[int] = None) -> Dict[str, Any]:
""" """
Execute a command via SSH on the VM instance. Execute a command on the VM instance using instance.exec() for proper stderr capture.
Args: Args:
instance: MorphVM instance instance: MorphVM instance
command: Command to execute command: Command to execute
timeout: Optional timeout in seconds timeout: Optional timeout in seconds (Note: exec() may not support timeout directly)
Returns: Returns:
dict with stdout, stderr, returncode dict with stdout, stderr, returncode
""" """
ssh_context_manager = None
try: try:
# Use the instance's SSH context manager # Use instance.exec() which properly captures both stdout and stderr
ssh_context_manager = instance.ssh() # (unlike ssh.run() which doesn't capture stderr correctly)
ssh_context = ssh_context_manager.__enter__() result = instance.exec(command)
# Execute the command # Debug logging only for verbose mode or unusual cases
result = ssh_context.run(command, get_pty=False, timeout=timeout or 120) # Note: Non-zero exit codes are normal (model's command failed) - not a tool error
if result.exit_code != 0 and not result.stdout and not result.stderr:
# Close the SSH connection # Only log if we got absolutely no output - might indicate an issue
if ssh_context_manager: print(f"⚠️ Command returned exit={result.exit_code} with no output")
try:
ssh_context_manager.__exit__(None, None, None)
except:
pass
return { return {
"stdout": result.stdout or "", "stdout": result.stdout or "",
"stderr": result.stderr or "", "stderr": result.stderr or "",
"returncode": result.returncode "returncode": result.exit_code
} }
except Exception as e: except Exception as e:
# Close connection on error
if ssh_context_manager:
try:
ssh_context_manager.__exit__(None, None, None)
except:
pass
# Check if it's a timeout # Check if it's a timeout
error_str = str(e).lower() error_str = str(e).lower()
if "timeout" in error_str: if "timeout" in error_str:
@ -224,7 +218,7 @@ def _execute_ssh_command(instance, command: str, timeout: Optional[int] = None)
return { return {
"stdout": "", "stdout": "",
"stderr": f"SSH execution failed: {str(e)}", "stderr": f"Command execution failed: {str(e)}",
"returncode": -1 "returncode": -1
} }
@ -312,7 +306,7 @@ def simple_terminal_tool(
if background: if background:
# Run in background with nohup and redirect output # Run in background with nohup and redirect output
exec_command = f"nohup {command} > /tmp/bg_output.log 2>&1 &" exec_command = f"nohup {command} > /tmp/bg_output.log 2>&1 &"
result = _execute_ssh_command(instance, exec_command, timeout=10) result = _execute_command(instance, exec_command, timeout=10)
# For background tasks, return immediately with info # For background tasks, return immediately with info
if result["returncode"] == 0: if result["returncode"] == 0:
@ -322,24 +316,72 @@ def simple_terminal_tool(
"error": None "error": None
}, ensure_ascii=False) }, ensure_ascii=False)
else: else:
# Include stderr in output but don't set error (command failure, not tool failure)
bg_output = result["stdout"]
if result["stderr"]:
bg_output = f"{bg_output}\n{result['stderr']}" if bg_output else result["stderr"]
return json.dumps({ return json.dumps({
"output": result["stdout"], "output": bg_output,
"exit_code": result["returncode"], "exit_code": result["returncode"],
"error": result["stderr"] "error": None # Only set for actual tool failures
}, ensure_ascii=False) }, ensure_ascii=False)
else: else:
# Run foreground command # Run foreground command with retry logic for transient failures
result = _execute_ssh_command(instance, command, timeout=timeout) max_retries = 3
retry_count = 0
result = None
while retry_count <= max_retries:
result = _execute_command(instance, command, timeout=timeout)
# Check if we should retry (only for transient errors, not normal results)
stdout = result.get("stdout", "")
stderr = result.get("stderr", "")
returncode = result.get("returncode", 0)
should_retry = False
retry_reason = ""
# NOTE: Empty output with exit_code=0 is NORMAL for many commands:
# - File writes: cat > file, echo > file
# - Directory ops: mkdir, cd
# - Silent installs: pip install --quiet
# So we do NOT retry on exit_code=0, even with empty output.
# Only retry on special error codes that suggest transient/infra issues
if not stdout and not stderr and returncode in [-1, 124]:
should_retry = True
retry_reason = f"transient error (code {returncode})"
if should_retry and retry_count < max_retries:
retry_count += 1
wait_time = 2 ** retry_count # Exponential backoff: 2s, 4s, 8s
print(f"⚠️ Terminal: {retry_reason}, retrying in {wait_time}s (attempt {retry_count}/{max_retries})")
time.sleep(wait_time)
continue
# Got a result (success or normal command failure) - exit retry loop
break
# Combine stdout and stderr for output # Combine stdout and stderr for output
output = result["stdout"] output = result["stdout"]
if result["stderr"] and result["returncode"] != 0: if result["stderr"] and result["returncode"] != 0:
output = f"{output}\n{result['stderr']}" if output else result["stderr"] output = f"{output}\n{result['stderr']}" if output else result["stderr"]
# Truncate output if too long (max 50,000 chars to avoid context explosion)
MAX_OUTPUT_CHARS = 50000
if len(output) > MAX_OUTPUT_CHARS:
truncated_notice = f"\n\n... [OUTPUT TRUNCATED - showing last {MAX_OUTPUT_CHARS} chars of {len(output)} total] ..."
output = truncated_notice + output[-MAX_OUTPUT_CHARS:]
# NOTE: error is only set for FUNCTIONAL tool failures (VM issues, timeouts, etc.)
# Non-zero exit codes from the model's commands are NOT tool failures -
# the model can self-correct. The exit_code field tells the model if the command succeeded.
# Retries that eventually succeed also don't count as failures.
return json.dumps({ return json.dumps({
"output": output.strip(), "output": output.strip(),
"exit_code": result["returncode"], "exit_code": result["returncode"],
"error": result["stderr"] if result["returncode"] != 0 else None "error": None # Only set for actual tool failures, not command failures
}, ensure_ascii=False) }, ensure_ascii=False)
except Exception as e: except Exception as e:

View file

@ -270,6 +270,7 @@ def terminal_tool(
except ImportError as import_error: except ImportError as import_error:
return json.dumps({ return json.dumps({
"output": "", "output": "",
"stderr": "",
"screen": "", "screen": "",
"exit_code": -1, "exit_code": -1,
"error": f"Terminal tool is disabled due to import error: {import_error}", "error": f"Terminal tool is disabled due to import error: {import_error}",
@ -287,6 +288,7 @@ def terminal_tool(
if not morph_api_key: if not morph_api_key:
return json.dumps({ return json.dumps({
"output": "", "output": "",
"stderr": "",
"screen": "", "screen": "",
"exit_code": -1, "exit_code": -1,
"error": "MORPH_API_KEY environment variable not set", "error": "MORPH_API_KEY environment variable not set",
@ -349,29 +351,85 @@ def terminal_tool(
# Generate unique tool block ID # Generate unique tool block ID
tool_block_id = f"tool_{uuid.uuid4().hex[:8]}" tool_block_id = f"tool_{uuid.uuid4().hex[:8]}"
# Execute the tool with hecate # Retry configuration for handling transient empty responses
result = run_tool( max_retries = 3
tool_call=tool_call, retry_count = 0
instance=instance,
console=console,
tool_block_id=tool_block_id,
ctx=ctx
)
# Format the result with only essential fields for the LLM while retry_count <= max_retries:
# Map hecate's "stdout" to "output" for compatibility # Execute the tool with hecate
formatted_result = { result = run_tool(
"output": result.get("stdout", result.get("output", "")), tool_call=tool_call,
"screen": result.get("screen", ""), instance=instance,
"exit_code": result.get("returncode", result.get("exit_code", -1)), console=console,
"error": result.get("error") tool_block_id=tool_block_id,
} ctx=ctx
)
return json.dumps(formatted_result, ensure_ascii=False) # Format the result with only essential fields for the LLM
# Map hecate's "stdout" to "output" for compatibility
stdout = result.get("stdout", result.get("output", ""))
stderr = result.get("stderr", "")
exit_code = result.get("returncode", result.get("exit_code", -1))
error = result.get("error")
screen = result.get("screen", "")
# If there's no explicit error but there's stderr, include it in error field
# This helps capture why commands failed even without an explicit error message
if not error and stderr:
error = stderr
# If exit code is non-zero but no error info, note that
elif not error and exit_code and exit_code != 0 and not stdout:
error = f"Command exited with code {exit_code}"
# Check if we should retry:
# 1. Empty output with non-zero exit code (clear failure)
# 2. Completely empty response (may indicate timing/VM issue)
should_retry = False
retry_reason = ""
if not stdout and not stderr and not screen and not error and exit_code == 0:
# Completely empty response - might be a timing issue
should_retry = True
retry_reason = "completely empty response (possible timing issue)"
elif not stdout and not stderr and exit_code != 0 and exit_code != -1:
# Non-zero exit with no output at all - might be transient
should_retry = True
retry_reason = f"empty output with exit code {exit_code}"
if should_retry and retry_count < max_retries:
retry_count += 1
wait_time = 2 ** retry_count # Exponential backoff: 2s, 4s, 8s
print(f"⚠️ Terminal: {retry_reason}, retrying in {wait_time}s (attempt {retry_count}/{max_retries})")
time.sleep(wait_time)
continue
# Success or max retries reached - return the result
formatted_result = {
"output": stdout,
"stderr": stderr, # Now capturing stderr separately too
"screen": screen,
"exit_code": exit_code,
"error": error
}
if retry_count > 0:
formatted_result["retries"] = retry_count
return json.dumps(formatted_result, ensure_ascii=False)
# Should never reach here, but just in case
return json.dumps({
"output": "",
"stderr": "",
"screen": "",
"exit_code": -1,
"error": "Terminal tool: max retries exceeded"
}, ensure_ascii=False)
except Exception as e: except Exception as e:
return json.dumps({ return json.dumps({
"output": "", "output": "",
"stderr": "",
"screen": "", "screen": "",
"exit_code": -1, "exit_code": -1,
"error": f"Failed to execute terminal command: {str(e)}", "error": f"Failed to execute terminal command: {str(e)}",

View file

@ -139,6 +139,9 @@ async def process_content_with_llm(
to intelligently extract key information and create markdown summaries, to intelligently extract key information and create markdown summaries,
significantly reducing token usage while preserving all important information. significantly reducing token usage while preserving all important information.
For very large content (>500k chars), uses chunked processing with synthesis.
For extremely large content (>2M chars), refuses to process entirely.
Args: Args:
content (str): The raw content to process content (str): The raw content to process
url (str): The source URL (for context, optional) url (str): The source URL (for context, optional)
@ -149,13 +152,25 @@ async def process_content_with_llm(
Returns: Returns:
Optional[str]: Processed markdown content, or None if content too short or processing fails Optional[str]: Processed markdown content, or None if content too short or processing fails
""" """
try: # Size thresholds
# Skip processing if content is too short MAX_CONTENT_SIZE = 2_000_000 # 2M chars - refuse entirely above this
if len(content) < min_length: CHUNK_THRESHOLD = 500_000 # 500k chars - use chunked processing above this
print(f"📏 Content too short ({len(content)} < {min_length} chars), skipping LLM processing") CHUNK_SIZE = 100_000 # 100k chars per chunk
return None MAX_OUTPUT_SIZE = 5000 # Hard cap on final output size
print(f"🧠 Processing content with LLM ({len(content)} characters)") try:
content_len = len(content)
# Refuse if content is absurdly large
if content_len > MAX_CONTENT_SIZE:
size_mb = content_len / 1_000_000
print(f"🚫 Content too large ({size_mb:.1f}MB > 2MB limit). Refusing to process.")
return f"[Content too large to process: {size_mb:.1f}MB. Try using web_crawl with specific extraction instructions, or search for a more focused source.]"
# Skip processing if content is too short
if content_len < min_length:
print(f"📏 Content too short ({content_len} < {min_length} chars), skipping LLM processing")
return None
# Create context information # Create context information
context_info = [] context_info = []
@ -163,10 +178,83 @@ async def process_content_with_llm(
context_info.append(f"Title: {title}") context_info.append(f"Title: {title}")
if url: if url:
context_info.append(f"Source: {url}") context_info.append(f"Source: {url}")
context_str = "\n".join(context_info) + "\n\n" if context_info else "" context_str = "\n".join(context_info) + "\n\n" if context_info else ""
# Simplified prompt for better quality markdown output # Check if we need chunked processing
if content_len > CHUNK_THRESHOLD:
print(f"📦 Content large ({content_len:,} chars). Using chunked processing...")
return await _process_large_content_chunked(
content, context_str, model, CHUNK_SIZE, MAX_OUTPUT_SIZE
)
# Standard single-pass processing for normal content
print(f"🧠 Processing content with LLM ({content_len} characters)")
processed_content = await _call_summarizer_llm(content, context_str, model)
if processed_content:
# Enforce output cap
if len(processed_content) > MAX_OUTPUT_SIZE:
processed_content = processed_content[:MAX_OUTPUT_SIZE] + "\n\n[... summary truncated for context management ...]"
# Log compression metrics
processed_length = len(processed_content)
compression_ratio = processed_length / content_len if content_len > 0 else 1.0
print(f"✅ Content processed: {content_len}{processed_length} chars ({compression_ratio:.1%})")
return processed_content
except Exception as e:
print(f"❌ Error processing content with LLM: {str(e)}")
return f"[Failed to process content: {str(e)[:100]}. Content size: {len(content):,} chars]"
async def _call_summarizer_llm(
content: str,
context_str: str,
model: str,
max_tokens: int = 4000,
is_chunk: bool = False,
chunk_info: str = ""
) -> Optional[str]:
"""
Make a single LLM call to summarize content.
Args:
content: The content to summarize
context_str: Context information (title, URL)
model: Model to use
max_tokens: Maximum output tokens
is_chunk: Whether this is a chunk of a larger document
chunk_info: Information about chunk position (e.g., "Chunk 2/5")
Returns:
Summarized content or None on failure
"""
if is_chunk:
# Chunk-specific prompt - aware that this is partial content
system_prompt = """You are an expert content analyst processing a SECTION of a larger document. Your job is to extract and summarize the key information from THIS SECTION ONLY.
Important guidelines for chunk processing:
1. Do NOT write introductions or conclusions - this is a partial document
2. Focus on extracting ALL key facts, figures, data points, and insights from this section
3. Preserve important quotes, code snippets, and specific details verbatim
4. Use bullet points and structured formatting for easy synthesis later
5. Note any references to other sections (e.g., "as mentioned earlier", "see below") without trying to resolve them
Your output will be combined with summaries of other sections, so focus on thorough extraction rather than narrative flow."""
user_prompt = f"""Extract key information from this SECTION of a larger document:
{context_str}{chunk_info}
SECTION CONTENT:
{content}
Extract all important information from this section in a structured format. Focus on facts, data, insights, and key details. Do not add introductions or conclusions."""
else:
# Standard full-document prompt
system_prompt = """You are an expert content analyst. Your job is to process web content and create a comprehensive yet concise summary that preserves all important information while dramatically reducing bulk. system_prompt = """You are an expert content analyst. Your job is to process web content and create a comprehensive yet concise summary that preserves all important information while dramatically reducing bulk.
Create a well-structured markdown summary that includes: Create a well-structured markdown summary that includes:
@ -183,49 +271,155 @@ Your goal is to preserve ALL important information while reducing length. Never
Create a markdown summary that captures all key information in a well-organized, scannable format. Include important quotes and code snippets in their original formatting. Focus on actionable information, specific details, and unique insights.""" Create a markdown summary that captures all key information in a well-organized, scannable format. Include important quotes and code snippets in their original formatting. Focus on actionable information, specific details, and unique insights."""
# Call the LLM asynchronously with retry logic for flaky API # Call the LLM with retry logic
max_retries = 6 max_retries = 6
retry_delay = 2 # Start with 2 seconds retry_delay = 2
last_error = None last_error = None
for attempt in range(max_retries): for attempt in range(max_retries):
try: try:
response = await summarizer_client.chat.completions.create( response = await summarizer_client.chat.completions.create(
model=model, model=model,
messages=[ messages=[
{"role": "system", "content": system_prompt}, {"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt} {"role": "user", "content": user_prompt}
], ],
temperature=0.1, # Low temperature for consistent extraction temperature=0.1,
max_tokens=4000 # Generous limit for comprehensive processing max_tokens=max_tokens
) )
break # Success, exit retry loop return response.choices[0].message.content.strip()
except Exception as api_error: except Exception as api_error:
last_error = api_error last_error = api_error
if attempt < max_retries - 1: if attempt < max_retries - 1:
print(f"⚠️ LLM API call failed (attempt {attempt + 1}/{max_retries}): {str(api_error)[:100]}") print(f"⚠️ LLM API call failed (attempt {attempt + 1}/{max_retries}): {str(api_error)[:100]}")
print(f" Retrying in {retry_delay}s...") print(f" Retrying in {retry_delay}s...")
await asyncio.sleep(retry_delay) await asyncio.sleep(retry_delay)
retry_delay = min(retry_delay * 2, 60) # Exponential backoff: 2s, 4s, 8s, 16s, 32s, 60s retry_delay = min(retry_delay * 2, 60)
else: else:
# All retries exhausted raise last_error
raise last_error
# Get the markdown response directly return None
processed_content = response.choices[0].message.content.strip()
# Calculate compression metrics for logging
original_length = len(content)
processed_length = len(processed_content)
compression_ratio = processed_length / original_length if original_length > 0 else 1.0
print(f"✅ Content processed: {original_length}{processed_length} chars ({compression_ratio:.1%})") async def _process_large_content_chunked(
content: str,
context_str: str,
model: str,
chunk_size: int,
max_output_size: int
) -> Optional[str]:
"""
Process large content by chunking, summarizing each chunk in parallel,
then synthesizing the summaries.
return processed_content Args:
content: The large content to process
context_str: Context information
model: Model to use
chunk_size: Size of each chunk in characters
max_output_size: Maximum final output size
Returns:
Synthesized summary or None on failure
"""
# Split content into chunks
chunks = []
for i in range(0, len(content), chunk_size):
chunk = content[i:i + chunk_size]
chunks.append(chunk)
print(f" 📦 Split into {len(chunks)} chunks of ~{chunk_size:,} chars each")
# Summarize each chunk in parallel
async def summarize_chunk(chunk_idx: int, chunk_content: str) -> tuple[int, Optional[str]]:
"""Summarize a single chunk."""
try:
chunk_info = f"[Processing chunk {chunk_idx + 1} of {len(chunks)}]"
summary = await _call_summarizer_llm(
chunk_content,
context_str,
model,
max_tokens=2000,
is_chunk=True,
chunk_info=chunk_info
)
if summary:
print(f" ✅ Chunk {chunk_idx + 1}/{len(chunks)} summarized: {len(chunk_content):,}{len(summary):,} chars")
return chunk_idx, summary
except Exception as e:
print(f" ⚠️ Chunk {chunk_idx + 1}/{len(chunks)} failed: {str(e)[:50]}")
return chunk_idx, None
# Run all chunk summarizations in parallel
tasks = [summarize_chunk(i, chunk) for i, chunk in enumerate(chunks)]
results = await asyncio.gather(*tasks)
# Collect successful summaries in order
summaries = []
for chunk_idx, summary in sorted(results, key=lambda x: x[0]):
if summary:
summaries.append(f"## Section {chunk_idx + 1}\n{summary}")
if not summaries:
print(f" ❌ All chunk summarizations failed")
return "[Failed to process large content: all chunk summarizations failed]"
print(f" 📊 Got {len(summaries)}/{len(chunks)} chunk summaries")
# If only one chunk succeeded, just return it (with cap)
if len(summaries) == 1:
result = summaries[0]
if len(result) > max_output_size:
result = result[:max_output_size] + "\n\n[... truncated ...]"
return result
# Synthesize the summaries into a final summary
print(f" 🔗 Synthesizing {len(summaries)} summaries...")
combined_summaries = "\n\n---\n\n".join(summaries)
synthesis_prompt = f"""You have been given summaries of different sections of a large document.
Synthesize these into ONE cohesive, comprehensive summary that:
1. Removes redundancy between sections
2. Preserves all key facts, figures, and actionable information
3. Is well-organized with clear structure
4. Is under {max_output_size} characters
{context_str}SECTION SUMMARIES:
{combined_summaries}
Create a single, unified markdown summary."""
try:
response = await summarizer_client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "You synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise."},
{"role": "user", "content": synthesis_prompt}
],
temperature=0.1,
max_tokens=4000
)
final_summary = response.choices[0].message.content.strip()
# Enforce hard cap
if len(final_summary) > max_output_size:
final_summary = final_summary[:max_output_size] + "\n\n[... summary truncated for context management ...]"
original_len = len(content)
final_len = len(final_summary)
compression = final_len / original_len if original_len > 0 else 1.0
print(f" ✅ Synthesis complete: {original_len:,}{final_len:,} chars ({compression:.2%})")
return final_summary
except Exception as e: except Exception as e:
print(f"❌ Error processing content with LLM: {str(e)}") print(f" ⚠️ Synthesis failed: {str(e)[:100]}")
return None # Fall back to concatenated summaries with truncation
fallback = "\n\n".join(summaries)
if len(fallback) > max_output_size:
fallback = fallback[:max_output_size] + "\n\n[... truncated due to synthesis failure ...]"
return fallback
def clean_base64_images(text: str) -> str: def clean_base64_images(text: str) -> str: