Refactor BatchRunner and AIAgent for enhanced reasoning and tool management, improved tool definitions for fileops

- Updated `ALL_POSSIBLE_TOOLS` to auto-derive from `TOOL_TO_TOOLSET_MAP` for consistent schema. - Introduced `_extract_reasoning_stats` function to track reasoning coverage in assistant turns. - Enhanced `_process_batch_worker` to discard prompts with no reasoning and aggregate reasoning statistics. - Updated documentation and comments for clarity on new features and changes.
2026-02-08 20:19:14 +00:00 · 2026-02-08 20:19:14 +00:00 · dd70d57b9b
commit dd70d57b9b
parent f12ea1bc02
4 changed files with 277 additions and 90 deletions
--- a/run_agent.py
+++ b/run_agent.py
@ -1120,6 +1120,24 @@ class AIAgent:
            return content
        return content.replace("<REASONING_SCRATCHPAD>", "<think>").replace("</REASONING_SCRATCHPAD>", "</think>")
    
+    @staticmethod
+    def _has_incomplete_scratchpad(content: str) -> bool:
+        """
+        Check if content has an opening <REASONING_SCRATCHPAD> without a closing tag.
+        
+        This indicates the model ran out of output tokens mid-reasoning, producing
+        a broken turn that shouldn't be saved. The caller should retry or discard.
+        
+        Args:
+            content: Assistant message content to check
+            
+        Returns:
+            True if there's an unclosed scratchpad tag
+        """
+        if not content:
+            return False
+        return "<REASONING_SCRATCHPAD>" in content and "</REASONING_SCRATCHPAD>" not in content
+    
    def _convert_to_trajectory_format(self, messages: List[Dict[str, Any]], user_query: str, completed: bool) -> List[Dict[str, Any]]:
        """
        Convert internal message format to trajectory format for saving.
@ -1204,6 +1222,11 @@ class AIAgent:
                        }
                        content += f"<tool_call>\n{json.dumps(tool_call_json, ensure_ascii=False)}\n</tool_call>\n"
                    
+                    # Ensure every gpt turn has a <think> block (empty if no reasoning)
+                    # so the format is consistent for training data
+                    if "<think>" not in content:
+                        content = "<think>\n</think>\n" + content
+                    
                    trajectory.append({
                        "from": "gpt",
                        "value": content.rstrip()
@ -1256,6 +1279,10 @@ class AIAgent:
                    raw_content = msg["content"] or ""
                    content += self._convert_scratchpad_to_think(raw_content)
                    
+                    # Ensure every gpt turn has a <think> block (empty if no reasoning)
+                    if "<think>" not in content:
+                        content = "<think>\n</think>\n" + content
+                    
                    trajectory.append({
                        "from": "gpt",
                        "value": content.strip()
@ -1903,6 +1930,48 @@ class AIAgent:
                if assistant_message.content and not self.quiet_mode:
                    print(f"{self.log_prefix}🤖 Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}")
                
+                # Check for incomplete <REASONING_SCRATCHPAD> (opened but never closed)
+                # This means the model ran out of output tokens mid-reasoning — retry up to 2 times
+                if self._has_incomplete_scratchpad(assistant_message.content or ""):
+                    if not hasattr(self, '_incomplete_scratchpad_retries'):
+                        self._incomplete_scratchpad_retries = 0
+                    self._incomplete_scratchpad_retries += 1
+                    
+                    print(f"{self.log_prefix}⚠️  Incomplete <REASONING_SCRATCHPAD> detected (opened but never closed)")
+                    
+                    if self._incomplete_scratchpad_retries <= 2:
+                        print(f"{self.log_prefix}🔄 Retrying API call ({self._incomplete_scratchpad_retries}/2)...")
+                        # Don't add the broken message, just retry
+                        continue
+                    else:
+                        # Max retries - discard this turn and save as partial
+                        print(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.")
+                        self._incomplete_scratchpad_retries = 0
+                        
+                        rolled_back_messages = self._get_messages_up_to_last_assistant(messages)
+                        
+                        try:
+                            cleanup_vm(effective_task_id)
+                        except Exception:
+                            pass
+                        try:
+                            cleanup_browser(effective_task_id)
+                        except Exception:
+                            pass
+                        
+                        return {
+                            "final_response": None,
+                            "messages": rolled_back_messages,
+                            "api_calls": api_call_count,
+                            "completed": False,
+                            "partial": True,
+                            "error": "Incomplete REASONING_SCRATCHPAD after 2 retries"
+                        }
+                
+                # Reset incomplete scratchpad counter on clean response
+                if hasattr(self, '_incomplete_scratchpad_retries'):
+                    self._incomplete_scratchpad_retries = 0
+                
                # Check for tool calls
                if assistant_message.tool_calls:
                    if not self.quiet_mode: