Refactor BatchRunner and AIAgent for enhanced reasoning and tool management, improved tool definitions for fileops
- Updated `ALL_POSSIBLE_TOOLS` to auto-derive from `TOOL_TO_TOOLSET_MAP` for consistent schema. - Introduced `_extract_reasoning_stats` function to track reasoning coverage in assistant turns. - Enhanced `_process_batch_worker` to discard prompts with no reasoning and aggregate reasoning statistics. - Updated documentation and comments for clarity on new features and changes.
This commit is contained in:
parent
f12ea1bc02
commit
dd70d57b9b
4 changed files with 277 additions and 90 deletions
69
run_agent.py
69
run_agent.py
|
|
@ -1120,6 +1120,24 @@ class AIAgent:
|
|||
return content
|
||||
return content.replace("<REASONING_SCRATCHPAD>", "<think>").replace("</REASONING_SCRATCHPAD>", "</think>")
|
||||
|
||||
@staticmethod
|
||||
def _has_incomplete_scratchpad(content: str) -> bool:
|
||||
"""
|
||||
Check if content has an opening <REASONING_SCRATCHPAD> without a closing tag.
|
||||
|
||||
This indicates the model ran out of output tokens mid-reasoning, producing
|
||||
a broken turn that shouldn't be saved. The caller should retry or discard.
|
||||
|
||||
Args:
|
||||
content: Assistant message content to check
|
||||
|
||||
Returns:
|
||||
True if there's an unclosed scratchpad tag
|
||||
"""
|
||||
if not content:
|
||||
return False
|
||||
return "<REASONING_SCRATCHPAD>" in content and "</REASONING_SCRATCHPAD>" not in content
|
||||
|
||||
def _convert_to_trajectory_format(self, messages: List[Dict[str, Any]], user_query: str, completed: bool) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Convert internal message format to trajectory format for saving.
|
||||
|
|
@ -1204,6 +1222,11 @@ class AIAgent:
|
|||
}
|
||||
content += f"<tool_call>\n{json.dumps(tool_call_json, ensure_ascii=False)}\n</tool_call>\n"
|
||||
|
||||
# Ensure every gpt turn has a <think> block (empty if no reasoning)
|
||||
# so the format is consistent for training data
|
||||
if "<think>" not in content:
|
||||
content = "<think>\n</think>\n" + content
|
||||
|
||||
trajectory.append({
|
||||
"from": "gpt",
|
||||
"value": content.rstrip()
|
||||
|
|
@ -1256,6 +1279,10 @@ class AIAgent:
|
|||
raw_content = msg["content"] or ""
|
||||
content += self._convert_scratchpad_to_think(raw_content)
|
||||
|
||||
# Ensure every gpt turn has a <think> block (empty if no reasoning)
|
||||
if "<think>" not in content:
|
||||
content = "<think>\n</think>\n" + content
|
||||
|
||||
trajectory.append({
|
||||
"from": "gpt",
|
||||
"value": content.strip()
|
||||
|
|
@ -1903,6 +1930,48 @@ class AIAgent:
|
|||
if assistant_message.content and not self.quiet_mode:
|
||||
print(f"{self.log_prefix}🤖 Assistant: {assistant_message.content[:100]}{'...' if len(assistant_message.content) > 100 else ''}")
|
||||
|
||||
# Check for incomplete <REASONING_SCRATCHPAD> (opened but never closed)
|
||||
# This means the model ran out of output tokens mid-reasoning — retry up to 2 times
|
||||
if self._has_incomplete_scratchpad(assistant_message.content or ""):
|
||||
if not hasattr(self, '_incomplete_scratchpad_retries'):
|
||||
self._incomplete_scratchpad_retries = 0
|
||||
self._incomplete_scratchpad_retries += 1
|
||||
|
||||
print(f"{self.log_prefix}⚠️ Incomplete <REASONING_SCRATCHPAD> detected (opened but never closed)")
|
||||
|
||||
if self._incomplete_scratchpad_retries <= 2:
|
||||
print(f"{self.log_prefix}🔄 Retrying API call ({self._incomplete_scratchpad_retries}/2)...")
|
||||
# Don't add the broken message, just retry
|
||||
continue
|
||||
else:
|
||||
# Max retries - discard this turn and save as partial
|
||||
print(f"{self.log_prefix}❌ Max retries (2) for incomplete scratchpad. Saving as partial.")
|
||||
self._incomplete_scratchpad_retries = 0
|
||||
|
||||
rolled_back_messages = self._get_messages_up_to_last_assistant(messages)
|
||||
|
||||
try:
|
||||
cleanup_vm(effective_task_id)
|
||||
except Exception:
|
||||
pass
|
||||
try:
|
||||
cleanup_browser(effective_task_id)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return {
|
||||
"final_response": None,
|
||||
"messages": rolled_back_messages,
|
||||
"api_calls": api_call_count,
|
||||
"completed": False,
|
||||
"partial": True,
|
||||
"error": "Incomplete REASONING_SCRATCHPAD after 2 retries"
|
||||
}
|
||||
|
||||
# Reset incomplete scratchpad counter on clean response
|
||||
if hasattr(self, '_incomplete_scratchpad_retries'):
|
||||
self._incomplete_scratchpad_retries = 0
|
||||
|
||||
# Check for tool calls
|
||||
if assistant_message.tool_calls:
|
||||
if not self.quiet_mode:
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue