Merge remote-tracking branch 'origin/main' into feat/honcho-async-memory

Made-with: Cursor # Conflicts: # cli.py # tests/test_run_agent.py
2026-03-11 12:22:56 -04:00 · 2026-03-11 12:22:56 -04:00 · a0b0dbe6b2
commit a0b0dbe6b2
parent 047b118299 91101065bb
138 changed files with 17829 additions and 1109 deletions
--- a/run_agent.py
+++ b/run_agent.py
@ -175,6 +175,7 @@ class AIAgent:
        session_id: str = None,
        tool_progress_callback: callable = None,
        thinking_callback: callable = None,
+        reasoning_callback: callable = None,
        clarify_callback: callable = None,
        step_callback: callable = None,
        max_tokens: int = None,
@ -266,6 +267,7 @@ class AIAgent:

        self.tool_progress_callback = tool_progress_callback
        self.thinking_callback = thinking_callback
+        self.reasoning_callback = reasoning_callback
        self.clarify_callback = clarify_callback
        self.step_callback = step_callback
        self._last_reported_tool = None  # Track for "new tool" mode
@ -303,6 +305,13 @@ class AIAgent:
        self._use_prompt_caching = is_openrouter and is_claude
        self._cache_ttl = "5m"  # Default 5-minute TTL (1.25x write cost)
        
+        # Iteration budget pressure: warn the LLM as it approaches max_iterations.
+        # Warnings are injected into the last tool result JSON (not as separate
+        # messages) so they don't break message structure or invalidate caching.
+        self._budget_caution_threshold = 0.7   # 70% — nudge to start wrapping up
+        self._budget_warning_threshold = 0.9   # 90% — urgent, respond now
+        self._budget_pressure_enabled = True
+
        # Persistent error log -- always writes WARNING+ to ~/.hermes/logs/errors.log
        # so tool failures, API errors, etc. are inspectable after the fact.
        from agent.redact import RedactingFormatter
@ -503,6 +512,7 @@ class AIAgent:
        
        # SQLite session store (optional -- provided by CLI or gateway)
        self._session_db = session_db
+        self._last_flushed_db_idx = 0  # tracks DB-write cursor to prevent duplicate writes
        if self._session_db:
            try:
                self._session_db.create_session(
@ -833,45 +843,19 @@ class AIAgent:
        self._save_session_log(messages)
        self._flush_messages_to_session_db(messages, conversation_history)

-    def _log_msg_to_db(self, msg: Dict):
-        """Log a single message to SQLite immediately. Called after each messages.append()."""
-        if not self._session_db:
-            return
-        try:
-            role = msg.get("role", "unknown")
-            content = msg.get("content")
-            tool_calls_data = None
-            if hasattr(msg, "tool_calls") and msg.tool_calls:
-                tool_calls_data = [
-                    {"name": tc.function.name, "arguments": tc.function.arguments}
-                    for tc in msg.tool_calls
-                ]
-            elif isinstance(msg.get("tool_calls"), list):
-                tool_calls_data = msg["tool_calls"]
-            self._session_db.append_message(
-                session_id=self.session_id,
-                role=role,
-                content=content,
-                tool_name=msg.get("tool_name"),
-                tool_calls=tool_calls_data,
-                tool_call_id=msg.get("tool_call_id"),
-                finish_reason=msg.get("finish_reason"),
-            )
-        except Exception as e:
-            logger.debug("Session DB log_msg failed: %s", e)
-
    def _flush_messages_to_session_db(self, messages: List[Dict], conversation_history: List[Dict] = None):
-        """Persist any un-logged messages to the SQLite session store.
+        """Persist any un-flushed messages to the SQLite session store.

-        Called both at the normal end of run_conversation and from every early-
-        return path so that tool calls, tool responses, and assistant messages
-        are never lost even when the conversation errors out.
+        Uses _last_flushed_db_idx to track which messages have already been
+        written, so repeated calls (from multiple exit paths) only write
+        truly new messages — preventing the duplicate-write bug (#860).
        """
        if not self._session_db:
            return
        try:
            start_idx = len(conversation_history) if conversation_history else 0
-            for msg in messages[start_idx:]:
+            flush_from = max(start_idx, self._last_flushed_db_idx)
+            for msg in messages[flush_from:]:
                role = msg.get("role", "unknown")
                content = msg.get("content")
                tool_calls_data = None
@ -891,6 +875,7 @@ class AIAgent:
                    tool_call_id=msg.get("tool_call_id"),
                    finish_reason=msg.get("finish_reason"),
                )
+            self._last_flushed_db_idx = len(messages)
        except Exception as e:
            logger.debug("Session DB append_message failed: %s", e)

@ -1668,7 +1653,14 @@ class AIAgent:
                    prompt_parts.append(user_block)

        has_skills_tools = any(name in self.valid_tool_names for name in ['skills_list', 'skill_view', 'skill_manage'])
-        skills_prompt = build_skills_system_prompt() if has_skills_tools else ""
+        if has_skills_tools:
+            avail_toolsets = {ts for ts, avail in check_toolset_requirements().items() if avail}
+            skills_prompt = build_skills_system_prompt(
+                available_tools=self.valid_tool_names,
+                available_toolsets=avail_toolsets,
+            )
+        else:
+            skills_prompt = ""
        if skills_prompt:
            prompt_parts.append(skills_prompt)

@ -2043,6 +2035,7 @@ class AIAgent:
        allowed_keys = {
            "model", "instructions", "input", "tools", "store",
            "reasoning", "include", "max_output_tokens", "temperature",
+            "tool_choice", "parallel_tool_calls", "prompt_cache_key",
        }
        normalized: Dict[str, Any] = {
            "model": model,
@ -2068,6 +2061,12 @@ class AIAgent:
        if isinstance(temperature, (int, float)):
            normalized["temperature"] = float(temperature)

+        # Pass through tool_choice, parallel_tool_calls, prompt_cache_key
+        for passthrough_key in ("tool_choice", "parallel_tool_calls", "prompt_cache_key"):
+            val = api_kwargs.get(passthrough_key)
+            if val is not None:
+                normalized[passthrough_key] = val
+
        if allow_stream:
            stream = api_kwargs.get("stream")
            if stream is not None and stream is not True:
@ -2604,7 +2603,10 @@ class AIAgent:
                "instructions": instructions,
                "input": self._chat_messages_to_responses_input(payload_messages),
                "tools": self._responses_tools(),
+                "tool_choice": "auto",
+                "parallel_tool_calls": True,
                "store": False,
+                "prompt_cache_key": self.session_id,
            }

            if reasoning_enabled:
@ -2681,6 +2683,12 @@ class AIAgent:
            preview = reasoning_text[:100] + "..." if len(reasoning_text) > 100 else reasoning_text
            logging.debug(f"Captured reasoning ({len(reasoning_text)} chars): {preview}")

+        if reasoning_text and self.reasoning_callback:
+            try:
+                self.reasoning_callback(reasoning_text)
+            except Exception:
+                pass
+
        msg = {
            "role": "assistant",
            "content": assistant_message.content or "",
@ -2900,7 +2908,7 @@ class AIAgent:
            if messages and messages[-1].get("_flush_sentinel") == _sentinel:
                messages.pop()

-    def _compress_context(self, messages: list, system_message: str, *, approx_tokens: int = None) -> tuple:
+    def _compress_context(self, messages: list, system_message: str, *, approx_tokens: int = None, task_id: str = "default") -> tuple:
        """Compress conversation context and split the session in SQLite.

        Returns:
@ -2915,6 +2923,25 @@ class AIAgent:
        if todo_snapshot:
            compressed.append({"role": "user", "content": todo_snapshot})

+        # Preserve file-read history so the model doesn't re-read files
+        # it already examined before compression.
+        try:
+            from tools.file_tools import get_read_files_summary
+            read_files = get_read_files_summary(task_id)
+            if read_files:
+                file_list = "\n".join(
+                    f"  - {f['path']} ({', '.join(f['regions'])})"
+                    for f in read_files
+                )
+                compressed.append({"role": "user", "content": (
+                    "[Files already read in this session — do NOT re-read these]\n"
+                    f"{file_list}\n"
+                    "Use the information from the context summary above. "
+                    "Proceed with writing, editing, or responding."
+                )})
+        except Exception:
+            pass  # Don't break compression if file tracking fails
+
        self._invalidate_system_prompt()
        new_system_prompt = self._build_system_prompt(system_message)
        self._cached_system_prompt = new_system_prompt
@ -2940,12 +2967,14 @@ class AIAgent:
                    except (ValueError, Exception) as e:
                        logger.debug("Could not propagate title on compression: %s", e)
                self._session_db.update_system_prompt(self.session_id, new_system_prompt)
+                # Reset flush cursor — new session starts with no messages written
+                self._last_flushed_db_idx = 0
            except Exception as e:
                logger.debug("Session DB compression split failed: %s", e)

        return compressed, new_system_prompt

-    def _execute_tool_calls(self, assistant_message, messages: list, effective_task_id: str) -> None:
+    def _execute_tool_calls(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
        """Execute tool calls from the assistant message and append results to messages."""
        for i, tool_call in enumerate(assistant_message.tool_calls, 1):
            # SAFETY: check interrupt BEFORE starting each tool.
@ -2963,7 +2992,6 @@ class AIAgent:
                        "tool_call_id": skipped_tc.id,
                    }
                    messages.append(skip_msg)
-                    self._log_msg_to_db(skip_msg)
                break

            function_name = tool_call.function.name
@ -3172,7 +3200,6 @@ class AIAgent:
                "tool_call_id": tool_call.id
            }
            messages.append(tool_msg)
-            self._log_msg_to_db(tool_msg)

            if not self.quiet_mode:
                response_preview = function_result[:self.log_prefix_chars] + "..." if len(function_result) > self.log_prefix_chars else function_result
@ -3189,12 +3216,56 @@ class AIAgent:
                        "tool_call_id": skipped_tc.id
                    }
                    messages.append(skip_msg)
-                    self._log_msg_to_db(skip_msg)
                break

            if self.tool_delay > 0 and i < len(assistant_message.tool_calls):
                time.sleep(self.tool_delay)

+        # ── Budget pressure injection ─────────────────────────────────
+        # After all tool calls in this turn are processed, check if we're
+        # approaching max_iterations. If so, inject a warning into the LAST
+        # tool result's JSON so the LLM sees it naturally when reading results.
+        budget_warning = self._get_budget_warning(api_call_count)
+        if budget_warning and messages and messages[-1].get("role") == "tool":
+            last_content = messages[-1]["content"]
+            try:
+                parsed = json.loads(last_content)
+                if isinstance(parsed, dict):
+                    parsed["_budget_warning"] = budget_warning
+                    messages[-1]["content"] = json.dumps(parsed, ensure_ascii=False)
+                else:
+                    messages[-1]["content"] = last_content + f"\n\n{budget_warning}"
+            except (json.JSONDecodeError, TypeError):
+                messages[-1]["content"] = last_content + f"\n\n{budget_warning}"
+            if not self.quiet_mode:
+                remaining = self.max_iterations - api_call_count
+                tier = "⚠️  WARNING" if remaining <= self.max_iterations * 0.1 else "💡 CAUTION"
+                print(f"{self.log_prefix}{tier}: {remaining} iterations remaining")
+
+    def _get_budget_warning(self, api_call_count: int) -> Optional[str]:
+        """Return a budget pressure string, or None if not yet needed.
+
+        Two-tier system:
+          - Caution (70%): nudge to consolidate work
+          - Warning (90%): urgent, must respond now
+        """
+        if not self._budget_pressure_enabled or self.max_iterations <= 0:
+            return None
+        progress = api_call_count / self.max_iterations
+        remaining = self.max_iterations - api_call_count
+        if progress >= self._budget_warning_threshold:
+            return (
+                f"[BUDGET WARNING: Iteration {api_call_count}/{self.max_iterations}. "
+                f"Only {remaining} iteration(s) left. "
+                "Provide your final response NOW. No more tool calls unless absolutely critical.]"
+            )
+        if progress >= self._budget_caution_threshold:
+            return (
+                f"[BUDGET: Iteration {api_call_count}/{self.max_iterations}. "
+                f"{remaining} iterations left. Start consolidating your work.]"
+            )
+        return None
+
    def _handle_max_iterations(self, messages: list, api_call_count: int) -> str:
        """Request a summary when max iterations are reached. Returns the final response text."""
        print(f"⚠️  Reached maximum iterations ({self.max_iterations}). Requesting summary...")
@ -3427,7 +3498,6 @@ class AIAgent:
        # Add user message
        user_msg = {"role": "user", "content": user_message}
        messages.append(user_msg)
-        self._log_msg_to_db(user_msg)
        
        if not self.quiet_mode:
            print(f"💬 Starting conversation: '{user_message[:60]}{'...' if len(user_message) > 60 else ''}'")
@ -3509,7 +3579,8 @@ class AIAgent:
                for _pass in range(3):
                    _orig_len = len(messages)
                    messages, active_system_prompt = self._compress_context(
-                        messages, system_message, approx_tokens=_preflight_tokens
+                        messages, system_message, approx_tokens=_preflight_tokens,
+                        task_id=effective_task_id,
                    )
                    if len(messages) >= _orig_len:
                        break  # Cannot compress further
@ -3660,7 +3731,7 @@ class AIAgent:
            
            api_start_time = time.time()
            retry_count = 0
-            max_retries = 6  # Increased to allow longer backoff periods
+            max_retries = 3
            compression_attempts = 0
            max_compression_attempts = 3
            codex_auth_retry_attempted = False
@ -3827,7 +3898,6 @@ class AIAgent:
                                length_continue_retries += 1
                                interim_msg = self._build_assistant_message(assistant_message, finish_reason)
                                messages.append(interim_msg)
-                                self._log_msg_to_db(interim_msg)
                                if assistant_message.content:
                                    truncated_response_prefix += assistant_message.content

@ -3845,7 +3915,6 @@ class AIAgent:
                                        ),
                                    }
                                    messages.append(continue_msg)
-                                    self._log_msg_to_db(continue_msg)
                                    self._session_messages = messages
                                    self._save_session_log(messages)
                                    restart_with_length_continuation = True
@ -4035,7 +4104,8 @@ class AIAgent:

                        original_len = len(messages)
                        messages, active_system_prompt = self._compress_context(
-                            messages, system_message, approx_tokens=approx_tokens
+                            messages, system_message, approx_tokens=approx_tokens,
+                            task_id=effective_task_id,
                        )

                        if len(messages) < original_len:
@ -4064,6 +4134,7 @@ class AIAgent:
                        'token limit', 'too many tokens', 'reduce the length',
                        'exceeds the limit', 'context window',
                        'request entity too large',  # OpenRouter/Nous 413 safety net
+                        'prompt is too long',  # Anthropic: "prompt is too long: N tokens > M maximum"
                    ])
                    
                    if is_context_length_error:
@ -4103,7 +4174,8 @@ class AIAgent:

                        original_len = len(messages)
                        messages, active_system_prompt = self._compress_context(
-                            messages, system_message, approx_tokens=approx_tokens
+                            messages, system_message, approx_tokens=approx_tokens,
+                            task_id=effective_task_id,
                        )

                        if len(messages) < original_len or new_ctx and new_ctx < old_ctx:
@ -4130,8 +4202,11 @@ class AIAgent:
                    # These indicate a problem with the request itself (bad model ID,
                    # invalid API key, forbidden, etc.) and will never succeed on retry.
                    # Note: 413 and context-length errors are excluded — handled above.
+                    # Also catch local validation errors (ValueError, TypeError) — these
+                    # are programming bugs, not transient failures.
+                    is_local_validation_error = isinstance(api_error, (ValueError, TypeError))
                    is_client_status_error = isinstance(status_code, int) and 400 <= status_code < 500 and status_code != 413
-                    is_client_error = (is_client_status_error or any(phrase in error_msg for phrase in [
+                    is_client_error = (is_local_validation_error or is_client_status_error or any(phrase in error_msg for phrase in [
                        'error code: 401', 'error code: 403',
                        'error code: 404', 'error code: 422',
                        'is not a valid model', 'invalid model', 'model not found',
@ -4318,7 +4393,6 @@ class AIAgent:
                        )
                        if not duplicate_interim:
                            messages.append(interim_msg)
-                            self._log_msg_to_db(interim_msg)

                    if self._codex_incomplete_retries < 3:
                        if not self.quiet_mode:
@ -4369,7 +4443,6 @@ class AIAgent:
                        print(f"{self.log_prefix}⚠️  Unknown tool '{invalid_preview}' — sending error to model for self-correction")
                        assistant_msg = self._build_assistant_message(assistant_message, finish_reason)
                        messages.append(assistant_msg)
-                        self._log_msg_to_db(assistant_msg)
                        for tc in assistant_message.tool_calls:
                            if tc.function.name not in self.valid_tool_names:
                                content = f"Tool '{tc.function.name}' does not exist. Available tools: {available}"
@ -4424,7 +4497,6 @@ class AIAgent:
                            )
                            recovery_dict = {"role": "user", "content": recovery_msg}
                            messages.append(recovery_dict)
-                            self._log_msg_to_db(recovery_dict)
                            continue
                    
                    # Reset retry counter on successful JSON validation
@ -4446,9 +4518,9 @@ class AIAgent:
                                print(f"  ┊ 💬 {clean}")
                    
                    messages.append(assistant_msg)
-                    self._log_msg_to_db(assistant_msg)
                    
-                    self._execute_tool_calls(assistant_message, messages, effective_task_id)
+                    _msg_count_before_tools = len(messages)
+                    self._execute_tool_calls(assistant_message, messages, effective_task_id, api_call_count)

                    # Refund the iteration if the ONLY tool(s) called were
                    # execute_code (programmatic tool calling).  These are
@ -4457,10 +4529,24 @@ class AIAgent:
                    if _tc_names == {"execute_code"}:
                        self.iteration_budget.refund()
                    
-                    if self.compression_enabled and self.context_compressor.should_compress():
+                    # Estimate next prompt size using real token counts from the
+                    # last API response + rough estimate of newly appended tool
+                    # results.  This catches cases where tool results push the
+                    # context past the limit that last_prompt_tokens alone misses
+                    # (e.g. large file reads, web extractions).
+                    _compressor = self.context_compressor
+                    _new_tool_msgs = messages[_msg_count_before_tools:]
+                    _new_chars = sum(len(str(m.get("content", "") or "")) for m in _new_tool_msgs)
+                    _estimated_next_prompt = (
+                        _compressor.last_prompt_tokens
+                        + _compressor.last_completion_tokens
+                        + _new_chars // 3  # conservative: JSON-heavy tool results ≈ 3 chars/token
+                    )
+                    if self.compression_enabled and _compressor.should_compress(_estimated_next_prompt):
                        messages, active_system_prompt = self._compress_context(
                            messages, system_message,
-                            approx_tokens=self.context_compressor.last_prompt_tokens
+                            approx_tokens=self.context_compressor.last_prompt_tokens,
+                            task_id=effective_task_id,
                        )
                    
                    # Save session log incrementally (so progress is visible even if interrupted)
@ -4549,7 +4635,6 @@ class AIAgent:
                                "finish_reason": finish_reason,
                            }
                            messages.append(empty_msg)
-                            self._log_msg_to_db(empty_msg)
                            
                            self._cleanup_task_resources(effective_task_id)
                            self._persist_session(messages, conversation_history)
@ -4580,7 +4665,6 @@ class AIAgent:
                        codex_ack_continuations += 1
                        interim_msg = self._build_assistant_message(assistant_message, "incomplete")
                        messages.append(interim_msg)
-                        self._log_msg_to_db(interim_msg)

                        continue_msg = {
                            "role": "user",
@ -4590,7 +4674,6 @@ class AIAgent:
                            ),
                        }
                        messages.append(continue_msg)
-                        self._log_msg_to_db(continue_msg)
                        self._session_messages = messages
                        self._save_session_log(messages)
                        continue
@ -4606,7 +4689,6 @@ class AIAgent:
                    final_msg = self._build_assistant_message(assistant_message, finish_reason)
                    
                    messages.append(final_msg)
-                    self._log_msg_to_db(final_msg)
                    
                    if not self.quiet_mode:
                        print(f"🎉 Conversation completed after {api_call_count} OpenAI-compatible API call(s)")
@ -4643,7 +4725,6 @@ class AIAgent:
                                    "content": f"Error executing tool: {error_msg}",
                                }
                                messages.append(err_msg)
-                                self._log_msg_to_db(err_msg)
                        pending_handled = True
                    break
                
@ -4656,7 +4737,6 @@ class AIAgent:
                        "content": f"[System error during processing: {error_msg}]",
                    }
                    messages.append(sys_err_msg)
-                    self._log_msg_to_db(sys_err_msg)
                
                # If we're near the limit, break to avoid infinite loops
                if api_call_count >= self.max_iterations - 1:
@ -4688,9 +4768,17 @@ class AIAgent:
            self._honcho_sync(original_user_message, final_response)
            self._queue_honcho_prefetch(original_user_message)

+        # Extract reasoning from the last assistant message (if any)
+        last_reasoning = None
+        for msg in reversed(messages):
+            if msg.get("role") == "assistant" and msg.get("reasoning"):
+                last_reasoning = msg["reasoning"]
+                break
+
        # Build result with interrupt info if applicable
        result = {
            "final_response": final_response,
+            "last_reasoning": last_reasoning,
            "messages": messages,
            "api_calls": api_call_count,
            "completed": completed,