Merge remote-tracking branch 'origin/main' into feat/honcho-async-memory

Made-with: Cursor

# Conflicts:
#	cli.py
#	tests/test_run_agent.py
This commit is contained in:
Erosika 2026-03-11 12:22:56 -04:00
commit a0b0dbe6b2
138 changed files with 17829 additions and 1109 deletions

View file

@ -175,6 +175,7 @@ class AIAgent:
session_id: str = None,
tool_progress_callback: callable = None,
thinking_callback: callable = None,
reasoning_callback: callable = None,
clarify_callback: callable = None,
step_callback: callable = None,
max_tokens: int = None,
@ -266,6 +267,7 @@ class AIAgent:
self.tool_progress_callback = tool_progress_callback
self.thinking_callback = thinking_callback
self.reasoning_callback = reasoning_callback
self.clarify_callback = clarify_callback
self.step_callback = step_callback
self._last_reported_tool = None # Track for "new tool" mode
@ -303,6 +305,13 @@ class AIAgent:
self._use_prompt_caching = is_openrouter and is_claude
self._cache_ttl = "5m" # Default 5-minute TTL (1.25x write cost)
# Iteration budget pressure: warn the LLM as it approaches max_iterations.
# Warnings are injected into the last tool result JSON (not as separate
# messages) so they don't break message structure or invalidate caching.
self._budget_caution_threshold = 0.7 # 70% — nudge to start wrapping up
self._budget_warning_threshold = 0.9 # 90% — urgent, respond now
self._budget_pressure_enabled = True
# Persistent error log -- always writes WARNING+ to ~/.hermes/logs/errors.log
# so tool failures, API errors, etc. are inspectable after the fact.
from agent.redact import RedactingFormatter
@ -503,6 +512,7 @@ class AIAgent:
# SQLite session store (optional -- provided by CLI or gateway)
self._session_db = session_db
self._last_flushed_db_idx = 0 # tracks DB-write cursor to prevent duplicate writes
if self._session_db:
try:
self._session_db.create_session(
@ -833,45 +843,19 @@ class AIAgent:
self._save_session_log(messages)
self._flush_messages_to_session_db(messages, conversation_history)
def _log_msg_to_db(self, msg: Dict):
"""Log a single message to SQLite immediately. Called after each messages.append()."""
if not self._session_db:
return
try:
role = msg.get("role", "unknown")
content = msg.get("content")
tool_calls_data = None
if hasattr(msg, "tool_calls") and msg.tool_calls:
tool_calls_data = [
{"name": tc.function.name, "arguments": tc.function.arguments}
for tc in msg.tool_calls
]
elif isinstance(msg.get("tool_calls"), list):
tool_calls_data = msg["tool_calls"]
self._session_db.append_message(
session_id=self.session_id,
role=role,
content=content,
tool_name=msg.get("tool_name"),
tool_calls=tool_calls_data,
tool_call_id=msg.get("tool_call_id"),
finish_reason=msg.get("finish_reason"),
)
except Exception as e:
logger.debug("Session DB log_msg failed: %s", e)
def _flush_messages_to_session_db(self, messages: List[Dict], conversation_history: List[Dict] = None):
"""Persist any un-logged messages to the SQLite session store.
"""Persist any un-flushed messages to the SQLite session store.
Called both at the normal end of run_conversation and from every early-
return path so that tool calls, tool responses, and assistant messages
are never lost even when the conversation errors out.
Uses _last_flushed_db_idx to track which messages have already been
written, so repeated calls (from multiple exit paths) only write
truly new messages preventing the duplicate-write bug (#860).
"""
if not self._session_db:
return
try:
start_idx = len(conversation_history) if conversation_history else 0
for msg in messages[start_idx:]:
flush_from = max(start_idx, self._last_flushed_db_idx)
for msg in messages[flush_from:]:
role = msg.get("role", "unknown")
content = msg.get("content")
tool_calls_data = None
@ -891,6 +875,7 @@ class AIAgent:
tool_call_id=msg.get("tool_call_id"),
finish_reason=msg.get("finish_reason"),
)
self._last_flushed_db_idx = len(messages)
except Exception as e:
logger.debug("Session DB append_message failed: %s", e)
@ -1668,7 +1653,14 @@ class AIAgent:
prompt_parts.append(user_block)
has_skills_tools = any(name in self.valid_tool_names for name in ['skills_list', 'skill_view', 'skill_manage'])
skills_prompt = build_skills_system_prompt() if has_skills_tools else ""
if has_skills_tools:
avail_toolsets = {ts for ts, avail in check_toolset_requirements().items() if avail}
skills_prompt = build_skills_system_prompt(
available_tools=self.valid_tool_names,
available_toolsets=avail_toolsets,
)
else:
skills_prompt = ""
if skills_prompt:
prompt_parts.append(skills_prompt)
@ -2043,6 +2035,7 @@ class AIAgent:
allowed_keys = {
"model", "instructions", "input", "tools", "store",
"reasoning", "include", "max_output_tokens", "temperature",
"tool_choice", "parallel_tool_calls", "prompt_cache_key",
}
normalized: Dict[str, Any] = {
"model": model,
@ -2068,6 +2061,12 @@ class AIAgent:
if isinstance(temperature, (int, float)):
normalized["temperature"] = float(temperature)
# Pass through tool_choice, parallel_tool_calls, prompt_cache_key
for passthrough_key in ("tool_choice", "parallel_tool_calls", "prompt_cache_key"):
val = api_kwargs.get(passthrough_key)
if val is not None:
normalized[passthrough_key] = val
if allow_stream:
stream = api_kwargs.get("stream")
if stream is not None and stream is not True:
@ -2604,7 +2603,10 @@ class AIAgent:
"instructions": instructions,
"input": self._chat_messages_to_responses_input(payload_messages),
"tools": self._responses_tools(),
"tool_choice": "auto",
"parallel_tool_calls": True,
"store": False,
"prompt_cache_key": self.session_id,
}
if reasoning_enabled:
@ -2681,6 +2683,12 @@ class AIAgent:
preview = reasoning_text[:100] + "..." if len(reasoning_text) > 100 else reasoning_text
logging.debug(f"Captured reasoning ({len(reasoning_text)} chars): {preview}")
if reasoning_text and self.reasoning_callback:
try:
self.reasoning_callback(reasoning_text)
except Exception:
pass
msg = {
"role": "assistant",
"content": assistant_message.content or "",
@ -2900,7 +2908,7 @@ class AIAgent:
if messages and messages[-1].get("_flush_sentinel") == _sentinel:
messages.pop()
def _compress_context(self, messages: list, system_message: str, *, approx_tokens: int = None) -> tuple:
def _compress_context(self, messages: list, system_message: str, *, approx_tokens: int = None, task_id: str = "default") -> tuple:
"""Compress conversation context and split the session in SQLite.
Returns:
@ -2915,6 +2923,25 @@ class AIAgent:
if todo_snapshot:
compressed.append({"role": "user", "content": todo_snapshot})
# Preserve file-read history so the model doesn't re-read files
# it already examined before compression.
try:
from tools.file_tools import get_read_files_summary
read_files = get_read_files_summary(task_id)
if read_files:
file_list = "\n".join(
f" - {f['path']} ({', '.join(f['regions'])})"
for f in read_files
)
compressed.append({"role": "user", "content": (
"[Files already read in this session — do NOT re-read these]\n"
f"{file_list}\n"
"Use the information from the context summary above. "
"Proceed with writing, editing, or responding."
)})
except Exception:
pass # Don't break compression if file tracking fails
self._invalidate_system_prompt()
new_system_prompt = self._build_system_prompt(system_message)
self._cached_system_prompt = new_system_prompt
@ -2940,12 +2967,14 @@ class AIAgent:
except (ValueError, Exception) as e:
logger.debug("Could not propagate title on compression: %s", e)
self._session_db.update_system_prompt(self.session_id, new_system_prompt)
# Reset flush cursor — new session starts with no messages written
self._last_flushed_db_idx = 0
except Exception as e:
logger.debug("Session DB compression split failed: %s", e)
return compressed, new_system_prompt
def _execute_tool_calls(self, assistant_message, messages: list, effective_task_id: str) -> None:
def _execute_tool_calls(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
"""Execute tool calls from the assistant message and append results to messages."""
for i, tool_call in enumerate(assistant_message.tool_calls, 1):
# SAFETY: check interrupt BEFORE starting each tool.
@ -2963,7 +2992,6 @@ class AIAgent:
"tool_call_id": skipped_tc.id,
}
messages.append(skip_msg)
self._log_msg_to_db(skip_msg)
break
function_name = tool_call.function.name
@ -3172,7 +3200,6 @@ class AIAgent:
"tool_call_id": tool_call.id
}
messages.append(tool_msg)
self._log_msg_to_db(tool_msg)
if not self.quiet_mode:
response_preview = function_result[:self.log_prefix_chars] + "..." if len(function_result) > self.log_prefix_chars else function_result
@ -3189,12 +3216,56 @@ class AIAgent:
"tool_call_id": skipped_tc.id
}
messages.append(skip_msg)
self._log_msg_to_db(skip_msg)
break
if self.tool_delay > 0 and i < len(assistant_message.tool_calls):
time.sleep(self.tool_delay)
# ── Budget pressure injection ─────────────────────────────────
# After all tool calls in this turn are processed, check if we're
# approaching max_iterations. If so, inject a warning into the LAST
# tool result's JSON so the LLM sees it naturally when reading results.
budget_warning = self._get_budget_warning(api_call_count)
if budget_warning and messages and messages[-1].get("role") == "tool":
last_content = messages[-1]["content"]
try:
parsed = json.loads(last_content)
if isinstance(parsed, dict):
parsed["_budget_warning"] = budget_warning
messages[-1]["content"] = json.dumps(parsed, ensure_ascii=False)
else:
messages[-1]["content"] = last_content + f"\n\n{budget_warning}"
except (json.JSONDecodeError, TypeError):
messages[-1]["content"] = last_content + f"\n\n{budget_warning}"
if not self.quiet_mode:
remaining = self.max_iterations - api_call_count
tier = "⚠️ WARNING" if remaining <= self.max_iterations * 0.1 else "💡 CAUTION"
print(f"{self.log_prefix}{tier}: {remaining} iterations remaining")
def _get_budget_warning(self, api_call_count: int) -> Optional[str]:
"""Return a budget pressure string, or None if not yet needed.
Two-tier system:
- Caution (70%): nudge to consolidate work
- Warning (90%): urgent, must respond now
"""
if not self._budget_pressure_enabled or self.max_iterations <= 0:
return None
progress = api_call_count / self.max_iterations
remaining = self.max_iterations - api_call_count
if progress >= self._budget_warning_threshold:
return (
f"[BUDGET WARNING: Iteration {api_call_count}/{self.max_iterations}. "
f"Only {remaining} iteration(s) left. "
"Provide your final response NOW. No more tool calls unless absolutely critical.]"
)
if progress >= self._budget_caution_threshold:
return (
f"[BUDGET: Iteration {api_call_count}/{self.max_iterations}. "
f"{remaining} iterations left. Start consolidating your work.]"
)
return None
def _handle_max_iterations(self, messages: list, api_call_count: int) -> str:
"""Request a summary when max iterations are reached. Returns the final response text."""
print(f"⚠️ Reached maximum iterations ({self.max_iterations}). Requesting summary...")
@ -3427,7 +3498,6 @@ class AIAgent:
# Add user message
user_msg = {"role": "user", "content": user_message}
messages.append(user_msg)
self._log_msg_to_db(user_msg)
if not self.quiet_mode:
print(f"💬 Starting conversation: '{user_message[:60]}{'...' if len(user_message) > 60 else ''}'")
@ -3509,7 +3579,8 @@ class AIAgent:
for _pass in range(3):
_orig_len = len(messages)
messages, active_system_prompt = self._compress_context(
messages, system_message, approx_tokens=_preflight_tokens
messages, system_message, approx_tokens=_preflight_tokens,
task_id=effective_task_id,
)
if len(messages) >= _orig_len:
break # Cannot compress further
@ -3660,7 +3731,7 @@ class AIAgent:
api_start_time = time.time()
retry_count = 0
max_retries = 6 # Increased to allow longer backoff periods
max_retries = 3
compression_attempts = 0
max_compression_attempts = 3
codex_auth_retry_attempted = False
@ -3827,7 +3898,6 @@ class AIAgent:
length_continue_retries += 1
interim_msg = self._build_assistant_message(assistant_message, finish_reason)
messages.append(interim_msg)
self._log_msg_to_db(interim_msg)
if assistant_message.content:
truncated_response_prefix += assistant_message.content
@ -3845,7 +3915,6 @@ class AIAgent:
),
}
messages.append(continue_msg)
self._log_msg_to_db(continue_msg)
self._session_messages = messages
self._save_session_log(messages)
restart_with_length_continuation = True
@ -4035,7 +4104,8 @@ class AIAgent:
original_len = len(messages)
messages, active_system_prompt = self._compress_context(
messages, system_message, approx_tokens=approx_tokens
messages, system_message, approx_tokens=approx_tokens,
task_id=effective_task_id,
)
if len(messages) < original_len:
@ -4064,6 +4134,7 @@ class AIAgent:
'token limit', 'too many tokens', 'reduce the length',
'exceeds the limit', 'context window',
'request entity too large', # OpenRouter/Nous 413 safety net
'prompt is too long', # Anthropic: "prompt is too long: N tokens > M maximum"
])
if is_context_length_error:
@ -4103,7 +4174,8 @@ class AIAgent:
original_len = len(messages)
messages, active_system_prompt = self._compress_context(
messages, system_message, approx_tokens=approx_tokens
messages, system_message, approx_tokens=approx_tokens,
task_id=effective_task_id,
)
if len(messages) < original_len or new_ctx and new_ctx < old_ctx:
@ -4130,8 +4202,11 @@ class AIAgent:
# These indicate a problem with the request itself (bad model ID,
# invalid API key, forbidden, etc.) and will never succeed on retry.
# Note: 413 and context-length errors are excluded — handled above.
# Also catch local validation errors (ValueError, TypeError) — these
# are programming bugs, not transient failures.
is_local_validation_error = isinstance(api_error, (ValueError, TypeError))
is_client_status_error = isinstance(status_code, int) and 400 <= status_code < 500 and status_code != 413
is_client_error = (is_client_status_error or any(phrase in error_msg for phrase in [
is_client_error = (is_local_validation_error or is_client_status_error or any(phrase in error_msg for phrase in [
'error code: 401', 'error code: 403',
'error code: 404', 'error code: 422',
'is not a valid model', 'invalid model', 'model not found',
@ -4318,7 +4393,6 @@ class AIAgent:
)
if not duplicate_interim:
messages.append(interim_msg)
self._log_msg_to_db(interim_msg)
if self._codex_incomplete_retries < 3:
if not self.quiet_mode:
@ -4369,7 +4443,6 @@ class AIAgent:
print(f"{self.log_prefix}⚠️ Unknown tool '{invalid_preview}' — sending error to model for self-correction")
assistant_msg = self._build_assistant_message(assistant_message, finish_reason)
messages.append(assistant_msg)
self._log_msg_to_db(assistant_msg)
for tc in assistant_message.tool_calls:
if tc.function.name not in self.valid_tool_names:
content = f"Tool '{tc.function.name}' does not exist. Available tools: {available}"
@ -4424,7 +4497,6 @@ class AIAgent:
)
recovery_dict = {"role": "user", "content": recovery_msg}
messages.append(recovery_dict)
self._log_msg_to_db(recovery_dict)
continue
# Reset retry counter on successful JSON validation
@ -4446,9 +4518,9 @@ class AIAgent:
print(f" ┊ 💬 {clean}")
messages.append(assistant_msg)
self._log_msg_to_db(assistant_msg)
self._execute_tool_calls(assistant_message, messages, effective_task_id)
_msg_count_before_tools = len(messages)
self._execute_tool_calls(assistant_message, messages, effective_task_id, api_call_count)
# Refund the iteration if the ONLY tool(s) called were
# execute_code (programmatic tool calling). These are
@ -4457,10 +4529,24 @@ class AIAgent:
if _tc_names == {"execute_code"}:
self.iteration_budget.refund()
if self.compression_enabled and self.context_compressor.should_compress():
# Estimate next prompt size using real token counts from the
# last API response + rough estimate of newly appended tool
# results. This catches cases where tool results push the
# context past the limit that last_prompt_tokens alone misses
# (e.g. large file reads, web extractions).
_compressor = self.context_compressor
_new_tool_msgs = messages[_msg_count_before_tools:]
_new_chars = sum(len(str(m.get("content", "") or "")) for m in _new_tool_msgs)
_estimated_next_prompt = (
_compressor.last_prompt_tokens
+ _compressor.last_completion_tokens
+ _new_chars // 3 # conservative: JSON-heavy tool results ≈ 3 chars/token
)
if self.compression_enabled and _compressor.should_compress(_estimated_next_prompt):
messages, active_system_prompt = self._compress_context(
messages, system_message,
approx_tokens=self.context_compressor.last_prompt_tokens
approx_tokens=self.context_compressor.last_prompt_tokens,
task_id=effective_task_id,
)
# Save session log incrementally (so progress is visible even if interrupted)
@ -4549,7 +4635,6 @@ class AIAgent:
"finish_reason": finish_reason,
}
messages.append(empty_msg)
self._log_msg_to_db(empty_msg)
self._cleanup_task_resources(effective_task_id)
self._persist_session(messages, conversation_history)
@ -4580,7 +4665,6 @@ class AIAgent:
codex_ack_continuations += 1
interim_msg = self._build_assistant_message(assistant_message, "incomplete")
messages.append(interim_msg)
self._log_msg_to_db(interim_msg)
continue_msg = {
"role": "user",
@ -4590,7 +4674,6 @@ class AIAgent:
),
}
messages.append(continue_msg)
self._log_msg_to_db(continue_msg)
self._session_messages = messages
self._save_session_log(messages)
continue
@ -4606,7 +4689,6 @@ class AIAgent:
final_msg = self._build_assistant_message(assistant_message, finish_reason)
messages.append(final_msg)
self._log_msg_to_db(final_msg)
if not self.quiet_mode:
print(f"🎉 Conversation completed after {api_call_count} OpenAI-compatible API call(s)")
@ -4643,7 +4725,6 @@ class AIAgent:
"content": f"Error executing tool: {error_msg}",
}
messages.append(err_msg)
self._log_msg_to_db(err_msg)
pending_handled = True
break
@ -4656,7 +4737,6 @@ class AIAgent:
"content": f"[System error during processing: {error_msg}]",
}
messages.append(sys_err_msg)
self._log_msg_to_db(sys_err_msg)
# If we're near the limit, break to avoid infinite loops
if api_call_count >= self.max_iterations - 1:
@ -4688,9 +4768,17 @@ class AIAgent:
self._honcho_sync(original_user_message, final_response)
self._queue_honcho_prefetch(original_user_message)
# Extract reasoning from the last assistant message (if any)
last_reasoning = None
for msg in reversed(messages):
if msg.get("role") == "assistant" and msg.get("reasoning"):
last_reasoning = msg["reasoning"]
break
# Build result with interrupt info if applicable
result = {
"final_response": final_response,
"last_reasoning": last_reasoning,
"messages": messages,
"api_calls": api_call_count,
"completed": completed,