feat: call_llm/async_call_llm + config slots + migrate all consumers
Add centralized call_llm() and async_call_llm() functions that own the full LLM request lifecycle: 1. Resolve provider + model from task config or explicit args 2. Get or create a cached client for that provider 3. Format request args (max_tokens handling, provider extra_body) 4. Make the API call with max_tokens/max_completion_tokens retry 5. Return the response Config: expanded auxiliary section with provider:model slots for all tasks (compression, vision, web_extract, session_search, skills_hub, mcp, flush_memories). Config version bumped to 7. Migrated all auxiliary consumers: - context_compressor.py: uses call_llm(task='compression') - vision_tools.py: uses async_call_llm(task='vision') - web_tools.py: uses async_call_llm(task='web_extract') - session_search_tool.py: uses async_call_llm(task='session_search') - browser_tool.py: uses call_llm(task='vision'/'web_extract') - mcp_tool.py: uses call_llm(task='mcp') - skills_guard.py: uses call_llm(provider='openrouter') - run_agent.py flush_memories: uses call_llm(task='flush_memories') Tests updated for context_compressor and MCP tool. Some test mocks still need updating (15 remaining failures from mock pattern changes, 2 pre-existing).
This commit is contained in:
parent
013cc4d2fc
commit
0aa31cd3cb
13 changed files with 552 additions and 375 deletions
|
|
@ -784,3 +784,253 @@ def auxiliary_max_tokens_param(value: int) -> dict:
|
||||||
and "api.openai.com" in custom_base.lower()):
|
and "api.openai.com" in custom_base.lower()):
|
||||||
return {"max_completion_tokens": value}
|
return {"max_completion_tokens": value}
|
||||||
return {"max_tokens": value}
|
return {"max_tokens": value}
|
||||||
|
|
||||||
|
|
||||||
|
# ── Centralized LLM Call API ────────────────────────────────────────────────
|
||||||
|
#
|
||||||
|
# call_llm() and async_call_llm() own the full request lifecycle:
|
||||||
|
# 1. Resolve provider + model from task config (or explicit args)
|
||||||
|
# 2. Get or create a cached client for that provider
|
||||||
|
# 3. Format request args for the provider + model (max_tokens handling, etc.)
|
||||||
|
# 4. Make the API call
|
||||||
|
# 5. Return the response
|
||||||
|
#
|
||||||
|
# Every auxiliary LLM consumer should use these instead of manually
|
||||||
|
# constructing clients and calling .chat.completions.create().
|
||||||
|
|
||||||
|
# Client cache: (provider, async_mode) -> (client, default_model)
|
||||||
|
_client_cache: Dict[tuple, tuple] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def _get_cached_client(
|
||||||
|
provider: str, model: str = None, async_mode: bool = False,
|
||||||
|
) -> Tuple[Optional[Any], Optional[str]]:
|
||||||
|
"""Get or create a cached client for the given provider."""
|
||||||
|
cache_key = (provider, async_mode)
|
||||||
|
if cache_key in _client_cache:
|
||||||
|
cached_client, cached_default = _client_cache[cache_key]
|
||||||
|
return cached_client, model or cached_default
|
||||||
|
client, default_model = resolve_provider_client(provider, model, async_mode)
|
||||||
|
if client is not None:
|
||||||
|
_client_cache[cache_key] = (client, default_model)
|
||||||
|
return client, model or default_model
|
||||||
|
|
||||||
|
|
||||||
|
def _resolve_task_provider_model(
|
||||||
|
task: str = None,
|
||||||
|
provider: str = None,
|
||||||
|
model: str = None,
|
||||||
|
) -> Tuple[str, Optional[str]]:
|
||||||
|
"""Determine provider + model for a call.
|
||||||
|
|
||||||
|
Priority:
|
||||||
|
1. Explicit provider/model args (always win)
|
||||||
|
2. Env var overrides (AUXILIARY_{TASK}_PROVIDER, etc.)
|
||||||
|
3. Config file (auxiliary.{task}.provider/model or compression.*)
|
||||||
|
4. "auto" (full auto-detection chain)
|
||||||
|
|
||||||
|
Returns (provider, model) where model may be None (use provider default).
|
||||||
|
"""
|
||||||
|
if provider:
|
||||||
|
return provider, model
|
||||||
|
|
||||||
|
if task:
|
||||||
|
# Check env var overrides first
|
||||||
|
env_provider = _get_auxiliary_provider(task)
|
||||||
|
if env_provider != "auto":
|
||||||
|
# Check for env var model override too
|
||||||
|
env_model = None
|
||||||
|
for prefix in ("AUXILIARY_", "CONTEXT_"):
|
||||||
|
val = os.getenv(f"{prefix}{task.upper()}_MODEL", "").strip()
|
||||||
|
if val:
|
||||||
|
env_model = val
|
||||||
|
break
|
||||||
|
return env_provider, model or env_model
|
||||||
|
|
||||||
|
# Read from config file
|
||||||
|
try:
|
||||||
|
from hermes_cli.config import load_config
|
||||||
|
config = load_config()
|
||||||
|
except ImportError:
|
||||||
|
return "auto", model
|
||||||
|
|
||||||
|
# Check auxiliary.{task} section
|
||||||
|
aux = config.get("auxiliary", {})
|
||||||
|
task_config = aux.get(task, {})
|
||||||
|
cfg_provider = task_config.get("provider", "").strip() or None
|
||||||
|
cfg_model = task_config.get("model", "").strip() or None
|
||||||
|
|
||||||
|
# Backwards compat: compression section has its own keys
|
||||||
|
if task == "compression" and not cfg_provider:
|
||||||
|
comp = config.get("compression", {})
|
||||||
|
cfg_provider = comp.get("summary_provider", "").strip() or None
|
||||||
|
cfg_model = cfg_model or comp.get("summary_model", "").strip() or None
|
||||||
|
|
||||||
|
if cfg_provider and cfg_provider != "auto":
|
||||||
|
return cfg_provider, model or cfg_model
|
||||||
|
return "auto", model or cfg_model
|
||||||
|
|
||||||
|
return "auto", model
|
||||||
|
|
||||||
|
|
||||||
|
def _build_call_kwargs(
|
||||||
|
provider: str,
|
||||||
|
model: str,
|
||||||
|
messages: list,
|
||||||
|
temperature: Optional[float] = None,
|
||||||
|
max_tokens: Optional[int] = None,
|
||||||
|
tools: Optional[list] = None,
|
||||||
|
timeout: float = 30.0,
|
||||||
|
extra_body: Optional[dict] = None,
|
||||||
|
) -> dict:
|
||||||
|
"""Build kwargs for .chat.completions.create() with model/provider adjustments."""
|
||||||
|
kwargs: Dict[str, Any] = {
|
||||||
|
"model": model,
|
||||||
|
"messages": messages,
|
||||||
|
"timeout": timeout,
|
||||||
|
}
|
||||||
|
|
||||||
|
if temperature is not None:
|
||||||
|
kwargs["temperature"] = temperature
|
||||||
|
|
||||||
|
if max_tokens is not None:
|
||||||
|
# Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens.
|
||||||
|
# Direct OpenAI api.openai.com with newer models needs max_completion_tokens.
|
||||||
|
if provider == "custom":
|
||||||
|
custom_base = os.getenv("OPENAI_BASE_URL", "")
|
||||||
|
if "api.openai.com" in custom_base.lower():
|
||||||
|
kwargs["max_completion_tokens"] = max_tokens
|
||||||
|
else:
|
||||||
|
kwargs["max_tokens"] = max_tokens
|
||||||
|
else:
|
||||||
|
kwargs["max_tokens"] = max_tokens
|
||||||
|
|
||||||
|
if tools:
|
||||||
|
kwargs["tools"] = tools
|
||||||
|
|
||||||
|
# Provider-specific extra_body
|
||||||
|
merged_extra = dict(extra_body or {})
|
||||||
|
if provider == "nous" or auxiliary_is_nous:
|
||||||
|
merged_extra.setdefault("tags", []).extend(["product=hermes-agent"])
|
||||||
|
if merged_extra:
|
||||||
|
kwargs["extra_body"] = merged_extra
|
||||||
|
|
||||||
|
return kwargs
|
||||||
|
|
||||||
|
|
||||||
|
def call_llm(
|
||||||
|
task: str = None,
|
||||||
|
*,
|
||||||
|
provider: str = None,
|
||||||
|
model: str = None,
|
||||||
|
messages: list,
|
||||||
|
temperature: float = None,
|
||||||
|
max_tokens: int = None,
|
||||||
|
tools: list = None,
|
||||||
|
timeout: float = 30.0,
|
||||||
|
extra_body: dict = None,
|
||||||
|
) -> Any:
|
||||||
|
"""Centralized synchronous LLM call.
|
||||||
|
|
||||||
|
Resolves provider + model (from task config, explicit args, or auto-detect),
|
||||||
|
handles auth, request formatting, and model-specific arg adjustments.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
task: Auxiliary task name ("compression", "vision", "web_extract",
|
||||||
|
"session_search", "skills_hub", "mcp", "flush_memories").
|
||||||
|
Reads provider:model from config/env. Ignored if provider is set.
|
||||||
|
provider: Explicit provider override.
|
||||||
|
model: Explicit model override.
|
||||||
|
messages: Chat messages list.
|
||||||
|
temperature: Sampling temperature (None = provider default).
|
||||||
|
max_tokens: Max output tokens (handles max_tokens vs max_completion_tokens).
|
||||||
|
tools: Tool definitions (for function calling).
|
||||||
|
timeout: Request timeout in seconds.
|
||||||
|
extra_body: Additional request body fields.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Response object with .choices[0].message.content
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
RuntimeError: If no provider is configured.
|
||||||
|
"""
|
||||||
|
resolved_provider, resolved_model = _resolve_task_provider_model(
|
||||||
|
task, provider, model)
|
||||||
|
|
||||||
|
client, final_model = _get_cached_client(resolved_provider, resolved_model)
|
||||||
|
if client is None:
|
||||||
|
# Fallback: try openrouter
|
||||||
|
if resolved_provider != "openrouter":
|
||||||
|
logger.warning("Provider %s unavailable, falling back to openrouter",
|
||||||
|
resolved_provider)
|
||||||
|
client, final_model = _get_cached_client(
|
||||||
|
"openrouter", resolved_model or _OPENROUTER_MODEL)
|
||||||
|
if client is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"No LLM provider configured for task={task} provider={resolved_provider}. "
|
||||||
|
f"Run: hermes setup")
|
||||||
|
|
||||||
|
kwargs = _build_call_kwargs(
|
||||||
|
resolved_provider, final_model, messages,
|
||||||
|
temperature=temperature, max_tokens=max_tokens,
|
||||||
|
tools=tools, timeout=timeout, extra_body=extra_body)
|
||||||
|
|
||||||
|
# Handle max_tokens vs max_completion_tokens retry
|
||||||
|
try:
|
||||||
|
return client.chat.completions.create(**kwargs)
|
||||||
|
except Exception as first_err:
|
||||||
|
err_str = str(first_err)
|
||||||
|
if "max_tokens" in err_str or "unsupported_parameter" in err_str:
|
||||||
|
kwargs.pop("max_tokens", None)
|
||||||
|
kwargs["max_completion_tokens"] = max_tokens
|
||||||
|
return client.chat.completions.create(**kwargs)
|
||||||
|
raise
|
||||||
|
|
||||||
|
|
||||||
|
async def async_call_llm(
|
||||||
|
task: str = None,
|
||||||
|
*,
|
||||||
|
provider: str = None,
|
||||||
|
model: str = None,
|
||||||
|
messages: list,
|
||||||
|
temperature: float = None,
|
||||||
|
max_tokens: int = None,
|
||||||
|
tools: list = None,
|
||||||
|
timeout: float = 30.0,
|
||||||
|
extra_body: dict = None,
|
||||||
|
) -> Any:
|
||||||
|
"""Centralized asynchronous LLM call.
|
||||||
|
|
||||||
|
Same as call_llm() but async. See call_llm() for full documentation.
|
||||||
|
"""
|
||||||
|
resolved_provider, resolved_model = _resolve_task_provider_model(
|
||||||
|
task, provider, model)
|
||||||
|
|
||||||
|
client, final_model = _get_cached_client(
|
||||||
|
resolved_provider, resolved_model, async_mode=True)
|
||||||
|
if client is None:
|
||||||
|
if resolved_provider != "openrouter":
|
||||||
|
logger.warning("Provider %s unavailable, falling back to openrouter",
|
||||||
|
resolved_provider)
|
||||||
|
client, final_model = _get_cached_client(
|
||||||
|
"openrouter", resolved_model or _OPENROUTER_MODEL,
|
||||||
|
async_mode=True)
|
||||||
|
if client is None:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"No LLM provider configured for task={task} provider={resolved_provider}. "
|
||||||
|
f"Run: hermes setup")
|
||||||
|
|
||||||
|
kwargs = _build_call_kwargs(
|
||||||
|
resolved_provider, final_model, messages,
|
||||||
|
temperature=temperature, max_tokens=max_tokens,
|
||||||
|
tools=tools, timeout=timeout, extra_body=extra_body)
|
||||||
|
|
||||||
|
try:
|
||||||
|
return await client.chat.completions.create(**kwargs)
|
||||||
|
except Exception as first_err:
|
||||||
|
err_str = str(first_err)
|
||||||
|
if "max_tokens" in err_str or "unsupported_parameter" in err_str:
|
||||||
|
kwargs.pop("max_tokens", None)
|
||||||
|
kwargs["max_completion_tokens"] = max_tokens
|
||||||
|
return await client.chat.completions.create(**kwargs)
|
||||||
|
raise
|
||||||
|
|
|
||||||
|
|
@ -9,7 +9,7 @@ import logging
|
||||||
import os
|
import os
|
||||||
from typing import Any, Dict, List, Optional
|
from typing import Any, Dict, List, Optional
|
||||||
|
|
||||||
from agent.auxiliary_client import get_text_auxiliary_client
|
from agent.auxiliary_client import call_llm
|
||||||
from agent.model_metadata import (
|
from agent.model_metadata import (
|
||||||
get_model_context_length,
|
get_model_context_length,
|
||||||
estimate_messages_tokens_rough,
|
estimate_messages_tokens_rough,
|
||||||
|
|
@ -53,8 +53,7 @@ class ContextCompressor:
|
||||||
self.last_completion_tokens = 0
|
self.last_completion_tokens = 0
|
||||||
self.last_total_tokens = 0
|
self.last_total_tokens = 0
|
||||||
|
|
||||||
self.client, default_model = get_text_auxiliary_client("compression")
|
self.summary_model = summary_model_override or ""
|
||||||
self.summary_model = summary_model_override or default_model
|
|
||||||
|
|
||||||
def update_from_response(self, usage: Dict[str, Any]):
|
def update_from_response(self, usage: Dict[str, Any]):
|
||||||
"""Update tracked token usage from API response."""
|
"""Update tracked token usage from API response."""
|
||||||
|
|
@ -120,73 +119,30 @@ TURNS TO SUMMARIZE:
|
||||||
|
|
||||||
Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
|
Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
|
||||||
|
|
||||||
# 1. Try the auxiliary model (cheap/fast)
|
# Use the centralized LLM router — handles provider resolution,
|
||||||
if self.client:
|
# auth, and fallback internally.
|
||||||
try:
|
|
||||||
return self._call_summary_model(self.client, self.summary_model, prompt)
|
|
||||||
except Exception as e:
|
|
||||||
logging.warning(f"Failed to generate context summary with auxiliary model: {e}")
|
|
||||||
|
|
||||||
# 2. Fallback: re-try via the centralized provider router.
|
|
||||||
# This covers all configured providers (Codex OAuth, API-key
|
|
||||||
# providers, etc.) without ad-hoc env var lookups.
|
|
||||||
from agent.auxiliary_client import resolve_provider_client
|
|
||||||
fallback_providers = ["custom", "openrouter", "nous", "codex"]
|
|
||||||
for fb_provider in fallback_providers:
|
|
||||||
try:
|
|
||||||
fb_client, fb_model = resolve_provider_client(
|
|
||||||
fb_provider, model=self.model)
|
|
||||||
if fb_client is None:
|
|
||||||
continue
|
|
||||||
# Don't retry the same client that just failed
|
|
||||||
if (self.client is not None
|
|
||||||
and hasattr(fb_client, "base_url")
|
|
||||||
and hasattr(self.client, "base_url")
|
|
||||||
and str(fb_client.base_url) == str(self.client.base_url)):
|
|
||||||
continue
|
|
||||||
logger.info("Retrying context summary with fallback provider "
|
|
||||||
"%s (%s)", fb_provider, fb_model)
|
|
||||||
summary = self._call_summary_model(fb_client, fb_model, prompt)
|
|
||||||
# Promote successful fallback for future compressions
|
|
||||||
self.client = fb_client
|
|
||||||
self.summary_model = fb_model
|
|
||||||
return summary
|
|
||||||
except Exception as fallback_err:
|
|
||||||
logging.warning("Fallback provider %s failed: %s",
|
|
||||||
fb_provider, fallback_err)
|
|
||||||
|
|
||||||
# 3. All providers failed — return None so the caller drops turns
|
|
||||||
# without a summary.
|
|
||||||
logging.warning("Context compression: no provider available for "
|
|
||||||
"summary. Middle turns will be dropped without summary.")
|
|
||||||
return None
|
|
||||||
|
|
||||||
def _call_summary_model(self, client, model: str, prompt: str) -> str:
|
|
||||||
"""Make the actual LLM call to generate a summary. Raises on failure."""
|
|
||||||
kwargs = {
|
|
||||||
"model": model,
|
|
||||||
"messages": [{"role": "user", "content": prompt}],
|
|
||||||
"temperature": 0.3,
|
|
||||||
"timeout": 30.0,
|
|
||||||
}
|
|
||||||
# Most providers (OpenRouter, local models) use max_tokens.
|
|
||||||
# Direct OpenAI with newer models (gpt-4o, o-series, gpt-5+)
|
|
||||||
# requires max_completion_tokens instead.
|
|
||||||
try:
|
try:
|
||||||
kwargs["max_tokens"] = self.summary_target_tokens * 2
|
call_kwargs = {
|
||||||
response = client.chat.completions.create(**kwargs)
|
"task": "compression",
|
||||||
except Exception as first_err:
|
"messages": [{"role": "user", "content": prompt}],
|
||||||
if "max_tokens" in str(first_err) or "unsupported_parameter" in str(first_err):
|
"temperature": 0.3,
|
||||||
kwargs.pop("max_tokens", None)
|
"max_tokens": self.summary_target_tokens * 2,
|
||||||
kwargs["max_completion_tokens"] = self.summary_target_tokens * 2
|
"timeout": 30.0,
|
||||||
response = client.chat.completions.create(**kwargs)
|
}
|
||||||
else:
|
if self.summary_model:
|
||||||
raise
|
call_kwargs["model"] = self.summary_model
|
||||||
|
response = call_llm(**call_kwargs)
|
||||||
summary = response.choices[0].message.content.strip()
|
summary = response.choices[0].message.content.strip()
|
||||||
if not summary.startswith("[CONTEXT SUMMARY]:"):
|
if not summary.startswith("[CONTEXT SUMMARY]:"):
|
||||||
summary = "[CONTEXT SUMMARY]: " + summary
|
summary = "[CONTEXT SUMMARY]: " + summary
|
||||||
return summary
|
return summary
|
||||||
|
except RuntimeError:
|
||||||
|
logging.warning("Context compression: no provider available for "
|
||||||
|
"summary. Middle turns will be dropped without summary.")
|
||||||
|
return None
|
||||||
|
except Exception as e:
|
||||||
|
logging.warning("Failed to generate context summary: %s", e)
|
||||||
|
return None
|
||||||
|
|
||||||
# ------------------------------------------------------------------
|
# ------------------------------------------------------------------
|
||||||
# Tool-call / tool-result pair integrity helpers
|
# Tool-call / tool-result pair integrity helpers
|
||||||
|
|
|
||||||
|
|
@ -125,17 +125,41 @@ DEFAULT_CONFIG = {
|
||||||
"summary_provider": "auto",
|
"summary_provider": "auto",
|
||||||
},
|
},
|
||||||
|
|
||||||
# Auxiliary model overrides (advanced). By default Hermes auto-selects
|
# Auxiliary model config — provider:model for each side task.
|
||||||
# the provider and model for each side task. Set these to override.
|
# Format: provider is the provider name, model is the model slug.
|
||||||
|
# "auto" for provider = auto-detect best available provider.
|
||||||
|
# Empty model = use provider's default auxiliary model.
|
||||||
|
# All tasks fall back to openrouter:google/gemini-3-flash-preview if
|
||||||
|
# the configured provider is unavailable.
|
||||||
"auxiliary": {
|
"auxiliary": {
|
||||||
"vision": {
|
"vision": {
|
||||||
"provider": "auto", # auto | openrouter | nous | main
|
"provider": "auto", # auto | openrouter | nous | codex | custom
|
||||||
"model": "", # e.g. "google/gemini-2.5-flash", "gpt-4o"
|
"model": "", # e.g. "google/gemini-2.5-flash", "gpt-4o"
|
||||||
},
|
},
|
||||||
"web_extract": {
|
"web_extract": {
|
||||||
"provider": "auto",
|
"provider": "auto",
|
||||||
"model": "",
|
"model": "",
|
||||||
},
|
},
|
||||||
|
"compression": {
|
||||||
|
"provider": "auto",
|
||||||
|
"model": "",
|
||||||
|
},
|
||||||
|
"session_search": {
|
||||||
|
"provider": "auto",
|
||||||
|
"model": "",
|
||||||
|
},
|
||||||
|
"skills_hub": {
|
||||||
|
"provider": "auto",
|
||||||
|
"model": "",
|
||||||
|
},
|
||||||
|
"mcp": {
|
||||||
|
"provider": "auto",
|
||||||
|
"model": "",
|
||||||
|
},
|
||||||
|
"flush_memories": {
|
||||||
|
"provider": "auto",
|
||||||
|
"model": "",
|
||||||
|
},
|
||||||
},
|
},
|
||||||
|
|
||||||
"display": {
|
"display": {
|
||||||
|
|
@ -217,7 +241,7 @@ DEFAULT_CONFIG = {
|
||||||
"personalities": {},
|
"personalities": {},
|
||||||
|
|
||||||
# Config schema version - bump this when adding new required fields
|
# Config schema version - bump this when adding new required fields
|
||||||
"_config_version": 6,
|
"_config_version": 7,
|
||||||
}
|
}
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|
|
||||||
31
run_agent.py
31
run_agent.py
|
|
@ -2623,19 +2623,22 @@ class AIAgent:
|
||||||
|
|
||||||
# Use auxiliary client for the flush call when available --
|
# Use auxiliary client for the flush call when available --
|
||||||
# it's cheaper and avoids Codex Responses API incompatibility.
|
# it's cheaper and avoids Codex Responses API incompatibility.
|
||||||
from agent.auxiliary_client import get_text_auxiliary_client
|
from agent.auxiliary_client import call_llm as _call_llm
|
||||||
aux_client, aux_model = get_text_auxiliary_client()
|
_aux_available = True
|
||||||
|
try:
|
||||||
|
response = _call_llm(
|
||||||
|
task="flush_memories",
|
||||||
|
messages=api_messages,
|
||||||
|
tools=[memory_tool_def],
|
||||||
|
temperature=0.3,
|
||||||
|
max_tokens=5120,
|
||||||
|
timeout=30.0,
|
||||||
|
)
|
||||||
|
except RuntimeError:
|
||||||
|
_aux_available = False
|
||||||
|
response = None
|
||||||
|
|
||||||
if aux_client:
|
if not _aux_available and self.api_mode == "codex_responses":
|
||||||
api_kwargs = {
|
|
||||||
"model": aux_model,
|
|
||||||
"messages": api_messages,
|
|
||||||
"tools": [memory_tool_def],
|
|
||||||
"temperature": 0.3,
|
|
||||||
"max_tokens": 5120,
|
|
||||||
}
|
|
||||||
response = aux_client.chat.completions.create(**api_kwargs, timeout=30.0)
|
|
||||||
elif self.api_mode == "codex_responses":
|
|
||||||
# No auxiliary client -- use the Codex Responses path directly
|
# No auxiliary client -- use the Codex Responses path directly
|
||||||
codex_kwargs = self._build_api_kwargs(api_messages)
|
codex_kwargs = self._build_api_kwargs(api_messages)
|
||||||
codex_kwargs["tools"] = self._responses_tools([memory_tool_def])
|
codex_kwargs["tools"] = self._responses_tools([memory_tool_def])
|
||||||
|
|
@ -2643,7 +2646,7 @@ class AIAgent:
|
||||||
if "max_output_tokens" in codex_kwargs:
|
if "max_output_tokens" in codex_kwargs:
|
||||||
codex_kwargs["max_output_tokens"] = 5120
|
codex_kwargs["max_output_tokens"] = 5120
|
||||||
response = self._run_codex_stream(codex_kwargs)
|
response = self._run_codex_stream(codex_kwargs)
|
||||||
else:
|
elif not _aux_available:
|
||||||
api_kwargs = {
|
api_kwargs = {
|
||||||
"model": self.model,
|
"model": self.model,
|
||||||
"messages": api_messages,
|
"messages": api_messages,
|
||||||
|
|
@ -2655,7 +2658,7 @@ class AIAgent:
|
||||||
|
|
||||||
# Extract tool calls from the response, handling both API formats
|
# Extract tool calls from the response, handling both API formats
|
||||||
tool_calls = []
|
tool_calls = []
|
||||||
if self.api_mode == "codex_responses" and not aux_client:
|
if self.api_mode == "codex_responses" and not _aux_available:
|
||||||
assistant_msg, _ = self._normalize_codex_response(response)
|
assistant_msg, _ = self._normalize_codex_response(response)
|
||||||
if assistant_msg and assistant_msg.tool_calls:
|
if assistant_msg and assistant_msg.tool_calls:
|
||||||
tool_calls = assistant_msg.tool_calls
|
tool_calls = assistant_msg.tool_calls
|
||||||
|
|
|
||||||
|
|
@ -9,8 +9,7 @@ from agent.context_compressor import ContextCompressor
|
||||||
@pytest.fixture()
|
@pytest.fixture()
|
||||||
def compressor():
|
def compressor():
|
||||||
"""Create a ContextCompressor with mocked dependencies."""
|
"""Create a ContextCompressor with mocked dependencies."""
|
||||||
with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
|
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
|
||||||
patch("agent.context_compressor.get_text_auxiliary_client", return_value=(None, None)):
|
|
||||||
c = ContextCompressor(
|
c = ContextCompressor(
|
||||||
model="test/model",
|
model="test/model",
|
||||||
threshold_percent=0.85,
|
threshold_percent=0.85,
|
||||||
|
|
@ -119,14 +118,11 @@ class TestGenerateSummaryNoneContent:
|
||||||
"""Regression: content=None (from tool-call-only assistant messages) must not crash."""
|
"""Regression: content=None (from tool-call-only assistant messages) must not crash."""
|
||||||
|
|
||||||
def test_none_content_does_not_crash(self):
|
def test_none_content_does_not_crash(self):
|
||||||
mock_client = MagicMock()
|
|
||||||
mock_response = MagicMock()
|
mock_response = MagicMock()
|
||||||
mock_response.choices = [MagicMock()]
|
mock_response.choices = [MagicMock()]
|
||||||
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: tool calls happened"
|
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: tool calls happened"
|
||||||
mock_client.chat.completions.create.return_value = mock_response
|
|
||||||
|
|
||||||
with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
|
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
|
||||||
patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
|
|
||||||
c = ContextCompressor(model="test", quiet_mode=True)
|
c = ContextCompressor(model="test", quiet_mode=True)
|
||||||
|
|
||||||
messages = [
|
messages = [
|
||||||
|
|
@ -139,14 +135,14 @@ class TestGenerateSummaryNoneContent:
|
||||||
{"role": "user", "content": "thanks"},
|
{"role": "user", "content": "thanks"},
|
||||||
]
|
]
|
||||||
|
|
||||||
summary = c._generate_summary(messages)
|
with patch("agent.context_compressor.call_llm", return_value=mock_response):
|
||||||
|
summary = c._generate_summary(messages)
|
||||||
assert isinstance(summary, str)
|
assert isinstance(summary, str)
|
||||||
assert "CONTEXT SUMMARY" in summary
|
assert "CONTEXT SUMMARY" in summary
|
||||||
|
|
||||||
def test_none_content_in_system_message_compress(self):
|
def test_none_content_in_system_message_compress(self):
|
||||||
"""System message with content=None should not crash during compress."""
|
"""System message with content=None should not crash during compress."""
|
||||||
with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
|
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
|
||||||
patch("agent.context_compressor.get_text_auxiliary_client", return_value=(None, None)):
|
|
||||||
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
|
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
|
||||||
|
|
||||||
msgs = [{"role": "system", "content": None}] + [
|
msgs = [{"role": "system", "content": None}] + [
|
||||||
|
|
@ -165,12 +161,12 @@ class TestCompressWithClient:
|
||||||
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened"
|
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened"
|
||||||
mock_client.chat.completions.create.return_value = mock_response
|
mock_client.chat.completions.create.return_value = mock_response
|
||||||
|
|
||||||
with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
|
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
|
||||||
patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
|
|
||||||
c = ContextCompressor(model="test", quiet_mode=True)
|
c = ContextCompressor(model="test", quiet_mode=True)
|
||||||
|
|
||||||
msgs = [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} for i in range(10)]
|
msgs = [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} for i in range(10)]
|
||||||
result = c.compress(msgs)
|
with patch("agent.context_compressor.call_llm", return_value=mock_response):
|
||||||
|
result = c.compress(msgs)
|
||||||
|
|
||||||
# Should have summary message in the middle
|
# Should have summary message in the middle
|
||||||
contents = [m.get("content", "") for m in result]
|
contents = [m.get("content", "") for m in result]
|
||||||
|
|
@ -184,8 +180,7 @@ class TestCompressWithClient:
|
||||||
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compressed middle"
|
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compressed middle"
|
||||||
mock_client.chat.completions.create.return_value = mock_response
|
mock_client.chat.completions.create.return_value = mock_response
|
||||||
|
|
||||||
with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
|
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
|
||||||
patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
|
|
||||||
c = ContextCompressor(
|
c = ContextCompressor(
|
||||||
model="test",
|
model="test",
|
||||||
quiet_mode=True,
|
quiet_mode=True,
|
||||||
|
|
@ -212,7 +207,8 @@ class TestCompressWithClient:
|
||||||
{"role": "user", "content": "later 4"},
|
{"role": "user", "content": "later 4"},
|
||||||
]
|
]
|
||||||
|
|
||||||
result = c.compress(msgs)
|
with patch("agent.context_compressor.call_llm", return_value=mock_response):
|
||||||
|
result = c.compress(msgs)
|
||||||
|
|
||||||
answered_ids = {
|
answered_ids = {
|
||||||
msg.get("tool_call_id")
|
msg.get("tool_call_id")
|
||||||
|
|
@ -232,8 +228,7 @@ class TestCompressWithClient:
|
||||||
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened"
|
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened"
|
||||||
mock_client.chat.completions.create.return_value = mock_response
|
mock_client.chat.completions.create.return_value = mock_response
|
||||||
|
|
||||||
with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
|
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
|
||||||
patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
|
|
||||||
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
|
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
|
||||||
|
|
||||||
# Last head message (index 1) is "assistant" → summary should be "user"
|
# Last head message (index 1) is "assistant" → summary should be "user"
|
||||||
|
|
@ -245,7 +240,8 @@ class TestCompressWithClient:
|
||||||
{"role": "user", "content": "msg 4"},
|
{"role": "user", "content": "msg 4"},
|
||||||
{"role": "assistant", "content": "msg 5"},
|
{"role": "assistant", "content": "msg 5"},
|
||||||
]
|
]
|
||||||
result = c.compress(msgs)
|
with patch("agent.context_compressor.call_llm", return_value=mock_response):
|
||||||
|
result = c.compress(msgs)
|
||||||
summary_msg = [m for m in result if "CONTEXT SUMMARY" in (m.get("content") or "")]
|
summary_msg = [m for m in result if "CONTEXT SUMMARY" in (m.get("content") or "")]
|
||||||
assert len(summary_msg) == 1
|
assert len(summary_msg) == 1
|
||||||
assert summary_msg[0]["role"] == "user"
|
assert summary_msg[0]["role"] == "user"
|
||||||
|
|
@ -258,8 +254,7 @@ class TestCompressWithClient:
|
||||||
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened"
|
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened"
|
||||||
mock_client.chat.completions.create.return_value = mock_response
|
mock_client.chat.completions.create.return_value = mock_response
|
||||||
|
|
||||||
with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
|
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
|
||||||
patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
|
|
||||||
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=3, protect_last_n=2)
|
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=3, protect_last_n=2)
|
||||||
|
|
||||||
# Last head message (index 2) is "user" → summary should be "assistant"
|
# Last head message (index 2) is "user" → summary should be "assistant"
|
||||||
|
|
@ -273,20 +268,18 @@ class TestCompressWithClient:
|
||||||
{"role": "user", "content": "msg 6"},
|
{"role": "user", "content": "msg 6"},
|
||||||
{"role": "assistant", "content": "msg 7"},
|
{"role": "assistant", "content": "msg 7"},
|
||||||
]
|
]
|
||||||
result = c.compress(msgs)
|
with patch("agent.context_compressor.call_llm", return_value=mock_response):
|
||||||
|
result = c.compress(msgs)
|
||||||
summary_msg = [m for m in result if "CONTEXT SUMMARY" in (m.get("content") or "")]
|
summary_msg = [m for m in result if "CONTEXT SUMMARY" in (m.get("content") or "")]
|
||||||
assert len(summary_msg) == 1
|
assert len(summary_msg) == 1
|
||||||
assert summary_msg[0]["role"] == "assistant"
|
assert summary_msg[0]["role"] == "assistant"
|
||||||
|
|
||||||
def test_summarization_does_not_start_tail_with_tool_outputs(self):
|
def test_summarization_does_not_start_tail_with_tool_outputs(self):
|
||||||
mock_client = MagicMock()
|
|
||||||
mock_response = MagicMock()
|
mock_response = MagicMock()
|
||||||
mock_response.choices = [MagicMock()]
|
mock_response.choices = [MagicMock()]
|
||||||
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compressed middle"
|
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compressed middle"
|
||||||
mock_client.chat.completions.create.return_value = mock_response
|
|
||||||
|
|
||||||
with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
|
with patch("agent.context_compressor.get_model_context_length", return_value=100000):
|
||||||
patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
|
|
||||||
c = ContextCompressor(
|
c = ContextCompressor(
|
||||||
model="test",
|
model="test",
|
||||||
quiet_mode=True,
|
quiet_mode=True,
|
||||||
|
|
@ -309,7 +302,8 @@ class TestCompressWithClient:
|
||||||
{"role": "user", "content": "latest user"},
|
{"role": "user", "content": "latest user"},
|
||||||
]
|
]
|
||||||
|
|
||||||
result = c.compress(msgs)
|
with patch("agent.context_compressor.call_llm", return_value=mock_response):
|
||||||
|
result = c.compress(msgs)
|
||||||
|
|
||||||
called_ids = {
|
called_ids = {
|
||||||
tc["id"]
|
tc["id"]
|
||||||
|
|
|
||||||
|
|
@ -1828,8 +1828,8 @@ class TestSamplingCallbackText:
|
||||||
)
|
)
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "default-model"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
):
|
||||||
params = _make_sampling_params()
|
params = _make_sampling_params()
|
||||||
result = asyncio.run(self.handler(None, params))
|
result = asyncio.run(self.handler(None, params))
|
||||||
|
|
@ -1847,13 +1847,13 @@ class TestSamplingCallbackText:
|
||||||
fake_client.chat.completions.create.return_value = _make_llm_response()
|
fake_client.chat.completions.create.return_value = _make_llm_response()
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "default-model"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
) as mock_call:
|
||||||
params = _make_sampling_params(system_prompt="Be helpful")
|
params = _make_sampling_params(system_prompt="Be helpful")
|
||||||
asyncio.run(self.handler(None, params))
|
asyncio.run(self.handler(None, params))
|
||||||
|
|
||||||
call_args = fake_client.chat.completions.create.call_args
|
call_args = mock_call.call_args
|
||||||
messages = call_args.kwargs["messages"]
|
messages = call_args.kwargs["messages"]
|
||||||
assert messages[0] == {"role": "system", "content": "Be helpful"}
|
assert messages[0] == {"role": "system", "content": "Be helpful"}
|
||||||
|
|
||||||
|
|
@ -1865,8 +1865,8 @@ class TestSamplingCallbackText:
|
||||||
)
|
)
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "default-model"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
):
|
||||||
params = _make_sampling_params()
|
params = _make_sampling_params()
|
||||||
result = asyncio.run(self.handler(None, params))
|
result = asyncio.run(self.handler(None, params))
|
||||||
|
|
@ -1889,8 +1889,8 @@ class TestSamplingCallbackToolUse:
|
||||||
fake_client.chat.completions.create.return_value = _make_llm_tool_response()
|
fake_client.chat.completions.create.return_value = _make_llm_tool_response()
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "default-model"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
):
|
||||||
params = _make_sampling_params()
|
params = _make_sampling_params()
|
||||||
result = asyncio.run(self.handler(None, params))
|
result = asyncio.run(self.handler(None, params))
|
||||||
|
|
@ -1916,8 +1916,8 @@ class TestSamplingCallbackToolUse:
|
||||||
)
|
)
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "default-model"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
):
|
||||||
result = asyncio.run(self.handler(None, _make_sampling_params()))
|
result = asyncio.run(self.handler(None, _make_sampling_params()))
|
||||||
|
|
||||||
|
|
@ -1939,8 +1939,8 @@ class TestToolLoopGovernance:
|
||||||
fake_client.chat.completions.create.return_value = _make_llm_tool_response()
|
fake_client.chat.completions.create.return_value = _make_llm_tool_response()
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "default-model"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
):
|
||||||
params = _make_sampling_params()
|
params = _make_sampling_params()
|
||||||
# Round 1, 2: allowed
|
# Round 1, 2: allowed
|
||||||
|
|
@ -1959,8 +1959,8 @@ class TestToolLoopGovernance:
|
||||||
fake_client = MagicMock()
|
fake_client = MagicMock()
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "default-model"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
):
|
||||||
# Tool response (round 1 of 1 allowed)
|
# Tool response (round 1 of 1 allowed)
|
||||||
fake_client.chat.completions.create.return_value = _make_llm_tool_response()
|
fake_client.chat.completions.create.return_value = _make_llm_tool_response()
|
||||||
|
|
@ -1984,8 +1984,8 @@ class TestToolLoopGovernance:
|
||||||
fake_client.chat.completions.create.return_value = _make_llm_tool_response()
|
fake_client.chat.completions.create.return_value = _make_llm_tool_response()
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "default-model"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
):
|
||||||
result = asyncio.run(handler(None, _make_sampling_params()))
|
result = asyncio.run(handler(None, _make_sampling_params()))
|
||||||
assert isinstance(result, ErrorData)
|
assert isinstance(result, ErrorData)
|
||||||
|
|
@ -2003,8 +2003,8 @@ class TestSamplingErrors:
|
||||||
fake_client.chat.completions.create.return_value = _make_llm_response()
|
fake_client.chat.completions.create.return_value = _make_llm_response()
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "default-model"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
):
|
||||||
# First call succeeds
|
# First call succeeds
|
||||||
r1 = asyncio.run(handler(None, _make_sampling_params()))
|
r1 = asyncio.run(handler(None, _make_sampling_params()))
|
||||||
|
|
@ -2017,20 +2017,16 @@ class TestSamplingErrors:
|
||||||
|
|
||||||
def test_timeout_error(self):
|
def test_timeout_error(self):
|
||||||
handler = SamplingHandler("to", {"timeout": 0.05})
|
handler = SamplingHandler("to", {"timeout": 0.05})
|
||||||
fake_client = MagicMock()
|
|
||||||
|
|
||||||
def slow_call(**kwargs):
|
def slow_call(**kwargs):
|
||||||
import threading
|
import threading
|
||||||
# Use an event to ensure the thread truly blocks long enough
|
|
||||||
evt = threading.Event()
|
evt = threading.Event()
|
||||||
evt.wait(5) # blocks for up to 5 seconds (cancelled by timeout)
|
evt.wait(5) # blocks for up to 5 seconds (cancelled by timeout)
|
||||||
return _make_llm_response()
|
return _make_llm_response()
|
||||||
|
|
||||||
fake_client.chat.completions.create.side_effect = slow_call
|
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "default-model"),
|
side_effect=slow_call,
|
||||||
):
|
):
|
||||||
result = asyncio.run(handler(None, _make_sampling_params()))
|
result = asyncio.run(handler(None, _make_sampling_params()))
|
||||||
assert isinstance(result, ErrorData)
|
assert isinstance(result, ErrorData)
|
||||||
|
|
@ -2041,12 +2037,11 @@ class TestSamplingErrors:
|
||||||
handler = SamplingHandler("np", {})
|
handler = SamplingHandler("np", {})
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(None, None),
|
side_effect=RuntimeError("No LLM provider configured"),
|
||||||
):
|
):
|
||||||
result = asyncio.run(handler(None, _make_sampling_params()))
|
result = asyncio.run(handler(None, _make_sampling_params()))
|
||||||
assert isinstance(result, ErrorData)
|
assert isinstance(result, ErrorData)
|
||||||
assert "No LLM provider" in result.message
|
|
||||||
assert handler.metrics["errors"] == 1
|
assert handler.metrics["errors"] == 1
|
||||||
|
|
||||||
def test_empty_choices_returns_error(self):
|
def test_empty_choices_returns_error(self):
|
||||||
|
|
@ -2060,8 +2055,8 @@ class TestSamplingErrors:
|
||||||
)
|
)
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "default-model"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
):
|
||||||
result = asyncio.run(handler(None, _make_sampling_params()))
|
result = asyncio.run(handler(None, _make_sampling_params()))
|
||||||
|
|
||||||
|
|
@ -2080,8 +2075,8 @@ class TestSamplingErrors:
|
||||||
)
|
)
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "default-model"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
):
|
||||||
result = asyncio.run(handler(None, _make_sampling_params()))
|
result = asyncio.run(handler(None, _make_sampling_params()))
|
||||||
|
|
||||||
|
|
@ -2099,8 +2094,8 @@ class TestSamplingErrors:
|
||||||
)
|
)
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "default-model"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
):
|
||||||
result = asyncio.run(handler(None, _make_sampling_params()))
|
result = asyncio.run(handler(None, _make_sampling_params()))
|
||||||
|
|
||||||
|
|
@ -2120,8 +2115,8 @@ class TestModelWhitelist:
|
||||||
fake_client.chat.completions.create.return_value = _make_llm_response()
|
fake_client.chat.completions.create.return_value = _make_llm_response()
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "test-model"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
):
|
||||||
result = asyncio.run(handler(None, _make_sampling_params()))
|
result = asyncio.run(handler(None, _make_sampling_params()))
|
||||||
assert isinstance(result, CreateMessageResult)
|
assert isinstance(result, CreateMessageResult)
|
||||||
|
|
@ -2131,8 +2126,8 @@ class TestModelWhitelist:
|
||||||
fake_client = MagicMock()
|
fake_client = MagicMock()
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "gpt-3.5-turbo"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
):
|
||||||
result = asyncio.run(handler(None, _make_sampling_params()))
|
result = asyncio.run(handler(None, _make_sampling_params()))
|
||||||
assert isinstance(result, ErrorData)
|
assert isinstance(result, ErrorData)
|
||||||
|
|
@ -2145,8 +2140,8 @@ class TestModelWhitelist:
|
||||||
fake_client.chat.completions.create.return_value = _make_llm_response()
|
fake_client.chat.completions.create.return_value = _make_llm_response()
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "any-model"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
):
|
||||||
result = asyncio.run(handler(None, _make_sampling_params()))
|
result = asyncio.run(handler(None, _make_sampling_params()))
|
||||||
assert isinstance(result, CreateMessageResult)
|
assert isinstance(result, CreateMessageResult)
|
||||||
|
|
@ -2166,8 +2161,8 @@ class TestMalformedToolCallArgs:
|
||||||
)
|
)
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "default-model"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
):
|
||||||
result = asyncio.run(handler(None, _make_sampling_params()))
|
result = asyncio.run(handler(None, _make_sampling_params()))
|
||||||
|
|
||||||
|
|
@ -2194,8 +2189,8 @@ class TestMalformedToolCallArgs:
|
||||||
fake_client.chat.completions.create.return_value = response
|
fake_client.chat.completions.create.return_value = response
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "default-model"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
):
|
||||||
result = asyncio.run(handler(None, _make_sampling_params()))
|
result = asyncio.run(handler(None, _make_sampling_params()))
|
||||||
|
|
||||||
|
|
@ -2214,8 +2209,8 @@ class TestMetricsTracking:
|
||||||
fake_client.chat.completions.create.return_value = _make_llm_response()
|
fake_client.chat.completions.create.return_value = _make_llm_response()
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "default-model"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
):
|
||||||
asyncio.run(handler(None, _make_sampling_params()))
|
asyncio.run(handler(None, _make_sampling_params()))
|
||||||
|
|
||||||
|
|
@ -2229,8 +2224,8 @@ class TestMetricsTracking:
|
||||||
fake_client.chat.completions.create.return_value = _make_llm_tool_response()
|
fake_client.chat.completions.create.return_value = _make_llm_tool_response()
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(fake_client, "default-model"),
|
return_value=fake_client.chat.completions.create.return_value,
|
||||||
):
|
):
|
||||||
asyncio.run(handler(None, _make_sampling_params()))
|
asyncio.run(handler(None, _make_sampling_params()))
|
||||||
|
|
||||||
|
|
@ -2241,8 +2236,8 @@ class TestMetricsTracking:
|
||||||
handler = SamplingHandler("met3", {})
|
handler = SamplingHandler("met3", {})
|
||||||
|
|
||||||
with patch(
|
with patch(
|
||||||
"agent.auxiliary_client.get_text_auxiliary_client",
|
"agent.auxiliary_client.call_llm",
|
||||||
return_value=(None, None),
|
side_effect=RuntimeError("No LLM provider configured"),
|
||||||
):
|
):
|
||||||
asyncio.run(handler(None, _make_sampling_params()))
|
asyncio.run(handler(None, _make_sampling_params()))
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -63,7 +63,7 @@ import time
|
||||||
import requests
|
import requests
|
||||||
from typing import Dict, Any, Optional, List
|
from typing import Dict, Any, Optional, List
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from agent.auxiliary_client import get_vision_auxiliary_client, get_text_auxiliary_client
|
from agent.auxiliary_client import call_llm
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -80,38 +80,15 @@ DEFAULT_SESSION_TIMEOUT = 300
|
||||||
# Max tokens for snapshot content before summarization
|
# Max tokens for snapshot content before summarization
|
||||||
SNAPSHOT_SUMMARIZE_THRESHOLD = 8000
|
SNAPSHOT_SUMMARIZE_THRESHOLD = 8000
|
||||||
|
|
||||||
# Vision client — for browser_vision (screenshot analysis)
|
|
||||||
# Wrapped in try/except so a broken auxiliary config doesn't prevent the entire
|
|
||||||
# browser_tool module from importing (which would disable all 10 browser tools).
|
|
||||||
try:
|
|
||||||
_aux_vision_client, _DEFAULT_VISION_MODEL = get_vision_auxiliary_client()
|
|
||||||
except Exception as _init_err:
|
|
||||||
logger.debug("Could not initialise vision auxiliary client: %s", _init_err)
|
|
||||||
_aux_vision_client, _DEFAULT_VISION_MODEL = None, None
|
|
||||||
|
|
||||||
# Text client — for page snapshot summarization (same config as web_extract)
|
def _get_vision_model() -> Optional[str]:
|
||||||
try:
|
|
||||||
_aux_text_client, _DEFAULT_TEXT_MODEL = get_text_auxiliary_client("web_extract")
|
|
||||||
except Exception as _init_err:
|
|
||||||
logger.debug("Could not initialise text auxiliary client: %s", _init_err)
|
|
||||||
_aux_text_client, _DEFAULT_TEXT_MODEL = None, None
|
|
||||||
|
|
||||||
# Module-level alias for availability checks
|
|
||||||
EXTRACTION_MODEL = _DEFAULT_TEXT_MODEL or _DEFAULT_VISION_MODEL
|
|
||||||
|
|
||||||
|
|
||||||
def _get_vision_model() -> str:
|
|
||||||
"""Model for browser_vision (screenshot analysis — multimodal)."""
|
"""Model for browser_vision (screenshot analysis — multimodal)."""
|
||||||
return (os.getenv("AUXILIARY_VISION_MODEL", "").strip()
|
return os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None
|
||||||
or _DEFAULT_VISION_MODEL
|
|
||||||
or "google/gemini-3-flash-preview")
|
|
||||||
|
|
||||||
|
|
||||||
def _get_extraction_model() -> str:
|
def _get_extraction_model() -> Optional[str]:
|
||||||
"""Model for page snapshot text summarization — same as web_extract."""
|
"""Model for page snapshot text summarization — same as web_extract."""
|
||||||
return (os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip()
|
return os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() or None
|
||||||
or _DEFAULT_TEXT_MODEL
|
|
||||||
or "google/gemini-3-flash-preview")
|
|
||||||
|
|
||||||
|
|
||||||
def _is_local_mode() -> bool:
|
def _is_local_mode() -> bool:
|
||||||
|
|
@ -941,9 +918,6 @@ def _extract_relevant_content(
|
||||||
|
|
||||||
Falls back to simple truncation when no auxiliary text model is configured.
|
Falls back to simple truncation when no auxiliary text model is configured.
|
||||||
"""
|
"""
|
||||||
if _aux_text_client is None:
|
|
||||||
return _truncate_snapshot(snapshot_text)
|
|
||||||
|
|
||||||
if user_task:
|
if user_task:
|
||||||
extraction_prompt = (
|
extraction_prompt = (
|
||||||
f"You are a content extractor for a browser automation agent.\n\n"
|
f"You are a content extractor for a browser automation agent.\n\n"
|
||||||
|
|
@ -968,13 +942,16 @@ def _extract_relevant_content(
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
from agent.auxiliary_client import auxiliary_max_tokens_param
|
call_kwargs = {
|
||||||
response = _aux_text_client.chat.completions.create(
|
"task": "web_extract",
|
||||||
model=_get_extraction_model(),
|
"messages": [{"role": "user", "content": extraction_prompt}],
|
||||||
messages=[{"role": "user", "content": extraction_prompt}],
|
"max_tokens": 4000,
|
||||||
**auxiliary_max_tokens_param(4000),
|
"temperature": 0.1,
|
||||||
temperature=0.1,
|
}
|
||||||
)
|
model = _get_extraction_model()
|
||||||
|
if model:
|
||||||
|
call_kwargs["model"] = model
|
||||||
|
response = call_llm(**call_kwargs)
|
||||||
return response.choices[0].message.content
|
return response.choices[0].message.content
|
||||||
except Exception:
|
except Exception:
|
||||||
return _truncate_snapshot(snapshot_text)
|
return _truncate_snapshot(snapshot_text)
|
||||||
|
|
@ -1497,14 +1474,6 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
|
||||||
|
|
||||||
effective_task_id = task_id or "default"
|
effective_task_id = task_id or "default"
|
||||||
|
|
||||||
# Check auxiliary vision client
|
|
||||||
if _aux_vision_client is None or _DEFAULT_VISION_MODEL is None:
|
|
||||||
return json.dumps({
|
|
||||||
"success": False,
|
|
||||||
"error": "Browser vision unavailable: no auxiliary vision model configured. "
|
|
||||||
"Set OPENROUTER_API_KEY or configure Nous Portal to enable browser vision."
|
|
||||||
}, ensure_ascii=False)
|
|
||||||
|
|
||||||
# Save screenshot to persistent location so it can be shared with users
|
# Save screenshot to persistent location so it can be shared with users
|
||||||
hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
|
hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
|
||||||
screenshots_dir = hermes_home / "browser_screenshots"
|
screenshots_dir = hermes_home / "browser_screenshots"
|
||||||
|
|
@ -1562,14 +1531,13 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
|
||||||
f"Focus on answering the user's specific question."
|
f"Focus on answering the user's specific question."
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use the sync auxiliary vision client directly
|
# Use the centralized LLM router
|
||||||
from agent.auxiliary_client import auxiliary_max_tokens_param
|
|
||||||
vision_model = _get_vision_model()
|
vision_model = _get_vision_model()
|
||||||
logger.debug("browser_vision: analysing screenshot (%d bytes) with model=%s",
|
logger.debug("browser_vision: analysing screenshot (%d bytes)",
|
||||||
len(image_data), vision_model)
|
len(image_data))
|
||||||
response = _aux_vision_client.chat.completions.create(
|
call_kwargs = {
|
||||||
model=vision_model,
|
"task": "vision",
|
||||||
messages=[
|
"messages": [
|
||||||
{
|
{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
"content": [
|
"content": [
|
||||||
|
|
@ -1578,9 +1546,12 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
|
||||||
],
|
],
|
||||||
}
|
}
|
||||||
],
|
],
|
||||||
**auxiliary_max_tokens_param(2000),
|
"max_tokens": 2000,
|
||||||
temperature=0.1,
|
"temperature": 0.1,
|
||||||
)
|
}
|
||||||
|
if vision_model:
|
||||||
|
call_kwargs["model"] = vision_model
|
||||||
|
response = call_llm(**call_kwargs)
|
||||||
|
|
||||||
analysis = response.choices[0].message.content
|
analysis = response.choices[0].message.content
|
||||||
response_data = {
|
response_data = {
|
||||||
|
|
|
||||||
|
|
@ -456,17 +456,13 @@ class SamplingHandler:
|
||||||
# Resolve model
|
# Resolve model
|
||||||
model = self._resolve_model(getattr(params, "modelPreferences", None))
|
model = self._resolve_model(getattr(params, "modelPreferences", None))
|
||||||
|
|
||||||
# Get auxiliary LLM client
|
# Get auxiliary LLM client via centralized router
|
||||||
from agent.auxiliary_client import get_text_auxiliary_client
|
from agent.auxiliary_client import call_llm
|
||||||
client, default_model = get_text_auxiliary_client()
|
|
||||||
if client is None:
|
|
||||||
self.metrics["errors"] += 1
|
|
||||||
return self._error("No LLM provider available for sampling")
|
|
||||||
|
|
||||||
resolved_model = model or default_model
|
# Model whitelist check (we need to resolve model before calling)
|
||||||
|
resolved_model = model or self.model_override or ""
|
||||||
|
|
||||||
# Model whitelist check
|
if self.allowed_models and resolved_model and resolved_model not in self.allowed_models:
|
||||||
if self.allowed_models and resolved_model not in self.allowed_models:
|
|
||||||
logger.warning(
|
logger.warning(
|
||||||
"MCP server '%s' requested model '%s' not in allowed_models",
|
"MCP server '%s' requested model '%s' not in allowed_models",
|
||||||
self.server_name, resolved_model,
|
self.server_name, resolved_model,
|
||||||
|
|
@ -484,20 +480,15 @@ class SamplingHandler:
|
||||||
|
|
||||||
# Build LLM call kwargs
|
# Build LLM call kwargs
|
||||||
max_tokens = min(params.maxTokens, self.max_tokens_cap)
|
max_tokens = min(params.maxTokens, self.max_tokens_cap)
|
||||||
call_kwargs: dict = {
|
call_temperature = None
|
||||||
"model": resolved_model,
|
|
||||||
"messages": messages,
|
|
||||||
"max_tokens": max_tokens,
|
|
||||||
}
|
|
||||||
if hasattr(params, "temperature") and params.temperature is not None:
|
if hasattr(params, "temperature") and params.temperature is not None:
|
||||||
call_kwargs["temperature"] = params.temperature
|
call_temperature = params.temperature
|
||||||
if stop := getattr(params, "stopSequences", None):
|
|
||||||
call_kwargs["stop"] = stop
|
|
||||||
|
|
||||||
# Forward server-provided tools
|
# Forward server-provided tools
|
||||||
|
call_tools = None
|
||||||
server_tools = getattr(params, "tools", None)
|
server_tools = getattr(params, "tools", None)
|
||||||
if server_tools:
|
if server_tools:
|
||||||
call_kwargs["tools"] = [
|
call_tools = [
|
||||||
{
|
{
|
||||||
"type": "function",
|
"type": "function",
|
||||||
"function": {
|
"function": {
|
||||||
|
|
@ -508,9 +499,6 @@ class SamplingHandler:
|
||||||
}
|
}
|
||||||
for t in server_tools
|
for t in server_tools
|
||||||
]
|
]
|
||||||
if tool_choice := getattr(params, "toolChoice", None):
|
|
||||||
mode = getattr(tool_choice, "mode", "auto")
|
|
||||||
call_kwargs["tool_choice"] = {"auto": "auto", "required": "required", "none": "none"}.get(mode, "auto")
|
|
||||||
|
|
||||||
logger.log(
|
logger.log(
|
||||||
self.audit_level,
|
self.audit_level,
|
||||||
|
|
@ -520,7 +508,15 @@ class SamplingHandler:
|
||||||
|
|
||||||
# Offload sync LLM call to thread (non-blocking)
|
# Offload sync LLM call to thread (non-blocking)
|
||||||
def _sync_call():
|
def _sync_call():
|
||||||
return client.chat.completions.create(**call_kwargs)
|
return call_llm(
|
||||||
|
task="mcp",
|
||||||
|
model=resolved_model or None,
|
||||||
|
messages=messages,
|
||||||
|
temperature=call_temperature,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
tools=call_tools,
|
||||||
|
timeout=self.timeout,
|
||||||
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
response = await asyncio.wait_for(
|
response = await asyncio.wait_for(
|
||||||
|
|
|
||||||
|
|
@ -22,13 +22,7 @@ import os
|
||||||
import logging
|
import logging
|
||||||
from typing import Dict, Any, List, Optional, Union
|
from typing import Dict, Any, List, Optional, Union
|
||||||
|
|
||||||
from openai import AsyncOpenAI, OpenAI
|
from agent.auxiliary_client import async_call_llm
|
||||||
|
|
||||||
from agent.auxiliary_client import get_async_text_auxiliary_client
|
|
||||||
|
|
||||||
# Resolve the async auxiliary client at import time so we have the model slug.
|
|
||||||
# Handles Codex Responses API adapter transparently.
|
|
||||||
_async_aux_client, _SUMMARIZER_MODEL = get_async_text_auxiliary_client()
|
|
||||||
MAX_SESSION_CHARS = 100_000
|
MAX_SESSION_CHARS = 100_000
|
||||||
MAX_SUMMARY_TOKENS = 10000
|
MAX_SUMMARY_TOKENS = 10000
|
||||||
|
|
||||||
|
|
@ -156,26 +150,22 @@ async def _summarize_session(
|
||||||
f"Summarize this conversation with focus on: {query}"
|
f"Summarize this conversation with focus on: {query}"
|
||||||
)
|
)
|
||||||
|
|
||||||
if _async_aux_client is None or _SUMMARIZER_MODEL is None:
|
|
||||||
logging.warning("No auxiliary model available for session summarization")
|
|
||||||
return None
|
|
||||||
|
|
||||||
max_retries = 3
|
max_retries = 3
|
||||||
for attempt in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
|
response = await async_call_llm(
|
||||||
_extra = get_auxiliary_extra_body()
|
task="session_search",
|
||||||
response = await _async_aux_client.chat.completions.create(
|
|
||||||
model=_SUMMARIZER_MODEL,
|
|
||||||
messages=[
|
messages=[
|
||||||
{"role": "system", "content": system_prompt},
|
{"role": "system", "content": system_prompt},
|
||||||
{"role": "user", "content": user_prompt},
|
{"role": "user", "content": user_prompt},
|
||||||
],
|
],
|
||||||
**({} if not _extra else {"extra_body": _extra}),
|
|
||||||
temperature=0.1,
|
temperature=0.1,
|
||||||
**auxiliary_max_tokens_param(MAX_SUMMARY_TOKENS),
|
max_tokens=MAX_SUMMARY_TOKENS,
|
||||||
)
|
)
|
||||||
return response.choices[0].message.content.strip()
|
return response.choices[0].message.content.strip()
|
||||||
|
except RuntimeError:
|
||||||
|
logging.warning("No auxiliary model available for session summarization")
|
||||||
|
return None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
await asyncio.sleep(1 * (attempt + 1))
|
await asyncio.sleep(1 * (attempt + 1))
|
||||||
|
|
@ -333,8 +323,6 @@ def session_search(
|
||||||
|
|
||||||
def check_session_search_requirements() -> bool:
|
def check_session_search_requirements() -> bool:
|
||||||
"""Requires SQLite state database and an auxiliary text model."""
|
"""Requires SQLite state database and an auxiliary text model."""
|
||||||
if _async_aux_client is None:
|
|
||||||
return False
|
|
||||||
try:
|
try:
|
||||||
from hermes_state import DEFAULT_DB_PATH
|
from hermes_state import DEFAULT_DB_PATH
|
||||||
return DEFAULT_DB_PATH.parent.exists()
|
return DEFAULT_DB_PATH.parent.exists()
|
||||||
|
|
|
||||||
|
|
@ -936,13 +936,10 @@ def llm_audit_skill(skill_path: Path, static_result: ScanResult,
|
||||||
|
|
||||||
# Call the LLM via the centralized provider router
|
# Call the LLM via the centralized provider router
|
||||||
try:
|
try:
|
||||||
from agent.auxiliary_client import resolve_provider_client
|
from agent.auxiliary_client import call_llm
|
||||||
|
|
||||||
client, _default_model = resolve_provider_client("openrouter")
|
response = call_llm(
|
||||||
if client is None:
|
provider="openrouter",
|
||||||
return static_result
|
|
||||||
|
|
||||||
response = client.chat.completions.create(
|
|
||||||
model=model,
|
model=model,
|
||||||
messages=[{
|
messages=[{
|
||||||
"role": "user",
|
"role": "user",
|
||||||
|
|
|
||||||
|
|
@ -37,16 +37,11 @@ from pathlib import Path
|
||||||
from typing import Any, Awaitable, Dict, Optional
|
from typing import Any, Awaitable, Dict, Optional
|
||||||
from urllib.parse import urlparse
|
from urllib.parse import urlparse
|
||||||
import httpx
|
import httpx
|
||||||
from agent.auxiliary_client import get_async_vision_auxiliary_client
|
from agent.auxiliary_client import async_call_llm
|
||||||
from tools.debug_helpers import DebugSession
|
from tools.debug_helpers import DebugSession
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Resolve vision auxiliary client at module level.
|
|
||||||
# Uses get_async_vision_auxiliary_client() which properly handles Codex
|
|
||||||
# routing (Responses API adapter) instead of raw AsyncOpenAI construction.
|
|
||||||
_aux_async_client, DEFAULT_VISION_MODEL = get_async_vision_auxiliary_client()
|
|
||||||
|
|
||||||
_debug = DebugSession("vision_tools", env_var="VISION_TOOLS_DEBUG")
|
_debug = DebugSession("vision_tools", env_var="VISION_TOOLS_DEBUG")
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -185,7 +180,7 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None)
|
||||||
async def vision_analyze_tool(
|
async def vision_analyze_tool(
|
||||||
image_url: str,
|
image_url: str,
|
||||||
user_prompt: str,
|
user_prompt: str,
|
||||||
model: str = DEFAULT_VISION_MODEL,
|
model: str = None,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""
|
"""
|
||||||
Analyze an image from a URL or local file path using vision AI.
|
Analyze an image from a URL or local file path using vision AI.
|
||||||
|
|
@ -245,15 +240,6 @@ async def vision_analyze_tool(
|
||||||
logger.info("Analyzing image: %s", image_url[:60])
|
logger.info("Analyzing image: %s", image_url[:60])
|
||||||
logger.info("User prompt: %s", user_prompt[:100])
|
logger.info("User prompt: %s", user_prompt[:100])
|
||||||
|
|
||||||
# Check auxiliary vision client availability
|
|
||||||
if _aux_async_client is None or DEFAULT_VISION_MODEL is None:
|
|
||||||
logger.error("Vision analysis unavailable: no auxiliary vision model configured")
|
|
||||||
return json.dumps({
|
|
||||||
"success": False,
|
|
||||||
"analysis": "Vision analysis unavailable: no auxiliary vision model configured. "
|
|
||||||
"Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools."
|
|
||||||
}, indent=2, ensure_ascii=False)
|
|
||||||
|
|
||||||
# Determine if this is a local file path or a remote URL
|
# Determine if this is a local file path or a remote URL
|
||||||
local_path = Path(image_url)
|
local_path = Path(image_url)
|
||||||
if local_path.is_file():
|
if local_path.is_file():
|
||||||
|
|
@ -309,18 +295,18 @@ async def vision_analyze_tool(
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|
||||||
logger.info("Processing image with %s...", model)
|
logger.info("Processing image with vision model...")
|
||||||
|
|
||||||
# Call the vision API
|
# Call the vision API via centralized router
|
||||||
from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
|
call_kwargs = {
|
||||||
_extra = get_auxiliary_extra_body()
|
"task": "vision",
|
||||||
response = await _aux_async_client.chat.completions.create(
|
"messages": messages,
|
||||||
model=model,
|
"temperature": 0.1,
|
||||||
messages=messages,
|
"max_tokens": 2000,
|
||||||
temperature=0.1,
|
}
|
||||||
**auxiliary_max_tokens_param(2000),
|
if model:
|
||||||
**({} if not _extra else {"extra_body": _extra}),
|
call_kwargs["model"] = model
|
||||||
)
|
response = await async_call_llm(**call_kwargs)
|
||||||
|
|
||||||
# Extract the analysis
|
# Extract the analysis
|
||||||
analysis = response.choices[0].message.content.strip()
|
analysis = response.choices[0].message.content.strip()
|
||||||
|
|
@ -391,7 +377,18 @@ async def vision_analyze_tool(
|
||||||
|
|
||||||
def check_vision_requirements() -> bool:
|
def check_vision_requirements() -> bool:
|
||||||
"""Check if an auxiliary vision model is available."""
|
"""Check if an auxiliary vision model is available."""
|
||||||
return _aux_async_client is not None
|
try:
|
||||||
|
from agent.auxiliary_client import resolve_provider_client
|
||||||
|
client, _ = resolve_provider_client("openrouter")
|
||||||
|
if client is not None:
|
||||||
|
return True
|
||||||
|
client, _ = resolve_provider_client("nous")
|
||||||
|
if client is not None:
|
||||||
|
return True
|
||||||
|
client, _ = resolve_provider_client("custom")
|
||||||
|
return client is not None
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_debug_session_info() -> Dict[str, Any]:
|
def get_debug_session_info() -> Dict[str, Any]:
|
||||||
|
|
@ -419,10 +416,9 @@ if __name__ == "__main__":
|
||||||
print("Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools.")
|
print("Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools.")
|
||||||
exit(1)
|
exit(1)
|
||||||
else:
|
else:
|
||||||
print(f"✅ Vision model available: {DEFAULT_VISION_MODEL}")
|
print("✅ Vision model available")
|
||||||
|
|
||||||
print("🛠️ Vision tools ready for use!")
|
print("🛠️ Vision tools ready for use!")
|
||||||
print(f"🧠 Using model: {DEFAULT_VISION_MODEL}")
|
|
||||||
|
|
||||||
# Show debug mode status
|
# Show debug mode status
|
||||||
if _debug.active:
|
if _debug.active:
|
||||||
|
|
@ -489,9 +485,7 @@ def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]:
|
||||||
"Fully describe and explain everything about this image, then answer the "
|
"Fully describe and explain everything about this image, then answer the "
|
||||||
f"following question:\n\n{question}"
|
f"following question:\n\n{question}"
|
||||||
)
|
)
|
||||||
model = (os.getenv("AUXILIARY_VISION_MODEL", "").strip()
|
model = os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None
|
||||||
or DEFAULT_VISION_MODEL
|
|
||||||
or "google/gemini-3-flash-preview")
|
|
||||||
return vision_analyze_tool(image_url, full_prompt, model)
|
return vision_analyze_tool(image_url, full_prompt, model)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -47,8 +47,7 @@ import re
|
||||||
import asyncio
|
import asyncio
|
||||||
from typing import List, Dict, Any, Optional
|
from typing import List, Dict, Any, Optional
|
||||||
from firecrawl import Firecrawl
|
from firecrawl import Firecrawl
|
||||||
from openai import AsyncOpenAI
|
from agent.auxiliary_client import async_call_llm
|
||||||
from agent.auxiliary_client import get_async_text_auxiliary_client
|
|
||||||
from tools.debug_helpers import DebugSession
|
from tools.debug_helpers import DebugSession
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
@ -83,15 +82,8 @@ def _get_firecrawl_client():
|
||||||
|
|
||||||
DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000
|
DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000
|
||||||
|
|
||||||
# Resolve async auxiliary client at module level.
|
# Allow per-task override via env var
|
||||||
# Handles Codex Responses API adapter transparently.
|
DEFAULT_SUMMARIZER_MODEL = os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() or None
|
||||||
_aux_async_client, _DEFAULT_SUMMARIZER_MODEL = get_async_text_auxiliary_client("web_extract")
|
|
||||||
|
|
||||||
# Allow per-task override via config.yaml auxiliary.web_extract_model
|
|
||||||
DEFAULT_SUMMARIZER_MODEL = (
|
|
||||||
os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip()
|
|
||||||
or _DEFAULT_SUMMARIZER_MODEL
|
|
||||||
)
|
|
||||||
|
|
||||||
_debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG")
|
_debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG")
|
||||||
|
|
||||||
|
|
@ -249,22 +241,22 @@ Create a markdown summary that captures all key information in a well-organized,
|
||||||
|
|
||||||
for attempt in range(max_retries):
|
for attempt in range(max_retries):
|
||||||
try:
|
try:
|
||||||
if _aux_async_client is None:
|
call_kwargs = {
|
||||||
logger.warning("No auxiliary model available for web content processing")
|
"task": "web_extract",
|
||||||
return None
|
"messages": [
|
||||||
from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
|
|
||||||
_extra = get_auxiliary_extra_body()
|
|
||||||
response = await _aux_async_client.chat.completions.create(
|
|
||||||
model=model,
|
|
||||||
messages=[
|
|
||||||
{"role": "system", "content": system_prompt},
|
{"role": "system", "content": system_prompt},
|
||||||
{"role": "user", "content": user_prompt}
|
{"role": "user", "content": user_prompt}
|
||||||
],
|
],
|
||||||
temperature=0.1,
|
"temperature": 0.1,
|
||||||
**auxiliary_max_tokens_param(max_tokens),
|
"max_tokens": max_tokens,
|
||||||
**({} if not _extra else {"extra_body": _extra}),
|
}
|
||||||
)
|
if model:
|
||||||
|
call_kwargs["model"] = model
|
||||||
|
response = await async_call_llm(**call_kwargs)
|
||||||
return response.choices[0].message.content.strip()
|
return response.choices[0].message.content.strip()
|
||||||
|
except RuntimeError:
|
||||||
|
logger.warning("No auxiliary model available for web content processing")
|
||||||
|
return None
|
||||||
except Exception as api_error:
|
except Exception as api_error:
|
||||||
last_error = api_error
|
last_error = api_error
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
|
|
@ -368,25 +360,18 @@ Synthesize these into ONE cohesive, comprehensive summary that:
|
||||||
Create a single, unified markdown summary."""
|
Create a single, unified markdown summary."""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if _aux_async_client is None:
|
call_kwargs = {
|
||||||
logger.warning("No auxiliary model for synthesis, concatenating summaries")
|
"task": "web_extract",
|
||||||
fallback = "\n\n".join(summaries)
|
"messages": [
|
||||||
if len(fallback) > max_output_size:
|
|
||||||
fallback = fallback[:max_output_size] + "\n\n[... truncated ...]"
|
|
||||||
return fallback
|
|
||||||
|
|
||||||
from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
|
|
||||||
_extra = get_auxiliary_extra_body()
|
|
||||||
response = await _aux_async_client.chat.completions.create(
|
|
||||||
model=model,
|
|
||||||
messages=[
|
|
||||||
{"role": "system", "content": "You synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise."},
|
{"role": "system", "content": "You synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise."},
|
||||||
{"role": "user", "content": synthesis_prompt}
|
{"role": "user", "content": synthesis_prompt}
|
||||||
],
|
],
|
||||||
temperature=0.1,
|
"temperature": 0.1,
|
||||||
**auxiliary_max_tokens_param(20000),
|
"max_tokens": 20000,
|
||||||
**({} if not _extra else {"extra_body": _extra}),
|
}
|
||||||
)
|
if model:
|
||||||
|
call_kwargs["model"] = model
|
||||||
|
response = await async_call_llm(**call_kwargs)
|
||||||
final_summary = response.choices[0].message.content.strip()
|
final_summary = response.choices[0].message.content.strip()
|
||||||
|
|
||||||
# Enforce hard cap
|
# Enforce hard cap
|
||||||
|
|
@ -713,8 +698,8 @@ async def web_extract_tool(
|
||||||
debug_call_data["pages_extracted"] = pages_extracted
|
debug_call_data["pages_extracted"] = pages_extracted
|
||||||
debug_call_data["original_response_size"] = len(json.dumps(response))
|
debug_call_data["original_response_size"] = len(json.dumps(response))
|
||||||
|
|
||||||
# Process each result with LLM if enabled and auxiliary client is available
|
# Process each result with LLM if enabled
|
||||||
if use_llm_processing and _aux_async_client is not None:
|
if use_llm_processing:
|
||||||
logger.info("Processing extracted content with LLM (parallel)...")
|
logger.info("Processing extracted content with LLM (parallel)...")
|
||||||
debug_call_data["processing_applied"].append("llm_processing")
|
debug_call_data["processing_applied"].append("llm_processing")
|
||||||
|
|
||||||
|
|
@ -780,10 +765,6 @@ async def web_extract_tool(
|
||||||
else:
|
else:
|
||||||
logger.warning("%s (no content to process)", url)
|
logger.warning("%s (no content to process)", url)
|
||||||
else:
|
else:
|
||||||
if use_llm_processing and _aux_async_client is None:
|
|
||||||
logger.warning("LLM processing requested but no auxiliary model available, returning raw content")
|
|
||||||
debug_call_data["processing_applied"].append("llm_processing_unavailable")
|
|
||||||
|
|
||||||
# Print summary of extracted pages for debugging (original behavior)
|
# Print summary of extracted pages for debugging (original behavior)
|
||||||
for result in response.get('results', []):
|
for result in response.get('results', []):
|
||||||
url = result.get('url', 'Unknown URL')
|
url = result.get('url', 'Unknown URL')
|
||||||
|
|
@ -1013,8 +994,8 @@ async def web_crawl_tool(
|
||||||
debug_call_data["pages_crawled"] = pages_crawled
|
debug_call_data["pages_crawled"] = pages_crawled
|
||||||
debug_call_data["original_response_size"] = len(json.dumps(response))
|
debug_call_data["original_response_size"] = len(json.dumps(response))
|
||||||
|
|
||||||
# Process each result with LLM if enabled and auxiliary client is available
|
# Process each result with LLM if enabled
|
||||||
if use_llm_processing and _aux_async_client is not None:
|
if use_llm_processing:
|
||||||
logger.info("Processing crawled content with LLM (parallel)...")
|
logger.info("Processing crawled content with LLM (parallel)...")
|
||||||
debug_call_data["processing_applied"].append("llm_processing")
|
debug_call_data["processing_applied"].append("llm_processing")
|
||||||
|
|
||||||
|
|
@ -1080,10 +1061,6 @@ async def web_crawl_tool(
|
||||||
else:
|
else:
|
||||||
logger.warning("%s (no content to process)", page_url)
|
logger.warning("%s (no content to process)", page_url)
|
||||||
else:
|
else:
|
||||||
if use_llm_processing and _aux_async_client is None:
|
|
||||||
logger.warning("LLM processing requested but no auxiliary model available, returning raw content")
|
|
||||||
debug_call_data["processing_applied"].append("llm_processing_unavailable")
|
|
||||||
|
|
||||||
# Print summary of crawled pages for debugging (original behavior)
|
# Print summary of crawled pages for debugging (original behavior)
|
||||||
for result in response.get('results', []):
|
for result in response.get('results', []):
|
||||||
page_url = result.get('url', 'Unknown URL')
|
page_url = result.get('url', 'Unknown URL')
|
||||||
|
|
@ -1138,7 +1115,15 @@ def check_firecrawl_api_key() -> bool:
|
||||||
|
|
||||||
def check_auxiliary_model() -> bool:
|
def check_auxiliary_model() -> bool:
|
||||||
"""Check if an auxiliary text model is available for LLM content processing."""
|
"""Check if an auxiliary text model is available for LLM content processing."""
|
||||||
return _aux_async_client is not None
|
try:
|
||||||
|
from agent.auxiliary_client import resolve_provider_client
|
||||||
|
for p in ("openrouter", "nous", "custom", "codex"):
|
||||||
|
client, _ = resolve_provider_client(p)
|
||||||
|
if client is not None:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
except Exception:
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
def get_debug_session_info() -> Dict[str, Any]:
|
def get_debug_session_info() -> Dict[str, Any]:
|
||||||
|
|
|
||||||
|
|
@ -344,28 +344,32 @@ class TrajectoryCompressor:
|
||||||
raise RuntimeError(f"Failed to load tokenizer '{self.config.tokenizer_name}': {e}")
|
raise RuntimeError(f"Failed to load tokenizer '{self.config.tokenizer_name}': {e}")
|
||||||
|
|
||||||
def _init_summarizer(self):
|
def _init_summarizer(self):
|
||||||
"""Initialize LLM client for summarization (sync and async).
|
"""Initialize LLM routing for summarization (sync and async).
|
||||||
|
|
||||||
Routes through the centralized provider router for known providers
|
Uses call_llm/async_call_llm from the centralized provider router
|
||||||
(OpenRouter, Nous, Codex, etc.) so auth and headers are handled
|
which handles auth, headers, and provider detection internally.
|
||||||
consistently. Falls back to raw construction for custom endpoints.
|
For custom endpoints, falls back to raw client construction.
|
||||||
"""
|
"""
|
||||||
from agent.auxiliary_client import resolve_provider_client
|
from agent.auxiliary_client import call_llm, async_call_llm
|
||||||
|
|
||||||
provider = self._detect_provider()
|
provider = self._detect_provider()
|
||||||
if provider:
|
if provider:
|
||||||
# Use centralized router — handles auth, headers, Codex adapter
|
# Store provider for use in _generate_summary calls
|
||||||
self.client, _ = resolve_provider_client(
|
self._llm_provider = provider
|
||||||
|
self._use_call_llm = True
|
||||||
|
# Verify the provider is available
|
||||||
|
from agent.auxiliary_client import resolve_provider_client
|
||||||
|
client, _ = resolve_provider_client(
|
||||||
provider, model=self.config.summarization_model)
|
provider, model=self.config.summarization_model)
|
||||||
self.async_client, _ = resolve_provider_client(
|
if client is None:
|
||||||
provider, model=self.config.summarization_model,
|
|
||||||
async_mode=True)
|
|
||||||
if self.client is None:
|
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
f"Provider '{provider}' is not configured. "
|
f"Provider '{provider}' is not configured. "
|
||||||
f"Check your API key or run: hermes setup")
|
f"Check your API key or run: hermes setup")
|
||||||
|
self.client = None # Not used directly
|
||||||
|
self.async_client = None # Not used directly
|
||||||
else:
|
else:
|
||||||
# Custom endpoint — use config's raw base_url + api_key_env
|
# Custom endpoint — use config's raw base_url + api_key_env
|
||||||
|
self._use_call_llm = False
|
||||||
api_key = os.getenv(self.config.api_key_env)
|
api_key = os.getenv(self.config.api_key_env)
|
||||||
if not api_key:
|
if not api_key:
|
||||||
raise RuntimeError(
|
raise RuntimeError(
|
||||||
|
|
@ -524,12 +528,22 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
|
||||||
try:
|
try:
|
||||||
metrics.summarization_api_calls += 1
|
metrics.summarization_api_calls += 1
|
||||||
|
|
||||||
response = self.client.chat.completions.create(
|
if getattr(self, '_use_call_llm', False):
|
||||||
model=self.config.summarization_model,
|
from agent.auxiliary_client import call_llm
|
||||||
messages=[{"role": "user", "content": prompt}],
|
response = call_llm(
|
||||||
temperature=self.config.temperature,
|
provider=self._llm_provider,
|
||||||
max_tokens=self.config.summary_target_tokens * 2,
|
model=self.config.summarization_model,
|
||||||
)
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
temperature=self.config.temperature,
|
||||||
|
max_tokens=self.config.summary_target_tokens * 2,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
response = self.client.chat.completions.create(
|
||||||
|
model=self.config.summarization_model,
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
temperature=self.config.temperature,
|
||||||
|
max_tokens=self.config.summary_target_tokens * 2,
|
||||||
|
)
|
||||||
|
|
||||||
summary = response.choices[0].message.content.strip()
|
summary = response.choices[0].message.content.strip()
|
||||||
|
|
||||||
|
|
@ -581,12 +595,22 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
|
||||||
try:
|
try:
|
||||||
metrics.summarization_api_calls += 1
|
metrics.summarization_api_calls += 1
|
||||||
|
|
||||||
response = await self.async_client.chat.completions.create(
|
if getattr(self, '_use_call_llm', False):
|
||||||
model=self.config.summarization_model,
|
from agent.auxiliary_client import async_call_llm
|
||||||
messages=[{"role": "user", "content": prompt}],
|
response = await async_call_llm(
|
||||||
temperature=self.config.temperature,
|
provider=self._llm_provider,
|
||||||
max_tokens=self.config.summary_target_tokens * 2,
|
model=self.config.summarization_model,
|
||||||
)
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
temperature=self.config.temperature,
|
||||||
|
max_tokens=self.config.summary_target_tokens * 2,
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
response = await self.async_client.chat.completions.create(
|
||||||
|
model=self.config.summarization_model,
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
temperature=self.config.temperature,
|
||||||
|
max_tokens=self.config.summary_target_tokens * 2,
|
||||||
|
)
|
||||||
|
|
||||||
summary = response.choices[0].message.content.strip()
|
summary = response.choices[0].message.content.strip()
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue