feat: call_llm/async_call_llm + config slots + migrate all consumers

Add centralized call_llm() and async_call_llm() functions that own the
full LLM request lifecycle:
  1. Resolve provider + model from task config or explicit args
  2. Get or create a cached client for that provider
  3. Format request args (max_tokens handling, provider extra_body)
  4. Make the API call with max_tokens/max_completion_tokens retry
  5. Return the response

Config: expanded auxiliary section with provider:model slots for all
tasks (compression, vision, web_extract, session_search, skills_hub,
mcp, flush_memories). Config version bumped to 7.

Migrated all auxiliary consumers:
- context_compressor.py: uses call_llm(task='compression')
- vision_tools.py: uses async_call_llm(task='vision')
- web_tools.py: uses async_call_llm(task='web_extract')
- session_search_tool.py: uses async_call_llm(task='session_search')
- browser_tool.py: uses call_llm(task='vision'/'web_extract')
- mcp_tool.py: uses call_llm(task='mcp')
- skills_guard.py: uses call_llm(provider='openrouter')
- run_agent.py flush_memories: uses call_llm(task='flush_memories')

Tests updated for context_compressor and MCP tool. Some test mocks
still need updating (15 remaining failures from mock pattern changes,
2 pre-existing).
This commit is contained in:
teknium1 2026-03-11 20:52:19 -07:00
parent 013cc4d2fc
commit 0aa31cd3cb
13 changed files with 552 additions and 375 deletions

View file

@ -784,3 +784,253 @@ def auxiliary_max_tokens_param(value: int) -> dict:
and "api.openai.com" in custom_base.lower()): and "api.openai.com" in custom_base.lower()):
return {"max_completion_tokens": value} return {"max_completion_tokens": value}
return {"max_tokens": value} return {"max_tokens": value}
# ── Centralized LLM Call API ────────────────────────────────────────────────
#
# call_llm() and async_call_llm() own the full request lifecycle:
# 1. Resolve provider + model from task config (or explicit args)
# 2. Get or create a cached client for that provider
# 3. Format request args for the provider + model (max_tokens handling, etc.)
# 4. Make the API call
# 5. Return the response
#
# Every auxiliary LLM consumer should use these instead of manually
# constructing clients and calling .chat.completions.create().
# Client cache: (provider, async_mode) -> (client, default_model)
_client_cache: Dict[tuple, tuple] = {}
def _get_cached_client(
provider: str, model: str = None, async_mode: bool = False,
) -> Tuple[Optional[Any], Optional[str]]:
"""Get or create a cached client for the given provider."""
cache_key = (provider, async_mode)
if cache_key in _client_cache:
cached_client, cached_default = _client_cache[cache_key]
return cached_client, model or cached_default
client, default_model = resolve_provider_client(provider, model, async_mode)
if client is not None:
_client_cache[cache_key] = (client, default_model)
return client, model or default_model
def _resolve_task_provider_model(
task: str = None,
provider: str = None,
model: str = None,
) -> Tuple[str, Optional[str]]:
"""Determine provider + model for a call.
Priority:
1. Explicit provider/model args (always win)
2. Env var overrides (AUXILIARY_{TASK}_PROVIDER, etc.)
3. Config file (auxiliary.{task}.provider/model or compression.*)
4. "auto" (full auto-detection chain)
Returns (provider, model) where model may be None (use provider default).
"""
if provider:
return provider, model
if task:
# Check env var overrides first
env_provider = _get_auxiliary_provider(task)
if env_provider != "auto":
# Check for env var model override too
env_model = None
for prefix in ("AUXILIARY_", "CONTEXT_"):
val = os.getenv(f"{prefix}{task.upper()}_MODEL", "").strip()
if val:
env_model = val
break
return env_provider, model or env_model
# Read from config file
try:
from hermes_cli.config import load_config
config = load_config()
except ImportError:
return "auto", model
# Check auxiliary.{task} section
aux = config.get("auxiliary", {})
task_config = aux.get(task, {})
cfg_provider = task_config.get("provider", "").strip() or None
cfg_model = task_config.get("model", "").strip() or None
# Backwards compat: compression section has its own keys
if task == "compression" and not cfg_provider:
comp = config.get("compression", {})
cfg_provider = comp.get("summary_provider", "").strip() or None
cfg_model = cfg_model or comp.get("summary_model", "").strip() or None
if cfg_provider and cfg_provider != "auto":
return cfg_provider, model or cfg_model
return "auto", model or cfg_model
return "auto", model
def _build_call_kwargs(
provider: str,
model: str,
messages: list,
temperature: Optional[float] = None,
max_tokens: Optional[int] = None,
tools: Optional[list] = None,
timeout: float = 30.0,
extra_body: Optional[dict] = None,
) -> dict:
"""Build kwargs for .chat.completions.create() with model/provider adjustments."""
kwargs: Dict[str, Any] = {
"model": model,
"messages": messages,
"timeout": timeout,
}
if temperature is not None:
kwargs["temperature"] = temperature
if max_tokens is not None:
# Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens.
# Direct OpenAI api.openai.com with newer models needs max_completion_tokens.
if provider == "custom":
custom_base = os.getenv("OPENAI_BASE_URL", "")
if "api.openai.com" in custom_base.lower():
kwargs["max_completion_tokens"] = max_tokens
else:
kwargs["max_tokens"] = max_tokens
else:
kwargs["max_tokens"] = max_tokens
if tools:
kwargs["tools"] = tools
# Provider-specific extra_body
merged_extra = dict(extra_body or {})
if provider == "nous" or auxiliary_is_nous:
merged_extra.setdefault("tags", []).extend(["product=hermes-agent"])
if merged_extra:
kwargs["extra_body"] = merged_extra
return kwargs
def call_llm(
task: str = None,
*,
provider: str = None,
model: str = None,
messages: list,
temperature: float = None,
max_tokens: int = None,
tools: list = None,
timeout: float = 30.0,
extra_body: dict = None,
) -> Any:
"""Centralized synchronous LLM call.
Resolves provider + model (from task config, explicit args, or auto-detect),
handles auth, request formatting, and model-specific arg adjustments.
Args:
task: Auxiliary task name ("compression", "vision", "web_extract",
"session_search", "skills_hub", "mcp", "flush_memories").
Reads provider:model from config/env. Ignored if provider is set.
provider: Explicit provider override.
model: Explicit model override.
messages: Chat messages list.
temperature: Sampling temperature (None = provider default).
max_tokens: Max output tokens (handles max_tokens vs max_completion_tokens).
tools: Tool definitions (for function calling).
timeout: Request timeout in seconds.
extra_body: Additional request body fields.
Returns:
Response object with .choices[0].message.content
Raises:
RuntimeError: If no provider is configured.
"""
resolved_provider, resolved_model = _resolve_task_provider_model(
task, provider, model)
client, final_model = _get_cached_client(resolved_provider, resolved_model)
if client is None:
# Fallback: try openrouter
if resolved_provider != "openrouter":
logger.warning("Provider %s unavailable, falling back to openrouter",
resolved_provider)
client, final_model = _get_cached_client(
"openrouter", resolved_model or _OPENROUTER_MODEL)
if client is None:
raise RuntimeError(
f"No LLM provider configured for task={task} provider={resolved_provider}. "
f"Run: hermes setup")
kwargs = _build_call_kwargs(
resolved_provider, final_model, messages,
temperature=temperature, max_tokens=max_tokens,
tools=tools, timeout=timeout, extra_body=extra_body)
# Handle max_tokens vs max_completion_tokens retry
try:
return client.chat.completions.create(**kwargs)
except Exception as first_err:
err_str = str(first_err)
if "max_tokens" in err_str or "unsupported_parameter" in err_str:
kwargs.pop("max_tokens", None)
kwargs["max_completion_tokens"] = max_tokens
return client.chat.completions.create(**kwargs)
raise
async def async_call_llm(
task: str = None,
*,
provider: str = None,
model: str = None,
messages: list,
temperature: float = None,
max_tokens: int = None,
tools: list = None,
timeout: float = 30.0,
extra_body: dict = None,
) -> Any:
"""Centralized asynchronous LLM call.
Same as call_llm() but async. See call_llm() for full documentation.
"""
resolved_provider, resolved_model = _resolve_task_provider_model(
task, provider, model)
client, final_model = _get_cached_client(
resolved_provider, resolved_model, async_mode=True)
if client is None:
if resolved_provider != "openrouter":
logger.warning("Provider %s unavailable, falling back to openrouter",
resolved_provider)
client, final_model = _get_cached_client(
"openrouter", resolved_model or _OPENROUTER_MODEL,
async_mode=True)
if client is None:
raise RuntimeError(
f"No LLM provider configured for task={task} provider={resolved_provider}. "
f"Run: hermes setup")
kwargs = _build_call_kwargs(
resolved_provider, final_model, messages,
temperature=temperature, max_tokens=max_tokens,
tools=tools, timeout=timeout, extra_body=extra_body)
try:
return await client.chat.completions.create(**kwargs)
except Exception as first_err:
err_str = str(first_err)
if "max_tokens" in err_str or "unsupported_parameter" in err_str:
kwargs.pop("max_tokens", None)
kwargs["max_completion_tokens"] = max_tokens
return await client.chat.completions.create(**kwargs)
raise

View file

@ -9,7 +9,7 @@ import logging
import os import os
from typing import Any, Dict, List, Optional from typing import Any, Dict, List, Optional
from agent.auxiliary_client import get_text_auxiliary_client from agent.auxiliary_client import call_llm
from agent.model_metadata import ( from agent.model_metadata import (
get_model_context_length, get_model_context_length,
estimate_messages_tokens_rough, estimate_messages_tokens_rough,
@ -53,8 +53,7 @@ class ContextCompressor:
self.last_completion_tokens = 0 self.last_completion_tokens = 0
self.last_total_tokens = 0 self.last_total_tokens = 0
self.client, default_model = get_text_auxiliary_client("compression") self.summary_model = summary_model_override or ""
self.summary_model = summary_model_override or default_model
def update_from_response(self, usage: Dict[str, Any]): def update_from_response(self, usage: Dict[str, Any]):
"""Update tracked token usage from API response.""" """Update tracked token usage from API response."""
@ -120,73 +119,30 @@ TURNS TO SUMMARIZE:
Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix.""" Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
# 1. Try the auxiliary model (cheap/fast) # Use the centralized LLM router — handles provider resolution,
if self.client: # auth, and fallback internally.
try:
return self._call_summary_model(self.client, self.summary_model, prompt)
except Exception as e:
logging.warning(f"Failed to generate context summary with auxiliary model: {e}")
# 2. Fallback: re-try via the centralized provider router.
# This covers all configured providers (Codex OAuth, API-key
# providers, etc.) without ad-hoc env var lookups.
from agent.auxiliary_client import resolve_provider_client
fallback_providers = ["custom", "openrouter", "nous", "codex"]
for fb_provider in fallback_providers:
try:
fb_client, fb_model = resolve_provider_client(
fb_provider, model=self.model)
if fb_client is None:
continue
# Don't retry the same client that just failed
if (self.client is not None
and hasattr(fb_client, "base_url")
and hasattr(self.client, "base_url")
and str(fb_client.base_url) == str(self.client.base_url)):
continue
logger.info("Retrying context summary with fallback provider "
"%s (%s)", fb_provider, fb_model)
summary = self._call_summary_model(fb_client, fb_model, prompt)
# Promote successful fallback for future compressions
self.client = fb_client
self.summary_model = fb_model
return summary
except Exception as fallback_err:
logging.warning("Fallback provider %s failed: %s",
fb_provider, fallback_err)
# 3. All providers failed — return None so the caller drops turns
# without a summary.
logging.warning("Context compression: no provider available for "
"summary. Middle turns will be dropped without summary.")
return None
def _call_summary_model(self, client, model: str, prompt: str) -> str:
"""Make the actual LLM call to generate a summary. Raises on failure."""
kwargs = {
"model": model,
"messages": [{"role": "user", "content": prompt}],
"temperature": 0.3,
"timeout": 30.0,
}
# Most providers (OpenRouter, local models) use max_tokens.
# Direct OpenAI with newer models (gpt-4o, o-series, gpt-5+)
# requires max_completion_tokens instead.
try: try:
kwargs["max_tokens"] = self.summary_target_tokens * 2 call_kwargs = {
response = client.chat.completions.create(**kwargs) "task": "compression",
except Exception as first_err: "messages": [{"role": "user", "content": prompt}],
if "max_tokens" in str(first_err) or "unsupported_parameter" in str(first_err): "temperature": 0.3,
kwargs.pop("max_tokens", None) "max_tokens": self.summary_target_tokens * 2,
kwargs["max_completion_tokens"] = self.summary_target_tokens * 2 "timeout": 30.0,
response = client.chat.completions.create(**kwargs) }
else: if self.summary_model:
raise call_kwargs["model"] = self.summary_model
response = call_llm(**call_kwargs)
summary = response.choices[0].message.content.strip() summary = response.choices[0].message.content.strip()
if not summary.startswith("[CONTEXT SUMMARY]:"): if not summary.startswith("[CONTEXT SUMMARY]:"):
summary = "[CONTEXT SUMMARY]: " + summary summary = "[CONTEXT SUMMARY]: " + summary
return summary return summary
except RuntimeError:
logging.warning("Context compression: no provider available for "
"summary. Middle turns will be dropped without summary.")
return None
except Exception as e:
logging.warning("Failed to generate context summary: %s", e)
return None
# ------------------------------------------------------------------ # ------------------------------------------------------------------
# Tool-call / tool-result pair integrity helpers # Tool-call / tool-result pair integrity helpers

View file

@ -125,17 +125,41 @@ DEFAULT_CONFIG = {
"summary_provider": "auto", "summary_provider": "auto",
}, },
# Auxiliary model overrides (advanced). By default Hermes auto-selects # Auxiliary model config — provider:model for each side task.
# the provider and model for each side task. Set these to override. # Format: provider is the provider name, model is the model slug.
# "auto" for provider = auto-detect best available provider.
# Empty model = use provider's default auxiliary model.
# All tasks fall back to openrouter:google/gemini-3-flash-preview if
# the configured provider is unavailable.
"auxiliary": { "auxiliary": {
"vision": { "vision": {
"provider": "auto", # auto | openrouter | nous | main "provider": "auto", # auto | openrouter | nous | codex | custom
"model": "", # e.g. "google/gemini-2.5-flash", "gpt-4o" "model": "", # e.g. "google/gemini-2.5-flash", "gpt-4o"
}, },
"web_extract": { "web_extract": {
"provider": "auto", "provider": "auto",
"model": "", "model": "",
}, },
"compression": {
"provider": "auto",
"model": "",
},
"session_search": {
"provider": "auto",
"model": "",
},
"skills_hub": {
"provider": "auto",
"model": "",
},
"mcp": {
"provider": "auto",
"model": "",
},
"flush_memories": {
"provider": "auto",
"model": "",
},
}, },
"display": { "display": {
@ -217,7 +241,7 @@ DEFAULT_CONFIG = {
"personalities": {}, "personalities": {},
# Config schema version - bump this when adding new required fields # Config schema version - bump this when adding new required fields
"_config_version": 6, "_config_version": 7,
} }
# ============================================================================= # =============================================================================

View file

@ -2623,19 +2623,22 @@ class AIAgent:
# Use auxiliary client for the flush call when available -- # Use auxiliary client for the flush call when available --
# it's cheaper and avoids Codex Responses API incompatibility. # it's cheaper and avoids Codex Responses API incompatibility.
from agent.auxiliary_client import get_text_auxiliary_client from agent.auxiliary_client import call_llm as _call_llm
aux_client, aux_model = get_text_auxiliary_client() _aux_available = True
try:
response = _call_llm(
task="flush_memories",
messages=api_messages,
tools=[memory_tool_def],
temperature=0.3,
max_tokens=5120,
timeout=30.0,
)
except RuntimeError:
_aux_available = False
response = None
if aux_client: if not _aux_available and self.api_mode == "codex_responses":
api_kwargs = {
"model": aux_model,
"messages": api_messages,
"tools": [memory_tool_def],
"temperature": 0.3,
"max_tokens": 5120,
}
response = aux_client.chat.completions.create(**api_kwargs, timeout=30.0)
elif self.api_mode == "codex_responses":
# No auxiliary client -- use the Codex Responses path directly # No auxiliary client -- use the Codex Responses path directly
codex_kwargs = self._build_api_kwargs(api_messages) codex_kwargs = self._build_api_kwargs(api_messages)
codex_kwargs["tools"] = self._responses_tools([memory_tool_def]) codex_kwargs["tools"] = self._responses_tools([memory_tool_def])
@ -2643,7 +2646,7 @@ class AIAgent:
if "max_output_tokens" in codex_kwargs: if "max_output_tokens" in codex_kwargs:
codex_kwargs["max_output_tokens"] = 5120 codex_kwargs["max_output_tokens"] = 5120
response = self._run_codex_stream(codex_kwargs) response = self._run_codex_stream(codex_kwargs)
else: elif not _aux_available:
api_kwargs = { api_kwargs = {
"model": self.model, "model": self.model,
"messages": api_messages, "messages": api_messages,
@ -2655,7 +2658,7 @@ class AIAgent:
# Extract tool calls from the response, handling both API formats # Extract tool calls from the response, handling both API formats
tool_calls = [] tool_calls = []
if self.api_mode == "codex_responses" and not aux_client: if self.api_mode == "codex_responses" and not _aux_available:
assistant_msg, _ = self._normalize_codex_response(response) assistant_msg, _ = self._normalize_codex_response(response)
if assistant_msg and assistant_msg.tool_calls: if assistant_msg and assistant_msg.tool_calls:
tool_calls = assistant_msg.tool_calls tool_calls = assistant_msg.tool_calls

View file

@ -9,8 +9,7 @@ from agent.context_compressor import ContextCompressor
@pytest.fixture() @pytest.fixture()
def compressor(): def compressor():
"""Create a ContextCompressor with mocked dependencies.""" """Create a ContextCompressor with mocked dependencies."""
with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ with patch("agent.context_compressor.get_model_context_length", return_value=100000):
patch("agent.context_compressor.get_text_auxiliary_client", return_value=(None, None)):
c = ContextCompressor( c = ContextCompressor(
model="test/model", model="test/model",
threshold_percent=0.85, threshold_percent=0.85,
@ -119,14 +118,11 @@ class TestGenerateSummaryNoneContent:
"""Regression: content=None (from tool-call-only assistant messages) must not crash.""" """Regression: content=None (from tool-call-only assistant messages) must not crash."""
def test_none_content_does_not_crash(self): def test_none_content_does_not_crash(self):
mock_client = MagicMock()
mock_response = MagicMock() mock_response = MagicMock()
mock_response.choices = [MagicMock()] mock_response.choices = [MagicMock()]
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: tool calls happened" mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: tool calls happened"
mock_client.chat.completions.create.return_value = mock_response
with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ with patch("agent.context_compressor.get_model_context_length", return_value=100000):
patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
c = ContextCompressor(model="test", quiet_mode=True) c = ContextCompressor(model="test", quiet_mode=True)
messages = [ messages = [
@ -139,14 +135,14 @@ class TestGenerateSummaryNoneContent:
{"role": "user", "content": "thanks"}, {"role": "user", "content": "thanks"},
] ]
summary = c._generate_summary(messages) with patch("agent.context_compressor.call_llm", return_value=mock_response):
summary = c._generate_summary(messages)
assert isinstance(summary, str) assert isinstance(summary, str)
assert "CONTEXT SUMMARY" in summary assert "CONTEXT SUMMARY" in summary
def test_none_content_in_system_message_compress(self): def test_none_content_in_system_message_compress(self):
"""System message with content=None should not crash during compress.""" """System message with content=None should not crash during compress."""
with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ with patch("agent.context_compressor.get_model_context_length", return_value=100000):
patch("agent.context_compressor.get_text_auxiliary_client", return_value=(None, None)):
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2) c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
msgs = [{"role": "system", "content": None}] + [ msgs = [{"role": "system", "content": None}] + [
@ -165,12 +161,12 @@ class TestCompressWithClient:
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened" mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened"
mock_client.chat.completions.create.return_value = mock_response mock_client.chat.completions.create.return_value = mock_response
with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ with patch("agent.context_compressor.get_model_context_length", return_value=100000):
patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
c = ContextCompressor(model="test", quiet_mode=True) c = ContextCompressor(model="test", quiet_mode=True)
msgs = [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} for i in range(10)] msgs = [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} for i in range(10)]
result = c.compress(msgs) with patch("agent.context_compressor.call_llm", return_value=mock_response):
result = c.compress(msgs)
# Should have summary message in the middle # Should have summary message in the middle
contents = [m.get("content", "") for m in result] contents = [m.get("content", "") for m in result]
@ -184,8 +180,7 @@ class TestCompressWithClient:
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compressed middle" mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compressed middle"
mock_client.chat.completions.create.return_value = mock_response mock_client.chat.completions.create.return_value = mock_response
with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ with patch("agent.context_compressor.get_model_context_length", return_value=100000):
patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
c = ContextCompressor( c = ContextCompressor(
model="test", model="test",
quiet_mode=True, quiet_mode=True,
@ -212,7 +207,8 @@ class TestCompressWithClient:
{"role": "user", "content": "later 4"}, {"role": "user", "content": "later 4"},
] ]
result = c.compress(msgs) with patch("agent.context_compressor.call_llm", return_value=mock_response):
result = c.compress(msgs)
answered_ids = { answered_ids = {
msg.get("tool_call_id") msg.get("tool_call_id")
@ -232,8 +228,7 @@ class TestCompressWithClient:
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened" mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened"
mock_client.chat.completions.create.return_value = mock_response mock_client.chat.completions.create.return_value = mock_response
with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ with patch("agent.context_compressor.get_model_context_length", return_value=100000):
patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2) c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
# Last head message (index 1) is "assistant" → summary should be "user" # Last head message (index 1) is "assistant" → summary should be "user"
@ -245,7 +240,8 @@ class TestCompressWithClient:
{"role": "user", "content": "msg 4"}, {"role": "user", "content": "msg 4"},
{"role": "assistant", "content": "msg 5"}, {"role": "assistant", "content": "msg 5"},
] ]
result = c.compress(msgs) with patch("agent.context_compressor.call_llm", return_value=mock_response):
result = c.compress(msgs)
summary_msg = [m for m in result if "CONTEXT SUMMARY" in (m.get("content") or "")] summary_msg = [m for m in result if "CONTEXT SUMMARY" in (m.get("content") or "")]
assert len(summary_msg) == 1 assert len(summary_msg) == 1
assert summary_msg[0]["role"] == "user" assert summary_msg[0]["role"] == "user"
@ -258,8 +254,7 @@ class TestCompressWithClient:
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened" mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened"
mock_client.chat.completions.create.return_value = mock_response mock_client.chat.completions.create.return_value = mock_response
with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ with patch("agent.context_compressor.get_model_context_length", return_value=100000):
patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=3, protect_last_n=2) c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=3, protect_last_n=2)
# Last head message (index 2) is "user" → summary should be "assistant" # Last head message (index 2) is "user" → summary should be "assistant"
@ -273,20 +268,18 @@ class TestCompressWithClient:
{"role": "user", "content": "msg 6"}, {"role": "user", "content": "msg 6"},
{"role": "assistant", "content": "msg 7"}, {"role": "assistant", "content": "msg 7"},
] ]
result = c.compress(msgs) with patch("agent.context_compressor.call_llm", return_value=mock_response):
result = c.compress(msgs)
summary_msg = [m for m in result if "CONTEXT SUMMARY" in (m.get("content") or "")] summary_msg = [m for m in result if "CONTEXT SUMMARY" in (m.get("content") or "")]
assert len(summary_msg) == 1 assert len(summary_msg) == 1
assert summary_msg[0]["role"] == "assistant" assert summary_msg[0]["role"] == "assistant"
def test_summarization_does_not_start_tail_with_tool_outputs(self): def test_summarization_does_not_start_tail_with_tool_outputs(self):
mock_client = MagicMock()
mock_response = MagicMock() mock_response = MagicMock()
mock_response.choices = [MagicMock()] mock_response.choices = [MagicMock()]
mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compressed middle" mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compressed middle"
mock_client.chat.completions.create.return_value = mock_response
with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ with patch("agent.context_compressor.get_model_context_length", return_value=100000):
patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
c = ContextCompressor( c = ContextCompressor(
model="test", model="test",
quiet_mode=True, quiet_mode=True,
@ -309,7 +302,8 @@ class TestCompressWithClient:
{"role": "user", "content": "latest user"}, {"role": "user", "content": "latest user"},
] ]
result = c.compress(msgs) with patch("agent.context_compressor.call_llm", return_value=mock_response):
result = c.compress(msgs)
called_ids = { called_ids = {
tc["id"] tc["id"]

View file

@ -1828,8 +1828,8 @@ class TestSamplingCallbackText:
) )
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "default-model"), return_value=fake_client.chat.completions.create.return_value,
): ):
params = _make_sampling_params() params = _make_sampling_params()
result = asyncio.run(self.handler(None, params)) result = asyncio.run(self.handler(None, params))
@ -1847,13 +1847,13 @@ class TestSamplingCallbackText:
fake_client.chat.completions.create.return_value = _make_llm_response() fake_client.chat.completions.create.return_value = _make_llm_response()
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "default-model"), return_value=fake_client.chat.completions.create.return_value,
): ) as mock_call:
params = _make_sampling_params(system_prompt="Be helpful") params = _make_sampling_params(system_prompt="Be helpful")
asyncio.run(self.handler(None, params)) asyncio.run(self.handler(None, params))
call_args = fake_client.chat.completions.create.call_args call_args = mock_call.call_args
messages = call_args.kwargs["messages"] messages = call_args.kwargs["messages"]
assert messages[0] == {"role": "system", "content": "Be helpful"} assert messages[0] == {"role": "system", "content": "Be helpful"}
@ -1865,8 +1865,8 @@ class TestSamplingCallbackText:
) )
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "default-model"), return_value=fake_client.chat.completions.create.return_value,
): ):
params = _make_sampling_params() params = _make_sampling_params()
result = asyncio.run(self.handler(None, params)) result = asyncio.run(self.handler(None, params))
@ -1889,8 +1889,8 @@ class TestSamplingCallbackToolUse:
fake_client.chat.completions.create.return_value = _make_llm_tool_response() fake_client.chat.completions.create.return_value = _make_llm_tool_response()
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "default-model"), return_value=fake_client.chat.completions.create.return_value,
): ):
params = _make_sampling_params() params = _make_sampling_params()
result = asyncio.run(self.handler(None, params)) result = asyncio.run(self.handler(None, params))
@ -1916,8 +1916,8 @@ class TestSamplingCallbackToolUse:
) )
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "default-model"), return_value=fake_client.chat.completions.create.return_value,
): ):
result = asyncio.run(self.handler(None, _make_sampling_params())) result = asyncio.run(self.handler(None, _make_sampling_params()))
@ -1939,8 +1939,8 @@ class TestToolLoopGovernance:
fake_client.chat.completions.create.return_value = _make_llm_tool_response() fake_client.chat.completions.create.return_value = _make_llm_tool_response()
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "default-model"), return_value=fake_client.chat.completions.create.return_value,
): ):
params = _make_sampling_params() params = _make_sampling_params()
# Round 1, 2: allowed # Round 1, 2: allowed
@ -1959,8 +1959,8 @@ class TestToolLoopGovernance:
fake_client = MagicMock() fake_client = MagicMock()
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "default-model"), return_value=fake_client.chat.completions.create.return_value,
): ):
# Tool response (round 1 of 1 allowed) # Tool response (round 1 of 1 allowed)
fake_client.chat.completions.create.return_value = _make_llm_tool_response() fake_client.chat.completions.create.return_value = _make_llm_tool_response()
@ -1984,8 +1984,8 @@ class TestToolLoopGovernance:
fake_client.chat.completions.create.return_value = _make_llm_tool_response() fake_client.chat.completions.create.return_value = _make_llm_tool_response()
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "default-model"), return_value=fake_client.chat.completions.create.return_value,
): ):
result = asyncio.run(handler(None, _make_sampling_params())) result = asyncio.run(handler(None, _make_sampling_params()))
assert isinstance(result, ErrorData) assert isinstance(result, ErrorData)
@ -2003,8 +2003,8 @@ class TestSamplingErrors:
fake_client.chat.completions.create.return_value = _make_llm_response() fake_client.chat.completions.create.return_value = _make_llm_response()
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "default-model"), return_value=fake_client.chat.completions.create.return_value,
): ):
# First call succeeds # First call succeeds
r1 = asyncio.run(handler(None, _make_sampling_params())) r1 = asyncio.run(handler(None, _make_sampling_params()))
@ -2017,20 +2017,16 @@ class TestSamplingErrors:
def test_timeout_error(self): def test_timeout_error(self):
handler = SamplingHandler("to", {"timeout": 0.05}) handler = SamplingHandler("to", {"timeout": 0.05})
fake_client = MagicMock()
def slow_call(**kwargs): def slow_call(**kwargs):
import threading import threading
# Use an event to ensure the thread truly blocks long enough
evt = threading.Event() evt = threading.Event()
evt.wait(5) # blocks for up to 5 seconds (cancelled by timeout) evt.wait(5) # blocks for up to 5 seconds (cancelled by timeout)
return _make_llm_response() return _make_llm_response()
fake_client.chat.completions.create.side_effect = slow_call
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "default-model"), side_effect=slow_call,
): ):
result = asyncio.run(handler(None, _make_sampling_params())) result = asyncio.run(handler(None, _make_sampling_params()))
assert isinstance(result, ErrorData) assert isinstance(result, ErrorData)
@ -2041,12 +2037,11 @@ class TestSamplingErrors:
handler = SamplingHandler("np", {}) handler = SamplingHandler("np", {})
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(None, None), side_effect=RuntimeError("No LLM provider configured"),
): ):
result = asyncio.run(handler(None, _make_sampling_params())) result = asyncio.run(handler(None, _make_sampling_params()))
assert isinstance(result, ErrorData) assert isinstance(result, ErrorData)
assert "No LLM provider" in result.message
assert handler.metrics["errors"] == 1 assert handler.metrics["errors"] == 1
def test_empty_choices_returns_error(self): def test_empty_choices_returns_error(self):
@ -2060,8 +2055,8 @@ class TestSamplingErrors:
) )
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "default-model"), return_value=fake_client.chat.completions.create.return_value,
): ):
result = asyncio.run(handler(None, _make_sampling_params())) result = asyncio.run(handler(None, _make_sampling_params()))
@ -2080,8 +2075,8 @@ class TestSamplingErrors:
) )
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "default-model"), return_value=fake_client.chat.completions.create.return_value,
): ):
result = asyncio.run(handler(None, _make_sampling_params())) result = asyncio.run(handler(None, _make_sampling_params()))
@ -2099,8 +2094,8 @@ class TestSamplingErrors:
) )
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "default-model"), return_value=fake_client.chat.completions.create.return_value,
): ):
result = asyncio.run(handler(None, _make_sampling_params())) result = asyncio.run(handler(None, _make_sampling_params()))
@ -2120,8 +2115,8 @@ class TestModelWhitelist:
fake_client.chat.completions.create.return_value = _make_llm_response() fake_client.chat.completions.create.return_value = _make_llm_response()
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "test-model"), return_value=fake_client.chat.completions.create.return_value,
): ):
result = asyncio.run(handler(None, _make_sampling_params())) result = asyncio.run(handler(None, _make_sampling_params()))
assert isinstance(result, CreateMessageResult) assert isinstance(result, CreateMessageResult)
@ -2131,8 +2126,8 @@ class TestModelWhitelist:
fake_client = MagicMock() fake_client = MagicMock()
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "gpt-3.5-turbo"), return_value=fake_client.chat.completions.create.return_value,
): ):
result = asyncio.run(handler(None, _make_sampling_params())) result = asyncio.run(handler(None, _make_sampling_params()))
assert isinstance(result, ErrorData) assert isinstance(result, ErrorData)
@ -2145,8 +2140,8 @@ class TestModelWhitelist:
fake_client.chat.completions.create.return_value = _make_llm_response() fake_client.chat.completions.create.return_value = _make_llm_response()
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "any-model"), return_value=fake_client.chat.completions.create.return_value,
): ):
result = asyncio.run(handler(None, _make_sampling_params())) result = asyncio.run(handler(None, _make_sampling_params()))
assert isinstance(result, CreateMessageResult) assert isinstance(result, CreateMessageResult)
@ -2166,8 +2161,8 @@ class TestMalformedToolCallArgs:
) )
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "default-model"), return_value=fake_client.chat.completions.create.return_value,
): ):
result = asyncio.run(handler(None, _make_sampling_params())) result = asyncio.run(handler(None, _make_sampling_params()))
@ -2194,8 +2189,8 @@ class TestMalformedToolCallArgs:
fake_client.chat.completions.create.return_value = response fake_client.chat.completions.create.return_value = response
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "default-model"), return_value=fake_client.chat.completions.create.return_value,
): ):
result = asyncio.run(handler(None, _make_sampling_params())) result = asyncio.run(handler(None, _make_sampling_params()))
@ -2214,8 +2209,8 @@ class TestMetricsTracking:
fake_client.chat.completions.create.return_value = _make_llm_response() fake_client.chat.completions.create.return_value = _make_llm_response()
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "default-model"), return_value=fake_client.chat.completions.create.return_value,
): ):
asyncio.run(handler(None, _make_sampling_params())) asyncio.run(handler(None, _make_sampling_params()))
@ -2229,8 +2224,8 @@ class TestMetricsTracking:
fake_client.chat.completions.create.return_value = _make_llm_tool_response() fake_client.chat.completions.create.return_value = _make_llm_tool_response()
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(fake_client, "default-model"), return_value=fake_client.chat.completions.create.return_value,
): ):
asyncio.run(handler(None, _make_sampling_params())) asyncio.run(handler(None, _make_sampling_params()))
@ -2241,8 +2236,8 @@ class TestMetricsTracking:
handler = SamplingHandler("met3", {}) handler = SamplingHandler("met3", {})
with patch( with patch(
"agent.auxiliary_client.get_text_auxiliary_client", "agent.auxiliary_client.call_llm",
return_value=(None, None), side_effect=RuntimeError("No LLM provider configured"),
): ):
asyncio.run(handler(None, _make_sampling_params())) asyncio.run(handler(None, _make_sampling_params()))

View file

@ -63,7 +63,7 @@ import time
import requests import requests
from typing import Dict, Any, Optional, List from typing import Dict, Any, Optional, List
from pathlib import Path from pathlib import Path
from agent.auxiliary_client import get_vision_auxiliary_client, get_text_auxiliary_client from agent.auxiliary_client import call_llm
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -80,38 +80,15 @@ DEFAULT_SESSION_TIMEOUT = 300
# Max tokens for snapshot content before summarization # Max tokens for snapshot content before summarization
SNAPSHOT_SUMMARIZE_THRESHOLD = 8000 SNAPSHOT_SUMMARIZE_THRESHOLD = 8000
# Vision client — for browser_vision (screenshot analysis)
# Wrapped in try/except so a broken auxiliary config doesn't prevent the entire
# browser_tool module from importing (which would disable all 10 browser tools).
try:
_aux_vision_client, _DEFAULT_VISION_MODEL = get_vision_auxiliary_client()
except Exception as _init_err:
logger.debug("Could not initialise vision auxiliary client: %s", _init_err)
_aux_vision_client, _DEFAULT_VISION_MODEL = None, None
# Text client — for page snapshot summarization (same config as web_extract) def _get_vision_model() -> Optional[str]:
try:
_aux_text_client, _DEFAULT_TEXT_MODEL = get_text_auxiliary_client("web_extract")
except Exception as _init_err:
logger.debug("Could not initialise text auxiliary client: %s", _init_err)
_aux_text_client, _DEFAULT_TEXT_MODEL = None, None
# Module-level alias for availability checks
EXTRACTION_MODEL = _DEFAULT_TEXT_MODEL or _DEFAULT_VISION_MODEL
def _get_vision_model() -> str:
"""Model for browser_vision (screenshot analysis — multimodal).""" """Model for browser_vision (screenshot analysis — multimodal)."""
return (os.getenv("AUXILIARY_VISION_MODEL", "").strip() return os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None
or _DEFAULT_VISION_MODEL
or "google/gemini-3-flash-preview")
def _get_extraction_model() -> str: def _get_extraction_model() -> Optional[str]:
"""Model for page snapshot text summarization — same as web_extract.""" """Model for page snapshot text summarization — same as web_extract."""
return (os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() return os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() or None
or _DEFAULT_TEXT_MODEL
or "google/gemini-3-flash-preview")
def _is_local_mode() -> bool: def _is_local_mode() -> bool:
@ -941,9 +918,6 @@ def _extract_relevant_content(
Falls back to simple truncation when no auxiliary text model is configured. Falls back to simple truncation when no auxiliary text model is configured.
""" """
if _aux_text_client is None:
return _truncate_snapshot(snapshot_text)
if user_task: if user_task:
extraction_prompt = ( extraction_prompt = (
f"You are a content extractor for a browser automation agent.\n\n" f"You are a content extractor for a browser automation agent.\n\n"
@ -968,13 +942,16 @@ def _extract_relevant_content(
) )
try: try:
from agent.auxiliary_client import auxiliary_max_tokens_param call_kwargs = {
response = _aux_text_client.chat.completions.create( "task": "web_extract",
model=_get_extraction_model(), "messages": [{"role": "user", "content": extraction_prompt}],
messages=[{"role": "user", "content": extraction_prompt}], "max_tokens": 4000,
**auxiliary_max_tokens_param(4000), "temperature": 0.1,
temperature=0.1, }
) model = _get_extraction_model()
if model:
call_kwargs["model"] = model
response = call_llm(**call_kwargs)
return response.choices[0].message.content return response.choices[0].message.content
except Exception: except Exception:
return _truncate_snapshot(snapshot_text) return _truncate_snapshot(snapshot_text)
@ -1497,14 +1474,6 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
effective_task_id = task_id or "default" effective_task_id = task_id or "default"
# Check auxiliary vision client
if _aux_vision_client is None or _DEFAULT_VISION_MODEL is None:
return json.dumps({
"success": False,
"error": "Browser vision unavailable: no auxiliary vision model configured. "
"Set OPENROUTER_API_KEY or configure Nous Portal to enable browser vision."
}, ensure_ascii=False)
# Save screenshot to persistent location so it can be shared with users # Save screenshot to persistent location so it can be shared with users
hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
screenshots_dir = hermes_home / "browser_screenshots" screenshots_dir = hermes_home / "browser_screenshots"
@ -1562,14 +1531,13 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
f"Focus on answering the user's specific question." f"Focus on answering the user's specific question."
) )
# Use the sync auxiliary vision client directly # Use the centralized LLM router
from agent.auxiliary_client import auxiliary_max_tokens_param
vision_model = _get_vision_model() vision_model = _get_vision_model()
logger.debug("browser_vision: analysing screenshot (%d bytes) with model=%s", logger.debug("browser_vision: analysing screenshot (%d bytes)",
len(image_data), vision_model) len(image_data))
response = _aux_vision_client.chat.completions.create( call_kwargs = {
model=vision_model, "task": "vision",
messages=[ "messages": [
{ {
"role": "user", "role": "user",
"content": [ "content": [
@ -1578,9 +1546,12 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
], ],
} }
], ],
**auxiliary_max_tokens_param(2000), "max_tokens": 2000,
temperature=0.1, "temperature": 0.1,
) }
if vision_model:
call_kwargs["model"] = vision_model
response = call_llm(**call_kwargs)
analysis = response.choices[0].message.content analysis = response.choices[0].message.content
response_data = { response_data = {

View file

@ -456,17 +456,13 @@ class SamplingHandler:
# Resolve model # Resolve model
model = self._resolve_model(getattr(params, "modelPreferences", None)) model = self._resolve_model(getattr(params, "modelPreferences", None))
# Get auxiliary LLM client # Get auxiliary LLM client via centralized router
from agent.auxiliary_client import get_text_auxiliary_client from agent.auxiliary_client import call_llm
client, default_model = get_text_auxiliary_client()
if client is None:
self.metrics["errors"] += 1
return self._error("No LLM provider available for sampling")
resolved_model = model or default_model # Model whitelist check (we need to resolve model before calling)
resolved_model = model or self.model_override or ""
# Model whitelist check if self.allowed_models and resolved_model and resolved_model not in self.allowed_models:
if self.allowed_models and resolved_model not in self.allowed_models:
logger.warning( logger.warning(
"MCP server '%s' requested model '%s' not in allowed_models", "MCP server '%s' requested model '%s' not in allowed_models",
self.server_name, resolved_model, self.server_name, resolved_model,
@ -484,20 +480,15 @@ class SamplingHandler:
# Build LLM call kwargs # Build LLM call kwargs
max_tokens = min(params.maxTokens, self.max_tokens_cap) max_tokens = min(params.maxTokens, self.max_tokens_cap)
call_kwargs: dict = { call_temperature = None
"model": resolved_model,
"messages": messages,
"max_tokens": max_tokens,
}
if hasattr(params, "temperature") and params.temperature is not None: if hasattr(params, "temperature") and params.temperature is not None:
call_kwargs["temperature"] = params.temperature call_temperature = params.temperature
if stop := getattr(params, "stopSequences", None):
call_kwargs["stop"] = stop
# Forward server-provided tools # Forward server-provided tools
call_tools = None
server_tools = getattr(params, "tools", None) server_tools = getattr(params, "tools", None)
if server_tools: if server_tools:
call_kwargs["tools"] = [ call_tools = [
{ {
"type": "function", "type": "function",
"function": { "function": {
@ -508,9 +499,6 @@ class SamplingHandler:
} }
for t in server_tools for t in server_tools
] ]
if tool_choice := getattr(params, "toolChoice", None):
mode = getattr(tool_choice, "mode", "auto")
call_kwargs["tool_choice"] = {"auto": "auto", "required": "required", "none": "none"}.get(mode, "auto")
logger.log( logger.log(
self.audit_level, self.audit_level,
@ -520,7 +508,15 @@ class SamplingHandler:
# Offload sync LLM call to thread (non-blocking) # Offload sync LLM call to thread (non-blocking)
def _sync_call(): def _sync_call():
return client.chat.completions.create(**call_kwargs) return call_llm(
task="mcp",
model=resolved_model or None,
messages=messages,
temperature=call_temperature,
max_tokens=max_tokens,
tools=call_tools,
timeout=self.timeout,
)
try: try:
response = await asyncio.wait_for( response = await asyncio.wait_for(

View file

@ -22,13 +22,7 @@ import os
import logging import logging
from typing import Dict, Any, List, Optional, Union from typing import Dict, Any, List, Optional, Union
from openai import AsyncOpenAI, OpenAI from agent.auxiliary_client import async_call_llm
from agent.auxiliary_client import get_async_text_auxiliary_client
# Resolve the async auxiliary client at import time so we have the model slug.
# Handles Codex Responses API adapter transparently.
_async_aux_client, _SUMMARIZER_MODEL = get_async_text_auxiliary_client()
MAX_SESSION_CHARS = 100_000 MAX_SESSION_CHARS = 100_000
MAX_SUMMARY_TOKENS = 10000 MAX_SUMMARY_TOKENS = 10000
@ -156,26 +150,22 @@ async def _summarize_session(
f"Summarize this conversation with focus on: {query}" f"Summarize this conversation with focus on: {query}"
) )
if _async_aux_client is None or _SUMMARIZER_MODEL is None:
logging.warning("No auxiliary model available for session summarization")
return None
max_retries = 3 max_retries = 3
for attempt in range(max_retries): for attempt in range(max_retries):
try: try:
from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param response = await async_call_llm(
_extra = get_auxiliary_extra_body() task="session_search",
response = await _async_aux_client.chat.completions.create(
model=_SUMMARIZER_MODEL,
messages=[ messages=[
{"role": "system", "content": system_prompt}, {"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}, {"role": "user", "content": user_prompt},
], ],
**({} if not _extra else {"extra_body": _extra}),
temperature=0.1, temperature=0.1,
**auxiliary_max_tokens_param(MAX_SUMMARY_TOKENS), max_tokens=MAX_SUMMARY_TOKENS,
) )
return response.choices[0].message.content.strip() return response.choices[0].message.content.strip()
except RuntimeError:
logging.warning("No auxiliary model available for session summarization")
return None
except Exception as e: except Exception as e:
if attempt < max_retries - 1: if attempt < max_retries - 1:
await asyncio.sleep(1 * (attempt + 1)) await asyncio.sleep(1 * (attempt + 1))
@ -333,8 +323,6 @@ def session_search(
def check_session_search_requirements() -> bool: def check_session_search_requirements() -> bool:
"""Requires SQLite state database and an auxiliary text model.""" """Requires SQLite state database and an auxiliary text model."""
if _async_aux_client is None:
return False
try: try:
from hermes_state import DEFAULT_DB_PATH from hermes_state import DEFAULT_DB_PATH
return DEFAULT_DB_PATH.parent.exists() return DEFAULT_DB_PATH.parent.exists()

View file

@ -936,13 +936,10 @@ def llm_audit_skill(skill_path: Path, static_result: ScanResult,
# Call the LLM via the centralized provider router # Call the LLM via the centralized provider router
try: try:
from agent.auxiliary_client import resolve_provider_client from agent.auxiliary_client import call_llm
client, _default_model = resolve_provider_client("openrouter") response = call_llm(
if client is None: provider="openrouter",
return static_result
response = client.chat.completions.create(
model=model, model=model,
messages=[{ messages=[{
"role": "user", "role": "user",

View file

@ -37,16 +37,11 @@ from pathlib import Path
from typing import Any, Awaitable, Dict, Optional from typing import Any, Awaitable, Dict, Optional
from urllib.parse import urlparse from urllib.parse import urlparse
import httpx import httpx
from agent.auxiliary_client import get_async_vision_auxiliary_client from agent.auxiliary_client import async_call_llm
from tools.debug_helpers import DebugSession from tools.debug_helpers import DebugSession
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Resolve vision auxiliary client at module level.
# Uses get_async_vision_auxiliary_client() which properly handles Codex
# routing (Responses API adapter) instead of raw AsyncOpenAI construction.
_aux_async_client, DEFAULT_VISION_MODEL = get_async_vision_auxiliary_client()
_debug = DebugSession("vision_tools", env_var="VISION_TOOLS_DEBUG") _debug = DebugSession("vision_tools", env_var="VISION_TOOLS_DEBUG")
@ -185,7 +180,7 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None)
async def vision_analyze_tool( async def vision_analyze_tool(
image_url: str, image_url: str,
user_prompt: str, user_prompt: str,
model: str = DEFAULT_VISION_MODEL, model: str = None,
) -> str: ) -> str:
""" """
Analyze an image from a URL or local file path using vision AI. Analyze an image from a URL or local file path using vision AI.
@ -245,15 +240,6 @@ async def vision_analyze_tool(
logger.info("Analyzing image: %s", image_url[:60]) logger.info("Analyzing image: %s", image_url[:60])
logger.info("User prompt: %s", user_prompt[:100]) logger.info("User prompt: %s", user_prompt[:100])
# Check auxiliary vision client availability
if _aux_async_client is None or DEFAULT_VISION_MODEL is None:
logger.error("Vision analysis unavailable: no auxiliary vision model configured")
return json.dumps({
"success": False,
"analysis": "Vision analysis unavailable: no auxiliary vision model configured. "
"Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools."
}, indent=2, ensure_ascii=False)
# Determine if this is a local file path or a remote URL # Determine if this is a local file path or a remote URL
local_path = Path(image_url) local_path = Path(image_url)
if local_path.is_file(): if local_path.is_file():
@ -309,18 +295,18 @@ async def vision_analyze_tool(
} }
] ]
logger.info("Processing image with %s...", model) logger.info("Processing image with vision model...")
# Call the vision API # Call the vision API via centralized router
from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param call_kwargs = {
_extra = get_auxiliary_extra_body() "task": "vision",
response = await _aux_async_client.chat.completions.create( "messages": messages,
model=model, "temperature": 0.1,
messages=messages, "max_tokens": 2000,
temperature=0.1, }
**auxiliary_max_tokens_param(2000), if model:
**({} if not _extra else {"extra_body": _extra}), call_kwargs["model"] = model
) response = await async_call_llm(**call_kwargs)
# Extract the analysis # Extract the analysis
analysis = response.choices[0].message.content.strip() analysis = response.choices[0].message.content.strip()
@ -391,7 +377,18 @@ async def vision_analyze_tool(
def check_vision_requirements() -> bool: def check_vision_requirements() -> bool:
"""Check if an auxiliary vision model is available.""" """Check if an auxiliary vision model is available."""
return _aux_async_client is not None try:
from agent.auxiliary_client import resolve_provider_client
client, _ = resolve_provider_client("openrouter")
if client is not None:
return True
client, _ = resolve_provider_client("nous")
if client is not None:
return True
client, _ = resolve_provider_client("custom")
return client is not None
except Exception:
return False
def get_debug_session_info() -> Dict[str, Any]: def get_debug_session_info() -> Dict[str, Any]:
@ -419,10 +416,9 @@ if __name__ == "__main__":
print("Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools.") print("Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools.")
exit(1) exit(1)
else: else:
print(f"✅ Vision model available: {DEFAULT_VISION_MODEL}") print("✅ Vision model available")
print("🛠️ Vision tools ready for use!") print("🛠️ Vision tools ready for use!")
print(f"🧠 Using model: {DEFAULT_VISION_MODEL}")
# Show debug mode status # Show debug mode status
if _debug.active: if _debug.active:
@ -489,9 +485,7 @@ def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]:
"Fully describe and explain everything about this image, then answer the " "Fully describe and explain everything about this image, then answer the "
f"following question:\n\n{question}" f"following question:\n\n{question}"
) )
model = (os.getenv("AUXILIARY_VISION_MODEL", "").strip() model = os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None
or DEFAULT_VISION_MODEL
or "google/gemini-3-flash-preview")
return vision_analyze_tool(image_url, full_prompt, model) return vision_analyze_tool(image_url, full_prompt, model)

View file

@ -47,8 +47,7 @@ import re
import asyncio import asyncio
from typing import List, Dict, Any, Optional from typing import List, Dict, Any, Optional
from firecrawl import Firecrawl from firecrawl import Firecrawl
from openai import AsyncOpenAI from agent.auxiliary_client import async_call_llm
from agent.auxiliary_client import get_async_text_auxiliary_client
from tools.debug_helpers import DebugSession from tools.debug_helpers import DebugSession
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -83,15 +82,8 @@ def _get_firecrawl_client():
DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000 DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000
# Resolve async auxiliary client at module level. # Allow per-task override via env var
# Handles Codex Responses API adapter transparently. DEFAULT_SUMMARIZER_MODEL = os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() or None
_aux_async_client, _DEFAULT_SUMMARIZER_MODEL = get_async_text_auxiliary_client("web_extract")
# Allow per-task override via config.yaml auxiliary.web_extract_model
DEFAULT_SUMMARIZER_MODEL = (
os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip()
or _DEFAULT_SUMMARIZER_MODEL
)
_debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG") _debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG")
@ -249,22 +241,22 @@ Create a markdown summary that captures all key information in a well-organized,
for attempt in range(max_retries): for attempt in range(max_retries):
try: try:
if _aux_async_client is None: call_kwargs = {
logger.warning("No auxiliary model available for web content processing") "task": "web_extract",
return None "messages": [
from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
_extra = get_auxiliary_extra_body()
response = await _aux_async_client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": system_prompt}, {"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt} {"role": "user", "content": user_prompt}
], ],
temperature=0.1, "temperature": 0.1,
**auxiliary_max_tokens_param(max_tokens), "max_tokens": max_tokens,
**({} if not _extra else {"extra_body": _extra}), }
) if model:
call_kwargs["model"] = model
response = await async_call_llm(**call_kwargs)
return response.choices[0].message.content.strip() return response.choices[0].message.content.strip()
except RuntimeError:
logger.warning("No auxiliary model available for web content processing")
return None
except Exception as api_error: except Exception as api_error:
last_error = api_error last_error = api_error
if attempt < max_retries - 1: if attempt < max_retries - 1:
@ -368,25 +360,18 @@ Synthesize these into ONE cohesive, comprehensive summary that:
Create a single, unified markdown summary.""" Create a single, unified markdown summary."""
try: try:
if _aux_async_client is None: call_kwargs = {
logger.warning("No auxiliary model for synthesis, concatenating summaries") "task": "web_extract",
fallback = "\n\n".join(summaries) "messages": [
if len(fallback) > max_output_size:
fallback = fallback[:max_output_size] + "\n\n[... truncated ...]"
return fallback
from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
_extra = get_auxiliary_extra_body()
response = await _aux_async_client.chat.completions.create(
model=model,
messages=[
{"role": "system", "content": "You synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise."}, {"role": "system", "content": "You synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise."},
{"role": "user", "content": synthesis_prompt} {"role": "user", "content": synthesis_prompt}
], ],
temperature=0.1, "temperature": 0.1,
**auxiliary_max_tokens_param(20000), "max_tokens": 20000,
**({} if not _extra else {"extra_body": _extra}), }
) if model:
call_kwargs["model"] = model
response = await async_call_llm(**call_kwargs)
final_summary = response.choices[0].message.content.strip() final_summary = response.choices[0].message.content.strip()
# Enforce hard cap # Enforce hard cap
@ -713,8 +698,8 @@ async def web_extract_tool(
debug_call_data["pages_extracted"] = pages_extracted debug_call_data["pages_extracted"] = pages_extracted
debug_call_data["original_response_size"] = len(json.dumps(response)) debug_call_data["original_response_size"] = len(json.dumps(response))
# Process each result with LLM if enabled and auxiliary client is available # Process each result with LLM if enabled
if use_llm_processing and _aux_async_client is not None: if use_llm_processing:
logger.info("Processing extracted content with LLM (parallel)...") logger.info("Processing extracted content with LLM (parallel)...")
debug_call_data["processing_applied"].append("llm_processing") debug_call_data["processing_applied"].append("llm_processing")
@ -780,10 +765,6 @@ async def web_extract_tool(
else: else:
logger.warning("%s (no content to process)", url) logger.warning("%s (no content to process)", url)
else: else:
if use_llm_processing and _aux_async_client is None:
logger.warning("LLM processing requested but no auxiliary model available, returning raw content")
debug_call_data["processing_applied"].append("llm_processing_unavailable")
# Print summary of extracted pages for debugging (original behavior) # Print summary of extracted pages for debugging (original behavior)
for result in response.get('results', []): for result in response.get('results', []):
url = result.get('url', 'Unknown URL') url = result.get('url', 'Unknown URL')
@ -1013,8 +994,8 @@ async def web_crawl_tool(
debug_call_data["pages_crawled"] = pages_crawled debug_call_data["pages_crawled"] = pages_crawled
debug_call_data["original_response_size"] = len(json.dumps(response)) debug_call_data["original_response_size"] = len(json.dumps(response))
# Process each result with LLM if enabled and auxiliary client is available # Process each result with LLM if enabled
if use_llm_processing and _aux_async_client is not None: if use_llm_processing:
logger.info("Processing crawled content with LLM (parallel)...") logger.info("Processing crawled content with LLM (parallel)...")
debug_call_data["processing_applied"].append("llm_processing") debug_call_data["processing_applied"].append("llm_processing")
@ -1080,10 +1061,6 @@ async def web_crawl_tool(
else: else:
logger.warning("%s (no content to process)", page_url) logger.warning("%s (no content to process)", page_url)
else: else:
if use_llm_processing and _aux_async_client is None:
logger.warning("LLM processing requested but no auxiliary model available, returning raw content")
debug_call_data["processing_applied"].append("llm_processing_unavailable")
# Print summary of crawled pages for debugging (original behavior) # Print summary of crawled pages for debugging (original behavior)
for result in response.get('results', []): for result in response.get('results', []):
page_url = result.get('url', 'Unknown URL') page_url = result.get('url', 'Unknown URL')
@ -1138,7 +1115,15 @@ def check_firecrawl_api_key() -> bool:
def check_auxiliary_model() -> bool: def check_auxiliary_model() -> bool:
"""Check if an auxiliary text model is available for LLM content processing.""" """Check if an auxiliary text model is available for LLM content processing."""
return _aux_async_client is not None try:
from agent.auxiliary_client import resolve_provider_client
for p in ("openrouter", "nous", "custom", "codex"):
client, _ = resolve_provider_client(p)
if client is not None:
return True
return False
except Exception:
return False
def get_debug_session_info() -> Dict[str, Any]: def get_debug_session_info() -> Dict[str, Any]:

View file

@ -344,28 +344,32 @@ class TrajectoryCompressor:
raise RuntimeError(f"Failed to load tokenizer '{self.config.tokenizer_name}': {e}") raise RuntimeError(f"Failed to load tokenizer '{self.config.tokenizer_name}': {e}")
def _init_summarizer(self): def _init_summarizer(self):
"""Initialize LLM client for summarization (sync and async). """Initialize LLM routing for summarization (sync and async).
Routes through the centralized provider router for known providers Uses call_llm/async_call_llm from the centralized provider router
(OpenRouter, Nous, Codex, etc.) so auth and headers are handled which handles auth, headers, and provider detection internally.
consistently. Falls back to raw construction for custom endpoints. For custom endpoints, falls back to raw client construction.
""" """
from agent.auxiliary_client import resolve_provider_client from agent.auxiliary_client import call_llm, async_call_llm
provider = self._detect_provider() provider = self._detect_provider()
if provider: if provider:
# Use centralized router — handles auth, headers, Codex adapter # Store provider for use in _generate_summary calls
self.client, _ = resolve_provider_client( self._llm_provider = provider
self._use_call_llm = True
# Verify the provider is available
from agent.auxiliary_client import resolve_provider_client
client, _ = resolve_provider_client(
provider, model=self.config.summarization_model) provider, model=self.config.summarization_model)
self.async_client, _ = resolve_provider_client( if client is None:
provider, model=self.config.summarization_model,
async_mode=True)
if self.client is None:
raise RuntimeError( raise RuntimeError(
f"Provider '{provider}' is not configured. " f"Provider '{provider}' is not configured. "
f"Check your API key or run: hermes setup") f"Check your API key or run: hermes setup")
self.client = None # Not used directly
self.async_client = None # Not used directly
else: else:
# Custom endpoint — use config's raw base_url + api_key_env # Custom endpoint — use config's raw base_url + api_key_env
self._use_call_llm = False
api_key = os.getenv(self.config.api_key_env) api_key = os.getenv(self.config.api_key_env)
if not api_key: if not api_key:
raise RuntimeError( raise RuntimeError(
@ -524,12 +528,22 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
try: try:
metrics.summarization_api_calls += 1 metrics.summarization_api_calls += 1
response = self.client.chat.completions.create( if getattr(self, '_use_call_llm', False):
model=self.config.summarization_model, from agent.auxiliary_client import call_llm
messages=[{"role": "user", "content": prompt}], response = call_llm(
temperature=self.config.temperature, provider=self._llm_provider,
max_tokens=self.config.summary_target_tokens * 2, model=self.config.summarization_model,
) messages=[{"role": "user", "content": prompt}],
temperature=self.config.temperature,
max_tokens=self.config.summary_target_tokens * 2,
)
else:
response = self.client.chat.completions.create(
model=self.config.summarization_model,
messages=[{"role": "user", "content": prompt}],
temperature=self.config.temperature,
max_tokens=self.config.summary_target_tokens * 2,
)
summary = response.choices[0].message.content.strip() summary = response.choices[0].message.content.strip()
@ -581,12 +595,22 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
try: try:
metrics.summarization_api_calls += 1 metrics.summarization_api_calls += 1
response = await self.async_client.chat.completions.create( if getattr(self, '_use_call_llm', False):
model=self.config.summarization_model, from agent.auxiliary_client import async_call_llm
messages=[{"role": "user", "content": prompt}], response = await async_call_llm(
temperature=self.config.temperature, provider=self._llm_provider,
max_tokens=self.config.summary_target_tokens * 2, model=self.config.summarization_model,
) messages=[{"role": "user", "content": prompt}],
temperature=self.config.temperature,
max_tokens=self.config.summary_target_tokens * 2,
)
else:
response = await self.async_client.chat.completions.create(
model=self.config.summarization_model,
messages=[{"role": "user", "content": prompt}],
temperature=self.config.temperature,
max_tokens=self.config.summary_target_tokens * 2,
)
summary = response.choices[0].message.content.strip() summary = response.choices[0].message.content.strip()