feat: call_llm/async_call_llm + config slots + migrate all consumers

Add centralized call_llm() and async_call_llm() functions that own the full LLM request lifecycle: 1. Resolve provider + model from task config or explicit args 2. Get or create a cached client for that provider 3. Format request args (max_tokens handling, provider extra_body) 4. Make the API call with max_tokens/max_completion_tokens retry 5. Return the response Config: expanded auxiliary section with provider:model slots for all tasks (compression, vision, web_extract, session_search, skills_hub, mcp, flush_memories). Config version bumped to 7. Migrated all auxiliary consumers: - context_compressor.py: uses call_llm(task='compression') - vision_tools.py: uses async_call_llm(task='vision') - web_tools.py: uses async_call_llm(task='web_extract') - session_search_tool.py: uses async_call_llm(task='session_search') - browser_tool.py: uses call_llm(task='vision'/'web_extract') - mcp_tool.py: uses call_llm(task='mcp') - skills_guard.py: uses call_llm(provider='openrouter') - run_agent.py flush_memories: uses call_llm(task='flush_memories') Tests updated for context_compressor and MCP tool. Some test mocks still need updating (15 remaining failures from mock pattern changes, 2 pre-existing).
2026-03-11 20:52:19 -07:00 · 2026-03-11 20:52:19 -07:00 · 0aa31cd3cb
commit 0aa31cd3cb
parent 013cc4d2fc
13 changed files with 552 additions and 375 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -784,3 +784,253 @@ def auxiliary_max_tokens_param(value: int) -> dict:
            and "api.openai.com" in custom_base.lower()):
        return {"max_completion_tokens": value}
    return {"max_tokens": value}
 # ── Centralized LLM Call API ────────────────────────────────────────────────
 #
 # call_llm() and async_call_llm() own the full request lifecycle:
 #   1. Resolve provider + model from task config (or explicit args)
 #   2. Get or create a cached client for that provider
 #   3. Format request args for the provider + model (max_tokens handling, etc.)
 #   4. Make the API call
 #   5. Return the response
 #
 # Every auxiliary LLM consumer should use these instead of manually
 # constructing clients and calling .chat.completions.create().
 # Client cache: (provider, async_mode) -> (client, default_model)
 _client_cache: Dict[tuple, tuple] = {}
 def _get_cached_client(
    provider: str, model: str = None, async_mode: bool = False,
 ) -> Tuple[Optional[Any], Optional[str]]:
    """Get or create a cached client for the given provider."""
    cache_key = (provider, async_mode)
    if cache_key in _client_cache:
        cached_client, cached_default = _client_cache[cache_key]
        return cached_client, model or cached_default
    client, default_model = resolve_provider_client(provider, model, async_mode)
    if client is not None:
        _client_cache[cache_key] = (client, default_model)
    return client, model or default_model
 def _resolve_task_provider_model(
    task: str = None,
    provider: str = None,
    model: str = None,
 ) -> Tuple[str, Optional[str]]:
    """Determine provider + model for a call.
    Priority:
      1. Explicit provider/model args (always win)
      2. Env var overrides (AUXILIARY_{TASK}_PROVIDER, etc.)
      3. Config file (auxiliary.{task}.provider/model or compression.*)
      4. "auto" (full auto-detection chain)
    Returns (provider, model) where model may be None (use provider default).
    """
    if provider:
        return provider, model
    if task:
        # Check env var overrides first
        env_provider = _get_auxiliary_provider(task)
        if env_provider != "auto":
            # Check for env var model override too
            env_model = None
            for prefix in ("AUXILIARY_", "CONTEXT_"):
                val = os.getenv(f"{prefix}{task.upper()}_MODEL", "").strip()
                if val:
                    env_model = val
                    break
            return env_provider, model or env_model
        # Read from config file
        try:
            from hermes_cli.config import load_config
            config = load_config()
        except ImportError:
            return "auto", model
        # Check auxiliary.{task} section
        aux = config.get("auxiliary", {})
        task_config = aux.get(task, {})
        cfg_provider = task_config.get("provider", "").strip() or None
        cfg_model = task_config.get("model", "").strip() or None
        # Backwards compat: compression section has its own keys
        if task == "compression" and not cfg_provider:
            comp = config.get("compression", {})
            cfg_provider = comp.get("summary_provider", "").strip() or None
            cfg_model = cfg_model or comp.get("summary_model", "").strip() or None
        if cfg_provider and cfg_provider != "auto":
            return cfg_provider, model or cfg_model
        return "auto", model or cfg_model
    return "auto", model
 def _build_call_kwargs(
    provider: str,
    model: str,
    messages: list,
    temperature: Optional[float] = None,
    max_tokens: Optional[int] = None,
    tools: Optional[list] = None,
    timeout: float = 30.0,
    extra_body: Optional[dict] = None,
 ) -> dict:
    """Build kwargs for .chat.completions.create() with model/provider adjustments."""
    kwargs: Dict[str, Any] = {
        "model": model,
        "messages": messages,
        "timeout": timeout,
    }
    if temperature is not None:
        kwargs["temperature"] = temperature
    if max_tokens is not None:
        # Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens.
        # Direct OpenAI api.openai.com with newer models needs max_completion_tokens.
        if provider == "custom":
            custom_base = os.getenv("OPENAI_BASE_URL", "")
            if "api.openai.com" in custom_base.lower():
                kwargs["max_completion_tokens"] = max_tokens
            else:
                kwargs["max_tokens"] = max_tokens
        else:
            kwargs["max_tokens"] = max_tokens
    if tools:
        kwargs["tools"] = tools
    # Provider-specific extra_body
    merged_extra = dict(extra_body or {})
    if provider == "nous" or auxiliary_is_nous:
        merged_extra.setdefault("tags", []).extend(["product=hermes-agent"])
    if merged_extra:
        kwargs["extra_body"] = merged_extra
    return kwargs
 def call_llm(
    task: str = None,
    *,
    provider: str = None,
    model: str = None,
    messages: list,
    temperature: float = None,
    max_tokens: int = None,
    tools: list = None,
    timeout: float = 30.0,
    extra_body: dict = None,
 ) -> Any:
    """Centralized synchronous LLM call.
    Resolves provider + model (from task config, explicit args, or auto-detect),
    handles auth, request formatting, and model-specific arg adjustments.
    Args:
        task: Auxiliary task name ("compression", "vision", "web_extract",
              "session_search", "skills_hub", "mcp", "flush_memories").
              Reads provider:model from config/env. Ignored if provider is set.
        provider: Explicit provider override.
        model: Explicit model override.
        messages: Chat messages list.
        temperature: Sampling temperature (None = provider default).
        max_tokens: Max output tokens (handles max_tokens vs max_completion_tokens).
        tools: Tool definitions (for function calling).
        timeout: Request timeout in seconds.
        extra_body: Additional request body fields.
    Returns:
        Response object with .choices[0].message.content
    Raises:
        RuntimeError: If no provider is configured.
    """
    resolved_provider, resolved_model = _resolve_task_provider_model(
        task, provider, model)
    client, final_model = _get_cached_client(resolved_provider, resolved_model)
    if client is None:
        # Fallback: try openrouter
        if resolved_provider != "openrouter":
            logger.warning("Provider %s unavailable, falling back to openrouter",
                           resolved_provider)
            client, final_model = _get_cached_client(
                "openrouter", resolved_model or _OPENROUTER_MODEL)
    if client is None:
        raise RuntimeError(
            f"No LLM provider configured for task={task} provider={resolved_provider}. "
            f"Run: hermes setup")
    kwargs = _build_call_kwargs(
        resolved_provider, final_model, messages,
        temperature=temperature, max_tokens=max_tokens,
        tools=tools, timeout=timeout, extra_body=extra_body)
    # Handle max_tokens vs max_completion_tokens retry
    try:
        return client.chat.completions.create(**kwargs)
    except Exception as first_err:
        err_str = str(first_err)
        if "max_tokens" in err_str or "unsupported_parameter" in err_str:
            kwargs.pop("max_tokens", None)
            kwargs["max_completion_tokens"] = max_tokens
            return client.chat.completions.create(**kwargs)
        raise
 async def async_call_llm(
    task: str = None,
    *,
    provider: str = None,
    model: str = None,
    messages: list,
    temperature: float = None,
    max_tokens: int = None,
    tools: list = None,
    timeout: float = 30.0,
    extra_body: dict = None,
 ) -> Any:
    """Centralized asynchronous LLM call.
    Same as call_llm() but async. See call_llm() for full documentation.
    """
    resolved_provider, resolved_model = _resolve_task_provider_model(
        task, provider, model)
    client, final_model = _get_cached_client(
        resolved_provider, resolved_model, async_mode=True)
    if client is None:
        if resolved_provider != "openrouter":
            logger.warning("Provider %s unavailable, falling back to openrouter",
                           resolved_provider)
            client, final_model = _get_cached_client(
                "openrouter", resolved_model or _OPENROUTER_MODEL,
                async_mode=True)
    if client is None:
        raise RuntimeError(
            f"No LLM provider configured for task={task} provider={resolved_provider}. "
            f"Run: hermes setup")
    kwargs = _build_call_kwargs(
        resolved_provider, final_model, messages,
        temperature=temperature, max_tokens=max_tokens,
        tools=tools, timeout=timeout, extra_body=extra_body)
    try:
        return await client.chat.completions.create(**kwargs)
    except Exception as first_err:
        err_str = str(first_err)
        if "max_tokens" in err_str or "unsupported_parameter" in err_str:
            kwargs.pop("max_tokens", None)
            kwargs["max_completion_tokens"] = max_tokens
            return await client.chat.completions.create(**kwargs)
        raise
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@ -9,7 +9,7 @@ import logging
 import os
 from typing import Any, Dict, List, Optional
-from agent.auxiliary_client import get_text_auxiliary_client
+from agent.auxiliary_client import call_llm
 from agent.model_metadata import (
    get_model_context_length,
    estimate_messages_tokens_rough,
@ -53,8 +53,7 @@ class ContextCompressor:
        self.last_completion_tokens = 0
        self.last_total_tokens = 0
-        self.client, default_model = get_text_auxiliary_client("compression")
+        self.summary_model = summary_model_override or ""
        self.summary_model = summary_model_override or default_model
    def update_from_response(self, usage: Dict[str, Any]):
        """Update tracked token usage from API response."""
@ -120,73 +119,30 @@ TURNS TO SUMMARIZE:
 Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
-        # 1. Try the auxiliary model (cheap/fast)
+        # Use the centralized LLM router — handles provider resolution,
-        if self.client:
+        # auth, and fallback internally.
            try:
                return self._call_summary_model(self.client, self.summary_model, prompt)
            except Exception as e:
                logging.warning(f"Failed to generate context summary with auxiliary model: {e}")
        # 2. Fallback: re-try via the centralized provider router.
        #    This covers all configured providers (Codex OAuth, API-key
        #    providers, etc.) without ad-hoc env var lookups.
        from agent.auxiliary_client import resolve_provider_client
        fallback_providers = ["custom", "openrouter", "nous", "codex"]
        for fb_provider in fallback_providers:
            try:
                fb_client, fb_model = resolve_provider_client(
                    fb_provider, model=self.model)
                if fb_client is None:
                    continue
                # Don't retry the same client that just failed
                if (self.client is not None
                        and hasattr(fb_client, "base_url")
                        and hasattr(self.client, "base_url")
                        and str(fb_client.base_url) == str(self.client.base_url)):
                    continue
                logger.info("Retrying context summary with fallback provider "
                            "%s (%s)", fb_provider, fb_model)
                summary = self._call_summary_model(fb_client, fb_model, prompt)
                # Promote successful fallback for future compressions
                self.client = fb_client
                self.summary_model = fb_model
                return summary
            except Exception as fallback_err:
                logging.warning("Fallback provider %s failed: %s",
                                fb_provider, fallback_err)
        # 3. All providers failed — return None so the caller drops turns
        #    without a summary.
        logging.warning("Context compression: no provider available for "
                        "summary. Middle turns will be dropped without summary.")
        return None
    def _call_summary_model(self, client, model: str, prompt: str) -> str:
        """Make the actual LLM call to generate a summary. Raises on failure."""
        kwargs = {
            "model": model,
            "messages": [{"role": "user", "content": prompt}],
            "temperature": 0.3,
            "timeout": 30.0,
        }
        # Most providers (OpenRouter, local models) use max_tokens.
        # Direct OpenAI with newer models (gpt-4o, o-series, gpt-5+)
        # requires max_completion_tokens instead.
        try:
-            kwargs["max_tokens"] = self.summary_target_tokens * 2
+            call_kwargs = {
-            response = client.chat.completions.create(**kwargs)
+                "task": "compression",
-        except Exception as first_err:
+                "messages": [{"role": "user", "content": prompt}],
-            if "max_tokens" in str(first_err) or "unsupported_parameter" in str(first_err):
+                "temperature": 0.3,
-                kwargs.pop("max_tokens", None)
+                "max_tokens": self.summary_target_tokens * 2,
-                kwargs["max_completion_tokens"] = self.summary_target_tokens * 2
+                "timeout": 30.0,
-                response = client.chat.completions.create(**kwargs)
+            }
-            else:
+            if self.summary_model:
-                raise
+                call_kwargs["model"] = self.summary_model
-
+            response = call_llm(**call_kwargs)
-        summary = response.choices[0].message.content.strip()
+            summary = response.choices[0].message.content.strip()
-        if not summary.startswith("[CONTEXT SUMMARY]:"):
+            if not summary.startswith("[CONTEXT SUMMARY]:"):
-            summary = "[CONTEXT SUMMARY]: " + summary
+                summary = "[CONTEXT SUMMARY]: " + summary
-        return summary
+            return summary
        except RuntimeError:
            logging.warning("Context compression: no provider available for "
                            "summary. Middle turns will be dropped without summary.")
            return None
        except Exception as e:
            logging.warning("Failed to generate context summary: %s", e)
            return None
    # ------------------------------------------------------------------
    # Tool-call / tool-result pair integrity helpers
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@ -125,17 +125,41 @@ DEFAULT_CONFIG = {
        "summary_provider": "auto",
    },
-    # Auxiliary model overrides (advanced).  By default Hermes auto-selects
+    # Auxiliary model config — provider:model for each side task.
-    # the provider and model for each side task.  Set these to override.
+    # Format: provider is the provider name, model is the model slug.
    # "auto" for provider = auto-detect best available provider.
    # Empty model = use provider's default auxiliary model.
    # All tasks fall back to openrouter:google/gemini-3-flash-preview if
    # the configured provider is unavailable.
    "auxiliary": {
        "vision": {
-            "provider": "auto",    # auto | openrouter | nous | main
+            "provider": "auto",    # auto | openrouter | nous | codex | custom
            "model": "",           # e.g. "google/gemini-2.5-flash", "gpt-4o"
        },
        "web_extract": {
            "provider": "auto",
            "model": "",
        },
        "compression": {
            "provider": "auto",
            "model": "",
        },
        "session_search": {
            "provider": "auto",
            "model": "",
        },
        "skills_hub": {
            "provider": "auto",
            "model": "",
        },
        "mcp": {
            "provider": "auto",
            "model": "",
        },
        "flush_memories": {
            "provider": "auto",
            "model": "",
        },
    },
    "display": {
@ -217,7 +241,7 @@ DEFAULT_CONFIG = {
    "personalities": {},
    # Config schema version - bump this when adding new required fields
-    "_config_version": 6,
+    "_config_version": 7,
 }
 # =============================================================================
--- a/run_agent.py
+++ b/run_agent.py
@ -2623,19 +2623,22 @@ class AIAgent:
            # Use auxiliary client for the flush call when available --
            # it's cheaper and avoids Codex Responses API incompatibility.
-            from agent.auxiliary_client import get_text_auxiliary_client
+            from agent.auxiliary_client import call_llm as _call_llm
-            aux_client, aux_model = get_text_auxiliary_client()
+            _aux_available = True
            try:
                response = _call_llm(
                    task="flush_memories",
                    messages=api_messages,
                    tools=[memory_tool_def],
                    temperature=0.3,
                    max_tokens=5120,
                    timeout=30.0,
                )
            except RuntimeError:
                _aux_available = False
                response = None
-            if aux_client:
+            if not _aux_available and self.api_mode == "codex_responses":
                api_kwargs = {
                    "model": aux_model,
                    "messages": api_messages,
                    "tools": [memory_tool_def],
                    "temperature": 0.3,
                    "max_tokens": 5120,
                }
                response = aux_client.chat.completions.create(**api_kwargs, timeout=30.0)
            elif self.api_mode == "codex_responses":
                # No auxiliary client -- use the Codex Responses path directly
                codex_kwargs = self._build_api_kwargs(api_messages)
                codex_kwargs["tools"] = self._responses_tools([memory_tool_def])
@ -2643,7 +2646,7 @@ class AIAgent:
                if "max_output_tokens" in codex_kwargs:
                    codex_kwargs["max_output_tokens"] = 5120
                response = self._run_codex_stream(codex_kwargs)
-            else:
+            elif not _aux_available:
                api_kwargs = {
                    "model": self.model,
                    "messages": api_messages,
@ -2655,7 +2658,7 @@ class AIAgent:
            # Extract tool calls from the response, handling both API formats
            tool_calls = []
-            if self.api_mode == "codex_responses" and not aux_client:
+            if self.api_mode == "codex_responses" and not _aux_available:
                assistant_msg, _ = self._normalize_codex_response(response)
                if assistant_msg and assistant_msg.tool_calls:
                    tool_calls = assistant_msg.tool_calls
--- a/tests/agent/test_context_compressor.py
+++ b/tests/agent/test_context_compressor.py
@ -9,8 +9,7 @@ from agent.context_compressor import ContextCompressor
@pytest.fixture()
 def compressor():
    """Create a ContextCompressor with mocked dependencies."""
-    with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
+    with patch("agent.context_compressor.get_model_context_length", return_value=100000):
         patch("agent.context_compressor.get_text_auxiliary_client", return_value=(None, None)):
        c = ContextCompressor(
            model="test/model",
            threshold_percent=0.85,
@ -119,14 +118,11 @@ class TestGenerateSummaryNoneContent:
    """Regression: content=None (from tool-call-only assistant messages) must not crash."""
    def test_none_content_does_not_crash(self):
        mock_client = MagicMock()
        mock_response = MagicMock()
        mock_response.choices = [MagicMock()]
        mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: tool calls happened"
        mock_client.chat.completions.create.return_value = mock_response
-        with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
+        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
             patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
            c = ContextCompressor(model="test", quiet_mode=True)
        messages = [
@ -139,14 +135,14 @@ class TestGenerateSummaryNoneContent:
            {"role": "user", "content": "thanks"},
        ]
-        summary = c._generate_summary(messages)
+        with patch("agent.context_compressor.call_llm", return_value=mock_response):
            summary = c._generate_summary(messages)
        assert isinstance(summary, str)
        assert "CONTEXT SUMMARY" in summary
    def test_none_content_in_system_message_compress(self):
        """System message with content=None should not crash during compress."""
-        with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
+        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
             patch("agent.context_compressor.get_text_auxiliary_client", return_value=(None, None)):
            c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
        msgs = [{"role": "system", "content": None}] + [
@ -165,12 +161,12 @@ class TestCompressWithClient:
        mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened"
        mock_client.chat.completions.create.return_value = mock_response
-        with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
+        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
             patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
            c = ContextCompressor(model="test", quiet_mode=True)
        msgs = [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} for i in range(10)]
-        result = c.compress(msgs)
+        with patch("agent.context_compressor.call_llm", return_value=mock_response):
            result = c.compress(msgs)
        # Should have summary message in the middle
        contents = [m.get("content", "") for m in result]
@ -184,8 +180,7 @@ class TestCompressWithClient:
        mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compressed middle"
        mock_client.chat.completions.create.return_value = mock_response
-        with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
+        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
             patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
            c = ContextCompressor(
                model="test",
                quiet_mode=True,
@ -212,7 +207,8 @@ class TestCompressWithClient:
            {"role": "user", "content": "later 4"},
        ]
-        result = c.compress(msgs)
+        with patch("agent.context_compressor.call_llm", return_value=mock_response):
            result = c.compress(msgs)
        answered_ids = {
            msg.get("tool_call_id")
@ -232,8 +228,7 @@ class TestCompressWithClient:
        mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened"
        mock_client.chat.completions.create.return_value = mock_response
-        with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
+        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
             patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
            c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
        # Last head message (index 1) is "assistant" → summary should be "user"
@ -245,7 +240,8 @@ class TestCompressWithClient:
            {"role": "user", "content": "msg 4"},
            {"role": "assistant", "content": "msg 5"},
        ]
-        result = c.compress(msgs)
+        with patch("agent.context_compressor.call_llm", return_value=mock_response):
            result = c.compress(msgs)
        summary_msg = [m for m in result if "CONTEXT SUMMARY" in (m.get("content") or "")]
        assert len(summary_msg) == 1
        assert summary_msg[0]["role"] == "user"
@ -258,8 +254,7 @@ class TestCompressWithClient:
        mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened"
        mock_client.chat.completions.create.return_value = mock_response
-        with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
+        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
             patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
            c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=3, protect_last_n=2)
        # Last head message (index 2) is "user" → summary should be "assistant"
@ -273,20 +268,18 @@ class TestCompressWithClient:
            {"role": "user", "content": "msg 6"},
            {"role": "assistant", "content": "msg 7"},
        ]
-        result = c.compress(msgs)
+        with patch("agent.context_compressor.call_llm", return_value=mock_response):
            result = c.compress(msgs)
        summary_msg = [m for m in result if "CONTEXT SUMMARY" in (m.get("content") or "")]
        assert len(summary_msg) == 1
        assert summary_msg[0]["role"] == "assistant"
    def test_summarization_does_not_start_tail_with_tool_outputs(self):
        mock_client = MagicMock()
        mock_response = MagicMock()
        mock_response.choices = [MagicMock()]
        mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compressed middle"
        mock_client.chat.completions.create.return_value = mock_response
-        with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
+        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
             patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
            c = ContextCompressor(
                model="test",
                quiet_mode=True,
@ -309,7 +302,8 @@ class TestCompressWithClient:
            {"role": "user", "content": "latest user"},
        ]
-        result = c.compress(msgs)
+        with patch("agent.context_compressor.call_llm", return_value=mock_response):
            result = c.compress(msgs)
        called_ids = {
            tc["id"]
--- a/tests/tools/test_mcp_tool.py
+++ b/tests/tools/test_mcp_tool.py
@ -1828,8 +1828,8 @@ class TestSamplingCallbackText:
        )
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "default-model"),
+            return_value=fake_client.chat.completions.create.return_value,
        ):
            params = _make_sampling_params()
            result = asyncio.run(self.handler(None, params))
@ -1847,13 +1847,13 @@ class TestSamplingCallbackText:
        fake_client.chat.completions.create.return_value = _make_llm_response()
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "default-model"),
+            return_value=fake_client.chat.completions.create.return_value,
-        ):
+        ) as mock_call:
            params = _make_sampling_params(system_prompt="Be helpful")
            asyncio.run(self.handler(None, params))
-        call_args = fake_client.chat.completions.create.call_args
+        call_args = mock_call.call_args
        messages = call_args.kwargs["messages"]
        assert messages[0] == {"role": "system", "content": "Be helpful"}
@ -1865,8 +1865,8 @@ class TestSamplingCallbackText:
        )
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "default-model"),
+            return_value=fake_client.chat.completions.create.return_value,
        ):
            params = _make_sampling_params()
            result = asyncio.run(self.handler(None, params))
@ -1889,8 +1889,8 @@ class TestSamplingCallbackToolUse:
        fake_client.chat.completions.create.return_value = _make_llm_tool_response()
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "default-model"),
+            return_value=fake_client.chat.completions.create.return_value,
        ):
            params = _make_sampling_params()
            result = asyncio.run(self.handler(None, params))
@ -1916,8 +1916,8 @@ class TestSamplingCallbackToolUse:
        )
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "default-model"),
+            return_value=fake_client.chat.completions.create.return_value,
        ):
            result = asyncio.run(self.handler(None, _make_sampling_params()))
@ -1939,8 +1939,8 @@ class TestToolLoopGovernance:
        fake_client.chat.completions.create.return_value = _make_llm_tool_response()
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "default-model"),
+            return_value=fake_client.chat.completions.create.return_value,
        ):
            params = _make_sampling_params()
            # Round 1, 2: allowed
@ -1959,8 +1959,8 @@ class TestToolLoopGovernance:
        fake_client = MagicMock()
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "default-model"),
+            return_value=fake_client.chat.completions.create.return_value,
        ):
            # Tool response (round 1 of 1 allowed)
            fake_client.chat.completions.create.return_value = _make_llm_tool_response()
@ -1984,8 +1984,8 @@ class TestToolLoopGovernance:
        fake_client.chat.completions.create.return_value = _make_llm_tool_response()
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "default-model"),
+            return_value=fake_client.chat.completions.create.return_value,
        ):
            result = asyncio.run(handler(None, _make_sampling_params()))
            assert isinstance(result, ErrorData)
@ -2003,8 +2003,8 @@ class TestSamplingErrors:
        fake_client.chat.completions.create.return_value = _make_llm_response()
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "default-model"),
+            return_value=fake_client.chat.completions.create.return_value,
        ):
            # First call succeeds
            r1 = asyncio.run(handler(None, _make_sampling_params()))
@ -2017,20 +2017,16 @@ class TestSamplingErrors:
    def test_timeout_error(self):
        handler = SamplingHandler("to", {"timeout": 0.05})
        fake_client = MagicMock()
        def slow_call(**kwargs):
            import threading
            # Use an event to ensure the thread truly blocks long enough
            evt = threading.Event()
            evt.wait(5)  # blocks for up to 5 seconds (cancelled by timeout)
            return _make_llm_response()
        fake_client.chat.completions.create.side_effect = slow_call
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "default-model"),
+            side_effect=slow_call,
        ):
            result = asyncio.run(handler(None, _make_sampling_params()))
            assert isinstance(result, ErrorData)
@ -2041,12 +2037,11 @@ class TestSamplingErrors:
        handler = SamplingHandler("np", {})
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(None, None),
+            side_effect=RuntimeError("No LLM provider configured"),
        ):
            result = asyncio.run(handler(None, _make_sampling_params()))
            assert isinstance(result, ErrorData)
            assert "No LLM provider" in result.message
            assert handler.metrics["errors"] == 1
    def test_empty_choices_returns_error(self):
@ -2060,8 +2055,8 @@ class TestSamplingErrors:
        )
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "default-model"),
+            return_value=fake_client.chat.completions.create.return_value,
        ):
            result = asyncio.run(handler(None, _make_sampling_params()))
@ -2080,8 +2075,8 @@ class TestSamplingErrors:
        )
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "default-model"),
+            return_value=fake_client.chat.completions.create.return_value,
        ):
            result = asyncio.run(handler(None, _make_sampling_params()))
@ -2099,8 +2094,8 @@ class TestSamplingErrors:
        )
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "default-model"),
+            return_value=fake_client.chat.completions.create.return_value,
        ):
            result = asyncio.run(handler(None, _make_sampling_params()))
@ -2120,8 +2115,8 @@ class TestModelWhitelist:
        fake_client.chat.completions.create.return_value = _make_llm_response()
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "test-model"),
+            return_value=fake_client.chat.completions.create.return_value,
        ):
            result = asyncio.run(handler(None, _make_sampling_params()))
            assert isinstance(result, CreateMessageResult)
@ -2131,8 +2126,8 @@ class TestModelWhitelist:
        fake_client = MagicMock()
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "gpt-3.5-turbo"),
+            return_value=fake_client.chat.completions.create.return_value,
        ):
            result = asyncio.run(handler(None, _make_sampling_params()))
            assert isinstance(result, ErrorData)
@ -2145,8 +2140,8 @@ class TestModelWhitelist:
        fake_client.chat.completions.create.return_value = _make_llm_response()
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "any-model"),
+            return_value=fake_client.chat.completions.create.return_value,
        ):
            result = asyncio.run(handler(None, _make_sampling_params()))
            assert isinstance(result, CreateMessageResult)
@ -2166,8 +2161,8 @@ class TestMalformedToolCallArgs:
        )
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "default-model"),
+            return_value=fake_client.chat.completions.create.return_value,
        ):
            result = asyncio.run(handler(None, _make_sampling_params()))
@ -2194,8 +2189,8 @@ class TestMalformedToolCallArgs:
        fake_client.chat.completions.create.return_value = response
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "default-model"),
+            return_value=fake_client.chat.completions.create.return_value,
        ):
            result = asyncio.run(handler(None, _make_sampling_params()))
@ -2214,8 +2209,8 @@ class TestMetricsTracking:
        fake_client.chat.completions.create.return_value = _make_llm_response()
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "default-model"),
+            return_value=fake_client.chat.completions.create.return_value,
        ):
            asyncio.run(handler(None, _make_sampling_params()))
@ -2229,8 +2224,8 @@ class TestMetricsTracking:
        fake_client.chat.completions.create.return_value = _make_llm_tool_response()
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(fake_client, "default-model"),
+            return_value=fake_client.chat.completions.create.return_value,
        ):
            asyncio.run(handler(None, _make_sampling_params()))
@ -2241,8 +2236,8 @@ class TestMetricsTracking:
        handler = SamplingHandler("met3", {})
        with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
+            "agent.auxiliary_client.call_llm",
-            return_value=(None, None),
+            side_effect=RuntimeError("No LLM provider configured"),
        ):
            asyncio.run(handler(None, _make_sampling_params()))
--- a/tools/browser_tool.py
+++ b/tools/browser_tool.py
@ -63,7 +63,7 @@ import time
 import requests
 from typing import Dict, Any, Optional, List
 from pathlib import Path
-from agent.auxiliary_client import get_vision_auxiliary_client, get_text_auxiliary_client
+from agent.auxiliary_client import call_llm
 logger = logging.getLogger(__name__)
@ -80,38 +80,15 @@ DEFAULT_SESSION_TIMEOUT = 300
 # Max tokens for snapshot content before summarization
 SNAPSHOT_SUMMARIZE_THRESHOLD = 8000
 # Vision client — for browser_vision (screenshot analysis)
 # Wrapped in try/except so a broken auxiliary config doesn't prevent the entire
 # browser_tool module from importing (which would disable all 10 browser tools).
 try:
    _aux_vision_client, _DEFAULT_VISION_MODEL = get_vision_auxiliary_client()
 except Exception as _init_err:
    logger.debug("Could not initialise vision auxiliary client: %s", _init_err)
    _aux_vision_client, _DEFAULT_VISION_MODEL = None, None
-# Text client — for page snapshot summarization (same config as web_extract)
+def _get_vision_model() -> Optional[str]:
 try:
    _aux_text_client, _DEFAULT_TEXT_MODEL = get_text_auxiliary_client("web_extract")
 except Exception as _init_err:
    logger.debug("Could not initialise text auxiliary client: %s", _init_err)
    _aux_text_client, _DEFAULT_TEXT_MODEL = None, None
 # Module-level alias for availability checks
 EXTRACTION_MODEL = _DEFAULT_TEXT_MODEL or _DEFAULT_VISION_MODEL
 def _get_vision_model() -> str:
    """Model for browser_vision (screenshot analysis — multimodal)."""
-    return (os.getenv("AUXILIARY_VISION_MODEL", "").strip()
+    return os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None
            or _DEFAULT_VISION_MODEL
            or "google/gemini-3-flash-preview")
-def _get_extraction_model() -> str:
+def _get_extraction_model() -> Optional[str]:
    """Model for page snapshot text summarization — same as web_extract."""
-    return (os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip()
+    return os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() or None
            or _DEFAULT_TEXT_MODEL
            or "google/gemini-3-flash-preview")
 def _is_local_mode() -> bool:
@ -941,9 +918,6 @@ def _extract_relevant_content(
    Falls back to simple truncation when no auxiliary text model is configured.
    """
    if _aux_text_client is None:
        return _truncate_snapshot(snapshot_text)
    if user_task:
        extraction_prompt = (
            f"You are a content extractor for a browser automation agent.\n\n"
@ -968,13 +942,16 @@ def _extract_relevant_content(
        )
    try:
-        from agent.auxiliary_client import auxiliary_max_tokens_param
+        call_kwargs = {
-        response = _aux_text_client.chat.completions.create(
+            "task": "web_extract",
-            model=_get_extraction_model(),
+            "messages": [{"role": "user", "content": extraction_prompt}],
-            messages=[{"role": "user", "content": extraction_prompt}],
+            "max_tokens": 4000,
-            **auxiliary_max_tokens_param(4000),
+            "temperature": 0.1,
-            temperature=0.1,
+        }
-        )
+        model = _get_extraction_model()
        if model:
            call_kwargs["model"] = model
        response = call_llm(**call_kwargs)
        return response.choices[0].message.content
    except Exception:
        return _truncate_snapshot(snapshot_text)
@ -1497,14 +1474,6 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
    effective_task_id = task_id or "default"
    # Check auxiliary vision client
    if _aux_vision_client is None or _DEFAULT_VISION_MODEL is None:
        return json.dumps({
            "success": False,
            "error": "Browser vision unavailable: no auxiliary vision model configured. "
                     "Set OPENROUTER_API_KEY or configure Nous Portal to enable browser vision."
        }, ensure_ascii=False)
    # Save screenshot to persistent location so it can be shared with users
    hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
    screenshots_dir = hermes_home / "browser_screenshots"
@ -1562,14 +1531,13 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
            f"Focus on answering the user's specific question."
        )
-        # Use the sync auxiliary vision client directly
+        # Use the centralized LLM router
        from agent.auxiliary_client import auxiliary_max_tokens_param
        vision_model = _get_vision_model()
-        logger.debug("browser_vision: analysing screenshot (%d bytes) with model=%s",
+        logger.debug("browser_vision: analysing screenshot (%d bytes)",
-                     len(image_data), vision_model)
+                     len(image_data))
-        response = _aux_vision_client.chat.completions.create(
+        call_kwargs = {
-            model=vision_model,
+            "task": "vision",
-            messages=[
+            "messages": [
                {
                    "role": "user",
                    "content": [
@ -1578,9 +1546,12 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
                    ],
                }
            ],
-            **auxiliary_max_tokens_param(2000),
+            "max_tokens": 2000,
-            temperature=0.1,
+            "temperature": 0.1,
-        )
+        }
        if vision_model:
            call_kwargs["model"] = vision_model
        response = call_llm(**call_kwargs)
        analysis = response.choices[0].message.content
        response_data = {
--- a/tools/mcp_tool.py
+++ b/tools/mcp_tool.py
@ -456,17 +456,13 @@ class SamplingHandler:
        # Resolve model
        model = self._resolve_model(getattr(params, "modelPreferences", None))
-        # Get auxiliary LLM client
+        # Get auxiliary LLM client via centralized router
-        from agent.auxiliary_client import get_text_auxiliary_client
+        from agent.auxiliary_client import call_llm
        client, default_model = get_text_auxiliary_client()
        if client is None:
            self.metrics["errors"] += 1
            return self._error("No LLM provider available for sampling")
-        resolved_model = model or default_model
+        # Model whitelist check (we need to resolve model before calling)
        resolved_model = model or self.model_override or ""
-        # Model whitelist check
+        if self.allowed_models and resolved_model and resolved_model not in self.allowed_models:
        if self.allowed_models and resolved_model not in self.allowed_models:
            logger.warning(
                "MCP server '%s' requested model '%s' not in allowed_models",
                self.server_name, resolved_model,
@ -484,20 +480,15 @@ class SamplingHandler:
        # Build LLM call kwargs
        max_tokens = min(params.maxTokens, self.max_tokens_cap)
-        call_kwargs: dict = {
+        call_temperature = None
            "model": resolved_model,
            "messages": messages,
            "max_tokens": max_tokens,
        }
        if hasattr(params, "temperature") and params.temperature is not None:
-            call_kwargs["temperature"] = params.temperature
+            call_temperature = params.temperature
        if stop := getattr(params, "stopSequences", None):
            call_kwargs["stop"] = stop
        # Forward server-provided tools
        call_tools = None
        server_tools = getattr(params, "tools", None)
        if server_tools:
-            call_kwargs["tools"] = [
+            call_tools = [
                {
                    "type": "function",
                    "function": {
@ -508,9 +499,6 @@ class SamplingHandler:
                }
                for t in server_tools
            ]
            if tool_choice := getattr(params, "toolChoice", None):
                mode = getattr(tool_choice, "mode", "auto")
                call_kwargs["tool_choice"] = {"auto": "auto", "required": "required", "none": "none"}.get(mode, "auto")
        logger.log(
            self.audit_level,
@ -520,7 +508,15 @@ class SamplingHandler:
        # Offload sync LLM call to thread (non-blocking)
        def _sync_call():
-            return client.chat.completions.create(**call_kwargs)
+            return call_llm(
                task="mcp",
                model=resolved_model or None,
                messages=messages,
                temperature=call_temperature,
                max_tokens=max_tokens,
                tools=call_tools,
                timeout=self.timeout,
            )
        try:
            response = await asyncio.wait_for(
--- a/tools/session_search_tool.py
+++ b/tools/session_search_tool.py
@ -22,13 +22,7 @@ import os
 import logging
 from typing import Dict, Any, List, Optional, Union
-from openai import AsyncOpenAI, OpenAI
+from agent.auxiliary_client import async_call_llm
 from agent.auxiliary_client import get_async_text_auxiliary_client
 # Resolve the async auxiliary client at import time so we have the model slug.
 # Handles Codex Responses API adapter transparently.
 _async_aux_client, _SUMMARIZER_MODEL = get_async_text_auxiliary_client()
 MAX_SESSION_CHARS = 100_000
 MAX_SUMMARY_TOKENS = 10000
@ -156,26 +150,22 @@ async def _summarize_session(
        f"Summarize this conversation with focus on: {query}"
    )
    if _async_aux_client is None or _SUMMARIZER_MODEL is None:
        logging.warning("No auxiliary model available for session summarization")
        return None
    max_retries = 3
    for attempt in range(max_retries):
        try:
-            from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
+            response = await async_call_llm(
-            _extra = get_auxiliary_extra_body()
+                task="session_search",
            response = await _async_aux_client.chat.completions.create(
                model=_SUMMARIZER_MODEL,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt},
                ],
                **({} if not _extra else {"extra_body": _extra}),
                temperature=0.1,
-                **auxiliary_max_tokens_param(MAX_SUMMARY_TOKENS),
+                max_tokens=MAX_SUMMARY_TOKENS,
            )
            return response.choices[0].message.content.strip()
        except RuntimeError:
            logging.warning("No auxiliary model available for session summarization")
            return None
        except Exception as e:
            if attempt < max_retries - 1:
                await asyncio.sleep(1 * (attempt + 1))
@ -333,8 +323,6 @@ def session_search(
 def check_session_search_requirements() -> bool:
    """Requires SQLite state database and an auxiliary text model."""
    if _async_aux_client is None:
        return False
    try:
        from hermes_state import DEFAULT_DB_PATH
        return DEFAULT_DB_PATH.parent.exists()
--- a/tools/skills_guard.py
+++ b/tools/skills_guard.py
@ -936,13 +936,10 @@ def llm_audit_skill(skill_path: Path, static_result: ScanResult,
    # Call the LLM via the centralized provider router
    try:
-        from agent.auxiliary_client import resolve_provider_client
+        from agent.auxiliary_client import call_llm
-        client, _default_model = resolve_provider_client("openrouter")
+        response = call_llm(
-        if client is None:
+            provider="openrouter",
            return static_result
        response = client.chat.completions.create(
            model=model,
            messages=[{
                "role": "user",
--- a/tools/vision_tools.py
+++ b/tools/vision_tools.py
@ -37,16 +37,11 @@ from pathlib import Path
 from typing import Any, Awaitable, Dict, Optional
 from urllib.parse import urlparse
 import httpx
-from agent.auxiliary_client import get_async_vision_auxiliary_client
+from agent.auxiliary_client import async_call_llm
 from tools.debug_helpers import DebugSession
 logger = logging.getLogger(__name__)
 # Resolve vision auxiliary client at module level.
 # Uses get_async_vision_auxiliary_client() which properly handles Codex
 # routing (Responses API adapter) instead of raw AsyncOpenAI construction.
 _aux_async_client, DEFAULT_VISION_MODEL = get_async_vision_auxiliary_client()
 _debug = DebugSession("vision_tools", env_var="VISION_TOOLS_DEBUG")
@ -185,7 +180,7 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None)
 async def vision_analyze_tool(
    image_url: str,
    user_prompt: str,
-    model: str = DEFAULT_VISION_MODEL,
+    model: str = None,
 ) -> str:
    """
    Analyze an image from a URL or local file path using vision AI.
@ -245,15 +240,6 @@ async def vision_analyze_tool(
        logger.info("Analyzing image: %s", image_url[:60])
        logger.info("User prompt: %s", user_prompt[:100])
        # Check auxiliary vision client availability
        if _aux_async_client is None or DEFAULT_VISION_MODEL is None:
            logger.error("Vision analysis unavailable: no auxiliary vision model configured")
            return json.dumps({
                "success": False,
                "analysis": "Vision analysis unavailable: no auxiliary vision model configured. "
                            "Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools."
            }, indent=2, ensure_ascii=False)
        # Determine if this is a local file path or a remote URL
        local_path = Path(image_url)
        if local_path.is_file():
@ -309,18 +295,18 @@ async def vision_analyze_tool(
            }
        ]
-        logger.info("Processing image with %s...", model)
+        logger.info("Processing image with vision model...")
-        # Call the vision API
+        # Call the vision API via centralized router
-        from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
+        call_kwargs = {
-        _extra = get_auxiliary_extra_body()
+            "task": "vision",
-        response = await _aux_async_client.chat.completions.create(
+            "messages": messages,
-            model=model,
+            "temperature": 0.1,
-            messages=messages,
+            "max_tokens": 2000,
-            temperature=0.1,
+        }
-            **auxiliary_max_tokens_param(2000),
+        if model:
-            **({} if not _extra else {"extra_body": _extra}),
+            call_kwargs["model"] = model
-        )
+        response = await async_call_llm(**call_kwargs)
        # Extract the analysis
        analysis = response.choices[0].message.content.strip()
@ -391,7 +377,18 @@ async def vision_analyze_tool(
 def check_vision_requirements() -> bool:
    """Check if an auxiliary vision model is available."""
-    return _aux_async_client is not None
+    try:
        from agent.auxiliary_client import resolve_provider_client
        client, _ = resolve_provider_client("openrouter")
        if client is not None:
            return True
        client, _ = resolve_provider_client("nous")
        if client is not None:
            return True
        client, _ = resolve_provider_client("custom")
        return client is not None
    except Exception:
        return False
 def get_debug_session_info() -> Dict[str, Any]:
@ -419,10 +416,9 @@ if __name__ == "__main__":
        print("Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools.")
        exit(1)
    else:
-        print(f"✅ Vision model available: {DEFAULT_VISION_MODEL}")
+        print("✅ Vision model available")
    print("🛠️ Vision tools ready for use!")
    print(f"🧠 Using model: {DEFAULT_VISION_MODEL}")
    # Show debug mode status
    if _debug.active:
@ -489,9 +485,7 @@ def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]:
        "Fully describe and explain everything about this image, then answer the "
        f"following question:\n\n{question}"
    )
-    model = (os.getenv("AUXILIARY_VISION_MODEL", "").strip()
+    model = os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None
             or DEFAULT_VISION_MODEL
             or "google/gemini-3-flash-preview")
    return vision_analyze_tool(image_url, full_prompt, model)
--- a/tools/web_tools.py
+++ b/tools/web_tools.py
@ -47,8 +47,7 @@ import re
 import asyncio
 from typing import List, Dict, Any, Optional
 from firecrawl import Firecrawl
-from openai import AsyncOpenAI
+from agent.auxiliary_client import async_call_llm
 from agent.auxiliary_client import get_async_text_auxiliary_client
 from tools.debug_helpers import DebugSession
 logger = logging.getLogger(__name__)
@ -83,15 +82,8 @@ def _get_firecrawl_client():
 DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000
-# Resolve async auxiliary client at module level.
+# Allow per-task override via env var
-# Handles Codex Responses API adapter transparently.
+DEFAULT_SUMMARIZER_MODEL = os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() or None
 _aux_async_client, _DEFAULT_SUMMARIZER_MODEL = get_async_text_auxiliary_client("web_extract")
 # Allow per-task override via config.yaml auxiliary.web_extract_model
 DEFAULT_SUMMARIZER_MODEL = (
    os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip()
    or _DEFAULT_SUMMARIZER_MODEL
 )
 _debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG")
@ -249,22 +241,22 @@ Create a markdown summary that captures all key information in a well-organized,
    for attempt in range(max_retries):
        try:
-            if _aux_async_client is None:
+            call_kwargs = {
-                logger.warning("No auxiliary model available for web content processing")
+                "task": "web_extract",
-                return None
+                "messages": [
            from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
            _extra = get_auxiliary_extra_body()
            response = await _aux_async_client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
-                temperature=0.1,
+                "temperature": 0.1,
-                **auxiliary_max_tokens_param(max_tokens),
+                "max_tokens": max_tokens,
-                **({} if not _extra else {"extra_body": _extra}),
+            }
-            )
+            if model:
                call_kwargs["model"] = model
            response = await async_call_llm(**call_kwargs)
            return response.choices[0].message.content.strip()
        except RuntimeError:
            logger.warning("No auxiliary model available for web content processing")
            return None
        except Exception as api_error:
            last_error = api_error
            if attempt < max_retries - 1:
@ -368,25 +360,18 @@ Synthesize these into ONE cohesive, comprehensive summary that:
 Create a single, unified markdown summary."""
    try:
-        if _aux_async_client is None:
+        call_kwargs = {
-            logger.warning("No auxiliary model for synthesis, concatenating summaries")
+            "task": "web_extract",
-            fallback = "\n\n".join(summaries)
+            "messages": [
            if len(fallback) > max_output_size:
                fallback = fallback[:max_output_size] + "\n\n[... truncated ...]"
            return fallback
        from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
        _extra = get_auxiliary_extra_body()
        response = await _aux_async_client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise."},
                {"role": "user", "content": synthesis_prompt}
            ],
-            temperature=0.1,
+            "temperature": 0.1,
-            **auxiliary_max_tokens_param(20000),
+            "max_tokens": 20000,
-            **({} if not _extra else {"extra_body": _extra}),
+        }
-        )
+        if model:
            call_kwargs["model"] = model
        response = await async_call_llm(**call_kwargs)
        final_summary = response.choices[0].message.content.strip()
        # Enforce hard cap
@ -713,8 +698,8 @@ async def web_extract_tool(
        debug_call_data["pages_extracted"] = pages_extracted
        debug_call_data["original_response_size"] = len(json.dumps(response))
-        # Process each result with LLM if enabled and auxiliary client is available
+        # Process each result with LLM if enabled
-        if use_llm_processing and _aux_async_client is not None:
+        if use_llm_processing:
            logger.info("Processing extracted content with LLM (parallel)...")
            debug_call_data["processing_applied"].append("llm_processing")
@ -780,10 +765,6 @@ async def web_extract_tool(
                else:
                    logger.warning("%s (no content to process)", url)
        else:
            if use_llm_processing and _aux_async_client is None:
                logger.warning("LLM processing requested but no auxiliary model available, returning raw content")
                debug_call_data["processing_applied"].append("llm_processing_unavailable")
            # Print summary of extracted pages for debugging (original behavior)
            for result in response.get('results', []):
                url = result.get('url', 'Unknown URL')
@ -1013,8 +994,8 @@ async def web_crawl_tool(
        debug_call_data["pages_crawled"] = pages_crawled
        debug_call_data["original_response_size"] = len(json.dumps(response))
-        # Process each result with LLM if enabled and auxiliary client is available
+        # Process each result with LLM if enabled
-        if use_llm_processing and _aux_async_client is not None:
+        if use_llm_processing:
            logger.info("Processing crawled content with LLM (parallel)...")
            debug_call_data["processing_applied"].append("llm_processing")
@ -1080,10 +1061,6 @@ async def web_crawl_tool(
                else:
                    logger.warning("%s (no content to process)", page_url)
        else:
            if use_llm_processing and _aux_async_client is None:
                logger.warning("LLM processing requested but no auxiliary model available, returning raw content")
                debug_call_data["processing_applied"].append("llm_processing_unavailable")
            # Print summary of crawled pages for debugging (original behavior)
            for result in response.get('results', []):
                page_url = result.get('url', 'Unknown URL')
@ -1138,7 +1115,15 @@ def check_firecrawl_api_key() -> bool:
 def check_auxiliary_model() -> bool:
    """Check if an auxiliary text model is available for LLM content processing."""
-    return _aux_async_client is not None
+    try:
        from agent.auxiliary_client import resolve_provider_client
        for p in ("openrouter", "nous", "custom", "codex"):
            client, _ = resolve_provider_client(p)
            if client is not None:
                return True
        return False
    except Exception:
        return False
 def get_debug_session_info() -> Dict[str, Any]:
--- a/trajectory_compressor.py
+++ b/trajectory_compressor.py
@ -344,28 +344,32 @@ class TrajectoryCompressor:
            raise RuntimeError(f"Failed to load tokenizer '{self.config.tokenizer_name}': {e}")
    def _init_summarizer(self):
-        """Initialize LLM client for summarization (sync and async).
+        """Initialize LLM routing for summarization (sync and async).
-        Routes through the centralized provider router for known providers
+        Uses call_llm/async_call_llm from the centralized provider router
-        (OpenRouter, Nous, Codex, etc.) so auth and headers are handled
+        which handles auth, headers, and provider detection internally.
-        consistently.  Falls back to raw construction for custom endpoints.
+        For custom endpoints, falls back to raw client construction.
        """
-        from agent.auxiliary_client import resolve_provider_client
+        from agent.auxiliary_client import call_llm, async_call_llm
        provider = self._detect_provider()
        if provider:
-            # Use centralized router — handles auth, headers, Codex adapter
+            # Store provider for use in _generate_summary calls
-            self.client, _ = resolve_provider_client(
+            self._llm_provider = provider
            self._use_call_llm = True
            # Verify the provider is available
            from agent.auxiliary_client import resolve_provider_client
            client, _ = resolve_provider_client(
                provider, model=self.config.summarization_model)
-            self.async_client, _ = resolve_provider_client(
+            if client is None:
                provider, model=self.config.summarization_model,
                async_mode=True)
            if self.client is None:
                raise RuntimeError(
                    f"Provider '{provider}' is not configured. "
                    f"Check your API key or run: hermes setup")
            self.client = None  # Not used directly
            self.async_client = None  # Not used directly
        else:
            # Custom endpoint — use config's raw base_url + api_key_env
            self._use_call_llm = False
            api_key = os.getenv(self.config.api_key_env)
            if not api_key:
                raise RuntimeError(
@ -524,12 +528,22 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
            try:
                metrics.summarization_api_calls += 1
-                response = self.client.chat.completions.create(
+                if getattr(self, '_use_call_llm', False):
-                    model=self.config.summarization_model,
+                    from agent.auxiliary_client import call_llm
-                    messages=[{"role": "user", "content": prompt}],
+                    response = call_llm(
-                    temperature=self.config.temperature,
+                        provider=self._llm_provider,
-                    max_tokens=self.config.summary_target_tokens * 2,
+                        model=self.config.summarization_model,
-                )
+                        messages=[{"role": "user", "content": prompt}],
                        temperature=self.config.temperature,
                        max_tokens=self.config.summary_target_tokens * 2,
                    )
                else:
                    response = self.client.chat.completions.create(
                        model=self.config.summarization_model,
                        messages=[{"role": "user", "content": prompt}],
                        temperature=self.config.temperature,
                        max_tokens=self.config.summary_target_tokens * 2,
                    )
                summary = response.choices[0].message.content.strip()
@ -581,12 +595,22 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
            try:
                metrics.summarization_api_calls += 1
-                response = await self.async_client.chat.completions.create(
+                if getattr(self, '_use_call_llm', False):
-                    model=self.config.summarization_model,
+                    from agent.auxiliary_client import async_call_llm
-                    messages=[{"role": "user", "content": prompt}],
+                    response = await async_call_llm(
-                    temperature=self.config.temperature,
+                        provider=self._llm_provider,
-                    max_tokens=self.config.summary_target_tokens * 2,
+                        model=self.config.summarization_model,
-                )
+                        messages=[{"role": "user", "content": prompt}],
                        temperature=self.config.temperature,
                        max_tokens=self.config.summary_target_tokens * 2,
                    )
                else:
                    response = await self.async_client.chat.completions.create(
                        model=self.config.summarization_model,
                        messages=[{"role": "user", "content": prompt}],
                        temperature=self.config.temperature,
                        max_tokens=self.config.summary_target_tokens * 2,
                    )
                summary = response.choices[0].message.content.strip()