From 8805e705a7e134ff7e090bd5fa5e37ba2ec14811 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Wed, 11 Mar 2026 19:46:47 -0700
Subject: [PATCH 01/11] feat: centralized provider router + fix Codex vision
 bypass + vision error handling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three interconnected fixes for auxiliary client infrastructure:

1. CENTRALIZED PROVIDER ROUTER (auxiliary_client.py)
   Add resolve_provider_client(provider, model, async_mode) — a single
   entry point for creating properly configured clients. Given a provider
   name and optional model, it handles auth lookup (env vars, OAuth
   tokens, auth.json), base URL resolution, provider-specific headers,
   and API format differences (Chat Completions vs Responses API for
   Codex). All auxiliary consumers should route through this instead of
   ad-hoc env var lookups.

   Refactored get_text_auxiliary_client, get_async_text_auxiliary_client,
   and get_vision_auxiliary_client to use the router internally.

2. FIX CODEX VISION BYPASS (vision_tools.py)
   vision_tools.py was constructing a raw AsyncOpenAI client from the
   sync vision client's api_key/base_url, completely bypassing the Codex
   Responses API adapter. When the vision provider resolved to Codex,
   the raw client would hit chatgpt.com/backend-api/codex with
   chat.completions.create() which only supports the Responses API.

   Fix: Added get_async_vision_auxiliary_client() which properly wraps
   Codex into AsyncCodexAuxiliaryClient. vision_tools.py now uses this
   instead of manual client construction.

3. FIX COMPRESSION FALLBACK + VISION ERROR HANDLING
   - context_compressor.py: Removed _get_fallback_client() which blindly
     looked for OPENAI_API_KEY + OPENAI_BASE_URL (fails for Codex OAuth,
     API-key providers, users without OPENAI_BASE_URL set). Replaced
     with fallback loop through resolve_provider_client() for each
     known provider, with same-provider dedup.

   - vision_tools.py: Added error detection for vision capability
     failures. Returns clear message to the model when the configured
     model doesn't support vision, instead of a generic error.

Addresses #886
---
 agent/auxiliary_client.py   | 225 ++++++++++++++++++++++++++++++++----
 agent/context_compressor.py |  67 +++++------
 tools/vision_tools.py       |  42 ++++---
 3 files changed, 256 insertions(+), 78 deletions(-)

diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py
index 57c3c118..4571520a 100644
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -499,6 +499,188 @@ def _resolve_auto() -> Tuple[Optional[OpenAI], Optional[str]]:
     return None, None
 
 
+# ── Centralized Provider Router ─────────────────────────────────────────────
+#
+# resolve_provider_client() is the single entry point for creating a properly
+# configured client given a (provider, model) pair.  It handles auth lookup,
+# base URL resolution, provider-specific headers, and API format differences
+# (Chat Completions vs Responses API for Codex).
+#
+# All auxiliary consumer code should go through this or the public helpers
+# below — never look up auth env vars ad-hoc.
+
+
+def _to_async_client(sync_client, model: str):
+    """Convert a sync client to its async counterpart, preserving Codex routing."""
+    from openai import AsyncOpenAI
+
+    if isinstance(sync_client, CodexAuxiliaryClient):
+        return AsyncCodexAuxiliaryClient(sync_client), model
+
+    async_kwargs = {
+        "api_key": sync_client.api_key,
+        "base_url": str(sync_client.base_url),
+    }
+    base_lower = str(sync_client.base_url).lower()
+    if "openrouter" in base_lower:
+        async_kwargs["default_headers"] = dict(_OR_HEADERS)
+    elif "api.kimi.com" in base_lower:
+        async_kwargs["default_headers"] = {"User-Agent": "KimiCLI/1.0"}
+    return AsyncOpenAI(**async_kwargs), model
+
+
+def resolve_provider_client(
+    provider: str,
+    model: str = None,
+    async_mode: bool = False,
+) -> Tuple[Optional[Any], Optional[str]]:
+    """Central router: given a provider name and optional model, return a
+    configured client with the correct auth, base URL, and API format.
+
+    The returned client always exposes ``.chat.completions.create()`` — for
+    Codex/Responses API providers, an adapter handles the translation
+    transparently.
+
+    Args:
+        provider: Provider identifier.  One of:
+            "openrouter", "nous", "openai-codex" (or "codex"),
+            "zai", "kimi-coding", "minimax", "minimax-cn", "nous-api",
+            "custom" (OPENAI_BASE_URL + OPENAI_API_KEY),
+            "auto" (full auto-detection chain).
+        model: Model slug override.  If None, uses the provider's default
+               auxiliary model.
+        async_mode: If True, return an async-compatible client.
+
+    Returns:
+        (client, resolved_model) or (None, None) if auth is unavailable.
+    """
+    # Normalise aliases
+    provider = (provider or "auto").strip().lower()
+    if provider == "codex":
+        provider = "openai-codex"
+    if provider == "main":
+        provider = "custom"
+
+    # ── Auto: try all providers in priority order ────────────────────
+    if provider == "auto":
+        client, resolved = _resolve_auto()
+        if client is None:
+            return None, None
+        final_model = model or resolved
+        return (_to_async_client(client, final_model) if async_mode
+                else (client, final_model))
+
+    # ── OpenRouter ───────────────────────────────────────────────────
+    if provider == "openrouter":
+        client, default = _try_openrouter()
+        if client is None:
+            logger.warning("resolve_provider_client: openrouter requested "
+                           "but OPENROUTER_API_KEY not set")
+            return None, None
+        final_model = model or default
+        return (_to_async_client(client, final_model) if async_mode
+                else (client, final_model))
+
+    # ── Nous Portal (OAuth) ──────────────────────────────────────────
+    if provider == "nous":
+        client, default = _try_nous()
+        if client is None:
+            logger.warning("resolve_provider_client: nous requested "
+                           "but Nous Portal not configured (run: hermes login)")
+            return None, None
+        final_model = model or default
+        return (_to_async_client(client, final_model) if async_mode
+                else (client, final_model))
+
+    # ── OpenAI Codex (OAuth → Responses API) ─────────────────────────
+    if provider == "openai-codex":
+        client, default = _try_codex()
+        if client is None:
+            logger.warning("resolve_provider_client: openai-codex requested "
+                           "but no Codex OAuth token found (run: hermes model)")
+            return None, None
+        final_model = model or default
+        return (_to_async_client(client, final_model) if async_mode
+                else (client, final_model))
+
+    # ── Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY) ───────────
+    if provider == "custom":
+        # Try custom first, then codex, then API-key providers
+        for try_fn in (_try_custom_endpoint, _try_codex,
+                       _resolve_api_key_provider):
+            client, default = try_fn()
+            if client is not None:
+                final_model = model or default
+                return (_to_async_client(client, final_model) if async_mode
+                        else (client, final_model))
+        logger.warning("resolve_provider_client: custom/main requested "
+                       "but no endpoint credentials found")
+        return None, None
+
+    # ── API-key providers from PROVIDER_REGISTRY ─────────────────────
+    try:
+        from hermes_cli.auth import PROVIDER_REGISTRY, _resolve_kimi_base_url
+    except ImportError:
+        logger.debug("hermes_cli.auth not available for provider %s", provider)
+        return None, None
+
+    pconfig = PROVIDER_REGISTRY.get(provider)
+    if pconfig is None:
+        logger.warning("resolve_provider_client: unknown provider %r", provider)
+        return None, None
+
+    if pconfig.auth_type == "api_key":
+        # Find the first configured API key
+        api_key = ""
+        for env_var in pconfig.api_key_env_vars:
+            api_key = os.getenv(env_var, "").strip()
+            if api_key:
+                break
+        if not api_key:
+            logger.warning("resolve_provider_client: provider %s has no API "
+                           "key configured (tried: %s)",
+                           provider, ", ".join(pconfig.api_key_env_vars))
+            return None, None
+
+        # Resolve base URL (env override → provider-specific logic → default)
+        base_url_override = os.getenv(pconfig.base_url_env_var, "").strip() if pconfig.base_url_env_var else ""
+        if provider == "kimi-coding":
+            base_url = _resolve_kimi_base_url(api_key, pconfig.inference_base_url, base_url_override)
+        elif base_url_override:
+            base_url = base_url_override
+        else:
+            base_url = pconfig.inference_base_url
+
+        default_model = _API_KEY_PROVIDER_AUX_MODELS.get(provider, "")
+        final_model = model or default_model
+
+        # Provider-specific headers
+        headers = {}
+        if "api.kimi.com" in base_url.lower():
+            headers["User-Agent"] = "KimiCLI/1.0"
+
+        client = OpenAI(api_key=api_key, base_url=base_url,
+                        **({"default_headers": headers} if headers else {}))
+        logger.debug("resolve_provider_client: %s (%s)", provider, final_model)
+        return (_to_async_client(client, final_model) if async_mode
+                else (client, final_model))
+
+    elif pconfig.auth_type in ("oauth_device_code", "oauth_external"):
+        # OAuth providers — route through their specific try functions
+        if provider == "nous":
+            return resolve_provider_client("nous", model, async_mode)
+        if provider == "openai-codex":
+            return resolve_provider_client("openai-codex", model, async_mode)
+        # nous-api is api_key type so it's handled above
+        logger.warning("resolve_provider_client: OAuth provider %s not "
+                       "directly supported, try 'auto'", provider)
+        return None, None
+
+    logger.warning("resolve_provider_client: unhandled auth_type %s for %s",
+                   pconfig.auth_type, provider)
+    return None, None
+
+
 # ── Public API ──────────────────────────────────────────────────────────────
 
 def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optional[str]]:
@@ -513,8 +695,8 @@ def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optiona
     """
     forced = _get_auxiliary_provider(task)
     if forced != "auto":
-        return _resolve_forced_provider(forced)
-    return _resolve_auto()
+        return resolve_provider_client(forced)
+    return resolve_provider_client("auto")
 
 
 def get_async_text_auxiliary_client(task: str = ""):
@@ -524,24 +706,10 @@ def get_async_text_auxiliary_client(task: str = ""):
     (AsyncCodexAuxiliaryClient, model) which wraps the Responses API.
     Returns (None, None) when no provider is available.
     """
-    from openai import AsyncOpenAI
-
-    sync_client, model = get_text_auxiliary_client(task)
-    if sync_client is None:
-        return None, None
-
-    if isinstance(sync_client, CodexAuxiliaryClient):
-        return AsyncCodexAuxiliaryClient(sync_client), model
-
-    async_kwargs = {
-        "api_key": sync_client.api_key,
-        "base_url": str(sync_client.base_url),
-    }
-    if "openrouter" in str(sync_client.base_url).lower():
-        async_kwargs["default_headers"] = dict(_OR_HEADERS)
-    elif "api.kimi.com" in str(sync_client.base_url).lower():
-        async_kwargs["default_headers"] = {"User-Agent": "KimiCLI/1.0"}
-    return AsyncOpenAI(**async_kwargs), model
+    forced = _get_auxiliary_provider(task)
+    if forced != "auto":
+        return resolve_provider_client(forced, async_mode=True)
+    return resolve_provider_client("auto", async_mode=True)
 
 
 def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
@@ -559,7 +727,7 @@ def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
     """
     forced = _get_auxiliary_provider("vision")
     if forced != "auto":
-        return _resolve_forced_provider(forced)
+        return resolve_provider_client(forced)
     # Auto: try providers known to support multimodal first, then fall
     # back to the user's custom endpoint.  Many local models (Qwen-VL,
     # LLaVA, Pixtral, etc.) support vision — skipping them entirely
@@ -573,6 +741,21 @@ def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
     return None, None
 
 
+def get_async_vision_auxiliary_client():
+    """Return (async_client, model_slug) for async vision consumers.
+
+    Properly handles Codex routing — unlike manually constructing
+    AsyncOpenAI from a sync client, this preserves the Responses API
+    adapter for Codex providers.
+
+    Returns (None, None) when no provider is available.
+    """
+    sync_client, model = get_vision_auxiliary_client()
+    if sync_client is None:
+        return None, None
+    return _to_async_client(sync_client, model)
+
+
 def get_auxiliary_extra_body() -> dict:
     """Return extra_body kwargs for auxiliary API calls.
     
diff --git a/agent/context_compressor.py b/agent/context_compressor.py
index 01aa2af8..fae483fd 100644
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@@ -127,20 +127,38 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
             except Exception as e:
                 logging.warning(f"Failed to generate context summary with auxiliary model: {e}")
 
-        # 2. Fallback: try the user's main model endpoint
-        fallback_client, fallback_model = self._get_fallback_client()
-        if fallback_client is not None:
+        # 2. Fallback: re-try via the centralized provider router.
+        #    This covers all configured providers (Codex OAuth, API-key
+        #    providers, etc.) without ad-hoc env var lookups.
+        from agent.auxiliary_client import resolve_provider_client
+        fallback_providers = ["custom", "openrouter", "nous", "codex"]
+        for fb_provider in fallback_providers:
             try:
-                logger.info("Retrying context summary with main model (%s)", fallback_model)
-                summary = self._call_summary_model(fallback_client, fallback_model, prompt)
-                self.client = fallback_client
-                self.summary_model = fallback_model
+                fb_client, fb_model = resolve_provider_client(
+                    fb_provider, model=self.model)
+                if fb_client is None:
+                    continue
+                # Don't retry the same client that just failed
+                if (self.client is not None
+                        and hasattr(fb_client, "base_url")
+                        and hasattr(self.client, "base_url")
+                        and str(fb_client.base_url) == str(self.client.base_url)):
+                    continue
+                logger.info("Retrying context summary with fallback provider "
+                            "%s (%s)", fb_provider, fb_model)
+                summary = self._call_summary_model(fb_client, fb_model, prompt)
+                # Promote successful fallback for future compressions
+                self.client = fb_client
+                self.summary_model = fb_model
                 return summary
             except Exception as fallback_err:
-                logging.warning(f"Main model summary also failed: {fallback_err}")
+                logging.warning("Fallback provider %s failed: %s",
+                                fb_provider, fallback_err)
 
-        # 3. All models failed — return None so the caller drops turns without a summary
-        logging.warning("Context compression: no model available for summary. Middle turns will be dropped without summary.")
+        # 3. All providers failed — return None so the caller drops turns
+        #    without a summary.
+        logging.warning("Context compression: no provider available for "
+                        "summary. Middle turns will be dropped without summary.")
         return None
 
     def _call_summary_model(self, client, model: str, prompt: str) -> str:
@@ -170,35 +188,6 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
             summary = "[CONTEXT SUMMARY]: " + summary
         return summary
 
-    def _get_fallback_client(self):
-        """Try to build a fallback client from the main model's endpoint config.
-
-        When the primary auxiliary client fails (e.g. stale OpenRouter key), this
-        creates a client using the user's active custom endpoint (OPENAI_BASE_URL)
-        so compression can still produce a real summary instead of a static string.
-
-        Returns (client, model) or (None, None).
-        """
-        custom_base = os.getenv("OPENAI_BASE_URL")
-        custom_key = os.getenv("OPENAI_API_KEY")
-        if not custom_base or not custom_key:
-            return None, None
-
-        # Don't fallback to the same provider that just failed
-        from hermes_constants import OPENROUTER_BASE_URL
-        if custom_base.rstrip("/") == OPENROUTER_BASE_URL.rstrip("/"):
-            return None, None
-
-        model = os.getenv("LLM_MODEL") or os.getenv("OPENAI_MODEL") or self.model
-        try:
-            from openai import OpenAI as _OpenAI
-            client = _OpenAI(api_key=custom_key, base_url=custom_base)
-            logger.debug("Built fallback auxiliary client: %s via %s", model, custom_base)
-            return client, model
-        except Exception as exc:
-            logger.debug("Could not build fallback auxiliary client: %s", exc)
-            return None, None
-
     # ------------------------------------------------------------------
     # Tool-call / tool-result pair integrity helpers
     # ------------------------------------------------------------------
diff --git a/tools/vision_tools.py b/tools/vision_tools.py
index bfde51ec..ee89b58a 100644
--- a/tools/vision_tools.py
+++ b/tools/vision_tools.py
@@ -37,27 +37,15 @@ from pathlib import Path
 from typing import Any, Awaitable, Dict, Optional
 from urllib.parse import urlparse
 import httpx
-from openai import AsyncOpenAI
-from agent.auxiliary_client import get_vision_auxiliary_client
+from agent.auxiliary_client import get_async_vision_auxiliary_client
 from tools.debug_helpers import DebugSession
 
 logger = logging.getLogger(__name__)
 
-# Resolve vision auxiliary client at module level; build an async wrapper.
-_aux_sync_client, DEFAULT_VISION_MODEL = get_vision_auxiliary_client()
-_aux_async_client: AsyncOpenAI | None = None
-if _aux_sync_client is not None:
-    _async_kwargs = {
-        "api_key": _aux_sync_client.api_key,
-        "base_url": str(_aux_sync_client.base_url),
-    }
-    if "openrouter" in str(_aux_sync_client.base_url).lower():
-        _async_kwargs["default_headers"] = {
-            "HTTP-Referer": "https://github.com/NousResearch/hermes-agent",
-            "X-OpenRouter-Title": "Hermes Agent",
-                "X-OpenRouter-Categories": "productivity,cli-agent",
-        }
-    _aux_async_client = AsyncOpenAI(**_async_kwargs)
+# Resolve vision auxiliary client at module level.
+# Uses get_async_vision_auxiliary_client() which properly handles Codex
+# routing (Responses API adapter) instead of raw AsyncOpenAI construction.
+_aux_async_client, DEFAULT_VISION_MODEL = get_async_vision_auxiliary_client()
 
 _debug = DebugSession("vision_tools", env_var="VISION_TOOLS_DEBUG")
 
@@ -359,10 +347,28 @@ async def vision_analyze_tool(
         error_msg = f"Error analyzing image: {str(e)}"
         logger.error("%s", error_msg, exc_info=True)
         
+        # Detect vision capability errors — give the model a clear message
+        # so it can inform the user instead of a cryptic API error.
+        err_str = str(e).lower()
+        if any(hint in err_str for hint in (
+            "does not support", "not support image", "invalid_request",
+            "content_policy", "image_url", "multimodal",
+            "unrecognized request argument", "image input",
+        )):
+            analysis = (
+                f"{model} does not support vision or our request was not "
+                f"accepted by the server. Error: {e}"
+            )
+        else:
+            analysis = (
+                "There was a problem with the request and the image could not "
+                f"be analyzed. Error: {e}"
+            )
+        
         # Prepare error response
         result = {
             "success": False,
-            "analysis": "There was a problem with the request and the image could not be analyzed."
+            "analysis": analysis,
         }
         
         debug_call_data["error"] = error_msg

From 07f09ecd83fba861041fb117e5e6221d15819975 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Wed, 11 Mar 2026 20:02:36 -0700
Subject: [PATCH 02/11] refactor: route ad-hoc LLM consumers through
 centralized provider router
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Route all remaining ad-hoc auxiliary LLM call sites through
resolve_provider_client() so auth, headers, and API format (Chat
Completions vs Responses API) are handled consistently in one place.

Files changed:

- tools/openrouter_client.py: Replace manual AsyncOpenAI construction
  with resolve_provider_client('openrouter', async_mode=True). The
  shared client module now delegates entirely to the router.

- tools/skills_guard.py: Replace inline OpenAI client construction
  (hardcoded OpenRouter base_url, manual api_key lookup, manual
  headers) with resolve_provider_client('openrouter'). Remove unused
  OPENROUTER_BASE_URL import.

- trajectory_compressor.py: Add _detect_provider() to map config
  base_url to a provider name, then route through
  resolve_provider_client. Falls back to raw construction for
  unrecognized custom endpoints.

- mini_swe_runner.py: Route default case (no explicit api_key/base_url)
  through resolve_provider_client('openrouter') with auto-detection
  fallback. Preserves direct construction when explicit creds are
  passed via CLI args.

- agent/auxiliary_client.py: Fix stale module docstring — vision auto
  mode now correctly documents that Codex and custom endpoints are
  tried (not skipped).
---
 agent/auxiliary_client.py  |  5 ++-
 mini_swe_runner.py         | 45 ++++++++++----------
 tools/openrouter_client.py | 31 +++++---------
 tools/skills_guard.py      | 20 +++------
 trajectory_compressor.py   | 85 ++++++++++++++++++++++++--------------
 5 files changed, 97 insertions(+), 89 deletions(-)

diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py
index 4571520a..9c153a74 100644
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -17,7 +17,10 @@ Resolution order for text tasks (auto mode):
 Resolution order for vision/multimodal tasks (auto mode):
   1. OpenRouter
   2. Nous Portal
-  3. None  (steps 3-5 are skipped — they may not support multimodal)
+  3. Codex OAuth (gpt-5.3-codex supports vision via Responses API)
+  4. Custom endpoint (for local vision models: Qwen-VL, LLaVA, Pixtral, etc.)
+  5. None  (API-key providers like z.ai/Kimi/MiniMax are skipped —
+     they may not support multimodal)
 
 Per-task provider overrides (e.g. AUXILIARY_VISION_PROVIDER,
 CONTEXT_COMPRESSION_PROVIDER) can force a specific provider for each task:
diff --git a/mini_swe_runner.py b/mini_swe_runner.py
index 9be7b734..5cb337b8 100644
--- a/mini_swe_runner.py
+++ b/mini_swe_runner.py
@@ -189,29 +189,30 @@ class MiniSWERunner:
         )
         self.logger = logging.getLogger(__name__)
         
-        # Initialize OpenAI client - defaults to OpenRouter
-        from openai import OpenAI
-        
-        client_kwargs = {}
-        
-        # Default to OpenRouter if no base_url provided
-        if base_url:
-            client_kwargs["base_url"] = base_url
+        # Initialize LLM client via centralized provider router.
+        # If explicit api_key/base_url are provided (e.g. from CLI args),
+        # construct directly.  Otherwise use the router for OpenRouter.
+        if api_key or base_url:
+            from openai import OpenAI
+            client_kwargs = {
+                "base_url": base_url or "https://openrouter.ai/api/v1",
+                "api_key": api_key or os.getenv(
+                    "OPENROUTER_API_KEY",
+                    os.getenv("ANTHROPIC_API_KEY",
+                              os.getenv("OPENAI_API_KEY", ""))),
+            }
+            self.client = OpenAI(**client_kwargs)
         else:
-            client_kwargs["base_url"] = "https://openrouter.ai/api/v1"
-
-
-        
-        # Handle API key - OpenRouter is the primary provider
-        if api_key:
-            client_kwargs["api_key"] = api_key
-        else:
-            client_kwargs["api_key"] = os.getenv(
-                "OPENROUTER_API_KEY",
-                os.getenv("ANTHROPIC_API_KEY", os.getenv("OPENAI_API_KEY", ""))
-            )
-        
-        self.client = OpenAI(**client_kwargs)
+            from agent.auxiliary_client import resolve_provider_client
+            self.client, _ = resolve_provider_client("openrouter", model=model)
+            if self.client is None:
+                # Fallback: try auto-detection
+                self.client, _ = resolve_provider_client("auto", model=model)
+            if self.client is None:
+                from openai import OpenAI
+                self.client = OpenAI(
+                    base_url="https://openrouter.ai/api/v1",
+                    api_key=os.getenv("OPENROUTER_API_KEY", ""))
         
         # Environment will be created per-task
         self.env = None
diff --git a/tools/openrouter_client.py b/tools/openrouter_client.py
index 343cf102..0637a7db 100644
--- a/tools/openrouter_client.py
+++ b/tools/openrouter_client.py
@@ -1,39 +1,30 @@
 """Shared OpenRouter API client for Hermes tools.
 
 Provides a single lazy-initialized AsyncOpenAI client that all tool modules
-can share, eliminating the duplicated _get_openrouter_client() / 
-_get_summarizer_client() pattern previously copy-pasted across web_tools,
-vision_tools, mixture_of_agents_tool, and session_search_tool.
+can share.  Routes through the centralized provider router in
+agent/auxiliary_client.py so auth, headers, and API format are handled
+consistently.
 """
 
 import os
 
-from openai import AsyncOpenAI
-from hermes_constants import OPENROUTER_BASE_URL
-
-_client: AsyncOpenAI | None = None
+_client = None
 
 
-def get_async_client() -> AsyncOpenAI:
-    """Return a shared AsyncOpenAI client pointed at OpenRouter.
+def get_async_client():
+    """Return a shared async OpenAI-compatible client for OpenRouter.
 
     The client is created lazily on first call and reused thereafter.
+    Uses the centralized provider router for auth and client construction.
     Raises ValueError if OPENROUTER_API_KEY is not set.
     """
     global _client
     if _client is None:
-        api_key = os.getenv("OPENROUTER_API_KEY")
-        if not api_key:
+        from agent.auxiliary_client import resolve_provider_client
+        client, _model = resolve_provider_client("openrouter", async_mode=True)
+        if client is None:
             raise ValueError("OPENROUTER_API_KEY environment variable not set")
-        _client = AsyncOpenAI(
-            api_key=api_key,
-            base_url=OPENROUTER_BASE_URL,
-            default_headers={
-                "HTTP-Referer": "https://github.com/NousResearch/hermes-agent",
-                "X-OpenRouter-Title": "Hermes Agent",
-                "X-OpenRouter-Categories": "productivity,cli-agent",
-            },
-        )
+        _client = client
     return _client
 
 
diff --git a/tools/skills_guard.py b/tools/skills_guard.py
index 0b6d7fee..8234b0a2 100644
--- a/tools/skills_guard.py
+++ b/tools/skills_guard.py
@@ -29,7 +29,7 @@ from datetime import datetime, timezone
 from pathlib import Path
 from typing import List, Tuple
 
-from hermes_constants import OPENROUTER_BASE_URL
+
 
 
 # ---------------------------------------------------------------------------
@@ -934,24 +934,14 @@ def llm_audit_skill(skill_path: Path, static_result: ScanResult,
     if not model:
         return static_result
 
-    # Call the LLM via the OpenAI SDK (same pattern as run_agent.py)
+    # Call the LLM via the centralized provider router
     try:
-        from openai import OpenAI
-        import os
+        from agent.auxiliary_client import resolve_provider_client
 
-        api_key = os.getenv("OPENROUTER_API_KEY", "")
-        if not api_key:
+        client, _default_model = resolve_provider_client("openrouter")
+        if client is None:
             return static_result
 
-        client = OpenAI(
-            base_url=OPENROUTER_BASE_URL,
-            api_key=api_key,
-            default_headers={
-                "HTTP-Referer": "https://github.com/NousResearch/hermes-agent",
-                "X-OpenRouter-Title": "Hermes Agent",
-                "X-OpenRouter-Categories": "productivity,cli-agent",
-            },
-        )
         response = client.chat.completions.create(
             model=model,
             messages=[{
diff --git a/trajectory_compressor.py b/trajectory_compressor.py
index 3f49c617..5f1c84c6 100644
--- a/trajectory_compressor.py
+++ b/trajectory_compressor.py
@@ -344,38 +344,61 @@ class TrajectoryCompressor:
             raise RuntimeError(f"Failed to load tokenizer '{self.config.tokenizer_name}': {e}")
     
     def _init_summarizer(self):
-        """Initialize OpenRouter client for summarization (sync and async)."""
-        api_key = os.getenv(self.config.api_key_env)
-        if not api_key:
-            raise RuntimeError(f"Missing API key. Set {self.config.api_key_env} environment variable.")
-        
-        from openai import OpenAI, AsyncOpenAI
-        
-        # OpenRouter app attribution headers (only for OpenRouter endpoints)
-        extra = {}
-        if "openrouter" in self.config.base_url.lower():
-            extra["default_headers"] = {
-                "HTTP-Referer": "https://github.com/NousResearch/hermes-agent",
-                "X-OpenRouter-Title": "Hermes Agent",
-                "X-OpenRouter-Categories": "productivity,cli-agent",
-            }
-        
-        # Sync client (for backwards compatibility)
-        self.client = OpenAI(
-            api_key=api_key,
-            base_url=self.config.base_url,
-            **extra,
-        )
-        
-        # Async client for parallel processing
-        self.async_client = AsyncOpenAI(
-            api_key=api_key,
-            base_url=self.config.base_url,
-            **extra,
-        )
-        
-        print(f"✅ Initialized OpenRouter client: {self.config.summarization_model}")
+        """Initialize LLM client for summarization (sync and async).
+
+        Routes through the centralized provider router for known providers
+        (OpenRouter, Nous, Codex, etc.) so auth and headers are handled
+        consistently.  Falls back to raw construction for custom endpoints.
+        """
+        from agent.auxiliary_client import resolve_provider_client
+
+        provider = self._detect_provider()
+        if provider:
+            # Use centralized router — handles auth, headers, Codex adapter
+            self.client, _ = resolve_provider_client(
+                provider, model=self.config.summarization_model)
+            self.async_client, _ = resolve_provider_client(
+                provider, model=self.config.summarization_model,
+                async_mode=True)
+            if self.client is None:
+                raise RuntimeError(
+                    f"Provider '{provider}' is not configured. "
+                    f"Check your API key or run: hermes setup")
+        else:
+            # Custom endpoint — use config's raw base_url + api_key_env
+            api_key = os.getenv(self.config.api_key_env)
+            if not api_key:
+                raise RuntimeError(
+                    f"Missing API key. Set {self.config.api_key_env} "
+                    f"environment variable.")
+            from openai import OpenAI, AsyncOpenAI
+            self.client = OpenAI(
+                api_key=api_key, base_url=self.config.base_url)
+            self.async_client = AsyncOpenAI(
+                api_key=api_key, base_url=self.config.base_url)
+
+        print(f"✅ Initialized summarizer client: {self.config.summarization_model}")
         print(f"   Max concurrent requests: {self.config.max_concurrent_requests}")
+
+    def _detect_provider(self) -> str:
+        """Detect the provider name from the configured base_url."""
+        url = self.config.base_url.lower()
+        if "openrouter" in url:
+            return "openrouter"
+        if "nousresearch.com" in url:
+            return "nous"
+        if "chatgpt.com/backend-api/codex" in url:
+            return "codex"
+        if "api.z.ai" in url:
+            return "zai"
+        if "moonshot.ai" in url or "api.kimi.com" in url:
+            return "kimi-coding"
+        if "minimaxi.com" in url:
+            return "minimax-cn"
+        if "minimax.io" in url:
+            return "minimax"
+        # Unknown base_url — not a known provider
+        return ""
     
     def count_tokens(self, text: str) -> int:
         """Count tokens in text using the configured tokenizer."""

From 013cc4d2fcc46c25edb7b2452a1e101209dea2fb Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Wed, 11 Mar 2026 20:14:44 -0700
Subject: [PATCH 03/11] chore: remove nous-api provider (API key path)

Nous Portal only supports OAuth authentication. Remove the 'nous-api'
provider which allowed direct API key access via NOUS_API_KEY env var.

Removed from:
- hermes_cli/auth.py: PROVIDER_REGISTRY entry + aliases
- hermes_cli/config.py: OPTIONAL_ENV_VARS entry
- hermes_cli/setup.py: setup wizard option + model selection handler
  (reindexed remaining provider choices)
- agent/auxiliary_client.py: docstring references
- tests/test_runtime_provider_resolution.py: nous-api test
- tests/integration/test_web_tools.py: renamed dict key
---
 agent/auxiliary_client.py                 |  4 +-
 hermes_cli/auth.py                        |  9 ----
 hermes_cli/config.py                      |  8 ---
 hermes_cli/setup.py                       | 61 ++++-------------------
 tests/integration/test_web_tools.py       |  2 +-
 tests/test_runtime_provider_resolution.py | 23 ---------
 6 files changed, 14 insertions(+), 93 deletions(-)

diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py
index 9c153a74..264bab3f 100644
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -547,7 +547,7 @@ def resolve_provider_client(
     Args:
         provider: Provider identifier.  One of:
             "openrouter", "nous", "openai-codex" (or "codex"),
-            "zai", "kimi-coding", "minimax", "minimax-cn", "nous-api",
+            "zai", "kimi-coding", "minimax", "minimax-cn",
             "custom" (OPENAI_BASE_URL + OPENAI_API_KEY),
             "auto" (full auto-detection chain).
         model: Model slug override.  If None, uses the provider's default
@@ -674,7 +674,7 @@ def resolve_provider_client(
             return resolve_provider_client("nous", model, async_mode)
         if provider == "openai-codex":
             return resolve_provider_client("openai-codex", model, async_mode)
-        # nous-api is api_key type so it's handled above
+        # Other OAuth providers not directly supported
         logger.warning("resolve_provider_client: OAuth provider %s not "
                        "directly supported, try 'auto'", provider)
         return None, None
diff --git a/hermes_cli/auth.py b/hermes_cli/auth.py
index c90f7792..05d233f9 100644
--- a/hermes_cli/auth.py
+++ b/hermes_cli/auth.py
@@ -108,14 +108,6 @@ PROVIDER_REGISTRY: Dict[str, ProviderConfig] = {
         auth_type="oauth_external",
         inference_base_url=DEFAULT_CODEX_BASE_URL,
     ),
-    "nous-api": ProviderConfig(
-        id="nous-api",
-        name="Nous Portal (API Key)",
-        auth_type="api_key",
-        inference_base_url="https://inference-api.nousresearch.com/v1",
-        api_key_env_vars=("NOUS_API_KEY",),
-        base_url_env_var="NOUS_BASE_URL",
-    ),
     "zai": ProviderConfig(
         id="zai",
         name="Z.AI / GLM",
@@ -521,7 +513,6 @@ def resolve_provider(
 
     # Normalize provider aliases
     _PROVIDER_ALIASES = {
-        "nous_api": "nous-api", "nousapi": "nous-api", "nous-portal-api": "nous-api",
         "glm": "zai", "z-ai": "zai", "z.ai": "zai", "zhipu": "zai",
         "kimi": "kimi-coding", "moonshot": "kimi-coding",
         "minimax-china": "minimax-cn", "minimax_cn": "minimax-cn",
diff --git a/hermes_cli/config.py b/hermes_cli/config.py
index 75811849..677de678 100644
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -242,14 +242,6 @@ REQUIRED_ENV_VARS = {}
 # Optional environment variables that enhance functionality
 OPTIONAL_ENV_VARS = {
     # ── Provider (handled in provider selection, not shown in checklists) ──
-    "NOUS_API_KEY": {
-        "description": "Nous Portal API key (direct API key access to Nous inference)",
-        "prompt": "Nous Portal API key",
-        "url": "https://portal.nousresearch.com",
-        "password": True,
-        "category": "provider",
-        "advanced": True,
-    },
     "NOUS_BASE_URL": {
         "description": "Nous Portal base URL override",
         "prompt": "Nous Portal base URL (leave empty for default)",
diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py
index c471b1b9..6b00952c 100644
--- a/hermes_cli/setup.py
+++ b/hermes_cli/setup.py
@@ -516,7 +516,6 @@ def setup_model_provider(config: dict):
         keep_label = None  # No provider configured — don't show "Keep current"
 
     provider_choices = [
-        "Nous Portal API key (direct API key access)",
         "Login with Nous Portal (Nous Research subscription — OAuth)",
         "Login with OpenAI Codex",
         "OpenRouter API key (100+ models, pay-per-use)",
@@ -530,7 +529,7 @@ def setup_model_provider(config: dict):
         provider_choices.append(keep_label)
     
     # Default to "Keep current" if a provider exists, otherwise OpenRouter (most common)
-    default_provider = len(provider_choices) - 1 if has_any_provider else 3
+    default_provider = len(provider_choices) - 1 if has_any_provider else 2
     
     if not has_any_provider:
         print_warning("An inference provider is required for Hermes to work.")
@@ -542,37 +541,7 @@ def setup_model_provider(config: dict):
     selected_provider = None  # "nous", "openai-codex", "openrouter", "custom", or None (keep)
     nous_models = []  # populated if Nous login succeeds
 
-    if provider_idx == 0:  # Nous Portal API Key (direct)
-        selected_provider = "nous-api"
-        print()
-        print_header("Nous Portal API Key")
-        print_info("Use a Nous Portal API key for direct access to Nous inference.")
-        print_info("Get your API key at: https://portal.nousresearch.com")
-        print()
-
-        existing_key = get_env_value("NOUS_API_KEY")
-        if existing_key:
-            print_info(f"Current: {existing_key[:8]}... (configured)")
-            if prompt_yes_no("Update Nous API key?", False):
-                api_key = prompt("  Nous API key", password=True)
-                if api_key:
-                    save_env_value("NOUS_API_KEY", api_key)
-                    print_success("Nous API key updated")
-        else:
-            api_key = prompt("  Nous API key", password=True)
-            if api_key:
-                save_env_value("NOUS_API_KEY", api_key)
-                print_success("Nous API key saved")
-            else:
-                print_warning("Skipped - agent won't work without an API key")
-
-        # Clear custom endpoint vars if switching
-        if existing_custom:
-            save_env_value("OPENAI_BASE_URL", "")
-            save_env_value("OPENAI_API_KEY", "")
-        _update_config_for_provider("nous-api", "https://inference-api.nousresearch.com/v1")
-
-    elif provider_idx == 1:  # Nous Portal
+    if provider_idx == 0:  # Nous Portal (OAuth)
         selected_provider = "nous"
         print()
         print_header("Nous Portal Login")
@@ -612,7 +581,7 @@ def setup_model_provider(config: dict):
             print_info("You can try again later with: hermes model")
             selected_provider = None
 
-    elif provider_idx == 2:  # OpenAI Codex
+    elif provider_idx == 1:  # OpenAI Codex
         selected_provider = "openai-codex"
         print()
         print_header("OpenAI Codex Login")
@@ -636,7 +605,7 @@ def setup_model_provider(config: dict):
             print_info("You can try again later with: hermes model")
             selected_provider = None
 
-    elif provider_idx == 3:  # OpenRouter
+    elif provider_idx == 2:  # OpenRouter
         selected_provider = "openrouter"
         print()
         print_header("OpenRouter API Key")
@@ -686,7 +655,7 @@ def setup_model_provider(config: dict):
         except Exception as e:
             logger.debug("Could not save provider to config.yaml: %s", e)
 
-    elif provider_idx == 4:  # Custom endpoint
+    elif provider_idx == 3:  # Custom endpoint
         selected_provider = "custom"
         print()
         print_header("Custom OpenAI-Compatible Endpoint")
@@ -737,7 +706,7 @@ def setup_model_provider(config: dict):
 
         print_success("Custom endpoint configured")
 
-    elif provider_idx == 5:  # Z.AI / GLM
+    elif provider_idx == 4:  # Z.AI / GLM
         selected_provider = "zai"
         print()
         print_header("Z.AI / GLM API Key")
@@ -791,7 +760,7 @@ def setup_model_provider(config: dict):
             save_env_value("OPENAI_API_KEY", "")
         _update_config_for_provider("zai", zai_base_url)
 
-    elif provider_idx == 6:  # Kimi / Moonshot
+    elif provider_idx == 5:  # Kimi / Moonshot
         selected_provider = "kimi-coding"
         print()
         print_header("Kimi / Moonshot API Key")
@@ -823,7 +792,7 @@ def setup_model_provider(config: dict):
             save_env_value("OPENAI_API_KEY", "")
         _update_config_for_provider("kimi-coding", pconfig.inference_base_url)
 
-    elif provider_idx == 7:  # MiniMax
+    elif provider_idx == 6:  # MiniMax
         selected_provider = "minimax"
         print()
         print_header("MiniMax API Key")
@@ -855,7 +824,7 @@ def setup_model_provider(config: dict):
             save_env_value("OPENAI_API_KEY", "")
         _update_config_for_provider("minimax", pconfig.inference_base_url)
 
-    elif provider_idx == 8:  # MiniMax China
+    elif provider_idx == 7:  # MiniMax China
         selected_provider = "minimax-cn"
         print()
         print_header("MiniMax China API Key")
@@ -887,12 +856,12 @@ def setup_model_provider(config: dict):
             save_env_value("OPENAI_API_KEY", "")
         _update_config_for_provider("minimax-cn", pconfig.inference_base_url)
 
-    # else: provider_idx == 9 (Keep current) — only shown when a provider already exists
+    # else: provider_idx == 8 (Keep current) — only shown when a provider already exists
 
     # ── OpenRouter API Key for tools (if not already set) ──
     # Tools (vision, web, MoA) use OpenRouter independently of the main provider.
     # Prompt for OpenRouter key if not set and a non-OpenRouter provider was chosen.
-    if selected_provider in ("nous", "nous-api", "openai-codex", "custom", "zai", "kimi-coding", "minimax", "minimax-cn") and not get_env_value("OPENROUTER_API_KEY"):
+    if selected_provider in ("nous", "openai-codex", "custom", "zai", "kimi-coding", "minimax", "minimax-cn") and not get_env_value("OPENROUTER_API_KEY"):
         print()
         print_header("OpenRouter API Key (for tools)")
         print_info("Tools like vision analysis, web search, and MoA use OpenRouter")
@@ -945,14 +914,6 @@ def setup_model_provider(config: dict):
             if custom:
                 config['model'] = custom
                 save_env_value("LLM_MODEL", custom)
-        elif selected_provider == "nous-api":
-            # Nous API key provider — prompt for model manually
-            print_info("Enter a model name available on Nous inference API.")
-            print_info("Examples: anthropic/claude-opus-4.6, deepseek/deepseek-r1")
-            custom = prompt(f"  Model name (Enter to keep '{current_model}')")
-            if custom:
-                config['model'] = custom
-                save_env_value("LLM_MODEL", custom)
         elif selected_provider == "openai-codex":
             from hermes_cli.codex_models import get_codex_model_ids
             codex_models = get_codex_model_ids()
diff --git a/tests/integration/test_web_tools.py b/tests/integration/test_web_tools.py
index cd3de453..fb2ea9da 100644
--- a/tests/integration/test_web_tools.py
+++ b/tests/integration/test_web_tools.py
@@ -579,7 +579,7 @@ class WebToolsTester:
             "results": self.test_results,
             "environment": {
                 "firecrawl_api_key": check_firecrawl_api_key(),
-                "nous_api_key": check_auxiliary_model(),
+                "auxiliary_model": check_auxiliary_model(),
                 "debug_mode": get_debug_session_info()["enabled"]
             }
         }
diff --git a/tests/test_runtime_provider_resolution.py b/tests/test_runtime_provider_resolution.py
index 9ccd7c7e..9631591b 100644
--- a/tests/test_runtime_provider_resolution.py
+++ b/tests/test_runtime_provider_resolution.py
@@ -158,29 +158,6 @@ def test_custom_endpoint_auto_provider_prefers_openai_key(monkeypatch):
     assert resolved["api_key"] == "sk-vllm-key"
 
 
-def test_resolve_runtime_provider_nous_api(monkeypatch):
-    """Nous Portal API key provider resolves via the api_key path."""
-    monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "nous-api")
-    monkeypatch.setattr(
-        rp,
-        "resolve_api_key_provider_credentials",
-        lambda pid: {
-            "provider": "nous-api",
-            "api_key": "nous-test-key",
-            "base_url": "https://inference-api.nousresearch.com/v1",
-            "source": "NOUS_API_KEY",
-        },
-    )
-
-    resolved = rp.resolve_runtime_provider(requested="nous-api")
-
-    assert resolved["provider"] == "nous-api"
-    assert resolved["api_mode"] == "chat_completions"
-    assert resolved["base_url"] == "https://inference-api.nousresearch.com/v1"
-    assert resolved["api_key"] == "nous-test-key"
-    assert resolved["requested_provider"] == "nous-api"
-
-
 def test_explicit_openrouter_skips_openai_base_url(monkeypatch):
     """When the user explicitly requests openrouter, OPENAI_BASE_URL
     (which may point to a custom endpoint) must not override the

From 0aa31cd3cb8167748ade1195e40eff469f07c7da Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Wed, 11 Mar 2026 20:52:19 -0700
Subject: [PATCH 04/11] feat: call_llm/async_call_llm + config slots + migrate
 all consumers

Add centralized call_llm() and async_call_llm() functions that own the
full LLM request lifecycle:
  1. Resolve provider + model from task config or explicit args
  2. Get or create a cached client for that provider
  3. Format request args (max_tokens handling, provider extra_body)
  4. Make the API call with max_tokens/max_completion_tokens retry
  5. Return the response

Config: expanded auxiliary section with provider:model slots for all
tasks (compression, vision, web_extract, session_search, skills_hub,
mcp, flush_memories). Config version bumped to 7.

Migrated all auxiliary consumers:
- context_compressor.py: uses call_llm(task='compression')
- vision_tools.py: uses async_call_llm(task='vision')
- web_tools.py: uses async_call_llm(task='web_extract')
- session_search_tool.py: uses async_call_llm(task='session_search')
- browser_tool.py: uses call_llm(task='vision'/'web_extract')
- mcp_tool.py: uses call_llm(task='mcp')
- skills_guard.py: uses call_llm(provider='openrouter')
- run_agent.py flush_memories: uses call_llm(task='flush_memories')

Tests updated for context_compressor and MCP tool. Some test mocks
still need updating (15 remaining failures from mock pattern changes,
2 pre-existing).
---
 agent/auxiliary_client.py              | 250 +++++++++++++++++++++++++
 agent/context_compressor.py            |  94 +++-------
 hermes_cli/config.py                   |  32 +++-
 run_agent.py                           |  31 +--
 tests/agent/test_context_compressor.py |  46 ++---
 tests/tools/test_mcp_tool.py           |  97 +++++-----
 tools/browser_tool.py                  |  83 +++-----
 tools/mcp_tool.py                      |  40 ++--
 tools/session_search_tool.py           |  26 +--
 tools/skills_guard.py                  |   9 +-
 tools/vision_tools.py                  |  60 +++---
 tools/web_tools.py                     |  89 ++++-----
 trajectory_compressor.py               |  70 ++++---
 13 files changed, 552 insertions(+), 375 deletions(-)

diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py
index 264bab3f..04afe4c7 100644
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -784,3 +784,253 @@ def auxiliary_max_tokens_param(value: int) -> dict:
             and "api.openai.com" in custom_base.lower()):
         return {"max_completion_tokens": value}
     return {"max_tokens": value}
+
+
+# ── Centralized LLM Call API ────────────────────────────────────────────────
+#
+# call_llm() and async_call_llm() own the full request lifecycle:
+#   1. Resolve provider + model from task config (or explicit args)
+#   2. Get or create a cached client for that provider
+#   3. Format request args for the provider + model (max_tokens handling, etc.)
+#   4. Make the API call
+#   5. Return the response
+#
+# Every auxiliary LLM consumer should use these instead of manually
+# constructing clients and calling .chat.completions.create().
+
+# Client cache: (provider, async_mode) -> (client, default_model)
+_client_cache: Dict[tuple, tuple] = {}
+
+
+def _get_cached_client(
+    provider: str, model: str = None, async_mode: bool = False,
+) -> Tuple[Optional[Any], Optional[str]]:
+    """Get or create a cached client for the given provider."""
+    cache_key = (provider, async_mode)
+    if cache_key in _client_cache:
+        cached_client, cached_default = _client_cache[cache_key]
+        return cached_client, model or cached_default
+    client, default_model = resolve_provider_client(provider, model, async_mode)
+    if client is not None:
+        _client_cache[cache_key] = (client, default_model)
+    return client, model or default_model
+
+
+def _resolve_task_provider_model(
+    task: str = None,
+    provider: str = None,
+    model: str = None,
+) -> Tuple[str, Optional[str]]:
+    """Determine provider + model for a call.
+
+    Priority:
+      1. Explicit provider/model args (always win)
+      2. Env var overrides (AUXILIARY_{TASK}_PROVIDER, etc.)
+      3. Config file (auxiliary.{task}.provider/model or compression.*)
+      4. "auto" (full auto-detection chain)
+
+    Returns (provider, model) where model may be None (use provider default).
+    """
+    if provider:
+        return provider, model
+
+    if task:
+        # Check env var overrides first
+        env_provider = _get_auxiliary_provider(task)
+        if env_provider != "auto":
+            # Check for env var model override too
+            env_model = None
+            for prefix in ("AUXILIARY_", "CONTEXT_"):
+                val = os.getenv(f"{prefix}{task.upper()}_MODEL", "").strip()
+                if val:
+                    env_model = val
+                    break
+            return env_provider, model or env_model
+
+        # Read from config file
+        try:
+            from hermes_cli.config import load_config
+            config = load_config()
+        except ImportError:
+            return "auto", model
+
+        # Check auxiliary.{task} section
+        aux = config.get("auxiliary", {})
+        task_config = aux.get(task, {})
+        cfg_provider = task_config.get("provider", "").strip() or None
+        cfg_model = task_config.get("model", "").strip() or None
+
+        # Backwards compat: compression section has its own keys
+        if task == "compression" and not cfg_provider:
+            comp = config.get("compression", {})
+            cfg_provider = comp.get("summary_provider", "").strip() or None
+            cfg_model = cfg_model or comp.get("summary_model", "").strip() or None
+
+        if cfg_provider and cfg_provider != "auto":
+            return cfg_provider, model or cfg_model
+        return "auto", model or cfg_model
+
+    return "auto", model
+
+
+def _build_call_kwargs(
+    provider: str,
+    model: str,
+    messages: list,
+    temperature: Optional[float] = None,
+    max_tokens: Optional[int] = None,
+    tools: Optional[list] = None,
+    timeout: float = 30.0,
+    extra_body: Optional[dict] = None,
+) -> dict:
+    """Build kwargs for .chat.completions.create() with model/provider adjustments."""
+    kwargs: Dict[str, Any] = {
+        "model": model,
+        "messages": messages,
+        "timeout": timeout,
+    }
+
+    if temperature is not None:
+        kwargs["temperature"] = temperature
+
+    if max_tokens is not None:
+        # Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens.
+        # Direct OpenAI api.openai.com with newer models needs max_completion_tokens.
+        if provider == "custom":
+            custom_base = os.getenv("OPENAI_BASE_URL", "")
+            if "api.openai.com" in custom_base.lower():
+                kwargs["max_completion_tokens"] = max_tokens
+            else:
+                kwargs["max_tokens"] = max_tokens
+        else:
+            kwargs["max_tokens"] = max_tokens
+
+    if tools:
+        kwargs["tools"] = tools
+
+    # Provider-specific extra_body
+    merged_extra = dict(extra_body or {})
+    if provider == "nous" or auxiliary_is_nous:
+        merged_extra.setdefault("tags", []).extend(["product=hermes-agent"])
+    if merged_extra:
+        kwargs["extra_body"] = merged_extra
+
+    return kwargs
+
+
+def call_llm(
+    task: str = None,
+    *,
+    provider: str = None,
+    model: str = None,
+    messages: list,
+    temperature: float = None,
+    max_tokens: int = None,
+    tools: list = None,
+    timeout: float = 30.0,
+    extra_body: dict = None,
+) -> Any:
+    """Centralized synchronous LLM call.
+
+    Resolves provider + model (from task config, explicit args, or auto-detect),
+    handles auth, request formatting, and model-specific arg adjustments.
+
+    Args:
+        task: Auxiliary task name ("compression", "vision", "web_extract",
+              "session_search", "skills_hub", "mcp", "flush_memories").
+              Reads provider:model from config/env. Ignored if provider is set.
+        provider: Explicit provider override.
+        model: Explicit model override.
+        messages: Chat messages list.
+        temperature: Sampling temperature (None = provider default).
+        max_tokens: Max output tokens (handles max_tokens vs max_completion_tokens).
+        tools: Tool definitions (for function calling).
+        timeout: Request timeout in seconds.
+        extra_body: Additional request body fields.
+
+    Returns:
+        Response object with .choices[0].message.content
+
+    Raises:
+        RuntimeError: If no provider is configured.
+    """
+    resolved_provider, resolved_model = _resolve_task_provider_model(
+        task, provider, model)
+
+    client, final_model = _get_cached_client(resolved_provider, resolved_model)
+    if client is None:
+        # Fallback: try openrouter
+        if resolved_provider != "openrouter":
+            logger.warning("Provider %s unavailable, falling back to openrouter",
+                           resolved_provider)
+            client, final_model = _get_cached_client(
+                "openrouter", resolved_model or _OPENROUTER_MODEL)
+    if client is None:
+        raise RuntimeError(
+            f"No LLM provider configured for task={task} provider={resolved_provider}. "
+            f"Run: hermes setup")
+
+    kwargs = _build_call_kwargs(
+        resolved_provider, final_model, messages,
+        temperature=temperature, max_tokens=max_tokens,
+        tools=tools, timeout=timeout, extra_body=extra_body)
+
+    # Handle max_tokens vs max_completion_tokens retry
+    try:
+        return client.chat.completions.create(**kwargs)
+    except Exception as first_err:
+        err_str = str(first_err)
+        if "max_tokens" in err_str or "unsupported_parameter" in err_str:
+            kwargs.pop("max_tokens", None)
+            kwargs["max_completion_tokens"] = max_tokens
+            return client.chat.completions.create(**kwargs)
+        raise
+
+
+async def async_call_llm(
+    task: str = None,
+    *,
+    provider: str = None,
+    model: str = None,
+    messages: list,
+    temperature: float = None,
+    max_tokens: int = None,
+    tools: list = None,
+    timeout: float = 30.0,
+    extra_body: dict = None,
+) -> Any:
+    """Centralized asynchronous LLM call.
+
+    Same as call_llm() but async. See call_llm() for full documentation.
+    """
+    resolved_provider, resolved_model = _resolve_task_provider_model(
+        task, provider, model)
+
+    client, final_model = _get_cached_client(
+        resolved_provider, resolved_model, async_mode=True)
+    if client is None:
+        if resolved_provider != "openrouter":
+            logger.warning("Provider %s unavailable, falling back to openrouter",
+                           resolved_provider)
+            client, final_model = _get_cached_client(
+                "openrouter", resolved_model or _OPENROUTER_MODEL,
+                async_mode=True)
+    if client is None:
+        raise RuntimeError(
+            f"No LLM provider configured for task={task} provider={resolved_provider}. "
+            f"Run: hermes setup")
+
+    kwargs = _build_call_kwargs(
+        resolved_provider, final_model, messages,
+        temperature=temperature, max_tokens=max_tokens,
+        tools=tools, timeout=timeout, extra_body=extra_body)
+
+    try:
+        return await client.chat.completions.create(**kwargs)
+    except Exception as first_err:
+        err_str = str(first_err)
+        if "max_tokens" in err_str or "unsupported_parameter" in err_str:
+            kwargs.pop("max_tokens", None)
+            kwargs["max_completion_tokens"] = max_tokens
+            return await client.chat.completions.create(**kwargs)
+        raise
diff --git a/agent/context_compressor.py b/agent/context_compressor.py
index fae483fd..a0ca0c99 100644
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@@ -9,7 +9,7 @@ import logging
 import os
 from typing import Any, Dict, List, Optional
 
-from agent.auxiliary_client import get_text_auxiliary_client
+from agent.auxiliary_client import call_llm
 from agent.model_metadata import (
     get_model_context_length,
     estimate_messages_tokens_rough,
@@ -53,8 +53,7 @@ class ContextCompressor:
         self.last_completion_tokens = 0
         self.last_total_tokens = 0
 
-        self.client, default_model = get_text_auxiliary_client("compression")
-        self.summary_model = summary_model_override or default_model
+        self.summary_model = summary_model_override or ""
 
     def update_from_response(self, usage: Dict[str, Any]):
         """Update tracked token usage from API response."""
@@ -120,73 +119,30 @@ TURNS TO SUMMARIZE:
 
 Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
 
-        # 1. Try the auxiliary model (cheap/fast)
-        if self.client:
-            try:
-                return self._call_summary_model(self.client, self.summary_model, prompt)
-            except Exception as e:
-                logging.warning(f"Failed to generate context summary with auxiliary model: {e}")
-
-        # 2. Fallback: re-try via the centralized provider router.
-        #    This covers all configured providers (Codex OAuth, API-key
-        #    providers, etc.) without ad-hoc env var lookups.
-        from agent.auxiliary_client import resolve_provider_client
-        fallback_providers = ["custom", "openrouter", "nous", "codex"]
-        for fb_provider in fallback_providers:
-            try:
-                fb_client, fb_model = resolve_provider_client(
-                    fb_provider, model=self.model)
-                if fb_client is None:
-                    continue
-                # Don't retry the same client that just failed
-                if (self.client is not None
-                        and hasattr(fb_client, "base_url")
-                        and hasattr(self.client, "base_url")
-                        and str(fb_client.base_url) == str(self.client.base_url)):
-                    continue
-                logger.info("Retrying context summary with fallback provider "
-                            "%s (%s)", fb_provider, fb_model)
-                summary = self._call_summary_model(fb_client, fb_model, prompt)
-                # Promote successful fallback for future compressions
-                self.client = fb_client
-                self.summary_model = fb_model
-                return summary
-            except Exception as fallback_err:
-                logging.warning("Fallback provider %s failed: %s",
-                                fb_provider, fallback_err)
-
-        # 3. All providers failed — return None so the caller drops turns
-        #    without a summary.
-        logging.warning("Context compression: no provider available for "
-                        "summary. Middle turns will be dropped without summary.")
-        return None
-
-    def _call_summary_model(self, client, model: str, prompt: str) -> str:
-        """Make the actual LLM call to generate a summary. Raises on failure."""
-        kwargs = {
-            "model": model,
-            "messages": [{"role": "user", "content": prompt}],
-            "temperature": 0.3,
-            "timeout": 30.0,
-        }
-        # Most providers (OpenRouter, local models) use max_tokens.
-        # Direct OpenAI with newer models (gpt-4o, o-series, gpt-5+)
-        # requires max_completion_tokens instead.
+        # Use the centralized LLM router — handles provider resolution,
+        # auth, and fallback internally.
         try:
-            kwargs["max_tokens"] = self.summary_target_tokens * 2
-            response = client.chat.completions.create(**kwargs)
-        except Exception as first_err:
-            if "max_tokens" in str(first_err) or "unsupported_parameter" in str(first_err):
-                kwargs.pop("max_tokens", None)
-                kwargs["max_completion_tokens"] = self.summary_target_tokens * 2
-                response = client.chat.completions.create(**kwargs)
-            else:
-                raise
-
-        summary = response.choices[0].message.content.strip()
-        if not summary.startswith("[CONTEXT SUMMARY]:"):
-            summary = "[CONTEXT SUMMARY]: " + summary
-        return summary
+            call_kwargs = {
+                "task": "compression",
+                "messages": [{"role": "user", "content": prompt}],
+                "temperature": 0.3,
+                "max_tokens": self.summary_target_tokens * 2,
+                "timeout": 30.0,
+            }
+            if self.summary_model:
+                call_kwargs["model"] = self.summary_model
+            response = call_llm(**call_kwargs)
+            summary = response.choices[0].message.content.strip()
+            if not summary.startswith("[CONTEXT SUMMARY]:"):
+                summary = "[CONTEXT SUMMARY]: " + summary
+            return summary
+        except RuntimeError:
+            logging.warning("Context compression: no provider available for "
+                            "summary. Middle turns will be dropped without summary.")
+            return None
+        except Exception as e:
+            logging.warning("Failed to generate context summary: %s", e)
+            return None
 
     # ------------------------------------------------------------------
     # Tool-call / tool-result pair integrity helpers
diff --git a/hermes_cli/config.py b/hermes_cli/config.py
index 677de678..99008978 100644
--- a/hermes_cli/config.py
+++ b/hermes_cli/config.py
@@ -125,17 +125,41 @@ DEFAULT_CONFIG = {
         "summary_provider": "auto",
     },
     
-    # Auxiliary model overrides (advanced).  By default Hermes auto-selects
-    # the provider and model for each side task.  Set these to override.
+    # Auxiliary model config — provider:model for each side task.
+    # Format: provider is the provider name, model is the model slug.
+    # "auto" for provider = auto-detect best available provider.
+    # Empty model = use provider's default auxiliary model.
+    # All tasks fall back to openrouter:google/gemini-3-flash-preview if
+    # the configured provider is unavailable.
     "auxiliary": {
         "vision": {
-            "provider": "auto",    # auto | openrouter | nous | main
+            "provider": "auto",    # auto | openrouter | nous | codex | custom
             "model": "",           # e.g. "google/gemini-2.5-flash", "gpt-4o"
         },
         "web_extract": {
             "provider": "auto",
             "model": "",
         },
+        "compression": {
+            "provider": "auto",
+            "model": "",
+        },
+        "session_search": {
+            "provider": "auto",
+            "model": "",
+        },
+        "skills_hub": {
+            "provider": "auto",
+            "model": "",
+        },
+        "mcp": {
+            "provider": "auto",
+            "model": "",
+        },
+        "flush_memories": {
+            "provider": "auto",
+            "model": "",
+        },
     },
     
     "display": {
@@ -217,7 +241,7 @@ DEFAULT_CONFIG = {
     "personalities": {},
 
     # Config schema version - bump this when adding new required fields
-    "_config_version": 6,
+    "_config_version": 7,
 }
 
 # =============================================================================
diff --git a/run_agent.py b/run_agent.py
index db35d85f..8849d25c 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -2623,19 +2623,22 @@ class AIAgent:
 
             # Use auxiliary client for the flush call when available --
             # it's cheaper and avoids Codex Responses API incompatibility.
-            from agent.auxiliary_client import get_text_auxiliary_client
-            aux_client, aux_model = get_text_auxiliary_client()
+            from agent.auxiliary_client import call_llm as _call_llm
+            _aux_available = True
+            try:
+                response = _call_llm(
+                    task="flush_memories",
+                    messages=api_messages,
+                    tools=[memory_tool_def],
+                    temperature=0.3,
+                    max_tokens=5120,
+                    timeout=30.0,
+                )
+            except RuntimeError:
+                _aux_available = False
+                response = None
 
-            if aux_client:
-                api_kwargs = {
-                    "model": aux_model,
-                    "messages": api_messages,
-                    "tools": [memory_tool_def],
-                    "temperature": 0.3,
-                    "max_tokens": 5120,
-                }
-                response = aux_client.chat.completions.create(**api_kwargs, timeout=30.0)
-            elif self.api_mode == "codex_responses":
+            if not _aux_available and self.api_mode == "codex_responses":
                 # No auxiliary client -- use the Codex Responses path directly
                 codex_kwargs = self._build_api_kwargs(api_messages)
                 codex_kwargs["tools"] = self._responses_tools([memory_tool_def])
@@ -2643,7 +2646,7 @@ class AIAgent:
                 if "max_output_tokens" in codex_kwargs:
                     codex_kwargs["max_output_tokens"] = 5120
                 response = self._run_codex_stream(codex_kwargs)
-            else:
+            elif not _aux_available:
                 api_kwargs = {
                     "model": self.model,
                     "messages": api_messages,
@@ -2655,7 +2658,7 @@ class AIAgent:
 
             # Extract tool calls from the response, handling both API formats
             tool_calls = []
-            if self.api_mode == "codex_responses" and not aux_client:
+            if self.api_mode == "codex_responses" and not _aux_available:
                 assistant_msg, _ = self._normalize_codex_response(response)
                 if assistant_msg and assistant_msg.tool_calls:
                     tool_calls = assistant_msg.tool_calls
diff --git a/tests/agent/test_context_compressor.py b/tests/agent/test_context_compressor.py
index 12fa374c..82ee9350 100644
--- a/tests/agent/test_context_compressor.py
+++ b/tests/agent/test_context_compressor.py
@@ -9,8 +9,7 @@ from agent.context_compressor import ContextCompressor
 @pytest.fixture()
 def compressor():
     """Create a ContextCompressor with mocked dependencies."""
-    with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
-         patch("agent.context_compressor.get_text_auxiliary_client", return_value=(None, None)):
+    with patch("agent.context_compressor.get_model_context_length", return_value=100000):
         c = ContextCompressor(
             model="test/model",
             threshold_percent=0.85,
@@ -119,14 +118,11 @@ class TestGenerateSummaryNoneContent:
     """Regression: content=None (from tool-call-only assistant messages) must not crash."""
 
     def test_none_content_does_not_crash(self):
-        mock_client = MagicMock()
         mock_response = MagicMock()
         mock_response.choices = [MagicMock()]
         mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: tool calls happened"
-        mock_client.chat.completions.create.return_value = mock_response
 
-        with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
-             patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
+        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
             c = ContextCompressor(model="test", quiet_mode=True)
 
         messages = [
@@ -139,14 +135,14 @@ class TestGenerateSummaryNoneContent:
             {"role": "user", "content": "thanks"},
         ]
 
-        summary = c._generate_summary(messages)
+        with patch("agent.context_compressor.call_llm", return_value=mock_response):
+            summary = c._generate_summary(messages)
         assert isinstance(summary, str)
         assert "CONTEXT SUMMARY" in summary
 
     def test_none_content_in_system_message_compress(self):
         """System message with content=None should not crash during compress."""
-        with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
-             patch("agent.context_compressor.get_text_auxiliary_client", return_value=(None, None)):
+        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
             c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
 
         msgs = [{"role": "system", "content": None}] + [
@@ -165,12 +161,12 @@ class TestCompressWithClient:
         mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened"
         mock_client.chat.completions.create.return_value = mock_response
 
-        with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
-             patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
+        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
             c = ContextCompressor(model="test", quiet_mode=True)
 
         msgs = [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} for i in range(10)]
-        result = c.compress(msgs)
+        with patch("agent.context_compressor.call_llm", return_value=mock_response):
+            result = c.compress(msgs)
 
         # Should have summary message in the middle
         contents = [m.get("content", "") for m in result]
@@ -184,8 +180,7 @@ class TestCompressWithClient:
         mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compressed middle"
         mock_client.chat.completions.create.return_value = mock_response
 
-        with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
-             patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
+        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
             c = ContextCompressor(
                 model="test",
                 quiet_mode=True,
@@ -212,7 +207,8 @@ class TestCompressWithClient:
             {"role": "user", "content": "later 4"},
         ]
 
-        result = c.compress(msgs)
+        with patch("agent.context_compressor.call_llm", return_value=mock_response):
+            result = c.compress(msgs)
 
         answered_ids = {
             msg.get("tool_call_id")
@@ -232,8 +228,7 @@ class TestCompressWithClient:
         mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened"
         mock_client.chat.completions.create.return_value = mock_response
 
-        with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
-             patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
+        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
             c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2)
 
         # Last head message (index 1) is "assistant" → summary should be "user"
@@ -245,7 +240,8 @@ class TestCompressWithClient:
             {"role": "user", "content": "msg 4"},
             {"role": "assistant", "content": "msg 5"},
         ]
-        result = c.compress(msgs)
+        with patch("agent.context_compressor.call_llm", return_value=mock_response):
+            result = c.compress(msgs)
         summary_msg = [m for m in result if "CONTEXT SUMMARY" in (m.get("content") or "")]
         assert len(summary_msg) == 1
         assert summary_msg[0]["role"] == "user"
@@ -258,8 +254,7 @@ class TestCompressWithClient:
         mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened"
         mock_client.chat.completions.create.return_value = mock_response
 
-        with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
-             patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
+        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
             c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=3, protect_last_n=2)
 
         # Last head message (index 2) is "user" → summary should be "assistant"
@@ -273,20 +268,18 @@ class TestCompressWithClient:
             {"role": "user", "content": "msg 6"},
             {"role": "assistant", "content": "msg 7"},
         ]
-        result = c.compress(msgs)
+        with patch("agent.context_compressor.call_llm", return_value=mock_response):
+            result = c.compress(msgs)
         summary_msg = [m for m in result if "CONTEXT SUMMARY" in (m.get("content") or "")]
         assert len(summary_msg) == 1
         assert summary_msg[0]["role"] == "assistant"
 
     def test_summarization_does_not_start_tail_with_tool_outputs(self):
-        mock_client = MagicMock()
         mock_response = MagicMock()
         mock_response.choices = [MagicMock()]
         mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compressed middle"
-        mock_client.chat.completions.create.return_value = mock_response
 
-        with patch("agent.context_compressor.get_model_context_length", return_value=100000), \
-             patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")):
+        with patch("agent.context_compressor.get_model_context_length", return_value=100000):
             c = ContextCompressor(
                 model="test",
                 quiet_mode=True,
@@ -309,7 +302,8 @@ class TestCompressWithClient:
             {"role": "user", "content": "latest user"},
         ]
 
-        result = c.compress(msgs)
+        with patch("agent.context_compressor.call_llm", return_value=mock_response):
+            result = c.compress(msgs)
 
         called_ids = {
             tc["id"]
diff --git a/tests/tools/test_mcp_tool.py b/tests/tools/test_mcp_tool.py
index 446f80d3..0d527e95 100644
--- a/tests/tools/test_mcp_tool.py
+++ b/tests/tools/test_mcp_tool.py
@@ -1828,8 +1828,8 @@ class TestSamplingCallbackText:
         )
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "default-model"),
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
         ):
             params = _make_sampling_params()
             result = asyncio.run(self.handler(None, params))
@@ -1847,13 +1847,13 @@ class TestSamplingCallbackText:
         fake_client.chat.completions.create.return_value = _make_llm_response()
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "default-model"),
-        ):
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
+        ) as mock_call:
             params = _make_sampling_params(system_prompt="Be helpful")
             asyncio.run(self.handler(None, params))
 
-        call_args = fake_client.chat.completions.create.call_args
+        call_args = mock_call.call_args
         messages = call_args.kwargs["messages"]
         assert messages[0] == {"role": "system", "content": "Be helpful"}
 
@@ -1865,8 +1865,8 @@ class TestSamplingCallbackText:
         )
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "default-model"),
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
         ):
             params = _make_sampling_params()
             result = asyncio.run(self.handler(None, params))
@@ -1889,8 +1889,8 @@ class TestSamplingCallbackToolUse:
         fake_client.chat.completions.create.return_value = _make_llm_tool_response()
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "default-model"),
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
         ):
             params = _make_sampling_params()
             result = asyncio.run(self.handler(None, params))
@@ -1916,8 +1916,8 @@ class TestSamplingCallbackToolUse:
         )
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "default-model"),
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
         ):
             result = asyncio.run(self.handler(None, _make_sampling_params()))
 
@@ -1939,8 +1939,8 @@ class TestToolLoopGovernance:
         fake_client.chat.completions.create.return_value = _make_llm_tool_response()
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "default-model"),
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
         ):
             params = _make_sampling_params()
             # Round 1, 2: allowed
@@ -1959,8 +1959,8 @@ class TestToolLoopGovernance:
         fake_client = MagicMock()
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "default-model"),
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
         ):
             # Tool response (round 1 of 1 allowed)
             fake_client.chat.completions.create.return_value = _make_llm_tool_response()
@@ -1984,8 +1984,8 @@ class TestToolLoopGovernance:
         fake_client.chat.completions.create.return_value = _make_llm_tool_response()
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "default-model"),
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
         ):
             result = asyncio.run(handler(None, _make_sampling_params()))
             assert isinstance(result, ErrorData)
@@ -2003,8 +2003,8 @@ class TestSamplingErrors:
         fake_client.chat.completions.create.return_value = _make_llm_response()
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "default-model"),
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
         ):
             # First call succeeds
             r1 = asyncio.run(handler(None, _make_sampling_params()))
@@ -2017,20 +2017,16 @@ class TestSamplingErrors:
 
     def test_timeout_error(self):
         handler = SamplingHandler("to", {"timeout": 0.05})
-        fake_client = MagicMock()
 
         def slow_call(**kwargs):
             import threading
-            # Use an event to ensure the thread truly blocks long enough
             evt = threading.Event()
             evt.wait(5)  # blocks for up to 5 seconds (cancelled by timeout)
             return _make_llm_response()
 
-        fake_client.chat.completions.create.side_effect = slow_call
-
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "default-model"),
+            "agent.auxiliary_client.call_llm",
+            side_effect=slow_call,
         ):
             result = asyncio.run(handler(None, _make_sampling_params()))
             assert isinstance(result, ErrorData)
@@ -2041,12 +2037,11 @@ class TestSamplingErrors:
         handler = SamplingHandler("np", {})
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(None, None),
+            "agent.auxiliary_client.call_llm",
+            side_effect=RuntimeError("No LLM provider configured"),
         ):
             result = asyncio.run(handler(None, _make_sampling_params()))
             assert isinstance(result, ErrorData)
-            assert "No LLM provider" in result.message
             assert handler.metrics["errors"] == 1
 
     def test_empty_choices_returns_error(self):
@@ -2060,8 +2055,8 @@ class TestSamplingErrors:
         )
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "default-model"),
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
         ):
             result = asyncio.run(handler(None, _make_sampling_params()))
 
@@ -2080,8 +2075,8 @@ class TestSamplingErrors:
         )
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "default-model"),
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
         ):
             result = asyncio.run(handler(None, _make_sampling_params()))
 
@@ -2099,8 +2094,8 @@ class TestSamplingErrors:
         )
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "default-model"),
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
         ):
             result = asyncio.run(handler(None, _make_sampling_params()))
 
@@ -2120,8 +2115,8 @@ class TestModelWhitelist:
         fake_client.chat.completions.create.return_value = _make_llm_response()
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "test-model"),
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
         ):
             result = asyncio.run(handler(None, _make_sampling_params()))
             assert isinstance(result, CreateMessageResult)
@@ -2131,8 +2126,8 @@ class TestModelWhitelist:
         fake_client = MagicMock()
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "gpt-3.5-turbo"),
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
         ):
             result = asyncio.run(handler(None, _make_sampling_params()))
             assert isinstance(result, ErrorData)
@@ -2145,8 +2140,8 @@ class TestModelWhitelist:
         fake_client.chat.completions.create.return_value = _make_llm_response()
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "any-model"),
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
         ):
             result = asyncio.run(handler(None, _make_sampling_params()))
             assert isinstance(result, CreateMessageResult)
@@ -2166,8 +2161,8 @@ class TestMalformedToolCallArgs:
         )
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "default-model"),
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
         ):
             result = asyncio.run(handler(None, _make_sampling_params()))
 
@@ -2194,8 +2189,8 @@ class TestMalformedToolCallArgs:
         fake_client.chat.completions.create.return_value = response
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "default-model"),
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
         ):
             result = asyncio.run(handler(None, _make_sampling_params()))
 
@@ -2214,8 +2209,8 @@ class TestMetricsTracking:
         fake_client.chat.completions.create.return_value = _make_llm_response()
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "default-model"),
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
         ):
             asyncio.run(handler(None, _make_sampling_params()))
 
@@ -2229,8 +2224,8 @@ class TestMetricsTracking:
         fake_client.chat.completions.create.return_value = _make_llm_tool_response()
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(fake_client, "default-model"),
+            "agent.auxiliary_client.call_llm",
+            return_value=fake_client.chat.completions.create.return_value,
         ):
             asyncio.run(handler(None, _make_sampling_params()))
 
@@ -2241,8 +2236,8 @@ class TestMetricsTracking:
         handler = SamplingHandler("met3", {})
 
         with patch(
-            "agent.auxiliary_client.get_text_auxiliary_client",
-            return_value=(None, None),
+            "agent.auxiliary_client.call_llm",
+            side_effect=RuntimeError("No LLM provider configured"),
         ):
             asyncio.run(handler(None, _make_sampling_params()))
 
diff --git a/tools/browser_tool.py b/tools/browser_tool.py
index dd44549b..ae951574 100644
--- a/tools/browser_tool.py
+++ b/tools/browser_tool.py
@@ -63,7 +63,7 @@ import time
 import requests
 from typing import Dict, Any, Optional, List
 from pathlib import Path
-from agent.auxiliary_client import get_vision_auxiliary_client, get_text_auxiliary_client
+from agent.auxiliary_client import call_llm
 
 logger = logging.getLogger(__name__)
 
@@ -80,38 +80,15 @@ DEFAULT_SESSION_TIMEOUT = 300
 # Max tokens for snapshot content before summarization
 SNAPSHOT_SUMMARIZE_THRESHOLD = 8000
 
-# Vision client — for browser_vision (screenshot analysis)
-# Wrapped in try/except so a broken auxiliary config doesn't prevent the entire
-# browser_tool module from importing (which would disable all 10 browser tools).
-try:
-    _aux_vision_client, _DEFAULT_VISION_MODEL = get_vision_auxiliary_client()
-except Exception as _init_err:
-    logger.debug("Could not initialise vision auxiliary client: %s", _init_err)
-    _aux_vision_client, _DEFAULT_VISION_MODEL = None, None
 
-# Text client — for page snapshot summarization (same config as web_extract)
-try:
-    _aux_text_client, _DEFAULT_TEXT_MODEL = get_text_auxiliary_client("web_extract")
-except Exception as _init_err:
-    logger.debug("Could not initialise text auxiliary client: %s", _init_err)
-    _aux_text_client, _DEFAULT_TEXT_MODEL = None, None
-
-# Module-level alias for availability checks
-EXTRACTION_MODEL = _DEFAULT_TEXT_MODEL or _DEFAULT_VISION_MODEL
-
-
-def _get_vision_model() -> str:
+def _get_vision_model() -> Optional[str]:
     """Model for browser_vision (screenshot analysis — multimodal)."""
-    return (os.getenv("AUXILIARY_VISION_MODEL", "").strip()
-            or _DEFAULT_VISION_MODEL
-            or "google/gemini-3-flash-preview")
+    return os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None
 
 
-def _get_extraction_model() -> str:
+def _get_extraction_model() -> Optional[str]:
     """Model for page snapshot text summarization — same as web_extract."""
-    return (os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip()
-            or _DEFAULT_TEXT_MODEL
-            or "google/gemini-3-flash-preview")
+    return os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() or None
 
 
 def _is_local_mode() -> bool:
@@ -941,9 +918,6 @@ def _extract_relevant_content(
 
     Falls back to simple truncation when no auxiliary text model is configured.
     """
-    if _aux_text_client is None:
-        return _truncate_snapshot(snapshot_text)
-
     if user_task:
         extraction_prompt = (
             f"You are a content extractor for a browser automation agent.\n\n"
@@ -968,13 +942,16 @@ def _extract_relevant_content(
         )
 
     try:
-        from agent.auxiliary_client import auxiliary_max_tokens_param
-        response = _aux_text_client.chat.completions.create(
-            model=_get_extraction_model(),
-            messages=[{"role": "user", "content": extraction_prompt}],
-            **auxiliary_max_tokens_param(4000),
-            temperature=0.1,
-        )
+        call_kwargs = {
+            "task": "web_extract",
+            "messages": [{"role": "user", "content": extraction_prompt}],
+            "max_tokens": 4000,
+            "temperature": 0.1,
+        }
+        model = _get_extraction_model()
+        if model:
+            call_kwargs["model"] = model
+        response = call_llm(**call_kwargs)
         return response.choices[0].message.content
     except Exception:
         return _truncate_snapshot(snapshot_text)
@@ -1497,14 +1474,6 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
     
     effective_task_id = task_id or "default"
     
-    # Check auxiliary vision client
-    if _aux_vision_client is None or _DEFAULT_VISION_MODEL is None:
-        return json.dumps({
-            "success": False,
-            "error": "Browser vision unavailable: no auxiliary vision model configured. "
-                     "Set OPENROUTER_API_KEY or configure Nous Portal to enable browser vision."
-        }, ensure_ascii=False)
-    
     # Save screenshot to persistent location so it can be shared with users
     hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
     screenshots_dir = hermes_home / "browser_screenshots"
@@ -1562,14 +1531,13 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
             f"Focus on answering the user's specific question."
         )
 
-        # Use the sync auxiliary vision client directly
-        from agent.auxiliary_client import auxiliary_max_tokens_param
+        # Use the centralized LLM router
         vision_model = _get_vision_model()
-        logger.debug("browser_vision: analysing screenshot (%d bytes) with model=%s",
-                     len(image_data), vision_model)
-        response = _aux_vision_client.chat.completions.create(
-            model=vision_model,
-            messages=[
+        logger.debug("browser_vision: analysing screenshot (%d bytes)",
+                     len(image_data))
+        call_kwargs = {
+            "task": "vision",
+            "messages": [
                 {
                     "role": "user",
                     "content": [
@@ -1578,9 +1546,12 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
                     ],
                 }
             ],
-            **auxiliary_max_tokens_param(2000),
-            temperature=0.1,
-        )
+            "max_tokens": 2000,
+            "temperature": 0.1,
+        }
+        if vision_model:
+            call_kwargs["model"] = vision_model
+        response = call_llm(**call_kwargs)
         
         analysis = response.choices[0].message.content
         response_data = {
diff --git a/tools/mcp_tool.py b/tools/mcp_tool.py
index b0fc35f7..e1137909 100644
--- a/tools/mcp_tool.py
+++ b/tools/mcp_tool.py
@@ -456,17 +456,13 @@ class SamplingHandler:
         # Resolve model
         model = self._resolve_model(getattr(params, "modelPreferences", None))
 
-        # Get auxiliary LLM client
-        from agent.auxiliary_client import get_text_auxiliary_client
-        client, default_model = get_text_auxiliary_client()
-        if client is None:
-            self.metrics["errors"] += 1
-            return self._error("No LLM provider available for sampling")
+        # Get auxiliary LLM client via centralized router
+        from agent.auxiliary_client import call_llm
 
-        resolved_model = model or default_model
+        # Model whitelist check (we need to resolve model before calling)
+        resolved_model = model or self.model_override or ""
 
-        # Model whitelist check
-        if self.allowed_models and resolved_model not in self.allowed_models:
+        if self.allowed_models and resolved_model and resolved_model not in self.allowed_models:
             logger.warning(
                 "MCP server '%s' requested model '%s' not in allowed_models",
                 self.server_name, resolved_model,
@@ -484,20 +480,15 @@ class SamplingHandler:
 
         # Build LLM call kwargs
         max_tokens = min(params.maxTokens, self.max_tokens_cap)
-        call_kwargs: dict = {
-            "model": resolved_model,
-            "messages": messages,
-            "max_tokens": max_tokens,
-        }
+        call_temperature = None
         if hasattr(params, "temperature") and params.temperature is not None:
-            call_kwargs["temperature"] = params.temperature
-        if stop := getattr(params, "stopSequences", None):
-            call_kwargs["stop"] = stop
+            call_temperature = params.temperature
 
         # Forward server-provided tools
+        call_tools = None
         server_tools = getattr(params, "tools", None)
         if server_tools:
-            call_kwargs["tools"] = [
+            call_tools = [
                 {
                     "type": "function",
                     "function": {
@@ -508,9 +499,6 @@ class SamplingHandler:
                 }
                 for t in server_tools
             ]
-            if tool_choice := getattr(params, "toolChoice", None):
-                mode = getattr(tool_choice, "mode", "auto")
-                call_kwargs["tool_choice"] = {"auto": "auto", "required": "required", "none": "none"}.get(mode, "auto")
 
         logger.log(
             self.audit_level,
@@ -520,7 +508,15 @@ class SamplingHandler:
 
         # Offload sync LLM call to thread (non-blocking)
         def _sync_call():
-            return client.chat.completions.create(**call_kwargs)
+            return call_llm(
+                task="mcp",
+                model=resolved_model or None,
+                messages=messages,
+                temperature=call_temperature,
+                max_tokens=max_tokens,
+                tools=call_tools,
+                timeout=self.timeout,
+            )
 
         try:
             response = await asyncio.wait_for(
diff --git a/tools/session_search_tool.py b/tools/session_search_tool.py
index 4bf88cbf..cd1b98fd 100644
--- a/tools/session_search_tool.py
+++ b/tools/session_search_tool.py
@@ -22,13 +22,7 @@ import os
 import logging
 from typing import Dict, Any, List, Optional, Union
 
-from openai import AsyncOpenAI, OpenAI
-
-from agent.auxiliary_client import get_async_text_auxiliary_client
-
-# Resolve the async auxiliary client at import time so we have the model slug.
-# Handles Codex Responses API adapter transparently.
-_async_aux_client, _SUMMARIZER_MODEL = get_async_text_auxiliary_client()
+from agent.auxiliary_client import async_call_llm
 MAX_SESSION_CHARS = 100_000
 MAX_SUMMARY_TOKENS = 10000
 
@@ -156,26 +150,22 @@ async def _summarize_session(
         f"Summarize this conversation with focus on: {query}"
     )
 
-    if _async_aux_client is None or _SUMMARIZER_MODEL is None:
-        logging.warning("No auxiliary model available for session summarization")
-        return None
-
     max_retries = 3
     for attempt in range(max_retries):
         try:
-            from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
-            _extra = get_auxiliary_extra_body()
-            response = await _async_aux_client.chat.completions.create(
-                model=_SUMMARIZER_MODEL,
+            response = await async_call_llm(
+                task="session_search",
                 messages=[
                     {"role": "system", "content": system_prompt},
                     {"role": "user", "content": user_prompt},
                 ],
-                **({} if not _extra else {"extra_body": _extra}),
                 temperature=0.1,
-                **auxiliary_max_tokens_param(MAX_SUMMARY_TOKENS),
+                max_tokens=MAX_SUMMARY_TOKENS,
             )
             return response.choices[0].message.content.strip()
+        except RuntimeError:
+            logging.warning("No auxiliary model available for session summarization")
+            return None
         except Exception as e:
             if attempt < max_retries - 1:
                 await asyncio.sleep(1 * (attempt + 1))
@@ -333,8 +323,6 @@ def session_search(
 
 def check_session_search_requirements() -> bool:
     """Requires SQLite state database and an auxiliary text model."""
-    if _async_aux_client is None:
-        return False
     try:
         from hermes_state import DEFAULT_DB_PATH
         return DEFAULT_DB_PATH.parent.exists()
diff --git a/tools/skills_guard.py b/tools/skills_guard.py
index 8234b0a2..c354d654 100644
--- a/tools/skills_guard.py
+++ b/tools/skills_guard.py
@@ -936,13 +936,10 @@ def llm_audit_skill(skill_path: Path, static_result: ScanResult,
 
     # Call the LLM via the centralized provider router
     try:
-        from agent.auxiliary_client import resolve_provider_client
+        from agent.auxiliary_client import call_llm
 
-        client, _default_model = resolve_provider_client("openrouter")
-        if client is None:
-            return static_result
-
-        response = client.chat.completions.create(
+        response = call_llm(
+            provider="openrouter",
             model=model,
             messages=[{
                 "role": "user",
diff --git a/tools/vision_tools.py b/tools/vision_tools.py
index ee89b58a..c1b09a22 100644
--- a/tools/vision_tools.py
+++ b/tools/vision_tools.py
@@ -37,16 +37,11 @@ from pathlib import Path
 from typing import Any, Awaitable, Dict, Optional
 from urllib.parse import urlparse
 import httpx
-from agent.auxiliary_client import get_async_vision_auxiliary_client
+from agent.auxiliary_client import async_call_llm
 from tools.debug_helpers import DebugSession
 
 logger = logging.getLogger(__name__)
 
-# Resolve vision auxiliary client at module level.
-# Uses get_async_vision_auxiliary_client() which properly handles Codex
-# routing (Responses API adapter) instead of raw AsyncOpenAI construction.
-_aux_async_client, DEFAULT_VISION_MODEL = get_async_vision_auxiliary_client()
-
 _debug = DebugSession("vision_tools", env_var="VISION_TOOLS_DEBUG")
 
 
@@ -185,7 +180,7 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None)
 async def vision_analyze_tool(
     image_url: str,
     user_prompt: str,
-    model: str = DEFAULT_VISION_MODEL,
+    model: str = None,
 ) -> str:
     """
     Analyze an image from a URL or local file path using vision AI.
@@ -245,15 +240,6 @@ async def vision_analyze_tool(
         logger.info("Analyzing image: %s", image_url[:60])
         logger.info("User prompt: %s", user_prompt[:100])
         
-        # Check auxiliary vision client availability
-        if _aux_async_client is None or DEFAULT_VISION_MODEL is None:
-            logger.error("Vision analysis unavailable: no auxiliary vision model configured")
-            return json.dumps({
-                "success": False,
-                "analysis": "Vision analysis unavailable: no auxiliary vision model configured. "
-                            "Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools."
-            }, indent=2, ensure_ascii=False)
-        
         # Determine if this is a local file path or a remote URL
         local_path = Path(image_url)
         if local_path.is_file():
@@ -309,18 +295,18 @@ async def vision_analyze_tool(
             }
         ]
         
-        logger.info("Processing image with %s...", model)
+        logger.info("Processing image with vision model...")
         
-        # Call the vision API
-        from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
-        _extra = get_auxiliary_extra_body()
-        response = await _aux_async_client.chat.completions.create(
-            model=model,
-            messages=messages,
-            temperature=0.1,
-            **auxiliary_max_tokens_param(2000),
-            **({} if not _extra else {"extra_body": _extra}),
-        )
+        # Call the vision API via centralized router
+        call_kwargs = {
+            "task": "vision",
+            "messages": messages,
+            "temperature": 0.1,
+            "max_tokens": 2000,
+        }
+        if model:
+            call_kwargs["model"] = model
+        response = await async_call_llm(**call_kwargs)
         
         # Extract the analysis
         analysis = response.choices[0].message.content.strip()
@@ -391,7 +377,18 @@ async def vision_analyze_tool(
 
 def check_vision_requirements() -> bool:
     """Check if an auxiliary vision model is available."""
-    return _aux_async_client is not None
+    try:
+        from agent.auxiliary_client import resolve_provider_client
+        client, _ = resolve_provider_client("openrouter")
+        if client is not None:
+            return True
+        client, _ = resolve_provider_client("nous")
+        if client is not None:
+            return True
+        client, _ = resolve_provider_client("custom")
+        return client is not None
+    except Exception:
+        return False
 
 
 def get_debug_session_info() -> Dict[str, Any]:
@@ -419,10 +416,9 @@ if __name__ == "__main__":
         print("Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools.")
         exit(1)
     else:
-        print(f"✅ Vision model available: {DEFAULT_VISION_MODEL}")
+        print("✅ Vision model available")
     
     print("🛠️ Vision tools ready for use!")
-    print(f"🧠 Using model: {DEFAULT_VISION_MODEL}")
     
     # Show debug mode status
     if _debug.active:
@@ -489,9 +485,7 @@ def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]:
         "Fully describe and explain everything about this image, then answer the "
         f"following question:\n\n{question}"
     )
-    model = (os.getenv("AUXILIARY_VISION_MODEL", "").strip()
-             or DEFAULT_VISION_MODEL
-             or "google/gemini-3-flash-preview")
+    model = os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None
     return vision_analyze_tool(image_url, full_prompt, model)
 
 
diff --git a/tools/web_tools.py b/tools/web_tools.py
index e99d94fb..71a882a5 100644
--- a/tools/web_tools.py
+++ b/tools/web_tools.py
@@ -47,8 +47,7 @@ import re
 import asyncio
 from typing import List, Dict, Any, Optional
 from firecrawl import Firecrawl
-from openai import AsyncOpenAI
-from agent.auxiliary_client import get_async_text_auxiliary_client
+from agent.auxiliary_client import async_call_llm
 from tools.debug_helpers import DebugSession
 
 logger = logging.getLogger(__name__)
@@ -83,15 +82,8 @@ def _get_firecrawl_client():
 
 DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000
 
-# Resolve async auxiliary client at module level.
-# Handles Codex Responses API adapter transparently.
-_aux_async_client, _DEFAULT_SUMMARIZER_MODEL = get_async_text_auxiliary_client("web_extract")
-
-# Allow per-task override via config.yaml auxiliary.web_extract_model
-DEFAULT_SUMMARIZER_MODEL = (
-    os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip()
-    or _DEFAULT_SUMMARIZER_MODEL
-)
+# Allow per-task override via env var
+DEFAULT_SUMMARIZER_MODEL = os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() or None
 
 _debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG")
 
@@ -249,22 +241,22 @@ Create a markdown summary that captures all key information in a well-organized,
 
     for attempt in range(max_retries):
         try:
-            if _aux_async_client is None:
-                logger.warning("No auxiliary model available for web content processing")
-                return None
-            from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
-            _extra = get_auxiliary_extra_body()
-            response = await _aux_async_client.chat.completions.create(
-                model=model,
-                messages=[
+            call_kwargs = {
+                "task": "web_extract",
+                "messages": [
                     {"role": "system", "content": system_prompt},
                     {"role": "user", "content": user_prompt}
                 ],
-                temperature=0.1,
-                **auxiliary_max_tokens_param(max_tokens),
-                **({} if not _extra else {"extra_body": _extra}),
-            )
+                "temperature": 0.1,
+                "max_tokens": max_tokens,
+            }
+            if model:
+                call_kwargs["model"] = model
+            response = await async_call_llm(**call_kwargs)
             return response.choices[0].message.content.strip()
+        except RuntimeError:
+            logger.warning("No auxiliary model available for web content processing")
+            return None
         except Exception as api_error:
             last_error = api_error
             if attempt < max_retries - 1:
@@ -368,25 +360,18 @@ Synthesize these into ONE cohesive, comprehensive summary that:
 Create a single, unified markdown summary."""
 
     try:
-        if _aux_async_client is None:
-            logger.warning("No auxiliary model for synthesis, concatenating summaries")
-            fallback = "\n\n".join(summaries)
-            if len(fallback) > max_output_size:
-                fallback = fallback[:max_output_size] + "\n\n[... truncated ...]"
-            return fallback
-
-        from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param
-        _extra = get_auxiliary_extra_body()
-        response = await _aux_async_client.chat.completions.create(
-            model=model,
-            messages=[
+        call_kwargs = {
+            "task": "web_extract",
+            "messages": [
                 {"role": "system", "content": "You synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise."},
                 {"role": "user", "content": synthesis_prompt}
             ],
-            temperature=0.1,
-            **auxiliary_max_tokens_param(20000),
-            **({} if not _extra else {"extra_body": _extra}),
-        )
+            "temperature": 0.1,
+            "max_tokens": 20000,
+        }
+        if model:
+            call_kwargs["model"] = model
+        response = await async_call_llm(**call_kwargs)
         final_summary = response.choices[0].message.content.strip()
         
         # Enforce hard cap
@@ -713,8 +698,8 @@ async def web_extract_tool(
         debug_call_data["pages_extracted"] = pages_extracted
         debug_call_data["original_response_size"] = len(json.dumps(response))
         
-        # Process each result with LLM if enabled and auxiliary client is available
-        if use_llm_processing and _aux_async_client is not None:
+        # Process each result with LLM if enabled
+        if use_llm_processing:
             logger.info("Processing extracted content with LLM (parallel)...")
             debug_call_data["processing_applied"].append("llm_processing")
             
@@ -780,10 +765,6 @@ async def web_extract_tool(
                 else:
                     logger.warning("%s (no content to process)", url)
         else:
-            if use_llm_processing and _aux_async_client is None:
-                logger.warning("LLM processing requested but no auxiliary model available, returning raw content")
-                debug_call_data["processing_applied"].append("llm_processing_unavailable")
-            
             # Print summary of extracted pages for debugging (original behavior)
             for result in response.get('results', []):
                 url = result.get('url', 'Unknown URL')
@@ -1013,8 +994,8 @@ async def web_crawl_tool(
         debug_call_data["pages_crawled"] = pages_crawled
         debug_call_data["original_response_size"] = len(json.dumps(response))
         
-        # Process each result with LLM if enabled and auxiliary client is available
-        if use_llm_processing and _aux_async_client is not None:
+        # Process each result with LLM if enabled
+        if use_llm_processing:
             logger.info("Processing crawled content with LLM (parallel)...")
             debug_call_data["processing_applied"].append("llm_processing")
             
@@ -1080,10 +1061,6 @@ async def web_crawl_tool(
                 else:
                     logger.warning("%s (no content to process)", page_url)
         else:
-            if use_llm_processing and _aux_async_client is None:
-                logger.warning("LLM processing requested but no auxiliary model available, returning raw content")
-                debug_call_data["processing_applied"].append("llm_processing_unavailable")
-            
             # Print summary of crawled pages for debugging (original behavior)
             for result in response.get('results', []):
                 page_url = result.get('url', 'Unknown URL')
@@ -1138,7 +1115,15 @@ def check_firecrawl_api_key() -> bool:
 
 def check_auxiliary_model() -> bool:
     """Check if an auxiliary text model is available for LLM content processing."""
-    return _aux_async_client is not None
+    try:
+        from agent.auxiliary_client import resolve_provider_client
+        for p in ("openrouter", "nous", "custom", "codex"):
+            client, _ = resolve_provider_client(p)
+            if client is not None:
+                return True
+        return False
+    except Exception:
+        return False
 
 
 def get_debug_session_info() -> Dict[str, Any]:
diff --git a/trajectory_compressor.py b/trajectory_compressor.py
index 5f1c84c6..ef81d6e2 100644
--- a/trajectory_compressor.py
+++ b/trajectory_compressor.py
@@ -344,28 +344,32 @@ class TrajectoryCompressor:
             raise RuntimeError(f"Failed to load tokenizer '{self.config.tokenizer_name}': {e}")
     
     def _init_summarizer(self):
-        """Initialize LLM client for summarization (sync and async).
+        """Initialize LLM routing for summarization (sync and async).
 
-        Routes through the centralized provider router for known providers
-        (OpenRouter, Nous, Codex, etc.) so auth and headers are handled
-        consistently.  Falls back to raw construction for custom endpoints.
+        Uses call_llm/async_call_llm from the centralized provider router
+        which handles auth, headers, and provider detection internally.
+        For custom endpoints, falls back to raw client construction.
         """
-        from agent.auxiliary_client import resolve_provider_client
+        from agent.auxiliary_client import call_llm, async_call_llm
 
         provider = self._detect_provider()
         if provider:
-            # Use centralized router — handles auth, headers, Codex adapter
-            self.client, _ = resolve_provider_client(
+            # Store provider for use in _generate_summary calls
+            self._llm_provider = provider
+            self._use_call_llm = True
+            # Verify the provider is available
+            from agent.auxiliary_client import resolve_provider_client
+            client, _ = resolve_provider_client(
                 provider, model=self.config.summarization_model)
-            self.async_client, _ = resolve_provider_client(
-                provider, model=self.config.summarization_model,
-                async_mode=True)
-            if self.client is None:
+            if client is None:
                 raise RuntimeError(
                     f"Provider '{provider}' is not configured. "
                     f"Check your API key or run: hermes setup")
+            self.client = None  # Not used directly
+            self.async_client = None  # Not used directly
         else:
             # Custom endpoint — use config's raw base_url + api_key_env
+            self._use_call_llm = False
             api_key = os.getenv(self.config.api_key_env)
             if not api_key:
                 raise RuntimeError(
@@ -524,12 +528,22 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
             try:
                 metrics.summarization_api_calls += 1
                 
-                response = self.client.chat.completions.create(
-                    model=self.config.summarization_model,
-                    messages=[{"role": "user", "content": prompt}],
-                    temperature=self.config.temperature,
-                    max_tokens=self.config.summary_target_tokens * 2,
-                )
+                if getattr(self, '_use_call_llm', False):
+                    from agent.auxiliary_client import call_llm
+                    response = call_llm(
+                        provider=self._llm_provider,
+                        model=self.config.summarization_model,
+                        messages=[{"role": "user", "content": prompt}],
+                        temperature=self.config.temperature,
+                        max_tokens=self.config.summary_target_tokens * 2,
+                    )
+                else:
+                    response = self.client.chat.completions.create(
+                        model=self.config.summarization_model,
+                        messages=[{"role": "user", "content": prompt}],
+                        temperature=self.config.temperature,
+                        max_tokens=self.config.summary_target_tokens * 2,
+                    )
                 
                 summary = response.choices[0].message.content.strip()
                 
@@ -581,12 +595,22 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""
             try:
                 metrics.summarization_api_calls += 1
                 
-                response = await self.async_client.chat.completions.create(
-                    model=self.config.summarization_model,
-                    messages=[{"role": "user", "content": prompt}],
-                    temperature=self.config.temperature,
-                    max_tokens=self.config.summary_target_tokens * 2,
-                )
+                if getattr(self, '_use_call_llm', False):
+                    from agent.auxiliary_client import async_call_llm
+                    response = await async_call_llm(
+                        provider=self._llm_provider,
+                        model=self.config.summarization_model,
+                        messages=[{"role": "user", "content": prompt}],
+                        temperature=self.config.temperature,
+                        max_tokens=self.config.summary_target_tokens * 2,
+                    )
+                else:
+                    response = await self.async_client.chat.completions.create(
+                        model=self.config.summarization_model,
+                        messages=[{"role": "user", "content": prompt}],
+                        temperature=self.config.temperature,
+                        max_tokens=self.config.summary_target_tokens * 2,
+                    )
                 
                 summary = response.choices[0].message.content.strip()
                 

From 29ef69c703324fb75b567279ee6ed3d1bf6ab7dd Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Wed, 11 Mar 2026 21:06:54 -0700
Subject: [PATCH 05/11] fix: update all test mocks for call_llm migration

Update 14 test files to use the new call_llm/async_call_llm mock
patterns instead of the old get_text_auxiliary_client/
get_vision_auxiliary_client tuple returns.

- vision_tools tests: mock async_call_llm instead of _aux_async_client
- browser tests: mock call_llm instead of _aux_vision_client
- flush_memories tests: mock call_llm instead of get_text_auxiliary_client
- session_search tests: mock async_call_llm with RuntimeError
- mcp_tool tests: fix whitelist model config, use side_effect for
  multi-response tests
- auxiliary_config_bridge: update for model=None (resolved in router)

3251 passed, 2 pre-existing unrelated failures.
---
 tests/test_auxiliary_config_bridge.py |  7 ++++---
 tests/test_flush_memories_codex.py    | 25 +++++++++++--------------
 tests/test_run_agent.py               |  2 +-
 tests/tools/test_browser_console.py   |  6 ++----
 tests/tools/test_mcp_tool.py          | 14 ++++++++------
 tests/tools/test_session_search.py    | 18 ++++++++----------
 tests/tools/test_vision_tools.py      | 23 ++++++-----------------
 7 files changed, 40 insertions(+), 55 deletions(-)

diff --git a/tests/test_auxiliary_config_bridge.py b/tests/test_auxiliary_config_bridge.py
index b0804e4b..a4d65c2a 100644
--- a/tests/test_auxiliary_config_bridge.py
+++ b/tests/test_auxiliary_config_bridge.py
@@ -229,13 +229,14 @@ class TestVisionModelOverride:
 
     def test_default_model_when_no_override(self, monkeypatch):
         monkeypatch.delenv("AUXILIARY_VISION_MODEL", raising=False)
-        from tools.vision_tools import _handle_vision_analyze, DEFAULT_VISION_MODEL
+        from tools.vision_tools import _handle_vision_analyze
         with patch("tools.vision_tools.vision_analyze_tool", new_callable=MagicMock) as mock_tool:
             mock_tool.return_value = '{"success": true}'
             _handle_vision_analyze({"image_url": "http://test.jpg", "question": "test"})
             call_args = mock_tool.call_args
-            expected = DEFAULT_VISION_MODEL or "google/gemini-3-flash-preview"
-            assert call_args[0][2] == expected
+            # With no AUXILIARY_VISION_MODEL env var, model should be None
+            # (the centralized call_llm router picks the provider default)
+            assert call_args[0][2] is None
 
 
 # ── DEFAULT_CONFIG shape tests ───────────────────────────────────────────────
diff --git a/tests/test_flush_memories_codex.py b/tests/test_flush_memories_codex.py
index 22eef5ab..3d12c9d3 100644
--- a/tests/test_flush_memories_codex.py
+++ b/tests/test_flush_memories_codex.py
@@ -98,10 +98,9 @@ class TestFlushMemoriesUsesAuxiliaryClient:
     def test_flush_uses_auxiliary_when_available(self, monkeypatch):
         agent = _make_agent(monkeypatch, api_mode="codex_responses", provider="openai-codex")
 
-        mock_aux_client = MagicMock()
-        mock_aux_client.chat.completions.create.return_value = _chat_response_with_memory_call()
+        mock_response = _chat_response_with_memory_call()
 
-        with patch("agent.auxiliary_client.get_text_auxiliary_client", return_value=(mock_aux_client, "gpt-4o-mini")):
+        with patch("agent.auxiliary_client.call_llm", return_value=mock_response) as mock_call:
             messages = [
                 {"role": "user", "content": "Hello"},
                 {"role": "assistant", "content": "Hi there"},
@@ -110,9 +109,9 @@ class TestFlushMemoriesUsesAuxiliaryClient:
             with patch("tools.memory_tool.memory_tool", return_value="Saved.") as mock_memory:
                 agent.flush_memories(messages)
 
-        mock_aux_client.chat.completions.create.assert_called_once()
-        call_kwargs = mock_aux_client.chat.completions.create.call_args
-        assert call_kwargs.kwargs.get("model") == "gpt-4o-mini" or call_kwargs[1].get("model") == "gpt-4o-mini"
+        mock_call.assert_called_once()
+        call_kwargs = mock_call.call_args
+        assert call_kwargs.kwargs.get("task") == "flush_memories"
 
     def test_flush_uses_main_client_when_no_auxiliary(self, monkeypatch):
         """Non-Codex mode with no auxiliary falls back to self.client."""
@@ -120,7 +119,7 @@ class TestFlushMemoriesUsesAuxiliaryClient:
         agent.client = MagicMock()
         agent.client.chat.completions.create.return_value = _chat_response_with_memory_call()
 
-        with patch("agent.auxiliary_client.get_text_auxiliary_client", return_value=(None, None)):
+        with patch("agent.auxiliary_client.call_llm", side_effect=RuntimeError("no provider")):
             messages = [
                 {"role": "user", "content": "Hello"},
                 {"role": "assistant", "content": "Hi there"},
@@ -135,10 +134,9 @@ class TestFlushMemoriesUsesAuxiliaryClient:
         """Verify that memory tool calls from the flush response actually get executed."""
         agent = _make_agent(monkeypatch, api_mode="chat_completions", provider="openrouter")
 
-        mock_aux_client = MagicMock()
-        mock_aux_client.chat.completions.create.return_value = _chat_response_with_memory_call()
+        mock_response = _chat_response_with_memory_call()
 
-        with patch("agent.auxiliary_client.get_text_auxiliary_client", return_value=(mock_aux_client, "gpt-4o-mini")):
+        with patch("agent.auxiliary_client.call_llm", return_value=mock_response):
             messages = [
                 {"role": "user", "content": "Hello"},
                 {"role": "assistant", "content": "Hi"},
@@ -157,10 +155,9 @@ class TestFlushMemoriesUsesAuxiliaryClient:
         """After flush, the flush prompt and any response should be removed from messages."""
         agent = _make_agent(monkeypatch, api_mode="chat_completions", provider="openrouter")
 
-        mock_aux_client = MagicMock()
-        mock_aux_client.chat.completions.create.return_value = _chat_response_with_memory_call()
+        mock_response = _chat_response_with_memory_call()
 
-        with patch("agent.auxiliary_client.get_text_auxiliary_client", return_value=(mock_aux_client, "gpt-4o-mini")):
+        with patch("agent.auxiliary_client.call_llm", return_value=mock_response):
             messages = [
                 {"role": "user", "content": "Hello"},
                 {"role": "assistant", "content": "Hi"},
@@ -202,7 +199,7 @@ class TestFlushMemoriesCodexFallback:
             model="gpt-5-codex",
         )
 
-        with patch("agent.auxiliary_client.get_text_auxiliary_client", return_value=(None, None)), \
+        with patch("agent.auxiliary_client.call_llm", side_effect=RuntimeError("no provider")), \
              patch.object(agent, "_run_codex_stream", return_value=codex_response) as mock_stream, \
              patch.object(agent, "_build_api_kwargs") as mock_build, \
              patch("tools.memory_tool.memory_tool", return_value="Saved.") as mock_memory:
diff --git a/tests/test_run_agent.py b/tests/test_run_agent.py
index a3a82283..c789d735 100644
--- a/tests/test_run_agent.py
+++ b/tests/test_run_agent.py
@@ -959,7 +959,7 @@ class TestFlushSentinelNotLeaked:
         agent.client.chat.completions.create.return_value = mock_response
 
         # Bypass auxiliary client so flush uses agent.client directly
-        with patch("agent.auxiliary_client.get_text_auxiliary_client", return_value=(None, None)):
+        with patch("agent.auxiliary_client.call_llm", side_effect=RuntimeError("no provider")):
             agent.flush_memories(messages, min_turns=0)
 
         # Check what was actually sent to the API
diff --git a/tests/tools/test_browser_console.py b/tests/tools/test_browser_console.py
index 962b49f0..f5f54a0b 100644
--- a/tests/tools/test_browser_console.py
+++ b/tests/tools/test_browser_console.py
@@ -137,8 +137,7 @@ class TestBrowserVisionAnnotate:
 
         with (
             patch("tools.browser_tool._run_browser_command") as mock_cmd,
-            patch("tools.browser_tool._aux_vision_client") as mock_client,
-            patch("tools.browser_tool._DEFAULT_VISION_MODEL", "test-model"),
+            patch("tools.browser_tool.call_llm") as mock_call_llm,
             patch("tools.browser_tool._get_vision_model", return_value="test-model"),
         ):
             mock_cmd.return_value = {"success": True, "data": {}}
@@ -159,8 +158,7 @@ class TestBrowserVisionAnnotate:
 
         with (
             patch("tools.browser_tool._run_browser_command") as mock_cmd,
-            patch("tools.browser_tool._aux_vision_client") as mock_client,
-            patch("tools.browser_tool._DEFAULT_VISION_MODEL", "test-model"),
+            patch("tools.browser_tool.call_llm") as mock_call_llm,
             patch("tools.browser_tool._get_vision_model", return_value="test-model"),
         ):
             mock_cmd.return_value = {"success": True, "data": {}}
diff --git a/tests/tools/test_mcp_tool.py b/tests/tools/test_mcp_tool.py
index 0d527e95..bc3179ea 100644
--- a/tests/tools/test_mcp_tool.py
+++ b/tests/tools/test_mcp_tool.py
@@ -1956,24 +1956,26 @@ class TestToolLoopGovernance:
     def test_text_response_resets_counter(self):
         """A text response resets the tool loop counter."""
         handler = SamplingHandler("tl2", {"max_tool_rounds": 1})
-        fake_client = MagicMock()
+
+        # Use a list to hold the current response, so the side_effect can
+        # pick up changes between calls.
+        responses = [_make_llm_tool_response()]
 
         with patch(
             "agent.auxiliary_client.call_llm",
-            return_value=fake_client.chat.completions.create.return_value,
+            side_effect=lambda **kw: responses[0],
         ):
             # Tool response (round 1 of 1 allowed)
-            fake_client.chat.completions.create.return_value = _make_llm_tool_response()
             r1 = asyncio.run(handler(None, _make_sampling_params()))
             assert isinstance(r1, CreateMessageResultWithTools)
 
             # Text response resets counter
-            fake_client.chat.completions.create.return_value = _make_llm_response()
+            responses[0] = _make_llm_response()
             r2 = asyncio.run(handler(None, _make_sampling_params()))
             assert isinstance(r2, CreateMessageResult)
 
             # Tool response again (should succeed since counter was reset)
-            fake_client.chat.completions.create.return_value = _make_llm_tool_response()
+            responses[0] = _make_llm_tool_response()
             r3 = asyncio.run(handler(None, _make_sampling_params()))
             assert isinstance(r3, CreateMessageResultWithTools)
 
@@ -2122,7 +2124,7 @@ class TestModelWhitelist:
             assert isinstance(result, CreateMessageResult)
 
     def test_disallowed_model_rejected(self):
-        handler = SamplingHandler("wl2", {"allowed_models": ["gpt-4o"]})
+        handler = SamplingHandler("wl2", {"allowed_models": ["gpt-4o"], "model": "test-model"})
         fake_client = MagicMock()
 
         with patch(
diff --git a/tests/tools/test_session_search.py b/tests/tools/test_session_search.py
index 645e08ff..c3624714 100644
--- a/tests/tools/test_session_search.py
+++ b/tests/tools/test_session_search.py
@@ -189,16 +189,14 @@ class TestSessionSearch:
             {"role": "assistant", "content": "hi there"},
         ]
 
-        # Mock the summarizer to return a simple summary
-        import tools.session_search_tool as sst
-        original_client = sst._async_aux_client
-        sst._async_aux_client = None  # Disable summarizer → returns None
-
-        result = json.loads(session_search(
-            query="test", db=mock_db, current_session_id=current_sid,
-        ))
-
-        sst._async_aux_client = original_client
+        # Mock async_call_llm to raise RuntimeError → summarizer returns None
+        from unittest.mock import AsyncMock, patch as _patch
+        with _patch("tools.session_search_tool.async_call_llm",
+                     new_callable=AsyncMock,
+                     side_effect=RuntimeError("no provider")):
+            result = json.loads(session_search(
+                query="test", db=mock_db, current_session_id=current_sid,
+            ))
 
         assert result["success"] is True
         # Current session should be skipped, only other_sid should appear
diff --git a/tests/tools/test_vision_tools.py b/tests/tools/test_vision_tools.py
index 0135284a..6cfdc941 100644
--- a/tests/tools/test_vision_tools.py
+++ b/tests/tools/test_vision_tools.py
@@ -202,7 +202,7 @@ class TestHandleVisionAnalyze:
             assert model == "custom/model-v1"
 
     def test_falls_back_to_default_model(self):
-        """Without AUXILIARY_VISION_MODEL, should use DEFAULT_VISION_MODEL or fallback."""
+        """Without AUXILIARY_VISION_MODEL, model should be None (let call_llm resolve default)."""
         with (
             patch(
                 "tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock
@@ -218,9 +218,9 @@ class TestHandleVisionAnalyze:
             coro.close()
             call_args = mock_tool.call_args
             model = call_args[0][2]
-            # Should be DEFAULT_VISION_MODEL or the hardcoded fallback
-            assert model is not None
-            assert len(model) > 0
+            # With no AUXILIARY_VISION_MODEL set, model should be None
+            # (the centralized call_llm router picks the default)
+            assert model is None
 
     def test_empty_args_graceful(self):
         """Missing keys should default to empty strings, not raise."""
@@ -277,8 +277,6 @@ class TestErrorLoggingExcInfo:
                 new_callable=AsyncMock,
                 side_effect=Exception("download boom"),
             ),
-            patch("tools.vision_tools._aux_async_client", MagicMock()),
-            patch("tools.vision_tools.DEFAULT_VISION_MODEL", "test/model"),
             caplog.at_level(logging.ERROR, logger="tools.vision_tools"),
         ):
             result = await vision_analyze_tool(
@@ -311,25 +309,16 @@ class TestErrorLoggingExcInfo:
                 "tools.vision_tools._image_to_base64_data_url",
                 return_value="data:image/jpeg;base64,abc",
             ),
-            patch("agent.auxiliary_client.get_auxiliary_extra_body", return_value=None),
-            patch(
-                "agent.auxiliary_client.auxiliary_max_tokens_param",
-                return_value={"max_tokens": 2000},
-            ),
             caplog.at_level(logging.WARNING, logger="tools.vision_tools"),
         ):
-            # Mock the vision client
-            mock_client = AsyncMock()
+            # Mock the async_call_llm function to return a mock response
             mock_response = MagicMock()
             mock_choice = MagicMock()
             mock_choice.message.content = "A test image description"
             mock_response.choices = [mock_choice]
-            mock_client.chat.completions.create = AsyncMock(return_value=mock_response)
 
-            # Patch module-level _aux_async_client so the tool doesn't bail early
             with (
-                patch("tools.vision_tools._aux_async_client", mock_client),
-                patch("tools.vision_tools.DEFAULT_VISION_MODEL", "test/model"),
+                patch("tools.vision_tools.async_call_llm", new_callable=AsyncMock, return_value=mock_response),
             ):
                 # Make unlink fail to trigger cleanup warning
                 original_unlink = Path.unlink

From a29801286ff0997dc688e206c3144cfe4bc4bdf6 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Wed, 11 Mar 2026 21:38:29 -0700
Subject: [PATCH 06/11] refactor: route main agent client + fallback through
 centralized router
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 2 of the provider router migration — route the main agent's
client construction and fallback activation through
resolve_provider_client() instead of duplicated ad-hoc logic.

run_agent.py:
- __init__: When no explicit api_key/base_url, use
  resolve_provider_client(provider, raw_codex=True) for client
  construction. Explicit creds (from CLI/gateway runtime provider)
  still construct directly.
- _try_activate_fallback: Replace _resolve_fallback_credentials and
  its duplicated _FALLBACK_API_KEY_PROVIDERS / _FALLBACK_OAUTH_PROVIDERS
  dicts with a single resolve_provider_client() call. The router
  handles all provider types (API-key, OAuth, Codex) centrally.
- Remove _resolve_fallback_credentials method and both fallback dicts.

agent/auxiliary_client.py:
- Add raw_codex parameter to resolve_provider_client(). When True,
  returns the raw OpenAI client for Codex providers instead of wrapping
  in CodexAuxiliaryClient. The main agent needs this for direct
  responses.stream() access.

3251 passed, 2 pre-existing unrelated failures.
---
 agent/auxiliary_client.py    |  17 +++
 run_agent.py                 | 185 ++++++++++++--------------------
 tests/test_fallback_model.py | 200 +++++++++++++++++++++--------------
 3 files changed, 206 insertions(+), 196 deletions(-)

diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py
index 04afe4c7..19c2b8bd 100644
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -536,6 +536,7 @@ def resolve_provider_client(
     provider: str,
     model: str = None,
     async_mode: bool = False,
+    raw_codex: bool = False,
 ) -> Tuple[Optional[Any], Optional[str]]:
     """Central router: given a provider name and optional model, return a
     configured client with the correct auth, base URL, and API format.
@@ -553,6 +554,10 @@ def resolve_provider_client(
         model: Model slug override.  If None, uses the provider's default
                auxiliary model.
         async_mode: If True, return an async-compatible client.
+        raw_codex: If True, return a raw OpenAI client for Codex providers
+            instead of wrapping in CodexAuxiliaryClient.  Use this when
+            the caller needs direct access to responses.stream() (e.g.,
+            the main agent loop).
 
     Returns:
         (client, resolved_model) or (None, None) if auth is unavailable.
@@ -597,6 +602,18 @@ def resolve_provider_client(
 
     # ── OpenAI Codex (OAuth → Responses API) ─────────────────────────
     if provider == "openai-codex":
+        if raw_codex:
+            # Return the raw OpenAI client for callers that need direct
+            # access to responses.stream() (e.g., the main agent loop).
+            codex_token = _read_codex_access_token()
+            if not codex_token:
+                logger.warning("resolve_provider_client: openai-codex requested "
+                               "but no Codex OAuth token found (run: hermes model)")
+                return None, None
+            final_model = model or _CODEX_AUX_MODEL
+            raw_client = OpenAI(api_key=codex_token, base_url=_CODEX_AUX_BASE_URL)
+            return (raw_client, final_model)
+        # Standard path: wrap in CodexAuxiliaryClient adapter
         client, default = _try_codex()
         if client is None:
             logger.warning("resolve_provider_client: openai-codex requested "
diff --git a/run_agent.py b/run_agent.py
index 8849d25c..107b803c 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -418,36 +418,50 @@ class AIAgent:
                 ]:
                     logging.getLogger(quiet_logger).setLevel(logging.ERROR)
         
-        # Initialize OpenAI client - defaults to OpenRouter
-        client_kwargs = {}
-        
-        # Default to OpenRouter if no base_url provided
-        if base_url:
-            client_kwargs["base_url"] = base_url
+        # Initialize OpenAI client via centralized provider router.
+        # The router handles auth resolution, base URL, headers, and
+        # Codex wrapping for all known providers.
+        # raw_codex=True because the main agent needs direct responses.stream()
+        # access for Codex Responses API streaming.
+        if api_key and base_url:
+            # Explicit credentials from CLI/gateway — construct directly.
+            # The runtime provider resolver already handled auth for us.
+            client_kwargs = {"api_key": api_key, "base_url": base_url}
+            effective_base = base_url
+            if "openrouter" in effective_base.lower():
+                client_kwargs["default_headers"] = {
+                    "HTTP-Referer": "https://github.com/NousResearch/hermes-agent",
+                    "X-OpenRouter-Title": "Hermes Agent",
+                    "X-OpenRouter-Categories": "productivity,cli-agent",
+                }
+            elif "api.kimi.com" in effective_base.lower():
+                client_kwargs["default_headers"] = {
+                    "User-Agent": "KimiCLI/1.0",
+                }
         else:
-            client_kwargs["base_url"] = OPENROUTER_BASE_URL
-        
-        # Handle API key - OpenRouter is the primary provider
-        if api_key:
-            client_kwargs["api_key"] = api_key
-        else:
-            # Primary: OPENROUTER_API_KEY, fallback to direct provider keys
-            client_kwargs["api_key"] = os.getenv("OPENROUTER_API_KEY", "")
-        
-        # OpenRouter app attribution — shows hermes-agent in rankings/analytics
-        effective_base = client_kwargs.get("base_url", "")
-        if "openrouter" in effective_base.lower():
-            client_kwargs["default_headers"] = {
-                "HTTP-Referer": "https://github.com/NousResearch/hermes-agent",
-                "X-OpenRouter-Title": "Hermes Agent",
-                "X-OpenRouter-Categories": "productivity,cli-agent",
-            }
-        elif "api.kimi.com" in effective_base.lower():
-            # Kimi Code API requires a recognized coding-agent User-Agent
-            # (see https://github.com/MoonshotAI/kimi-cli)
-            client_kwargs["default_headers"] = {
-                "User-Agent": "KimiCLI/1.0",
-            }
+            # No explicit creds — use the centralized provider router
+            from agent.auxiliary_client import resolve_provider_client
+            _routed_client, _ = resolve_provider_client(
+                self.provider or "auto", model=self.model, raw_codex=True)
+            if _routed_client is not None:
+                client_kwargs = {
+                    "api_key": _routed_client.api_key,
+                    "base_url": str(_routed_client.base_url),
+                }
+                # Preserve any default_headers the router set
+                if hasattr(_routed_client, '_default_headers') and _routed_client._default_headers:
+                    client_kwargs["default_headers"] = dict(_routed_client._default_headers)
+            else:
+                # Final fallback: try raw OpenRouter key
+                client_kwargs = {
+                    "api_key": os.getenv("OPENROUTER_API_KEY", ""),
+                    "base_url": OPENROUTER_BASE_URL,
+                    "default_headers": {
+                        "HTTP-Referer": "https://github.com/NousResearch/hermes-agent",
+                        "X-OpenRouter-Title": "Hermes Agent",
+                        "X-OpenRouter-Categories": "productivity,cli-agent",
+                    },
+                }
         
         self._client_kwargs = client_kwargs  # stored for rebuilding after interrupt
         try:
@@ -2236,75 +2250,6 @@ class AIAgent:
 
     # ── Provider fallback ──────────────────────────────────────────────────
 
-    # API-key providers: provider → (base_url, [env_var_names])
-    _FALLBACK_API_KEY_PROVIDERS = {
-        "openrouter": (OPENROUTER_BASE_URL, ["OPENROUTER_API_KEY"]),
-        "zai": ("https://api.z.ai/api/paas/v4", ["ZAI_API_KEY", "Z_AI_API_KEY"]),
-        "kimi-coding": ("https://api.moonshot.ai/v1", ["KIMI_API_KEY"]),
-        "minimax": ("https://api.minimax.io/v1", ["MINIMAX_API_KEY"]),
-        "minimax-cn": ("https://api.minimaxi.com/v1", ["MINIMAX_CN_API_KEY"]),
-    }
-
-    # OAuth providers: provider → (resolver_import_path, api_mode)
-    # Each resolver returns {"api_key": ..., "base_url": ...}.
-    _FALLBACK_OAUTH_PROVIDERS = {
-        "openai-codex": ("resolve_codex_runtime_credentials", "codex_responses"),
-        "nous": ("resolve_nous_runtime_credentials", "chat_completions"),
-    }
-
-    def _resolve_fallback_credentials(
-        self, fb_provider: str, fb_config: dict
-    ) -> Optional[tuple]:
-        """Resolve credentials for a fallback provider.
-
-        Returns (api_key, base_url, api_mode) on success, or None on failure.
-        Handles three cases:
-          1. OAuth providers (openai-codex, nous) — call credential resolver
-          2. API-key providers (openrouter, zai, etc.) — read env var
-          3. Custom endpoints — use base_url + api_key_env from config
-        """
-        # ── 1. OAuth providers ────────────────────────────────────────
-        if fb_provider in self._FALLBACK_OAUTH_PROVIDERS:
-            resolver_name, api_mode = self._FALLBACK_OAUTH_PROVIDERS[fb_provider]
-            try:
-                import hermes_cli.auth as _auth
-                resolver = getattr(_auth, resolver_name)
-                creds = resolver()
-                return creds["api_key"], creds["base_url"], api_mode
-            except Exception as e:
-                logging.warning(
-                    "Fallback to %s failed (credential resolution): %s",
-                    fb_provider, e,
-                )
-                return None
-
-        # ── 2. API-key providers ──────────────────────────────────────
-        fb_key = (fb_config.get("api_key") or "").strip()
-        if not fb_key:
-            key_env = (fb_config.get("api_key_env") or "").strip()
-            if key_env:
-                fb_key = os.getenv(key_env, "")
-            elif fb_provider in self._FALLBACK_API_KEY_PROVIDERS:
-                for env_var in self._FALLBACK_API_KEY_PROVIDERS[fb_provider][1]:
-                    fb_key = os.getenv(env_var, "")
-                    if fb_key:
-                        break
-        if not fb_key:
-            logging.warning(
-                "Fallback model configured but no API key found for provider '%s'",
-                fb_provider,
-            )
-            return None
-
-        # ── 3. Resolve base URL ───────────────────────────────────────
-        fb_base_url = (fb_config.get("base_url") or "").strip()
-        if not fb_base_url and fb_provider in self._FALLBACK_API_KEY_PROVIDERS:
-            fb_base_url = self._FALLBACK_API_KEY_PROVIDERS[fb_provider][0]
-        if not fb_base_url:
-            fb_base_url = OPENROUTER_BASE_URL
-
-        return fb_key, fb_base_url, "chat_completions"
-
     def _try_activate_fallback(self) -> bool:
         """Switch to the configured fallback model/provider.
 
@@ -2312,6 +2257,10 @@ class AIAgent:
         OpenAI client, model slug, and provider in-place so the retry loop
         can continue with the new backend.  One-shot: returns False if
         already activated or not configured.
+
+        Uses the centralized provider router (resolve_provider_client) for
+        auth resolution and client construction — no duplicated provider→key
+        mappings.
         """
         if self._fallback_activated or not self._fallback_model:
             return False
@@ -2322,25 +2271,31 @@ class AIAgent:
         if not fb_provider or not fb_model:
             return False
 
-        resolved = self._resolve_fallback_credentials(fb_provider, fb)
-        if resolved is None:
-            return False
-        fb_key, fb_base_url, fb_api_mode = resolved
-
-        # Build new client
+        # Use centralized router for client construction.
+        # raw_codex=True because the main agent needs direct responses.stream()
+        # access for Codex providers.
         try:
-            client_kwargs = {"api_key": fb_key, "base_url": fb_base_url}
-            if "openrouter" in fb_base_url.lower():
-                client_kwargs["default_headers"] = {
-                    "HTTP-Referer": "https://github.com/NousResearch/hermes-agent",
-                    "X-OpenRouter-Title": "Hermes Agent",
-                    "X-OpenRouter-Categories": "productivity,cli-agent",
-                }
-            elif "api.kimi.com" in fb_base_url.lower():
-                client_kwargs["default_headers"] = {"User-Agent": "KimiCLI/1.0"}
+            from agent.auxiliary_client import resolve_provider_client
+            fb_client, _ = resolve_provider_client(
+                fb_provider, model=fb_model, raw_codex=True)
+            if fb_client is None:
+                logging.warning(
+                    "Fallback to %s failed: provider not configured",
+                    fb_provider)
+                return False
 
-            self.client = OpenAI(**client_kwargs)
-            self._client_kwargs = client_kwargs
+            # Determine api_mode from provider
+            fb_api_mode = "chat_completions"
+            if fb_provider == "openai-codex":
+                fb_api_mode = "codex_responses"
+            fb_base_url = str(fb_client.base_url)
+
+            # Swap client and config in-place
+            self.client = fb_client
+            self._client_kwargs = {
+                "api_key": fb_client.api_key,
+                "base_url": fb_base_url,
+            }
             old_model = self.model
             self.model = fb_model
             self.provider = fb_provider
diff --git a/tests/test_fallback_model.py b/tests/test_fallback_model.py
index dcc150c3..9e34bf74 100644
--- a/tests/test_fallback_model.py
+++ b/tests/test_fallback_model.py
@@ -35,7 +35,7 @@ def _make_agent(fallback_model=None):
         patch("run_agent.OpenAI"),
     ):
         agent = AIAgent(
-            api_key="test-key-primary",
+            api_key="test-key",
             quiet_mode=True,
             skip_context_files=True,
             skip_memory=True,
@@ -45,6 +45,14 @@ def _make_agent(fallback_model=None):
         return agent
 
 
+def _mock_resolve(base_url="https://openrouter.ai/api/v1", api_key="test-key"):
+    """Helper to create a mock client for resolve_provider_client."""
+    mock_client = MagicMock()
+    mock_client.api_key = api_key
+    mock_client.base_url = base_url
+    return mock_client
+
+
 # =============================================================================
 # _try_activate_fallback()
 # =============================================================================
@@ -71,9 +79,13 @@ class TestTryActivateFallback:
         agent = _make_agent(
             fallback_model={"provider": "openrouter", "model": "anthropic/claude-sonnet-4"},
         )
-        with (
-            patch.dict("os.environ", {"OPENROUTER_API_KEY": "sk-or-fallback-key"}),
-            patch("run_agent.OpenAI") as mock_openai,
+        mock_client = _mock_resolve(
+            api_key="sk-or-fallback-key",
+            base_url="https://openrouter.ai/api/v1",
+        )
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(mock_client, "anthropic/claude-sonnet-4"),
         ):
             result = agent._try_activate_fallback()
             assert result is True
@@ -81,36 +93,37 @@ class TestTryActivateFallback:
             assert agent.model == "anthropic/claude-sonnet-4"
             assert agent.provider == "openrouter"
             assert agent.api_mode == "chat_completions"
-            mock_openai.assert_called_once()
-            call_kwargs = mock_openai.call_args[1]
-            assert call_kwargs["api_key"] == "sk-or-fallback-key"
-            assert "openrouter" in call_kwargs["base_url"].lower()
-            # OpenRouter should get attribution headers
-            assert "default_headers" in call_kwargs
+            assert agent.client is mock_client
 
     def test_activates_zai_fallback(self):
         agent = _make_agent(
             fallback_model={"provider": "zai", "model": "glm-5"},
         )
-        with (
-            patch.dict("os.environ", {"ZAI_API_KEY": "sk-zai-key"}),
-            patch("run_agent.OpenAI") as mock_openai,
+        mock_client = _mock_resolve(
+            api_key="sk-zai-key",
+            base_url="https://open.z.ai/api/v1",
+        )
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(mock_client, "glm-5"),
         ):
             result = agent._try_activate_fallback()
             assert result is True
             assert agent.model == "glm-5"
             assert agent.provider == "zai"
-            call_kwargs = mock_openai.call_args[1]
-            assert call_kwargs["api_key"] == "sk-zai-key"
-            assert "z.ai" in call_kwargs["base_url"].lower()
+            assert agent.client is mock_client
 
     def test_activates_kimi_fallback(self):
         agent = _make_agent(
             fallback_model={"provider": "kimi-coding", "model": "kimi-k2.5"},
         )
-        with (
-            patch.dict("os.environ", {"KIMI_API_KEY": "sk-kimi-key"}),
-            patch("run_agent.OpenAI"),
+        mock_client = _mock_resolve(
+            api_key="sk-kimi-key",
+            base_url="https://api.moonshot.ai/v1",
+        )
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(mock_client, "kimi-k2.5"),
         ):
             assert agent._try_activate_fallback() is True
             assert agent.model == "kimi-k2.5"
@@ -120,23 +133,30 @@ class TestTryActivateFallback:
         agent = _make_agent(
             fallback_model={"provider": "minimax", "model": "MiniMax-M2.5"},
         )
-        with (
-            patch.dict("os.environ", {"MINIMAX_API_KEY": "sk-mm-key"}),
-            patch("run_agent.OpenAI") as mock_openai,
+        mock_client = _mock_resolve(
+            api_key="sk-mm-key",
+            base_url="https://api.minimax.io/v1",
+        )
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(mock_client, "MiniMax-M2.5"),
         ):
             assert agent._try_activate_fallback() is True
             assert agent.model == "MiniMax-M2.5"
             assert agent.provider == "minimax"
-            call_kwargs = mock_openai.call_args[1]
-            assert "minimax.io" in call_kwargs["base_url"]
+            assert agent.client is mock_client
 
     def test_only_fires_once(self):
         agent = _make_agent(
             fallback_model={"provider": "openrouter", "model": "anthropic/claude-sonnet-4"},
         )
-        with (
-            patch.dict("os.environ", {"OPENROUTER_API_KEY": "sk-or-key"}),
-            patch("run_agent.OpenAI"),
+        mock_client = _mock_resolve(
+            api_key="sk-or-key",
+            base_url="https://openrouter.ai/api/v1",
+        )
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(mock_client, "anthropic/claude-sonnet-4"),
         ):
             assert agent._try_activate_fallback() is True
             # Second attempt should return False
@@ -147,9 +167,10 @@ class TestTryActivateFallback:
         agent = _make_agent(
             fallback_model={"provider": "minimax", "model": "MiniMax-M2.5"},
         )
-        # Ensure MINIMAX_API_KEY is not in the environment
-        env = {k: v for k, v in os.environ.items() if k != "MINIMAX_API_KEY"}
-        with patch.dict("os.environ", env, clear=True):
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(None, None),
+        ):
             assert agent._try_activate_fallback() is False
             assert agent._fallback_activated is False
 
@@ -163,22 +184,29 @@ class TestTryActivateFallback:
                 "api_key_env": "MY_CUSTOM_KEY",
             },
         )
-        with (
-            patch.dict("os.environ", {"MY_CUSTOM_KEY": "custom-secret"}),
-            patch("run_agent.OpenAI") as mock_openai,
+        mock_client = _mock_resolve(
+            api_key="custom-secret",
+            base_url="http://localhost:8080/v1",
+        )
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(mock_client, "my-model"),
         ):
             assert agent._try_activate_fallback() is True
-            call_kwargs = mock_openai.call_args[1]
-            assert call_kwargs["base_url"] == "http://localhost:8080/v1"
-            assert call_kwargs["api_key"] == "custom-secret"
+            assert agent.client is mock_client
+            assert agent.model == "my-model"
 
     def test_prompt_caching_enabled_for_claude_on_openrouter(self):
         agent = _make_agent(
             fallback_model={"provider": "openrouter", "model": "anthropic/claude-sonnet-4"},
         )
-        with (
-            patch.dict("os.environ", {"OPENROUTER_API_KEY": "sk-or-key"}),
-            patch("run_agent.OpenAI"),
+        mock_client = _mock_resolve(
+            api_key="sk-or-key",
+            base_url="https://openrouter.ai/api/v1",
+        )
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(mock_client, "anthropic/claude-sonnet-4"),
         ):
             agent._try_activate_fallback()
             assert agent._use_prompt_caching is True
@@ -187,9 +215,13 @@ class TestTryActivateFallback:
         agent = _make_agent(
             fallback_model={"provider": "openrouter", "model": "google/gemini-2.5-flash"},
         )
-        with (
-            patch.dict("os.environ", {"OPENROUTER_API_KEY": "sk-or-key"}),
-            patch("run_agent.OpenAI"),
+        mock_client = _mock_resolve(
+            api_key="sk-or-key",
+            base_url="https://openrouter.ai/api/v1",
+        )
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(mock_client, "google/gemini-2.5-flash"),
         ):
             agent._try_activate_fallback()
             assert agent._use_prompt_caching is False
@@ -198,9 +230,13 @@ class TestTryActivateFallback:
         agent = _make_agent(
             fallback_model={"provider": "zai", "model": "glm-5"},
         )
-        with (
-            patch.dict("os.environ", {"ZAI_API_KEY": "sk-zai-key"}),
-            patch("run_agent.OpenAI"),
+        mock_client = _mock_resolve(
+            api_key="sk-zai-key",
+            base_url="https://open.z.ai/api/v1",
+        )
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(mock_client, "glm-5"),
         ):
             agent._try_activate_fallback()
             assert agent._use_prompt_caching is False
@@ -210,35 +246,36 @@ class TestTryActivateFallback:
         agent = _make_agent(
             fallback_model={"provider": "zai", "model": "glm-5"},
         )
-        with (
-            patch.dict("os.environ", {"Z_AI_API_KEY": "sk-alt-key"}),
-            patch("run_agent.OpenAI") as mock_openai,
+        mock_client = _mock_resolve(
+            api_key="sk-alt-key",
+            base_url="https://open.z.ai/api/v1",
+        )
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(mock_client, "glm-5"),
         ):
             assert agent._try_activate_fallback() is True
-            call_kwargs = mock_openai.call_args[1]
-            assert call_kwargs["api_key"] == "sk-alt-key"
+            assert agent.client is mock_client
 
     def test_activates_codex_fallback(self):
         """OpenAI Codex fallback should use OAuth credentials and codex_responses mode."""
         agent = _make_agent(
             fallback_model={"provider": "openai-codex", "model": "gpt-5.3-codex"},
         )
-        mock_creds = {
-            "api_key": "codex-oauth-token",
-            "base_url": "https://chatgpt.com/backend-api/codex",
-        }
-        with (
-            patch("hermes_cli.auth.resolve_codex_runtime_credentials", return_value=mock_creds),
-            patch("run_agent.OpenAI") as mock_openai,
+        mock_client = _mock_resolve(
+            api_key="codex-oauth-token",
+            base_url="https://chatgpt.com/backend-api/codex",
+        )
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(mock_client, "gpt-5.3-codex"),
         ):
             result = agent._try_activate_fallback()
             assert result is True
             assert agent.model == "gpt-5.3-codex"
             assert agent.provider == "openai-codex"
             assert agent.api_mode == "codex_responses"
-            call_kwargs = mock_openai.call_args[1]
-            assert call_kwargs["api_key"] == "codex-oauth-token"
-            assert "chatgpt.com" in call_kwargs["base_url"]
+            assert agent.client is mock_client
 
     def test_codex_fallback_fails_gracefully_without_credentials(self):
         """Codex fallback should return False if no OAuth credentials available."""
@@ -246,8 +283,8 @@ class TestTryActivateFallback:
             fallback_model={"provider": "openai-codex", "model": "gpt-5.3-codex"},
         )
         with patch(
-            "hermes_cli.auth.resolve_codex_runtime_credentials",
-            side_effect=Exception("No Codex credentials"),
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(None, None),
         ):
             assert agent._try_activate_fallback() is False
             assert agent._fallback_activated is False
@@ -257,22 +294,20 @@ class TestTryActivateFallback:
         agent = _make_agent(
             fallback_model={"provider": "nous", "model": "nous-hermes-3"},
         )
-        mock_creds = {
-            "api_key": "nous-agent-key-abc",
-            "base_url": "https://inference-api.nousresearch.com/v1",
-        }
-        with (
-            patch("hermes_cli.auth.resolve_nous_runtime_credentials", return_value=mock_creds),
-            patch("run_agent.OpenAI") as mock_openai,
+        mock_client = _mock_resolve(
+            api_key="nous-agent-key-abc",
+            base_url="https://inference-api.nousresearch.com/v1",
+        )
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(mock_client, "nous-hermes-3"),
         ):
             result = agent._try_activate_fallback()
             assert result is True
             assert agent.model == "nous-hermes-3"
             assert agent.provider == "nous"
             assert agent.api_mode == "chat_completions"
-            call_kwargs = mock_openai.call_args[1]
-            assert call_kwargs["api_key"] == "nous-agent-key-abc"
-            assert "nousresearch.com" in call_kwargs["base_url"]
+            assert agent.client is mock_client
 
     def test_nous_fallback_fails_gracefully_without_login(self):
         """Nous fallback should return False if not logged in."""
@@ -280,8 +315,8 @@ class TestTryActivateFallback:
             fallback_model={"provider": "nous", "model": "nous-hermes-3"},
         )
         with patch(
-            "hermes_cli.auth.resolve_nous_runtime_credentials",
-            side_effect=Exception("Not logged in to Nous Portal"),
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(None, None),
         ):
             assert agent._try_activate_fallback() is False
             assert agent._fallback_activated is False
@@ -315,7 +350,7 @@ class TestFallbackInit:
 # =============================================================================
 
 class TestProviderCredentials:
-    """Verify that each supported provider resolves its API key correctly."""
+    """Verify that each supported provider resolves via the centralized router."""
 
     @pytest.mark.parametrize("provider,env_var,base_url_fragment", [
         ("openrouter", "OPENROUTER_API_KEY", "openrouter"),
@@ -328,12 +363,15 @@ class TestProviderCredentials:
         agent = _make_agent(
             fallback_model={"provider": provider, "model": "test-model"},
         )
-        with (
-            patch.dict("os.environ", {env_var: "test-key-123"}),
-            patch("run_agent.OpenAI") as mock_openai,
+        mock_client = MagicMock()
+        mock_client.api_key = "test-api-key"
+        mock_client.base_url = f"https://{base_url_fragment}/v1"
+        with patch(
+            "agent.auxiliary_client.resolve_provider_client",
+            return_value=(mock_client, "test-model"),
         ):
             result = agent._try_activate_fallback()
             assert result is True, f"Failed to activate fallback for {provider}"
-            call_kwargs = mock_openai.call_args[1]
-            assert call_kwargs["api_key"] == "test-key-123"
-            assert base_url_fragment in call_kwargs["base_url"].lower()
+            assert agent.client is mock_client
+            assert agent.model == "test-model"
+            assert agent.provider == provider

From 9302690e1b71c1abfc2496640f0a8c3a68709d35 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Wed, 11 Mar 2026 22:04:42 -0700
Subject: [PATCH 07/11] =?UTF-8?q?refactor:=20remove=20LLM=5FMODEL=20env=20?=
 =?UTF-8?q?var=20dependency=20=E2=80=94=20config.yaml=20is=20sole=20source?=
 =?UTF-8?q?=20of=20truth?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Model selection now comes exclusively from config.yaml (set via
'hermes model' or 'hermes setup'). The LLM_MODEL env var is no longer
read or written anywhere in production code.

Why: env vars are per-process/per-user and would conflict in
multi-agent or multi-tenant setups. Config.yaml is file-based and
can be scoped per-user or eventually per-session.

Changes:
- cli.py: Read model from CLI_CONFIG only, not LLM_MODEL/OPENAI_MODEL
- hermes_cli/auth.py: _save_model_choice() no longer writes LLM_MODEL
  to .env
- hermes_cli/setup.py: Remove 12 save_env_value('LLM_MODEL', ...)
  calls from all provider setup flows
- gateway/run.py: Remove LLM_MODEL fallback (HERMES_MODEL still works
  for gateway process runtime)
- cron/scheduler.py: Same
- agent/auxiliary_client.py: Remove LLM_MODEL from custom endpoint
  model detection
---
 agent/auxiliary_client.py             |  2 +-
 cli.py                                | 11 ++++++++---
 cron/scheduler.py                     |  2 +-
 gateway/run.py                        |  6 +++---
 hermes_cli/auth.py                    |  9 ++++++---
 hermes_cli/setup.py                   | 12 ------------
 tests/test_cli_provider_resolution.py | 26 +++++++++++++++++---------
 7 files changed, 36 insertions(+), 32 deletions(-)

diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py
index 19c2b8bd..1c6ac271 100644
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@@ -443,7 +443,7 @@ def _try_custom_endpoint() -> Tuple[Optional[OpenAI], Optional[str]]:
     custom_key = os.getenv("OPENAI_API_KEY")
     if not custom_base or not custom_key:
         return None, None
-    model = os.getenv("OPENAI_MODEL") or os.getenv("LLM_MODEL") or "gpt-4o-mini"
+    model = os.getenv("OPENAI_MODEL") or "gpt-4o-mini"
     logger.debug("Auxiliary client: custom endpoint (%s)", model)
     return OpenAI(api_key=custom_key, base_url=custom_base), model
 
diff --git a/cli.py b/cli.py
index 50e5db8d..d62da32f 100755
--- a/cli.py
+++ b/cli.py
@@ -1129,12 +1129,17 @@ class HermesCLI:
         self.verbose = verbose if verbose is not None else (self.tool_progress_mode == "verbose")
         
         # Configuration - priority: CLI args > env vars > config file
-        # Model can come from: CLI arg, LLM_MODEL env, OPENAI_MODEL env (custom endpoint), or config
-        self.model = model or os.getenv("LLM_MODEL") or os.getenv("OPENAI_MODEL") or CLI_CONFIG["model"]["default"]
+        # Model comes from: CLI arg or config.yaml (single source of truth).
+        # LLM_MODEL/OPENAI_MODEL env vars are NOT checked — config.yaml is
+        # authoritative.  This avoids conflicts in multi-agent setups where
+        # env vars would stomp each other.
+        _model_config = CLI_CONFIG.get("model", {})
+        _config_model = _model_config.get("default", "") if isinstance(_model_config, dict) else (_model_config or "")
+        self.model = model or _config_model or "anthropic/claude-opus-4.6"
         # Track whether model was explicitly chosen by the user or fell back
         # to the global default.  Provider-specific normalisation may override
         # the default silently but should warn when overriding an explicit choice.
-        self._model_is_default = not (model or os.getenv("LLM_MODEL") or os.getenv("OPENAI_MODEL"))
+        self._model_is_default = not model
 
         self._explicit_api_key = api_key
         self._explicit_base_url = base_url
diff --git a/cron/scheduler.py b/cron/scheduler.py
index 348a25c2..c80122ce 100644
--- a/cron/scheduler.py
+++ b/cron/scheduler.py
@@ -180,7 +180,7 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]:
         except UnicodeDecodeError:
             load_dotenv(str(_hermes_home / ".env"), override=True, encoding="latin-1")
 
-        model = os.getenv("HERMES_MODEL") or os.getenv("LLM_MODEL") or "anthropic/claude-opus-4.6"
+        model = os.getenv("HERMES_MODEL") or "anthropic/claude-opus-4.6"
 
         # Load config.yaml for model, reasoning, prefill, toolsets, provider routing
         _cfg = {}
diff --git a/gateway/run.py b/gateway/run.py
index 96d43672..772d4c4f 100644
--- a/gateway/run.py
+++ b/gateway/run.py
@@ -1544,7 +1544,7 @@ class GatewayRunner:
         config_path = _hermes_home / 'config.yaml'
 
         # Resolve current model and provider from config
-        current = os.getenv("HERMES_MODEL") or os.getenv("LLM_MODEL") or "anthropic/claude-opus-4.6"
+        current = os.getenv("HERMES_MODEL") or "anthropic/claude-opus-4.6"
         current_provider = "openrouter"
         try:
             if config_path.exists():
@@ -1999,7 +1999,7 @@ class GatewayRunner:
                 return
 
             # Read model from config (same as _run_agent)
-            model = os.getenv("HERMES_MODEL") or os.getenv("LLM_MODEL") or "anthropic/claude-opus-4.6"
+            model = os.getenv("HERMES_MODEL") or "anthropic/claude-opus-4.6"
             try:
                 import yaml as _y
                 _cfg_path = _hermes_home / "config.yaml"
@@ -3093,7 +3093,7 @@ class GatewayRunner:
             except Exception:
                 pass
 
-            model = os.getenv("HERMES_MODEL") or os.getenv("LLM_MODEL") or "anthropic/claude-opus-4.6"
+            model = os.getenv("HERMES_MODEL") or "anthropic/claude-opus-4.6"
 
             try:
                 import yaml as _y
diff --git a/hermes_cli/auth.py b/hermes_cli/auth.py
index 05d233f9..1ffa85bd 100644
--- a/hermes_cli/auth.py
+++ b/hermes_cli/auth.py
@@ -1671,8 +1671,12 @@ def _prompt_model_selection(model_ids: List[str], current_model: str = "") -> Op
 
 
 def _save_model_choice(model_id: str) -> None:
-    """Save the selected model to config.yaml and .env."""
-    from hermes_cli.config import save_config, load_config, save_env_value
+    """Save the selected model to config.yaml (single source of truth).
+
+    The model is stored in config.yaml only — NOT in .env.  This avoids
+    conflicts in multi-agent setups where env vars would stomp each other.
+    """
+    from hermes_cli.config import save_config, load_config
 
     config = load_config()
     # Always use dict format so provider/base_url can be stored alongside
@@ -1681,7 +1685,6 @@ def _save_model_choice(model_id: str) -> None:
     else:
         config["model"] = {"default": model_id}
     save_config(config)
-    save_env_value("LLM_MODEL", model_id)
 
 
 def login_command(args) -> None:
diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py
index 6b00952c..2f48574b 100644
--- a/hermes_cli/setup.py
+++ b/hermes_cli/setup.py
@@ -681,7 +681,6 @@ def setup_model_provider(config: dict):
             save_env_value("OPENAI_API_KEY", api_key)
         if model_name:
             config['model'] = model_name
-            save_env_value("LLM_MODEL", model_name)
 
         # Save provider and base_url to config.yaml so the gateway and CLI
         # both resolve the correct provider without relying on env-var heuristics.
@@ -913,7 +912,6 @@ def setup_model_provider(config: dict):
             custom = prompt(f"  Model name (Enter to keep '{current_model}')")
             if custom:
                 config['model'] = custom
-                save_env_value("LLM_MODEL", custom)
         elif selected_provider == "openai-codex":
             from hermes_cli.codex_models import get_codex_model_ids
             codex_models = get_codex_model_ids()
@@ -927,12 +925,10 @@ def setup_model_provider(config: dict):
             model_idx = prompt_choice("Select default model:", model_choices, default_codex)
             if model_idx < len(codex_models):
                 config['model'] = codex_models[model_idx]
-                save_env_value("LLM_MODEL", codex_models[model_idx])
             elif model_idx == len(codex_models):
                 custom = prompt("Enter model name")
                 if custom:
                     config['model'] = custom
-                    save_env_value("LLM_MODEL", custom)
             _update_config_for_provider("openai-codex", DEFAULT_CODEX_BASE_URL)
         elif selected_provider == "zai":
             # Coding Plan endpoints don't have GLM-5
@@ -950,12 +946,10 @@ def setup_model_provider(config: dict):
 
             if model_idx < len(zai_models):
                 config['model'] = zai_models[model_idx]
-                save_env_value("LLM_MODEL", zai_models[model_idx])
             elif model_idx == len(zai_models):
                 custom = prompt("Enter model name")
                 if custom:
                     config['model'] = custom
-                    save_env_value("LLM_MODEL", custom)
             # else: keep current
         elif selected_provider == "kimi-coding":
             kimi_models = ["kimi-k2.5", "kimi-k2-thinking", "kimi-k2-turbo-preview"]
@@ -968,12 +962,10 @@ def setup_model_provider(config: dict):
 
             if model_idx < len(kimi_models):
                 config['model'] = kimi_models[model_idx]
-                save_env_value("LLM_MODEL", kimi_models[model_idx])
             elif model_idx == len(kimi_models):
                 custom = prompt("Enter model name")
                 if custom:
                     config['model'] = custom
-                    save_env_value("LLM_MODEL", custom)
             # else: keep current
         elif selected_provider in ("minimax", "minimax-cn"):
             minimax_models = ["MiniMax-M2.5", "MiniMax-M2.5-highspeed", "MiniMax-M2.1"]
@@ -986,12 +978,10 @@ def setup_model_provider(config: dict):
 
             if model_idx < len(minimax_models):
                 config['model'] = minimax_models[model_idx]
-                save_env_value("LLM_MODEL", minimax_models[model_idx])
             elif model_idx == len(minimax_models):
                 custom = prompt("Enter model name")
                 if custom:
                     config['model'] = custom
-                    save_env_value("LLM_MODEL", custom)
             # else: keep current
         else:
             # Static list for OpenRouter / fallback (from canonical list)
@@ -1008,12 +998,10 @@ def setup_model_provider(config: dict):
 
             if model_idx < len(ids):
                 config['model'] = ids[model_idx]
-                save_env_value("LLM_MODEL", ids[model_idx])
             elif model_idx == len(ids):  # Custom
                 custom = prompt("Enter model name (e.g., anthropic/claude-opus-4.6)")
                 if custom:
                     config['model'] = custom
-                    save_env_value("LLM_MODEL", custom)
             # else: Keep current
 
         _final_model = config.get('model', '')
diff --git a/tests/test_cli_provider_resolution.py b/tests/test_cli_provider_resolution.py
index f4a446ac..2a3dc43e 100644
--- a/tests/test_cli_provider_resolution.py
+++ b/tests/test_cli_provider_resolution.py
@@ -197,21 +197,28 @@ def test_codex_provider_replaces_incompatible_default_model(monkeypatch):
     assert shell.model == "gpt-5.2-codex"
 
 
-def test_codex_provider_trusts_explicit_envvar_model(monkeypatch):
-    """When the user explicitly sets LLM_MODEL, we trust their choice and
-    let the API be the judge — even if it's a non-OpenAI model.  Only
-    provider prefixes are stripped; the bare model passes through."""
+def test_codex_provider_uses_config_model(monkeypatch):
+    """Model comes from config.yaml, not LLM_MODEL env var.
+    Config.yaml is the single source of truth to avoid multi-agent conflicts."""
     cli = _import_cli()
 
-    monkeypatch.setenv("LLM_MODEL", "claude-opus-4-6")
+    # LLM_MODEL env var should be IGNORED (even if set)
+    monkeypatch.setenv("LLM_MODEL", "should-be-ignored")
     monkeypatch.delenv("OPENAI_MODEL", raising=False)
 
+    # Set model via config
+    monkeypatch.setitem(cli.CLI_CONFIG, "model", {
+        "default": "gpt-5.2-codex",
+        "provider": "openai-codex",
+        "base_url": "https://chatgpt.com/backend-api/codex",
+    })
+
     def _runtime_resolve(**kwargs):
         return {
             "provider": "openai-codex",
             "api_mode": "codex_responses",
             "base_url": "https://chatgpt.com/backend-api/codex",
-            "api_key": "test-key",
+            "api_key": "fake-codex-token",
             "source": "env/config",
         }
 
@@ -220,11 +227,12 @@ def test_codex_provider_trusts_explicit_envvar_model(monkeypatch):
 
     shell = cli.HermesCLI(compact=True, max_turns=1)
 
-    assert shell._model_is_default is False
     assert shell._ensure_runtime_credentials() is True
     assert shell.provider == "openai-codex"
-    # User explicitly chose this model — it passes through untouched
-    assert shell.model == "claude-opus-4-6"
+    # Model from config (may be normalized by codex provider logic)
+    assert "codex" in shell.model.lower()
+    # LLM_MODEL env var is NOT used
+    assert shell.model != "should-be-ignored"
 
 
 def test_codex_provider_preserves_explicit_codex_model(monkeypatch):

From a7e5f195284a54b469a1f2bf9ab6b60401ae3212 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Wed, 11 Mar 2026 22:41:33 -0700
Subject: [PATCH 08/11] fix: don't send OpenRouter-specific provider
 preferences to Nous Portal

Two bugs in _build_api_kwargs that broke Nous Portal:

1. Provider preferences (only, ignore, order, sort) are OpenRouter-
   specific routing features. They were being sent in extra_body to ALL
   providers, including Nous Portal. When the config had
   providers_only=['google-vertex'], Nous Portal returned 404 'Inference
   host not found' because it doesn't have a google-vertex backend.

   Fix: Only include provider preferences when _is_openrouter is True.

2. Reasoning config with enabled=false was being sent to Nous Portal,
   which requires reasoning and returns 400 'Reasoning is mandatory for
   this endpoint and cannot be disabled.'

   Fix: Omit the reasoning parameter for Nous when enabled=false.

Root cause found via HERMES_DUMP_REQUESTS=1 which showed the exact
request payload being sent to Nous Portal's inference API.
---
 run_agent.py | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/run_agent.py b/run_agent.py
index 107b803c..bb66351b 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -2392,16 +2392,24 @@ class AIAgent:
 
         extra_body = {}
 
-        if provider_preferences:
-            extra_body["provider"] = provider_preferences
-
         _is_openrouter = "openrouter" in self.base_url.lower()
+
+        # Provider preferences (only, ignore, order, sort) are OpenRouter-
+        # specific — don't send them to other providers (Nous, Codex, etc.)
+        if provider_preferences and _is_openrouter:
+            extra_body["provider"] = provider_preferences
         _is_nous = "nousresearch" in self.base_url.lower()
 
         _is_mistral = "api.mistral.ai" in self.base_url.lower()
         if (_is_openrouter or _is_nous) and not _is_mistral:
             if self.reasoning_config is not None:
-                extra_body["reasoning"] = self.reasoning_config
+                rc = dict(self.reasoning_config)
+                # Nous Portal requires reasoning enabled — don't send
+                # enabled=false to it (would cause 400).
+                if _is_nous and rc.get("enabled") is False:
+                    pass  # omit reasoning entirely for Nous when disabled
+                else:
+                    extra_body["reasoning"] = rc
             else:
                 extra_body["reasoning"] = {
                     "enabled": True,

From 65356003e3da075337d4e4407353f6b57d84d150 Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Wed, 11 Mar 2026 22:49:24 -0700
Subject: [PATCH 09/11] revert: keep provider preferences for all providers
 (Nous will proxy)

Nous Portal backend will become a transparent proxy for OpenRouter-
specific parameters (provider preferences, etc.), so keep sending them
to all providers. The reasoning disabled fix is kept (that's a real
constraint of the Nous endpoint).
---
 run_agent.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/run_agent.py b/run_agent.py
index bb66351b..af1b31c0 100644
--- a/run_agent.py
+++ b/run_agent.py
@@ -2395,7 +2395,9 @@ class AIAgent:
         _is_openrouter = "openrouter" in self.base_url.lower()
 
         # Provider preferences (only, ignore, order, sort) are OpenRouter-
-        # specific — don't send them to other providers (Nous, Codex, etc.)
+        # specific.  Only send to OpenRouter-compatible endpoints.
+        # TODO: Nous Portal will add transparent proxy support — re-enable
+        # for _is_nous when their backend is updated.
         if provider_preferences and _is_openrouter:
             extra_body["provider"] = provider_preferences
         _is_nous = "nousresearch" in self.base_url.lower()

From ec2c6dff7073b1369ac71f405901dabb893e650f Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Wed, 11 Mar 2026 23:06:06 -0700
Subject: [PATCH 10/11] feat: unified /model and /provider into single view
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Both /model and /provider now show the same unified display:

  Current: anthropic/claude-opus-4.6 via OpenRouter

  Authenticated providers & models:
    [openrouter] ← active
      anthropic/claude-opus-4.6 ← current
      anthropic/claude-sonnet-4.5
      ...
    [nous]
      claude-opus-4-6
      gemini-3-flash
      ...
    [openai-codex]
      gpt-5.2-codex
      gpt-5.1-codex-mini
      ...

  Not configured: Z.AI / GLM, Kimi / Moonshot, ...

  Switch model:    /model <model-name>
  Switch provider: /model <provider>:<model-name>
  Example: /model nous:claude-opus-4-6

Users can see all authenticated providers and their models at a glance,
making it easy to switch mid-conversation.

Also added curated model lists for Nous Portal and OpenAI Codex to
hermes_cli/models.py.
---
 cli.py                          | 126 +++++++++++++++++---------------
 hermes_cli/models.py            |  13 ++++
 tests/test_cli_model_command.py |   4 +-
 3 files changed, 83 insertions(+), 60 deletions(-)

diff --git a/cli.py b/cli.py
index d62da32f..723a7554 100755
--- a/cli.py
+++ b/cli.py
@@ -2265,6 +2265,72 @@ class HermesCLI:
         remaining = len(self.conversation_history)
         print(f"  {remaining} message(s) remaining in history.")
     
+    def _show_model_and_providers(self):
+        """Unified /model and /provider display.
+
+        Shows current model + provider, then lists all authenticated
+        providers with their available models so users can switch easily.
+        """
+        from hermes_cli.models import (
+            curated_models_for_provider, list_available_providers,
+            normalize_provider, _PROVIDER_LABELS,
+        )
+        from hermes_cli.auth import resolve_provider as _resolve_provider
+
+        # Resolve current provider
+        raw_provider = normalize_provider(self.provider)
+        if raw_provider == "auto":
+            try:
+                current = _resolve_provider(
+                    self.requested_provider,
+                    explicit_api_key=self._explicit_api_key,
+                    explicit_base_url=self._explicit_base_url,
+                )
+            except Exception:
+                current = "openrouter"
+        else:
+            current = raw_provider
+        current_label = _PROVIDER_LABELS.get(current, current)
+
+        print(f"\n  Current: {self.model} via {current_label}")
+        print()
+
+        # Show all authenticated providers with their models
+        providers = list_available_providers()
+        authed = [p for p in providers if p["authenticated"]]
+        unauthed = [p for p in providers if not p["authenticated"]]
+
+        if authed:
+            print("  Authenticated providers & models:")
+            for p in authed:
+                is_active = p["id"] == current
+                marker = " ← active" if is_active else ""
+                print(f"    [{p['id']}]{marker}")
+                curated = curated_models_for_provider(p["id"])
+                if curated:
+                    for mid, desc in curated:
+                        current_marker = " ← current" if (is_active and mid == self.model) else ""
+                        print(f"      {mid}{current_marker}")
+                else:
+                    print(f"      (use /model {p['id']}:<model-name>)")
+                print()
+
+        if unauthed:
+            names = ", ".join(p["label"] for p in unauthed)
+            print(f"  Not configured: {names}")
+            print(f"  Run: hermes setup")
+            print()
+
+        print("  Switch model:    /model <model-name>")
+        print("  Switch provider: /model <provider>:<model-name>")
+        if authed and len(authed) > 1:
+            # Show a concrete example with a non-active provider
+            other = next((p for p in authed if p["id"] != current), authed[0])
+            other_models = curated_models_for_provider(other["id"])
+            if other_models:
+                example_model = other_models[0][0]
+                print(f"  Example: /model {other['id']}:{example_model}")
+
     def _handle_prompt_command(self, cmd: str):
         """Handle the /prompt command to view or set system prompt."""
         parts = cmd.split(maxsplit=1)
@@ -2776,65 +2842,9 @@ class HermesCLI:
                             print(f"  Reason: {message}")
                         print("  Note: Model will revert on restart. Use a verified model to save to config.")
             else:
-                from hermes_cli.models import curated_models_for_provider, normalize_provider, _PROVIDER_LABELS
-                from hermes_cli.auth import resolve_provider as _resolve_provider
-                # Resolve "auto" to the actual provider using credential detection
-                raw_provider = normalize_provider(self.provider)
-                if raw_provider == "auto":
-                    try:
-                        display_provider = _resolve_provider(
-                            self.requested_provider,
-                            explicit_api_key=self._explicit_api_key,
-                            explicit_base_url=self._explicit_base_url,
-                        )
-                    except Exception:
-                        display_provider = "openrouter"
-                else:
-                    display_provider = raw_provider
-                provider_label = _PROVIDER_LABELS.get(display_provider, display_provider)
-                print(f"\n  Current model:    {self.model}")
-                print(f"  Current provider: {provider_label}")
-                print()
-                curated = curated_models_for_provider(display_provider)
-                if curated:
-                    print(f"  Available models ({provider_label}):")
-                    for mid, desc in curated:
-                        marker = " ←" if mid == self.model else ""
-                        label = f"  {desc}" if desc else ""
-                        print(f"    {mid}{label}{marker}")
-                    print()
-                print("  Usage: /model <model-name>")
-                print("         /model provider:model-name  (to switch provider)")
-                print("  Example: /model openrouter:anthropic/claude-sonnet-4.5")
-                print("  See /provider for available providers")
+                self._show_model_and_providers()
         elif cmd_lower == "/provider":
-            from hermes_cli.models import list_available_providers, normalize_provider, _PROVIDER_LABELS
-            from hermes_cli.auth import resolve_provider as _resolve_provider
-            # Resolve current provider
-            raw_provider = normalize_provider(self.provider)
-            if raw_provider == "auto":
-                try:
-                    current = _resolve_provider(
-                        self.requested_provider,
-                        explicit_api_key=self._explicit_api_key,
-                        explicit_base_url=self._explicit_base_url,
-                    )
-                except Exception:
-                    current = "openrouter"
-            else:
-                current = raw_provider
-            current_label = _PROVIDER_LABELS.get(current, current)
-            print(f"\n  Current provider: {current_label} ({current})\n")
-            providers = list_available_providers()
-            print("  Available providers:")
-            for p in providers:
-                marker = " ← active" if p["id"] == current else ""
-                auth = "✓" if p["authenticated"] else "✗"
-                aliases = f"  (also: {', '.join(p['aliases'])})" if p["aliases"] else ""
-                print(f"    [{auth}] {p['id']:<14} {p['label']}{aliases}{marker}")
-            print()
-            print("  Switch: /model provider:model-name")
-            print("  Setup:  hermes setup")
+            self._show_model_and_providers()
         elif cmd_lower.startswith("/prompt"):
             # Use original case so prompt text isn't lowercased
             self._handle_prompt_command(cmd_original)
diff --git a/hermes_cli/models.py b/hermes_cli/models.py
index 1fdde090..0df1d309 100644
--- a/hermes_cli/models.py
+++ b/hermes_cli/models.py
@@ -31,6 +31,19 @@ OPENROUTER_MODELS: list[tuple[str, str]] = [
 ]
 
 _PROVIDER_MODELS: dict[str, list[str]] = {
+    "nous": [
+        "claude-opus-4-6",
+        "claude-sonnet-4-6",
+        "gpt-5.4",
+        "gemini-3-flash",
+        "gemini-3.0-pro-preview",
+        "deepseek-v3.2",
+    ],
+    "openai-codex": [
+        "gpt-5.2-codex",
+        "gpt-5.1-codex-mini",
+        "gpt-5.1-codex-max",
+    ],
     "zai": [
         "glm-5",
         "glm-4.7",
diff --git a/tests/test_cli_model_command.py b/tests/test_cli_model_command.py
index b8b8e8d2..477ad429 100644
--- a/tests/test_cli_model_command.py
+++ b/tests/test_cli_model_command.py
@@ -93,8 +93,8 @@ class TestModelCommand:
         output = capsys.readouterr().out
         assert "anthropic/claude-opus-4.6" in output
         assert "OpenRouter" in output
-        assert "Available models" in output
-        assert "provider:model-name" in output
+        assert "Authenticated providers" in output or "Switch model" in output
+        assert "provider" in output and "model" in output
 
     # -- provider switching tests -------------------------------------------
 

From 7febdf7208d59db52f8ebe54b8be71a0d6c31d7c Mon Sep 17 00:00:00 2001
From: teknium1 <teknium1@gmail.com>
Date: Wed, 11 Mar 2026 23:29:26 -0700
Subject: [PATCH 11/11] fix: custom endpoint model validation + better /model
 error messages

- Custom endpoints can serve any model, so skip validation for
  provider='custom' in validate_requested_model(). Previously it
  would reject any model name since there's no static catalog or
  live API to check against.
- Show clear setup instructions when switching to custom endpoint
  without OPENAI_BASE_URL/OPENAI_API_KEY configured.
- Added curated model lists for Nous Portal and OpenAI Codex to
  _PROVIDER_MODELS so /model shows their available models.
---
 cli.py               | 6 +++++-
 hermes_cli/models.py | 9 +++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/cli.py b/cli.py
index 723a7554..7f2b2394 100755
--- a/cli.py
+++ b/cli.py
@@ -2795,7 +2795,11 @@ class HermesCLI:
                         base_url_for_probe = runtime.get("base_url", "")
                     except Exception as e:
                         provider_label = _PROVIDER_LABELS.get(target_provider, target_provider)
-                        print(f"(>_<) Could not resolve credentials for provider '{provider_label}': {e}")
+                        if target_provider == "custom":
+                            print(f"(>_<) Custom endpoint not configured. Set OPENAI_BASE_URL and OPENAI_API_KEY,")
+                            print(f"      or run: hermes setup → Custom OpenAI-compatible endpoint")
+                        else:
+                            print(f"(>_<) Could not resolve credentials for provider '{provider_label}': {e}")
                         print(f"(^_^) Current model unchanged: {self.model}")
                         return True
 
diff --git a/hermes_cli/models.py b/hermes_cli/models.py
index 0df1d309..54d4e3c1 100644
--- a/hermes_cli/models.py
+++ b/hermes_cli/models.py
@@ -276,6 +276,15 @@ def validate_requested_model(
             "message": "Model names cannot contain spaces.",
         }
 
+    # Custom endpoints can serve any model — skip validation
+    if normalized == "custom":
+        return {
+            "accepted": True,
+            "persist": True,
+            "recognized": False,
+            "message": None,
+        }
+
     # Probe the live API to check if the model actually exists
     api_models = fetch_api_models(api_key, base_url)