From 8805e705a7e134ff7e090bd5fa5e37ba2ec14811 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Wed, 11 Mar 2026 19:46:47 -0700 Subject: [PATCH 01/11] feat: centralized provider router + fix Codex vision bypass + vision error handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three interconnected fixes for auxiliary client infrastructure: 1. CENTRALIZED PROVIDER ROUTER (auxiliary_client.py) Add resolve_provider_client(provider, model, async_mode) — a single entry point for creating properly configured clients. Given a provider name and optional model, it handles auth lookup (env vars, OAuth tokens, auth.json), base URL resolution, provider-specific headers, and API format differences (Chat Completions vs Responses API for Codex). All auxiliary consumers should route through this instead of ad-hoc env var lookups. Refactored get_text_auxiliary_client, get_async_text_auxiliary_client, and get_vision_auxiliary_client to use the router internally. 2. FIX CODEX VISION BYPASS (vision_tools.py) vision_tools.py was constructing a raw AsyncOpenAI client from the sync vision client's api_key/base_url, completely bypassing the Codex Responses API adapter. When the vision provider resolved to Codex, the raw client would hit chatgpt.com/backend-api/codex with chat.completions.create() which only supports the Responses API. Fix: Added get_async_vision_auxiliary_client() which properly wraps Codex into AsyncCodexAuxiliaryClient. vision_tools.py now uses this instead of manual client construction. 3. FIX COMPRESSION FALLBACK + VISION ERROR HANDLING - context_compressor.py: Removed _get_fallback_client() which blindly looked for OPENAI_API_KEY + OPENAI_BASE_URL (fails for Codex OAuth, API-key providers, users without OPENAI_BASE_URL set). Replaced with fallback loop through resolve_provider_client() for each known provider, with same-provider dedup. - vision_tools.py: Added error detection for vision capability failures. Returns clear message to the model when the configured model doesn't support vision, instead of a generic error. Addresses #886 --- agent/auxiliary_client.py | 225 ++++++++++++++++++++++++++++++++---- agent/context_compressor.py | 67 +++++------ tools/vision_tools.py | 42 ++++--- 3 files changed, 256 insertions(+), 78 deletions(-) diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 57c3c118..4571520a 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -499,6 +499,188 @@ def _resolve_auto() -> Tuple[Optional[OpenAI], Optional[str]]: return None, None +# ── Centralized Provider Router ───────────────────────────────────────────── +# +# resolve_provider_client() is the single entry point for creating a properly +# configured client given a (provider, model) pair. It handles auth lookup, +# base URL resolution, provider-specific headers, and API format differences +# (Chat Completions vs Responses API for Codex). +# +# All auxiliary consumer code should go through this or the public helpers +# below — never look up auth env vars ad-hoc. + + +def _to_async_client(sync_client, model: str): + """Convert a sync client to its async counterpart, preserving Codex routing.""" + from openai import AsyncOpenAI + + if isinstance(sync_client, CodexAuxiliaryClient): + return AsyncCodexAuxiliaryClient(sync_client), model + + async_kwargs = { + "api_key": sync_client.api_key, + "base_url": str(sync_client.base_url), + } + base_lower = str(sync_client.base_url).lower() + if "openrouter" in base_lower: + async_kwargs["default_headers"] = dict(_OR_HEADERS) + elif "api.kimi.com" in base_lower: + async_kwargs["default_headers"] = {"User-Agent": "KimiCLI/1.0"} + return AsyncOpenAI(**async_kwargs), model + + +def resolve_provider_client( + provider: str, + model: str = None, + async_mode: bool = False, +) -> Tuple[Optional[Any], Optional[str]]: + """Central router: given a provider name and optional model, return a + configured client with the correct auth, base URL, and API format. + + The returned client always exposes ``.chat.completions.create()`` — for + Codex/Responses API providers, an adapter handles the translation + transparently. + + Args: + provider: Provider identifier. One of: + "openrouter", "nous", "openai-codex" (or "codex"), + "zai", "kimi-coding", "minimax", "minimax-cn", "nous-api", + "custom" (OPENAI_BASE_URL + OPENAI_API_KEY), + "auto" (full auto-detection chain). + model: Model slug override. If None, uses the provider's default + auxiliary model. + async_mode: If True, return an async-compatible client. + + Returns: + (client, resolved_model) or (None, None) if auth is unavailable. + """ + # Normalise aliases + provider = (provider or "auto").strip().lower() + if provider == "codex": + provider = "openai-codex" + if provider == "main": + provider = "custom" + + # ── Auto: try all providers in priority order ──────────────────── + if provider == "auto": + client, resolved = _resolve_auto() + if client is None: + return None, None + final_model = model or resolved + return (_to_async_client(client, final_model) if async_mode + else (client, final_model)) + + # ── OpenRouter ─────────────────────────────────────────────────── + if provider == "openrouter": + client, default = _try_openrouter() + if client is None: + logger.warning("resolve_provider_client: openrouter requested " + "but OPENROUTER_API_KEY not set") + return None, None + final_model = model or default + return (_to_async_client(client, final_model) if async_mode + else (client, final_model)) + + # ── Nous Portal (OAuth) ────────────────────────────────────────── + if provider == "nous": + client, default = _try_nous() + if client is None: + logger.warning("resolve_provider_client: nous requested " + "but Nous Portal not configured (run: hermes login)") + return None, None + final_model = model or default + return (_to_async_client(client, final_model) if async_mode + else (client, final_model)) + + # ── OpenAI Codex (OAuth → Responses API) ───────────────────────── + if provider == "openai-codex": + client, default = _try_codex() + if client is None: + logger.warning("resolve_provider_client: openai-codex requested " + "but no Codex OAuth token found (run: hermes model)") + return None, None + final_model = model or default + return (_to_async_client(client, final_model) if async_mode + else (client, final_model)) + + # ── Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY) ─────────── + if provider == "custom": + # Try custom first, then codex, then API-key providers + for try_fn in (_try_custom_endpoint, _try_codex, + _resolve_api_key_provider): + client, default = try_fn() + if client is not None: + final_model = model or default + return (_to_async_client(client, final_model) if async_mode + else (client, final_model)) + logger.warning("resolve_provider_client: custom/main requested " + "but no endpoint credentials found") + return None, None + + # ── API-key providers from PROVIDER_REGISTRY ───────────────────── + try: + from hermes_cli.auth import PROVIDER_REGISTRY, _resolve_kimi_base_url + except ImportError: + logger.debug("hermes_cli.auth not available for provider %s", provider) + return None, None + + pconfig = PROVIDER_REGISTRY.get(provider) + if pconfig is None: + logger.warning("resolve_provider_client: unknown provider %r", provider) + return None, None + + if pconfig.auth_type == "api_key": + # Find the first configured API key + api_key = "" + for env_var in pconfig.api_key_env_vars: + api_key = os.getenv(env_var, "").strip() + if api_key: + break + if not api_key: + logger.warning("resolve_provider_client: provider %s has no API " + "key configured (tried: %s)", + provider, ", ".join(pconfig.api_key_env_vars)) + return None, None + + # Resolve base URL (env override → provider-specific logic → default) + base_url_override = os.getenv(pconfig.base_url_env_var, "").strip() if pconfig.base_url_env_var else "" + if provider == "kimi-coding": + base_url = _resolve_kimi_base_url(api_key, pconfig.inference_base_url, base_url_override) + elif base_url_override: + base_url = base_url_override + else: + base_url = pconfig.inference_base_url + + default_model = _API_KEY_PROVIDER_AUX_MODELS.get(provider, "") + final_model = model or default_model + + # Provider-specific headers + headers = {} + if "api.kimi.com" in base_url.lower(): + headers["User-Agent"] = "KimiCLI/1.0" + + client = OpenAI(api_key=api_key, base_url=base_url, + **({"default_headers": headers} if headers else {})) + logger.debug("resolve_provider_client: %s (%s)", provider, final_model) + return (_to_async_client(client, final_model) if async_mode + else (client, final_model)) + + elif pconfig.auth_type in ("oauth_device_code", "oauth_external"): + # OAuth providers — route through their specific try functions + if provider == "nous": + return resolve_provider_client("nous", model, async_mode) + if provider == "openai-codex": + return resolve_provider_client("openai-codex", model, async_mode) + # nous-api is api_key type so it's handled above + logger.warning("resolve_provider_client: OAuth provider %s not " + "directly supported, try 'auto'", provider) + return None, None + + logger.warning("resolve_provider_client: unhandled auth_type %s for %s", + pconfig.auth_type, provider) + return None, None + + # ── Public API ────────────────────────────────────────────────────────────── def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optional[str]]: @@ -513,8 +695,8 @@ def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optiona """ forced = _get_auxiliary_provider(task) if forced != "auto": - return _resolve_forced_provider(forced) - return _resolve_auto() + return resolve_provider_client(forced) + return resolve_provider_client("auto") def get_async_text_auxiliary_client(task: str = ""): @@ -524,24 +706,10 @@ def get_async_text_auxiliary_client(task: str = ""): (AsyncCodexAuxiliaryClient, model) which wraps the Responses API. Returns (None, None) when no provider is available. """ - from openai import AsyncOpenAI - - sync_client, model = get_text_auxiliary_client(task) - if sync_client is None: - return None, None - - if isinstance(sync_client, CodexAuxiliaryClient): - return AsyncCodexAuxiliaryClient(sync_client), model - - async_kwargs = { - "api_key": sync_client.api_key, - "base_url": str(sync_client.base_url), - } - if "openrouter" in str(sync_client.base_url).lower(): - async_kwargs["default_headers"] = dict(_OR_HEADERS) - elif "api.kimi.com" in str(sync_client.base_url).lower(): - async_kwargs["default_headers"] = {"User-Agent": "KimiCLI/1.0"} - return AsyncOpenAI(**async_kwargs), model + forced = _get_auxiliary_provider(task) + if forced != "auto": + return resolve_provider_client(forced, async_mode=True) + return resolve_provider_client("auto", async_mode=True) def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]: @@ -559,7 +727,7 @@ def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]: """ forced = _get_auxiliary_provider("vision") if forced != "auto": - return _resolve_forced_provider(forced) + return resolve_provider_client(forced) # Auto: try providers known to support multimodal first, then fall # back to the user's custom endpoint. Many local models (Qwen-VL, # LLaVA, Pixtral, etc.) support vision — skipping them entirely @@ -573,6 +741,21 @@ def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]: return None, None +def get_async_vision_auxiliary_client(): + """Return (async_client, model_slug) for async vision consumers. + + Properly handles Codex routing — unlike manually constructing + AsyncOpenAI from a sync client, this preserves the Responses API + adapter for Codex providers. + + Returns (None, None) when no provider is available. + """ + sync_client, model = get_vision_auxiliary_client() + if sync_client is None: + return None, None + return _to_async_client(sync_client, model) + + def get_auxiliary_extra_body() -> dict: """Return extra_body kwargs for auxiliary API calls. diff --git a/agent/context_compressor.py b/agent/context_compressor.py index 01aa2af8..fae483fd 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -127,20 +127,38 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix.""" except Exception as e: logging.warning(f"Failed to generate context summary with auxiliary model: {e}") - # 2. Fallback: try the user's main model endpoint - fallback_client, fallback_model = self._get_fallback_client() - if fallback_client is not None: + # 2. Fallback: re-try via the centralized provider router. + # This covers all configured providers (Codex OAuth, API-key + # providers, etc.) without ad-hoc env var lookups. + from agent.auxiliary_client import resolve_provider_client + fallback_providers = ["custom", "openrouter", "nous", "codex"] + for fb_provider in fallback_providers: try: - logger.info("Retrying context summary with main model (%s)", fallback_model) - summary = self._call_summary_model(fallback_client, fallback_model, prompt) - self.client = fallback_client - self.summary_model = fallback_model + fb_client, fb_model = resolve_provider_client( + fb_provider, model=self.model) + if fb_client is None: + continue + # Don't retry the same client that just failed + if (self.client is not None + and hasattr(fb_client, "base_url") + and hasattr(self.client, "base_url") + and str(fb_client.base_url) == str(self.client.base_url)): + continue + logger.info("Retrying context summary with fallback provider " + "%s (%s)", fb_provider, fb_model) + summary = self._call_summary_model(fb_client, fb_model, prompt) + # Promote successful fallback for future compressions + self.client = fb_client + self.summary_model = fb_model return summary except Exception as fallback_err: - logging.warning(f"Main model summary also failed: {fallback_err}") + logging.warning("Fallback provider %s failed: %s", + fb_provider, fallback_err) - # 3. All models failed — return None so the caller drops turns without a summary - logging.warning("Context compression: no model available for summary. Middle turns will be dropped without summary.") + # 3. All providers failed — return None so the caller drops turns + # without a summary. + logging.warning("Context compression: no provider available for " + "summary. Middle turns will be dropped without summary.") return None def _call_summary_model(self, client, model: str, prompt: str) -> str: @@ -170,35 +188,6 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix.""" summary = "[CONTEXT SUMMARY]: " + summary return summary - def _get_fallback_client(self): - """Try to build a fallback client from the main model's endpoint config. - - When the primary auxiliary client fails (e.g. stale OpenRouter key), this - creates a client using the user's active custom endpoint (OPENAI_BASE_URL) - so compression can still produce a real summary instead of a static string. - - Returns (client, model) or (None, None). - """ - custom_base = os.getenv("OPENAI_BASE_URL") - custom_key = os.getenv("OPENAI_API_KEY") - if not custom_base or not custom_key: - return None, None - - # Don't fallback to the same provider that just failed - from hermes_constants import OPENROUTER_BASE_URL - if custom_base.rstrip("/") == OPENROUTER_BASE_URL.rstrip("/"): - return None, None - - model = os.getenv("LLM_MODEL") or os.getenv("OPENAI_MODEL") or self.model - try: - from openai import OpenAI as _OpenAI - client = _OpenAI(api_key=custom_key, base_url=custom_base) - logger.debug("Built fallback auxiliary client: %s via %s", model, custom_base) - return client, model - except Exception as exc: - logger.debug("Could not build fallback auxiliary client: %s", exc) - return None, None - # ------------------------------------------------------------------ # Tool-call / tool-result pair integrity helpers # ------------------------------------------------------------------ diff --git a/tools/vision_tools.py b/tools/vision_tools.py index bfde51ec..ee89b58a 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -37,27 +37,15 @@ from pathlib import Path from typing import Any, Awaitable, Dict, Optional from urllib.parse import urlparse import httpx -from openai import AsyncOpenAI -from agent.auxiliary_client import get_vision_auxiliary_client +from agent.auxiliary_client import get_async_vision_auxiliary_client from tools.debug_helpers import DebugSession logger = logging.getLogger(__name__) -# Resolve vision auxiliary client at module level; build an async wrapper. -_aux_sync_client, DEFAULT_VISION_MODEL = get_vision_auxiliary_client() -_aux_async_client: AsyncOpenAI | None = None -if _aux_sync_client is not None: - _async_kwargs = { - "api_key": _aux_sync_client.api_key, - "base_url": str(_aux_sync_client.base_url), - } - if "openrouter" in str(_aux_sync_client.base_url).lower(): - _async_kwargs["default_headers"] = { - "HTTP-Referer": "https://github.com/NousResearch/hermes-agent", - "X-OpenRouter-Title": "Hermes Agent", - "X-OpenRouter-Categories": "productivity,cli-agent", - } - _aux_async_client = AsyncOpenAI(**_async_kwargs) +# Resolve vision auxiliary client at module level. +# Uses get_async_vision_auxiliary_client() which properly handles Codex +# routing (Responses API adapter) instead of raw AsyncOpenAI construction. +_aux_async_client, DEFAULT_VISION_MODEL = get_async_vision_auxiliary_client() _debug = DebugSession("vision_tools", env_var="VISION_TOOLS_DEBUG") @@ -359,10 +347,28 @@ async def vision_analyze_tool( error_msg = f"Error analyzing image: {str(e)}" logger.error("%s", error_msg, exc_info=True) + # Detect vision capability errors — give the model a clear message + # so it can inform the user instead of a cryptic API error. + err_str = str(e).lower() + if any(hint in err_str for hint in ( + "does not support", "not support image", "invalid_request", + "content_policy", "image_url", "multimodal", + "unrecognized request argument", "image input", + )): + analysis = ( + f"{model} does not support vision or our request was not " + f"accepted by the server. Error: {e}" + ) + else: + analysis = ( + "There was a problem with the request and the image could not " + f"be analyzed. Error: {e}" + ) + # Prepare error response result = { "success": False, - "analysis": "There was a problem with the request and the image could not be analyzed." + "analysis": analysis, } debug_call_data["error"] = error_msg From 07f09ecd83fba861041fb117e5e6221d15819975 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Wed, 11 Mar 2026 20:02:36 -0700 Subject: [PATCH 02/11] refactor: route ad-hoc LLM consumers through centralized provider router MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Route all remaining ad-hoc auxiliary LLM call sites through resolve_provider_client() so auth, headers, and API format (Chat Completions vs Responses API) are handled consistently in one place. Files changed: - tools/openrouter_client.py: Replace manual AsyncOpenAI construction with resolve_provider_client('openrouter', async_mode=True). The shared client module now delegates entirely to the router. - tools/skills_guard.py: Replace inline OpenAI client construction (hardcoded OpenRouter base_url, manual api_key lookup, manual headers) with resolve_provider_client('openrouter'). Remove unused OPENROUTER_BASE_URL import. - trajectory_compressor.py: Add _detect_provider() to map config base_url to a provider name, then route through resolve_provider_client. Falls back to raw construction for unrecognized custom endpoints. - mini_swe_runner.py: Route default case (no explicit api_key/base_url) through resolve_provider_client('openrouter') with auto-detection fallback. Preserves direct construction when explicit creds are passed via CLI args. - agent/auxiliary_client.py: Fix stale module docstring — vision auto mode now correctly documents that Codex and custom endpoints are tried (not skipped). --- agent/auxiliary_client.py | 5 ++- mini_swe_runner.py | 45 ++++++++++---------- tools/openrouter_client.py | 31 +++++--------- tools/skills_guard.py | 20 +++------ trajectory_compressor.py | 85 ++++++++++++++++++++++++-------------- 5 files changed, 97 insertions(+), 89 deletions(-) diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 4571520a..9c153a74 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -17,7 +17,10 @@ Resolution order for text tasks (auto mode): Resolution order for vision/multimodal tasks (auto mode): 1. OpenRouter 2. Nous Portal - 3. None (steps 3-5 are skipped — they may not support multimodal) + 3. Codex OAuth (gpt-5.3-codex supports vision via Responses API) + 4. Custom endpoint (for local vision models: Qwen-VL, LLaVA, Pixtral, etc.) + 5. None (API-key providers like z.ai/Kimi/MiniMax are skipped — + they may not support multimodal) Per-task provider overrides (e.g. AUXILIARY_VISION_PROVIDER, CONTEXT_COMPRESSION_PROVIDER) can force a specific provider for each task: diff --git a/mini_swe_runner.py b/mini_swe_runner.py index 9be7b734..5cb337b8 100644 --- a/mini_swe_runner.py +++ b/mini_swe_runner.py @@ -189,29 +189,30 @@ class MiniSWERunner: ) self.logger = logging.getLogger(__name__) - # Initialize OpenAI client - defaults to OpenRouter - from openai import OpenAI - - client_kwargs = {} - - # Default to OpenRouter if no base_url provided - if base_url: - client_kwargs["base_url"] = base_url + # Initialize LLM client via centralized provider router. + # If explicit api_key/base_url are provided (e.g. from CLI args), + # construct directly. Otherwise use the router for OpenRouter. + if api_key or base_url: + from openai import OpenAI + client_kwargs = { + "base_url": base_url or "https://openrouter.ai/api/v1", + "api_key": api_key or os.getenv( + "OPENROUTER_API_KEY", + os.getenv("ANTHROPIC_API_KEY", + os.getenv("OPENAI_API_KEY", ""))), + } + self.client = OpenAI(**client_kwargs) else: - client_kwargs["base_url"] = "https://openrouter.ai/api/v1" - - - - # Handle API key - OpenRouter is the primary provider - if api_key: - client_kwargs["api_key"] = api_key - else: - client_kwargs["api_key"] = os.getenv( - "OPENROUTER_API_KEY", - os.getenv("ANTHROPIC_API_KEY", os.getenv("OPENAI_API_KEY", "")) - ) - - self.client = OpenAI(**client_kwargs) + from agent.auxiliary_client import resolve_provider_client + self.client, _ = resolve_provider_client("openrouter", model=model) + if self.client is None: + # Fallback: try auto-detection + self.client, _ = resolve_provider_client("auto", model=model) + if self.client is None: + from openai import OpenAI + self.client = OpenAI( + base_url="https://openrouter.ai/api/v1", + api_key=os.getenv("OPENROUTER_API_KEY", "")) # Environment will be created per-task self.env = None diff --git a/tools/openrouter_client.py b/tools/openrouter_client.py index 343cf102..0637a7db 100644 --- a/tools/openrouter_client.py +++ b/tools/openrouter_client.py @@ -1,39 +1,30 @@ """Shared OpenRouter API client for Hermes tools. Provides a single lazy-initialized AsyncOpenAI client that all tool modules -can share, eliminating the duplicated _get_openrouter_client() / -_get_summarizer_client() pattern previously copy-pasted across web_tools, -vision_tools, mixture_of_agents_tool, and session_search_tool. +can share. Routes through the centralized provider router in +agent/auxiliary_client.py so auth, headers, and API format are handled +consistently. """ import os -from openai import AsyncOpenAI -from hermes_constants import OPENROUTER_BASE_URL - -_client: AsyncOpenAI | None = None +_client = None -def get_async_client() -> AsyncOpenAI: - """Return a shared AsyncOpenAI client pointed at OpenRouter. +def get_async_client(): + """Return a shared async OpenAI-compatible client for OpenRouter. The client is created lazily on first call and reused thereafter. + Uses the centralized provider router for auth and client construction. Raises ValueError if OPENROUTER_API_KEY is not set. """ global _client if _client is None: - api_key = os.getenv("OPENROUTER_API_KEY") - if not api_key: + from agent.auxiliary_client import resolve_provider_client + client, _model = resolve_provider_client("openrouter", async_mode=True) + if client is None: raise ValueError("OPENROUTER_API_KEY environment variable not set") - _client = AsyncOpenAI( - api_key=api_key, - base_url=OPENROUTER_BASE_URL, - default_headers={ - "HTTP-Referer": "https://github.com/NousResearch/hermes-agent", - "X-OpenRouter-Title": "Hermes Agent", - "X-OpenRouter-Categories": "productivity,cli-agent", - }, - ) + _client = client return _client diff --git a/tools/skills_guard.py b/tools/skills_guard.py index 0b6d7fee..8234b0a2 100644 --- a/tools/skills_guard.py +++ b/tools/skills_guard.py @@ -29,7 +29,7 @@ from datetime import datetime, timezone from pathlib import Path from typing import List, Tuple -from hermes_constants import OPENROUTER_BASE_URL + # --------------------------------------------------------------------------- @@ -934,24 +934,14 @@ def llm_audit_skill(skill_path: Path, static_result: ScanResult, if not model: return static_result - # Call the LLM via the OpenAI SDK (same pattern as run_agent.py) + # Call the LLM via the centralized provider router try: - from openai import OpenAI - import os + from agent.auxiliary_client import resolve_provider_client - api_key = os.getenv("OPENROUTER_API_KEY", "") - if not api_key: + client, _default_model = resolve_provider_client("openrouter") + if client is None: return static_result - client = OpenAI( - base_url=OPENROUTER_BASE_URL, - api_key=api_key, - default_headers={ - "HTTP-Referer": "https://github.com/NousResearch/hermes-agent", - "X-OpenRouter-Title": "Hermes Agent", - "X-OpenRouter-Categories": "productivity,cli-agent", - }, - ) response = client.chat.completions.create( model=model, messages=[{ diff --git a/trajectory_compressor.py b/trajectory_compressor.py index 3f49c617..5f1c84c6 100644 --- a/trajectory_compressor.py +++ b/trajectory_compressor.py @@ -344,38 +344,61 @@ class TrajectoryCompressor: raise RuntimeError(f"Failed to load tokenizer '{self.config.tokenizer_name}': {e}") def _init_summarizer(self): - """Initialize OpenRouter client for summarization (sync and async).""" - api_key = os.getenv(self.config.api_key_env) - if not api_key: - raise RuntimeError(f"Missing API key. Set {self.config.api_key_env} environment variable.") - - from openai import OpenAI, AsyncOpenAI - - # OpenRouter app attribution headers (only for OpenRouter endpoints) - extra = {} - if "openrouter" in self.config.base_url.lower(): - extra["default_headers"] = { - "HTTP-Referer": "https://github.com/NousResearch/hermes-agent", - "X-OpenRouter-Title": "Hermes Agent", - "X-OpenRouter-Categories": "productivity,cli-agent", - } - - # Sync client (for backwards compatibility) - self.client = OpenAI( - api_key=api_key, - base_url=self.config.base_url, - **extra, - ) - - # Async client for parallel processing - self.async_client = AsyncOpenAI( - api_key=api_key, - base_url=self.config.base_url, - **extra, - ) - - print(f"✅ Initialized OpenRouter client: {self.config.summarization_model}") + """Initialize LLM client for summarization (sync and async). + + Routes through the centralized provider router for known providers + (OpenRouter, Nous, Codex, etc.) so auth and headers are handled + consistently. Falls back to raw construction for custom endpoints. + """ + from agent.auxiliary_client import resolve_provider_client + + provider = self._detect_provider() + if provider: + # Use centralized router — handles auth, headers, Codex adapter + self.client, _ = resolve_provider_client( + provider, model=self.config.summarization_model) + self.async_client, _ = resolve_provider_client( + provider, model=self.config.summarization_model, + async_mode=True) + if self.client is None: + raise RuntimeError( + f"Provider '{provider}' is not configured. " + f"Check your API key or run: hermes setup") + else: + # Custom endpoint — use config's raw base_url + api_key_env + api_key = os.getenv(self.config.api_key_env) + if not api_key: + raise RuntimeError( + f"Missing API key. Set {self.config.api_key_env} " + f"environment variable.") + from openai import OpenAI, AsyncOpenAI + self.client = OpenAI( + api_key=api_key, base_url=self.config.base_url) + self.async_client = AsyncOpenAI( + api_key=api_key, base_url=self.config.base_url) + + print(f"✅ Initialized summarizer client: {self.config.summarization_model}") print(f" Max concurrent requests: {self.config.max_concurrent_requests}") + + def _detect_provider(self) -> str: + """Detect the provider name from the configured base_url.""" + url = self.config.base_url.lower() + if "openrouter" in url: + return "openrouter" + if "nousresearch.com" in url: + return "nous" + if "chatgpt.com/backend-api/codex" in url: + return "codex" + if "api.z.ai" in url: + return "zai" + if "moonshot.ai" in url or "api.kimi.com" in url: + return "kimi-coding" + if "minimaxi.com" in url: + return "minimax-cn" + if "minimax.io" in url: + return "minimax" + # Unknown base_url — not a known provider + return "" def count_tokens(self, text: str) -> int: """Count tokens in text using the configured tokenizer.""" From 013cc4d2fcc46c25edb7b2452a1e101209dea2fb Mon Sep 17 00:00:00 2001 From: teknium1 Date: Wed, 11 Mar 2026 20:14:44 -0700 Subject: [PATCH 03/11] chore: remove nous-api provider (API key path) Nous Portal only supports OAuth authentication. Remove the 'nous-api' provider which allowed direct API key access via NOUS_API_KEY env var. Removed from: - hermes_cli/auth.py: PROVIDER_REGISTRY entry + aliases - hermes_cli/config.py: OPTIONAL_ENV_VARS entry - hermes_cli/setup.py: setup wizard option + model selection handler (reindexed remaining provider choices) - agent/auxiliary_client.py: docstring references - tests/test_runtime_provider_resolution.py: nous-api test - tests/integration/test_web_tools.py: renamed dict key --- agent/auxiliary_client.py | 4 +- hermes_cli/auth.py | 9 ---- hermes_cli/config.py | 8 --- hermes_cli/setup.py | 61 ++++------------------- tests/integration/test_web_tools.py | 2 +- tests/test_runtime_provider_resolution.py | 23 --------- 6 files changed, 14 insertions(+), 93 deletions(-) diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 9c153a74..264bab3f 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -547,7 +547,7 @@ def resolve_provider_client( Args: provider: Provider identifier. One of: "openrouter", "nous", "openai-codex" (or "codex"), - "zai", "kimi-coding", "minimax", "minimax-cn", "nous-api", + "zai", "kimi-coding", "minimax", "minimax-cn", "custom" (OPENAI_BASE_URL + OPENAI_API_KEY), "auto" (full auto-detection chain). model: Model slug override. If None, uses the provider's default @@ -674,7 +674,7 @@ def resolve_provider_client( return resolve_provider_client("nous", model, async_mode) if provider == "openai-codex": return resolve_provider_client("openai-codex", model, async_mode) - # nous-api is api_key type so it's handled above + # Other OAuth providers not directly supported logger.warning("resolve_provider_client: OAuth provider %s not " "directly supported, try 'auto'", provider) return None, None diff --git a/hermes_cli/auth.py b/hermes_cli/auth.py index c90f7792..05d233f9 100644 --- a/hermes_cli/auth.py +++ b/hermes_cli/auth.py @@ -108,14 +108,6 @@ PROVIDER_REGISTRY: Dict[str, ProviderConfig] = { auth_type="oauth_external", inference_base_url=DEFAULT_CODEX_BASE_URL, ), - "nous-api": ProviderConfig( - id="nous-api", - name="Nous Portal (API Key)", - auth_type="api_key", - inference_base_url="https://inference-api.nousresearch.com/v1", - api_key_env_vars=("NOUS_API_KEY",), - base_url_env_var="NOUS_BASE_URL", - ), "zai": ProviderConfig( id="zai", name="Z.AI / GLM", @@ -521,7 +513,6 @@ def resolve_provider( # Normalize provider aliases _PROVIDER_ALIASES = { - "nous_api": "nous-api", "nousapi": "nous-api", "nous-portal-api": "nous-api", "glm": "zai", "z-ai": "zai", "z.ai": "zai", "zhipu": "zai", "kimi": "kimi-coding", "moonshot": "kimi-coding", "minimax-china": "minimax-cn", "minimax_cn": "minimax-cn", diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 75811849..677de678 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -242,14 +242,6 @@ REQUIRED_ENV_VARS = {} # Optional environment variables that enhance functionality OPTIONAL_ENV_VARS = { # ── Provider (handled in provider selection, not shown in checklists) ── - "NOUS_API_KEY": { - "description": "Nous Portal API key (direct API key access to Nous inference)", - "prompt": "Nous Portal API key", - "url": "https://portal.nousresearch.com", - "password": True, - "category": "provider", - "advanced": True, - }, "NOUS_BASE_URL": { "description": "Nous Portal base URL override", "prompt": "Nous Portal base URL (leave empty for default)", diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index c471b1b9..6b00952c 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -516,7 +516,6 @@ def setup_model_provider(config: dict): keep_label = None # No provider configured — don't show "Keep current" provider_choices = [ - "Nous Portal API key (direct API key access)", "Login with Nous Portal (Nous Research subscription — OAuth)", "Login with OpenAI Codex", "OpenRouter API key (100+ models, pay-per-use)", @@ -530,7 +529,7 @@ def setup_model_provider(config: dict): provider_choices.append(keep_label) # Default to "Keep current" if a provider exists, otherwise OpenRouter (most common) - default_provider = len(provider_choices) - 1 if has_any_provider else 3 + default_provider = len(provider_choices) - 1 if has_any_provider else 2 if not has_any_provider: print_warning("An inference provider is required for Hermes to work.") @@ -542,37 +541,7 @@ def setup_model_provider(config: dict): selected_provider = None # "nous", "openai-codex", "openrouter", "custom", or None (keep) nous_models = [] # populated if Nous login succeeds - if provider_idx == 0: # Nous Portal API Key (direct) - selected_provider = "nous-api" - print() - print_header("Nous Portal API Key") - print_info("Use a Nous Portal API key for direct access to Nous inference.") - print_info("Get your API key at: https://portal.nousresearch.com") - print() - - existing_key = get_env_value("NOUS_API_KEY") - if existing_key: - print_info(f"Current: {existing_key[:8]}... (configured)") - if prompt_yes_no("Update Nous API key?", False): - api_key = prompt(" Nous API key", password=True) - if api_key: - save_env_value("NOUS_API_KEY", api_key) - print_success("Nous API key updated") - else: - api_key = prompt(" Nous API key", password=True) - if api_key: - save_env_value("NOUS_API_KEY", api_key) - print_success("Nous API key saved") - else: - print_warning("Skipped - agent won't work without an API key") - - # Clear custom endpoint vars if switching - if existing_custom: - save_env_value("OPENAI_BASE_URL", "") - save_env_value("OPENAI_API_KEY", "") - _update_config_for_provider("nous-api", "https://inference-api.nousresearch.com/v1") - - elif provider_idx == 1: # Nous Portal + if provider_idx == 0: # Nous Portal (OAuth) selected_provider = "nous" print() print_header("Nous Portal Login") @@ -612,7 +581,7 @@ def setup_model_provider(config: dict): print_info("You can try again later with: hermes model") selected_provider = None - elif provider_idx == 2: # OpenAI Codex + elif provider_idx == 1: # OpenAI Codex selected_provider = "openai-codex" print() print_header("OpenAI Codex Login") @@ -636,7 +605,7 @@ def setup_model_provider(config: dict): print_info("You can try again later with: hermes model") selected_provider = None - elif provider_idx == 3: # OpenRouter + elif provider_idx == 2: # OpenRouter selected_provider = "openrouter" print() print_header("OpenRouter API Key") @@ -686,7 +655,7 @@ def setup_model_provider(config: dict): except Exception as e: logger.debug("Could not save provider to config.yaml: %s", e) - elif provider_idx == 4: # Custom endpoint + elif provider_idx == 3: # Custom endpoint selected_provider = "custom" print() print_header("Custom OpenAI-Compatible Endpoint") @@ -737,7 +706,7 @@ def setup_model_provider(config: dict): print_success("Custom endpoint configured") - elif provider_idx == 5: # Z.AI / GLM + elif provider_idx == 4: # Z.AI / GLM selected_provider = "zai" print() print_header("Z.AI / GLM API Key") @@ -791,7 +760,7 @@ def setup_model_provider(config: dict): save_env_value("OPENAI_API_KEY", "") _update_config_for_provider("zai", zai_base_url) - elif provider_idx == 6: # Kimi / Moonshot + elif provider_idx == 5: # Kimi / Moonshot selected_provider = "kimi-coding" print() print_header("Kimi / Moonshot API Key") @@ -823,7 +792,7 @@ def setup_model_provider(config: dict): save_env_value("OPENAI_API_KEY", "") _update_config_for_provider("kimi-coding", pconfig.inference_base_url) - elif provider_idx == 7: # MiniMax + elif provider_idx == 6: # MiniMax selected_provider = "minimax" print() print_header("MiniMax API Key") @@ -855,7 +824,7 @@ def setup_model_provider(config: dict): save_env_value("OPENAI_API_KEY", "") _update_config_for_provider("minimax", pconfig.inference_base_url) - elif provider_idx == 8: # MiniMax China + elif provider_idx == 7: # MiniMax China selected_provider = "minimax-cn" print() print_header("MiniMax China API Key") @@ -887,12 +856,12 @@ def setup_model_provider(config: dict): save_env_value("OPENAI_API_KEY", "") _update_config_for_provider("minimax-cn", pconfig.inference_base_url) - # else: provider_idx == 9 (Keep current) — only shown when a provider already exists + # else: provider_idx == 8 (Keep current) — only shown when a provider already exists # ── OpenRouter API Key for tools (if not already set) ── # Tools (vision, web, MoA) use OpenRouter independently of the main provider. # Prompt for OpenRouter key if not set and a non-OpenRouter provider was chosen. - if selected_provider in ("nous", "nous-api", "openai-codex", "custom", "zai", "kimi-coding", "minimax", "minimax-cn") and not get_env_value("OPENROUTER_API_KEY"): + if selected_provider in ("nous", "openai-codex", "custom", "zai", "kimi-coding", "minimax", "minimax-cn") and not get_env_value("OPENROUTER_API_KEY"): print() print_header("OpenRouter API Key (for tools)") print_info("Tools like vision analysis, web search, and MoA use OpenRouter") @@ -945,14 +914,6 @@ def setup_model_provider(config: dict): if custom: config['model'] = custom save_env_value("LLM_MODEL", custom) - elif selected_provider == "nous-api": - # Nous API key provider — prompt for model manually - print_info("Enter a model name available on Nous inference API.") - print_info("Examples: anthropic/claude-opus-4.6, deepseek/deepseek-r1") - custom = prompt(f" Model name (Enter to keep '{current_model}')") - if custom: - config['model'] = custom - save_env_value("LLM_MODEL", custom) elif selected_provider == "openai-codex": from hermes_cli.codex_models import get_codex_model_ids codex_models = get_codex_model_ids() diff --git a/tests/integration/test_web_tools.py b/tests/integration/test_web_tools.py index cd3de453..fb2ea9da 100644 --- a/tests/integration/test_web_tools.py +++ b/tests/integration/test_web_tools.py @@ -579,7 +579,7 @@ class WebToolsTester: "results": self.test_results, "environment": { "firecrawl_api_key": check_firecrawl_api_key(), - "nous_api_key": check_auxiliary_model(), + "auxiliary_model": check_auxiliary_model(), "debug_mode": get_debug_session_info()["enabled"] } } diff --git a/tests/test_runtime_provider_resolution.py b/tests/test_runtime_provider_resolution.py index 9ccd7c7e..9631591b 100644 --- a/tests/test_runtime_provider_resolution.py +++ b/tests/test_runtime_provider_resolution.py @@ -158,29 +158,6 @@ def test_custom_endpoint_auto_provider_prefers_openai_key(monkeypatch): assert resolved["api_key"] == "sk-vllm-key" -def test_resolve_runtime_provider_nous_api(monkeypatch): - """Nous Portal API key provider resolves via the api_key path.""" - monkeypatch.setattr(rp, "resolve_provider", lambda *a, **k: "nous-api") - monkeypatch.setattr( - rp, - "resolve_api_key_provider_credentials", - lambda pid: { - "provider": "nous-api", - "api_key": "nous-test-key", - "base_url": "https://inference-api.nousresearch.com/v1", - "source": "NOUS_API_KEY", - }, - ) - - resolved = rp.resolve_runtime_provider(requested="nous-api") - - assert resolved["provider"] == "nous-api" - assert resolved["api_mode"] == "chat_completions" - assert resolved["base_url"] == "https://inference-api.nousresearch.com/v1" - assert resolved["api_key"] == "nous-test-key" - assert resolved["requested_provider"] == "nous-api" - - def test_explicit_openrouter_skips_openai_base_url(monkeypatch): """When the user explicitly requests openrouter, OPENAI_BASE_URL (which may point to a custom endpoint) must not override the From 0aa31cd3cb8167748ade1195e40eff469f07c7da Mon Sep 17 00:00:00 2001 From: teknium1 Date: Wed, 11 Mar 2026 20:52:19 -0700 Subject: [PATCH 04/11] feat: call_llm/async_call_llm + config slots + migrate all consumers Add centralized call_llm() and async_call_llm() functions that own the full LLM request lifecycle: 1. Resolve provider + model from task config or explicit args 2. Get or create a cached client for that provider 3. Format request args (max_tokens handling, provider extra_body) 4. Make the API call with max_tokens/max_completion_tokens retry 5. Return the response Config: expanded auxiliary section with provider:model slots for all tasks (compression, vision, web_extract, session_search, skills_hub, mcp, flush_memories). Config version bumped to 7. Migrated all auxiliary consumers: - context_compressor.py: uses call_llm(task='compression') - vision_tools.py: uses async_call_llm(task='vision') - web_tools.py: uses async_call_llm(task='web_extract') - session_search_tool.py: uses async_call_llm(task='session_search') - browser_tool.py: uses call_llm(task='vision'/'web_extract') - mcp_tool.py: uses call_llm(task='mcp') - skills_guard.py: uses call_llm(provider='openrouter') - run_agent.py flush_memories: uses call_llm(task='flush_memories') Tests updated for context_compressor and MCP tool. Some test mocks still need updating (15 remaining failures from mock pattern changes, 2 pre-existing). --- agent/auxiliary_client.py | 250 +++++++++++++++++++++++++ agent/context_compressor.py | 94 +++------- hermes_cli/config.py | 32 +++- run_agent.py | 31 +-- tests/agent/test_context_compressor.py | 46 ++--- tests/tools/test_mcp_tool.py | 97 +++++----- tools/browser_tool.py | 83 +++----- tools/mcp_tool.py | 40 ++-- tools/session_search_tool.py | 26 +-- tools/skills_guard.py | 9 +- tools/vision_tools.py | 60 +++--- tools/web_tools.py | 89 ++++----- trajectory_compressor.py | 70 ++++--- 13 files changed, 552 insertions(+), 375 deletions(-) diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 264bab3f..04afe4c7 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -784,3 +784,253 @@ def auxiliary_max_tokens_param(value: int) -> dict: and "api.openai.com" in custom_base.lower()): return {"max_completion_tokens": value} return {"max_tokens": value} + + +# ── Centralized LLM Call API ──────────────────────────────────────────────── +# +# call_llm() and async_call_llm() own the full request lifecycle: +# 1. Resolve provider + model from task config (or explicit args) +# 2. Get or create a cached client for that provider +# 3. Format request args for the provider + model (max_tokens handling, etc.) +# 4. Make the API call +# 5. Return the response +# +# Every auxiliary LLM consumer should use these instead of manually +# constructing clients and calling .chat.completions.create(). + +# Client cache: (provider, async_mode) -> (client, default_model) +_client_cache: Dict[tuple, tuple] = {} + + +def _get_cached_client( + provider: str, model: str = None, async_mode: bool = False, +) -> Tuple[Optional[Any], Optional[str]]: + """Get or create a cached client for the given provider.""" + cache_key = (provider, async_mode) + if cache_key in _client_cache: + cached_client, cached_default = _client_cache[cache_key] + return cached_client, model or cached_default + client, default_model = resolve_provider_client(provider, model, async_mode) + if client is not None: + _client_cache[cache_key] = (client, default_model) + return client, model or default_model + + +def _resolve_task_provider_model( + task: str = None, + provider: str = None, + model: str = None, +) -> Tuple[str, Optional[str]]: + """Determine provider + model for a call. + + Priority: + 1. Explicit provider/model args (always win) + 2. Env var overrides (AUXILIARY_{TASK}_PROVIDER, etc.) + 3. Config file (auxiliary.{task}.provider/model or compression.*) + 4. "auto" (full auto-detection chain) + + Returns (provider, model) where model may be None (use provider default). + """ + if provider: + return provider, model + + if task: + # Check env var overrides first + env_provider = _get_auxiliary_provider(task) + if env_provider != "auto": + # Check for env var model override too + env_model = None + for prefix in ("AUXILIARY_", "CONTEXT_"): + val = os.getenv(f"{prefix}{task.upper()}_MODEL", "").strip() + if val: + env_model = val + break + return env_provider, model or env_model + + # Read from config file + try: + from hermes_cli.config import load_config + config = load_config() + except ImportError: + return "auto", model + + # Check auxiliary.{task} section + aux = config.get("auxiliary", {}) + task_config = aux.get(task, {}) + cfg_provider = task_config.get("provider", "").strip() or None + cfg_model = task_config.get("model", "").strip() or None + + # Backwards compat: compression section has its own keys + if task == "compression" and not cfg_provider: + comp = config.get("compression", {}) + cfg_provider = comp.get("summary_provider", "").strip() or None + cfg_model = cfg_model or comp.get("summary_model", "").strip() or None + + if cfg_provider and cfg_provider != "auto": + return cfg_provider, model or cfg_model + return "auto", model or cfg_model + + return "auto", model + + +def _build_call_kwargs( + provider: str, + model: str, + messages: list, + temperature: Optional[float] = None, + max_tokens: Optional[int] = None, + tools: Optional[list] = None, + timeout: float = 30.0, + extra_body: Optional[dict] = None, +) -> dict: + """Build kwargs for .chat.completions.create() with model/provider adjustments.""" + kwargs: Dict[str, Any] = { + "model": model, + "messages": messages, + "timeout": timeout, + } + + if temperature is not None: + kwargs["temperature"] = temperature + + if max_tokens is not None: + # Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens. + # Direct OpenAI api.openai.com with newer models needs max_completion_tokens. + if provider == "custom": + custom_base = os.getenv("OPENAI_BASE_URL", "") + if "api.openai.com" in custom_base.lower(): + kwargs["max_completion_tokens"] = max_tokens + else: + kwargs["max_tokens"] = max_tokens + else: + kwargs["max_tokens"] = max_tokens + + if tools: + kwargs["tools"] = tools + + # Provider-specific extra_body + merged_extra = dict(extra_body or {}) + if provider == "nous" or auxiliary_is_nous: + merged_extra.setdefault("tags", []).extend(["product=hermes-agent"]) + if merged_extra: + kwargs["extra_body"] = merged_extra + + return kwargs + + +def call_llm( + task: str = None, + *, + provider: str = None, + model: str = None, + messages: list, + temperature: float = None, + max_tokens: int = None, + tools: list = None, + timeout: float = 30.0, + extra_body: dict = None, +) -> Any: + """Centralized synchronous LLM call. + + Resolves provider + model (from task config, explicit args, or auto-detect), + handles auth, request formatting, and model-specific arg adjustments. + + Args: + task: Auxiliary task name ("compression", "vision", "web_extract", + "session_search", "skills_hub", "mcp", "flush_memories"). + Reads provider:model from config/env. Ignored if provider is set. + provider: Explicit provider override. + model: Explicit model override. + messages: Chat messages list. + temperature: Sampling temperature (None = provider default). + max_tokens: Max output tokens (handles max_tokens vs max_completion_tokens). + tools: Tool definitions (for function calling). + timeout: Request timeout in seconds. + extra_body: Additional request body fields. + + Returns: + Response object with .choices[0].message.content + + Raises: + RuntimeError: If no provider is configured. + """ + resolved_provider, resolved_model = _resolve_task_provider_model( + task, provider, model) + + client, final_model = _get_cached_client(resolved_provider, resolved_model) + if client is None: + # Fallback: try openrouter + if resolved_provider != "openrouter": + logger.warning("Provider %s unavailable, falling back to openrouter", + resolved_provider) + client, final_model = _get_cached_client( + "openrouter", resolved_model or _OPENROUTER_MODEL) + if client is None: + raise RuntimeError( + f"No LLM provider configured for task={task} provider={resolved_provider}. " + f"Run: hermes setup") + + kwargs = _build_call_kwargs( + resolved_provider, final_model, messages, + temperature=temperature, max_tokens=max_tokens, + tools=tools, timeout=timeout, extra_body=extra_body) + + # Handle max_tokens vs max_completion_tokens retry + try: + return client.chat.completions.create(**kwargs) + except Exception as first_err: + err_str = str(first_err) + if "max_tokens" in err_str or "unsupported_parameter" in err_str: + kwargs.pop("max_tokens", None) + kwargs["max_completion_tokens"] = max_tokens + return client.chat.completions.create(**kwargs) + raise + + +async def async_call_llm( + task: str = None, + *, + provider: str = None, + model: str = None, + messages: list, + temperature: float = None, + max_tokens: int = None, + tools: list = None, + timeout: float = 30.0, + extra_body: dict = None, +) -> Any: + """Centralized asynchronous LLM call. + + Same as call_llm() but async. See call_llm() for full documentation. + """ + resolved_provider, resolved_model = _resolve_task_provider_model( + task, provider, model) + + client, final_model = _get_cached_client( + resolved_provider, resolved_model, async_mode=True) + if client is None: + if resolved_provider != "openrouter": + logger.warning("Provider %s unavailable, falling back to openrouter", + resolved_provider) + client, final_model = _get_cached_client( + "openrouter", resolved_model or _OPENROUTER_MODEL, + async_mode=True) + if client is None: + raise RuntimeError( + f"No LLM provider configured for task={task} provider={resolved_provider}. " + f"Run: hermes setup") + + kwargs = _build_call_kwargs( + resolved_provider, final_model, messages, + temperature=temperature, max_tokens=max_tokens, + tools=tools, timeout=timeout, extra_body=extra_body) + + try: + return await client.chat.completions.create(**kwargs) + except Exception as first_err: + err_str = str(first_err) + if "max_tokens" in err_str or "unsupported_parameter" in err_str: + kwargs.pop("max_tokens", None) + kwargs["max_completion_tokens"] = max_tokens + return await client.chat.completions.create(**kwargs) + raise diff --git a/agent/context_compressor.py b/agent/context_compressor.py index fae483fd..a0ca0c99 100644 --- a/agent/context_compressor.py +++ b/agent/context_compressor.py @@ -9,7 +9,7 @@ import logging import os from typing import Any, Dict, List, Optional -from agent.auxiliary_client import get_text_auxiliary_client +from agent.auxiliary_client import call_llm from agent.model_metadata import ( get_model_context_length, estimate_messages_tokens_rough, @@ -53,8 +53,7 @@ class ContextCompressor: self.last_completion_tokens = 0 self.last_total_tokens = 0 - self.client, default_model = get_text_auxiliary_client("compression") - self.summary_model = summary_model_override or default_model + self.summary_model = summary_model_override or "" def update_from_response(self, usage: Dict[str, Any]): """Update tracked token usage from API response.""" @@ -120,73 +119,30 @@ TURNS TO SUMMARIZE: Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix.""" - # 1. Try the auxiliary model (cheap/fast) - if self.client: - try: - return self._call_summary_model(self.client, self.summary_model, prompt) - except Exception as e: - logging.warning(f"Failed to generate context summary with auxiliary model: {e}") - - # 2. Fallback: re-try via the centralized provider router. - # This covers all configured providers (Codex OAuth, API-key - # providers, etc.) without ad-hoc env var lookups. - from agent.auxiliary_client import resolve_provider_client - fallback_providers = ["custom", "openrouter", "nous", "codex"] - for fb_provider in fallback_providers: - try: - fb_client, fb_model = resolve_provider_client( - fb_provider, model=self.model) - if fb_client is None: - continue - # Don't retry the same client that just failed - if (self.client is not None - and hasattr(fb_client, "base_url") - and hasattr(self.client, "base_url") - and str(fb_client.base_url) == str(self.client.base_url)): - continue - logger.info("Retrying context summary with fallback provider " - "%s (%s)", fb_provider, fb_model) - summary = self._call_summary_model(fb_client, fb_model, prompt) - # Promote successful fallback for future compressions - self.client = fb_client - self.summary_model = fb_model - return summary - except Exception as fallback_err: - logging.warning("Fallback provider %s failed: %s", - fb_provider, fallback_err) - - # 3. All providers failed — return None so the caller drops turns - # without a summary. - logging.warning("Context compression: no provider available for " - "summary. Middle turns will be dropped without summary.") - return None - - def _call_summary_model(self, client, model: str, prompt: str) -> str: - """Make the actual LLM call to generate a summary. Raises on failure.""" - kwargs = { - "model": model, - "messages": [{"role": "user", "content": prompt}], - "temperature": 0.3, - "timeout": 30.0, - } - # Most providers (OpenRouter, local models) use max_tokens. - # Direct OpenAI with newer models (gpt-4o, o-series, gpt-5+) - # requires max_completion_tokens instead. + # Use the centralized LLM router — handles provider resolution, + # auth, and fallback internally. try: - kwargs["max_tokens"] = self.summary_target_tokens * 2 - response = client.chat.completions.create(**kwargs) - except Exception as first_err: - if "max_tokens" in str(first_err) or "unsupported_parameter" in str(first_err): - kwargs.pop("max_tokens", None) - kwargs["max_completion_tokens"] = self.summary_target_tokens * 2 - response = client.chat.completions.create(**kwargs) - else: - raise - - summary = response.choices[0].message.content.strip() - if not summary.startswith("[CONTEXT SUMMARY]:"): - summary = "[CONTEXT SUMMARY]: " + summary - return summary + call_kwargs = { + "task": "compression", + "messages": [{"role": "user", "content": prompt}], + "temperature": 0.3, + "max_tokens": self.summary_target_tokens * 2, + "timeout": 30.0, + } + if self.summary_model: + call_kwargs["model"] = self.summary_model + response = call_llm(**call_kwargs) + summary = response.choices[0].message.content.strip() + if not summary.startswith("[CONTEXT SUMMARY]:"): + summary = "[CONTEXT SUMMARY]: " + summary + return summary + except RuntimeError: + logging.warning("Context compression: no provider available for " + "summary. Middle turns will be dropped without summary.") + return None + except Exception as e: + logging.warning("Failed to generate context summary: %s", e) + return None # ------------------------------------------------------------------ # Tool-call / tool-result pair integrity helpers diff --git a/hermes_cli/config.py b/hermes_cli/config.py index 677de678..99008978 100644 --- a/hermes_cli/config.py +++ b/hermes_cli/config.py @@ -125,17 +125,41 @@ DEFAULT_CONFIG = { "summary_provider": "auto", }, - # Auxiliary model overrides (advanced). By default Hermes auto-selects - # the provider and model for each side task. Set these to override. + # Auxiliary model config — provider:model for each side task. + # Format: provider is the provider name, model is the model slug. + # "auto" for provider = auto-detect best available provider. + # Empty model = use provider's default auxiliary model. + # All tasks fall back to openrouter:google/gemini-3-flash-preview if + # the configured provider is unavailable. "auxiliary": { "vision": { - "provider": "auto", # auto | openrouter | nous | main + "provider": "auto", # auto | openrouter | nous | codex | custom "model": "", # e.g. "google/gemini-2.5-flash", "gpt-4o" }, "web_extract": { "provider": "auto", "model": "", }, + "compression": { + "provider": "auto", + "model": "", + }, + "session_search": { + "provider": "auto", + "model": "", + }, + "skills_hub": { + "provider": "auto", + "model": "", + }, + "mcp": { + "provider": "auto", + "model": "", + }, + "flush_memories": { + "provider": "auto", + "model": "", + }, }, "display": { @@ -217,7 +241,7 @@ DEFAULT_CONFIG = { "personalities": {}, # Config schema version - bump this when adding new required fields - "_config_version": 6, + "_config_version": 7, } # ============================================================================= diff --git a/run_agent.py b/run_agent.py index db35d85f..8849d25c 100644 --- a/run_agent.py +++ b/run_agent.py @@ -2623,19 +2623,22 @@ class AIAgent: # Use auxiliary client for the flush call when available -- # it's cheaper and avoids Codex Responses API incompatibility. - from agent.auxiliary_client import get_text_auxiliary_client - aux_client, aux_model = get_text_auxiliary_client() + from agent.auxiliary_client import call_llm as _call_llm + _aux_available = True + try: + response = _call_llm( + task="flush_memories", + messages=api_messages, + tools=[memory_tool_def], + temperature=0.3, + max_tokens=5120, + timeout=30.0, + ) + except RuntimeError: + _aux_available = False + response = None - if aux_client: - api_kwargs = { - "model": aux_model, - "messages": api_messages, - "tools": [memory_tool_def], - "temperature": 0.3, - "max_tokens": 5120, - } - response = aux_client.chat.completions.create(**api_kwargs, timeout=30.0) - elif self.api_mode == "codex_responses": + if not _aux_available and self.api_mode == "codex_responses": # No auxiliary client -- use the Codex Responses path directly codex_kwargs = self._build_api_kwargs(api_messages) codex_kwargs["tools"] = self._responses_tools([memory_tool_def]) @@ -2643,7 +2646,7 @@ class AIAgent: if "max_output_tokens" in codex_kwargs: codex_kwargs["max_output_tokens"] = 5120 response = self._run_codex_stream(codex_kwargs) - else: + elif not _aux_available: api_kwargs = { "model": self.model, "messages": api_messages, @@ -2655,7 +2658,7 @@ class AIAgent: # Extract tool calls from the response, handling both API formats tool_calls = [] - if self.api_mode == "codex_responses" and not aux_client: + if self.api_mode == "codex_responses" and not _aux_available: assistant_msg, _ = self._normalize_codex_response(response) if assistant_msg and assistant_msg.tool_calls: tool_calls = assistant_msg.tool_calls diff --git a/tests/agent/test_context_compressor.py b/tests/agent/test_context_compressor.py index 12fa374c..82ee9350 100644 --- a/tests/agent/test_context_compressor.py +++ b/tests/agent/test_context_compressor.py @@ -9,8 +9,7 @@ from agent.context_compressor import ContextCompressor @pytest.fixture() def compressor(): """Create a ContextCompressor with mocked dependencies.""" - with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ - patch("agent.context_compressor.get_text_auxiliary_client", return_value=(None, None)): + with patch("agent.context_compressor.get_model_context_length", return_value=100000): c = ContextCompressor( model="test/model", threshold_percent=0.85, @@ -119,14 +118,11 @@ class TestGenerateSummaryNoneContent: """Regression: content=None (from tool-call-only assistant messages) must not crash.""" def test_none_content_does_not_crash(self): - mock_client = MagicMock() mock_response = MagicMock() mock_response.choices = [MagicMock()] mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: tool calls happened" - mock_client.chat.completions.create.return_value = mock_response - with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ - patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")): + with patch("agent.context_compressor.get_model_context_length", return_value=100000): c = ContextCompressor(model="test", quiet_mode=True) messages = [ @@ -139,14 +135,14 @@ class TestGenerateSummaryNoneContent: {"role": "user", "content": "thanks"}, ] - summary = c._generate_summary(messages) + with patch("agent.context_compressor.call_llm", return_value=mock_response): + summary = c._generate_summary(messages) assert isinstance(summary, str) assert "CONTEXT SUMMARY" in summary def test_none_content_in_system_message_compress(self): """System message with content=None should not crash during compress.""" - with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ - patch("agent.context_compressor.get_text_auxiliary_client", return_value=(None, None)): + with patch("agent.context_compressor.get_model_context_length", return_value=100000): c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2) msgs = [{"role": "system", "content": None}] + [ @@ -165,12 +161,12 @@ class TestCompressWithClient: mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened" mock_client.chat.completions.create.return_value = mock_response - with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ - patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")): + with patch("agent.context_compressor.get_model_context_length", return_value=100000): c = ContextCompressor(model="test", quiet_mode=True) msgs = [{"role": "user" if i % 2 == 0 else "assistant", "content": f"msg {i}"} for i in range(10)] - result = c.compress(msgs) + with patch("agent.context_compressor.call_llm", return_value=mock_response): + result = c.compress(msgs) # Should have summary message in the middle contents = [m.get("content", "") for m in result] @@ -184,8 +180,7 @@ class TestCompressWithClient: mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compressed middle" mock_client.chat.completions.create.return_value = mock_response - with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ - patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")): + with patch("agent.context_compressor.get_model_context_length", return_value=100000): c = ContextCompressor( model="test", quiet_mode=True, @@ -212,7 +207,8 @@ class TestCompressWithClient: {"role": "user", "content": "later 4"}, ] - result = c.compress(msgs) + with patch("agent.context_compressor.call_llm", return_value=mock_response): + result = c.compress(msgs) answered_ids = { msg.get("tool_call_id") @@ -232,8 +228,7 @@ class TestCompressWithClient: mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened" mock_client.chat.completions.create.return_value = mock_response - with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ - patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")): + with patch("agent.context_compressor.get_model_context_length", return_value=100000): c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=2, protect_last_n=2) # Last head message (index 1) is "assistant" → summary should be "user" @@ -245,7 +240,8 @@ class TestCompressWithClient: {"role": "user", "content": "msg 4"}, {"role": "assistant", "content": "msg 5"}, ] - result = c.compress(msgs) + with patch("agent.context_compressor.call_llm", return_value=mock_response): + result = c.compress(msgs) summary_msg = [m for m in result if "CONTEXT SUMMARY" in (m.get("content") or "")] assert len(summary_msg) == 1 assert summary_msg[0]["role"] == "user" @@ -258,8 +254,7 @@ class TestCompressWithClient: mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: stuff happened" mock_client.chat.completions.create.return_value = mock_response - with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ - patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")): + with patch("agent.context_compressor.get_model_context_length", return_value=100000): c = ContextCompressor(model="test", quiet_mode=True, protect_first_n=3, protect_last_n=2) # Last head message (index 2) is "user" → summary should be "assistant" @@ -273,20 +268,18 @@ class TestCompressWithClient: {"role": "user", "content": "msg 6"}, {"role": "assistant", "content": "msg 7"}, ] - result = c.compress(msgs) + with patch("agent.context_compressor.call_llm", return_value=mock_response): + result = c.compress(msgs) summary_msg = [m for m in result if "CONTEXT SUMMARY" in (m.get("content") or "")] assert len(summary_msg) == 1 assert summary_msg[0]["role"] == "assistant" def test_summarization_does_not_start_tail_with_tool_outputs(self): - mock_client = MagicMock() mock_response = MagicMock() mock_response.choices = [MagicMock()] mock_response.choices[0].message.content = "[CONTEXT SUMMARY]: compressed middle" - mock_client.chat.completions.create.return_value = mock_response - with patch("agent.context_compressor.get_model_context_length", return_value=100000), \ - patch("agent.context_compressor.get_text_auxiliary_client", return_value=(mock_client, "test-model")): + with patch("agent.context_compressor.get_model_context_length", return_value=100000): c = ContextCompressor( model="test", quiet_mode=True, @@ -309,7 +302,8 @@ class TestCompressWithClient: {"role": "user", "content": "latest user"}, ] - result = c.compress(msgs) + with patch("agent.context_compressor.call_llm", return_value=mock_response): + result = c.compress(msgs) called_ids = { tc["id"] diff --git a/tests/tools/test_mcp_tool.py b/tests/tools/test_mcp_tool.py index 446f80d3..0d527e95 100644 --- a/tests/tools/test_mcp_tool.py +++ b/tests/tools/test_mcp_tool.py @@ -1828,8 +1828,8 @@ class TestSamplingCallbackText: ) with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): params = _make_sampling_params() result = asyncio.run(self.handler(None, params)) @@ -1847,13 +1847,13 @@ class TestSamplingCallbackText: fake_client.chat.completions.create.return_value = _make_llm_response() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), - ): + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, + ) as mock_call: params = _make_sampling_params(system_prompt="Be helpful") asyncio.run(self.handler(None, params)) - call_args = fake_client.chat.completions.create.call_args + call_args = mock_call.call_args messages = call_args.kwargs["messages"] assert messages[0] == {"role": "system", "content": "Be helpful"} @@ -1865,8 +1865,8 @@ class TestSamplingCallbackText: ) with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): params = _make_sampling_params() result = asyncio.run(self.handler(None, params)) @@ -1889,8 +1889,8 @@ class TestSamplingCallbackToolUse: fake_client.chat.completions.create.return_value = _make_llm_tool_response() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): params = _make_sampling_params() result = asyncio.run(self.handler(None, params)) @@ -1916,8 +1916,8 @@ class TestSamplingCallbackToolUse: ) with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(self.handler(None, _make_sampling_params())) @@ -1939,8 +1939,8 @@ class TestToolLoopGovernance: fake_client.chat.completions.create.return_value = _make_llm_tool_response() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): params = _make_sampling_params() # Round 1, 2: allowed @@ -1959,8 +1959,8 @@ class TestToolLoopGovernance: fake_client = MagicMock() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): # Tool response (round 1 of 1 allowed) fake_client.chat.completions.create.return_value = _make_llm_tool_response() @@ -1984,8 +1984,8 @@ class TestToolLoopGovernance: fake_client.chat.completions.create.return_value = _make_llm_tool_response() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(handler(None, _make_sampling_params())) assert isinstance(result, ErrorData) @@ -2003,8 +2003,8 @@ class TestSamplingErrors: fake_client.chat.completions.create.return_value = _make_llm_response() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): # First call succeeds r1 = asyncio.run(handler(None, _make_sampling_params())) @@ -2017,20 +2017,16 @@ class TestSamplingErrors: def test_timeout_error(self): handler = SamplingHandler("to", {"timeout": 0.05}) - fake_client = MagicMock() def slow_call(**kwargs): import threading - # Use an event to ensure the thread truly blocks long enough evt = threading.Event() evt.wait(5) # blocks for up to 5 seconds (cancelled by timeout) return _make_llm_response() - fake_client.chat.completions.create.side_effect = slow_call - with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + side_effect=slow_call, ): result = asyncio.run(handler(None, _make_sampling_params())) assert isinstance(result, ErrorData) @@ -2041,12 +2037,11 @@ class TestSamplingErrors: handler = SamplingHandler("np", {}) with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(None, None), + "agent.auxiliary_client.call_llm", + side_effect=RuntimeError("No LLM provider configured"), ): result = asyncio.run(handler(None, _make_sampling_params())) assert isinstance(result, ErrorData) - assert "No LLM provider" in result.message assert handler.metrics["errors"] == 1 def test_empty_choices_returns_error(self): @@ -2060,8 +2055,8 @@ class TestSamplingErrors: ) with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(handler(None, _make_sampling_params())) @@ -2080,8 +2075,8 @@ class TestSamplingErrors: ) with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(handler(None, _make_sampling_params())) @@ -2099,8 +2094,8 @@ class TestSamplingErrors: ) with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(handler(None, _make_sampling_params())) @@ -2120,8 +2115,8 @@ class TestModelWhitelist: fake_client.chat.completions.create.return_value = _make_llm_response() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "test-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(handler(None, _make_sampling_params())) assert isinstance(result, CreateMessageResult) @@ -2131,8 +2126,8 @@ class TestModelWhitelist: fake_client = MagicMock() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "gpt-3.5-turbo"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(handler(None, _make_sampling_params())) assert isinstance(result, ErrorData) @@ -2145,8 +2140,8 @@ class TestModelWhitelist: fake_client.chat.completions.create.return_value = _make_llm_response() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "any-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(handler(None, _make_sampling_params())) assert isinstance(result, CreateMessageResult) @@ -2166,8 +2161,8 @@ class TestMalformedToolCallArgs: ) with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(handler(None, _make_sampling_params())) @@ -2194,8 +2189,8 @@ class TestMalformedToolCallArgs: fake_client.chat.completions.create.return_value = response with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): result = asyncio.run(handler(None, _make_sampling_params())) @@ -2214,8 +2209,8 @@ class TestMetricsTracking: fake_client.chat.completions.create.return_value = _make_llm_response() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): asyncio.run(handler(None, _make_sampling_params())) @@ -2229,8 +2224,8 @@ class TestMetricsTracking: fake_client.chat.completions.create.return_value = _make_llm_tool_response() with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(fake_client, "default-model"), + "agent.auxiliary_client.call_llm", + return_value=fake_client.chat.completions.create.return_value, ): asyncio.run(handler(None, _make_sampling_params())) @@ -2241,8 +2236,8 @@ class TestMetricsTracking: handler = SamplingHandler("met3", {}) with patch( - "agent.auxiliary_client.get_text_auxiliary_client", - return_value=(None, None), + "agent.auxiliary_client.call_llm", + side_effect=RuntimeError("No LLM provider configured"), ): asyncio.run(handler(None, _make_sampling_params())) diff --git a/tools/browser_tool.py b/tools/browser_tool.py index dd44549b..ae951574 100644 --- a/tools/browser_tool.py +++ b/tools/browser_tool.py @@ -63,7 +63,7 @@ import time import requests from typing import Dict, Any, Optional, List from pathlib import Path -from agent.auxiliary_client import get_vision_auxiliary_client, get_text_auxiliary_client +from agent.auxiliary_client import call_llm logger = logging.getLogger(__name__) @@ -80,38 +80,15 @@ DEFAULT_SESSION_TIMEOUT = 300 # Max tokens for snapshot content before summarization SNAPSHOT_SUMMARIZE_THRESHOLD = 8000 -# Vision client — for browser_vision (screenshot analysis) -# Wrapped in try/except so a broken auxiliary config doesn't prevent the entire -# browser_tool module from importing (which would disable all 10 browser tools). -try: - _aux_vision_client, _DEFAULT_VISION_MODEL = get_vision_auxiliary_client() -except Exception as _init_err: - logger.debug("Could not initialise vision auxiliary client: %s", _init_err) - _aux_vision_client, _DEFAULT_VISION_MODEL = None, None -# Text client — for page snapshot summarization (same config as web_extract) -try: - _aux_text_client, _DEFAULT_TEXT_MODEL = get_text_auxiliary_client("web_extract") -except Exception as _init_err: - logger.debug("Could not initialise text auxiliary client: %s", _init_err) - _aux_text_client, _DEFAULT_TEXT_MODEL = None, None - -# Module-level alias for availability checks -EXTRACTION_MODEL = _DEFAULT_TEXT_MODEL or _DEFAULT_VISION_MODEL - - -def _get_vision_model() -> str: +def _get_vision_model() -> Optional[str]: """Model for browser_vision (screenshot analysis — multimodal).""" - return (os.getenv("AUXILIARY_VISION_MODEL", "").strip() - or _DEFAULT_VISION_MODEL - or "google/gemini-3-flash-preview") + return os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None -def _get_extraction_model() -> str: +def _get_extraction_model() -> Optional[str]: """Model for page snapshot text summarization — same as web_extract.""" - return (os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() - or _DEFAULT_TEXT_MODEL - or "google/gemini-3-flash-preview") + return os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() or None def _is_local_mode() -> bool: @@ -941,9 +918,6 @@ def _extract_relevant_content( Falls back to simple truncation when no auxiliary text model is configured. """ - if _aux_text_client is None: - return _truncate_snapshot(snapshot_text) - if user_task: extraction_prompt = ( f"You are a content extractor for a browser automation agent.\n\n" @@ -968,13 +942,16 @@ def _extract_relevant_content( ) try: - from agent.auxiliary_client import auxiliary_max_tokens_param - response = _aux_text_client.chat.completions.create( - model=_get_extraction_model(), - messages=[{"role": "user", "content": extraction_prompt}], - **auxiliary_max_tokens_param(4000), - temperature=0.1, - ) + call_kwargs = { + "task": "web_extract", + "messages": [{"role": "user", "content": extraction_prompt}], + "max_tokens": 4000, + "temperature": 0.1, + } + model = _get_extraction_model() + if model: + call_kwargs["model"] = model + response = call_llm(**call_kwargs) return response.choices[0].message.content except Exception: return _truncate_snapshot(snapshot_text) @@ -1497,14 +1474,6 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] effective_task_id = task_id or "default" - # Check auxiliary vision client - if _aux_vision_client is None or _DEFAULT_VISION_MODEL is None: - return json.dumps({ - "success": False, - "error": "Browser vision unavailable: no auxiliary vision model configured. " - "Set OPENROUTER_API_KEY or configure Nous Portal to enable browser vision." - }, ensure_ascii=False) - # Save screenshot to persistent location so it can be shared with users hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes")) screenshots_dir = hermes_home / "browser_screenshots" @@ -1562,14 +1531,13 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] f"Focus on answering the user's specific question." ) - # Use the sync auxiliary vision client directly - from agent.auxiliary_client import auxiliary_max_tokens_param + # Use the centralized LLM router vision_model = _get_vision_model() - logger.debug("browser_vision: analysing screenshot (%d bytes) with model=%s", - len(image_data), vision_model) - response = _aux_vision_client.chat.completions.create( - model=vision_model, - messages=[ + logger.debug("browser_vision: analysing screenshot (%d bytes)", + len(image_data)) + call_kwargs = { + "task": "vision", + "messages": [ { "role": "user", "content": [ @@ -1578,9 +1546,12 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str] ], } ], - **auxiliary_max_tokens_param(2000), - temperature=0.1, - ) + "max_tokens": 2000, + "temperature": 0.1, + } + if vision_model: + call_kwargs["model"] = vision_model + response = call_llm(**call_kwargs) analysis = response.choices[0].message.content response_data = { diff --git a/tools/mcp_tool.py b/tools/mcp_tool.py index b0fc35f7..e1137909 100644 --- a/tools/mcp_tool.py +++ b/tools/mcp_tool.py @@ -456,17 +456,13 @@ class SamplingHandler: # Resolve model model = self._resolve_model(getattr(params, "modelPreferences", None)) - # Get auxiliary LLM client - from agent.auxiliary_client import get_text_auxiliary_client - client, default_model = get_text_auxiliary_client() - if client is None: - self.metrics["errors"] += 1 - return self._error("No LLM provider available for sampling") + # Get auxiliary LLM client via centralized router + from agent.auxiliary_client import call_llm - resolved_model = model or default_model + # Model whitelist check (we need to resolve model before calling) + resolved_model = model or self.model_override or "" - # Model whitelist check - if self.allowed_models and resolved_model not in self.allowed_models: + if self.allowed_models and resolved_model and resolved_model not in self.allowed_models: logger.warning( "MCP server '%s' requested model '%s' not in allowed_models", self.server_name, resolved_model, @@ -484,20 +480,15 @@ class SamplingHandler: # Build LLM call kwargs max_tokens = min(params.maxTokens, self.max_tokens_cap) - call_kwargs: dict = { - "model": resolved_model, - "messages": messages, - "max_tokens": max_tokens, - } + call_temperature = None if hasattr(params, "temperature") and params.temperature is not None: - call_kwargs["temperature"] = params.temperature - if stop := getattr(params, "stopSequences", None): - call_kwargs["stop"] = stop + call_temperature = params.temperature # Forward server-provided tools + call_tools = None server_tools = getattr(params, "tools", None) if server_tools: - call_kwargs["tools"] = [ + call_tools = [ { "type": "function", "function": { @@ -508,9 +499,6 @@ class SamplingHandler: } for t in server_tools ] - if tool_choice := getattr(params, "toolChoice", None): - mode = getattr(tool_choice, "mode", "auto") - call_kwargs["tool_choice"] = {"auto": "auto", "required": "required", "none": "none"}.get(mode, "auto") logger.log( self.audit_level, @@ -520,7 +508,15 @@ class SamplingHandler: # Offload sync LLM call to thread (non-blocking) def _sync_call(): - return client.chat.completions.create(**call_kwargs) + return call_llm( + task="mcp", + model=resolved_model or None, + messages=messages, + temperature=call_temperature, + max_tokens=max_tokens, + tools=call_tools, + timeout=self.timeout, + ) try: response = await asyncio.wait_for( diff --git a/tools/session_search_tool.py b/tools/session_search_tool.py index 4bf88cbf..cd1b98fd 100644 --- a/tools/session_search_tool.py +++ b/tools/session_search_tool.py @@ -22,13 +22,7 @@ import os import logging from typing import Dict, Any, List, Optional, Union -from openai import AsyncOpenAI, OpenAI - -from agent.auxiliary_client import get_async_text_auxiliary_client - -# Resolve the async auxiliary client at import time so we have the model slug. -# Handles Codex Responses API adapter transparently. -_async_aux_client, _SUMMARIZER_MODEL = get_async_text_auxiliary_client() +from agent.auxiliary_client import async_call_llm MAX_SESSION_CHARS = 100_000 MAX_SUMMARY_TOKENS = 10000 @@ -156,26 +150,22 @@ async def _summarize_session( f"Summarize this conversation with focus on: {query}" ) - if _async_aux_client is None or _SUMMARIZER_MODEL is None: - logging.warning("No auxiliary model available for session summarization") - return None - max_retries = 3 for attempt in range(max_retries): try: - from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param - _extra = get_auxiliary_extra_body() - response = await _async_aux_client.chat.completions.create( - model=_SUMMARIZER_MODEL, + response = await async_call_llm( + task="session_search", messages=[ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt}, ], - **({} if not _extra else {"extra_body": _extra}), temperature=0.1, - **auxiliary_max_tokens_param(MAX_SUMMARY_TOKENS), + max_tokens=MAX_SUMMARY_TOKENS, ) return response.choices[0].message.content.strip() + except RuntimeError: + logging.warning("No auxiliary model available for session summarization") + return None except Exception as e: if attempt < max_retries - 1: await asyncio.sleep(1 * (attempt + 1)) @@ -333,8 +323,6 @@ def session_search( def check_session_search_requirements() -> bool: """Requires SQLite state database and an auxiliary text model.""" - if _async_aux_client is None: - return False try: from hermes_state import DEFAULT_DB_PATH return DEFAULT_DB_PATH.parent.exists() diff --git a/tools/skills_guard.py b/tools/skills_guard.py index 8234b0a2..c354d654 100644 --- a/tools/skills_guard.py +++ b/tools/skills_guard.py @@ -936,13 +936,10 @@ def llm_audit_skill(skill_path: Path, static_result: ScanResult, # Call the LLM via the centralized provider router try: - from agent.auxiliary_client import resolve_provider_client + from agent.auxiliary_client import call_llm - client, _default_model = resolve_provider_client("openrouter") - if client is None: - return static_result - - response = client.chat.completions.create( + response = call_llm( + provider="openrouter", model=model, messages=[{ "role": "user", diff --git a/tools/vision_tools.py b/tools/vision_tools.py index ee89b58a..c1b09a22 100644 --- a/tools/vision_tools.py +++ b/tools/vision_tools.py @@ -37,16 +37,11 @@ from pathlib import Path from typing import Any, Awaitable, Dict, Optional from urllib.parse import urlparse import httpx -from agent.auxiliary_client import get_async_vision_auxiliary_client +from agent.auxiliary_client import async_call_llm from tools.debug_helpers import DebugSession logger = logging.getLogger(__name__) -# Resolve vision auxiliary client at module level. -# Uses get_async_vision_auxiliary_client() which properly handles Codex -# routing (Responses API adapter) instead of raw AsyncOpenAI construction. -_aux_async_client, DEFAULT_VISION_MODEL = get_async_vision_auxiliary_client() - _debug = DebugSession("vision_tools", env_var="VISION_TOOLS_DEBUG") @@ -185,7 +180,7 @@ def _image_to_base64_data_url(image_path: Path, mime_type: Optional[str] = None) async def vision_analyze_tool( image_url: str, user_prompt: str, - model: str = DEFAULT_VISION_MODEL, + model: str = None, ) -> str: """ Analyze an image from a URL or local file path using vision AI. @@ -245,15 +240,6 @@ async def vision_analyze_tool( logger.info("Analyzing image: %s", image_url[:60]) logger.info("User prompt: %s", user_prompt[:100]) - # Check auxiliary vision client availability - if _aux_async_client is None or DEFAULT_VISION_MODEL is None: - logger.error("Vision analysis unavailable: no auxiliary vision model configured") - return json.dumps({ - "success": False, - "analysis": "Vision analysis unavailable: no auxiliary vision model configured. " - "Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools." - }, indent=2, ensure_ascii=False) - # Determine if this is a local file path or a remote URL local_path = Path(image_url) if local_path.is_file(): @@ -309,18 +295,18 @@ async def vision_analyze_tool( } ] - logger.info("Processing image with %s...", model) + logger.info("Processing image with vision model...") - # Call the vision API - from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param - _extra = get_auxiliary_extra_body() - response = await _aux_async_client.chat.completions.create( - model=model, - messages=messages, - temperature=0.1, - **auxiliary_max_tokens_param(2000), - **({} if not _extra else {"extra_body": _extra}), - ) + # Call the vision API via centralized router + call_kwargs = { + "task": "vision", + "messages": messages, + "temperature": 0.1, + "max_tokens": 2000, + } + if model: + call_kwargs["model"] = model + response = await async_call_llm(**call_kwargs) # Extract the analysis analysis = response.choices[0].message.content.strip() @@ -391,7 +377,18 @@ async def vision_analyze_tool( def check_vision_requirements() -> bool: """Check if an auxiliary vision model is available.""" - return _aux_async_client is not None + try: + from agent.auxiliary_client import resolve_provider_client + client, _ = resolve_provider_client("openrouter") + if client is not None: + return True + client, _ = resolve_provider_client("nous") + if client is not None: + return True + client, _ = resolve_provider_client("custom") + return client is not None + except Exception: + return False def get_debug_session_info() -> Dict[str, Any]: @@ -419,10 +416,9 @@ if __name__ == "__main__": print("Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools.") exit(1) else: - print(f"✅ Vision model available: {DEFAULT_VISION_MODEL}") + print("✅ Vision model available") print("🛠️ Vision tools ready for use!") - print(f"🧠 Using model: {DEFAULT_VISION_MODEL}") # Show debug mode status if _debug.active: @@ -489,9 +485,7 @@ def _handle_vision_analyze(args: Dict[str, Any], **kw: Any) -> Awaitable[str]: "Fully describe and explain everything about this image, then answer the " f"following question:\n\n{question}" ) - model = (os.getenv("AUXILIARY_VISION_MODEL", "").strip() - or DEFAULT_VISION_MODEL - or "google/gemini-3-flash-preview") + model = os.getenv("AUXILIARY_VISION_MODEL", "").strip() or None return vision_analyze_tool(image_url, full_prompt, model) diff --git a/tools/web_tools.py b/tools/web_tools.py index e99d94fb..71a882a5 100644 --- a/tools/web_tools.py +++ b/tools/web_tools.py @@ -47,8 +47,7 @@ import re import asyncio from typing import List, Dict, Any, Optional from firecrawl import Firecrawl -from openai import AsyncOpenAI -from agent.auxiliary_client import get_async_text_auxiliary_client +from agent.auxiliary_client import async_call_llm from tools.debug_helpers import DebugSession logger = logging.getLogger(__name__) @@ -83,15 +82,8 @@ def _get_firecrawl_client(): DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000 -# Resolve async auxiliary client at module level. -# Handles Codex Responses API adapter transparently. -_aux_async_client, _DEFAULT_SUMMARIZER_MODEL = get_async_text_auxiliary_client("web_extract") - -# Allow per-task override via config.yaml auxiliary.web_extract_model -DEFAULT_SUMMARIZER_MODEL = ( - os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() - or _DEFAULT_SUMMARIZER_MODEL -) +# Allow per-task override via env var +DEFAULT_SUMMARIZER_MODEL = os.getenv("AUXILIARY_WEB_EXTRACT_MODEL", "").strip() or None _debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG") @@ -249,22 +241,22 @@ Create a markdown summary that captures all key information in a well-organized, for attempt in range(max_retries): try: - if _aux_async_client is None: - logger.warning("No auxiliary model available for web content processing") - return None - from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param - _extra = get_auxiliary_extra_body() - response = await _aux_async_client.chat.completions.create( - model=model, - messages=[ + call_kwargs = { + "task": "web_extract", + "messages": [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_prompt} ], - temperature=0.1, - **auxiliary_max_tokens_param(max_tokens), - **({} if not _extra else {"extra_body": _extra}), - ) + "temperature": 0.1, + "max_tokens": max_tokens, + } + if model: + call_kwargs["model"] = model + response = await async_call_llm(**call_kwargs) return response.choices[0].message.content.strip() + except RuntimeError: + logger.warning("No auxiliary model available for web content processing") + return None except Exception as api_error: last_error = api_error if attempt < max_retries - 1: @@ -368,25 +360,18 @@ Synthesize these into ONE cohesive, comprehensive summary that: Create a single, unified markdown summary.""" try: - if _aux_async_client is None: - logger.warning("No auxiliary model for synthesis, concatenating summaries") - fallback = "\n\n".join(summaries) - if len(fallback) > max_output_size: - fallback = fallback[:max_output_size] + "\n\n[... truncated ...]" - return fallback - - from agent.auxiliary_client import get_auxiliary_extra_body, auxiliary_max_tokens_param - _extra = get_auxiliary_extra_body() - response = await _aux_async_client.chat.completions.create( - model=model, - messages=[ + call_kwargs = { + "task": "web_extract", + "messages": [ {"role": "system", "content": "You synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise."}, {"role": "user", "content": synthesis_prompt} ], - temperature=0.1, - **auxiliary_max_tokens_param(20000), - **({} if not _extra else {"extra_body": _extra}), - ) + "temperature": 0.1, + "max_tokens": 20000, + } + if model: + call_kwargs["model"] = model + response = await async_call_llm(**call_kwargs) final_summary = response.choices[0].message.content.strip() # Enforce hard cap @@ -713,8 +698,8 @@ async def web_extract_tool( debug_call_data["pages_extracted"] = pages_extracted debug_call_data["original_response_size"] = len(json.dumps(response)) - # Process each result with LLM if enabled and auxiliary client is available - if use_llm_processing and _aux_async_client is not None: + # Process each result with LLM if enabled + if use_llm_processing: logger.info("Processing extracted content with LLM (parallel)...") debug_call_data["processing_applied"].append("llm_processing") @@ -780,10 +765,6 @@ async def web_extract_tool( else: logger.warning("%s (no content to process)", url) else: - if use_llm_processing and _aux_async_client is None: - logger.warning("LLM processing requested but no auxiliary model available, returning raw content") - debug_call_data["processing_applied"].append("llm_processing_unavailable") - # Print summary of extracted pages for debugging (original behavior) for result in response.get('results', []): url = result.get('url', 'Unknown URL') @@ -1013,8 +994,8 @@ async def web_crawl_tool( debug_call_data["pages_crawled"] = pages_crawled debug_call_data["original_response_size"] = len(json.dumps(response)) - # Process each result with LLM if enabled and auxiliary client is available - if use_llm_processing and _aux_async_client is not None: + # Process each result with LLM if enabled + if use_llm_processing: logger.info("Processing crawled content with LLM (parallel)...") debug_call_data["processing_applied"].append("llm_processing") @@ -1080,10 +1061,6 @@ async def web_crawl_tool( else: logger.warning("%s (no content to process)", page_url) else: - if use_llm_processing and _aux_async_client is None: - logger.warning("LLM processing requested but no auxiliary model available, returning raw content") - debug_call_data["processing_applied"].append("llm_processing_unavailable") - # Print summary of crawled pages for debugging (original behavior) for result in response.get('results', []): page_url = result.get('url', 'Unknown URL') @@ -1138,7 +1115,15 @@ def check_firecrawl_api_key() -> bool: def check_auxiliary_model() -> bool: """Check if an auxiliary text model is available for LLM content processing.""" - return _aux_async_client is not None + try: + from agent.auxiliary_client import resolve_provider_client + for p in ("openrouter", "nous", "custom", "codex"): + client, _ = resolve_provider_client(p) + if client is not None: + return True + return False + except Exception: + return False def get_debug_session_info() -> Dict[str, Any]: diff --git a/trajectory_compressor.py b/trajectory_compressor.py index 5f1c84c6..ef81d6e2 100644 --- a/trajectory_compressor.py +++ b/trajectory_compressor.py @@ -344,28 +344,32 @@ class TrajectoryCompressor: raise RuntimeError(f"Failed to load tokenizer '{self.config.tokenizer_name}': {e}") def _init_summarizer(self): - """Initialize LLM client for summarization (sync and async). + """Initialize LLM routing for summarization (sync and async). - Routes through the centralized provider router for known providers - (OpenRouter, Nous, Codex, etc.) so auth and headers are handled - consistently. Falls back to raw construction for custom endpoints. + Uses call_llm/async_call_llm from the centralized provider router + which handles auth, headers, and provider detection internally. + For custom endpoints, falls back to raw client construction. """ - from agent.auxiliary_client import resolve_provider_client + from agent.auxiliary_client import call_llm, async_call_llm provider = self._detect_provider() if provider: - # Use centralized router — handles auth, headers, Codex adapter - self.client, _ = resolve_provider_client( + # Store provider for use in _generate_summary calls + self._llm_provider = provider + self._use_call_llm = True + # Verify the provider is available + from agent.auxiliary_client import resolve_provider_client + client, _ = resolve_provider_client( provider, model=self.config.summarization_model) - self.async_client, _ = resolve_provider_client( - provider, model=self.config.summarization_model, - async_mode=True) - if self.client is None: + if client is None: raise RuntimeError( f"Provider '{provider}' is not configured. " f"Check your API key or run: hermes setup") + self.client = None # Not used directly + self.async_client = None # Not used directly else: # Custom endpoint — use config's raw base_url + api_key_env + self._use_call_llm = False api_key = os.getenv(self.config.api_key_env) if not api_key: raise RuntimeError( @@ -524,12 +528,22 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix.""" try: metrics.summarization_api_calls += 1 - response = self.client.chat.completions.create( - model=self.config.summarization_model, - messages=[{"role": "user", "content": prompt}], - temperature=self.config.temperature, - max_tokens=self.config.summary_target_tokens * 2, - ) + if getattr(self, '_use_call_llm', False): + from agent.auxiliary_client import call_llm + response = call_llm( + provider=self._llm_provider, + model=self.config.summarization_model, + messages=[{"role": "user", "content": prompt}], + temperature=self.config.temperature, + max_tokens=self.config.summary_target_tokens * 2, + ) + else: + response = self.client.chat.completions.create( + model=self.config.summarization_model, + messages=[{"role": "user", "content": prompt}], + temperature=self.config.temperature, + max_tokens=self.config.summary_target_tokens * 2, + ) summary = response.choices[0].message.content.strip() @@ -581,12 +595,22 @@ Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix.""" try: metrics.summarization_api_calls += 1 - response = await self.async_client.chat.completions.create( - model=self.config.summarization_model, - messages=[{"role": "user", "content": prompt}], - temperature=self.config.temperature, - max_tokens=self.config.summary_target_tokens * 2, - ) + if getattr(self, '_use_call_llm', False): + from agent.auxiliary_client import async_call_llm + response = await async_call_llm( + provider=self._llm_provider, + model=self.config.summarization_model, + messages=[{"role": "user", "content": prompt}], + temperature=self.config.temperature, + max_tokens=self.config.summary_target_tokens * 2, + ) + else: + response = await self.async_client.chat.completions.create( + model=self.config.summarization_model, + messages=[{"role": "user", "content": prompt}], + temperature=self.config.temperature, + max_tokens=self.config.summary_target_tokens * 2, + ) summary = response.choices[0].message.content.strip() From 29ef69c703324fb75b567279ee6ed3d1bf6ab7dd Mon Sep 17 00:00:00 2001 From: teknium1 Date: Wed, 11 Mar 2026 21:06:54 -0700 Subject: [PATCH 05/11] fix: update all test mocks for call_llm migration Update 14 test files to use the new call_llm/async_call_llm mock patterns instead of the old get_text_auxiliary_client/ get_vision_auxiliary_client tuple returns. - vision_tools tests: mock async_call_llm instead of _aux_async_client - browser tests: mock call_llm instead of _aux_vision_client - flush_memories tests: mock call_llm instead of get_text_auxiliary_client - session_search tests: mock async_call_llm with RuntimeError - mcp_tool tests: fix whitelist model config, use side_effect for multi-response tests - auxiliary_config_bridge: update for model=None (resolved in router) 3251 passed, 2 pre-existing unrelated failures. --- tests/test_auxiliary_config_bridge.py | 7 ++++--- tests/test_flush_memories_codex.py | 25 +++++++++++-------------- tests/test_run_agent.py | 2 +- tests/tools/test_browser_console.py | 6 ++---- tests/tools/test_mcp_tool.py | 14 ++++++++------ tests/tools/test_session_search.py | 18 ++++++++---------- tests/tools/test_vision_tools.py | 23 ++++++----------------- 7 files changed, 40 insertions(+), 55 deletions(-) diff --git a/tests/test_auxiliary_config_bridge.py b/tests/test_auxiliary_config_bridge.py index b0804e4b..a4d65c2a 100644 --- a/tests/test_auxiliary_config_bridge.py +++ b/tests/test_auxiliary_config_bridge.py @@ -229,13 +229,14 @@ class TestVisionModelOverride: def test_default_model_when_no_override(self, monkeypatch): monkeypatch.delenv("AUXILIARY_VISION_MODEL", raising=False) - from tools.vision_tools import _handle_vision_analyze, DEFAULT_VISION_MODEL + from tools.vision_tools import _handle_vision_analyze with patch("tools.vision_tools.vision_analyze_tool", new_callable=MagicMock) as mock_tool: mock_tool.return_value = '{"success": true}' _handle_vision_analyze({"image_url": "http://test.jpg", "question": "test"}) call_args = mock_tool.call_args - expected = DEFAULT_VISION_MODEL or "google/gemini-3-flash-preview" - assert call_args[0][2] == expected + # With no AUXILIARY_VISION_MODEL env var, model should be None + # (the centralized call_llm router picks the provider default) + assert call_args[0][2] is None # ── DEFAULT_CONFIG shape tests ─────────────────────────────────────────────── diff --git a/tests/test_flush_memories_codex.py b/tests/test_flush_memories_codex.py index 22eef5ab..3d12c9d3 100644 --- a/tests/test_flush_memories_codex.py +++ b/tests/test_flush_memories_codex.py @@ -98,10 +98,9 @@ class TestFlushMemoriesUsesAuxiliaryClient: def test_flush_uses_auxiliary_when_available(self, monkeypatch): agent = _make_agent(monkeypatch, api_mode="codex_responses", provider="openai-codex") - mock_aux_client = MagicMock() - mock_aux_client.chat.completions.create.return_value = _chat_response_with_memory_call() + mock_response = _chat_response_with_memory_call() - with patch("agent.auxiliary_client.get_text_auxiliary_client", return_value=(mock_aux_client, "gpt-4o-mini")): + with patch("agent.auxiliary_client.call_llm", return_value=mock_response) as mock_call: messages = [ {"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there"}, @@ -110,9 +109,9 @@ class TestFlushMemoriesUsesAuxiliaryClient: with patch("tools.memory_tool.memory_tool", return_value="Saved.") as mock_memory: agent.flush_memories(messages) - mock_aux_client.chat.completions.create.assert_called_once() - call_kwargs = mock_aux_client.chat.completions.create.call_args - assert call_kwargs.kwargs.get("model") == "gpt-4o-mini" or call_kwargs[1].get("model") == "gpt-4o-mini" + mock_call.assert_called_once() + call_kwargs = mock_call.call_args + assert call_kwargs.kwargs.get("task") == "flush_memories" def test_flush_uses_main_client_when_no_auxiliary(self, monkeypatch): """Non-Codex mode with no auxiliary falls back to self.client.""" @@ -120,7 +119,7 @@ class TestFlushMemoriesUsesAuxiliaryClient: agent.client = MagicMock() agent.client.chat.completions.create.return_value = _chat_response_with_memory_call() - with patch("agent.auxiliary_client.get_text_auxiliary_client", return_value=(None, None)): + with patch("agent.auxiliary_client.call_llm", side_effect=RuntimeError("no provider")): messages = [ {"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi there"}, @@ -135,10 +134,9 @@ class TestFlushMemoriesUsesAuxiliaryClient: """Verify that memory tool calls from the flush response actually get executed.""" agent = _make_agent(monkeypatch, api_mode="chat_completions", provider="openrouter") - mock_aux_client = MagicMock() - mock_aux_client.chat.completions.create.return_value = _chat_response_with_memory_call() + mock_response = _chat_response_with_memory_call() - with patch("agent.auxiliary_client.get_text_auxiliary_client", return_value=(mock_aux_client, "gpt-4o-mini")): + with patch("agent.auxiliary_client.call_llm", return_value=mock_response): messages = [ {"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi"}, @@ -157,10 +155,9 @@ class TestFlushMemoriesUsesAuxiliaryClient: """After flush, the flush prompt and any response should be removed from messages.""" agent = _make_agent(monkeypatch, api_mode="chat_completions", provider="openrouter") - mock_aux_client = MagicMock() - mock_aux_client.chat.completions.create.return_value = _chat_response_with_memory_call() + mock_response = _chat_response_with_memory_call() - with patch("agent.auxiliary_client.get_text_auxiliary_client", return_value=(mock_aux_client, "gpt-4o-mini")): + with patch("agent.auxiliary_client.call_llm", return_value=mock_response): messages = [ {"role": "user", "content": "Hello"}, {"role": "assistant", "content": "Hi"}, @@ -202,7 +199,7 @@ class TestFlushMemoriesCodexFallback: model="gpt-5-codex", ) - with patch("agent.auxiliary_client.get_text_auxiliary_client", return_value=(None, None)), \ + with patch("agent.auxiliary_client.call_llm", side_effect=RuntimeError("no provider")), \ patch.object(agent, "_run_codex_stream", return_value=codex_response) as mock_stream, \ patch.object(agent, "_build_api_kwargs") as mock_build, \ patch("tools.memory_tool.memory_tool", return_value="Saved.") as mock_memory: diff --git a/tests/test_run_agent.py b/tests/test_run_agent.py index a3a82283..c789d735 100644 --- a/tests/test_run_agent.py +++ b/tests/test_run_agent.py @@ -959,7 +959,7 @@ class TestFlushSentinelNotLeaked: agent.client.chat.completions.create.return_value = mock_response # Bypass auxiliary client so flush uses agent.client directly - with patch("agent.auxiliary_client.get_text_auxiliary_client", return_value=(None, None)): + with patch("agent.auxiliary_client.call_llm", side_effect=RuntimeError("no provider")): agent.flush_memories(messages, min_turns=0) # Check what was actually sent to the API diff --git a/tests/tools/test_browser_console.py b/tests/tools/test_browser_console.py index 962b49f0..f5f54a0b 100644 --- a/tests/tools/test_browser_console.py +++ b/tests/tools/test_browser_console.py @@ -137,8 +137,7 @@ class TestBrowserVisionAnnotate: with ( patch("tools.browser_tool._run_browser_command") as mock_cmd, - patch("tools.browser_tool._aux_vision_client") as mock_client, - patch("tools.browser_tool._DEFAULT_VISION_MODEL", "test-model"), + patch("tools.browser_tool.call_llm") as mock_call_llm, patch("tools.browser_tool._get_vision_model", return_value="test-model"), ): mock_cmd.return_value = {"success": True, "data": {}} @@ -159,8 +158,7 @@ class TestBrowserVisionAnnotate: with ( patch("tools.browser_tool._run_browser_command") as mock_cmd, - patch("tools.browser_tool._aux_vision_client") as mock_client, - patch("tools.browser_tool._DEFAULT_VISION_MODEL", "test-model"), + patch("tools.browser_tool.call_llm") as mock_call_llm, patch("tools.browser_tool._get_vision_model", return_value="test-model"), ): mock_cmd.return_value = {"success": True, "data": {}} diff --git a/tests/tools/test_mcp_tool.py b/tests/tools/test_mcp_tool.py index 0d527e95..bc3179ea 100644 --- a/tests/tools/test_mcp_tool.py +++ b/tests/tools/test_mcp_tool.py @@ -1956,24 +1956,26 @@ class TestToolLoopGovernance: def test_text_response_resets_counter(self): """A text response resets the tool loop counter.""" handler = SamplingHandler("tl2", {"max_tool_rounds": 1}) - fake_client = MagicMock() + + # Use a list to hold the current response, so the side_effect can + # pick up changes between calls. + responses = [_make_llm_tool_response()] with patch( "agent.auxiliary_client.call_llm", - return_value=fake_client.chat.completions.create.return_value, + side_effect=lambda **kw: responses[0], ): # Tool response (round 1 of 1 allowed) - fake_client.chat.completions.create.return_value = _make_llm_tool_response() r1 = asyncio.run(handler(None, _make_sampling_params())) assert isinstance(r1, CreateMessageResultWithTools) # Text response resets counter - fake_client.chat.completions.create.return_value = _make_llm_response() + responses[0] = _make_llm_response() r2 = asyncio.run(handler(None, _make_sampling_params())) assert isinstance(r2, CreateMessageResult) # Tool response again (should succeed since counter was reset) - fake_client.chat.completions.create.return_value = _make_llm_tool_response() + responses[0] = _make_llm_tool_response() r3 = asyncio.run(handler(None, _make_sampling_params())) assert isinstance(r3, CreateMessageResultWithTools) @@ -2122,7 +2124,7 @@ class TestModelWhitelist: assert isinstance(result, CreateMessageResult) def test_disallowed_model_rejected(self): - handler = SamplingHandler("wl2", {"allowed_models": ["gpt-4o"]}) + handler = SamplingHandler("wl2", {"allowed_models": ["gpt-4o"], "model": "test-model"}) fake_client = MagicMock() with patch( diff --git a/tests/tools/test_session_search.py b/tests/tools/test_session_search.py index 645e08ff..c3624714 100644 --- a/tests/tools/test_session_search.py +++ b/tests/tools/test_session_search.py @@ -189,16 +189,14 @@ class TestSessionSearch: {"role": "assistant", "content": "hi there"}, ] - # Mock the summarizer to return a simple summary - import tools.session_search_tool as sst - original_client = sst._async_aux_client - sst._async_aux_client = None # Disable summarizer → returns None - - result = json.loads(session_search( - query="test", db=mock_db, current_session_id=current_sid, - )) - - sst._async_aux_client = original_client + # Mock async_call_llm to raise RuntimeError → summarizer returns None + from unittest.mock import AsyncMock, patch as _patch + with _patch("tools.session_search_tool.async_call_llm", + new_callable=AsyncMock, + side_effect=RuntimeError("no provider")): + result = json.loads(session_search( + query="test", db=mock_db, current_session_id=current_sid, + )) assert result["success"] is True # Current session should be skipped, only other_sid should appear diff --git a/tests/tools/test_vision_tools.py b/tests/tools/test_vision_tools.py index 0135284a..6cfdc941 100644 --- a/tests/tools/test_vision_tools.py +++ b/tests/tools/test_vision_tools.py @@ -202,7 +202,7 @@ class TestHandleVisionAnalyze: assert model == "custom/model-v1" def test_falls_back_to_default_model(self): - """Without AUXILIARY_VISION_MODEL, should use DEFAULT_VISION_MODEL or fallback.""" + """Without AUXILIARY_VISION_MODEL, model should be None (let call_llm resolve default).""" with ( patch( "tools.vision_tools.vision_analyze_tool", new_callable=AsyncMock @@ -218,9 +218,9 @@ class TestHandleVisionAnalyze: coro.close() call_args = mock_tool.call_args model = call_args[0][2] - # Should be DEFAULT_VISION_MODEL or the hardcoded fallback - assert model is not None - assert len(model) > 0 + # With no AUXILIARY_VISION_MODEL set, model should be None + # (the centralized call_llm router picks the default) + assert model is None def test_empty_args_graceful(self): """Missing keys should default to empty strings, not raise.""" @@ -277,8 +277,6 @@ class TestErrorLoggingExcInfo: new_callable=AsyncMock, side_effect=Exception("download boom"), ), - patch("tools.vision_tools._aux_async_client", MagicMock()), - patch("tools.vision_tools.DEFAULT_VISION_MODEL", "test/model"), caplog.at_level(logging.ERROR, logger="tools.vision_tools"), ): result = await vision_analyze_tool( @@ -311,25 +309,16 @@ class TestErrorLoggingExcInfo: "tools.vision_tools._image_to_base64_data_url", return_value="data:image/jpeg;base64,abc", ), - patch("agent.auxiliary_client.get_auxiliary_extra_body", return_value=None), - patch( - "agent.auxiliary_client.auxiliary_max_tokens_param", - return_value={"max_tokens": 2000}, - ), caplog.at_level(logging.WARNING, logger="tools.vision_tools"), ): - # Mock the vision client - mock_client = AsyncMock() + # Mock the async_call_llm function to return a mock response mock_response = MagicMock() mock_choice = MagicMock() mock_choice.message.content = "A test image description" mock_response.choices = [mock_choice] - mock_client.chat.completions.create = AsyncMock(return_value=mock_response) - # Patch module-level _aux_async_client so the tool doesn't bail early with ( - patch("tools.vision_tools._aux_async_client", mock_client), - patch("tools.vision_tools.DEFAULT_VISION_MODEL", "test/model"), + patch("tools.vision_tools.async_call_llm", new_callable=AsyncMock, return_value=mock_response), ): # Make unlink fail to trigger cleanup warning original_unlink = Path.unlink From a29801286ff0997dc688e206c3144cfe4bc4bdf6 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Wed, 11 Mar 2026 21:38:29 -0700 Subject: [PATCH 06/11] refactor: route main agent client + fallback through centralized router MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 2 of the provider router migration — route the main agent's client construction and fallback activation through resolve_provider_client() instead of duplicated ad-hoc logic. run_agent.py: - __init__: When no explicit api_key/base_url, use resolve_provider_client(provider, raw_codex=True) for client construction. Explicit creds (from CLI/gateway runtime provider) still construct directly. - _try_activate_fallback: Replace _resolve_fallback_credentials and its duplicated _FALLBACK_API_KEY_PROVIDERS / _FALLBACK_OAUTH_PROVIDERS dicts with a single resolve_provider_client() call. The router handles all provider types (API-key, OAuth, Codex) centrally. - Remove _resolve_fallback_credentials method and both fallback dicts. agent/auxiliary_client.py: - Add raw_codex parameter to resolve_provider_client(). When True, returns the raw OpenAI client for Codex providers instead of wrapping in CodexAuxiliaryClient. The main agent needs this for direct responses.stream() access. 3251 passed, 2 pre-existing unrelated failures. --- agent/auxiliary_client.py | 17 +++ run_agent.py | 185 ++++++++++++-------------------- tests/test_fallback_model.py | 200 +++++++++++++++++++++-------------- 3 files changed, 206 insertions(+), 196 deletions(-) diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 04afe4c7..19c2b8bd 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -536,6 +536,7 @@ def resolve_provider_client( provider: str, model: str = None, async_mode: bool = False, + raw_codex: bool = False, ) -> Tuple[Optional[Any], Optional[str]]: """Central router: given a provider name and optional model, return a configured client with the correct auth, base URL, and API format. @@ -553,6 +554,10 @@ def resolve_provider_client( model: Model slug override. If None, uses the provider's default auxiliary model. async_mode: If True, return an async-compatible client. + raw_codex: If True, return a raw OpenAI client for Codex providers + instead of wrapping in CodexAuxiliaryClient. Use this when + the caller needs direct access to responses.stream() (e.g., + the main agent loop). Returns: (client, resolved_model) or (None, None) if auth is unavailable. @@ -597,6 +602,18 @@ def resolve_provider_client( # ── OpenAI Codex (OAuth → Responses API) ───────────────────────── if provider == "openai-codex": + if raw_codex: + # Return the raw OpenAI client for callers that need direct + # access to responses.stream() (e.g., the main agent loop). + codex_token = _read_codex_access_token() + if not codex_token: + logger.warning("resolve_provider_client: openai-codex requested " + "but no Codex OAuth token found (run: hermes model)") + return None, None + final_model = model or _CODEX_AUX_MODEL + raw_client = OpenAI(api_key=codex_token, base_url=_CODEX_AUX_BASE_URL) + return (raw_client, final_model) + # Standard path: wrap in CodexAuxiliaryClient adapter client, default = _try_codex() if client is None: logger.warning("resolve_provider_client: openai-codex requested " diff --git a/run_agent.py b/run_agent.py index 8849d25c..107b803c 100644 --- a/run_agent.py +++ b/run_agent.py @@ -418,36 +418,50 @@ class AIAgent: ]: logging.getLogger(quiet_logger).setLevel(logging.ERROR) - # Initialize OpenAI client - defaults to OpenRouter - client_kwargs = {} - - # Default to OpenRouter if no base_url provided - if base_url: - client_kwargs["base_url"] = base_url + # Initialize OpenAI client via centralized provider router. + # The router handles auth resolution, base URL, headers, and + # Codex wrapping for all known providers. + # raw_codex=True because the main agent needs direct responses.stream() + # access for Codex Responses API streaming. + if api_key and base_url: + # Explicit credentials from CLI/gateway — construct directly. + # The runtime provider resolver already handled auth for us. + client_kwargs = {"api_key": api_key, "base_url": base_url} + effective_base = base_url + if "openrouter" in effective_base.lower(): + client_kwargs["default_headers"] = { + "HTTP-Referer": "https://github.com/NousResearch/hermes-agent", + "X-OpenRouter-Title": "Hermes Agent", + "X-OpenRouter-Categories": "productivity,cli-agent", + } + elif "api.kimi.com" in effective_base.lower(): + client_kwargs["default_headers"] = { + "User-Agent": "KimiCLI/1.0", + } else: - client_kwargs["base_url"] = OPENROUTER_BASE_URL - - # Handle API key - OpenRouter is the primary provider - if api_key: - client_kwargs["api_key"] = api_key - else: - # Primary: OPENROUTER_API_KEY, fallback to direct provider keys - client_kwargs["api_key"] = os.getenv("OPENROUTER_API_KEY", "") - - # OpenRouter app attribution — shows hermes-agent in rankings/analytics - effective_base = client_kwargs.get("base_url", "") - if "openrouter" in effective_base.lower(): - client_kwargs["default_headers"] = { - "HTTP-Referer": "https://github.com/NousResearch/hermes-agent", - "X-OpenRouter-Title": "Hermes Agent", - "X-OpenRouter-Categories": "productivity,cli-agent", - } - elif "api.kimi.com" in effective_base.lower(): - # Kimi Code API requires a recognized coding-agent User-Agent - # (see https://github.com/MoonshotAI/kimi-cli) - client_kwargs["default_headers"] = { - "User-Agent": "KimiCLI/1.0", - } + # No explicit creds — use the centralized provider router + from agent.auxiliary_client import resolve_provider_client + _routed_client, _ = resolve_provider_client( + self.provider or "auto", model=self.model, raw_codex=True) + if _routed_client is not None: + client_kwargs = { + "api_key": _routed_client.api_key, + "base_url": str(_routed_client.base_url), + } + # Preserve any default_headers the router set + if hasattr(_routed_client, '_default_headers') and _routed_client._default_headers: + client_kwargs["default_headers"] = dict(_routed_client._default_headers) + else: + # Final fallback: try raw OpenRouter key + client_kwargs = { + "api_key": os.getenv("OPENROUTER_API_KEY", ""), + "base_url": OPENROUTER_BASE_URL, + "default_headers": { + "HTTP-Referer": "https://github.com/NousResearch/hermes-agent", + "X-OpenRouter-Title": "Hermes Agent", + "X-OpenRouter-Categories": "productivity,cli-agent", + }, + } self._client_kwargs = client_kwargs # stored for rebuilding after interrupt try: @@ -2236,75 +2250,6 @@ class AIAgent: # ── Provider fallback ────────────────────────────────────────────────── - # API-key providers: provider → (base_url, [env_var_names]) - _FALLBACK_API_KEY_PROVIDERS = { - "openrouter": (OPENROUTER_BASE_URL, ["OPENROUTER_API_KEY"]), - "zai": ("https://api.z.ai/api/paas/v4", ["ZAI_API_KEY", "Z_AI_API_KEY"]), - "kimi-coding": ("https://api.moonshot.ai/v1", ["KIMI_API_KEY"]), - "minimax": ("https://api.minimax.io/v1", ["MINIMAX_API_KEY"]), - "minimax-cn": ("https://api.minimaxi.com/v1", ["MINIMAX_CN_API_KEY"]), - } - - # OAuth providers: provider → (resolver_import_path, api_mode) - # Each resolver returns {"api_key": ..., "base_url": ...}. - _FALLBACK_OAUTH_PROVIDERS = { - "openai-codex": ("resolve_codex_runtime_credentials", "codex_responses"), - "nous": ("resolve_nous_runtime_credentials", "chat_completions"), - } - - def _resolve_fallback_credentials( - self, fb_provider: str, fb_config: dict - ) -> Optional[tuple]: - """Resolve credentials for a fallback provider. - - Returns (api_key, base_url, api_mode) on success, or None on failure. - Handles three cases: - 1. OAuth providers (openai-codex, nous) — call credential resolver - 2. API-key providers (openrouter, zai, etc.) — read env var - 3. Custom endpoints — use base_url + api_key_env from config - """ - # ── 1. OAuth providers ──────────────────────────────────────── - if fb_provider in self._FALLBACK_OAUTH_PROVIDERS: - resolver_name, api_mode = self._FALLBACK_OAUTH_PROVIDERS[fb_provider] - try: - import hermes_cli.auth as _auth - resolver = getattr(_auth, resolver_name) - creds = resolver() - return creds["api_key"], creds["base_url"], api_mode - except Exception as e: - logging.warning( - "Fallback to %s failed (credential resolution): %s", - fb_provider, e, - ) - return None - - # ── 2. API-key providers ────────────────────────────────────── - fb_key = (fb_config.get("api_key") or "").strip() - if not fb_key: - key_env = (fb_config.get("api_key_env") or "").strip() - if key_env: - fb_key = os.getenv(key_env, "") - elif fb_provider in self._FALLBACK_API_KEY_PROVIDERS: - for env_var in self._FALLBACK_API_KEY_PROVIDERS[fb_provider][1]: - fb_key = os.getenv(env_var, "") - if fb_key: - break - if not fb_key: - logging.warning( - "Fallback model configured but no API key found for provider '%s'", - fb_provider, - ) - return None - - # ── 3. Resolve base URL ─────────────────────────────────────── - fb_base_url = (fb_config.get("base_url") or "").strip() - if not fb_base_url and fb_provider in self._FALLBACK_API_KEY_PROVIDERS: - fb_base_url = self._FALLBACK_API_KEY_PROVIDERS[fb_provider][0] - if not fb_base_url: - fb_base_url = OPENROUTER_BASE_URL - - return fb_key, fb_base_url, "chat_completions" - def _try_activate_fallback(self) -> bool: """Switch to the configured fallback model/provider. @@ -2312,6 +2257,10 @@ class AIAgent: OpenAI client, model slug, and provider in-place so the retry loop can continue with the new backend. One-shot: returns False if already activated or not configured. + + Uses the centralized provider router (resolve_provider_client) for + auth resolution and client construction — no duplicated provider→key + mappings. """ if self._fallback_activated or not self._fallback_model: return False @@ -2322,25 +2271,31 @@ class AIAgent: if not fb_provider or not fb_model: return False - resolved = self._resolve_fallback_credentials(fb_provider, fb) - if resolved is None: - return False - fb_key, fb_base_url, fb_api_mode = resolved - - # Build new client + # Use centralized router for client construction. + # raw_codex=True because the main agent needs direct responses.stream() + # access for Codex providers. try: - client_kwargs = {"api_key": fb_key, "base_url": fb_base_url} - if "openrouter" in fb_base_url.lower(): - client_kwargs["default_headers"] = { - "HTTP-Referer": "https://github.com/NousResearch/hermes-agent", - "X-OpenRouter-Title": "Hermes Agent", - "X-OpenRouter-Categories": "productivity,cli-agent", - } - elif "api.kimi.com" in fb_base_url.lower(): - client_kwargs["default_headers"] = {"User-Agent": "KimiCLI/1.0"} + from agent.auxiliary_client import resolve_provider_client + fb_client, _ = resolve_provider_client( + fb_provider, model=fb_model, raw_codex=True) + if fb_client is None: + logging.warning( + "Fallback to %s failed: provider not configured", + fb_provider) + return False - self.client = OpenAI(**client_kwargs) - self._client_kwargs = client_kwargs + # Determine api_mode from provider + fb_api_mode = "chat_completions" + if fb_provider == "openai-codex": + fb_api_mode = "codex_responses" + fb_base_url = str(fb_client.base_url) + + # Swap client and config in-place + self.client = fb_client + self._client_kwargs = { + "api_key": fb_client.api_key, + "base_url": fb_base_url, + } old_model = self.model self.model = fb_model self.provider = fb_provider diff --git a/tests/test_fallback_model.py b/tests/test_fallback_model.py index dcc150c3..9e34bf74 100644 --- a/tests/test_fallback_model.py +++ b/tests/test_fallback_model.py @@ -35,7 +35,7 @@ def _make_agent(fallback_model=None): patch("run_agent.OpenAI"), ): agent = AIAgent( - api_key="test-key-primary", + api_key="test-key", quiet_mode=True, skip_context_files=True, skip_memory=True, @@ -45,6 +45,14 @@ def _make_agent(fallback_model=None): return agent +def _mock_resolve(base_url="https://openrouter.ai/api/v1", api_key="test-key"): + """Helper to create a mock client for resolve_provider_client.""" + mock_client = MagicMock() + mock_client.api_key = api_key + mock_client.base_url = base_url + return mock_client + + # ============================================================================= # _try_activate_fallback() # ============================================================================= @@ -71,9 +79,13 @@ class TestTryActivateFallback: agent = _make_agent( fallback_model={"provider": "openrouter", "model": "anthropic/claude-sonnet-4"}, ) - with ( - patch.dict("os.environ", {"OPENROUTER_API_KEY": "sk-or-fallback-key"}), - patch("run_agent.OpenAI") as mock_openai, + mock_client = _mock_resolve( + api_key="sk-or-fallback-key", + base_url="https://openrouter.ai/api/v1", + ) + with patch( + "agent.auxiliary_client.resolve_provider_client", + return_value=(mock_client, "anthropic/claude-sonnet-4"), ): result = agent._try_activate_fallback() assert result is True @@ -81,36 +93,37 @@ class TestTryActivateFallback: assert agent.model == "anthropic/claude-sonnet-4" assert agent.provider == "openrouter" assert agent.api_mode == "chat_completions" - mock_openai.assert_called_once() - call_kwargs = mock_openai.call_args[1] - assert call_kwargs["api_key"] == "sk-or-fallback-key" - assert "openrouter" in call_kwargs["base_url"].lower() - # OpenRouter should get attribution headers - assert "default_headers" in call_kwargs + assert agent.client is mock_client def test_activates_zai_fallback(self): agent = _make_agent( fallback_model={"provider": "zai", "model": "glm-5"}, ) - with ( - patch.dict("os.environ", {"ZAI_API_KEY": "sk-zai-key"}), - patch("run_agent.OpenAI") as mock_openai, + mock_client = _mock_resolve( + api_key="sk-zai-key", + base_url="https://open.z.ai/api/v1", + ) + with patch( + "agent.auxiliary_client.resolve_provider_client", + return_value=(mock_client, "glm-5"), ): result = agent._try_activate_fallback() assert result is True assert agent.model == "glm-5" assert agent.provider == "zai" - call_kwargs = mock_openai.call_args[1] - assert call_kwargs["api_key"] == "sk-zai-key" - assert "z.ai" in call_kwargs["base_url"].lower() + assert agent.client is mock_client def test_activates_kimi_fallback(self): agent = _make_agent( fallback_model={"provider": "kimi-coding", "model": "kimi-k2.5"}, ) - with ( - patch.dict("os.environ", {"KIMI_API_KEY": "sk-kimi-key"}), - patch("run_agent.OpenAI"), + mock_client = _mock_resolve( + api_key="sk-kimi-key", + base_url="https://api.moonshot.ai/v1", + ) + with patch( + "agent.auxiliary_client.resolve_provider_client", + return_value=(mock_client, "kimi-k2.5"), ): assert agent._try_activate_fallback() is True assert agent.model == "kimi-k2.5" @@ -120,23 +133,30 @@ class TestTryActivateFallback: agent = _make_agent( fallback_model={"provider": "minimax", "model": "MiniMax-M2.5"}, ) - with ( - patch.dict("os.environ", {"MINIMAX_API_KEY": "sk-mm-key"}), - patch("run_agent.OpenAI") as mock_openai, + mock_client = _mock_resolve( + api_key="sk-mm-key", + base_url="https://api.minimax.io/v1", + ) + with patch( + "agent.auxiliary_client.resolve_provider_client", + return_value=(mock_client, "MiniMax-M2.5"), ): assert agent._try_activate_fallback() is True assert agent.model == "MiniMax-M2.5" assert agent.provider == "minimax" - call_kwargs = mock_openai.call_args[1] - assert "minimax.io" in call_kwargs["base_url"] + assert agent.client is mock_client def test_only_fires_once(self): agent = _make_agent( fallback_model={"provider": "openrouter", "model": "anthropic/claude-sonnet-4"}, ) - with ( - patch.dict("os.environ", {"OPENROUTER_API_KEY": "sk-or-key"}), - patch("run_agent.OpenAI"), + mock_client = _mock_resolve( + api_key="sk-or-key", + base_url="https://openrouter.ai/api/v1", + ) + with patch( + "agent.auxiliary_client.resolve_provider_client", + return_value=(mock_client, "anthropic/claude-sonnet-4"), ): assert agent._try_activate_fallback() is True # Second attempt should return False @@ -147,9 +167,10 @@ class TestTryActivateFallback: agent = _make_agent( fallback_model={"provider": "minimax", "model": "MiniMax-M2.5"}, ) - # Ensure MINIMAX_API_KEY is not in the environment - env = {k: v for k, v in os.environ.items() if k != "MINIMAX_API_KEY"} - with patch.dict("os.environ", env, clear=True): + with patch( + "agent.auxiliary_client.resolve_provider_client", + return_value=(None, None), + ): assert agent._try_activate_fallback() is False assert agent._fallback_activated is False @@ -163,22 +184,29 @@ class TestTryActivateFallback: "api_key_env": "MY_CUSTOM_KEY", }, ) - with ( - patch.dict("os.environ", {"MY_CUSTOM_KEY": "custom-secret"}), - patch("run_agent.OpenAI") as mock_openai, + mock_client = _mock_resolve( + api_key="custom-secret", + base_url="http://localhost:8080/v1", + ) + with patch( + "agent.auxiliary_client.resolve_provider_client", + return_value=(mock_client, "my-model"), ): assert agent._try_activate_fallback() is True - call_kwargs = mock_openai.call_args[1] - assert call_kwargs["base_url"] == "http://localhost:8080/v1" - assert call_kwargs["api_key"] == "custom-secret" + assert agent.client is mock_client + assert agent.model == "my-model" def test_prompt_caching_enabled_for_claude_on_openrouter(self): agent = _make_agent( fallback_model={"provider": "openrouter", "model": "anthropic/claude-sonnet-4"}, ) - with ( - patch.dict("os.environ", {"OPENROUTER_API_KEY": "sk-or-key"}), - patch("run_agent.OpenAI"), + mock_client = _mock_resolve( + api_key="sk-or-key", + base_url="https://openrouter.ai/api/v1", + ) + with patch( + "agent.auxiliary_client.resolve_provider_client", + return_value=(mock_client, "anthropic/claude-sonnet-4"), ): agent._try_activate_fallback() assert agent._use_prompt_caching is True @@ -187,9 +215,13 @@ class TestTryActivateFallback: agent = _make_agent( fallback_model={"provider": "openrouter", "model": "google/gemini-2.5-flash"}, ) - with ( - patch.dict("os.environ", {"OPENROUTER_API_KEY": "sk-or-key"}), - patch("run_agent.OpenAI"), + mock_client = _mock_resolve( + api_key="sk-or-key", + base_url="https://openrouter.ai/api/v1", + ) + with patch( + "agent.auxiliary_client.resolve_provider_client", + return_value=(mock_client, "google/gemini-2.5-flash"), ): agent._try_activate_fallback() assert agent._use_prompt_caching is False @@ -198,9 +230,13 @@ class TestTryActivateFallback: agent = _make_agent( fallback_model={"provider": "zai", "model": "glm-5"}, ) - with ( - patch.dict("os.environ", {"ZAI_API_KEY": "sk-zai-key"}), - patch("run_agent.OpenAI"), + mock_client = _mock_resolve( + api_key="sk-zai-key", + base_url="https://open.z.ai/api/v1", + ) + with patch( + "agent.auxiliary_client.resolve_provider_client", + return_value=(mock_client, "glm-5"), ): agent._try_activate_fallback() assert agent._use_prompt_caching is False @@ -210,35 +246,36 @@ class TestTryActivateFallback: agent = _make_agent( fallback_model={"provider": "zai", "model": "glm-5"}, ) - with ( - patch.dict("os.environ", {"Z_AI_API_KEY": "sk-alt-key"}), - patch("run_agent.OpenAI") as mock_openai, + mock_client = _mock_resolve( + api_key="sk-alt-key", + base_url="https://open.z.ai/api/v1", + ) + with patch( + "agent.auxiliary_client.resolve_provider_client", + return_value=(mock_client, "glm-5"), ): assert agent._try_activate_fallback() is True - call_kwargs = mock_openai.call_args[1] - assert call_kwargs["api_key"] == "sk-alt-key" + assert agent.client is mock_client def test_activates_codex_fallback(self): """OpenAI Codex fallback should use OAuth credentials and codex_responses mode.""" agent = _make_agent( fallback_model={"provider": "openai-codex", "model": "gpt-5.3-codex"}, ) - mock_creds = { - "api_key": "codex-oauth-token", - "base_url": "https://chatgpt.com/backend-api/codex", - } - with ( - patch("hermes_cli.auth.resolve_codex_runtime_credentials", return_value=mock_creds), - patch("run_agent.OpenAI") as mock_openai, + mock_client = _mock_resolve( + api_key="codex-oauth-token", + base_url="https://chatgpt.com/backend-api/codex", + ) + with patch( + "agent.auxiliary_client.resolve_provider_client", + return_value=(mock_client, "gpt-5.3-codex"), ): result = agent._try_activate_fallback() assert result is True assert agent.model == "gpt-5.3-codex" assert agent.provider == "openai-codex" assert agent.api_mode == "codex_responses" - call_kwargs = mock_openai.call_args[1] - assert call_kwargs["api_key"] == "codex-oauth-token" - assert "chatgpt.com" in call_kwargs["base_url"] + assert agent.client is mock_client def test_codex_fallback_fails_gracefully_without_credentials(self): """Codex fallback should return False if no OAuth credentials available.""" @@ -246,8 +283,8 @@ class TestTryActivateFallback: fallback_model={"provider": "openai-codex", "model": "gpt-5.3-codex"}, ) with patch( - "hermes_cli.auth.resolve_codex_runtime_credentials", - side_effect=Exception("No Codex credentials"), + "agent.auxiliary_client.resolve_provider_client", + return_value=(None, None), ): assert agent._try_activate_fallback() is False assert agent._fallback_activated is False @@ -257,22 +294,20 @@ class TestTryActivateFallback: agent = _make_agent( fallback_model={"provider": "nous", "model": "nous-hermes-3"}, ) - mock_creds = { - "api_key": "nous-agent-key-abc", - "base_url": "https://inference-api.nousresearch.com/v1", - } - with ( - patch("hermes_cli.auth.resolve_nous_runtime_credentials", return_value=mock_creds), - patch("run_agent.OpenAI") as mock_openai, + mock_client = _mock_resolve( + api_key="nous-agent-key-abc", + base_url="https://inference-api.nousresearch.com/v1", + ) + with patch( + "agent.auxiliary_client.resolve_provider_client", + return_value=(mock_client, "nous-hermes-3"), ): result = agent._try_activate_fallback() assert result is True assert agent.model == "nous-hermes-3" assert agent.provider == "nous" assert agent.api_mode == "chat_completions" - call_kwargs = mock_openai.call_args[1] - assert call_kwargs["api_key"] == "nous-agent-key-abc" - assert "nousresearch.com" in call_kwargs["base_url"] + assert agent.client is mock_client def test_nous_fallback_fails_gracefully_without_login(self): """Nous fallback should return False if not logged in.""" @@ -280,8 +315,8 @@ class TestTryActivateFallback: fallback_model={"provider": "nous", "model": "nous-hermes-3"}, ) with patch( - "hermes_cli.auth.resolve_nous_runtime_credentials", - side_effect=Exception("Not logged in to Nous Portal"), + "agent.auxiliary_client.resolve_provider_client", + return_value=(None, None), ): assert agent._try_activate_fallback() is False assert agent._fallback_activated is False @@ -315,7 +350,7 @@ class TestFallbackInit: # ============================================================================= class TestProviderCredentials: - """Verify that each supported provider resolves its API key correctly.""" + """Verify that each supported provider resolves via the centralized router.""" @pytest.mark.parametrize("provider,env_var,base_url_fragment", [ ("openrouter", "OPENROUTER_API_KEY", "openrouter"), @@ -328,12 +363,15 @@ class TestProviderCredentials: agent = _make_agent( fallback_model={"provider": provider, "model": "test-model"}, ) - with ( - patch.dict("os.environ", {env_var: "test-key-123"}), - patch("run_agent.OpenAI") as mock_openai, + mock_client = MagicMock() + mock_client.api_key = "test-api-key" + mock_client.base_url = f"https://{base_url_fragment}/v1" + with patch( + "agent.auxiliary_client.resolve_provider_client", + return_value=(mock_client, "test-model"), ): result = agent._try_activate_fallback() assert result is True, f"Failed to activate fallback for {provider}" - call_kwargs = mock_openai.call_args[1] - assert call_kwargs["api_key"] == "test-key-123" - assert base_url_fragment in call_kwargs["base_url"].lower() + assert agent.client is mock_client + assert agent.model == "test-model" + assert agent.provider == provider From 9302690e1b71c1abfc2496640f0a8c3a68709d35 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Wed, 11 Mar 2026 22:04:42 -0700 Subject: [PATCH 07/11] =?UTF-8?q?refactor:=20remove=20LLM=5FMODEL=20env=20?= =?UTF-8?q?var=20dependency=20=E2=80=94=20config.yaml=20is=20sole=20source?= =?UTF-8?q?=20of=20truth?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Model selection now comes exclusively from config.yaml (set via 'hermes model' or 'hermes setup'). The LLM_MODEL env var is no longer read or written anywhere in production code. Why: env vars are per-process/per-user and would conflict in multi-agent or multi-tenant setups. Config.yaml is file-based and can be scoped per-user or eventually per-session. Changes: - cli.py: Read model from CLI_CONFIG only, not LLM_MODEL/OPENAI_MODEL - hermes_cli/auth.py: _save_model_choice() no longer writes LLM_MODEL to .env - hermes_cli/setup.py: Remove 12 save_env_value('LLM_MODEL', ...) calls from all provider setup flows - gateway/run.py: Remove LLM_MODEL fallback (HERMES_MODEL still works for gateway process runtime) - cron/scheduler.py: Same - agent/auxiliary_client.py: Remove LLM_MODEL from custom endpoint model detection --- agent/auxiliary_client.py | 2 +- cli.py | 11 ++++++++--- cron/scheduler.py | 2 +- gateway/run.py | 6 +++--- hermes_cli/auth.py | 9 ++++++--- hermes_cli/setup.py | 12 ------------ tests/test_cli_provider_resolution.py | 26 +++++++++++++++++--------- 7 files changed, 36 insertions(+), 32 deletions(-) diff --git a/agent/auxiliary_client.py b/agent/auxiliary_client.py index 19c2b8bd..1c6ac271 100644 --- a/agent/auxiliary_client.py +++ b/agent/auxiliary_client.py @@ -443,7 +443,7 @@ def _try_custom_endpoint() -> Tuple[Optional[OpenAI], Optional[str]]: custom_key = os.getenv("OPENAI_API_KEY") if not custom_base or not custom_key: return None, None - model = os.getenv("OPENAI_MODEL") or os.getenv("LLM_MODEL") or "gpt-4o-mini" + model = os.getenv("OPENAI_MODEL") or "gpt-4o-mini" logger.debug("Auxiliary client: custom endpoint (%s)", model) return OpenAI(api_key=custom_key, base_url=custom_base), model diff --git a/cli.py b/cli.py index 50e5db8d..d62da32f 100755 --- a/cli.py +++ b/cli.py @@ -1129,12 +1129,17 @@ class HermesCLI: self.verbose = verbose if verbose is not None else (self.tool_progress_mode == "verbose") # Configuration - priority: CLI args > env vars > config file - # Model can come from: CLI arg, LLM_MODEL env, OPENAI_MODEL env (custom endpoint), or config - self.model = model or os.getenv("LLM_MODEL") or os.getenv("OPENAI_MODEL") or CLI_CONFIG["model"]["default"] + # Model comes from: CLI arg or config.yaml (single source of truth). + # LLM_MODEL/OPENAI_MODEL env vars are NOT checked — config.yaml is + # authoritative. This avoids conflicts in multi-agent setups where + # env vars would stomp each other. + _model_config = CLI_CONFIG.get("model", {}) + _config_model = _model_config.get("default", "") if isinstance(_model_config, dict) else (_model_config or "") + self.model = model or _config_model or "anthropic/claude-opus-4.6" # Track whether model was explicitly chosen by the user or fell back # to the global default. Provider-specific normalisation may override # the default silently but should warn when overriding an explicit choice. - self._model_is_default = not (model or os.getenv("LLM_MODEL") or os.getenv("OPENAI_MODEL")) + self._model_is_default = not model self._explicit_api_key = api_key self._explicit_base_url = base_url diff --git a/cron/scheduler.py b/cron/scheduler.py index 348a25c2..c80122ce 100644 --- a/cron/scheduler.py +++ b/cron/scheduler.py @@ -180,7 +180,7 @@ def run_job(job: dict) -> tuple[bool, str, str, Optional[str]]: except UnicodeDecodeError: load_dotenv(str(_hermes_home / ".env"), override=True, encoding="latin-1") - model = os.getenv("HERMES_MODEL") or os.getenv("LLM_MODEL") or "anthropic/claude-opus-4.6" + model = os.getenv("HERMES_MODEL") or "anthropic/claude-opus-4.6" # Load config.yaml for model, reasoning, prefill, toolsets, provider routing _cfg = {} diff --git a/gateway/run.py b/gateway/run.py index 96d43672..772d4c4f 100644 --- a/gateway/run.py +++ b/gateway/run.py @@ -1544,7 +1544,7 @@ class GatewayRunner: config_path = _hermes_home / 'config.yaml' # Resolve current model and provider from config - current = os.getenv("HERMES_MODEL") or os.getenv("LLM_MODEL") or "anthropic/claude-opus-4.6" + current = os.getenv("HERMES_MODEL") or "anthropic/claude-opus-4.6" current_provider = "openrouter" try: if config_path.exists(): @@ -1999,7 +1999,7 @@ class GatewayRunner: return # Read model from config (same as _run_agent) - model = os.getenv("HERMES_MODEL") or os.getenv("LLM_MODEL") or "anthropic/claude-opus-4.6" + model = os.getenv("HERMES_MODEL") or "anthropic/claude-opus-4.6" try: import yaml as _y _cfg_path = _hermes_home / "config.yaml" @@ -3093,7 +3093,7 @@ class GatewayRunner: except Exception: pass - model = os.getenv("HERMES_MODEL") or os.getenv("LLM_MODEL") or "anthropic/claude-opus-4.6" + model = os.getenv("HERMES_MODEL") or "anthropic/claude-opus-4.6" try: import yaml as _y diff --git a/hermes_cli/auth.py b/hermes_cli/auth.py index 05d233f9..1ffa85bd 100644 --- a/hermes_cli/auth.py +++ b/hermes_cli/auth.py @@ -1671,8 +1671,12 @@ def _prompt_model_selection(model_ids: List[str], current_model: str = "") -> Op def _save_model_choice(model_id: str) -> None: - """Save the selected model to config.yaml and .env.""" - from hermes_cli.config import save_config, load_config, save_env_value + """Save the selected model to config.yaml (single source of truth). + + The model is stored in config.yaml only — NOT in .env. This avoids + conflicts in multi-agent setups where env vars would stomp each other. + """ + from hermes_cli.config import save_config, load_config config = load_config() # Always use dict format so provider/base_url can be stored alongside @@ -1681,7 +1685,6 @@ def _save_model_choice(model_id: str) -> None: else: config["model"] = {"default": model_id} save_config(config) - save_env_value("LLM_MODEL", model_id) def login_command(args) -> None: diff --git a/hermes_cli/setup.py b/hermes_cli/setup.py index 6b00952c..2f48574b 100644 --- a/hermes_cli/setup.py +++ b/hermes_cli/setup.py @@ -681,7 +681,6 @@ def setup_model_provider(config: dict): save_env_value("OPENAI_API_KEY", api_key) if model_name: config['model'] = model_name - save_env_value("LLM_MODEL", model_name) # Save provider and base_url to config.yaml so the gateway and CLI # both resolve the correct provider without relying on env-var heuristics. @@ -913,7 +912,6 @@ def setup_model_provider(config: dict): custom = prompt(f" Model name (Enter to keep '{current_model}')") if custom: config['model'] = custom - save_env_value("LLM_MODEL", custom) elif selected_provider == "openai-codex": from hermes_cli.codex_models import get_codex_model_ids codex_models = get_codex_model_ids() @@ -927,12 +925,10 @@ def setup_model_provider(config: dict): model_idx = prompt_choice("Select default model:", model_choices, default_codex) if model_idx < len(codex_models): config['model'] = codex_models[model_idx] - save_env_value("LLM_MODEL", codex_models[model_idx]) elif model_idx == len(codex_models): custom = prompt("Enter model name") if custom: config['model'] = custom - save_env_value("LLM_MODEL", custom) _update_config_for_provider("openai-codex", DEFAULT_CODEX_BASE_URL) elif selected_provider == "zai": # Coding Plan endpoints don't have GLM-5 @@ -950,12 +946,10 @@ def setup_model_provider(config: dict): if model_idx < len(zai_models): config['model'] = zai_models[model_idx] - save_env_value("LLM_MODEL", zai_models[model_idx]) elif model_idx == len(zai_models): custom = prompt("Enter model name") if custom: config['model'] = custom - save_env_value("LLM_MODEL", custom) # else: keep current elif selected_provider == "kimi-coding": kimi_models = ["kimi-k2.5", "kimi-k2-thinking", "kimi-k2-turbo-preview"] @@ -968,12 +962,10 @@ def setup_model_provider(config: dict): if model_idx < len(kimi_models): config['model'] = kimi_models[model_idx] - save_env_value("LLM_MODEL", kimi_models[model_idx]) elif model_idx == len(kimi_models): custom = prompt("Enter model name") if custom: config['model'] = custom - save_env_value("LLM_MODEL", custom) # else: keep current elif selected_provider in ("minimax", "minimax-cn"): minimax_models = ["MiniMax-M2.5", "MiniMax-M2.5-highspeed", "MiniMax-M2.1"] @@ -986,12 +978,10 @@ def setup_model_provider(config: dict): if model_idx < len(minimax_models): config['model'] = minimax_models[model_idx] - save_env_value("LLM_MODEL", minimax_models[model_idx]) elif model_idx == len(minimax_models): custom = prompt("Enter model name") if custom: config['model'] = custom - save_env_value("LLM_MODEL", custom) # else: keep current else: # Static list for OpenRouter / fallback (from canonical list) @@ -1008,12 +998,10 @@ def setup_model_provider(config: dict): if model_idx < len(ids): config['model'] = ids[model_idx] - save_env_value("LLM_MODEL", ids[model_idx]) elif model_idx == len(ids): # Custom custom = prompt("Enter model name (e.g., anthropic/claude-opus-4.6)") if custom: config['model'] = custom - save_env_value("LLM_MODEL", custom) # else: Keep current _final_model = config.get('model', '') diff --git a/tests/test_cli_provider_resolution.py b/tests/test_cli_provider_resolution.py index f4a446ac..2a3dc43e 100644 --- a/tests/test_cli_provider_resolution.py +++ b/tests/test_cli_provider_resolution.py @@ -197,21 +197,28 @@ def test_codex_provider_replaces_incompatible_default_model(monkeypatch): assert shell.model == "gpt-5.2-codex" -def test_codex_provider_trusts_explicit_envvar_model(monkeypatch): - """When the user explicitly sets LLM_MODEL, we trust their choice and - let the API be the judge — even if it's a non-OpenAI model. Only - provider prefixes are stripped; the bare model passes through.""" +def test_codex_provider_uses_config_model(monkeypatch): + """Model comes from config.yaml, not LLM_MODEL env var. + Config.yaml is the single source of truth to avoid multi-agent conflicts.""" cli = _import_cli() - monkeypatch.setenv("LLM_MODEL", "claude-opus-4-6") + # LLM_MODEL env var should be IGNORED (even if set) + monkeypatch.setenv("LLM_MODEL", "should-be-ignored") monkeypatch.delenv("OPENAI_MODEL", raising=False) + # Set model via config + monkeypatch.setitem(cli.CLI_CONFIG, "model", { + "default": "gpt-5.2-codex", + "provider": "openai-codex", + "base_url": "https://chatgpt.com/backend-api/codex", + }) + def _runtime_resolve(**kwargs): return { "provider": "openai-codex", "api_mode": "codex_responses", "base_url": "https://chatgpt.com/backend-api/codex", - "api_key": "test-key", + "api_key": "fake-codex-token", "source": "env/config", } @@ -220,11 +227,12 @@ def test_codex_provider_trusts_explicit_envvar_model(monkeypatch): shell = cli.HermesCLI(compact=True, max_turns=1) - assert shell._model_is_default is False assert shell._ensure_runtime_credentials() is True assert shell.provider == "openai-codex" - # User explicitly chose this model — it passes through untouched - assert shell.model == "claude-opus-4-6" + # Model from config (may be normalized by codex provider logic) + assert "codex" in shell.model.lower() + # LLM_MODEL env var is NOT used + assert shell.model != "should-be-ignored" def test_codex_provider_preserves_explicit_codex_model(monkeypatch): From a7e5f195284a54b469a1f2bf9ab6b60401ae3212 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Wed, 11 Mar 2026 22:41:33 -0700 Subject: [PATCH 08/11] fix: don't send OpenRouter-specific provider preferences to Nous Portal Two bugs in _build_api_kwargs that broke Nous Portal: 1. Provider preferences (only, ignore, order, sort) are OpenRouter- specific routing features. They were being sent in extra_body to ALL providers, including Nous Portal. When the config had providers_only=['google-vertex'], Nous Portal returned 404 'Inference host not found' because it doesn't have a google-vertex backend. Fix: Only include provider preferences when _is_openrouter is True. 2. Reasoning config with enabled=false was being sent to Nous Portal, which requires reasoning and returns 400 'Reasoning is mandatory for this endpoint and cannot be disabled.' Fix: Omit the reasoning parameter for Nous when enabled=false. Root cause found via HERMES_DUMP_REQUESTS=1 which showed the exact request payload being sent to Nous Portal's inference API. --- run_agent.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/run_agent.py b/run_agent.py index 107b803c..bb66351b 100644 --- a/run_agent.py +++ b/run_agent.py @@ -2392,16 +2392,24 @@ class AIAgent: extra_body = {} - if provider_preferences: - extra_body["provider"] = provider_preferences - _is_openrouter = "openrouter" in self.base_url.lower() + + # Provider preferences (only, ignore, order, sort) are OpenRouter- + # specific — don't send them to other providers (Nous, Codex, etc.) + if provider_preferences and _is_openrouter: + extra_body["provider"] = provider_preferences _is_nous = "nousresearch" in self.base_url.lower() _is_mistral = "api.mistral.ai" in self.base_url.lower() if (_is_openrouter or _is_nous) and not _is_mistral: if self.reasoning_config is not None: - extra_body["reasoning"] = self.reasoning_config + rc = dict(self.reasoning_config) + # Nous Portal requires reasoning enabled — don't send + # enabled=false to it (would cause 400). + if _is_nous and rc.get("enabled") is False: + pass # omit reasoning entirely for Nous when disabled + else: + extra_body["reasoning"] = rc else: extra_body["reasoning"] = { "enabled": True, From 65356003e3da075337d4e4407353f6b57d84d150 Mon Sep 17 00:00:00 2001 From: teknium1 Date: Wed, 11 Mar 2026 22:49:24 -0700 Subject: [PATCH 09/11] revert: keep provider preferences for all providers (Nous will proxy) Nous Portal backend will become a transparent proxy for OpenRouter- specific parameters (provider preferences, etc.), so keep sending them to all providers. The reasoning disabled fix is kept (that's a real constraint of the Nous endpoint). --- run_agent.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/run_agent.py b/run_agent.py index bb66351b..af1b31c0 100644 --- a/run_agent.py +++ b/run_agent.py @@ -2395,7 +2395,9 @@ class AIAgent: _is_openrouter = "openrouter" in self.base_url.lower() # Provider preferences (only, ignore, order, sort) are OpenRouter- - # specific — don't send them to other providers (Nous, Codex, etc.) + # specific. Only send to OpenRouter-compatible endpoints. + # TODO: Nous Portal will add transparent proxy support — re-enable + # for _is_nous when their backend is updated. if provider_preferences and _is_openrouter: extra_body["provider"] = provider_preferences _is_nous = "nousresearch" in self.base_url.lower() From ec2c6dff7073b1369ac71f405901dabb893e650f Mon Sep 17 00:00:00 2001 From: teknium1 Date: Wed, 11 Mar 2026 23:06:06 -0700 Subject: [PATCH 10/11] feat: unified /model and /provider into single view MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both /model and /provider now show the same unified display: Current: anthropic/claude-opus-4.6 via OpenRouter Authenticated providers & models: [openrouter] ← active anthropic/claude-opus-4.6 ← current anthropic/claude-sonnet-4.5 ... [nous] claude-opus-4-6 gemini-3-flash ... [openai-codex] gpt-5.2-codex gpt-5.1-codex-mini ... Not configured: Z.AI / GLM, Kimi / Moonshot, ... Switch model: /model Switch provider: /model : Example: /model nous:claude-opus-4-6 Users can see all authenticated providers and their models at a glance, making it easy to switch mid-conversation. Also added curated model lists for Nous Portal and OpenAI Codex to hermes_cli/models.py. --- cli.py | 126 +++++++++++++++++--------------- hermes_cli/models.py | 13 ++++ tests/test_cli_model_command.py | 4 +- 3 files changed, 83 insertions(+), 60 deletions(-) diff --git a/cli.py b/cli.py index d62da32f..723a7554 100755 --- a/cli.py +++ b/cli.py @@ -2265,6 +2265,72 @@ class HermesCLI: remaining = len(self.conversation_history) print(f" {remaining} message(s) remaining in history.") + def _show_model_and_providers(self): + """Unified /model and /provider display. + + Shows current model + provider, then lists all authenticated + providers with their available models so users can switch easily. + """ + from hermes_cli.models import ( + curated_models_for_provider, list_available_providers, + normalize_provider, _PROVIDER_LABELS, + ) + from hermes_cli.auth import resolve_provider as _resolve_provider + + # Resolve current provider + raw_provider = normalize_provider(self.provider) + if raw_provider == "auto": + try: + current = _resolve_provider( + self.requested_provider, + explicit_api_key=self._explicit_api_key, + explicit_base_url=self._explicit_base_url, + ) + except Exception: + current = "openrouter" + else: + current = raw_provider + current_label = _PROVIDER_LABELS.get(current, current) + + print(f"\n Current: {self.model} via {current_label}") + print() + + # Show all authenticated providers with their models + providers = list_available_providers() + authed = [p for p in providers if p["authenticated"]] + unauthed = [p for p in providers if not p["authenticated"]] + + if authed: + print(" Authenticated providers & models:") + for p in authed: + is_active = p["id"] == current + marker = " ← active" if is_active else "" + print(f" [{p['id']}]{marker}") + curated = curated_models_for_provider(p["id"]) + if curated: + for mid, desc in curated: + current_marker = " ← current" if (is_active and mid == self.model) else "" + print(f" {mid}{current_marker}") + else: + print(f" (use /model {p['id']}:)") + print() + + if unauthed: + names = ", ".join(p["label"] for p in unauthed) + print(f" Not configured: {names}") + print(f" Run: hermes setup") + print() + + print(" Switch model: /model ") + print(" Switch provider: /model :") + if authed and len(authed) > 1: + # Show a concrete example with a non-active provider + other = next((p for p in authed if p["id"] != current), authed[0]) + other_models = curated_models_for_provider(other["id"]) + if other_models: + example_model = other_models[0][0] + print(f" Example: /model {other['id']}:{example_model}") + def _handle_prompt_command(self, cmd: str): """Handle the /prompt command to view or set system prompt.""" parts = cmd.split(maxsplit=1) @@ -2776,65 +2842,9 @@ class HermesCLI: print(f" Reason: {message}") print(" Note: Model will revert on restart. Use a verified model to save to config.") else: - from hermes_cli.models import curated_models_for_provider, normalize_provider, _PROVIDER_LABELS - from hermes_cli.auth import resolve_provider as _resolve_provider - # Resolve "auto" to the actual provider using credential detection - raw_provider = normalize_provider(self.provider) - if raw_provider == "auto": - try: - display_provider = _resolve_provider( - self.requested_provider, - explicit_api_key=self._explicit_api_key, - explicit_base_url=self._explicit_base_url, - ) - except Exception: - display_provider = "openrouter" - else: - display_provider = raw_provider - provider_label = _PROVIDER_LABELS.get(display_provider, display_provider) - print(f"\n Current model: {self.model}") - print(f" Current provider: {provider_label}") - print() - curated = curated_models_for_provider(display_provider) - if curated: - print(f" Available models ({provider_label}):") - for mid, desc in curated: - marker = " ←" if mid == self.model else "" - label = f" {desc}" if desc else "" - print(f" {mid}{label}{marker}") - print() - print(" Usage: /model ") - print(" /model provider:model-name (to switch provider)") - print(" Example: /model openrouter:anthropic/claude-sonnet-4.5") - print(" See /provider for available providers") + self._show_model_and_providers() elif cmd_lower == "/provider": - from hermes_cli.models import list_available_providers, normalize_provider, _PROVIDER_LABELS - from hermes_cli.auth import resolve_provider as _resolve_provider - # Resolve current provider - raw_provider = normalize_provider(self.provider) - if raw_provider == "auto": - try: - current = _resolve_provider( - self.requested_provider, - explicit_api_key=self._explicit_api_key, - explicit_base_url=self._explicit_base_url, - ) - except Exception: - current = "openrouter" - else: - current = raw_provider - current_label = _PROVIDER_LABELS.get(current, current) - print(f"\n Current provider: {current_label} ({current})\n") - providers = list_available_providers() - print(" Available providers:") - for p in providers: - marker = " ← active" if p["id"] == current else "" - auth = "✓" if p["authenticated"] else "✗" - aliases = f" (also: {', '.join(p['aliases'])})" if p["aliases"] else "" - print(f" [{auth}] {p['id']:<14} {p['label']}{aliases}{marker}") - print() - print(" Switch: /model provider:model-name") - print(" Setup: hermes setup") + self._show_model_and_providers() elif cmd_lower.startswith("/prompt"): # Use original case so prompt text isn't lowercased self._handle_prompt_command(cmd_original) diff --git a/hermes_cli/models.py b/hermes_cli/models.py index 1fdde090..0df1d309 100644 --- a/hermes_cli/models.py +++ b/hermes_cli/models.py @@ -31,6 +31,19 @@ OPENROUTER_MODELS: list[tuple[str, str]] = [ ] _PROVIDER_MODELS: dict[str, list[str]] = { + "nous": [ + "claude-opus-4-6", + "claude-sonnet-4-6", + "gpt-5.4", + "gemini-3-flash", + "gemini-3.0-pro-preview", + "deepseek-v3.2", + ], + "openai-codex": [ + "gpt-5.2-codex", + "gpt-5.1-codex-mini", + "gpt-5.1-codex-max", + ], "zai": [ "glm-5", "glm-4.7", diff --git a/tests/test_cli_model_command.py b/tests/test_cli_model_command.py index b8b8e8d2..477ad429 100644 --- a/tests/test_cli_model_command.py +++ b/tests/test_cli_model_command.py @@ -93,8 +93,8 @@ class TestModelCommand: output = capsys.readouterr().out assert "anthropic/claude-opus-4.6" in output assert "OpenRouter" in output - assert "Available models" in output - assert "provider:model-name" in output + assert "Authenticated providers" in output or "Switch model" in output + assert "provider" in output and "model" in output # -- provider switching tests ------------------------------------------- From 7febdf7208d59db52f8ebe54b8be71a0d6c31d7c Mon Sep 17 00:00:00 2001 From: teknium1 Date: Wed, 11 Mar 2026 23:29:26 -0700 Subject: [PATCH 11/11] fix: custom endpoint model validation + better /model error messages - Custom endpoints can serve any model, so skip validation for provider='custom' in validate_requested_model(). Previously it would reject any model name since there's no static catalog or live API to check against. - Show clear setup instructions when switching to custom endpoint without OPENAI_BASE_URL/OPENAI_API_KEY configured. - Added curated model lists for Nous Portal and OpenAI Codex to _PROVIDER_MODELS so /model shows their available models. --- cli.py | 6 +++++- hermes_cli/models.py | 9 +++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/cli.py b/cli.py index 723a7554..7f2b2394 100755 --- a/cli.py +++ b/cli.py @@ -2795,7 +2795,11 @@ class HermesCLI: base_url_for_probe = runtime.get("base_url", "") except Exception as e: provider_label = _PROVIDER_LABELS.get(target_provider, target_provider) - print(f"(>_<) Could not resolve credentials for provider '{provider_label}': {e}") + if target_provider == "custom": + print(f"(>_<) Custom endpoint not configured. Set OPENAI_BASE_URL and OPENAI_API_KEY,") + print(f" or run: hermes setup → Custom OpenAI-compatible endpoint") + else: + print(f"(>_<) Could not resolve credentials for provider '{provider_label}': {e}") print(f"(^_^) Current model unchanged: {self.model}") return True diff --git a/hermes_cli/models.py b/hermes_cli/models.py index 0df1d309..54d4e3c1 100644 --- a/hermes_cli/models.py +++ b/hermes_cli/models.py @@ -276,6 +276,15 @@ def validate_requested_model( "message": "Model names cannot contain spaces.", } + # Custom endpoints can serve any model — skip validation + if normalized == "custom": + return { + "accepted": True, + "persist": True, + "recognized": False, + "message": None, + } + # Probe the live API to check if the model actually exists api_models = fetch_api_models(api_key, base_url)