feat: add direct endpoint overrides for auxiliary and delegation

Add base_url/api_key overrides for auxiliary tasks and delegation so users can route those flows straight to a custom OpenAI-compatible endpoint without having to rely on provider=main or named custom providers. Also clear gateway session env vars in test isolation so the full suite stays deterministic when run from a messaging-backed agent session.
2026-03-14 20:48:29 -07:00 · 2026-03-14 20:48:29 -07:00 · 9f6bccd76a
commit 9f6bccd76a
parent 6c24d76533
12 changed files with 526 additions and 99 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -30,6 +30,10 @@ Default "auto" follows the chains above.
 Per-task model overrides (e.g. AUXILIARY_VISION_MODEL,
 AUXILIARY_WEB_EXTRACT_MODEL) let callers use a different model slug
 than the provider's default.
+
+Per-task direct endpoint overrides (e.g. AUXILIARY_VISION_BASE_URL,
+AUXILIARY_VISION_API_KEY) let callers route a specific auxiliary task to a
+custom OpenAI-compatible endpoint without touching the main model settings.
 """

 import json
@ -418,6 +422,17 @@ def _get_auxiliary_provider(task: str = "") -> str:
    return "auto"


+def _get_auxiliary_env_override(task: str, suffix: str) -> Optional[str]:
+    """Read an auxiliary env override from AUXILIARY_* or CONTEXT_* prefixes."""
+    if not task:
+        return None
+    for prefix in ("AUXILIARY_", "CONTEXT_"):
+        val = os.getenv(f"{prefix}{task.upper()}_{suffix}", "").strip()
+        if val:
+            return val
+    return None
+
+
 def _try_openrouter() -> Tuple[Optional[OpenAI], Optional[str]]:
    or_key = os.getenv("OPENROUTER_API_KEY")
    if not or_key:
@ -564,6 +579,8 @@ def resolve_provider_client(
    model: str = None,
    async_mode: bool = False,
    raw_codex: bool = False,
+    explicit_base_url: str = None,
+    explicit_api_key: str = None,
 ) -> Tuple[Optional[Any], Optional[str]]:
    """Central router: given a provider name and optional model, return a
    configured client with the correct auth, base URL, and API format.
@ -585,6 +602,8 @@ def resolve_provider_client(
            instead of wrapping in CodexAuxiliaryClient.  Use this when
            the caller needs direct access to responses.stream() (e.g.,
            the main agent loop).
+        explicit_base_url: Optional direct OpenAI-compatible endpoint.
+        explicit_api_key: Optional API key paired with explicit_base_url.

    Returns:
        (client, resolved_model) or (None, None) if auth is unavailable.
@ -661,6 +680,22 @@ def resolve_provider_client(

    # ── Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY) ───────────
    if provider == "custom":
+        if explicit_base_url:
+            custom_base = explicit_base_url.strip()
+            custom_key = (
+                (explicit_api_key or "").strip()
+                or os.getenv("OPENAI_API_KEY", "").strip()
+            )
+            if not custom_base or not custom_key:
+                logger.warning(
+                    "resolve_provider_client: explicit custom endpoint requested "
+                    "but no API key was found (set explicit_api_key or OPENAI_API_KEY)"
+                )
+                return None, None
+            final_model = model or _read_main_model() or "gpt-4o-mini"
+            client = OpenAI(api_key=custom_key, base_url=custom_base)
+            return (_to_async_client(client, final_model) if async_mode
+                    else (client, final_model))
        # Try custom first, then codex, then API-key providers
        for try_fn in (_try_custom_endpoint, _try_codex,
                       _resolve_api_key_provider):
@ -749,10 +784,13 @@ def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optiona
    Callers may override the returned model with a per-task env var
    (e.g. CONTEXT_COMPRESSION_MODEL, AUXILIARY_WEB_EXTRACT_MODEL).
    """
-    forced = _get_auxiliary_provider(task)
-    if forced != "auto":
-        return resolve_provider_client(forced)
-    return resolve_provider_client("auto")
+    provider, model, base_url, api_key = _resolve_task_provider_model(task or None)
+    return resolve_provider_client(
+        provider,
+        model=model,
+        explicit_base_url=base_url,
+        explicit_api_key=api_key,
+    )


 def get_async_text_auxiliary_client(task: str = ""):
@ -762,10 +800,14 @@ def get_async_text_auxiliary_client(task: str = ""):
    (AsyncCodexAuxiliaryClient, model) which wraps the Responses API.
    Returns (None, None) when no provider is available.
    """
-    forced = _get_auxiliary_provider(task)
-    if forced != "auto":
-        return resolve_provider_client(forced, async_mode=True)
-    return resolve_provider_client("auto", async_mode=True)
+    provider, model, base_url, api_key = _resolve_task_provider_model(task or None)
+    return resolve_provider_client(
+        provider,
+        model=model,
+        async_mode=True,
+        explicit_base_url=base_url,
+        explicit_api_key=api_key,
+    )


 _VISION_AUTO_PROVIDER_ORDER = (
@ -821,26 +863,43 @@ def resolve_vision_provider_client(
    provider: Optional[str] = None,
    model: Optional[str] = None,
    *,
+    base_url: Optional[str] = None,
+    api_key: Optional[str] = None,
    async_mode: bool = False,
 ) -> Tuple[Optional[str], Optional[Any], Optional[str]]:
    """Resolve the client actually used for vision tasks.

-    Explicit provider overrides still use the generic provider router for
-    non-standard backends, so users can intentionally force experimental
-    providers. Auto mode stays conservative and only tries vision backends
-    known to work today.
+    Direct endpoint overrides take precedence over provider selection. Explicit
+    provider overrides still use the generic provider router for non-standard
+    backends, so users can intentionally force experimental providers. Auto mode
+    stays conservative and only tries vision backends known to work today.
    """
-    requested = _normalize_vision_provider(provider or _get_auxiliary_provider("vision"))
+    requested, resolved_model, resolved_base_url, resolved_api_key = _resolve_task_provider_model(
+        "vision", provider, model, base_url, api_key
+    )
+    requested = _normalize_vision_provider(requested)

    def _finalize(resolved_provider: str, sync_client: Any, default_model: Optional[str]):
        if sync_client is None:
            return resolved_provider, None, None
-        final_model = model or default_model
+        final_model = resolved_model or default_model
        if async_mode:
            async_client, async_model = _to_async_client(sync_client, final_model)
            return resolved_provider, async_client, async_model
        return resolved_provider, sync_client, final_model

+    if resolved_base_url:
+        client, final_model = resolve_provider_client(
+            "custom",
+            model=resolved_model,
+            async_mode=async_mode,
+            explicit_base_url=resolved_base_url,
+            explicit_api_key=resolved_api_key,
+        )
+        if client is None:
+            return "custom", None, None
+        return "custom", client, final_model
+
    if requested == "auto":
        for candidate in get_available_vision_backends():
            sync_client, default_model = _resolve_strict_vision_backend(candidate)
@ -853,7 +912,7 @@ def resolve_vision_provider_client(
        sync_client, default_model = _resolve_strict_vision_backend(requested)
        return _finalize(requested, sync_client, default_model)

-    client, final_model = _get_cached_client(requested, model, async_mode)
+    client, final_model = _get_cached_client(requested, resolved_model, async_mode)
    if client is None:
        return requested, None, None
    return requested, client, final_model
@ -910,19 +969,29 @@ def auxiliary_max_tokens_param(value: int) -> dict:
 # Every auxiliary LLM consumer should use these instead of manually
 # constructing clients and calling .chat.completions.create().

-# Client cache: (provider, async_mode) -> (client, default_model)
+# Client cache: (provider, async_mode, base_url, api_key) -> (client, default_model)
 _client_cache: Dict[tuple, tuple] = {}


 def _get_cached_client(
-    provider: str, model: str = None, async_mode: bool = False,
+    provider: str,
+    model: str = None,
+    async_mode: bool = False,
+    base_url: str = None,
+    api_key: str = None,
 ) -> Tuple[Optional[Any], Optional[str]]:
    """Get or create a cached client for the given provider."""
-    cache_key = (provider, async_mode)
+    cache_key = (provider, async_mode, base_url or "", api_key or "")
    if cache_key in _client_cache:
        cached_client, cached_default = _client_cache[cache_key]
        return cached_client, model or cached_default
-    client, default_model = resolve_provider_client(provider, model, async_mode)
+    client, default_model = resolve_provider_client(
+        provider,
+        model,
+        async_mode,
+        explicit_base_url=base_url,
+        explicit_api_key=api_key,
+    )
    if client is not None:
        _client_cache[cache_key] = (client, default_model)
    return client, model or default_model
@ -932,57 +1001,75 @@ def _resolve_task_provider_model(
    task: str = None,
    provider: str = None,
    model: str = None,
-) -> Tuple[str, Optional[str]]:
+    base_url: str = None,
+    api_key: str = None,
+) -> Tuple[str, Optional[str], Optional[str], Optional[str]]:
    """Determine provider + model for a call.

    Priority:
-      1. Explicit provider/model args (always win)
-      2. Env var overrides (AUXILIARY_{TASK}_PROVIDER, etc.)
-      3. Config file (auxiliary.{task}.provider/model or compression.*)
+      1. Explicit provider/model/base_url/api_key args (always win)
+      2. Env var overrides (AUXILIARY_{TASK}_*, CONTEXT_{TASK}_*)
+      3. Config file (auxiliary.{task}.* or compression.*)
      4. "auto" (full auto-detection chain)

-    Returns (provider, model) where model may be None (use provider default).
+    Returns (provider, model, base_url, api_key) where model may be None
+    (use provider default). When base_url is set, provider is forced to
+    "custom" and the task uses that direct endpoint.
    """
-    if provider:
-        return provider, model
+    config = {}
+    cfg_provider = None
+    cfg_model = None
+    cfg_base_url = None
+    cfg_api_key = None

    if task:
-        # Check env var overrides first
-        env_provider = _get_auxiliary_provider(task)
-        if env_provider != "auto":
-            # Check for env var model override too
-            env_model = None
-            for prefix in ("AUXILIARY_", "CONTEXT_"):
-                val = os.getenv(f"{prefix}{task.upper()}_MODEL", "").strip()
-                if val:
-                    env_model = val
-                    break
-            return env_provider, model or env_model
-
-        # Read from config file
        try:
            from hermes_cli.config import load_config
            config = load_config()
        except ImportError:
-            return "auto", model
+            config = {}

-        # Check auxiliary.{task} section
-        aux = config.get("auxiliary", {})
-        task_config = aux.get(task, {})
-        cfg_provider = task_config.get("provider", "").strip() or None
-        cfg_model = task_config.get("model", "").strip() or None
+        aux = config.get("auxiliary", {}) if isinstance(config, dict) else {}
+        task_config = aux.get(task, {}) if isinstance(aux, dict) else {}
+        if not isinstance(task_config, dict):
+            task_config = {}
+        cfg_provider = str(task_config.get("provider", "")).strip() or None
+        cfg_model = str(task_config.get("model", "")).strip() or None
+        cfg_base_url = str(task_config.get("base_url", "")).strip() or None
+        cfg_api_key = str(task_config.get("api_key", "")).strip() or None

        # Backwards compat: compression section has its own keys
        if task == "compression" and not cfg_provider:
-            comp = config.get("compression", {})
-            cfg_provider = comp.get("summary_provider", "").strip() or None
-            cfg_model = cfg_model or comp.get("summary_model", "").strip() or None
+            comp = config.get("compression", {}) if isinstance(config, dict) else {}
+            if isinstance(comp, dict):
+                cfg_provider = comp.get("summary_provider", "").strip() or None
+                cfg_model = cfg_model or comp.get("summary_model", "").strip() or None

+    env_model = _get_auxiliary_env_override(task, "MODEL") if task else None
+    resolved_model = model or env_model or cfg_model
+
+    if base_url:
+        return "custom", resolved_model, base_url, api_key
+    if provider:
+        return provider, resolved_model, base_url, api_key
+
+    if task:
+        env_base_url = _get_auxiliary_env_override(task, "BASE_URL")
+        env_api_key = _get_auxiliary_env_override(task, "API_KEY")
+        if env_base_url:
+            return "custom", resolved_model, env_base_url, env_api_key or cfg_api_key
+
+        env_provider = _get_auxiliary_provider(task)
+        if env_provider != "auto":
+            return env_provider, resolved_model, None, None
+
+        if cfg_base_url:
+            return "custom", resolved_model, cfg_base_url, cfg_api_key
        if cfg_provider and cfg_provider != "auto":
-            return cfg_provider, model or cfg_model
-        return "auto", model or cfg_model
+            return cfg_provider, resolved_model, None, None
+        return "auto", resolved_model, None, None

-    return "auto", model
+    return "auto", resolved_model, None, None


 def _build_call_kwargs(
@ -994,6 +1081,7 @@ def _build_call_kwargs(
    tools: Optional[list] = None,
    timeout: float = 30.0,
    extra_body: Optional[dict] = None,
+    base_url: Optional[str] = None,
 ) -> dict:
    """Build kwargs for .chat.completions.create() with model/provider adjustments."""
    kwargs: Dict[str, Any] = {
@ -1009,7 +1097,7 @@ def _build_call_kwargs(
        # Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens.
        # Direct OpenAI api.openai.com with newer models needs max_completion_tokens.
        if provider == "custom":
-            custom_base = os.getenv("OPENAI_BASE_URL", "")
+            custom_base = base_url or os.getenv("OPENAI_BASE_URL", "")
            if "api.openai.com" in custom_base.lower():
                kwargs["max_completion_tokens"] = max_tokens
            else:
@ -1035,6 +1123,8 @@ def call_llm(
    *,
    provider: str = None,
    model: str = None,
+    base_url: str = None,
+    api_key: str = None,
    messages: list,
    temperature: float = None,
    max_tokens: int = None,
@ -1066,16 +1156,18 @@ def call_llm(
    Raises:
        RuntimeError: If no provider is configured.
    """
-    resolved_provider, resolved_model = _resolve_task_provider_model(
-        task, provider, model)
+    resolved_provider, resolved_model, resolved_base_url, resolved_api_key = _resolve_task_provider_model(
+        task, provider, model, base_url, api_key)

    if task == "vision":
        effective_provider, client, final_model = resolve_vision_provider_client(
-            provider=resolved_provider,
-            model=resolved_model,
+            provider=provider,
+            model=model,
+            base_url=base_url,
+            api_key=api_key,
            async_mode=False,
        )
-        if client is None and resolved_provider != "auto":
+        if client is None and resolved_provider != "auto" and not resolved_base_url:
            logger.warning(
                "Vision provider %s unavailable, falling back to auto vision backends",
                resolved_provider,
@ -1092,10 +1184,15 @@ def call_llm(
            )
        resolved_provider = effective_provider or resolved_provider
    else:
-        client, final_model = _get_cached_client(resolved_provider, resolved_model)
+        client, final_model = _get_cached_client(
+            resolved_provider,
+            resolved_model,
+            base_url=resolved_base_url,
+            api_key=resolved_api_key,
+        )
        if client is None:
            # Fallback: try openrouter
-            if resolved_provider != "openrouter":
+            if resolved_provider != "openrouter" and not resolved_base_url:
                logger.warning("Provider %s unavailable, falling back to openrouter",
                               resolved_provider)
                client, final_model = _get_cached_client(
@ -1108,7 +1205,8 @@ def call_llm(
    kwargs = _build_call_kwargs(
        resolved_provider, final_model, messages,
        temperature=temperature, max_tokens=max_tokens,
-        tools=tools, timeout=timeout, extra_body=extra_body)
+        tools=tools, timeout=timeout, extra_body=extra_body,
+        base_url=resolved_base_url)

    # Handle max_tokens vs max_completion_tokens retry
    try:
@ -1127,6 +1225,8 @@ async def async_call_llm(
    *,
    provider: str = None,
    model: str = None,
+    base_url: str = None,
+    api_key: str = None,
    messages: list,
    temperature: float = None,
    max_tokens: int = None,
@ -1138,16 +1238,18 @@ async def async_call_llm(

    Same as call_llm() but async. See call_llm() for full documentation.
    """
-    resolved_provider, resolved_model = _resolve_task_provider_model(
-        task, provider, model)
+    resolved_provider, resolved_model, resolved_base_url, resolved_api_key = _resolve_task_provider_model(
+        task, provider, model, base_url, api_key)

    if task == "vision":
        effective_provider, client, final_model = resolve_vision_provider_client(
-            provider=resolved_provider,
-            model=resolved_model,
+            provider=provider,
+            model=model,
+            base_url=base_url,
+            api_key=api_key,
            async_mode=True,
        )
-        if client is None and resolved_provider != "auto":
+        if client is None and resolved_provider != "auto" and not resolved_base_url:
            logger.warning(
                "Vision provider %s unavailable, falling back to auto vision backends",
                resolved_provider,
@ -1165,9 +1267,14 @@ async def async_call_llm(
        resolved_provider = effective_provider or resolved_provider
    else:
        client, final_model = _get_cached_client(
-            resolved_provider, resolved_model, async_mode=True)
+            resolved_provider,
+            resolved_model,
+            async_mode=True,
+            base_url=resolved_base_url,
+            api_key=resolved_api_key,
+        )
        if client is None:
-            if resolved_provider != "openrouter":
+            if resolved_provider != "openrouter" and not resolved_base_url:
                logger.warning("Provider %s unavailable, falling back to openrouter",
                               resolved_provider)
                client, final_model = _get_cached_client(
@ -1181,7 +1288,8 @@ async def async_call_llm(
    kwargs = _build_call_kwargs(
        resolved_provider, final_model, messages,
        temperature=temperature, max_tokens=max_tokens,
-        tools=tools, timeout=timeout, extra_body=extra_body)
+        tools=tools, timeout=timeout, extra_body=extra_body,
+        base_url=resolved_base_url)

    try:
        return await client.chat.completions.create(**kwargs)