merge: resolve conflict with main in subagent interrupt test

2026-03-12 16:28:57 -04:00 · 2026-03-12 16:28:57 -04:00 · fefc709b2c
commit fefc709b2c
parent 45d3e83ad1 e004c094ea
75 changed files with 8124 additions and 1376 deletions
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -17,7 +17,10 @@ Resolution order for text tasks (auto mode):
 Resolution order for vision/multimodal tasks (auto mode):
  1. OpenRouter
  2. Nous Portal
-  3. None  (steps 3-5 are skipped — they may not support multimodal)
+  3. Codex OAuth (gpt-5.3-codex supports vision via Responses API)
+  4. Custom endpoint (for local vision models: Qwen-VL, LLaVA, Pixtral, etc.)
+  5. None  (API-key providers like z.ai/Kimi/MiniMax are skipped —
+     they may not support multimodal)

 Per-task provider overrides (e.g. AUXILIARY_VISION_PROVIDER,
 CONTEXT_COMPRESSION_PROVIDER) can force a specific provider for each task:
@ -440,7 +443,7 @@ def _try_custom_endpoint() -> Tuple[Optional[OpenAI], Optional[str]]:
    custom_key = os.getenv("OPENAI_API_KEY")
    if not custom_base or not custom_key:
        return None, None
-    model = os.getenv("OPENAI_MODEL") or os.getenv("LLM_MODEL") or "gpt-4o-mini"
+    model = os.getenv("OPENAI_MODEL") or "gpt-4o-mini"
    logger.debug("Auxiliary client: custom endpoint (%s)", model)
    return OpenAI(api_key=custom_key, base_url=custom_base), model

@ -499,6 +502,205 @@ def _resolve_auto() -> Tuple[Optional[OpenAI], Optional[str]]:
    return None, None


+# ── Centralized Provider Router ─────────────────────────────────────────────
+#
+# resolve_provider_client() is the single entry point for creating a properly
+# configured client given a (provider, model) pair.  It handles auth lookup,
+# base URL resolution, provider-specific headers, and API format differences
+# (Chat Completions vs Responses API for Codex).
+#
+# All auxiliary consumer code should go through this or the public helpers
+# below — never look up auth env vars ad-hoc.
+
+
+def _to_async_client(sync_client, model: str):
+    """Convert a sync client to its async counterpart, preserving Codex routing."""
+    from openai import AsyncOpenAI
+
+    if isinstance(sync_client, CodexAuxiliaryClient):
+        return AsyncCodexAuxiliaryClient(sync_client), model
+
+    async_kwargs = {
+        "api_key": sync_client.api_key,
+        "base_url": str(sync_client.base_url),
+    }
+    base_lower = str(sync_client.base_url).lower()
+    if "openrouter" in base_lower:
+        async_kwargs["default_headers"] = dict(_OR_HEADERS)
+    elif "api.kimi.com" in base_lower:
+        async_kwargs["default_headers"] = {"User-Agent": "KimiCLI/1.0"}
+    return AsyncOpenAI(**async_kwargs), model
+
+
+def resolve_provider_client(
+    provider: str,
+    model: str = None,
+    async_mode: bool = False,
+    raw_codex: bool = False,
+) -> Tuple[Optional[Any], Optional[str]]:
+    """Central router: given a provider name and optional model, return a
+    configured client with the correct auth, base URL, and API format.
+
+    The returned client always exposes ``.chat.completions.create()`` — for
+    Codex/Responses API providers, an adapter handles the translation
+    transparently.
+
+    Args:
+        provider: Provider identifier.  One of:
+            "openrouter", "nous", "openai-codex" (or "codex"),
+            "zai", "kimi-coding", "minimax", "minimax-cn",
+            "custom" (OPENAI_BASE_URL + OPENAI_API_KEY),
+            "auto" (full auto-detection chain).
+        model: Model slug override.  If None, uses the provider's default
+               auxiliary model.
+        async_mode: If True, return an async-compatible client.
+        raw_codex: If True, return a raw OpenAI client for Codex providers
+            instead of wrapping in CodexAuxiliaryClient.  Use this when
+            the caller needs direct access to responses.stream() (e.g.,
+            the main agent loop).
+
+    Returns:
+        (client, resolved_model) or (None, None) if auth is unavailable.
+    """
+    # Normalise aliases
+    provider = (provider or "auto").strip().lower()
+    if provider == "codex":
+        provider = "openai-codex"
+    if provider == "main":
+        provider = "custom"
+
+    # ── Auto: try all providers in priority order ────────────────────
+    if provider == "auto":
+        client, resolved = _resolve_auto()
+        if client is None:
+            return None, None
+        final_model = model or resolved
+        return (_to_async_client(client, final_model) if async_mode
+                else (client, final_model))
+
+    # ── OpenRouter ───────────────────────────────────────────────────
+    if provider == "openrouter":
+        client, default = _try_openrouter()
+        if client is None:
+            logger.warning("resolve_provider_client: openrouter requested "
+                           "but OPENROUTER_API_KEY not set")
+            return None, None
+        final_model = model or default
+        return (_to_async_client(client, final_model) if async_mode
+                else (client, final_model))
+
+    # ── Nous Portal (OAuth) ──────────────────────────────────────────
+    if provider == "nous":
+        client, default = _try_nous()
+        if client is None:
+            logger.warning("resolve_provider_client: nous requested "
+                           "but Nous Portal not configured (run: hermes login)")
+            return None, None
+        final_model = model or default
+        return (_to_async_client(client, final_model) if async_mode
+                else (client, final_model))
+
+    # ── OpenAI Codex (OAuth → Responses API) ─────────────────────────
+    if provider == "openai-codex":
+        if raw_codex:
+            # Return the raw OpenAI client for callers that need direct
+            # access to responses.stream() (e.g., the main agent loop).
+            codex_token = _read_codex_access_token()
+            if not codex_token:
+                logger.warning("resolve_provider_client: openai-codex requested "
+                               "but no Codex OAuth token found (run: hermes model)")
+                return None, None
+            final_model = model or _CODEX_AUX_MODEL
+            raw_client = OpenAI(api_key=codex_token, base_url=_CODEX_AUX_BASE_URL)
+            return (raw_client, final_model)
+        # Standard path: wrap in CodexAuxiliaryClient adapter
+        client, default = _try_codex()
+        if client is None:
+            logger.warning("resolve_provider_client: openai-codex requested "
+                           "but no Codex OAuth token found (run: hermes model)")
+            return None, None
+        final_model = model or default
+        return (_to_async_client(client, final_model) if async_mode
+                else (client, final_model))
+
+    # ── Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY) ───────────
+    if provider == "custom":
+        # Try custom first, then codex, then API-key providers
+        for try_fn in (_try_custom_endpoint, _try_codex,
+                       _resolve_api_key_provider):
+            client, default = try_fn()
+            if client is not None:
+                final_model = model or default
+                return (_to_async_client(client, final_model) if async_mode
+                        else (client, final_model))
+        logger.warning("resolve_provider_client: custom/main requested "
+                       "but no endpoint credentials found")
+        return None, None
+
+    # ── API-key providers from PROVIDER_REGISTRY ─────────────────────
+    try:
+        from hermes_cli.auth import PROVIDER_REGISTRY, _resolve_kimi_base_url
+    except ImportError:
+        logger.debug("hermes_cli.auth not available for provider %s", provider)
+        return None, None
+
+    pconfig = PROVIDER_REGISTRY.get(provider)
+    if pconfig is None:
+        logger.warning("resolve_provider_client: unknown provider %r", provider)
+        return None, None
+
+    if pconfig.auth_type == "api_key":
+        # Find the first configured API key
+        api_key = ""
+        for env_var in pconfig.api_key_env_vars:
+            api_key = os.getenv(env_var, "").strip()
+            if api_key:
+                break
+        if not api_key:
+            logger.warning("resolve_provider_client: provider %s has no API "
+                           "key configured (tried: %s)",
+                           provider, ", ".join(pconfig.api_key_env_vars))
+            return None, None
+
+        # Resolve base URL (env override → provider-specific logic → default)
+        base_url_override = os.getenv(pconfig.base_url_env_var, "").strip() if pconfig.base_url_env_var else ""
+        if provider == "kimi-coding":
+            base_url = _resolve_kimi_base_url(api_key, pconfig.inference_base_url, base_url_override)
+        elif base_url_override:
+            base_url = base_url_override
+        else:
+            base_url = pconfig.inference_base_url
+
+        default_model = _API_KEY_PROVIDER_AUX_MODELS.get(provider, "")
+        final_model = model or default_model
+
+        # Provider-specific headers
+        headers = {}
+        if "api.kimi.com" in base_url.lower():
+            headers["User-Agent"] = "KimiCLI/1.0"
+
+        client = OpenAI(api_key=api_key, base_url=base_url,
+                        **({"default_headers": headers} if headers else {}))
+        logger.debug("resolve_provider_client: %s (%s)", provider, final_model)
+        return (_to_async_client(client, final_model) if async_mode
+                else (client, final_model))
+
+    elif pconfig.auth_type in ("oauth_device_code", "oauth_external"):
+        # OAuth providers — route through their specific try functions
+        if provider == "nous":
+            return resolve_provider_client("nous", model, async_mode)
+        if provider == "openai-codex":
+            return resolve_provider_client("openai-codex", model, async_mode)
+        # Other OAuth providers not directly supported
+        logger.warning("resolve_provider_client: OAuth provider %s not "
+                       "directly supported, try 'auto'", provider)
+        return None, None
+
+    logger.warning("resolve_provider_client: unhandled auth_type %s for %s",
+                   pconfig.auth_type, provider)
+    return None, None
+
+
 # ── Public API ──────────────────────────────────────────────────────────────

 def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optional[str]]:
@ -513,8 +715,8 @@ def get_text_auxiliary_client(task: str = "") -> Tuple[Optional[OpenAI], Optiona
    """
    forced = _get_auxiliary_provider(task)
    if forced != "auto":
-        return _resolve_forced_provider(forced)
-    return _resolve_auto()
+        return resolve_provider_client(forced)
+    return resolve_provider_client("auto")


 def get_async_text_auxiliary_client(task: str = ""):
@ -524,24 +726,10 @@ def get_async_text_auxiliary_client(task: str = ""):
    (AsyncCodexAuxiliaryClient, model) which wraps the Responses API.
    Returns (None, None) when no provider is available.
    """
-    from openai import AsyncOpenAI
-
-    sync_client, model = get_text_auxiliary_client(task)
-    if sync_client is None:
-        return None, None
-
-    if isinstance(sync_client, CodexAuxiliaryClient):
-        return AsyncCodexAuxiliaryClient(sync_client), model
-
-    async_kwargs = {
-        "api_key": sync_client.api_key,
-        "base_url": str(sync_client.base_url),
-    }
-    if "openrouter" in str(sync_client.base_url).lower():
-        async_kwargs["default_headers"] = dict(_OR_HEADERS)
-    elif "api.kimi.com" in str(sync_client.base_url).lower():
-        async_kwargs["default_headers"] = {"User-Agent": "KimiCLI/1.0"}
-    return AsyncOpenAI(**async_kwargs), model
+    forced = _get_auxiliary_provider(task)
+    if forced != "auto":
+        return resolve_provider_client(forced, async_mode=True)
+    return resolve_provider_client("auto", async_mode=True)


 def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
@ -559,7 +747,7 @@ def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
    """
    forced = _get_auxiliary_provider("vision")
    if forced != "auto":
-        return _resolve_forced_provider(forced)
+        return resolve_provider_client(forced)
    # Auto: try providers known to support multimodal first, then fall
    # back to the user's custom endpoint.  Many local models (Qwen-VL,
    # LLaVA, Pixtral, etc.) support vision — skipping them entirely
@ -573,6 +761,21 @@ def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
    return None, None


+def get_async_vision_auxiliary_client():
+    """Return (async_client, model_slug) for async vision consumers.
+
+    Properly handles Codex routing — unlike manually constructing
+    AsyncOpenAI from a sync client, this preserves the Responses API
+    adapter for Codex providers.
+
+    Returns (None, None) when no provider is available.
+    """
+    sync_client, model = get_vision_auxiliary_client()
+    if sync_client is None:
+        return None, None
+    return _to_async_client(sync_client, model)
+
+
 def get_auxiliary_extra_body() -> dict:
    """Return extra_body kwargs for auxiliary API calls.
    
@ -598,3 +801,253 @@ def auxiliary_max_tokens_param(value: int) -> dict:
            and "api.openai.com" in custom_base.lower()):
        return {"max_completion_tokens": value}
    return {"max_tokens": value}
+
+
+# ── Centralized LLM Call API ────────────────────────────────────────────────
+#
+# call_llm() and async_call_llm() own the full request lifecycle:
+#   1. Resolve provider + model from task config (or explicit args)
+#   2. Get or create a cached client for that provider
+#   3. Format request args for the provider + model (max_tokens handling, etc.)
+#   4. Make the API call
+#   5. Return the response
+#
+# Every auxiliary LLM consumer should use these instead of manually
+# constructing clients and calling .chat.completions.create().
+
+# Client cache: (provider, async_mode) -> (client, default_model)
+_client_cache: Dict[tuple, tuple] = {}
+
+
+def _get_cached_client(
+    provider: str, model: str = None, async_mode: bool = False,
+) -> Tuple[Optional[Any], Optional[str]]:
+    """Get or create a cached client for the given provider."""
+    cache_key = (provider, async_mode)
+    if cache_key in _client_cache:
+        cached_client, cached_default = _client_cache[cache_key]
+        return cached_client, model or cached_default
+    client, default_model = resolve_provider_client(provider, model, async_mode)
+    if client is not None:
+        _client_cache[cache_key] = (client, default_model)
+    return client, model or default_model
+
+
+def _resolve_task_provider_model(
+    task: str = None,
+    provider: str = None,
+    model: str = None,
+) -> Tuple[str, Optional[str]]:
+    """Determine provider + model for a call.
+
+    Priority:
+      1. Explicit provider/model args (always win)
+      2. Env var overrides (AUXILIARY_{TASK}_PROVIDER, etc.)
+      3. Config file (auxiliary.{task}.provider/model or compression.*)
+      4. "auto" (full auto-detection chain)
+
+    Returns (provider, model) where model may be None (use provider default).
+    """
+    if provider:
+        return provider, model
+
+    if task:
+        # Check env var overrides first
+        env_provider = _get_auxiliary_provider(task)
+        if env_provider != "auto":
+            # Check for env var model override too
+            env_model = None
+            for prefix in ("AUXILIARY_", "CONTEXT_"):
+                val = os.getenv(f"{prefix}{task.upper()}_MODEL", "").strip()
+                if val:
+                    env_model = val
+                    break
+            return env_provider, model or env_model
+
+        # Read from config file
+        try:
+            from hermes_cli.config import load_config
+            config = load_config()
+        except ImportError:
+            return "auto", model
+
+        # Check auxiliary.{task} section
+        aux = config.get("auxiliary", {})
+        task_config = aux.get(task, {})
+        cfg_provider = task_config.get("provider", "").strip() or None
+        cfg_model = task_config.get("model", "").strip() or None
+
+        # Backwards compat: compression section has its own keys
+        if task == "compression" and not cfg_provider:
+            comp = config.get("compression", {})
+            cfg_provider = comp.get("summary_provider", "").strip() or None
+            cfg_model = cfg_model or comp.get("summary_model", "").strip() or None
+
+        if cfg_provider and cfg_provider != "auto":
+            return cfg_provider, model or cfg_model
+        return "auto", model or cfg_model
+
+    return "auto", model
+
+
+def _build_call_kwargs(
+    provider: str,
+    model: str,
+    messages: list,
+    temperature: Optional[float] = None,
+    max_tokens: Optional[int] = None,
+    tools: Optional[list] = None,
+    timeout: float = 30.0,
+    extra_body: Optional[dict] = None,
+) -> dict:
+    """Build kwargs for .chat.completions.create() with model/provider adjustments."""
+    kwargs: Dict[str, Any] = {
+        "model": model,
+        "messages": messages,
+        "timeout": timeout,
+    }
+
+    if temperature is not None:
+        kwargs["temperature"] = temperature
+
+    if max_tokens is not None:
+        # Codex adapter handles max_tokens internally; OpenRouter/Nous use max_tokens.
+        # Direct OpenAI api.openai.com with newer models needs max_completion_tokens.
+        if provider == "custom":
+            custom_base = os.getenv("OPENAI_BASE_URL", "")
+            if "api.openai.com" in custom_base.lower():
+                kwargs["max_completion_tokens"] = max_tokens
+            else:
+                kwargs["max_tokens"] = max_tokens
+        else:
+            kwargs["max_tokens"] = max_tokens
+
+    if tools:
+        kwargs["tools"] = tools
+
+    # Provider-specific extra_body
+    merged_extra = dict(extra_body or {})
+    if provider == "nous" or auxiliary_is_nous:
+        merged_extra.setdefault("tags", []).extend(["product=hermes-agent"])
+    if merged_extra:
+        kwargs["extra_body"] = merged_extra
+
+    return kwargs
+
+
+def call_llm(
+    task: str = None,
+    *,
+    provider: str = None,
+    model: str = None,
+    messages: list,
+    temperature: float = None,
+    max_tokens: int = None,
+    tools: list = None,
+    timeout: float = 30.0,
+    extra_body: dict = None,
+) -> Any:
+    """Centralized synchronous LLM call.
+
+    Resolves provider + model (from task config, explicit args, or auto-detect),
+    handles auth, request formatting, and model-specific arg adjustments.
+
+    Args:
+        task: Auxiliary task name ("compression", "vision", "web_extract",
+              "session_search", "skills_hub", "mcp", "flush_memories").
+              Reads provider:model from config/env. Ignored if provider is set.
+        provider: Explicit provider override.
+        model: Explicit model override.
+        messages: Chat messages list.
+        temperature: Sampling temperature (None = provider default).
+        max_tokens: Max output tokens (handles max_tokens vs max_completion_tokens).
+        tools: Tool definitions (for function calling).
+        timeout: Request timeout in seconds.
+        extra_body: Additional request body fields.
+
+    Returns:
+        Response object with .choices[0].message.content
+
+    Raises:
+        RuntimeError: If no provider is configured.
+    """
+    resolved_provider, resolved_model = _resolve_task_provider_model(
+        task, provider, model)
+
+    client, final_model = _get_cached_client(resolved_provider, resolved_model)
+    if client is None:
+        # Fallback: try openrouter
+        if resolved_provider != "openrouter":
+            logger.warning("Provider %s unavailable, falling back to openrouter",
+                           resolved_provider)
+            client, final_model = _get_cached_client(
+                "openrouter", resolved_model or _OPENROUTER_MODEL)
+    if client is None:
+        raise RuntimeError(
+            f"No LLM provider configured for task={task} provider={resolved_provider}. "
+            f"Run: hermes setup")
+
+    kwargs = _build_call_kwargs(
+        resolved_provider, final_model, messages,
+        temperature=temperature, max_tokens=max_tokens,
+        tools=tools, timeout=timeout, extra_body=extra_body)
+
+    # Handle max_tokens vs max_completion_tokens retry
+    try:
+        return client.chat.completions.create(**kwargs)
+    except Exception as first_err:
+        err_str = str(first_err)
+        if "max_tokens" in err_str or "unsupported_parameter" in err_str:
+            kwargs.pop("max_tokens", None)
+            kwargs["max_completion_tokens"] = max_tokens
+            return client.chat.completions.create(**kwargs)
+        raise
+
+
+async def async_call_llm(
+    task: str = None,
+    *,
+    provider: str = None,
+    model: str = None,
+    messages: list,
+    temperature: float = None,
+    max_tokens: int = None,
+    tools: list = None,
+    timeout: float = 30.0,
+    extra_body: dict = None,
+) -> Any:
+    """Centralized asynchronous LLM call.
+
+    Same as call_llm() but async. See call_llm() for full documentation.
+    """
+    resolved_provider, resolved_model = _resolve_task_provider_model(
+        task, provider, model)
+
+    client, final_model = _get_cached_client(
+        resolved_provider, resolved_model, async_mode=True)
+    if client is None:
+        if resolved_provider != "openrouter":
+            logger.warning("Provider %s unavailable, falling back to openrouter",
+                           resolved_provider)
+            client, final_model = _get_cached_client(
+                "openrouter", resolved_model or _OPENROUTER_MODEL,
+                async_mode=True)
+    if client is None:
+        raise RuntimeError(
+            f"No LLM provider configured for task={task} provider={resolved_provider}. "
+            f"Run: hermes setup")
+
+    kwargs = _build_call_kwargs(
+        resolved_provider, final_model, messages,
+        temperature=temperature, max_tokens=max_tokens,
+        tools=tools, timeout=timeout, extra_body=extra_body)
+
+    try:
+        return await client.chat.completions.create(**kwargs)
+    except Exception as first_err:
+        err_str = str(first_err)
+        if "max_tokens" in err_str or "unsupported_parameter" in err_str:
+            kwargs.pop("max_tokens", None)
+            kwargs["max_completion_tokens"] = max_tokens
+            return await client.chat.completions.create(**kwargs)
+        raise
--- a/agent/context_compressor.py
+++ b/agent/context_compressor.py
@ -9,7 +9,7 @@ import logging
 import os
 from typing import Any, Dict, List, Optional

-from agent.auxiliary_client import get_text_auxiliary_client
+from agent.auxiliary_client import call_llm
 from agent.model_metadata import (
    get_model_context_length,
    estimate_messages_tokens_rough,
@ -53,8 +53,7 @@ class ContextCompressor:
        self.last_completion_tokens = 0
        self.last_total_tokens = 0

-        self.client, default_model = get_text_auxiliary_client("compression")
-        self.summary_model = summary_model_override or default_model
+        self.summary_model = summary_model_override or ""

    def update_from_response(self, usage: Dict[str, Any]):
        """Update tracked token usage from API response."""
@ -120,84 +119,30 @@ TURNS TO SUMMARIZE:

 Write only the summary, starting with "[CONTEXT SUMMARY]:" prefix."""

-        # 1. Try the auxiliary model (cheap/fast)
-        if self.client:
-            try:
-                return self._call_summary_model(self.client, self.summary_model, prompt)
-            except Exception as e:
-                logging.warning(f"Failed to generate context summary with auxiliary model: {e}")
-
-        # 2. Fallback: try the user's main model endpoint
-        fallback_client, fallback_model = self._get_fallback_client()
-        if fallback_client is not None:
-            try:
-                logger.info("Retrying context summary with main model (%s)", fallback_model)
-                summary = self._call_summary_model(fallback_client, fallback_model, prompt)
-                self.client = fallback_client
-                self.summary_model = fallback_model
-                return summary
-            except Exception as fallback_err:
-                logging.warning(f"Main model summary also failed: {fallback_err}")
-
-        # 3. All models failed — return None so the caller drops turns without a summary
-        logging.warning("Context compression: no model available for summary. Middle turns will be dropped without summary.")
-        return None
-
-    def _call_summary_model(self, client, model: str, prompt: str) -> str:
-        """Make the actual LLM call to generate a summary. Raises on failure."""
-        kwargs = {
-            "model": model,
-            "messages": [{"role": "user", "content": prompt}],
-            "temperature": 0.3,
-            "timeout": 30.0,
-        }
-        # Most providers (OpenRouter, local models) use max_tokens.
-        # Direct OpenAI with newer models (gpt-4o, o-series, gpt-5+)
-        # requires max_completion_tokens instead.
+        # Use the centralized LLM router — handles provider resolution,
+        # auth, and fallback internally.
        try:
-            kwargs["max_tokens"] = self.summary_target_tokens * 2
-            response = client.chat.completions.create(**kwargs)
-        except Exception as first_err:
-            if "max_tokens" in str(first_err) or "unsupported_parameter" in str(first_err):
-                kwargs.pop("max_tokens", None)
-                kwargs["max_completion_tokens"] = self.summary_target_tokens * 2
-                response = client.chat.completions.create(**kwargs)
-            else:
-                raise
-
-        summary = response.choices[0].message.content.strip()
-        if not summary.startswith("[CONTEXT SUMMARY]:"):
-            summary = "[CONTEXT SUMMARY]: " + summary
-        return summary
-
-    def _get_fallback_client(self):
-        """Try to build a fallback client from the main model's endpoint config.
-
-        When the primary auxiliary client fails (e.g. stale OpenRouter key), this
-        creates a client using the user's active custom endpoint (OPENAI_BASE_URL)
-        so compression can still produce a real summary instead of a static string.
-
-        Returns (client, model) or (None, None).
-        """
-        custom_base = os.getenv("OPENAI_BASE_URL")
-        custom_key = os.getenv("OPENAI_API_KEY")
-        if not custom_base or not custom_key:
-            return None, None
-
-        # Don't fallback to the same provider that just failed
-        from hermes_constants import OPENROUTER_BASE_URL
-        if custom_base.rstrip("/") == OPENROUTER_BASE_URL.rstrip("/"):
-            return None, None
-
-        model = os.getenv("LLM_MODEL") or os.getenv("OPENAI_MODEL") or self.model
-        try:
-            from openai import OpenAI as _OpenAI
-            client = _OpenAI(api_key=custom_key, base_url=custom_base)
-            logger.debug("Built fallback auxiliary client: %s via %s", model, custom_base)
-            return client, model
-        except Exception as exc:
-            logger.debug("Could not build fallback auxiliary client: %s", exc)
-            return None, None
+            call_kwargs = {
+                "task": "compression",
+                "messages": [{"role": "user", "content": prompt}],
+                "temperature": 0.3,
+                "max_tokens": self.summary_target_tokens * 2,
+                "timeout": 30.0,
+            }
+            if self.summary_model:
+                call_kwargs["model"] = self.summary_model
+            response = call_llm(**call_kwargs)
+            summary = response.choices[0].message.content.strip()
+            if not summary.startswith("[CONTEXT SUMMARY]:"):
+                summary = "[CONTEXT SUMMARY]: " + summary
+            return summary
+        except RuntimeError:
+            logging.warning("Context compression: no provider available for "
+                            "summary. Middle turns will be dropped without summary.")
+            return None
+        except Exception as e:
+            logging.warning("Failed to generate context summary: %s", e)
+            return None

    # ------------------------------------------------------------------
    # Tool-call / tool-result pair integrity helpers
--- a/agent/model_metadata.py
+++ b/agent/model_metadata.py
@ -53,8 +53,10 @@ DEFAULT_CONTEXT_LENGTHS = {
    "glm-5": 202752,
    "glm-4.5": 131072,
    "glm-4.5-flash": 131072,
+    "kimi-for-coding": 262144,
    "kimi-k2.5": 262144,
    "kimi-k2-thinking": 262144,
+    "kimi-k2-thinking-turbo": 262144,
    "kimi-k2-turbo-preview": 262144,
    "kimi-k2-0905-preview": 131072,
    "MiniMax-M2.5": 204800,