fix(anthropic): revert inline vision, add hermes model flow, wire vision aux

Feedback fixes: 1. Revert _convert_vision_content — vision is handled by the vision_analyze tool, not by converting image blocks inline in conversation messages. Removed the function and its tests. 2. Add Anthropic to 'hermes model' (cmd_model in main.py): - Added to provider_labels dict - Added to providers selection list - Added _model_flow_anthropic() with Claude Code credential auto-detection, API key prompting, and model selection from catalog. 3. Wire up Anthropic as a vision-capable auxiliary provider: - Added _try_anthropic() to auxiliary_client.py using claude-sonnet-4 as the vision model (Claude natively supports multimodal) - Added to the get_vision_auxiliary_client() auto-detection chain (after OpenRouter/Nous, before Codex/custom) Cache tracking note: the Anthropic cache metrics branch in run_agent.py (cache_read_input_tokens / cache_creation_input_tokens) is in the correct place — it's response-level parsing, same location as the existing OpenRouter cache tracking. auxiliary_client.py has no cache tracking.
2026-03-12 16:09:04 -07:00 · 2026-03-12 16:09:04 -07:00 · 7086fde37e
commit 7086fde37e
parent d7adfe8f61
4 changed files with 105 additions and 94 deletions
--- a/agent/anthropic_adapter.py
+++ b/agent/anthropic_adapter.py
@ -184,58 +184,6 @@ def convert_tools_to_anthropic(tools: List[Dict]) -> List[Dict]:
    return result


-def _convert_vision_content(content: Any) -> Any:
-    """Convert OpenAI multimodal content blocks to Anthropic format.
-
-    OpenAI format:  [{"type": "image_url", "image_url": {"url": "data:...;base64,..."}}]
-    Anthropic format: [{"type": "image", "source": {"type": "base64", ...}}]
-    """
-    if not isinstance(content, list):
-        return content
-
-    result = []
-    for block in content:
-        if not isinstance(block, dict):
-            result.append(block)
-            continue
-
-        if block.get("type") == "image_url":
-            image_url = block.get("image_url", {})
-            url = image_url.get("url", "") if isinstance(image_url, dict) else ""
-
-            if url.startswith("data:"):
-                # data:image/png;base64,iVBOR...
-                try:
-                    header, b64_data = url.split(",", 1)
-                    media_type = header.split(":")[1].split(";")[0]
-                    result.append({
-                        "type": "image",
-                        "source": {
-                            "type": "base64",
-                            "media_type": media_type,
-                            "data": b64_data,
-                        },
-                    })
-                except (ValueError, IndexError):
-                    logger.warning("Could not parse data URL for image, skipping")
-            else:
-                # Regular URL — Anthropic supports url source type
-                result.append({
-                    "type": "image",
-                    "source": {
-                        "type": "url",
-                        "url": url,
-                    },
-                })
-        elif block.get("type") == "text":
-            result.append({"type": "text", "text": block.get("text", "")})
-        else:
-            # Pass through unknown block types
-            result.append(block)
-
-    return result
-
-
 def convert_messages_to_anthropic(
    messages: List[Dict],
 ) -> Tuple[Optional[Any], List[Dict]]:
@ -304,9 +252,8 @@ def convert_messages_to_anthropic(
                result.append({"role": "user", "content": [tool_result]})
            continue

-        # Regular user message — convert vision content if multimodal
-        converted = _convert_vision_content(content) if isinstance(content, list) else content
-        result.append({"role": "user", "content": converted})
+        # Regular user message
+        result.append({"role": "user", "content": content})

    # Strip orphaned tool_use blocks (no matching tool_result follows)
    tool_result_ids = set()
--- a/agent/auxiliary_client.py
+++ b/agent/auxiliary_client.py
@ -449,6 +449,21 @@ def _try_custom_endpoint() -> Tuple[Optional[OpenAI], Optional[str]]:
    return OpenAI(api_key=custom_key, base_url=custom_base), model


+_ANTHROPIC_VISION_MODEL = "claude-sonnet-4-20250514"
+
+
+def _try_anthropic() -> Tuple[Optional[Any], Optional[str]]:
+    """Try Anthropic credentials for auxiliary tasks (vision-capable)."""
+    from agent.anthropic_adapter import resolve_anthropic_token
+    token = resolve_anthropic_token()
+    if not token:
+        return None, None
+    # Return a simple wrapper that indicates Anthropic is available.
+    # The actual client is created by resolve_provider_client("anthropic").
+    logger.debug("Auxiliary client: Anthropic (%s)", _ANTHROPIC_VISION_MODEL)
+    return resolve_provider_client("anthropic", model=_ANTHROPIC_VISION_MODEL)
+
+
 def _try_codex() -> Tuple[Optional[Any], Optional[str]]:
    codex_token = _read_codex_access_token()
    if not codex_token:
@ -753,8 +768,8 @@ def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
    # back to the user's custom endpoint.  Many local models (Qwen-VL,
    # LLaVA, Pixtral, etc.) support vision — skipping them entirely
    # caused silent failures for local-only users.
-    for try_fn in (_try_openrouter, _try_nous, _try_codex,
-                   _try_custom_endpoint):
+    for try_fn in (_try_openrouter, _try_nous, _try_anthropic,
+                   _try_codex, _try_custom_endpoint):
        client, model = try_fn()
        if client is not None:
            return client, model