fix(anthropic): revert inline vision, add hermes model flow, wire vision aux

Feedback fixes:

1. Revert _convert_vision_content — vision is handled by the vision_analyze
   tool, not by converting image blocks inline in conversation messages.
   Removed the function and its tests.

2. Add Anthropic to 'hermes model' (cmd_model in main.py):
   - Added to provider_labels dict
   - Added to providers selection list
   - Added _model_flow_anthropic() with Claude Code credential auto-detection,
     API key prompting, and model selection from catalog.

3. Wire up Anthropic as a vision-capable auxiliary provider:
   - Added _try_anthropic() to auxiliary_client.py using claude-sonnet-4
     as the vision model (Claude natively supports multimodal)
   - Added to the get_vision_auxiliary_client() auto-detection chain
     (after OpenRouter/Nous, before Codex/custom)

Cache tracking note: the Anthropic cache metrics branch in run_agent.py
(cache_read_input_tokens / cache_creation_input_tokens) is in the correct
place — it's response-level parsing, same location as the existing
OpenRouter cache tracking. auxiliary_client.py has no cache tracking.
This commit is contained in:
teknium1 2026-03-12 16:09:04 -07:00
parent d7adfe8f61
commit 7086fde37e
4 changed files with 105 additions and 94 deletions

View file

@ -184,58 +184,6 @@ def convert_tools_to_anthropic(tools: List[Dict]) -> List[Dict]:
return result
def _convert_vision_content(content: Any) -> Any:
"""Convert OpenAI multimodal content blocks to Anthropic format.
OpenAI format: [{"type": "image_url", "image_url": {"url": "data:...;base64,..."}}]
Anthropic format: [{"type": "image", "source": {"type": "base64", ...}}]
"""
if not isinstance(content, list):
return content
result = []
for block in content:
if not isinstance(block, dict):
result.append(block)
continue
if block.get("type") == "image_url":
image_url = block.get("image_url", {})
url = image_url.get("url", "") if isinstance(image_url, dict) else ""
if url.startswith("data:"):
# data:image/png;base64,iVBOR...
try:
header, b64_data = url.split(",", 1)
media_type = header.split(":")[1].split(";")[0]
result.append({
"type": "image",
"source": {
"type": "base64",
"media_type": media_type,
"data": b64_data,
},
})
except (ValueError, IndexError):
logger.warning("Could not parse data URL for image, skipping")
else:
# Regular URL — Anthropic supports url source type
result.append({
"type": "image",
"source": {
"type": "url",
"url": url,
},
})
elif block.get("type") == "text":
result.append({"type": "text", "text": block.get("text", "")})
else:
# Pass through unknown block types
result.append(block)
return result
def convert_messages_to_anthropic(
messages: List[Dict],
) -> Tuple[Optional[Any], List[Dict]]:
@ -304,9 +252,8 @@ def convert_messages_to_anthropic(
result.append({"role": "user", "content": [tool_result]})
continue
# Regular user message — convert vision content if multimodal
converted = _convert_vision_content(content) if isinstance(content, list) else content
result.append({"role": "user", "content": converted})
# Regular user message
result.append({"role": "user", "content": content})
# Strip orphaned tool_use blocks (no matching tool_result follows)
tool_result_ids = set()

View file

@ -449,6 +449,21 @@ def _try_custom_endpoint() -> Tuple[Optional[OpenAI], Optional[str]]:
return OpenAI(api_key=custom_key, base_url=custom_base), model
_ANTHROPIC_VISION_MODEL = "claude-sonnet-4-20250514"
def _try_anthropic() -> Tuple[Optional[Any], Optional[str]]:
"""Try Anthropic credentials for auxiliary tasks (vision-capable)."""
from agent.anthropic_adapter import resolve_anthropic_token
token = resolve_anthropic_token()
if not token:
return None, None
# Return a simple wrapper that indicates Anthropic is available.
# The actual client is created by resolve_provider_client("anthropic").
logger.debug("Auxiliary client: Anthropic (%s)", _ANTHROPIC_VISION_MODEL)
return resolve_provider_client("anthropic", model=_ANTHROPIC_VISION_MODEL)
def _try_codex() -> Tuple[Optional[Any], Optional[str]]:
codex_token = _read_codex_access_token()
if not codex_token:
@ -753,8 +768,8 @@ def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
# back to the user's custom endpoint. Many local models (Qwen-VL,
# LLaVA, Pixtral, etc.) support vision — skipping them entirely
# caused silent failures for local-only users.
for try_fn in (_try_openrouter, _try_nous, _try_codex,
_try_custom_endpoint):
for try_fn in (_try_openrouter, _try_nous, _try_anthropic,
_try_codex, _try_custom_endpoint):
client, model = try_fn()
if client is not None:
return client, model