fix(anthropic): revert inline vision, add hermes model flow, wire vision aux
Feedback fixes:
1. Revert _convert_vision_content — vision is handled by the vision_analyze
tool, not by converting image blocks inline in conversation messages.
Removed the function and its tests.
2. Add Anthropic to 'hermes model' (cmd_model in main.py):
- Added to provider_labels dict
- Added to providers selection list
- Added _model_flow_anthropic() with Claude Code credential auto-detection,
API key prompting, and model selection from catalog.
3. Wire up Anthropic as a vision-capable auxiliary provider:
- Added _try_anthropic() to auxiliary_client.py using claude-sonnet-4
as the vision model (Claude natively supports multimodal)
- Added to the get_vision_auxiliary_client() auto-detection chain
(after OpenRouter/Nous, before Codex/custom)
Cache tracking note: the Anthropic cache metrics branch in run_agent.py
(cache_read_input_tokens / cache_creation_input_tokens) is in the correct
place — it's response-level parsing, same location as the existing
OpenRouter cache tracking. auxiliary_client.py has no cache tracking.
This commit is contained in:
parent
d7adfe8f61
commit
7086fde37e
4 changed files with 105 additions and 94 deletions
|
|
@ -184,58 +184,6 @@ def convert_tools_to_anthropic(tools: List[Dict]) -> List[Dict]:
|
|||
return result
|
||||
|
||||
|
||||
def _convert_vision_content(content: Any) -> Any:
|
||||
"""Convert OpenAI multimodal content blocks to Anthropic format.
|
||||
|
||||
OpenAI format: [{"type": "image_url", "image_url": {"url": "data:...;base64,..."}}]
|
||||
Anthropic format: [{"type": "image", "source": {"type": "base64", ...}}]
|
||||
"""
|
||||
if not isinstance(content, list):
|
||||
return content
|
||||
|
||||
result = []
|
||||
for block in content:
|
||||
if not isinstance(block, dict):
|
||||
result.append(block)
|
||||
continue
|
||||
|
||||
if block.get("type") == "image_url":
|
||||
image_url = block.get("image_url", {})
|
||||
url = image_url.get("url", "") if isinstance(image_url, dict) else ""
|
||||
|
||||
if url.startswith("data:"):
|
||||
# data:image/png;base64,iVBOR...
|
||||
try:
|
||||
header, b64_data = url.split(",", 1)
|
||||
media_type = header.split(":")[1].split(";")[0]
|
||||
result.append({
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "base64",
|
||||
"media_type": media_type,
|
||||
"data": b64_data,
|
||||
},
|
||||
})
|
||||
except (ValueError, IndexError):
|
||||
logger.warning("Could not parse data URL for image, skipping")
|
||||
else:
|
||||
# Regular URL — Anthropic supports url source type
|
||||
result.append({
|
||||
"type": "image",
|
||||
"source": {
|
||||
"type": "url",
|
||||
"url": url,
|
||||
},
|
||||
})
|
||||
elif block.get("type") == "text":
|
||||
result.append({"type": "text", "text": block.get("text", "")})
|
||||
else:
|
||||
# Pass through unknown block types
|
||||
result.append(block)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def convert_messages_to_anthropic(
|
||||
messages: List[Dict],
|
||||
) -> Tuple[Optional[Any], List[Dict]]:
|
||||
|
|
@ -304,9 +252,8 @@ def convert_messages_to_anthropic(
|
|||
result.append({"role": "user", "content": [tool_result]})
|
||||
continue
|
||||
|
||||
# Regular user message — convert vision content if multimodal
|
||||
converted = _convert_vision_content(content) if isinstance(content, list) else content
|
||||
result.append({"role": "user", "content": converted})
|
||||
# Regular user message
|
||||
result.append({"role": "user", "content": content})
|
||||
|
||||
# Strip orphaned tool_use blocks (no matching tool_result follows)
|
||||
tool_result_ids = set()
|
||||
|
|
|
|||
|
|
@ -449,6 +449,21 @@ def _try_custom_endpoint() -> Tuple[Optional[OpenAI], Optional[str]]:
|
|||
return OpenAI(api_key=custom_key, base_url=custom_base), model
|
||||
|
||||
|
||||
_ANTHROPIC_VISION_MODEL = "claude-sonnet-4-20250514"
|
||||
|
||||
|
||||
def _try_anthropic() -> Tuple[Optional[Any], Optional[str]]:
|
||||
"""Try Anthropic credentials for auxiliary tasks (vision-capable)."""
|
||||
from agent.anthropic_adapter import resolve_anthropic_token
|
||||
token = resolve_anthropic_token()
|
||||
if not token:
|
||||
return None, None
|
||||
# Return a simple wrapper that indicates Anthropic is available.
|
||||
# The actual client is created by resolve_provider_client("anthropic").
|
||||
logger.debug("Auxiliary client: Anthropic (%s)", _ANTHROPIC_VISION_MODEL)
|
||||
return resolve_provider_client("anthropic", model=_ANTHROPIC_VISION_MODEL)
|
||||
|
||||
|
||||
def _try_codex() -> Tuple[Optional[Any], Optional[str]]:
|
||||
codex_token = _read_codex_access_token()
|
||||
if not codex_token:
|
||||
|
|
@ -753,8 +768,8 @@ def get_vision_auxiliary_client() -> Tuple[Optional[OpenAI], Optional[str]]:
|
|||
# back to the user's custom endpoint. Many local models (Qwen-VL,
|
||||
# LLaVA, Pixtral, etc.) support vision — skipping them entirely
|
||||
# caused silent failures for local-only users.
|
||||
for try_fn in (_try_openrouter, _try_nous, _try_codex,
|
||||
_try_custom_endpoint):
|
||||
for try_fn in (_try_openrouter, _try_nous, _try_anthropic,
|
||||
_try_codex, _try_custom_endpoint):
|
||||
client, model = try_fn()
|
||||
if client is not None:
|
||||
return client, model
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue