feat: add native Anthropic auxiliary vision
This commit is contained in:
parent
dc11b86e4b
commit
db362dbd4c
7 changed files with 386 additions and 30 deletions
|
|
@ -391,6 +391,68 @@ def _sanitize_tool_id(tool_id: str) -> str:
|
||||||
return sanitized or "tool_0"
|
return sanitized or "tool_0"
|
||||||
|
|
||||||
|
|
||||||
|
def _convert_openai_image_part_to_anthropic(part: Dict[str, Any]) -> Optional[Dict[str, Any]]:
|
||||||
|
"""Convert an OpenAI-style image block to Anthropic's image source format."""
|
||||||
|
image_data = part.get("image_url", {})
|
||||||
|
url = image_data.get("url", "") if isinstance(image_data, dict) else str(image_data)
|
||||||
|
if not isinstance(url, str) or not url.strip():
|
||||||
|
return None
|
||||||
|
url = url.strip()
|
||||||
|
|
||||||
|
if url.startswith("data:"):
|
||||||
|
header, sep, data = url.partition(",")
|
||||||
|
if sep and ";base64" in header:
|
||||||
|
media_type = header[5:].split(";", 1)[0] or "image/png"
|
||||||
|
return {
|
||||||
|
"type": "image",
|
||||||
|
"source": {
|
||||||
|
"type": "base64",
|
||||||
|
"media_type": media_type,
|
||||||
|
"data": data,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
if url.startswith("http://") or url.startswith("https://"):
|
||||||
|
return {
|
||||||
|
"type": "image",
|
||||||
|
"source": {
|
||||||
|
"type": "url",
|
||||||
|
"url": url,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _convert_user_content_part_to_anthropic(part: Any) -> Optional[Dict[str, Any]]:
|
||||||
|
if isinstance(part, dict):
|
||||||
|
ptype = part.get("type")
|
||||||
|
if ptype == "text":
|
||||||
|
block = {"type": "text", "text": part.get("text", "")}
|
||||||
|
if isinstance(part.get("cache_control"), dict):
|
||||||
|
block["cache_control"] = dict(part["cache_control"])
|
||||||
|
return block
|
||||||
|
if ptype == "image_url":
|
||||||
|
return _convert_openai_image_part_to_anthropic(part)
|
||||||
|
if ptype == "image" and part.get("source"):
|
||||||
|
return dict(part)
|
||||||
|
if ptype == "image" and part.get("data"):
|
||||||
|
media_type = part.get("mimeType") or part.get("media_type") or "image/png"
|
||||||
|
return {
|
||||||
|
"type": "image",
|
||||||
|
"source": {
|
||||||
|
"type": "base64",
|
||||||
|
"media_type": media_type,
|
||||||
|
"data": part.get("data", ""),
|
||||||
|
},
|
||||||
|
}
|
||||||
|
if ptype == "tool_result":
|
||||||
|
return dict(part)
|
||||||
|
elif part is not None:
|
||||||
|
return {"type": "text", "text": str(part)}
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def convert_tools_to_anthropic(tools: List[Dict]) -> List[Dict]:
|
def convert_tools_to_anthropic(tools: List[Dict]) -> List[Dict]:
|
||||||
"""Convert OpenAI tool definitions to Anthropic format."""
|
"""Convert OpenAI tool definitions to Anthropic format."""
|
||||||
if not tools:
|
if not tools:
|
||||||
|
|
@ -495,7 +557,15 @@ def convert_messages_to_anthropic(
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Regular user message
|
# Regular user message
|
||||||
result.append({"role": "user", "content": content})
|
if isinstance(content, list):
|
||||||
|
converted_blocks = []
|
||||||
|
for part in content:
|
||||||
|
converted = _convert_user_content_part_to_anthropic(part)
|
||||||
|
if converted is not None:
|
||||||
|
converted_blocks.append(converted)
|
||||||
|
result.append({"role": "user", "content": converted_blocks or [{"type": "text", "text": ""}]})
|
||||||
|
else:
|
||||||
|
result.append({"role": "user", "content": content})
|
||||||
|
|
||||||
# Strip orphaned tool_use blocks (no matching tool_result follows)
|
# Strip orphaned tool_use blocks (no matching tool_result follows)
|
||||||
tool_result_ids = set()
|
tool_result_ids = set()
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
"""Shared auxiliary OpenAI client for cheap/fast side tasks.
|
"""Shared auxiliary client router for side tasks.
|
||||||
|
|
||||||
Provides a single resolution chain so every consumer (context compression,
|
Provides a single resolution chain so every consumer (context compression,
|
||||||
session search, web extraction, vision analysis, browser vision) picks up
|
session search, web extraction, vision analysis, browser vision) picks up
|
||||||
|
|
@ -10,21 +10,21 @@ Resolution order for text tasks (auto mode):
|
||||||
3. Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY)
|
3. Custom endpoint (OPENAI_BASE_URL + OPENAI_API_KEY)
|
||||||
4. Codex OAuth (Responses API via chatgpt.com with gpt-5.3-codex,
|
4. Codex OAuth (Responses API via chatgpt.com with gpt-5.3-codex,
|
||||||
wrapped to look like a chat.completions client)
|
wrapped to look like a chat.completions client)
|
||||||
5. Direct API-key providers (z.ai/GLM, Kimi/Moonshot, MiniMax, MiniMax-CN)
|
5. Native Anthropic
|
||||||
— checked via PROVIDER_REGISTRY entries with auth_type='api_key'
|
6. Direct API-key providers (z.ai/GLM, Kimi/Moonshot, MiniMax, MiniMax-CN)
|
||||||
6. None
|
7. None
|
||||||
|
|
||||||
Resolution order for vision/multimodal tasks (auto mode):
|
Resolution order for vision/multimodal tasks (auto mode):
|
||||||
1. OpenRouter
|
1. Selected main provider, if it is one of the supported vision backends below
|
||||||
2. Nous Portal
|
2. OpenRouter
|
||||||
3. Codex OAuth (gpt-5.3-codex supports vision via Responses API)
|
3. Nous Portal
|
||||||
4. Custom endpoint (for local vision models: Qwen-VL, LLaVA, Pixtral, etc.)
|
4. Codex OAuth (gpt-5.3-codex supports vision via Responses API)
|
||||||
5. None (API-key providers like z.ai/Kimi/MiniMax are skipped —
|
5. Native Anthropic
|
||||||
they may not support multimodal)
|
6. Custom endpoint (for local vision models: Qwen-VL, LLaVA, Pixtral, etc.)
|
||||||
|
7. None
|
||||||
|
|
||||||
Per-task provider overrides (e.g. AUXILIARY_VISION_PROVIDER,
|
Per-task provider overrides (e.g. AUXILIARY_VISION_PROVIDER,
|
||||||
CONTEXT_COMPRESSION_PROVIDER) can force a specific provider for each task:
|
CONTEXT_COMPRESSION_PROVIDER) can force a specific provider for each task.
|
||||||
"openrouter", "nous", "codex", or "main" (= steps 3-5).
|
|
||||||
Default "auto" follows the chains above.
|
Default "auto" follows the chains above.
|
||||||
|
|
||||||
Per-task model overrides (e.g. AUXILIARY_VISION_MODEL,
|
Per-task model overrides (e.g. AUXILIARY_VISION_MODEL,
|
||||||
|
|
@ -74,6 +74,7 @@ auxiliary_is_nous: bool = False
|
||||||
_OPENROUTER_MODEL = "google/gemini-3-flash-preview"
|
_OPENROUTER_MODEL = "google/gemini-3-flash-preview"
|
||||||
_NOUS_MODEL = "gemini-3-flash"
|
_NOUS_MODEL = "gemini-3-flash"
|
||||||
_NOUS_DEFAULT_BASE_URL = "https://inference-api.nousresearch.com/v1"
|
_NOUS_DEFAULT_BASE_URL = "https://inference-api.nousresearch.com/v1"
|
||||||
|
_ANTHROPIC_DEFAULT_BASE_URL = "https://api.anthropic.com"
|
||||||
_AUTH_JSON_PATH = get_hermes_home() / "auth.json"
|
_AUTH_JSON_PATH = get_hermes_home() / "auth.json"
|
||||||
|
|
||||||
# Codex fallback: uses the Responses API (the only endpoint the Codex
|
# Codex fallback: uses the Responses API (the only endpoint the Codex
|
||||||
|
|
@ -309,6 +310,114 @@ class AsyncCodexAuxiliaryClient:
|
||||||
self.base_url = sync_wrapper.base_url
|
self.base_url = sync_wrapper.base_url
|
||||||
|
|
||||||
|
|
||||||
|
class _AnthropicCompletionsAdapter:
|
||||||
|
"""OpenAI-client-compatible adapter for Anthropic Messages API."""
|
||||||
|
|
||||||
|
def __init__(self, real_client: Any, model: str):
|
||||||
|
self._client = real_client
|
||||||
|
self._model = model
|
||||||
|
|
||||||
|
def create(self, **kwargs) -> Any:
|
||||||
|
from agent.anthropic_adapter import build_anthropic_kwargs, normalize_anthropic_response
|
||||||
|
|
||||||
|
messages = kwargs.get("messages", [])
|
||||||
|
model = kwargs.get("model", self._model)
|
||||||
|
tools = kwargs.get("tools")
|
||||||
|
tool_choice = kwargs.get("tool_choice")
|
||||||
|
max_tokens = kwargs.get("max_tokens") or kwargs.get("max_completion_tokens") or 2000
|
||||||
|
temperature = kwargs.get("temperature")
|
||||||
|
|
||||||
|
normalized_tool_choice = None
|
||||||
|
if isinstance(tool_choice, str):
|
||||||
|
normalized_tool_choice = tool_choice
|
||||||
|
elif isinstance(tool_choice, dict):
|
||||||
|
choice_type = str(tool_choice.get("type", "")).lower()
|
||||||
|
if choice_type == "function":
|
||||||
|
normalized_tool_choice = tool_choice.get("function", {}).get("name")
|
||||||
|
elif choice_type in {"auto", "required", "none"}:
|
||||||
|
normalized_tool_choice = choice_type
|
||||||
|
|
||||||
|
anthropic_kwargs = build_anthropic_kwargs(
|
||||||
|
model=model,
|
||||||
|
messages=messages,
|
||||||
|
tools=tools,
|
||||||
|
max_tokens=max_tokens,
|
||||||
|
reasoning_config=None,
|
||||||
|
tool_choice=normalized_tool_choice,
|
||||||
|
)
|
||||||
|
if temperature is not None:
|
||||||
|
anthropic_kwargs["temperature"] = temperature
|
||||||
|
|
||||||
|
response = self._client.messages.create(**anthropic_kwargs)
|
||||||
|
assistant_message, finish_reason = normalize_anthropic_response(response)
|
||||||
|
|
||||||
|
usage = None
|
||||||
|
if hasattr(response, "usage") and response.usage:
|
||||||
|
prompt_tokens = getattr(response.usage, "input_tokens", 0) or 0
|
||||||
|
completion_tokens = getattr(response.usage, "output_tokens", 0) or 0
|
||||||
|
total_tokens = getattr(response.usage, "total_tokens", 0) or (prompt_tokens + completion_tokens)
|
||||||
|
usage = SimpleNamespace(
|
||||||
|
prompt_tokens=prompt_tokens,
|
||||||
|
completion_tokens=completion_tokens,
|
||||||
|
total_tokens=total_tokens,
|
||||||
|
)
|
||||||
|
|
||||||
|
choice = SimpleNamespace(
|
||||||
|
index=0,
|
||||||
|
message=assistant_message,
|
||||||
|
finish_reason=finish_reason,
|
||||||
|
)
|
||||||
|
return SimpleNamespace(
|
||||||
|
choices=[choice],
|
||||||
|
model=model,
|
||||||
|
usage=usage,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class _AnthropicChatShim:
|
||||||
|
def __init__(self, adapter: _AnthropicCompletionsAdapter):
|
||||||
|
self.completions = adapter
|
||||||
|
|
||||||
|
|
||||||
|
class AnthropicAuxiliaryClient:
|
||||||
|
"""OpenAI-client-compatible wrapper over a native Anthropic client."""
|
||||||
|
|
||||||
|
def __init__(self, real_client: Any, model: str, api_key: str, base_url: str):
|
||||||
|
self._real_client = real_client
|
||||||
|
adapter = _AnthropicCompletionsAdapter(real_client, model)
|
||||||
|
self.chat = _AnthropicChatShim(adapter)
|
||||||
|
self.api_key = api_key
|
||||||
|
self.base_url = base_url
|
||||||
|
|
||||||
|
def close(self):
|
||||||
|
close_fn = getattr(self._real_client, "close", None)
|
||||||
|
if callable(close_fn):
|
||||||
|
close_fn()
|
||||||
|
|
||||||
|
|
||||||
|
class _AsyncAnthropicCompletionsAdapter:
|
||||||
|
def __init__(self, sync_adapter: _AnthropicCompletionsAdapter):
|
||||||
|
self._sync = sync_adapter
|
||||||
|
|
||||||
|
async def create(self, **kwargs) -> Any:
|
||||||
|
import asyncio
|
||||||
|
return await asyncio.to_thread(self._sync.create, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
class _AsyncAnthropicChatShim:
|
||||||
|
def __init__(self, adapter: _AsyncAnthropicCompletionsAdapter):
|
||||||
|
self.completions = adapter
|
||||||
|
|
||||||
|
|
||||||
|
class AsyncAnthropicAuxiliaryClient:
|
||||||
|
def __init__(self, sync_wrapper: "AnthropicAuxiliaryClient"):
|
||||||
|
sync_adapter = sync_wrapper.chat.completions
|
||||||
|
async_adapter = _AsyncAnthropicCompletionsAdapter(sync_adapter)
|
||||||
|
self.chat = _AsyncAnthropicChatShim(async_adapter)
|
||||||
|
self.api_key = sync_wrapper.api_key
|
||||||
|
self.base_url = sync_wrapper.base_url
|
||||||
|
|
||||||
|
|
||||||
def _read_nous_auth() -> Optional[dict]:
|
def _read_nous_auth() -> Optional[dict]:
|
||||||
"""Read and validate ~/.hermes/auth.json for an active Nous provider.
|
"""Read and validate ~/.hermes/auth.json for an active Nous provider.
|
||||||
|
|
||||||
|
|
@ -380,6 +489,9 @@ def _resolve_api_key_provider() -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||||
break
|
break
|
||||||
if not api_key:
|
if not api_key:
|
||||||
continue
|
continue
|
||||||
|
if provider_id == "anthropic":
|
||||||
|
return _try_anthropic()
|
||||||
|
|
||||||
# Resolve base URL (with optional env-var override)
|
# Resolve base URL (with optional env-var override)
|
||||||
# Kimi Code keys (sk-kimi-) need api.kimi.com/coding/v1
|
# Kimi Code keys (sk-kimi-) need api.kimi.com/coding/v1
|
||||||
env_url = ""
|
env_url = ""
|
||||||
|
|
@ -484,6 +596,22 @@ def _try_codex() -> Tuple[Optional[Any], Optional[str]]:
|
||||||
return CodexAuxiliaryClient(real_client, _CODEX_AUX_MODEL), _CODEX_AUX_MODEL
|
return CodexAuxiliaryClient(real_client, _CODEX_AUX_MODEL), _CODEX_AUX_MODEL
|
||||||
|
|
||||||
|
|
||||||
|
def _try_anthropic() -> Tuple[Optional[Any], Optional[str]]:
|
||||||
|
try:
|
||||||
|
from agent.anthropic_adapter import build_anthropic_client, resolve_anthropic_token
|
||||||
|
except ImportError:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
token = resolve_anthropic_token()
|
||||||
|
if not token:
|
||||||
|
return None, None
|
||||||
|
|
||||||
|
model = _API_KEY_PROVIDER_AUX_MODELS.get("anthropic", "claude-haiku-4-5-20251001")
|
||||||
|
logger.debug("Auxiliary client: Anthropic native (%s)", model)
|
||||||
|
real_client = build_anthropic_client(token, _ANTHROPIC_DEFAULT_BASE_URL)
|
||||||
|
return AnthropicAuxiliaryClient(real_client, model, token, _ANTHROPIC_DEFAULT_BASE_URL), model
|
||||||
|
|
||||||
|
|
||||||
def _resolve_forced_provider(forced: str) -> Tuple[Optional[OpenAI], Optional[str]]:
|
def _resolve_forced_provider(forced: str) -> Tuple[Optional[OpenAI], Optional[str]]:
|
||||||
"""Resolve a specific forced provider. Returns (None, None) if creds missing."""
|
"""Resolve a specific forced provider. Returns (None, None) if creds missing."""
|
||||||
if forced == "openrouter":
|
if forced == "openrouter":
|
||||||
|
|
@ -546,6 +674,8 @@ def _to_async_client(sync_client, model: str):
|
||||||
|
|
||||||
if isinstance(sync_client, CodexAuxiliaryClient):
|
if isinstance(sync_client, CodexAuxiliaryClient):
|
||||||
return AsyncCodexAuxiliaryClient(sync_client), model
|
return AsyncCodexAuxiliaryClient(sync_client), model
|
||||||
|
if isinstance(sync_client, AnthropicAuxiliaryClient):
|
||||||
|
return AsyncAnthropicAuxiliaryClient(sync_client), model
|
||||||
|
|
||||||
async_kwargs = {
|
async_kwargs = {
|
||||||
"api_key": sync_client.api_key,
|
"api_key": sync_client.api_key,
|
||||||
|
|
@ -686,6 +816,14 @@ def resolve_provider_client(
|
||||||
return None, None
|
return None, None
|
||||||
|
|
||||||
if pconfig.auth_type == "api_key":
|
if pconfig.auth_type == "api_key":
|
||||||
|
if provider == "anthropic":
|
||||||
|
client, default_model = _try_anthropic()
|
||||||
|
if client is None:
|
||||||
|
logger.warning("resolve_provider_client: anthropic requested but no Anthropic credentials found")
|
||||||
|
return None, None
|
||||||
|
final_model = model or default_model
|
||||||
|
return (_to_async_client(client, final_model) if async_mode else (client, final_model))
|
||||||
|
|
||||||
# Find the first configured API key
|
# Find the first configured API key
|
||||||
api_key = ""
|
api_key = ""
|
||||||
for env_var in pconfig.api_key_env_vars:
|
for env_var in pconfig.api_key_env_vars:
|
||||||
|
|
@ -772,6 +910,7 @@ _VISION_AUTO_PROVIDER_ORDER = (
|
||||||
"openrouter",
|
"openrouter",
|
||||||
"nous",
|
"nous",
|
||||||
"openai-codex",
|
"openai-codex",
|
||||||
|
"anthropic",
|
||||||
"custom",
|
"custom",
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -793,6 +932,8 @@ def _resolve_strict_vision_backend(provider: str) -> Tuple[Optional[Any], Option
|
||||||
return _try_nous()
|
return _try_nous()
|
||||||
if provider == "openai-codex":
|
if provider == "openai-codex":
|
||||||
return _try_codex()
|
return _try_codex()
|
||||||
|
if provider == "anthropic":
|
||||||
|
return _try_anthropic()
|
||||||
if provider == "custom":
|
if provider == "custom":
|
||||||
return _try_custom_endpoint()
|
return _try_custom_endpoint()
|
||||||
return None, None
|
return None, None
|
||||||
|
|
@ -802,19 +943,36 @@ def _strict_vision_backend_available(provider: str) -> bool:
|
||||||
return _resolve_strict_vision_backend(provider)[0] is not None
|
return _resolve_strict_vision_backend(provider)[0] is not None
|
||||||
|
|
||||||
|
|
||||||
|
def _preferred_main_vision_provider() -> Optional[str]:
|
||||||
|
"""Return the selected main provider when it is also a supported vision backend."""
|
||||||
|
try:
|
||||||
|
from hermes_cli.config import load_config
|
||||||
|
|
||||||
|
config = load_config()
|
||||||
|
model_cfg = config.get("model", {})
|
||||||
|
if isinstance(model_cfg, dict):
|
||||||
|
provider = _normalize_vision_provider(model_cfg.get("provider", ""))
|
||||||
|
if provider in _VISION_AUTO_PROVIDER_ORDER:
|
||||||
|
return provider
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
def get_available_vision_backends() -> List[str]:
|
def get_available_vision_backends() -> List[str]:
|
||||||
"""Return the currently available vision backends in auto-selection order.
|
"""Return the currently available vision backends in auto-selection order.
|
||||||
|
|
||||||
This is the single source of truth for setup, tool gating, and runtime
|
This is the single source of truth for setup, tool gating, and runtime
|
||||||
auto-routing of vision tasks. Phase 1 keeps the auto list conservative:
|
auto-routing of vision tasks. The selected main provider is preferred when
|
||||||
OpenRouter, Nous Portal, Codex OAuth, then custom OpenAI-compatible
|
it is also a known-good vision backend; otherwise Hermes falls back through
|
||||||
endpoints. Explicit provider overrides can still route elsewhere.
|
the standard conservative order.
|
||||||
"""
|
"""
|
||||||
return [
|
ordered = list(_VISION_AUTO_PROVIDER_ORDER)
|
||||||
provider
|
preferred = _preferred_main_vision_provider()
|
||||||
for provider in _VISION_AUTO_PROVIDER_ORDER
|
if preferred in ordered:
|
||||||
if _strict_vision_backend_available(provider)
|
ordered.remove(preferred)
|
||||||
]
|
ordered.insert(0, preferred)
|
||||||
|
return [provider for provider in ordered if _strict_vision_backend_available(provider)]
|
||||||
|
|
||||||
|
|
||||||
def resolve_vision_provider_client(
|
def resolve_vision_provider_client(
|
||||||
|
|
|
||||||
|
|
@ -1268,11 +1268,9 @@ def setup_model_provider(config: dict):
|
||||||
|
|
||||||
_vision_needs_setup = not bool(_vision_backends)
|
_vision_needs_setup = not bool(_vision_backends)
|
||||||
|
|
||||||
if selected_provider in {"openrouter", "nous", "openai-codex"}:
|
if selected_provider in _vision_backends:
|
||||||
# If the user just selected one of our known-good vision backends during
|
# If the user just selected a backend Hermes can already use for
|
||||||
# setup, treat vision as covered. Auth/setup failure returns earlier.
|
# vision, treat it as covered. Auth/setup failure returns earlier.
|
||||||
_vision_needs_setup = False
|
|
||||||
elif selected_provider == "custom" and "custom" in _vision_backends:
|
|
||||||
_vision_needs_setup = False
|
_vision_needs_setup = False
|
||||||
|
|
||||||
if _vision_needs_setup:
|
if _vision_needs_setup:
|
||||||
|
|
|
||||||
|
|
@ -10,6 +10,8 @@ import pytest
|
||||||
from agent.auxiliary_client import (
|
from agent.auxiliary_client import (
|
||||||
get_text_auxiliary_client,
|
get_text_auxiliary_client,
|
||||||
get_vision_auxiliary_client,
|
get_vision_auxiliary_client,
|
||||||
|
get_available_vision_backends,
|
||||||
|
resolve_provider_client,
|
||||||
auxiliary_max_tokens_param,
|
auxiliary_max_tokens_param,
|
||||||
_read_codex_access_token,
|
_read_codex_access_token,
|
||||||
_get_auxiliary_provider,
|
_get_auxiliary_provider,
|
||||||
|
|
@ -24,6 +26,7 @@ def _clean_env(monkeypatch):
|
||||||
for key in (
|
for key in (
|
||||||
"OPENROUTER_API_KEY", "OPENAI_BASE_URL", "OPENAI_API_KEY",
|
"OPENROUTER_API_KEY", "OPENAI_BASE_URL", "OPENAI_API_KEY",
|
||||||
"OPENAI_MODEL", "LLM_MODEL", "NOUS_INFERENCE_BASE_URL",
|
"OPENAI_MODEL", "LLM_MODEL", "NOUS_INFERENCE_BASE_URL",
|
||||||
|
"ANTHROPIC_API_KEY", "ANTHROPIC_TOKEN", "CLAUDE_CODE_OAUTH_TOKEN",
|
||||||
# Per-task provider/model overrides
|
# Per-task provider/model overrides
|
||||||
"AUXILIARY_VISION_PROVIDER", "AUXILIARY_VISION_MODEL",
|
"AUXILIARY_VISION_PROVIDER", "AUXILIARY_VISION_MODEL",
|
||||||
"AUXILIARY_WEB_EXTRACT_PROVIDER", "AUXILIARY_WEB_EXTRACT_MODEL",
|
"AUXILIARY_WEB_EXTRACT_PROVIDER", "AUXILIARY_WEB_EXTRACT_MODEL",
|
||||||
|
|
@ -164,14 +167,74 @@ class TestGetTextAuxiliaryClient:
|
||||||
|
|
||||||
|
|
||||||
class TestVisionClientFallback:
|
class TestVisionClientFallback:
|
||||||
"""Vision client auto mode only tries OpenRouter + Nous (multimodal-capable)."""
|
"""Vision client auto mode resolves known-good multimodal backends."""
|
||||||
|
|
||||||
def test_vision_returns_none_without_any_credentials(self):
|
def test_vision_returns_none_without_any_credentials(self):
|
||||||
with patch("agent.auxiliary_client._read_nous_auth", return_value=None):
|
with (
|
||||||
|
patch("agent.auxiliary_client._read_nous_auth", return_value=None),
|
||||||
|
patch("agent.auxiliary_client._try_anthropic", return_value=(None, None)),
|
||||||
|
):
|
||||||
client, model = get_vision_auxiliary_client()
|
client, model = get_vision_auxiliary_client()
|
||||||
assert client is None
|
assert client is None
|
||||||
assert model is None
|
assert model is None
|
||||||
|
|
||||||
|
def test_vision_auto_includes_anthropic_when_configured(self, monkeypatch):
|
||||||
|
monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-api03-key")
|
||||||
|
with (
|
||||||
|
patch("agent.auxiliary_client._read_nous_auth", return_value=None),
|
||||||
|
patch("agent.anthropic_adapter.build_anthropic_client", return_value=MagicMock()),
|
||||||
|
patch("agent.anthropic_adapter.resolve_anthropic_token", return_value="sk-ant-api03-key"),
|
||||||
|
):
|
||||||
|
backends = get_available_vision_backends()
|
||||||
|
|
||||||
|
assert "anthropic" in backends
|
||||||
|
|
||||||
|
def test_resolve_provider_client_returns_native_anthropic_wrapper(self, monkeypatch):
|
||||||
|
monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-api03-key")
|
||||||
|
with (
|
||||||
|
patch("agent.auxiliary_client._read_nous_auth", return_value=None),
|
||||||
|
patch("agent.anthropic_adapter.build_anthropic_client", return_value=MagicMock()),
|
||||||
|
patch("agent.anthropic_adapter.resolve_anthropic_token", return_value="sk-ant-api03-key"),
|
||||||
|
):
|
||||||
|
client, model = resolve_provider_client("anthropic")
|
||||||
|
|
||||||
|
assert client is not None
|
||||||
|
assert client.__class__.__name__ == "AnthropicAuxiliaryClient"
|
||||||
|
assert model == "claude-haiku-4-5-20251001"
|
||||||
|
|
||||||
|
def test_vision_auto_uses_anthropic_when_no_higher_priority_backend(self, monkeypatch):
|
||||||
|
monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-api03-key")
|
||||||
|
with (
|
||||||
|
patch("agent.auxiliary_client._read_nous_auth", return_value=None),
|
||||||
|
patch("agent.anthropic_adapter.build_anthropic_client", return_value=MagicMock()),
|
||||||
|
patch("agent.anthropic_adapter.resolve_anthropic_token", return_value="sk-ant-api03-key"),
|
||||||
|
):
|
||||||
|
client, model = get_vision_auxiliary_client()
|
||||||
|
|
||||||
|
assert client is not None
|
||||||
|
assert client.__class__.__name__ == "AnthropicAuxiliaryClient"
|
||||||
|
assert model == "claude-haiku-4-5-20251001"
|
||||||
|
|
||||||
|
def test_selected_anthropic_provider_is_preferred_for_vision_auto(self, monkeypatch):
|
||||||
|
monkeypatch.setenv("OPENROUTER_API_KEY", "or-key")
|
||||||
|
monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-api03-key")
|
||||||
|
|
||||||
|
def fake_load_config():
|
||||||
|
return {"model": {"provider": "anthropic", "default": "claude-sonnet-4-6"}}
|
||||||
|
|
||||||
|
with (
|
||||||
|
patch("agent.auxiliary_client._read_nous_auth", return_value=None),
|
||||||
|
patch("agent.anthropic_adapter.build_anthropic_client", return_value=MagicMock()),
|
||||||
|
patch("agent.anthropic_adapter.resolve_anthropic_token", return_value="sk-ant-api03-key"),
|
||||||
|
patch("agent.auxiliary_client.OpenAI") as mock_openai,
|
||||||
|
patch("hermes_cli.config.load_config", fake_load_config),
|
||||||
|
):
|
||||||
|
client, model = get_vision_auxiliary_client()
|
||||||
|
|
||||||
|
assert client is not None
|
||||||
|
assert client.__class__.__name__ == "AnthropicAuxiliaryClient"
|
||||||
|
assert model == "claude-haiku-4-5-20251001"
|
||||||
|
|
||||||
def test_vision_auto_includes_codex(self, codex_auth_dir):
|
def test_vision_auto_includes_codex(self, codex_auth_dir):
|
||||||
"""Codex supports vision (gpt-5.3-codex), so auto mode should use it."""
|
"""Codex supports vision (gpt-5.3-codex), so auto mode should use it."""
|
||||||
with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \
|
with patch("agent.auxiliary_client._read_nous_auth", return_value=None), \
|
||||||
|
|
|
||||||
|
|
@ -111,6 +111,7 @@ def test_setup_keep_current_config_provider_uses_provider_specific_model_menu(tm
|
||||||
monkeypatch.setattr("hermes_cli.auth.get_active_provider", lambda: None)
|
monkeypatch.setattr("hermes_cli.auth.get_active_provider", lambda: None)
|
||||||
monkeypatch.setattr("hermes_cli.auth.detect_external_credentials", lambda: [])
|
monkeypatch.setattr("hermes_cli.auth.detect_external_credentials", lambda: [])
|
||||||
monkeypatch.setattr("hermes_cli.models.provider_model_ids", lambda provider: [])
|
monkeypatch.setattr("hermes_cli.models.provider_model_ids", lambda provider: [])
|
||||||
|
monkeypatch.setattr("agent.auxiliary_client.get_available_vision_backends", lambda: [])
|
||||||
|
|
||||||
setup_model_provider(config)
|
setup_model_provider(config)
|
||||||
save_config(config)
|
save_config(config)
|
||||||
|
|
@ -149,6 +150,7 @@ def test_setup_keep_current_anthropic_can_configure_openai_vision_default(tmp_pa
|
||||||
monkeypatch.setattr("hermes_cli.auth.get_active_provider", lambda: None)
|
monkeypatch.setattr("hermes_cli.auth.get_active_provider", lambda: None)
|
||||||
monkeypatch.setattr("hermes_cli.auth.detect_external_credentials", lambda: [])
|
monkeypatch.setattr("hermes_cli.auth.detect_external_credentials", lambda: [])
|
||||||
monkeypatch.setattr("hermes_cli.models.provider_model_ids", lambda provider: [])
|
monkeypatch.setattr("hermes_cli.models.provider_model_ids", lambda provider: [])
|
||||||
|
monkeypatch.setattr("agent.auxiliary_client.get_available_vision_backends", lambda: [])
|
||||||
|
|
||||||
setup_model_provider(config)
|
setup_model_provider(config)
|
||||||
env = _read_env(tmp_path)
|
env = _read_env(tmp_path)
|
||||||
|
|
@ -224,3 +226,17 @@ def test_setup_summary_marks_codex_auth_as_vision_available(tmp_path, monkeypatc
|
||||||
assert "missing run 'hermes setup' to configure" not in output
|
assert "missing run 'hermes setup' to configure" not in output
|
||||||
assert "Mixture of Agents" in output
|
assert "Mixture of Agents" in output
|
||||||
assert "missing OPENROUTER_API_KEY" in output
|
assert "missing OPENROUTER_API_KEY" in output
|
||||||
|
|
||||||
|
|
||||||
|
def test_setup_summary_marks_anthropic_auth_as_vision_available(tmp_path, monkeypatch, capsys):
|
||||||
|
monkeypatch.setenv("HERMES_HOME", str(tmp_path))
|
||||||
|
_clear_provider_env(monkeypatch)
|
||||||
|
monkeypatch.setenv("ANTHROPIC_API_KEY", "sk-ant-api03-key")
|
||||||
|
monkeypatch.setattr("shutil.which", lambda _name: None)
|
||||||
|
monkeypatch.setattr("agent.auxiliary_client.get_available_vision_backends", lambda: ["anthropic"])
|
||||||
|
|
||||||
|
_print_setup_summary(load_config(), tmp_path)
|
||||||
|
output = capsys.readouterr().out
|
||||||
|
|
||||||
|
assert "Vision (image analysis)" in output
|
||||||
|
assert "missing run 'hermes setup' to configure" not in output
|
||||||
|
|
|
||||||
|
|
@ -567,6 +567,56 @@ class TestConvertMessages:
|
||||||
assert tool_block["content"] == "result"
|
assert tool_block["content"] == "result"
|
||||||
assert tool_block["cache_control"] == {"type": "ephemeral"}
|
assert tool_block["cache_control"] == {"type": "ephemeral"}
|
||||||
|
|
||||||
|
def test_converts_data_url_image_to_anthropic_image_block(self):
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "Describe this image"},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {"url": "data:image/png;base64,ZmFrZQ=="},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
_, result = convert_messages_to_anthropic(messages)
|
||||||
|
blocks = result[0]["content"]
|
||||||
|
assert blocks[0] == {"type": "text", "text": "Describe this image"}
|
||||||
|
assert blocks[1] == {
|
||||||
|
"type": "image",
|
||||||
|
"source": {
|
||||||
|
"type": "base64",
|
||||||
|
"media_type": "image/png",
|
||||||
|
"data": "ZmFrZQ==",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
def test_converts_remote_image_url_to_anthropic_image_block(self):
|
||||||
|
messages = [
|
||||||
|
{
|
||||||
|
"role": "user",
|
||||||
|
"content": [
|
||||||
|
{"type": "text", "text": "Describe this image"},
|
||||||
|
{
|
||||||
|
"type": "image_url",
|
||||||
|
"image_url": {"url": "https://example.com/cat.png"},
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
]
|
||||||
|
|
||||||
|
_, result = convert_messages_to_anthropic(messages)
|
||||||
|
blocks = result[0]["content"]
|
||||||
|
assert blocks[1] == {
|
||||||
|
"type": "image",
|
||||||
|
"source": {
|
||||||
|
"type": "url",
|
||||||
|
"url": "https://example.com/cat.png",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
def test_empty_cached_assistant_tool_turn_converts_without_empty_text_block(self):
|
def test_empty_cached_assistant_tool_turn_converts_without_empty_text_block(self):
|
||||||
messages = apply_anthropic_cache_control([
|
messages = apply_anthropic_cache_control([
|
||||||
{"role": "system", "content": "System prompt"},
|
{"role": "system", "content": "System prompt"},
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,8 @@
|
||||||
Vision Tools Module
|
Vision Tools Module
|
||||||
|
|
||||||
This module provides vision analysis tools that work with image URLs.
|
This module provides vision analysis tools that work with image URLs.
|
||||||
Uses Gemini 3 Flash Preview via OpenRouter API for intelligent image understanding.
|
Uses the centralized auxiliary vision router, which can select OpenRouter,
|
||||||
|
Nous, Codex, native Anthropic, or a custom OpenAI-compatible endpoint.
|
||||||
|
|
||||||
Available tools:
|
Available tools:
|
||||||
- vision_analyze_tool: Analyze images from URLs with custom prompts
|
- vision_analyze_tool: Analyze images from URLs with custom prompts
|
||||||
|
|
@ -409,7 +410,7 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
if not api_available:
|
if not api_available:
|
||||||
print("❌ No auxiliary vision model available")
|
print("❌ No auxiliary vision model available")
|
||||||
print("Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools.")
|
print("Configure a supported multimodal backend (OpenRouter, Nous, Codex, Anthropic, or a custom OpenAI-compatible endpoint).")
|
||||||
exit(1)
|
exit(1)
|
||||||
else:
|
else:
|
||||||
print("✅ Vision model available")
|
print("✅ Vision model available")
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue