fix: convert anthropic image content blocks

This commit is contained in:
teknium1 2026-03-14 23:21:09 -07:00
parent 4f4e2671ac
commit 735a6e7651
7 changed files with 343 additions and 15 deletions

View file

@ -21,6 +21,8 @@ Usage:
"""
import atexit
import asyncio
import base64
import concurrent.futures
import copy
import hashlib
@ -31,6 +33,7 @@ import os
import random
import re
import sys
import tempfile
import time
import threading
import weakref
@ -503,6 +506,11 @@ class AIAgent:
self._persist_user_message_idx = None
self._persist_user_message_override = None
# Cache anthropic image-to-text fallbacks per image payload/URL so a
# single tool loop does not repeatedly re-run auxiliary vision on the
# same image history.
self._anthropic_image_fallback_cache: Dict[str, str] = {}
# Initialize LLM client via centralized provider router.
# The router handles auth resolution, base URL, headers, and
# Codex/Anthropic wrapping for all known providers.
@ -2921,13 +2929,156 @@ class AIAgent:
# ── End provider fallback ──────────────────────────────────────────────
@staticmethod
def _content_has_image_parts(content: Any) -> bool:
if not isinstance(content, list):
return False
for part in content:
if isinstance(part, dict) and part.get("type") in {"image_url", "input_image"}:
return True
return False
@staticmethod
def _materialize_data_url_for_vision(image_url: str) -> tuple[str, Optional[Path]]:
header, _, data = str(image_url or "").partition(",")
mime = "image/jpeg"
if header.startswith("data:"):
mime_part = header[len("data:"):].split(";", 1)[0].strip()
if mime_part.startswith("image/"):
mime = mime_part
suffix = {
"image/png": ".png",
"image/gif": ".gif",
"image/webp": ".webp",
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
}.get(mime, ".jpg")
tmp = tempfile.NamedTemporaryFile(prefix="anthropic_image_", suffix=suffix, delete=False)
with tmp:
tmp.write(base64.b64decode(data))
path = Path(tmp.name)
return str(path), path
def _describe_image_for_anthropic_fallback(self, image_url: str, role: str) -> str:
cache_key = hashlib.sha256(str(image_url or "").encode("utf-8")).hexdigest()
cached = self._anthropic_image_fallback_cache.get(cache_key)
if cached:
return cached
role_label = {
"assistant": "assistant",
"tool": "tool result",
}.get(role, "user")
analysis_prompt = (
"Describe everything visible in this image in thorough detail. "
"Include any text, code, UI, data, objects, people, layout, colors, "
"and any other notable visual information."
)
vision_source = str(image_url or "")
cleanup_path: Optional[Path] = None
if vision_source.startswith("data:"):
vision_source, cleanup_path = self._materialize_data_url_for_vision(vision_source)
description = ""
try:
from tools.vision_tools import vision_analyze_tool
result_json = asyncio.run(
vision_analyze_tool(image_url=vision_source, user_prompt=analysis_prompt)
)
result = json.loads(result_json) if isinstance(result_json, str) else {}
description = (result.get("analysis") or "").strip()
except Exception as e:
description = f"Image analysis failed: {e}"
finally:
if cleanup_path and cleanup_path.exists():
try:
cleanup_path.unlink()
except OSError:
pass
if not description:
description = "Image analysis failed."
note = f"[The {role_label} attached an image. Here's what it contains:\n{description}]"
if vision_source and not str(image_url or "").startswith("data:"):
note += (
f"\n[If you need a closer look, use vision_analyze with image_url: {vision_source}]"
)
self._anthropic_image_fallback_cache[cache_key] = note
return note
def _preprocess_anthropic_content(self, content: Any, role: str) -> Any:
if not self._content_has_image_parts(content):
return content
text_parts: List[str] = []
image_notes: List[str] = []
for part in content:
if isinstance(part, str):
if part.strip():
text_parts.append(part.strip())
continue
if not isinstance(part, dict):
continue
ptype = part.get("type")
if ptype in {"text", "input_text"}:
text = str(part.get("text", "") or "").strip()
if text:
text_parts.append(text)
continue
if ptype in {"image_url", "input_image"}:
image_data = part.get("image_url", {})
image_url = image_data.get("url", "") if isinstance(image_data, dict) else str(image_data or "")
if image_url:
image_notes.append(self._describe_image_for_anthropic_fallback(image_url, role))
else:
image_notes.append("[An image was attached but no image source was available.]")
continue
text = str(part.get("text", "") or "").strip()
if text:
text_parts.append(text)
prefix = "\n\n".join(note for note in image_notes if note).strip()
suffix = "\n".join(text for text in text_parts if text).strip()
if prefix and suffix:
return f"{prefix}\n\n{suffix}"
if prefix:
return prefix
if suffix:
return suffix
return "[A multimodal message was converted to text for Anthropic compatibility.]"
def _prepare_anthropic_messages_for_api(self, api_messages: list) -> list:
if not any(
isinstance(msg, dict) and self._content_has_image_parts(msg.get("content"))
for msg in api_messages
):
return api_messages
transformed = copy.deepcopy(api_messages)
for msg in transformed:
if not isinstance(msg, dict):
continue
msg["content"] = self._preprocess_anthropic_content(
msg.get("content"),
str(msg.get("role", "user") or "user"),
)
return transformed
def _build_api_kwargs(self, api_messages: list) -> dict:
"""Build the keyword arguments dict for the active API mode."""
if self.api_mode == "anthropic_messages":
from agent.anthropic_adapter import build_anthropic_kwargs
anthropic_messages = self._prepare_anthropic_messages_for_api(api_messages)
return build_anthropic_kwargs(
model=self.model,
messages=api_messages,
messages=anthropic_messages,
tools=self.tools,
max_tokens=self.max_tokens,
reasoning_config=self.reasoning_config,