Enhance image handling and analysis capabilities across platforms

- Updated the vision tool to accept both HTTP/HTTPS URLs and local file paths for image analysis.
- Implemented caching of user-uploaded images in local directories to ensure reliable access for the vision tool, addressing issues with ephemeral URLs.
- Enhanced platform adapters (Discord, Telegram, WhatsApp) to download and cache images, allowing for immediate analysis and enriched message context.
- Added a new method to auto-analyze images attached by users, enriching the conversation with detailed descriptions.
- Improved documentation for image handling processes and updated related functions for clarity and efficiency.
This commit is contained in:
teknium1 2026-02-15 16:10:50 -08:00
parent eb49936a60
commit 5404a8fcd8
7 changed files with 303 additions and 35 deletions

View file

@ -58,7 +58,7 @@ from gateway.session import (
build_session_context_prompt,
)
from gateway.delivery import DeliveryRouter, DeliveryTarget
from gateway.platforms.base import BasePlatformAdapter, MessageEvent
from gateway.platforms.base import BasePlatformAdapter, MessageEvent, MessageType
class GatewayRunner:
@ -298,10 +298,39 @@ class GatewayRunner:
# Load conversation history from transcript
history = self.session_store.load_transcript(session_entry.session_id)
# -----------------------------------------------------------------
# Auto-analyze images sent by the user
#
# If the user attached image(s), we run the vision tool eagerly so
# the conversation model always receives a text description. The
# local file path is also included so the model can re-examine the
# image later with a more targeted question via vision_analyze.
#
# We filter to image paths only (by media_type) so that non-image
# attachments (documents, audio, etc.) are not sent to the vision
# tool even when they appear in the same message.
# -----------------------------------------------------------------
message_text = event.text or ""
if event.media_urls:
image_paths = []
for i, path in enumerate(event.media_urls):
# Check media_types if available; otherwise infer from message type
mtype = event.media_types[i] if i < len(event.media_types) else ""
is_image = (
mtype.startswith("image/")
or event.message_type == MessageType.PHOTO
)
if is_image:
image_paths.append(path)
if image_paths:
message_text = await self._enrich_message_with_vision(
message_text, image_paths
)
try:
# Run the agent
response = await self._run_agent(
message=event.text,
message=message_text,
context_prompt=context_prompt,
history=history,
source=source,
@ -320,10 +349,10 @@ class GatewayRunner:
except Exception:
pass
# Append to transcript
# Append to transcript (use the enriched message so vision context is preserved)
self.session_store.append_to_transcript(
session_entry.session_id,
{"role": "user", "content": event.text, "timestamp": datetime.now().isoformat()}
{"role": "user", "content": message_text, "timestamp": datetime.now().isoformat()}
)
self.session_store.append_to_transcript(
session_entry.session_id,
@ -411,6 +440,75 @@ class GatewayRunner:
if var in os.environ:
del os.environ[var]
async def _enrich_message_with_vision(
self,
user_text: str,
image_paths: List[str],
) -> str:
"""
Auto-analyze user-attached images with the vision tool and prepend
the descriptions to the message text.
Each image is analyzed with a general-purpose prompt. The resulting
description *and* the local cache path are injected so the model can:
1. Immediately understand what the user sent (no extra tool call).
2. Re-examine the image with vision_analyze if it needs more detail.
Args:
user_text: The user's original caption / message text.
image_paths: List of local file paths to cached images.
Returns:
The enriched message string with vision descriptions prepended.
"""
from tools.vision_tools import vision_analyze_tool
import json as _json
analysis_prompt = (
"Describe everything visible in this image in thorough detail. "
"Include any text, code, data, objects, people, layout, colors, "
"and any other notable visual information."
)
enriched_parts = []
for path in image_paths:
try:
print(f"[gateway] Auto-analyzing user image: {path}", flush=True)
result_json = await vision_analyze_tool(
image_url=path,
user_prompt=analysis_prompt,
)
result = _json.loads(result_json)
if result.get("success"):
description = result.get("analysis", "")
enriched_parts.append(
f"[User sent an image. Vision analysis:\n{description}]\n"
f"[To examine this image further, use vision_analyze with "
f"image_url: {path}]"
)
else:
# Analysis failed -- still tell the model the image exists
enriched_parts.append(
f"[User sent an image but automatic analysis failed. "
f"You can try analyzing it with vision_analyze using "
f"image_url: {path}]"
)
except Exception as e:
print(f"[gateway] Vision auto-analysis error: {e}", flush=True)
enriched_parts.append(
f"[User sent an image but automatic analysis encountered an error. "
f"You can try analyzing it with vision_analyze using "
f"image_url: {path}]"
)
# Combine: vision descriptions first, then the user's original text
if enriched_parts:
prefix = "\n\n".join(enriched_parts)
if user_text:
return f"{prefix}\n\n{user_text}"
return prefix
return user_text
async def _run_agent(
self,
message: str,