Hermes Agent UX Improvements
This commit is contained in:
parent
b1f55e3ee5
commit
ededaaa874
23 changed files with 945 additions and 1545 deletions
|
|
@ -281,7 +281,12 @@ def check_dangerous_command(command: str, env_type: str,
|
|||
approval_callback=approval_callback)
|
||||
|
||||
if choice == "deny":
|
||||
return {"approved": False, "message": "BLOCKED: User denied this potentially dangerous command. Do NOT retry this command - the user has explicitly rejected it."}
|
||||
return {
|
||||
"approved": False,
|
||||
"message": f"BLOCKED: User denied this potentially dangerous command (matched '{description}' pattern). Do NOT retry this command - the user has explicitly rejected it.",
|
||||
"pattern_key": pattern_key,
|
||||
"description": description,
|
||||
}
|
||||
|
||||
if choice == "session":
|
||||
approve_session(session_key, pattern_key)
|
||||
|
|
|
|||
|
|
@ -51,25 +51,16 @@ import signal
|
|||
import subprocess
|
||||
import shutil
|
||||
import sys
|
||||
import asyncio
|
||||
import tempfile
|
||||
import threading
|
||||
import time
|
||||
import requests
|
||||
from typing import Dict, Any, Optional, List
|
||||
from pathlib import Path
|
||||
from hermes_constants import OPENROUTER_CHAT_URL
|
||||
from agent.auxiliary_client import get_vision_auxiliary_client
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Try to import httpx for async LLM calls
|
||||
try:
|
||||
import httpx
|
||||
HTTPX_AVAILABLE = True
|
||||
except ImportError:
|
||||
HTTPX_AVAILABLE = False
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Configuration
|
||||
# ============================================================================
|
||||
|
|
@ -83,8 +74,8 @@ DEFAULT_SESSION_TIMEOUT = 300
|
|||
# Max tokens for snapshot content before summarization
|
||||
SNAPSHOT_SUMMARIZE_THRESHOLD = 8000
|
||||
|
||||
# Model for task-aware extraction
|
||||
EXTRACTION_MODEL = "google/gemini-3-flash-preview"
|
||||
# Resolve vision auxiliary client for extraction/vision tasks
|
||||
_aux_vision_client, EXTRACTION_MODEL = get_vision_auxiliary_client()
|
||||
|
||||
# Track active sessions per task
|
||||
# Now stores tuple of (session_name, browserbase_session_id, cdp_url)
|
||||
|
|
@ -782,87 +773,49 @@ def _run_browser_command(
|
|||
return {"success": False, "error": str(e)}
|
||||
|
||||
|
||||
async def _extract_relevant_content(
|
||||
def _extract_relevant_content(
|
||||
snapshot_text: str,
|
||||
user_task: Optional[str] = None
|
||||
) -> str:
|
||||
"""Use LLM to extract relevant content from a snapshot based on the user's task.
|
||||
|
||||
Falls back to simple truncation when no auxiliary vision model is configured.
|
||||
"""
|
||||
Use LLM to extract relevant content from a snapshot based on the user's task.
|
||||
|
||||
This provides task-aware summarization that preserves meaningful text content
|
||||
(paragraphs, prices, descriptions) relevant to what the user is trying to accomplish.
|
||||
|
||||
Args:
|
||||
snapshot_text: The full snapshot text
|
||||
user_task: The user's current task/goal (optional)
|
||||
|
||||
Returns:
|
||||
Summarized/extracted content
|
||||
"""
|
||||
if not HTTPX_AVAILABLE:
|
||||
# Fall back to simple truncation
|
||||
if _aux_vision_client is None or EXTRACTION_MODEL is None:
|
||||
return _truncate_snapshot(snapshot_text)
|
||||
|
||||
# Get API key
|
||||
api_key = os.environ.get("OPENROUTER_API_KEY")
|
||||
if not api_key:
|
||||
return _truncate_snapshot(snapshot_text)
|
||||
|
||||
# Build extraction prompt
|
||||
|
||||
if user_task:
|
||||
extraction_prompt = f"""You are a content extractor for a browser automation agent.
|
||||
|
||||
The user's task is: {user_task}
|
||||
|
||||
Given the following page snapshot (accessibility tree representation), extract and summarize the most relevant information for completing this task. Focus on:
|
||||
1. Interactive elements (buttons, links, inputs) that might be needed
|
||||
2. Text content relevant to the task (prices, descriptions, headings, important info)
|
||||
3. Navigation structure if relevant
|
||||
|
||||
Keep ref IDs (like [ref=e5]) for interactive elements so the agent can use them.
|
||||
|
||||
Page Snapshot:
|
||||
{snapshot_text}
|
||||
|
||||
Provide a concise summary that preserves actionable information and relevant content."""
|
||||
extraction_prompt = (
|
||||
f"You are a content extractor for a browser automation agent.\n\n"
|
||||
f"The user's task is: {user_task}\n\n"
|
||||
f"Given the following page snapshot (accessibility tree representation), "
|
||||
f"extract and summarize the most relevant information for completing this task. Focus on:\n"
|
||||
f"1. Interactive elements (buttons, links, inputs) that might be needed\n"
|
||||
f"2. Text content relevant to the task (prices, descriptions, headings, important info)\n"
|
||||
f"3. Navigation structure if relevant\n\n"
|
||||
f"Keep ref IDs (like [ref=e5]) for interactive elements so the agent can use them.\n\n"
|
||||
f"Page Snapshot:\n{snapshot_text}\n\n"
|
||||
f"Provide a concise summary that preserves actionable information and relevant content."
|
||||
)
|
||||
else:
|
||||
extraction_prompt = f"""Summarize this page snapshot, preserving:
|
||||
1. All interactive elements with their ref IDs (like [ref=e5])
|
||||
2. Key text content and headings
|
||||
3. Important information visible on the page
|
||||
|
||||
Page Snapshot:
|
||||
{snapshot_text}
|
||||
|
||||
Provide a concise summary focused on interactive elements and key content."""
|
||||
extraction_prompt = (
|
||||
f"Summarize this page snapshot, preserving:\n"
|
||||
f"1. All interactive elements with their ref IDs (like [ref=e5])\n"
|
||||
f"2. Key text content and headings\n"
|
||||
f"3. Important information visible on the page\n\n"
|
||||
f"Page Snapshot:\n{snapshot_text}\n\n"
|
||||
f"Provide a concise summary focused on interactive elements and key content."
|
||||
)
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||
response = await client.post(
|
||||
OPENROUTER_CHAT_URL,
|
||||
headers={
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": EXTRACTION_MODEL,
|
||||
"messages": [
|
||||
{"role": "user", "content": extraction_prompt}
|
||||
],
|
||||
"max_tokens": 4000,
|
||||
"temperature": 0.1
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
result = response.json()
|
||||
return result["choices"][0]["message"]["content"]
|
||||
else:
|
||||
# Fall back to truncation on API error
|
||||
return _truncate_snapshot(snapshot_text)
|
||||
|
||||
response = _aux_vision_client.chat.completions.create(
|
||||
model=EXTRACTION_MODEL,
|
||||
messages=[{"role": "user", "content": extraction_prompt}],
|
||||
max_tokens=4000,
|
||||
temperature=0.1,
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
except Exception:
|
||||
# Fall back to truncation on any error
|
||||
return _truncate_snapshot(snapshot_text)
|
||||
|
||||
|
||||
|
|
@ -991,16 +944,7 @@ def browser_snapshot(
|
|||
|
||||
# Check if snapshot needs summarization
|
||||
if len(snapshot_text) > SNAPSHOT_SUMMARIZE_THRESHOLD and user_task:
|
||||
# Run async extraction
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
except RuntimeError:
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
snapshot_text = loop.run_until_complete(
|
||||
_extract_relevant_content(snapshot_text, user_task)
|
||||
)
|
||||
snapshot_text = _extract_relevant_content(snapshot_text, user_task)
|
||||
elif len(snapshot_text) > SNAPSHOT_SUMMARIZE_THRESHOLD:
|
||||
snapshot_text = _truncate_snapshot(snapshot_text)
|
||||
|
||||
|
|
@ -1286,12 +1230,12 @@ def browser_vision(question: str, task_id: Optional[str] = None) -> str:
|
|||
|
||||
effective_task_id = task_id or "default"
|
||||
|
||||
# Check for OpenRouter API key
|
||||
api_key = os.environ.get("OPENROUTER_API_KEY")
|
||||
if not api_key:
|
||||
# Check auxiliary vision client
|
||||
if _aux_vision_client is None or EXTRACTION_MODEL is None:
|
||||
return json.dumps({
|
||||
"success": False,
|
||||
"error": "OPENROUTER_API_KEY not set. Vision analysis requires this API key."
|
||||
"error": "Browser vision unavailable: no auxiliary vision model configured. "
|
||||
"Set OPENROUTER_API_KEY or configure Nous Portal to enable browser vision."
|
||||
}, ensure_ascii=False)
|
||||
|
||||
# Create a temporary file for the screenshot
|
||||
|
|
@ -1325,110 +1269,36 @@ def browser_vision(question: str, task_id: Optional[str] = None) -> str:
|
|||
image_base64 = base64.b64encode(image_data).decode("ascii")
|
||||
data_url = f"data:image/png;base64,{image_base64}"
|
||||
|
||||
# Prepare the vision prompt
|
||||
vision_prompt = f"""You are analyzing a screenshot of a web browser.
|
||||
vision_prompt = (
|
||||
f"You are analyzing a screenshot of a web browser.\n\n"
|
||||
f"User's question: {question}\n\n"
|
||||
f"Provide a detailed and helpful answer based on what you see in the screenshot. "
|
||||
f"If there are interactive elements, describe them. If there are verification challenges "
|
||||
f"or CAPTCHAs, describe what type they are and what action might be needed. "
|
||||
f"Focus on answering the user's specific question."
|
||||
)
|
||||
|
||||
User's question: {question}
|
||||
|
||||
Provide a detailed and helpful answer based on what you see in the screenshot.
|
||||
If there are interactive elements, describe them. If there are verification challenges
|
||||
or CAPTCHAs, describe what type they are and what action might be needed.
|
||||
Focus on answering the user's specific question."""
|
||||
|
||||
# Call OpenRouter/Gemini for vision analysis
|
||||
if HTTPX_AVAILABLE:
|
||||
import asyncio
|
||||
|
||||
async def analyze_screenshot():
|
||||
async with httpx.AsyncClient(timeout=60.0) as client:
|
||||
response = await client.post(
|
||||
OPENROUTER_CHAT_URL,
|
||||
headers={
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": "google/gemini-3-flash-preview",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": vision_prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": data_url}
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"max_tokens": 2000,
|
||||
"temperature": 0.1
|
||||
}
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
return {
|
||||
"success": False,
|
||||
"error": f"Vision API error: {response.status_code} - {response.text[:200]}"
|
||||
}
|
||||
|
||||
result_data = response.json()
|
||||
analysis = result_data["choices"][0]["message"]["content"]
|
||||
return {
|
||||
"success": True,
|
||||
"analysis": analysis
|
||||
}
|
||||
|
||||
# Run the async function
|
||||
try:
|
||||
loop = asyncio.get_event_loop()
|
||||
except RuntimeError:
|
||||
loop = asyncio.new_event_loop()
|
||||
asyncio.set_event_loop(loop)
|
||||
|
||||
vision_result = loop.run_until_complete(analyze_screenshot())
|
||||
return json.dumps(vision_result, ensure_ascii=False)
|
||||
|
||||
else:
|
||||
# Fallback: use synchronous requests
|
||||
response = requests.post(
|
||||
OPENROUTER_CHAT_URL,
|
||||
headers={
|
||||
"Authorization": f"Bearer {api_key}",
|
||||
"Content-Type": "application/json"
|
||||
},
|
||||
json={
|
||||
"model": "google/gemini-3-flash-preview",
|
||||
"messages": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": vision_prompt},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": data_url}
|
||||
}
|
||||
]
|
||||
}
|
||||
# Use the sync auxiliary vision client directly
|
||||
response = _aux_vision_client.chat.completions.create(
|
||||
model=EXTRACTION_MODEL,
|
||||
messages=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": vision_prompt},
|
||||
{"type": "image_url", "image_url": {"url": data_url}},
|
||||
],
|
||||
"max_tokens": 2000,
|
||||
"temperature": 0.1
|
||||
},
|
||||
timeout=60
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
return json.dumps({
|
||||
"success": False,
|
||||
"error": f"Vision API error: {response.status_code} - {response.text[:200]}"
|
||||
}, ensure_ascii=False)
|
||||
|
||||
result_data = response.json()
|
||||
analysis = result_data["choices"][0]["message"]["content"]
|
||||
return json.dumps({
|
||||
"success": True,
|
||||
"analysis": analysis
|
||||
}, ensure_ascii=False)
|
||||
}
|
||||
],
|
||||
max_tokens=2000,
|
||||
temperature=0.1,
|
||||
)
|
||||
|
||||
analysis = response.choices[0].message.content
|
||||
return json.dumps({
|
||||
"success": True,
|
||||
"analysis": analysis,
|
||||
}, ensure_ascii=False)
|
||||
|
||||
except Exception as e:
|
||||
return json.dumps({
|
||||
|
|
|
|||
|
|
@ -22,9 +22,19 @@ import os
|
|||
import logging
|
||||
from typing import Dict, Any, List, Optional
|
||||
|
||||
from tools.openrouter_client import get_async_client as _get_client
|
||||
from openai import AsyncOpenAI, OpenAI
|
||||
|
||||
SUMMARIZER_MODEL = "google/gemini-3-flash-preview"
|
||||
from agent.auxiliary_client import get_text_auxiliary_client
|
||||
|
||||
# Resolve the auxiliary client at import time so we have the model slug.
|
||||
# We build an AsyncOpenAI from the same credentials for async summarization.
|
||||
_aux_client, _SUMMARIZER_MODEL = get_text_auxiliary_client()
|
||||
_async_aux_client: AsyncOpenAI | None = None
|
||||
if _aux_client is not None:
|
||||
_async_aux_client = AsyncOpenAI(
|
||||
api_key=_aux_client.api_key,
|
||||
base_url=str(_aux_client.base_url),
|
||||
)
|
||||
MAX_SESSION_CHARS = 100_000
|
||||
MAX_SUMMARY_TOKENS = 2000
|
||||
|
||||
|
|
@ -126,11 +136,15 @@ async def _summarize_session(
|
|||
f"Summarize this conversation with focus on: {query}"
|
||||
)
|
||||
|
||||
if _async_aux_client is None or _SUMMARIZER_MODEL is None:
|
||||
logging.warning("No auxiliary model available for session summarization")
|
||||
return None
|
||||
|
||||
max_retries = 3
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = await _get_client().chat.completions.create(
|
||||
model=SUMMARIZER_MODEL,
|
||||
response = await _async_aux_client.chat.completions.create(
|
||||
model=_SUMMARIZER_MODEL,
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": user_prompt},
|
||||
|
|
@ -252,8 +266,8 @@ def session_search(
|
|||
|
||||
|
||||
def check_session_search_requirements() -> bool:
|
||||
"""Requires SQLite state database and OpenRouter API key."""
|
||||
if not os.getenv("OPENROUTER_API_KEY"):
|
||||
"""Requires SQLite state database and an auxiliary text model."""
|
||||
if _async_aux_client is None:
|
||||
return False
|
||||
try:
|
||||
from hermes_state import DEFAULT_DB_PATH
|
||||
|
|
@ -316,5 +330,4 @@ registry.register(
|
|||
limit=args.get("limit", 3),
|
||||
db=kw.get("db")),
|
||||
check_fn=check_session_search_requirements,
|
||||
requires_env=["OPENROUTER_API_KEY"],
|
||||
)
|
||||
|
|
|
|||
|
|
@ -359,7 +359,6 @@ Do NOT use vim/nano/interactive tools without pty=true — they hang without a p
|
|||
|
||||
# Global state for environment lifecycle management
|
||||
_active_environments: Dict[str, Any] = {}
|
||||
_task_workdirs: Dict[str, str] = {} # Maps task_id to working directory
|
||||
_last_activity: Dict[str, float] = {}
|
||||
_env_lock = threading.Lock()
|
||||
_creation_locks: Dict[str, threading.Lock] = {} # Per-task locks for sandbox creation
|
||||
|
|
@ -530,7 +529,6 @@ def _cleanup_inactive_envs(lifetime_seconds: int = 300):
|
|||
if current_time - last_time > lifetime_seconds:
|
||||
env = _active_environments.pop(task_id, None)
|
||||
_last_activity.pop(task_id, None)
|
||||
_task_workdirs.pop(task_id, None)
|
||||
if env is not None:
|
||||
envs_to_stop.append((task_id, env))
|
||||
|
||||
|
|
@ -609,7 +607,7 @@ def get_active_environments_info() -> Dict[str, Any]:
|
|||
info = {
|
||||
"count": len(_active_environments),
|
||||
"task_ids": list(_active_environments.keys()),
|
||||
"workdirs": dict(_task_workdirs),
|
||||
"workdirs": {},
|
||||
}
|
||||
|
||||
# Calculate total disk usage
|
||||
|
|
@ -632,7 +630,7 @@ def get_active_environments_info() -> Dict[str, Any]:
|
|||
|
||||
def cleanup_all_environments():
|
||||
"""Clean up ALL active environments. Use with caution."""
|
||||
global _active_environments, _last_activity, _task_workdirs
|
||||
global _active_environments, _last_activity
|
||||
|
||||
task_ids = list(_active_environments.keys())
|
||||
cleaned = 0
|
||||
|
|
@ -661,7 +659,7 @@ def cleanup_all_environments():
|
|||
|
||||
def cleanup_vm(task_id: str):
|
||||
"""Manually clean up a specific environment by task_id."""
|
||||
global _active_environments, _last_activity, _task_workdirs
|
||||
global _active_environments, _last_activity
|
||||
|
||||
# Remove from tracking dicts while holding the lock, but defer the
|
||||
# actual (potentially slow) env.cleanup() call to outside the lock
|
||||
|
|
@ -669,7 +667,6 @@ def cleanup_vm(task_id: str):
|
|||
env = None
|
||||
with _env_lock:
|
||||
env = _active_environments.pop(task_id, None)
|
||||
_task_workdirs.pop(task_id, None)
|
||||
_last_activity.pop(task_id, None)
|
||||
|
||||
# Clean up per-task creation lock
|
||||
|
|
@ -782,17 +779,6 @@ def terminal_tool(
|
|||
default_timeout = config["timeout"]
|
||||
effective_timeout = timeout or default_timeout
|
||||
|
||||
# For local environment in batch mode, create a unique subdirectory per task
|
||||
# This prevents parallel tasks from overwriting each other's files
|
||||
# In CLI mode (HERMES_QUIET), use the cwd directly without subdirectories
|
||||
if env_type == "local" and not os.getenv("HERMES_QUIET"):
|
||||
with _env_lock:
|
||||
if effective_task_id not in _task_workdirs:
|
||||
task_workdir = Path(cwd) / f"hermes-{effective_task_id}-{uuid.uuid4().hex[:8]}"
|
||||
task_workdir.mkdir(parents=True, exist_ok=True)
|
||||
_task_workdirs[effective_task_id] = str(task_workdir)
|
||||
cwd = _task_workdirs[effective_task_id]
|
||||
|
||||
# Start cleanup thread
|
||||
_start_cleanup_thread()
|
||||
|
||||
|
|
@ -874,11 +860,16 @@ def terminal_tool(
|
|||
"description": approval.get("description", "dangerous command"),
|
||||
"pattern_key": approval.get("pattern_key", ""),
|
||||
}, ensure_ascii=False)
|
||||
# Command was blocked - return informative message
|
||||
# Command was blocked - include the pattern category so the caller knows why
|
||||
desc = approval.get("description", "potentially dangerous operation")
|
||||
fallback_msg = (
|
||||
f"Command denied: matches '{desc}' pattern. "
|
||||
"Use the approval prompt to allow it, or rephrase the command."
|
||||
)
|
||||
return json.dumps({
|
||||
"output": "",
|
||||
"exit_code": -1,
|
||||
"error": approval.get("message", "Command denied - potentially dangerous operation"),
|
||||
"error": approval.get("message", fallback_msg),
|
||||
"status": "blocked"
|
||||
}, ensure_ascii=False)
|
||||
|
||||
|
|
@ -996,11 +987,17 @@ def terminal_tool(
|
|||
# Add helpful message for sudo failures in messaging context
|
||||
output = _handle_sudo_failure(output, env_type)
|
||||
|
||||
# Truncate output if too long
|
||||
# Truncate output if too long, keeping both head and tail
|
||||
MAX_OUTPUT_CHARS = 50000
|
||||
if len(output) > MAX_OUTPUT_CHARS:
|
||||
truncated_notice = f"\n\n... [OUTPUT TRUNCATED - showing last {MAX_OUTPUT_CHARS} chars of {len(output)} total] ..."
|
||||
output = truncated_notice + output[-MAX_OUTPUT_CHARS:]
|
||||
head_chars = int(MAX_OUTPUT_CHARS * 0.4) # 40% head (error messages often appear early)
|
||||
tail_chars = MAX_OUTPUT_CHARS - head_chars # 60% tail (most recent/relevant output)
|
||||
omitted = len(output) - head_chars - tail_chars
|
||||
truncated_notice = (
|
||||
f"\n\n... [OUTPUT TRUNCATED - {omitted} chars omitted "
|
||||
f"out of {len(output)} total] ...\n\n"
|
||||
)
|
||||
output = output[:head_chars] + truncated_notice + output[-tail_chars:]
|
||||
|
||||
return json.dumps({
|
||||
"output": output.strip() if output else "",
|
||||
|
|
|
|||
|
|
@ -36,13 +36,20 @@ import base64
|
|||
from pathlib import Path
|
||||
from typing import Dict, Any, Optional
|
||||
import httpx
|
||||
from tools.openrouter_client import get_async_client as _get_openrouter_client, check_api_key as check_openrouter_api_key
|
||||
from openai import AsyncOpenAI
|
||||
from agent.auxiliary_client import get_vision_auxiliary_client
|
||||
from tools.debug_helpers import DebugSession
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Configuration for vision processing
|
||||
DEFAULT_VISION_MODEL = "google/gemini-3-flash-preview"
|
||||
# Resolve vision auxiliary client at module level; build an async wrapper.
|
||||
_aux_sync_client, DEFAULT_VISION_MODEL = get_vision_auxiliary_client()
|
||||
_aux_async_client: AsyncOpenAI | None = None
|
||||
if _aux_sync_client is not None:
|
||||
_aux_async_client = AsyncOpenAI(
|
||||
api_key=_aux_sync_client.api_key,
|
||||
base_url=str(_aux_sync_client.base_url),
|
||||
)
|
||||
|
||||
_debug = DebugSession("vision_tools", env_var="VISION_TOOLS_DEBUG")
|
||||
|
||||
|
|
@ -230,9 +237,13 @@ async def vision_analyze_tool(
|
|||
logger.info("Analyzing image: %s", image_url[:60])
|
||||
logger.info("User prompt: %s", user_prompt[:100])
|
||||
|
||||
# Check API key availability
|
||||
if not os.getenv("OPENROUTER_API_KEY"):
|
||||
raise ValueError("OPENROUTER_API_KEY environment variable not set")
|
||||
# Check auxiliary vision client availability
|
||||
if _aux_async_client is None or DEFAULT_VISION_MODEL is None:
|
||||
return json.dumps({
|
||||
"success": False,
|
||||
"analysis": "Vision analysis unavailable: no auxiliary vision model configured. "
|
||||
"Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools."
|
||||
}, indent=2, ensure_ascii=False)
|
||||
|
||||
# Determine if this is a local file path or a remote URL
|
||||
local_path = Path(image_url)
|
||||
|
|
@ -291,18 +302,12 @@ async def vision_analyze_tool(
|
|||
|
||||
logger.info("Processing image with %s...", model)
|
||||
|
||||
# Call the vision API with reasoning enabled
|
||||
response = await _get_openrouter_client().chat.completions.create(
|
||||
# Call the vision API
|
||||
response = await _aux_async_client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
temperature=0.1, # Low temperature for consistent analysis
|
||||
max_tokens=2000, # Generous limit for detailed analysis
|
||||
extra_body={
|
||||
"reasoning": {
|
||||
"enabled": True,
|
||||
"effort": "xhigh"
|
||||
}
|
||||
}
|
||||
temperature=0.1,
|
||||
max_tokens=2000,
|
||||
)
|
||||
|
||||
# Extract the analysis
|
||||
|
|
@ -353,13 +358,8 @@ async def vision_analyze_tool(
|
|||
|
||||
|
||||
def check_vision_requirements() -> bool:
|
||||
"""
|
||||
Check if all requirements for vision tools are met.
|
||||
|
||||
Returns:
|
||||
bool: True if requirements are met, False otherwise
|
||||
"""
|
||||
return check_openrouter_api_key()
|
||||
"""Check if an auxiliary vision model is available."""
|
||||
return _aux_async_client is not None
|
||||
|
||||
|
||||
def get_debug_session_info() -> Dict[str, Any]:
|
||||
|
|
@ -379,16 +379,15 @@ if __name__ == "__main__":
|
|||
print("👁️ Vision Tools Module")
|
||||
print("=" * 40)
|
||||
|
||||
# Check if API key is available
|
||||
api_available = check_openrouter_api_key()
|
||||
# Check if vision model is available
|
||||
api_available = check_vision_requirements()
|
||||
|
||||
if not api_available:
|
||||
print("❌ OPENROUTER_API_KEY environment variable not set")
|
||||
print("Please set your API key: export OPENROUTER_API_KEY='your-key-here'")
|
||||
print("Get API key at: https://openrouter.ai/")
|
||||
print("❌ No auxiliary vision model available")
|
||||
print("Set OPENROUTER_API_KEY or configure Nous Portal to enable vision tools.")
|
||||
exit(1)
|
||||
else:
|
||||
print("✅ OpenRouter API key found")
|
||||
print(f"✅ Vision model available: {DEFAULT_VISION_MODEL}")
|
||||
|
||||
print("🛠️ Vision tools ready for use!")
|
||||
print(f"🧠 Using model: {DEFAULT_VISION_MODEL}")
|
||||
|
|
@ -455,7 +454,8 @@ def _handle_vision_analyze(args, **kw):
|
|||
image_url = args.get("image_url", "")
|
||||
question = args.get("question", "")
|
||||
full_prompt = f"Fully describe and explain everything about this image, then answer the following question:\n\n{question}"
|
||||
return vision_analyze_tool(image_url, full_prompt, "google/gemini-3-flash-preview")
|
||||
model = DEFAULT_VISION_MODEL or "google/gemini-3-flash-preview"
|
||||
return vision_analyze_tool(image_url, full_prompt, model)
|
||||
|
||||
|
||||
registry.register(
|
||||
|
|
@ -464,6 +464,5 @@ registry.register(
|
|||
schema=VISION_ANALYZE_SCHEMA,
|
||||
handler=_handle_vision_analyze,
|
||||
check_fn=check_vision_requirements,
|
||||
requires_env=["OPENROUTER_API_KEY"],
|
||||
is_async=True,
|
||||
)
|
||||
|
|
|
|||
|
|
@ -47,7 +47,8 @@ import re
|
|||
import asyncio
|
||||
from typing import List, Dict, Any, Optional
|
||||
from firecrawl import Firecrawl
|
||||
from tools.openrouter_client import get_async_client as _get_openrouter_client
|
||||
from openai import AsyncOpenAI
|
||||
from agent.auxiliary_client import get_text_auxiliary_client
|
||||
from tools.debug_helpers import DebugSession
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -64,9 +65,17 @@ def _get_firecrawl_client():
|
|||
_firecrawl_client = Firecrawl(api_key=api_key)
|
||||
return _firecrawl_client
|
||||
|
||||
DEFAULT_SUMMARIZER_MODEL = "google/gemini-3-flash-preview"
|
||||
DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION = 5000
|
||||
|
||||
# Resolve auxiliary text client at module level; build an async wrapper.
|
||||
_aux_sync_client, DEFAULT_SUMMARIZER_MODEL = get_text_auxiliary_client()
|
||||
_aux_async_client: AsyncOpenAI | None = None
|
||||
if _aux_sync_client is not None:
|
||||
_aux_async_client = AsyncOpenAI(
|
||||
api_key=_aux_sync_client.api_key,
|
||||
base_url=str(_aux_sync_client.base_url),
|
||||
)
|
||||
|
||||
_debug = DebugSession("web_tools", env_var="WEB_TOOLS_DEBUG")
|
||||
|
||||
|
||||
|
|
@ -223,7 +232,10 @@ Create a markdown summary that captures all key information in a well-organized,
|
|||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = await _get_openrouter_client().chat.completions.create(
|
||||
if _aux_async_client is None:
|
||||
logger.warning("No auxiliary model available for web content processing")
|
||||
return None
|
||||
response = await _aux_async_client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": system_prompt},
|
||||
|
|
@ -231,12 +243,6 @@ Create a markdown summary that captures all key information in a well-organized,
|
|||
],
|
||||
temperature=0.1,
|
||||
max_tokens=max_tokens,
|
||||
extra_body={
|
||||
"reasoning": {
|
||||
"enabled": True,
|
||||
"effort": "xhigh"
|
||||
}
|
||||
}
|
||||
)
|
||||
return response.choices[0].message.content.strip()
|
||||
except Exception as api_error:
|
||||
|
|
@ -342,7 +348,14 @@ Synthesize these into ONE cohesive, comprehensive summary that:
|
|||
Create a single, unified markdown summary."""
|
||||
|
||||
try:
|
||||
response = await _get_openrouter_client().chat.completions.create(
|
||||
if _aux_async_client is None:
|
||||
logger.warning("No auxiliary model for synthesis, concatenating summaries")
|
||||
fallback = "\n\n".join(summaries)
|
||||
if len(fallback) > max_output_size:
|
||||
fallback = fallback[:max_output_size] + "\n\n[... truncated ...]"
|
||||
return fallback
|
||||
|
||||
response = await _aux_async_client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": "You synthesize multiple summaries into one cohesive, comprehensive summary. Be thorough but concise."},
|
||||
|
|
@ -350,12 +363,6 @@ Create a single, unified markdown summary."""
|
|||
],
|
||||
temperature=0.1,
|
||||
max_tokens=4000,
|
||||
extra_body={
|
||||
"reasoning": {
|
||||
"enabled": True,
|
||||
"effort": "xhigh"
|
||||
}
|
||||
}
|
||||
)
|
||||
final_summary = response.choices[0].message.content.strip()
|
||||
|
||||
|
|
@ -677,8 +684,8 @@ async def web_extract_tool(
|
|||
debug_call_data["pages_extracted"] = pages_extracted
|
||||
debug_call_data["original_response_size"] = len(json.dumps(response))
|
||||
|
||||
# Process each result with LLM if enabled
|
||||
if use_llm_processing and os.getenv("OPENROUTER_API_KEY"):
|
||||
# Process each result with LLM if enabled and auxiliary client is available
|
||||
if use_llm_processing and _aux_async_client is not None:
|
||||
logger.info("Processing extracted content with LLM (parallel)...")
|
||||
debug_call_data["processing_applied"].append("llm_processing")
|
||||
|
||||
|
|
@ -744,8 +751,8 @@ async def web_extract_tool(
|
|||
else:
|
||||
logger.warning("%s (no content to process)", url)
|
||||
else:
|
||||
if use_llm_processing and not os.getenv("OPENROUTER_API_KEY"):
|
||||
logger.warning("LLM processing requested but OPENROUTER_API_KEY not set, returning raw content")
|
||||
if use_llm_processing and _aux_async_client is None:
|
||||
logger.warning("LLM processing requested but no auxiliary model available, returning raw content")
|
||||
debug_call_data["processing_applied"].append("llm_processing_unavailable")
|
||||
|
||||
# Print summary of extracted pages for debugging (original behavior)
|
||||
|
|
@ -973,8 +980,8 @@ async def web_crawl_tool(
|
|||
debug_call_data["pages_crawled"] = pages_crawled
|
||||
debug_call_data["original_response_size"] = len(json.dumps(response))
|
||||
|
||||
# Process each result with LLM if enabled
|
||||
if use_llm_processing and os.getenv("OPENROUTER_API_KEY"):
|
||||
# Process each result with LLM if enabled and auxiliary client is available
|
||||
if use_llm_processing and _aux_async_client is not None:
|
||||
logger.info("Processing crawled content with LLM (parallel)...")
|
||||
debug_call_data["processing_applied"].append("llm_processing")
|
||||
|
||||
|
|
@ -1040,8 +1047,8 @@ async def web_crawl_tool(
|
|||
else:
|
||||
logger.warning("%s (no content to process)", page_url)
|
||||
else:
|
||||
if use_llm_processing and not os.getenv("OPENROUTER_API_KEY"):
|
||||
logger.warning("LLM processing requested but OPENROUTER_API_KEY not set, returning raw content")
|
||||
if use_llm_processing and _aux_async_client is None:
|
||||
logger.warning("LLM processing requested but no auxiliary model available, returning raw content")
|
||||
debug_call_data["processing_applied"].append("llm_processing_unavailable")
|
||||
|
||||
# Print summary of crawled pages for debugging (original behavior)
|
||||
|
|
@ -1096,14 +1103,9 @@ def check_firecrawl_api_key() -> bool:
|
|||
return bool(os.getenv("FIRECRAWL_API_KEY"))
|
||||
|
||||
|
||||
def check_nous_api_key() -> bool:
|
||||
"""
|
||||
Check if the Nous Research API key is available in environment variables.
|
||||
|
||||
Returns:
|
||||
bool: True if API key is set, False otherwise
|
||||
"""
|
||||
return bool(os.getenv("OPENROUTER_API_KEY"))
|
||||
def check_auxiliary_model() -> bool:
|
||||
"""Check if an auxiliary text model is available for LLM content processing."""
|
||||
return _aux_async_client is not None
|
||||
|
||||
|
||||
def get_debug_session_info() -> Dict[str, Any]:
|
||||
|
|
@ -1120,7 +1122,7 @@ if __name__ == "__main__":
|
|||
|
||||
# Check if API keys are available
|
||||
firecrawl_available = check_firecrawl_api_key()
|
||||
nous_available = check_nous_api_key()
|
||||
nous_available = check_auxiliary_model()
|
||||
|
||||
if not firecrawl_available:
|
||||
print("❌ FIRECRAWL_API_KEY environment variable not set")
|
||||
|
|
@ -1130,12 +1132,11 @@ if __name__ == "__main__":
|
|||
print("✅ Firecrawl API key found")
|
||||
|
||||
if not nous_available:
|
||||
print("❌ OPENROUTER_API_KEY environment variable not set")
|
||||
print("Please set your API key: export OPENROUTER_API_KEY='your-key-here'")
|
||||
print("Get API key at: https://inference-api.nousresearch.com/")
|
||||
print("⚠️ Without Nous API key, LLM content processing will be disabled")
|
||||
print("❌ No auxiliary model available for LLM content processing")
|
||||
print("Set OPENROUTER_API_KEY, configure Nous Portal, or set OPENAI_BASE_URL + OPENAI_API_KEY")
|
||||
print("⚠️ Without an auxiliary model, LLM content processing will be disabled")
|
||||
else:
|
||||
print("✅ Nous Research API key found")
|
||||
print(f"✅ Auxiliary model available: {DEFAULT_SUMMARIZER_MODEL}")
|
||||
|
||||
if not firecrawl_available:
|
||||
exit(1)
|
||||
|
|
@ -1143,7 +1144,7 @@ if __name__ == "__main__":
|
|||
print("🛠️ Web tools ready for use!")
|
||||
|
||||
if nous_available:
|
||||
print("🧠 LLM content processing available with Gemini 3 Flash Preview via OpenRouter")
|
||||
print(f"🧠 LLM content processing available with {DEFAULT_SUMMARIZER_MODEL}")
|
||||
print(f" Default min length for processing: {DEFAULT_MIN_LENGTH_FOR_SUMMARIZATION} chars")
|
||||
|
||||
# Show debug mode status
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue