Merge origin/main into hermes/hermes-5d160594

2026-03-14 19:34:05 -07:00 · 2026-03-14 19:34:05 -07:00 · 3229e434b8
commit 3229e434b8
parent 2536ff328b 81cd367aec
78 changed files with 3762 additions and 395 deletions
--- a/tools/browser_tool.py
+++ b/tools/browser_tool.py
@ -53,6 +53,7 @@ import atexit
 import json
 import logging
 import os
+import re
 import signal
 import subprocess
 import shutil
@ -165,63 +166,18 @@ def _emergency_cleanup_all_sessions():
    if not _active_sessions:
        return
    
-    logger.info("Emergency cleanup: closing %s active session(s)...", len(_active_sessions))
-    
+    logger.info("Emergency cleanup: closing %s active session(s)...",
+                len(_active_sessions))
+
    try:
-        if _is_local_mode():
-            # Local mode: just close agent-browser sessions via CLI
-            for task_id, session_info in list(_active_sessions.items()):
-                session_name = session_info.get("session_name")
-                if session_name:
-                    try:
-                        browser_cmd = _find_agent_browser()
-                        task_socket_dir = os.path.join(
-                            _socket_safe_tmpdir(),
-                            f"agent-browser-{session_name}"
-                        )
-                        env = {**os.environ, "AGENT_BROWSER_SOCKET_DIR": task_socket_dir}
-                        subprocess.run(
-                            browser_cmd.split() + ["--session", session_name, "--json", "close"],
-                            capture_output=True, timeout=5, env=env,
-                        )
-                        logger.info("Closed local session %s", session_name)
-                    except Exception as e:
-                        logger.debug("Error closing local session %s: %s", session_name, e)
-        else:
-            # Cloud mode: release Browserbase sessions via API
-            api_key = os.environ.get("BROWSERBASE_API_KEY")
-            project_id = os.environ.get("BROWSERBASE_PROJECT_ID")
-
-            if not api_key or not project_id:
-                logger.warning("Cannot cleanup - missing BROWSERBASE credentials")
-                return
-
-            for task_id, session_info in list(_active_sessions.items()):
-                bb_session_id = session_info.get("bb_session_id")
-                if bb_session_id:
-                    try:
-                        response = requests.post(
-                            f"https://api.browserbase.com/v1/sessions/{bb_session_id}",
-                            headers={
-                                "X-BB-API-Key": api_key,
-                                "Content-Type": "application/json"
-                            },
-                            json={
-                                "projectId": project_id,
-                                "status": "REQUEST_RELEASE"
-                            },
-                            timeout=5  # Short timeout for cleanup
-                        )
-                        if response.status_code in (200, 201, 204):
-                            logger.info("Closed session %s", bb_session_id)
-                        else:
-                            logger.warning("Failed to close session %s: HTTP %s", bb_session_id, response.status_code)
-                    except Exception as e:
-                        logger.error("Error closing session %s: %s", bb_session_id, e)
-        
-        _active_sessions.clear()
+        cleanup_all_browsers()
    except Exception as e:
        logger.error("Emergency cleanup error: %s", e)
+    finally:
+        with _cleanup_lock:
+            _active_sessions.clear()
+            _session_last_activity.clear()
+        _recording_sessions.clear()


 # Register cleanup via atexit only.  Previous versions installed SIGINT/SIGTERM
@ -640,18 +596,14 @@ def _create_browserbase_session(task_id: str) -> Dict[str, str]:


 def _create_local_session(task_id: str) -> Dict[str, str]:
-    """Create a lightweight local browser session (no cloud API call).
-
-    Returns the same dict shape as ``_create_browserbase_session`` so the rest
-    of the code can treat both modes uniformly.
-    """
    import uuid
-    session_name = f"hermes_{task_id}_{uuid.uuid4().hex[:8]}"
-    logger.info("Created local browser session %s", session_name)
+    session_name = f"h_{uuid.uuid4().hex[:10]}"
+    logger.info("Created local browser session %s for task %s",
+                session_name, task_id)
    return {
        "session_name": session_name,
-        "bb_session_id": None,   # Not applicable in local mode
-        "cdp_url": None,         # Not applicable in local mode
+        "bb_session_id": None,
+        "cdp_url": None,
        "features": {"local": True},
    }

@ -772,6 +724,27 @@ def _find_agent_browser() -> str:
    )


+def _extract_screenshot_path_from_text(text: str) -> Optional[str]:
+    """Extract a screenshot file path from agent-browser human-readable output."""
+    if not text:
+        return None
+
+    patterns = [
+        r"Screenshot saved to ['\"](?P<path>/[^'\"]+?\.png)['\"]",
+        r"Screenshot saved to (?P<path>/\S+?\.png)(?:\s|$)",
+        r"(?P<path>/\S+?\.png)(?:\s|$)",
+    ]
+
+    for pattern in patterns:
+        match = re.search(pattern, text)
+        if match:
+            path = match.group("path").strip().strip("'\"")
+            if path:
+                return path
+
+    return None
+
+
 def _run_browser_command(
    task_id: str,
    command: str,
@ -841,9 +814,20 @@ def _run_browser_command(
                     command, task_id, task_socket_dir, len(task_socket_dir))
        
        browser_env = {**os.environ}
-        # Ensure PATH includes standard dirs (systemd services may have minimal PATH)
-        if "/usr/bin" not in browser_env.get("PATH", "").split(":"):
-            browser_env["PATH"] = f"{browser_env.get('PATH', '')}:{_SANE_PATH}"
+
+        # Ensure PATH includes Hermes-managed Node first, then standard system dirs.
+        hermes_home = Path(os.environ.get("HERMES_HOME", Path.home() / ".hermes"))
+        hermes_node_bin = str(hermes_home / "node" / "bin")
+
+        existing_path = browser_env.get("PATH", "")
+        path_parts = [p for p in existing_path.split(":") if p]
+        candidate_dirs = [hermes_node_bin] + [p for p in _SANE_PATH.split(":") if p]
+
+        for part in reversed(candidate_dirs):
+            if os.path.isdir(part) and part not in path_parts:
+                path_parts.insert(0, part)
+
+        browser_env["PATH"] = ":".join(path_parts)
        browser_env["AGENT_BROWSER_SOCKET_DIR"] = task_socket_dir
        
        result = subprocess.run(
@ -866,10 +850,11 @@ def _run_browser_command(
                           command, " ".join(cmd_parts[:4]) + "...",
                           (result.stderr or "")[:200])

-        # Parse JSON output
-        if result.stdout.strip():
+        stdout_text = result.stdout.strip()
+
+        if stdout_text:
            try:
-                parsed = json.loads(result.stdout.strip())
+                parsed = json.loads(stdout_text)
                # Warn if snapshot came back empty (common sign of daemon/CDP issues)
                if command == "snapshot" and parsed.get("success"):
                    snap_data = parsed.get("data", {})
@ -879,13 +864,33 @@ def _run_browser_command(
                                       "returncode=%s", result.returncode)
                return parsed
            except json.JSONDecodeError:
-                # Non-JSON output indicates agent-browser crash or version mismatch
-                raw = result.stdout.strip()[:500]
+                raw = stdout_text[:2000]
                logger.warning("browser '%s' returned non-JSON output (rc=%s): %s",
-                               command, result.returncode, raw[:200])
+                               command, result.returncode, raw[:500])
+
+                if command == "screenshot":
+                    stderr_text = (result.stderr or "").strip()
+                    combined_text = "\n".join(
+                        part for part in [stdout_text, stderr_text] if part
+                    )
+                    recovered_path = _extract_screenshot_path_from_text(combined_text)
+
+                    if recovered_path and Path(recovered_path).exists():
+                        logger.info(
+                            "browser 'screenshot' recovered file from non-JSON output: %s",
+                            recovered_path,
+                        )
+                        return {
+                            "success": True,
+                            "data": {
+                                "path": recovered_path,
+                                "raw": raw,
+                            },
+                        }
+
                return {
-                    "success": True,
-                    "data": {"raw": raw}
+                    "success": False,
+                    "error": f"Non-JSON output from agent-browser for '{command}': {raw}"
                }
        
        # Check for errors
@ -1250,46 +1255,26 @@ def browser_press(key: str, task_id: Optional[str] = None) -> str:
 def browser_close(task_id: Optional[str] = None) -> str:
    """
    Close the browser session.
-    
+
    Args:
        task_id: Task identifier for session isolation
-        
+
    Returns:
        JSON string with close result
    """
    effective_task_id = task_id or "default"
-    
-    # Stop auto-recording before closing
-    _maybe_stop_recording(effective_task_id)
-    
-    result = _run_browser_command(effective_task_id, "close", [])
-    
-    # Close the backend session (Browserbase API in cloud mode, nothing extra in local mode)
-    session_key = task_id if task_id and task_id in _active_sessions else "default"
-    if session_key in _active_sessions:
-        session_info = _active_sessions[session_key]
-        bb_session_id = session_info.get("bb_session_id")
-        if bb_session_id:
-            # Cloud mode: release the Browserbase session via API
-            try:
-                config = _get_browserbase_config()
-                _close_browserbase_session(bb_session_id, config["api_key"], config["project_id"])
-            except Exception as e:
-                logger.warning("Could not close BrowserBase session: %s", e)
-        del _active_sessions[session_key]
-    
-    if result.get("success"):
-        return json.dumps({
-            "success": True,
-            "closed": True
-        }, ensure_ascii=False)
-    else:
-        # Even if close fails, session was released
-        return json.dumps({
-            "success": True,
-            "closed": True,
-            "warning": result.get("error", "Session may not have been active")
-        }, ensure_ascii=False)
+    with _cleanup_lock:
+        had_session = effective_task_id in _active_sessions
+
+    cleanup_browser(effective_task_id)
+
+    response = {
+        "success": True,
+        "closed": True,
+    }
+    if not had_session:
+        response["warning"] = "Session may not have been active"
+    return json.dumps(response, ensure_ascii=False)


 def browser_console(clear: bool = False, task_id: Optional[str] = None) -> str:
@ -1481,9 +1466,11 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
        _cleanup_old_screenshots(screenshots_dir, max_age_hours=24)
        
        # Take screenshot using agent-browser
-        screenshot_args = [str(screenshot_path)]
+        screenshot_args = []
        if annotate:
-            screenshot_args.insert(0, "--annotate")
+            screenshot_args.append("--annotate")
+        screenshot_args.append("--full")
+        screenshot_args.append(str(screenshot_path))
        result = _run_browser_command(
            effective_task_id, 
            "screenshot", 
@ -1498,7 +1485,11 @@ def browser_vision(question: str, annotate: bool = False, task_id: Optional[str]
                "success": False,
                "error": f"Failed to take screenshot ({mode} mode): {error_detail}"
            }, ensure_ascii=False)
-        
+
+        actual_screenshot_path = result.get("data", {}).get("path")
+        if actual_screenshot_path:
+            screenshot_path = Path(actual_screenshot_path)
+
        # Check if screenshot file was created
        if not screenshot_path.exists():
            mode = "local" if _is_local_mode() else "cloud"
--- a/tools/cronjob_tools.py
+++ b/tools/cronjob_tools.py
@ -304,6 +304,12 @@ Jobs run in a fresh session with no current-chat context, so prompts must be sel
 If skill or skills are provided on create, the future cron run loads those skills in order, then follows the prompt as the task instruction.
 On update, passing skills=[] clears attached skills.

+NOTE: The agent's final response is auto-delivered to the target — do NOT use
+send_message in the prompt for that same destination. Same-target send_message
+calls are skipped to avoid duplicate cron deliveries. Put the primary
+user-facing content in the final response, and use send_message only for
+additional or different targets.
+
 Important safety rule: cron-run sessions should not recursively schedule more cron jobs.""",
    "parameters": {
        "type": "object",
--- a/tools/memory_tool.py
+++ b/tools/memory_tool.py
@ -435,24 +435,25 @@ def check_memory_requirements() -> bool:
 MEMORY_SCHEMA = {
    "name": "memory",
    "description": (
-        "Save important information to persistent memory that survives across sessions. "
-        "Your memory appears in your system prompt at session start -- it's how you "
-        "remember things about the user and your environment between conversations.\n\n"
+        "Save durable information to persistent memory that survives across sessions. "
+        "Memory is injected into future turns, so keep it compact and focused on facts "
+        "that will still matter later.\n\n"
        "WHEN TO SAVE (do this proactively, don't wait to be asked):\n"
        "- User shares a preference, habit, or personal detail (name, role, timezone, coding style)\n"
        "- You discover something about the environment (OS, installed tools, project structure)\n"
        "- User corrects you or says 'remember this' / 'don't do that again'\n"
        "- You learn a convention, API quirk, or workflow specific to this user's setup\n"
-        "- You completed something - log it like a diary entry\n"
-        "- After completing a complex task, save a brief note about what was done\n\n"
-        "- If you've discovered a new way to do something, solved a problem that could be necessary later, save it as a skill with the skill tool\n\n"
+        "- You identify a stable fact that will be useful again in future sessions\n\n"
+        "Do NOT save task progress, session outcomes, completed-work logs, or temporary TODO "
+        "state to memory; use session_search to recall those from past transcripts.\n"
+        "If you've discovered a new way to do something, solved a problem that could be "
+        "necessary later, save it as a skill with the skill tool.\n\n"
        "TWO TARGETS:\n"
        "- 'user': who the user is -- name, role, preferences, communication style, pet peeves\n"
        "- 'memory': your notes -- environment facts, project conventions, tool quirks, lessons learned\n\n"
        "ACTIONS: add (new entry), replace (update existing -- old_text identifies it), "
-        "remove (delete -- old_text identifies it).\n"
-        "Capacity shown in system prompt. When >80%, consolidate entries before adding new ones.\n\n"
-        "SKIP: trivial/obvious info, things easily re-discovered, raw data dumps."
+        "remove (delete -- old_text identifies it).\n\n"
+        "SKIP: trivial/obvious info, things easily re-discovered, raw data dumps, and temporary task state."
    ),
    "parameters": {
        "type": "object",
--- a/tools/send_message_tool.py
+++ b/tools/send_message_tool.py
@ -153,6 +153,10 @@ def _handle_send(args):
                f"or set a home channel via: hermes config set {platform_name.upper()}_HOME_CHANNEL <channel_id>"
            })

+    duplicate_skip = _maybe_skip_cron_duplicate_send(platform_name, chat_id, thread_id)
+    if duplicate_skip:
+        return json.dumps(duplicate_skip)
+
    try:
        from model_tools import _run_async
        result = _run_async(
@ -213,6 +217,51 @@ def _describe_media_for_mirror(media_files):
    return f"[Sent {len(media_files)} media attachments]"


+def _get_cron_auto_delivery_target():
+    """Return the cron scheduler's auto-delivery target for the current run, if any."""
+    platform = os.getenv("HERMES_CRON_AUTO_DELIVER_PLATFORM", "").strip().lower()
+    chat_id = os.getenv("HERMES_CRON_AUTO_DELIVER_CHAT_ID", "").strip()
+    if not platform or not chat_id:
+        return None
+    thread_id = os.getenv("HERMES_CRON_AUTO_DELIVER_THREAD_ID", "").strip() or None
+    return {
+        "platform": platform,
+        "chat_id": chat_id,
+        "thread_id": thread_id,
+    }
+
+
+def _maybe_skip_cron_duplicate_send(platform_name: str, chat_id: str, thread_id: str | None):
+    """Skip redundant cron send_message calls when the scheduler will auto-deliver there."""
+    auto_target = _get_cron_auto_delivery_target()
+    if not auto_target:
+        return None
+
+    same_target = (
+        auto_target["platform"] == platform_name
+        and str(auto_target["chat_id"]) == str(chat_id)
+        and auto_target.get("thread_id") == thread_id
+    )
+    if not same_target:
+        return None
+
+    target_label = f"{platform_name}:{chat_id}"
+    if thread_id is not None:
+        target_label += f":{thread_id}"
+
+    return {
+        "success": True,
+        "skipped": True,
+        "reason": "cron_auto_delivery_duplicate_target",
+        "target": target_label,
+        "note": (
+            f"Skipped send_message to {target_label}. This cron job will already auto-deliver "
+            "its final response to that same target. Put the intended user-facing content in "
+            "your final response instead, or use a different target if you want an additional message."
+        ),
+    }
+
+
 async def _send_to_platform(platform, pconfig, chat_id, message, thread_id=None, media_files=None):
    """Route a message to the appropriate platform sender."""
    from gateway.config import Platform
--- a/tools/session_search_tool.py
+++ b/tools/session_search_tool.py
@ -341,8 +341,8 @@ SESSION_SEARCH_SCHEMA = {
        "- The user references a project, person, or concept that seems familiar but isn't in memory\n"
        "- You want to check if you've solved a similar problem before\n"
        "- The user asks 'what did we do about X?' or 'how did we fix Y?'\n\n"
-        "Don't hesitate to search -- it's fast and cheap. Better to search and confirm "
-        "than to guess or ask the user to repeat themselves.\n\n"
+        "Don't hesitate to search when it is actually cross-session -- it's fast and cheap. "
+        "Better to search and confirm than to guess or ask the user to repeat themselves.\n\n"
        "Search syntax: keywords joined with OR for broad recall (elevenlabs OR baseten OR funding), "
        "phrases for exact match (\"docker networking\"), boolean (python NOT java), prefix (deploy*). "
        "IMPORTANT: Use OR between keywords for best results — FTS5 defaults to AND which misses "
--- a/tools/skills_guard.py
+++ b/tools/skills_guard.py
@ -645,14 +645,11 @@ def should_allow_install(result: ScanResult, force: bool = False) -> Tuple[bool,

    Args:
        result: Scan result from scan_skill()
-        force: If True, override blocks for caution verdicts (never overrides dangerous)
+        force: If True, override blocked policy decisions for this scan result

    Returns:
        (allowed, reason) tuple
    """
-    if result.verdict == "dangerous":
-        return False, f"Scan verdict is DANGEROUS ({len(result.findings)} findings). Blocked."
-
    policy = INSTALL_POLICY.get(result.trust_level, INSTALL_POLICY["community"])
    vi = VERDICT_INDEX.get(result.verdict, 2)
    decision = policy[vi]
@ -661,7 +658,10 @@ def should_allow_install(result: ScanResult, force: bool = False) -> Tuple[bool,
        return True, f"Allowed ({result.trust_level} source, {result.verdict} verdict)"

    if force:
-        return True, f"Force-installed despite {result.verdict} verdict ({len(result.findings)} findings)"
+        return True, (
+            f"Force-installed despite blocked {result.verdict} verdict "
+            f"({len(result.findings)} findings)"
+        )

    return False, (
        f"Blocked ({result.trust_level} source + {result.verdict} verdict, "
--- a/tools/vision_tools.py
+++ b/tools/vision_tools.py
@ -354,6 +354,7 @@ async def vision_analyze_tool(
        # Prepare error response
        result = {
            "success": False,
+            "error": error_msg,
            "analysis": analysis,
        }