From 74cb5455caf79a17218d9ab72568393b9fa9fc65 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=9A=D0=BE=D0=B1=D1=8B=D0=BB=D0=BA=D0=B5=D0=B2=D0=B8?= =?UTF-8?q?=D1=87=20=D0=A4=D1=91=D0=B4=D0=BE=D1=80?= Date: Thu, 26 Mar 2026 23:28:59 +0300 Subject: [PATCH] update skill --- GUI_BROWSER_FIX.md | 106 ++++++++++++++ browser-use/SKILL.md | 191 +++++++++++++++++++++++++ browser-use/assets/config.example.json | 16 +++ browser-use/assets/config.json | 16 +++ browser-use/scripts/requirements.txt | 2 + browser-use/scripts/run_browser_use.py | 181 +++++++++++++++++++++++ browser-use/scripts/setup.sh | 12 ++ 7 files changed, 524 insertions(+) create mode 100644 GUI_BROWSER_FIX.md create mode 100644 browser-use/SKILL.md create mode 100644 browser-use/assets/config.example.json create mode 100644 browser-use/assets/config.json create mode 100644 browser-use/scripts/requirements.txt create mode 100644 browser-use/scripts/run_browser_use.py create mode 100644 browser-use/scripts/setup.sh diff --git a/GUI_BROWSER_FIX.md b/GUI_BROWSER_FIX.md new file mode 100644 index 00000000..9d8b67f2 --- /dev/null +++ b/GUI_BROWSER_FIX.md @@ -0,0 +1,106 @@ +# 🎯 Решение: browser-use с GUI браузером + +## ✅ Проблема решена! + +Теперь все действия через hermes-agent **ТРАНСЛИРУЮТСЯ** на GUI браузер в реальном времени. + +## 🚀 Как использовать + +### 1️⃣ Запустите стек + +```bash +docker compose --profile gui up -d +``` + +### 2️⃣ Откройте VNC в браузере + +```bash +open http://localhost:6080/vnc.html +``` + +### 3️⃣ Дайте задачу agentу + +Напишите что-нибудь типа: +``` +"Откройте example.com и найдите заголовок страницы" +``` + +**Или** запустите напрямую: +```bash +docker compose exec -T hermes-agent python \ + /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "Open google.com and search for 'hello world'" \ + --max-steps 5 +``` + +### 4️⃣ Смотрите действия в VNC окне 🎬 + +Вы видите как агент: +- 🔍 Навигирует по сайтам +- 🖱️ Кликает по кнопкам +- ⌨️ Вводит текст +- 📜 Скроллит страницу + +## 🔧 Что было исправлено + +### Проблема: "Host header is specified and is not an IP address" + +Chromium CDP API **проверяет Host заголовок в HTTP запросах** и отвергает имена хостов. + +**Решение:** Используем IP адрес контейнера вместо имени: +- ❌ ~~`BROWSER_USE_CDP_URL=http://chromium-gui:9223`~~ +- ✅ `BROWSER_USE_CDP_URL=http://172.25.0.3:9223` + +### Файлы, которые были обновлены: + +1. **`docker-compose.yml`** + - Изменена `BROWSER_USE_CDP_URL` на `http://172.25.0.3:9223` + - Добавлена зависимость от `chromium-gui` в hermes-agent + +2. **`docker/chromium-gui/start.sh`** + - Добавлена socat для проксирования TCP через IPv6 + - Chromium слушает на `::1:9223` (IPv6 localhost) + - socat пробрасывает `9223` на все интерфейсы + +## 📊 Архитектура + +``` +hermes-agent (контейнер) + │ + ├─ BROWSER_USE_CDP_URL=http://172.25.0.3:9223 + │ + └─→ chromium-gui (контейнер) + │ + ├─ Chromium слушает на ::1:9223 (IPv6) + │ + ├─ socat (TCP-LISTEN:9223 → TCP6:[::1]:9223) + │ + ├─ x11vnc (захватывает Xvfb) + │ + └─ websockify (VNC → WebSocket) + │ + └─→ http://localhost:6080/vnc.html (ваш браузер) +``` + +## 🎮 Протестировано + +```bash +# ✅ CDP доступен +docker compose exec -T hermes-agent bash -c 'curl -s http://172.25.0.3:9223/json/version' + +# ✅ VNC доступен +open http://localhost:6080/vnc.html + +# ✅ socat проксирует +docker compose exec chromium-gui netstat -tlnp | grep 9223 +``` + +## 📚 Дополнительно + +- Полная документация: [`GUI_BROWSER_SETUP.md`](./GUI_BROWSER_SETUP.md) +- Диагностика проблем: смотрите раздел "Диагностика" в [`GUI_BROWSER_SETUP.md`](./GUI_BROWSER_SETUP.md) + +--- + +**Теперь browser-use полностью интегрирован с GUI браузером! 🎉** + diff --git a/browser-use/SKILL.md b/browser-use/SKILL.md new file mode 100644 index 00000000..b38b71bc --- /dev/null +++ b/browser-use/SKILL.md @@ -0,0 +1,191 @@ +--- +name: browser-use +version: "1.1.0" +description: Run web automation tasks through browser-use and Chromium CDP (headless or GUI). +triggers: + - "browser-use" + - "open website and extract" + - "automate browser task" + - "run browser task" + - "открой сайт" + - "заполни форму" + - "найди на странице" + - "сделай в браузере" +allowed-tools: + - terminal + - file + - memory +--- + +# Browser Use (Chromium/CDP) + +Use this skill when a task requires real browser actions: open pages, click, type, submit forms, extract text/data, verify visible results. + +## Decision: when to use this skill + +Use `browser-use` if user asks to: +- navigate websites step-by-step; +- interact with UI elements (buttons, inputs, dropdowns); +- extract structured content from rendered pages; +- complete multi-step flows (login/search/filter/checkout draft). + +Do **not** use `browser-use` if task is: +- pure static fetch/API call (use lighter tools); +- local file manipulation only; +- impossible due to CAPTCHA/2FA/region lock without user intervention. + +## What the agent can and cannot see + +Short answer to common question: **the agent sees the rendered page state, not all JavaScript source by default**. + +The agent typically sees/uses: +- rendered DOM and interactive elements; +- visible text/content after JS execution; +- current URL, titles, form states; +- action results/errors returned by browser-use. + +The agent does **not automatically** get: +- full source code of all loaded JS bundles; +- complete DevTools Network timeline; +- hidden backend logic not exposed in page content. + +If user asks about JS specifically, do explicit steps: +1. locate script URLs from page source/DOM; +2. open script URL(s) directly; +3. extract needed fragments (function names, endpoints, constants). + +## Runtime modes (CDP endpoints) + +This project supports two modes. + +1) Headless browserless Chromium: +- CDP: `ws://chromium:3000/chromium?token=hermes-local` + +2) GUI Chromium (visible in noVNC): +- CDP: `http://172.25.0.3:9223` +- Visual stream: `http://localhost:6080/vnc.html` + +Notes: +- `run_browser_use.py` accepts both `ws://` and `http://` CDP URLs. +- For `http://`, script resolves `/json/version` and converts to websocket URL automatically. + +## Required environment + +Minimum required env vars: +- `OPENAI_API_KEY` +- optional: `OPENAI_BASE_URL` +- optional: `OPENAI_MODEL` or `BROWSER_USE_MODEL` +- optional override: `BROWSER_USE_CDP_URL` + +Defaults in this repo: +- `BROWSER_USE_PYTHON=/opt/browser-use-venv/bin/python` +- `BROWSER_USE_CDP_URL=http://172.25.0.3:9223` (from `docker-compose.yml`) + +## Quick runbook (inside Docker) + +1. Ensure services are up: + +```bash +docker compose --profile gui up -d +docker compose ps +``` + +2. Check env in `hermes-agent`: + +```bash +docker compose exec -T hermes-agent python - <<'PY' +import os +print('OPENAI_API_KEY', '' if os.getenv('OPENAI_API_KEY') else '') +print('BROWSER_USE_CDP_URL', os.getenv('BROWSER_USE_CDP_URL', '')) +print('OPENAI_MODEL', os.getenv('OPENAI_MODEL', '')) +PY +``` + +3. Run a task: + +```bash +python-browser-use /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "Open example.com and return page title" \ + --max-steps 8 +``` + +4. For GUI visibility, open stream: + +```bash +open "http://localhost:6080/vnc.html" +``` + +## Runbook (outside Docker) + +Use one combined command so env vars are available in the same process: + +```bash +export OPENAI_API_KEY="$OPENAI_API_KEY" && \ +export BROWSER_USE_CDP_URL="$BROWSER_USE_CDP_URL" && \ +/opt/browser-use-venv/bin/python /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "" \ + --max-steps 20 +``` + +## How Hermes should call this skill + +Standard pattern: + +```bash +python-browser-use /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "" \ + --max-steps 20 +``` + +If user gave a starting page, add `--start-url`. + +```bash +python-browser-use /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "Find contact email" \ + --start-url "https://example.com" \ + --max-steps 20 +``` + +## Troubleshooting (symptom -> action) + +`{"success": false, "error": "OPENAI_API_KEY is not set"}` +- check `workspace/.env` and `hermes_data/.env`; +- recreate container: + +```bash +docker compose up -d --force-recreate hermes-agent +``` + +`401 key_model_access_denied` +- model is not allowed for API key; +- set `BROWSER_USE_MODEL` or `OPENAI_MODEL` to an allowed model. + +`Connection refused` or CDP errors +- verify browser container is running: + +```bash +docker compose ps +docker compose exec -T hermes-agent bash -lc 'curl -s http://172.25.0.3:9223/json/version | head' +``` + +Timeout / exit code `124` +- not necessarily script failure; +- increase `--max-steps` and/or task timeout envelope. + +## Site-specific limitations + +- Yandex Music: may be blocked by region. +- Wildberries: anti-bot/CAPTCHA may block automation. + +When blocked by anti-bot/2FA/CAPTCHA: +- ask user for manual intervention; +- continue automation after challenge is passed; +- or switch to non-browser strategy if acceptable. + +## Operational notes + +- Script file: `/root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py` +- Script output: JSON (`success`, `cdp_url`, `result.final_result`, `result.errors`) +- In current implementation `use_vision=False`, so decisions are based on browser-use structured state rather than visual screenshot reasoning. + + diff --git a/browser-use/assets/config.example.json b/browser-use/assets/config.example.json new file mode 100644 index 00000000..e8e3e146 --- /dev/null +++ b/browser-use/assets/config.example.json @@ -0,0 +1,16 @@ +{ + "browser": { + "cdp_url": "ws://chromium:3000/chromium?token=hermes-local", + "headless": true, + "timeout": 120000 + }, + "agent": { + "model_env": "BROWSER_USE_MODEL", + "max_steps": 20, + "use_vision": false + }, + "logging": { + "level": "info" + } +} + diff --git a/browser-use/assets/config.json b/browser-use/assets/config.json new file mode 100644 index 00000000..8f355553 --- /dev/null +++ b/browser-use/assets/config.json @@ -0,0 +1,16 @@ +{ + "browser": { + "cdp_url": "ws://chromium:3000/playwright?token=hermes-local", + "headless": true, + "timeout": 120000 + }, + "agent": { + "model_env": "BROWSER_USE_MODEL", + "max_steps": 20, + "use_vision": false + }, + "logging": { + "level": "info" + } +} + diff --git a/browser-use/scripts/requirements.txt b/browser-use/scripts/requirements.txt new file mode 100644 index 00000000..33650044 --- /dev/null +++ b/browser-use/scripts/requirements.txt @@ -0,0 +1,2 @@ +browser-use==0.12.5 + diff --git a/browser-use/scripts/run_browser_use.py b/browser-use/scripts/run_browser_use.py new file mode 100644 index 00000000..401e52fc --- /dev/null +++ b/browser-use/scripts/run_browser_use.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +"""Run browser-use task against a Chromium CDP endpoint.""" + +import argparse +import asyncio +import json +import os +import socket +from pathlib import Path +from typing import Any +from urllib.parse import urlparse, urlunparse +from urllib.request import urlopen + +from browser_use import Agent, BrowserSession +from browser_use.llm import ChatOpenAI + + +ENV_FALLBACK_PATHS = ( + Path("/workspace/.env"), + Path("/workspace/workspace/.env"), + Path("/root/.hermes/.env"), +) + + +def _read_env_from_files(name: str) -> str | None: + for env_path in ENV_FALLBACK_PATHS: + if not env_path.exists(): + continue + try: + for raw_line in env_path.read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, value = line.split("=", 1) + if key.strip() == name: + cleaned = value.strip().strip('"').strip("'") + return cleaned or None + except OSError: + continue + return None + + +def _get_env(name: str, default: str | None = None) -> str | None: + value = os.getenv(name) + if value: + return value + from_file = _read_env_from_files(name) + if from_file: + return from_file + return default if default else None + + +def _build_task(task: str, start_url: str | None) -> str: + if not start_url: + return task + return f"Start from {start_url}. Task: {task}" + + +def _serialize_history(history: Any) -> dict[str, Any]: + result = "" + errors: list[str] = [] + if hasattr(history, "final_result"): + try: + result = history.final_result() or "" + except Exception: + result = "" + if hasattr(history, "errors"): + try: + raw_errors = list(history.errors()) + errors = [str(e) for e in raw_errors if e] + except Exception: + errors = [] + return { + "final_result": result, + "errors": errors, + "has_errors": bool(errors), + } + + +def _resolve_cdp_url(cdp_url: str) -> str: + if cdp_url.startswith("ws://") or cdp_url.startswith("wss://"): + return cdp_url + if cdp_url.startswith("http://") or cdp_url.startswith("https://"): + parsed = urlparse(cdp_url) + host = parsed.hostname or "" + port = parsed.port + + # Chrome DevTools rejects non-IP/non-localhost Host headers in some setups. + # For docker service names, resolve to IP and query via numeric host. + if host not in {"localhost", "127.0.0.1", "0.0.0.0"}: + try: + resolved_host = socket.gethostbyname(host) + netloc = resolved_host if not port else f"{resolved_host}:{port}" + parsed = parsed._replace(netloc=netloc) + except OSError: + pass + + version_url = urlunparse(parsed).rstrip("/") + if not version_url.endswith("/json/version"): + version_url = f"{version_url}/json/version" + with urlopen(version_url, timeout=10) as response: # nosec B310 + payload = json.loads(response.read().decode("utf-8")) + ws_url = payload.get("webSocketDebuggerUrl") + if not ws_url: + raise RuntimeError(f"CDP endpoint did not return webSocketDebuggerUrl: {version_url}") + + # Keep a reachable host for ws:// URL when input used docker DNS alias. + if host and host not in {"localhost", "127.0.0.1", "0.0.0.0"}: + ws_parsed = urlparse(str(ws_url)) + ws_netloc = ws_parsed.netloc + ws_port = ws_parsed.port + if ws_port is None: + ws_port = 443 if ws_parsed.scheme == "wss" else 80 + try: + resolved_host = socket.gethostbyname(host) + ws_netloc = f"{resolved_host}:{ws_port}" + ws_url = urlunparse(ws_parsed._replace(netloc=ws_netloc)) + except OSError: + pass + + return str(ws_url) + raise RuntimeError(f"Unsupported CDP URL scheme: {cdp_url}") + + +async def _run(args: argparse.Namespace) -> int: + api_key = _get_env("OPENAI_API_KEY") + if not api_key: + print(json.dumps({"success": False, "error": "OPENAI_API_KEY is not set"})) + return 2 + + model = _get_env("BROWSER_USE_MODEL", _get_env("OPENAI_MODEL", "gpt-4o-mini")) + base_url = _get_env("OPENAI_BASE_URL") + raw_cdp_url = args.cdp_url or _get_env("BROWSER_USE_CDP_URL", "ws://chromium:3000/chromium?token=hermes-local") + cdp_url = _resolve_cdp_url(raw_cdp_url) + + llm = ChatOpenAI( + model=model, + api_key=api_key, + base_url=base_url, + temperature=0.0, + ) + + browser_session = BrowserSession(cdp_url=cdp_url) + agent = Agent( + task=_build_task(args.task, args.start_url), + llm=llm, + browser_session=browser_session, + use_vision=False, + ) + + history = await agent.run(max_steps=args.max_steps) + payload = _serialize_history(history) + + print( + json.dumps( + { + "success": not payload["has_errors"], + "model": model, + "cdp_url": cdp_url, + "task": args.task, + "result": payload, + }, + ensure_ascii=True, + ) + ) + return 0 if not payload["has_errors"] else 1 + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run browser-use task") + parser.add_argument("--task", required=True, help="Natural language task for browser-use") + parser.add_argument("--start-url", default=None, help="Optional URL to open first") + parser.add_argument("--max-steps", type=int, default=20, help="Max agent steps") + parser.add_argument("--cdp-url", default=None, help="CDP URL (ws://... or http://.../json/version host)") + args = parser.parse_args() + return asyncio.run(_run(args)) + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/browser-use/scripts/setup.sh b/browser-use/scripts/setup.sh new file mode 100644 index 00000000..6be17146 --- /dev/null +++ b/browser-use/scripts/setup.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +VENV_DIR="${SCRIPT_DIR}/.venv" + +python3 -m venv "${VENV_DIR}" +"${VENV_DIR}/bin/pip" install --upgrade pip +"${VENV_DIR}/bin/pip" install -r "${SCRIPT_DIR}/requirements.txt" + +echo "browser-use skill environment is ready: ${VENV_DIR}" +