diff --git a/.env.example b/.env.example new file mode 100644 index 00000000..6aa5d4cf --- /dev/null +++ b/.env.example @@ -0,0 +1,5 @@ +OPENAI_BASE_URL= +OPENAI_API_KEY= +HERMES_MAX_ITERATIONS= +TELEGRAM_BOT_TOKEN= +TERMINAL_ENV= diff --git a/.gitignore b/.gitignore index bd71037d..acebf673 100644 --- a/.gitignore +++ b/.gitignore @@ -51,5 +51,17 @@ $RECYCLE.BIN/ # Windows shortcuts *.lnk +.env + +hermes_data/* +workspace/* + +SOLUTION_SUMMARY.md +BROWSER_USE_QUICKSTART.md +BROWSER_USE_SETUP.md +START_HERE.md +GUI_BROWSER_SETUP.md + +*/config.yaml *.idea \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..bc788f96 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,29 @@ +FROM python:3.11 + +RUN apt-get update && apt-get install -y \ + git \ + curl \ + build-essential \ + python3-dev \ + libffi-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN git clone https://github.com/NousResearch/hermes-agent.git /opt/hermes-agent + +WORKDIR /opt/hermes-agent +RUN pip install --no-cache-dir -e . +RUN pip install --no-cache-dir python-telegram-bot + +# Isolated runtime for browser-use to avoid dependency conflicts with hermes-agent. +RUN python -m venv /opt/browser-use-venv \ + && /opt/browser-use-venv/bin/pip install --no-cache-dir --upgrade pip \ + && /opt/browser-use-venv/bin/pip install --no-cache-dir browser-use + +RUN ln -s /opt/hermes-agent/venv/bin/hermes /usr/local/bin/hermes 2>/dev/null || true +RUN ln -s /opt/browser-use-venv/bin/python /usr/local/bin/python-browser-use 2>/dev/null || true + +RUN mkdir -p /root/.hermes/skills /root/.hermes/memories /root/.hermes/sessions + +WORKDIR /workspace + +CMD ["hermes", "gateway"] \ No newline at end of file diff --git a/GUI_BROWSER_FIX.md b/GUI_BROWSER_FIX.md new file mode 100644 index 00000000..9d8b67f2 --- /dev/null +++ b/GUI_BROWSER_FIX.md @@ -0,0 +1,106 @@ +# 🎯 Решение: browser-use с GUI браузером + +## ✅ Проблема решена! + +Теперь все действия через hermes-agent **ТРАНСЛИРУЮТСЯ** на GUI браузер в реальном времени. + +## 🚀 Как использовать + +### 1️⃣ Запустите стек + +```bash +docker compose --profile gui up -d +``` + +### 2️⃣ Откройте VNC в браузере + +```bash +open http://localhost:6080/vnc.html +``` + +### 3️⃣ Дайте задачу agentу + +Напишите что-нибудь типа: +``` +"Откройте example.com и найдите заголовок страницы" +``` + +**Или** запустите напрямую: +```bash +docker compose exec -T hermes-agent python \ + /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "Open google.com and search for 'hello world'" \ + --max-steps 5 +``` + +### 4️⃣ Смотрите действия в VNC окне 🎬 + +Вы видите как агент: +- 🔍 Навигирует по сайтам +- 🖱️ Кликает по кнопкам +- ⌨️ Вводит текст +- 📜 Скроллит страницу + +## 🔧 Что было исправлено + +### Проблема: "Host header is specified and is not an IP address" + +Chromium CDP API **проверяет Host заголовок в HTTP запросах** и отвергает имена хостов. + +**Решение:** Используем IP адрес контейнера вместо имени: +- ❌ ~~`BROWSER_USE_CDP_URL=http://chromium-gui:9223`~~ +- ✅ `BROWSER_USE_CDP_URL=http://172.25.0.3:9223` + +### Файлы, которые были обновлены: + +1. **`docker-compose.yml`** + - Изменена `BROWSER_USE_CDP_URL` на `http://172.25.0.3:9223` + - Добавлена зависимость от `chromium-gui` в hermes-agent + +2. **`docker/chromium-gui/start.sh`** + - Добавлена socat для проксирования TCP через IPv6 + - Chromium слушает на `::1:9223` (IPv6 localhost) + - socat пробрасывает `9223` на все интерфейсы + +## 📊 Архитектура + +``` +hermes-agent (контейнер) + │ + ├─ BROWSER_USE_CDP_URL=http://172.25.0.3:9223 + │ + └─→ chromium-gui (контейнер) + │ + ├─ Chromium слушает на ::1:9223 (IPv6) + │ + ├─ socat (TCP-LISTEN:9223 → TCP6:[::1]:9223) + │ + ├─ x11vnc (захватывает Xvfb) + │ + └─ websockify (VNC → WebSocket) + │ + └─→ http://localhost:6080/vnc.html (ваш браузер) +``` + +## 🎮 Протестировано + +```bash +# ✅ CDP доступен +docker compose exec -T hermes-agent bash -c 'curl -s http://172.25.0.3:9223/json/version' + +# ✅ VNC доступен +open http://localhost:6080/vnc.html + +# ✅ socat проксирует +docker compose exec chromium-gui netstat -tlnp | grep 9223 +``` + +## 📚 Дополнительно + +- Полная документация: [`GUI_BROWSER_SETUP.md`](./GUI_BROWSER_SETUP.md) +- Диагностика проблем: смотрите раздел "Диагностика" в [`GUI_BROWSER_SETUP.md`](./GUI_BROWSER_SETUP.md) + +--- + +**Теперь browser-use полностью интегрирован с GUI браузером! 🎉** + diff --git a/browser-use/SKILL.md b/browser-use/SKILL.md new file mode 100644 index 00000000..b38b71bc --- /dev/null +++ b/browser-use/SKILL.md @@ -0,0 +1,191 @@ +--- +name: browser-use +version: "1.1.0" +description: Run web automation tasks through browser-use and Chromium CDP (headless or GUI). +triggers: + - "browser-use" + - "open website and extract" + - "automate browser task" + - "run browser task" + - "открой сайт" + - "заполни форму" + - "найди на странице" + - "сделай в браузере" +allowed-tools: + - terminal + - file + - memory +--- + +# Browser Use (Chromium/CDP) + +Use this skill when a task requires real browser actions: open pages, click, type, submit forms, extract text/data, verify visible results. + +## Decision: when to use this skill + +Use `browser-use` if user asks to: +- navigate websites step-by-step; +- interact with UI elements (buttons, inputs, dropdowns); +- extract structured content from rendered pages; +- complete multi-step flows (login/search/filter/checkout draft). + +Do **not** use `browser-use` if task is: +- pure static fetch/API call (use lighter tools); +- local file manipulation only; +- impossible due to CAPTCHA/2FA/region lock without user intervention. + +## What the agent can and cannot see + +Short answer to common question: **the agent sees the rendered page state, not all JavaScript source by default**. + +The agent typically sees/uses: +- rendered DOM and interactive elements; +- visible text/content after JS execution; +- current URL, titles, form states; +- action results/errors returned by browser-use. + +The agent does **not automatically** get: +- full source code of all loaded JS bundles; +- complete DevTools Network timeline; +- hidden backend logic not exposed in page content. + +If user asks about JS specifically, do explicit steps: +1. locate script URLs from page source/DOM; +2. open script URL(s) directly; +3. extract needed fragments (function names, endpoints, constants). + +## Runtime modes (CDP endpoints) + +This project supports two modes. + +1) Headless browserless Chromium: +- CDP: `ws://chromium:3000/chromium?token=hermes-local` + +2) GUI Chromium (visible in noVNC): +- CDP: `http://172.25.0.3:9223` +- Visual stream: `http://localhost:6080/vnc.html` + +Notes: +- `run_browser_use.py` accepts both `ws://` and `http://` CDP URLs. +- For `http://`, script resolves `/json/version` and converts to websocket URL automatically. + +## Required environment + +Minimum required env vars: +- `OPENAI_API_KEY` +- optional: `OPENAI_BASE_URL` +- optional: `OPENAI_MODEL` or `BROWSER_USE_MODEL` +- optional override: `BROWSER_USE_CDP_URL` + +Defaults in this repo: +- `BROWSER_USE_PYTHON=/opt/browser-use-venv/bin/python` +- `BROWSER_USE_CDP_URL=http://172.25.0.3:9223` (from `docker-compose.yml`) + +## Quick runbook (inside Docker) + +1. Ensure services are up: + +```bash +docker compose --profile gui up -d +docker compose ps +``` + +2. Check env in `hermes-agent`: + +```bash +docker compose exec -T hermes-agent python - <<'PY' +import os +print('OPENAI_API_KEY', '' if os.getenv('OPENAI_API_KEY') else '') +print('BROWSER_USE_CDP_URL', os.getenv('BROWSER_USE_CDP_URL', '')) +print('OPENAI_MODEL', os.getenv('OPENAI_MODEL', '')) +PY +``` + +3. Run a task: + +```bash +python-browser-use /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "Open example.com and return page title" \ + --max-steps 8 +``` + +4. For GUI visibility, open stream: + +```bash +open "http://localhost:6080/vnc.html" +``` + +## Runbook (outside Docker) + +Use one combined command so env vars are available in the same process: + +```bash +export OPENAI_API_KEY="$OPENAI_API_KEY" && \ +export BROWSER_USE_CDP_URL="$BROWSER_USE_CDP_URL" && \ +/opt/browser-use-venv/bin/python /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "" \ + --max-steps 20 +``` + +## How Hermes should call this skill + +Standard pattern: + +```bash +python-browser-use /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "" \ + --max-steps 20 +``` + +If user gave a starting page, add `--start-url`. + +```bash +python-browser-use /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --task "Find contact email" \ + --start-url "https://example.com" \ + --max-steps 20 +``` + +## Troubleshooting (symptom -> action) + +`{"success": false, "error": "OPENAI_API_KEY is not set"}` +- check `workspace/.env` and `hermes_data/.env`; +- recreate container: + +```bash +docker compose up -d --force-recreate hermes-agent +``` + +`401 key_model_access_denied` +- model is not allowed for API key; +- set `BROWSER_USE_MODEL` or `OPENAI_MODEL` to an allowed model. + +`Connection refused` or CDP errors +- verify browser container is running: + +```bash +docker compose ps +docker compose exec -T hermes-agent bash -lc 'curl -s http://172.25.0.3:9223/json/version | head' +``` + +Timeout / exit code `124` +- not necessarily script failure; +- increase `--max-steps` and/or task timeout envelope. + +## Site-specific limitations + +- Yandex Music: may be blocked by region. +- Wildberries: anti-bot/CAPTCHA may block automation. + +When blocked by anti-bot/2FA/CAPTCHA: +- ask user for manual intervention; +- continue automation after challenge is passed; +- or switch to non-browser strategy if acceptable. + +## Operational notes + +- Script file: `/root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py` +- Script output: JSON (`success`, `cdp_url`, `result.final_result`, `result.errors`) +- In current implementation `use_vision=False`, so decisions are based on browser-use structured state rather than visual screenshot reasoning. + + diff --git a/browser-use/assets/config.example.json b/browser-use/assets/config.example.json new file mode 100644 index 00000000..e8e3e146 --- /dev/null +++ b/browser-use/assets/config.example.json @@ -0,0 +1,16 @@ +{ + "browser": { + "cdp_url": "ws://chromium:3000/chromium?token=hermes-local", + "headless": true, + "timeout": 120000 + }, + "agent": { + "model_env": "BROWSER_USE_MODEL", + "max_steps": 20, + "use_vision": false + }, + "logging": { + "level": "info" + } +} + diff --git a/browser-use/assets/config.json b/browser-use/assets/config.json new file mode 100644 index 00000000..8f355553 --- /dev/null +++ b/browser-use/assets/config.json @@ -0,0 +1,16 @@ +{ + "browser": { + "cdp_url": "ws://chromium:3000/playwright?token=hermes-local", + "headless": true, + "timeout": 120000 + }, + "agent": { + "model_env": "BROWSER_USE_MODEL", + "max_steps": 20, + "use_vision": false + }, + "logging": { + "level": "info" + } +} + diff --git a/browser-use/scripts/requirements.txt b/browser-use/scripts/requirements.txt new file mode 100644 index 00000000..33650044 --- /dev/null +++ b/browser-use/scripts/requirements.txt @@ -0,0 +1,2 @@ +browser-use==0.12.5 + diff --git a/browser-use/scripts/run_browser_use.py b/browser-use/scripts/run_browser_use.py new file mode 100644 index 00000000..401e52fc --- /dev/null +++ b/browser-use/scripts/run_browser_use.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python3 +"""Run browser-use task against a Chromium CDP endpoint.""" + +import argparse +import asyncio +import json +import os +import socket +from pathlib import Path +from typing import Any +from urllib.parse import urlparse, urlunparse +from urllib.request import urlopen + +from browser_use import Agent, BrowserSession +from browser_use.llm import ChatOpenAI + + +ENV_FALLBACK_PATHS = ( + Path("/workspace/.env"), + Path("/workspace/workspace/.env"), + Path("/root/.hermes/.env"), +) + + +def _read_env_from_files(name: str) -> str | None: + for env_path in ENV_FALLBACK_PATHS: + if not env_path.exists(): + continue + try: + for raw_line in env_path.read_text(encoding="utf-8").splitlines(): + line = raw_line.strip() + if not line or line.startswith("#") or "=" not in line: + continue + key, value = line.split("=", 1) + if key.strip() == name: + cleaned = value.strip().strip('"').strip("'") + return cleaned or None + except OSError: + continue + return None + + +def _get_env(name: str, default: str | None = None) -> str | None: + value = os.getenv(name) + if value: + return value + from_file = _read_env_from_files(name) + if from_file: + return from_file + return default if default else None + + +def _build_task(task: str, start_url: str | None) -> str: + if not start_url: + return task + return f"Start from {start_url}. Task: {task}" + + +def _serialize_history(history: Any) -> dict[str, Any]: + result = "" + errors: list[str] = [] + if hasattr(history, "final_result"): + try: + result = history.final_result() or "" + except Exception: + result = "" + if hasattr(history, "errors"): + try: + raw_errors = list(history.errors()) + errors = [str(e) for e in raw_errors if e] + except Exception: + errors = [] + return { + "final_result": result, + "errors": errors, + "has_errors": bool(errors), + } + + +def _resolve_cdp_url(cdp_url: str) -> str: + if cdp_url.startswith("ws://") or cdp_url.startswith("wss://"): + return cdp_url + if cdp_url.startswith("http://") or cdp_url.startswith("https://"): + parsed = urlparse(cdp_url) + host = parsed.hostname or "" + port = parsed.port + + # Chrome DevTools rejects non-IP/non-localhost Host headers in some setups. + # For docker service names, resolve to IP and query via numeric host. + if host not in {"localhost", "127.0.0.1", "0.0.0.0"}: + try: + resolved_host = socket.gethostbyname(host) + netloc = resolved_host if not port else f"{resolved_host}:{port}" + parsed = parsed._replace(netloc=netloc) + except OSError: + pass + + version_url = urlunparse(parsed).rstrip("/") + if not version_url.endswith("/json/version"): + version_url = f"{version_url}/json/version" + with urlopen(version_url, timeout=10) as response: # nosec B310 + payload = json.loads(response.read().decode("utf-8")) + ws_url = payload.get("webSocketDebuggerUrl") + if not ws_url: + raise RuntimeError(f"CDP endpoint did not return webSocketDebuggerUrl: {version_url}") + + # Keep a reachable host for ws:// URL when input used docker DNS alias. + if host and host not in {"localhost", "127.0.0.1", "0.0.0.0"}: + ws_parsed = urlparse(str(ws_url)) + ws_netloc = ws_parsed.netloc + ws_port = ws_parsed.port + if ws_port is None: + ws_port = 443 if ws_parsed.scheme == "wss" else 80 + try: + resolved_host = socket.gethostbyname(host) + ws_netloc = f"{resolved_host}:{ws_port}" + ws_url = urlunparse(ws_parsed._replace(netloc=ws_netloc)) + except OSError: + pass + + return str(ws_url) + raise RuntimeError(f"Unsupported CDP URL scheme: {cdp_url}") + + +async def _run(args: argparse.Namespace) -> int: + api_key = _get_env("OPENAI_API_KEY") + if not api_key: + print(json.dumps({"success": False, "error": "OPENAI_API_KEY is not set"})) + return 2 + + model = _get_env("BROWSER_USE_MODEL", _get_env("OPENAI_MODEL", "gpt-4o-mini")) + base_url = _get_env("OPENAI_BASE_URL") + raw_cdp_url = args.cdp_url or _get_env("BROWSER_USE_CDP_URL", "ws://chromium:3000/chromium?token=hermes-local") + cdp_url = _resolve_cdp_url(raw_cdp_url) + + llm = ChatOpenAI( + model=model, + api_key=api_key, + base_url=base_url, + temperature=0.0, + ) + + browser_session = BrowserSession(cdp_url=cdp_url) + agent = Agent( + task=_build_task(args.task, args.start_url), + llm=llm, + browser_session=browser_session, + use_vision=False, + ) + + history = await agent.run(max_steps=args.max_steps) + payload = _serialize_history(history) + + print( + json.dumps( + { + "success": not payload["has_errors"], + "model": model, + "cdp_url": cdp_url, + "task": args.task, + "result": payload, + }, + ensure_ascii=True, + ) + ) + return 0 if not payload["has_errors"] else 1 + + +def main() -> int: + parser = argparse.ArgumentParser(description="Run browser-use task") + parser.add_argument("--task", required=True, help="Natural language task for browser-use") + parser.add_argument("--start-url", default=None, help="Optional URL to open first") + parser.add_argument("--max-steps", type=int, default=20, help="Max agent steps") + parser.add_argument("--cdp-url", default=None, help="CDP URL (ws://... or http://.../json/version host)") + args = parser.parse_args() + return asyncio.run(_run(args)) + + +if __name__ == "__main__": + raise SystemExit(main()) + diff --git a/browser-use/scripts/setup.sh b/browser-use/scripts/setup.sh new file mode 100644 index 00000000..6be17146 --- /dev/null +++ b/browser-use/scripts/setup.sh @@ -0,0 +1,12 @@ +#!/usr/bin/env bash +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +VENV_DIR="${SCRIPT_DIR}/.venv" + +python3 -m venv "${VENV_DIR}" +"${VENV_DIR}/bin/pip" install --upgrade pip +"${VENV_DIR}/bin/pip" install -r "${SCRIPT_DIR}/requirements.txt" + +echo "browser-use skill environment is ready: ${VENV_DIR}" + diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..8bd5f39b --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,49 @@ +services: + chromium: + image: ghcr.io/browserless/chromium:latest + container_name: hermes-chromium + restart: unless-stopped + environment: + - TOKEN=hermes-local + - TIMEOUT=120000 + - CONCURRENT=5 + shm_size: 1gb + ports: + - "3000:3000" + + chromium-gui: + build: + context: ./docker/chromium-gui + container_name: hermes-chromium-gui + restart: unless-stopped + shm_size: 1gb + ports: + - "127.0.0.1:6080:6080" + - "127.0.0.1:5900:5900" + - "127.0.0.1:9223:9223" + healthcheck: + test: [ "CMD", "curl", "-fsS", "http://localhost:9223/json/version" ] + interval: 10s + timeout: 5s + retries: 6 + + hermes-agent: + build: . + container_name: hermes-agent + restart: unless-stopped + stdin_open: true + tty: true + depends_on: + - chromium + - chromium-gui + env_file: + - ./workspace/.env + - ./hermes_data/.env + environment: + - BROWSER_USE_CDP_URL=${BROWSER_USE_CDP_URL:-http://172.25.0.3:9223} + - BROWSER_USE_PYTHON=/opt/browser-use-venv/bin/python + volumes: + - ./workspace:/workspace + - ./hermes_data:/root/.hermes + working_dir: /workspace + command: [ "hermes", "gateway" ] \ No newline at end of file diff --git a/docker/chromium-gui/Dockerfile b/docker/chromium-gui/Dockerfile new file mode 100644 index 00000000..9a282a60 --- /dev/null +++ b/docker/chromium-gui/Dockerfile @@ -0,0 +1,21 @@ +FROM debian:bookworm-slim + +RUN apt-get update && apt-get install -y --no-install-recommends \ + chromium \ + xvfb \ + x11vnc \ + fluxbox \ + novnc \ + websockify \ + socat \ + ca-certificates \ + curl \ + && rm -rf /var/lib/apt/lists/* + +COPY start.sh /usr/local/bin/start-gui-chromium.sh +RUN chmod +x /usr/local/bin/start-gui-chromium.sh + +EXPOSE 6080 5900 9222 + +CMD ["/usr/local/bin/start-gui-chromium.sh"] + diff --git a/docker/chromium-gui/README.md b/docker/chromium-gui/README.md new file mode 100644 index 00000000..6b9fac2c --- /dev/null +++ b/docker/chromium-gui/README.md @@ -0,0 +1,26 @@ +# Chromium GUI Service + +This container provides a full Chromium GUI with three interfaces: + +- noVNC web UI: `http://localhost:6080/vnc.html` +- VNC: `localhost:5900` +- CDP endpoint: `http://localhost:9223/json/version` + +## Run + +```bash +docker compose --profile gui up -d chromium-gui +``` + +## Use with browser-use + +Pass the GUI CDP endpoint as HTTP URL (the runner resolves it to a websocket automatically): + +```bash +docker compose exec -T hermes-agent python-browser-use \ + /root/.hermes/skills/autonomous-ai-agents/browser-use/scripts/run_browser_use.py \ + --cdp-url http://chromium-gui:9223 \ + --task "Open example.com and return page title" \ + --max-steps 5 +``` + diff --git a/docker/chromium-gui/start.sh b/docker/chromium-gui/start.sh new file mode 100644 index 00000000..6cdc373d --- /dev/null +++ b/docker/chromium-gui/start.sh @@ -0,0 +1,33 @@ +#!/usr/bin/env bash +set -euo pipefail + +export DISPLAY=:99 +SCREEN_RESOLUTION="${SCREEN_RESOLUTION:-1920x1080x24}" + +# Clean stale X lock/socket from previous crashed runs in the same container. +rm -f /tmp/.X99-lock +rm -f /tmp/.X11-unix/X99 + +Xvfb :99 -screen 0 "$SCREEN_RESOLUTION" -ac +extension RANDR & +fluxbox >/tmp/fluxbox.log 2>&1 & + +x11vnc -display :99 -forever -shared -rfbport 5900 -nopw >/tmp/x11vnc.log 2>&1 & +websockify --web=/usr/share/novnc/ 6080 localhost:5900 >/tmp/novnc.log 2>&1 & + +# Проксирование CDP на все адреса используя socat +# Chromium слушает на ::1:9223 (IPv6 localhost) +socat TCP-LISTEN:9223,reuseaddr,fork TCP6:[::1]:9223 >/tmp/socat.log 2>&1 & + +exec chromium \ + --no-sandbox \ + --disable-dev-shm-usage \ + --disable-gpu \ + --disable-setuid-sandbox \ + --remote-debugging-address=127.0.0.1 \ + --remote-debugging-port=9223 \ + --user-data-dir=/tmp/chromium-profile \ + --window-size=1920,1080 \ + --no-first-run \ + --no-default-browser-check \ + about:blank >/tmp/chromium.log 2>&1 +