From f1f32d8366620d2cb0b2423b37df637498730448 Mon Sep 17 00:00:00 2001 From: VladislavIlin7 Date: Tue, 21 Apr 2026 23:32:09 +0300 Subject: [PATCH 1/2] add new tool: to_captcha --- api/clients/browser_rpc_client.py | 34 ++- api/clients/browser_rpc_contracts.py | 11 +- api/contracts/task_schemas.py | 63 ++-- api/core/settings.py | 1 + api/domain/task_status.py | 4 +- api/main.py | 1 + api/repositories/task_store.py | 77 ++++- api/routes/tasks.py | 59 +++- api/services/task_service.py | 140 ++++++++- browser_env/browser_use_runner.py | 412 ++++++++++++++++++++++++-- hermes_code/cli.py | 130 +++++++- hermes_code/model_tools.py | 3 +- hermes_code/run_agent.py | 80 +++++ hermes_code/tools/browser_use_tool.py | 123 ++++++-- 14 files changed, 1008 insertions(+), 130 deletions(-) diff --git a/api/clients/browser_rpc_client.py b/api/clients/browser_rpc_client.py index 1fcbe6f4..80d4ff9a 100644 --- a/api/clients/browser_rpc_client.py +++ b/api/clients/browser_rpc_client.py @@ -7,19 +7,21 @@ from api.clients.browser_rpc_contracts import BrowserRpcError class BrowserRpcClient: def __init__(self, rpc_url: str, session: aiohttp.ClientSession) -> None: - self._rpc_url = rpc_url + self._run_url = rpc_url.rstrip("/") + if self._run_url.endswith("/run"): + self._base_url = self._run_url[: -len("/run")] + else: + self._base_url = self._run_url + self._run_url = f"{self._base_url}/run" self._session = session - async def run(self, task: str, timeout_sec: float) -> dict[str, Any]: - payload = {"task": task} - timeout = aiohttp.ClientTimeout(total=timeout_sec) - + async def _post_json(self, url: str, payload: dict[str, Any], timeout_sec: float | None = None) -> dict[str, Any]: + timeout = aiohttp.ClientTimeout(total=timeout_sec) if timeout_sec is not None else None try: - async with self._session.post(self._rpc_url, json=payload, timeout=timeout) as response: + async with self._session.post(url, json=payload, timeout=timeout) as response: if response.status >= 400: body = await response.text() raise BrowserRpcError(f"RPC HTTP: {response.status}: {body}") - try: data = await response.json(content_type=None) except aiohttp.ContentTypeError as exc: @@ -29,10 +31,22 @@ class BrowserRpcClient: if not isinstance(data, dict): raise BrowserRpcError("RPC returned invalid payload type") - return data + async def run(self, task_id: str, task: str, timeout_sec: float) -> dict[str, Any]: + return await self._post_json(self._run_url, {"task_id": task_id, "task": task}, timeout_sec=timeout_sec) -async def run_browser_task(rpc_url: str, task: str, timeout_sec: float) -> dict[str, Any]: + async def verify_captcha(self, task_id: str) -> dict[str, Any]: + return await self._post_json(f"{self._base_url}/verify", {"task_id": task_id}, timeout_sec=15) + + async def resume(self, task_id: str, timeout_sec: float) -> dict[str, Any]: + return await self._post_json(f"{self._base_url}/resume", {"task_id": task_id}, timeout_sec=timeout_sec) + + async def abort(self, task_id: str, reason: str | None = None) -> dict[str, Any]: + return await self._post_json(f"{self._base_url}/abort", {"task_id": task_id, "reason": reason}, timeout_sec=15) + + +async def run_browser_task(rpc_url: str, task_id: str, task: str, timeout_sec: float) -> dict[str, Any]: async with aiohttp.ClientSession() as session: - return await BrowserRpcClient(rpc_url, session=session).run(task=task, timeout_sec=timeout_sec) + client = BrowserRpcClient(rpc_url, session=session) + return await client.run(task_id=task_id, task=task, timeout_sec=timeout_sec) diff --git a/api/clients/browser_rpc_contracts.py b/api/clients/browser_rpc_contracts.py index 77ff31fa..2da71855 100644 --- a/api/clients/browser_rpc_contracts.py +++ b/api/clients/browser_rpc_contracts.py @@ -1,8 +1,15 @@ from typing import Any, Protocol -class BrowserRpcError(RuntimeError): ... +class BrowserRpcError(RuntimeError): + pass class BrowserRpcRunner(Protocol): - async def run(self, task: str, timeout_sec: float) -> dict[str, Any]: ... + async def run(self, task_id: str, task: str, timeout_sec: float) -> dict[str, Any]: ... + + async def verify_captcha(self, task_id: str) -> dict[str, Any]: ... + + async def resume(self, task_id: str, timeout_sec: float) -> dict[str, Any]: ... + + async def abort(self, task_id: str, reason: str | None = None) -> dict[str, Any]: ... diff --git a/api/contracts/task_schemas.py b/api/contracts/task_schemas.py index bcad3cbe..2aea67b4 100644 --- a/api/contracts/task_schemas.py +++ b/api/contracts/task_schemas.py @@ -1,4 +1,4 @@ -from typing import Any +from typing import Any, Literal from pydantic import BaseModel, Field @@ -6,38 +6,67 @@ from api.domain.task_status import TaskStatus class BrowserTaskRequest(BaseModel): - """Запрос на запуск задачи в browser-use агенте.""" + """Request to start a browser task.""" - task: str = Field(..., description="Текстовая задача для browser-use агента") - timeout: int = Field(300, description="Максимальное время выполнения задачи в секундах") - metadata: dict[str, Any] | None = Field(default=None, description="Дополнительные метаданные клиента") + task: str = Field(..., description="Text task for the browser-use agent") + timeout: int = Field(300, description="Maximum task execution time in seconds") + metadata: dict[str, Any] | None = Field(default=None, description="Optional client metadata") + + +class BrowserCaptchaVerification(BaseModel): + mode: Literal["dom_url_title"] = "dom_url_title" + selectors_absent: list[str] = Field(default_factory=list) + challenge_signals_absent: list[str] = Field(default_factory=list) + max_wait_seconds: int = Field(..., description="How long the task may stay paused waiting for the user") + + +class BrowserHumanInterventionPayload(BaseModel): + status: Literal["awaiting_user_captcha"] = "awaiting_user_captcha" + task_id: str + session_id: str + resume_token: str + browser_view_url: str | None = None + captcha_type: Literal["cloudflare", "recaptcha", "hcaptcha", "unknown"] = "unknown" + instructions: str + detected_at: float + verification: BrowserCaptchaVerification + + +class BrowserCaptchaReadyRequest(BaseModel): + user_response: str | None = Field(default=None, description="Free-form confirmation from the user") + + +class BrowserCaptchaAbortRequest(BaseModel): + reason: str | None = Field(default=None, description="Optional reason for aborting the CAPTCHA flow") class BrowserTaskAcceptedResponse(BaseModel): - """Ответ о том, что задача принята в обработку.""" + """Response indicating that a task was accepted for processing.""" task_id: str status: TaskStatus class BrowserTaskStatusResponse(BaseModel): - """Текущий статус задачи и временные отметки ее выполнения.""" + """Current task status and timestamps.""" task_id: str status: TaskStatus - create_at: float = Field(..., description="Время создания задачи в Unix timestamp") - started_at: float | None = Field(default=None, description="Время начала выполнения в Unix timestamp") - finished_at: float | None = Field(default=None, description="Время завершения выполнения в Unix timestamp") - error: str | None = Field(default=None, description="Текст ошибки, если задача завершилась с ошибкой") + create_at: float = Field(..., description="Task creation time as a Unix timestamp") + started_at: float | None = Field(default=None, description="Task start time as a Unix timestamp") + finished_at: float | None = Field(default=None, description="Task completion time as a Unix timestamp") + error: str | None = Field(default=None, description="Task error when applicable") + human_intervention: BrowserHumanInterventionPayload | None = None class BrowserTaskResultResponse(BaseModel): - """Финальный результат выполнения задачи в browser-use.""" + """Terminal result or meaningful paused state for a browser task.""" task_id: str status: TaskStatus - success: bool = Field(..., description="Успешно ли выполнена задача") - execution_time: float = Field(..., description="Фактическое время выполнения в секундах") - result: str | None = Field(default=None, description="Итоговый текстовый результат") - error: str | None = Field(default=None, description="Текст ошибки, если выполнение не удалось") - raw_response: dict[str, Any] | None = Field(default=None, description="Сырой ответ от browser-use RPC") + success: bool = Field(..., description="Whether the task completed successfully") + execution_time: float = Field(..., description="Observed execution time in seconds") + result: str | None = Field(default=None, description="Final textual result when available") + error: str | None = Field(default=None, description="Error text when execution failed") + raw_response: dict[str, Any] | None = Field(default=None, description="Raw payload returned by the browser runtime") + human_intervention: BrowserHumanInterventionPayload | None = None diff --git a/api/core/settings.py b/api/core/settings.py index c0839f7c..551a378a 100644 --- a/api/core/settings.py +++ b/api/core/settings.py @@ -9,6 +9,7 @@ class Settings: browser_rpc_url: str = os.getenv("BROWSER_USE_RPC_URL", "http://browser:8787/run") browser_rpc_timeout: float = float(os.getenv("BROWSER_USE_RPC_TIMEOUT", "900")) + captcha_wait_timeout: int = int(os.getenv("BROWSER_CAPTCHA_MAX_WAIT_SECONDS", "900")) max_concurrency: int = int(os.getenv("BROWSER_API_MAX_CONCURRENCY", "2")) diff --git a/api/domain/task_status.py b/api/domain/task_status.py index 20ea10f0..381a9ab9 100644 --- a/api/domain/task_status.py +++ b/api/domain/task_status.py @@ -2,8 +2,10 @@ from enum import Enum class TaskStatus(str, Enum): - """Состояние задачи браузерного агента.""" + """Browser task lifecycle states.""" + queued = "queued" running = "running" + awaiting_user_captcha = "awaiting_user_captcha" succeeded = "succeeded" failed = "failed" diff --git a/api/main.py b/api/main.py index c1a0e537..bdebd2a2 100644 --- a/api/main.py +++ b/api/main.py @@ -18,6 +18,7 @@ async def lifespan(app: FastAPI): rpc_client=BrowserRpcClient(settings.browser_rpc_url, session=session), max_concurrency=settings.max_concurrency, rpc_timeout_cap=settings.browser_rpc_timeout, + captcha_wait_timeout=settings.captcha_wait_timeout, ) app.state.task_service = task_service try: diff --git a/api/repositories/task_store.py b/api/repositories/task_store.py index bc66cd18..63b5d7d7 100644 --- a/api/repositories/task_store.py +++ b/api/repositories/task_store.py @@ -20,13 +20,17 @@ class TaskRecord: result: str | None = None error: str | None = None raw_response: dict[str, Any] | None = None + human_intervention: dict[str, Any] | None = None + session_id: str | None = None + resume_token: str | None = None + awaiting_deadline: float | None = None @property def execution_time(self) -> float: if self.started_at is None: return 0 end = self.finished_at if self.finished_at is not None else time.time() - return max(0, end - self.started_at) + return max(0.0, end - self.started_at) class TaskStore: @@ -51,16 +55,42 @@ class TaskStore: if rec is None: return None rec.status = TaskStatus.running - rec.started_at = time.time() + rec.finished_at = None + rec.error = None + rec.human_intervention = None + rec.awaiting_deadline = None + if rec.started_at is None: + rec.started_at = time.time() + return rec + + async def set_awaiting_captcha( + self, + task_id: str, + raw_response: dict[str, Any], + max_wait_seconds: int, + ) -> TaskRecord | None: + async with self._lock: + rec = self._tasks.get(task_id) + if rec is None: + return None + payload = raw_response.get("human_intervention") or {} + rec.status = TaskStatus.awaiting_user_captcha + rec.raw_response = raw_response + rec.human_intervention = payload + rec.session_id = payload.get("session_id") or rec.session_id + rec.resume_token = payload.get("resume_token") or rec.resume_token + rec.awaiting_deadline = time.time() + max(1, int(max_wait_seconds)) + rec.error = None + rec.finished_at = None return rec async def set_done( - self, - task_id: str, - success: bool, - raw_response: dict[str, Any] | None, - error: str | None, - result: str | None = None, + self, + task_id: str, + success: bool, + raw_response: dict[str, Any] | None, + error: str | None, + result: str | None = None, ) -> TaskRecord | None: async with self._lock: rec = self._tasks.get(task_id) @@ -69,8 +99,35 @@ class TaskStore: rec.finished_at = time.time() rec.raw_response = raw_response rec.error = error if error is not None else ( - raw_response.get("error") if isinstance(raw_response, dict) else None) + raw_response.get("error") if isinstance(raw_response, dict) else None + ) rec.result = result if result is not None else ( - raw_response.get("result") if isinstance(raw_response, dict) else None) + raw_response.get("result") if isinstance(raw_response, dict) else None + ) + rec.human_intervention = None + rec.awaiting_deadline = None rec.status = TaskStatus.succeeded if success else TaskStatus.failed return rec + + async def expire_if_needed(self, task_id: str) -> TaskRecord | None: + async with self._lock: + rec = self._tasks.get(task_id) + if rec is None: + return None + if rec.status != TaskStatus.awaiting_user_captcha: + return rec + if rec.awaiting_deadline is None or rec.awaiting_deadline > time.time(): + return rec + + rec.status = TaskStatus.failed + rec.finished_at = time.time() + rec.error = "CAPTCHA wait expired before the user completed verification." + rec.raw_response = { + "success": False, + "status": TaskStatus.failed.value, + "error": rec.error, + "error_code": "captcha_wait_expired", + } + rec.human_intervention = None + rec.awaiting_deadline = None + return rec diff --git a/api/routes/tasks.py b/api/routes/tasks.py index 94ed6238..88103771 100644 --- a/api/routes/tasks.py +++ b/api/routes/tasks.py @@ -2,6 +2,8 @@ from fastapi import APIRouter, Depends, HTTPException, Request from fastapi.responses import JSONResponse from api.contracts.task_schemas import ( + BrowserCaptchaAbortRequest, + BrowserCaptchaReadyRequest, BrowserTaskAcceptedResponse, BrowserTaskRequest, BrowserTaskResultResponse, @@ -20,8 +22,8 @@ def get_task_service(request: Request) -> TaskService: @router.post("/tasks", response_model=BrowserTaskAcceptedResponse, status_code=202) async def create_task( - payload: BrowserTaskRequest, - service: TaskService = Depends(get_task_service), + payload: BrowserTaskRequest, + service: TaskService = Depends(get_task_service), ) -> BrowserTaskAcceptedResponse: rec = await service.submit_task(task=payload.task.strip(), timeout=payload.timeout, metadata=payload.metadata) return BrowserTaskAcceptedResponse(task_id=rec.task_id, status=rec.status) @@ -37,8 +39,8 @@ async def get_task_status(task_id: str, service: TaskService = Depends(get_task_ @router.get("/tasks/{task_id}/result", response_model=BrowserTaskResultResponse) async def get_task_result( - task_id: str, - service: TaskService = Depends(get_task_service), + task_id: str, + service: TaskService = Depends(get_task_service), ) -> JSONResponse | BrowserTaskResultResponse: rec = await service.get_task(task_id) if rec is None: @@ -55,18 +57,35 @@ async def get_task_result( "result": None, "error": None, "raw_response": None, + "human_intervention": None, }, ) - return BrowserTaskResultResponse( - task_id=rec.task_id, - status=rec.status, - success=(rec.status == TaskStatus.succeeded), - execution_time=rec.execution_time, - result=rec.result, - error=rec.error, - raw_response=rec.raw_response, - ) + return _to_result_response(rec) + + +@router.post("/tasks/{task_id}/captcha/ready", response_model=BrowserTaskResultResponse) +async def captcha_ready( + task_id: str, + payload: BrowserCaptchaReadyRequest, + service: TaskService = Depends(get_task_service), +) -> BrowserTaskResultResponse: + rec = await service.resume_captcha(task_id, user_response=payload.user_response) + if rec is None: + raise HTTPException(status_code=404, detail="Task not found") + return _to_result_response(rec) + + +@router.post("/tasks/{task_id}/captcha/abort", response_model=BrowserTaskResultResponse) +async def captcha_abort( + task_id: str, + payload: BrowserCaptchaAbortRequest, + service: TaskService = Depends(get_task_service), +) -> BrowserTaskResultResponse: + rec = await service.abort_captcha(task_id, reason=payload.reason) + if rec is None: + raise HTTPException(status_code=404, detail="Task not found") + return _to_result_response(rec) def _to_status_response(rec: TaskRecord) -> BrowserTaskStatusResponse: @@ -77,4 +96,18 @@ def _to_status_response(rec: TaskRecord) -> BrowserTaskStatusResponse: started_at=rec.started_at, finished_at=rec.finished_at, error=rec.error, + human_intervention=rec.human_intervention, + ) + + +def _to_result_response(rec: TaskRecord) -> BrowserTaskResultResponse: + return BrowserTaskResultResponse( + task_id=rec.task_id, + status=rec.status, + success=(rec.status == TaskStatus.succeeded), + execution_time=rec.execution_time, + result=rec.result, + error=rec.error, + raw_response=rec.raw_response, + human_intervention=rec.human_intervention, ) diff --git a/api/services/task_service.py b/api/services/task_service.py index 97fc6b39..16a93006 100644 --- a/api/services/task_service.py +++ b/api/services/task_service.py @@ -1,21 +1,24 @@ import asyncio from api.clients.browser_rpc_contracts import BrowserRpcError, BrowserRpcRunner +from api.domain.task_status import TaskStatus from api.repositories.task_store import TaskRecord, TaskStore class TaskService: def __init__( - self, - store: TaskStore, - rpc_client: BrowserRpcRunner, - max_concurrency: int, - rpc_timeout_cap: float | None = None, + self, + store: TaskStore, + rpc_client: BrowserRpcRunner, + max_concurrency: int, + rpc_timeout_cap: float | None = None, + captcha_wait_timeout: int = 900, ) -> None: self._store = store self._rpc_client = rpc_client self._semaphore = asyncio.Semaphore(max_concurrency) self._rpc_timeout_cap = rpc_timeout_cap + self._captcha_wait_timeout = captcha_wait_timeout self._background_tasks: set[asyncio.Task[None]] = set() async def submit_task(self, task: str, timeout: int, metadata: dict | None) -> TaskRecord: @@ -26,7 +29,22 @@ class TaskService: return record async def get_task(self, task_id: str) -> TaskRecord | None: - return await self._store.get(task_id) + before = await self._store.get(task_id) + was_awaiting = bool(before is not None and before.status == TaskStatus.awaiting_user_captcha) + await self._store.expire_if_needed(task_id) + after = await self._store.get(task_id) + if ( + was_awaiting + and after is not None + and after.status == TaskStatus.failed + and after.error + and "expired" in after.error.lower() + ): + try: + await self._rpc_client.abort(task_id, reason=after.error) + except BrowserRpcError: + pass + return after async def close(self) -> None: if not self._background_tasks: @@ -37,6 +55,86 @@ class TaskService: await asyncio.gather(*self._background_tasks, return_exceptions=True) self._background_tasks.clear() + async def resume_captcha(self, task_id: str, user_response: str | None = None) -> TaskRecord | None: + rec = await self.get_task(task_id) + if rec is None: + return None + if rec.status == TaskStatus.failed: + return rec + if rec.status != TaskStatus.awaiting_user_captcha: + return rec + + verify_raw = await self._rpc_client.verify_captcha(task_id) + if not verify_raw.get("verified"): + await self._store.set_awaiting_captcha( + task_id=task_id, + raw_response={ + "success": False, + "status": TaskStatus.awaiting_user_captcha.value, + "error": "CAPTCHA is still present.", + "human_intervention": rec.human_intervention, + "verification": verify_raw, + "user_response": user_response, + }, + max_wait_seconds=(rec.human_intervention or {}).get("verification", {}).get( + "max_wait_seconds", self._captcha_wait_timeout + ), + ) + return await self._store.get(task_id) + + await self._store.set_running(task_id) + + try: + rpc_timeout = float(rec.timeout) + if self._rpc_timeout_cap is not None: + rpc_timeout = min(rpc_timeout, self._rpc_timeout_cap) + + raw = await asyncio.wait_for( + self._rpc_client.resume(task_id=task_id, timeout_sec=rpc_timeout), + timeout=float(rec.timeout) + 5, + ) + except asyncio.TimeoutError: + await self._store.set_done( + task_id=task_id, + success=False, + raw_response=None, + error="Timeout exceeded after CAPTCHA resume.", + ) + return await self._store.get(task_id) + except BrowserRpcError as exc: + await self._store.set_done( + task_id=task_id, + success=False, + raw_response=None, + error=str(exc), + ) + return await self._store.get(task_id) + + await self._apply_rpc_result(task_id, raw) + return await self._store.get(task_id) + + async def abort_captcha(self, task_id: str, reason: str | None = None) -> TaskRecord | None: + rec = await self.get_task(task_id) + if rec is None: + return None + if rec.status == TaskStatus.awaiting_user_captcha: + try: + await self._rpc_client.abort(task_id, reason=reason) + except BrowserRpcError: + pass + await self._store.set_done( + task_id=task_id, + success=False, + raw_response={ + "success": False, + "status": TaskStatus.failed.value, + "error_code": "captcha_aborted", + "reason": reason, + }, + error=reason or "User aborted CAPTCHA flow.", + ) + return await self._store.get(task_id) + async def _worker(self, task_id: str) -> None: rec = await self._store.set_running(task_id) if rec is None: @@ -49,17 +147,10 @@ class TaskService: rpc_timeout = min(rpc_timeout, self._rpc_timeout_cap) raw = await asyncio.wait_for( - self._rpc_client.run(task=rec.task, timeout_sec=rpc_timeout), + self._rpc_client.run(task_id=task_id, task=rec.task, timeout_sec=rpc_timeout), timeout=float(rec.timeout) + 5, ) - success = bool(raw.get("success")) - await self._store.set_done( - task_id=task_id, - success=success, - raw_response=raw, - error=None, - result=raw.get("result") if isinstance(raw, dict) else None, - ) + await self._apply_rpc_result(task_id, raw) except asyncio.TimeoutError: await self._store.set_done( task_id=task_id, @@ -81,3 +172,22 @@ class TaskService: raw_response=None, error=f"Internal error: {exc}", ) + + async def _apply_rpc_result(self, task_id: str, raw: dict | None) -> None: + raw = raw or {} + status = raw.get("status") + if status == TaskStatus.awaiting_user_captcha.value: + human = raw.get("human_intervention") or {} + verification = human.get("verification") or {} + max_wait_seconds = verification.get("max_wait_seconds", self._captcha_wait_timeout) + await self._store.set_awaiting_captcha(task_id, raw_response=raw, max_wait_seconds=max_wait_seconds) + return + + success = bool(raw.get("success")) + await self._store.set_done( + task_id=task_id, + success=success, + raw_response=raw, + error=None, + result=raw.get("result") if isinstance(raw, dict) else None, + ) diff --git a/browser_env/browser_use_runner.py b/browser_env/browser_use_runner.py index 08ed6b42..08e2f82b 100644 --- a/browser_env/browser_use_runner.py +++ b/browser_env/browser_use_runner.py @@ -1,12 +1,52 @@ import asyncio import json import os +import threading +import time +import uuid +from dataclasses import dataclass, field from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer +from typing import Any from urllib import error, request from browser_use import Agent, Browser, ChatOpenAI +CAPTCHA_WAIT_TIMEOUT = int(os.getenv("BROWSER_CAPTCHA_MAX_WAIT_SECONDS", "900")) +_RUNNER_TASKS: dict[str, "RunnerTask"] = {} +_RUNNER_LOCK = threading.Lock() + +_CF_STRONG = ( + "just a moment", + "attention required", + "checking your browser", + "cf-challenge", + "cdn-cgi/challenge-platform", + "__cf_chl", + "turnstile", +) +_RECAPTCHA_STRONG = ( + "g-recaptcha", + "recaptcha/api2", + "www.google.com/recaptcha", + "grecaptcha", +) +_HCAPTCHA_STRONG = ( + "hcaptcha", + "newassets.hcaptcha.com", + "js.hcaptcha.com/1/api.js", +) +_GENERIC_CAPTCHA_STRONG = ( + "captcha", + "are you human", + "verify you are human", + "human verification", + "bot detection", + "security check", + "press and hold", +) + + def _json_response(handler, status_code, payload): data = json.dumps(payload, ensure_ascii=False).encode("utf-8") handler.send_response(status_code) @@ -16,12 +56,198 @@ def _json_response(handler, status_code, payload): handler.wfile.write(data) -async def run_browser_task(task): +@dataclass +class RunnerTask: + task_id: str + task: str + browser_view_url: str + resume_token: str = field(default_factory=lambda: uuid.uuid4().hex) + created_at: float = field(default_factory=time.time) + status: str = "starting" + payload: dict[str, Any] | None = None + error: str | None = None + agent: Any = None + browser: Any = None + loop: asyncio.AbstractEventLoop | None = None + thread: threading.Thread | None = None + settled_event: threading.Event = field(default_factory=threading.Event) + finished: bool = False + awaiting: bool = False + aborted: bool = False + transition_count: int = 0 + lock: threading.Lock = field(default_factory=threading.Lock) + + def set_payload(self, status: str, payload: dict[str, Any]) -> None: + with self.lock: + self.status = status + self.payload = payload + self.transition_count += 1 + self.awaiting = status == "awaiting_user_captcha" + self.finished = status in {"succeeded", "failed"} + self.settled_event.set() + + +def _get_task(task_id: str) -> RunnerTask | None: + with _RUNNER_LOCK: + return _RUNNER_TASKS.get(task_id) + + +def _put_task(task: RunnerTask) -> RunnerTask: + with _RUNNER_LOCK: + _RUNNER_TASKS[task.task_id] = task + return task + + +async def _get_page_html(agent: Agent) -> str: + try: + cdp_session = await agent.browser_session.get_or_create_cdp_session() + doc = await cdp_session.cdp_client.send.DOM.getDocument(session_id=cdp_session.session_id) + html_result = await cdp_session.cdp_client.send.DOM.getOuterHTML( + params={"nodeId": doc["root"]["nodeId"]}, + session_id=cdp_session.session_id, + ) + return str(html_result.get("outerHTML", "")) + except Exception: + return "" + + +async def _capture_page_state(agent: Agent) -> dict[str, Any]: + url = "" + title = "" + summary = None + try: + summary = await agent.browser_session.get_browser_state_summary() + except Exception: + summary = None + + if summary is not None: + if isinstance(summary, dict): + url = str(summary.get("url") or "") + title = str(summary.get("title") or "") + else: + url = str(getattr(summary, "url", "") or "") + title = str(getattr(summary, "title", "") or "") + + if not url: + try: + url = str(await agent.browser_session.get_current_page_url() or "") + except Exception: + url = "" + if not title: + try: + title = str(await agent.browser_session.get_current_page_title() or "") + except Exception: + title = "" + + html = await _get_page_html(agent) + return {"url": url, "title": title, "html": html} + + +def _classify_captcha(haystack: str) -> str: + if any(token in haystack for token in _CF_STRONG): + return "cloudflare" + if any(token in haystack for token in _RECAPTCHA_STRONG): + return "recaptcha" + if any(token in haystack for token in _HCAPTCHA_STRONG): + return "hcaptcha" + return "unknown" + + +def _detect_captcha_from_state(page_state: dict[str, Any]) -> tuple[bool, str, list[str]]: + url = str(page_state.get("url") or "").lower() + title = str(page_state.get("title") or "").lower() + html = str(page_state.get("html") or "").lower() + haystack = "\n".join([url, title, html[:150000]]) + + signals: list[str] = [] + if any(token in haystack for token in _CF_STRONG): + signals.append("cloudflare_challenge") + if any(token in haystack for token in _RECAPTCHA_STRONG): + signals.append("recaptcha") + if any(token in haystack for token in _HCAPTCHA_STRONG): + signals.append("hcaptcha") + + generic_hits = [token for token in _GENERIC_CAPTCHA_STRONG if token in haystack] + if generic_hits: + signals.extend(f"generic:{token}" for token in generic_hits[:3]) + + blocked = bool(signals) + captcha_type = _classify_captcha(haystack) + return blocked, captcha_type, signals + + +async def _build_captcha_payload(task: RunnerTask, agent: Agent) -> dict[str, Any]: + page_state = await _capture_page_state(agent) + blocked, captcha_type, signals = _detect_captcha_from_state(page_state) + if not blocked: + raise RuntimeError("Captcha payload requested without an active challenge") + + verification = { + "mode": "dom_url_title", + "selectors_absent": [ + "iframe[src*='recaptcha']", + "[class*='hcaptcha']", + "[id*='captcha']", + "form[action*='challenge']", + "input[name='cf-turnstile-response']", + ], + "challenge_signals_absent": signals, + "max_wait_seconds": CAPTCHA_WAIT_TIMEOUT, + } + browser_view_url = task.browser_view_url or None + instructions = ( + "Open the live browser view, complete the verification challenge manually, " + "then return and reply 'ready' or 'done'." + ) + return { + "success": False, + "status": "awaiting_user_captcha", + "task_id": task.task_id, + "session_id": task.task_id, + "resume_token": task.resume_token, + "browser_view_url": browser_view_url, + "captcha_type": captcha_type, + "instructions": instructions, + "detected_at": time.time(), + "page_url": page_state.get("url"), + "page_title": page_state.get("title"), + "verification": verification, + "human_intervention": { + "status": "awaiting_user_captcha", + "task_id": task.task_id, + "session_id": task.task_id, + "resume_token": task.resume_token, + "browser_view_url": browser_view_url, + "captcha_type": captcha_type, + "instructions": instructions, + "detected_at": time.time(), + "verification": verification, + }, + } + + +async def _verify_captcha_state(task: RunnerTask) -> dict[str, Any]: + if not task.agent: + return {"success": False, "verified": False, "error": "Task is not attached to an active agent"} + + page_state = await _capture_page_state(task.agent) + blocked, captcha_type, signals = _detect_captcha_from_state(page_state) + return { + "success": True, + "task_id": task.task_id, + "verified": not blocked, + "captcha_type": captcha_type if blocked else None, + "page_url": page_state.get("url"), + "page_title": page_state.get("title"), + "signals": signals, + "verification_mode": "dom_url_title", + } + + +async def _run_browser_task(task: RunnerTask): cdp_url = os.getenv("BROWSER_CDP_URL", "http://127.0.0.1:9222") - browser_view_url = os.getenv("BROWSER_VIEW_URL", "") browser = Browser(cdp_url=cdp_url) - llm = ChatOpenAI( model=os.getenv("MODEL_DEFAULT", "qwen3.5-122b"), api_key=os.getenv("OPENAI_API_KEY"), @@ -29,22 +255,131 @@ async def run_browser_task(task): temperature=0.0, ) - agent = Agent(task=task, llm=llm, browser=browser) + agent = Agent(task=task.task, llm=llm, browser=browser) + task.browser = browser + task.agent = agent + + async def on_step_end(current_agent: Agent): + if task.awaiting or task.finished or task.aborted: + return + page_state = await _capture_page_state(current_agent) + blocked, _, _ = _detect_captcha_from_state(page_state) + if not blocked: + return + payload = await _build_captcha_payload(task, current_agent) + task.set_payload("awaiting_user_captcha", payload) + current_agent.pause() try: - history = await agent.run() - return { - "success": True, - "result": history.final_result(), - "browser_view": browser_view_url, - } + history = await agent.run(on_step_end=on_step_end) + if task.aborted: + task.set_payload( + "failed", + {"success": False, "status": "failed", "error": task.error or "Task aborted during CAPTCHA flow."}, + ) + return + if task.awaiting: + return + task.set_payload( + "succeeded", + { + "success": True, + "status": "succeeded", + "result": history.final_result(), + "browser_view_url": task.browser_view_url or None, + }, + ) except Exception as err: - return {"success": False, "error": f"Browser automation failed: {err}"} + if not task.awaiting and not task.finished: + task.set_payload( + "failed", + {"success": False, "status": "failed", "error": f"Browser automation failed: {err}"}, + ) finally: - try: - await browser.close() - except Exception: - pass + if not task.awaiting: + try: + await browser.close() + except Exception: + pass + + +def _runner_thread_main(task: RunnerTask) -> None: + loop = asyncio.new_event_loop() + task.loop = loop + asyncio.set_event_loop(loop) + try: + loop.run_until_complete(_run_browser_task(task)) + finally: + pending = asyncio.all_tasks(loop=loop) + for pending_task in pending: + pending_task.cancel() + if pending: + loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True)) + loop.close() + + +def _start_task(task_id: str, task_text: str) -> dict[str, Any]: + existing = _get_task(task_id) + if existing: + return existing.payload or { + "success": False, + "status": existing.status, + "error": "Task already exists", + "task_id": task_id, + } + + state = _put_task( + RunnerTask( + task_id=task_id, + task=task_text, + browser_view_url=os.getenv("BROWSER_VIEW_URL", ""), + ) + ) + thread = threading.Thread(target=_runner_thread_main, args=(state,), daemon=True, name=f"browser-task-{task_id[:8]}") + state.thread = thread + thread.start() + + state.settled_event.wait() + return state.payload or {"success": False, "status": "failed", "error": "Task exited without payload", "task_id": task_id} + + +def _resume_task(task_id: str) -> dict[str, Any]: + state = _get_task(task_id) + if not state: + return {"success": False, "status": "failed", "error": "Task not found", "task_id": task_id} + if not state.loop or not state.agent: + return {"success": False, "status": "failed", "error": "Task cannot be resumed", "task_id": task_id} + + state.awaiting = False + state.settled_event.clear() + state.loop.call_soon_threadsafe(state.agent.resume) + state.settled_event.wait() + return state.payload or {"success": False, "status": "failed", "error": "Resume did not produce a payload", "task_id": task_id} + + +def _verify_task(task_id: str) -> dict[str, Any]: + state = _get_task(task_id) + if not state: + return {"success": False, "verified": False, "error": "Task not found", "task_id": task_id} + if not state.loop: + return {"success": False, "verified": False, "error": "Task has no active event loop", "task_id": task_id} + future = asyncio.run_coroutine_threadsafe(_verify_captcha_state(state), state.loop) + return future.result(timeout=20) + + +def _abort_task(task_id: str, reason: str | None = None) -> dict[str, Any]: + state = _get_task(task_id) + if not state: + return {"success": False, "status": "failed", "error": "Task not found", "task_id": task_id} + state.aborted = True + state.error = reason or "CAPTCHA flow aborted by user." + if state.loop and state.agent: + state.loop.call_soon_threadsafe(state.agent.resume) + state.set_payload( + "failed", + {"success": False, "status": "failed", "task_id": task_id, "error": state.error, "error_code": "captcha_aborted"}, + ) + return state.payload class BrowserUseRPCHandler(BaseHTTPRequestHandler): @@ -62,28 +397,49 @@ class BrowserUseRPCHandler(BaseHTTPRequestHandler): _json_response(self, 503, {"success": False, "error": f"Browser is not ready: {err}"}) def do_POST(self): - if self.path != "/run": - _json_response(self, 404, {"success": False, "error": "Not found"}) - return - try: content_length = int(self.headers.get("Content-Length", "0")) raw = self.rfile.read(content_length) payload = json.loads(raw.decode("utf-8") if raw else "{}") + except json.JSONDecodeError: + _json_response(self, 400, {"success": False, "error": "Invalid JSON payload"}) + return + + if self.path == "/run": task = payload.get("task", "") + task_id = str(payload.get("task_id") or uuid.uuid4().hex) if not isinstance(task, str) or not task.strip(): _json_response(self, 400, {"success": False, "error": "Field 'task' is required"}) return + result = _start_task(task_id=task_id, task_text=task.strip()) + _json_response(self, 200, result) + return - result = asyncio.run(run_browser_task(task.strip())) - code = 200 if result.get("success") else 500 - _json_response(self, code, result) - except json.JSONDecodeError: - _json_response(self, 400, {"success": False, "error": "Invalid JSON payload"}) - except error.URLError as err: - _json_response(self, 503, {"success": False, "error": f"Transport error: {err}"}) - except Exception as err: - _json_response(self, 500, {"success": False, "error": f"Internal error: {err}"}) + if self.path == "/verify": + task_id = str(payload.get("task_id") or "") + if not task_id: + _json_response(self, 400, {"success": False, "error": "Field 'task_id' is required"}) + return + _json_response(self, 200, _verify_task(task_id)) + return + + if self.path == "/resume": + task_id = str(payload.get("task_id") or "") + if not task_id: + _json_response(self, 400, {"success": False, "error": "Field 'task_id' is required"}) + return + _json_response(self, 200, _resume_task(task_id)) + return + + if self.path == "/abort": + task_id = str(payload.get("task_id") or "") + if not task_id: + _json_response(self, 400, {"success": False, "error": "Field 'task_id' is required"}) + return + _json_response(self, 200, _abort_task(task_id, reason=payload.get("reason"))) + return + + _json_response(self, 404, {"success": False, "error": "Not found"}) def log_message(self, format_str, *args): return diff --git a/hermes_code/cli.py b/hermes_code/cli.py index c15bd87b..f5908161 100644 --- a/hermes_code/cli.py +++ b/hermes_code/cli.py @@ -1922,6 +1922,7 @@ class HermesCLI: platform="cli", session_db=self._session_db, clarify_callback=self._clarify_callback, + captcha_callback=self._captcha_callback, reasoning_callback=( self._stream_reasoning_delta if (self.streaming_enabled and self.show_reasoning) else self._on_reasoning if (self.show_reasoning or self.verbose) @@ -5113,6 +5114,40 @@ class HermesCLI: "Use your best judgement to make the choice and proceed." ) + def _captcha_callback(self, payload): + """Prompt the user to complete a paused CAPTCHA flow in the live browser.""" + import time as _time + + timeout = int((payload.get("verification") or {}).get("max_wait_seconds", 900)) + response_queue = queue.Queue() + self._captcha_state = { + "payload": payload, + "response_queue": response_queue, + } + self._captcha_deadline = _time.monotonic() + timeout + self._invalidate() + + last_refresh = _time.monotonic() + while True: + try: + result = response_queue.get(timeout=1) + self._captcha_deadline = 0 + return result + except queue.Empty: + remaining = self._captcha_deadline - _time.monotonic() + if remaining <= 0: + break + now = _time.monotonic() + if now - last_refresh >= 5.0: + last_refresh = now + self._invalidate() + + self._captcha_state = None + self._captcha_deadline = 0 + self._invalidate() + _cprint(f"\n{_DIM}(captcha wait timed out after {timeout}s){_RST}") + return {"action": "timeout", "user_response": ""} + def _sudo_password_callback(self) -> str: """ Prompt for sudo password through the prompt_toolkit UI. @@ -5812,6 +5847,8 @@ class HermesCLI: return [("class:sudo-prompt", f"🔑 {state_suffix}")] if self._approval_state: return [("class:prompt-working", f"⚠ {state_suffix}")] + if self._captcha_state: + return [("class:prompt-working", f"🧩 {state_suffix}")] if self._clarify_freetext: return [("class:clarify-selected", f"✎ {state_suffix}")] if self._clarify_state: @@ -5878,6 +5915,7 @@ class HermesCLI: sudo_widget, secret_widget, approval_widget, + captcha_widget, clarify_widget, spinner_widget, spacer, @@ -5900,6 +5938,7 @@ class HermesCLI: sudo_widget, secret_widget, approval_widget, + captcha_widget, clarify_widget, spinner_widget, spacer, @@ -5983,6 +6022,10 @@ class HermesCLI: self._approval_deadline = 0 self._approval_lock = threading.Lock() # serialize concurrent approval prompts (delegation race fix) + # CAPTCHA / human verification prompt state + self._captcha_state = None # dict with payload + response_queue + self._captcha_deadline = 0 + # Slash command loading state self._command_running = False self._command_status = "" @@ -6058,6 +6101,23 @@ class HermesCLI: event.app.invalidate() return + # --- CAPTCHA prompt: accept ready/done/abort style input --- + if self._captcha_state: + text = event.app.current_buffer.text.strip() + normalized = text.lower() + if normalized in {"abort", "cancel", "stop"}: + action = "abort" + elif text: + action = "ready" + else: + return + self._captcha_state["response_queue"].put({"action": action, "user_response": text}) + self._captcha_state = None + self._captcha_deadline = 0 + event.app.current_buffer.reset() + event.app.invalidate() + return + # --- Clarify freetext mode: user typed their own answer --- if self._clarify_freetext and self._clarify_state: text = event.app.current_buffer.text.strip() @@ -6194,7 +6254,7 @@ class HermesCLI: # Buffer.auto_up/auto_down handle both: cursor movement when multi-line, # history browsing when on the first/last line (or single-line input). _normal_input = Condition( - lambda: not self._clarify_state and not self._approval_state and not self._sudo_state and not self._secret_state + lambda: not self._clarify_state and not self._approval_state and not self._captcha_state and not self._sudo_state and not self._secret_state ) @kb.add('up', filter=_normal_input) @@ -6261,6 +6321,14 @@ class HermesCLI: event.app.invalidate() return + if self._captcha_state: + self._captcha_state["response_queue"].put({"action": "timeout", "user_response": ""}) + self._captcha_state = None + self._captcha_deadline = 0 + event.app.current_buffer.reset() + event.app.invalidate() + return + # Cancel clarify prompt if self._clarify_state: self._clarify_state["response_queue"].put( @@ -6334,7 +6402,7 @@ class HermesCLI: # Guard: don't START recording during agent run or interactive prompts if cli_ref._agent_running: return - if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state: + if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state or cli_ref._captcha_state: return # Guard: don't start while a previous stop/transcribe cycle is # still running — recorder.stop() holds AudioRecorder._lock and @@ -6554,6 +6622,8 @@ class HermesCLI: return "type secret (hidden), Enter to skip" if cli_ref._approval_state: return "" + if cli_ref._captcha_state: + return "type ready/done after you solve the challenge, or abort to cancel" if cli_ref._clarify_freetext: return "type your answer here and press Enter" if cli_ref._clarify_state: @@ -6597,6 +6667,13 @@ class HermesCLI: ('class:clarify-countdown', f' ({remaining}s)'), ] + if cli_ref._captcha_state: + remaining = max(0, int(cli_ref._captcha_deadline - _time.monotonic())) + return [ + ('class:hint', " complete the challenge in the browser, then type 'ready'"), + ('class:clarify-countdown', f' ({remaining}s)'), + ] + if cli_ref._clarify_state: remaining = max(0, int(cli_ref._clarify_deadline - _time.monotonic())) countdown = f' ({remaining}s)' if cli_ref._clarify_deadline else '' @@ -6619,7 +6696,7 @@ class HermesCLI: return [] def get_hint_height(): - if cli_ref._sudo_state or cli_ref._secret_state or cli_ref._approval_state or cli_ref._clarify_state or cli_ref._command_running: + if cli_ref._sudo_state or cli_ref._secret_state or cli_ref._approval_state or cli_ref._captcha_state or cli_ref._clarify_state or cli_ref._command_running: return 1 # Keep a 1-line spacer while agent runs so output doesn't push # right up against the top rule of the input area @@ -6644,7 +6721,7 @@ class HermesCLI: height=get_hint_height, ) - # --- Clarify tool: dynamic display widget for questions + choices --- + # --- Interactive panels: CAPTCHA + clarify --- def _panel_box_width(title: str, content_lines: list[str], min_width: int = 46, max_width: int = 76) -> int: """Choose a stable panel width wide enough for the title and content.""" @@ -6672,6 +6749,45 @@ class HermesCLI: def _append_blank_panel_line(lines, border_style: str, box_width: int) -> None: lines.append((border_style, "│" + (" " * box_width) + "│\n")) + def _get_captcha_display(): + state = cli_ref._captcha_state + if not state: + return [] + + payload = state.get("payload") or {} + title = "Manual CAPTCHA Required" + browser_view_url = payload.get("browser_view_url") or "Browser view URL is not configured." + body_lines = [ + payload.get("instructions") or "Open the live browser and complete the verification challenge.", + f"Type: {payload.get('captcha_type', 'unknown')}", + f"Task ID: {payload.get('task_id', '')}", + f"Browser: {browser_view_url}", + "When the challenge disappears, type 'ready' or 'done' and press Enter.", + "Type 'abort' to stop this browser task.", + ] + box_width = _panel_box_width(title, body_lines, min_width=56, max_width=94) + inner_text_width = max(8, box_width - 2) + lines = [] + lines.append(('class:captcha-border', '╭─ ')) + lines.append(('class:captcha-title', title)) + lines.append(('class:captcha-border', ' ' + ('─' * max(0, box_width - len(title) - 3)) + '╮\n')) + _append_blank_panel_line(lines, 'class:captcha-border', box_width) + for text in body_lines: + style = 'class:captcha-link' if text.startswith("Browser: ") else 'class:captcha-text' + for wrapped in _wrap_panel_text(text, inner_text_width): + _append_panel_line(lines, 'class:captcha-border', style, wrapped, box_width) + _append_blank_panel_line(lines, 'class:captcha-border', box_width) + lines.append(('class:captcha-border', '╰' + ('─' * box_width) + '╯\n')) + return lines + + captcha_widget = ConditionalContainer( + Window( + FormattedTextControl(_get_captcha_display), + wrap_lines=True, + ), + filter=Condition(lambda: cli_ref._captcha_state is not None), + ) + def _get_clarify_display(): """Build styled text for the clarify question/choices panel.""" state = cli_ref._clarify_state @@ -6897,6 +7013,7 @@ class HermesCLI: sudo_widget=sudo_widget, secret_widget=secret_widget, approval_widget=approval_widget, + captcha_widget=captcha_widget, clarify_widget=clarify_widget, spinner_widget=spinner_widget, spacer=spacer, @@ -6954,6 +7071,11 @@ class HermesCLI: 'approval-cmd': '#AAAAAA italic', 'approval-choice': '#AAAAAA', 'approval-selected': '#FFD700 bold', + # CAPTCHA panel + 'captcha-border': '#CD7F32', + 'captcha-title': '#FFBF00 bold', + 'captcha-text': '#FFF8DC', + 'captcha-link': '#87CEEB underline', # Voice mode 'voice-prompt': '#87CEEB', 'voice-recording': '#FF4444 bold', diff --git a/hermes_code/model_tools.py b/hermes_code/model_tools.py index ceae2ceb..4774a223 100644 --- a/hermes_code/model_tools.py +++ b/hermes_code/model_tools.py @@ -152,6 +152,7 @@ def _discover_tools(): "tools.memory_tool", "tools.session_search_tool", "tools.clarify_tool", + "tools.to_captcha_tool", "tools.code_execution_tool", "tools.delegate_tool", "tools.process_registry", @@ -362,7 +363,7 @@ def get_tool_definitions( # because they need agent-level state (TodoStore, MemoryStore, etc.). # The registry still holds their schemas; dispatch just returns a stub error # so if something slips through, the LLM sees a sensible message. -_AGENT_LOOP_TOOLS = {"todo", "memory", "session_search", "delegate_task"} +_AGENT_LOOP_TOOLS = {"todo", "memory", "session_search", "delegate_task", "to_captcha"} _READ_SEARCH_TOOLS = {"read_file", "search_files"} diff --git a/hermes_code/run_agent.py b/hermes_code/run_agent.py index 08e2807b..3d680d25 100644 --- a/hermes_code/run_agent.py +++ b/hermes_code/run_agent.py @@ -400,6 +400,7 @@ class AIAgent: thinking_callback: callable = None, reasoning_callback: callable = None, clarify_callback: callable = None, + captcha_callback: callable = None, step_callback: callable = None, stream_delta_callback: callable = None, tool_gen_callback: callable = None, @@ -447,6 +448,7 @@ class AIAgent: tool_progress_callback (callable): Callback function(tool_name, args_preview) for progress notifications clarify_callback (callable): Callback function(question, choices) -> str for interactive user questions. Provided by the platform layer (CLI or gateway). If None, the clarify tool returns an error. + captcha_callback (callable): Callback function(payload_dict) -> dict for manual CAPTCHA completion flows. max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set) reasoning_config (Dict): OpenRouter reasoning configuration override (e.g. {"effort": "none"} to disable thinking). If None, defaults to {"enabled": True, "effort": "medium"} for OpenRouter. Set to disable/customize reasoning. @@ -529,6 +531,7 @@ class AIAgent: self.thinking_callback = thinking_callback self.reasoning_callback = reasoning_callback self.clarify_callback = clarify_callback + self.captcha_callback = captcha_callback self.step_callback = step_callback self.stream_delta_callback = stream_delta_callback self.status_callback = status_callback @@ -4693,6 +4696,18 @@ class AIAgent: choices=function_args.get("choices"), callback=self.clarify_callback, ) + elif function_name == "to_captcha": + from tools.to_captcha_tool import to_captcha_tool as _to_captcha_tool + return _to_captcha_tool( + task_id=function_args.get("task_id", ""), + browser_view_url=function_args.get("browser_view_url"), + captcha_type=function_args.get("captcha_type"), + instructions=function_args.get("instructions"), + detected_at=function_args.get("detected_at"), + verification=function_args.get("verification"), + resume_token=function_args.get("resume_token"), + callback=self.captcha_callback, + ) elif function_name == "delegate_task": from tools.delegate_tool import delegate_task as _delegate_task return _delegate_task( @@ -4711,6 +4726,53 @@ class AIAgent: honcho_session_key=self._honcho_session_key, ) + def _maybe_resolve_captcha(self, function_name: str, function_result: str, effective_task_id: str) -> str: + """Bridge paused browser tasks into the dedicated CAPTCHA orchestration flow.""" + if function_name != "internet_browser": + return function_result + + try: + payload = json.loads(function_result) + except (json.JSONDecodeError, TypeError): + return function_result + if not isinstance(payload, dict): + return function_result + if payload.get("status") != "awaiting_user_captcha": + return function_result + + human = payload.get("human_intervention") or {} + captcha_args = { + "task_id": payload.get("task_id") or human.get("task_id") or effective_task_id, + "browser_view_url": human.get("browser_view_url"), + "captcha_type": human.get("captcha_type"), + "instructions": human.get("instructions"), + "detected_at": human.get("detected_at"), + "verification": human.get("verification"), + "resume_token": human.get("resume_token"), + } + captcha_result = self._invoke_tool("to_captcha", captcha_args, effective_task_id) + try: + captcha_payload = json.loads(captcha_result) + except (json.JSONDecodeError, TypeError): + return captcha_result + if not isinstance(captcha_payload, dict): + return captcha_result + + if captcha_payload.get("status") == "resumed": + task_result = captcha_payload.get("task_result") + if isinstance(task_result, dict): + return json.dumps(task_result, ensure_ascii=False) + return json.dumps( + { + "success": False, + "status": captcha_payload.get("status", "still_blocked"), + "task_id": captcha_args["task_id"], + "human_intervention": human, + "captcha_flow": captcha_payload, + }, + ensure_ascii=False, + ) + def _execute_tool_calls_concurrent(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None: """Execute multiple tool calls concurrently using a thread pool. @@ -4843,6 +4905,8 @@ class AIAgent: tool_duration = 0.0 else: function_name, function_args, function_result, tool_duration, is_error = r + function_result = self._maybe_resolve_captcha(function_name, function_result, effective_task_id) + is_error, _ = _detect_tool_failure(function_name, function_result) if is_error: result_preview = function_result[:200] if len(function_result) > 200 else function_result @@ -5029,6 +5093,21 @@ class AIAgent: tool_duration = time.time() - tool_start_time if self.quiet_mode: self._vprint(f" {_get_cute_tool_message_impl('clarify', function_args, tool_duration, result=function_result)}") + elif function_name == "to_captcha": + from tools.to_captcha_tool import to_captcha_tool as _to_captcha_tool + function_result = _to_captcha_tool( + task_id=function_args.get("task_id", ""), + browser_view_url=function_args.get("browser_view_url"), + captcha_type=function_args.get("captcha_type"), + instructions=function_args.get("instructions"), + detected_at=function_args.get("detected_at"), + verification=function_args.get("verification"), + resume_token=function_args.get("resume_token"), + callback=self.captcha_callback, + ) + tool_duration = time.time() - tool_start_time + if self.quiet_mode: + self._vprint(f" {_get_cute_tool_message_impl('to_captcha', function_args, tool_duration, result=function_result)}") elif function_name == "delegate_task": from tools.delegate_tool import delegate_task as _delegate_task tasks_arg = function_args.get("tasks") @@ -5099,6 +5178,7 @@ class AIAgent: logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True) tool_duration = time.time() - tool_start_time + function_result = self._maybe_resolve_captcha(function_name, function_result, effective_task_id) result_preview = function_result if self.verbose_logging else ( function_result[:200] if len(function_result) > 200 else function_result ) diff --git a/hermes_code/tools/browser_use_tool.py b/hermes_code/tools/browser_use_tool.py index 167b61b3..d5b467fc 100644 --- a/hermes_code/tools/browser_use_tool.py +++ b/hermes_code/tools/browser_use_tool.py @@ -1,65 +1,130 @@ import json import os +import time from urllib import error, request + from tools.registry import registry -def run_browser_task(task): +def _browser_api_base_url() -> str: + return os.getenv("BROWSER_API_URL", "http://browser-api:8088/api/browser").rstrip("/") + + +def _http_json(url: str, method: str = "GET", payload: dict | None = None, timeout_sec: int = 30) -> dict: + body = None + headers = {"Content-Type": "application/json"} + if payload is not None: + body = json.dumps(payload).encode("utf-8") + req = request.Request(url, data=body, headers=headers, method=method) + try: + with request.urlopen(req, timeout=timeout_sec) as resp: + raw = resp.read().decode("utf-8") + return json.loads(raw) if raw else {} + except error.HTTPError as http_err: + raw = http_err.read().decode("utf-8", errors="replace") + try: + data = json.loads(raw) if raw else {} + except json.JSONDecodeError: + data = {"details": raw} + return { + "success": False, + "error": f"Browser API returned HTTP {http_err.code}", + "details": data, + } + except Exception as err: + return {"success": False, "error": f"Browser API request failed: {err}"} + + +def run_browser_task(task: str): if not task or not str(task).strip(): return json.dumps({"success": False, "error": "Task is required"}, ensure_ascii=False) - rpc_url = os.getenv("BROWSER_USE_RPC_URL", "http://browser:8787/run") timeout_sec = int(os.getenv("BROWSER_USE_RPC_TIMEOUT", "900")) - payload = json.dumps({"task": task}).encode("utf-8") - req = request.Request(rpc_url, data=payload, headers={"Content-Type": "application/json"}, method="POST") + poll_interval = float(os.getenv("BROWSER_API_POLL_INTERVAL", "1.5")) + api_base = _browser_api_base_url() - try: - with request.urlopen(req, timeout=timeout_sec) as resp: - body = resp.read().decode("utf-8") - return body - except error.HTTPError as http_err: - body = http_err.read().decode("utf-8", errors="replace") + accepted = _http_json( + f"{api_base}/tasks", + method="POST", + payload={"task": task, "timeout": timeout_sec, "metadata": {"source": "internet_browser"}}, + timeout_sec=30, + ) + task_id = accepted.get("task_id") + if not task_id: return json.dumps( { "success": False, - "error": f"browser-use RPC returned HTTP {http_err.code}", - "details": body, - }, - ensure_ascii=False, - ) - except Exception as err: - return json.dumps( - { - "success": False, - "error": f"browser-use RPC request failed: {err}", + "error": accepted.get("error", "Browser task was not accepted"), + "details": accepted, }, ensure_ascii=False, ) + deadline = time.time() + timeout_sec + 10 + status_url = f"{api_base}/tasks/{task_id}" + result_url = f"{api_base}/tasks/{task_id}/result" + + while time.time() < deadline: + status_payload = _http_json(status_url, timeout_sec=15) + status = status_payload.get("status") + if not status and status_payload.get("error"): + return json.dumps( + { + "success": False, + "status": "failed", + "task_id": task_id, + "error": status_payload.get("error"), + "details": status_payload.get("details"), + }, + ensure_ascii=False, + ) + if status == "awaiting_user_captcha": + return json.dumps( + { + "success": False, + "status": status, + "task_id": task_id, + "human_intervention": status_payload.get("human_intervention"), + }, + ensure_ascii=False, + ) + if status in {"succeeded", "failed"}: + result_payload = _http_json(result_url, timeout_sec=30) + result_payload.setdefault("task_id", task_id) + return json.dumps(result_payload, ensure_ascii=False) + time.sleep(poll_interval) + + return json.dumps( + { + "success": False, + "status": "failed", + "task_id": task_id, + "error": "Timed out while waiting for browser task result", + }, + ensure_ascii=False, + ) + registry.register( name="internet_browser", - toolset="browse_cmd", + toolset="browse_cmd", schema={ "name": "internet_browser", "description": ( - "ГЛАВНЫЙ ИНСТРУМЕНТ ДЛЯ ВЕБ-СЕРФИНГА. Вызывай этот инструмент НАПРЯМУЮ (через стандартный tool call/function call). " - "КАТЕГОРИЧЕСКИ ЗАПРЕЩЕНО использовать `execute_code` или `delegate_task` для работы с браузером. " - "Не пиши Python-скрипты! Просто передай в этот инструмент параметр `task`. " - "Используй для любых задач в интернете: поиск товаров (Wildberries, Ozon), чтение статей, клики, навигация." + "Main browser automation tool for internet tasks. Call it directly via a normal tool/function call. " + "Do not use execute_code or delegate_task for browser work. Pass the task in natural language." ), "parameters": { "type": "object", "properties": { "task": { - "type": "string", - "description": "Подробная задача на естественном языке. Например: 'Зайди на wildberries.ru, найди черную футболку и верни цену'." + "type": "string", + "description": "Detailed natural-language browser task." } }, "required": ["task"] } }, - handler=lambda args, **kw: run_browser_task(args.get("task")), emoji="🌐", -) \ No newline at end of file +) From 3b7d02ee8160c6f6db7618e94f205a43f7c0b9a7 Mon Sep 17 00:00:00 2001 From: VladislavIlin7 Date: Fri, 24 Apr 2026 15:55:07 +0300 Subject: [PATCH 2/2] fix to_capcha --- browser_env/entrypoint.sh | 2 +- hermes_code/agent/prompt_builder.py | 9 +++++++++ hermes_code/run_agent.py | 3 +++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/browser_env/entrypoint.sh b/browser_env/entrypoint.sh index 052ca6c5..bf426a96 100644 --- a/browser_env/entrypoint.sh +++ b/browser_env/entrypoint.sh @@ -98,7 +98,7 @@ if [ ! -f /var/lib/dbus/machine-id ]; then dbus-uuidgen > /var/lib/dbus/machine-id 2>/dev/null || true fi -# Удаляем stale lock/socket от прошлых падений Xvfb на том же DISPLAY. +# Удаляем stale lock/socket от прошлых падений Xvfb на том же DISPLAY. rm -f "/tmp/.X${DISPLAY_NUM}-lock" "/tmp/.X11-unix/X${DISPLAY_NUM}" || true log "starting X stack on DISPLAY=${DISPLAY}" diff --git a/hermes_code/agent/prompt_builder.py b/hermes_code/agent/prompt_builder.py index d6c4c6a6..f592c206 100644 --- a/hermes_code/agent/prompt_builder.py +++ b/hermes_code/agent/prompt_builder.py @@ -154,6 +154,15 @@ SKILLS_GUIDANCE = ( "Skills that aren't maintained become liabilities." ) +BROWSER_CAPTCHA_GUIDANCE = ( + "For browser tasks, do not pre-emptively refuse just because CAPTCHA may appear. " + "Start the task with internet_browser. " + "If the browser runtime reports status='awaiting_user_captcha', immediately use to_captcha " + "to hand control to the user for manual verification and then resume. " + "Important: you must never claim that you solved CAPTCHA yourself and you must not attempt bypass methods. " + "Your role is orchestration: run browser steps, request manual CAPTCHA completion from the user, verify, and continue." +) + PLATFORM_HINTS = { "whatsapp": ( "You are on a text messaging communication platform, WhatsApp. " diff --git a/hermes_code/run_agent.py b/hermes_code/run_agent.py index 3d680d25..21384dcd 100644 --- a/hermes_code/run_agent.py +++ b/hermes_code/run_agent.py @@ -73,6 +73,7 @@ from hermes_constants import OPENROUTER_BASE_URL from agent.prompt_builder import ( DEFAULT_AGENT_IDENTITY, PLATFORM_HINTS, MEMORY_GUIDANCE, SESSION_SEARCH_GUIDANCE, SKILLS_GUIDANCE, + BROWSER_CAPTCHA_GUIDANCE, ) from agent.model_metadata import ( fetch_model_metadata, @@ -2279,6 +2280,8 @@ class AIAgent: tool_guidance.append(SESSION_SEARCH_GUIDANCE) if "skill_manage" in self.valid_tool_names: tool_guidance.append(SKILLS_GUIDANCE) + if "internet_browser" in self.valid_tool_names and "to_captcha" in self.valid_tool_names: + tool_guidance.append(BROWSER_CAPTCHA_GUIDANCE) if tool_guidance: prompt_parts.append(" ".join(tool_guidance))