Compare commits

...
Sign in to create a new pull request.

2 commits

Author SHA1 Message Date
3b7d02ee81 fix to_capcha 2026-04-24 15:55:07 +03:00
f1f32d8366 add new tool: to_captcha 2026-04-21 23:32:09 +03:00
16 changed files with 1021 additions and 131 deletions

View file

@ -7,19 +7,21 @@ from api.clients.browser_rpc_contracts import BrowserRpcError
class BrowserRpcClient:
def __init__(self, rpc_url: str, session: aiohttp.ClientSession) -> None:
self._rpc_url = rpc_url
self._run_url = rpc_url.rstrip("/")
if self._run_url.endswith("/run"):
self._base_url = self._run_url[: -len("/run")]
else:
self._base_url = self._run_url
self._run_url = f"{self._base_url}/run"
self._session = session
async def run(self, task: str, timeout_sec: float) -> dict[str, Any]:
payload = {"task": task}
timeout = aiohttp.ClientTimeout(total=timeout_sec)
async def _post_json(self, url: str, payload: dict[str, Any], timeout_sec: float | None = None) -> dict[str, Any]:
timeout = aiohttp.ClientTimeout(total=timeout_sec) if timeout_sec is not None else None
try:
async with self._session.post(self._rpc_url, json=payload, timeout=timeout) as response:
async with self._session.post(url, json=payload, timeout=timeout) as response:
if response.status >= 400:
body = await response.text()
raise BrowserRpcError(f"RPC HTTP: {response.status}: {body}")
try:
data = await response.json(content_type=None)
except aiohttp.ContentTypeError as exc:
@ -29,10 +31,22 @@ class BrowserRpcClient:
if not isinstance(data, dict):
raise BrowserRpcError("RPC returned invalid payload type")
return data
async def run(self, task_id: str, task: str, timeout_sec: float) -> dict[str, Any]:
return await self._post_json(self._run_url, {"task_id": task_id, "task": task}, timeout_sec=timeout_sec)
async def run_browser_task(rpc_url: str, task: str, timeout_sec: float) -> dict[str, Any]:
async def verify_captcha(self, task_id: str) -> dict[str, Any]:
return await self._post_json(f"{self._base_url}/verify", {"task_id": task_id}, timeout_sec=15)
async def resume(self, task_id: str, timeout_sec: float) -> dict[str, Any]:
return await self._post_json(f"{self._base_url}/resume", {"task_id": task_id}, timeout_sec=timeout_sec)
async def abort(self, task_id: str, reason: str | None = None) -> dict[str, Any]:
return await self._post_json(f"{self._base_url}/abort", {"task_id": task_id, "reason": reason}, timeout_sec=15)
async def run_browser_task(rpc_url: str, task_id: str, task: str, timeout_sec: float) -> dict[str, Any]:
async with aiohttp.ClientSession() as session:
return await BrowserRpcClient(rpc_url, session=session).run(task=task, timeout_sec=timeout_sec)
client = BrowserRpcClient(rpc_url, session=session)
return await client.run(task_id=task_id, task=task, timeout_sec=timeout_sec)

View file

@ -1,8 +1,15 @@
from typing import Any, Protocol
class BrowserRpcError(RuntimeError): ...
class BrowserRpcError(RuntimeError):
pass
class BrowserRpcRunner(Protocol):
async def run(self, task: str, timeout_sec: float) -> dict[str, Any]: ...
async def run(self, task_id: str, task: str, timeout_sec: float) -> dict[str, Any]: ...
async def verify_captcha(self, task_id: str) -> dict[str, Any]: ...
async def resume(self, task_id: str, timeout_sec: float) -> dict[str, Any]: ...
async def abort(self, task_id: str, reason: str | None = None) -> dict[str, Any]: ...

View file

@ -1,4 +1,4 @@
from typing import Any
from typing import Any, Literal
from pydantic import BaseModel, Field
@ -6,38 +6,67 @@ from api.domain.task_status import TaskStatus
class BrowserTaskRequest(BaseModel):
"""Запрос на запуск задачи в browser-use агенте."""
"""Request to start a browser task."""
task: str = Field(..., description="Текстовая задача для browser-use агента")
timeout: int = Field(300, description="Максимальное время выполнения задачи в секундах")
metadata: dict[str, Any] | None = Field(default=None, description="Дополнительные метаданные клиента")
task: str = Field(..., description="Text task for the browser-use agent")
timeout: int = Field(300, description="Maximum task execution time in seconds")
metadata: dict[str, Any] | None = Field(default=None, description="Optional client metadata")
class BrowserCaptchaVerification(BaseModel):
mode: Literal["dom_url_title"] = "dom_url_title"
selectors_absent: list[str] = Field(default_factory=list)
challenge_signals_absent: list[str] = Field(default_factory=list)
max_wait_seconds: int = Field(..., description="How long the task may stay paused waiting for the user")
class BrowserHumanInterventionPayload(BaseModel):
status: Literal["awaiting_user_captcha"] = "awaiting_user_captcha"
task_id: str
session_id: str
resume_token: str
browser_view_url: str | None = None
captcha_type: Literal["cloudflare", "recaptcha", "hcaptcha", "unknown"] = "unknown"
instructions: str
detected_at: float
verification: BrowserCaptchaVerification
class BrowserCaptchaReadyRequest(BaseModel):
user_response: str | None = Field(default=None, description="Free-form confirmation from the user")
class BrowserCaptchaAbortRequest(BaseModel):
reason: str | None = Field(default=None, description="Optional reason for aborting the CAPTCHA flow")
class BrowserTaskAcceptedResponse(BaseModel):
"""Ответ о том, что задача принята в обработку."""
"""Response indicating that a task was accepted for processing."""
task_id: str
status: TaskStatus
class BrowserTaskStatusResponse(BaseModel):
"""Текущий статус задачи и временные отметки ее выполнения."""
"""Current task status and timestamps."""
task_id: str
status: TaskStatus
create_at: float = Field(..., description="Время создания задачи в Unix timestamp")
started_at: float | None = Field(default=None, description="Время начала выполнения в Unix timestamp")
finished_at: float | None = Field(default=None, description="Время завершения выполнения в Unix timestamp")
error: str | None = Field(default=None, description="Текст ошибки, если задача завершилась с ошибкой")
create_at: float = Field(..., description="Task creation time as a Unix timestamp")
started_at: float | None = Field(default=None, description="Task start time as a Unix timestamp")
finished_at: float | None = Field(default=None, description="Task completion time as a Unix timestamp")
error: str | None = Field(default=None, description="Task error when applicable")
human_intervention: BrowserHumanInterventionPayload | None = None
class BrowserTaskResultResponse(BaseModel):
"""Финальный результат выполнения задачи в browser-use."""
"""Terminal result or meaningful paused state for a browser task."""
task_id: str
status: TaskStatus
success: bool = Field(..., description="Успешно ли выполнена задача")
execution_time: float = Field(..., description="Фактическое время выполнения в секундах")
result: str | None = Field(default=None, description="Итоговый текстовый результат")
error: str | None = Field(default=None, description="Текст ошибки, если выполнение не удалось")
raw_response: dict[str, Any] | None = Field(default=None, description="Сырой ответ от browser-use RPC")
success: bool = Field(..., description="Whether the task completed successfully")
execution_time: float = Field(..., description="Observed execution time in seconds")
result: str | None = Field(default=None, description="Final textual result when available")
error: str | None = Field(default=None, description="Error text when execution failed")
raw_response: dict[str, Any] | None = Field(default=None, description="Raw payload returned by the browser runtime")
human_intervention: BrowserHumanInterventionPayload | None = None

View file

@ -9,6 +9,7 @@ class Settings:
browser_rpc_url: str = os.getenv("BROWSER_USE_RPC_URL", "http://browser:8787/run")
browser_rpc_timeout: float = float(os.getenv("BROWSER_USE_RPC_TIMEOUT", "900"))
captcha_wait_timeout: int = int(os.getenv("BROWSER_CAPTCHA_MAX_WAIT_SECONDS", "900"))
max_concurrency: int = int(os.getenv("BROWSER_API_MAX_CONCURRENCY", "2"))

View file

@ -2,8 +2,10 @@ from enum import Enum
class TaskStatus(str, Enum):
"""Состояние задачи браузерного агента."""
"""Browser task lifecycle states."""
queued = "queued"
running = "running"
awaiting_user_captcha = "awaiting_user_captcha"
succeeded = "succeeded"
failed = "failed"

View file

@ -18,6 +18,7 @@ async def lifespan(app: FastAPI):
rpc_client=BrowserRpcClient(settings.browser_rpc_url, session=session),
max_concurrency=settings.max_concurrency,
rpc_timeout_cap=settings.browser_rpc_timeout,
captcha_wait_timeout=settings.captcha_wait_timeout,
)
app.state.task_service = task_service
try:

View file

@ -20,13 +20,17 @@ class TaskRecord:
result: str | None = None
error: str | None = None
raw_response: dict[str, Any] | None = None
human_intervention: dict[str, Any] | None = None
session_id: str | None = None
resume_token: str | None = None
awaiting_deadline: float | None = None
@property
def execution_time(self) -> float:
if self.started_at is None:
return 0
end = self.finished_at if self.finished_at is not None else time.time()
return max(0, end - self.started_at)
return max(0.0, end - self.started_at)
class TaskStore:
@ -51,16 +55,42 @@ class TaskStore:
if rec is None:
return None
rec.status = TaskStatus.running
rec.started_at = time.time()
rec.finished_at = None
rec.error = None
rec.human_intervention = None
rec.awaiting_deadline = None
if rec.started_at is None:
rec.started_at = time.time()
return rec
async def set_awaiting_captcha(
self,
task_id: str,
raw_response: dict[str, Any],
max_wait_seconds: int,
) -> TaskRecord | None:
async with self._lock:
rec = self._tasks.get(task_id)
if rec is None:
return None
payload = raw_response.get("human_intervention") or {}
rec.status = TaskStatus.awaiting_user_captcha
rec.raw_response = raw_response
rec.human_intervention = payload
rec.session_id = payload.get("session_id") or rec.session_id
rec.resume_token = payload.get("resume_token") or rec.resume_token
rec.awaiting_deadline = time.time() + max(1, int(max_wait_seconds))
rec.error = None
rec.finished_at = None
return rec
async def set_done(
self,
task_id: str,
success: bool,
raw_response: dict[str, Any] | None,
error: str | None,
result: str | None = None,
self,
task_id: str,
success: bool,
raw_response: dict[str, Any] | None,
error: str | None,
result: str | None = None,
) -> TaskRecord | None:
async with self._lock:
rec = self._tasks.get(task_id)
@ -69,8 +99,35 @@ class TaskStore:
rec.finished_at = time.time()
rec.raw_response = raw_response
rec.error = error if error is not None else (
raw_response.get("error") if isinstance(raw_response, dict) else None)
raw_response.get("error") if isinstance(raw_response, dict) else None
)
rec.result = result if result is not None else (
raw_response.get("result") if isinstance(raw_response, dict) else None)
raw_response.get("result") if isinstance(raw_response, dict) else None
)
rec.human_intervention = None
rec.awaiting_deadline = None
rec.status = TaskStatus.succeeded if success else TaskStatus.failed
return rec
async def expire_if_needed(self, task_id: str) -> TaskRecord | None:
async with self._lock:
rec = self._tasks.get(task_id)
if rec is None:
return None
if rec.status != TaskStatus.awaiting_user_captcha:
return rec
if rec.awaiting_deadline is None or rec.awaiting_deadline > time.time():
return rec
rec.status = TaskStatus.failed
rec.finished_at = time.time()
rec.error = "CAPTCHA wait expired before the user completed verification."
rec.raw_response = {
"success": False,
"status": TaskStatus.failed.value,
"error": rec.error,
"error_code": "captcha_wait_expired",
}
rec.human_intervention = None
rec.awaiting_deadline = None
return rec

View file

@ -2,6 +2,8 @@ from fastapi import APIRouter, Depends, HTTPException, Request
from fastapi.responses import JSONResponse
from api.contracts.task_schemas import (
BrowserCaptchaAbortRequest,
BrowserCaptchaReadyRequest,
BrowserTaskAcceptedResponse,
BrowserTaskRequest,
BrowserTaskResultResponse,
@ -20,8 +22,8 @@ def get_task_service(request: Request) -> TaskService:
@router.post("/tasks", response_model=BrowserTaskAcceptedResponse, status_code=202)
async def create_task(
payload: BrowserTaskRequest,
service: TaskService = Depends(get_task_service),
payload: BrowserTaskRequest,
service: TaskService = Depends(get_task_service),
) -> BrowserTaskAcceptedResponse:
rec = await service.submit_task(task=payload.task.strip(), timeout=payload.timeout, metadata=payload.metadata)
return BrowserTaskAcceptedResponse(task_id=rec.task_id, status=rec.status)
@ -37,8 +39,8 @@ async def get_task_status(task_id: str, service: TaskService = Depends(get_task_
@router.get("/tasks/{task_id}/result", response_model=BrowserTaskResultResponse)
async def get_task_result(
task_id: str,
service: TaskService = Depends(get_task_service),
task_id: str,
service: TaskService = Depends(get_task_service),
) -> JSONResponse | BrowserTaskResultResponse:
rec = await service.get_task(task_id)
if rec is None:
@ -55,18 +57,35 @@ async def get_task_result(
"result": None,
"error": None,
"raw_response": None,
"human_intervention": None,
},
)
return BrowserTaskResultResponse(
task_id=rec.task_id,
status=rec.status,
success=(rec.status == TaskStatus.succeeded),
execution_time=rec.execution_time,
result=rec.result,
error=rec.error,
raw_response=rec.raw_response,
)
return _to_result_response(rec)
@router.post("/tasks/{task_id}/captcha/ready", response_model=BrowserTaskResultResponse)
async def captcha_ready(
task_id: str,
payload: BrowserCaptchaReadyRequest,
service: TaskService = Depends(get_task_service),
) -> BrowserTaskResultResponse:
rec = await service.resume_captcha(task_id, user_response=payload.user_response)
if rec is None:
raise HTTPException(status_code=404, detail="Task not found")
return _to_result_response(rec)
@router.post("/tasks/{task_id}/captcha/abort", response_model=BrowserTaskResultResponse)
async def captcha_abort(
task_id: str,
payload: BrowserCaptchaAbortRequest,
service: TaskService = Depends(get_task_service),
) -> BrowserTaskResultResponse:
rec = await service.abort_captcha(task_id, reason=payload.reason)
if rec is None:
raise HTTPException(status_code=404, detail="Task not found")
return _to_result_response(rec)
def _to_status_response(rec: TaskRecord) -> BrowserTaskStatusResponse:
@ -77,4 +96,18 @@ def _to_status_response(rec: TaskRecord) -> BrowserTaskStatusResponse:
started_at=rec.started_at,
finished_at=rec.finished_at,
error=rec.error,
human_intervention=rec.human_intervention,
)
def _to_result_response(rec: TaskRecord) -> BrowserTaskResultResponse:
return BrowserTaskResultResponse(
task_id=rec.task_id,
status=rec.status,
success=(rec.status == TaskStatus.succeeded),
execution_time=rec.execution_time,
result=rec.result,
error=rec.error,
raw_response=rec.raw_response,
human_intervention=rec.human_intervention,
)

View file

@ -1,21 +1,24 @@
import asyncio
from api.clients.browser_rpc_contracts import BrowserRpcError, BrowserRpcRunner
from api.domain.task_status import TaskStatus
from api.repositories.task_store import TaskRecord, TaskStore
class TaskService:
def __init__(
self,
store: TaskStore,
rpc_client: BrowserRpcRunner,
max_concurrency: int,
rpc_timeout_cap: float | None = None,
self,
store: TaskStore,
rpc_client: BrowserRpcRunner,
max_concurrency: int,
rpc_timeout_cap: float | None = None,
captcha_wait_timeout: int = 900,
) -> None:
self._store = store
self._rpc_client = rpc_client
self._semaphore = asyncio.Semaphore(max_concurrency)
self._rpc_timeout_cap = rpc_timeout_cap
self._captcha_wait_timeout = captcha_wait_timeout
self._background_tasks: set[asyncio.Task[None]] = set()
async def submit_task(self, task: str, timeout: int, metadata: dict | None) -> TaskRecord:
@ -26,7 +29,22 @@ class TaskService:
return record
async def get_task(self, task_id: str) -> TaskRecord | None:
return await self._store.get(task_id)
before = await self._store.get(task_id)
was_awaiting = bool(before is not None and before.status == TaskStatus.awaiting_user_captcha)
await self._store.expire_if_needed(task_id)
after = await self._store.get(task_id)
if (
was_awaiting
and after is not None
and after.status == TaskStatus.failed
and after.error
and "expired" in after.error.lower()
):
try:
await self._rpc_client.abort(task_id, reason=after.error)
except BrowserRpcError:
pass
return after
async def close(self) -> None:
if not self._background_tasks:
@ -37,6 +55,86 @@ class TaskService:
await asyncio.gather(*self._background_tasks, return_exceptions=True)
self._background_tasks.clear()
async def resume_captcha(self, task_id: str, user_response: str | None = None) -> TaskRecord | None:
rec = await self.get_task(task_id)
if rec is None:
return None
if rec.status == TaskStatus.failed:
return rec
if rec.status != TaskStatus.awaiting_user_captcha:
return rec
verify_raw = await self._rpc_client.verify_captcha(task_id)
if not verify_raw.get("verified"):
await self._store.set_awaiting_captcha(
task_id=task_id,
raw_response={
"success": False,
"status": TaskStatus.awaiting_user_captcha.value,
"error": "CAPTCHA is still present.",
"human_intervention": rec.human_intervention,
"verification": verify_raw,
"user_response": user_response,
},
max_wait_seconds=(rec.human_intervention or {}).get("verification", {}).get(
"max_wait_seconds", self._captcha_wait_timeout
),
)
return await self._store.get(task_id)
await self._store.set_running(task_id)
try:
rpc_timeout = float(rec.timeout)
if self._rpc_timeout_cap is not None:
rpc_timeout = min(rpc_timeout, self._rpc_timeout_cap)
raw = await asyncio.wait_for(
self._rpc_client.resume(task_id=task_id, timeout_sec=rpc_timeout),
timeout=float(rec.timeout) + 5,
)
except asyncio.TimeoutError:
await self._store.set_done(
task_id=task_id,
success=False,
raw_response=None,
error="Timeout exceeded after CAPTCHA resume.",
)
return await self._store.get(task_id)
except BrowserRpcError as exc:
await self._store.set_done(
task_id=task_id,
success=False,
raw_response=None,
error=str(exc),
)
return await self._store.get(task_id)
await self._apply_rpc_result(task_id, raw)
return await self._store.get(task_id)
async def abort_captcha(self, task_id: str, reason: str | None = None) -> TaskRecord | None:
rec = await self.get_task(task_id)
if rec is None:
return None
if rec.status == TaskStatus.awaiting_user_captcha:
try:
await self._rpc_client.abort(task_id, reason=reason)
except BrowserRpcError:
pass
await self._store.set_done(
task_id=task_id,
success=False,
raw_response={
"success": False,
"status": TaskStatus.failed.value,
"error_code": "captcha_aborted",
"reason": reason,
},
error=reason or "User aborted CAPTCHA flow.",
)
return await self._store.get(task_id)
async def _worker(self, task_id: str) -> None:
rec = await self._store.set_running(task_id)
if rec is None:
@ -49,17 +147,10 @@ class TaskService:
rpc_timeout = min(rpc_timeout, self._rpc_timeout_cap)
raw = await asyncio.wait_for(
self._rpc_client.run(task=rec.task, timeout_sec=rpc_timeout),
self._rpc_client.run(task_id=task_id, task=rec.task, timeout_sec=rpc_timeout),
timeout=float(rec.timeout) + 5,
)
success = bool(raw.get("success"))
await self._store.set_done(
task_id=task_id,
success=success,
raw_response=raw,
error=None,
result=raw.get("result") if isinstance(raw, dict) else None,
)
await self._apply_rpc_result(task_id, raw)
except asyncio.TimeoutError:
await self._store.set_done(
task_id=task_id,
@ -81,3 +172,22 @@ class TaskService:
raw_response=None,
error=f"Internal error: {exc}",
)
async def _apply_rpc_result(self, task_id: str, raw: dict | None) -> None:
raw = raw or {}
status = raw.get("status")
if status == TaskStatus.awaiting_user_captcha.value:
human = raw.get("human_intervention") or {}
verification = human.get("verification") or {}
max_wait_seconds = verification.get("max_wait_seconds", self._captcha_wait_timeout)
await self._store.set_awaiting_captcha(task_id, raw_response=raw, max_wait_seconds=max_wait_seconds)
return
success = bool(raw.get("success"))
await self._store.set_done(
task_id=task_id,
success=success,
raw_response=raw,
error=None,
result=raw.get("result") if isinstance(raw, dict) else None,
)

View file

@ -1,12 +1,52 @@
import asyncio
import json
import os
import threading
import time
import uuid
from dataclasses import dataclass, field
from http.server import BaseHTTPRequestHandler, ThreadingHTTPServer
from typing import Any
from urllib import error, request
from browser_use import Agent, Browser, ChatOpenAI
CAPTCHA_WAIT_TIMEOUT = int(os.getenv("BROWSER_CAPTCHA_MAX_WAIT_SECONDS", "900"))
_RUNNER_TASKS: dict[str, "RunnerTask"] = {}
_RUNNER_LOCK = threading.Lock()
_CF_STRONG = (
"just a moment",
"attention required",
"checking your browser",
"cf-challenge",
"cdn-cgi/challenge-platform",
"__cf_chl",
"turnstile",
)
_RECAPTCHA_STRONG = (
"g-recaptcha",
"recaptcha/api2",
"www.google.com/recaptcha",
"grecaptcha",
)
_HCAPTCHA_STRONG = (
"hcaptcha",
"newassets.hcaptcha.com",
"js.hcaptcha.com/1/api.js",
)
_GENERIC_CAPTCHA_STRONG = (
"captcha",
"are you human",
"verify you are human",
"human verification",
"bot detection",
"security check",
"press and hold",
)
def _json_response(handler, status_code, payload):
data = json.dumps(payload, ensure_ascii=False).encode("utf-8")
handler.send_response(status_code)
@ -16,12 +56,198 @@ def _json_response(handler, status_code, payload):
handler.wfile.write(data)
async def run_browser_task(task):
@dataclass
class RunnerTask:
task_id: str
task: str
browser_view_url: str
resume_token: str = field(default_factory=lambda: uuid.uuid4().hex)
created_at: float = field(default_factory=time.time)
status: str = "starting"
payload: dict[str, Any] | None = None
error: str | None = None
agent: Any = None
browser: Any = None
loop: asyncio.AbstractEventLoop | None = None
thread: threading.Thread | None = None
settled_event: threading.Event = field(default_factory=threading.Event)
finished: bool = False
awaiting: bool = False
aborted: bool = False
transition_count: int = 0
lock: threading.Lock = field(default_factory=threading.Lock)
def set_payload(self, status: str, payload: dict[str, Any]) -> None:
with self.lock:
self.status = status
self.payload = payload
self.transition_count += 1
self.awaiting = status == "awaiting_user_captcha"
self.finished = status in {"succeeded", "failed"}
self.settled_event.set()
def _get_task(task_id: str) -> RunnerTask | None:
with _RUNNER_LOCK:
return _RUNNER_TASKS.get(task_id)
def _put_task(task: RunnerTask) -> RunnerTask:
with _RUNNER_LOCK:
_RUNNER_TASKS[task.task_id] = task
return task
async def _get_page_html(agent: Agent) -> str:
try:
cdp_session = await agent.browser_session.get_or_create_cdp_session()
doc = await cdp_session.cdp_client.send.DOM.getDocument(session_id=cdp_session.session_id)
html_result = await cdp_session.cdp_client.send.DOM.getOuterHTML(
params={"nodeId": doc["root"]["nodeId"]},
session_id=cdp_session.session_id,
)
return str(html_result.get("outerHTML", ""))
except Exception:
return ""
async def _capture_page_state(agent: Agent) -> dict[str, Any]:
url = ""
title = ""
summary = None
try:
summary = await agent.browser_session.get_browser_state_summary()
except Exception:
summary = None
if summary is not None:
if isinstance(summary, dict):
url = str(summary.get("url") or "")
title = str(summary.get("title") or "")
else:
url = str(getattr(summary, "url", "") or "")
title = str(getattr(summary, "title", "") or "")
if not url:
try:
url = str(await agent.browser_session.get_current_page_url() or "")
except Exception:
url = ""
if not title:
try:
title = str(await agent.browser_session.get_current_page_title() or "")
except Exception:
title = ""
html = await _get_page_html(agent)
return {"url": url, "title": title, "html": html}
def _classify_captcha(haystack: str) -> str:
if any(token in haystack for token in _CF_STRONG):
return "cloudflare"
if any(token in haystack for token in _RECAPTCHA_STRONG):
return "recaptcha"
if any(token in haystack for token in _HCAPTCHA_STRONG):
return "hcaptcha"
return "unknown"
def _detect_captcha_from_state(page_state: dict[str, Any]) -> tuple[bool, str, list[str]]:
url = str(page_state.get("url") or "").lower()
title = str(page_state.get("title") or "").lower()
html = str(page_state.get("html") or "").lower()
haystack = "\n".join([url, title, html[:150000]])
signals: list[str] = []
if any(token in haystack for token in _CF_STRONG):
signals.append("cloudflare_challenge")
if any(token in haystack for token in _RECAPTCHA_STRONG):
signals.append("recaptcha")
if any(token in haystack for token in _HCAPTCHA_STRONG):
signals.append("hcaptcha")
generic_hits = [token for token in _GENERIC_CAPTCHA_STRONG if token in haystack]
if generic_hits:
signals.extend(f"generic:{token}" for token in generic_hits[:3])
blocked = bool(signals)
captcha_type = _classify_captcha(haystack)
return blocked, captcha_type, signals
async def _build_captcha_payload(task: RunnerTask, agent: Agent) -> dict[str, Any]:
page_state = await _capture_page_state(agent)
blocked, captcha_type, signals = _detect_captcha_from_state(page_state)
if not blocked:
raise RuntimeError("Captcha payload requested without an active challenge")
verification = {
"mode": "dom_url_title",
"selectors_absent": [
"iframe[src*='recaptcha']",
"[class*='hcaptcha']",
"[id*='captcha']",
"form[action*='challenge']",
"input[name='cf-turnstile-response']",
],
"challenge_signals_absent": signals,
"max_wait_seconds": CAPTCHA_WAIT_TIMEOUT,
}
browser_view_url = task.browser_view_url or None
instructions = (
"Open the live browser view, complete the verification challenge manually, "
"then return and reply 'ready' or 'done'."
)
return {
"success": False,
"status": "awaiting_user_captcha",
"task_id": task.task_id,
"session_id": task.task_id,
"resume_token": task.resume_token,
"browser_view_url": browser_view_url,
"captcha_type": captcha_type,
"instructions": instructions,
"detected_at": time.time(),
"page_url": page_state.get("url"),
"page_title": page_state.get("title"),
"verification": verification,
"human_intervention": {
"status": "awaiting_user_captcha",
"task_id": task.task_id,
"session_id": task.task_id,
"resume_token": task.resume_token,
"browser_view_url": browser_view_url,
"captcha_type": captcha_type,
"instructions": instructions,
"detected_at": time.time(),
"verification": verification,
},
}
async def _verify_captcha_state(task: RunnerTask) -> dict[str, Any]:
if not task.agent:
return {"success": False, "verified": False, "error": "Task is not attached to an active agent"}
page_state = await _capture_page_state(task.agent)
blocked, captcha_type, signals = _detect_captcha_from_state(page_state)
return {
"success": True,
"task_id": task.task_id,
"verified": not blocked,
"captcha_type": captcha_type if blocked else None,
"page_url": page_state.get("url"),
"page_title": page_state.get("title"),
"signals": signals,
"verification_mode": "dom_url_title",
}
async def _run_browser_task(task: RunnerTask):
cdp_url = os.getenv("BROWSER_CDP_URL", "http://127.0.0.1:9222")
browser_view_url = os.getenv("BROWSER_VIEW_URL", "")
browser = Browser(cdp_url=cdp_url)
llm = ChatOpenAI(
model=os.getenv("MODEL_DEFAULT", "qwen3.5-122b"),
api_key=os.getenv("OPENAI_API_KEY"),
@ -29,22 +255,131 @@ async def run_browser_task(task):
temperature=0.0,
)
agent = Agent(task=task, llm=llm, browser=browser)
agent = Agent(task=task.task, llm=llm, browser=browser)
task.browser = browser
task.agent = agent
async def on_step_end(current_agent: Agent):
if task.awaiting or task.finished or task.aborted:
return
page_state = await _capture_page_state(current_agent)
blocked, _, _ = _detect_captcha_from_state(page_state)
if not blocked:
return
payload = await _build_captcha_payload(task, current_agent)
task.set_payload("awaiting_user_captcha", payload)
current_agent.pause()
try:
history = await agent.run()
return {
"success": True,
"result": history.final_result(),
"browser_view": browser_view_url,
}
history = await agent.run(on_step_end=on_step_end)
if task.aborted:
task.set_payload(
"failed",
{"success": False, "status": "failed", "error": task.error or "Task aborted during CAPTCHA flow."},
)
return
if task.awaiting:
return
task.set_payload(
"succeeded",
{
"success": True,
"status": "succeeded",
"result": history.final_result(),
"browser_view_url": task.browser_view_url or None,
},
)
except Exception as err:
return {"success": False, "error": f"Browser automation failed: {err}"}
if not task.awaiting and not task.finished:
task.set_payload(
"failed",
{"success": False, "status": "failed", "error": f"Browser automation failed: {err}"},
)
finally:
try:
await browser.close()
except Exception:
pass
if not task.awaiting:
try:
await browser.close()
except Exception:
pass
def _runner_thread_main(task: RunnerTask) -> None:
loop = asyncio.new_event_loop()
task.loop = loop
asyncio.set_event_loop(loop)
try:
loop.run_until_complete(_run_browser_task(task))
finally:
pending = asyncio.all_tasks(loop=loop)
for pending_task in pending:
pending_task.cancel()
if pending:
loop.run_until_complete(asyncio.gather(*pending, return_exceptions=True))
loop.close()
def _start_task(task_id: str, task_text: str) -> dict[str, Any]:
existing = _get_task(task_id)
if existing:
return existing.payload or {
"success": False,
"status": existing.status,
"error": "Task already exists",
"task_id": task_id,
}
state = _put_task(
RunnerTask(
task_id=task_id,
task=task_text,
browser_view_url=os.getenv("BROWSER_VIEW_URL", ""),
)
)
thread = threading.Thread(target=_runner_thread_main, args=(state,), daemon=True, name=f"browser-task-{task_id[:8]}")
state.thread = thread
thread.start()
state.settled_event.wait()
return state.payload or {"success": False, "status": "failed", "error": "Task exited without payload", "task_id": task_id}
def _resume_task(task_id: str) -> dict[str, Any]:
state = _get_task(task_id)
if not state:
return {"success": False, "status": "failed", "error": "Task not found", "task_id": task_id}
if not state.loop or not state.agent:
return {"success": False, "status": "failed", "error": "Task cannot be resumed", "task_id": task_id}
state.awaiting = False
state.settled_event.clear()
state.loop.call_soon_threadsafe(state.agent.resume)
state.settled_event.wait()
return state.payload or {"success": False, "status": "failed", "error": "Resume did not produce a payload", "task_id": task_id}
def _verify_task(task_id: str) -> dict[str, Any]:
state = _get_task(task_id)
if not state:
return {"success": False, "verified": False, "error": "Task not found", "task_id": task_id}
if not state.loop:
return {"success": False, "verified": False, "error": "Task has no active event loop", "task_id": task_id}
future = asyncio.run_coroutine_threadsafe(_verify_captcha_state(state), state.loop)
return future.result(timeout=20)
def _abort_task(task_id: str, reason: str | None = None) -> dict[str, Any]:
state = _get_task(task_id)
if not state:
return {"success": False, "status": "failed", "error": "Task not found", "task_id": task_id}
state.aborted = True
state.error = reason or "CAPTCHA flow aborted by user."
if state.loop and state.agent:
state.loop.call_soon_threadsafe(state.agent.resume)
state.set_payload(
"failed",
{"success": False, "status": "failed", "task_id": task_id, "error": state.error, "error_code": "captcha_aborted"},
)
return state.payload
class BrowserUseRPCHandler(BaseHTTPRequestHandler):
@ -62,28 +397,49 @@ class BrowserUseRPCHandler(BaseHTTPRequestHandler):
_json_response(self, 503, {"success": False, "error": f"Browser is not ready: {err}"})
def do_POST(self):
if self.path != "/run":
_json_response(self, 404, {"success": False, "error": "Not found"})
return
try:
content_length = int(self.headers.get("Content-Length", "0"))
raw = self.rfile.read(content_length)
payload = json.loads(raw.decode("utf-8") if raw else "{}")
except json.JSONDecodeError:
_json_response(self, 400, {"success": False, "error": "Invalid JSON payload"})
return
if self.path == "/run":
task = payload.get("task", "")
task_id = str(payload.get("task_id") or uuid.uuid4().hex)
if not isinstance(task, str) or not task.strip():
_json_response(self, 400, {"success": False, "error": "Field 'task' is required"})
return
result = _start_task(task_id=task_id, task_text=task.strip())
_json_response(self, 200, result)
return
result = asyncio.run(run_browser_task(task.strip()))
code = 200 if result.get("success") else 500
_json_response(self, code, result)
except json.JSONDecodeError:
_json_response(self, 400, {"success": False, "error": "Invalid JSON payload"})
except error.URLError as err:
_json_response(self, 503, {"success": False, "error": f"Transport error: {err}"})
except Exception as err:
_json_response(self, 500, {"success": False, "error": f"Internal error: {err}"})
if self.path == "/verify":
task_id = str(payload.get("task_id") or "")
if not task_id:
_json_response(self, 400, {"success": False, "error": "Field 'task_id' is required"})
return
_json_response(self, 200, _verify_task(task_id))
return
if self.path == "/resume":
task_id = str(payload.get("task_id") or "")
if not task_id:
_json_response(self, 400, {"success": False, "error": "Field 'task_id' is required"})
return
_json_response(self, 200, _resume_task(task_id))
return
if self.path == "/abort":
task_id = str(payload.get("task_id") or "")
if not task_id:
_json_response(self, 400, {"success": False, "error": "Field 'task_id' is required"})
return
_json_response(self, 200, _abort_task(task_id, reason=payload.get("reason")))
return
_json_response(self, 404, {"success": False, "error": "Not found"})
def log_message(self, format_str, *args):
return

View file

@ -98,7 +98,7 @@ if [ ! -f /var/lib/dbus/machine-id ]; then
dbus-uuidgen > /var/lib/dbus/machine-id 2>/dev/null || true
fi
# Удаляем stale lock/socket от прошлых падений Xvfb на том же DISPLAY.
# Удаляем stale lock/socket от прошлых падений Xvfb на том же DISPLAY.
rm -f "/tmp/.X${DISPLAY_NUM}-lock" "/tmp/.X11-unix/X${DISPLAY_NUM}" || true
log "starting X stack on DISPLAY=${DISPLAY}"

View file

@ -154,6 +154,15 @@ SKILLS_GUIDANCE = (
"Skills that aren't maintained become liabilities."
)
BROWSER_CAPTCHA_GUIDANCE = (
"For browser tasks, do not pre-emptively refuse just because CAPTCHA may appear. "
"Start the task with internet_browser. "
"If the browser runtime reports status='awaiting_user_captcha', immediately use to_captcha "
"to hand control to the user for manual verification and then resume. "
"Important: you must never claim that you solved CAPTCHA yourself and you must not attempt bypass methods. "
"Your role is orchestration: run browser steps, request manual CAPTCHA completion from the user, verify, and continue."
)
PLATFORM_HINTS = {
"whatsapp": (
"You are on a text messaging communication platform, WhatsApp. "

View file

@ -1922,6 +1922,7 @@ class HermesCLI:
platform="cli",
session_db=self._session_db,
clarify_callback=self._clarify_callback,
captcha_callback=self._captcha_callback,
reasoning_callback=(
self._stream_reasoning_delta if (self.streaming_enabled and self.show_reasoning)
else self._on_reasoning if (self.show_reasoning or self.verbose)
@ -5113,6 +5114,40 @@ class HermesCLI:
"Use your best judgement to make the choice and proceed."
)
def _captcha_callback(self, payload):
"""Prompt the user to complete a paused CAPTCHA flow in the live browser."""
import time as _time
timeout = int((payload.get("verification") or {}).get("max_wait_seconds", 900))
response_queue = queue.Queue()
self._captcha_state = {
"payload": payload,
"response_queue": response_queue,
}
self._captcha_deadline = _time.monotonic() + timeout
self._invalidate()
last_refresh = _time.monotonic()
while True:
try:
result = response_queue.get(timeout=1)
self._captcha_deadline = 0
return result
except queue.Empty:
remaining = self._captcha_deadline - _time.monotonic()
if remaining <= 0:
break
now = _time.monotonic()
if now - last_refresh >= 5.0:
last_refresh = now
self._invalidate()
self._captcha_state = None
self._captcha_deadline = 0
self._invalidate()
_cprint(f"\n{_DIM}(captcha wait timed out after {timeout}s){_RST}")
return {"action": "timeout", "user_response": ""}
def _sudo_password_callback(self) -> str:
"""
Prompt for sudo password through the prompt_toolkit UI.
@ -5812,6 +5847,8 @@ class HermesCLI:
return [("class:sudo-prompt", f"🔑 {state_suffix}")]
if self._approval_state:
return [("class:prompt-working", f"{state_suffix}")]
if self._captcha_state:
return [("class:prompt-working", f"🧩 {state_suffix}")]
if self._clarify_freetext:
return [("class:clarify-selected", f"{state_suffix}")]
if self._clarify_state:
@ -5878,6 +5915,7 @@ class HermesCLI:
sudo_widget,
secret_widget,
approval_widget,
captcha_widget,
clarify_widget,
spinner_widget,
spacer,
@ -5900,6 +5938,7 @@ class HermesCLI:
sudo_widget,
secret_widget,
approval_widget,
captcha_widget,
clarify_widget,
spinner_widget,
spacer,
@ -5983,6 +6022,10 @@ class HermesCLI:
self._approval_deadline = 0
self._approval_lock = threading.Lock() # serialize concurrent approval prompts (delegation race fix)
# CAPTCHA / human verification prompt state
self._captcha_state = None # dict with payload + response_queue
self._captcha_deadline = 0
# Slash command loading state
self._command_running = False
self._command_status = ""
@ -6058,6 +6101,23 @@ class HermesCLI:
event.app.invalidate()
return
# --- CAPTCHA prompt: accept ready/done/abort style input ---
if self._captcha_state:
text = event.app.current_buffer.text.strip()
normalized = text.lower()
if normalized in {"abort", "cancel", "stop"}:
action = "abort"
elif text:
action = "ready"
else:
return
self._captcha_state["response_queue"].put({"action": action, "user_response": text})
self._captcha_state = None
self._captcha_deadline = 0
event.app.current_buffer.reset()
event.app.invalidate()
return
# --- Clarify freetext mode: user typed their own answer ---
if self._clarify_freetext and self._clarify_state:
text = event.app.current_buffer.text.strip()
@ -6194,7 +6254,7 @@ class HermesCLI:
# Buffer.auto_up/auto_down handle both: cursor movement when multi-line,
# history browsing when on the first/last line (or single-line input).
_normal_input = Condition(
lambda: not self._clarify_state and not self._approval_state and not self._sudo_state and not self._secret_state
lambda: not self._clarify_state and not self._approval_state and not self._captcha_state and not self._sudo_state and not self._secret_state
)
@kb.add('up', filter=_normal_input)
@ -6261,6 +6321,14 @@ class HermesCLI:
event.app.invalidate()
return
if self._captcha_state:
self._captcha_state["response_queue"].put({"action": "timeout", "user_response": ""})
self._captcha_state = None
self._captcha_deadline = 0
event.app.current_buffer.reset()
event.app.invalidate()
return
# Cancel clarify prompt
if self._clarify_state:
self._clarify_state["response_queue"].put(
@ -6334,7 +6402,7 @@ class HermesCLI:
# Guard: don't START recording during agent run or interactive prompts
if cli_ref._agent_running:
return
if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state:
if cli_ref._clarify_state or cli_ref._sudo_state or cli_ref._approval_state or cli_ref._captcha_state:
return
# Guard: don't start while a previous stop/transcribe cycle is
# still running — recorder.stop() holds AudioRecorder._lock and
@ -6554,6 +6622,8 @@ class HermesCLI:
return "type secret (hidden), Enter to skip"
if cli_ref._approval_state:
return ""
if cli_ref._captcha_state:
return "type ready/done after you solve the challenge, or abort to cancel"
if cli_ref._clarify_freetext:
return "type your answer here and press Enter"
if cli_ref._clarify_state:
@ -6597,6 +6667,13 @@ class HermesCLI:
('class:clarify-countdown', f' ({remaining}s)'),
]
if cli_ref._captcha_state:
remaining = max(0, int(cli_ref._captcha_deadline - _time.monotonic()))
return [
('class:hint', " complete the challenge in the browser, then type 'ready'"),
('class:clarify-countdown', f' ({remaining}s)'),
]
if cli_ref._clarify_state:
remaining = max(0, int(cli_ref._clarify_deadline - _time.monotonic()))
countdown = f' ({remaining}s)' if cli_ref._clarify_deadline else ''
@ -6619,7 +6696,7 @@ class HermesCLI:
return []
def get_hint_height():
if cli_ref._sudo_state or cli_ref._secret_state or cli_ref._approval_state or cli_ref._clarify_state or cli_ref._command_running:
if cli_ref._sudo_state or cli_ref._secret_state or cli_ref._approval_state or cli_ref._captcha_state or cli_ref._clarify_state or cli_ref._command_running:
return 1
# Keep a 1-line spacer while agent runs so output doesn't push
# right up against the top rule of the input area
@ -6644,7 +6721,7 @@ class HermesCLI:
height=get_hint_height,
)
# --- Clarify tool: dynamic display widget for questions + choices ---
# --- Interactive panels: CAPTCHA + clarify ---
def _panel_box_width(title: str, content_lines: list[str], min_width: int = 46, max_width: int = 76) -> int:
"""Choose a stable panel width wide enough for the title and content."""
@ -6672,6 +6749,45 @@ class HermesCLI:
def _append_blank_panel_line(lines, border_style: str, box_width: int) -> None:
lines.append((border_style, "" + (" " * box_width) + "\n"))
def _get_captcha_display():
state = cli_ref._captcha_state
if not state:
return []
payload = state.get("payload") or {}
title = "Manual CAPTCHA Required"
browser_view_url = payload.get("browser_view_url") or "Browser view URL is not configured."
body_lines = [
payload.get("instructions") or "Open the live browser and complete the verification challenge.",
f"Type: {payload.get('captcha_type', 'unknown')}",
f"Task ID: {payload.get('task_id', '')}",
f"Browser: {browser_view_url}",
"When the challenge disappears, type 'ready' or 'done' and press Enter.",
"Type 'abort' to stop this browser task.",
]
box_width = _panel_box_width(title, body_lines, min_width=56, max_width=94)
inner_text_width = max(8, box_width - 2)
lines = []
lines.append(('class:captcha-border', '╭─ '))
lines.append(('class:captcha-title', title))
lines.append(('class:captcha-border', ' ' + ('' * max(0, box_width - len(title) - 3)) + '\n'))
_append_blank_panel_line(lines, 'class:captcha-border', box_width)
for text in body_lines:
style = 'class:captcha-link' if text.startswith("Browser: ") else 'class:captcha-text'
for wrapped in _wrap_panel_text(text, inner_text_width):
_append_panel_line(lines, 'class:captcha-border', style, wrapped, box_width)
_append_blank_panel_line(lines, 'class:captcha-border', box_width)
lines.append(('class:captcha-border', '' + ('' * box_width) + '\n'))
return lines
captcha_widget = ConditionalContainer(
Window(
FormattedTextControl(_get_captcha_display),
wrap_lines=True,
),
filter=Condition(lambda: cli_ref._captcha_state is not None),
)
def _get_clarify_display():
"""Build styled text for the clarify question/choices panel."""
state = cli_ref._clarify_state
@ -6897,6 +7013,7 @@ class HermesCLI:
sudo_widget=sudo_widget,
secret_widget=secret_widget,
approval_widget=approval_widget,
captcha_widget=captcha_widget,
clarify_widget=clarify_widget,
spinner_widget=spinner_widget,
spacer=spacer,
@ -6954,6 +7071,11 @@ class HermesCLI:
'approval-cmd': '#AAAAAA italic',
'approval-choice': '#AAAAAA',
'approval-selected': '#FFD700 bold',
# CAPTCHA panel
'captcha-border': '#CD7F32',
'captcha-title': '#FFBF00 bold',
'captcha-text': '#FFF8DC',
'captcha-link': '#87CEEB underline',
# Voice mode
'voice-prompt': '#87CEEB',
'voice-recording': '#FF4444 bold',

View file

@ -152,6 +152,7 @@ def _discover_tools():
"tools.memory_tool",
"tools.session_search_tool",
"tools.clarify_tool",
"tools.to_captcha_tool",
"tools.code_execution_tool",
"tools.delegate_tool",
"tools.process_registry",
@ -362,7 +363,7 @@ def get_tool_definitions(
# because they need agent-level state (TodoStore, MemoryStore, etc.).
# The registry still holds their schemas; dispatch just returns a stub error
# so if something slips through, the LLM sees a sensible message.
_AGENT_LOOP_TOOLS = {"todo", "memory", "session_search", "delegate_task"}
_AGENT_LOOP_TOOLS = {"todo", "memory", "session_search", "delegate_task", "to_captcha"}
_READ_SEARCH_TOOLS = {"read_file", "search_files"}

View file

@ -73,6 +73,7 @@ from hermes_constants import OPENROUTER_BASE_URL
from agent.prompt_builder import (
DEFAULT_AGENT_IDENTITY, PLATFORM_HINTS,
MEMORY_GUIDANCE, SESSION_SEARCH_GUIDANCE, SKILLS_GUIDANCE,
BROWSER_CAPTCHA_GUIDANCE,
)
from agent.model_metadata import (
fetch_model_metadata,
@ -400,6 +401,7 @@ class AIAgent:
thinking_callback: callable = None,
reasoning_callback: callable = None,
clarify_callback: callable = None,
captcha_callback: callable = None,
step_callback: callable = None,
stream_delta_callback: callable = None,
tool_gen_callback: callable = None,
@ -447,6 +449,7 @@ class AIAgent:
tool_progress_callback (callable): Callback function(tool_name, args_preview) for progress notifications
clarify_callback (callable): Callback function(question, choices) -> str for interactive user questions.
Provided by the platform layer (CLI or gateway). If None, the clarify tool returns an error.
captcha_callback (callable): Callback function(payload_dict) -> dict for manual CAPTCHA completion flows.
max_tokens (int): Maximum tokens for model responses (optional, uses model default if not set)
reasoning_config (Dict): OpenRouter reasoning configuration override (e.g. {"effort": "none"} to disable thinking).
If None, defaults to {"enabled": True, "effort": "medium"} for OpenRouter. Set to disable/customize reasoning.
@ -529,6 +532,7 @@ class AIAgent:
self.thinking_callback = thinking_callback
self.reasoning_callback = reasoning_callback
self.clarify_callback = clarify_callback
self.captcha_callback = captcha_callback
self.step_callback = step_callback
self.stream_delta_callback = stream_delta_callback
self.status_callback = status_callback
@ -2276,6 +2280,8 @@ class AIAgent:
tool_guidance.append(SESSION_SEARCH_GUIDANCE)
if "skill_manage" in self.valid_tool_names:
tool_guidance.append(SKILLS_GUIDANCE)
if "internet_browser" in self.valid_tool_names and "to_captcha" in self.valid_tool_names:
tool_guidance.append(BROWSER_CAPTCHA_GUIDANCE)
if tool_guidance:
prompt_parts.append(" ".join(tool_guidance))
@ -4693,6 +4699,18 @@ class AIAgent:
choices=function_args.get("choices"),
callback=self.clarify_callback,
)
elif function_name == "to_captcha":
from tools.to_captcha_tool import to_captcha_tool as _to_captcha_tool
return _to_captcha_tool(
task_id=function_args.get("task_id", ""),
browser_view_url=function_args.get("browser_view_url"),
captcha_type=function_args.get("captcha_type"),
instructions=function_args.get("instructions"),
detected_at=function_args.get("detected_at"),
verification=function_args.get("verification"),
resume_token=function_args.get("resume_token"),
callback=self.captcha_callback,
)
elif function_name == "delegate_task":
from tools.delegate_tool import delegate_task as _delegate_task
return _delegate_task(
@ -4711,6 +4729,53 @@ class AIAgent:
honcho_session_key=self._honcho_session_key,
)
def _maybe_resolve_captcha(self, function_name: str, function_result: str, effective_task_id: str) -> str:
"""Bridge paused browser tasks into the dedicated CAPTCHA orchestration flow."""
if function_name != "internet_browser":
return function_result
try:
payload = json.loads(function_result)
except (json.JSONDecodeError, TypeError):
return function_result
if not isinstance(payload, dict):
return function_result
if payload.get("status") != "awaiting_user_captcha":
return function_result
human = payload.get("human_intervention") or {}
captcha_args = {
"task_id": payload.get("task_id") or human.get("task_id") or effective_task_id,
"browser_view_url": human.get("browser_view_url"),
"captcha_type": human.get("captcha_type"),
"instructions": human.get("instructions"),
"detected_at": human.get("detected_at"),
"verification": human.get("verification"),
"resume_token": human.get("resume_token"),
}
captcha_result = self._invoke_tool("to_captcha", captcha_args, effective_task_id)
try:
captcha_payload = json.loads(captcha_result)
except (json.JSONDecodeError, TypeError):
return captcha_result
if not isinstance(captcha_payload, dict):
return captcha_result
if captcha_payload.get("status") == "resumed":
task_result = captcha_payload.get("task_result")
if isinstance(task_result, dict):
return json.dumps(task_result, ensure_ascii=False)
return json.dumps(
{
"success": False,
"status": captcha_payload.get("status", "still_blocked"),
"task_id": captcha_args["task_id"],
"human_intervention": human,
"captcha_flow": captcha_payload,
},
ensure_ascii=False,
)
def _execute_tool_calls_concurrent(self, assistant_message, messages: list, effective_task_id: str, api_call_count: int = 0) -> None:
"""Execute multiple tool calls concurrently using a thread pool.
@ -4843,6 +4908,8 @@ class AIAgent:
tool_duration = 0.0
else:
function_name, function_args, function_result, tool_duration, is_error = r
function_result = self._maybe_resolve_captcha(function_name, function_result, effective_task_id)
is_error, _ = _detect_tool_failure(function_name, function_result)
if is_error:
result_preview = function_result[:200] if len(function_result) > 200 else function_result
@ -5029,6 +5096,21 @@ class AIAgent:
tool_duration = time.time() - tool_start_time
if self.quiet_mode:
self._vprint(f" {_get_cute_tool_message_impl('clarify', function_args, tool_duration, result=function_result)}")
elif function_name == "to_captcha":
from tools.to_captcha_tool import to_captcha_tool as _to_captcha_tool
function_result = _to_captcha_tool(
task_id=function_args.get("task_id", ""),
browser_view_url=function_args.get("browser_view_url"),
captcha_type=function_args.get("captcha_type"),
instructions=function_args.get("instructions"),
detected_at=function_args.get("detected_at"),
verification=function_args.get("verification"),
resume_token=function_args.get("resume_token"),
callback=self.captcha_callback,
)
tool_duration = time.time() - tool_start_time
if self.quiet_mode:
self._vprint(f" {_get_cute_tool_message_impl('to_captcha', function_args, tool_duration, result=function_result)}")
elif function_name == "delegate_task":
from tools.delegate_tool import delegate_task as _delegate_task
tasks_arg = function_args.get("tasks")
@ -5099,6 +5181,7 @@ class AIAgent:
logger.error("handle_function_call raised for %s: %s", function_name, tool_error, exc_info=True)
tool_duration = time.time() - tool_start_time
function_result = self._maybe_resolve_captcha(function_name, function_result, effective_task_id)
result_preview = function_result if self.verbose_logging else (
function_result[:200] if len(function_result) > 200 else function_result
)

View file

@ -1,41 +1,109 @@
import json
import os
import time
from urllib import error, request
from tools.registry import registry
def run_browser_task(task):
def _browser_api_base_url() -> str:
return os.getenv("BROWSER_API_URL", "http://browser-api:8088/api/browser").rstrip("/")
def _http_json(url: str, method: str = "GET", payload: dict | None = None, timeout_sec: int = 30) -> dict:
body = None
headers = {"Content-Type": "application/json"}
if payload is not None:
body = json.dumps(payload).encode("utf-8")
req = request.Request(url, data=body, headers=headers, method=method)
try:
with request.urlopen(req, timeout=timeout_sec) as resp:
raw = resp.read().decode("utf-8")
return json.loads(raw) if raw else {}
except error.HTTPError as http_err:
raw = http_err.read().decode("utf-8", errors="replace")
try:
data = json.loads(raw) if raw else {}
except json.JSONDecodeError:
data = {"details": raw}
return {
"success": False,
"error": f"Browser API returned HTTP {http_err.code}",
"details": data,
}
except Exception as err:
return {"success": False, "error": f"Browser API request failed: {err}"}
def run_browser_task(task: str):
if not task or not str(task).strip():
return json.dumps({"success": False, "error": "Task is required"}, ensure_ascii=False)
rpc_url = os.getenv("BROWSER_USE_RPC_URL", "http://browser:8787/run")
timeout_sec = int(os.getenv("BROWSER_USE_RPC_TIMEOUT", "900"))
payload = json.dumps({"task": task}).encode("utf-8")
req = request.Request(rpc_url, data=payload, headers={"Content-Type": "application/json"}, method="POST")
poll_interval = float(os.getenv("BROWSER_API_POLL_INTERVAL", "1.5"))
api_base = _browser_api_base_url()
try:
with request.urlopen(req, timeout=timeout_sec) as resp:
body = resp.read().decode("utf-8")
return body
except error.HTTPError as http_err:
body = http_err.read().decode("utf-8", errors="replace")
accepted = _http_json(
f"{api_base}/tasks",
method="POST",
payload={"task": task, "timeout": timeout_sec, "metadata": {"source": "internet_browser"}},
timeout_sec=30,
)
task_id = accepted.get("task_id")
if not task_id:
return json.dumps(
{
"success": False,
"error": f"browser-use RPC returned HTTP {http_err.code}",
"details": body,
},
ensure_ascii=False,
)
except Exception as err:
return json.dumps(
{
"success": False,
"error": f"browser-use RPC request failed: {err}",
"error": accepted.get("error", "Browser task was not accepted"),
"details": accepted,
},
ensure_ascii=False,
)
deadline = time.time() + timeout_sec + 10
status_url = f"{api_base}/tasks/{task_id}"
result_url = f"{api_base}/tasks/{task_id}/result"
while time.time() < deadline:
status_payload = _http_json(status_url, timeout_sec=15)
status = status_payload.get("status")
if not status and status_payload.get("error"):
return json.dumps(
{
"success": False,
"status": "failed",
"task_id": task_id,
"error": status_payload.get("error"),
"details": status_payload.get("details"),
},
ensure_ascii=False,
)
if status == "awaiting_user_captcha":
return json.dumps(
{
"success": False,
"status": status,
"task_id": task_id,
"human_intervention": status_payload.get("human_intervention"),
},
ensure_ascii=False,
)
if status in {"succeeded", "failed"}:
result_payload = _http_json(result_url, timeout_sec=30)
result_payload.setdefault("task_id", task_id)
return json.dumps(result_payload, ensure_ascii=False)
time.sleep(poll_interval)
return json.dumps(
{
"success": False,
"status": "failed",
"task_id": task_id,
"error": "Timed out while waiting for browser task result",
},
ensure_ascii=False,
)
registry.register(
name="internet_browser",
@ -43,23 +111,20 @@ registry.register(
schema={
"name": "internet_browser",
"description": (
"ГЛАВНЫЙ ИНСТРУМЕНТ ДЛЯ ВЕБ-СЕРФИНГА. Вызывай этот инструмент НАПРЯМУЮ (через стандартный tool call/function call). "
"КАТЕГОРИЧЕСКИ ЗАПРЕЩЕНО использовать `execute_code` или `delegate_task` для работы с браузером. "
"Не пиши Python-скрипты! Просто передай в этот инструмент параметр `task`. "
"Используй для любых задач в интернете: поиск товаров (Wildberries, Ozon), чтение статей, клики, навигация."
"Main browser automation tool for internet tasks. Call it directly via a normal tool/function call. "
"Do not use execute_code or delegate_task for browser work. Pass the task in natural language."
),
"parameters": {
"type": "object",
"properties": {
"task": {
"type": "string",
"description": "Подробная задача на естественном языке. Например: 'Зайди на wildberries.ru, найди черную футболку и верни цену'."
"description": "Detailed natural-language browser task."
}
},
"required": ["task"]
}
},
handler=lambda args, **kw: run_browser_task(args.get("task")),
emoji="🌐",
)