This commit is contained in:
Aleksandr Dubchak 2026-04-23 00:04:11 +03:00
parent 2b5d923f63
commit 98d5e90894
754 changed files with 1175740 additions and 142424 deletions

32
stuff/RunTrace.py Normal file
View file

@ -0,0 +1,32 @@
from dataclasses import dataclass, field
from typing import List, Optional
from StepTrace import StepTrace
@dataclass
class RunTrace:
run_id: str
task_id: str
dataset: str
instruction: str
model_name: Optional[str]
started_at: float
finished_at: Optional[float]
success: bool
final_url: Optional[str]
final_answer: Optional[str]
error: Optional[str]
fail_reason: Optional[str]
total_steps: int
total_tokens: Optional[int]
total_latency_sec: Optional[float]
screenshots_dir: Optional[str]
steps: List[StepTrace] = field(default_factory=list)

23
stuff/StepTrace.py Normal file
View file

@ -0,0 +1,23 @@
from dataclasses import dataclass
from typing import Optional
@dataclass
class StepTrace:
step_no: int
timestamp: float
thought: Optional[str]
action_type: str
action_target: Optional[str]
action_value: Optional[str]
url_before: Optional[str]
url_after: Optional[str]
screenshot_before: Optional[str]
screenshot_after: Optional[str]
success: bool
error: Optional[str] = None

View file

@ -0,0 +1,47 @@
import os
import sys
import asyncio
from dotenv import load_dotenv
PROJECT_ROOT = os.path.abspath(
os.path.join(os.path.dirname(__file__), "")
)
SKILLS_DIR = os.path.join(PROJECT_ROOT, "../BrowserUse_and_ComputerUse_skills")
HERMES_CODE_DIR = os.path.join(SKILLS_DIR, "hermes_code")
ENV_PATH = os.path.join(SKILLS_DIR, ".env")
for path in [PROJECT_ROOT, SKILLS_DIR, HERMES_CODE_DIR]:
if path not in sys.path:
sys.path.append(path)
load_dotenv(ENV_PATH, override=True)
os.environ["MODEL"] = "qwen3.5-122b"
os.environ["MODEL_DEFAULT"] = "qwen3.5-122b"
os.environ["BASE_URL"] = "https://llm.lambda.coredump.ru/v1"
os.environ["OPENAI_BASE_URL"] = "https://llm.lambda.coredump.ru/v1"
os.environ["API_KEY"] = "sk-4rzg5cB88S4MCSgNOLAzIw"
os.environ["OPENAI_API_KEY"] = "sk-4rzg5cB88S4MCSgNOLAzIw"
os.environ["PROVIDER"] = "custom"
os.environ["BROWSER_URL"] = "http://localhost:9222"
os.environ["BROWSER_VIEW_URL"] = "http://localhost:6080"
print("MODEL:", os.getenv("MODEL"), flush=True)
print("BROWSER_URL:", os.getenv("BROWSER_URL"), flush=True)
from hermes_code.tools.browser_use_tool import run_browser_task
async def main():
print("DEBUG: before run_browser_task", flush=True)
result = await asyncio.wait_for(
run_browser_task("Открой nba.com"),
timeout=60,
)
print("DEBUG: after run_browser_task", flush=True)
print(result, flush=True)
if __name__ == "__main__":
asyncio.run(main())

14
stuff/eval_t.py Normal file
View file

@ -0,0 +1,14 @@
import mind2web_runner
print("IMPORTED EVAL_AGENT =", eval_agent.__file__)
from mind2web_runner import run_agent_task
def main():
result = run_agent_task("открой nba.com")
print("==== RESULT ====")
print(result)
if __name__ == "__main__":
main()

31
stuff/extracting.py Normal file
View file

@ -0,0 +1,31 @@
import json
INPUT_PATH = "../Mind2Web/test_task/test_task_2.json"
OUTPUT_PATH = "../Mind2Web/test_1_task_2.json"
with open(INPUT_PATH, "r") as f:
data = json.load(f)
# --- защита от кривых данных ---
if isinstance(data, dict):
print("⚠️ Файл уже обрезан (dict вместо list)")
task = data
elif isinstance(data, list):
print(f"✅ Найден список задач: {len(data)}")
task = data[0]
else:
raise ValueError("❌ Неизвестный формат JSON")
# --- проверка структуры ---
print("Ключи задачи:", task.keys())
if "confirmed_task" not in task:
print("❌ ВНИМАНИЕ: нет confirmed_task — файл уже обрезан или не тот")
else:
print("✅ Цель:", task["confirmed_task"])
# --- сохранение ---
with open(OUTPUT_PATH, "w") as f:
json.dump(task, f, indent=2, ensure_ascii=False)
print(f"💾 Сохранено в {OUTPUT_PATH}")

42
stuff/loaders.py Normal file
View file

@ -0,0 +1,42 @@
import json
from typing import List
from one_Task_class import Task
def load_mind2web_tasks(path: str, limit: int | None = None) -> List[Task]:
with open(path, "r", encoding="utf-8") as f:
data = json.load(f)
if not isinstance(data, list):
raise ValueError(f"Expected list of tasks in {path}, got {type(data).__name__}")
tasks: List[Task] = []
for item in data:
if not isinstance(item, dict):
continue
annotation_id = item.get("annotation_id")
confirmed_task = item.get("confirmed_task")
website = item.get("website")
# пропускаем битые записи
if not annotation_id or not confirmed_task:
continue
task = Task(
id=annotation_id,
dataset="mind2web",
website=website,
instruction=confirmed_task,
start_url=None,
expected=None,
raw=item,
)
tasks.append(task)
if limit is not None and len(tasks) >= limit:
break
return tasks

22
stuff/main.py Normal file
View file

@ -0,0 +1,22 @@
from loaders import load_mind2web_tasks
from runner import run_many_tasks
# твой агент (как у тебя уже есть)
from agent import agent
def main():
tasks = load_mind2web_tasks(
"../Mind2Web/test_task/test_task_2.json",
limit=2
)
results = run_many_tasks(tasks, agent)
print("\n=== FINAL RESULTS ===")
print(f"Total: {len(results)}")
print(f"Success: {sum(r.success for r in results)}")
if __name__ == "__main__":
main()

13
stuff/one_Task_class.py Normal file
View file

@ -0,0 +1,13 @@
from dataclasses import dataclass
from typing import Optional, Any
@dataclass
class Task:
id: str
dataset: str
website: Optional[str]
instruction: str
start_url: Optional[str]
expected: Optional[Any]
raw: dict

99
stuff/run_agent_task.py Normal file
View file

@ -0,0 +1,99 @@
import json
import time
from typing import Any, Dict, Optional
async def run_agent_task(
agent,
instruction: str,
session_id: str,
history: Optional[list] = None,
) -> Dict[str, Any]:
history = history or []
started_at = time.time()
result = agent.run_conversation(
instruction,
conversation_history=history,
task_id=session_id,
)
finished_at = time.time()
normalized: Dict[str, Any] = {
"success": False,
"final_url": None,
"final_answer": result.get("final_response"),
"fail_reason": None,
"error": None,
"total_tokens": result.get("total_tokens"),
"input_tokens": result.get("input_tokens"),
"output_tokens": result.get("output_tokens"),
"model_name": result.get("model"),
"screenshots_dir": None,
"steps": [],
"started_at": started_at,
"finished_at": finished_at,
"raw_result": result,
}
# Общая диагностика верхнего уровня
if not result.get("completed", False):
normalized["fail_reason"] = "not_completed"
if result.get("interrupted", False):
normalized["fail_reason"] = "interrupted"
# Ищем tool output
messages = result.get("messages", [])
for msg in messages:
if msg.get("role") != "tool":
continue
content = msg.get("content")
if not content:
continue
try:
tool_payload = json.loads(content)
except Exception:
continue
# Основной success браузерного шага
normalized["success"] = bool(tool_payload.get("success", False))
# Публичная ссылка на browser view
normalized["final_url"] = tool_payload.get("browser_view")
# Если tool сам вернул текст результата, это полезно сохранить
if not normalized["final_answer"]:
normalized["final_answer"] = tool_payload.get("result")
if not normalized["success"] and normalized["fail_reason"] is None:
normalized["fail_reason"] = "tool_failed"
# Сохраняем шаг как минимум на уровне tool-call
normalized["steps"].append(
{
"timestamp": finished_at,
"thought": None,
"action_type": "TOOL_CALL",
"action_target": msg.get("tool_call_id"),
"action_value": None,
"url_before": None,
"url_after": tool_payload.get("browser_view"),
"screenshot_before": None,
"screenshot_after": None,
"success": bool(tool_payload.get("success", False)),
"error": None if tool_payload.get("success", False) else tool_payload.get("result"),
}
)
# Если tool output не нашли, но completed=True, это отдельный класс ошибки
if not normalized["steps"] and result.get("completed", False):
normalized["fail_reason"] = normalized["fail_reason"] or "no_tool_output"
# Верхнеуровневая ошибка, если агент вообще не завершился нормально
if not normalized["success"] and normalized["error"] is None and normalized["fail_reason"] is None:
normalized["fail_reason"] = "unknown_failure"
return normalized

78
stuff/run_task.py Normal file
View file

@ -0,0 +1,78 @@
import time
import uuid
from one_Task_class import Task
from RunTrace import RunTrace
from StepTrace import StepTrace
from run_agent_task import run_agent_task
async def run_task(task: Task, agent, model_name: str = None) -> RunTrace:
started_at = time.time()
trace = RunTrace(
run_id=str(uuid.uuid4()),
task_id=task.id,
dataset=task.dataset,
instruction=task.instruction,
model_name=model_name,
started_at=started_at,
finished_at=None,
success=False,
final_url=None,
final_answer=None,
error=None,
fail_reason=None,
total_steps=0,
total_tokens=None,
total_latency_sec=None,
screenshots_dir=None,
steps=[],
)
try:
result = await run_agent_task(
agent=agent,
instruction=task.instruction,
session_id=f"eval-{task.dataset}-{task.id}",
history=[],
)
trace.success = bool(result.get("success", False))
trace.final_url = result.get("final_url")
trace.final_answer = result.get("final_answer")
trace.error = result.get("error")
trace.fail_reason = result.get("fail_reason")
trace.total_tokens = result.get("total_tokens")
trace.screenshots_dir = result.get("screenshots_dir")
raw_steps = result.get("steps", [])
trace.total_steps = len(raw_steps)
for i, step in enumerate(raw_steps):
trace.steps.append(
StepTrace(
step_no=i,
timestamp=step.get("timestamp", time.time()),
thought=step.get("thought"),
action_type=step.get("action_type", "unknown"),
action_target=step.get("action_target"),
action_value=step.get("action_value"),
url_before=step.get("url_before"),
url_after=step.get("url_after"),
screenshot_before=step.get("screenshot_before"),
screenshot_after=step.get("screenshot_after"),
success=step.get("success", True),
error=step.get("error"),
)
)
except Exception as e:
trace.error = str(e)
trace.fail_reason = "runtime_exception"
finally:
finished_at = time.time()
trace.finished_at = finished_at
trace.total_latency_sec = finished_at - started_at
return trace

23
stuff/runner.py Normal file
View file

@ -0,0 +1,23 @@
from typing import List
from one_Task_class import Task
from RunTrace import RunTrace
from run_task import run_task
def run_many_tasks(tasks: List[Task], agent) -> List[RunTrace]:
results: List[RunTrace] = []
for i, task in enumerate(tasks):
print(f"\n=== Running task {i + 1}/{len(tasks)} ===")
print(f"Instruction: {task.instruction}\n")
trace = run_task(task, agent)
results.append(trace)
print("Success:", trace.success)
if trace.error:
print("Error:", trace.error)
return results