Enhance TerminalBench 2 configuration and evaluation handling

- Added task_timeout parameter to enforce a maximum wall-clock time for each task, automatically scoring as FAIL if exceeded. - Introduced terminal_timeout and tool_pool_size parameters to improve command execution and concurrency management. - Updated logging to provide detailed task execution times and timeout handling, enhancing overall monitoring. - Removed outdated evaluate_config.yaml file to streamline configuration management.
2026-02-10 22:53:24 +00:00 · 2026-02-10 22:53:24 +00:00 · ba3fea24f1
commit ba3fea24f1
parent 6b4a8d0b17
3 changed files with 130 additions and 74 deletions
--- a/environments/benchmarks/terminalbench_2/default.yaml
+++ b/environments/benchmarks/terminalbench_2/default.yaml
@ -19,8 +19,11 @@ env:
  max_token_length: 32000
  agent_temperature: 0.8
  terminal_backend: "modal"
  terminal_timeout: 300        # 5 min per command (builds, pip install)
  tool_pool_size: 128          # thread pool for 89 parallel tasks
  dataset_name: "NousResearch/terminal-bench-2"
  test_timeout: 600
  task_timeout: 1800           # 30 min wall-clock per task, auto-FAIL if exceeded
  tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
  use_wandb: true
  wandb_name: "terminal-bench-2"
--- a/environments/benchmarks/terminalbench_2/terminalbench2_env.py
+++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py
@ -111,6 +111,12 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig):
        description="Comma-separated task names to skip (e.g., 'heavy-task,slow-task').",
    )
    # --- Per-task wall-clock timeout ---
    task_timeout: int = Field(
        default=1800,
        description="Maximum wall-clock seconds per task (agent loop + verification). "
        "Tasks exceeding this are scored as FAIL. Default 30 minutes.",
    )
 # =============================================================================
@ -190,10 +196,14 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
            # Modal backend for per-task cloud-isolated sandboxes
            terminal_backend="modal",
            terminal_timeout=300,   # 5 min per command (builds, pip install, etc.)
            # Test execution timeout (TB2 test scripts can install deps like pytest)
            test_timeout=180,
            # 89 tasks run in parallel, each needs a thread for tool calls
            tool_pool_size=128,
            # --- Eval-only Atropos settings ---
            # These settings make the env work as an eval-only environment:
            #   - STOP_TRAIN: pauses training during eval (standard for eval envs)
@ -231,6 +241,14 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
        """Load the Terminal-Bench 2.0 dataset from HuggingFace."""
        from datasets import load_dataset
        # Auto-set terminal_lifetime to task_timeout + 120s so sandboxes
        # never get killed during an active task, but still get cleaned up
        # promptly after the task times out.
        lifetime = self.config.task_timeout + 120
        self.config.terminal_lifetime = lifetime
        os.environ["TERMINAL_LIFETIME_SECONDS"] = str(lifetime)
        print(f"  Terminal lifetime auto-set to {lifetime}s (task_timeout + 120s)")
        print(f"Loading TB2 dataset from: {self.config.dataset_name}")
        ds = load_dataset(self.config.dataset_name, split="train")
@ -366,6 +384,10 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
        task_id = str(uuid.uuid4())
        task_dir = None  # Set if we extract a Dockerfile (needs cleanup)
        from tqdm import tqdm
        tqdm.write(f"  [START] {task_name} (task_id={task_id[:8]})")
        task_start = time.time()
        try:
            # --- 1. Resolve Docker image ---
            modal_image, task_dir = self._resolve_task_image(eval_item, task_name)
@ -416,9 +438,16 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
                )
                reward = 0.0
            else:
                # Run tests in a thread so the blocking ctx.terminal() calls
                # don't freeze the entire event loop (which would stall all
                # other tasks, tqdm updates, and timeout timers).
                ctx = ToolContext(task_id)
                try:
-                    reward = self._run_tests(eval_item, ctx, task_name)
+                    loop = asyncio.get_event_loop()
                    reward = await loop.run_in_executor(
                        None,  # default thread pool
                        self._run_tests, eval_item, ctx, task_name,
                    )
                except Exception as e:
                    logger.error("Task %s: test verification failed: %s", task_name, e)
                    reward = 0.0
@ -427,7 +456,8 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
            passed = reward == 1.0
            status = "PASS" if passed else "FAIL"
-            print(f"  [{status}] {task_name} (turns={result.turns_used})")
+            elapsed = time.time() - task_start
            tqdm.write(f"  [{status}] {task_name} (turns={result.turns_used}, {elapsed:.0f}s)")
            logger.info(
                "Task %s: reward=%.1f, turns=%d, finished=%s",
                task_name, reward, result.turns_used, result.finished_naturally,
@ -443,8 +473,9 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
            }
        except Exception as e:
            elapsed = time.time() - task_start
            logger.error("Task %s: rollout failed: %s", task_name, e, exc_info=True)
-            print(f"  [ERROR] {task_name}: {e}")
+            tqdm.write(f"  [ERROR] {task_name}: {e} ({elapsed:.0f}s)")
            return {
                "passed": False, "reward": 0.0,
                "task_name": task_name, "category": category,
@ -586,6 +617,31 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
    # Evaluate -- main entry point for the eval subcommand
    # =========================================================================
    async def _eval_with_timeout(self, item: Dict[str, Any]) -> Dict:
        """
        Wrap rollout_and_score_eval with a per-task wall-clock timeout.
        If the task exceeds task_timeout seconds, it's automatically scored
        as FAIL. This prevents any single task from hanging indefinitely.
        """
        task_name = item.get("task_name", "unknown")
        category = item.get("category", "unknown")
        try:
            return await asyncio.wait_for(
                self.rollout_and_score_eval(item),
                timeout=self.config.task_timeout,
            )
        except asyncio.TimeoutError:
            from tqdm import tqdm
            elapsed = self.config.task_timeout
            tqdm.write(f"  [TIMEOUT] {task_name} (exceeded {elapsed}s wall-clock limit)")
            logger.error("Task %s: wall-clock timeout after %ds", task_name, elapsed)
            return {
                "passed": False, "reward": 0.0,
                "task_name": task_name, "category": category,
                "error": f"timeout ({elapsed}s)",
            }
    async def evaluate(self, *args, **kwargs) -> None:
        """
        Run Terminal-Bench 2.0 evaluation over all tasks.
@ -594,27 +650,88 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
            python environments/terminalbench2_env.py evaluate
        Runs all tasks through rollout_and_score_eval() via asyncio.gather()
-        (same pattern as GPQA and other Atropos eval envs). Aggregates
+        (same pattern as GPQA and other Atropos eval envs). Each task is
-        per-task, per-category, and overall pass rates, then logs to wandb
+        wrapped with a wall-clock timeout so hung tasks auto-fail.
-        and evaluate_log().
+
        Suppresses noisy Modal/terminal output (HERMES_QUIET) so the tqdm
        bar stays visible.
        """
        start_time = time.time()
        # Route all logging through tqdm.write() so the progress bar stays
        # pinned at the bottom while log lines scroll above it.
        from tqdm import tqdm
        class _TqdmHandler(logging.Handler):
            def emit(self, record):
                try:
                    tqdm.write(self.format(record))
                except Exception:
                    self.handleError(record)
        handler = _TqdmHandler()
        handler.setFormatter(logging.Formatter(
            "%(asctime)s [%(name)s] %(levelname)s: %(message)s",
            datefmt="%H:%M:%S",
        ))
        root = logging.getLogger()
        root.handlers = [handler]  # Replace any existing handlers
        root.setLevel(logging.INFO)
        # Silence noisy third-party loggers that flood the output
        logging.getLogger("httpx").setLevel(logging.WARNING)      # Every HTTP request
        logging.getLogger("openai").setLevel(logging.WARNING)     # OpenAI client retries
        logging.getLogger("rex-deploy").setLevel(logging.WARNING) # Swerex deployment
        logging.getLogger("rex_image_builder").setLevel(logging.WARNING)  # Image builds
        print(f"\n{'='*60}")
        print("Starting Terminal-Bench 2.0 Evaluation")
        print(f"{'='*60}")
        print(f"  Dataset: {self.config.dataset_name}")
        print(f"  Total tasks: {len(self.all_eval_items)}")
        print(f"  Max agent turns: {self.config.max_agent_turns}")
        print(f"  Task timeout: {self.config.task_timeout}s")
        print(f"  Terminal backend: {self.config.terminal_backend}")
        print(f"  Tool thread pool: {self.config.tool_pool_size}")
        print(f"  Terminal timeout: {self.config.terminal_timeout}s/cmd")
        print(f"  Terminal lifetime: {self.config.terminal_lifetime}s (auto: task_timeout + 120)")
        print(f"{'='*60}\n")
-        # Fire all tasks -- Atropos / Modal handle scheduling
+        # Fire all tasks with wall-clock timeout, track live accuracy on the bar
-        from tqdm.asyncio import tqdm_asyncio
+        total_tasks = len(self.all_eval_items)
        eval_tasks = [
-            self.rollout_and_score_eval(item) for item in self.all_eval_items
+            asyncio.ensure_future(self._eval_with_timeout(item))
            for item in self.all_eval_items
        ]
-        results = await tqdm_asyncio.gather(*eval_tasks, desc="Evaluating TB2")
+
        results = []
        passed_count = 0
        pbar = tqdm(total=total_tasks, desc="Evaluating TB2", dynamic_ncols=True)
        try:
            for coro in asyncio.as_completed(eval_tasks):
                result = await coro
                results.append(result)
                if result and result.get("passed"):
                    passed_count += 1
                done = len(results)
                pct = (passed_count / done * 100) if done else 0
                pbar.set_postfix_str(f"pass={passed_count}/{done} ({pct:.1f}%)")
                pbar.update(1)
        except (KeyboardInterrupt, asyncio.CancelledError):
            pbar.close()
            print(f"\n\nInterrupted! Cleaning up {len(eval_tasks)} tasks...")
            # Cancel all pending tasks
            for task in eval_tasks:
                task.cancel()
            # Let cancellations propagate (finally blocks run cleanup_vm)
            await asyncio.gather(*eval_tasks, return_exceptions=True)
            # Belt-and-suspenders: clean up any remaining sandboxes
            from tools.terminal_tool import cleanup_all_environments
            cleanup_all_environments()
            print("All sandboxes cleaned up.")
            return
        finally:
            pbar.close()
        end_time = time.time()
--- a/evals/terminal-bench-2/evaluate_config.yaml
+++ b/evals/terminal-bench-2/evaluate_config.yaml
@ -1,64 +0,0 @@
 env:
  group_size: 1
  max_num_workers: -1
  max_eval_workers: 16
  max_num_workers_per_node: 8
  steps_per_eval: 1
  max_token_length: 32000
  eval_handling: STOP_TRAIN
  eval_limit_ratio: 0.5
  inference_weight: 1.0
  batch_size: -1
  max_batches_offpolicy: 3
  tokenizer_name: NousResearch/Hermes-3-Llama-3.1-8B
  use_wandb: false
  rollout_server_url: http://localhost:8000
  total_steps: 1
  wandb_name: terminal-bench-2
  num_rollouts_to_keep: 32
  num_rollouts_per_group_for_logging: 1
  ensure_scores_are_not_same: false
  data_path_to_save_groups: null
  data_dir_to_save_evals: evals/terminal-bench-2
  min_items_sent_before_logging: 2
  include_messages: false
  min_batch_allocation: null
  worker_timeout: 600.0
  thinking_mode: false
  reasoning_effort: null
  max_reasoning_tokens: null
  custom_thinking_prompt: null
  enabled_toolsets:
  - terminal
  - file
  disabled_toolsets: null
  distribution: null
  max_agent_turns: 60
  system_prompt: 'You are a skilled software engineer and system administrator with
    access to a terminal and file tools. You are working inside a Linux container
    environment. Complete the user''s task by using the available tools. Be methodical:
    explore the environment first, plan your approach, then execute step by step.
    Verify your work before finishing.'
  agent_temperature: 1.0
  terminal_backend: modal
  dataset_name: NousResearch/terminal-bench-2
  dataset_split: train
  prompt_field: prompt
  tool_call_parser: hermes
  test_timeout: 180
  force_build: false
  task_filter: fix-git
  skip_tasks: null
 openai:
 - timeout: 1200
  num_max_requests_at_once: 512
  num_requests_for_eval: 64
  model_name: anthropic/claude-sonnet-4
  rolling_buffer_length: 1000
  server_type: openai
  api_key: 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
  base_url: https://openrouter.ai/api/v1
  n_kwarg_is_ignored: false
  health_check: false
 slurm: false
 testing: false