From 1b7bc299f373771706698b813f38c2043bf6bcd7 Mon Sep 17 00:00:00 2001 From: teknium Date: Thu, 12 Feb 2026 05:36:45 +0000 Subject: [PATCH] Enhance TerminalBench2 environment with task filtering due to incompat with modal and logging improvements - Updated task filter descriptions for clarity and added a new skip task feature to exclude incompatible tasks. - Introduced a set of modal incompatible tasks to prevent execution errors in cloud environments. - Implemented streaming JSONL logging for task results, preserving data even on interruptions. - Refactored task evaluation logic to include skipped task reporting and improved error handling. --- .../terminalbench_2/terminalbench2_env.py | 80 +++++++++++++++---- 1 file changed, 65 insertions(+), 15 deletions(-) diff --git a/environments/benchmarks/terminalbench_2/terminalbench2_env.py b/environments/benchmarks/terminalbench_2/terminalbench2_env.py index 99ee7eb1..ccb65b32 100644 --- a/environments/benchmarks/terminalbench_2/terminalbench2_env.py +++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py @@ -103,12 +103,12 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig): # --- Task filtering (comma-separated from CLI) --- task_filter: Optional[str] = Field( default=None, - description="Comma-separated task names to run (e.g., 'fix-git,broken-pipe'). " + description="Comma-separated task names to run (e.g., 'fix-git,git-multibranch'). " "If not set, all tasks are run.", ) skip_tasks: Optional[str] = Field( default=None, - description="Comma-separated task names to skip (e.g., 'heavy-task,slow-task').", + description="Comma-separated task names to skip on top of the default skip list.", ) # --- Per-task wall-clock timeout --- @@ -119,6 +119,14 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig): ) +# Tasks that cannot run properly on Modal and are excluded from scoring. +MODAL_INCOMPATIBLE_TASKS = { + "qemu-startup", # Needs KVM/hardware virtualization + "qemu-alpine-ssh", # Needs KVM/hardware virtualization + "crack-7z-hash", # Password brute-force -- too slow for cloud sandbox timeouts +} + + # ============================================================================= # Tar extraction helper # ============================================================================= @@ -186,13 +194,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): max_agent_turns=60, max_token_length=16000, agent_temperature=0.6, - system_prompt=( - "You are a skilled software engineer and system administrator with " - "access to a terminal and file tools. You are working inside a Linux " - "container environment. Complete the user's task by using the available " - "tools. Be methodical: explore the environment first, plan your approach, " - "then execute step by step. Verify your work before finishing." - ), + system_prompt=None, # Modal backend for per-task cloud-isolated sandboxes terminal_backend="modal", @@ -258,10 +260,18 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): allowed = {name.strip() for name in self.config.task_filter.split(",")} tasks = [t for t in tasks if t["task_name"] in allowed] print(f" Filtered to {len(tasks)} tasks: {sorted(allowed)}") + + # Skip tasks incompatible with the current backend (e.g., QEMU on Modal) + # plus any user-specified skip_tasks + skip = set(MODAL_INCOMPATIBLE_TASKS) if self.config.terminal_backend == "modal" else set() if self.config.skip_tasks: - skip = {name.strip() for name in self.config.skip_tasks.split(",")} + skip |= {name.strip() for name in self.config.skip_tasks.split(",")} + if skip: + before = len(tasks) tasks = [t for t in tasks if t["task_name"] not in skip] - print(f" After skip_tasks: {len(tasks)} tasks (skipped: {sorted(skip)})") + skipped = before - len(tasks) + if skipped > 0: + print(f" Skipped {skipped} incompatible tasks: {sorted(skip & {t['task_name'] for t in ds})}") self.all_eval_items = tasks self.iter = 0 @@ -274,10 +284,30 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): # Reward tracking for wandb logging self.eval_metrics: List[Tuple[str, float]] = [] + # Streaming JSONL writer -- saves each task's full conversation + # immediately on completion so data is preserved even on Ctrl+C. + # Timestamped filename so each run produces a unique file. + import datetime + log_dir = os.path.join(os.path.dirname(__file__), "logs") + os.makedirs(log_dir, exist_ok=True) + run_ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + self._streaming_path = os.path.join(log_dir, f"samples_{run_ts}.jsonl") + self._streaming_file = open(self._streaming_path, "w") + self._streaming_lock = __import__("threading").Lock() + print(f" Streaming results to: {self._streaming_path}") + print(f"TB2 ready: {len(self.all_eval_items)} tasks across {len(self.category_index)} categories") for cat, indices in sorted(self.category_index.items()): print(f" {cat}: {len(indices)} tasks") + def _save_result(self, result: Dict[str, Any]): + """Write a single task result to the streaming JSONL file immediately.""" + if not hasattr(self, "_streaming_file") or self._streaming_file.closed: + return + with self._streaming_lock: + self._streaming_file.write(json.dumps(result, ensure_ascii=False, default=str) + "\n") + self._streaming_file.flush() + # ========================================================================= # Training pipeline stubs -- NOT used in eval-only mode # ========================================================================= @@ -423,6 +453,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): task_id=task_id, temperature=self.config.agent_temperature, max_tokens=self.config.max_token_length, + extra_body=self.config.extra_body, ) result = await agent.run(messages) @@ -463,24 +494,29 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): task_name, reward, result.turns_used, result.finished_naturally, ) - return { + out = { "passed": passed, "reward": reward, "task_name": task_name, "category": category, "turns_used": result.turns_used, "finished_naturally": result.finished_naturally, + "messages": result.messages, } + self._save_result(out) + return out except Exception as e: elapsed = time.time() - task_start logger.error("Task %s: rollout failed: %s", task_name, e, exc_info=True) tqdm.write(f" [ERROR] {task_name}: {e} ({elapsed:.0f}s)") - return { + out = { "passed": False, "reward": 0.0, "task_name": task_name, "category": category, "error": str(e), } + self._save_result(out) + return out finally: # --- Cleanup: clear overrides, sandbox, and temp files --- @@ -636,11 +672,13 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): elapsed = self.config.task_timeout tqdm.write(f" [TIMEOUT] {task_name} (exceeded {elapsed}s wall-clock limit)") logger.error("Task %s: wall-clock timeout after %ds", task_name, elapsed) - return { + out = { "passed": False, "reward": 0.0, "task_name": task_name, "category": category, "error": f"timeout ({elapsed}s)", } + self._save_result(out) + return out async def evaluate(self, *args, **kwargs) -> None: """ @@ -796,7 +834,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): print(f"{'='*60}\n") - # Build sample records for evaluate_log + # Build sample records for evaluate_log (includes full conversations) samples = [ { "task_name": r.get("task_name"), @@ -805,6 +843,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): "reward": r.get("reward"), "turns_used": r.get("turns_used"), "error": r.get("error"), + "messages": r.get("messages"), } for r in valid_results ] @@ -826,11 +865,22 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv): except Exception as e: print(f"Error logging evaluation results: {e}") + # Close streaming file + if hasattr(self, "_streaming_file") and not self._streaming_file.closed: + self._streaming_file.close() + print(f" Live results saved to: {self._streaming_path}") + # Kill all remaining sandboxes. Timed-out tasks leave orphaned thread # pool workers still executing commands -- cleanup_all stops them. from tools.terminal_tool import cleanup_all_environments print("\nCleaning up all sandboxes...") cleanup_all_environments() + + # Shut down the tool thread pool so orphaned workers from timed-out + # tasks are killed immediately instead of retrying against dead + # sandboxes and spamming the console with TimeoutError warnings. + from environments.agent_loop import _tool_executor + _tool_executor.shutdown(wait=False, cancel_futures=True) print("Done.") # =========================================================================