From 1b7bc299f373771706698b813f38c2043bf6bcd7 Mon Sep 17 00:00:00 2001
From: teknium <teknium@nousresearch.com>
Date: Thu, 12 Feb 2026 05:36:45 +0000
Subject: [PATCH] Enhance TerminalBench2 environment with task filtering due to
 incompat with modal and logging improvements

- Updated task filter descriptions for clarity and added a new skip task feature to exclude incompatible tasks.
- Introduced a set of modal incompatible tasks to prevent execution errors in cloud environments.
- Implemented streaming JSONL logging for task results, preserving data even on interruptions.
- Refactored task evaluation logic to include skipped task reporting and improved error handling.
---
 .../terminalbench_2/terminalbench2_env.py     | 80 +++++++++++++++----
 1 file changed, 65 insertions(+), 15 deletions(-)

diff --git a/environments/benchmarks/terminalbench_2/terminalbench2_env.py b/environments/benchmarks/terminalbench_2/terminalbench2_env.py
index 99ee7eb1..ccb65b32 100644
--- a/environments/benchmarks/terminalbench_2/terminalbench2_env.py
+++ b/environments/benchmarks/terminalbench_2/terminalbench2_env.py
@@ -103,12 +103,12 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig):
     # --- Task filtering (comma-separated from CLI) ---
     task_filter: Optional[str] = Field(
         default=None,
-        description="Comma-separated task names to run (e.g., 'fix-git,broken-pipe'). "
+        description="Comma-separated task names to run (e.g., 'fix-git,git-multibranch'). "
         "If not set, all tasks are run.",
     )
     skip_tasks: Optional[str] = Field(
         default=None,
-        description="Comma-separated task names to skip (e.g., 'heavy-task,slow-task').",
+        description="Comma-separated task names to skip on top of the default skip list.",
     )
 
     # --- Per-task wall-clock timeout ---
@@ -119,6 +119,14 @@ class TerminalBench2EvalConfig(HermesAgentEnvConfig):
     )
 
 
+# Tasks that cannot run properly on Modal and are excluded from scoring.
+MODAL_INCOMPATIBLE_TASKS = {
+    "qemu-startup",        # Needs KVM/hardware virtualization
+    "qemu-alpine-ssh",     # Needs KVM/hardware virtualization
+    "crack-7z-hash",       # Password brute-force -- too slow for cloud sandbox timeouts
+}
+
+
 # =============================================================================
 # Tar extraction helper
 # =============================================================================
@@ -186,13 +194,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
             max_agent_turns=60,
             max_token_length=16000,
             agent_temperature=0.6,
-            system_prompt=(
-                "You are a skilled software engineer and system administrator with "
-                "access to a terminal and file tools. You are working inside a Linux "
-                "container environment. Complete the user's task by using the available "
-                "tools. Be methodical: explore the environment first, plan your approach, "
-                "then execute step by step. Verify your work before finishing."
-            ),
+            system_prompt=None,
 
             # Modal backend for per-task cloud-isolated sandboxes
             terminal_backend="modal",
@@ -258,10 +260,18 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
             allowed = {name.strip() for name in self.config.task_filter.split(",")}
             tasks = [t for t in tasks if t["task_name"] in allowed]
             print(f"  Filtered to {len(tasks)} tasks: {sorted(allowed)}")
+
+        # Skip tasks incompatible with the current backend (e.g., QEMU on Modal)
+        # plus any user-specified skip_tasks
+        skip = set(MODAL_INCOMPATIBLE_TASKS) if self.config.terminal_backend == "modal" else set()
         if self.config.skip_tasks:
-            skip = {name.strip() for name in self.config.skip_tasks.split(",")}
+            skip |= {name.strip() for name in self.config.skip_tasks.split(",")}
+        if skip:
+            before = len(tasks)
             tasks = [t for t in tasks if t["task_name"] not in skip]
-            print(f"  After skip_tasks: {len(tasks)} tasks (skipped: {sorted(skip)})")
+            skipped = before - len(tasks)
+            if skipped > 0:
+                print(f"  Skipped {skipped} incompatible tasks: {sorted(skip & {t['task_name'] for t in ds})}")
 
         self.all_eval_items = tasks
         self.iter = 0
@@ -274,10 +284,30 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
         # Reward tracking for wandb logging
         self.eval_metrics: List[Tuple[str, float]] = []
 
+        # Streaming JSONL writer -- saves each task's full conversation
+        # immediately on completion so data is preserved even on Ctrl+C.
+        # Timestamped filename so each run produces a unique file.
+        import datetime
+        log_dir = os.path.join(os.path.dirname(__file__), "logs")
+        os.makedirs(log_dir, exist_ok=True)
+        run_ts = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        self._streaming_path = os.path.join(log_dir, f"samples_{run_ts}.jsonl")
+        self._streaming_file = open(self._streaming_path, "w")
+        self._streaming_lock = __import__("threading").Lock()
+        print(f"  Streaming results to: {self._streaming_path}")
+
         print(f"TB2 ready: {len(self.all_eval_items)} tasks across {len(self.category_index)} categories")
         for cat, indices in sorted(self.category_index.items()):
             print(f"  {cat}: {len(indices)} tasks")
 
+    def _save_result(self, result: Dict[str, Any]):
+        """Write a single task result to the streaming JSONL file immediately."""
+        if not hasattr(self, "_streaming_file") or self._streaming_file.closed:
+            return
+        with self._streaming_lock:
+            self._streaming_file.write(json.dumps(result, ensure_ascii=False, default=str) + "\n")
+            self._streaming_file.flush()
+
     # =========================================================================
     # Training pipeline stubs -- NOT used in eval-only mode
     # =========================================================================
@@ -423,6 +453,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
                 task_id=task_id,
                 temperature=self.config.agent_temperature,
                 max_tokens=self.config.max_token_length,
+                extra_body=self.config.extra_body,
             )
             result = await agent.run(messages)
 
@@ -463,24 +494,29 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
                 task_name, reward, result.turns_used, result.finished_naturally,
             )
 
-            return {
+            out = {
                 "passed": passed,
                 "reward": reward,
                 "task_name": task_name,
                 "category": category,
                 "turns_used": result.turns_used,
                 "finished_naturally": result.finished_naturally,
+                "messages": result.messages,
             }
+            self._save_result(out)
+            return out
 
         except Exception as e:
             elapsed = time.time() - task_start
             logger.error("Task %s: rollout failed: %s", task_name, e, exc_info=True)
             tqdm.write(f"  [ERROR] {task_name}: {e} ({elapsed:.0f}s)")
-            return {
+            out = {
                 "passed": False, "reward": 0.0,
                 "task_name": task_name, "category": category,
                 "error": str(e),
             }
+            self._save_result(out)
+            return out
 
         finally:
             # --- Cleanup: clear overrides, sandbox, and temp files ---
@@ -636,11 +672,13 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
             elapsed = self.config.task_timeout
             tqdm.write(f"  [TIMEOUT] {task_name} (exceeded {elapsed}s wall-clock limit)")
             logger.error("Task %s: wall-clock timeout after %ds", task_name, elapsed)
-            return {
+            out = {
                 "passed": False, "reward": 0.0,
                 "task_name": task_name, "category": category,
                 "error": f"timeout ({elapsed}s)",
             }
+            self._save_result(out)
+            return out
 
     async def evaluate(self, *args, **kwargs) -> None:
         """
@@ -796,7 +834,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
 
         print(f"{'='*60}\n")
 
-        # Build sample records for evaluate_log
+        # Build sample records for evaluate_log (includes full conversations)
         samples = [
             {
                 "task_name": r.get("task_name"),
@@ -805,6 +843,7 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
                 "reward": r.get("reward"),
                 "turns_used": r.get("turns_used"),
                 "error": r.get("error"),
+                "messages": r.get("messages"),
             }
             for r in valid_results
         ]
@@ -826,11 +865,22 @@ class TerminalBench2EvalEnv(HermesAgentBaseEnv):
         except Exception as e:
             print(f"Error logging evaluation results: {e}")
 
+        # Close streaming file
+        if hasattr(self, "_streaming_file") and not self._streaming_file.closed:
+            self._streaming_file.close()
+            print(f"  Live results saved to: {self._streaming_path}")
+
         # Kill all remaining sandboxes. Timed-out tasks leave orphaned thread
         # pool workers still executing commands -- cleanup_all stops them.
         from tools.terminal_tool import cleanup_all_environments
         print("\nCleaning up all sandboxes...")
         cleanup_all_environments()
+
+        # Shut down the tool thread pool so orphaned workers from timed-out
+        # tasks are killed immediately instead of retrying against dead
+        # sandboxes and spamming the console with TimeoutError warnings.
+        from environments.agent_loop import _tool_executor
+        _tool_executor.shutdown(wait=False, cancel_futures=True)
         print("Done.")
 
     # =========================================================================