From ad042fdd68c0286023c08451096e111133482606 Mon Sep 17 00:00:00 2001
From: teknium <teknium@nousresearch.com>
Date: Tue, 10 Feb 2026 19:48:41 +0000
Subject: [PATCH] Update terminalbench_2 configuration for enhanced performance
 and evaluation

- Increased max_token_length from 16000 to 32000 to allow for longer inputs.
- Adjusted agent_temperature from 0.6 to 0.8 for more varied responses.
- Extended test_timeout from 180 to 600 seconds to accommodate longer evaluations.
- Updated data directory path for saving evaluations to ensure proper organization.
---
 .../benchmarks/terminalbench_2/default.yaml        | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/environments/benchmarks/terminalbench_2/default.yaml b/environments/benchmarks/terminalbench_2/default.yaml
index e6b3014c..62f66316 100644
--- a/environments/benchmarks/terminalbench_2/default.yaml
+++ b/environments/benchmarks/terminalbench_2/default.yaml
@@ -16,22 +16,16 @@
 env:
   enabled_toolsets: ["terminal", "file"]
   max_agent_turns: 60
-  max_token_length: 16000
-  agent_temperature: 0.6
+  max_token_length: 32000
+  agent_temperature: 0.8
   terminal_backend: "modal"
   dataset_name: "NousResearch/terminal-bench-2"
-  test_timeout: 180
+  test_timeout: 600
   tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B"
   use_wandb: true
   wandb_name: "terminal-bench-2"
   ensure_scores_are_not_same: false
-  data_dir_to_save_evals: "evals/terminal-bench-2"
-  system_prompt: >
-    You are a skilled software engineer and system administrator with
-    access to a terminal and file tools. You are working inside a Linux
-    container environment. Complete the user's task by using the available
-    tools. Be methodical: explore the environment first, plan your approach,
-    then execute step by step. Verify your work before finishing.
+  data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2"
 
 openai:
   base_url: "https://openrouter.ai/api/v1"