From ad042fdd68c0286023c08451096e111133482606 Mon Sep 17 00:00:00 2001 From: teknium Date: Tue, 10 Feb 2026 19:48:41 +0000 Subject: [PATCH] Update terminalbench_2 configuration for enhanced performance and evaluation - Increased max_token_length from 16000 to 32000 to allow for longer inputs. - Adjusted agent_temperature from 0.6 to 0.8 for more varied responses. - Extended test_timeout from 180 to 600 seconds to accommodate longer evaluations. - Updated data directory path for saving evaluations to ensure proper organization. --- .../benchmarks/terminalbench_2/default.yaml | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/environments/benchmarks/terminalbench_2/default.yaml b/environments/benchmarks/terminalbench_2/default.yaml index e6b3014c..62f66316 100644 --- a/environments/benchmarks/terminalbench_2/default.yaml +++ b/environments/benchmarks/terminalbench_2/default.yaml @@ -16,22 +16,16 @@ env: enabled_toolsets: ["terminal", "file"] max_agent_turns: 60 - max_token_length: 16000 - agent_temperature: 0.6 + max_token_length: 32000 + agent_temperature: 0.8 terminal_backend: "modal" dataset_name: "NousResearch/terminal-bench-2" - test_timeout: 180 + test_timeout: 600 tokenizer_name: "NousResearch/Hermes-3-Llama-3.1-8B" use_wandb: true wandb_name: "terminal-bench-2" ensure_scores_are_not_same: false - data_dir_to_save_evals: "evals/terminal-bench-2" - system_prompt: > - You are a skilled software engineer and system administrator with - access to a terminal and file tools. You are working inside a Linux - container environment. Complete the user's task by using the available - tools. Be methodical: explore the environment first, plan your approach, - then execute step by step. Verify your work before finishing. + data_dir_to_save_evals: "environments/benchmarks/evals/terminal-bench-2" openai: base_url: "https://openrouter.ai/api/v1"