Enhance batch processing and image generation tools

- Updated batch processing to include robust resume functionality by scanning completed prompts based on content rather than indices, improving recovery from failures. - Implemented retry logic for image downloads with exponential backoff to handle transient failures effectively. - Refined image generation tool to utilize the FLUX 2 Pro model, updating descriptions and parameters for clarity and consistency. - Added new configuration scripts for GLM 4.7 and Imagen tasks, enhancing usability and logging capabilities. - Removed outdated scripts and test files to streamline the codebase.
2026-01-18 10:11:59 +00:00 · 2026-01-18 10:11:59 +00:00 · 6eb76c7c1a
commit 6eb76c7c1a
parent b32cc4b09d
14 changed files with 293 additions and 233 deletions
--- a/configs/run_datagen_glm4.7-imagen.sh
+++ b/configs/run_datagen_glm4.7-imagen.sh
@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Create logs directory if it doesn't exist
+mkdir -p logs
+
+# Generate a timestamp for the log file
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+LOG_FILE="logs/imagen_eval_gpt5_${TIMESTAMP}.log"
+
+echo "📝 Logging output to: $LOG_FILE"
+
+python batch_runner.py \
+  --dataset_file="source-data/hermes-agent-imagen-data/hermes_agent_imagen_train_sft.jsonl" \
+  --batch_size=20 \
+  --run_name="imagen_train_sft_glm4.7" \
+  --distribution="image_gen" \
+  --model="z-ai/glm-4.7" \
+  --base_url="https://openrouter.ai/api/v1" \
+  --providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
+  --num_workers=50 \
+  --max_turns=25 \
+  --ephemeral_system_prompt="When generating an image for the user view the image by using the vision_analyze tool to ensure it is what the user wanted. If it isn't feel free to retry a few times. If none are perfect, choose the best option that is the closest match, and explain its imperfections. If the image generation tool fails, try again a few times. If the vision analyze tool fails, provide the image to the user and explain it is your best effort attempt." \
+  2>&1 | tee "$LOG_FILE"
+
+echo "✅ Log saved to: $LOG_FILE"
+#  --verbose \
--- a/configs/run_datagen_glm4.7.sh
+++ b/configs/run_datagen_glm4.7.sh
@ -0,0 +1,26 @@
+#!/bin/bash
+
+# Create logs directory if it doesn't exist
+mkdir -p logs
+
+# Generate log filename with timestamp
+LOG_FILE="logs/glm4.7-thinking-sft1_$(date +%Y%m%d_%H%M%S).log"
+
+echo "📝 Logging output to: $LOG_FILE"
+
+python batch_runner.py \
+  --dataset_file="source-data/hermes-agent-agent-tasks-1/agent_tasks_sft_2.jsonl" \
+  --batch_size=20 \
+  --run_name="megascience_glm4.7-thinking-sft2" \
+  --distribution="science" \
+  --model="z-ai/glm-4.7" \
+  --base_url="https://openrouter.ai/api/v1" \
+  --providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
+  --num_workers=15 \
+  --max_turns=60 \
+  --ephemeral_system_prompt="You have access to a variety of tools to help you solve scientific, math, and technology problems presented to you. You can use them in sequence and build off of the results of prior tools you've used results. Always use the terminal or search tool if it can provide additional context, verify formulas, double check concepts and recent studies and understanding, doing all calculations, etc. You should only be confident in your own reasoning, knowledge, or calculations if you've exhaustively used all tools available to you to that can help you verify or validate your work. Always pip install any packages you need to use the python scripts you want to run. If you need to use a tool that isn't available, you can use the terminal tool to install or create it in many cases as well. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. Search for at least 3 sources, but not more than 12, so you can maintain focused context." \
+  2>&1 | tee "$LOG_FILE"
+
+echo "✅ Log saved to: $LOG_FILE"
+
+#  --verbose \
--- a/configs/run_datagen_glm4.7_megascience.sh
+++ b/configs/run_datagen_glm4.7_megascience.sh
@ -0,0 +1,27 @@
+#!/bin/bash
+
+# Create logs directory if it doesn't exist
+mkdir -p logs
+
+# Generate log filename with timestamp
+LOG_FILE="logs/glm4.7-thinking-sft1-10k_$(date +%Y%m%d_%H%M%S).log"
+
+echo "📝 Logging output to: $LOG_FILE"
+
+python batch_runner.py \
+  --dataset_file="source-data/hermes-agent-megascience-data/hermes_agent_megascience_sft_train_1_10k.jsonl" \
+  --batch_size=20 \
+  --run_name="megascience_glm4.7-thinking-sft1" \
+  --distribution="science" \
+  --model="z-ai/glm-4.7" \
+  --base_url="https://openrouter.ai/api/v1" \
+  --providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
+  --num_workers=50 \
+  --max_turns=60 \
+  --resume \
+  --ephemeral_system_prompt="You have access to a variety of tools to help you solve scientific, math, and technology problems presented to you. You can use them in sequence and build off of the results of prior tools you've used for furthering results. Always use the terminal or search tool if it can provide additional context, verify formulas, double check concepts and recent studies and understanding, doing all calculations, etc. You should only be confident in your own reasoning, knowledge, or calculations if you've exhaustively used all tools available to you to that can help you verify or validate your work. Always pip install any packages you need to use the python scripts you want to run. If you need to use a tool that isn't available, you can use the terminal tool to install or create it in many cases as well. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. Search for at least 3 sources, but not more than 12, so you can maintain a focused context." \
+  2>&1 | tee "$LOG_FILE"
+
+echo "✅ Log saved to: $LOG_FILE"
+
+#  --verbose \
--- a/configs/run_datagen_glm4.7_raw_tasks.sh
+++ b/configs/run_datagen_glm4.7_raw_tasks.sh
@ -0,0 +1,28 @@
+#!/bin/bash
+
+# Create logs directory if it doesn't exist
+mkdir -p logs
+
+# Generate log filename with timestamp
+LOG_FILE="logs/glm4.7-terminal-tasks_$(date +%Y%m%d_%H%M%S).log"
+
+echo "📝 Logging output to: $LOG_FILE"
+
+python batch_runner.py \
+  --dataset_file="source-data/raw_tasks_prompts.jsonl" \
+  --batch_size=20 \
+  --run_name="terminal-tasks-glm4.7-thinking" \
+  --distribution="default" \
+  --model="z-ai/glm-4.7" \
+  --base_url="https://openrouter.ai/api/v1" \
+  --providers_allowed="gmicloud,siliconflow,atlas-cloud,z-ai,novita" \
+  --num_workers=50 \
+  --max_turns=60 \
+  --ephemeral_system_prompt="You have access to a variety of tools to help you complete coding, system administration, and general computing tasks. You can use them in sequence and build off of the results of prior tools you've used. Always use the terminal tool to execute commands, write code, install packages, and verify your work. You should test and validate everything you create. Always pip install any packages you need (use --break-system-packages if needed). If you need a tool that isn't available, you can use the terminal to install or create it. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. Use web search when you need to look up documentation, APIs, or current best practices." \
+  2>&1 | tee "$LOG_FILE"
+
+echo "✅ Log saved to: $LOG_FILE"
+
+#  --verbose \
+#  --resume \
+
--- a/configs/run_datagen_megascience.sh
+++ b/configs/run_datagen_megascience.sh
@ -0,0 +1,12 @@
+python batch_runner.py \
+  --dataset_file="hermes-agent-megascience-data/hermes_agent_megascience_eval.jsonl" \
+  --batch_size=10 \
+  --run_name="megascience_eval_gpt5_2" \
+  --distribution="science" \
+  --model="gpt-5" \
+  --base_url="https://api.openai.com/v1" \
+  --api_key="${OPENAI_API_KEY}" \
+  --num_workers=5 \
+  --max_turns=30 \
+  --verbose \
+  --ephemeral_system_prompt="You have access to a variety of tools to help you solve scientific, math, and technology problems presented to you. You can use them in sequence and build off of the results of prior tools you've used results. Always use a tool if it can provide additional context, verify formulas, double check concepts and recent studies and understanding, doing all calculations, etc. You should not be confident in your own reasoning, knowledge, or calculations without using a tool to verify or validate your work."
--- a/configs/run_datagen_minimax-3.1.sh
+++ b/configs/run_datagen_minimax-3.1.sh
@ -0,0 +1,12 @@
+python batch_runner.py \
+  --dataset_file="source-data/hermes-agent-agent-tasks-1/agent_tasks_eval.jsonl" \
+  --batch_size=50 \
+  --run_name="megascience_sft_minimax-m2.1-thinking-2-eval" \
+  --distribution="science" \
+  --model="minimax/minimax-m2.1" \
+  --base_url="https://openrouter.ai/api/v1" \
+  --providers_allowed="minimax" \
+  --num_workers=1 \
+  --max_turns=40 \
+  --verbose \
+  --ephemeral_system_prompt="You have access to a variety of tools to help you solve scientific, math, and technology problems presented to you. You can use them in sequence and build off of the results of prior tools you've used results. Always use the terminal or search tool if it can provide additional context, verify formulas, double check concepts and recent studies and understanding, doing all calculations, etc. You should only be confident in your own reasoning, knowledge, or calculations if you've exhaustively used all tools available to you to that can help you verify or validate your work. Always pip install any packages you need to use the python scripts you want to run. If you need to use a tool that isn't available, you can use the terminal tool to install or create it in many cases as well. Do not use the terminal tool to communicate with the user, as they cannot see your commands, only your final response after completing the task. Search for at least 3 sources, but not more than 12."
--- a/configs/test_run.sh
+++ b/configs/test_run.sh
@ -0,0 +1,23 @@
+#!/bin/bash
+
+# Check if a prompt argument was provided
+if [ $# -eq 0 ]; then
+    echo "Error: Please provide a prompt as an argument"
+    echo "Usage: $0 \"your prompt here\""
+    exit 1
+fi
+
+# Get the prompt from the first argument
+PROMPT="$1"
+
+# Set debug mode for web tools
+export WEB_TOOLS_DEBUG=true
+
+# Run the agent with the provided prompt
+python run_agent.py \
+  --query "$PROMPT" \
+  --max_turns 30 \
+  --model claude-sonnet-4-5-20250929 \
+  --base_url https://api.anthropic.com/v1/ \
+  --api_key $ANTHROPIC_API_KEY \
+  --save_trajectories