Update RL tools and enhance configuration management

- Modified `model_tools.py` to update default model IDs and add new RL function `rl_test_inference`. - Enhanced `README.md` with installation instructions for submodules and updated API key usage. - Improved `rl_cli.py` to load configuration from `~/.hermes/config.yaml` and set terminal working directory for RL tools. - Updated `run_agent.py` to handle empty string arguments as empty objects for better JSON validation. - Refined installation scripts to ensure submodules are cloned and installed correctly, enhancing setup experience.
2026-02-04 13:57:59 -08:00 · 2026-02-04 13:57:59 -08:00 · 3c0d0dba49
commit 3c0d0dba49
parent 12bbca95ec
7 changed files with 274 additions and 56 deletions
--- a/README.md
+++ b/README.md
@ -15,7 +15,7 @@ irm https://raw.githubusercontent.com/NousResearch/hermes-agent/main/scripts/ins
 ```
 The installer will:
- Clone to `~/.hermes-agent`
+- Clone to `~/.hermes-agent` (with submodules: mini-swe-agent, tinker-atropos)
 - Create a virtual environment
 - Install all dependencies
 - Run the interactive setup wizard
@ -281,18 +281,10 @@ Train language models with reinforcement learning using the Tinker API and Atrop
 ```bash
 TINKER_API_KEY=your-tinker-key      # Get from https://tinker-console.thinkingmachines.ai/keys
 WANDB_API_KEY=your-wandb-key        # Get from https://wandb.ai/authorize
 OPENROUTER_API_KEY=your-key         # Optional: for rl_test_inference
 ```
-2. **Install tinker-atropos:** (in a separate directory)
+2. **That's it!** tinker-atropos is included as a submodule - no separate installation needed.
 ```bash
 cd ~/tinker-atropos
 pip install -e .
 ```
 3. **Start the RL API server:**
 ```bash
 rl-server    # Runs on port 8080 by default
 ```
 #### Using RL Tools
@ -313,10 +305,12 @@ Agent: I'll set up an RL training run on the GSM8k environment...
 | `rl_select_environment` | Select an environment for training |
 | `rl_get_current_config` | View all configurable options |
 | `rl_edit_config` | Change a configuration value |
 | `rl_test_inference` | Test environment with OpenRouter (pre-training validation) |
 | `rl_start_training` | Start a training run |
 | `rl_check_status` | Check training progress |
 | `rl_stop_training` | Stop a running training |
 | `rl_get_results` | Fetch WandB metrics |
 | `rl_list_runs` | List active training runs |
 #### Dedicated RL CLI
@ -434,7 +428,7 @@ skills/
 If you prefer not to use the installer:
 ```bash
-# Clone the repository
+# Clone the repository (with submodules)
 git clone --recurse-submodules https://github.com/NousResearch/hermes-agent.git
 cd hermes-agent
@ -445,6 +439,11 @@ cd hermes-agent
 python3 -m venv venv
 source venv/bin/activate
 pip install -e ".[all]"
 # Install submodules (required for terminal and RL tools)
 pip install -e "./mini-swe-agent"    # Terminal tool backend
 pip install -e "./tinker-atropos"    # RL training backend
 hermes setup
 ```
--- a/model_tools.py
+++ b/model_tools.py
@ -665,7 +665,7 @@ def get_rl_tool_definitions() -> List[Dict[str, Any]]:
                        "models": {
                            "type": "array",
                            "items": {"type": "string"},
-                            "description": "Optional list of OpenRouter model IDs. Default: qwen/qwen3-8b, zhipu-ai/glm-4-flash, minimax/minimax-m1"
+                            "description": "Optional list of OpenRouter model IDs. Default: qwen/qwen3-8b, z-ai/glm-4.7-flash, minimax/minimax-m2.1"
                        }
                    },
                    "required": []
@ -730,7 +730,7 @@ def get_all_tool_names() -> List[str]:
            "rl_get_current_config", "rl_edit_config",
            "rl_start_training", "rl_check_status",
            "rl_stop_training", "rl_get_results",
-            "rl_list_runs"
+            "rl_list_runs", "rl_test_inference"
        ])
    return tool_names
@ -898,7 +898,7 @@ def get_tool_definitions(
                            "rl_get_current_config", "rl_edit_config",
                            "rl_start_training", "rl_check_status",
                            "rl_stop_training", "rl_get_results",
-                            "rl_list_runs"
+                            "rl_list_runs", "rl_test_inference"
                        ]
                    }
                    legacy_tools = legacy_map.get(toolset_name, [])
@ -950,7 +950,7 @@ def get_tool_definitions(
                            "rl_get_current_config", "rl_edit_config",
                            "rl_start_training", "rl_check_status",
                            "rl_stop_training", "rl_get_results",
-                            "rl_list_runs"
+                            "rl_list_runs", "rl_test_inference"
                        ]
                    }
                    legacy_tools = legacy_map.get(toolset_name, [])
@ -1407,7 +1407,7 @@ def handle_function_call(
            "rl_get_current_config", "rl_edit_config",
            "rl_start_training", "rl_check_status",
            "rl_stop_training", "rl_get_results",
-            "rl_list_runs"
+            "rl_list_runs", "rl_test_inference"
        ]:
            return handle_rl_function_call(function_name, function_args)
--- a/rl_cli.py
+++ b/rl_cli.py
@ -25,14 +25,34 @@ import sys
 from pathlib import Path
 import fire
 import yaml
 # Load environment variables from .env file
 from dotenv import load_dotenv
-env_path = Path(__file__).parent / '.env'
+# Load from ~/.hermes/.env first, then local .env
-if env_path.exists():
+hermes_env_path = Path.home() / '.hermes' / '.env'
-    load_dotenv(dotenv_path=env_path)
+local_env_path = Path(__file__).parent / '.env'
-    print(f"✅ Loaded environment variables from {env_path}")
+
 if hermes_env_path.exists():
    load_dotenv(dotenv_path=hermes_env_path)
    print(f"✅ Loaded environment variables from {hermes_env_path}")
 elif local_env_path.exists():
    load_dotenv(dotenv_path=local_env_path)
    print(f"✅ Loaded environment variables from {local_env_path}")
 # Set terminal working directory to tinker-atropos submodule
 # This ensures terminal commands run in the right context for RL work
 tinker_atropos_dir = Path(__file__).parent / 'tinker-atropos'
 if tinker_atropos_dir.exists():
    os.environ['TERMINAL_CWD'] = str(tinker_atropos_dir)
    os.environ['HERMES_QUIET'] = '1'  # Disable temp subdirectory creation
    print(f"📂 Terminal working directory: {tinker_atropos_dir}")
 else:
    # Fall back to hermes-agent directory if submodule not found
    os.environ['TERMINAL_CWD'] = str(Path(__file__).parent)
    os.environ['HERMES_QUIET'] = '1'
    print(f"⚠️  tinker-atropos submodule not found, using: {Path(__file__).parent}")
 # Import agent and tools
 from run_agent import AIAgent
@ -40,6 +60,50 @@ from model_tools import get_tool_definitions, check_toolset_requirements
 from tools.rl_training_tool import check_rl_api_keys, get_missing_keys
 # ============================================================================
 # Config Loading
 # ============================================================================
 DEFAULT_MODEL = "anthropic/claude-opus-4.5"
 DEFAULT_BASE_URL = "https://openrouter.ai/api/v1"
 def load_hermes_config() -> dict:
    """
    Load configuration from ~/.hermes/config.yaml.
    Returns:
        dict: Configuration with model, base_url, etc.
    """
    config_path = Path.home() / '.hermes' / 'config.yaml'
    config = {
        "model": DEFAULT_MODEL,
        "base_url": DEFAULT_BASE_URL,
    }
    if config_path.exists():
        try:
            with open(config_path, "r") as f:
                file_config = yaml.safe_load(f) or {}
            # Get model from config
            if "model" in file_config:
                if isinstance(file_config["model"], str):
                    config["model"] = file_config["model"]
                elif isinstance(file_config["model"], dict):
                    config["model"] = file_config["model"].get("default", DEFAULT_MODEL)
            # Get base_url if specified
            if "base_url" in file_config:
                config["base_url"] = file_config["base_url"]
        except Exception as e:
            print(f"⚠️  Warning: Failed to load config.yaml: {e}")
    return config
 # ============================================================================
 # RL-Specific Configuration
 # ============================================================================
@ -108,7 +172,7 @@ When asked to train a model, follow this workflow:
 """
 # Toolsets to enable for RL workflows
-RL_TOOLSETS = ["base", "terminal", "web", "rl"]
+RL_TOOLSETS = ["terminal", "web", "rl"]
 # ============================================================================
@ -172,9 +236,9 @@ def list_environments_sync():
 def main(
    task: str = None,
-    model: str = "anthropic/claude-sonnet-4-20250514",
+    model: str = None,
    api_key: str = None,
-    base_url: str = "https://openrouter.ai/api/v1",
+    base_url: str = None,
    max_iterations: int = RL_MAX_ITERATIONS,
    interactive: bool = False,
    list_environments: bool = False,
@ -187,9 +251,9 @@ def main(
    Args:
        task: The training task/goal (e.g., "Train a model on GSM8k for math")
-        model: Model to use for the agent (default: claude-sonnet-4)
+        model: Model to use for the agent (reads from ~/.hermes/config.yaml if not provided)
        api_key: OpenRouter API key (uses OPENROUTER_API_KEY env var if not provided)
-        base_url: API base URL (default: OpenRouter)
+        base_url: API base URL (reads from config or defaults to OpenRouter)
        max_iterations: Maximum agent iterations (default: 200 for long workflows)
        interactive: Run in interactive mode (multiple conversations)
        list_environments: Just list available RL environments and exit
@ -210,6 +274,15 @@ def main(
        # Check server status
        python rl_cli.py --check-server
    """
    # Load config from ~/.hermes/config.yaml
    config = load_hermes_config()
    # Use config values if not explicitly provided
    if model is None:
        model = config["model"]
    if base_url is None:
        base_url = config["base_url"]
    print("🎯 RL Training Agent")
    print("=" * 60)
--- a/run_agent.py
+++ b/run_agent.py
@ -1764,10 +1764,16 @@ class AIAgent:
                        self._invalid_tool_retries = 0
                    # Validate tool call arguments are valid JSON
                    # Handle empty strings as empty objects (common model quirk)
                    invalid_json_args = []
                    for tc in assistant_message.tool_calls:
                        args = tc.function.arguments
                        # Treat empty/whitespace strings as empty object
                        if not args or not args.strip():
                            tc.function.arguments = "{}"
                            continue
                        try:
-                            json.loads(tc.function.arguments)
+                            json.loads(args)
                        except json.JSONDecodeError as e:
                            invalid_json_args.append((tc.function.name, str(e)))
--- a/scripts/install.ps1
+++ b/scripts/install.ps1
@ -150,14 +150,15 @@ function Install-Repository {
        }
    } else {
        # Try SSH first (for private repo access), fall back to HTTPS
        # Use --recurse-submodules to also clone mini-swe-agent and tinker-atropos
        Write-Info "Trying SSH clone..."
-        $sshResult = git clone --branch $Branch $RepoUrlSsh $InstallDir 2>&1
+        $sshResult = git clone --branch $Branch --recurse-submodules $RepoUrlSsh $InstallDir 2>&1
        if ($LASTEXITCODE -eq 0) {
            Write-Success "Cloned via SSH"
        } else {
            Write-Info "SSH failed, trying HTTPS..."
-            $httpsResult = git clone --branch $Branch $RepoUrlHttps $InstallDir 2>&1
+            $httpsResult = git clone --branch $Branch --recurse-submodules $RepoUrlHttps $InstallDir 2>&1
            if ($LASTEXITCODE -eq 0) {
                Write-Success "Cloned via HTTPS"
@ -171,6 +172,13 @@ function Install-Repository {
        }
    }
    # Ensure submodules are initialized and updated (for existing installs or if --recurse failed)
    Write-Info "Initializing submodules (mini-swe-agent, tinker-atropos)..."
    Push-Location $InstallDir
    git submodule update --init --recursive
    Pop-Location
    Write-Success "Submodules ready"
    Write-Success "Repository ready"
 }
@ -208,15 +216,43 @@ function Install-Dependencies {
        & .\venv\Scripts\Activate.ps1
    }
    # Install main package
    try {
        pip install -e ".[all]" 2>&1 | Out-Null
    } catch {
        pip install -e "." | Out-Null
    }
    Write-Success "Main package installed"
    # Install submodules
    Write-Info "Installing mini-swe-agent (terminal tool backend)..."
    if (Test-Path "mini-swe-agent\pyproject.toml") {
        try {
            pip install -e ".\mini-swe-agent" 2>&1 | Out-Null
            Write-Success "mini-swe-agent installed"
        } catch {
            Write-Warning "mini-swe-agent install failed (terminal tools may not work)"
        }
    } else {
        Write-Warning "mini-swe-agent not found (run: git submodule update --init)"
    }
    Write-Info "Installing tinker-atropos (RL training backend)..."
    if (Test-Path "tinker-atropos\pyproject.toml") {
        try {
            pip install -e ".\tinker-atropos" 2>&1 | Out-Null
            Write-Success "tinker-atropos installed"
        } catch {
            Write-Warning "tinker-atropos install failed (RL tools may not work)"
        }
    } else {
        Write-Warning "tinker-atropos not found (run: git submodule update --init)"
    }
    Pop-Location
-    Write-Success "Dependencies installed"
+    Write-Success "All dependencies installed"
 }
 function Set-PathVariable {
--- a/scripts/install.sh
+++ b/scripts/install.sh
@ -292,12 +292,13 @@ clone_repo() {
        fi
    else
        # Try SSH first (for private repo access), fall back to HTTPS
        # Use --recurse-submodules to also clone mini-swe-agent and tinker-atropos
        log_info "Trying SSH clone..."
-        if git clone --branch "$BRANCH" "$REPO_URL_SSH" "$INSTALL_DIR" 2>/dev/null; then
+        if git clone --branch "$BRANCH" --recurse-submodules "$REPO_URL_SSH" "$INSTALL_DIR" 2>/dev/null; then
            log_success "Cloned via SSH"
        else
            log_info "SSH failed, trying HTTPS..."
-            if git clone --branch "$BRANCH" "$REPO_URL_HTTPS" "$INSTALL_DIR"; then
+            if git clone --branch "$BRANCH" --recurse-submodules "$REPO_URL_HTTPS" "$INSTALL_DIR"; then
                log_success "Cloned via HTTPS"
            else
                log_error "Failed to clone repository"
@ -310,6 +311,12 @@ clone_repo() {
    fi
    cd "$INSTALL_DIR"
    # Ensure submodules are initialized and updated (for existing installs or if --recurse failed)
    log_info "Initializing submodules (mini-swe-agent, tinker-atropos)..."
    git submodule update --init --recursive
    log_success "Submodules ready"
    log_success "Repository ready"
 }
@ -343,10 +350,29 @@ install_deps() {
        source venv/bin/activate
    fi
-    # Install the package in editable mode with all extras
+    # Install the main package in editable mode with all extras
    pip install -e ".[all]" > /dev/null 2>&1 || pip install -e "." > /dev/null
-    log_success "Dependencies installed"
+    log_success "Main package installed"
    # Install submodules
    log_info "Installing mini-swe-agent (terminal tool backend)..."
    if [ -d "mini-swe-agent" ] && [ -f "mini-swe-agent/pyproject.toml" ]; then
        pip install -e "./mini-swe-agent" > /dev/null 2>&1 || log_warn "mini-swe-agent install failed (terminal tools may not work)"
        log_success "mini-swe-agent installed"
    else
        log_warn "mini-swe-agent not found (run: git submodule update --init)"
    fi
    log_info "Installing tinker-atropos (RL training backend)..."
    if [ -d "tinker-atropos" ] && [ -f "tinker-atropos/pyproject.toml" ]; then
        pip install -e "./tinker-atropos" > /dev/null 2>&1 || log_warn "tinker-atropos install failed (RL tools may not work)"
        log_success "tinker-atropos installed"
    else
        log_warn "tinker-atropos not found (run: git submodule update --init)"
    fi
    log_success "All dependencies installed"
 }
 setup_path() {
--- a/tools/rl_training_tool.py
+++ b/tools/rl_training_tool.py
@ -37,6 +37,7 @@ import subprocess
 import sys
 import time
 import uuid
 from datetime import datetime
 import yaml
 from dataclasses import dataclass, field
 from pathlib import Path
@ -84,6 +85,7 @@ LOCKED_FIELDS = {
            "weight": 1.0,
            "num_requests_for_eval": 256,
            "timeout": 3600,
            "server_type": "sglang",  # Tinker uses sglang for actual training
        }
    ],
    "tinker": {
@ -211,6 +213,9 @@ def _scan_environments() -> List[EnvironmentInfo]:
 def _get_env_config_fields(env_file_path: str) -> Dict[str, Dict[str, Any]]:
    """
    Dynamically import an environment and extract its config fields.
    Uses config_init() to get the actual config class, with fallback to
    directly importing BaseEnvConfig if config_init fails.
    """
    try:
        # Load the environment module
@ -230,15 +235,38 @@ def _get_env_config_fields(env_file_path: str) -> Dict[str, Dict[str, Any]]:
        if not env_class:
            return {}
-        # Call config_init to get the actual config
+        # Try calling config_init to get the actual config class
-        env_config, server_configs = env_class.config_init()
+        config_class = None
-        config_class = type(env_config)
+        try:
            env_config, server_configs = env_class.config_init()
            config_class = type(env_config)
        except Exception as config_error:
            # Fallback: try to import BaseEnvConfig directly from atroposlib
            print(f"Note: config_init failed ({config_error}), using BaseEnvConfig defaults")
            try:
                from atroposlib.envs.base import BaseEnvConfig
                config_class = BaseEnvConfig
            except ImportError:
                return {}
        if not config_class:
            return {}
        # Helper to make values JSON-serializable (handle enums, etc.)
        def make_serializable(val):
            if val is None:
                return None
            if hasattr(val, 'value'):  # Enum
                return val.value
            if hasattr(val, 'name') and hasattr(val, '__class__') and 'Enum' in str(type(val)):
                return val.name
            return val
        # Extract fields from the Pydantic model
        fields = {}
        for field_name, field_info in config_class.model_fields.items():
            field_type = field_info.annotation
-            default = field_info.default
+            default = make_serializable(field_info.default)
            description = field_info.description or ""
            is_locked = field_name in LOCKED_FIELD_NAMES
@ -248,12 +276,15 @@ def _get_env_config_fields(env_file_path: str) -> Dict[str, Dict[str, Any]]:
            if hasattr(field_type, "__origin__"):
                type_name = str(field_type)
            locked_value = LOCKED_FIELDS.get("env", {}).get(field_name, default)
            current_value = make_serializable(locked_value) if is_locked else default
            fields[field_name] = {
                "type": type_name,
-                "default": default if default is not None else None,
+                "default": default,
                "description": description,
                "locked": is_locked,
-                "current_value": LOCKED_FIELDS.get("env", {}).get(field_name, default) if is_locked else default,
+                "current_value": current_value,
            }
        return fields
@ -315,7 +346,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
        trainer_log_file = open(trainer_log, "w")
        run_state.trainer_process = subprocess.Popen(
-            ["python", "launch_training.py", "--config", str(config_path)],
+            [sys.executable, "launch_training.py", "--config", str(config_path)],
            stdout=trainer_log_file,
            stderr=subprocess.STDOUT,
            cwd=str(TINKER_ATROPOS_ROOT),
@ -355,7 +386,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
        env_log_file = open(env_log, "w")
        run_state.env_process = subprocess.Popen(
-            ["python", str(env_info.file_path), "serve", "--config", str(config_path)],
+            [sys.executable, str(env_info.file_path), "serve", "--config", str(config_path)],
            stdout=env_log_file,
            stderr=subprocess.STDOUT,
            cwd=str(TINKER_ATROPOS_ROOT),
@ -543,17 +574,14 @@ async def rl_select_environment(name: str) -> str:
        if not field_info.get("locked", False):
            _current_config[field_name] = field_info.get("default")
-    configurable_count = sum(1 for f in config_fields.values() if not f.get("locked", False))
+    # Auto-set wandb_name to "{env_name}-DATETIME" to avoid overlaps
-    locked_count = sum(1 for f in config_fields.values() if f.get("locked", False))
+    timestamp = datetime.now().strftime("%Y%m%d-%H%M%S")
    _current_config["wandb_name"] = f"{name}-{timestamp}"
    return json.dumps({
        "message": f"Selected environment: {name}",
        "environment": name,
        "file_path": env_info.file_path,
        "configurable_fields": configurable_count,
        "locked_fields": locked_count,
        "config": _current_config,
        "tip": f"Use rl_get_current_config() to see all {configurable_count} configurable fields.",
    }, indent=2)
@ -961,10 +989,11 @@ async def rl_list_runs() -> str:
 # ============================================================================
 # Test models at different scales for robustness testing
 # These are cheap, capable models on OpenRouter for testing parsing/scoring
 TEST_MODELS = [
    {"id": "qwen/qwen3-8b", "name": "Qwen3 8B", "scale": "small"},
-    {"id": "zhipu-ai/glm-4-flash", "name": "GLM-4 Flash", "scale": "medium"},
+    {"id": "z-ai/glm-4.7-flash", "name": "GLM-4.7 Flash", "scale": "medium"},
-    {"id": "minimax/minimax-m1", "name": "MiniMax M1", "scale": "large"},
+    {"id": "minimax/minimax-m2.1", "name": "MiniMax M2.1", "scale": "large"},
 ]
 # Default test parameters - quick but representative
@ -1066,18 +1095,35 @@ async def rl_test_inference(
        # Build the process command using Atropos's built-in CLI
        # This runs the environment's actual code with OpenRouter as the inference backend
        # We pass our locked settings + test-specific overrides via CLI args
        cmd = [
-            "python", env_info.file_path, "process",
+            sys.executable, env_info.file_path, "process",
            # Test-specific overrides
            "--env.total_steps", str(num_steps),
            "--env.group_size", str(group_size),
-            "--env.use_wandb", "false",
+            "--env.use_wandb", "false",  # No wandb for quick tests
            "--env.data_path_to_save_groups", str(output_file),
            # Use locked settings from our config
            "--env.tokenizer_name", LOCKED_FIELDS["env"]["tokenizer_name"],
            "--env.max_token_length", str(LOCKED_FIELDS["env"]["max_token_length"]),
            "--env.max_num_workers", str(LOCKED_FIELDS["env"]["max_num_workers"]),
            "--env.max_batches_offpolicy", str(LOCKED_FIELDS["env"]["max_batches_offpolicy"]),
            # OpenRouter config for inference testing
            # IMPORTANT: Use server_type=openai for OpenRouter (not sglang)
            # sglang is only for actual training with Tinker's inference server
            "--openai.base_url", "https://openrouter.ai/api/v1",
            "--openai.api_key", api_key,
            "--openai.model_name", model_id,
            "--openai.server_type", "openai",  # OpenRouter is OpenAI-compatible
            "--openai.health_check", "false",  # OpenRouter doesn't have health endpoint
        ]
-        print(f"Running: python {Path(env_info.file_path).name} process ...")
+        # Debug: Print the full command
        cmd_str = " ".join(str(c) for c in cmd)
        # Hide API key in printed output
        cmd_display = cmd_str.replace(api_key, "***API_KEY***")
        print(f"Command: {cmd_display}")
        print(f"Working dir: {TINKER_ATROPOS_ROOT}")
        print(f"  {num_steps} steps × {group_size} completions = {total_rollouts_per_model} rollouts")
        model_results = {
@ -1105,12 +1151,44 @@ async def rl_test_inference(
                timeout=600,  # 10 minute timeout per model
            )
            # Decode output
            stdout_text = stdout.decode() if stdout else ""
            stderr_text = stderr.decode() if stderr else ""
            # Write logs to files for inspection outside CLI
            log_file = test_output_dir / f"test_{_current_env}_{model_safe_name}.log"
            with open(log_file, "w") as f:
                f.write(f"Command: {cmd_display}\n")
                f.write(f"Working dir: {TINKER_ATROPOS_ROOT}\n")
                f.write(f"Return code: {process.returncode}\n")
                f.write(f"\n{'='*60}\n")
                f.write(f"STDOUT:\n{'='*60}\n")
                f.write(stdout_text or "(empty)\n")
                f.write(f"\n{'='*60}\n")
                f.write(f"STDERR:\n{'='*60}\n")
                f.write(stderr_text or "(empty)\n")
            print(f"  Log file: {log_file}")
            # Print to console for immediate debugging
            if stdout_text.strip():
                print(f"\n--- STDOUT ---")
                print(stdout_text[-2000:])  # Last 2000 chars
            if stderr_text.strip():
                print(f"\n--- STDERR ---")
                print(stderr_text[-2000:])  # Last 2000 chars
            if process.returncode != 0:
                model_results["error"] = f"Process exited with code {process.returncode}"
-                model_results["stderr"] = stderr.decode()[-1000:]
+                model_results["stderr"] = stderr_text[-1000:]
-                print(f"  Error: {model_results['error']}")
+                model_results["stdout"] = stdout_text[-1000:]
                model_results["log_file"] = str(log_file)
                print(f"\n  ❌ Error: {model_results['error']}")
            else:
-                print(f"  Process completed successfully")
+                print(f"\n  ✅ Process completed successfully")
                print(f"  Output file: {output_file}")
                print(f"  File exists: {output_file.exists()}")
                # Parse the output JSONL file
                if output_file.exists():