Add support for Atropos Agentic RL environments (requires branch tool_call_support in Atropos atm)

- Added new environments for reinforcement learning, including `HermesSweEnv` for software engineering tasks and `TerminalTestEnv` for inline testing. - Introduced `ToolContext` for unrestricted access to tools during reward computation. - Updated `.gitignore` to exclude `wandb/` directory. - Enhanced `README.md` with detailed architecture and usage instructions for Atropos environments. - Added configuration files for SWE and terminal test environments to streamline setup. - Removed unnecessary compiled Python files from `__pycache__`.
2026-02-07 09:17:16 +00:00 · 2026-02-07 09:17:16 +00:00 · 07b615e96e
commit 07b615e96e
parent ac79725923
30 changed files with 2851 additions and 965 deletions
--- a/environments/init.py
+++ b/environments/init.py
@ -0,0 +1,28 @@
+"""
+Hermes-Agent Atropos Environments
+
+Provides a layered integration between hermes-agent's tool-calling capabilities
+and the Atropos RL training framework.
+
+Layers:
+    - agent_loop: Reusable multi-turn agent loop with standard OpenAI-spec tool calling
+    - tool_context: Per-rollout tool access handle for reward/verification functions
+    - hermes_base_env: Abstract base environment (BaseEnv subclass) for Atropos
+    - tool_call_parsers: Client-side tool call parser registry for Phase 2 (VLLM /generate)
+
+Concrete environments:
+    - terminal_test_env: Simple file-creation tasks for testing the stack
+    - hermes_swe_env: SWE-bench style tasks with Modal sandboxes
+"""
+
+from environments.agent_loop import AgentResult, HermesAgentLoop
+from environments.tool_context import ToolContext
+from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
+
+__all__ = [
+    "AgentResult",
+    "HermesAgentLoop",
+    "ToolContext",
+    "HermesAgentBaseEnv",
+    "HermesAgentEnvConfig",
+]
--- a/environments/agent_loop.py
+++ b/environments/agent_loop.py
@ -0,0 +1,306 @@
+"""
+HermesAgentLoop -- Reusable Multi-Turn Agent Engine
+
+Runs the hermes-agent tool-calling loop using standard OpenAI-spec tool calling.
+Works with any server that returns ChatCompletion objects with tool_calls:
+    - Phase 1: OpenAI server type (VLLM, SGLang, OpenRouter, OpenAI API)
+    - Phase 2: ManagedServer with client-side tool call parser
+
+The loop passes tools= and checks response.choices[0].message.tool_calls,
+identical to hermes-agent's run_agent.py. Tool execution is dispatched via
+handle_function_call() from model_tools.py.
+"""
+
+import json
+import logging
+import uuid
+from dataclasses import dataclass, field
+from typing import Any, Dict, List, Optional, Set
+
+from model_tools import handle_function_call
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class AgentResult:
+    """Result of running the agent loop."""
+
+    # Full conversation history in OpenAI message format
+    messages: List[Dict[str, Any]]
+    # ManagedServer.get_state() if available (Phase 2), None otherwise
+    managed_state: Optional[Dict[str, Any]] = None
+    # How many LLM calls were made
+    turns_used: int = 0
+    # True if model stopped calling tools naturally (vs hitting max_turns)
+    finished_naturally: bool = False
+    # Extracted reasoning content per turn (from PR #297 helpers)
+    reasoning_per_turn: List[Optional[str]] = field(default_factory=list)
+
+
+def _extract_reasoning_from_message(message) -> Optional[str]:
+    """
+    Extract reasoning content from a ChatCompletion message.
+
+    Handles multiple provider formats:
+    1. message.reasoning_content field (some providers)
+    2. message.reasoning field (some providers)
+    3. message.reasoning_details[].text (OpenRouter style)
+
+    Note: <think> block extraction from content is NOT done here -- that's
+    handled by the response already in Phase 1 (server does it) or by
+    ManagedServer's patch in Phase 2.
+
+    Args:
+        message: The assistant message from ChatCompletion response
+
+    Returns:
+        Extracted reasoning text, or None if not found
+    """
+    # Check reasoning_content field (common across providers)
+    if hasattr(message, "reasoning_content") and message.reasoning_content:
+        return message.reasoning_content
+
+    # Check reasoning field
+    if hasattr(message, "reasoning") and message.reasoning:
+        return message.reasoning
+
+    # Check reasoning_details (OpenRouter style)
+    if hasattr(message, "reasoning_details") and message.reasoning_details:
+        for detail in message.reasoning_details:
+            if hasattr(detail, "text") and detail.text:
+                return detail.text
+            if isinstance(detail, dict) and detail.get("text"):
+                return detail["text"]
+
+    return None
+
+
+class HermesAgentLoop:
+    """
+    Runs hermes-agent's tool-calling loop using standard OpenAI-spec tool calling.
+
+    Same pattern as run_agent.py:
+    - Pass tools= to the API
+    - Check response.choices[0].message.tool_calls
+    - Dispatch via handle_function_call()
+
+    Works identically with any server type -- OpenAI, VLLM, SGLang, OpenRouter,
+    or ManagedServer with a parser. The server determines how tool_calls get
+    populated on the response.
+    """
+
+    def __init__(
+        self,
+        server,
+        tool_schemas: List[Dict[str, Any]],
+        valid_tool_names: Set[str],
+        max_turns: int = 30,
+        task_id: Optional[str] = None,
+        temperature: float = 1.0,
+        max_tokens: Optional[int] = None,
+    ):
+        """
+        Initialize the agent loop.
+
+        Args:
+            server: Server object with chat_completion() method (OpenAIServer,
+                    ManagedServer, ServerManager, etc.)
+            tool_schemas: OpenAI-format tool definitions from get_tool_definitions()
+            valid_tool_names: Set of tool names the model is allowed to call
+            max_turns: Maximum number of LLM calls before stopping
+            task_id: Unique ID for terminal/browser session isolation
+            temperature: Sampling temperature for generation
+            max_tokens: Max tokens per generation (None for server default)
+        """
+        self.server = server
+        self.tool_schemas = tool_schemas
+        self.valid_tool_names = valid_tool_names
+        self.max_turns = max_turns
+        self.task_id = task_id or str(uuid.uuid4())
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+
+    async def run(self, messages: List[Dict[str, Any]]) -> AgentResult:
+        """
+        Execute the full agent loop using standard OpenAI tool calling.
+
+        Args:
+            messages: Initial conversation messages (system + user).
+                      Modified in-place as the conversation progresses.
+
+        Returns:
+            AgentResult with full conversation history, managed state, and metadata
+        """
+        reasoning_per_turn = []
+
+        for turn in range(self.max_turns):
+            # Build the chat_completion kwargs
+            chat_kwargs = {
+                "messages": messages,
+                "n": 1,
+                "temperature": self.temperature,
+            }
+
+            # Only pass tools if we have them
+            if self.tool_schemas:
+                chat_kwargs["tools"] = self.tool_schemas
+
+            # Only pass max_tokens if explicitly set
+            if self.max_tokens is not None:
+                chat_kwargs["max_tokens"] = self.max_tokens
+
+            # Make the API call -- standard OpenAI spec
+            try:
+                response = await self.server.chat_completion(**chat_kwargs)
+            except Exception as e:
+                logger.error("API call failed on turn %d: %s", turn + 1, e)
+                return AgentResult(
+                    messages=messages,
+                    managed_state=self._get_managed_state(),
+                    turns_used=turn + 1,
+                    finished_naturally=False,
+                    reasoning_per_turn=reasoning_per_turn,
+                )
+
+            if not response or not response.choices:
+                logger.warning("Empty response on turn %d", turn + 1)
+                return AgentResult(
+                    messages=messages,
+                    managed_state=self._get_managed_state(),
+                    turns_used=turn + 1,
+                    finished_naturally=False,
+                    reasoning_per_turn=reasoning_per_turn,
+                )
+
+            assistant_msg = response.choices[0].message
+
+            # Extract reasoning content from the response (all provider formats)
+            reasoning = _extract_reasoning_from_message(assistant_msg)
+            reasoning_per_turn.append(reasoning)
+
+            # Check for tool calls -- standard OpenAI spec
+            if assistant_msg.tool_calls:
+                # Build the assistant message dict for conversation history
+                msg_dict: Dict[str, Any] = {
+                    "role": "assistant",
+                    "content": assistant_msg.content or "",
+                    "tool_calls": [
+                        {
+                            "id": tc.id,
+                            "type": "function",
+                            "function": {
+                                "name": tc.function.name,
+                                "arguments": tc.function.arguments,
+                            },
+                        }
+                        for tc in assistant_msg.tool_calls
+                    ],
+                }
+
+                # Preserve reasoning_content for multi-turn chat template handling
+                # (e.g., Kimi-K2's template renders <think> blocks differently
+                # for history vs. the latest turn based on this field)
+                if reasoning:
+                    msg_dict["reasoning_content"] = reasoning
+
+                messages.append(msg_dict)
+
+                # Execute each tool call via hermes-agent's dispatch
+                for tc in assistant_msg.tool_calls:
+                    tool_name = tc.function.name
+
+                    # Validate tool name
+                    if tool_name not in self.valid_tool_names:
+                        tool_result = json.dumps(
+                            {
+                                "error": f"Unknown tool '{tool_name}'. "
+                                f"Available tools: {sorted(self.valid_tool_names)}"
+                            }
+                        )
+                        logger.warning(
+                            "Model called unknown tool '%s' on turn %d",
+                            tool_name,
+                            turn + 1,
+                        )
+                    else:
+                        # Parse arguments and dispatch
+                        try:
+                            args = json.loads(tc.function.arguments)
+                        except json.JSONDecodeError:
+                            args = {}
+                            logger.warning(
+                                "Invalid JSON in tool call arguments for '%s': %s",
+                                tool_name,
+                                tc.function.arguments[:200],
+                            )
+
+                        try:
+                            tool_result = handle_function_call(
+                                tool_name, args, task_id=self.task_id
+                            )
+                        except Exception as e:
+                            tool_result = json.dumps(
+                                {"error": f"Tool execution failed: {str(e)}"}
+                            )
+                            logger.error(
+                                "Tool '%s' execution failed: %s", tool_name, e
+                            )
+
+                    # Add tool response to conversation
+                    messages.append(
+                        {
+                            "role": "tool",
+                            "tool_call_id": tc.id,
+                            "content": tool_result,
+                        }
+                    )
+
+                logger.debug(
+                    "Turn %d: %d tool calls executed",
+                    turn + 1,
+                    len(assistant_msg.tool_calls),
+                )
+
+            else:
+                # No tool calls -- model is done
+                msg_dict = {
+                    "role": "assistant",
+                    "content": assistant_msg.content or "",
+                }
+                if reasoning:
+                    msg_dict["reasoning_content"] = reasoning
+                messages.append(msg_dict)
+
+                logger.debug(
+                    "Turn %d: model finished naturally (no tool calls)", turn + 1
+                )
+
+                return AgentResult(
+                    messages=messages,
+                    managed_state=self._get_managed_state(),
+                    turns_used=turn + 1,
+                    finished_naturally=True,
+                    reasoning_per_turn=reasoning_per_turn,
+                )
+
+        # Hit max turns without the model stopping
+        logger.info("Agent hit max_turns (%d) without finishing", self.max_turns)
+        return AgentResult(
+            messages=messages,
+            managed_state=self._get_managed_state(),
+            turns_used=self.max_turns,
+            finished_naturally=False,
+            reasoning_per_turn=reasoning_per_turn,
+        )
+
+    def _get_managed_state(self) -> Optional[Dict[str, Any]]:
+        """
+        Get ManagedServer state if the server supports it.
+
+        Returns state dict with SequenceNodes containing tokens/logprobs/masks,
+        or None if the server doesn't support get_state() (e.g., regular OpenAI server).
+        """
+        if hasattr(self.server, "get_state"):
+            return self.server.get_state()
+        return None
--- a/environments/configs/swe_default.yaml
+++ b/environments/configs/swe_default.yaml
@ -0,0 +1,33 @@
+# SWE Environment -- Default Configuration
+#
+# SWE-bench style tasks with Modal sandboxes for cloud isolation.
+# Uses terminal + file + web toolsets.
+#
+# Usage:
+#   python environments/hermes_swe_env.py serve --config environments/configs/swe_default.yaml
+
+env:
+  enabled_toolsets: ["terminal", "file", "web"]
+  max_agent_turns: 30
+  max_token_length: 4096
+  group_size: 4
+  terminal_backend: "modal"
+  tool_call_parser: "hermes"
+  tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
+  dataset_name: "bigcode/humanevalpack"
+  dataset_split: "test"
+  prompt_field: "prompt"
+  steps_per_eval: 50
+  total_steps: 500
+  use_wandb: true
+  wandb_name: "hermes-swe"
+  system_prompt: >
+    You are a skilled software engineer. You have access to a terminal,
+    file tools, and web search. Use these tools to complete the coding task.
+    Write clean, working code and verify it runs correctly before finishing.
+
+openai:
+  base_url: "http://localhost:8000/v1"
+  model_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
+  server_type: "openai"
+  api_key: ""
--- a/environments/configs/terminal_test_default.yaml
+++ b/environments/configs/terminal_test_default.yaml
@ -0,0 +1,35 @@
+# Terminal Test Environment -- Default Configuration
+#
+# Simple file-creation tasks for validating the full Atropos + hermes-agent stack.
+# Uses Modal terminal backend and OpenRouter (Claude) for inference.
+# API keys loaded from ~/hermes-agent/.env
+#
+# Usage:
+#   run-api
+#   python environments/terminal_test_env.py serve
+#   # Or with config file:
+#   python environments/terminal_test_env.py serve --config environments/configs/terminal_test_default.yaml
+
+env:
+  enabled_toolsets: ["terminal", "file"]
+  max_agent_turns: 10
+  max_token_length: 2048
+  group_size: 3
+  total_steps: 3
+  steps_per_eval: 3
+  terminal_backend: "modal"
+  tool_call_parser: "hermes"
+  tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
+  ensure_scores_are_not_same: false
+  use_wandb: false
+  system_prompt: >
+    You are a helpful assistant with access to a terminal and file tools.
+    Complete the user's request by using the available tools.
+    Be precise and follow instructions exactly.
+
+openai:
+  base_url: "https://openrouter.ai/api/v1"
+  model_name: "anthropic/claude-opus-4.6"
+  server_type: "openai"
+  health_check: false
+  # api_key loaded from OPENROUTER_API_KEY in .env
--- a/environments/hermes_base_env.py
+++ b/environments/hermes_base_env.py
@ -0,0 +1,540 @@
+"""
+HermesAgentBaseEnv -- Abstract Base Environment for Hermes-Agent + Atropos
+
+Provides the Atropos integration plumbing that all hermes-agent environments share:
+- Two-mode operation (OpenAI server for Phase 1, VLLM ManagedServer for Phase 2)
+- Per-group toolset/distribution resolution
+- Agent loop orchestration via HermesAgentLoop
+- ToolContext creation for reward functions
+- ScoredDataGroup construction from ManagedServer state
+
+Subclasses only need to implement:
+    setup()           -- Load dataset, initialize state
+    get_next_item()   -- Return the next item from the dataset
+    format_prompt()   -- Convert a dataset item into the user message
+    compute_reward()  -- Score the rollout (has full ToolContext access)
+    evaluate()        -- Periodic evaluation
+"""
+
+import asyncio
+import json
+import logging
+import os
+import sys
+import uuid
+from abc import abstractmethod
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Set, Tuple, Union
+
+# Ensure the hermes-agent repo root is on sys.path so that imports like
+# `from model_tools import ...` and `from environments.X import ...` work
+# regardless of where the script is invoked from.
+_repo_root = Path(__file__).resolve().parent.parent
+if str(_repo_root) not in sys.path:
+    sys.path.insert(0, str(_repo_root))
+
+from dotenv import load_dotenv
+from pydantic import Field
+
+# Load API keys from hermes-agent/.env so all environments can access them
+_env_path = _repo_root / ".env"
+if _env_path.exists():
+    load_dotenv(dotenv_path=_env_path)
+
+from atroposlib.envs.base import (
+    BaseEnv,
+    BaseEnvConfig,
+    ScoredDataGroup,
+    ScoredDataItem,
+)
+from atroposlib.envs.server_handling.server_manager import (
+    APIServerConfig,
+    ServerBaseline,
+    ServerManager,
+)
+from atroposlib.type_definitions import Item
+
+from environments.agent_loop import AgentResult, HermesAgentLoop
+from environments.tool_context import ToolContext
+
+# Import hermes-agent toolset infrastructure
+from model_tools import get_tool_definitions
+from toolset_distributions import sample_toolsets_from_distribution
+
+logger = logging.getLogger(__name__)
+
+
+class HermesAgentEnvConfig(BaseEnvConfig):
+    """
+    Configuration for hermes-agent Atropos environments.
+
+    Extends BaseEnvConfig with agent-specific settings for toolsets,
+    terminal backend, dataset loading, and tool call parsing.
+    """
+
+    # --- Toolset configuration ---
+    # Mutually exclusive: use either enabled_toolsets OR distribution
+    enabled_toolsets: Optional[List[str]] = Field(
+        default=None,
+        description="Explicit list of hermes toolsets to enable (e.g., ['terminal', 'file', 'web']). "
+        "If None and distribution is also None, all available toolsets are enabled.",
+    )
+    disabled_toolsets: Optional[List[str]] = Field(
+        default=None,
+        description="Toolsets to disable. Applied as a filter on top of enabled_toolsets or distribution.",
+    )
+    distribution: Optional[str] = Field(
+        default=None,
+        description="Name of a toolset distribution from toolset_distributions.py "
+        "(e.g., 'development', 'terminal_tasks'). Sampled once per group. "
+        "Mutually exclusive with enabled_toolsets.",
+    )
+
+    # --- Agent loop configuration ---
+    max_agent_turns: int = Field(
+        default=30,
+        description="Maximum number of LLM calls (tool-calling iterations) per rollout.",
+    )
+    system_prompt: Optional[str] = Field(
+        default=None,
+        description="System prompt for the agent. Tools are handled via the tools= parameter, "
+        "not embedded in the prompt text.",
+    )
+    agent_temperature: float = Field(
+        default=1.0,
+        description="Sampling temperature for agent generation during rollouts.",
+    )
+
+    # --- Terminal backend ---
+    terminal_backend: str = Field(
+        default="local",
+        description="Terminal backend: 'local', 'docker', 'modal', 'ssh', 'singularity'. "
+        "Modal recommended for production RL (cloud isolation per rollout).",
+    )
+
+    # --- Dataset ---
+    dataset_name: Optional[str] = Field(
+        default=None,
+        description="HuggingFace dataset name. Optional if tasks are defined inline.",
+    )
+    dataset_split: str = Field(
+        default="train",
+        description="Dataset split to use.",
+    )
+    prompt_field: str = Field(
+        default="prompt",
+        description="Which field in the dataset contains the prompt.",
+    )
+
+    # --- Phase 2: Tool call parsing ---
+    tool_call_parser: str = Field(
+        default="hermes",
+        description="Tool call parser name for Phase 2 (VLLM server type). "
+        "Ignored in Phase 1 (OpenAI server type where VLLM parses natively). "
+        "Options: hermes, mistral, llama3_json, qwen, deepseek_v3, etc.",
+    )
+
+
+class HermesAgentBaseEnv(BaseEnv):
+    """
+    Abstract base environment for hermes-agent Atropos integration.
+
+    Handles two modes of operation:
+    - Phase 1 (OpenAI server type): Uses server.chat_completion() directly.
+      The server (VLLM, SGLang, OpenRouter, OpenAI) handles tool call parsing
+      and reasoning extraction natively. DummyManagedServer provides placeholder
+      tokens. Good for SFT data gen, verifier testing, evaluation.
+
+    - Phase 2 (VLLM server type): Uses ManagedServer for exact token IDs + logprobs
+      via /generate. Client-side tool call parser reconstructs structured tool_calls
+      from raw output. Full RL training capability.
+
+    Subclasses must implement:
+        setup()           -- Load dataset, initialize state
+        get_next_item()   -- Return the next item to roll out
+        format_prompt()   -- Convert a dataset item into the user message string
+        compute_reward()  -- Score the rollout using ToolContext
+        evaluate()        -- Periodic evaluation
+    """
+
+    name: Optional[str] = "hermes-agent"
+    env_config_cls = HermesAgentEnvConfig
+
+    def __init__(
+        self,
+        config: HermesAgentEnvConfig,
+        server_configs: Union[ServerBaseline, List[APIServerConfig]],
+        slurm=False,
+        testing=False,
+    ):
+        super().__init__(config, server_configs, slurm, testing)
+
+        # Set terminal backend environment variable so hermes tools pick it up
+        if config.terminal_backend:
+            os.environ["TERMINAL_ENV"] = config.terminal_backend
+
+        # Current group's resolved tools (set in collect_trajectories)
+        self._current_group_tools: Optional[Tuple[List[Dict], Set[str]]] = None
+
+    # =========================================================================
+    # Toolset resolution (per-group)
+    # =========================================================================
+
+    def _resolve_tools_for_group(self) -> Tuple[List[Dict[str, Any]], Set[str]]:
+        """
+        Resolve toolsets for a group. Called once in collect_trajectories(),
+        then shared by all collect_trajectory() calls in the group.
+
+        If distribution is set, samples probabilistically.
+        If enabled_toolsets is set, uses that explicit list.
+        disabled_toolsets is applied as a filter on top.
+
+        Returns:
+            (tool_schemas, valid_tool_names) tuple
+        """
+        config = self.config
+
+        if config.distribution:
+            group_toolsets = sample_toolsets_from_distribution(config.distribution)
+            logger.info("Sampled toolsets from '%s': %s", config.distribution, group_toolsets)
+        else:
+            group_toolsets = config.enabled_toolsets  # None means "all available"
+
+        tools = get_tool_definitions(
+            enabled_toolsets=group_toolsets,
+            disabled_toolsets=config.disabled_toolsets,
+            quiet_mode=True,
+        )
+
+        valid_names = {t["function"]["name"] for t in tools} if tools else set()
+        logger.info("Resolved %d tools for group: %s", len(valid_names), sorted(valid_names))
+        return tools, valid_names
+
+    # =========================================================================
+    # Server mode detection
+    # =========================================================================
+
+    def _use_managed_server(self) -> bool:
+        """
+        Determine if we should use ManagedServer (Phase 2) or direct server (Phase 1).
+
+        Phase 2 (ManagedServer) is used when the server type is 'vllm' or 'sglang',
+        which go through the /generate endpoint for exact token tracking.
+
+        Phase 1 (direct server) is used for 'openai' server type, which uses
+        /v1/chat/completions with native tool call parsing.
+        """
+        if not self.server.servers:
+            return False
+
+        server = self.server.servers[0]
+        # If the server is an OpenAI server (not VLLM/SGLang), use direct mode
+        from atroposlib.envs.server_handling.openai_server import OpenAIServer
+        return not isinstance(server, OpenAIServer)
+
+    # =========================================================================
+    # Core Atropos integration
+    # =========================================================================
+
+    async def collect_trajectories(
+        self, item: Item
+    ) -> Tuple[
+        Union[Optional[ScoredDataGroup], List[Optional[ScoredDataGroup]]],
+        List[Item],
+    ]:
+        """
+        Override collect_trajectories to resolve toolsets once per group,
+        then delegate to the standard group-level collection.
+
+        The default BaseEnv.collect_trajectories() calls collect_trajectory()
+        group_size times in parallel. We resolve tools once here and store
+        them for all those calls to use.
+        """
+        # Resolve toolsets for this group (shared by all rollouts in the group)
+        self._current_group_tools = self._resolve_tools_for_group()
+
+        # Delegate to the default implementation which calls collect_trajectory()
+        # group_size times via asyncio.gather
+        return await super().collect_trajectories(item)
+
+    # =========================================================================
+    # Wandb rollout display -- format trajectories nicely
+    # =========================================================================
+
+    @staticmethod
+    def _format_trajectory_for_display(messages: List[Dict[str, Any]]) -> str:
+        """
+        Format a conversation's messages into a readable trajectory string
+        for wandb rollout tables. Shows tool calls, tool results, and reasoning
+        in a structured way instead of raw token decoding.
+        """
+        parts = []
+        for msg in messages:
+            role = msg.get("role", "unknown")
+            content = msg.get("content", "")
+
+            if role == "system":
+                parts.append(f"[SYSTEM]\n{content}")
+
+            elif role == "user":
+                parts.append(f"[USER]\n{content}")
+
+            elif role == "assistant":
+                # Show reasoning if present
+                reasoning = msg.get("reasoning_content", "")
+                if reasoning:
+                    # Truncate long reasoning for display
+                    if len(reasoning) > 300:
+                        reasoning = reasoning[:300] + "..."
+                    parts.append(f"[ASSISTANT thinking]\n{reasoning}")
+
+                # Show content
+                if content:
+                    parts.append(f"[ASSISTANT]\n{content}")
+
+                # Show tool calls
+                tool_calls = msg.get("tool_calls", [])
+                for tc in tool_calls:
+                    func = tc.get("function", {})
+                    name = func.get("name", "?")
+                    args = func.get("arguments", "{}")
+                    # Truncate long arguments for display
+                    if len(args) > 200:
+                        args = args[:200] + "..."
+                    parts.append(f"[TOOL CALL] {name}({args})")
+
+            elif role == "tool":
+                tool_id = msg.get("tool_call_id", "")
+                result = content
+                # Truncate long tool results for display
+                if len(result) > 500:
+                    result = result[:500] + "..."
+                parts.append(f"[TOOL RESULT] {result}")
+
+        return "\n\n".join(parts)
+
+    async def add_rollouts_for_wandb(
+        self,
+        scored_data,
+        item=None,
+    ):
+        """
+        Override to show formatted trajectories with tool calls visible,
+        instead of raw token decoding which loses all structure.
+        """
+        num_keep = self.config.num_rollouts_per_group_for_logging
+        if num_keep == -1:
+            num_keep = self.config.group_size
+
+        group = []
+        for i in range(min(num_keep, len(scored_data.get("scores", [])))):
+            score = scored_data["scores"][i]
+
+            # Use messages if available for rich display
+            messages = None
+            if scored_data.get("messages") and i < len(scored_data["messages"]):
+                messages = scored_data["messages"][i]
+
+            if messages:
+                text = self._format_trajectory_for_display(messages)
+            elif scored_data.get("tokens") and i < len(scored_data["tokens"]):
+                text = self.tokenizer.decode(scored_data["tokens"][i])
+            else:
+                text = "(no data)"
+
+            group.append((text, score))
+
+        self.rollouts_for_wandb.append(group)
+        if len(self.rollouts_for_wandb) > self.config.num_rollouts_to_keep:
+            self.rollouts_for_wandb.pop(0)
+
+    async def collect_trajectory(
+        self, item: Item
+    ) -> Tuple[Optional[Union[ScoredDataItem, Any]], List[Item]]:
+        """
+        Run a single rollout: agent loop + reward computation.
+
+        This is called group_size times in parallel by collect_trajectories().
+        Each call gets its own task_id for terminal/browser session isolation.
+        """
+        task_id = str(uuid.uuid4())
+
+        # Get group-level tools (resolved once in collect_trajectories)
+        if self._current_group_tools is None:
+            # Fallback: resolve per-trajectory if called outside collect_trajectories
+            tools, valid_names = self._resolve_tools_for_group()
+        else:
+            tools, valid_names = self._current_group_tools
+
+        # Build initial messages
+        messages: List[Dict[str, Any]] = []
+        if self.config.system_prompt:
+            messages.append({"role": "system", "content": self.config.system_prompt})
+        messages.append({"role": "user", "content": self.format_prompt(item)})
+
+        # Run the agent loop
+        result: AgentResult
+        if self._use_managed_server():
+            # Phase 2: ManagedServer with parser -- exact tokens + logprobs
+            try:
+                async with self.server.managed_server(tokenizer=self.tokenizer) as managed:
+                    agent = HermesAgentLoop(
+                        server=managed,
+                        tool_schemas=tools,
+                        valid_tool_names=valid_names,
+                        max_turns=self.config.max_agent_turns,
+                        task_id=task_id,
+                        temperature=self.config.agent_temperature,
+                        max_tokens=self.config.max_token_length,
+                    )
+                    result = await agent.run(messages)
+            except NotImplementedError:
+                # DummyManagedServer not allowed -- fall back to Phase 1
+                logger.warning(
+                    "ManagedServer not available (OpenAI server?). "
+                    "Falling back to direct server mode."
+                )
+                agent = HermesAgentLoop(
+                    server=self.server,
+                    tool_schemas=tools,
+                    valid_tool_names=valid_names,
+                    max_turns=self.config.max_agent_turns,
+                    task_id=task_id,
+                    temperature=self.config.agent_temperature,
+                    max_tokens=self.config.max_token_length,
+                )
+                result = await agent.run(messages)
+        else:
+            # Phase 1: OpenAI server -- native tool_calls, placeholder tokens
+            agent = HermesAgentLoop(
+                server=self.server,
+                tool_schemas=tools,
+                valid_tool_names=valid_names,
+                max_turns=self.config.max_agent_turns,
+                task_id=task_id,
+                temperature=self.config.agent_temperature,
+                max_tokens=self.config.max_token_length,
+            )
+            result = await agent.run(messages)
+
+        # Compute reward using ToolContext (gives verifier full tool access)
+        ctx = ToolContext(task_id)
+        try:
+            reward = await self.compute_reward(item, result, ctx)
+        except Exception as e:
+            logger.error("compute_reward failed: %s", e)
+            reward = 0.0
+        finally:
+            ctx.cleanup()
+
+        # Build ScoredDataItem from ManagedServer state
+        # Phase 2: real tokens/masks/logprobs from SequenceNodes
+        # Phase 1: placeholder tokens (still need a valid ScoredDataItem for the pipeline)
+        nodes = (result.managed_state or {}).get("nodes", [])
+
+        if nodes:
+            # Phase 2 (or DummyManagedServer): use actual node data
+            node = nodes[-1]  # Final sequence node = full trajectory
+            scored_item: Dict[str, Any] = {
+                "tokens": node.tokens,
+                "masks": node.masked_tokens,
+                "scores": reward,
+            }
+
+            # Include logprobs if available (Phase 2)
+            if hasattr(node, "logprobs") and node.logprobs:
+                scored_item["advantages"] = None  # Computed by trainer
+                scored_item["ref_logprobs"] = None
+        else:
+            # Phase 1 with no managed state: create placeholder tokens
+            # so the data pipeline doesn't break. These are NOT suitable
+            # for training but allow process mode (SFT data gen) to work.
+            # Tokenize the full conversation to get approximate tokens.
+            full_text = "\n".join(
+                msg.get("content", "") for msg in result.messages if msg.get("content")
+            )
+            if self.tokenizer:
+                tokens = self.tokenizer.encode(full_text, add_special_tokens=True)
+            else:
+                tokens = list(range(min(len(full_text) // 4, 128)))
+
+            scored_item = {
+                "tokens": tokens,
+                "masks": [-100] + tokens[1:],  # Mask first token as prompt
+                "scores": reward,
+            }
+
+        # Always include messages for wandb rollout display and data logging
+        scored_item["messages"] = result.messages
+
+        return scored_item, []
+
+    # =========================================================================
+    # Abstract methods -- subclasses must implement
+    # =========================================================================
+
+    @abstractmethod
+    async def setup(self):
+        """
+        Load dataset, initialize state.
+
+        Called once when the environment starts. Typical implementation:
+            self.dataset = load_dataset(self.config.dataset_name, split=self.config.dataset_split)
+            self.iter = 0
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    async def get_next_item(self) -> Item:
+        """
+        Return the next item from the dataset for rollout.
+
+        Called by the base env's main loop to get items for workers.
+        Should cycle through the dataset.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def format_prompt(self, item: Item) -> str:
+        """
+        Convert a dataset item into the user message for the agent.
+
+        Args:
+            item: Dataset item (dict, tuple, etc.)
+
+        Returns:
+            The prompt string to send to the agent
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    async def compute_reward(
+        self, item: Item, result: AgentResult, ctx: ToolContext
+    ) -> float:
+        """
+        Score the rollout. Has full access to:
+        - item: the original dataset item (ground truth, test commands, etc.)
+        - result: AgentResult with full messages, turn count, reasoning, etc.
+        - ctx: ToolContext -- call ANY hermes-agent tool (terminal, file, web,
+               browser, vision...) scoped to this rollout's sandbox. Nothing
+               is off-limits.
+
+        Args:
+            item: The dataset item that was rolled out
+            result: The agent's rollout result
+            ctx: ToolContext with full tool access for verification
+
+        Returns:
+            Reward float (typically 0.0 to 1.0, but any float is valid)
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    async def evaluate(self, *args, **kwargs):
+        """
+        Periodic evaluation. Called every steps_per_eval steps.
+
+        Typical implementation runs the agent on a held-out eval set
+        and logs metrics via wandb/evaluate_log.
+        """
+        raise NotImplementedError
--- a/environments/hermes_swe_env.py
+++ b/environments/hermes_swe_env.py
@ -0,0 +1,229 @@
+"""
+HermesSweEnv -- SWE-Bench Style Environment with Modal Sandboxes
+
+A concrete environment for software engineering tasks where the model writes code
+and the reward function runs tests to verify correctness. Uses Modal terminal
+backend for cloud-isolated sandboxes per rollout.
+
+The reward function uses ToolContext.terminal() to run test commands in the same
+Modal sandbox the model used during its agentic loop. All filesystem state from
+the model's tool calls is preserved for verification.
+
+Usage:
+    # Phase 1: OpenAI server type
+    vllm serve YourModel --tool-parser hermes
+    run-api
+    python environments/hermes_swe_env.py serve \\
+        --openai.base_url http://localhost:8000/v1 \\
+        --openai.model_name YourModel \\
+        --openai.server_type openai \\
+        --env.dataset_name bigcode/humanevalpack \\
+        --env.terminal_backend modal
+
+    # Phase 2: VLLM server type (full RL training)
+    python environments/hermes_swe_env.py serve \\
+        --openai.base_url http://localhost:8000/v1 \\
+        --openai.model_name YourModel \\
+        --openai.server_type vllm \\
+        --env.tool_call_parser hermes \\
+        --env.terminal_backend modal
+"""
+
+import logging
+import sys
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+# Ensure repo root is on sys.path for imports
+_repo_root = Path(__file__).resolve().parent.parent
+if str(_repo_root) not in sys.path:
+    sys.path.insert(0, str(_repo_root))
+
+from datasets import load_dataset
+
+from atroposlib.envs.base import ScoredDataGroup
+from atroposlib.envs.server_handling.server_manager import APIServerConfig
+from atroposlib.type_definitions import Item
+
+from environments.agent_loop import AgentResult
+from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
+from environments.tool_context import ToolContext
+
+logger = logging.getLogger(__name__)
+
+
+class HermesSweEnvConfig(HermesAgentEnvConfig):
+    """Config with defaults for SWE-bench style tasks."""
+
+    pass  # Inherits all fields, overrides defaults in config_init
+
+
+class HermesSweEnv(HermesAgentBaseEnv):
+    """
+    SWE-bench style environment using Modal terminal backend.
+
+    The model gets a coding task, uses terminal + file + web tools to solve it,
+    and the reward function runs tests in the same Modal sandbox to verify.
+
+    Subclass this for specific SWE datasets (HumanEval, SWE-bench, etc.)
+    and customize format_prompt() and compute_reward() as needed.
+    """
+
+    name = "hermes-swe"
+    env_config_cls = HermesSweEnvConfig
+
+    @classmethod
+    def config_init(cls) -> Tuple[HermesSweEnvConfig, List[APIServerConfig]]:
+        """
+        Default configuration for the SWE environment.
+
+        Uses Modal terminal backend for cloud isolation and terminal + file + web toolsets.
+        """
+        env_config = HermesSweEnvConfig(
+            # Toolsets: terminal for running code, file for reading/writing, web for docs
+            enabled_toolsets=["terminal", "file", "web"],
+            disabled_toolsets=None,
+            distribution=None,
+            # Agent settings -- SWE tasks need more turns
+            max_agent_turns=30,
+            max_token_length=4096,
+            agent_temperature=1.0,
+            system_prompt=(
+                "You are a skilled software engineer. You have access to a terminal, "
+                "file tools, and web search. Use these tools to complete the coding task. "
+                "Write clean, working code and verify it runs correctly before finishing."
+            ),
+            # Modal backend for cloud-isolated sandboxes
+            terminal_backend="modal",
+            # Dataset -- override via CLI for your specific SWE dataset
+            dataset_name="bigcode/humanevalpack",
+            dataset_split="test",
+            prompt_field="prompt",
+            # Atropos settings
+            group_size=4,
+            tokenizer_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
+            tool_call_parser="hermes",
+            steps_per_eval=50,
+            total_steps=500,
+            use_wandb=True,
+            wandb_name="hermes-swe",
+        )
+
+        server_configs = [
+            APIServerConfig(
+                base_url="http://localhost:8000/v1",
+                model_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
+                server_type="openai",  # Phase 1; switch to "vllm" for Phase 2
+                api_key="",
+            )
+        ]
+
+        return env_config, server_configs
+
+    async def setup(self):
+        """Load the SWE dataset."""
+        if self.config.dataset_name:
+            self.dataset = load_dataset(
+                self.config.dataset_name, split=self.config.dataset_split
+            )
+        else:
+            # Placeholder if no dataset specified
+            self.dataset = []
+        self.iter = 0
+        self.reward_buffer: List[float] = []
+
+    async def get_next_item(self) -> Dict[str, Any]:
+        """Cycle through the SWE dataset."""
+        if not self.dataset:
+            raise ValueError("No dataset loaded. Set dataset_name in config.")
+        item = self.dataset[self.iter % len(self.dataset)]
+        self.iter += 1
+        return item
+
+    def format_prompt(self, item: Dict[str, Any]) -> str:
+        """
+        Format the SWE task prompt.
+
+        Override this in subclasses for different dataset formats.
+        Default assumes the dataset has a 'prompt' field and optionally a 'test' field.
+        """
+        prompt = item.get(self.config.prompt_field, "")
+
+        # If the dataset has test information, include it in the prompt
+        test_info = item.get("test", item.get("test_code", item.get("tests", "")))
+        if test_info:
+            prompt += f"\n\nTests to pass:\n{test_info}"
+
+        return prompt
+
+    async def compute_reward(
+        self, item: Dict[str, Any], result: AgentResult, ctx: ToolContext
+    ) -> float:
+        """
+        Score by running tests in the model's Modal sandbox.
+
+        Default implementation:
+        - If the dataset item has a 'test' or 'test_code' field, run it
+        - Check exit code: 0 = pass, non-zero = fail
+        - Partial credit for file creation
+
+        Override this in subclasses for more sophisticated reward logic.
+        """
+        # Find the test command from the dataset item
+        test_code = item.get("test", item.get("test_code", item.get("tests", "")))
+
+        if test_code:
+            # Run the test in the model's sandbox
+            test_result = ctx.terminal(
+                f'cd /workspace && python3 -c "{test_code}"', timeout=60
+            )
+
+            if test_result["exit_code"] == 0:
+                self.reward_buffer.append(1.0)
+                return 1.0
+
+        # Partial credit: check if the model created any Python files
+        file_check = ctx.terminal("find /workspace -name '*.py' -newer /tmp/.start_marker 2>/dev/null | head -5")
+        if file_check["exit_code"] == 0 and file_check.get("output", "").strip():
+            self.reward_buffer.append(0.1)
+            return 0.1
+
+        self.reward_buffer.append(0.0)
+        return 0.0
+
+    async def evaluate(self, *args, **kwargs):
+        """
+        Run evaluation on a held-out set.
+
+        Override for dataset-specific evaluation logic.
+        """
+        start_time = time.time()
+        end_time = time.time()
+
+        eval_metrics = {"eval/placeholder": 0.0}
+        await self.evaluate_log(
+            metrics=eval_metrics,
+            start_time=start_time,
+            end_time=end_time,
+        )
+
+    async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
+        """Log SWE-specific metrics."""
+        if wandb_metrics is None:
+            wandb_metrics = {}
+
+        if self.reward_buffer:
+            wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / len(
+                self.reward_buffer
+            )
+            wandb_metrics["train/pass_rate"] = sum(
+                1 for r in self.reward_buffer if r == 1.0
+            ) / len(self.reward_buffer)
+            self.reward_buffer = []
+
+        await super().wandb_log(wandb_metrics)
+
+
+if __name__ == "__main__":
+    HermesSweEnv.cli()
--- a/environments/terminal_test_env.py
+++ b/environments/terminal_test_env.py
@ -0,0 +1,292 @@
+"""
+TerminalTestEnv -- Simple Test Environment for Validating the Stack
+
+A self-contained environment with inline tasks (no external dataset needed).
+Each task asks the model to create a file at a known path with specific content.
+The reward verifier cats the file and checks if the content matches.
+
+Enables only terminal + file toolsets. Uses Modal terminal backend with
+OpenRouter (Claude) by default.
+
+Training tasks (3):
+    1. Create ~/greeting.txt with "Hello from Hermes Agent"
+    2. Create ~/count.txt with numbers 1-5, one per line
+    3. Create ~/answer.txt with the result of 123 + 456
+
+Eval task (1):
+    1. Create ~/result.txt with the result of 6 * 7
+
+Usage:
+    # Start Atropos API server
+    run-api
+
+    # Run environment (uses OpenRouter + Modal by default)
+    python environments/terminal_test_env.py serve
+
+    # Process mode (no run-api needed, saves to JSONL)
+    python environments/terminal_test_env.py process \\
+        --env.data_path_to_save_groups terminal_test_output.jsonl
+"""
+
+import logging
+import os
+import sys
+import time
+from pathlib import Path
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+# Ensure repo root is on sys.path for imports
+_repo_root = Path(__file__).resolve().parent.parent
+if str(_repo_root) not in sys.path:
+    sys.path.insert(0, str(_repo_root))
+
+from atroposlib.envs.base import ScoredDataGroup
+from atroposlib.envs.server_handling.server_manager import APIServerConfig
+from atroposlib.type_definitions import Item
+
+from environments.agent_loop import AgentResult
+from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
+from environments.tool_context import ToolContext
+
+logger = logging.getLogger(__name__)
+
+
+# =============================================================================
+# Inline task definitions -- no external dataset needed
+# =============================================================================
+
+TRAIN_TASKS = [
+    {
+        "prompt": "Create a file at ~/greeting.txt containing exactly the text: Hello from Hermes Agent",
+        "verify_path": "~/greeting.txt",
+        "expected_content": "Hello from Hermes Agent",
+    },
+    {
+        "prompt": "Create a file at ~/count.txt containing the numbers 1 through 5, one per line",
+        "verify_path": "~/count.txt",
+        "expected_content": "1\n2\n3\n4\n5",
+    },
+    {
+        "prompt": "Create a file at ~/answer.txt containing the result of 123 + 456",
+        "verify_path": "~/answer.txt",
+        "expected_content": "579",
+    },
+]
+
+EVAL_TASKS = [
+    {
+        "prompt": "Create a file at ~/result.txt containing the result of 6 * 7",
+        "verify_path": "~/result.txt",
+        "expected_content": "42",
+    },
+]
+
+
+class TerminalTestEnvConfig(HermesAgentEnvConfig):
+    """Config with defaults suitable for terminal testing."""
+
+    pass  # Inherits all fields, overrides defaults in config_init
+
+
+class TerminalTestEnv(HermesAgentBaseEnv):
+    """
+    Simple test environment with inline file-creation tasks.
+
+    All tasks follow the same pattern: "create a file at ~/X.txt with content Y".
+    The verifier runs `cat ~/X.txt` in the rollout's terminal and checks the output
+    against the expected string. Same verifier logic for all tasks.
+
+    This environment is designed to validate the full stack end-to-end:
+    - Agent loop executes tool calls (terminal/file)
+    - ToolContext provides terminal access to the reward function
+    - Reward function verifies file content via cat
+    - Scored data flows through the Atropos pipeline
+    """
+
+    name = "terminal-test"
+    env_config_cls = TerminalTestEnvConfig
+
+    @classmethod
+    def config_init(cls) -> Tuple[TerminalTestEnvConfig, List[APIServerConfig]]:
+        """
+        Default configuration for the terminal test environment.
+
+        Uses Modal terminal backend for cloud isolation and OpenRouter with
+        Claude for inference. API keys loaded from ~/hermes-agent/.env.
+        """
+        env_config = TerminalTestEnvConfig(
+            # Terminal + file tools only
+            enabled_toolsets=["terminal", "file"],
+            disabled_toolsets=None,
+            distribution=None,
+            # Agent settings
+            max_agent_turns=10,  # Simple tasks, don't need many turns
+            max_token_length=2048,
+            agent_temperature=1.0,
+            system_prompt=(
+                "You are a helpful assistant with access to a terminal and file tools. "
+                "Complete the user's request by using the available tools. "
+                "Be precise and follow instructions exactly."
+            ),
+            # Modal terminal backend for cloud-isolated sandboxes per rollout
+            terminal_backend="modal",
+            # Atropos settings
+            group_size=3,              # 3 rollouts per group
+            tokenizer_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
+            tool_call_parser="hermes",
+            steps_per_eval=3,          # Eval after all 3 steps
+            total_steps=3,             # 3 groups total (1 group per step)
+            use_wandb=True,
+            wandb_name="terminal-test",
+            ensure_scores_are_not_same=False,  # Allow all-same scores for simple tasks
+            # No external dataset
+            dataset_name=None,
+        )
+
+        # OpenRouter with Claude -- API key loaded from .env (OPENROUTER_API_KEY)
+        server_configs = [
+            APIServerConfig(
+                base_url="https://openrouter.ai/api/v1",
+                model_name="anthropic/claude-opus-4.6",
+                server_type="openai",
+                api_key=os.getenv("OPENROUTER_API_KEY", ""),
+                health_check=False,  # OpenRouter doesn't have a /health endpoint
+            )
+        ]
+
+        return env_config, server_configs
+
+    async def setup(self):
+        """Initialize inline task lists."""
+        self.train_tasks = list(TRAIN_TASKS)
+        self.eval_tasks = list(EVAL_TASKS)
+        self.iter = 0
+        # Track reward stats for wandb logging
+        self.reward_buffer: List[float] = []
+
+    async def get_next_item(self) -> Dict[str, str]:
+        """Cycle through training tasks."""
+        item = self.train_tasks[self.iter % len(self.train_tasks)]
+        self.iter += 1
+        return item
+
+    def format_prompt(self, item: Dict[str, str]) -> str:
+        """The prompt is directly in the task item."""
+        return item["prompt"]
+
+    async def compute_reward(
+        self, item: Dict[str, str], result: AgentResult, ctx: ToolContext
+    ) -> float:
+        """
+        Verify by cat-ing the expected file path and checking content matches.
+        Same verifier for all tasks -- they all write a file at a known path.
+
+        Scoring:
+            1.0 = exact match
+            0.5 = expected content is present but has extra stuff
+            0.0 = file doesn't exist or content doesn't match
+        """
+        verify_result = ctx.terminal(f"cat {item['verify_path']}")
+
+        # File doesn't exist or can't be read
+        if verify_result["exit_code"] != 0:
+            self.reward_buffer.append(0.0)
+            return 0.0
+
+        actual = verify_result.get("output", "").strip()
+        expected = item["expected_content"].strip()
+
+        # Exact match
+        if actual == expected:
+            self.reward_buffer.append(1.0)
+            return 1.0
+
+        # Partial credit: expected content is present but has extra stuff
+        if expected in actual:
+            self.reward_buffer.append(0.5)
+            return 0.5
+
+        self.reward_buffer.append(0.0)
+        return 0.0
+
+    async def evaluate(self, *args, **kwargs):
+        """
+        Run eval tasks using the agent loop and verify results.
+        Logs accuracy metrics.
+        """
+        start_time = time.time()
+        correct = 0
+        total = len(self.eval_tasks)
+        samples = []
+
+        for eval_item in self.eval_tasks:
+            try:
+                # For eval, we do a simple single-turn completion (not full agent loop)
+                # to keep eval fast. The agent loop is tested via training.
+                completion = await self.server.chat_completion(
+                    messages=[
+                        {"role": "system", "content": self.config.system_prompt or ""},
+                        {"role": "user", "content": eval_item["prompt"]},
+                    ],
+                    n=1,
+                    max_tokens=self.config.max_token_length,
+                    temperature=0.0,
+                    split="eval",
+                )
+
+                response_content = (
+                    completion.choices[0].message.content if completion.choices else ""
+                )
+
+                samples.append(
+                    {
+                        "prompt": eval_item["prompt"],
+                        "response": response_content,
+                        "expected": eval_item["expected_content"],
+                    }
+                )
+
+            except Exception as e:
+                logger.error("Eval failed for item: %s", e)
+                samples.append(
+                    {
+                        "prompt": eval_item["prompt"],
+                        "response": f"ERROR: {e}",
+                        "expected": eval_item["expected_content"],
+                    }
+                )
+
+        end_time = time.time()
+
+        eval_metrics = {
+            "eval/num_samples": total,
+        }
+
+        await self.evaluate_log(
+            metrics=eval_metrics,
+            samples=samples,
+            start_time=start_time,
+            end_time=end_time,
+        )
+
+    async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
+        """Log training metrics including reward stats and accuracy."""
+        if wandb_metrics is None:
+            wandb_metrics = {}
+
+        if self.reward_buffer:
+            total = len(self.reward_buffer)
+            correct = sum(1 for r in self.reward_buffer if r == 1.0)
+            partial = sum(1 for r in self.reward_buffer if r == 0.5)
+
+            wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / total
+            wandb_metrics["train/accuracy"] = correct / total
+            wandb_metrics["train/partial_match_rate"] = partial / total
+            wandb_metrics["train/total_rollouts"] = total
+            self.reward_buffer = []
+
+        await super().wandb_log(wandb_metrics)
+
+
+if __name__ == "__main__":
+    TerminalTestEnv.cli()
--- a/environments/tool_call_parsers/init.py
+++ b/environments/tool_call_parsers/init.py
@ -0,0 +1,120 @@
+"""
+Tool Call Parser Registry
+
+Client-side parsers that extract structured tool_calls from raw model output text.
+Used in Phase 2 (VLLM server type) where ManagedServer's /generate endpoint returns
+raw text without tool call parsing.
+
+Each parser is a standalone reimplementation of the corresponding VLLM parser's
+non-streaming extract_tool_calls() logic. No VLLM dependency -- only standard library
+(re, json, uuid) and openai types.
+
+Usage:
+    from environments.tool_call_parsers import get_parser
+
+    parser = get_parser("hermes")
+    content, tool_calls = parser.parse(raw_model_output)
+    # content = text with tool call markup stripped
+    # tool_calls = list of ChatCompletionMessageToolCall objects, or None
+"""
+
+import logging
+from abc import ABC, abstractmethod
+from typing import Dict, List, Optional, Tuple, Type
+
+from openai.types.chat.chat_completion_message_tool_call import (
+    ChatCompletionMessageToolCall,
+)
+
+logger = logging.getLogger(__name__)
+
+# Type alias for parser return value
+ParseResult = Tuple[Optional[str], Optional[List[ChatCompletionMessageToolCall]]]
+
+
+class ToolCallParser(ABC):
+    """
+    Base class for tool call parsers.
+
+    Each parser knows how to extract structured tool_calls from a specific
+    model family's raw output text format.
+    """
+
+    @abstractmethod
+    def parse(self, text: str) -> ParseResult:
+        """
+        Parse raw model output text for tool calls.
+
+        Args:
+            text: Raw decoded text from the model's completion
+
+        Returns:
+            Tuple of (content, tool_calls) where:
+            - content: text with tool call markup stripped (the message 'content' field),
+                       or None if the entire output was tool calls
+            - tool_calls: list of ChatCompletionMessageToolCall objects,
+                          or None if no tool calls were found
+        """
+        raise NotImplementedError
+
+
+# Global parser registry: name -> parser class
+PARSER_REGISTRY: Dict[str, Type[ToolCallParser]] = {}
+
+
+def register_parser(name: str):
+    """
+    Decorator to register a parser class under a given name.
+
+    Usage:
+        @register_parser("hermes")
+        class HermesToolCallParser(ToolCallParser):
+            ...
+    """
+
+    def decorator(cls: Type[ToolCallParser]) -> Type[ToolCallParser]:
+        PARSER_REGISTRY[name] = cls
+        return cls
+
+    return decorator
+
+
+def get_parser(name: str) -> ToolCallParser:
+    """
+    Get a parser instance by name.
+
+    Args:
+        name: Parser name (e.g., "hermes", "mistral", "llama3_json")
+
+    Returns:
+        Instantiated parser
+
+    Raises:
+        KeyError: If parser name is not found in registry
+    """
+    if name not in PARSER_REGISTRY:
+        available = sorted(PARSER_REGISTRY.keys())
+        raise KeyError(
+            f"Tool call parser '{name}' not found. Available parsers: {available}"
+        )
+    return PARSER_REGISTRY[name]()
+
+
+def list_parsers() -> List[str]:
+    """Return sorted list of registered parser names."""
+    return sorted(PARSER_REGISTRY.keys())
+
+
+# Import all parser modules to trigger registration via @register_parser decorators
+# Each module registers itself when imported
+from environments.tool_call_parsers.hermes_parser import HermesToolCallParser  # noqa: E402, F401
+from environments.tool_call_parsers.longcat_parser import LongcatToolCallParser  # noqa: E402, F401
+from environments.tool_call_parsers.mistral_parser import MistralToolCallParser  # noqa: E402, F401
+from environments.tool_call_parsers.llama_parser import LlamaToolCallParser  # noqa: E402, F401
+from environments.tool_call_parsers.qwen_parser import QwenToolCallParser  # noqa: E402, F401
+from environments.tool_call_parsers.deepseek_v3_parser import DeepSeekV3ToolCallParser  # noqa: E402, F401
+from environments.tool_call_parsers.deepseek_v3_1_parser import DeepSeekV31ToolCallParser  # noqa: E402, F401
+from environments.tool_call_parsers.kimi_k2_parser import KimiK2ToolCallParser  # noqa: E402, F401
+from environments.tool_call_parsers.glm45_parser import Glm45ToolCallParser  # noqa: E402, F401
+from environments.tool_call_parsers.glm47_parser import Glm47ToolCallParser  # noqa: E402, F401
+from environments.tool_call_parsers.qwen3_coder_parser import Qwen3CoderToolCallParser  # noqa: E402, F401
--- a/environments/tool_call_parsers/deepseek_v3_1_parser.py
+++ b/environments/tool_call_parsers/deepseek_v3_1_parser.py
@ -0,0 +1,71 @@
+"""
+DeepSeek V3.1 tool call parser.
+
+Similar to V3 but with a slightly different format:
+    <｜tool▁call▁begin｜>function_name<｜tool▁sep｜>arguments<｜tool▁call▁end｜>
+
+Note: V3 has type+name before the separator, V3.1 has name before and args after.
+
+Based on VLLM's DeepSeekV31ToolParser.extract_tool_calls()
+"""
+
+import re
+import uuid
+from typing import List, Optional
+
+from openai.types.chat.chat_completion_message_tool_call import (
+    ChatCompletionMessageToolCall,
+    Function,
+)
+
+from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
+
+
+@register_parser("deepseek_v3_1")
+@register_parser("deepseek_v31")
+class DeepSeekV31ToolCallParser(ToolCallParser):
+    """
+    Parser for DeepSeek V3.1 tool calls.
+
+    Slightly different regex than V3: function_name comes before the separator,
+    arguments come after (no type field, no json code block wrapper).
+    """
+
+    START_TOKEN = "<｜tool▁calls▁begin｜>"
+
+    # Regex captures: function_name, function_arguments
+    PATTERN = re.compile(
+        r"<｜tool▁call▁begin｜>(?P<function_name>.*?)<｜tool▁sep｜>(?P<function_arguments>.*?)<｜tool▁call▁end｜>"
+    )
+
+    def parse(self, text: str) -> ParseResult:
+        if self.START_TOKEN not in text:
+            return text, None
+
+        try:
+            matches = self.PATTERN.findall(text)
+            if not matches:
+                return text, None
+
+            tool_calls: List[ChatCompletionMessageToolCall] = []
+            for match in matches:
+                func_name, func_args = match
+                tool_calls.append(
+                    ChatCompletionMessageToolCall(
+                        id=f"call_{uuid.uuid4().hex[:8]}",
+                        type="function",
+                        function=Function(
+                            name=func_name.strip(),
+                            arguments=func_args.strip(),
+                        ),
+                    )
+                )
+
+            if not tool_calls:
+                return text, None
+
+            content = text[: text.find(self.START_TOKEN)].strip()
+            return content if content else None, tool_calls
+
+        except Exception:
+            return text, None
--- a/environments/tool_call_parsers/deepseek_v3_parser.py
+++ b/environments/tool_call_parsers/deepseek_v3_parser.py
@ -0,0 +1,75 @@
+"""
+DeepSeek V3 tool call parser.
+
+Format uses special unicode tokens:
+    <｜tool▁calls▁begin｜>
+    <｜tool▁call▁begin｜>type<｜tool▁sep｜>function_name
+    ```json
+    {"arg": "value"}
+    ```
+    <｜tool▁call▁end｜>
+    <｜tool▁calls▁end｜>
+
+Based on VLLM's DeepSeekV3ToolParser.extract_tool_calls()
+"""
+
+import re
+import uuid
+from typing import List, Optional
+
+from openai.types.chat.chat_completion_message_tool_call import (
+    ChatCompletionMessageToolCall,
+    Function,
+)
+
+from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
+
+
+@register_parser("deepseek_v3")
+class DeepSeekV3ToolCallParser(ToolCallParser):
+    """
+    Parser for DeepSeek V3 tool calls.
+
+    Uses special unicode tokens with fullwidth angle brackets and block elements.
+    Extracts type, function name, and JSON arguments from the structured format.
+    """
+
+    START_TOKEN = "<｜tool▁calls▁begin｜>"
+
+    # Regex captures: type, function_name, function_arguments
+    PATTERN = re.compile(
+        r"<｜tool▁call▁begin｜>(?P<type>.*)<｜tool▁sep｜>(?P<function_name>.*)\n```json\n(?P<function_arguments>.*)\n```<｜tool▁call▁end｜>"
+    )
+
+    def parse(self, text: str) -> ParseResult:
+        if self.START_TOKEN not in text:
+            return text, None
+
+        try:
+            matches = self.PATTERN.findall(text)
+            if not matches:
+                return text, None
+
+            tool_calls: List[ChatCompletionMessageToolCall] = []
+            for match in matches:
+                tc_type, func_name, func_args = match
+                tool_calls.append(
+                    ChatCompletionMessageToolCall(
+                        id=f"call_{uuid.uuid4().hex[:8]}",
+                        type="function",
+                        function=Function(
+                            name=func_name.strip(),
+                            arguments=func_args.strip(),
+                        ),
+                    )
+                )
+
+            if not tool_calls:
+                return text, None
+
+            # Content is everything before the tool calls section
+            content = text[: text.find(self.START_TOKEN)].strip()
+            return content if content else None, tool_calls
+
+        except Exception:
+            return text, None
--- a/environments/tool_call_parsers/glm45_parser.py
+++ b/environments/tool_call_parsers/glm45_parser.py
@ -0,0 +1,109 @@
+"""
+GLM 4.5 (GLM-4-MoE) tool call parser.
+
+Format uses custom arg_key/arg_value tags rather than standard JSON:
+    <tool_call>function_name
+    <arg_key>param1</arg_key><arg_value>value1</arg_value>
+    <arg_key>param2</arg_key><arg_value>value2</arg_value>
+    </tool_call>
+
+Values are deserialized using json.loads -> ast.literal_eval -> raw string fallback.
+
+Based on VLLM's Glm4MoeModelToolParser.extract_tool_calls()
+"""
+
+import ast
+import json
+import re
+import uuid
+from typing import Any, Dict, List, Optional
+
+from openai.types.chat.chat_completion_message_tool_call import (
+    ChatCompletionMessageToolCall,
+    Function,
+)
+
+from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
+
+
+def _deserialize_value(value: str) -> Any:
+    """
+    Try to deserialize a string value to its native Python type.
+    Attempts json.loads, then ast.literal_eval, then returns raw string.
+    """
+    try:
+        return json.loads(value)
+    except (json.JSONDecodeError, TypeError):
+        pass
+
+    try:
+        return ast.literal_eval(value)
+    except (ValueError, SyntaxError, TypeError):
+        pass
+
+    return value
+
+
+@register_parser("glm45")
+class Glm45ToolCallParser(ToolCallParser):
+    """
+    Parser for GLM 4.5 (GLM-4-MoE) tool calls.
+
+    Uses <tool_call>...</tool_call> tags with <arg_key>/<arg_value> pairs
+    instead of standard JSON arguments.
+    """
+
+    FUNC_CALL_REGEX = re.compile(r"<tool_call>.*?</tool_call>", re.DOTALL)
+    FUNC_DETAIL_REGEX = re.compile(r"<tool_call>([^\n]*)\n(.*)</tool_call>", re.DOTALL)
+    FUNC_ARG_REGEX = re.compile(
+        r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>", re.DOTALL
+    )
+
+    START_TOKEN = "<tool_call>"
+
+    def parse(self, text: str) -> ParseResult:
+        if self.START_TOKEN not in text:
+            return text, None
+
+        try:
+            matched_calls = self.FUNC_CALL_REGEX.findall(text)
+            if not matched_calls:
+                return text, None
+
+            tool_calls: List[ChatCompletionMessageToolCall] = []
+
+            for match in matched_calls:
+                detail = self.FUNC_DETAIL_REGEX.search(match)
+                if not detail:
+                    continue
+
+                func_name = detail.group(1).strip()
+                func_args_raw = detail.group(2)
+
+                # Parse arg_key/arg_value pairs
+                pairs = self.FUNC_ARG_REGEX.findall(func_args_raw) if func_args_raw else []
+                arg_dict: Dict[str, Any] = {}
+                for key, value in pairs:
+                    arg_key = key.strip()
+                    arg_val = _deserialize_value(value.strip())
+                    arg_dict[arg_key] = arg_val
+
+                tool_calls.append(
+                    ChatCompletionMessageToolCall(
+                        id=f"call_{uuid.uuid4().hex[:8]}",
+                        type="function",
+                        function=Function(
+                            name=func_name,
+                            arguments=json.dumps(arg_dict, ensure_ascii=False),
+                        ),
+                    )
+                )
+
+            if not tool_calls:
+                return text, None
+
+            content = text[: text.find(self.START_TOKEN)].strip()
+            return content if content else None, tool_calls
+
+        except Exception:
+            return text, None
--- a/environments/tool_call_parsers/glm47_parser.py
+++ b/environments/tool_call_parsers/glm47_parser.py
@ -0,0 +1,35 @@
+"""
+GLM 4.7 tool call parser.
+
+Same as GLM 4.5 but with slightly different regex patterns.
+The tool_call tags may wrap differently and arg parsing handles
+newlines between key/value pairs.
+
+Based on VLLM's Glm47MoeModelToolParser (extends Glm4MoeModelToolParser).
+"""
+
+import re
+
+from environments.tool_call_parsers import ParseResult, register_parser
+from environments.tool_call_parsers.glm45_parser import Glm45ToolCallParser
+
+
+@register_parser("glm47")
+class Glm47ToolCallParser(Glm45ToolCallParser):
+    """
+    Parser for GLM 4.7 tool calls.
+    Extends GLM 4.5 with updated regex patterns.
+    """
+
+    def __init__(self):
+        super().__init__()
+        # GLM 4.7 uses a slightly different detail regex that includes
+        # the <tool_call> wrapper and optional arg_key content
+        self.FUNC_DETAIL_REGEX = re.compile(
+            r"<tool_call>(.*?)(<arg_key>.*?)?</tool_call>", re.DOTALL
+        )
+        # GLM 4.7 handles newlines between arg_key and arg_value tags
+        self.FUNC_ARG_REGEX = re.compile(
+            r"<arg_key>(.*?)</arg_key>(?:\\n|\s)*<arg_value>(.*?)</arg_value>",
+            re.DOTALL,
+        )
--- a/environments/tool_call_parsers/hermes_parser.py
+++ b/environments/tool_call_parsers/hermes_parser.py
@ -0,0 +1,73 @@
+"""
+Hermes tool call parser.
+
+Format: <tool_call>{"name": "func", "arguments": {...}}</tool_call>
+Based on VLLM's Hermes2ProToolParser.extract_tool_calls()
+"""
+
+import json
+import re
+import uuid
+from typing import List, Optional, Tuple
+
+from openai.types.chat.chat_completion_message_tool_call import (
+    ChatCompletionMessageToolCall,
+    Function,
+)
+
+from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
+
+
+@register_parser("hermes")
+class HermesToolCallParser(ToolCallParser):
+    """
+    Parser for Hermes-format tool calls.
+
+    Matches <tool_call>...</tool_call> tags containing JSON with "name" and "arguments".
+    Also handles unclosed <tool_call> at end-of-string (truncated generation).
+    """
+
+    # Matches both closed and unclosed tool_call tags
+    PATTERN = re.compile(
+        r"<tool_call>\s*(.*?)\s*</tool_call>|<tool_call>\s*(.*)", re.DOTALL
+    )
+
+    def parse(self, text: str) -> ParseResult:
+        if "<tool_call>" not in text:
+            return text, None
+
+        try:
+            matches = self.PATTERN.findall(text)
+            if not matches:
+                return text, None
+
+            tool_calls: List[ChatCompletionMessageToolCall] = []
+            for match in matches:
+                # match is a tuple: (closed_content, unclosed_content)
+                raw_json = match[0] if match[0] else match[1]
+                if not raw_json.strip():
+                    continue
+
+                tc_data = json.loads(raw_json)
+                tool_calls.append(
+                    ChatCompletionMessageToolCall(
+                        id=f"call_{uuid.uuid4().hex[:8]}",
+                        type="function",
+                        function=Function(
+                            name=tc_data["name"],
+                            arguments=json.dumps(
+                                tc_data.get("arguments", {}), ensure_ascii=False
+                            ),
+                        ),
+                    )
+                )
+
+            if not tool_calls:
+                return text, None
+
+            # Content is everything before the first <tool_call> tag
+            content = text[: text.find("<tool_call>")].strip()
+            return content if content else None, tool_calls
+
+        except Exception:
+            return text, None
--- a/environments/tool_call_parsers/kimi_k2_parser.py
+++ b/environments/tool_call_parsers/kimi_k2_parser.py
@ -0,0 +1,93 @@
+"""
+Kimi K2 tool call parser.
+
+Format:
+    <|tool_calls_section_begin|>
+    <|tool_call_begin|>function_id:0<|tool_call_argument_begin|>{"arg": "val"}<|tool_call_end|>
+    <|tool_calls_section_end|>
+
+The function_id format is typically "functions.func_name:index" or "func_name:index".
+
+Based on VLLM's KimiK2ToolParser.extract_tool_calls()
+"""
+
+import re
+import uuid
+from typing import List, Optional
+
+from openai.types.chat.chat_completion_message_tool_call import (
+    ChatCompletionMessageToolCall,
+    Function,
+)
+
+from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
+
+
+@register_parser("kimi_k2")
+class KimiK2ToolCallParser(ToolCallParser):
+    """
+    Parser for Kimi K2 tool calls.
+
+    Uses section begin/end tokens wrapping individual tool call begin/end tokens.
+    The tool_call_id contains the function name (after last dot, before colon).
+    """
+
+    # Support both singular and plural variants
+    START_TOKENS = [
+        "<|tool_calls_section_begin|>",
+        "<|tool_call_section_begin|>",
+    ]
+
+    # Regex captures: tool_call_id (e.g., "functions.get_weather:0"), function_arguments
+    PATTERN = re.compile(
+        r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[^<]+:\d+)\s*"
+        r"<\|tool_call_argument_begin\|>\s*"
+        r"(?P<function_arguments>(?:(?!<\|tool_call_begin\|>).)*?)\s*"
+        r"<\|tool_call_end\|>",
+        re.DOTALL,
+    )
+
+    def parse(self, text: str) -> ParseResult:
+        # Check for any variant of the start token
+        has_start = any(token in text for token in self.START_TOKENS)
+        if not has_start:
+            return text, None
+
+        try:
+            matches = self.PATTERN.findall(text)
+            if not matches:
+                return text, None
+
+            tool_calls: List[ChatCompletionMessageToolCall] = []
+            for match in matches:
+                function_id, function_args = match
+
+                # Extract function name from ID format: "functions.get_weather:0" -> "get_weather"
+                function_name = function_id.split(":")[0].split(".")[-1]
+
+                tool_calls.append(
+                    ChatCompletionMessageToolCall(
+                        id=function_id,  # Preserve the original ID format
+                        type="function",
+                        function=Function(
+                            name=function_name,
+                            arguments=function_args.strip(),
+                        ),
+                    )
+                )
+
+            if not tool_calls:
+                return text, None
+
+            # Content is everything before the tool calls section
+            earliest_start = len(text)
+            for token in self.START_TOKENS:
+                idx = text.find(token)
+                if idx >= 0 and idx < earliest_start:
+                    earliest_start = idx
+
+            content = text[:earliest_start].strip()
+            return content if content else None, tool_calls
+
+        except Exception:
+            return text, None
--- a/environments/tool_call_parsers/llama_parser.py
+++ b/environments/tool_call_parsers/llama_parser.py
@ -0,0 +1,96 @@
+"""
+Llama 3.x / 4 tool call parser.
+
+Format: The model outputs JSON objects with "name" and "arguments" (or "parameters") keys.
+May be preceded by <|python_tag|> token. Supports multiple JSON objects separated
+by content or semicolons.
+
+Based on VLLM's Llama3JsonToolParser.extract_tool_calls()
+"""
+
+import json
+import re
+import uuid
+from typing import List, Optional
+
+from openai.types.chat.chat_completion_message_tool_call import (
+    ChatCompletionMessageToolCall,
+    Function,
+)
+
+from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
+
+
+@register_parser("llama3_json")
+@register_parser("llama4_json")
+class LlamaToolCallParser(ToolCallParser):
+    """
+    Parser for Llama 3.x and 4 JSON-format tool calls.
+
+    Finds JSON objects containing "name" + ("arguments" or "parameters") keys.
+    Uses Python's json.JSONDecoder.raw_decode for robust extraction of
+    JSON objects from mixed text.
+    """
+
+    BOT_TOKEN = "<|python_tag|>"
+
+    # Regex to find the start of potential JSON objects
+    JSON_START = re.compile(r"\{")
+
+    def parse(self, text: str) -> ParseResult:
+        # Quick check: need either the bot token or a JSON brace
+        if self.BOT_TOKEN not in text and "{" not in text:
+            return text, None
+
+        try:
+            decoder = json.JSONDecoder()
+            tool_calls: List[ChatCompletionMessageToolCall] = []
+            end_index = -1  # Track where the last parsed JSON ended
+
+            for match in self.JSON_START.finditer(text):
+                start = match.start()
+                # Skip if this brace is inside a previously parsed JSON object
+                if start <= end_index:
+                    continue
+
+                try:
+                    obj, json_end = decoder.raw_decode(text[start:])
+                    end_index = start + json_end
+
+                    # Must have "name" and either "arguments" or "parameters"
+                    name = obj.get("name")
+                    args = obj.get("arguments", obj.get("parameters"))
+
+                    if not name or args is None:
+                        continue
+
+                    # Normalize arguments to JSON string
+                    if isinstance(args, dict):
+                        args = json.dumps(args, ensure_ascii=False)
+                    elif not isinstance(args, str):
+                        args = json.dumps(args, ensure_ascii=False)
+
+                    tool_calls.append(
+                        ChatCompletionMessageToolCall(
+                            id=f"call_{uuid.uuid4().hex[:8]}",
+                            type="function",
+                            function=Function(name=name, arguments=args),
+                        )
+                    )
+                except (json.JSONDecodeError, KeyError, ValueError):
+                    continue
+
+            if not tool_calls:
+                return text, None
+
+            # Content is everything before the first tool call JSON
+            # Find where the first tool call starts in the text
+            first_tc_start = text.find("{")
+            if self.BOT_TOKEN in text:
+                first_tc_start = text.find(self.BOT_TOKEN)
+            content = text[:first_tc_start].strip() if first_tc_start > 0 else None
+
+            return content, tool_calls
+
+        except Exception:
+            return text, None
--- a/environments/tool_call_parsers/longcat_parser.py
+++ b/environments/tool_call_parsers/longcat_parser.py
@ -0,0 +1,69 @@
+"""
+Longcat Flash Chat tool call parser.
+
+Same as Hermes but uses <longcat_tool_call> tags instead of <tool_call>.
+Based on VLLM's LongcatFlashToolParser (extends Hermes2ProToolParser).
+"""
+
+import json
+import re
+import uuid
+from typing import List, Optional
+
+from openai.types.chat.chat_completion_message_tool_call import (
+    ChatCompletionMessageToolCall,
+    Function,
+)
+
+from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
+
+
+@register_parser("longcat")
+class LongcatToolCallParser(ToolCallParser):
+    """
+    Parser for Longcat Flash Chat tool calls.
+    Identical logic to Hermes, just different tag names.
+    """
+
+    PATTERN = re.compile(
+        r"<longcat_tool_call>\s*(.*?)\s*</longcat_tool_call>|<longcat_tool_call>\s*(.*)",
+        re.DOTALL,
+    )
+
+    def parse(self, text: str) -> ParseResult:
+        if "<longcat_tool_call>" not in text:
+            return text, None
+
+        try:
+            matches = self.PATTERN.findall(text)
+            if not matches:
+                return text, None
+
+            tool_calls: List[ChatCompletionMessageToolCall] = []
+            for match in matches:
+                raw_json = match[0] if match[0] else match[1]
+                if not raw_json.strip():
+                    continue
+
+                tc_data = json.loads(raw_json)
+                tool_calls.append(
+                    ChatCompletionMessageToolCall(
+                        id=f"call_{uuid.uuid4().hex[:8]}",
+                        type="function",
+                        function=Function(
+                            name=tc_data["name"],
+                            arguments=json.dumps(
+                                tc_data.get("arguments", {}), ensure_ascii=False
+                            ),
+                        ),
+                    )
+                )
+
+            if not tool_calls:
+                return text, None
+
+            content = text[: text.find("<longcat_tool_call>")].strip()
+            return content if content else None, tool_calls
+
+        except Exception:
+            return text, None
--- a/environments/tool_call_parsers/mistral_parser.py
+++ b/environments/tool_call_parsers/mistral_parser.py
@ -0,0 +1,130 @@
+"""
+Mistral tool call parser.
+
+Supports two formats depending on tokenizer version:
+- Pre-v11: content[TOOL_CALLS] [{"name": ..., "arguments": {...}}, ...]
+- v11+:    content[TOOL_CALLS]tool_name1{"arg": "val"}[TOOL_CALLS]tool_name2{"arg": "val"}
+
+Based on VLLM's MistralToolParser.extract_tool_calls()
+The [TOOL_CALLS] token is the bot_token used by Mistral models.
+"""
+
+import json
+import re
+import uuid
+from typing import List, Optional
+
+from openai.types.chat.chat_completion_message_tool_call import (
+    ChatCompletionMessageToolCall,
+    Function,
+)
+
+from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
+
+
+def _generate_mistral_id() -> str:
+    """Mistral tool call IDs are 9-char alphanumeric strings."""
+    import random
+    import string
+
+    return "".join(random.choices(string.ascii_letters + string.digits, k=9))
+
+
+@register_parser("mistral")
+class MistralToolCallParser(ToolCallParser):
+    """
+    Parser for Mistral-format tool calls.
+
+    Detects format by checking if the content after [TOOL_CALLS] starts with '['
+    (pre-v11 JSON array) or with a tool name (v11+ format).
+    """
+
+    # The [TOOL_CALLS] token -- may appear as different strings depending on tokenizer
+    BOT_TOKEN = "[TOOL_CALLS]"
+
+    # Fallback regex for pre-v11 format when JSON parsing fails
+    TOOL_CALL_REGEX = re.compile(r"\[?\s*(\{.*?\})\s*\]?", re.DOTALL)
+
+    def parse(self, text: str) -> ParseResult:
+        if self.BOT_TOKEN not in text:
+            return text, None
+
+        try:
+            parts = text.split(self.BOT_TOKEN)
+            content = parts[0].strip()
+            raw_tool_calls = parts[1:]
+
+            # Detect format: if the first raw part starts with '[', it's pre-v11
+            first_raw = raw_tool_calls[0].strip() if raw_tool_calls else ""
+            is_pre_v11 = first_raw.startswith("[") or first_raw.startswith("{")
+
+            tool_calls: List[ChatCompletionMessageToolCall] = []
+
+            if not is_pre_v11:
+                # v11+ format: [TOOL_CALLS]tool_name{args}[TOOL_CALLS]tool_name2{args2}
+                for raw in raw_tool_calls:
+                    raw = raw.strip()
+                    if not raw or "{" not in raw:
+                        continue
+
+                    brace_idx = raw.find("{")
+                    tool_name = raw[:brace_idx].strip()
+                    args_str = raw[brace_idx:]
+
+                    tool_calls.append(
+                        ChatCompletionMessageToolCall(
+                            id=_generate_mistral_id(),
+                            type="function",
+                            function=Function(name=tool_name, arguments=args_str),
+                        )
+                    )
+            else:
+                # Pre-v11 format: [TOOL_CALLS] [{"name": ..., "arguments": {...}}]
+                try:
+                    parsed = json.loads(first_raw)
+                    if isinstance(parsed, dict):
+                        parsed = [parsed]
+
+                    for tc in parsed:
+                        args = tc.get("arguments", {})
+                        if isinstance(args, dict):
+                            args = json.dumps(args, ensure_ascii=False)
+
+                        tool_calls.append(
+                            ChatCompletionMessageToolCall(
+                                id=_generate_mistral_id(),
+                                type="function",
+                                function=Function(
+                                    name=tc["name"], arguments=args
+                                ),
+                            )
+                        )
+                except json.JSONDecodeError:
+                    # Fallback regex extraction
+                    match = self.TOOL_CALL_REGEX.findall(first_raw)
+                    if match:
+                        for raw_json in match:
+                            try:
+                                tc = json.loads(raw_json)
+                                args = tc.get("arguments", {})
+                                if isinstance(args, dict):
+                                    args = json.dumps(args, ensure_ascii=False)
+                                tool_calls.append(
+                                    ChatCompletionMessageToolCall(
+                                        id=_generate_mistral_id(),
+                                        type="function",
+                                        function=Function(
+                                            name=tc["name"], arguments=args
+                                        ),
+                                    )
+                                )
+                            except (json.JSONDecodeError, KeyError):
+                                continue
+
+            if not tool_calls:
+                return text, None
+
+            return content if content else None, tool_calls
+
+        except Exception:
+            return text, None
--- a/environments/tool_call_parsers/qwen3_coder_parser.py
+++ b/environments/tool_call_parsers/qwen3_coder_parser.py
@ -0,0 +1,163 @@
+"""
+Qwen3-Coder tool call parser.
+
+Format uses XML-style nested tags:
+    <tool_call>
+    <function=function_name>
+    <parameter=param_name>value</parameter>
+    <parameter=param_name2>value2</parameter>
+    </function>
+    </tool_call>
+
+Parameters are extracted from <parameter=name>value</parameter> tags and
+type-converted using the schema if available, otherwise treated as strings.
+
+Based on VLLM's Qwen3CoderToolParser.extract_tool_calls()
+"""
+
+import ast
+import json
+import re
+import uuid
+from typing import Any, Dict, List, Optional
+
+from openai.types.chat.chat_completion_message_tool_call import (
+    ChatCompletionMessageToolCall,
+    Function,
+)
+
+from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
+
+
+def _try_convert_value(value: str) -> Any:
+    """
+    Try to convert a parameter value string to a native Python type.
+    Handles null, numbers, booleans, JSON objects/arrays, and falls back to string.
+    """
+    stripped = value.strip()
+
+    # Handle null
+    if stripped.lower() == "null":
+        return None
+
+    # Try JSON first (handles objects, arrays, strings, numbers, booleans)
+    try:
+        return json.loads(stripped)
+    except (json.JSONDecodeError, TypeError):
+        pass
+
+    # Try Python literal eval (handles tuples, etc.)
+    try:
+        return ast.literal_eval(stripped)
+    except (ValueError, SyntaxError, TypeError):
+        pass
+
+    # Return as string
+    return stripped
+
+
+@register_parser("qwen3_coder")
+class Qwen3CoderToolCallParser(ToolCallParser):
+    """
+    Parser for Qwen3-Coder XML-format tool calls.
+
+    Uses nested XML tags: <tool_call><function=name><parameter=key>val</parameter></function></tool_call>
+    """
+
+    START_TOKEN = "<tool_call>"
+    FUNCTION_PREFIX = "<function="
+
+    # Find complete tool_call blocks (or unclosed at end)
+    TOOL_CALL_REGEX = re.compile(
+        r"<tool_call>(.*?)</tool_call>|<tool_call>(.*?)$", re.DOTALL
+    )
+
+    # Find function blocks within a tool_call
+    FUNCTION_REGEX = re.compile(
+        r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL
+    )
+
+    # Find parameter blocks within a function
+    PARAMETER_REGEX = re.compile(
+        r"<parameter=(.*?)(?:</parameter>|(?=<parameter=)|(?=</function>)|$)",
+        re.DOTALL,
+    )
+
+    def _parse_function_call(self, function_str: str) -> Optional[ChatCompletionMessageToolCall]:
+        """Parse a single <function=name>...</function> block into a ToolCall."""
+        try:
+            # Extract function name: everything before the first '>'
+            gt_idx = function_str.index(">")
+            func_name = function_str[:gt_idx].strip()
+            params_str = function_str[gt_idx + 1:]
+
+            # Extract parameters
+            param_dict: Dict[str, Any] = {}
+            for match_text in self.PARAMETER_REGEX.findall(params_str):
+                if ">" not in match_text:
+                    continue
+                eq_idx = match_text.index(">")
+                param_name = match_text[:eq_idx].strip()
+                param_value = match_text[eq_idx + 1:]
+
+                # Clean up whitespace
+                if param_value.startswith("\n"):
+                    param_value = param_value[1:]
+                if param_value.endswith("\n"):
+                    param_value = param_value[:-1]
+
+                param_dict[param_name] = _try_convert_value(param_value)
+
+            return ChatCompletionMessageToolCall(
+                id=f"call_{uuid.uuid4().hex[:24]}",
+                type="function",
+                function=Function(
+                    name=func_name,
+                    arguments=json.dumps(param_dict, ensure_ascii=False),
+                ),
+            )
+        except (ValueError, IndexError):
+            return None
+
+    def parse(self, text: str) -> ParseResult:
+        if self.FUNCTION_PREFIX not in text:
+            return text, None
+
+        try:
+            # Find all tool_call blocks
+            tc_matches = self.TOOL_CALL_REGEX.findall(text)
+            raw_blocks = [m[0] if m[0] else m[1] for m in tc_matches]
+
+            # Fallback: if no tool_call tags, try the whole text
+            if not raw_blocks:
+                raw_blocks = [text]
+
+            # Find function blocks within each tool_call
+            function_strs: List[str] = []
+            for block in raw_blocks:
+                func_matches = self.FUNCTION_REGEX.findall(block)
+                function_strs.extend(m[0] if m[0] else m[1] for m in func_matches)
+
+            if not function_strs:
+                return text, None
+
+            # Parse each function call
+            tool_calls: List[ChatCompletionMessageToolCall] = []
+            for func_str in function_strs:
+                tc = self._parse_function_call(func_str)
+                if tc is not None:
+                    tool_calls.append(tc)
+
+            if not tool_calls:
+                return text, None
+
+            # Content before tool calls
+            first_tc = text.find(self.START_TOKEN)
+            if first_tc < 0:
+                first_tc = text.find(self.FUNCTION_PREFIX)
+            content = text[:first_tc].strip() if first_tc > 0 else None
+
+            return content, tool_calls
+
+        except Exception:
+            return text, None
--- a/environments/tool_call_parsers/qwen_parser.py
+++ b/environments/tool_call_parsers/qwen_parser.py
@ -0,0 +1,19 @@
+"""
+Qwen 2.5 tool call parser.
+
+Uses the same <tool_call> format as Hermes.
+Registered as a separate parser name for clarity when using --tool-parser=qwen.
+"""
+
+from environments.tool_call_parsers import register_parser
+from environments.tool_call_parsers.hermes_parser import HermesToolCallParser
+
+
+@register_parser("qwen")
+class QwenToolCallParser(HermesToolCallParser):
+    """
+    Parser for Qwen 2.5 tool calls.
+    Same <tool_call>{"name": ..., "arguments": ...}</tool_call> format as Hermes.
+    """
+
+    pass  # Identical format -- inherits everything from Hermes
--- a/environments/tool_context.py
+++ b/environments/tool_context.py
@ -0,0 +1,246 @@
+"""
+ToolContext -- Unrestricted Tool Access for Reward Functions
+
+A per-rollout handle that gives reward/verification functions direct access to
+ALL hermes-agent tools, scoped to the rollout's task_id. The same task_id means
+the terminal/browser session is the SAME one the model used during its rollout --
+all state (files, processes, browser tabs) is preserved.
+
+The verifier author decides which tools to use. Nothing is hardcoded or gated.
+
+Example usage in a compute_reward():
+    async def compute_reward(self, item, result, ctx):
+        # Run tests in the model's terminal sandbox
+        test = ctx.terminal("pytest -v")
+        if test["exit_code"] == 0:
+            return 1.0
+
+        # Check if a file was created
+        content = ctx.read_file("/workspace/solution.py")
+        if content.get("content"):
+            return 0.5
+
+        return 0.0
+"""
+
+import json
+import logging
+from typing import Any, Dict, List, Optional
+
+from model_tools import handle_function_call
+from tools.terminal_tool import cleanup_vm
+from tools.browser_tool import cleanup_browser
+
+logger = logging.getLogger(__name__)
+
+
+class ToolContext:
+    """
+    Open-ended access to all hermes-agent tools for a specific rollout.
+
+    Passed to compute_reward() so verifiers can use any tool they need:
+    terminal commands, file reads/writes, web searches, browser automation, etc.
+    All calls share the rollout's task_id for session isolation.
+    """
+
+    def __init__(self, task_id: str):
+        self.task_id = task_id
+
+    # -------------------------------------------------------------------------
+    # Terminal tools
+    # -------------------------------------------------------------------------
+
+    def terminal(self, command: str, timeout: int = 180) -> Dict[str, Any]:
+        """
+        Run a command in the rollout's terminal session.
+
+        Args:
+            command: Shell command to execute
+            timeout: Command timeout in seconds
+
+        Returns:
+            Dict with 'exit_code' (int) and 'output' (str)
+        """
+        result = handle_function_call(
+            "terminal",
+            {"command": command, "timeout": timeout},
+            task_id=self.task_id,
+        )
+        try:
+            return json.loads(result)
+        except json.JSONDecodeError:
+            return {"exit_code": -1, "output": result}
+
+    # -------------------------------------------------------------------------
+    # File tools
+    # -------------------------------------------------------------------------
+
+    def read_file(self, path: str) -> Dict[str, Any]:
+        """
+        Read a file from the rollout's filesystem.
+
+        Args:
+            path: File path to read
+
+        Returns:
+            Dict with file content or error
+        """
+        result = handle_function_call(
+            "read_file", {"path": path}, task_id=self.task_id
+        )
+        try:
+            return json.loads(result)
+        except json.JSONDecodeError:
+            return {"error": result}
+
+    def write_file(self, path: str, content: str) -> Dict[str, Any]:
+        """
+        Write a file in the rollout's filesystem.
+
+        Args:
+            path: File path to write
+            content: Content to write
+
+        Returns:
+            Dict with success status or error
+        """
+        result = handle_function_call(
+            "write_file", {"path": path, "content": content}, task_id=self.task_id
+        )
+        try:
+            return json.loads(result)
+        except json.JSONDecodeError:
+            return {"error": result}
+
+    def search(self, query: str, path: str = ".") -> Dict[str, Any]:
+        """
+        Search for text in the rollout's filesystem.
+
+        Args:
+            query: Search query
+            path: Directory to search in
+
+        Returns:
+            Dict with search results
+        """
+        result = handle_function_call(
+            "search", {"query": query, "path": path}, task_id=self.task_id
+        )
+        try:
+            return json.loads(result)
+        except json.JSONDecodeError:
+            return {"error": result}
+
+    # -------------------------------------------------------------------------
+    # Web tools
+    # -------------------------------------------------------------------------
+
+    def web_search(self, query: str) -> Dict[str, Any]:
+        """
+        Search the web.
+
+        Args:
+            query: Search query
+
+        Returns:
+            Dict with search results
+        """
+        result = handle_function_call("web_search", {"query": query})
+        try:
+            return json.loads(result)
+        except json.JSONDecodeError:
+            return {"error": result}
+
+    def web_extract(self, urls: List[str]) -> Dict[str, Any]:
+        """
+        Extract content from URLs.
+
+        Args:
+            urls: List of URLs to extract content from
+
+        Returns:
+            Dict with extracted content
+        """
+        result = handle_function_call("web_extract", {"urls": urls})
+        try:
+            return json.loads(result)
+        except json.JSONDecodeError:
+            return {"error": result}
+
+    # -------------------------------------------------------------------------
+    # Browser tools
+    # -------------------------------------------------------------------------
+
+    def browser_navigate(self, url: str) -> Dict[str, Any]:
+        """
+        Navigate the rollout's browser session to a URL.
+
+        Args:
+            url: URL to navigate to
+
+        Returns:
+            Dict with page snapshot or error
+        """
+        result = handle_function_call(
+            "browser_navigate", {"url": url}, task_id=self.task_id
+        )
+        try:
+            return json.loads(result)
+        except json.JSONDecodeError:
+            return {"error": result}
+
+    def browser_snapshot(self) -> Dict[str, Any]:
+        """
+        Take a snapshot of the current browser page.
+
+        Returns:
+            Dict with page content/accessibility snapshot
+        """
+        result = handle_function_call(
+            "browser_snapshot", {}, task_id=self.task_id
+        )
+        try:
+            return json.loads(result)
+        except json.JSONDecodeError:
+            return {"error": result}
+
+    # -------------------------------------------------------------------------
+    # Generic tool access
+    # -------------------------------------------------------------------------
+
+    def call_tool(self, tool_name: str, arguments: Dict[str, Any]) -> str:
+        """
+        Call any hermes-agent tool by name.
+
+        This is the generic escape hatch -- if a tool doesn't have a convenience
+        wrapper above, you can call it directly here.
+
+        Args:
+            tool_name: Name of the tool (e.g., "vision_analyze", "skills_list")
+            arguments: Dict of arguments for the tool
+
+        Returns:
+            Raw JSON string result from the tool
+        """
+        return handle_function_call(tool_name, arguments, task_id=self.task_id)
+
+    # -------------------------------------------------------------------------
+    # Cleanup
+    # -------------------------------------------------------------------------
+
+    def cleanup(self):
+        """
+        Release all resources (terminal VMs, browser sessions) for this rollout.
+
+        Called automatically by the base environment via try/finally after
+        compute_reward() completes. You generally don't need to call this yourself.
+        """
+        try:
+            cleanup_vm(self.task_id)
+        except Exception as e:
+            logger.debug("VM cleanup for task %s: %s", self.task_id, e)
+
+        try:
+            cleanup_browser(self.task_id)
+        except Exception as e:
+            logger.debug("Browser cleanup for task %s: %s", self.task_id, e)