Add support for Atropos Agentic RL environments (requires branch tool_call_support in Atropos atm)

- Added new environments for reinforcement learning, including `HermesSweEnv` for software engineering tasks and `TerminalTestEnv` for inline testing.
- Introduced `ToolContext` for unrestricted access to tools during reward computation.
- Updated `.gitignore` to exclude `wandb/` directory.
- Enhanced `README.md` with detailed architecture and usage instructions for Atropos environments.
- Added configuration files for SWE and terminal test environments to streamline setup.
- Removed unnecessary compiled Python files from `__pycache__`.
This commit is contained in:
teknium 2026-02-07 09:17:16 +00:00
parent ac79725923
commit 07b615e96e
30 changed files with 2851 additions and 965 deletions

28
environments/__init__.py Normal file
View file

@ -0,0 +1,28 @@
"""
Hermes-Agent Atropos Environments
Provides a layered integration between hermes-agent's tool-calling capabilities
and the Atropos RL training framework.
Layers:
- agent_loop: Reusable multi-turn agent loop with standard OpenAI-spec tool calling
- tool_context: Per-rollout tool access handle for reward/verification functions
- hermes_base_env: Abstract base environment (BaseEnv subclass) for Atropos
- tool_call_parsers: Client-side tool call parser registry for Phase 2 (VLLM /generate)
Concrete environments:
- terminal_test_env: Simple file-creation tasks for testing the stack
- hermes_swe_env: SWE-bench style tasks with Modal sandboxes
"""
from environments.agent_loop import AgentResult, HermesAgentLoop
from environments.tool_context import ToolContext
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
__all__ = [
"AgentResult",
"HermesAgentLoop",
"ToolContext",
"HermesAgentBaseEnv",
"HermesAgentEnvConfig",
]

306
environments/agent_loop.py Normal file
View file

@ -0,0 +1,306 @@
"""
HermesAgentLoop -- Reusable Multi-Turn Agent Engine
Runs the hermes-agent tool-calling loop using standard OpenAI-spec tool calling.
Works with any server that returns ChatCompletion objects with tool_calls:
- Phase 1: OpenAI server type (VLLM, SGLang, OpenRouter, OpenAI API)
- Phase 2: ManagedServer with client-side tool call parser
The loop passes tools= and checks response.choices[0].message.tool_calls,
identical to hermes-agent's run_agent.py. Tool execution is dispatched via
handle_function_call() from model_tools.py.
"""
import json
import logging
import uuid
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional, Set
from model_tools import handle_function_call
logger = logging.getLogger(__name__)
@dataclass
class AgentResult:
"""Result of running the agent loop."""
# Full conversation history in OpenAI message format
messages: List[Dict[str, Any]]
# ManagedServer.get_state() if available (Phase 2), None otherwise
managed_state: Optional[Dict[str, Any]] = None
# How many LLM calls were made
turns_used: int = 0
# True if model stopped calling tools naturally (vs hitting max_turns)
finished_naturally: bool = False
# Extracted reasoning content per turn (from PR #297 helpers)
reasoning_per_turn: List[Optional[str]] = field(default_factory=list)
def _extract_reasoning_from_message(message) -> Optional[str]:
"""
Extract reasoning content from a ChatCompletion message.
Handles multiple provider formats:
1. message.reasoning_content field (some providers)
2. message.reasoning field (some providers)
3. message.reasoning_details[].text (OpenRouter style)
Note: <think> block extraction from content is NOT done here -- that's
handled by the response already in Phase 1 (server does it) or by
ManagedServer's patch in Phase 2.
Args:
message: The assistant message from ChatCompletion response
Returns:
Extracted reasoning text, or None if not found
"""
# Check reasoning_content field (common across providers)
if hasattr(message, "reasoning_content") and message.reasoning_content:
return message.reasoning_content
# Check reasoning field
if hasattr(message, "reasoning") and message.reasoning:
return message.reasoning
# Check reasoning_details (OpenRouter style)
if hasattr(message, "reasoning_details") and message.reasoning_details:
for detail in message.reasoning_details:
if hasattr(detail, "text") and detail.text:
return detail.text
if isinstance(detail, dict) and detail.get("text"):
return detail["text"]
return None
class HermesAgentLoop:
"""
Runs hermes-agent's tool-calling loop using standard OpenAI-spec tool calling.
Same pattern as run_agent.py:
- Pass tools= to the API
- Check response.choices[0].message.tool_calls
- Dispatch via handle_function_call()
Works identically with any server type -- OpenAI, VLLM, SGLang, OpenRouter,
or ManagedServer with a parser. The server determines how tool_calls get
populated on the response.
"""
def __init__(
self,
server,
tool_schemas: List[Dict[str, Any]],
valid_tool_names: Set[str],
max_turns: int = 30,
task_id: Optional[str] = None,
temperature: float = 1.0,
max_tokens: Optional[int] = None,
):
"""
Initialize the agent loop.
Args:
server: Server object with chat_completion() method (OpenAIServer,
ManagedServer, ServerManager, etc.)
tool_schemas: OpenAI-format tool definitions from get_tool_definitions()
valid_tool_names: Set of tool names the model is allowed to call
max_turns: Maximum number of LLM calls before stopping
task_id: Unique ID for terminal/browser session isolation
temperature: Sampling temperature for generation
max_tokens: Max tokens per generation (None for server default)
"""
self.server = server
self.tool_schemas = tool_schemas
self.valid_tool_names = valid_tool_names
self.max_turns = max_turns
self.task_id = task_id or str(uuid.uuid4())
self.temperature = temperature
self.max_tokens = max_tokens
async def run(self, messages: List[Dict[str, Any]]) -> AgentResult:
"""
Execute the full agent loop using standard OpenAI tool calling.
Args:
messages: Initial conversation messages (system + user).
Modified in-place as the conversation progresses.
Returns:
AgentResult with full conversation history, managed state, and metadata
"""
reasoning_per_turn = []
for turn in range(self.max_turns):
# Build the chat_completion kwargs
chat_kwargs = {
"messages": messages,
"n": 1,
"temperature": self.temperature,
}
# Only pass tools if we have them
if self.tool_schemas:
chat_kwargs["tools"] = self.tool_schemas
# Only pass max_tokens if explicitly set
if self.max_tokens is not None:
chat_kwargs["max_tokens"] = self.max_tokens
# Make the API call -- standard OpenAI spec
try:
response = await self.server.chat_completion(**chat_kwargs)
except Exception as e:
logger.error("API call failed on turn %d: %s", turn + 1, e)
return AgentResult(
messages=messages,
managed_state=self._get_managed_state(),
turns_used=turn + 1,
finished_naturally=False,
reasoning_per_turn=reasoning_per_turn,
)
if not response or not response.choices:
logger.warning("Empty response on turn %d", turn + 1)
return AgentResult(
messages=messages,
managed_state=self._get_managed_state(),
turns_used=turn + 1,
finished_naturally=False,
reasoning_per_turn=reasoning_per_turn,
)
assistant_msg = response.choices[0].message
# Extract reasoning content from the response (all provider formats)
reasoning = _extract_reasoning_from_message(assistant_msg)
reasoning_per_turn.append(reasoning)
# Check for tool calls -- standard OpenAI spec
if assistant_msg.tool_calls:
# Build the assistant message dict for conversation history
msg_dict: Dict[str, Any] = {
"role": "assistant",
"content": assistant_msg.content or "",
"tool_calls": [
{
"id": tc.id,
"type": "function",
"function": {
"name": tc.function.name,
"arguments": tc.function.arguments,
},
}
for tc in assistant_msg.tool_calls
],
}
# Preserve reasoning_content for multi-turn chat template handling
# (e.g., Kimi-K2's template renders <think> blocks differently
# for history vs. the latest turn based on this field)
if reasoning:
msg_dict["reasoning_content"] = reasoning
messages.append(msg_dict)
# Execute each tool call via hermes-agent's dispatch
for tc in assistant_msg.tool_calls:
tool_name = tc.function.name
# Validate tool name
if tool_name not in self.valid_tool_names:
tool_result = json.dumps(
{
"error": f"Unknown tool '{tool_name}'. "
f"Available tools: {sorted(self.valid_tool_names)}"
}
)
logger.warning(
"Model called unknown tool '%s' on turn %d",
tool_name,
turn + 1,
)
else:
# Parse arguments and dispatch
try:
args = json.loads(tc.function.arguments)
except json.JSONDecodeError:
args = {}
logger.warning(
"Invalid JSON in tool call arguments for '%s': %s",
tool_name,
tc.function.arguments[:200],
)
try:
tool_result = handle_function_call(
tool_name, args, task_id=self.task_id
)
except Exception as e:
tool_result = json.dumps(
{"error": f"Tool execution failed: {str(e)}"}
)
logger.error(
"Tool '%s' execution failed: %s", tool_name, e
)
# Add tool response to conversation
messages.append(
{
"role": "tool",
"tool_call_id": tc.id,
"content": tool_result,
}
)
logger.debug(
"Turn %d: %d tool calls executed",
turn + 1,
len(assistant_msg.tool_calls),
)
else:
# No tool calls -- model is done
msg_dict = {
"role": "assistant",
"content": assistant_msg.content or "",
}
if reasoning:
msg_dict["reasoning_content"] = reasoning
messages.append(msg_dict)
logger.debug(
"Turn %d: model finished naturally (no tool calls)", turn + 1
)
return AgentResult(
messages=messages,
managed_state=self._get_managed_state(),
turns_used=turn + 1,
finished_naturally=True,
reasoning_per_turn=reasoning_per_turn,
)
# Hit max turns without the model stopping
logger.info("Agent hit max_turns (%d) without finishing", self.max_turns)
return AgentResult(
messages=messages,
managed_state=self._get_managed_state(),
turns_used=self.max_turns,
finished_naturally=False,
reasoning_per_turn=reasoning_per_turn,
)
def _get_managed_state(self) -> Optional[Dict[str, Any]]:
"""
Get ManagedServer state if the server supports it.
Returns state dict with SequenceNodes containing tokens/logprobs/masks,
or None if the server doesn't support get_state() (e.g., regular OpenAI server).
"""
if hasattr(self.server, "get_state"):
return self.server.get_state()
return None

View file

@ -0,0 +1,33 @@
# SWE Environment -- Default Configuration
#
# SWE-bench style tasks with Modal sandboxes for cloud isolation.
# Uses terminal + file + web toolsets.
#
# Usage:
# python environments/hermes_swe_env.py serve --config environments/configs/swe_default.yaml
env:
enabled_toolsets: ["terminal", "file", "web"]
max_agent_turns: 30
max_token_length: 4096
group_size: 4
terminal_backend: "modal"
tool_call_parser: "hermes"
tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
dataset_name: "bigcode/humanevalpack"
dataset_split: "test"
prompt_field: "prompt"
steps_per_eval: 50
total_steps: 500
use_wandb: true
wandb_name: "hermes-swe"
system_prompt: >
You are a skilled software engineer. You have access to a terminal,
file tools, and web search. Use these tools to complete the coding task.
Write clean, working code and verify it runs correctly before finishing.
openai:
base_url: "http://localhost:8000/v1"
model_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
server_type: "openai"
api_key: ""

View file

@ -0,0 +1,35 @@
# Terminal Test Environment -- Default Configuration
#
# Simple file-creation tasks for validating the full Atropos + hermes-agent stack.
# Uses Modal terminal backend and OpenRouter (Claude) for inference.
# API keys loaded from ~/hermes-agent/.env
#
# Usage:
# run-api
# python environments/terminal_test_env.py serve
# # Or with config file:
# python environments/terminal_test_env.py serve --config environments/configs/terminal_test_default.yaml
env:
enabled_toolsets: ["terminal", "file"]
max_agent_turns: 10
max_token_length: 2048
group_size: 3
total_steps: 3
steps_per_eval: 3
terminal_backend: "modal"
tool_call_parser: "hermes"
tokenizer_name: "NousResearch/DeepHermes-3-Llama-3-3B-Preview"
ensure_scores_are_not_same: false
use_wandb: false
system_prompt: >
You are a helpful assistant with access to a terminal and file tools.
Complete the user's request by using the available tools.
Be precise and follow instructions exactly.
openai:
base_url: "https://openrouter.ai/api/v1"
model_name: "anthropic/claude-opus-4.6"
server_type: "openai"
health_check: false
# api_key loaded from OPENROUTER_API_KEY in .env

View file

@ -0,0 +1,540 @@
"""
HermesAgentBaseEnv -- Abstract Base Environment for Hermes-Agent + Atropos
Provides the Atropos integration plumbing that all hermes-agent environments share:
- Two-mode operation (OpenAI server for Phase 1, VLLM ManagedServer for Phase 2)
- Per-group toolset/distribution resolution
- Agent loop orchestration via HermesAgentLoop
- ToolContext creation for reward functions
- ScoredDataGroup construction from ManagedServer state
Subclasses only need to implement:
setup() -- Load dataset, initialize state
get_next_item() -- Return the next item from the dataset
format_prompt() -- Convert a dataset item into the user message
compute_reward() -- Score the rollout (has full ToolContext access)
evaluate() -- Periodic evaluation
"""
import asyncio
import json
import logging
import os
import sys
import uuid
from abc import abstractmethod
from pathlib import Path
from typing import Any, Dict, List, Optional, Set, Tuple, Union
# Ensure the hermes-agent repo root is on sys.path so that imports like
# `from model_tools import ...` and `from environments.X import ...` work
# regardless of where the script is invoked from.
_repo_root = Path(__file__).resolve().parent.parent
if str(_repo_root) not in sys.path:
sys.path.insert(0, str(_repo_root))
from dotenv import load_dotenv
from pydantic import Field
# Load API keys from hermes-agent/.env so all environments can access them
_env_path = _repo_root / ".env"
if _env_path.exists():
load_dotenv(dotenv_path=_env_path)
from atroposlib.envs.base import (
BaseEnv,
BaseEnvConfig,
ScoredDataGroup,
ScoredDataItem,
)
from atroposlib.envs.server_handling.server_manager import (
APIServerConfig,
ServerBaseline,
ServerManager,
)
from atroposlib.type_definitions import Item
from environments.agent_loop import AgentResult, HermesAgentLoop
from environments.tool_context import ToolContext
# Import hermes-agent toolset infrastructure
from model_tools import get_tool_definitions
from toolset_distributions import sample_toolsets_from_distribution
logger = logging.getLogger(__name__)
class HermesAgentEnvConfig(BaseEnvConfig):
"""
Configuration for hermes-agent Atropos environments.
Extends BaseEnvConfig with agent-specific settings for toolsets,
terminal backend, dataset loading, and tool call parsing.
"""
# --- Toolset configuration ---
# Mutually exclusive: use either enabled_toolsets OR distribution
enabled_toolsets: Optional[List[str]] = Field(
default=None,
description="Explicit list of hermes toolsets to enable (e.g., ['terminal', 'file', 'web']). "
"If None and distribution is also None, all available toolsets are enabled.",
)
disabled_toolsets: Optional[List[str]] = Field(
default=None,
description="Toolsets to disable. Applied as a filter on top of enabled_toolsets or distribution.",
)
distribution: Optional[str] = Field(
default=None,
description="Name of a toolset distribution from toolset_distributions.py "
"(e.g., 'development', 'terminal_tasks'). Sampled once per group. "
"Mutually exclusive with enabled_toolsets.",
)
# --- Agent loop configuration ---
max_agent_turns: int = Field(
default=30,
description="Maximum number of LLM calls (tool-calling iterations) per rollout.",
)
system_prompt: Optional[str] = Field(
default=None,
description="System prompt for the agent. Tools are handled via the tools= parameter, "
"not embedded in the prompt text.",
)
agent_temperature: float = Field(
default=1.0,
description="Sampling temperature for agent generation during rollouts.",
)
# --- Terminal backend ---
terminal_backend: str = Field(
default="local",
description="Terminal backend: 'local', 'docker', 'modal', 'ssh', 'singularity'. "
"Modal recommended for production RL (cloud isolation per rollout).",
)
# --- Dataset ---
dataset_name: Optional[str] = Field(
default=None,
description="HuggingFace dataset name. Optional if tasks are defined inline.",
)
dataset_split: str = Field(
default="train",
description="Dataset split to use.",
)
prompt_field: str = Field(
default="prompt",
description="Which field in the dataset contains the prompt.",
)
# --- Phase 2: Tool call parsing ---
tool_call_parser: str = Field(
default="hermes",
description="Tool call parser name for Phase 2 (VLLM server type). "
"Ignored in Phase 1 (OpenAI server type where VLLM parses natively). "
"Options: hermes, mistral, llama3_json, qwen, deepseek_v3, etc.",
)
class HermesAgentBaseEnv(BaseEnv):
"""
Abstract base environment for hermes-agent Atropos integration.
Handles two modes of operation:
- Phase 1 (OpenAI server type): Uses server.chat_completion() directly.
The server (VLLM, SGLang, OpenRouter, OpenAI) handles tool call parsing
and reasoning extraction natively. DummyManagedServer provides placeholder
tokens. Good for SFT data gen, verifier testing, evaluation.
- Phase 2 (VLLM server type): Uses ManagedServer for exact token IDs + logprobs
via /generate. Client-side tool call parser reconstructs structured tool_calls
from raw output. Full RL training capability.
Subclasses must implement:
setup() -- Load dataset, initialize state
get_next_item() -- Return the next item to roll out
format_prompt() -- Convert a dataset item into the user message string
compute_reward() -- Score the rollout using ToolContext
evaluate() -- Periodic evaluation
"""
name: Optional[str] = "hermes-agent"
env_config_cls = HermesAgentEnvConfig
def __init__(
self,
config: HermesAgentEnvConfig,
server_configs: Union[ServerBaseline, List[APIServerConfig]],
slurm=False,
testing=False,
):
super().__init__(config, server_configs, slurm, testing)
# Set terminal backend environment variable so hermes tools pick it up
if config.terminal_backend:
os.environ["TERMINAL_ENV"] = config.terminal_backend
# Current group's resolved tools (set in collect_trajectories)
self._current_group_tools: Optional[Tuple[List[Dict], Set[str]]] = None
# =========================================================================
# Toolset resolution (per-group)
# =========================================================================
def _resolve_tools_for_group(self) -> Tuple[List[Dict[str, Any]], Set[str]]:
"""
Resolve toolsets for a group. Called once in collect_trajectories(),
then shared by all collect_trajectory() calls in the group.
If distribution is set, samples probabilistically.
If enabled_toolsets is set, uses that explicit list.
disabled_toolsets is applied as a filter on top.
Returns:
(tool_schemas, valid_tool_names) tuple
"""
config = self.config
if config.distribution:
group_toolsets = sample_toolsets_from_distribution(config.distribution)
logger.info("Sampled toolsets from '%s': %s", config.distribution, group_toolsets)
else:
group_toolsets = config.enabled_toolsets # None means "all available"
tools = get_tool_definitions(
enabled_toolsets=group_toolsets,
disabled_toolsets=config.disabled_toolsets,
quiet_mode=True,
)
valid_names = {t["function"]["name"] for t in tools} if tools else set()
logger.info("Resolved %d tools for group: %s", len(valid_names), sorted(valid_names))
return tools, valid_names
# =========================================================================
# Server mode detection
# =========================================================================
def _use_managed_server(self) -> bool:
"""
Determine if we should use ManagedServer (Phase 2) or direct server (Phase 1).
Phase 2 (ManagedServer) is used when the server type is 'vllm' or 'sglang',
which go through the /generate endpoint for exact token tracking.
Phase 1 (direct server) is used for 'openai' server type, which uses
/v1/chat/completions with native tool call parsing.
"""
if not self.server.servers:
return False
server = self.server.servers[0]
# If the server is an OpenAI server (not VLLM/SGLang), use direct mode
from atroposlib.envs.server_handling.openai_server import OpenAIServer
return not isinstance(server, OpenAIServer)
# =========================================================================
# Core Atropos integration
# =========================================================================
async def collect_trajectories(
self, item: Item
) -> Tuple[
Union[Optional[ScoredDataGroup], List[Optional[ScoredDataGroup]]],
List[Item],
]:
"""
Override collect_trajectories to resolve toolsets once per group,
then delegate to the standard group-level collection.
The default BaseEnv.collect_trajectories() calls collect_trajectory()
group_size times in parallel. We resolve tools once here and store
them for all those calls to use.
"""
# Resolve toolsets for this group (shared by all rollouts in the group)
self._current_group_tools = self._resolve_tools_for_group()
# Delegate to the default implementation which calls collect_trajectory()
# group_size times via asyncio.gather
return await super().collect_trajectories(item)
# =========================================================================
# Wandb rollout display -- format trajectories nicely
# =========================================================================
@staticmethod
def _format_trajectory_for_display(messages: List[Dict[str, Any]]) -> str:
"""
Format a conversation's messages into a readable trajectory string
for wandb rollout tables. Shows tool calls, tool results, and reasoning
in a structured way instead of raw token decoding.
"""
parts = []
for msg in messages:
role = msg.get("role", "unknown")
content = msg.get("content", "")
if role == "system":
parts.append(f"[SYSTEM]\n{content}")
elif role == "user":
parts.append(f"[USER]\n{content}")
elif role == "assistant":
# Show reasoning if present
reasoning = msg.get("reasoning_content", "")
if reasoning:
# Truncate long reasoning for display
if len(reasoning) > 300:
reasoning = reasoning[:300] + "..."
parts.append(f"[ASSISTANT thinking]\n{reasoning}")
# Show content
if content:
parts.append(f"[ASSISTANT]\n{content}")
# Show tool calls
tool_calls = msg.get("tool_calls", [])
for tc in tool_calls:
func = tc.get("function", {})
name = func.get("name", "?")
args = func.get("arguments", "{}")
# Truncate long arguments for display
if len(args) > 200:
args = args[:200] + "..."
parts.append(f"[TOOL CALL] {name}({args})")
elif role == "tool":
tool_id = msg.get("tool_call_id", "")
result = content
# Truncate long tool results for display
if len(result) > 500:
result = result[:500] + "..."
parts.append(f"[TOOL RESULT] {result}")
return "\n\n".join(parts)
async def add_rollouts_for_wandb(
self,
scored_data,
item=None,
):
"""
Override to show formatted trajectories with tool calls visible,
instead of raw token decoding which loses all structure.
"""
num_keep = self.config.num_rollouts_per_group_for_logging
if num_keep == -1:
num_keep = self.config.group_size
group = []
for i in range(min(num_keep, len(scored_data.get("scores", [])))):
score = scored_data["scores"][i]
# Use messages if available for rich display
messages = None
if scored_data.get("messages") and i < len(scored_data["messages"]):
messages = scored_data["messages"][i]
if messages:
text = self._format_trajectory_for_display(messages)
elif scored_data.get("tokens") and i < len(scored_data["tokens"]):
text = self.tokenizer.decode(scored_data["tokens"][i])
else:
text = "(no data)"
group.append((text, score))
self.rollouts_for_wandb.append(group)
if len(self.rollouts_for_wandb) > self.config.num_rollouts_to_keep:
self.rollouts_for_wandb.pop(0)
async def collect_trajectory(
self, item: Item
) -> Tuple[Optional[Union[ScoredDataItem, Any]], List[Item]]:
"""
Run a single rollout: agent loop + reward computation.
This is called group_size times in parallel by collect_trajectories().
Each call gets its own task_id for terminal/browser session isolation.
"""
task_id = str(uuid.uuid4())
# Get group-level tools (resolved once in collect_trajectories)
if self._current_group_tools is None:
# Fallback: resolve per-trajectory if called outside collect_trajectories
tools, valid_names = self._resolve_tools_for_group()
else:
tools, valid_names = self._current_group_tools
# Build initial messages
messages: List[Dict[str, Any]] = []
if self.config.system_prompt:
messages.append({"role": "system", "content": self.config.system_prompt})
messages.append({"role": "user", "content": self.format_prompt(item)})
# Run the agent loop
result: AgentResult
if self._use_managed_server():
# Phase 2: ManagedServer with parser -- exact tokens + logprobs
try:
async with self.server.managed_server(tokenizer=self.tokenizer) as managed:
agent = HermesAgentLoop(
server=managed,
tool_schemas=tools,
valid_tool_names=valid_names,
max_turns=self.config.max_agent_turns,
task_id=task_id,
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
)
result = await agent.run(messages)
except NotImplementedError:
# DummyManagedServer not allowed -- fall back to Phase 1
logger.warning(
"ManagedServer not available (OpenAI server?). "
"Falling back to direct server mode."
)
agent = HermesAgentLoop(
server=self.server,
tool_schemas=tools,
valid_tool_names=valid_names,
max_turns=self.config.max_agent_turns,
task_id=task_id,
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
)
result = await agent.run(messages)
else:
# Phase 1: OpenAI server -- native tool_calls, placeholder tokens
agent = HermesAgentLoop(
server=self.server,
tool_schemas=tools,
valid_tool_names=valid_names,
max_turns=self.config.max_agent_turns,
task_id=task_id,
temperature=self.config.agent_temperature,
max_tokens=self.config.max_token_length,
)
result = await agent.run(messages)
# Compute reward using ToolContext (gives verifier full tool access)
ctx = ToolContext(task_id)
try:
reward = await self.compute_reward(item, result, ctx)
except Exception as e:
logger.error("compute_reward failed: %s", e)
reward = 0.0
finally:
ctx.cleanup()
# Build ScoredDataItem from ManagedServer state
# Phase 2: real tokens/masks/logprobs from SequenceNodes
# Phase 1: placeholder tokens (still need a valid ScoredDataItem for the pipeline)
nodes = (result.managed_state or {}).get("nodes", [])
if nodes:
# Phase 2 (or DummyManagedServer): use actual node data
node = nodes[-1] # Final sequence node = full trajectory
scored_item: Dict[str, Any] = {
"tokens": node.tokens,
"masks": node.masked_tokens,
"scores": reward,
}
# Include logprobs if available (Phase 2)
if hasattr(node, "logprobs") and node.logprobs:
scored_item["advantages"] = None # Computed by trainer
scored_item["ref_logprobs"] = None
else:
# Phase 1 with no managed state: create placeholder tokens
# so the data pipeline doesn't break. These are NOT suitable
# for training but allow process mode (SFT data gen) to work.
# Tokenize the full conversation to get approximate tokens.
full_text = "\n".join(
msg.get("content", "") for msg in result.messages if msg.get("content")
)
if self.tokenizer:
tokens = self.tokenizer.encode(full_text, add_special_tokens=True)
else:
tokens = list(range(min(len(full_text) // 4, 128)))
scored_item = {
"tokens": tokens,
"masks": [-100] + tokens[1:], # Mask first token as prompt
"scores": reward,
}
# Always include messages for wandb rollout display and data logging
scored_item["messages"] = result.messages
return scored_item, []
# =========================================================================
# Abstract methods -- subclasses must implement
# =========================================================================
@abstractmethod
async def setup(self):
"""
Load dataset, initialize state.
Called once when the environment starts. Typical implementation:
self.dataset = load_dataset(self.config.dataset_name, split=self.config.dataset_split)
self.iter = 0
"""
raise NotImplementedError
@abstractmethod
async def get_next_item(self) -> Item:
"""
Return the next item from the dataset for rollout.
Called by the base env's main loop to get items for workers.
Should cycle through the dataset.
"""
raise NotImplementedError
@abstractmethod
def format_prompt(self, item: Item) -> str:
"""
Convert a dataset item into the user message for the agent.
Args:
item: Dataset item (dict, tuple, etc.)
Returns:
The prompt string to send to the agent
"""
raise NotImplementedError
@abstractmethod
async def compute_reward(
self, item: Item, result: AgentResult, ctx: ToolContext
) -> float:
"""
Score the rollout. Has full access to:
- item: the original dataset item (ground truth, test commands, etc.)
- result: AgentResult with full messages, turn count, reasoning, etc.
- ctx: ToolContext -- call ANY hermes-agent tool (terminal, file, web,
browser, vision...) scoped to this rollout's sandbox. Nothing
is off-limits.
Args:
item: The dataset item that was rolled out
result: The agent's rollout result
ctx: ToolContext with full tool access for verification
Returns:
Reward float (typically 0.0 to 1.0, but any float is valid)
"""
raise NotImplementedError
@abstractmethod
async def evaluate(self, *args, **kwargs):
"""
Periodic evaluation. Called every steps_per_eval steps.
Typical implementation runs the agent on a held-out eval set
and logs metrics via wandb/evaluate_log.
"""
raise NotImplementedError

View file

@ -0,0 +1,229 @@
"""
HermesSweEnv -- SWE-Bench Style Environment with Modal Sandboxes
A concrete environment for software engineering tasks where the model writes code
and the reward function runs tests to verify correctness. Uses Modal terminal
backend for cloud-isolated sandboxes per rollout.
The reward function uses ToolContext.terminal() to run test commands in the same
Modal sandbox the model used during its agentic loop. All filesystem state from
the model's tool calls is preserved for verification.
Usage:
# Phase 1: OpenAI server type
vllm serve YourModel --tool-parser hermes
run-api
python environments/hermes_swe_env.py serve \\
--openai.base_url http://localhost:8000/v1 \\
--openai.model_name YourModel \\
--openai.server_type openai \\
--env.dataset_name bigcode/humanevalpack \\
--env.terminal_backend modal
# Phase 2: VLLM server type (full RL training)
python environments/hermes_swe_env.py serve \\
--openai.base_url http://localhost:8000/v1 \\
--openai.model_name YourModel \\
--openai.server_type vllm \\
--env.tool_call_parser hermes \\
--env.terminal_backend modal
"""
import logging
import sys
import time
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
# Ensure repo root is on sys.path for imports
_repo_root = Path(__file__).resolve().parent.parent
if str(_repo_root) not in sys.path:
sys.path.insert(0, str(_repo_root))
from datasets import load_dataset
from atroposlib.envs.base import ScoredDataGroup
from atroposlib.envs.server_handling.server_manager import APIServerConfig
from atroposlib.type_definitions import Item
from environments.agent_loop import AgentResult
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
from environments.tool_context import ToolContext
logger = logging.getLogger(__name__)
class HermesSweEnvConfig(HermesAgentEnvConfig):
"""Config with defaults for SWE-bench style tasks."""
pass # Inherits all fields, overrides defaults in config_init
class HermesSweEnv(HermesAgentBaseEnv):
"""
SWE-bench style environment using Modal terminal backend.
The model gets a coding task, uses terminal + file + web tools to solve it,
and the reward function runs tests in the same Modal sandbox to verify.
Subclass this for specific SWE datasets (HumanEval, SWE-bench, etc.)
and customize format_prompt() and compute_reward() as needed.
"""
name = "hermes-swe"
env_config_cls = HermesSweEnvConfig
@classmethod
def config_init(cls) -> Tuple[HermesSweEnvConfig, List[APIServerConfig]]:
"""
Default configuration for the SWE environment.
Uses Modal terminal backend for cloud isolation and terminal + file + web toolsets.
"""
env_config = HermesSweEnvConfig(
# Toolsets: terminal for running code, file for reading/writing, web for docs
enabled_toolsets=["terminal", "file", "web"],
disabled_toolsets=None,
distribution=None,
# Agent settings -- SWE tasks need more turns
max_agent_turns=30,
max_token_length=4096,
agent_temperature=1.0,
system_prompt=(
"You are a skilled software engineer. You have access to a terminal, "
"file tools, and web search. Use these tools to complete the coding task. "
"Write clean, working code and verify it runs correctly before finishing."
),
# Modal backend for cloud-isolated sandboxes
terminal_backend="modal",
# Dataset -- override via CLI for your specific SWE dataset
dataset_name="bigcode/humanevalpack",
dataset_split="test",
prompt_field="prompt",
# Atropos settings
group_size=4,
tokenizer_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
tool_call_parser="hermes",
steps_per_eval=50,
total_steps=500,
use_wandb=True,
wandb_name="hermes-swe",
)
server_configs = [
APIServerConfig(
base_url="http://localhost:8000/v1",
model_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
server_type="openai", # Phase 1; switch to "vllm" for Phase 2
api_key="",
)
]
return env_config, server_configs
async def setup(self):
"""Load the SWE dataset."""
if self.config.dataset_name:
self.dataset = load_dataset(
self.config.dataset_name, split=self.config.dataset_split
)
else:
# Placeholder if no dataset specified
self.dataset = []
self.iter = 0
self.reward_buffer: List[float] = []
async def get_next_item(self) -> Dict[str, Any]:
"""Cycle through the SWE dataset."""
if not self.dataset:
raise ValueError("No dataset loaded. Set dataset_name in config.")
item = self.dataset[self.iter % len(self.dataset)]
self.iter += 1
return item
def format_prompt(self, item: Dict[str, Any]) -> str:
"""
Format the SWE task prompt.
Override this in subclasses for different dataset formats.
Default assumes the dataset has a 'prompt' field and optionally a 'test' field.
"""
prompt = item.get(self.config.prompt_field, "")
# If the dataset has test information, include it in the prompt
test_info = item.get("test", item.get("test_code", item.get("tests", "")))
if test_info:
prompt += f"\n\nTests to pass:\n{test_info}"
return prompt
async def compute_reward(
self, item: Dict[str, Any], result: AgentResult, ctx: ToolContext
) -> float:
"""
Score by running tests in the model's Modal sandbox.
Default implementation:
- If the dataset item has a 'test' or 'test_code' field, run it
- Check exit code: 0 = pass, non-zero = fail
- Partial credit for file creation
Override this in subclasses for more sophisticated reward logic.
"""
# Find the test command from the dataset item
test_code = item.get("test", item.get("test_code", item.get("tests", "")))
if test_code:
# Run the test in the model's sandbox
test_result = ctx.terminal(
f'cd /workspace && python3 -c "{test_code}"', timeout=60
)
if test_result["exit_code"] == 0:
self.reward_buffer.append(1.0)
return 1.0
# Partial credit: check if the model created any Python files
file_check = ctx.terminal("find /workspace -name '*.py' -newer /tmp/.start_marker 2>/dev/null | head -5")
if file_check["exit_code"] == 0 and file_check.get("output", "").strip():
self.reward_buffer.append(0.1)
return 0.1
self.reward_buffer.append(0.0)
return 0.0
async def evaluate(self, *args, **kwargs):
"""
Run evaluation on a held-out set.
Override for dataset-specific evaluation logic.
"""
start_time = time.time()
end_time = time.time()
eval_metrics = {"eval/placeholder": 0.0}
await self.evaluate_log(
metrics=eval_metrics,
start_time=start_time,
end_time=end_time,
)
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
"""Log SWE-specific metrics."""
if wandb_metrics is None:
wandb_metrics = {}
if self.reward_buffer:
wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / len(
self.reward_buffer
)
wandb_metrics["train/pass_rate"] = sum(
1 for r in self.reward_buffer if r == 1.0
) / len(self.reward_buffer)
self.reward_buffer = []
await super().wandb_log(wandb_metrics)
if __name__ == "__main__":
HermesSweEnv.cli()

View file

@ -0,0 +1,292 @@
"""
TerminalTestEnv -- Simple Test Environment for Validating the Stack
A self-contained environment with inline tasks (no external dataset needed).
Each task asks the model to create a file at a known path with specific content.
The reward verifier cats the file and checks if the content matches.
Enables only terminal + file toolsets. Uses Modal terminal backend with
OpenRouter (Claude) by default.
Training tasks (3):
1. Create ~/greeting.txt with "Hello from Hermes Agent"
2. Create ~/count.txt with numbers 1-5, one per line
3. Create ~/answer.txt with the result of 123 + 456
Eval task (1):
1. Create ~/result.txt with the result of 6 * 7
Usage:
# Start Atropos API server
run-api
# Run environment (uses OpenRouter + Modal by default)
python environments/terminal_test_env.py serve
# Process mode (no run-api needed, saves to JSONL)
python environments/terminal_test_env.py process \\
--env.data_path_to_save_groups terminal_test_output.jsonl
"""
import logging
import os
import sys
import time
from pathlib import Path
from typing import Any, Dict, List, Optional, Tuple, Union
# Ensure repo root is on sys.path for imports
_repo_root = Path(__file__).resolve().parent.parent
if str(_repo_root) not in sys.path:
sys.path.insert(0, str(_repo_root))
from atroposlib.envs.base import ScoredDataGroup
from atroposlib.envs.server_handling.server_manager import APIServerConfig
from atroposlib.type_definitions import Item
from environments.agent_loop import AgentResult
from environments.hermes_base_env import HermesAgentBaseEnv, HermesAgentEnvConfig
from environments.tool_context import ToolContext
logger = logging.getLogger(__name__)
# =============================================================================
# Inline task definitions -- no external dataset needed
# =============================================================================
TRAIN_TASKS = [
{
"prompt": "Create a file at ~/greeting.txt containing exactly the text: Hello from Hermes Agent",
"verify_path": "~/greeting.txt",
"expected_content": "Hello from Hermes Agent",
},
{
"prompt": "Create a file at ~/count.txt containing the numbers 1 through 5, one per line",
"verify_path": "~/count.txt",
"expected_content": "1\n2\n3\n4\n5",
},
{
"prompt": "Create a file at ~/answer.txt containing the result of 123 + 456",
"verify_path": "~/answer.txt",
"expected_content": "579",
},
]
EVAL_TASKS = [
{
"prompt": "Create a file at ~/result.txt containing the result of 6 * 7",
"verify_path": "~/result.txt",
"expected_content": "42",
},
]
class TerminalTestEnvConfig(HermesAgentEnvConfig):
"""Config with defaults suitable for terminal testing."""
pass # Inherits all fields, overrides defaults in config_init
class TerminalTestEnv(HermesAgentBaseEnv):
"""
Simple test environment with inline file-creation tasks.
All tasks follow the same pattern: "create a file at ~/X.txt with content Y".
The verifier runs `cat ~/X.txt` in the rollout's terminal and checks the output
against the expected string. Same verifier logic for all tasks.
This environment is designed to validate the full stack end-to-end:
- Agent loop executes tool calls (terminal/file)
- ToolContext provides terminal access to the reward function
- Reward function verifies file content via cat
- Scored data flows through the Atropos pipeline
"""
name = "terminal-test"
env_config_cls = TerminalTestEnvConfig
@classmethod
def config_init(cls) -> Tuple[TerminalTestEnvConfig, List[APIServerConfig]]:
"""
Default configuration for the terminal test environment.
Uses Modal terminal backend for cloud isolation and OpenRouter with
Claude for inference. API keys loaded from ~/hermes-agent/.env.
"""
env_config = TerminalTestEnvConfig(
# Terminal + file tools only
enabled_toolsets=["terminal", "file"],
disabled_toolsets=None,
distribution=None,
# Agent settings
max_agent_turns=10, # Simple tasks, don't need many turns
max_token_length=2048,
agent_temperature=1.0,
system_prompt=(
"You are a helpful assistant with access to a terminal and file tools. "
"Complete the user's request by using the available tools. "
"Be precise and follow instructions exactly."
),
# Modal terminal backend for cloud-isolated sandboxes per rollout
terminal_backend="modal",
# Atropos settings
group_size=3, # 3 rollouts per group
tokenizer_name="NousResearch/DeepHermes-3-Llama-3-3B-Preview",
tool_call_parser="hermes",
steps_per_eval=3, # Eval after all 3 steps
total_steps=3, # 3 groups total (1 group per step)
use_wandb=True,
wandb_name="terminal-test",
ensure_scores_are_not_same=False, # Allow all-same scores for simple tasks
# No external dataset
dataset_name=None,
)
# OpenRouter with Claude -- API key loaded from .env (OPENROUTER_API_KEY)
server_configs = [
APIServerConfig(
base_url="https://openrouter.ai/api/v1",
model_name="anthropic/claude-opus-4.6",
server_type="openai",
api_key=os.getenv("OPENROUTER_API_KEY", ""),
health_check=False, # OpenRouter doesn't have a /health endpoint
)
]
return env_config, server_configs
async def setup(self):
"""Initialize inline task lists."""
self.train_tasks = list(TRAIN_TASKS)
self.eval_tasks = list(EVAL_TASKS)
self.iter = 0
# Track reward stats for wandb logging
self.reward_buffer: List[float] = []
async def get_next_item(self) -> Dict[str, str]:
"""Cycle through training tasks."""
item = self.train_tasks[self.iter % len(self.train_tasks)]
self.iter += 1
return item
def format_prompt(self, item: Dict[str, str]) -> str:
"""The prompt is directly in the task item."""
return item["prompt"]
async def compute_reward(
self, item: Dict[str, str], result: AgentResult, ctx: ToolContext
) -> float:
"""
Verify by cat-ing the expected file path and checking content matches.
Same verifier for all tasks -- they all write a file at a known path.
Scoring:
1.0 = exact match
0.5 = expected content is present but has extra stuff
0.0 = file doesn't exist or content doesn't match
"""
verify_result = ctx.terminal(f"cat {item['verify_path']}")
# File doesn't exist or can't be read
if verify_result["exit_code"] != 0:
self.reward_buffer.append(0.0)
return 0.0
actual = verify_result.get("output", "").strip()
expected = item["expected_content"].strip()
# Exact match
if actual == expected:
self.reward_buffer.append(1.0)
return 1.0
# Partial credit: expected content is present but has extra stuff
if expected in actual:
self.reward_buffer.append(0.5)
return 0.5
self.reward_buffer.append(0.0)
return 0.0
async def evaluate(self, *args, **kwargs):
"""
Run eval tasks using the agent loop and verify results.
Logs accuracy metrics.
"""
start_time = time.time()
correct = 0
total = len(self.eval_tasks)
samples = []
for eval_item in self.eval_tasks:
try:
# For eval, we do a simple single-turn completion (not full agent loop)
# to keep eval fast. The agent loop is tested via training.
completion = await self.server.chat_completion(
messages=[
{"role": "system", "content": self.config.system_prompt or ""},
{"role": "user", "content": eval_item["prompt"]},
],
n=1,
max_tokens=self.config.max_token_length,
temperature=0.0,
split="eval",
)
response_content = (
completion.choices[0].message.content if completion.choices else ""
)
samples.append(
{
"prompt": eval_item["prompt"],
"response": response_content,
"expected": eval_item["expected_content"],
}
)
except Exception as e:
logger.error("Eval failed for item: %s", e)
samples.append(
{
"prompt": eval_item["prompt"],
"response": f"ERROR: {e}",
"expected": eval_item["expected_content"],
}
)
end_time = time.time()
eval_metrics = {
"eval/num_samples": total,
}
await self.evaluate_log(
metrics=eval_metrics,
samples=samples,
start_time=start_time,
end_time=end_time,
)
async def wandb_log(self, wandb_metrics: Optional[Dict] = None):
"""Log training metrics including reward stats and accuracy."""
if wandb_metrics is None:
wandb_metrics = {}
if self.reward_buffer:
total = len(self.reward_buffer)
correct = sum(1 for r in self.reward_buffer if r == 1.0)
partial = sum(1 for r in self.reward_buffer if r == 0.5)
wandb_metrics["train/avg_reward"] = sum(self.reward_buffer) / total
wandb_metrics["train/accuracy"] = correct / total
wandb_metrics["train/partial_match_rate"] = partial / total
wandb_metrics["train/total_rollouts"] = total
self.reward_buffer = []
await super().wandb_log(wandb_metrics)
if __name__ == "__main__":
TerminalTestEnv.cli()

View file

@ -0,0 +1,120 @@
"""
Tool Call Parser Registry
Client-side parsers that extract structured tool_calls from raw model output text.
Used in Phase 2 (VLLM server type) where ManagedServer's /generate endpoint returns
raw text without tool call parsing.
Each parser is a standalone reimplementation of the corresponding VLLM parser's
non-streaming extract_tool_calls() logic. No VLLM dependency -- only standard library
(re, json, uuid) and openai types.
Usage:
from environments.tool_call_parsers import get_parser
parser = get_parser("hermes")
content, tool_calls = parser.parse(raw_model_output)
# content = text with tool call markup stripped
# tool_calls = list of ChatCompletionMessageToolCall objects, or None
"""
import logging
from abc import ABC, abstractmethod
from typing import Dict, List, Optional, Tuple, Type
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
)
logger = logging.getLogger(__name__)
# Type alias for parser return value
ParseResult = Tuple[Optional[str], Optional[List[ChatCompletionMessageToolCall]]]
class ToolCallParser(ABC):
"""
Base class for tool call parsers.
Each parser knows how to extract structured tool_calls from a specific
model family's raw output text format.
"""
@abstractmethod
def parse(self, text: str) -> ParseResult:
"""
Parse raw model output text for tool calls.
Args:
text: Raw decoded text from the model's completion
Returns:
Tuple of (content, tool_calls) where:
- content: text with tool call markup stripped (the message 'content' field),
or None if the entire output was tool calls
- tool_calls: list of ChatCompletionMessageToolCall objects,
or None if no tool calls were found
"""
raise NotImplementedError
# Global parser registry: name -> parser class
PARSER_REGISTRY: Dict[str, Type[ToolCallParser]] = {}
def register_parser(name: str):
"""
Decorator to register a parser class under a given name.
Usage:
@register_parser("hermes")
class HermesToolCallParser(ToolCallParser):
...
"""
def decorator(cls: Type[ToolCallParser]) -> Type[ToolCallParser]:
PARSER_REGISTRY[name] = cls
return cls
return decorator
def get_parser(name: str) -> ToolCallParser:
"""
Get a parser instance by name.
Args:
name: Parser name (e.g., "hermes", "mistral", "llama3_json")
Returns:
Instantiated parser
Raises:
KeyError: If parser name is not found in registry
"""
if name not in PARSER_REGISTRY:
available = sorted(PARSER_REGISTRY.keys())
raise KeyError(
f"Tool call parser '{name}' not found. Available parsers: {available}"
)
return PARSER_REGISTRY[name]()
def list_parsers() -> List[str]:
"""Return sorted list of registered parser names."""
return sorted(PARSER_REGISTRY.keys())
# Import all parser modules to trigger registration via @register_parser decorators
# Each module registers itself when imported
from environments.tool_call_parsers.hermes_parser import HermesToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.longcat_parser import LongcatToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.mistral_parser import MistralToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.llama_parser import LlamaToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.qwen_parser import QwenToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.deepseek_v3_parser import DeepSeekV3ToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.deepseek_v3_1_parser import DeepSeekV31ToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.kimi_k2_parser import KimiK2ToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.glm45_parser import Glm45ToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.glm47_parser import Glm47ToolCallParser # noqa: E402, F401
from environments.tool_call_parsers.qwen3_coder_parser import Qwen3CoderToolCallParser # noqa: E402, F401

View file

@ -0,0 +1,71 @@
"""
DeepSeek V3.1 tool call parser.
Similar to V3 but with a slightly different format:
<toolcallbegin>function_name<toolsep>arguments<toolcallend>
Note: V3 has type+name before the separator, V3.1 has name before and args after.
Based on VLLM's DeepSeekV31ToolParser.extract_tool_calls()
"""
import re
import uuid
from typing import List, Optional
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
Function,
)
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
@register_parser("deepseek_v3_1")
@register_parser("deepseek_v31")
class DeepSeekV31ToolCallParser(ToolCallParser):
"""
Parser for DeepSeek V3.1 tool calls.
Slightly different regex than V3: function_name comes before the separator,
arguments come after (no type field, no json code block wrapper).
"""
START_TOKEN = "<tool▁calls▁begin>"
# Regex captures: function_name, function_arguments
PATTERN = re.compile(
r"<tool▁call▁begin>(?P<function_name>.*?)<tool▁sep>(?P<function_arguments>.*?)<tool▁call▁end>"
)
def parse(self, text: str) -> ParseResult:
if self.START_TOKEN not in text:
return text, None
try:
matches = self.PATTERN.findall(text)
if not matches:
return text, None
tool_calls: List[ChatCompletionMessageToolCall] = []
for match in matches:
func_name, func_args = match
tool_calls.append(
ChatCompletionMessageToolCall(
id=f"call_{uuid.uuid4().hex[:8]}",
type="function",
function=Function(
name=func_name.strip(),
arguments=func_args.strip(),
),
)
)
if not tool_calls:
return text, None
content = text[: text.find(self.START_TOKEN)].strip()
return content if content else None, tool_calls
except Exception:
return text, None

View file

@ -0,0 +1,75 @@
"""
DeepSeek V3 tool call parser.
Format uses special unicode tokens:
<toolcallsbegin>
<toolcallbegin>type<toolsep>function_name
```json
{"arg": "value"}
```
<toolcallend>
<toolcallsend>
Based on VLLM's DeepSeekV3ToolParser.extract_tool_calls()
"""
import re
import uuid
from typing import List, Optional
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
Function,
)
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
@register_parser("deepseek_v3")
class DeepSeekV3ToolCallParser(ToolCallParser):
"""
Parser for DeepSeek V3 tool calls.
Uses special unicode tokens with fullwidth angle brackets and block elements.
Extracts type, function name, and JSON arguments from the structured format.
"""
START_TOKEN = "<tool▁calls▁begin>"
# Regex captures: type, function_name, function_arguments
PATTERN = re.compile(
r"<tool▁call▁begin>(?P<type>.*)<tool▁sep>(?P<function_name>.*)\n```json\n(?P<function_arguments>.*)\n```<tool▁call▁end>"
)
def parse(self, text: str) -> ParseResult:
if self.START_TOKEN not in text:
return text, None
try:
matches = self.PATTERN.findall(text)
if not matches:
return text, None
tool_calls: List[ChatCompletionMessageToolCall] = []
for match in matches:
tc_type, func_name, func_args = match
tool_calls.append(
ChatCompletionMessageToolCall(
id=f"call_{uuid.uuid4().hex[:8]}",
type="function",
function=Function(
name=func_name.strip(),
arguments=func_args.strip(),
),
)
)
if not tool_calls:
return text, None
# Content is everything before the tool calls section
content = text[: text.find(self.START_TOKEN)].strip()
return content if content else None, tool_calls
except Exception:
return text, None

View file

@ -0,0 +1,109 @@
"""
GLM 4.5 (GLM-4-MoE) tool call parser.
Format uses custom arg_key/arg_value tags rather than standard JSON:
<tool_call>function_name
<arg_key>param1</arg_key><arg_value>value1</arg_value>
<arg_key>param2</arg_key><arg_value>value2</arg_value>
</tool_call>
Values are deserialized using json.loads -> ast.literal_eval -> raw string fallback.
Based on VLLM's Glm4MoeModelToolParser.extract_tool_calls()
"""
import ast
import json
import re
import uuid
from typing import Any, Dict, List, Optional
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
Function,
)
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
def _deserialize_value(value: str) -> Any:
"""
Try to deserialize a string value to its native Python type.
Attempts json.loads, then ast.literal_eval, then returns raw string.
"""
try:
return json.loads(value)
except (json.JSONDecodeError, TypeError):
pass
try:
return ast.literal_eval(value)
except (ValueError, SyntaxError, TypeError):
pass
return value
@register_parser("glm45")
class Glm45ToolCallParser(ToolCallParser):
"""
Parser for GLM 4.5 (GLM-4-MoE) tool calls.
Uses <tool_call>...</tool_call> tags with <arg_key>/<arg_value> pairs
instead of standard JSON arguments.
"""
FUNC_CALL_REGEX = re.compile(r"<tool_call>.*?</tool_call>", re.DOTALL)
FUNC_DETAIL_REGEX = re.compile(r"<tool_call>([^\n]*)\n(.*)</tool_call>", re.DOTALL)
FUNC_ARG_REGEX = re.compile(
r"<arg_key>(.*?)</arg_key>\s*<arg_value>(.*?)</arg_value>", re.DOTALL
)
START_TOKEN = "<tool_call>"
def parse(self, text: str) -> ParseResult:
if self.START_TOKEN not in text:
return text, None
try:
matched_calls = self.FUNC_CALL_REGEX.findall(text)
if not matched_calls:
return text, None
tool_calls: List[ChatCompletionMessageToolCall] = []
for match in matched_calls:
detail = self.FUNC_DETAIL_REGEX.search(match)
if not detail:
continue
func_name = detail.group(1).strip()
func_args_raw = detail.group(2)
# Parse arg_key/arg_value pairs
pairs = self.FUNC_ARG_REGEX.findall(func_args_raw) if func_args_raw else []
arg_dict: Dict[str, Any] = {}
for key, value in pairs:
arg_key = key.strip()
arg_val = _deserialize_value(value.strip())
arg_dict[arg_key] = arg_val
tool_calls.append(
ChatCompletionMessageToolCall(
id=f"call_{uuid.uuid4().hex[:8]}",
type="function",
function=Function(
name=func_name,
arguments=json.dumps(arg_dict, ensure_ascii=False),
),
)
)
if not tool_calls:
return text, None
content = text[: text.find(self.START_TOKEN)].strip()
return content if content else None, tool_calls
except Exception:
return text, None

View file

@ -0,0 +1,35 @@
"""
GLM 4.7 tool call parser.
Same as GLM 4.5 but with slightly different regex patterns.
The tool_call tags may wrap differently and arg parsing handles
newlines between key/value pairs.
Based on VLLM's Glm47MoeModelToolParser (extends Glm4MoeModelToolParser).
"""
import re
from environments.tool_call_parsers import ParseResult, register_parser
from environments.tool_call_parsers.glm45_parser import Glm45ToolCallParser
@register_parser("glm47")
class Glm47ToolCallParser(Glm45ToolCallParser):
"""
Parser for GLM 4.7 tool calls.
Extends GLM 4.5 with updated regex patterns.
"""
def __init__(self):
super().__init__()
# GLM 4.7 uses a slightly different detail regex that includes
# the <tool_call> wrapper and optional arg_key content
self.FUNC_DETAIL_REGEX = re.compile(
r"<tool_call>(.*?)(<arg_key>.*?)?</tool_call>", re.DOTALL
)
# GLM 4.7 handles newlines between arg_key and arg_value tags
self.FUNC_ARG_REGEX = re.compile(
r"<arg_key>(.*?)</arg_key>(?:\\n|\s)*<arg_value>(.*?)</arg_value>",
re.DOTALL,
)

View file

@ -0,0 +1,73 @@
"""
Hermes tool call parser.
Format: <tool_call>{"name": "func", "arguments": {...}}</tool_call>
Based on VLLM's Hermes2ProToolParser.extract_tool_calls()
"""
import json
import re
import uuid
from typing import List, Optional, Tuple
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
Function,
)
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
@register_parser("hermes")
class HermesToolCallParser(ToolCallParser):
"""
Parser for Hermes-format tool calls.
Matches <tool_call>...</tool_call> tags containing JSON with "name" and "arguments".
Also handles unclosed <tool_call> at end-of-string (truncated generation).
"""
# Matches both closed and unclosed tool_call tags
PATTERN = re.compile(
r"<tool_call>\s*(.*?)\s*</tool_call>|<tool_call>\s*(.*)", re.DOTALL
)
def parse(self, text: str) -> ParseResult:
if "<tool_call>" not in text:
return text, None
try:
matches = self.PATTERN.findall(text)
if not matches:
return text, None
tool_calls: List[ChatCompletionMessageToolCall] = []
for match in matches:
# match is a tuple: (closed_content, unclosed_content)
raw_json = match[0] if match[0] else match[1]
if not raw_json.strip():
continue
tc_data = json.loads(raw_json)
tool_calls.append(
ChatCompletionMessageToolCall(
id=f"call_{uuid.uuid4().hex[:8]}",
type="function",
function=Function(
name=tc_data["name"],
arguments=json.dumps(
tc_data.get("arguments", {}), ensure_ascii=False
),
),
)
)
if not tool_calls:
return text, None
# Content is everything before the first <tool_call> tag
content = text[: text.find("<tool_call>")].strip()
return content if content else None, tool_calls
except Exception:
return text, None

View file

@ -0,0 +1,93 @@
"""
Kimi K2 tool call parser.
Format:
<|tool_calls_section_begin|>
<|tool_call_begin|>function_id:0<|tool_call_argument_begin|>{"arg": "val"}<|tool_call_end|>
<|tool_calls_section_end|>
The function_id format is typically "functions.func_name:index" or "func_name:index".
Based on VLLM's KimiK2ToolParser.extract_tool_calls()
"""
import re
import uuid
from typing import List, Optional
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
Function,
)
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
@register_parser("kimi_k2")
class KimiK2ToolCallParser(ToolCallParser):
"""
Parser for Kimi K2 tool calls.
Uses section begin/end tokens wrapping individual tool call begin/end tokens.
The tool_call_id contains the function name (after last dot, before colon).
"""
# Support both singular and plural variants
START_TOKENS = [
"<|tool_calls_section_begin|>",
"<|tool_call_section_begin|>",
]
# Regex captures: tool_call_id (e.g., "functions.get_weather:0"), function_arguments
PATTERN = re.compile(
r"<\|tool_call_begin\|>\s*(?P<tool_call_id>[^<]+:\d+)\s*"
r"<\|tool_call_argument_begin\|>\s*"
r"(?P<function_arguments>(?:(?!<\|tool_call_begin\|>).)*?)\s*"
r"<\|tool_call_end\|>",
re.DOTALL,
)
def parse(self, text: str) -> ParseResult:
# Check for any variant of the start token
has_start = any(token in text for token in self.START_TOKENS)
if not has_start:
return text, None
try:
matches = self.PATTERN.findall(text)
if not matches:
return text, None
tool_calls: List[ChatCompletionMessageToolCall] = []
for match in matches:
function_id, function_args = match
# Extract function name from ID format: "functions.get_weather:0" -> "get_weather"
function_name = function_id.split(":")[0].split(".")[-1]
tool_calls.append(
ChatCompletionMessageToolCall(
id=function_id, # Preserve the original ID format
type="function",
function=Function(
name=function_name,
arguments=function_args.strip(),
),
)
)
if not tool_calls:
return text, None
# Content is everything before the tool calls section
earliest_start = len(text)
for token in self.START_TOKENS:
idx = text.find(token)
if idx >= 0 and idx < earliest_start:
earliest_start = idx
content = text[:earliest_start].strip()
return content if content else None, tool_calls
except Exception:
return text, None

View file

@ -0,0 +1,96 @@
"""
Llama 3.x / 4 tool call parser.
Format: The model outputs JSON objects with "name" and "arguments" (or "parameters") keys.
May be preceded by <|python_tag|> token. Supports multiple JSON objects separated
by content or semicolons.
Based on VLLM's Llama3JsonToolParser.extract_tool_calls()
"""
import json
import re
import uuid
from typing import List, Optional
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
Function,
)
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
@register_parser("llama3_json")
@register_parser("llama4_json")
class LlamaToolCallParser(ToolCallParser):
"""
Parser for Llama 3.x and 4 JSON-format tool calls.
Finds JSON objects containing "name" + ("arguments" or "parameters") keys.
Uses Python's json.JSONDecoder.raw_decode for robust extraction of
JSON objects from mixed text.
"""
BOT_TOKEN = "<|python_tag|>"
# Regex to find the start of potential JSON objects
JSON_START = re.compile(r"\{")
def parse(self, text: str) -> ParseResult:
# Quick check: need either the bot token or a JSON brace
if self.BOT_TOKEN not in text and "{" not in text:
return text, None
try:
decoder = json.JSONDecoder()
tool_calls: List[ChatCompletionMessageToolCall] = []
end_index = -1 # Track where the last parsed JSON ended
for match in self.JSON_START.finditer(text):
start = match.start()
# Skip if this brace is inside a previously parsed JSON object
if start <= end_index:
continue
try:
obj, json_end = decoder.raw_decode(text[start:])
end_index = start + json_end
# Must have "name" and either "arguments" or "parameters"
name = obj.get("name")
args = obj.get("arguments", obj.get("parameters"))
if not name or args is None:
continue
# Normalize arguments to JSON string
if isinstance(args, dict):
args = json.dumps(args, ensure_ascii=False)
elif not isinstance(args, str):
args = json.dumps(args, ensure_ascii=False)
tool_calls.append(
ChatCompletionMessageToolCall(
id=f"call_{uuid.uuid4().hex[:8]}",
type="function",
function=Function(name=name, arguments=args),
)
)
except (json.JSONDecodeError, KeyError, ValueError):
continue
if not tool_calls:
return text, None
# Content is everything before the first tool call JSON
# Find where the first tool call starts in the text
first_tc_start = text.find("{")
if self.BOT_TOKEN in text:
first_tc_start = text.find(self.BOT_TOKEN)
content = text[:first_tc_start].strip() if first_tc_start > 0 else None
return content, tool_calls
except Exception:
return text, None

View file

@ -0,0 +1,69 @@
"""
Longcat Flash Chat tool call parser.
Same as Hermes but uses <longcat_tool_call> tags instead of <tool_call>.
Based on VLLM's LongcatFlashToolParser (extends Hermes2ProToolParser).
"""
import json
import re
import uuid
from typing import List, Optional
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
Function,
)
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
@register_parser("longcat")
class LongcatToolCallParser(ToolCallParser):
"""
Parser for Longcat Flash Chat tool calls.
Identical logic to Hermes, just different tag names.
"""
PATTERN = re.compile(
r"<longcat_tool_call>\s*(.*?)\s*</longcat_tool_call>|<longcat_tool_call>\s*(.*)",
re.DOTALL,
)
def parse(self, text: str) -> ParseResult:
if "<longcat_tool_call>" not in text:
return text, None
try:
matches = self.PATTERN.findall(text)
if not matches:
return text, None
tool_calls: List[ChatCompletionMessageToolCall] = []
for match in matches:
raw_json = match[0] if match[0] else match[1]
if not raw_json.strip():
continue
tc_data = json.loads(raw_json)
tool_calls.append(
ChatCompletionMessageToolCall(
id=f"call_{uuid.uuid4().hex[:8]}",
type="function",
function=Function(
name=tc_data["name"],
arguments=json.dumps(
tc_data.get("arguments", {}), ensure_ascii=False
),
),
)
)
if not tool_calls:
return text, None
content = text[: text.find("<longcat_tool_call>")].strip()
return content if content else None, tool_calls
except Exception:
return text, None

View file

@ -0,0 +1,130 @@
"""
Mistral tool call parser.
Supports two formats depending on tokenizer version:
- Pre-v11: content[TOOL_CALLS] [{"name": ..., "arguments": {...}}, ...]
- v11+: content[TOOL_CALLS]tool_name1{"arg": "val"}[TOOL_CALLS]tool_name2{"arg": "val"}
Based on VLLM's MistralToolParser.extract_tool_calls()
The [TOOL_CALLS] token is the bot_token used by Mistral models.
"""
import json
import re
import uuid
from typing import List, Optional
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
Function,
)
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
def _generate_mistral_id() -> str:
"""Mistral tool call IDs are 9-char alphanumeric strings."""
import random
import string
return "".join(random.choices(string.ascii_letters + string.digits, k=9))
@register_parser("mistral")
class MistralToolCallParser(ToolCallParser):
"""
Parser for Mistral-format tool calls.
Detects format by checking if the content after [TOOL_CALLS] starts with '['
(pre-v11 JSON array) or with a tool name (v11+ format).
"""
# The [TOOL_CALLS] token -- may appear as different strings depending on tokenizer
BOT_TOKEN = "[TOOL_CALLS]"
# Fallback regex for pre-v11 format when JSON parsing fails
TOOL_CALL_REGEX = re.compile(r"\[?\s*(\{.*?\})\s*\]?", re.DOTALL)
def parse(self, text: str) -> ParseResult:
if self.BOT_TOKEN not in text:
return text, None
try:
parts = text.split(self.BOT_TOKEN)
content = parts[0].strip()
raw_tool_calls = parts[1:]
# Detect format: if the first raw part starts with '[', it's pre-v11
first_raw = raw_tool_calls[0].strip() if raw_tool_calls else ""
is_pre_v11 = first_raw.startswith("[") or first_raw.startswith("{")
tool_calls: List[ChatCompletionMessageToolCall] = []
if not is_pre_v11:
# v11+ format: [TOOL_CALLS]tool_name{args}[TOOL_CALLS]tool_name2{args2}
for raw in raw_tool_calls:
raw = raw.strip()
if not raw or "{" not in raw:
continue
brace_idx = raw.find("{")
tool_name = raw[:brace_idx].strip()
args_str = raw[brace_idx:]
tool_calls.append(
ChatCompletionMessageToolCall(
id=_generate_mistral_id(),
type="function",
function=Function(name=tool_name, arguments=args_str),
)
)
else:
# Pre-v11 format: [TOOL_CALLS] [{"name": ..., "arguments": {...}}]
try:
parsed = json.loads(first_raw)
if isinstance(parsed, dict):
parsed = [parsed]
for tc in parsed:
args = tc.get("arguments", {})
if isinstance(args, dict):
args = json.dumps(args, ensure_ascii=False)
tool_calls.append(
ChatCompletionMessageToolCall(
id=_generate_mistral_id(),
type="function",
function=Function(
name=tc["name"], arguments=args
),
)
)
except json.JSONDecodeError:
# Fallback regex extraction
match = self.TOOL_CALL_REGEX.findall(first_raw)
if match:
for raw_json in match:
try:
tc = json.loads(raw_json)
args = tc.get("arguments", {})
if isinstance(args, dict):
args = json.dumps(args, ensure_ascii=False)
tool_calls.append(
ChatCompletionMessageToolCall(
id=_generate_mistral_id(),
type="function",
function=Function(
name=tc["name"], arguments=args
),
)
)
except (json.JSONDecodeError, KeyError):
continue
if not tool_calls:
return text, None
return content if content else None, tool_calls
except Exception:
return text, None

View file

@ -0,0 +1,163 @@
"""
Qwen3-Coder tool call parser.
Format uses XML-style nested tags:
<tool_call>
<function=function_name>
<parameter=param_name>value</parameter>
<parameter=param_name2>value2</parameter>
</function>
</tool_call>
Parameters are extracted from <parameter=name>value</parameter> tags and
type-converted using the schema if available, otherwise treated as strings.
Based on VLLM's Qwen3CoderToolParser.extract_tool_calls()
"""
import ast
import json
import re
import uuid
from typing import Any, Dict, List, Optional
from openai.types.chat.chat_completion_message_tool_call import (
ChatCompletionMessageToolCall,
Function,
)
from environments.tool_call_parsers import ParseResult, ToolCallParser, register_parser
def _try_convert_value(value: str) -> Any:
"""
Try to convert a parameter value string to a native Python type.
Handles null, numbers, booleans, JSON objects/arrays, and falls back to string.
"""
stripped = value.strip()
# Handle null
if stripped.lower() == "null":
return None
# Try JSON first (handles objects, arrays, strings, numbers, booleans)
try:
return json.loads(stripped)
except (json.JSONDecodeError, TypeError):
pass
# Try Python literal eval (handles tuples, etc.)
try:
return ast.literal_eval(stripped)
except (ValueError, SyntaxError, TypeError):
pass
# Return as string
return stripped
@register_parser("qwen3_coder")
class Qwen3CoderToolCallParser(ToolCallParser):
"""
Parser for Qwen3-Coder XML-format tool calls.
Uses nested XML tags: <tool_call><function=name><parameter=key>val</parameter></function></tool_call>
"""
START_TOKEN = "<tool_call>"
FUNCTION_PREFIX = "<function="
# Find complete tool_call blocks (or unclosed at end)
TOOL_CALL_REGEX = re.compile(
r"<tool_call>(.*?)</tool_call>|<tool_call>(.*?)$", re.DOTALL
)
# Find function blocks within a tool_call
FUNCTION_REGEX = re.compile(
r"<function=(.*?)</function>|<function=(.*)$", re.DOTALL
)
# Find parameter blocks within a function
PARAMETER_REGEX = re.compile(
r"<parameter=(.*?)(?:</parameter>|(?=<parameter=)|(?=</function>)|$)",
re.DOTALL,
)
def _parse_function_call(self, function_str: str) -> Optional[ChatCompletionMessageToolCall]:
"""Parse a single <function=name>...</function> block into a ToolCall."""
try:
# Extract function name: everything before the first '>'
gt_idx = function_str.index(">")
func_name = function_str[:gt_idx].strip()
params_str = function_str[gt_idx + 1:]
# Extract parameters
param_dict: Dict[str, Any] = {}
for match_text in self.PARAMETER_REGEX.findall(params_str):
if ">" not in match_text:
continue
eq_idx = match_text.index(">")
param_name = match_text[:eq_idx].strip()
param_value = match_text[eq_idx + 1:]
# Clean up whitespace
if param_value.startswith("\n"):
param_value = param_value[1:]
if param_value.endswith("\n"):
param_value = param_value[:-1]
param_dict[param_name] = _try_convert_value(param_value)
return ChatCompletionMessageToolCall(
id=f"call_{uuid.uuid4().hex[:24]}",
type="function",
function=Function(
name=func_name,
arguments=json.dumps(param_dict, ensure_ascii=False),
),
)
except (ValueError, IndexError):
return None
def parse(self, text: str) -> ParseResult:
if self.FUNCTION_PREFIX not in text:
return text, None
try:
# Find all tool_call blocks
tc_matches = self.TOOL_CALL_REGEX.findall(text)
raw_blocks = [m[0] if m[0] else m[1] for m in tc_matches]
# Fallback: if no tool_call tags, try the whole text
if not raw_blocks:
raw_blocks = [text]
# Find function blocks within each tool_call
function_strs: List[str] = []
for block in raw_blocks:
func_matches = self.FUNCTION_REGEX.findall(block)
function_strs.extend(m[0] if m[0] else m[1] for m in func_matches)
if not function_strs:
return text, None
# Parse each function call
tool_calls: List[ChatCompletionMessageToolCall] = []
for func_str in function_strs:
tc = self._parse_function_call(func_str)
if tc is not None:
tool_calls.append(tc)
if not tool_calls:
return text, None
# Content before tool calls
first_tc = text.find(self.START_TOKEN)
if first_tc < 0:
first_tc = text.find(self.FUNCTION_PREFIX)
content = text[:first_tc].strip() if first_tc > 0 else None
return content, tool_calls
except Exception:
return text, None

View file

@ -0,0 +1,19 @@
"""
Qwen 2.5 tool call parser.
Uses the same <tool_call> format as Hermes.
Registered as a separate parser name for clarity when using --tool-parser=qwen.
"""
from environments.tool_call_parsers import register_parser
from environments.tool_call_parsers.hermes_parser import HermesToolCallParser
@register_parser("qwen")
class QwenToolCallParser(HermesToolCallParser):
"""
Parser for Qwen 2.5 tool calls.
Same <tool_call>{"name": ..., "arguments": ...}</tool_call> format as Hermes.
"""
pass # Identical format -- inherits everything from Hermes

View file

@ -0,0 +1,246 @@
"""
ToolContext -- Unrestricted Tool Access for Reward Functions
A per-rollout handle that gives reward/verification functions direct access to
ALL hermes-agent tools, scoped to the rollout's task_id. The same task_id means
the terminal/browser session is the SAME one the model used during its rollout --
all state (files, processes, browser tabs) is preserved.
The verifier author decides which tools to use. Nothing is hardcoded or gated.
Example usage in a compute_reward():
async def compute_reward(self, item, result, ctx):
# Run tests in the model's terminal sandbox
test = ctx.terminal("pytest -v")
if test["exit_code"] == 0:
return 1.0
# Check if a file was created
content = ctx.read_file("/workspace/solution.py")
if content.get("content"):
return 0.5
return 0.0
"""
import json
import logging
from typing import Any, Dict, List, Optional
from model_tools import handle_function_call
from tools.terminal_tool import cleanup_vm
from tools.browser_tool import cleanup_browser
logger = logging.getLogger(__name__)
class ToolContext:
"""
Open-ended access to all hermes-agent tools for a specific rollout.
Passed to compute_reward() so verifiers can use any tool they need:
terminal commands, file reads/writes, web searches, browser automation, etc.
All calls share the rollout's task_id for session isolation.
"""
def __init__(self, task_id: str):
self.task_id = task_id
# -------------------------------------------------------------------------
# Terminal tools
# -------------------------------------------------------------------------
def terminal(self, command: str, timeout: int = 180) -> Dict[str, Any]:
"""
Run a command in the rollout's terminal session.
Args:
command: Shell command to execute
timeout: Command timeout in seconds
Returns:
Dict with 'exit_code' (int) and 'output' (str)
"""
result = handle_function_call(
"terminal",
{"command": command, "timeout": timeout},
task_id=self.task_id,
)
try:
return json.loads(result)
except json.JSONDecodeError:
return {"exit_code": -1, "output": result}
# -------------------------------------------------------------------------
# File tools
# -------------------------------------------------------------------------
def read_file(self, path: str) -> Dict[str, Any]:
"""
Read a file from the rollout's filesystem.
Args:
path: File path to read
Returns:
Dict with file content or error
"""
result = handle_function_call(
"read_file", {"path": path}, task_id=self.task_id
)
try:
return json.loads(result)
except json.JSONDecodeError:
return {"error": result}
def write_file(self, path: str, content: str) -> Dict[str, Any]:
"""
Write a file in the rollout's filesystem.
Args:
path: File path to write
content: Content to write
Returns:
Dict with success status or error
"""
result = handle_function_call(
"write_file", {"path": path, "content": content}, task_id=self.task_id
)
try:
return json.loads(result)
except json.JSONDecodeError:
return {"error": result}
def search(self, query: str, path: str = ".") -> Dict[str, Any]:
"""
Search for text in the rollout's filesystem.
Args:
query: Search query
path: Directory to search in
Returns:
Dict with search results
"""
result = handle_function_call(
"search", {"query": query, "path": path}, task_id=self.task_id
)
try:
return json.loads(result)
except json.JSONDecodeError:
return {"error": result}
# -------------------------------------------------------------------------
# Web tools
# -------------------------------------------------------------------------
def web_search(self, query: str) -> Dict[str, Any]:
"""
Search the web.
Args:
query: Search query
Returns:
Dict with search results
"""
result = handle_function_call("web_search", {"query": query})
try:
return json.loads(result)
except json.JSONDecodeError:
return {"error": result}
def web_extract(self, urls: List[str]) -> Dict[str, Any]:
"""
Extract content from URLs.
Args:
urls: List of URLs to extract content from
Returns:
Dict with extracted content
"""
result = handle_function_call("web_extract", {"urls": urls})
try:
return json.loads(result)
except json.JSONDecodeError:
return {"error": result}
# -------------------------------------------------------------------------
# Browser tools
# -------------------------------------------------------------------------
def browser_navigate(self, url: str) -> Dict[str, Any]:
"""
Navigate the rollout's browser session to a URL.
Args:
url: URL to navigate to
Returns:
Dict with page snapshot or error
"""
result = handle_function_call(
"browser_navigate", {"url": url}, task_id=self.task_id
)
try:
return json.loads(result)
except json.JSONDecodeError:
return {"error": result}
def browser_snapshot(self) -> Dict[str, Any]:
"""
Take a snapshot of the current browser page.
Returns:
Dict with page content/accessibility snapshot
"""
result = handle_function_call(
"browser_snapshot", {}, task_id=self.task_id
)
try:
return json.loads(result)
except json.JSONDecodeError:
return {"error": result}
# -------------------------------------------------------------------------
# Generic tool access
# -------------------------------------------------------------------------
def call_tool(self, tool_name: str, arguments: Dict[str, Any]) -> str:
"""
Call any hermes-agent tool by name.
This is the generic escape hatch -- if a tool doesn't have a convenience
wrapper above, you can call it directly here.
Args:
tool_name: Name of the tool (e.g., "vision_analyze", "skills_list")
arguments: Dict of arguments for the tool
Returns:
Raw JSON string result from the tool
"""
return handle_function_call(tool_name, arguments, task_id=self.task_id)
# -------------------------------------------------------------------------
# Cleanup
# -------------------------------------------------------------------------
def cleanup(self):
"""
Release all resources (terminal VMs, browser sessions) for this rollout.
Called automatically by the base environment via try/finally after
compute_reward() completes. You generally don't need to call this yourself.
"""
try:
cleanup_vm(self.task_id)
except Exception as e:
logger.debug("VM cleanup for task %s: %s", self.task_id, e)
try:
cleanup_browser(self.task_id)
except Exception as e:
logger.debug("Browser cleanup for task %s: %s", self.task_id, e)