Merge branch 'main' into fix/packaging-bugs

2026-03-13 03:15:45 -07:00 · 2026-03-13 03:15:45 -07:00 · 0a88b133c2
commit 0a88b133c2
parent 1d4a23fa6c 98b55360a9
289 changed files with 48243 additions and 3815 deletions
--- a/tools/rl_training_tool.py
+++ b/tools/rl_training_tool.py
@ -54,9 +54,10 @@ ENVIRONMENTS_DIR = TINKER_ATROPOS_ROOT / "tinker_atropos" / "environments"
 CONFIGS_DIR = TINKER_ATROPOS_ROOT / "configs"
 LOGS_DIR = Path(os.getenv("HERMES_HOME", Path.home() / ".hermes")) / "logs" / "rl_training"

-# Ensure logs directory exists
-LOGS_DIR.mkdir(parents=True, exist_ok=True)
-
+def _ensure_logs_dir():
+    """Lazily create logs directory on first use (avoid side effects at import time)."""
+    if TINKER_ATROPOS_ROOT.exists():
+        LOGS_DIR.mkdir(exist_ok=True)

 # ============================================================================
 # Locked Configuration (Infrastructure Settings)
@ -314,6 +315,8 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
    """
    run_id = run_state.run_id
    
+    _ensure_logs_dir()
+
    # Log file paths
    api_log = LOGS_DIR / f"api_{run_id}.log"
    trainer_log = LOGS_DIR / f"trainer_{run_id}.log"
@ -323,7 +326,10 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
        # Step 1: Start the Atropos API server (run-api)
        print(f"[{run_id}] Starting Atropos API server (run-api)...")
        
-        api_log_file = open(api_log, "w")
+        # File must stay open while the subprocess runs; we store the handle
+        # on run_state so _stop_training_run() can close it when done.
+        api_log_file = open(api_log, "w")  # closed by _stop_training_run
+        run_state.api_log_file = api_log_file
        run_state.api_process = subprocess.Popen(
            ["run-api"],
            stdout=api_log_file,
@ -337,6 +343,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
        if run_state.api_process.poll() is not None:
            run_state.status = "failed"
            run_state.error_message = f"API server exited with code {run_state.api_process.returncode}. Check {api_log}"
+            _stop_training_run(run_state)
            return
        
        print(f"[{run_id}] Atropos API server started")
@ -344,7 +351,8 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
        # Step 2: Start the Tinker trainer
        print(f"[{run_id}] Starting Tinker trainer: launch_training.py --config {config_path}")
        
-        trainer_log_file = open(trainer_log, "w")
+        trainer_log_file = open(trainer_log, "w")  # closed by _stop_training_run
+        run_state.trainer_log_file = trainer_log_file
        run_state.trainer_process = subprocess.Popen(
            [sys.executable, "launch_training.py", "--config", str(config_path)],
            stdout=trainer_log_file,
@ -360,8 +368,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
        if run_state.trainer_process.poll() is not None:
            run_state.status = "failed"
            run_state.error_message = f"Trainer exited with code {run_state.trainer_process.returncode}. Check {trainer_log}"
-            if run_state.api_process:
-                run_state.api_process.terminate()
+            _stop_training_run(run_state)
            return
        
        print(f"[{run_id}] Trainer started, inference server on port 8001")
@ -380,11 +387,13 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
        if not env_info:
            run_state.status = "failed"
            run_state.error_message = f"Environment '{run_state.environment}' not found"
+            _stop_training_run(run_state)
            return
        
        print(f"[{run_id}] Starting environment: {env_info.file_path} serve")
        
-        env_log_file = open(env_log, "w")
+        env_log_file = open(env_log, "w")  # closed by _stop_training_run
+        run_state.env_log_file = env_log_file
        run_state.env_process = subprocess.Popen(
            [sys.executable, str(env_info.file_path), "serve", "--config", str(config_path)],
            stdout=env_log_file,
@ -398,10 +407,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
        if run_state.env_process.poll() is not None:
            run_state.status = "failed"
            run_state.error_message = f"Environment exited with code {run_state.env_process.returncode}. Check {env_log}"
-            if run_state.trainer_process:
-                run_state.trainer_process.terminate()
-            if run_state.api_process:
-                run_state.api_process.terminate()
+            _stop_training_run(run_state)
            return
        
        run_state.status = "running"
@ -480,6 +486,16 @@ def _stop_training_run(run_state: RunState):
    if run_state.status == "running":
        run_state.status = "stopped"

+    # Close log file handles that were opened for subprocess stdout.
+    for attr in ("env_log_file", "trainer_log_file", "api_log_file"):
+        fh = getattr(run_state, attr, None)
+        if fh is not None:
+            try:
+                fh.close()
+            except Exception:
+                pass
+            setattr(run_state, attr, None)
+

 # ============================================================================
 # Environment Discovery Tools
@ -1079,6 +1095,7 @@ async def rl_test_inference(
    }
    
    # Create output directory for test results
+    _ensure_logs_dir()
    test_output_dir = LOGS_DIR / "inference_tests"
    test_output_dir.mkdir(exist_ok=True)