fix: call _stop_training_run on early-return failure paths

The 4 early-return paths in _spawn_training_run (API exit, trainer exit, env not found, env exit) were doing manual process.terminate() or returning without cleanup, leaking open log file handles. Now all paths call _stop_training_run() which handles both process termination and file handle closure. Also adds 12 tests for _stop_training_run covering file handle cleanup, process termination, status transitions, and edge cases. Inspired by PR #715 (0xbyt4) which identified the early-return issue. Core file handle fix was already on main via e28dc13 (memosr.eth).
2026-03-10 17:09:51 -07:00 · 2026-03-10 17:09:51 -07:00 · 03a4f184e6
commit 03a4f184e6
parent be2e259596
2 changed files with 146 additions and 6 deletions
--- a/tools/rl_training_tool.py
+++ b/tools/rl_training_tool.py
@ -340,6 +340,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
        if run_state.api_process.poll() is not None:
            run_state.status = "failed"
            run_state.error_message = f"API server exited with code {run_state.api_process.returncode}. Check {api_log}"
+            _stop_training_run(run_state)
            return
        
        print(f"[{run_id}] Atropos API server started")
@ -364,8 +365,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
        if run_state.trainer_process.poll() is not None:
            run_state.status = "failed"
            run_state.error_message = f"Trainer exited with code {run_state.trainer_process.returncode}. Check {trainer_log}"
-            if run_state.api_process:
-                run_state.api_process.terminate()
+            _stop_training_run(run_state)
            return
        
        print(f"[{run_id}] Trainer started, inference server on port 8001")
@ -384,6 +384,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
        if not env_info:
            run_state.status = "failed"
            run_state.error_message = f"Environment '{run_state.environment}' not found"
+            _stop_training_run(run_state)
            return
        
        print(f"[{run_id}] Starting environment: {env_info.file_path} serve")
@ -403,10 +404,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
        if run_state.env_process.poll() is not None:
            run_state.status = "failed"
            run_state.error_message = f"Environment exited with code {run_state.env_process.returncode}. Check {env_log}"
-            if run_state.trainer_process:
-                run_state.trainer_process.terminate()
-            if run_state.api_process:
-                run_state.api_process.terminate()
+            _stop_training_run(run_state)
            return
        
        run_state.status = "running"