fix: call _stop_training_run on early-return failure paths

The 4 early-return paths in _spawn_training_run (API exit, trainer
exit, env not found, env exit) were doing manual process.terminate()
or returning without cleanup, leaking open log file handles. Now all
paths call _stop_training_run() which handles both process termination
and file handle closure.

Also adds 12 tests for _stop_training_run covering file handle
cleanup, process termination, status transitions, and edge cases.

Inspired by PR #715 (0xbyt4) which identified the early-return issue.
Core file handle fix was already on main via e28dc13 (memosr.eth).
This commit is contained in:
teknium1 2026-03-10 17:09:51 -07:00
parent be2e259596
commit 03a4f184e6
2 changed files with 146 additions and 6 deletions

View file

@ -340,6 +340,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
if run_state.api_process.poll() is not None:
run_state.status = "failed"
run_state.error_message = f"API server exited with code {run_state.api_process.returncode}. Check {api_log}"
_stop_training_run(run_state)
return
print(f"[{run_id}] Atropos API server started")
@ -364,8 +365,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
if run_state.trainer_process.poll() is not None:
run_state.status = "failed"
run_state.error_message = f"Trainer exited with code {run_state.trainer_process.returncode}. Check {trainer_log}"
if run_state.api_process:
run_state.api_process.terminate()
_stop_training_run(run_state)
return
print(f"[{run_id}] Trainer started, inference server on port 8001")
@ -384,6 +384,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
if not env_info:
run_state.status = "failed"
run_state.error_message = f"Environment '{run_state.environment}' not found"
_stop_training_run(run_state)
return
print(f"[{run_id}] Starting environment: {env_info.file_path} serve")
@ -403,10 +404,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
if run_state.env_process.poll() is not None:
run_state.status = "failed"
run_state.error_message = f"Environment exited with code {run_state.env_process.returncode}. Check {env_log}"
if run_state.trainer_process:
run_state.trainer_process.terminate()
if run_state.api_process:
run_state.api_process.terminate()
_stop_training_run(run_state)
return
run_state.status = "running"