fix: call _stop_training_run on early-return failure paths
The 4 early-return paths in _spawn_training_run (API exit, trainer
exit, env not found, env exit) were doing manual process.terminate()
or returning without cleanup, leaking open log file handles. Now all
paths call _stop_training_run() which handles both process termination
and file handle closure.
Also adds 12 tests for _stop_training_run covering file handle
cleanup, process termination, status transitions, and edge cases.
Inspired by PR #715 (0xbyt4) which identified the early-return issue.
Core file handle fix was already on main via e28dc13 (memosr.eth).
This commit is contained in:
parent
be2e259596
commit
03a4f184e6
2 changed files with 146 additions and 6 deletions
|
|
@ -340,6 +340,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
|
|||
if run_state.api_process.poll() is not None:
|
||||
run_state.status = "failed"
|
||||
run_state.error_message = f"API server exited with code {run_state.api_process.returncode}. Check {api_log}"
|
||||
_stop_training_run(run_state)
|
||||
return
|
||||
|
||||
print(f"[{run_id}] Atropos API server started")
|
||||
|
|
@ -364,8 +365,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
|
|||
if run_state.trainer_process.poll() is not None:
|
||||
run_state.status = "failed"
|
||||
run_state.error_message = f"Trainer exited with code {run_state.trainer_process.returncode}. Check {trainer_log}"
|
||||
if run_state.api_process:
|
||||
run_state.api_process.terminate()
|
||||
_stop_training_run(run_state)
|
||||
return
|
||||
|
||||
print(f"[{run_id}] Trainer started, inference server on port 8001")
|
||||
|
|
@ -384,6 +384,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
|
|||
if not env_info:
|
||||
run_state.status = "failed"
|
||||
run_state.error_message = f"Environment '{run_state.environment}' not found"
|
||||
_stop_training_run(run_state)
|
||||
return
|
||||
|
||||
print(f"[{run_id}] Starting environment: {env_info.file_path} serve")
|
||||
|
|
@ -403,10 +404,7 @@ async def _spawn_training_run(run_state: RunState, config_path: Path):
|
|||
if run_state.env_process.poll() is not None:
|
||||
run_state.status = "failed"
|
||||
run_state.error_message = f"Environment exited with code {run_state.env_process.returncode}. Check {env_log}"
|
||||
if run_state.trainer_process:
|
||||
run_state.trainer_process.terminate()
|
||||
if run_state.api_process:
|
||||
run_state.api_process.terminate()
|
||||
_stop_training_run(run_state)
|
||||
return
|
||||
|
||||
run_state.status = "running"
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue