Make process_registry checkpoint writes atomic

This commit is contained in:
aydnOktay 2026-03-03 02:44:01 +03:00
parent 669e4d0297
commit 5fa3e24b76

View file

@ -34,6 +34,7 @@ import logging
import os import os
import signal import signal
import subprocess import subprocess
import tempfile
import threading import threading
import time import time
import uuid import uuid
@ -650,7 +651,7 @@ class ProcessRegistry:
# ----- Checkpoint (crash recovery) ----- # ----- Checkpoint (crash recovery) -----
def _write_checkpoint(self): def _write_checkpoint(self):
"""Write running process metadata to checkpoint file.""" """Write running process metadata to checkpoint file atomically."""
try: try:
with self._lock: with self._lock:
entries = [] entries = []
@ -665,12 +666,28 @@ class ProcessRegistry:
"task_id": s.task_id, "task_id": s.task_id,
"session_key": s.session_key, "session_key": s.session_key,
}) })
# Atomic write: temp file + os.replace to avoid corruption on crash
CHECKPOINT_PATH.parent.mkdir(parents=True, exist_ok=True) CHECKPOINT_PATH.parent.mkdir(parents=True, exist_ok=True)
CHECKPOINT_PATH.write_text( fd, tmp_path = tempfile.mkstemp(
json.dumps(entries, indent=2), encoding="utf-8" dir=str(CHECKPOINT_PATH.parent),
prefix='.checkpoint_',
suffix='.tmp',
) )
except Exception: try:
pass # Best-effort with os.fdopen(fd, 'w', encoding='utf-8') as f:
json.dump(entries, f, indent=2, ensure_ascii=False)
f.flush()
os.fsync(f.fileno())
os.replace(tmp_path, CHECKPOINT_PATH)
except BaseException:
try:
os.unlink(tmp_path)
except OSError:
pass
raise
except Exception as e:
logger.debug("Failed to write checkpoint file: %s", e, exc_info=True)
def recover_from_checkpoint(self) -> int: def recover_from_checkpoint(self) -> int:
""" """
@ -717,10 +734,28 @@ class ProcessRegistry:
logger.info("Recovered detached process: %s (pid=%d)", session.command[:60], pid) logger.info("Recovered detached process: %s (pid=%d)", session.command[:60], pid)
# Clear the checkpoint (will be rewritten as processes finish) # Clear the checkpoint (will be rewritten as processes finish)
# Use atomic write to avoid corruption
try: try:
CHECKPOINT_PATH.write_text("[]", encoding="utf-8") CHECKPOINT_PATH.parent.mkdir(parents=True, exist_ok=True)
fd, tmp_path = tempfile.mkstemp(
dir=str(CHECKPOINT_PATH.parent),
prefix='.checkpoint_',
suffix='.tmp',
)
try:
with os.fdopen(fd, 'w', encoding='utf-8') as f:
f.write("[]")
f.flush()
os.fsync(f.fileno())
os.replace(tmp_path, CHECKPOINT_PATH)
except BaseException:
try:
os.unlink(tmp_path)
except OSError:
pass
raise
except Exception as e: except Exception as e:
logger.debug("Could not write checkpoint file: %s", e) logger.debug("Could not clear checkpoint file: %s", e, exc_info=True)
return recovered return recovered