fix: skip stale cron jobs on gateway restart instead of firing immediately

When the gateway restarts after being down past a scheduled run time,
recurring jobs (cron/interval) were firing immediately because their
next_run_at was in the past. Now jobs more than 2 minutes late are
fast-forwarded to the next future occurrence instead.

- get_due_jobs() checks staleness for cron/interval jobs
- Stale jobs get next_run_at recomputed and saved
- Jobs within 2 minutes of their schedule still fire normally
- One-shot (once) jobs are unaffected — they fire if missed

Fixes the 'cron jobs run on every gateway restart' issue.
This commit is contained in:
teknium1 2026-03-16 23:48:13 -07:00
parent e3f9894caf
commit 4768ea624d
3 changed files with 64 additions and 7 deletions

View file

@ -304,17 +304,34 @@ class TestMarkJobRun:
class TestGetDueJobs:
def test_past_due_returned(self, tmp_cron_dir):
def test_past_due_within_window_returned(self, tmp_cron_dir):
"""Jobs less than 2 minutes late are still considered due (not stale)."""
job = create_job(prompt="Due now", schedule="every 1h")
# Force next_run_at to the past
# Force next_run_at to just 1 minute ago (within the 2-min window)
jobs = load_jobs()
jobs[0]["next_run_at"] = (datetime.now() - timedelta(minutes=5)).isoformat()
jobs[0]["next_run_at"] = (datetime.now() - timedelta(seconds=60)).isoformat()
save_jobs(jobs)
due = get_due_jobs()
assert len(due) == 1
assert due[0]["id"] == job["id"]
def test_stale_past_due_skipped(self, tmp_cron_dir):
"""Recurring jobs more than 2 minutes late are fast-forwarded, not fired."""
job = create_job(prompt="Stale", schedule="every 1h")
# Force next_run_at to 5 minutes ago (beyond the 2-min window)
jobs = load_jobs()
jobs[0]["next_run_at"] = (datetime.now() - timedelta(minutes=5)).isoformat()
save_jobs(jobs)
due = get_due_jobs()
assert len(due) == 0
# next_run_at should be fast-forwarded to the future
updated = get_job(job["id"])
from cron.jobs import _ensure_aware, _hermes_now
next_dt = _ensure_aware(datetime.fromisoformat(updated["next_run_at"]))
assert next_dt > _hermes_now()
def test_future_not_returned(self, tmp_cron_dir):
create_job(prompt="Not yet", schedule="every 1h")
due = get_due_jobs()