fix: skip stale cron jobs on gateway restart instead of firing immediately

When the gateway restarts after being down past a scheduled run time, recurring jobs (cron/interval) were firing immediately because their next_run_at was in the past. Now jobs more than 2 minutes late are fast-forwarded to the next future occurrence instead. - get_due_jobs() checks staleness for cron/interval jobs - Stale jobs get next_run_at recomputed and saved - Jobs within 2 minutes of their schedule still fire normally - One-shot (once) jobs are unaffected — they fire if missed Fixes the 'cron jobs run on every gateway restart' issue.
2026-03-16 23:48:13 -07:00 · 2026-03-16 23:48:13 -07:00 · 4768ea624d
commit 4768ea624d
parent e3f9894caf
3 changed files with 64 additions and 7 deletions
--- a/cron/jobs.py
+++ b/cron/jobs.py
@ -6,6 +6,7 @@ Output is saved to ~/.hermes/cron/output/{job_id}/{timestamp}.md
 """

 import json
+import logging
 import tempfile
 import os
 import re
@ -14,6 +15,8 @@ from datetime import datetime, timedelta
 from pathlib import Path
 from typing import Optional, Dict, List, Any

+logger = logging.getLogger(__name__)
+
 from hermes_time import now as _hermes_now

 try:
@ -528,10 +531,18 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):


 def get_due_jobs() -> List[Dict[str, Any]]:
-    """Get all jobs that are due to run now."""
+    """Get all jobs that are due to run now.
+
+    For recurring jobs (cron/interval), if the scheduled time is stale
+    (more than one period in the past, e.g. because the gateway was down),
+    the job is fast-forwarded to the next future run instead of firing
+    immediately.  This prevents a burst of missed jobs on gateway restart.
+    """
    now = _hermes_now()
    jobs = [_apply_skill_fields(j) for j in load_jobs()]
+    raw_jobs = load_jobs()  # For saving updates
    due = []
+    needs_save = False

    for job in jobs:
        if not job.get("enabled", True):
@ -543,8 +554,37 @@ def get_due_jobs() -> List[Dict[str, Any]]:

        next_run_dt = _ensure_aware(datetime.fromisoformat(next_run))
        if next_run_dt <= now:
+            schedule = job.get("schedule", {})
+            kind = schedule.get("kind")
+
+            # For recurring jobs, check if the scheduled time is stale
+            # (gateway was down and missed the window). Fast-forward to
+            # the next future occurrence instead of firing a stale run.
+            if kind in ("cron", "interval") and (now - next_run_dt).total_seconds() > 120:
+                # More than 2 minutes late — this is a missed run, not a current one.
+                # Recompute next_run_at to the next future occurrence.
+                new_next = compute_next_run(schedule, now.isoformat())
+                if new_next:
+                    logger.info(
+                        "Job '%s' missed its scheduled time (%s). "
+                        "Fast-forwarding to next run: %s",
+                        job.get("name", job["id"]),
+                        next_run,
+                        new_next,
+                    )
+                    # Update the job in storage
+                    for rj in raw_jobs:
+                        if rj["id"] == job["id"]:
+                            rj["next_run_at"] = new_next
+                            needs_save = True
+                            break
+                    continue  # Skip this run
+
            due.append(job)

+    if needs_save:
+        save_jobs(raw_jobs)
+
    return due