fix: skip stale cron jobs on gateway restart instead of firing immediately
When the gateway restarts after being down past a scheduled run time, recurring jobs (cron/interval) were firing immediately because their next_run_at was in the past. Now jobs more than 2 minutes late are fast-forwarded to the next future occurrence instead. - get_due_jobs() checks staleness for cron/interval jobs - Stale jobs get next_run_at recomputed and saved - Jobs within 2 minutes of their schedule still fire normally - One-shot (once) jobs are unaffected — they fire if missed Fixes the 'cron jobs run on every gateway restart' issue.
This commit is contained in:
parent
e3f9894caf
commit
4768ea624d
3 changed files with 64 additions and 7 deletions
42
cron/jobs.py
42
cron/jobs.py
|
|
@ -6,6 +6,7 @@ Output is saved to ~/.hermes/cron/output/{job_id}/{timestamp}.md
|
|||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import tempfile
|
||||
import os
|
||||
import re
|
||||
|
|
@ -14,6 +15,8 @@ from datetime import datetime, timedelta
|
|||
from pathlib import Path
|
||||
from typing import Optional, Dict, List, Any
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
from hermes_time import now as _hermes_now
|
||||
|
||||
try:
|
||||
|
|
@ -528,10 +531,18 @@ def mark_job_run(job_id: str, success: bool, error: Optional[str] = None):
|
|||
|
||||
|
||||
def get_due_jobs() -> List[Dict[str, Any]]:
|
||||
"""Get all jobs that are due to run now."""
|
||||
"""Get all jobs that are due to run now.
|
||||
|
||||
For recurring jobs (cron/interval), if the scheduled time is stale
|
||||
(more than one period in the past, e.g. because the gateway was down),
|
||||
the job is fast-forwarded to the next future run instead of firing
|
||||
immediately. This prevents a burst of missed jobs on gateway restart.
|
||||
"""
|
||||
now = _hermes_now()
|
||||
jobs = [_apply_skill_fields(j) for j in load_jobs()]
|
||||
raw_jobs = load_jobs() # For saving updates
|
||||
due = []
|
||||
needs_save = False
|
||||
|
||||
for job in jobs:
|
||||
if not job.get("enabled", True):
|
||||
|
|
@ -543,8 +554,37 @@ def get_due_jobs() -> List[Dict[str, Any]]:
|
|||
|
||||
next_run_dt = _ensure_aware(datetime.fromisoformat(next_run))
|
||||
if next_run_dt <= now:
|
||||
schedule = job.get("schedule", {})
|
||||
kind = schedule.get("kind")
|
||||
|
||||
# For recurring jobs, check if the scheduled time is stale
|
||||
# (gateway was down and missed the window). Fast-forward to
|
||||
# the next future occurrence instead of firing a stale run.
|
||||
if kind in ("cron", "interval") and (now - next_run_dt).total_seconds() > 120:
|
||||
# More than 2 minutes late — this is a missed run, not a current one.
|
||||
# Recompute next_run_at to the next future occurrence.
|
||||
new_next = compute_next_run(schedule, now.isoformat())
|
||||
if new_next:
|
||||
logger.info(
|
||||
"Job '%s' missed its scheduled time (%s). "
|
||||
"Fast-forwarding to next run: %s",
|
||||
job.get("name", job["id"]),
|
||||
next_run,
|
||||
new_next,
|
||||
)
|
||||
# Update the job in storage
|
||||
for rj in raw_jobs:
|
||||
if rj["id"] == job["id"]:
|
||||
rj["next_run_at"] = new_next
|
||||
needs_save = True
|
||||
break
|
||||
continue # Skip this run
|
||||
|
||||
due.append(job)
|
||||
|
||||
if needs_save:
|
||||
save_jobs(raw_jobs)
|
||||
|
||||
return due
|
||||
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue