fix(gateway): prevent concurrent agent runs for the same session
Place a sentinel in _running_agents immediately after the "already running" guard check passes — before any await. Without this, the numerous await points between the guard (line 1324) and agent registration (track_agent at line 4790) create a window where a second message for the same session can bypass the guard and start a duplicate agent, corrupting the transcript. The await gap includes: hook emissions, vision enrichment (external API call), audio transcription (external API call), session hygiene compression, and the run_in_executor call itself. For messages with media attachments the window can be several seconds wide. The sentinel is wrapped in try/finally so it is always cleaned up — even if the handler raises or takes an early-return path. When the real AIAgent is created, track_agent() overwrites the sentinel with the actual instance (preserving interrupt support). Also handles the edge case where a message arrives while the sentinel is set but no real agent exists yet: the message is queued via the adapter's pending-message mechanism instead of attempting to call interrupt() on the sentinel object.
This commit is contained in:
parent
02954c1a10
commit
aaa96713d4
2 changed files with 235 additions and 3 deletions
|
|
@ -222,6 +222,12 @@ from gateway.platforms.base import BasePlatformAdapter, MessageEvent, MessageTyp
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Sentinel placed into _running_agents immediately when a session starts
|
||||
# processing, *before* any await. Prevents a second message for the same
|
||||
# session from bypassing the "already running" guard during the async gap
|
||||
# between the guard check and actual agent creation.
|
||||
_AGENT_PENDING_SENTINEL = object()
|
||||
|
||||
|
||||
def _resolve_runtime_agent_kwargs() -> dict:
|
||||
"""Resolve provider credentials for gateway-created AIAgent instances."""
|
||||
|
|
@ -1346,7 +1352,14 @@ class GatewayRunner:
|
|||
adapter._pending_messages[_quick_key] = event
|
||||
return None
|
||||
|
||||
running_agent = self._running_agents[_quick_key]
|
||||
running_agent = self._running_agents.get(_quick_key)
|
||||
if running_agent is _AGENT_PENDING_SENTINEL:
|
||||
# Agent is being set up but not ready yet — queue the message
|
||||
# so it will be picked up after the agent starts.
|
||||
adapter = self.adapters.get(source.platform)
|
||||
if adapter:
|
||||
adapter._pending_messages[_quick_key] = event
|
||||
return None
|
||||
logger.debug("PRIORITY interrupt for session %s", _quick_key[:20])
|
||||
running_agent.interrupt(event.text)
|
||||
if _quick_key in self._pending_messages:
|
||||
|
|
@ -1354,7 +1367,7 @@ class GatewayRunner:
|
|||
else:
|
||||
self._pending_messages[_quick_key] = event.text
|
||||
return None
|
||||
|
||||
|
||||
# Check for commands
|
||||
command = event.get_command()
|
||||
|
||||
|
|
@ -1527,7 +1540,29 @@ class GatewayRunner:
|
|||
# Pending exec approvals are handled by /approve and /deny commands above.
|
||||
# No bare text matching — "yes" in normal conversation must not trigger
|
||||
# execution of a dangerous command.
|
||||
|
||||
|
||||
# ── Claim this session before any await ───────────────────────
|
||||
# Between here and _run_agent registering the real AIAgent, there
|
||||
# are numerous await points (hooks, vision enrichment, STT,
|
||||
# session hygiene compression). Without this sentinel a second
|
||||
# message arriving during any of those yields would pass the
|
||||
# "already running" guard and spin up a duplicate agent for the
|
||||
# same session — corrupting the transcript.
|
||||
self._running_agents[_quick_key] = _AGENT_PENDING_SENTINEL
|
||||
|
||||
try:
|
||||
return await self._handle_message_with_agent(event, source, _quick_key)
|
||||
finally:
|
||||
# If _run_agent replaced the sentinel with a real agent and
|
||||
# then cleaned it up, this is a no-op. If we exited early
|
||||
# (exception, command fallthrough, etc.) the sentinel must
|
||||
# not linger or the session would be permanently locked out.
|
||||
if self._running_agents.get(_quick_key) is _AGENT_PENDING_SENTINEL:
|
||||
del self._running_agents[_quick_key]
|
||||
|
||||
async def _handle_message_with_agent(self, event, source, _quick_key: str):
|
||||
"""Inner handler that runs under the _running_agents sentinel guard."""
|
||||
|
||||
# Get or create session
|
||||
session_entry = self.session_store.get_or_create_session(source)
|
||||
session_key = session_entry.session_key
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue