fix(honcho): isolate session routing for multi-user gateway (#1500)

Salvaged from PR #1470 by adavyas. Core fix: Honcho tool calls in a multi-session gateway could route to the wrong session because honcho_tools.py relied on process-global state. Now threads session context through the call chain: AIAgent._invoke_tool() → handle_function_call() → registry.dispatch() → handler **kw → _resolve_session_context() Changes: - Add _resolve_session_context() to prefer per-call context over globals - Plumb honcho_manager + honcho_session_key through handle_function_call - Add sync_honcho=False to run_conversation() for synthetic flush turns - Pass honcho_session_key through gateway memory flush lifecycle - Harden gateway PID detection when /proc cmdline is unreadable - Make interrupt test scripts import-safe for pytest-xdist - Wrap BibTeX examples in Jekyll raw blocks for docs build - Fix thread-order-dependent assertion in client lifecycle test - Expand Honcho docs: session isolation, lifecycle, routing internals Dropped from original PR: - Indentation change in _create_request_openai_client that would move client creation inside the lock (causes unnecessary contention) Co-authored-by: adavyas <adavyas@users.noreply.github.com>
2026-03-16 00:23:47 -07:00 · 2026-03-16 00:23:47 -07:00 · dd7921d514
commit dd7921d514
parent eb4f0348e1
17 changed files with 522 additions and 252 deletions
--- a/tests/gateway/test_honcho_lifecycle.py
+++ b/tests/gateway/test_honcho_lifecycle.py
@ -90,6 +90,7 @@ class TestGatewayHonchoLifecycle:
        runner = _make_runner()
        event = _make_event()
        runner._shutdown_gateway_honcho = MagicMock()
+        runner._async_flush_memories = AsyncMock()
        runner.session_store = MagicMock()
        runner.session_store._generate_session_key.return_value = "gateway-key"
        runner.session_store._entries = {
@ -100,4 +101,31 @@ class TestGatewayHonchoLifecycle:
        result = await runner._handle_reset_command(event)

        runner._shutdown_gateway_honcho.assert_called_once_with("gateway-key")
+        runner._async_flush_memories.assert_called_once_with("old-session", "gateway-key")
        assert "Session reset" in result
+
+    def test_flush_memories_reuses_gateway_session_key_and_skips_honcho_sync(self):
+        runner = _make_runner()
+        runner.session_store = MagicMock()
+        runner.session_store.load_transcript.return_value = [
+            {"role": "user", "content": "a"},
+            {"role": "assistant", "content": "b"},
+            {"role": "user", "content": "c"},
+            {"role": "assistant", "content": "d"},
+        ]
+        tmp_agent = MagicMock()
+
+        with (
+            patch("gateway.run._resolve_runtime_agent_kwargs", return_value={"api_key": "test-key"}),
+            patch("gateway.run._resolve_gateway_model", return_value="model-name"),
+            patch("run_agent.AIAgent", return_value=tmp_agent) as mock_agent_cls,
+        ):
+            runner._flush_memories_for_session("old-session", "gateway-key")
+
+        mock_agent_cls.assert_called_once()
+        _, kwargs = mock_agent_cls.call_args
+        assert kwargs["session_id"] == "old-session"
+        assert kwargs["honcho_session_key"] == "gateway-key"
+        tmp_agent.run_conversation.assert_called_once()
+        _, run_kwargs = tmp_agent.run_conversation.call_args
+        assert run_kwargs["sync_honcho"] is False
--- a/tests/gateway/test_resume_command.py
+++ b/tests/gateway/test_resume_command.py
@ -199,3 +199,28 @@ class TestHandleResumeCommand:

        assert real_key not in runner._running_agents
        db.close()
+
+    @pytest.mark.asyncio
+    async def test_resume_flushes_memories_with_gateway_session_key(self, tmp_path):
+        """Resume should preserve the gateway session key for Honcho flushes."""
+        from hermes_state import SessionDB
+
+        db = SessionDB(db_path=tmp_path / "state.db")
+        db.create_session("old_session", "telegram")
+        db.set_session_title("old_session", "Old Work")
+        db.create_session("current_session_001", "telegram")
+
+        event = _make_event(text="/resume Old Work")
+        runner = _make_runner(
+            session_db=db,
+            current_session_id="current_session_001",
+            event=event,
+        )
+
+        await runner._handle_resume_command(event)
+
+        runner._async_flush_memories.assert_called_once_with(
+            "current_session_001",
+            _session_key_for_event(event),
+        )
+        db.close()
--- a/tests/gateway/test_status.py
+++ b/tests/gateway/test_status.py
@ -26,6 +26,22 @@ class TestGatewayPidState:
        assert status.get_running_pid() is None
        assert not pid_path.exists()

+    def test_get_running_pid_accepts_gateway_metadata_when_cmdline_unavailable(self, tmp_path, monkeypatch):
+        monkeypatch.setenv("HERMES_HOME", str(tmp_path))
+        pid_path = tmp_path / "gateway.pid"
+        pid_path.write_text(json.dumps({
+            "pid": os.getpid(),
+            "kind": "hermes-gateway",
+            "argv": ["python", "-m", "hermes_cli.main", "gateway"],
+            "start_time": 123,
+        }))
+
+        monkeypatch.setattr(status.os, "kill", lambda pid, sig: None)
+        monkeypatch.setattr(status, "_get_process_start_time", lambda pid: 123)
+        monkeypatch.setattr(status, "_read_process_cmdline", lambda pid: None)
+
+        assert status.get_running_pid() == os.getpid()
+

 class TestGatewayRuntimeStatus:
    def test_write_runtime_status_records_platform_failure(self, tmp_path, monkeypatch):
--- a/tests/run_interrupt_test.py
+++ b/tests/run_interrupt_test.py
@ -16,126 +16,131 @@ from run_agent import AIAgent, IterationBudget
 from tools.delegate_tool import _run_single_child
 from tools.interrupt import set_interrupt, is_interrupted

-set_interrupt(False)
+def main() -> int:
+    set_interrupt(False)

-# Create parent agent (minimal)
-parent = AIAgent.__new__(AIAgent)
-parent._interrupt_requested = False
-parent._interrupt_message = None
-parent._active_children = []
-parent.quiet_mode = True
-parent.model = "test/model"
-parent.base_url = "http://localhost:1"
-parent.api_key = "test"
-parent.provider = "test"
-parent.api_mode = "chat_completions"
-parent.platform = "cli"
-parent.enabled_toolsets = ["terminal", "file"]
-parent.providers_allowed = None
-parent.providers_ignored = None
-parent.providers_order = None
-parent.provider_sort = None
-parent.max_tokens = None
-parent.reasoning_config = None
-parent.prefill_messages = None
-parent._session_db = None
-parent._delegate_depth = 0
-parent._delegate_spinner = None
-parent.tool_progress_callback = None
-parent.iteration_budget = IterationBudget(max_total=100)
-parent._client_kwargs = {"api_key": "test", "base_url": "http://localhost:1"}
+    # Create parent agent (minimal)
+    parent = AIAgent.__new__(AIAgent)
+    parent._interrupt_requested = False
+    parent._interrupt_message = None
+    parent._active_children = []
+    parent.quiet_mode = True
+    parent.model = "test/model"
+    parent.base_url = "http://localhost:1"
+    parent.api_key = "test"
+    parent.provider = "test"
+    parent.api_mode = "chat_completions"
+    parent.platform = "cli"
+    parent.enabled_toolsets = ["terminal", "file"]
+    parent.providers_allowed = None
+    parent.providers_ignored = None
+    parent.providers_order = None
+    parent.provider_sort = None
+    parent.max_tokens = None
+    parent.reasoning_config = None
+    parent.prefill_messages = None
+    parent._session_db = None
+    parent._delegate_depth = 0
+    parent._delegate_spinner = None
+    parent.tool_progress_callback = None
+    parent.iteration_budget = IterationBudget(max_total=100)
+    parent._client_kwargs = {"api_key": "test", "base_url": "http://localhost:1"}

-child_started = threading.Event()
-result_holder = [None]
+    child_started = threading.Event()
+    result_holder = [None]

+    def run_delegate():
+        with patch("run_agent.OpenAI") as MockOpenAI:
+            mock_client = MagicMock()

-def run_delegate():
-    with patch("run_agent.OpenAI") as MockOpenAI:
-        mock_client = MagicMock()
+            def slow_create(**kwargs):
+                time.sleep(3)
+                resp = MagicMock()
+                resp.choices = [MagicMock()]
+                resp.choices[0].message.content = "Done"
+                resp.choices[0].message.tool_calls = None
+                resp.choices[0].message.refusal = None
+                resp.choices[0].finish_reason = "stop"
+                resp.usage.prompt_tokens = 100
+                resp.usage.completion_tokens = 10
+                resp.usage.total_tokens = 110
+                resp.usage.prompt_tokens_details = None
+                return resp

-        def slow_create(**kwargs):
-            time.sleep(3)
-            resp = MagicMock()
-            resp.choices = [MagicMock()]
-            resp.choices[0].message.content = "Done"
-            resp.choices[0].message.tool_calls = None
-            resp.choices[0].message.refusal = None
-            resp.choices[0].finish_reason = "stop"
-            resp.usage.prompt_tokens = 100
-            resp.usage.completion_tokens = 10
-            resp.usage.total_tokens = 110
-            resp.usage.prompt_tokens_details = None
-            return resp
+            mock_client.chat.completions.create = slow_create
+            mock_client.close = MagicMock()
+            MockOpenAI.return_value = mock_client

-        mock_client.chat.completions.create = slow_create
-        mock_client.close = MagicMock()
-        MockOpenAI.return_value = mock_client
+            original_init = AIAgent.__init__

-        original_init = AIAgent.__init__
+            def patched_init(self_agent, *a, **kw):
+                original_init(self_agent, *a, **kw)
+                child_started.set()

-        def patched_init(self_agent, *a, **kw):
-            original_init(self_agent, *a, **kw)
-            child_started.set()
+            with patch.object(AIAgent, "__init__", patched_init):
+                try:
+                    result = _run_single_child(
+                        task_index=0,
+                        goal="Test slow task",
+                        context=None,
+                        toolsets=["terminal"],
+                        model="test/model",
+                        max_iterations=5,
+                        parent_agent=parent,
+                        task_count=1,
+                        override_provider="test",
+                        override_base_url="http://localhost:1",
+                        override_api_key="test",
+                        override_api_mode="chat_completions",
+                    )
+                    result_holder[0] = result
+                except Exception as e:
+                    print(f"ERROR in delegate: {e}")
+                    import traceback
+                    traceback.print_exc()

-        with patch.object(AIAgent, "__init__", patched_init):
-            try:
-                result = _run_single_child(
-                    task_index=0,
-                    goal="Test slow task",
-                    context=None,
-                    toolsets=["terminal"],
-                    model="test/model",
-                    max_iterations=5,
-                    parent_agent=parent,
-                    task_count=1,
-                    override_provider="test",
-                    override_base_url="http://localhost:1",
-                    override_api_key="test",
-                    override_api_mode="chat_completions",
-                )
-                result_holder[0] = result
-            except Exception as e:
-                print(f"ERROR in delegate: {e}")
-                import traceback
-                traceback.print_exc()
+    print("Starting agent thread...")
+    agent_thread = threading.Thread(target=run_delegate, daemon=True)
+    agent_thread.start()

+    started = child_started.wait(timeout=10)
+    if not started:
+        print("ERROR: Child never started")
+        set_interrupt(False)
+        return 1

-print("Starting agent thread...")
-agent_thread = threading.Thread(target=run_delegate, daemon=True)
-agent_thread.start()
+    time.sleep(0.5)

-started = child_started.wait(timeout=10)
-if not started:
-    print("ERROR: Child never started")
-    sys.exit(1)
+    print(f"Active children: {len(parent._active_children)}")
+    for i, c in enumerate(parent._active_children):
+        print(f"  Child {i}: _interrupt_requested={c._interrupt_requested}")

-time.sleep(0.5)
+    t0 = time.monotonic()
+    parent.interrupt("User typed a new message")
+    print("Called parent.interrupt()")

-print(f"Active children: {len(parent._active_children)}")
-for i, c in enumerate(parent._active_children):
-    print(f"  Child {i}: _interrupt_requested={c._interrupt_requested}")
+    for i, c in enumerate(parent._active_children):
+        print(f"  Child {i} after interrupt: _interrupt_requested={c._interrupt_requested}")
+    print(f"Global is_interrupted: {is_interrupted()}")

-t0 = time.monotonic()
-parent.interrupt("User typed a new message")
-print(f"Called parent.interrupt()")
+    agent_thread.join(timeout=10)
+    elapsed = time.monotonic() - t0
+    print(f"Agent thread finished in {elapsed:.2f}s")

-for i, c in enumerate(parent._active_children):
-    print(f"  Child {i} after interrupt: _interrupt_requested={c._interrupt_requested}")
-print(f"Global is_interrupted: {is_interrupted()}")
-
-agent_thread.join(timeout=10)
-elapsed = time.monotonic() - t0
-print(f"Agent thread finished in {elapsed:.2f}s")
-
-result = result_holder[0]
-if result:
-    print(f"Status: {result['status']}")
-    print(f"Duration: {result['duration_seconds']}s")
-    if elapsed < 2.0:
-        print("✅ PASS: Interrupt detected quickly!")
+    result = result_holder[0]
+    if result:
+        print(f"Status: {result['status']}")
+        print(f"Duration: {result['duration_seconds']}s")
+        if elapsed < 2.0:
+            print("✅ PASS: Interrupt detected quickly!")
+        else:
+            print(f"❌ FAIL: Took {elapsed:.2f}s — interrupt was too slow or not detected")
    else:
-        print(f"❌ FAIL: Took {elapsed:.2f}s — interrupt was too slow or not detected")
-else:
-    print("❌ FAIL: No result!")
+        print("❌ FAIL: No result!")

-set_interrupt(False)
+    set_interrupt(False)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tests/test_interactive_interrupt.py
+++ b/tests/test_interactive_interrupt.py
@ -29,51 +29,6 @@ from unittest.mock import MagicMock, patch
 from run_agent import AIAgent, IterationBudget
 from tools.interrupt import set_interrupt, is_interrupted

-set_interrupt(False)
-
-# ─── Create parent agent ───
-parent = AIAgent.__new__(AIAgent)
-parent._interrupt_requested = False
-parent._interrupt_message = None
-parent._active_children = []
-parent.quiet_mode = True
-parent.model = "test/model"
-parent.base_url = "http://localhost:1"
-parent.api_key = "test"
-parent.provider = "test"
-parent.api_mode = "chat_completions"
-parent.platform = "cli"
-parent.enabled_toolsets = ["terminal", "file"]
-parent.providers_allowed = None
-parent.providers_ignored = None
-parent.providers_order = None
-parent.provider_sort = None
-parent.max_tokens = None
-parent.reasoning_config = None
-parent.prefill_messages = None
-parent._session_db = None
-parent._delegate_depth = 0
-parent._delegate_spinner = None
-parent.tool_progress_callback = None
-parent.iteration_budget = IterationBudget(max_total=100)
-parent._client_kwargs = {"api_key": "test", "base_url": "http://localhost:1"}
-
-# Monkey-patch parent.interrupt to log
-_original_interrupt = AIAgent.interrupt
-def logged_interrupt(self, message=None):
-    log.info(f"🔴 parent.interrupt() called with: {message!r}")
-    log.info(f"   _active_children count: {len(self._active_children)}")
-    _original_interrupt(self, message)
-    log.info(f"   After interrupt: _interrupt_requested={self._interrupt_requested}")
-    for i, c in enumerate(self._active_children):
-        log.info(f"   Child {i}._interrupt_requested={c._interrupt_requested}")
-parent.interrupt = lambda msg=None: logged_interrupt(parent, msg)
-
-# ─── Simulate the exact CLI flow ───
-interrupt_queue = queue.Queue()
-child_running = threading.Event()
-agent_result = [None]
-
 def make_slow_response(delay=2.0):
    """API response that takes a while."""
    def create(**kwargs):
@ -94,96 +49,154 @@ def make_slow_response(delay=2.0):
    return create


-def agent_thread_func():
-    """Simulates the agent_thread in cli.py's chat() method."""
-    log.info("🟢 agent_thread starting")
+def main() -> int:
+    set_interrupt(False)

-    with patch("run_agent.OpenAI") as MockOpenAI:
-        mock_client = MagicMock()
-        mock_client.chat.completions.create = make_slow_response(delay=3.0)
-        mock_client.close = MagicMock()
-        MockOpenAI.return_value = mock_client
+    # ─── Create parent agent ───
+    parent = AIAgent.__new__(AIAgent)
+    parent._interrupt_requested = False
+    parent._interrupt_message = None
+    parent._active_children = []
+    parent.quiet_mode = True
+    parent.model = "test/model"
+    parent.base_url = "http://localhost:1"
+    parent.api_key = "test"
+    parent.provider = "test"
+    parent.api_mode = "chat_completions"
+    parent.platform = "cli"
+    parent.enabled_toolsets = ["terminal", "file"]
+    parent.providers_allowed = None
+    parent.providers_ignored = None
+    parent.providers_order = None
+    parent.provider_sort = None
+    parent.max_tokens = None
+    parent.reasoning_config = None
+    parent.prefill_messages = None
+    parent._session_db = None
+    parent._delegate_depth = 0
+    parent._delegate_spinner = None
+    parent.tool_progress_callback = None
+    parent.iteration_budget = IterationBudget(max_total=100)
+    parent._client_kwargs = {"api_key": "test", "base_url": "http://localhost:1"}

-        from tools.delegate_tool import _run_single_child
+    # Monkey-patch parent.interrupt to log
+    _original_interrupt = AIAgent.interrupt

-        # Signal that child is about to start
-        original_init = AIAgent.__init__
-        def patched_init(self_agent, *a, **kw):
-            log.info("🟡 Child AIAgent.__init__ called")
-            original_init(self_agent, *a, **kw)
-            child_running.set()
-            log.info(f"🟡 Child started, parent._active_children = {len(parent._active_children)}")
+    def logged_interrupt(self, message=None):
+        log.info(f"🔴 parent.interrupt() called with: {message!r}")
+        log.info(f"   _active_children count: {len(self._active_children)}")
+        _original_interrupt(self, message)
+        log.info(f"   After interrupt: _interrupt_requested={self._interrupt_requested}")
+        for i, child in enumerate(self._active_children):
+            log.info(f"   Child {i}._interrupt_requested={child._interrupt_requested}")

-        with patch.object(AIAgent, "__init__", patched_init):
-            result = _run_single_child(
-                task_index=0,
-                goal="Do a slow thing",
-                context=None,
-                toolsets=["terminal"],
-                model="test/model",
-                max_iterations=3,
-                parent_agent=parent,
-                task_count=1,
-                override_provider="test",
-                override_base_url="http://localhost:1",
-                override_api_key="test",
-                override_api_mode="chat_completions",
-            )
-            agent_result[0] = result
-            log.info(f"🟢 agent_thread finished. Result status: {result.get('status')}")
+    parent.interrupt = lambda msg=None: logged_interrupt(parent, msg)

+    # ─── Simulate the exact CLI flow ───
+    interrupt_queue = queue.Queue()
+    child_running = threading.Event()
+    agent_result = [None]

-# ─── Start agent thread (like chat() does) ───
-agent_thread = threading.Thread(target=agent_thread_func, name="agent_thread", daemon=True)
-agent_thread.start()
+    def agent_thread_func():
+        """Simulates the agent_thread in cli.py's chat() method."""
+        log.info("🟢 agent_thread starting")

-# ─── Wait for child to start ───
-if not child_running.wait(timeout=10):
-    print("FAIL: Child never started", file=sys.stderr)
-    sys.exit(1)
+        with patch("run_agent.OpenAI") as MockOpenAI:
+            mock_client = MagicMock()
+            mock_client.chat.completions.create = make_slow_response(delay=3.0)
+            mock_client.close = MagicMock()
+            MockOpenAI.return_value = mock_client

-# Give child time to enter its main loop and start API call
-time.sleep(1.0)
+            from tools.delegate_tool import _run_single_child

-# ─── Simulate user typing a message (like handle_enter does) ───
-log.info("📝 Simulating user typing 'Hey stop that'")
-interrupt_queue.put("Hey stop that")
+            # Signal that child is about to start
+            original_init = AIAgent.__init__

-# ─── Simulate chat() polling loop (like the real chat() method) ───
-log.info("📡 Starting interrupt queue polling (like chat())")
-interrupt_msg = None
-poll_count = 0
-while agent_thread.is_alive():
-    try:
-        interrupt_msg = interrupt_queue.get(timeout=0.1)
-        if interrupt_msg:
-            log.info(f"📨 Got interrupt message from queue: {interrupt_msg!r}")
-            log.info(f"   Calling parent.interrupt()...")
-            parent.interrupt(interrupt_msg)
-            log.info(f"   parent.interrupt() returned. Breaking poll loop.")
-            break
-    except queue.Empty:
-        poll_count += 1
-        if poll_count % 20 == 0:  # Log every 2s
-            log.info(f"   Still polling ({poll_count} iterations)...")
+            def patched_init(self_agent, *a, **kw):
+                log.info("🟡 Child AIAgent.__init__ called")
+                original_init(self_agent, *a, **kw)
+                child_running.set()
+                log.info(
+                    f"🟡 Child started, parent._active_children = {len(parent._active_children)}"
+                )

-# ─── Wait for agent to finish ───
-log.info("⏳ Waiting for agent_thread to join...")
-t0 = time.monotonic()
-agent_thread.join(timeout=10)
-elapsed = time.monotonic() - t0
-log.info(f"✅ agent_thread joined after {elapsed:.2f}s")
+            with patch.object(AIAgent, "__init__", patched_init):
+                result = _run_single_child(
+                    task_index=0,
+                    goal="Do a slow thing",
+                    context=None,
+                    toolsets=["terminal"],
+                    model="test/model",
+                    max_iterations=3,
+                    parent_agent=parent,
+                    task_count=1,
+                    override_provider="test",
+                    override_base_url="http://localhost:1",
+                    override_api_key="test",
+                    override_api_mode="chat_completions",
+                )
+                agent_result[0] = result
+                log.info(f"🟢 agent_thread finished. Result status: {result.get('status')}")

-# ─── Check results ───
-result = agent_result[0]
-if result:
-    log.info(f"Result status: {result['status']}")
-    log.info(f"Result duration: {result['duration_seconds']}s")
-    if result["status"] == "interrupted" and elapsed < 2.0:
-        print("✅ PASS: Interrupt worked correctly!", file=sys.stderr)
-    else:
+    # ─── Start agent thread (like chat() does) ───
+    agent_thread = threading.Thread(target=agent_thread_func, name="agent_thread", daemon=True)
+    agent_thread.start()
+
+    # ─── Wait for child to start ───
+    if not child_running.wait(timeout=10):
+        print("FAIL: Child never started", file=sys.stderr)
+        set_interrupt(False)
+        return 1
+
+    # Give child time to enter its main loop and start API call
+    time.sleep(1.0)
+
+    # ─── Simulate user typing a message (like handle_enter does) ───
+    log.info("📝 Simulating user typing 'Hey stop that'")
+    interrupt_queue.put("Hey stop that")
+
+    # ─── Simulate chat() polling loop (like the real chat() method) ───
+    log.info("📡 Starting interrupt queue polling (like chat())")
+    interrupt_msg = None
+    poll_count = 0
+    while agent_thread.is_alive():
+        try:
+            interrupt_msg = interrupt_queue.get(timeout=0.1)
+            if interrupt_msg:
+                log.info(f"📨 Got interrupt message from queue: {interrupt_msg!r}")
+                log.info("   Calling parent.interrupt()...")
+                parent.interrupt(interrupt_msg)
+                log.info("   parent.interrupt() returned. Breaking poll loop.")
+                break
+        except queue.Empty:
+            poll_count += 1
+            if poll_count % 20 == 0:  # Log every 2s
+                log.info(f"   Still polling ({poll_count} iterations)...")
+
+    # ─── Wait for agent to finish ───
+    log.info("⏳ Waiting for agent_thread to join...")
+    t0 = time.monotonic()
+    agent_thread.join(timeout=10)
+    elapsed = time.monotonic() - t0
+    log.info(f"✅ agent_thread joined after {elapsed:.2f}s")
+
+    # ─── Check results ───
+    result = agent_result[0]
+    if result:
+        log.info(f"Result status: {result['status']}")
+        log.info(f"Result duration: {result['duration_seconds']}s")
+        if result["status"] == "interrupted" and elapsed < 2.0:
+            print("✅ PASS: Interrupt worked correctly!", file=sys.stderr)
+            set_interrupt(False)
+            return 0
        print(f"❌ FAIL: status={result['status']}, elapsed={elapsed:.2f}s", file=sys.stderr)
-else:
-    print("❌ FAIL: No result returned", file=sys.stderr)
+        set_interrupt(False)
+        return 1

-set_interrupt(False)
+    print("❌ FAIL: No result returned", file=sys.stderr)
+    set_interrupt(False)
+    return 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/tests/test_openai_client_lifecycle.py
+++ b/tests/test_openai_client_lifecycle.py
@ -145,8 +145,9 @@ def test_concurrent_requests_do_not_break_each_other_when_one_client_closes(monk
    thread_one.join(timeout=5)
    thread_two.join(timeout=5)

-    assert isinstance(results["first"], APIConnectionError)
-    assert results["second"] == {"ok": "second"}
+    values = list(results.values())
+    assert sum(isinstance(value, APIConnectionError) for value in values) == 1
+    assert values.count({"ok": "second"}) == 1
    assert len(factory.calls) == 2


--- a/tests/test_run_agent.py
+++ b/tests/test_run_agent.py
@ -930,8 +930,10 @@ class TestConcurrentToolExecution:
            mock_hfc.assert_called_once_with(
                "web_search", {"q": "test"}, "task-1",
                enabled_tools=list(agent.valid_tool_names),
+                honcho_manager=None,
+                honcho_session_key=None,
            )
-        assert result == "result"
+            assert result == "result"

    def test_invoke_tool_handles_agent_level_tools(self, agent):
        """_invoke_tool should handle todo tool directly."""
@ -1584,6 +1586,38 @@ class TestSystemPromptStability:
        should_prefetch = not conversation_history
        assert should_prefetch is True

+    def test_run_conversation_can_skip_honcho_sync_for_synthetic_turns(self, agent):
+        captured = {}
+
+        def _fake_api_call(api_kwargs):
+            captured.update(api_kwargs)
+            return _mock_response(content="done", finish_reason="stop")
+
+        agent._honcho = MagicMock()
+        agent._honcho_session_key = "session-1"
+        agent._honcho_config = SimpleNamespace(
+            ai_peer="hermes",
+            memory_mode="hybrid",
+            write_frequency="async",
+            recall_mode="hybrid",
+        )
+        agent._use_prompt_caching = False
+
+        with (
+            patch.object(agent, "_honcho_sync") as mock_sync,
+            patch.object(agent, "_queue_honcho_prefetch") as mock_prefetch,
+            patch.object(agent, "_persist_session"),
+            patch.object(agent, "_save_trajectory"),
+            patch.object(agent, "_cleanup_task_resources"),
+            patch.object(agent, "_interruptible_api_call", side_effect=_fake_api_call),
+        ):
+            result = agent.run_conversation("synthetic flush turn", sync_honcho=False)
+
+        assert result["completed"] is True
+        assert captured["messages"][-1]["content"] == "synthetic flush turn"
+        mock_sync.assert_not_called()
+        mock_prefetch.assert_not_called()
+

 class TestHonchoActivation:
    def test_disabled_config_skips_honcho_init(self):
--- a/tests/tools/test_honcho_tools.py
+++ b/tests/tools/test_honcho_tools.py
@ -0,0 +1,36 @@
+"""Regression tests for per-call Honcho tool session routing."""
+
+import json
+from unittest.mock import MagicMock
+
+from tools import honcho_tools
+
+
+class TestHonchoToolSessionContext:
+    def setup_method(self):
+        self.orig_manager = honcho_tools._session_manager
+        self.orig_key = honcho_tools._session_key
+
+    def teardown_method(self):
+        honcho_tools._session_manager = self.orig_manager
+        honcho_tools._session_key = self.orig_key
+
+    def test_explicit_call_context_wins_over_module_global_state(self):
+        global_manager = MagicMock()
+        global_manager.get_peer_card.return_value = ["global"]
+        explicit_manager = MagicMock()
+        explicit_manager.get_peer_card.return_value = ["explicit"]
+
+        honcho_tools.set_session_context(global_manager, "global-session")
+
+        result = json.loads(
+            honcho_tools._handle_honcho_profile(
+                {},
+                honcho_manager=explicit_manager,
+                honcho_session_key="explicit-session",
+            )
+        )
+
+        assert result == {"result": ["explicit"]}
+        explicit_manager.get_peer_card.assert_called_once_with("explicit-session")
+        global_manager.get_peer_card.assert_not_called()