diff --git a/.planning/HANDOFF.json b/.planning/HANDOFF.json index e1e552c..8e89043 100644 --- a/.planning/HANDOFF.json +++ b/.planning/HANDOFF.json @@ -1,86 +1,114 @@ { "version": "1.0", - "timestamp": "2026-04-28T18:39:43.064Z", + "timestamp": "2026-04-30T15:03:14Z", "phase": "05", - "phase_name": "MVP Deployment", + "phase_name": "MVP deployment", "phase_dir": ".planning/phases/05-mvp-deployment", - "plan": 4, + "plan": 0, "task": 0, "total_tasks": 0, "status": "paused", "completed_tasks": [ { "id": 1, - "name": "Finalize multi-agent surface image handoff", + "name": "Fix path-based base_url normalization and add WS debug visibility", "status": "done", - "commit": "5b53788" + "commit": "7e5f9c2" }, { "id": 2, - "name": "Publish Docker image for the Matrix surface", + "name": "Add Matrix room recovery, reinvite flow, and default-agent warning behavior", "status": "done", - "artifact": "mput1/surfaces-bot:latest", - "digest": "sha256:26ba3a49290ab7c1cf0fa97f3de3fefdc70b59df7e6f1e0c2255728f8e2369be" + "commit": "7e5f9c2" }, { "id": 3, - "name": "Verify multi-agent file-volume routing contract", + "name": "Switch user file handling to workspace-root filenames with copy-style collision suffixes", "status": "done", - "evidence": "tests cover /agents/17/incoming and /agents/17/output routing" + "commit": "7e5f9c2" + }, + { + "id": 4, + "name": "Verify recent routing incident cause", + "status": "done", + "progress": "Confirmed that config lookup is exact-MXID based; mismatch in homeserver suffix caused fallback to the first agent." } ], "remaining_tasks": [ { - "id": 1, - "name": "Platform team integrates the published surface image into their 25-30 agent deployment", - "status": "external" + "id": 5, + "name": "Build and publish a fresh production image with the current workspace-root attachment contract", + "status": "not_started" }, { - "id": 2, - "name": "Run a real platform smoke test with production Matrix credentials, matrix-agents.yaml, and shared /agents volume", + "id": 6, + "name": "Send the new digest to platform and request Matrix bot redeploy", "status": "not_started" } ], "blockers": [ { - "description": "Full production verification depends on the platform team's real 25-30 agent orchestration and volume mounts.", + "description": "Platform redeploy is still required after the next image publish.", "type": "external", - "workaround": "Use docker-compose.fullstack.yml only as local E2E harness; production uses mput1/surfaces-bot:latest plus platform-managed agents." + "workaround": "None until a fresh digest is published." + }, + { + "description": "Old Phase 04 planning files still contain placeholder content.", + "type": "technical", + "workaround": "Ignore for the current deploy task; clean later as planning debt." } ], "human_actions_pending": [ { - "action": "Send platform the image tag, digest, deploy docs, and matrix-agents.yaml contract", - "context": "The bot is published as a single surface container; platform supplies agents, base_url values, and /agents/N volume mounts.", + "action": "Use exact Matrix MXIDs in user_agents, including the real homeserver suffix.", + "context": "Routing fallback to the first agent occurs whenever the config key does not exactly match the sender.", "blocking": true }, { - "action": "Platform prepares production config/matrix-agents.yaml", - "context": "Each external agent needs agent_id, base_url, and workspace_path such as /agents/17.", + "action": "Redeploy matrix-bot after the new image is published.", + "context": "Config edits alone need a container restart; the file-contract code change needs a new image first.", "blocking": true } ], "decisions": [ { - "decision": "Ship one generic Matrix surface image, not a compose stack with 25-30 agents.", - "rationale": "The platform owns agent lifecycle/orchestration; the surface only needs base_url and workspace_path per agent.", + "decision": "Keep fallback to the first agent for users missing from user_agents.", + "rationale": "Platform wanted that behavior to remain available, but with explicit user warning.", "phase": "05" }, { - "decision": "Make SURFACES_BOT_IMAGE explicit and document the published mput1/surfaces-bot image.", - "rationale": "Docker Hub push access is namespace-specific; hardcoding mrkan0 caused insufficient_scope.", + "decision": "Require exact Matrix MXID matching in user_agents.", + "rationale": "Current routing is deterministic and simple; no fuzzy matching or homeserver aliasing was introduced.", "phase": "05" }, { - "decision": "Keep docker-compose.fullstack.yml as internal E2E only.", - "rationale": "It validates the bot plus one local agent, but is not a model of production multi-agent orchestration.", + "decision": "Use workspace-root filenames for incoming user files and Windows-style copy suffixes on collision.", + "rationale": "Platform requested removal of incoming/outgoing directory split and timestamp-prefixed names.", "phase": "05" } ], "uncommitted_files": [ ".planning/HANDOFF.json", - ".planning/phases/05-mvp-deployment/.continue-here.md" + ".planning/STATE.md", + ".planning/phases/05-mvp-deployment/.continue-here.md", + "README.md", + "adapter/matrix/agent_registry.py", + "adapter/matrix/bot.py", + "adapter/matrix/files.py", + "adapter/matrix/handlers/auth.py", + "adapter/matrix/handlers/chat.py", + "adapter/matrix/reconciliation.py", + "adapter/matrix/routed_platform.py", + "config/matrix-agents.example.yaml", + "docs/deploy-architecture.md", + "sdk/real.py", + "tests/adapter/matrix/test_dispatcher.py", + "tests/adapter/matrix/test_files.py", + "tests/adapter/matrix/test_invite_space.py", + "tests/adapter/matrix/test_reconciliation.py", + "tests/platform/test_real.py", + "tests/test_deploy_handoff.py" ], - "next_action": "Resume by coordinating platform integration: confirm they use mput1/surfaces-bot:latest, mount /agents, provide config/matrix-agents.yaml, then run a real Matrix smoke test.", - "context_notes": "Phase 05 implementation and handoff commit 5b53788 are pushed. The Docker image was successfully built and pushed by the user as mput1/surfaces-bot:latest with digest sha256:26ba3a49290ab7c1cf0fa97f3de3fefdc70b59df7e6f1e0c2255728f8e2369be. Existing unrelated .planning dirt and a local jpg remain in the worktree and were intentionally not included in the handoff commit." + "next_action": "Build and publish a fresh production image from the current worktree, then send the digest to the platform for redeploy.", + "context_notes": "Current runtime logic appears correct. The last reported routing bug was traced to config mismatch between the real Matrix sender and the user_agents key. Do not reuse the previously published recovery image for deployment because it does not include the final workspace-root file contract." } diff --git a/.planning/phases/05-mvp-deployment/.continue-here.md b/.planning/phases/05-mvp-deployment/.continue-here.md index 5f0a722..25fefb4 100644 --- a/.planning/phases/05-mvp-deployment/.continue-here.md +++ b/.planning/phases/05-mvp-deployment/.continue-here.md @@ -3,37 +3,60 @@ phase: 05-mvp-deployment phase_name: MVP deployment task: 0 total_tasks: 0 -status: completed -last_updated: 2026-04-28T21:07:17Z +status: paused +last_updated: 2026-04-30T15:03:14Z --- -Phase 05 deployment handoff is complete. Image rebuilt for linux/amd64 and handoff text prepared for platform team. +Phase 05 code changes are in place, but the latest workspace-root attachment contract is not yet published in a new production image. Today's last debugging step confirmed that the user-to-agent config itself was fine except for one exact-MXID mismatch: the homeserver suffix in `user_agents` did not match the real Matrix sender, so fallback to the first agent was expected. -- Rebuilt image for linux/amd64 (was arm64 only): `mput1/surfaces-bot:latest` -- Updated deploy handoff digest in .continue-here.md -- Prepared deployment checklist text for platform +- Fixed the path-based `base_url` normalization bug that caused WS connects to drop route prefixes. +- Added WS lifecycle debug logging behind `SURFACES_DEBUG_WS=1`. +- Added Matrix routing/recovery behavior: +- warning users when they are not listed in `user_agents` +- preserving room bindings across config updates +- re-inviting users back into their Space and active rooms after leave +- `!new` from the entry/DM room to create a fresh working chat +- Reworked attachment handling so user files now go directly into the agent workspace root with Windows-style collision suffixes like `file (1).pdf`. +- Updated docs and tests to match the new root-workspace file contract. +- Verified that the recent “still goes to default agent” report was caused by exact MXID mismatch in config, not by YAML parsing or runtime routing logic. +- Published earlier images: +- `mput1/surfaces-bot:debug-ws-20260429` +- `mput1/surfaces-bot:matrix-recovery-20260429` -- Platform needs to pull image and deploy -- Awaiting smoke test confirmation from platform side +- Build and publish a new production image that includes the latest workspace-root attachment changes. +- Give the platform the new digest and ask them to redeploy the Matrix bot container. +- Optionally run local smoke/fullstack validation once more before publishing if extra confidence is needed. -- Rebuild for amd64 to match platform's production environment +- Keep the fallback to the first agent when a user is missing from `user_agents`. +- Require exact Matrix MXID match in `user_agents`; no fuzzy matching or homeserver normalization was added. +- Warn the user in-band when default-agent fallback is used. +- Keep room identity and `platform_chat_id` stable across config updates. +- Require container restart for config changes; no image rebuild is needed for `matrix-agents.yaml` edits alone. +- Remove `incoming/` and timestamp prefixes from the attachment contract. +- Save uploaded user files directly at the workspace root and resolve collisions with copy-style suffixes. -- None — implementation complete, awaiting platform deployment +- No code blocker. +- External dependency: platform redeploy after the next image publish. +- Historical debt: placeholder summary/plan artifacts still exist in old Phase 04 files and were not cleaned during this session. + +The current codebase should route correctly if the deployed config uses the exact real Matrix sender IDs, e.g. `@user:matrix.lambda.coredump.ru`. The next likely mistake during resume would be publishing the wrong image digest: the currently published recovery image predates the latest file-contract change. Resume by building a fresh image from the current worktree, not by reusing the old digest. + + -Await platform deployment confirmation. No further implementation work needed until platform reports issues or requests changes. - \ No newline at end of file +Rebuild the production image from the current worktree, publish it, and send the new digest to the platform for redeploy. +