wip: 05-mvp-deployment paused at task 0/0

This commit is contained in:
Mikhail Putilovskij 2026-04-30 18:04:24 +03:00
parent 7e5f9c20a0
commit 6369721876
2 changed files with 93 additions and 42 deletions

View file

@ -1,86 +1,114 @@
{ {
"version": "1.0", "version": "1.0",
"timestamp": "2026-04-28T18:39:43.064Z", "timestamp": "2026-04-30T15:03:14Z",
"phase": "05", "phase": "05",
"phase_name": "MVP Deployment", "phase_name": "MVP deployment",
"phase_dir": ".planning/phases/05-mvp-deployment", "phase_dir": ".planning/phases/05-mvp-deployment",
"plan": 4, "plan": 0,
"task": 0, "task": 0,
"total_tasks": 0, "total_tasks": 0,
"status": "paused", "status": "paused",
"completed_tasks": [ "completed_tasks": [
{ {
"id": 1, "id": 1,
"name": "Finalize multi-agent surface image handoff", "name": "Fix path-based base_url normalization and add WS debug visibility",
"status": "done", "status": "done",
"commit": "5b53788" "commit": "7e5f9c2"
}, },
{ {
"id": 2, "id": 2,
"name": "Publish Docker image for the Matrix surface", "name": "Add Matrix room recovery, reinvite flow, and default-agent warning behavior",
"status": "done", "status": "done",
"artifact": "mput1/surfaces-bot:latest", "commit": "7e5f9c2"
"digest": "sha256:26ba3a49290ab7c1cf0fa97f3de3fefdc70b59df7e6f1e0c2255728f8e2369be"
}, },
{ {
"id": 3, "id": 3,
"name": "Verify multi-agent file-volume routing contract", "name": "Switch user file handling to workspace-root filenames with copy-style collision suffixes",
"status": "done", "status": "done",
"evidence": "tests cover /agents/17/incoming and /agents/17/output routing" "commit": "7e5f9c2"
},
{
"id": 4,
"name": "Verify recent routing incident cause",
"status": "done",
"progress": "Confirmed that config lookup is exact-MXID based; mismatch in homeserver suffix caused fallback to the first agent."
} }
], ],
"remaining_tasks": [ "remaining_tasks": [
{ {
"id": 1, "id": 5,
"name": "Platform team integrates the published surface image into their 25-30 agent deployment", "name": "Build and publish a fresh production image with the current workspace-root attachment contract",
"status": "external" "status": "not_started"
}, },
{ {
"id": 2, "id": 6,
"name": "Run a real platform smoke test with production Matrix credentials, matrix-agents.yaml, and shared /agents volume", "name": "Send the new digest to platform and request Matrix bot redeploy",
"status": "not_started" "status": "not_started"
} }
], ],
"blockers": [ "blockers": [
{ {
"description": "Full production verification depends on the platform team's real 25-30 agent orchestration and volume mounts.", "description": "Platform redeploy is still required after the next image publish.",
"type": "external", "type": "external",
"workaround": "Use docker-compose.fullstack.yml only as local E2E harness; production uses mput1/surfaces-bot:latest plus platform-managed agents." "workaround": "None until a fresh digest is published."
},
{
"description": "Old Phase 04 planning files still contain placeholder content.",
"type": "technical",
"workaround": "Ignore for the current deploy task; clean later as planning debt."
} }
], ],
"human_actions_pending": [ "human_actions_pending": [
{ {
"action": "Send platform the image tag, digest, deploy docs, and matrix-agents.yaml contract", "action": "Use exact Matrix MXIDs in user_agents, including the real homeserver suffix.",
"context": "The bot is published as a single surface container; platform supplies agents, base_url values, and /agents/N volume mounts.", "context": "Routing fallback to the first agent occurs whenever the config key does not exactly match the sender.",
"blocking": true "blocking": true
}, },
{ {
"action": "Platform prepares production config/matrix-agents.yaml", "action": "Redeploy matrix-bot after the new image is published.",
"context": "Each external agent needs agent_id, base_url, and workspace_path such as /agents/17.", "context": "Config edits alone need a container restart; the file-contract code change needs a new image first.",
"blocking": true "blocking": true
} }
], ],
"decisions": [ "decisions": [
{ {
"decision": "Ship one generic Matrix surface image, not a compose stack with 25-30 agents.", "decision": "Keep fallback to the first agent for users missing from user_agents.",
"rationale": "The platform owns agent lifecycle/orchestration; the surface only needs base_url and workspace_path per agent.", "rationale": "Platform wanted that behavior to remain available, but with explicit user warning.",
"phase": "05" "phase": "05"
}, },
{ {
"decision": "Make SURFACES_BOT_IMAGE explicit and document the published mput1/surfaces-bot image.", "decision": "Require exact Matrix MXID matching in user_agents.",
"rationale": "Docker Hub push access is namespace-specific; hardcoding mrkan0 caused insufficient_scope.", "rationale": "Current routing is deterministic and simple; no fuzzy matching or homeserver aliasing was introduced.",
"phase": "05" "phase": "05"
}, },
{ {
"decision": "Keep docker-compose.fullstack.yml as internal E2E only.", "decision": "Use workspace-root filenames for incoming user files and Windows-style copy suffixes on collision.",
"rationale": "It validates the bot plus one local agent, but is not a model of production multi-agent orchestration.", "rationale": "Platform requested removal of incoming/outgoing directory split and timestamp-prefixed names.",
"phase": "05" "phase": "05"
} }
], ],
"uncommitted_files": [ "uncommitted_files": [
".planning/HANDOFF.json", ".planning/HANDOFF.json",
".planning/phases/05-mvp-deployment/.continue-here.md" ".planning/STATE.md",
".planning/phases/05-mvp-deployment/.continue-here.md",
"README.md",
"adapter/matrix/agent_registry.py",
"adapter/matrix/bot.py",
"adapter/matrix/files.py",
"adapter/matrix/handlers/auth.py",
"adapter/matrix/handlers/chat.py",
"adapter/matrix/reconciliation.py",
"adapter/matrix/routed_platform.py",
"config/matrix-agents.example.yaml",
"docs/deploy-architecture.md",
"sdk/real.py",
"tests/adapter/matrix/test_dispatcher.py",
"tests/adapter/matrix/test_files.py",
"tests/adapter/matrix/test_invite_space.py",
"tests/adapter/matrix/test_reconciliation.py",
"tests/platform/test_real.py",
"tests/test_deploy_handoff.py"
], ],
"next_action": "Resume by coordinating platform integration: confirm they use mput1/surfaces-bot:latest, mount /agents, provide config/matrix-agents.yaml, then run a real Matrix smoke test.", "next_action": "Build and publish a fresh production image from the current worktree, then send the digest to the platform for redeploy.",
"context_notes": "Phase 05 implementation and handoff commit 5b53788 are pushed. The Docker image was successfully built and pushed by the user as mput1/surfaces-bot:latest with digest sha256:26ba3a49290ab7c1cf0fa97f3de3fefdc70b59df7e6f1e0c2255728f8e2369be. Existing unrelated .planning dirt and a local jpg remain in the worktree and were intentionally not included in the handoff commit." "context_notes": "Current runtime logic appears correct. The last reported routing bug was traced to config mismatch between the real Matrix sender and the user_agents key. Do not reuse the previously published recovery image for deployment because it does not include the final workspace-root file contract."
} }

View file

@ -3,37 +3,60 @@ phase: 05-mvp-deployment
phase_name: MVP deployment phase_name: MVP deployment
task: 0 task: 0
total_tasks: 0 total_tasks: 0
status: completed status: paused
last_updated: 2026-04-28T21:07:17Z last_updated: 2026-04-30T15:03:14Z
--- ---
<current_state> <current_state>
Phase 05 deployment handoff is complete. Image rebuilt for linux/amd64 and handoff text prepared for platform team. Phase 05 code changes are in place, but the latest workspace-root attachment contract is not yet published in a new production image. Today's last debugging step confirmed that the user-to-agent config itself was fine except for one exact-MXID mismatch: the homeserver suffix in `user_agents` did not match the real Matrix sender, so fallback to the first agent was expected.
</current_state> </current_state>
<completed_work> <completed_work>
- Rebuilt image for linux/amd64 (was arm64 only): `mput1/surfaces-bot:latest` - Fixed the path-based `base_url` normalization bug that caused WS connects to drop route prefixes.
- Updated deploy handoff digest in .continue-here.md - Added WS lifecycle debug logging behind `SURFACES_DEBUG_WS=1`.
- Prepared deployment checklist text for platform - Added Matrix routing/recovery behavior:
- warning users when they are not listed in `user_agents`
- preserving room bindings across config updates
- re-inviting users back into their Space and active rooms after leave
- `!new` from the entry/DM room to create a fresh working chat
- Reworked attachment handling so user files now go directly into the agent workspace root with Windows-style collision suffixes like `file (1).pdf`.
- Updated docs and tests to match the new root-workspace file contract.
- Verified that the recent “still goes to default agent” report was caused by exact MXID mismatch in config, not by YAML parsing or runtime routing logic.
- Published earlier images:
- `mput1/surfaces-bot:debug-ws-20260429`
- `mput1/surfaces-bot:matrix-recovery-20260429`
</completed_work> </completed_work>
<remaining_work> <remaining_work>
- Platform needs to pull image and deploy - Build and publish a new production image that includes the latest workspace-root attachment changes.
- Awaiting smoke test confirmation from platform side - Give the platform the new digest and ask them to redeploy the Matrix bot container.
- Optionally run local smoke/fullstack validation once more before publishing if extra confidence is needed.
</remaining_work> </remaining_work>
<decisions_made> <decisions_made>
- Rebuild for amd64 to match platform's production environment - Keep the fallback to the first agent when a user is missing from `user_agents`.
- Require exact Matrix MXID match in `user_agents`; no fuzzy matching or homeserver normalization was added.
- Warn the user in-band when default-agent fallback is used.
- Keep room identity and `platform_chat_id` stable across config updates.
- Require container restart for config changes; no image rebuild is needed for `matrix-agents.yaml` edits alone.
- Remove `incoming/` and timestamp prefixes from the attachment contract.
- Save uploaded user files directly at the workspace root and resolve collisions with copy-style suffixes.
</decisions_made> </decisions_made>
<blockers> <blockers>
- None — implementation complete, awaiting platform deployment - No code blocker.
- External dependency: platform redeploy after the next image publish.
- Historical debt: placeholder summary/plan artifacts still exist in old Phase 04 files and were not cleaned during this session.
</blockers> </blockers>
<context>
The current codebase should route correctly if the deployed config uses the exact real Matrix sender IDs, e.g. `@user:matrix.lambda.coredump.ru`. The next likely mistake during resume would be publishing the wrong image digest: the currently published recovery image predates the latest file-contract change. Resume by building a fresh image from the current worktree, not by reusing the old digest.
</context>
<next_action> <next_action>
Await platform deployment confirmation. No further implementation work needed until platform reports issues or requests changes. Rebuild the production image from the current worktree, publish it, and send the new digest to the platform for redeploy.
</next_action> </next_action>