From f0c4988b4439afd4e7aefef25f22e18a957d5498 Mon Sep 17 00:00:00 2001 From: Azamat Date: Thu, 2 Apr 2026 11:57:20 +0300 Subject: [PATCH 01/30] [feat] add context and tasks for master-service --- docs/master.md | 125 ++++++++++ meetings/meeting_1_document.md | 405 +++++++++++++++++++++++++++++++++ tasks.md | 170 ++++++-------- 3 files changed, 602 insertions(+), 98 deletions(-) create mode 100644 docs/master.md create mode 100644 meetings/meeting_1_document.md diff --git a/docs/master.md b/docs/master.md new file mode 100644 index 0000000..bdf21c4 --- /dev/null +++ b/docs/master.md @@ -0,0 +1,125 @@ +# Требования к master-service + +Источник: `meetings/meeting_1_document.md` + уточнение от тимлида + +## Назначение + +Master-service — это control plane платформы AI-агентов. Он управляет sandbox-контейнерами с AI-Agent, рабочими пространствами пользователей, чатами, пользовательскими файлами, проверкой доступа и выдачей временного p2p-доступа к контейнеру. + +После старта sandbox и успешной авторизации Master должен оставаться плоскостью управления, а не постоянным data-plane прокси: рабочий трафик к агенту должен идти напрямую по выданному p2p-каналу. + +## Границы ответственности + +### В scope + +- оркестрация sandbox +- управление workspace и chat volume +- хранение чатов? и файлов пользователя +- хранение и выдача артефактов +- проверка доступа +- выдача и отзыв p2p-доступа +- учет сессий, TTL и cleanup-процессов +- хранение служебных метаданных + +### Вне scope + +- Telegram/Discord/Slack интеграции и их transport-логика +- бизнес-логика AI-Agent внутри sandbox +- реализация LLM-level защиты от prompt injection +- реализация конкретных lambda-tools и browser tools + +## Основные сущности + +- **User** — владелец workspace +- **Workspace** — пользовательское хранилище с мягкой квотой 10 GB +- **Chat** — изолированная рабочая область внутри workspace +- **SandboxSession** — активный runtime AI-Agent для конкретного чата +- **AccessLease** — временное право на p2p-доступ к sandbox +- **ChatFile** — файл, прикрепленный к чату +- **Artifact** — результат, созданный агентом и подготовленный к выдаче наружу +- **DependencyBundle** — предзагруженный набор зависимостей tool/runtime +- **LambdaToolBundle** — платформенный read-only набор тулов + +## Функциональные требования + +### 1. Управление workspace и chat storage + +- **FR-001** Master должен создавать workspace пользователя при первом обращении +- **FR-002** Master должен вести мягкую квоту workspace в 10 GB на пользователя без физического резервирования места +- **FR-003** Master должен создавать, получать, перечислять и удалять чаты пользователя +- **FR-004** Каждый chat должен иметь собственную изолированную директорию с файлами и историей +- **FR-005** История сообщений чата в первой версии должна храниться рядом с чатом в файле `history.md` +- **FR-006** Master должен сохранять пользовательские вложения в директорию соответствующего чата до запуска или во время работы sandbox +- **FR-007** Master должен предоставлять операции списка, чтения метаданных, удаления и очистки файлов чата + +### 2. Оркестрация sandbox + +- **FR-008** Master должен поднимать sandbox-контейнер AI-Agent по первому сообщению или при отсутствии активной sandbox-сессии +- **FR-009** Master должен переиспользовать активную sandbox-сессию, пока не истек TTL бездействия +- **FR-010** Master должен завершать sandbox после периода неактивности; стартовое значение TTL — около 1 часа, но параметр должен быть конфигурируемым +- **FR-011** После завершения sandbox данные чата и workspace должны сохраняться и использоваться при следующем старте +- **FR-012** Master должен уметь пересоздать sandbox на другом узле при сохранности данных и доступности нужных volume + +### 3. Подключение volume и runtime-ресурсов + +- **FR-013** При старте sandbox Master должен подключать volume текущего чата +- **FR-014** При старте sandbox Master должен подключать volume с dependency cache +- **FR-015** При старте sandbox Master должен подключать volume с lambda-tools в режиме read-only +- **FR-016** Если используются user-tools, Master должен подключать их так, чтобы агент мог их только исполнять, но не менять +- **FR-017** Master должен проверять наличие нужных dependency bundle на текущем узле и при отсутствии инициировать загрузку из внутреннего хранилища + +### 4. Доступ и p2p-соединение + +- **FR-018** Master должен аутентифицировать внутренние запросы от интеграционного слоя и других доверенных сервисов +- **FR-019** Master должен авторизовывать доступ по связке user/workspace/chat/sandbox +- **FR-020** После успешной проверки доступа Master должен выдавать временные параметры p2p-подключения к sandbox +- **FR-021** Выданный p2p-доступ должен быть ограничен по времени и привязан к конкретной sandbox-сессии +- **FR-022** Master должен уметь отзывать p2p-доступ при остановке sandbox, смене прав или завершении сессии +- **FR-023** Master не должен проксировать постоянный трафик к агенту после выдачи p2p-доступа, кроме управляющих операций + +### 5. Хранение чатов, файлов и артефактов + +- **FR-024** Master должен хранить центральные метаданные о пользователях, чатах, sandbox-сессиях, lease, файлах и артефактах +- **FR-025** Master должен отделять метаданные чата в центральной БД от фактической истории сообщений в `history.db` +- **FR-026** Master должен поддерживать загрузку артефактов в S3-совместимое объектное хранилище +- **FR-027** Master должен хранить связь между артефактом, пользователем, чатом и способом внешней доставки +- **FR-028** После подтвержденной отправки артефакт должен быть помечен на удаление или удален по policy + +### 6. Lifecycle и cleanup + +- **FR-029** Master должен отслеживать `last_read` и/или `last_update` для файлов чата +- **FR-030** Файлы чата должны автоматически удаляться после 3 дней неактивности +- **FR-031** Master должен поддерживать удаление всего workspace после длительной неактивности аккаунта; срок должен быть конфигурируемым +- **FR-032** Cleanup должен быть безопасным: нельзя удалять данные активного sandbox или чата с действующим lease + +### 7. Наблюдаемость и аудит + +- **FR-033** Master должен логировать создание и остановку sandbox, подключение volume, выдачу доступа и cleanup-операции +- **FR-034** Master должен собирать метрики по активным sandbox, времени запуска, ошибкам старта, cleanup и storage usage +- **FR-035** Master должен предоставлять аудитный след по операциям доступа к chat/file/artifact ресурсам + +## Нефункциональные требования + +- **NFR-001** Master должен быть control-plane сервисом и не содержать бизнес-логику AI-Agent +- **NFR-002** Все данные разных пользователей должны быть изолированы на уровне путей, mount policy и access policy +- **NFR-003** Повторный запрос на создание chat или запуск sandbox не должен приводить к дублированию ресурсов при одинаковом идемпотентном ключе +- **NFR-004** Политики TTL, cleanup, квоты и lease должны задаваться конфигом +- **NFR-005** Решение должно позволять горизонтально масштабировать Master и пересоздавать sandbox независимо от конкретного инстанса сервиса +- **NFR-006** Отказ sandbox не должен приводить к потере chat history и пользовательских файлов +- **NFR-007** Все security-чувствительные операции должны быть трассируемы и наблюдаемы + +## Явные ограничения первой версии + +- один master-service отвечает за orchestration и storage metadata +- история сообщений хранится в `history.md` внутри chat directory +- workspace quota считается по фактическому размеру файлов, без hard reservation +- heavy tools и browser automation не выносятся в отдельный сервис, пока это не потребуется +- интеграции мессенджеров считаются внешними клиентами master-service + +## Открытые вопросы + +- какой именно transport используется для p2p-доступа: raw TCP, WebSocket, gRPC или другой вариант = WebSocket +- какой срок inactivity cleanup для всего аккаунта считается продуктовым дефолтом +- нужен ли отдельный API для скачивания chat files наружу или достаточно metadata + object/file gateway +- подтверждается ли доставка артефакта внешней интеграцией явно или cleanup идет только по TTL +- одна sandbox-сессия должна соответствовать одному chat или допустим multiplex нескольких чатов на один runtime diff --git a/meetings/meeting_1_document.md b/meetings/meeting_1_document.md new file mode 100644 index 0000000..15b47c2 --- /dev/null +++ b/meetings/meeting_1_document.md @@ -0,0 +1,405 @@ +# Meeting 1 — Platform Architecture Design +**Date:** 2026-03-21 (estimated from transcript context) +**Duration:** ~1:30:18 (00:00 - 1:30:18) +**Participants:** Azamat N (lead architect, screen-sharing), Egor (Егор), Yaroslav (Ярослав), David Shvarts (Давид Шварц), Andrey Bakhtiozin (Бахтиозин Андрей) +**Platform:** DION video conferencing +**Collaborative tool:** Excalidraw (shared whiteboard) + +## Summary + +The Lambda Lab 3.0 team held their first architecture planning meeting to design the AI-agent platform. Azamat presented a draft architecture he had developed with Egor over the preceding two days, walking through the system on a shared Excalidraw whiteboard. The platform consists of two main applications: a **Master** service (orchestrator managing sessions, containers, and context) and an **AI-Agent** service (running inside isolated Docker containers with a per-user workspace). The team discussed the workspace structure (10GB per user with chat-scoped files and history), three categories of tools (built-in, lambda/platform, and user-generated), dependency management via mount points, artifact storage via S3, and message history storage options. Key side topics included security concerns around prompt injection and file leakage, the agent-browser tool for web interaction, infrastructure decisions (Forgejo for Git hosting, Matrix for team communication), and upcoming deadlines — specifically presenting this architecture to Kalinin the next day. + +The architecture is explicitly described as a first draft created in two days, meant to establish a direction rather than be final. The team assigned responsibilities: Azamat handles Forgejo repository setup, wiki documentation, and architecture presentation preparation. The meeting also touched on developer tooling — using Claude/OpenCode with agent profiles, CLAUDE.md-style configuration files, and CI/CD pipelines for documentation. + +## Discussion + +### Setup and Technical Issues +**[00:00 — 02:14]** + +The meeting opened with participants joining the DION video call and resolving recording/connection issues. Azamat briefly disconnected and reconnected. Five participants were present: Azamat N, Egor, Yaroslav, David Shvarts, and Andrey Bakhtiozin. + +**Visual context:** DION video conferencing interface showing participant tiles with initials and names. + +--- + +### Platform Architecture Overview +**[02:17 — 04:50]** + +Azamat shared his Excalidraw screen and began presenting the architecture he had drafted with Egor. He explained the overall platform goal: creating AI agents accessible through various integrations (Telegram, Discord, Slack), where the agent lives in an isolated environment. + +The architecture has two main applications: + +1. **Master** — orchestrator that manages agent sessions. When a user writes in Telegram, the message hits the Master, which creates a session and asks Docker to spin up a container with the AI-Agent. + +2. **AI-Agent** — the second application, living inside an isolated Docker container. Once the container is running, communication happens directly between the messaging integration and the agent container (peer-to-peer), bypassing the Master. + +> **Azamat** @ 02:37: "Вот наша задача сделать разными схемами платформу, которые подключаются к различной интеграции по типу Telegram, Discord и всего такого." + +> **Azamat** @ 02:51: "Наша задача именно сделать агента, который живет в изолированной среде." + +**Visual context:** Excalidraw diagram showing Stream (Telegram, Discord, Slack) -> Docker (multiple AI-Agent instances) -> Master (S3, DB). Arrows indicate data flow between components. + +--- + +### Workspace Structure and Projects +**[05:15 — 07:28]** + +Azamat described the workspace allocated per user — 10GB of storage. Within this workspace, users can create projects (later renamed to "chats"), each containing files the agent interacts with (PDFs, images, documents, videos). This is analogous to ChatGPT's project feature. + +> **Azamat** @ 05:15: "Мы ему выделяем 10 гигабайт памяти. Просто. И он в эти 10 гигабайт может делать, что хочет." + +> **Azamat** @ 07:14: "У юзера, вот как в chat.gpt можно создавать проекты, здесь будет то же самое." + +**Visual context:** Excalidraw diagram showing "Workspace 10GB" containing P1 (file1.pdf, file2.png), P2 (file3.docx), P3 (video.mp4). + +--- + +### Tool Categories +**[07:30 — 12:35]** + +The team established three categories of tools: + +**1. Built-in tools** (embedded in agent code): +- `web-search` — web search capability +- `fetch-url` — fetch a web page (e.g., `http://simple-file.ru/index.html`) +- `bash` — command-line interaction (cd, ls, git, python, etc.) + +**2. Lambda tools (platform-wide, read-only):** +- Written by other teams within the organization +- Example: `markitdown` — converts various formats (PDF, DOCX, HTML) to Markdown +- Must support `--help`, `--json`/`--agent` flags for agent-friendly interaction +- Must support `man-agent ` for machine-readable documentation +- Read-only, mounted into every agent container + +**3. User tools (per-user, execute-only):** +- Agent can write scripts to solve specific user tasks +- Example: a simple task-tracker with `add` and `close` commands, backed by sqlite.db +- Execute-only permissions (`+x only`) for security + +> **Azamat** @ 10:38: "Когда мы вызываем условно какой-нибудь такой тул, у нас должно быть типа help и желательно еще минус-минус JSON, чтобы вот это агент понимал." + +**Visual context:** Excalidraw showing AI-Agent sandbox with built-in tools listed, "Сторонние тулы RO" (third-party tools read-only) section, and "User-tool +x only" section. Workspace with P1/P2/P3 projects below. + +--- + +### Security Discussion: User Tools +**[15:58 — 16:30]** + +Egor raised a security concern: if user tools are writable by the agent, a malicious prompt injection could cause the agent to modify user tools, inject code like `print(token)`, and exfiltrate credentials stored in environment variables. + +> **Egor** @ 15:58: "Потому что это поможет подцепить где-нибудь вирусный промпт, полезть в папку UserTools, ввести туда какой-нибудь принт токен и слить токены оттуда." + +The team agreed that user tools should have execute-only permissions and that this area needs further discussion. + +--- + +### Dependency Management and Mount Points +**[17:37 — 23:47]** + +The team discussed how dependencies for tools (Python libraries, etc.) would be managed. Since downloading gigabytes of dependencies on every container launch is impractical, they decided to: + +- **Pre-mount dependencies** on the host machine +- Check if dependencies exist on the current node at container launch +- If not present, download from their own S3/cloud storage (in the same data center) rather than external sources +- Store as mounted volumes, not inside containers + +Mount points defined: +- `Mount cur-chat` — current chat/project files +- `Mount dependencies` — pre-cached tool dependencies +- `Mount lambda-tools` — shared platform tools (RO) + +> **Azamat** @ 18:15: "Нам надо где-то сохранять, потому что на каждый запуск еще тянуть гигабайт со скоростью 100 килобит в секунду не очень хочется." + +**Visual context:** Updated Excalidraw diagram showing mount points connected to the AI-Agent workspace, with arrows from host machine storage to container mounts. + +--- + +### S3 Storage for Artifacts +**[16:46 — 17:36]** + +S3 is used for artifact storage — files the agent generates (reports, presentations, images) that need to be sent back to users via Telegram or other messengers. The agent uploads artifacts to S3, and they are delivered to the user through the messaging integration. + +> **Azamat** @ 21:32: "Просто файл-помойка. Типа, если ты хочешь знать, как с ним работать, ты файлик просто по http туда загружаешь." + +> **David** @ 22:08: "Считай просто это как Google диск, только наш." + +--- + +### Projects vs. Chats Naming +**[36:07 — 37:50]** + +Confusion arose around the "project" naming because in ChatGPT, a project contains multiple chats. The team decided to rename "projects" to **"chats"** (C1, C2, C3) to avoid confusion. Each chat has its own files and history.db. The question of supporting multiple chats per project was deferred. + +> **Azamat** @ 36:29: "Знаешь, короче, как чаты просто в GPT-шке, только мы их называли проектами." + +> **Egor** @ 37:30: "Confusing naming тут." + +--- + +### Message History Storage +**[30:55 — 35:32]** + +The team debated where to store message history. Options discussed: + +1. **Central database** — rejected because querying by project_id across scattered chunks is inefficient +2. **Per-chat file** (history.db) inside the workspace — accepted as the initial approach + +Egor argued that per-chat SQLite files are more efficient since you always need the full history for a chat, not individual messages. The database fields discussed: `project_id`, `timestamp`, `msg`. + +> **Egor** @ 34:45: "Каждый запрос в BD, ему нужно пройтись по индексу, найти все записи вот с этим project id, а они раскиданы все по разным чанкам." + +> **Azamat** @ 36:04: "Я вспомнил про файлы типа memory.md, soul.md. Кто-то же так делает." + +**Visual context:** Database schema shown with fields: session_id/project_id, user_id, timestamp, msg. + +--- + +### File Lifecycle and Cleanup +**[37:50 — 42:12]** + +The team discussed file retention policy within chats: + +- **Decision:** Files inside chats live for **3 days** by default +- Track `last_read`/`last_update` timestamps +- After 3 days without access, files are auto-deleted +- If a user needs a deleted file, the bot asks them to re-send it +- For inactive accounts: delete entire workspace after N months (like Telegram's 6-month inactivity deletion) +- Artifacts generated by the agent are deleted after being sent to the user via messenger + +> **Azamat** @ 40:20: "Условно сделаем хранение три дня файлов, то есть если три дня никто их не трогает." + +> **Azamat** @ 48:33: "Давай просто как у телеграма. Если профиль не активен 3 месяца, он удаляет аккаунт." + +--- + +### Responsibilities and Team Boundaries +**[43:00 — 43:43]** + +The team explicitly marked areas of responsibility on the Excalidraw diagram. The green-highlighted sections (Telegram/Discord/Slack integrations and Stream) are **not their responsibility** — those belong to "Путиловский-team" (Putilovsky's team). Lambda Lab 3.0 owns the Master, AI-Agent, workspace, and tools infrastructure. + +**Visual context:** Diagram annotated with "Путиловский-team" labels on the Stream/integration components. + +--- + +### Machine-Level Architecture +**[44:29 — 47:01]** + +Egor suggested visualizing the machine-level architecture: a Linux base, shared environment layer with lambda-tools, then multiple Agent-AI containers above, each with their own environment and code layer. This shows how lambda tools are shared across all agents on a machine, while each agent has its own isolated workspace. + +**Visual context:** New Excalidraw section labeled "Машина" (Machine) showing: Linux -> Env -> lambda-tools -> multiple Agent-AI instances. + +--- + +### Storage Overcommit Discussion +**[49:16 — 51:32]** + +Azamat suggested that the 10GB per user wouldn't be physically reserved — similar to VPS overcommit, they'd simply check that each user's directory doesn't exceed 10GB. If a machine runs out of space, Kubernetes can migrate workloads to another node. The team agreed this is acceptable for the initial version and deferred detailed capacity planning. + +> **Azamat** @ 49:26: "Мы просто их будем создавать на диске и смотреть, что папка пользователей не больше 10 гигавайт." + +--- + +### Container Lifecycle and TTL +**[55:44 — 56:10]** + +When a user sends their first message, a container spins up. The container has a **TTL** — if there are no messages for about an hour, the container is killed. When the user returns, a new container is launched with the same mounted workspace data, so everything persists. + +> **Azamat** @ 55:55: "У него есть какой-то TTL. Ну, типа, если там сообщений нет час, ну, контейнер сдохнет." + +--- + +### Testing Strategy +**[52:40 — 53:25]** + +The question of testing without integrations was raised. Since the Telegram/Discord/Slack integration is another team's responsibility, testing the AI-Agent would require generating mock events. Azamat suggested using AI tools to quickly generate test client instruments. + +--- + +### Repository Structure +**[53:28 — 54:17]** + +The team decided on **two repositories**: one for Master, one for AI-Agent. Both will be hosted on Forgejo (their self-hosted Git platform). The architecture document and repository structure are due by Friday (per Kalinin's report). + +> **Azamat** @ 53:54: "Написано в отчете: команда к пятнице — предоставить структуру репозитория и архитектурный документ." + +--- + +### Infrastructure: Forgejo, Matrix, Communication +**[57:06 — 1:03:34]** + +- **Forgejo** — self-hosted Git service at lambda.coredump.ru, used instead of GitHub for code privacy and Docker registry access +- **Matrix** — team communication platform, registration at element.lambda.coredump.ru (E2E encryption disabled on server). To be used as backup if Telegram gets blocked. +- The team discussed registering on both platforms + +> **Azamat** @ 1:01:01: "Наш личный гитаб." + +**Visual context:** Forgejo landing page showing "MAI Lambda Lab 3.0" with self-hosted Git repository features. Telegram chat showing Matrix registration instructions. + +--- + +### Agent-Browser Tool +**[1:12:11 — 1:15:32]** + +The team discussed the agent-browser tool — a CLI tool for browser automation. They reviewed the GitHub repository for an open-source agent-browser (23.6k stars, Rust-based, ~1MB binary). Key concern: Chromium dependency (~800MB + RAM usage). + +**Decision:** Start with running the browser tool locally inside the container. If it proves too heavy, move to a separate API-based service. + +> **Azamat** @ 1:14:51: "Давай по умолчанию, ну что, вот сейчас примем решение, что оно по API ходит." + +**Visual context:** GitHub repository page for agent-browser showing quick-start commands, installation instructions, and browser automation commands. + +--- + +### Scripting and Tool Implementation +**[1:16:00 — 1:19:07]** + +Discussion about how agents should implement tools at runtime. The team considered Lua scripting as a lightweight alternative to Python for agent-generated scripts, avoiding dependency issues. The idea is that the agent writes small scripts to process data without needing to call the LLM again for each step. + +> **Azamat** @ 1:18:43: "Проще питон, потому что у вас питон уже будет поднятый." + +--- + +### Security: Prompt Injection and Data Leakage +**[1:19:27 — 1:23:58]** + +Extended discussion about security risks: + +- **Prompt injection** — malicious prompts embedded in web pages or files could instruct the agent to exfiltrate user data +- **File leakage** — sensitive files (e.g., NDA documents, movie scripts) uploaded by users could be leaked by a compromised agent +- **Defense options discussed:** + - Agent creates an execution plan before acting, which can be reviewed + - User tools should have execute-only permissions + - LLM-level defense is "useless" per Egor +- **Decision:** Security is not Lambda Lab 3.0's sole responsibility — there's another team doing "LLM research" that should add **security metrics** and test defense mechanisms + +> **Egor** @ 1:23:28: "И защищаться на уровне ЛЛМ это бесполезно." + +> **Azamat** @ 1:23:50: "Придумать метрику безопасности." + +--- + +### Tomorrow's Presentation and Next Steps +**[1:24:33 — 1:30:18]** + +The team confirmed that the Excalidraw architecture diagram is the main presentation material for the next day's meeting with Kalinin. Azamat will: +- Set up the Forgejo repository +- Create wiki documentation +- Set up the architecture document +- Share Forgejo registration links + +The meeting ended with Azamat demonstrating his OpenCode setup — showing agent profiles (code-refactorer, bug-hunter-fixer, worker, documentation-generator, feature-developer, code-explorer, test-engineer, code-reviewer) and system-design.md configuration files that define how AI coding agents interact with the project. + +> **Azamat** @ 1:25:09: "Касательно завтрашней презентации, мы же будем как-то вот показывать эту схему, не на пальцах же объяснять?" + +**Visual context:** Terminal showing OpenCode CLI with Claude Opus 4.6 model, agent profiles, system-design.md configuration file showing subagent delegation rules. + +--- + +## Decisions Made + +| # | Decision | Context | Timestamp | +|---|----------|---------|-----------| +| 1 | Platform consists of two applications: Master (orchestrator) and AI-Agent (sandboxed) | Core architecture decision | 03:01 | +| 2 | 10GB workspace per user with overcommit (not physically reserved) | Storage allocation strategy | 05:15, 49:26 | +| 3 | Three tool categories: built-in (in agent code), lambda (platform-wide RO), user (per-user +x only) | Tool taxonomy | 07:30 | +| 4 | Tools must support `--help`, `--json`/`--agent` flags and `man-agent` command | Tool interface contract | 10:38 | +| 5 | User tools have execute-only permissions for security | Security measure | 16:19 | +| 6 | Dependencies pre-mounted from host, cached in S3 | Performance optimization | 18:15 | +| 7 | Rename "projects" to "chats" (C1, C2, C3) | Naming clarity | 37:40 | +| 8 | Message history stored as per-chat history.db files inside workspace | Storage simplicity | 33:14 | +| 9 | Files inside chats auto-delete after 3 days of inactivity | File lifecycle policy | 40:20 | +| 10 | Inactive accounts deleted after N months (Telegram-style) | Account lifecycle | 48:33 | +| 11 | Container TTL ~1 hour without messages | Resource management | 55:55 | +| 12 | Two repositories: one for Master, one for AI-Agent | Code organization | 54:00 | +| 13 | Use Forgejo for Git hosting | Infrastructure | 57:30 | +| 14 | Use Matrix as backup communication (Element client) | Team communication | 1:00:37 | +| 15 | Agent-browser starts as local tool, move to API if too heavy | Browser tool strategy | 1:14:51 | +| 16 | Security is a shared responsibility — LLM metrics team should add security benchmarks | Security ownership | 1:23:48 | + +## Action Items + +| # | Owner | Task | Deadline | Timestamp | +|---|-------|------|----------|-----------| +| 1 | Azamat | Create Forgejo repository structure for Master and AI-Agent | Today (meeting day) | 1:00:12 | +| 2 | Azamat | Set up wiki and upload architecture documentation | Today | 1:00:19 | +| 3 | Azamat | Share Forgejo registration link with the team | Today | 1:01:24 | +| 4 | Azamat | Create Matrix chat for the team | Today | 1:03:09 | +| 5 | Azamat | Prepare architecture presentation for Kalinin | Tomorrow | 1:24:33 | +| 6 | Team | Register on Forgejo (lambda.coredump.ru) | Today | 1:01:06 | +| 7 | Team | Register on Matrix (element.lambda.coredump.ru) | Today | 1:00:37 | +| 8 | Team | Define tool interface contract (--help, --json, man-agent) | Not specified | 11:15 | +| 9 | Team | Set up dev-environment configuration (CLAUDE.md-style) | Not specified | 1:26:22 | +| 10 | Other team | Define security metrics for LLM safety | Not specified | 1:23:50 | + +## Key Quotes + +> "Наша задача именно сделать агента, который живет в изолированной среде, при этом он может пользоваться какими-то тулами." — Azamat, 02:51 + +> "Мы ему выделяем 10 гигабайт памяти. Просто. И он в эти 10 гигабайт может делать, что хочет." — Azamat, 05:15 + +> "Потому что это поможет подцепить где-нибудь вирусный промпт, полезть в папку UserTools, ввести туда какой-нибудь принт токен и слить токены оттуда." — Egor, 15:58 + +> "Каждый запрос в BD, ему нужно пройтись по индексу, найти все записи вот с этим project id, а они раскиданы все по разным чанкам." — Egor, 34:45 + +> "Считай просто это как Google диск, только наш." — David, 22:08 + +> "И защищаться на уровне ЛЛМ это бесполезно." — Egor, 1:23:28 + +> "Мы же, понимаешь, у нас же, это типа за два дня накинутая просто первая архитектура. Чтобы мы начали думать в этом направлении хотя бы." — Azamat, 50:14 + +## Visual Artifacts + +### Architecture Diagram (Excalidraw — primary artifact) + +The main diagram evolved throughout the meeting on Excalidraw and shows the following system: + +**Top-level architecture:** +- **Stream layer** (Путиловский-team responsibility): Telegram, Discord, Slack integrations +- **Docker layer**: Multiple AI-Agent containers +- **Master**: Orchestrator connected to DB and S3 +- **S3**: Artifact/file storage +- **DB**: Session management, metadata + +**AI-Agent Sandbox (per-user):** +- **Workspace 10GB** containing: + - Chats: C1 (file1.pdf, file2.png, history.db), C2 (file3.docx, history.db), C3 (video.mp4, history.db) + - User-tool (+x only) +- **Mount points:** + - `Mount cur-chat` — current chat data + - `Mount dependencies` — cached tool dependencies + - `Mount lambda-tools` — shared platform tools +- **Built-in tools:** web-search, fetch-url, bash +- **Lambda tools (RO):** markitdown (with --help, --json/--agent, man-agent interface) + +**Machine-level view:** +- Linux base +- Shared Env layer +- Lambda-tools (shared) +- Multiple Agent-AI instances, each with own workspace + +### ChatGPT Projects Reference +**[36:30]** Azamat showed ChatGPT's project interface to illustrate the concept of projects containing chats with attached files. + +### Forgejo Instance +**[57:30]** The team's Forgejo instance at "MAI Lambda Lab 3.0" — self-hosted Git repository service. + +### Agent-Browser GitHub Repository +**[1:13:30]** agent-browser repository (23.6k stars, 58 releases, v0.21.2, Rust-based CLI tool for browser automation). + +### OpenCode Agent Setup +**[1:28:00 — 1:29:30]** Azamat's terminal showing OpenCode CLI with Claude Opus 4.6, multiple agent profiles (@code-refactorer, @bug-hunter-fixer, @worker, @documentation-generator, @feature-developer, @code-explorer, @test-engineer, @code-reviewer), and system-design.md configuration for agent behavior. + +## Open Questions + +1. **How exactly should the tool interface contract work?** The `--help`, `--json`/`--agent` flags and `man-agent` command were proposed but not finalized. +2. **Should user tools be writable by the agent or only loaded from external sources?** Security implications were raised but not fully resolved. +3. **What happens when a heavy tool (like agent-browser) requires more resources than the container allows?** The "start local, move to API" approach was agreed on but details remain. +4. **How to handle the scenario where an agent needs a library that isn't pre-installed?** The scripting discussion suggested Python is available, but library management wasn't resolved. +5. **Security architecture:** Who is responsible for defending against prompt injection attacks? The LLM metrics team was suggested, but no formal agreement was reached. +6. **Memory and context management:** SOUL.md/MEMORY.md-style files were mentioned but not designed. +7. **What should the Kubernetes setup look like?** Mentioned as needed "in the second half of spring" but not planned. +8. **Detailed flow for file lifecycle:** When exactly are S3 artifacts cleaned up? The "3-day rule" applies to workspace files, but S3 cleanup timing remains unclear. +9. **Browser Use vs. agent-browser:** The relationship with Durevich's team regarding browser integration needs clarification. + +## Source +Original transcript: `meeting_1.txt` diff --git a/tasks.md b/tasks.md index b50acb2..c6ed917 100644 --- a/tasks.md +++ b/tasks.md @@ -1,142 +1,116 @@ -# План работ: web-python-skelet +# План работ: master-service MVP sandbox ## Контекст -- Источник требований: `AGENTS.md` и ADR `docs/001`-`docs/004` -- Текущее состояние: в `adapter/`, `domain/`, `usecase/`, `repository/`, `test/` пока только `__init__.py` -- Отсутствуют рабочие каталоги и файлы из целевой структуры: `adapter/config/`, `adapter/otel/`, `adapter/di/`, `adapter/http/fastapi/`, `config/`, `main.py` -- Ограничения: `docs/` и `tasks.md` не добавлять в git; коммиты не делать; работать по одной задаче -- ADR пока покрывают архитектуру, новые ADR нужны только если по ходу работ изменится решение +- Источники требований: `docs/master.md`, `meetings/meeting_1_document.md`, `README.md`, `docs/*` +- Базовый template уже готов: typed config, DI container, observability, FastAPI adapter и versioned API под `/api/v1` +- Текущая цель: минимальное управление Docker sandbox без auth +- MVP API: `POST /api/v1/create` +- Sandbox policy: TTL `300` секунд, одна активная sandbox на `chat_id`, повторный `create` переиспользует активную сессию +- Volume policy: chat volume `rw`, dependencies volume `ro`, lambda-tools volume `ro` +- Host paths вычисляются из typed config, а HTTP request передает только `chat_id` +- Cleanup выполняется периодическим in-process loop внутри master-service + +## Вне текущего scope + +- auth и access control +- p2p lease и WebSocket transport +- workspace/chat CRUD API +- chat files, artifacts, S3, quota и retention policy +- центральная БД и multi-node orchestration ## Правила выполнения -- Каждую задачу выполнять отдельным заходом, без параллельной реализации -- Каждый субагент отдает diff, список измененных файлов и проверку, но не делает commit -- Если в задаче всплывает архитектурное изменение, остановиться и вынести вопрос на согласование +- Выполняем по одной задаче +- Коммиты не делаем +- Если по ходу нужна смена архитектуры, останавливаемся и согласуем решение +- `domain/` и `usecase/` не импортируют Docker, FastAPI, OpenTelemetry, env или YAML +- Inner layers работают только через минимальные domain сущности и usecase порты ## Очередь задач -### T01. Базовый каркас домена и usecase +### M01. ADR и минимальный sandbox scaffolding -- Исполнитель: `primary-agent` (scaffolding) -- Статус: completed +- Исполнитель: `primary-agent` +- Статус: pending - Зависимости: нет - Commit required: no -- Scope: создать базовые файлы и контракты в `domain/`, `usecase/`, `repository/` -- Файлы: `domain/error.py`, `domain/user.py`, `usecase/interface.py`, `usecase/user.py`, `repository/user.py` -- Критерии приемки: зависимости направлены внутрь; в `domain/` и `usecase/` нет FastAPI/OTel; есть пример сущности, ошибок, портов и простого usecase +- Scope: зафиксировать MVP-решение в ADR и создать минимальные сущности, ошибки и usecase-контракты для sandbox orchestration +- Файлы: `docs/006-mvp-docker-sandbox-orchestration.md`, `domain/sandbox.py`, `domain/error.py`, `usecase/interface.py`, `usecase/sandbox.py` +- Критерии приемки: в `domain/` есть минимальная `SandboxSession` и sandbox-ошибки; в `usecase/` есть порты `SandboxSessionRepository`, `SandboxRuntime` и `Clock`; созданы скелеты `CreateSandbox` и `CleanupExpiredSandboxes`; ADR занимает 10-20 строк -### T02. Конфиг из YAML и env +### M02. Typed config для sandbox runtime - Субагент: `feature-developer` -- Статус: completed -- Зависимости: `T01` +- Статус: pending +- Зависимости: `M01` - Commit required: no -- Scope: собрать typed-config слой в `adapter/config/` и подготовить базовые yaml-файлы -- Файлы: `adapter/config/*`, `config/app.yaml` -- Критерии приемки: конфиг собирается в одну dataclass-структуру; секреты читаются из env; парсинг и валидация не протекают в inner layers +- Scope: расширить typed-config слоем `sandbox` с настройками image, TTL, cleanup interval, host paths и container mount paths +- Файлы: `adapter/config/model.py`, `adapter/config/loader.py`, `config/app.yaml` +- Решение: chat host path строится как путь под общим `sandbox.chats_root/`; request не передает host path напрямую +- Критерии приемки: конфиг собирается в typed dataclass-дерево; дефолтный TTL равен `300`; есть отдельные настройки для `chats_root`, `dependencies_host_path`, `lambda_tools_host_path`, `chat_mount_path`, `dependencies_mount_path`, `lambda_tools_mount_path`, `cleanup_interval_seconds`; inner layers не читают env -### T03. Observability порты и OTel adapter +### M03. Docker runtime adapter для sandbox lifecycle - Субагент: `feature-developer` -- Статус: completed -- Зависимости: `T01`, `T02` +- Статус: pending +- Зависимости: `M01`, `M02` - Commit required: no -- Scope: реализовать логгер, метрики, трейсинг и bootstrap OTel в `adapter/otel/` через интерфейсы из `usecase/interface.py` -- Файлы: `adapter/otel/*`, `config/otel-collector.yaml` -- Критерии приемки: inner layers знают только интерфейсы; OTLP exporter настраивается из конфига; нет кастомного trace middleware +- Scope: реализовать outer adapter над Docker для создания и остановки sandbox контейнера с нужными labels и mount policy +- Файлы: `adapter/docker/runtime.py`, `adapter/docker/__init__.py` +- Ограничения: все Docker-детали остаются в `adapter/`; runtime не должен протекать во внутренние слои +- Критерии приемки: runtime умеет создать sandbox container по входным параметрам usecase; chat volume монтируется как `rw`; dependency и lambda-tools volumes монтируются как `ro`; контейнер получает labels с `session_id`, `chat_id` и `expires_at`; runtime переводит ошибки Docker в понятные исключения адаптера -### T04. Composition root и lifetime singleton-объектов +### M04. In-memory session repository и usecase `CreateSandbox` - Субагент: `feature-developer` -- Статус: completed -- Зависимости: `T01`, `T02`, `T03` +- Статус: pending +- Зависимости: `M01`, `M02`, `M03` - Commit required: no -- Scope: собрать контейнер и startup wiring в `adapter/di/` -- Файлы: `adapter/di/container.py`, `adapter/di/__init__.py` -- Критерии приемки: repository/usecase создаются один раз на старте; контейнер хранит инстансы явно; нет пересоздания на HTTP-запрос +- Scope: реализовать in-memory registry активных sandbox-сессий и usecase создания sandbox с логикой reuse по `chat_id` +- Файлы: `repository/sandbox_session.py`, `usecase/sandbox.py`, `adapter/di/container.py` +- Решение: если по `chat_id` есть неистекшая сессия, usecase возвращает ее без нового container start; если сессия истекла, usecase инициирует stop старой sandbox и создает новую +- Критерии приемки: одна активная sandbox на `chat_id`; TTL-логика использует порт `Clock`; usecase не импортирует Docker; container wiring остается singleton-based -### T05. FastAPI adapter как заменяемый web layer +### M05. Cleanup expired sandboxes и lifecycle wiring - Субагент: `feature-developer` -- Статус: completed -- Зависимости: `T04` +- Статус: pending +- Зависимости: `M04` - Commit required: no -- Scope: поднять HTTP adapter в `adapter/http/fastapi/` с app factory, lifespan, dependencies, middleware и router ` /api/v1` -- Файлы: `adapter/http/fastapi/*`, `main.py` -- Критерии приемки: FastAPI изолирован в adapter-слое; handlers тонкие; request logging и metrics middleware подключены; usecase/repository берутся из контейнера +- Scope: реализовать usecase cleanup просроченных sandbox и подключить периодический cleanup loop в FastAPI lifecycle +- Файлы: `usecase/sandbox.py`, `adapter/di/container.py`, `adapter/http/fastapi/app.py`, при необходимости `adapter/http/fastapi/dependencies.py` +- Ограничения: не ломать ADR про раннее OTel instrumentation; lifecycle loop должен стартовать и останавливаться один раз +- Критерии приемки: cleanup находит истекшие сессии, останавливает sandbox через runtime и удаляет их из registry; интервал cleanup берется из конфига; shutdown корректно завершает фоновую задачу -### T06. Локальный runtime и compose-окружение +### M06. HTTP endpoint `POST /api/v1/create` - Субагент: `feature-developer` -- Статус: completed -- Зависимости: `T02`, `T03`, `T05` +- Статус: pending +- Зависимости: `M04` - Commit required: no -- Scope: добавить контейнерный runtime для сервиса и compose-окружение с OTel и UI для просмотра логов, метрик и трейсов -- Файлы: `Dockerfile`, `docker-compose.yml` -- Ограничения: не трогать репозиторный `config/app.yaml`; docker должен прокидывать свой runtime-config внутрь контейнера; Dockerfile только с двумя стадиями `build` и `run` -- Критерии приемки: `make compose-build` и `make compose-up` опираются на существующие файлы; сервис поднимается в контейнере; OTel telemetry уходит в dockerized stack; есть UI для просмотра логов, метрик и трейсов; для локального docker-окружения достаточно только `Dockerfile` и `docker-compose.yml` +- Scope: добавить минимальную HTTP ручку для создания или переиспользования sandbox без auth +- Файлы: `adapter/http/fastapi/schemas.py`, `adapter/http/fastapi/dependencies.py`, `adapter/http/fastapi/routers/v1/router.py`, при необходимости `adapter/di/container.py` +- Request: `{ "chat_id": "..." }` +- Response: `session_id`, `chat_id`, `container_id`, `status`, `expires_at` +- Критерии приемки: router остается тонким; handler только переводит HTTP input в команду usecase и маппит ошибки в HTTP; endpoint живет под `/api/v1/create`; auth не добавляется -### T07. Тесты на lifetimes, config и HTTP smoke +### M07. Тесты для create, reuse, TTL и mount policy - Субагент: `test-engineer` - Статус: pending -- Зависимости: `T01`, `T02`, `T03`, `T04`, `T05`, `T06` +- Зависимости: `M03`, `M04`, `M05`, `M06` - Commit required: no -- Scope: покрыть тестами ключевые архитектурные гарантии +- Scope: покрыть тестами ключевое поведение MVP без запуска реального production Docker stack - Файлы: `test/*` -- Критерии приемки: есть тест на singleton lifetime для repository/usecase; есть тест merge YAML+env; есть smoke-тест для ` /api/v1`; тесты не тянут FastAPI/OTel в inner layers +- Критерии приемки: есть unit-тесты для `CreateSandbox` и `CleanupExpiredSandboxes` с fake clock; есть HTTP smoke-тест для `POST /api/v1/create`; есть adapter-level тест с mock Docker client на mount policy `chat=rw`, `deps=ro`, `tools=ro`; тесты не тащат FastAPI или Docker в inner-layer тесты -### T08. Архитектурный и boundary review +### M08. Архитектурный и boundary review по MVP sandbox - Субагент: `code-reviewer` - Статус: pending -- Зависимости: `T07` +- Зависимости: `M07` - Commit required: no -- Scope: проверить импорты, соблюдение слоев, startup lifetimes и заменяемость web adapter +- Scope: проверить соблюдение clean architecture, dependency direction и соответствие MVP-ограничениям - Файлы: весь измененный код -- Критерии приемки: dependency direction не нарушен; FastAPI и OTel не протекают в `domain/` и `usecase/`; замечания сформулированы как точечные правки или подтверждение готовности к review - -### T09. Конфигурируемый runtime observability - -- Субагент: `feature-developer` -- Статус: completed -- Зависимости: `T03`, `T04`, `T05` -- Commit required: no -- Scope: сделать конфигурируемый runtime observability с настраиваемым sink и форматом логов, плюс отдельными флагами для метрик и трейсов -- Файлы: `adapter/config/*`, `adapter/observability/*`, `adapter/otel/*`, `adapter/di/*`, `adapter/http/fastapi/*`, `config/app.yaml`, при необходимости `main.py` -- Конфиг: `logging.output=stdout|file|otel`, `logging.format=text|json`, `logging.file_path=...`, `metrics.enabled=true|false`, `tracing.enabled=true|false` -- Решение: вынести выбор runtime в отдельный adapter-layer factory; `domain/` и `usecase/` не менять; для `stdout` и `file` поддержать оба формата `text` и `json`; `logging.file_path` использовать только при `logging.output=file`; при отключенных метриках и трейсах использовать `Noop`-реализации; OTel runtime поднимать только если нужен хотя бы для одного сигнала -- Критерии приемки: при `logging.output=stdout` логи идут в stdout в формате `text` или `json` по конфигу; при `logging.output=file` логи пишутся в файл по пути `logging.file_path` в формате `text` или `json`; при `logging.output=otel` логи уходят в collector; `metrics.enabled=false` отключает метрики и metrics middleware; `tracing.enabled=false` отключает FastAPI instrumentation и tracer runtime; DI продолжает отдавать единый runtime через контейнер; внутренние слои по-прежнему знают только порты - -### T10. ADR: раннее подключение OTel к FastAPI - -- Исполнитель: `primary-agent` (docs) -- Статус: completed -- Зависимости: нет -- Commit required: no -- Scope: зафиксировать правило, что FastAPI OTel instrumentation выполняется до первой сборки `middleware_stack` -- Файлы: `docs/005-fastapi-otel-early-instrumentation.md` -- Критерии приемки: ADR занимает 10-20 строк; описаны context, decision, consequences; решение не переписывает историю прошлых ADR - -### T11. Перенос FastAPI OTel bootstrap в app factory - -- Субагент: `feature-developer` -- Статус: completed -- Зависимости: `T10` -- Commit required: no -- Scope: перенести создание container, установку `FastAPIInstrumentor.instrument_app(...)` из `lifespan` в `create_app`, оставив в `lifespan` только shutdown -- Файлы: `adapter/http/fastapi/app.py`, `adapter/http/fastapi/lifespan.py`, при необходимости `adapter/http/fastapi/dependencies.py` -- Ограничения: не использовать ручной rebuild `app.middleware_stack`; не менять `domain/` и `usecase/`; не добавлять бизнес-логику; сохранить singleton-lifetime container -- Критерии приемки: instrumentation происходит до первой сборки middleware stack; `OpenTelemetryMiddleware` попадает в runtime stack без workaround; shutdown закрывает instrumentation и runtime один раз; compose-конфиг продолжает работать - -### T12. Регрессионная проверка HTTP telemetry wiring - -- Субагент: `test-engineer` -- Статус: pending -- Зависимости: `T11` -- Commit required: no -- Scope: добавить проверку, что раннее instrumentation wiring сохраняется и не требует ручного rebuild middleware stack -- Файлы: `test/*` -- Ограничения: без реального collector; проверять через ASGI/lifespan или локальные assertions по app runtime; не тянуть FastAPI и OTel в inner-layer тесты -- Критерии приемки: тест подтверждает, что при включенных metrics/tracing `OpenTelemetryMiddleware` присутствует в runtime stack; тест не зависит от внешнего OTel collector; существующие архитектурные границы не нарушены +- Критерии приемки: Docker остается только во внешнем adapter; FastAPI не протекает в `domain/` и `usecase/`; TTL и mount policy читаются как явные, тестируемые правила; замечания сформулированы как точечные правки или подтверждение готовности From 7b3f82e80596da5ac559516115f4c6135b54c420 Mon Sep 17 00:00:00 2001 From: Azamat Date: Thu, 2 Apr 2026 12:04:47 +0300 Subject: [PATCH 02/30] ref #3: [feat] add context and tasks for master-service --- Makefile | 2 +- adapter/http/fastapi/app.py | 2 +- adapter/http/fastapi/dependencies.py | 3 +- adapter/http/fastapi/middleware.py | 3 +- docs/006-mvp-docker-sandbox-orchestration.md | 19 ++++++++ domain/error.py | 16 +++++++ domain/sandbox.py | 21 +++++++++ tasks.md | 2 +- usecase/interface.py | 28 ++++++++++++ usecase/sandbox.py | 46 ++++++++++++++++++++ 10 files changed, 137 insertions(+), 5 deletions(-) create mode 100644 docs/006-mvp-docker-sandbox-orchestration.md create mode 100644 domain/sandbox.py create mode 100644 usecase/sandbox.py diff --git a/Makefile b/Makefile index 1c6eaf3..ed72fd7 100644 --- a/Makefile +++ b/Makefile @@ -31,7 +31,7 @@ test: uv run pytest -v lint: - uv run ruff check . + uv run ruff check . --fix typecheck: uv run mypy . diff --git a/adapter/http/fastapi/app.py b/adapter/http/fastapi/app.py index c23d0f9..06ac839 100644 --- a/adapter/http/fastapi/app.py +++ b/adapter/http/fastapi/app.py @@ -1,5 +1,6 @@ from collections.abc import Callable +from fastapi import FastAPI from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor from adapter.config.loader import load_config @@ -8,7 +9,6 @@ from adapter.di.container import AppContainer, build_container from adapter.http.fastapi.dependencies import APP_CONFIG_STATE, APP_CONTAINER_STATE from adapter.http.fastapi.middleware import register_middleware from adapter.http.fastapi.routers.v1.router import router as v1_router -from fastapi import FastAPI API_V1_PREFIX = '/api/v1' diff --git a/adapter/http/fastapi/dependencies.py b/adapter/http/fastapi/dependencies.py index 5afba58..4892459 100644 --- a/adapter/http/fastapi/dependencies.py +++ b/adapter/http/fastapi/dependencies.py @@ -1,7 +1,8 @@ from typing import cast -from adapter.di.container import AppContainer from fastapi import Depends, Request + +from adapter.di.container import AppContainer from usecase.user import GetUser APP_CONTAINER_STATE = 'container' diff --git a/adapter/http/fastapi/middleware.py b/adapter/http/fastapi/middleware.py index 83d277e..598e991 100644 --- a/adapter/http/fastapi/middleware.py +++ b/adapter/http/fastapi/middleware.py @@ -1,8 +1,9 @@ from time import perf_counter +from fastapi import FastAPI, Request, Response + from adapter.config.model import AppConfig from adapter.http.fastapi.dependencies import get_container -from fastapi import FastAPI, Request, Response def register_middleware(app: FastAPI, config: AppConfig) -> None: diff --git a/docs/006-mvp-docker-sandbox-orchestration.md b/docs/006-mvp-docker-sandbox-orchestration.md new file mode 100644 index 0000000..ef404c3 --- /dev/null +++ b/docs/006-mvp-docker-sandbox-orchestration.md @@ -0,0 +1,19 @@ +# 006 MVP Docker Sandbox Orchestration + +Context +- The service needs a first MVP for sandbox orchestration behind `/api/v1/create`. +- The first version must stay small, avoid auth, and preserve clean architecture boundaries. + +Decision +- Use Docker as the outer runtime adapter for sandbox start and stop operations. +- Keep sandbox entities and errors in `domain/` and orchestration ports in `usecase/`. +- Use an in-memory session repository for the MVP instead of a central database. +- Keep one active sandbox per `chat_id` and reuse it until TTL expiry. +- Set default sandbox TTL to 300 seconds. +- Mount chat storage as `rw`, dependencies as `ro`, and lambda-tools as `ro`. +- Run expired sandbox cleanup as an in-process background loop in the HTTP app lifecycle. + +Consequences +- Inner layers stay free from Docker and FastAPI details. +- The MVP is single-instance oriented and not yet suitable for multi-node coordination. +- Repository and runtime can be replaced later without changing usecase contracts. diff --git a/domain/error.py b/domain/error.py index 1179f43..f691113 100644 --- a/domain/error.py +++ b/domain/error.py @@ -16,3 +16,19 @@ class UserConflictError(UserError): def __init__(self, email: str) -> None: super().__init__('user_conflict') self.email = email + + +class SandboxError(DomainError): + pass + + +class SandboxStartError(SandboxError): + def __init__(self, chat_id: str) -> None: + super().__init__('sandbox_start_failed') + self.chat_id = chat_id + + +class SandboxAlreadyRunningError(SandboxError): + def __init__(self, chat_id: str) -> None: + super().__init__('sandbox_already_running') + self.chat_id = chat_id diff --git a/domain/sandbox.py b/domain/sandbox.py new file mode 100644 index 0000000..110b4e4 --- /dev/null +++ b/domain/sandbox.py @@ -0,0 +1,21 @@ +from dataclasses import dataclass +from datetime import datetime +from enum import Enum + + +class SandboxStatus(str, Enum): + STARTING = 'starting' + RUNNING = 'running' + STOPPING = 'stopping' + STOPPED = 'stopped' + FAILED = 'failed' + + +@dataclass(frozen=True, slots=True) +class SandboxSession: + session_id: str + chat_id: str + container_id: str + status: SandboxStatus + created_at: datetime + expires_at: datetime diff --git a/tasks.md b/tasks.md index c6ed917..98d8426 100644 --- a/tasks.md +++ b/tasks.md @@ -32,7 +32,7 @@ ### M01. ADR и минимальный sandbox scaffolding - Исполнитель: `primary-agent` -- Статус: pending +- Статус: completed - Зависимости: нет - Commit required: no - Scope: зафиксировать MVP-решение в ADR и создать минимальные сущности, ошибки и usecase-контракты для sandbox orchestration diff --git a/usecase/interface.py b/usecase/interface.py index 89811a8..fcc1fe6 100644 --- a/usecase/interface.py +++ b/usecase/interface.py @@ -1,7 +1,9 @@ from collections.abc import Mapping +from datetime import datetime from types import TracebackType from typing import Protocol, TypeAlias +from domain.sandbox import SandboxSession from domain.user import User AttrValue: TypeAlias = str | int | float | bool @@ -16,6 +18,32 @@ class UserRepository(Protocol): def save(self, user: User) -> None: ... +class SandboxSessionRepository(Protocol): + def get_active_by_chat_id(self, chat_id: str) -> SandboxSession | None: ... + + def list_expired(self, now: datetime) -> list[SandboxSession]: ... + + def save(self, session: SandboxSession) -> None: ... + + def delete(self, session_id: str) -> None: ... + + +class SandboxRuntime(Protocol): + def create( + self, + *, + session_id: str, + chat_id: str, + expires_at: datetime, + ) -> SandboxSession: ... + + def stop(self, container_id: str) -> None: ... + + +class Clock(Protocol): + def now(self) -> datetime: ... + + class Logger(Protocol): def debug(self, message: str, attrs: Attrs | None = None) -> None: ... diff --git a/usecase/sandbox.py b/usecase/sandbox.py new file mode 100644 index 0000000..0c34422 --- /dev/null +++ b/usecase/sandbox.py @@ -0,0 +1,46 @@ +from dataclasses import dataclass +from datetime import timedelta + +from domain.sandbox import SandboxSession +from usecase.interface import Clock, Logger, SandboxRuntime, SandboxSessionRepository + + +@dataclass(frozen=True, slots=True) +class CreateSandboxCommand: + chat_id: str + + +class CreateSandbox: + def __init__( + self, + repository: SandboxSessionRepository, + runtime: SandboxRuntime, + clock: Clock, + logger: Logger, + ttl: timedelta, + ) -> None: + self._repository = repository + self._runtime = runtime + self._clock = clock + self._logger = logger + self._ttl = ttl + + def execute(self, command: CreateSandboxCommand) -> SandboxSession: + raise NotImplementedError + + +class CleanupExpiredSandboxes: + def __init__( + self, + repository: SandboxSessionRepository, + runtime: SandboxRuntime, + clock: Clock, + logger: Logger, + ) -> None: + self._repository = repository + self._runtime = runtime + self._clock = clock + self._logger = logger + + def execute(self) -> list[SandboxSession]: + raise NotImplementedError From 3448266c1d20a43d14422fc48fa587e311e91fc5 Mon Sep 17 00:00:00 2001 From: Azamat Date: Thu, 2 Apr 2026 12:20:16 +0300 Subject: [PATCH 03/30] close #4: [feat] add config --- adapter/config/loader.py | 74 ++++++++++++++++++++++++++++++++++++++++ adapter/config/model.py | 14 ++++++++ config/app.yaml | 11 ++++++ tasks.md | 2 +- 4 files changed, 100 insertions(+), 1 deletion(-) diff --git a/adapter/config/loader.py b/adapter/config/loader.py index 3953923..0cb4b6b 100644 --- a/adapter/config/loader.py +++ b/adapter/config/loader.py @@ -12,6 +12,7 @@ from .model import ( LoggingConfig, MetricsConfig, OtelConfig, + SandboxConfig, SecurityConfig, TracingConfig, ) @@ -38,6 +39,7 @@ def load_config( logging_section = _section(yaml_data, 'logging') metrics_section = _section(yaml_data, 'metrics') tracing_section = _section(yaml_data, 'tracing') + sandbox_section = _section(yaml_data, 'sandbox') security_section = _section(yaml_data, 'security') logging_output = _yaml_or_env_choice( @@ -128,6 +130,7 @@ def load_config( enable_metrics=metrics_enabled, enable_tracing=tracing_enabled, ), + sandbox=_load_sandbox_config(sandbox_section, env_values), security=SecurityConfig( token_header=_yaml_or_env_str( security_section, @@ -221,6 +224,77 @@ def _optional_section(data: Mapping[str, object], name: str) -> dict[str, object return section +def _load_sandbox_config( + section: Mapping[str, object], + env: Mapping[str, str], +) -> SandboxConfig: + return SandboxConfig( + image=_yaml_or_env_str( + section, + 'image', + 'sandbox.image', + env, + 'APP_SANDBOX_IMAGE', + ), + ttl_seconds=_yaml_or_env_int( + section, + 'ttl_seconds', + 'sandbox.ttl_seconds', + env, + 'APP_SANDBOX_TTL_SECONDS', + ), + cleanup_interval_seconds=_yaml_or_env_int( + section, + 'cleanup_interval_seconds', + 'sandbox.cleanup_interval_seconds', + env, + 'APP_SANDBOX_CLEANUP_INTERVAL_SECONDS', + ), + chats_root=_yaml_or_env_str( + section, + 'chats_root', + 'sandbox.chats_root', + env, + 'APP_SANDBOX_CHATS_ROOT', + ), + dependencies_host_path=_yaml_or_env_str( + section, + 'dependencies_host_path', + 'sandbox.dependencies_host_path', + env, + 'APP_SANDBOX_DEPENDENCIES_HOST_PATH', + ), + lambda_tools_host_path=_yaml_or_env_str( + section, + 'lambda_tools_host_path', + 'sandbox.lambda_tools_host_path', + env, + 'APP_SANDBOX_LAMBDA_TOOLS_HOST_PATH', + ), + chat_mount_path=_yaml_or_env_str( + section, + 'chat_mount_path', + 'sandbox.chat_mount_path', + env, + 'APP_SANDBOX_CHAT_MOUNT_PATH', + ), + dependencies_mount_path=_yaml_or_env_str( + section, + 'dependencies_mount_path', + 'sandbox.dependencies_mount_path', + env, + 'APP_SANDBOX_DEPENDENCIES_MOUNT_PATH', + ), + lambda_tools_mount_path=_yaml_or_env_str( + section, + 'lambda_tools_mount_path', + 'sandbox.lambda_tools_mount_path', + env, + 'APP_SANDBOX_LAMBDA_TOOLS_MOUNT_PATH', + ), + ) + + def _load_otel_config( data: Mapping[str, object], env: Mapping[str, str], diff --git a/adapter/config/model.py b/adapter/config/model.py index 2e7d74e..ca18347 100644 --- a/adapter/config/model.py +++ b/adapter/config/model.py @@ -40,6 +40,19 @@ class OtelConfig: metric_export_interval: int +@dataclass(frozen=True, slots=True) +class SandboxConfig: + image: str + ttl_seconds: int + cleanup_interval_seconds: int + chats_root: str + dependencies_host_path: str + lambda_tools_host_path: str + chat_mount_path: str + dependencies_mount_path: str + lambda_tools_mount_path: str + + @dataclass(frozen=True, slots=True) class SecurityConfig: token_header: str @@ -55,4 +68,5 @@ class AppConfig: metrics: MetricsConfig tracing: TracingConfig otel: OtelConfig + sandbox: SandboxConfig security: SecurityConfig diff --git a/config/app.yaml b/config/app.yaml index 7aa81f4..2de4c27 100644 --- a/config/app.yaml +++ b/config/app.yaml @@ -24,5 +24,16 @@ otel: traces_endpoint: http://localhost:4318/v1/traces metric_export_interval: 1000 +sandbox: + image: ai-agent:latest + ttl_seconds: 300 + cleanup_interval_seconds: 60 + chats_root: var/sandbox/chats + dependencies_host_path: var/sandbox/dependencies + lambda_tools_host_path: var/sandbox/lambda-tools + chat_mount_path: /workspace/chat + dependencies_mount_path: /opt/dependencies + lambda_tools_mount_path: /opt/lambda-tools + security: token_header: X-API-Token diff --git a/tasks.md b/tasks.md index 98d8426..1eb618d 100644 --- a/tasks.md +++ b/tasks.md @@ -42,7 +42,7 @@ ### M02. Typed config для sandbox runtime - Субагент: `feature-developer` -- Статус: pending +- Статус: completed - Зависимости: `M01` - Commit required: no - Scope: расширить typed-config слоем `sandbox` с настройками image, TTL, cleanup interval, host paths и container mount paths From 87c789b7fe217b0cce11e2376b24c1f959c408bd Mon Sep 17 00:00:00 2001 From: Azamat Date: Thu, 2 Apr 2026 12:55:38 +0300 Subject: [PATCH 04/30] ref #5: [feat] add docker impl --- AGENTS.md | 3 + adapter/docker/__init__.py | 0 adapter/docker/runtime.py | 134 ++++++++++++++++++++++++++++ pyproject.toml | 2 + tasks.md | 2 +- uv.lock | 176 +++++++++++++++++++++++++++++++++++++ 6 files changed, 316 insertions(+), 1 deletion(-) create mode 100644 adapter/docker/__init__.py create mode 100644 adapter/docker/runtime.py diff --git a/AGENTS.md b/AGENTS.md index 03787a2..ba5bc34 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -83,3 +83,6 @@ - Prefer explicit wiring over magic - Do not expand scope without user approval - Do not `from __future__ import annotations` +- Do not `from importlib import import_module` +- Do not `importlib` +- Do not `cast` diff --git a/adapter/docker/__init__.py b/adapter/docker/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/adapter/docker/runtime.py b/adapter/docker/runtime.py new file mode 100644 index 0000000..89df9ab --- /dev/null +++ b/adapter/docker/runtime.py @@ -0,0 +1,134 @@ +from collections.abc import Callable +from datetime import datetime +from pathlib import Path + +from docker import DockerClient +from docker.errors import DockerException, NotFound +from docker.types import Mount + +from adapter.config.model import SandboxConfig +from domain.error import SandboxError, SandboxStartError +from domain.sandbox import SandboxSession, SandboxStatus +from usecase.interface import SandboxRuntime + +type NowFactory = Callable[[datetime], datetime] + + +class DockerSandboxRuntime(SandboxRuntime): + def __init__( + self, + config: SandboxConfig, + client: DockerClient, + now: NowFactory | None = None, + ) -> None: + self._config = config + self._client = client + self._now = _current_time if now is None else now + + def create( + self, + *, + session_id: str, + chat_id: str, + expires_at: datetime, + ) -> SandboxSession: + try: + chat_path = self._chat_path(chat_id) + dependencies_path = self._readonly_host_path( + self._config.dependencies_host_path + ) + lambda_tools_path = self._readonly_host_path( + self._config.lambda_tools_host_path + ) + chat_path.mkdir(parents=True, exist_ok=True) + container = self._client.containers.run( + self._config.image, + detach=True, + labels=self._labels(session_id, chat_id, expires_at), + mounts=self._mounts(chat_path, dependencies_path, lambda_tools_path), + ) + except (DockerException, OSError, ValueError) as exc: + raise SandboxStartError(chat_id) from exc + + container_id = str(getattr(container, 'id', '')).strip() + if not container_id: + raise SandboxStartError(chat_id) + + return SandboxSession( + session_id=session_id, + chat_id=chat_id, + container_id=container_id, + status=SandboxStatus.RUNNING, + created_at=self._now(expires_at), + expires_at=expires_at, + ) + + def stop(self, container_id: str) -> None: + try: + container = self._client.containers.get(container_id) + container.stop() + except NotFound: + return + except DockerException as exc: + raise SandboxError('sandbox_stop_failed') from exc + + def _labels( + self, + session_id: str, + chat_id: str, + expires_at: datetime, + ) -> dict[str, str]: + return { + 'session_id': session_id, + 'chat_id': chat_id, + 'expires_at': expires_at.isoformat(), + } + + def _mounts( + self, + chat_path: Path, + dependencies_path: Path, + lambda_tools_path: Path, + ) -> list[Mount]: + return [ + Mount( + target=self._config.chat_mount_path, + source=str(chat_path), + type='bind', + ), + Mount( + target=self._config.dependencies_mount_path, + source=str(dependencies_path), + type='bind', + read_only=True, + ), + Mount( + target=self._config.lambda_tools_mount_path, + source=str(lambda_tools_path), + type='bind', + read_only=True, + ), + ] + + def _chat_path(self, chat_id: str) -> Path: + if not chat_id.strip(): + raise ValueError('invalid chat path') + + chats_root = self._host_path(self._config.chats_root) + chat_path = (chats_root / chat_id).resolve(strict=False) + if not chat_path.is_relative_to(chats_root): + raise ValueError('invalid chat path') + return chat_path + + def _readonly_host_path(self, path_value: str) -> Path: + host_path = self._host_path(path_value) + if not host_path.exists(): + raise ValueError('invalid host path') + return host_path + + def _host_path(self, path_value: str) -> Path: + return Path(path_value).expanduser().resolve(strict=False) + + +def _current_time(expires_at: datetime) -> datetime: + return datetime.now(tz=expires_at.tzinfo) diff --git a/pyproject.toml b/pyproject.toml index b72a3e7..2f15e30 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -5,6 +5,7 @@ description = "" readme = "README.md" requires-python = ">=3.13" dependencies = [ + "docker>=7.1.0", "fastapi>=0.116.1", "opentelemetry-api>=1.31.1", "opentelemetry-exporter-otlp-proto-http>=1.31.1", @@ -21,6 +22,7 @@ dev = [ "mypy>=1.18.2", "pytest>=8.4.2", "ruff>=0.13.1", + "types-docker>=7.1.0.20260402", "types-pyyaml>=6.0.12.20250915", ] diff --git a/tasks.md b/tasks.md index 1eb618d..73c9b2e 100644 --- a/tasks.md +++ b/tasks.md @@ -53,7 +53,7 @@ ### M03. Docker runtime adapter для sandbox lifecycle - Субагент: `feature-developer` -- Статус: pending +- Статус: completed - Зависимости: `M01`, `M02` - Commit required: no - Scope: реализовать outer adapter над Docker для создания и остановки sandbox контейнера с нужными labels и mount policy diff --git a/uv.lock b/uv.lock index 9ce78f6..9819d68 100644 --- a/uv.lock +++ b/uv.lock @@ -50,6 +50,51 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/9a/3c/c17fb3ca2d9c3acff52e30b309f538586f9f5b9c9cf454f3845fc9af4881/certifi-2026.2.25-py3-none-any.whl", hash = "sha256:027692e4402ad994f1c42e52a4997a9763c646b73e4096e4d5d6db8af1d6f0fa", size = 153684, upload-time = "2026-02-25T02:54:15.766Z" }, ] +[[package]] +name = "cffi" +version = "2.0.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pycparser", marker = "implementation_name != 'PyPy'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/eb/56/b1ba7935a17738ae8453301356628e8147c79dbb825bcbc73dc7401f9846/cffi-2.0.0.tar.gz", hash = "sha256:44d1b5909021139fe36001ae048dbdde8214afa20200eda0f64c068cac5d5529", size = 523588, upload-time = "2025-09-08T23:24:04.541Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/4b/8d/a0a47a0c9e413a658623d014e91e74a50cdd2c423f7ccfd44086ef767f90/cffi-2.0.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:00bdf7acc5f795150faa6957054fbbca2439db2f775ce831222b66f192f03beb", size = 185230, upload-time = "2025-09-08T23:23:00.879Z" }, + { url = "https://files.pythonhosted.org/packages/4a/d2/a6c0296814556c68ee32009d9c2ad4f85f2707cdecfd7727951ec228005d/cffi-2.0.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:45d5e886156860dc35862657e1494b9bae8dfa63bf56796f2fb56e1679fc0bca", size = 181043, upload-time = "2025-09-08T23:23:02.231Z" }, + { url = "https://files.pythonhosted.org/packages/b0/1e/d22cc63332bd59b06481ceaac49d6c507598642e2230f201649058a7e704/cffi-2.0.0-cp313-cp313-manylinux1_i686.manylinux2014_i686.manylinux_2_17_i686.manylinux_2_5_i686.whl", hash = "sha256:07b271772c100085dd28b74fa0cd81c8fb1a3ba18b21e03d7c27f3436a10606b", size = 212446, upload-time = "2025-09-08T23:23:03.472Z" }, + { url = "https://files.pythonhosted.org/packages/a9/f5/a2c23eb03b61a0b8747f211eb716446c826ad66818ddc7810cc2cc19b3f2/cffi-2.0.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:d48a880098c96020b02d5a1f7d9251308510ce8858940e6fa99ece33f610838b", size = 220101, upload-time = "2025-09-08T23:23:04.792Z" }, + { url = "https://files.pythonhosted.org/packages/f2/7f/e6647792fc5850d634695bc0e6ab4111ae88e89981d35ac269956605feba/cffi-2.0.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:f93fd8e5c8c0a4aa1f424d6173f14a892044054871c771f8566e4008eaa359d2", size = 207948, upload-time = "2025-09-08T23:23:06.127Z" }, + { url = "https://files.pythonhosted.org/packages/cb/1e/a5a1bd6f1fb30f22573f76533de12a00bf274abcdc55c8edab639078abb6/cffi-2.0.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:dd4f05f54a52fb558f1ba9f528228066954fee3ebe629fc1660d874d040ae5a3", size = 206422, upload-time = "2025-09-08T23:23:07.753Z" }, + { url = "https://files.pythonhosted.org/packages/98/df/0a1755e750013a2081e863e7cd37e0cdd02664372c754e5560099eb7aa44/cffi-2.0.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:c8d3b5532fc71b7a77c09192b4a5a200ea992702734a2e9279a37f2478236f26", size = 219499, upload-time = "2025-09-08T23:23:09.648Z" }, + { url = "https://files.pythonhosted.org/packages/50/e1/a969e687fcf9ea58e6e2a928ad5e2dd88cc12f6f0ab477e9971f2309b57c/cffi-2.0.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:d9b29c1f0ae438d5ee9acb31cadee00a58c46cc9c0b2f9038c6b0b3470877a8c", size = 222928, upload-time = "2025-09-08T23:23:10.928Z" }, + { url = "https://files.pythonhosted.org/packages/36/54/0362578dd2c9e557a28ac77698ed67323ed5b9775ca9d3fe73fe191bb5d8/cffi-2.0.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6d50360be4546678fc1b79ffe7a66265e28667840010348dd69a314145807a1b", size = 221302, upload-time = "2025-09-08T23:23:12.42Z" }, + { url = "https://files.pythonhosted.org/packages/eb/6d/bf9bda840d5f1dfdbf0feca87fbdb64a918a69bca42cfa0ba7b137c48cb8/cffi-2.0.0-cp313-cp313-win32.whl", hash = "sha256:74a03b9698e198d47562765773b4a8309919089150a0bb17d829ad7b44b60d27", size = 172909, upload-time = "2025-09-08T23:23:14.32Z" }, + { url = "https://files.pythonhosted.org/packages/37/18/6519e1ee6f5a1e579e04b9ddb6f1676c17368a7aba48299c3759bbc3c8b3/cffi-2.0.0-cp313-cp313-win_amd64.whl", hash = "sha256:19f705ada2530c1167abacb171925dd886168931e0a7b78f5bffcae5c6b5be75", size = 183402, upload-time = "2025-09-08T23:23:15.535Z" }, + { url = "https://files.pythonhosted.org/packages/cb/0e/02ceeec9a7d6ee63bb596121c2c8e9b3a9e150936f4fbef6ca1943e6137c/cffi-2.0.0-cp313-cp313-win_arm64.whl", hash = "sha256:256f80b80ca3853f90c21b23ee78cd008713787b1b1e93eae9f3d6a7134abd91", size = 177780, upload-time = "2025-09-08T23:23:16.761Z" }, + { url = "https://files.pythonhosted.org/packages/92/c4/3ce07396253a83250ee98564f8d7e9789fab8e58858f35d07a9a2c78de9f/cffi-2.0.0-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:fc33c5141b55ed366cfaad382df24fe7dcbc686de5be719b207bb248e3053dc5", size = 185320, upload-time = "2025-09-08T23:23:18.087Z" }, + { url = "https://files.pythonhosted.org/packages/59/dd/27e9fa567a23931c838c6b02d0764611c62290062a6d4e8ff7863daf9730/cffi-2.0.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:c654de545946e0db659b3400168c9ad31b5d29593291482c43e3564effbcee13", size = 181487, upload-time = "2025-09-08T23:23:19.622Z" }, + { url = "https://files.pythonhosted.org/packages/d6/43/0e822876f87ea8a4ef95442c3d766a06a51fc5298823f884ef87aaad168c/cffi-2.0.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:24b6f81f1983e6df8db3adc38562c83f7d4a0c36162885ec7f7b77c7dcbec97b", size = 220049, upload-time = "2025-09-08T23:23:20.853Z" }, + { url = "https://files.pythonhosted.org/packages/b4/89/76799151d9c2d2d1ead63c2429da9ea9d7aac304603de0c6e8764e6e8e70/cffi-2.0.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:12873ca6cb9b0f0d3a0da705d6086fe911591737a59f28b7936bdfed27c0d47c", size = 207793, upload-time = "2025-09-08T23:23:22.08Z" }, + { url = "https://files.pythonhosted.org/packages/bb/dd/3465b14bb9e24ee24cb88c9e3730f6de63111fffe513492bf8c808a3547e/cffi-2.0.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:d9b97165e8aed9272a6bb17c01e3cc5871a594a446ebedc996e2397a1c1ea8ef", size = 206300, upload-time = "2025-09-08T23:23:23.314Z" }, + { url = "https://files.pythonhosted.org/packages/47/d9/d83e293854571c877a92da46fdec39158f8d7e68da75bf73581225d28e90/cffi-2.0.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:afb8db5439b81cf9c9d0c80404b60c3cc9c3add93e114dcae767f1477cb53775", size = 219244, upload-time = "2025-09-08T23:23:24.541Z" }, + { url = "https://files.pythonhosted.org/packages/2b/0f/1f177e3683aead2bb00f7679a16451d302c436b5cbf2505f0ea8146ef59e/cffi-2.0.0-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:737fe7d37e1a1bffe70bd5754ea763a62a066dc5913ca57e957824b72a85e205", size = 222828, upload-time = "2025-09-08T23:23:26.143Z" }, + { url = "https://files.pythonhosted.org/packages/c6/0f/cafacebd4b040e3119dcb32fed8bdef8dfe94da653155f9d0b9dc660166e/cffi-2.0.0-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:38100abb9d1b1435bc4cc340bb4489635dc2f0da7456590877030c9b3d40b0c1", size = 220926, upload-time = "2025-09-08T23:23:27.873Z" }, + { url = "https://files.pythonhosted.org/packages/3e/aa/df335faa45b395396fcbc03de2dfcab242cd61a9900e914fe682a59170b1/cffi-2.0.0-cp314-cp314-win32.whl", hash = "sha256:087067fa8953339c723661eda6b54bc98c5625757ea62e95eb4898ad5e776e9f", size = 175328, upload-time = "2025-09-08T23:23:44.61Z" }, + { url = "https://files.pythonhosted.org/packages/bb/92/882c2d30831744296ce713f0feb4c1cd30f346ef747b530b5318715cc367/cffi-2.0.0-cp314-cp314-win_amd64.whl", hash = "sha256:203a48d1fb583fc7d78a4c6655692963b860a417c0528492a6bc21f1aaefab25", size = 185650, upload-time = "2025-09-08T23:23:45.848Z" }, + { url = "https://files.pythonhosted.org/packages/9f/2c/98ece204b9d35a7366b5b2c6539c350313ca13932143e79dc133ba757104/cffi-2.0.0-cp314-cp314-win_arm64.whl", hash = "sha256:dbd5c7a25a7cb98f5ca55d258b103a2054f859a46ae11aaf23134f9cc0d356ad", size = 180687, upload-time = "2025-09-08T23:23:47.105Z" }, + { url = "https://files.pythonhosted.org/packages/3e/61/c768e4d548bfa607abcda77423448df8c471f25dbe64fb2ef6d555eae006/cffi-2.0.0-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:9a67fc9e8eb39039280526379fb3a70023d77caec1852002b4da7e8b270c4dd9", size = 188773, upload-time = "2025-09-08T23:23:29.347Z" }, + { url = "https://files.pythonhosted.org/packages/2c/ea/5f76bce7cf6fcd0ab1a1058b5af899bfbef198bea4d5686da88471ea0336/cffi-2.0.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:7a66c7204d8869299919db4d5069a82f1561581af12b11b3c9f48c584eb8743d", size = 185013, upload-time = "2025-09-08T23:23:30.63Z" }, + { url = "https://files.pythonhosted.org/packages/be/b4/c56878d0d1755cf9caa54ba71e5d049479c52f9e4afc230f06822162ab2f/cffi-2.0.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7cc09976e8b56f8cebd752f7113ad07752461f48a58cbba644139015ac24954c", size = 221593, upload-time = "2025-09-08T23:23:31.91Z" }, + { url = "https://files.pythonhosted.org/packages/e0/0d/eb704606dfe8033e7128df5e90fee946bbcb64a04fcdaa97321309004000/cffi-2.0.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.whl", hash = "sha256:92b68146a71df78564e4ef48af17551a5ddd142e5190cdf2c5624d0c3ff5b2e8", size = 209354, upload-time = "2025-09-08T23:23:33.214Z" }, + { url = "https://files.pythonhosted.org/packages/d8/19/3c435d727b368ca475fb8742ab97c9cb13a0de600ce86f62eab7fa3eea60/cffi-2.0.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b1e74d11748e7e98e2f426ab176d4ed720a64412b6a15054378afdb71e0f37dc", size = 208480, upload-time = "2025-09-08T23:23:34.495Z" }, + { url = "https://files.pythonhosted.org/packages/d0/44/681604464ed9541673e486521497406fadcc15b5217c3e326b061696899a/cffi-2.0.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:28a3a209b96630bca57cce802da70c266eb08c6e97e5afd61a75611ee6c64592", size = 221584, upload-time = "2025-09-08T23:23:36.096Z" }, + { url = "https://files.pythonhosted.org/packages/25/8e/342a504ff018a2825d395d44d63a767dd8ebc927ebda557fecdaca3ac33a/cffi-2.0.0-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:7553fb2090d71822f02c629afe6042c299edf91ba1bf94951165613553984512", size = 224443, upload-time = "2025-09-08T23:23:37.328Z" }, + { url = "https://files.pythonhosted.org/packages/e1/5e/b666bacbbc60fbf415ba9988324a132c9a7a0448a9a8f125074671c0f2c3/cffi-2.0.0-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:6c6c373cfc5c83a975506110d17457138c8c63016b563cc9ed6e056a82f13ce4", size = 223437, upload-time = "2025-09-08T23:23:38.945Z" }, + { url = "https://files.pythonhosted.org/packages/a0/1d/ec1a60bd1a10daa292d3cd6bb0b359a81607154fb8165f3ec95fe003b85c/cffi-2.0.0-cp314-cp314t-win32.whl", hash = "sha256:1fc9ea04857caf665289b7a75923f2c6ed559b8298a1b8c49e59f7dd95c8481e", size = 180487, upload-time = "2025-09-08T23:23:40.423Z" }, + { url = "https://files.pythonhosted.org/packages/bf/41/4c1168c74fac325c0c8156f04b6749c8b6a8f405bbf91413ba088359f60d/cffi-2.0.0-cp314-cp314t-win_amd64.whl", hash = "sha256:d68b6cef7827e8641e8ef16f4494edda8b36104d79773a334beaa1e3521430f6", size = 191726, upload-time = "2025-09-08T23:23:41.742Z" }, + { url = "https://files.pythonhosted.org/packages/ae/3a/dbeec9d1ee0844c679f6bb5d6ad4e9f198b1224f4e7a32825f47f6192b0c/cffi-2.0.0-cp314-cp314t-win_arm64.whl", hash = "sha256:0a1527a803f0a659de1af2e1fd700213caba79377e27e4693648c2923da066f9", size = 184195, upload-time = "2025-09-08T23:23:43.004Z" }, +] + [[package]] name = "charset-normalizer" version = "3.4.6" @@ -128,6 +173,73 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/d1/d6/3965ed04c63042e047cb6a3e6ed1a63a35087b6a609aa3a15ed8ac56c221/colorama-0.4.6-py2.py3-none-any.whl", hash = "sha256:4f1d9991f5acc0ca119f9d443620b77f9d6b33703e51011c16baf57afb285fc6", size = 25335, upload-time = "2022-10-25T02:36:20.889Z" }, ] +[[package]] +name = "cryptography" +version = "46.0.6" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cffi", marker = "platform_python_implementation != 'PyPy'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a4/ba/04b1bd4218cbc58dc90ce967106d51582371b898690f3ae0402876cc4f34/cryptography-46.0.6.tar.gz", hash = "sha256:27550628a518c5c6c903d84f637fbecf287f6cb9ced3804838a1295dc1fd0759", size = 750542, upload-time = "2026-03-25T23:34:53.396Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/47/23/9285e15e3bc57325b0a72e592921983a701efc1ee8f91c06c5f0235d86d9/cryptography-46.0.6-cp311-abi3-macosx_10_9_universal2.whl", hash = "sha256:64235194bad039a10bb6d2d930ab3323baaec67e2ce36215fd0952fad0930ca8", size = 7176401, upload-time = "2026-03-25T23:33:22.096Z" }, + { url = "https://files.pythonhosted.org/packages/60/f8/e61f8f13950ab6195b31913b42d39f0f9afc7d93f76710f299b5ec286ae6/cryptography-46.0.6-cp311-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:26031f1e5ca62fcb9d1fcb34b2b60b390d1aacaa15dc8b895a9ed00968b97b30", size = 4275275, upload-time = "2026-03-25T23:33:23.844Z" }, + { url = "https://files.pythonhosted.org/packages/19/69/732a736d12c2631e140be2348b4ad3d226302df63ef64d30dfdb8db7ad1c/cryptography-46.0.6-cp311-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:9a693028b9cbe51b5a1136232ee8f2bc242e4e19d456ded3fa7c86e43c713b4a", size = 4425320, upload-time = "2026-03-25T23:33:25.703Z" }, + { url = "https://files.pythonhosted.org/packages/d4/12/123be7292674abf76b21ac1fc0e1af50661f0e5b8f0ec8285faac18eb99e/cryptography-46.0.6-cp311-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:67177e8a9f421aa2d3a170c3e56eca4e0128883cf52a071a7cbf53297f18b175", size = 4278082, upload-time = "2026-03-25T23:33:27.423Z" }, + { url = "https://files.pythonhosted.org/packages/5b/ba/d5e27f8d68c24951b0a484924a84c7cdaed7502bac9f18601cd357f8b1d2/cryptography-46.0.6-cp311-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:d9528b535a6c4f8ff37847144b8986a9a143585f0540fbcb1a98115b543aa463", size = 4926514, upload-time = "2026-03-25T23:33:29.206Z" }, + { url = "https://files.pythonhosted.org/packages/34/71/1ea5a7352ae516d5512d17babe7e1b87d9db5150b21f794b1377eac1edc0/cryptography-46.0.6-cp311-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:22259338084d6ae497a19bae5d4c66b7ca1387d3264d1c2c0e72d9e9b6a77b97", size = 4457766, upload-time = "2026-03-25T23:33:30.834Z" }, + { url = "https://files.pythonhosted.org/packages/01/59/562be1e653accee4fdad92c7a2e88fced26b3fdfce144047519bbebc299e/cryptography-46.0.6-cp311-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:760997a4b950ff00d418398ad73fbc91aa2894b5c1db7ccb45b4f68b42a63b3c", size = 3986535, upload-time = "2026-03-25T23:33:33.02Z" }, + { url = "https://files.pythonhosted.org/packages/d6/8b/b1ebfeb788bf4624d36e45ed2662b8bd43a05ff62157093c1539c1288a18/cryptography-46.0.6-cp311-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:3dfa6567f2e9e4c5dceb8ccb5a708158a2a871052fa75c8b78cb0977063f1507", size = 4277618, upload-time = "2026-03-25T23:33:34.567Z" }, + { url = "https://files.pythonhosted.org/packages/dd/52/a005f8eabdb28df57c20f84c44d397a755782d6ff6d455f05baa2785bd91/cryptography-46.0.6-cp311-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:cdcd3edcbc5d55757e5f5f3d330dd00007ae463a7e7aa5bf132d1f22a4b62b19", size = 4890802, upload-time = "2026-03-25T23:33:37.034Z" }, + { url = "https://files.pythonhosted.org/packages/ec/4d/8e7d7245c79c617d08724e2efa397737715ca0ec830ecb3c91e547302555/cryptography-46.0.6-cp311-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:d4e4aadb7fc1f88687f47ca20bb7227981b03afaae69287029da08096853b738", size = 4457425, upload-time = "2026-03-25T23:33:38.904Z" }, + { url = "https://files.pythonhosted.org/packages/1d/5c/f6c3596a1430cec6f949085f0e1a970638d76f81c3ea56d93d564d04c340/cryptography-46.0.6-cp311-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:2b417edbe8877cda9022dde3a008e2deb50be9c407eef034aeeb3a8b11d9db3c", size = 4405530, upload-time = "2026-03-25T23:33:40.842Z" }, + { url = "https://files.pythonhosted.org/packages/7e/c9/9f9cea13ee2dbde070424e0c4f621c091a91ffcc504ffea5e74f0e1daeff/cryptography-46.0.6-cp311-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:380343e0653b1c9d7e1f55b52aaa2dbb2fdf2730088d48c43ca1c7c0abb7cc2f", size = 4667896, upload-time = "2026-03-25T23:33:42.781Z" }, + { url = "https://files.pythonhosted.org/packages/ad/b5/1895bc0821226f129bc74d00eccfc6a5969e2028f8617c09790bf89c185e/cryptography-46.0.6-cp311-abi3-win32.whl", hash = "sha256:bcb87663e1f7b075e48c3be3ecb5f0b46c8fc50b50a97cf264e7f60242dca3f2", size = 3026348, upload-time = "2026-03-25T23:33:45.021Z" }, + { url = "https://files.pythonhosted.org/packages/c3/f8/c9bcbf0d3e6ad288b9d9aa0b1dee04b063d19e8c4f871855a03ab3a297ab/cryptography-46.0.6-cp311-abi3-win_amd64.whl", hash = "sha256:6739d56300662c468fddb0e5e291f9b4d084bead381667b9e654c7dd81705124", size = 3483896, upload-time = "2026-03-25T23:33:46.649Z" }, + { url = "https://files.pythonhosted.org/packages/01/41/3a578f7fd5c70611c0aacba52cd13cb364a5dee895a5c1d467208a9380b0/cryptography-46.0.6-cp314-cp314t-macosx_10_9_universal2.whl", hash = "sha256:2ef9e69886cbb137c2aef9772c2e7138dc581fad4fcbcf13cc181eb5a3ab6275", size = 7117147, upload-time = "2026-03-25T23:33:48.249Z" }, + { url = "https://files.pythonhosted.org/packages/fa/87/887f35a6fca9dde90cad08e0de0c89263a8e59b2d2ff904fd9fcd8025b6f/cryptography-46.0.6-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:7f417f034f91dcec1cb6c5c35b07cdbb2ef262557f701b4ecd803ee8cefed4f4", size = 4266221, upload-time = "2026-03-25T23:33:49.874Z" }, + { url = "https://files.pythonhosted.org/packages/aa/a8/0a90c4f0b0871e0e3d1ed126aed101328a8a57fd9fd17f00fb67e82a51ca/cryptography-46.0.6-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:d24c13369e856b94892a89ddf70b332e0b70ad4a5c43cf3e9cb71d6d7ffa1f7b", size = 4408952, upload-time = "2026-03-25T23:33:52.128Z" }, + { url = "https://files.pythonhosted.org/packages/16/0b/b239701eb946523e4e9f329336e4ff32b1247e109cbab32d1a7b61da8ed7/cryptography-46.0.6-cp314-cp314t-manylinux_2_28_aarch64.whl", hash = "sha256:aad75154a7ac9039936d50cf431719a2f8d4ed3d3c277ac03f3339ded1a5e707", size = 4270141, upload-time = "2026-03-25T23:33:54.11Z" }, + { url = "https://files.pythonhosted.org/packages/0f/a8/976acdd4f0f30df7b25605f4b9d3d89295351665c2091d18224f7ad5cdbf/cryptography-46.0.6-cp314-cp314t-manylinux_2_28_ppc64le.whl", hash = "sha256:3c21d92ed15e9cfc6eb64c1f5a0326db22ca9c2566ca46d845119b45b4400361", size = 4904178, upload-time = "2026-03-25T23:33:55.725Z" }, + { url = "https://files.pythonhosted.org/packages/b1/1b/bf0e01a88efd0e59679b69f42d4afd5bced8700bb5e80617b2d63a3741af/cryptography-46.0.6-cp314-cp314t-manylinux_2_28_x86_64.whl", hash = "sha256:4668298aef7cddeaf5c6ecc244c2302a2b8e40f384255505c22875eebb47888b", size = 4441812, upload-time = "2026-03-25T23:33:57.364Z" }, + { url = "https://files.pythonhosted.org/packages/bb/8b/11df86de2ea389c65aa1806f331cae145f2ed18011f30234cc10ca253de8/cryptography-46.0.6-cp314-cp314t-manylinux_2_31_armv7l.whl", hash = "sha256:8ce35b77aaf02f3b59c90b2c8a05c73bac12cea5b4e8f3fbece1f5fddea5f0ca", size = 3963923, upload-time = "2026-03-25T23:33:59.361Z" }, + { url = "https://files.pythonhosted.org/packages/91/e0/207fb177c3a9ef6a8108f234208c3e9e76a6aa8cf20d51932916bd43bda0/cryptography-46.0.6-cp314-cp314t-manylinux_2_34_aarch64.whl", hash = "sha256:c89eb37fae9216985d8734c1afd172ba4927f5a05cfd9bf0e4863c6d5465b013", size = 4269695, upload-time = "2026-03-25T23:34:00.909Z" }, + { url = "https://files.pythonhosted.org/packages/21/5e/19f3260ed1e95bced52ace7501fabcd266df67077eeb382b79c81729d2d3/cryptography-46.0.6-cp314-cp314t-manylinux_2_34_ppc64le.whl", hash = "sha256:ed418c37d095aeddf5336898a132fba01091f0ac5844e3e8018506f014b6d2c4", size = 4869785, upload-time = "2026-03-25T23:34:02.796Z" }, + { url = "https://files.pythonhosted.org/packages/10/38/cd7864d79aa1d92ef6f1a584281433419b955ad5a5ba8d1eb6c872165bcb/cryptography-46.0.6-cp314-cp314t-manylinux_2_34_x86_64.whl", hash = "sha256:69cf0056d6947edc6e6760e5f17afe4bea06b56a9ac8a06de9d2bd6b532d4f3a", size = 4441404, upload-time = "2026-03-25T23:34:04.35Z" }, + { url = "https://files.pythonhosted.org/packages/09/0a/4fe7a8d25fed74419f91835cf5829ade6408fd1963c9eae9c4bce390ecbb/cryptography-46.0.6-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:8e7304c4f4e9490e11efe56af6713983460ee0780f16c63f219984dab3af9d2d", size = 4397549, upload-time = "2026-03-25T23:34:06.342Z" }, + { url = "https://files.pythonhosted.org/packages/5f/a0/7d738944eac6513cd60a8da98b65951f4a3b279b93479a7e8926d9cd730b/cryptography-46.0.6-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:b928a3ca837c77a10e81a814a693f2295200adb3352395fad024559b7be7a736", size = 4651874, upload-time = "2026-03-25T23:34:07.916Z" }, + { url = "https://files.pythonhosted.org/packages/cb/f1/c2326781ca05208845efca38bf714f76939ae446cd492d7613808badedf1/cryptography-46.0.6-cp314-cp314t-win32.whl", hash = "sha256:97c8115b27e19e592a05c45d0dd89c57f81f841cc9880e353e0d3bf25b2139ed", size = 3001511, upload-time = "2026-03-25T23:34:09.892Z" }, + { url = "https://files.pythonhosted.org/packages/c9/57/fe4a23eb549ac9d903bd4698ffda13383808ef0876cc912bcb2838799ece/cryptography-46.0.6-cp314-cp314t-win_amd64.whl", hash = "sha256:c797e2517cb7880f8297e2c0f43bb910e91381339336f75d2c1c2cbf811b70b4", size = 3471692, upload-time = "2026-03-25T23:34:11.613Z" }, + { url = "https://files.pythonhosted.org/packages/c4/cc/f330e982852403da79008552de9906804568ae9230da8432f7496ce02b71/cryptography-46.0.6-cp38-abi3-macosx_10_9_universal2.whl", hash = "sha256:12cae594e9473bca1a7aceb90536060643128bb274fcea0fc459ab90f7d1ae7a", size = 7162776, upload-time = "2026-03-25T23:34:13.308Z" }, + { url = "https://files.pythonhosted.org/packages/49/b3/dc27efd8dcc4bff583b3f01d4a3943cd8b5821777a58b3a6a5f054d61b79/cryptography-46.0.6-cp38-abi3-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:639301950939d844a9e1c4464d7e07f902fe9a7f6b215bb0d4f28584729935d8", size = 4270529, upload-time = "2026-03-25T23:34:15.019Z" }, + { url = "https://files.pythonhosted.org/packages/e6/05/e8d0e6eb4f0d83365b3cb0e00eb3c484f7348db0266652ccd84632a3d58d/cryptography-46.0.6-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:ed3775295fb91f70b4027aeba878d79b3e55c0b3e97eaa4de71f8f23a9f2eb77", size = 4414827, upload-time = "2026-03-25T23:34:16.604Z" }, + { url = "https://files.pythonhosted.org/packages/2f/97/daba0f5d2dc6d855e2dcb70733c812558a7977a55dd4a6722756628c44d1/cryptography-46.0.6-cp38-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:8927ccfbe967c7df312ade694f987e7e9e22b2425976ddbf28271d7e58845290", size = 4271265, upload-time = "2026-03-25T23:34:18.586Z" }, + { url = "https://files.pythonhosted.org/packages/89/06/fe1fce39a37ac452e58d04b43b0855261dac320a2ebf8f5260dd55b201a9/cryptography-46.0.6-cp38-abi3-manylinux_2_28_ppc64le.whl", hash = "sha256:b12c6b1e1651e42ab5de8b1e00dc3b6354fdfd778e7fa60541ddacc27cd21410", size = 4916800, upload-time = "2026-03-25T23:34:20.561Z" }, + { url = "https://files.pythonhosted.org/packages/ff/8a/b14f3101fe9c3592603339eb5d94046c3ce5f7fc76d6512a2d40efd9724e/cryptography-46.0.6-cp38-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:063b67749f338ca9c5a0b7fe438a52c25f9526b851e24e6c9310e7195aad3b4d", size = 4448771, upload-time = "2026-03-25T23:34:22.406Z" }, + { url = "https://files.pythonhosted.org/packages/01/b3/0796998056a66d1973fd52ee89dc1bb3b6581960a91ad4ac705f182d398f/cryptography-46.0.6-cp38-abi3-manylinux_2_31_armv7l.whl", hash = "sha256:02fad249cb0e090b574e30b276a3da6a149e04ee2f049725b1f69e7b8351ec70", size = 3978333, upload-time = "2026-03-25T23:34:24.281Z" }, + { url = "https://files.pythonhosted.org/packages/c5/3d/db200af5a4ffd08918cd55c08399dc6c9c50b0bc72c00a3246e099d3a849/cryptography-46.0.6-cp38-abi3-manylinux_2_34_aarch64.whl", hash = "sha256:7e6142674f2a9291463e5e150090b95a8519b2fb6e6aaec8917dd8d094ce750d", size = 4271069, upload-time = "2026-03-25T23:34:25.895Z" }, + { url = "https://files.pythonhosted.org/packages/d7/18/61acfd5b414309d74ee838be321c636fe71815436f53c9f0334bf19064fa/cryptography-46.0.6-cp38-abi3-manylinux_2_34_ppc64le.whl", hash = "sha256:456b3215172aeefb9284550b162801d62f5f264a081049a3e94307fe20792cfa", size = 4878358, upload-time = "2026-03-25T23:34:27.67Z" }, + { url = "https://files.pythonhosted.org/packages/8b/65/5bf43286d566f8171917cae23ac6add941654ccf085d739195a4eacf1674/cryptography-46.0.6-cp38-abi3-manylinux_2_34_x86_64.whl", hash = "sha256:341359d6c9e68834e204ceaf25936dffeafea3829ab80e9503860dcc4f4dac58", size = 4448061, upload-time = "2026-03-25T23:34:29.375Z" }, + { url = "https://files.pythonhosted.org/packages/e0/25/7e49c0fa7205cf3597e525d156a6bce5b5c9de1fd7e8cb01120e459f205a/cryptography-46.0.6-cp38-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9a9c42a2723999a710445bc0d974e345c32adfd8d2fac6d8a251fa829ad31cfb", size = 4399103, upload-time = "2026-03-25T23:34:32.036Z" }, + { url = "https://files.pythonhosted.org/packages/44/46/466269e833f1c4718d6cd496ffe20c56c9c8d013486ff66b4f69c302a68d/cryptography-46.0.6-cp38-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:6617f67b1606dfd9fe4dbfa354a9508d4a6d37afe30306fe6c101b7ce3274b72", size = 4659255, upload-time = "2026-03-25T23:34:33.679Z" }, + { url = "https://files.pythonhosted.org/packages/0a/09/ddc5f630cc32287d2c953fc5d32705e63ec73e37308e5120955316f53827/cryptography-46.0.6-cp38-abi3-win32.whl", hash = "sha256:7f6690b6c55e9c5332c0b59b9c8a3fb232ebf059094c17f9019a51e9827df91c", size = 3010660, upload-time = "2026-03-25T23:34:35.418Z" }, + { url = "https://files.pythonhosted.org/packages/1b/82/ca4893968aeb2709aacfb57a30dec6fa2ab25b10fa9f064b8882ce33f599/cryptography-46.0.6-cp38-abi3-win_amd64.whl", hash = "sha256:79e865c642cfc5c0b3eb12af83c35c5aeff4fa5c672dc28c43721c2c9fdd2f0f", size = 3471160, upload-time = "2026-03-25T23:34:37.191Z" }, +] + +[[package]] +name = "docker" +version = "7.1.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "pywin32", marker = "sys_platform == 'win32'" }, + { name = "requests" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/91/9b/4a2ea29aeba62471211598dac5d96825bb49348fa07e906ea930394a83ce/docker-7.1.0.tar.gz", hash = "sha256:ad8c70e6e3f8926cb8a92619b832b4ea5299e2831c14284663184e200546fa6c", size = 117834, upload-time = "2024-05-23T11:13:57.216Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/e3/26/57c6fb270950d476074c087527a558ccb6f4436657314bfb6cdf484114c4/docker-7.1.0-py3-none-any.whl", hash = "sha256:c96b93b7f0a746f9e77d325bcfb87422a3d8bd4f03136ae8a85b37f1898d5fc0", size = 147774, upload-time = "2024-05-23T11:13:55.01Z" }, +] + [[package]] name = "fastapi" version = "0.135.1" @@ -247,6 +359,7 @@ name = "master" version = "0.0.1" source = { virtual = "." } dependencies = [ + { name = "docker" }, { name = "fastapi" }, { name = "opentelemetry-api" }, { name = "opentelemetry-exporter-otlp-proto-http" }, @@ -262,11 +375,13 @@ dev = [ { name = "mypy" }, { name = "pytest" }, { name = "ruff" }, + { name = "types-docker" }, { name = "types-pyyaml" }, ] [package.metadata] requires-dist = [ + { name = "docker", specifier = ">=7.1.0" }, { name = "fastapi", specifier = ">=0.116.1" }, { name = "opentelemetry-api", specifier = ">=1.31.1" }, { name = "opentelemetry-exporter-otlp-proto-http", specifier = ">=1.31.1" }, @@ -282,6 +397,7 @@ dev = [ { name = "mypy", specifier = ">=1.18.2" }, { name = "pytest", specifier = ">=8.4.2" }, { name = "ruff", specifier = ">=0.13.1" }, + { name = "types-docker", specifier = ">=7.1.0.20260402" }, { name = "types-pyyaml", specifier = ">=6.0.12.20250915" }, ] @@ -501,6 +617,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c4/72/02445137af02769918a93807b2b7890047c32bfb9f90371cbc12688819eb/protobuf-6.33.6-py3-none-any.whl", hash = "sha256:77179e006c476e69bf8e8ce866640091ec42e1beb80b213c3900006ecfba6901", size = 170656, upload-time = "2026-03-18T19:04:59.826Z" }, ] +[[package]] +name = "pycparser" +version = "3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/1b/7d/92392ff7815c21062bea51aa7b87d45576f649f16458d78b7cf94b9ab2e6/pycparser-3.0.tar.gz", hash = "sha256:600f49d217304a5902ac3c37e1281c9fe94e4d0489de643a9504c5cdfdfc6b29", size = 103492, upload-time = "2026-01-21T14:26:51.89Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/0c/c3/44f3fbbfa403ea2a7c779186dc20772604442dde72947e7d01069cbe98e3/pycparser-3.0-py3-none-any.whl", hash = "sha256:b727414169a36b7d524c1c3e31839a521725078d7b2ff038656844266160a992", size = 48172, upload-time = "2026-01-21T14:26:50.693Z" }, +] + [[package]] name = "pydantic" version = "2.12.5" @@ -603,6 +728,19 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0b/d7/1959b9648791274998a9c3526f6d0ec8fd2233e4d4acce81bbae76b44b2a/python_dotenv-1.2.2-py3-none-any.whl", hash = "sha256:1d8214789a24de455a8b8bd8ae6fe3c6b69a5e3d64aa8a8e5d68e694bbcb285a", size = 22101, upload-time = "2026-03-01T16:00:25.09Z" }, ] +[[package]] +name = "pywin32" +version = "311" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a5/be/3fd5de0979fcb3994bfee0d65ed8ca9506a8a1260651b86174f6a86f52b3/pywin32-311-cp313-cp313-win32.whl", hash = "sha256:f95ba5a847cba10dd8c4d8fefa9f2a6cf283b8b88ed6178fa8a6c1ab16054d0d", size = 8705700, upload-time = "2025-07-14T20:13:26.471Z" }, + { url = "https://files.pythonhosted.org/packages/e3/28/e0a1909523c6890208295a29e05c2adb2126364e289826c0a8bc7297bd5c/pywin32-311-cp313-cp313-win_amd64.whl", hash = "sha256:718a38f7e5b058e76aee1c56ddd06908116d35147e133427e59a3983f703a20d", size = 9494700, upload-time = "2025-07-14T20:13:28.243Z" }, + { url = "https://files.pythonhosted.org/packages/04/bf/90339ac0f55726dce7d794e6d79a18a91265bdf3aa70b6b9ca52f35e022a/pywin32-311-cp313-cp313-win_arm64.whl", hash = "sha256:7b4075d959648406202d92a2310cb990fea19b535c7f4a78d3f5e10b926eeb8a", size = 8709318, upload-time = "2025-07-14T20:13:30.348Z" }, + { url = "https://files.pythonhosted.org/packages/c9/31/097f2e132c4f16d99a22bfb777e0fd88bd8e1c634304e102f313af69ace5/pywin32-311-cp314-cp314-win32.whl", hash = "sha256:b7a2c10b93f8986666d0c803ee19b5990885872a7de910fc460f9b0c2fbf92ee", size = 8840714, upload-time = "2025-07-14T20:13:32.449Z" }, + { url = "https://files.pythonhosted.org/packages/90/4b/07c77d8ba0e01349358082713400435347df8426208171ce297da32c313d/pywin32-311-cp314-cp314-win_amd64.whl", hash = "sha256:3aca44c046bd2ed8c90de9cb8427f581c479e594e99b5c0bb19b29c10fd6cb87", size = 9656800, upload-time = "2025-07-14T20:13:34.312Z" }, + { url = "https://files.pythonhosted.org/packages/c0/d2/21af5c535501a7233e734b8af901574572da66fcc254cb35d0609c9080dd/pywin32-311-cp314-cp314-win_arm64.whl", hash = "sha256:a508e2d9025764a8270f93111a970e1d0fbfc33f4153b388bb649b7eec4f9b42", size = 8932540, upload-time = "2025-07-14T20:13:36.379Z" }, +] + [[package]] name = "pyyaml" version = "6.0.3" @@ -691,6 +829,32 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/81/0d/13d1d239a25cbfb19e740db83143e95c772a1fe10202dda4b76792b114dd/starlette-0.52.1-py3-none-any.whl", hash = "sha256:0029d43eb3d273bc4f83a08720b4912ea4b071087a3b48db01b7c839f7954d74", size = 74272, upload-time = "2026-01-18T13:34:09.188Z" }, ] +[[package]] +name = "types-docker" +version = "7.1.0.20260402" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "types-paramiko" }, + { name = "types-requests" }, + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/1f/b6/e9dd7b51f5db0df219aed496e5f1fe4e4e17828cd9c354fc6a98a4454ea1/types_docker-7.1.0.20260402.tar.gz", hash = "sha256:2df72c6268a815f4ba28fe2556072a99b8acffe46370fa4d27c485355af2e37e", size = 32974, upload-time = "2026-04-02T04:22:54.318Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/90/8c/5056b3f6ba818dc9f677f9e7aa9044113657a8b67871ed30921d3712b2f2/types_docker-7.1.0.20260402-py3-none-any.whl", hash = "sha256:88055c13ad43a3d13415db42d180193e812e6a25d0b50710d6559e86cb9345e5", size = 47455, upload-time = "2026-04-02T04:22:53.426Z" }, +] + +[[package]] +name = "types-paramiko" +version = "4.0.0.20260402" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "cryptography" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c6/91/63f4fa68a7d563ca54f62c013aeefb480a2e18a0ee0623a56119f1662bf2/types_paramiko-4.0.0.20260402.tar.gz", hash = "sha256:a9287bdb78cb67c8e79897dc98b761900968cd2de288f72cc298c1a25cde6a38", size = 29105, upload-time = "2026-04-02T04:20:26.809Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/8d/5e/d650556433e9b745fb85676f4382b7ec0f35156696d946b9bf5ac470358b/types_paramiko-4.0.0.20260402-py3-none-any.whl", hash = "sha256:38ef646f54d5410012d8607b9f023a355ecaec12b749d0325bce5f16706f6183", size = 38816, upload-time = "2026-04-02T04:20:25.788Z" }, +] + [[package]] name = "types-pyyaml" version = "6.0.12.20250915" @@ -700,6 +864,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/bd/e0/1eed384f02555dde685fff1a1ac805c1c7dcb6dd019c916fe659b1c1f9ec/types_pyyaml-6.0.12.20250915-py3-none-any.whl", hash = "sha256:e7d4d9e064e89a3b3cae120b4990cd370874d2bf12fa5f46c97018dd5d3c9ab6", size = 20338, upload-time = "2025-09-15T03:00:59.218Z" }, ] +[[package]] +name = "types-requests" +version = "2.33.0.20260402" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "urllib3" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/c1/7b/a06527d20af1441d813360b8e0ce152a75b7d8e4aab7c7d0a156f405d7ec/types_requests-2.33.0.20260402.tar.gz", hash = "sha256:1bdd3ada9b869741c5c4b887d2c8b4e38284a1449751823b5ebbccba3eefd9da", size = 23851, upload-time = "2026-04-02T04:19:55.942Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/51/65/3853bb6bac5ae789dc7e28781154705c27859eccc8e46282c3f36780f5f5/types_requests-2.33.0.20260402-py3-none-any.whl", hash = "sha256:c98372d7124dd5d10af815ee25c013897592ff92af27b27e22c98984102c3254", size = 20739, upload-time = "2026-04-02T04:19:54.955Z" }, +] + [[package]] name = "typing-extensions" version = "4.15.0" From 33ebcb1a82a4d19685ff19996c440e0ed1dda542 Mon Sep 17 00:00:00 2001 From: Azamat Date: Thu, 2 Apr 2026 13:12:34 +0300 Subject: [PATCH 05/30] ref #6: [feat] add impl in memory session repository --- adapter/di/container.py | 51 ++++++++++++++++++++++++++++++----- repository/sandbox_session.py | 28 +++++++++++++++++++ tasks.md | 2 +- usecase/sandbox.py | 47 +++++++++++++++++++++++++++++++- 4 files changed, 120 insertions(+), 8 deletions(-) create mode 100644 repository/sandbox_session.py diff --git a/adapter/di/container.py b/adapter/di/container.py index 8c08e7f..f091913 100644 --- a/adapter/di/container.py +++ b/adapter/di/container.py @@ -1,24 +1,34 @@ from collections.abc import Mapping from dataclasses import dataclass, field +from datetime import UTC, datetime, timedelta from pathlib import Path +import docker +from docker import DockerClient + from adapter.config.loader import load_config from adapter.config.model import AppConfig +from adapter.docker.runtime import DockerSandboxRuntime from adapter.observability.factory import build_observability from adapter.observability.runtime import ObservabilityRuntime from domain.user import User +from repository.sandbox_session import InMemorySandboxSessionRepository from repository.user import InMemoryUserRepository +from usecase.interface import Clock +from usecase.sandbox import CreateSandbox from usecase.user import GetUser @dataclass(frozen=True, slots=True) class AppRepositories: user: InMemoryUserRepository + sandbox_session: InMemorySandboxSessionRepository @dataclass(frozen=True, slots=True) class AppUsecases: get_user: GetUser + create_sandbox: CreateSandbox @dataclass(slots=True) @@ -27,16 +37,29 @@ class AppContainer: observability: ObservabilityRuntime repositories: AppRepositories usecases: AppUsecases + _docker_client: DockerClient = field(repr=False) _is_shutdown: bool = field(default=False, init=False, repr=False) def shutdown(self) -> None: if self._is_shutdown: return - try: - self.observability.shutdown() - finally: - self._is_shutdown = True + self._is_shutdown = True + errors: list[Exception] = [] + + for action in (self._docker_client.close, self.observability.shutdown): + try: + action() + except Exception as exc: + errors.append(exc) + + if errors: + raise ExceptionGroup('shutdown failed', errors) + + +class SystemClock(Clock): + def now(self) -> datetime: + return datetime.now(tz=UTC) def build_container( @@ -54,17 +77,32 @@ def build_container( ) observability = build_observability(app_config) + docker_client: DockerClient = docker.from_env() + clock = SystemClock() user_repository = InMemoryUserRepository( observability.tracer, [User(id='123', email='aza@gglamer.ru', name='gglamer')] ) - repositories = AppRepositories(user=user_repository) + sandbox_repository = InMemorySandboxSessionRepository() + sandbox_runtime = DockerSandboxRuntime(app_config.sandbox, docker_client) + + repositories = AppRepositories( + user=user_repository, + sandbox_session=sandbox_repository, + ) usecases = AppUsecases( get_user=GetUser( repository=user_repository, logger=observability.logger, tracer=observability.tracer, - ) + ), + create_sandbox=CreateSandbox( + repository=sandbox_repository, + runtime=sandbox_runtime, + clock=clock, + logger=observability.logger, + ttl=timedelta(seconds=app_config.sandbox.ttl_seconds), + ), ) return AppContainer( @@ -72,4 +110,5 @@ def build_container( observability=observability, repositories=repositories, usecases=usecases, + _docker_client=docker_client, ) diff --git a/repository/sandbox_session.py b/repository/sandbox_session.py new file mode 100644 index 0000000..9b23cd7 --- /dev/null +++ b/repository/sandbox_session.py @@ -0,0 +1,28 @@ +from datetime import datetime + +from domain.sandbox import SandboxSession +from usecase.interface import SandboxSessionRepository + + +class InMemorySandboxSessionRepository(SandboxSessionRepository): + def __init__(self) -> None: + self._sessions_by_chat_id: dict[str, SandboxSession] = {} + + def get_active_by_chat_id(self, chat_id: str) -> SandboxSession | None: + return self._sessions_by_chat_id.get(chat_id) + + def list_expired(self, now: datetime) -> list[SandboxSession]: + return [ + session + for session in self._sessions_by_chat_id.values() + if session.expires_at <= now + ] + + def save(self, session: SandboxSession) -> None: + self._sessions_by_chat_id[session.chat_id] = session + + def delete(self, session_id: str) -> None: + for chat_id, session in tuple(self._sessions_by_chat_id.items()): + if session.session_id == session_id: + del self._sessions_by_chat_id[chat_id] + return diff --git a/tasks.md b/tasks.md index 73c9b2e..ca244cb 100644 --- a/tasks.md +++ b/tasks.md @@ -64,7 +64,7 @@ ### M04. In-memory session repository и usecase `CreateSandbox` - Субагент: `feature-developer` -- Статус: pending +- Статус: completed - Зависимости: `M01`, `M02`, `M03` - Commit required: no - Scope: реализовать in-memory registry активных sandbox-сессий и usecase создания sandbox с логикой reuse по `chat_id` diff --git a/usecase/sandbox.py b/usecase/sandbox.py index 0c34422..452a5ff 100644 --- a/usecase/sandbox.py +++ b/usecase/sandbox.py @@ -1,5 +1,6 @@ from dataclasses import dataclass from datetime import timedelta +from uuid import uuid4 from domain.sandbox import SandboxSession from usecase.interface import Clock, Logger, SandboxRuntime, SandboxSessionRepository @@ -26,7 +27,47 @@ class CreateSandbox: self._ttl = ttl def execute(self, command: CreateSandboxCommand) -> SandboxSession: - raise NotImplementedError + now = self._clock.now() + session = self._repository.get_active_by_chat_id(command.chat_id) + + if session is not None and session.expires_at > now: + self._logger.info( + 'sandbox_reused', + attrs={ + 'chat_id': command.chat_id, + 'session_id': session.session_id, + 'container_id': session.container_id, + }, + ) + return session + + if session is not None: + self._logger.info( + 'sandbox_replaced', + attrs={ + 'chat_id': command.chat_id, + 'session_id': session.session_id, + 'container_id': session.container_id, + }, + ) + self._runtime.stop(session.container_id) + self._repository.delete(session.session_id) + + new_session = self._runtime.create( + session_id=_new_session_id(), + chat_id=command.chat_id, + expires_at=now + self._ttl, + ) + self._repository.save(new_session) + self._logger.info( + 'sandbox_created', + attrs={ + 'chat_id': command.chat_id, + 'session_id': new_session.session_id, + 'container_id': new_session.container_id, + }, + ) + return new_session class CleanupExpiredSandboxes: @@ -44,3 +85,7 @@ class CleanupExpiredSandboxes: def execute(self) -> list[SandboxSession]: raise NotImplementedError + + +def _new_session_id() -> str: + return uuid4().hex From bae540427a280afe5efd28196eba8fba9fce666b Mon Sep 17 00:00:00 2001 From: Azamat Date: Thu, 2 Apr 2026 13:27:44 +0300 Subject: [PATCH 06/30] ref #7: [feat] add cleanup task --- adapter/di/container.py | 9 +++- adapter/http/fastapi/app.py | 91 +++++++++++++++++++++++++++++++++++-- tasks.md | 2 +- usecase/sandbox.py | 19 +++++++- 4 files changed, 113 insertions(+), 8 deletions(-) diff --git a/adapter/di/container.py b/adapter/di/container.py index f091913..c28fbfa 100644 --- a/adapter/di/container.py +++ b/adapter/di/container.py @@ -15,7 +15,7 @@ from domain.user import User from repository.sandbox_session import InMemorySandboxSessionRepository from repository.user import InMemoryUserRepository from usecase.interface import Clock -from usecase.sandbox import CreateSandbox +from usecase.sandbox import CleanupExpiredSandboxes, CreateSandbox from usecase.user import GetUser @@ -29,6 +29,7 @@ class AppRepositories: class AppUsecases: get_user: GetUser create_sandbox: CreateSandbox + cleanup_expired_sandboxes: CleanupExpiredSandboxes @dataclass(slots=True) @@ -103,6 +104,12 @@ def build_container( logger=observability.logger, ttl=timedelta(seconds=app_config.sandbox.ttl_seconds), ), + cleanup_expired_sandboxes=CleanupExpiredSandboxes( + repository=sandbox_repository, + runtime=sandbox_runtime, + clock=clock, + logger=observability.logger, + ), ) return AppContainer( diff --git a/adapter/http/fastapi/app.py b/adapter/http/fastapi/app.py index 06ac839..d0e60aa 100644 --- a/adapter/http/fastapi/app.py +++ b/adapter/http/fastapi/app.py @@ -1,4 +1,5 @@ -from collections.abc import Callable +import asyncio +from collections.abc import Awaitable, Callable from fastapi import FastAPI from opentelemetry.instrumentation.fastapi import FastAPIInstrumentor @@ -11,6 +12,8 @@ from adapter.http.fastapi.middleware import register_middleware from adapter.http.fastapi.routers.v1.router import router as v1_router API_V1_PREFIX = '/api/v1' +APP_CLEANUP_TASK_STATE = 'cleanup_task' +APP_CLEANUP_STOP_STATE = 'cleanup_stop' def create_app(config: AppConfig | None = None) -> FastAPI: @@ -22,6 +25,7 @@ def create_app(config: AppConfig | None = None) -> FastAPI: app = FastAPI(title=app_config.app.name) setattr(app.state, APP_CONFIG_STATE, app_config) setattr(app.state, APP_CONTAINER_STATE, container) + app.add_event_handler('startup', _build_startup_handler(app, container)) app.add_event_handler('shutdown', _build_shutdown_handler(app, container)) register_middleware(app, app_config) app.include_router(v1_router, prefix=API_V1_PREFIX) @@ -43,19 +47,96 @@ def create_app(config: AppConfig | None = None) -> FastAPI: raise +def _build_startup_handler( + app: FastAPI, + container: AppContainer, +) -> Callable[[], Awaitable[None]]: + async def startup() -> None: + task = _get_cleanup_task(app) + if task is not None and not task.done(): + return + + stop_event = asyncio.Event() + setattr(app.state, APP_CLEANUP_STOP_STATE, stop_event) + setattr( + app.state, + APP_CLEANUP_TASK_STATE, + asyncio.create_task( + _run_cleanup_loop(container, stop_event), + name='sandbox_cleanup', + ), + ) + + return startup + + def _build_shutdown_handler( app: FastAPI, container: AppContainer, -) -> Callable[[], None]: - def shutdown() -> None: +) -> Callable[[], Awaitable[None]]: + async def shutdown() -> None: try: - _uninstrument_app(app) + await _stop_cleanup_loop(app) finally: - container.shutdown() + try: + _uninstrument_app(app) + finally: + container.shutdown() return shutdown +async def _run_cleanup_loop( + container: AppContainer, + stop_event: asyncio.Event, +) -> None: + interval = container.config.sandbox.cleanup_interval_seconds + + while not stop_event.is_set(): + try: + container.usecases.cleanup_expired_sandboxes.execute() + except Exception as exc: + container.observability.logger.error( + 'sandbox_cleanup_failed', + attrs={ + 'error': type(exc).__name__, + }, + ) + + try: + await asyncio.wait_for(stop_event.wait(), timeout=interval) + except asyncio.TimeoutError: + continue + + +async def _stop_cleanup_loop(app: FastAPI) -> None: + stop_event = _get_cleanup_stop_event(app) + if stop_event is not None: + stop_event.set() + + task = _get_cleanup_task(app) + try: + if task is not None: + await task + finally: + setattr(app.state, APP_CLEANUP_TASK_STATE, None) + setattr(app.state, APP_CLEANUP_STOP_STATE, None) + + +def _get_cleanup_task(app: FastAPI) -> asyncio.Task[None] | None: + task = getattr(app.state, APP_CLEANUP_TASK_STATE, None) + if isinstance(task, asyncio.Task): + return task + return None + + +def _get_cleanup_stop_event(app: FastAPI) -> asyncio.Event | None: + stop_event = getattr(app.state, APP_CLEANUP_STOP_STATE, None) + if isinstance(stop_event, asyncio.Event): + return stop_event + return None + + def _uninstrument_app(app: FastAPI) -> None: if _is_instrumented(app): FastAPIInstrumentor.uninstrument_app(app) diff --git a/tasks.md b/tasks.md index ca244cb..e8c293c 100644 --- a/tasks.md +++ b/tasks.md @@ -75,7 +75,7 @@ ### M05. Cleanup expired sandboxes и lifecycle wiring - Субагент: `feature-developer` -- Статус: pending +- Статус: completed - Зависимости: `M04` - Commit required: no - Scope: реализовать usecase cleanup просроченных sandbox и подключить периодический cleanup loop в FastAPI lifecycle diff --git a/usecase/sandbox.py b/usecase/sandbox.py index 452a5ff..ae60946 100644 --- a/usecase/sandbox.py +++ b/usecase/sandbox.py @@ -84,7 +84,24 @@ class CleanupExpiredSandboxes: self._logger = logger def execute(self) -> list[SandboxSession]: - raise NotImplementedError + now = self._clock.now() + expired_sessions = self._repository.list_expired(now) + cleaned_sessions: list[SandboxSession] = [] + + for session in expired_sessions: + self._runtime.stop(session.container_id) + self._repository.delete(session.session_id) + cleaned_sessions.append(session) + self._logger.info( + 'sandbox_cleaned', + attrs={ + 'chat_id': session.chat_id, + 'session_id': session.session_id, + 'container_id': session.container_id, + }, + ) + + return cleaned_sessions def _new_session_id() -> str: From d2506e0c63206c1acc589303831084dca0227de2 Mon Sep 17 00:00:00 2001 From: Azamat Date: Thu, 2 Apr 2026 13:41:41 +0300 Subject: [PATCH 07/30] ref #8: [feat] add http endpoint --- adapter/docker/runtime.py | 12 +---- adapter/http/fastapi/dependencies.py | 13 ++++-- adapter/http/fastapi/routers/v1/router.py | 57 +++++++++++++++++++++-- adapter/http/fastapi/schemas.py | 18 ++++++- tasks.md | 2 +- usecase/interface.py | 1 + usecase/sandbox.py | 4 +- 7 files changed, 87 insertions(+), 20 deletions(-) diff --git a/adapter/docker/runtime.py b/adapter/docker/runtime.py index 89df9ab..61fcaf6 100644 --- a/adapter/docker/runtime.py +++ b/adapter/docker/runtime.py @@ -1,4 +1,3 @@ -from collections.abc import Callable from datetime import datetime from pathlib import Path @@ -11,25 +10,22 @@ from domain.error import SandboxError, SandboxStartError from domain.sandbox import SandboxSession, SandboxStatus from usecase.interface import SandboxRuntime -type NowFactory = Callable[[datetime], datetime] - class DockerSandboxRuntime(SandboxRuntime): def __init__( self, config: SandboxConfig, client: DockerClient, - now: NowFactory | None = None, ) -> None: self._config = config self._client = client - self._now = _current_time if now is None else now def create( self, *, session_id: str, chat_id: str, + created_at: datetime, expires_at: datetime, ) -> SandboxSession: try: @@ -59,7 +55,7 @@ class DockerSandboxRuntime(SandboxRuntime): chat_id=chat_id, container_id=container_id, status=SandboxStatus.RUNNING, - created_at=self._now(expires_at), + created_at=created_at, expires_at=expires_at, ) @@ -128,7 +124,3 @@ class DockerSandboxRuntime(SandboxRuntime): def _host_path(self, path_value: str) -> Path: return Path(path_value).expanduser().resolve(strict=False) - - -def _current_time(expires_at: datetime) -> datetime: - return datetime.now(tz=expires_at.tzinfo) diff --git a/adapter/http/fastapi/dependencies.py b/adapter/http/fastapi/dependencies.py index 4892459..87a9224 100644 --- a/adapter/http/fastapi/dependencies.py +++ b/adapter/http/fastapi/dependencies.py @@ -1,8 +1,7 @@ -from typing import cast - from fastapi import Depends, Request from adapter.di.container import AppContainer +from usecase.sandbox import CreateSandbox from usecase.user import GetUser APP_CONTAINER_STATE = 'container' @@ -11,10 +10,16 @@ APP_CONFIG_STATE = 'config' def get_container(request: Request) -> AppContainer: container = getattr(request.app.state, APP_CONTAINER_STATE, None) - if container is None: + if not isinstance(container, AppContainer): raise RuntimeError('container unavailable') - return cast(AppContainer, container) + return container def get_get_user(container: AppContainer = Depends(get_container)) -> GetUser: return container.usecases.get_user + + +def get_create_sandbox( + container: AppContainer = Depends(get_container), +) -> CreateSandbox: + return container.usecases.create_sandbox diff --git a/adapter/http/fastapi/routers/v1/router.py b/adapter/http/fastapi/routers/v1/router.py index df3d575..1f0aff4 100644 --- a/adapter/http/fastapi/routers/v1/router.py +++ b/adapter/http/fastapi/routers/v1/router.py @@ -1,9 +1,21 @@ from fastapi import APIRouter, Depends, HTTPException, status from adapter.di.container import AppContainer -from adapter.http.fastapi.dependencies import get_container, get_get_user -from adapter.http.fastapi.schemas import ErrorResponse, HealthResponse, UserResponse -from domain.error import UserNotFoundError +from adapter.http.fastapi.dependencies import ( + get_container, + get_create_sandbox, + get_get_user, +) +from adapter.http.fastapi.schemas import ( + CreateSandboxRequest, + ErrorResponse, + HealthResponse, + SandboxSessionResponse, + UserResponse, +) +from domain.error import SandboxError, SandboxStartError, UserNotFoundError +from domain.sandbox import SandboxSession +from usecase.sandbox import CreateSandbox, CreateSandboxCommand from usecase.user import GetUser, GetUserQuery router = APIRouter() @@ -38,3 +50,42 @@ def get_user(user_id: str, usecase: GetUser = Depends(get_get_user)) -> UserResp ) from exc return UserResponse(id=user.id, email=user.email, name=user.name) + + +@router.post( + '/create', + response_model=SandboxSessionResponse, + responses={ + status.HTTP_503_SERVICE_UNAVAILABLE: {'model': ErrorResponse}, + status.HTTP_500_INTERNAL_SERVER_ERROR: {'model': ErrorResponse}, + }, + status_code=status.HTTP_200_OK, +) +def create_sandbox( + request: CreateSandboxRequest, + usecase: CreateSandbox = Depends(get_create_sandbox), +) -> SandboxSessionResponse: + try: + session = usecase.execute(CreateSandboxCommand(chat_id=request.chat_id)) + except SandboxStartError as exc: + raise HTTPException( + status_code=status.HTTP_503_SERVICE_UNAVAILABLE, + detail=str(exc), + ) from exc + except SandboxError as exc: + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=str(exc), + ) from exc + + return _to_sandbox_session_response(session) + + +def _to_sandbox_session_response(session: SandboxSession) -> SandboxSessionResponse: + return SandboxSessionResponse( + session_id=session.session_id, + chat_id=session.chat_id, + container_id=session.container_id, + status=session.status.value, + expires_at=session.expires_at, + ) diff --git a/adapter/http/fastapi/schemas.py b/adapter/http/fastapi/schemas.py index e11c95b..08d9056 100644 --- a/adapter/http/fastapi/schemas.py +++ b/adapter/http/fastapi/schemas.py @@ -1,4 +1,6 @@ -from pydantic import BaseModel +from datetime import datetime + +from pydantic import BaseModel, ConfigDict, Field class HealthResponse(BaseModel): @@ -7,6 +9,20 @@ class HealthResponse(BaseModel): env: str +class CreateSandboxRequest(BaseModel): + model_config = ConfigDict(extra='forbid', str_strip_whitespace=True) + + chat_id: str = Field(min_length=1) + + +class SandboxSessionResponse(BaseModel): + session_id: str + chat_id: str + container_id: str + status: str + expires_at: datetime + + class UserResponse(BaseModel): id: str email: str diff --git a/tasks.md b/tasks.md index e8c293c..49c99a6 100644 --- a/tasks.md +++ b/tasks.md @@ -86,7 +86,7 @@ ### M06. HTTP endpoint `POST /api/v1/create` - Субагент: `feature-developer` -- Статус: pending +- Статус: completed - Зависимости: `M04` - Commit required: no - Scope: добавить минимальную HTTP ручку для создания или переиспользования sandbox без auth diff --git a/usecase/interface.py b/usecase/interface.py index fcc1fe6..0c8bcaa 100644 --- a/usecase/interface.py +++ b/usecase/interface.py @@ -34,6 +34,7 @@ class SandboxRuntime(Protocol): *, session_id: str, chat_id: str, + created_at: datetime, expires_at: datetime, ) -> SandboxSession: ... diff --git a/usecase/sandbox.py b/usecase/sandbox.py index ae60946..65740ef 100644 --- a/usecase/sandbox.py +++ b/usecase/sandbox.py @@ -53,10 +53,12 @@ class CreateSandbox: self._runtime.stop(session.container_id) self._repository.delete(session.session_id) + expires_at = now + self._ttl new_session = self._runtime.create( session_id=_new_session_id(), chat_id=command.chat_id, - expires_at=now + self._ttl, + created_at=now, + expires_at=expires_at, ) self._repository.save(new_session) self._logger.info( From 3a7973accdb0895f2a33794d57451e641e6d58e3 Mon Sep 17 00:00:00 2001 From: Azamat Date: Thu, 2 Apr 2026 14:09:41 +0300 Subject: [PATCH 08/30] ref #8: [feat] add config for docker daemon --- adapter/config/loader.py | 11 +++++++++++ adapter/config/model.py | 6 ++++++ adapter/di/container.py | 2 +- config/app.yaml | 3 +++ 4 files changed, 21 insertions(+), 1 deletion(-) diff --git a/adapter/config/loader.py b/adapter/config/loader.py index 0cb4b6b..f33b908 100644 --- a/adapter/config/loader.py +++ b/adapter/config/loader.py @@ -8,6 +8,7 @@ from dotenv import dotenv_values from .model import ( AppConfig, AppSectionConfig, + DockerConfig, HttpConfig, LoggingConfig, MetricsConfig, @@ -39,6 +40,7 @@ def load_config( logging_section = _section(yaml_data, 'logging') metrics_section = _section(yaml_data, 'metrics') tracing_section = _section(yaml_data, 'tracing') + docker_section = _section(yaml_data, 'docker') sandbox_section = _section(yaml_data, 'sandbox') security_section = _section(yaml_data, 'security') @@ -130,6 +132,15 @@ def load_config( enable_metrics=metrics_enabled, enable_tracing=tracing_enabled, ), + docker=DockerConfig( + base_url=_yaml_or_env_str( + docker_section, + 'base_url', + 'docker.base_url', + env_values, + 'APP_DOCKER_BASE_URL', + ) + ), sandbox=_load_sandbox_config(sandbox_section, env_values), security=SecurityConfig( token_header=_yaml_or_env_str( diff --git a/adapter/config/model.py b/adapter/config/model.py index ca18347..3a8e70d 100644 --- a/adapter/config/model.py +++ b/adapter/config/model.py @@ -40,6 +40,11 @@ class OtelConfig: metric_export_interval: int +@dataclass(frozen=True, slots=True) +class DockerConfig: + base_url: str + + @dataclass(frozen=True, slots=True) class SandboxConfig: image: str @@ -68,5 +73,6 @@ class AppConfig: metrics: MetricsConfig tracing: TracingConfig otel: OtelConfig + docker: DockerConfig sandbox: SandboxConfig security: SecurityConfig diff --git a/adapter/di/container.py b/adapter/di/container.py index c28fbfa..592cf6e 100644 --- a/adapter/di/container.py +++ b/adapter/di/container.py @@ -78,8 +78,8 @@ def build_container( ) observability = build_observability(app_config) - docker_client: DockerClient = docker.from_env() clock = SystemClock() + docker_client = docker.DockerClient(base_url=app_config.docker.base_url) user_repository = InMemoryUserRepository( observability.tracer, [User(id='123', email='aza@gglamer.ru', name='gglamer')] diff --git a/config/app.yaml b/config/app.yaml index 2de4c27..0e729db 100644 --- a/config/app.yaml +++ b/config/app.yaml @@ -24,6 +24,9 @@ otel: traces_endpoint: http://localhost:4318/v1/traces metric_export_interval: 1000 +docker: + base_url: unix:///var/run/docker.sock + sandbox: image: ai-agent:latest ttl_seconds: 300 From fb974fff1ec958f3743bcf2d687e9162958ba58c Mon Sep 17 00:00:00 2001 From: Azamat Date: Thu, 2 Apr 2026 20:28:14 +0300 Subject: [PATCH 09/30] ref #9: [feat] add tests --- .gitignore | 2 + tasks.md | 71 +++++++- test/test_create_http.py | 332 +++++++++++++++++++++++++++++++++++ test/test_docker_runtime.py | 212 ++++++++++++++++++++++ test/test_sandbox_usecase.py | 284 ++++++++++++++++++++++++++++++ 5 files changed, 899 insertions(+), 2 deletions(-) create mode 100644 test/test_create_http.py create mode 100644 test/test_docker_runtime.py create mode 100644 test/test_sandbox_usecase.py diff --git a/.gitignore b/.gitignore index b304a23..35df624 100644 --- a/.gitignore +++ b/.gitignore @@ -14,3 +14,5 @@ wheels/ !docs !AGENTS.md !tasks.md + +opencode.json diff --git a/tasks.md b/tasks.md index 49c99a6..f624b35 100644 --- a/tasks.md +++ b/tasks.md @@ -98,7 +98,7 @@ ### M07. Тесты для create, reuse, TTL и mount policy - Субагент: `test-engineer` -- Статус: pending +- Статус: completed - Зависимости: `M03`, `M04`, `M05`, `M06` - Commit required: no - Scope: покрыть тестами ключевое поведение MVP без запуска реального production Docker stack @@ -108,9 +108,76 @@ ### M08. Архитектурный и boundary review по MVP sandbox - Субагент: `code-reviewer` -- Статус: pending +- Статус: completed - Зависимости: `M07` - Commit required: no - Scope: проверить соблюдение clean architecture, dependency direction и соответствие MVP-ограничениям - Файлы: весь измененный код - Критерии приемки: Docker остается только во внешнем adapter; FastAPI не протекает в `domain/` и `usecase/`; TTL и mount policy читаются как явные, тестируемые правила; замечания сформулированы как точечные правки или подтверждение готовности + +## Follow-up после M08 review + +### M09. Сериализация lifecycle sandbox по `chat_id` + +- Субагент: `feature-developer` +- Статус: pending +- Зависимости: `M08` +- Commit required: no +- Scope: убрать гонки между параллельными `create` и cleanup для одного `chat_id` +- Файлы: `usecase/interface.py`, `usecase/sandbox.py`, `repository/sandbox_lock.py` или другой outer-layer lock implementation, `adapter/di/container.py` +- Решение: ввести явный usecase-port для process-local lock по `chat_id`; outer-layer реализация держит per-chat lock registry; `CreateSandbox` и `CleanupExpiredSandboxes` выполняют мутации сессии под этим lock +- Критерии приемки: для одного `chat_id` не поднимаются два sandbox при concurrent create; create-vs-cleanup не оставляет orphan container; locking не протекает в HTTP и Docker adapter как бизнес-логика + +### M10. Устойчивый cleanup и вынос blocking cleanup из event loop + +- Субагент: `feature-developer` +- Статус: pending +- Зависимости: `M09` +- Commit required: no +- Scope: сделать cleanup устойчивым к частичным ошибкам и не блокировать FastAPI event loop синхронным Docker stop +- Файлы: `usecase/sandbox.py`, `adapter/http/fastapi/app.py` +- Решение: `CleanupExpiredSandboxes` обрабатывает stop/delete по каждой сессии отдельно и продолжает batch; HTTP cleanup loop выносит blocking cleanup work в thread через adapter-layer orchestration +- Критерии приемки: ошибка на одной expired session не мешает чистить остальные; background cleanup loop не умирает после ошибки; blocking cleanup больше не выполняется прямо в event loop + +### M11. Удаление не-MVP user surface из приложения + +- Субагент: `feature-developer` +- Статус: pending +- Зависимости: `M08` +- Commit required: no +- Scope: убрать из runtime app неотносящиеся к MVP user endpoint и seed user wiring +- Файлы: `adapter/http/fastapi/routers/v1/router.py`, `adapter/http/fastapi/dependencies.py`, `adapter/http/fastapi/schemas.py`, `adapter/di/container.py` +- Решение: оставить в MVP только `health` и sandbox API; примерный user code может остаться в репозитории как template, но не должен быть подключен в runtime app +- Критерии приемки: `GET /api/v1/users/{user_id}` больше не опубликован; container не создает seeded user repository/usecase для runtime app; незапрошенная user-surface area исчезает + +### M12. Регрессионные тесты на race conditions и cleanup resilience + +- Субагент: `test-engineer` +- Статус: pending +- Зависимости: `M09`, `M10`, `M11` +- Commit required: no +- Scope: добавить тесты на новые гарантии после review fixes +- Файлы: `test/*` +- Критерии приемки: есть тест на duplicate create для одного `chat_id`; есть тест на create-vs-cleanup race или эквивалентную сериализацию; есть тест, что cleanup продолжает batch после stop failure; HTTP smoke/regression тесты обновлены под удаление user endpoint + +### M13. Повторный boundary review после fix-pass + +- Субагент: `code-reviewer` +- Статус: pending +- Зависимости: `M12` +- Commit required: no +- Scope: проверить, что must-fix и should-fix замечания из `M08` закрыты без нарушения clean architecture +- Файлы: весь измененный код после `M09`-`M12` +- Критерии приемки: нет гонки на one-sandbox-per-chat; cleanup не блокирует event loop и не валится на первом stop failure; runtime app не публикует лишний user API; замечания сведены к minor или отсутствуют + +### M14. Починка mypy-типизации тестов после sandbox MVP + +- Субагент: `test-engineer` +- Статус: completed +- Зависимости: `M07` +- Commit required: no +- Scope: устранить текущие ошибки `make pre-commit` в test-suite без изменения production behavior +- Файлы: `test/test_docker_runtime.py`, `test/test_create_http.py`, при необходимости общие test helpers в `test/*` +- Ошибки: несовместимый fake Docker client для `DockerSandboxRuntime`, неточная типизация `run_calls` и ASGI message payload, использование `object` вместо типизированных test doubles для `AppRepositories`, `AppUsecases`, `AppContainer` +- Решение: сделать test doubles типизированными через совместимые fake classes или локальные protocols; убрать `object` и неиндексируемые `dict[str, object]` там, где mypy не может вывести типы +- Критерии приемки: `uv run mypy .` проходит; `make pre-commit` доходит как минимум до pytest stage; production code не меняется или меняется только при явной необходимости для testability diff --git a/test/test_create_http.py b/test/test_create_http.py new file mode 100644 index 0000000..bf910e0 --- /dev/null +++ b/test/test_create_http.py @@ -0,0 +1,332 @@ +import asyncio +import json +from datetime import UTC, datetime, timedelta + +from docker import DockerClient +from fastapi import FastAPI +from starlette.types import Message, Scope + +from adapter.config.model import ( + AppConfig, + AppSectionConfig, + DockerConfig, + HttpConfig, + LoggingConfig, + MetricsConfig, + OtelConfig, + SandboxConfig, + SecurityConfig, + TracingConfig, +) +from adapter.di.container import AppContainer, AppRepositories, AppUsecases +from adapter.http.fastapi import app as app_module +from adapter.observability.noop import NoopMetrics, NoopTracer +from adapter.observability.runtime import ObservabilityRuntime +from domain.error import SandboxError, SandboxStartError +from domain.sandbox import SandboxSession, SandboxStatus +from repository.sandbox_session import InMemorySandboxSessionRepository +from repository.user import InMemoryUserRepository +from usecase.interface import Attrs +from usecase.sandbox import CleanupExpiredSandboxes, CreateSandbox, CreateSandboxCommand +from usecase.user import GetUser + + +class FakeLogger: + def __init__(self) -> None: + self.messages: list[tuple[str, str, Attrs | None]] = [] + + def debug(self, message: str, attrs: Attrs | None = None) -> None: + self.messages.append(('debug', message, attrs)) + + def info(self, message: str, attrs: Attrs | None = None) -> None: + self.messages.append(('info', message, attrs)) + + def warning(self, message: str, attrs: Attrs | None = None) -> None: + self.messages.append(('warning', message, attrs)) + + def error(self, message: str, attrs: Attrs | None = None) -> None: + self.messages.append(('error', message, attrs)) + + +class FakeCreateSandboxUsecase(CreateSandbox): + def __init__( + self, session: SandboxSession | None = None, error: Exception | None = None + ) -> None: + self._session = session + self._error = error + self.commands: list[CreateSandboxCommand] = [] + + def execute(self, command: CreateSandboxCommand) -> SandboxSession: + self.commands.append(command) + if self._error is not None: + raise self._error + if self._session is None: + raise AssertionError('missing session') + return self._session + + +class FakeCleanupExpiredSandboxes(CleanupExpiredSandboxes): + def __init__(self) -> None: + self.calls = 0 + + def execute(self) -> list[SandboxSession]: + self.calls += 1 + return [] + + +class FakeDockerClient(DockerClient): + def __init__(self) -> None: + self.close_calls = 0 + + def close(self) -> None: + self.close_calls += 1 + + +def build_config() -> AppConfig: + return AppConfig( + app=AppSectionConfig(name='master', env='test'), + http=HttpConfig(host='127.0.0.1', port=8000), + logging=LoggingConfig( + level='INFO', output='stdout', format='json', file_path=None + ), + metrics=MetricsConfig(enabled=False), + tracing=TracingConfig(enabled=False), + otel=OtelConfig( + service_name='master', + logs_endpoint='http://localhost:4318/v1/logs', + metrics_endpoint='http://localhost:4318/v1/metrics', + traces_endpoint='http://localhost:4318/v1/traces', + metric_export_interval=1000, + ), + docker=DockerConfig(base_url='unix:///var/run/docker.sock'), + sandbox=SandboxConfig( + image='sandbox:latest', + ttl_seconds=300, + cleanup_interval_seconds=60, + chats_root='/tmp/chats', + dependencies_host_path='/tmp/dependencies', + lambda_tools_host_path='/tmp/lambda-tools', + chat_mount_path='/workspace/chat', + dependencies_mount_path='/workspace/dependencies', + lambda_tools_mount_path='/workspace/lambda-tools', + ), + security=SecurityConfig( + token_header='Authorization', + api_token='token', + signing_key='signing-key', + ), + ) + + +def build_container( + config: AppConfig, + create_sandbox_usecase: FakeCreateSandboxUsecase, + cleanup_usecase: FakeCleanupExpiredSandboxes, + logger: FakeLogger, + docker_client: FakeDockerClient, +) -> AppContainer: + observability = ObservabilityRuntime( + logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), + ) + repositories = AppRepositories( + user=InMemoryUserRepository(NoopTracer()), + sandbox_session=InMemorySandboxSessionRepository(), + ) + usecases = AppUsecases( + get_user=GetUser( + repository=repositories.user, + logger=logger, + tracer=NoopTracer(), + ), + create_sandbox=create_sandbox_usecase, + cleanup_expired_sandboxes=cleanup_usecase, + ) + return AppContainer( + config=config, + observability=observability, + repositories=repositories, + usecases=usecases, + _docker_client=docker_client, + ) + + +async def post_json( + app: FastAPI, path: str, payload: dict[str, str] +) -> tuple[int, dict[str, object]]: + body = json.dumps(payload).encode() + messages: list[Message] = [] + request_sent = False + + async def receive() -> Message: + nonlocal request_sent + if request_sent: + await asyncio.sleep(0) + return {'type': 'http.disconnect'} + + request_sent = True + return { + 'type': 'http.request', + 'body': body, + 'more_body': False, + } + + async def send(message: Message) -> None: + messages.append(message) + + scope: Scope = { + 'type': 'http', + 'asgi': {'version': '3.0'}, + 'http_version': '1.1', + 'method': 'POST', + 'scheme': 'http', + 'path': path, + 'raw_path': path.encode(), + 'query_string': b'', + 'root_path': '', + 'headers': [ + (b'host', b'testserver'), + (b'content-type', b'application/json'), + (b'content-length', str(len(body)).encode()), + ], + 'client': ('testclient', 50000), + 'server': ('testserver', 80), + 'state': {}, + } + + await app(scope, receive, send) + + status = 500 + response_body = b'' + for message in messages: + if message['type'] == 'http.response.start': + status = int(message['status']) + if message['type'] == 'http.response.body': + response_body += bytes(message.get('body', b'')) + + return status, json.loads(response_body.decode()) + + +async def exercise_create_request( + app: FastAPI, + payload: dict[str, str], +) -> tuple[int, dict[str, object]]: + await app.router.startup() + try: + status, response = await post_json(app, '/api/v1/create', payload) + await asyncio.sleep(0) + return status, response + finally: + await app.router.shutdown() + + +def test_post_create_returns_session(monkeypatch) -> None: + config = build_config() + expires_at = datetime(2026, 4, 2, 12, 5, tzinfo=UTC) + session = SandboxSession( + session_id='session-123', + chat_id='chat-123', + container_id='container-123', + status=SandboxStatus.RUNNING, + created_at=expires_at - timedelta(minutes=5), + expires_at=expires_at, + ) + logger = FakeLogger() + create_usecase = FakeCreateSandboxUsecase(session=session) + cleanup_usecase = FakeCleanupExpiredSandboxes() + docker_client = FakeDockerClient() + container = build_container( + config, + create_usecase, + cleanup_usecase, + logger, + docker_client, + ) + monkeypatch.setattr(app_module, 'build_container', lambda **kwargs: container) + monkeypatch.setattr( + app_module.FastAPIInstrumentor, 'instrument_app', lambda *args, **kwargs: None + ) + + app = app_module.create_app(config=config) + + status_code, response = asyncio.run( + exercise_create_request(app, {'chat_id': 'chat-123'}) + ) + + assert status_code == 200 + assert response == { + 'session_id': 'session-123', + 'chat_id': 'chat-123', + 'container_id': 'container-123', + 'status': 'running', + 'expires_at': '2026-04-02T12:05:00Z', + } + assert len(create_usecase.commands) == 1 + assert create_usecase.commands[0].chat_id == 'chat-123' + assert cleanup_usecase.calls >= 1 + assert any( + message == 'http_request' + and attrs is not None + and attrs['http.path'] == '/api/v1/create' + for _, message, attrs in logger.messages + ) + assert docker_client.close_calls == 1 + + +def test_post_create_maps_start_errors_to_service_unavailable(monkeypatch) -> None: + config = build_config() + logger = FakeLogger() + create_usecase = FakeCreateSandboxUsecase(error=SandboxStartError('chat-123')) + cleanup_usecase = FakeCleanupExpiredSandboxes() + docker_client = FakeDockerClient() + container = build_container( + config, + create_usecase, + cleanup_usecase, + logger, + docker_client, + ) + monkeypatch.setattr(app_module, 'build_container', lambda **kwargs: container) + monkeypatch.setattr( + app_module.FastAPIInstrumentor, 'instrument_app', lambda *args, **kwargs: None + ) + + app = app_module.create_app(config=config) + + status_code, response = asyncio.run( + exercise_create_request(app, {'chat_id': 'chat-123'}) + ) + + assert status_code == 503 + assert response == {'detail': 'sandbox_start_failed'} + assert docker_client.close_calls == 1 + + +def test_post_create_maps_generic_sandbox_errors_to_internal_error(monkeypatch) -> None: + config = build_config() + logger = FakeLogger() + create_usecase = FakeCreateSandboxUsecase(error=SandboxError('sandbox_broken')) + cleanup_usecase = FakeCleanupExpiredSandboxes() + docker_client = FakeDockerClient() + container = build_container( + config, + create_usecase, + cleanup_usecase, + logger, + docker_client, + ) + monkeypatch.setattr(app_module, 'build_container', lambda **kwargs: container) + monkeypatch.setattr( + app_module.FastAPIInstrumentor, 'instrument_app', lambda *args, **kwargs: None + ) + + app = app_module.create_app(config=config) + + status_code, response = asyncio.run( + exercise_create_request(app, {'chat_id': 'chat-123'}) + ) + + assert status_code == 500 + assert response == {'detail': 'sandbox_broken'} + assert docker_client.close_calls == 1 diff --git a/test/test_docker_runtime.py b/test/test_docker_runtime.py new file mode 100644 index 0000000..338fe1f --- /dev/null +++ b/test/test_docker_runtime.py @@ -0,0 +1,212 @@ +from datetime import UTC, datetime, timedelta +from pathlib import Path +from typing import Any, TypedDict + +import pytest +from docker import DockerClient +from docker.errors import DockerException, NotFound +from docker.types import Mount + +from adapter.config.model import SandboxConfig +from adapter.docker.runtime import DockerSandboxRuntime +from domain.error import SandboxError, SandboxStartError +from domain.sandbox import SandboxStatus + + +class FakeContainer: + def __init__(self, container_id: str) -> None: + self.id = container_id + self.stop_calls = 0 + + def stop(self) -> None: + self.stop_calls += 1 + + +class RunKwargs(TypedDict): + detach: bool + labels: dict[str, str] + mounts: list[Mount] + + +class RunCall(TypedDict): + args: tuple[str] + kwargs: RunKwargs + + +class FakeContainers: + def __init__(self, run_result: FakeContainer | None = None) -> None: + self.run_calls: list[RunCall] = [] + self.get_calls: list[str] = [] + self.run_result = run_result or FakeContainer('container-123') + self.get_result: FakeContainer | Exception | None = None + + def run( + self, + image: str, + *, + detach: bool, + labels: dict[str, str], + mounts: list[Mount], + ) -> FakeContainer: + self.run_calls.append( + { + 'args': (image,), + 'kwargs': { + 'detach': detach, + 'labels': labels, + 'mounts': mounts, + }, + } + ) + return self.run_result + + def get(self, container_id: str) -> FakeContainer: + self.get_calls.append(container_id) + if isinstance(self.get_result, Exception): + raise self.get_result + if self.get_result is None: + raise AssertionError('missing get result') + return self.get_result + + +class FakeDockerClient(DockerClient): + def __init__(self, containers: FakeContainers) -> None: + self._containers = containers + + @property + def containers(self) -> Any: + return self._containers + + +def build_config(tmp_path: Path) -> SandboxConfig: + return SandboxConfig( + image='sandbox:latest', + ttl_seconds=300, + cleanup_interval_seconds=60, + chats_root=str(tmp_path / 'chats'), + dependencies_host_path=str(tmp_path / 'dependencies'), + lambda_tools_host_path=str(tmp_path / 'lambda-tools'), + chat_mount_path='/workspace/chat', + dependencies_mount_path='/workspace/dependencies', + lambda_tools_mount_path='/workspace/lambda-tools', + ) + + +def test_runtime_create_applies_mount_policy_and_labels(tmp_path: Path) -> None: + config = build_config(tmp_path) + (tmp_path / 'dependencies').mkdir() + (tmp_path / 'lambda-tools').mkdir() + containers = FakeContainers() + runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) + created_at = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + expires_at = created_at + timedelta(minutes=5) + + session = runtime.create( + session_id='session-123', + chat_id='chat-123', + created_at=created_at, + expires_at=expires_at, + ) + + assert session.session_id == 'session-123' + assert session.chat_id == 'chat-123' + assert session.container_id == 'container-123' + assert session.status is SandboxStatus.RUNNING + assert session.created_at == created_at + assert session.expires_at == expires_at + assert (tmp_path / 'chats' / 'chat-123').is_dir() + + call = containers.run_calls[0] + assert call['args'] == ('sandbox:latest',) + assert call['kwargs']['detach'] is True + assert call['kwargs']['labels'] == { + 'session_id': 'session-123', + 'chat_id': 'chat-123', + 'expires_at': expires_at.isoformat(), + } + + mounts = call['kwargs']['mounts'] + assert [dict(mount) for mount in mounts] == [ + { + 'Target': '/workspace/chat', + 'Source': str((tmp_path / 'chats' / 'chat-123').resolve(strict=False)), + 'Type': 'bind', + 'ReadOnly': False, + }, + { + 'Target': '/workspace/dependencies', + 'Source': str((tmp_path / 'dependencies').resolve(strict=False)), + 'Type': 'bind', + 'ReadOnly': True, + }, + { + 'Target': '/workspace/lambda-tools', + 'Source': str((tmp_path / 'lambda-tools').resolve(strict=False)), + 'Type': 'bind', + 'ReadOnly': True, + }, + ] + + +def test_runtime_create_raises_start_error_when_container_id_is_missing( + tmp_path: Path, +) -> None: + config = build_config(tmp_path) + (tmp_path / 'dependencies').mkdir() + (tmp_path / 'lambda-tools').mkdir() + containers = FakeContainers(run_result=FakeContainer('')) + runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) + + with pytest.raises(SandboxStartError) as excinfo: + runtime.create( + session_id='session-123', + chat_id='chat-123', + created_at=datetime(2026, 4, 2, 12, 0, tzinfo=UTC), + expires_at=datetime(2026, 4, 2, 12, 5, tzinfo=UTC), + ) + + assert str(excinfo.value) == 'sandbox_start_failed' + assert excinfo.value.chat_id == 'chat-123' + + +def test_runtime_stop_ignores_missing_container(tmp_path: Path) -> None: + config = build_config(tmp_path) + containers = FakeContainers() + containers.get_result = NotFound('missing') + runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) + + runtime.stop('container-123') + + assert containers.get_calls == ['container-123'] + + +def test_runtime_stop_wraps_docker_errors(tmp_path: Path) -> None: + config = build_config(tmp_path) + containers = FakeContainers() + containers.get_result = DockerException('boom') + runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) + + with pytest.raises(SandboxError) as excinfo: + runtime.stop('container-123') + + assert str(excinfo.value) == 'sandbox_stop_failed' + + +def test_runtime_create_rejects_chat_path_traversal(tmp_path: Path) -> None: + config = build_config(tmp_path) + (tmp_path / 'dependencies').mkdir() + (tmp_path / 'lambda-tools').mkdir() + containers = FakeContainers() + runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) + + with pytest.raises(SandboxStartError) as excinfo: + runtime.create( + session_id='session-123', + chat_id='../escape', + created_at=datetime(2026, 4, 2, 12, 0, tzinfo=UTC), + expires_at=datetime(2026, 4, 2, 12, 5, tzinfo=UTC), + ) + + assert str(excinfo.value) == 'sandbox_start_failed' + assert excinfo.value.chat_id == '../escape' + assert containers.run_calls == [] diff --git a/test/test_sandbox_usecase.py b/test/test_sandbox_usecase.py new file mode 100644 index 0000000..b050b69 --- /dev/null +++ b/test/test_sandbox_usecase.py @@ -0,0 +1,284 @@ +from datetime import UTC, datetime, timedelta + +from domain.sandbox import SandboxSession, SandboxStatus +from repository.sandbox_session import InMemorySandboxSessionRepository +from usecase.sandbox import CleanupExpiredSandboxes, CreateSandbox, CreateSandboxCommand + + +class FakeClock: + def __init__(self, now: datetime) -> None: + self._now = now + + def now(self) -> datetime: + return self._now + + +class FakeLogger: + def __init__(self) -> None: + self.messages: list[ + tuple[str, str, dict[str, str | int | float | bool] | None] + ] = [] + + def debug(self, message: str, attrs=None) -> None: + self.messages.append(('debug', message, attrs)) + + def info(self, message: str, attrs=None) -> None: + self.messages.append(('info', message, attrs)) + + def warning(self, message: str, attrs=None) -> None: + self.messages.append(('warning', message, attrs)) + + def error(self, message: str, attrs=None) -> None: + self.messages.append(('error', message, attrs)) + + +class FakeRuntime: + def __init__(self) -> None: + self.create_calls: list[dict[str, object]] = [] + self.stop_calls: list[str] = [] + + def create( + self, + *, + session_id: str, + chat_id: str, + created_at: datetime, + expires_at: datetime, + ) -> SandboxSession: + self.create_calls.append( + { + 'session_id': session_id, + 'chat_id': chat_id, + 'created_at': created_at, + 'expires_at': expires_at, + } + ) + return SandboxSession( + session_id=session_id, + chat_id=chat_id, + container_id=f'container-{session_id}', + status=SandboxStatus.RUNNING, + created_at=created_at, + expires_at=expires_at, + ) + + def stop(self, container_id: str) -> None: + self.stop_calls.append(container_id) + + +def test_create_sandbox_reuses_active_session_when_not_expired() -> None: + now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + session = SandboxSession( + session_id='session-1', + chat_id='chat-1', + container_id='container-1', + status=SandboxStatus.RUNNING, + created_at=now - timedelta(minutes=1), + expires_at=now + timedelta(minutes=4), + ) + repository = InMemorySandboxSessionRepository() + repository.save(session) + runtime = FakeRuntime() + logger = FakeLogger() + usecase = CreateSandbox( + repository=repository, + runtime=runtime, + clock=FakeClock(now), + logger=logger, + ttl=timedelta(minutes=5), + ) + + result = usecase.execute(CreateSandboxCommand(chat_id='chat-1')) + + assert result == session + assert runtime.create_calls == [] + assert runtime.stop_calls == [] + assert repository.get_active_by_chat_id('chat-1') == session + assert logger.messages == [ + ( + 'info', + 'sandbox_reused', + { + 'chat_id': 'chat-1', + 'session_id': 'session-1', + 'container_id': 'container-1', + }, + ) + ] + + +def test_create_sandbox_replaces_expired_session_and_creates_new_one( + monkeypatch, +) -> None: + now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + expired_session = SandboxSession( + session_id='session-old', + chat_id='chat-1', + container_id='container-old', + status=SandboxStatus.RUNNING, + created_at=now - timedelta(minutes=10), + expires_at=now, + ) + repository = InMemorySandboxSessionRepository() + repository.save(expired_session) + runtime = FakeRuntime() + logger = FakeLogger() + usecase = CreateSandbox( + repository=repository, + runtime=runtime, + clock=FakeClock(now), + logger=logger, + ttl=timedelta(minutes=5), + ) + monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: 'session-new') + + result = usecase.execute(CreateSandboxCommand(chat_id='chat-1')) + + assert runtime.stop_calls == ['container-old'] + assert runtime.create_calls == [ + { + 'session_id': 'session-new', + 'chat_id': 'chat-1', + 'created_at': now, + 'expires_at': now + timedelta(minutes=5), + } + ] + assert result == SandboxSession( + session_id='session-new', + chat_id='chat-1', + container_id='container-session-new', + status=SandboxStatus.RUNNING, + created_at=now, + expires_at=now + timedelta(minutes=5), + ) + assert repository.get_active_by_chat_id('chat-1') == result + assert logger.messages == [ + ( + 'info', + 'sandbox_replaced', + { + 'chat_id': 'chat-1', + 'session_id': 'session-old', + 'container_id': 'container-old', + }, + ), + ( + 'info', + 'sandbox_created', + { + 'chat_id': 'chat-1', + 'session_id': 'session-new', + 'container_id': 'container-session-new', + }, + ), + ] + + +def test_create_sandbox_creates_new_session_when_none_exists() -> None: + now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + repository = InMemorySandboxSessionRepository() + runtime = FakeRuntime() + logger = FakeLogger() + usecase = CreateSandbox( + repository=repository, + runtime=runtime, + clock=FakeClock(now), + logger=logger, + ttl=timedelta(minutes=5), + ) + + result = usecase.execute(CreateSandboxCommand(chat_id='chat-1')) + + assert result.chat_id == 'chat-1' + assert result.container_id == f'container-{result.session_id}' + assert result.status is SandboxStatus.RUNNING + assert result.created_at == now + assert result.expires_at == now + timedelta(minutes=5) + assert len(runtime.create_calls) == 1 + assert runtime.create_calls[0] == { + 'session_id': result.session_id, + 'chat_id': 'chat-1', + 'created_at': now, + 'expires_at': now + timedelta(minutes=5), + } + assert runtime.stop_calls == [] + assert repository.get_active_by_chat_id('chat-1') == result + assert logger.messages == [ + ( + 'info', + 'sandbox_created', + { + 'chat_id': 'chat-1', + 'session_id': result.session_id, + 'container_id': result.container_id, + }, + ) + ] + + +def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> None: + now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + expired_session = SandboxSession( + session_id='session-expired', + chat_id='chat-expired', + container_id='container-expired', + status=SandboxStatus.RUNNING, + created_at=now - timedelta(minutes=10), + expires_at=now - timedelta(seconds=1), + ) + boundary_session = SandboxSession( + session_id='session-boundary', + chat_id='chat-boundary', + container_id='container-boundary', + status=SandboxStatus.RUNNING, + created_at=now - timedelta(minutes=5), + expires_at=now, + ) + active_session = SandboxSession( + session_id='session-active', + chat_id='chat-active', + container_id='container-active', + status=SandboxStatus.RUNNING, + created_at=now - timedelta(minutes=1), + expires_at=now + timedelta(minutes=5), + ) + repository = InMemorySandboxSessionRepository() + repository.save(expired_session) + repository.save(boundary_session) + repository.save(active_session) + runtime = FakeRuntime() + logger = FakeLogger() + usecase = CleanupExpiredSandboxes( + repository=repository, + runtime=runtime, + clock=FakeClock(now), + logger=logger, + ) + + result = usecase.execute() + + assert result == [expired_session, boundary_session] + assert runtime.stop_calls == ['container-expired', 'container-boundary'] + assert repository.get_active_by_chat_id('chat-expired') is None + assert repository.get_active_by_chat_id('chat-boundary') is None + assert repository.get_active_by_chat_id('chat-active') == active_session + assert logger.messages == [ + ( + 'info', + 'sandbox_cleaned', + { + 'chat_id': 'chat-expired', + 'session_id': 'session-expired', + 'container_id': 'container-expired', + }, + ), + ( + 'info', + 'sandbox_cleaned', + { + 'chat_id': 'chat-boundary', + 'session_id': 'session-boundary', + 'container_id': 'container-boundary', + }, + ), + ] From f5d13feaf97bee5a44467cc8c593c4d03400ca9c Mon Sep 17 00:00:00 2001 From: Azamat Date: Thu, 2 Apr 2026 20:56:26 +0300 Subject: [PATCH 10/30] [fix] race condition --- adapter/di/container.py | 4 ++ repository/sandbox_lock.py | 43 ++++++++++++ repository/sandbox_session.py | 28 +++++--- tasks.md | 2 +- test/test_sandbox_usecase.py | 29 ++++++++ usecase/interface.py | 15 ++++ usecase/sandbox.py | 127 ++++++++++++++++++++-------------- 7 files changed, 185 insertions(+), 63 deletions(-) create mode 100644 repository/sandbox_lock.py diff --git a/adapter/di/container.py b/adapter/di/container.py index 592cf6e..55f95a0 100644 --- a/adapter/di/container.py +++ b/adapter/di/container.py @@ -12,6 +12,7 @@ from adapter.docker.runtime import DockerSandboxRuntime from adapter.observability.factory import build_observability from adapter.observability.runtime import ObservabilityRuntime from domain.user import User +from repository.sandbox_lock import ProcessLocalSandboxLifecycleLocker from repository.sandbox_session import InMemorySandboxSessionRepository from repository.user import InMemoryUserRepository from usecase.interface import Clock @@ -85,6 +86,7 @@ def build_container( observability.tracer, [User(id='123', email='aza@gglamer.ru', name='gglamer')] ) sandbox_repository = InMemorySandboxSessionRepository() + sandbox_locker = ProcessLocalSandboxLifecycleLocker() sandbox_runtime = DockerSandboxRuntime(app_config.sandbox, docker_client) repositories = AppRepositories( @@ -99,6 +101,7 @@ def build_container( ), create_sandbox=CreateSandbox( repository=sandbox_repository, + locker=sandbox_locker, runtime=sandbox_runtime, clock=clock, logger=observability.logger, @@ -106,6 +109,7 @@ def build_container( ), cleanup_expired_sandboxes=CleanupExpiredSandboxes( repository=sandbox_repository, + locker=sandbox_locker, runtime=sandbox_runtime, clock=clock, logger=observability.logger, diff --git a/repository/sandbox_lock.py b/repository/sandbox_lock.py new file mode 100644 index 0000000..704aeae --- /dev/null +++ b/repository/sandbox_lock.py @@ -0,0 +1,43 @@ +import threading +from types import TracebackType +from typing import Protocol + +from usecase.interface import LockContext, SandboxLifecycleLocker + + +class _SyncLock(Protocol): + def acquire(self, blocking: bool = True, timeout: float = -1) -> bool: ... + + def release(self) -> None: ... + + +class _ChatLock(LockContext): + def __init__(self, lock: _SyncLock) -> None: + self._lock = lock + + def __enter__(self) -> None: + self._lock.acquire() + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + traceback: TracebackType | None, + ) -> bool | None: + self._lock.release() + return None + + +class ProcessLocalSandboxLifecycleLocker(SandboxLifecycleLocker): + def __init__(self) -> None: + self._registry_lock = threading.Lock() + self._locks_by_chat_id: dict[str, _SyncLock] = {} + + def lock(self, chat_id: str) -> LockContext: + with self._registry_lock: + lock = self._locks_by_chat_id.get(chat_id) + if lock is None: + lock = threading.Lock() + self._locks_by_chat_id[chat_id] = lock + + return _ChatLock(lock) diff --git a/repository/sandbox_session.py b/repository/sandbox_session.py index 9b23cd7..6707d0c 100644 --- a/repository/sandbox_session.py +++ b/repository/sandbox_session.py @@ -1,3 +1,4 @@ +import threading from datetime import datetime from domain.sandbox import SandboxSession @@ -7,22 +8,27 @@ from usecase.interface import SandboxSessionRepository class InMemorySandboxSessionRepository(SandboxSessionRepository): def __init__(self) -> None: self._sessions_by_chat_id: dict[str, SandboxSession] = {} + self._lock = threading.Lock() def get_active_by_chat_id(self, chat_id: str) -> SandboxSession | None: - return self._sessions_by_chat_id.get(chat_id) + with self._lock: + return self._sessions_by_chat_id.get(chat_id) def list_expired(self, now: datetime) -> list[SandboxSession]: - return [ - session - for session in self._sessions_by_chat_id.values() - if session.expires_at <= now - ] + with self._lock: + return [ + session + for session in self._sessions_by_chat_id.values() + if session.expires_at <= now + ] def save(self, session: SandboxSession) -> None: - self._sessions_by_chat_id[session.chat_id] = session + with self._lock: + self._sessions_by_chat_id[session.chat_id] = session def delete(self, session_id: str) -> None: - for chat_id, session in tuple(self._sessions_by_chat_id.items()): - if session.session_id == session_id: - del self._sessions_by_chat_id[chat_id] - return + with self._lock: + for chat_id, session in tuple(self._sessions_by_chat_id.items()): + if session.session_id == session_id: + del self._sessions_by_chat_id[chat_id] + return diff --git a/tasks.md b/tasks.md index f624b35..75b4ce8 100644 --- a/tasks.md +++ b/tasks.md @@ -120,7 +120,7 @@ ### M09. Сериализация lifecycle sandbox по `chat_id` - Субагент: `feature-developer` -- Статус: pending +- Статус: completed - Зависимости: `M08` - Commit required: no - Scope: убрать гонки между параллельными `create` и cleanup для одного `chat_id` diff --git a/test/test_sandbox_usecase.py b/test/test_sandbox_usecase.py index b050b69..a58dd5c 100644 --- a/test/test_sandbox_usecase.py +++ b/test/test_sandbox_usecase.py @@ -32,6 +32,23 @@ class FakeLogger: self.messages.append(('error', message, attrs)) +class FakeLockContext: + def __enter__(self) -> None: + return None + + def __exit__(self, exc_type, exc, traceback) -> None: + return None + + +class FakeLocker: + def __init__(self) -> None: + self.chat_ids: list[str] = [] + + def lock(self, chat_id: str) -> FakeLockContext: + self.chat_ids.append(chat_id) + return FakeLockContext() + + class FakeRuntime: def __init__(self) -> None: self.create_calls: list[dict[str, object]] = [] @@ -80,8 +97,10 @@ def test_create_sandbox_reuses_active_session_when_not_expired() -> None: repository.save(session) runtime = FakeRuntime() logger = FakeLogger() + locker = FakeLocker() usecase = CreateSandbox( repository=repository, + locker=locker, runtime=runtime, clock=FakeClock(now), logger=logger, @@ -94,6 +113,7 @@ def test_create_sandbox_reuses_active_session_when_not_expired() -> None: assert runtime.create_calls == [] assert runtime.stop_calls == [] assert repository.get_active_by_chat_id('chat-1') == session + assert locker.chat_ids == ['chat-1'] assert logger.messages == [ ( 'info', @@ -123,8 +143,10 @@ def test_create_sandbox_replaces_expired_session_and_creates_new_one( repository.save(expired_session) runtime = FakeRuntime() logger = FakeLogger() + locker = FakeLocker() usecase = CreateSandbox( repository=repository, + locker=locker, runtime=runtime, clock=FakeClock(now), logger=logger, @@ -152,6 +174,7 @@ def test_create_sandbox_replaces_expired_session_and_creates_new_one( expires_at=now + timedelta(minutes=5), ) assert repository.get_active_by_chat_id('chat-1') == result + assert locker.chat_ids == ['chat-1'] assert logger.messages == [ ( 'info', @@ -179,8 +202,10 @@ def test_create_sandbox_creates_new_session_when_none_exists() -> None: repository = InMemorySandboxSessionRepository() runtime = FakeRuntime() logger = FakeLogger() + locker = FakeLocker() usecase = CreateSandbox( repository=repository, + locker=locker, runtime=runtime, clock=FakeClock(now), logger=logger, @@ -203,6 +228,7 @@ def test_create_sandbox_creates_new_session_when_none_exists() -> None: } assert runtime.stop_calls == [] assert repository.get_active_by_chat_id('chat-1') == result + assert locker.chat_ids == ['chat-1'] assert logger.messages == [ ( 'info', @@ -248,8 +274,10 @@ def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> repository.save(active_session) runtime = FakeRuntime() logger = FakeLogger() + locker = FakeLocker() usecase = CleanupExpiredSandboxes( repository=repository, + locker=locker, runtime=runtime, clock=FakeClock(now), logger=logger, @@ -262,6 +290,7 @@ def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> assert repository.get_active_by_chat_id('chat-expired') is None assert repository.get_active_by_chat_id('chat-boundary') is None assert repository.get_active_by_chat_id('chat-active') == active_session + assert locker.chat_ids == ['chat-expired', 'chat-boundary'] assert logger.messages == [ ( 'info', diff --git a/usecase/interface.py b/usecase/interface.py index 0c8bcaa..0c0e321 100644 --- a/usecase/interface.py +++ b/usecase/interface.py @@ -28,6 +28,21 @@ class SandboxSessionRepository(Protocol): def delete(self, session_id: str) -> None: ... +class LockContext(Protocol): + def __enter__(self) -> None: ... + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + traceback: TracebackType | None, + ) -> bool | None: ... + + +class SandboxLifecycleLocker(Protocol): + def lock(self, chat_id: str) -> LockContext: ... + + class SandboxRuntime(Protocol): def create( self, diff --git a/usecase/sandbox.py b/usecase/sandbox.py index 65740ef..0cb39e8 100644 --- a/usecase/sandbox.py +++ b/usecase/sandbox.py @@ -3,7 +3,13 @@ from datetime import timedelta from uuid import uuid4 from domain.sandbox import SandboxSession -from usecase.interface import Clock, Logger, SandboxRuntime, SandboxSessionRepository +from usecase.interface import ( + Clock, + Logger, + SandboxLifecycleLocker, + SandboxRuntime, + SandboxSessionRepository, +) @dataclass(frozen=True, slots=True) @@ -15,93 +21,112 @@ class CreateSandbox: def __init__( self, repository: SandboxSessionRepository, + locker: SandboxLifecycleLocker, runtime: SandboxRuntime, clock: Clock, logger: Logger, ttl: timedelta, ) -> None: self._repository = repository + self._locker = locker self._runtime = runtime self._clock = clock self._logger = logger self._ttl = ttl def execute(self, command: CreateSandboxCommand) -> SandboxSession: - now = self._clock.now() - session = self._repository.get_active_by_chat_id(command.chat_id) + with self._locker.lock(command.chat_id): + session = self._repository.get_active_by_chat_id(command.chat_id) + now = self._clock.now() - if session is not None and session.expires_at > now: + if session is not None and session.expires_at > now: + self._logger.info( + 'sandbox_reused', + attrs={ + 'chat_id': command.chat_id, + 'session_id': session.session_id, + 'container_id': session.container_id, + }, + ) + return session + + if session is not None: + self._logger.info( + 'sandbox_replaced', + attrs={ + 'chat_id': command.chat_id, + 'session_id': session.session_id, + 'container_id': session.container_id, + }, + ) + self._runtime.stop(session.container_id) + self._repository.delete(session.session_id) + + created_at = self._clock.now() + expires_at = created_at + self._ttl + new_session = self._runtime.create( + session_id=_new_session_id(), + chat_id=command.chat_id, + created_at=created_at, + expires_at=expires_at, + ) + self._repository.save(new_session) self._logger.info( - 'sandbox_reused', + 'sandbox_created', attrs={ 'chat_id': command.chat_id, - 'session_id': session.session_id, - 'container_id': session.container_id, + 'session_id': new_session.session_id, + 'container_id': new_session.container_id, }, ) - return session - - if session is not None: - self._logger.info( - 'sandbox_replaced', - attrs={ - 'chat_id': command.chat_id, - 'session_id': session.session_id, - 'container_id': session.container_id, - }, - ) - self._runtime.stop(session.container_id) - self._repository.delete(session.session_id) - - expires_at = now + self._ttl - new_session = self._runtime.create( - session_id=_new_session_id(), - chat_id=command.chat_id, - created_at=now, - expires_at=expires_at, - ) - self._repository.save(new_session) - self._logger.info( - 'sandbox_created', - attrs={ - 'chat_id': command.chat_id, - 'session_id': new_session.session_id, - 'container_id': new_session.container_id, - }, - ) - return new_session + return new_session class CleanupExpiredSandboxes: def __init__( self, repository: SandboxSessionRepository, + locker: SandboxLifecycleLocker, runtime: SandboxRuntime, clock: Clock, logger: Logger, ) -> None: self._repository = repository + self._locker = locker self._runtime = runtime self._clock = clock self._logger = logger def execute(self) -> list[SandboxSession]: - now = self._clock.now() - expired_sessions = self._repository.list_expired(now) + expired_sessions = self._repository.list_expired(self._clock.now()) cleaned_sessions: list[SandboxSession] = [] for session in expired_sessions: - self._runtime.stop(session.container_id) - self._repository.delete(session.session_id) - cleaned_sessions.append(session) - self._logger.info( - 'sandbox_cleaned', - attrs={ - 'chat_id': session.chat_id, - 'session_id': session.session_id, - 'container_id': session.container_id, - }, - ) + with self._locker.lock(session.chat_id): + current_session = self._repository.get_active_by_chat_id( + session.chat_id + ) + now = self._clock.now() + if current_session is None: + continue + + if current_session.session_id != session.session_id: + continue + + if current_session.expires_at > now: + continue + + self._runtime.stop(current_session.container_id) + self._repository.delete(current_session.session_id) + cleaned_sessions.append(current_session) + self._logger.info( + 'sandbox_cleaned', + attrs={ + 'chat_id': current_session.chat_id, + 'session_id': current_session.session_id, + 'container_id': current_session.container_id, + }, + ) return cleaned_sessions From 776b5138583827144ae00103b5aae84d6c3eb477 Mon Sep 17 00:00:00 2001 From: Azamat Date: Thu, 2 Apr 2026 21:17:21 +0300 Subject: [PATCH 11/30] [fix] cleanup task to other thread --- adapter/http/fastapi/app.py | 26 ++++++++++++---- tasks.md | 2 +- usecase/sandbox.py | 62 ++++++++++++++++++++++++------------- 3 files changed, 61 insertions(+), 29 deletions(-) diff --git a/adapter/http/fastapi/app.py b/adapter/http/fastapi/app.py index d0e60aa..ffa4851 100644 --- a/adapter/http/fastapi/app.py +++ b/adapter/http/fastapi/app.py @@ -75,13 +75,25 @@ def _build_shutdown_handler( container: AppContainer, ) -> Callable[[], Awaitable[None]]: async def shutdown() -> None: + errors: list[Exception] = [] + try: await _stop_cleanup_loop(app) - finally: - try: - _uninstrument_app(app) - finally: - container.shutdown() + except Exception as exc: + errors.append(exc) + + try: + _uninstrument_app(app) + except Exception as exc: + errors.append(exc) + + try: + container.shutdown() + except Exception as exc: + errors.append(exc) + + if errors: + raise ExceptionGroup('app shutdown failed', errors) return shutdown @@ -94,7 +106,9 @@ async def _run_cleanup_loop( while not stop_event.is_set(): try: - container.usecases.cleanup_expired_sandboxes.execute() + await asyncio.to_thread( + container.usecases.cleanup_expired_sandboxes.execute + ) except Exception as exc: container.observability.logger.error( 'sandbox_cleanup_failed', diff --git a/tasks.md b/tasks.md index 75b4ce8..b2de0dd 100644 --- a/tasks.md +++ b/tasks.md @@ -131,7 +131,7 @@ ### M10. Устойчивый cleanup и вынос blocking cleanup из event loop - Субагент: `feature-developer` -- Статус: pending +- Статус: completed - Зависимости: `M09` - Commit required: no - Scope: сделать cleanup устойчивым к частичным ошибкам и не блокировать FastAPI event loop синхронным Docker stop diff --git a/usecase/sandbox.py b/usecase/sandbox.py index 0cb39e8..00946a8 100644 --- a/usecase/sandbox.py +++ b/usecase/sandbox.py @@ -102,34 +102,52 @@ class CleanupExpiredSandboxes: cleaned_sessions: list[SandboxSession] = [] for session in expired_sessions: - with self._locker.lock(session.chat_id): - current_session = self._repository.get_active_by_chat_id( - session.chat_id - ) - now = self._clock.now() - if current_session is None: - continue - - if current_session.session_id != session.session_id: - continue - - if current_session.expires_at > now: - continue - - self._runtime.stop(current_session.container_id) - self._repository.delete(current_session.session_id) - cleaned_sessions.append(current_session) - self._logger.info( - 'sandbox_cleaned', + try: + cleaned_session = self._cleanup_session(session) + except Exception as exc: + self._logger.error( + 'sandbox_clean_failed', attrs={ - 'chat_id': current_session.chat_id, - 'session_id': current_session.session_id, - 'container_id': current_session.container_id, + 'chat_id': session.chat_id, + 'session_id': session.session_id, + 'container_id': session.container_id, + 'error': type(exc).__name__, }, ) + continue + + if cleaned_session is None: + continue + + cleaned_sessions.append(cleaned_session) + self._logger.info( + 'sandbox_cleaned', + attrs={ + 'chat_id': cleaned_session.chat_id, + 'session_id': cleaned_session.session_id, + 'container_id': cleaned_session.container_id, + }, + ) return cleaned_sessions + def _cleanup_session(self, session: SandboxSession) -> SandboxSession | None: + with self._locker.lock(session.chat_id): + current_session = self._repository.get_active_by_chat_id(session.chat_id) + now = self._clock.now() + if current_session is None: + return None + + if current_session.session_id != session.session_id: + return None + + if current_session.expires_at > now: + return None + + self._runtime.stop(current_session.container_id) + self._repository.delete(current_session.session_id) + return current_session + def _new_session_id() -> str: return uuid4().hex From 539f019f399c9f7410d2fd51ea20b90e74336876 Mon Sep 17 00:00:00 2001 From: Azamat Date: Thu, 2 Apr 2026 21:26:23 +0300 Subject: [PATCH 12/30] [feat] remove example user --- adapter/di/container.py | 18 +----------------- adapter/http/fastapi/dependencies.py | 5 ----- adapter/http/fastapi/routers/v1/router.py | 23 +---------------------- adapter/http/fastapi/schemas.py | 6 ------ tasks.md | 2 +- test/test_create_http.py | 12 +----------- 6 files changed, 4 insertions(+), 62 deletions(-) diff --git a/adapter/di/container.py b/adapter/di/container.py index 55f95a0..c5c7f35 100644 --- a/adapter/di/container.py +++ b/adapter/di/container.py @@ -11,24 +11,19 @@ from adapter.config.model import AppConfig from adapter.docker.runtime import DockerSandboxRuntime from adapter.observability.factory import build_observability from adapter.observability.runtime import ObservabilityRuntime -from domain.user import User from repository.sandbox_lock import ProcessLocalSandboxLifecycleLocker from repository.sandbox_session import InMemorySandboxSessionRepository -from repository.user import InMemoryUserRepository from usecase.interface import Clock from usecase.sandbox import CleanupExpiredSandboxes, CreateSandbox -from usecase.user import GetUser @dataclass(frozen=True, slots=True) class AppRepositories: - user: InMemoryUserRepository sandbox_session: InMemorySandboxSessionRepository @dataclass(frozen=True, slots=True) class AppUsecases: - get_user: GetUser create_sandbox: CreateSandbox cleanup_expired_sandboxes: CleanupExpiredSandboxes @@ -82,23 +77,12 @@ def build_container( clock = SystemClock() docker_client = docker.DockerClient(base_url=app_config.docker.base_url) - user_repository = InMemoryUserRepository( - observability.tracer, [User(id='123', email='aza@gglamer.ru', name='gglamer')] - ) sandbox_repository = InMemorySandboxSessionRepository() sandbox_locker = ProcessLocalSandboxLifecycleLocker() sandbox_runtime = DockerSandboxRuntime(app_config.sandbox, docker_client) - repositories = AppRepositories( - user=user_repository, - sandbox_session=sandbox_repository, - ) + repositories = AppRepositories(sandbox_session=sandbox_repository) usecases = AppUsecases( - get_user=GetUser( - repository=user_repository, - logger=observability.logger, - tracer=observability.tracer, - ), create_sandbox=CreateSandbox( repository=sandbox_repository, locker=sandbox_locker, diff --git a/adapter/http/fastapi/dependencies.py b/adapter/http/fastapi/dependencies.py index 87a9224..57af579 100644 --- a/adapter/http/fastapi/dependencies.py +++ b/adapter/http/fastapi/dependencies.py @@ -2,7 +2,6 @@ from fastapi import Depends, Request from adapter.di.container import AppContainer from usecase.sandbox import CreateSandbox -from usecase.user import GetUser APP_CONTAINER_STATE = 'container' APP_CONFIG_STATE = 'config' @@ -15,10 +14,6 @@ def get_container(request: Request) -> AppContainer: return container -def get_get_user(container: AppContainer = Depends(get_container)) -> GetUser: - return container.usecases.get_user - - def get_create_sandbox( container: AppContainer = Depends(get_container), ) -> CreateSandbox: diff --git a/adapter/http/fastapi/routers/v1/router.py b/adapter/http/fastapi/routers/v1/router.py index 1f0aff4..df713b1 100644 --- a/adapter/http/fastapi/routers/v1/router.py +++ b/adapter/http/fastapi/routers/v1/router.py @@ -4,19 +4,16 @@ from adapter.di.container import AppContainer from adapter.http.fastapi.dependencies import ( get_container, get_create_sandbox, - get_get_user, ) from adapter.http.fastapi.schemas import ( CreateSandboxRequest, ErrorResponse, HealthResponse, SandboxSessionResponse, - UserResponse, ) -from domain.error import SandboxError, SandboxStartError, UserNotFoundError +from domain.error import SandboxError, SandboxStartError from domain.sandbox import SandboxSession from usecase.sandbox import CreateSandbox, CreateSandboxCommand -from usecase.user import GetUser, GetUserQuery router = APIRouter() @@ -34,24 +31,6 @@ def health(container: AppContainer = Depends(get_container)) -> HealthResponse: ) -@router.get( - '/users/{user_id}', - response_model=UserResponse, - responses={status.HTTP_404_NOT_FOUND: {'model': ErrorResponse}}, - status_code=status.HTTP_200_OK, -) -def get_user(user_id: str, usecase: GetUser = Depends(get_get_user)) -> UserResponse: - try: - user = usecase.execute(GetUserQuery(user_id=user_id)) - except UserNotFoundError as exc: - raise HTTPException( - status_code=status.HTTP_404_NOT_FOUND, - detail=str(exc), - ) from exc - - return UserResponse(id=user.id, email=user.email, name=user.name) - - @router.post( '/create', response_model=SandboxSessionResponse, diff --git a/adapter/http/fastapi/schemas.py b/adapter/http/fastapi/schemas.py index 08d9056..9ca2e5f 100644 --- a/adapter/http/fastapi/schemas.py +++ b/adapter/http/fastapi/schemas.py @@ -23,11 +23,5 @@ class SandboxSessionResponse(BaseModel): expires_at: datetime -class UserResponse(BaseModel): - id: str - email: str - name: str - - class ErrorResponse(BaseModel): detail: str diff --git a/tasks.md b/tasks.md index b2de0dd..ef33fc6 100644 --- a/tasks.md +++ b/tasks.md @@ -142,7 +142,7 @@ ### M11. Удаление не-MVP user surface из приложения - Субагент: `feature-developer` -- Статус: pending +- Статус: completed - Зависимости: `M08` - Commit required: no - Scope: убрать из runtime app неотносящиеся к MVP user endpoint и seed user wiring diff --git a/test/test_create_http.py b/test/test_create_http.py index bf910e0..0176f6c 100644 --- a/test/test_create_http.py +++ b/test/test_create_http.py @@ -25,10 +25,8 @@ from adapter.observability.runtime import ObservabilityRuntime from domain.error import SandboxError, SandboxStartError from domain.sandbox import SandboxSession, SandboxStatus from repository.sandbox_session import InMemorySandboxSessionRepository -from repository.user import InMemoryUserRepository from usecase.interface import Attrs from usecase.sandbox import CleanupExpiredSandboxes, CreateSandbox, CreateSandboxCommand -from usecase.user import GetUser class FakeLogger: @@ -130,16 +128,8 @@ def build_container( metrics=NoopMetrics(), tracer=NoopTracer(), ) - repositories = AppRepositories( - user=InMemoryUserRepository(NoopTracer()), - sandbox_session=InMemorySandboxSessionRepository(), - ) + repositories = AppRepositories(sandbox_session=InMemorySandboxSessionRepository()) usecases = AppUsecases( - get_user=GetUser( - repository=repositories.user, - logger=logger, - tracer=NoopTracer(), - ), create_sandbox=create_sandbox_usecase, cleanup_expired_sandboxes=cleanup_usecase, ) From 44f1549d8036a342a7f44c635e976af0c37c3b8b Mon Sep 17 00:00:00 2001 From: Azamat Date: Thu, 2 Apr 2026 21:47:55 +0300 Subject: [PATCH 13/30] [feat] add test race condition & cleanup resilience --- tasks.md | 2 +- test/test_create_http.py | 91 ++++++++++-- test/test_sandbox_usecase.py | 276 +++++++++++++++++++++++++++++++++++ 3 files changed, 359 insertions(+), 10 deletions(-) diff --git a/tasks.md b/tasks.md index ef33fc6..9650de1 100644 --- a/tasks.md +++ b/tasks.md @@ -153,7 +153,7 @@ ### M12. Регрессионные тесты на race conditions и cleanup resilience - Субагент: `test-engineer` -- Статус: pending +- Статус: completed - Зависимости: `M09`, `M10`, `M11` - Commit required: no - Scope: добавить тесты на новые гарантии после review fixes diff --git a/test/test_create_http.py b/test/test_create_http.py index 0176f6c..478b61a 100644 --- a/test/test_create_http.py +++ b/test/test_create_http.py @@ -142,10 +142,13 @@ def build_container( ) -async def post_json( - app: FastAPI, path: str, payload: dict[str, str] +async def request_json( + app: FastAPI, + method: str, + path: str, + payload: dict[str, str] | None = None, ) -> tuple[int, dict[str, object]]: - body = json.dumps(payload).encode() + body = b'' if payload is None else json.dumps(payload).encode() messages: list[Message] = [] request_sent = False @@ -169,17 +172,13 @@ async def post_json( 'type': 'http', 'asgi': {'version': '3.0'}, 'http_version': '1.1', - 'method': 'POST', + 'method': method, 'scheme': 'http', 'path': path, 'raw_path': path.encode(), 'query_string': b'', 'root_path': '', - 'headers': [ - (b'host', b'testserver'), - (b'content-type', b'application/json'), - (b'content-length', str(len(body)).encode()), - ], + 'headers': _build_headers(body, payload is not None), 'client': ('testclient', 50000), 'server': ('testserver', 80), 'state': {}, @@ -195,9 +194,32 @@ async def post_json( if message['type'] == 'http.response.body': response_body += bytes(message.get('body', b'')) + if not response_body: + return status, {} + return status, json.loads(response_body.decode()) +def _build_headers(body: bytes, has_json_body: bool) -> list[tuple[bytes, bytes]]: + headers = [ + (b'host', b'testserver'), + (b'content-length', str(len(body)).encode()), + ] + if has_json_body: + headers.append((b'content-type', b'application/json')) + return headers + + +async def post_json( + app: FastAPI, path: str, payload: dict[str, str] +) -> tuple[int, dict[str, object]]: + return await request_json(app, 'POST', path, payload) + + +async def get_json(app: FastAPI, path: str) -> tuple[int, dict[str, object]]: + return await request_json(app, 'GET', path) + + async def exercise_create_request( app: FastAPI, payload: dict[str, str], @@ -211,6 +233,19 @@ async def exercise_create_request( await app.router.shutdown() +async def exercise_get_request( + app: FastAPI, + path: str, +) -> tuple[int, dict[str, object]]: + await app.router.startup() + try: + status, response = await get_json(app, path) + await asyncio.sleep(0) + return status, response + finally: + await app.router.shutdown() + + def test_post_create_returns_session(monkeypatch) -> None: config = build_config() expires_at = datetime(2026, 4, 2, 12, 5, tzinfo=UTC) @@ -320,3 +355,41 @@ def test_post_create_maps_generic_sandbox_errors_to_internal_error(monkeypatch) assert status_code == 500 assert response == {'detail': 'sandbox_broken'} assert docker_client.close_calls == 1 + + +def test_removed_user_endpoint_returns_not_found(monkeypatch) -> None: + config = build_config() + expires_at = datetime(2026, 4, 2, 12, 5, tzinfo=UTC) + session = SandboxSession( + session_id='session-123', + chat_id='chat-123', + container_id='container-123', + status=SandboxStatus.RUNNING, + created_at=expires_at - timedelta(minutes=5), + expires_at=expires_at, + ) + logger = FakeLogger() + create_usecase = FakeCreateSandboxUsecase(session=session) + cleanup_usecase = FakeCleanupExpiredSandboxes() + docker_client = FakeDockerClient() + container = build_container( + config, + create_usecase, + cleanup_usecase, + logger, + docker_client, + ) + monkeypatch.setattr(app_module, 'build_container', lambda **kwargs: container) + monkeypatch.setattr( + app_module.FastAPIInstrumentor, 'instrument_app', lambda *args, **kwargs: None + ) + + app = app_module.create_app(config=config) + + status_code, response = asyncio.run( + exercise_get_request(app, '/api/v1/users/user-123') + ) + + assert status_code == 404 + assert response == {'detail': 'Not Found'} + assert docker_client.close_calls == 1 diff --git a/test/test_sandbox_usecase.py b/test/test_sandbox_usecase.py index a58dd5c..1631d1b 100644 --- a/test/test_sandbox_usecase.py +++ b/test/test_sandbox_usecase.py @@ -1,6 +1,8 @@ +import threading from datetime import UTC, datetime, timedelta from domain.sandbox import SandboxSession, SandboxStatus +from repository.sandbox_lock import ProcessLocalSandboxLifecycleLocker from repository.sandbox_session import InMemorySandboxSessionRepository from usecase.sandbox import CleanupExpiredSandboxes, CreateSandbox, CreateSandboxCommand @@ -49,6 +51,89 @@ class FakeLocker: return FakeLockContext() +class TrackingLockContext: + def __init__( + self, + locker: 'TrackingLocker', + chat_id: str, + inner_context, + ) -> None: + self._locker = locker + self._chat_id = chat_id + self._inner_context = inner_context + + def __enter__(self) -> None: + with self._locker._state_lock: + self._locker.chat_ids.append(self._chat_id) + self._locker._attempts += 1 + if self._locker._attempts == 2: + self._locker.second_attempted.set() + + self._inner_context.__enter__() + + def __exit__(self, exc_type, exc, traceback) -> bool | None: + return self._inner_context.__exit__(exc_type, exc, traceback) + + +class TrackingLocker: + def __init__(self) -> None: + self._locker = ProcessLocalSandboxLifecycleLocker() + self._state_lock = threading.Lock() + self._attempts = 0 + self.second_attempted = threading.Event() + self.chat_ids: list[str] = [] + + def lock(self, chat_id: str) -> TrackingLockContext: + return TrackingLockContext(self, chat_id, self._locker.lock(chat_id)) + + +class BlockingCreateRuntime: + def __init__(self) -> None: + self.create_calls: list[dict[str, object]] = [] + self.stop_calls: list[str] = [] + self.create_started = threading.Event() + self.allow_create = threading.Event() + + def create( + self, + *, + session_id: str, + chat_id: str, + created_at: datetime, + expires_at: datetime, + ) -> SandboxSession: + self.create_calls.append( + { + 'session_id': session_id, + 'chat_id': chat_id, + 'created_at': created_at, + 'expires_at': expires_at, + } + ) + self.create_started.set() + assert self.allow_create.wait(timeout=1) + return SandboxSession( + session_id=session_id, + chat_id=chat_id, + container_id=f'container-{session_id}', + status=SandboxStatus.RUNNING, + created_at=created_at, + expires_at=expires_at, + ) + + def stop(self, container_id: str) -> None: + self.stop_calls.append(container_id) + + +class StaleSnapshotRepository(InMemorySandboxSessionRepository): + def __init__(self, snapshot: SandboxSession) -> None: + super().__init__() + self._snapshot = snapshot + + def list_expired(self, now: datetime) -> list[SandboxSession]: + return [self._snapshot] + + class FakeRuntime: def __init__(self) -> None: self.create_calls: list[dict[str, object]] = [] @@ -83,6 +168,17 @@ class FakeRuntime: self.stop_calls.append(container_id) +class FailingStopRuntime(FakeRuntime): + def __init__(self, failing_container_id: str) -> None: + super().__init__() + self._failing_container_id = failing_container_id + + def stop(self, container_id: str) -> None: + self.stop_calls.append(container_id) + if container_id == self._failing_container_id: + raise RuntimeError('stop_failed') + + def test_create_sandbox_reuses_active_session_when_not_expired() -> None: now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) session = SandboxSession( @@ -242,6 +338,84 @@ def test_create_sandbox_creates_new_session_when_none_exists() -> None: ] +def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id( + monkeypatch, +) -> None: + now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + repository = InMemorySandboxSessionRepository() + runtime = BlockingCreateRuntime() + logger = FakeLogger() + locker = TrackingLocker() + usecase = CreateSandbox( + repository=repository, + locker=locker, + runtime=runtime, + clock=FakeClock(now), + logger=logger, + ttl=timedelta(minutes=5), + ) + monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: 'session-new') + + results: list[SandboxSession | None] = [None, None] + errors: list[Exception] = [] + + def run_create(index: int) -> None: + try: + results[index] = usecase.execute(CreateSandboxCommand(chat_id='chat-1')) + except Exception as exc: + errors.append(exc) + + first_thread = threading.Thread(target=run_create, args=(0,)) + second_thread = threading.Thread(target=run_create, args=(1,)) + + first_thread.start() + assert runtime.create_started.wait(timeout=1) + + second_thread.start() + assert locker.second_attempted.wait(timeout=1) + assert len(runtime.create_calls) == 1 + + runtime.allow_create.set() + + first_thread.join(timeout=1) + second_thread.join(timeout=1) + + assert errors == [] + assert results[0] == results[1] + assert results[0] == SandboxSession( + session_id='session-new', + chat_id='chat-1', + container_id='container-session-new', + status=SandboxStatus.RUNNING, + created_at=now, + expires_at=now + timedelta(minutes=5), + ) + assert len(runtime.create_calls) == 1 + assert runtime.stop_calls == [] + assert repository.get_active_by_chat_id('chat-1') == results[0] + assert locker.chat_ids == ['chat-1', 'chat-1'] + assert logger.messages == [ + ( + 'info', + 'sandbox_created', + { + 'chat_id': 'chat-1', + 'session_id': 'session-new', + 'container_id': 'container-session-new', + }, + ), + ( + 'info', + 'sandbox_reused', + { + 'chat_id': 'chat-1', + 'session_id': 'session-new', + 'container_id': 'container-session-new', + }, + ), + ] + + def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> None: now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) expired_session = SandboxSession( @@ -311,3 +485,105 @@ def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> }, ), ] + + +def test_cleanup_expired_sandboxes_skips_replaced_session_from_stale_snapshot() -> None: + now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + expired_snapshot = SandboxSession( + session_id='session-expired', + chat_id='chat-1', + container_id='container-expired', + status=SandboxStatus.RUNNING, + created_at=now - timedelta(minutes=10), + expires_at=now - timedelta(seconds=1), + ) + replacement_session = SandboxSession( + session_id='session-new', + chat_id='chat-1', + container_id='container-new', + status=SandboxStatus.RUNNING, + created_at=now - timedelta(seconds=30), + expires_at=now + timedelta(minutes=5), + ) + repository = StaleSnapshotRepository(expired_snapshot) + repository.save(replacement_session) + runtime = FakeRuntime() + logger = FakeLogger() + locker = FakeLocker() + usecase = CleanupExpiredSandboxes( + repository=repository, + locker=locker, + runtime=runtime, + clock=FakeClock(now), + logger=logger, + ) + + result = usecase.execute() + + assert result == [] + assert runtime.stop_calls == [] + assert repository.get_active_by_chat_id('chat-1') == replacement_session + assert locker.chat_ids == ['chat-1'] + assert logger.messages == [] + + +def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None: + now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + failing_session = SandboxSession( + session_id='session-fail', + chat_id='chat-fail', + container_id='container-fail', + status=SandboxStatus.RUNNING, + created_at=now - timedelta(minutes=10), + expires_at=now - timedelta(minutes=1), + ) + cleaned_session = SandboxSession( + session_id='session-clean', + chat_id='chat-clean', + container_id='container-clean', + status=SandboxStatus.RUNNING, + created_at=now - timedelta(minutes=9), + expires_at=now - timedelta(seconds=1), + ) + repository = InMemorySandboxSessionRepository() + repository.save(failing_session) + repository.save(cleaned_session) + runtime = FailingStopRuntime('container-fail') + logger = FakeLogger() + locker = FakeLocker() + usecase = CleanupExpiredSandboxes( + repository=repository, + locker=locker, + runtime=runtime, + clock=FakeClock(now), + logger=logger, + ) + + result = usecase.execute() + + assert result == [cleaned_session] + assert runtime.stop_calls == ['container-fail', 'container-clean'] + assert repository.get_active_by_chat_id('chat-fail') == failing_session + assert repository.get_active_by_chat_id('chat-clean') is None + assert locker.chat_ids == ['chat-fail', 'chat-clean'] + assert logger.messages == [ + ( + 'error', + 'sandbox_clean_failed', + { + 'chat_id': 'chat-fail', + 'session_id': 'session-fail', + 'container_id': 'container-fail', + 'error': 'RuntimeError', + }, + ), + ( + 'info', + 'sandbox_cleaned', + { + 'chat_id': 'chat-clean', + 'session_id': 'session-clean', + 'container_id': 'container-clean', + }, + ), + ] From e629e34c4d6ea170917287e289bdd71b08d5e070 Mon Sep 17 00:00:00 2001 From: Azamat Date: Thu, 2 Apr 2026 22:35:50 +0300 Subject: [PATCH 14/30] ref #10: [fix] enforce UUID chat ids Normalize chat ids to a single UUID form so locks, repository keys, and mount paths cannot diverge through path-like aliases. --- adapter/docker/runtime.py | 18 ++++-- adapter/http/fastapi/schemas.py | 8 ++- tasks.md | 37 +++++++++++- test/test_create_http.py | 62 +++++++++++++++++---- test/test_docker_runtime.py | 28 ++++++---- test/test_sandbox_usecase.py | 99 ++++++++++++++++++--------------- usecase/sandbox.py | 20 ++++--- 7 files changed, 192 insertions(+), 80 deletions(-) diff --git a/adapter/docker/runtime.py b/adapter/docker/runtime.py index 61fcaf6..d24d110 100644 --- a/adapter/docker/runtime.py +++ b/adapter/docker/runtime.py @@ -1,5 +1,6 @@ from datetime import datetime from pathlib import Path +from uuid import UUID from docker import DockerClient from docker.errors import DockerException, NotFound @@ -28,8 +29,11 @@ class DockerSandboxRuntime(SandboxRuntime): created_at: datetime, expires_at: datetime, ) -> SandboxSession: + normalized_chat_id = chat_id + try: - chat_path = self._chat_path(chat_id) + normalized_chat_id = _canonical_chat_id(chat_id) + chat_path = self._chat_path(normalized_chat_id) dependencies_path = self._readonly_host_path( self._config.dependencies_host_path ) @@ -40,19 +44,19 @@ class DockerSandboxRuntime(SandboxRuntime): container = self._client.containers.run( self._config.image, detach=True, - labels=self._labels(session_id, chat_id, expires_at), + labels=self._labels(session_id, normalized_chat_id, expires_at), mounts=self._mounts(chat_path, dependencies_path, lambda_tools_path), ) except (DockerException, OSError, ValueError) as exc: - raise SandboxStartError(chat_id) from exc + raise SandboxStartError(normalized_chat_id) from exc container_id = str(getattr(container, 'id', '')).strip() if not container_id: - raise SandboxStartError(chat_id) + raise SandboxStartError(normalized_chat_id) return SandboxSession( session_id=session_id, - chat_id=chat_id, + chat_id=normalized_chat_id, container_id=container_id, status=SandboxStatus.RUNNING, created_at=created_at, @@ -124,3 +128,7 @@ class DockerSandboxRuntime(SandboxRuntime): def _host_path(self, path_value: str) -> Path: return Path(path_value).expanduser().resolve(strict=False) + + +def _canonical_chat_id(chat_id: str) -> str: + return str(UUID(str(chat_id).strip())) diff --git a/adapter/http/fastapi/schemas.py b/adapter/http/fastapi/schemas.py index 9ca2e5f..a34f702 100644 --- a/adapter/http/fastapi/schemas.py +++ b/adapter/http/fastapi/schemas.py @@ -1,6 +1,7 @@ from datetime import datetime +from uuid import UUID -from pydantic import BaseModel, ConfigDict, Field +from pydantic import BaseModel, ConfigDict, Field, field_validator class HealthResponse(BaseModel): @@ -14,6 +15,11 @@ class CreateSandboxRequest(BaseModel): chat_id: str = Field(min_length=1) + @field_validator('chat_id') + @classmethod + def validate_chat_id(cls, value: str) -> str: + return str(UUID(value)) + class SandboxSessionResponse(BaseModel): session_id: str diff --git a/tasks.md b/tasks.md index 9650de1..cb1e38f 100644 --- a/tasks.md +++ b/tasks.md @@ -163,7 +163,7 @@ ### M13. Повторный boundary review после fix-pass - Субагент: `code-reviewer` -- Статус: pending +- Статус: completed - Зависимости: `M12` - Commit required: no - Scope: проверить, что must-fix и should-fix замечания из `M08` закрыты без нарушения clean architecture @@ -181,3 +181,38 @@ - Ошибки: несовместимый fake Docker client для `DockerSandboxRuntime`, неточная типизация `run_calls` и ASGI message payload, использование `object` вместо типизированных test doubles для `AppRepositories`, `AppUsecases`, `AppContainer` - Решение: сделать test doubles типизированными через совместимые fake classes или локальные protocols; убрать `object` и неиндексируемые `dict[str, object]` там, где mypy не может вывести типы - Критерии приемки: `uv run mypy .` проходит; `make pre-commit` доходит как минимум до pytest stage; production code не меняется или меняется только при явной необходимости для testability + +## Follow-up после M13 review + +### M15. Канонизация и валидация `chat_id` + +- Субагент: `feature-developer` +- Статус: completed +- Зависимости: `M13` +- Commit required: no +- Scope: сделать `chat_id` строго UUID и убрать path alias/whole-root mount риск через неканоничные значения +- Файлы: `adapter/http/fastapi/schemas.py`, `adapter/docker/runtime.py`, при необходимости `usecase/sandbox.py` и тесты в `test/*` +- Решение: принять `chat_id` как UUID на HTTP boundary, использовать его каноничную строковую форму дальше в usecase/repository/path construction и не принимать произвольные path-like строки +- Критерии приемки: не-UUID значения отклоняются на HTTP boundary с `400/422`; UUID используется как единое каноничное значение для lock key, repository key и filesystem path; появляются регрессионные тесты на invalid `chat_id` + +### M16. Lifecycle reconciliation на startup/shutdown + +- Субагент: `feature-developer` +- Статус: pending +- Зависимости: `M13` +- Commit required: no +- Scope: устранить restart-gap между in-memory registry и уже запущенными Docker containers +- Файлы: `adapter/docker/runtime.py`, `adapter/di/container.py`, `adapter/http/fastapi/app.py`, при необходимости новые outer-layer helper files и тесты в `test/*` +- Решение: основная стратегия — reconciliation по Docker labels на startup, чтобы после restart master-service продолжал видеть уже запущенные sandbox и не поднимал дубликаты; graceful shutdown cleanup остается опциональным дополнением +- Критерии приемки: после restart master-service может восстановить/синхронизировать state по Docker labels без потери работающих agent containers; one-sandbox-per-chat не нарушается из-за пустого in-memory registry; lifecycle policy явно зафиксирована и покрыта тестами + +### M17. Управление жизненным циклом per-chat locks + +- Субагент: `feature-developer` +- Статус: pending +- Зависимости: `M13` +- Commit required: no +- Scope: ограничить неограниченный рост registry locks по числу когда-либо увиденных `chat_id` +- Файлы: `repository/sandbox_lock.py`, при необходимости тесты в `test/*` +- Решение: добавить eviction/ref-count/weakref policy во внешнем lock registry без нарушения сериализации lifecycle для активного `chat_id` +- Критерии приемки: registry locks не растет бесконечно без причины; сериализация для активных чатов сохраняется; поведение покрыто тестами diff --git a/test/test_create_http.py b/test/test_create_http.py index 478b61a..bbdbc6d 100644 --- a/test/test_create_http.py +++ b/test/test_create_http.py @@ -28,6 +28,9 @@ from repository.sandbox_session import InMemorySandboxSessionRepository from usecase.interface import Attrs from usecase.sandbox import CleanupExpiredSandboxes, CreateSandbox, CreateSandboxCommand +CHAT_ID = '123e4567-e89b-12d3-a456-426614174000' +NON_CANONICAL_CHAT_ID = '123E4567E89B12D3A456426614174000' + class FakeLogger: def __init__(self) -> None: @@ -246,12 +249,12 @@ async def exercise_get_request( await app.router.shutdown() -def test_post_create_returns_session(monkeypatch) -> None: +def test_post_create_returns_session_with_canonical_chat_id(monkeypatch) -> None: config = build_config() expires_at = datetime(2026, 4, 2, 12, 5, tzinfo=UTC) session = SandboxSession( session_id='session-123', - chat_id='chat-123', + chat_id=CHAT_ID, container_id='container-123', status=SandboxStatus.RUNNING, created_at=expires_at - timedelta(minutes=5), @@ -276,19 +279,19 @@ def test_post_create_returns_session(monkeypatch) -> None: app = app_module.create_app(config=config) status_code, response = asyncio.run( - exercise_create_request(app, {'chat_id': 'chat-123'}) + exercise_create_request(app, {'chat_id': NON_CANONICAL_CHAT_ID}) ) assert status_code == 200 assert response == { 'session_id': 'session-123', - 'chat_id': 'chat-123', + 'chat_id': CHAT_ID, 'container_id': 'container-123', 'status': 'running', 'expires_at': '2026-04-02T12:05:00Z', } assert len(create_usecase.commands) == 1 - assert create_usecase.commands[0].chat_id == 'chat-123' + assert create_usecase.commands[0].chat_id == CHAT_ID assert cleanup_usecase.calls >= 1 assert any( message == 'http_request' @@ -299,10 +302,19 @@ def test_post_create_returns_session(monkeypatch) -> None: assert docker_client.close_calls == 1 -def test_post_create_maps_start_errors_to_service_unavailable(monkeypatch) -> None: +def test_post_create_rejects_non_uuid_chat_id(monkeypatch) -> None: config = build_config() + expires_at = datetime(2026, 4, 2, 12, 5, tzinfo=UTC) + session = SandboxSession( + session_id='session-123', + chat_id=CHAT_ID, + container_id='container-123', + status=SandboxStatus.RUNNING, + created_at=expires_at - timedelta(minutes=5), + expires_at=expires_at, + ) logger = FakeLogger() - create_usecase = FakeCreateSandboxUsecase(error=SandboxStartError('chat-123')) + create_usecase = FakeCreateSandboxUsecase(session=session) cleanup_usecase = FakeCleanupExpiredSandboxes() docker_client = FakeDockerClient() container = build_container( @@ -320,7 +332,37 @@ def test_post_create_maps_start_errors_to_service_unavailable(monkeypatch) -> No app = app_module.create_app(config=config) status_code, response = asyncio.run( - exercise_create_request(app, {'chat_id': 'chat-123'}) + exercise_create_request(app, {'chat_id': 'x/../y'}) + ) + + assert status_code == 422 + assert 'detail' in response + assert create_usecase.commands == [] + assert docker_client.close_calls == 1 + + +def test_post_create_maps_start_errors_to_service_unavailable(monkeypatch) -> None: + config = build_config() + logger = FakeLogger() + create_usecase = FakeCreateSandboxUsecase(error=SandboxStartError(CHAT_ID)) + cleanup_usecase = FakeCleanupExpiredSandboxes() + docker_client = FakeDockerClient() + container = build_container( + config, + create_usecase, + cleanup_usecase, + logger, + docker_client, + ) + monkeypatch.setattr(app_module, 'build_container', lambda **kwargs: container) + monkeypatch.setattr( + app_module.FastAPIInstrumentor, 'instrument_app', lambda *args, **kwargs: None + ) + + app = app_module.create_app(config=config) + + status_code, response = asyncio.run( + exercise_create_request(app, {'chat_id': CHAT_ID}) ) assert status_code == 503 @@ -349,7 +391,7 @@ def test_post_create_maps_generic_sandbox_errors_to_internal_error(monkeypatch) app = app_module.create_app(config=config) status_code, response = asyncio.run( - exercise_create_request(app, {'chat_id': 'chat-123'}) + exercise_create_request(app, {'chat_id': CHAT_ID}) ) assert status_code == 500 @@ -362,7 +404,7 @@ def test_removed_user_endpoint_returns_not_found(monkeypatch) -> None: expires_at = datetime(2026, 4, 2, 12, 5, tzinfo=UTC) session = SandboxSession( session_id='session-123', - chat_id='chat-123', + chat_id=CHAT_ID, container_id='container-123', status=SandboxStatus.RUNNING, created_at=expires_at - timedelta(minutes=5), diff --git a/test/test_docker_runtime.py b/test/test_docker_runtime.py index 338fe1f..829024d 100644 --- a/test/test_docker_runtime.py +++ b/test/test_docker_runtime.py @@ -12,6 +12,9 @@ from adapter.docker.runtime import DockerSandboxRuntime from domain.error import SandboxError, SandboxStartError from domain.sandbox import SandboxStatus +CHAT_ID = '123e4567-e89b-12d3-a456-426614174000' +NON_CANONICAL_CHAT_ID = '123E4567E89B12D3A456426614174000' + class FakeContainer: def __init__(self, container_id: str) -> None: @@ -92,7 +95,9 @@ def build_config(tmp_path: Path) -> SandboxConfig: ) -def test_runtime_create_applies_mount_policy_and_labels(tmp_path: Path) -> None: +def test_runtime_create_applies_mount_policy_and_labels_with_canonical_chat_id( + tmp_path: Path, +) -> None: config = build_config(tmp_path) (tmp_path / 'dependencies').mkdir() (tmp_path / 'lambda-tools').mkdir() @@ -103,25 +108,25 @@ def test_runtime_create_applies_mount_policy_and_labels(tmp_path: Path) -> None: session = runtime.create( session_id='session-123', - chat_id='chat-123', + chat_id=NON_CANONICAL_CHAT_ID, created_at=created_at, expires_at=expires_at, ) assert session.session_id == 'session-123' - assert session.chat_id == 'chat-123' + assert session.chat_id == CHAT_ID assert session.container_id == 'container-123' assert session.status is SandboxStatus.RUNNING assert session.created_at == created_at assert session.expires_at == expires_at - assert (tmp_path / 'chats' / 'chat-123').is_dir() + assert (tmp_path / 'chats' / CHAT_ID).is_dir() call = containers.run_calls[0] assert call['args'] == ('sandbox:latest',) assert call['kwargs']['detach'] is True assert call['kwargs']['labels'] == { 'session_id': 'session-123', - 'chat_id': 'chat-123', + 'chat_id': CHAT_ID, 'expires_at': expires_at.isoformat(), } @@ -129,7 +134,7 @@ def test_runtime_create_applies_mount_policy_and_labels(tmp_path: Path) -> None: assert [dict(mount) for mount in mounts] == [ { 'Target': '/workspace/chat', - 'Source': str((tmp_path / 'chats' / 'chat-123').resolve(strict=False)), + 'Source': str((tmp_path / 'chats' / CHAT_ID).resolve(strict=False)), 'Type': 'bind', 'ReadOnly': False, }, @@ -160,13 +165,13 @@ def test_runtime_create_raises_start_error_when_container_id_is_missing( with pytest.raises(SandboxStartError) as excinfo: runtime.create( session_id='session-123', - chat_id='chat-123', + chat_id=CHAT_ID, created_at=datetime(2026, 4, 2, 12, 0, tzinfo=UTC), expires_at=datetime(2026, 4, 2, 12, 5, tzinfo=UTC), ) assert str(excinfo.value) == 'sandbox_start_failed' - assert excinfo.value.chat_id == 'chat-123' + assert excinfo.value.chat_id == CHAT_ID def test_runtime_stop_ignores_missing_container(tmp_path: Path) -> None: @@ -192,7 +197,8 @@ def test_runtime_stop_wraps_docker_errors(tmp_path: Path) -> None: assert str(excinfo.value) == 'sandbox_stop_failed' -def test_runtime_create_rejects_chat_path_traversal(tmp_path: Path) -> None: +@pytest.mark.parametrize('chat_id', ['.', 'a/..', 'x/../y']) +def test_runtime_create_rejects_non_uuid_chat_id(tmp_path: Path, chat_id: str) -> None: config = build_config(tmp_path) (tmp_path / 'dependencies').mkdir() (tmp_path / 'lambda-tools').mkdir() @@ -202,11 +208,11 @@ def test_runtime_create_rejects_chat_path_traversal(tmp_path: Path) -> None: with pytest.raises(SandboxStartError) as excinfo: runtime.create( session_id='session-123', - chat_id='../escape', + chat_id=chat_id, created_at=datetime(2026, 4, 2, 12, 0, tzinfo=UTC), expires_at=datetime(2026, 4, 2, 12, 5, tzinfo=UTC), ) assert str(excinfo.value) == 'sandbox_start_failed' - assert excinfo.value.chat_id == '../escape' + assert excinfo.value.chat_id == chat_id assert containers.run_calls == [] diff --git a/test/test_sandbox_usecase.py b/test/test_sandbox_usecase.py index 1631d1b..26e094f 100644 --- a/test/test_sandbox_usecase.py +++ b/test/test_sandbox_usecase.py @@ -6,6 +6,14 @@ from repository.sandbox_lock import ProcessLocalSandboxLifecycleLocker from repository.sandbox_session import InMemorySandboxSessionRepository from usecase.sandbox import CleanupExpiredSandboxes, CreateSandbox, CreateSandboxCommand +CHAT_ID = '11111111-1111-1111-1111-111111111111' +NON_CANONICAL_CHAT_ID = '11111111111111111111111111111111' +EXPIRED_CHAT_ID = '22222222-2222-2222-2222-222222222222' +BOUNDARY_CHAT_ID = '33333333-3333-3333-3333-333333333333' +ACTIVE_CHAT_ID = '44444444-4444-4444-4444-444444444444' +FAIL_CHAT_ID = '55555555-5555-5555-5555-555555555555' +CLEAN_CHAT_ID = '66666666-6666-6666-6666-666666666666' + class FakeClock: def __init__(self, now: datetime) -> None: @@ -183,7 +191,7 @@ def test_create_sandbox_reuses_active_session_when_not_expired() -> None: now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) session = SandboxSession( session_id='session-1', - chat_id='chat-1', + chat_id=CHAT_ID, container_id='container-1', status=SandboxStatus.RUNNING, created_at=now - timedelta(minutes=1), @@ -203,19 +211,19 @@ def test_create_sandbox_reuses_active_session_when_not_expired() -> None: ttl=timedelta(minutes=5), ) - result = usecase.execute(CreateSandboxCommand(chat_id='chat-1')) + result = usecase.execute(CreateSandboxCommand(chat_id=CHAT_ID)) assert result == session assert runtime.create_calls == [] assert runtime.stop_calls == [] - assert repository.get_active_by_chat_id('chat-1') == session - assert locker.chat_ids == ['chat-1'] + assert repository.get_active_by_chat_id(CHAT_ID) == session + assert locker.chat_ids == [CHAT_ID] assert logger.messages == [ ( 'info', 'sandbox_reused', { - 'chat_id': 'chat-1', + 'chat_id': CHAT_ID, 'session_id': 'session-1', 'container_id': 'container-1', }, @@ -229,7 +237,7 @@ def test_create_sandbox_replaces_expired_session_and_creates_new_one( now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) expired_session = SandboxSession( session_id='session-old', - chat_id='chat-1', + chat_id=CHAT_ID, container_id='container-old', status=SandboxStatus.RUNNING, created_at=now - timedelta(minutes=10), @@ -250,33 +258,33 @@ def test_create_sandbox_replaces_expired_session_and_creates_new_one( ) monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: 'session-new') - result = usecase.execute(CreateSandboxCommand(chat_id='chat-1')) + result = usecase.execute(CreateSandboxCommand(chat_id=CHAT_ID)) assert runtime.stop_calls == ['container-old'] assert runtime.create_calls == [ { 'session_id': 'session-new', - 'chat_id': 'chat-1', + 'chat_id': CHAT_ID, 'created_at': now, 'expires_at': now + timedelta(minutes=5), } ] assert result == SandboxSession( session_id='session-new', - chat_id='chat-1', + chat_id=CHAT_ID, container_id='container-session-new', status=SandboxStatus.RUNNING, created_at=now, expires_at=now + timedelta(minutes=5), ) - assert repository.get_active_by_chat_id('chat-1') == result - assert locker.chat_ids == ['chat-1'] + assert repository.get_active_by_chat_id(CHAT_ID) == result + assert locker.chat_ids == [CHAT_ID] assert logger.messages == [ ( 'info', 'sandbox_replaced', { - 'chat_id': 'chat-1', + 'chat_id': CHAT_ID, 'session_id': 'session-old', 'container_id': 'container-old', }, @@ -285,7 +293,7 @@ def test_create_sandbox_replaces_expired_session_and_creates_new_one( 'info', 'sandbox_created', { - 'chat_id': 'chat-1', + 'chat_id': CHAT_ID, 'session_id': 'session-new', 'container_id': 'container-session-new', }, @@ -308,9 +316,9 @@ def test_create_sandbox_creates_new_session_when_none_exists() -> None: ttl=timedelta(minutes=5), ) - result = usecase.execute(CreateSandboxCommand(chat_id='chat-1')) + result = usecase.execute(CreateSandboxCommand(chat_id=NON_CANONICAL_CHAT_ID)) - assert result.chat_id == 'chat-1' + assert result.chat_id == CHAT_ID assert result.container_id == f'container-{result.session_id}' assert result.status is SandboxStatus.RUNNING assert result.created_at == now @@ -318,19 +326,20 @@ def test_create_sandbox_creates_new_session_when_none_exists() -> None: assert len(runtime.create_calls) == 1 assert runtime.create_calls[0] == { 'session_id': result.session_id, - 'chat_id': 'chat-1', + 'chat_id': CHAT_ID, 'created_at': now, 'expires_at': now + timedelta(minutes=5), } assert runtime.stop_calls == [] - assert repository.get_active_by_chat_id('chat-1') == result - assert locker.chat_ids == ['chat-1'] + assert repository.get_active_by_chat_id(CHAT_ID) == result + assert repository.get_active_by_chat_id(NON_CANONICAL_CHAT_ID) is None + assert locker.chat_ids == [CHAT_ID] assert logger.messages == [ ( 'info', 'sandbox_created', { - 'chat_id': 'chat-1', + 'chat_id': CHAT_ID, 'session_id': result.session_id, 'container_id': result.container_id, }, @@ -361,7 +370,7 @@ def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id( def run_create(index: int) -> None: try: - results[index] = usecase.execute(CreateSandboxCommand(chat_id='chat-1')) + results[index] = usecase.execute(CreateSandboxCommand(chat_id=CHAT_ID)) except Exception as exc: errors.append(exc) @@ -384,7 +393,7 @@ def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id( assert results[0] == results[1] assert results[0] == SandboxSession( session_id='session-new', - chat_id='chat-1', + chat_id=CHAT_ID, container_id='container-session-new', status=SandboxStatus.RUNNING, created_at=now, @@ -392,14 +401,14 @@ def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id( ) assert len(runtime.create_calls) == 1 assert runtime.stop_calls == [] - assert repository.get_active_by_chat_id('chat-1') == results[0] - assert locker.chat_ids == ['chat-1', 'chat-1'] + assert repository.get_active_by_chat_id(CHAT_ID) == results[0] + assert locker.chat_ids == [CHAT_ID, CHAT_ID] assert logger.messages == [ ( 'info', 'sandbox_created', { - 'chat_id': 'chat-1', + 'chat_id': CHAT_ID, 'session_id': 'session-new', 'container_id': 'container-session-new', }, @@ -408,7 +417,7 @@ def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id( 'info', 'sandbox_reused', { - 'chat_id': 'chat-1', + 'chat_id': CHAT_ID, 'session_id': 'session-new', 'container_id': 'container-session-new', }, @@ -420,7 +429,7 @@ def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) expired_session = SandboxSession( session_id='session-expired', - chat_id='chat-expired', + chat_id=EXPIRED_CHAT_ID, container_id='container-expired', status=SandboxStatus.RUNNING, created_at=now - timedelta(minutes=10), @@ -428,7 +437,7 @@ def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> ) boundary_session = SandboxSession( session_id='session-boundary', - chat_id='chat-boundary', + chat_id=BOUNDARY_CHAT_ID, container_id='container-boundary', status=SandboxStatus.RUNNING, created_at=now - timedelta(minutes=5), @@ -436,7 +445,7 @@ def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> ) active_session = SandboxSession( session_id='session-active', - chat_id='chat-active', + chat_id=ACTIVE_CHAT_ID, container_id='container-active', status=SandboxStatus.RUNNING, created_at=now - timedelta(minutes=1), @@ -461,16 +470,16 @@ def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> assert result == [expired_session, boundary_session] assert runtime.stop_calls == ['container-expired', 'container-boundary'] - assert repository.get_active_by_chat_id('chat-expired') is None - assert repository.get_active_by_chat_id('chat-boundary') is None - assert repository.get_active_by_chat_id('chat-active') == active_session - assert locker.chat_ids == ['chat-expired', 'chat-boundary'] + assert repository.get_active_by_chat_id(EXPIRED_CHAT_ID) is None + assert repository.get_active_by_chat_id(BOUNDARY_CHAT_ID) is None + assert repository.get_active_by_chat_id(ACTIVE_CHAT_ID) == active_session + assert locker.chat_ids == [EXPIRED_CHAT_ID, BOUNDARY_CHAT_ID] assert logger.messages == [ ( 'info', 'sandbox_cleaned', { - 'chat_id': 'chat-expired', + 'chat_id': EXPIRED_CHAT_ID, 'session_id': 'session-expired', 'container_id': 'container-expired', }, @@ -479,7 +488,7 @@ def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> 'info', 'sandbox_cleaned', { - 'chat_id': 'chat-boundary', + 'chat_id': BOUNDARY_CHAT_ID, 'session_id': 'session-boundary', 'container_id': 'container-boundary', }, @@ -491,7 +500,7 @@ def test_cleanup_expired_sandboxes_skips_replaced_session_from_stale_snapshot() now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) expired_snapshot = SandboxSession( session_id='session-expired', - chat_id='chat-1', + chat_id=CHAT_ID, container_id='container-expired', status=SandboxStatus.RUNNING, created_at=now - timedelta(minutes=10), @@ -499,7 +508,7 @@ def test_cleanup_expired_sandboxes_skips_replaced_session_from_stale_snapshot() ) replacement_session = SandboxSession( session_id='session-new', - chat_id='chat-1', + chat_id=CHAT_ID, container_id='container-new', status=SandboxStatus.RUNNING, created_at=now - timedelta(seconds=30), @@ -522,8 +531,8 @@ def test_cleanup_expired_sandboxes_skips_replaced_session_from_stale_snapshot() assert result == [] assert runtime.stop_calls == [] - assert repository.get_active_by_chat_id('chat-1') == replacement_session - assert locker.chat_ids == ['chat-1'] + assert repository.get_active_by_chat_id(CHAT_ID) == replacement_session + assert locker.chat_ids == [CHAT_ID] assert logger.messages == [] @@ -531,7 +540,7 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None: now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) failing_session = SandboxSession( session_id='session-fail', - chat_id='chat-fail', + chat_id=FAIL_CHAT_ID, container_id='container-fail', status=SandboxStatus.RUNNING, created_at=now - timedelta(minutes=10), @@ -539,7 +548,7 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None: ) cleaned_session = SandboxSession( session_id='session-clean', - chat_id='chat-clean', + chat_id=CLEAN_CHAT_ID, container_id='container-clean', status=SandboxStatus.RUNNING, created_at=now - timedelta(minutes=9), @@ -563,15 +572,15 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None: assert result == [cleaned_session] assert runtime.stop_calls == ['container-fail', 'container-clean'] - assert repository.get_active_by_chat_id('chat-fail') == failing_session - assert repository.get_active_by_chat_id('chat-clean') is None - assert locker.chat_ids == ['chat-fail', 'chat-clean'] + assert repository.get_active_by_chat_id(FAIL_CHAT_ID) == failing_session + assert repository.get_active_by_chat_id(CLEAN_CHAT_ID) is None + assert locker.chat_ids == [FAIL_CHAT_ID, CLEAN_CHAT_ID] assert logger.messages == [ ( 'error', 'sandbox_clean_failed', { - 'chat_id': 'chat-fail', + 'chat_id': FAIL_CHAT_ID, 'session_id': 'session-fail', 'container_id': 'container-fail', 'error': 'RuntimeError', @@ -581,7 +590,7 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None: 'info', 'sandbox_cleaned', { - 'chat_id': 'chat-clean', + 'chat_id': CLEAN_CHAT_ID, 'session_id': 'session-clean', 'container_id': 'container-clean', }, diff --git a/usecase/sandbox.py b/usecase/sandbox.py index 00946a8..d04dd82 100644 --- a/usecase/sandbox.py +++ b/usecase/sandbox.py @@ -1,6 +1,6 @@ from dataclasses import dataclass from datetime import timedelta -from uuid import uuid4 +from uuid import UUID, uuid4 from domain.sandbox import SandboxSession from usecase.interface import ( @@ -35,15 +35,17 @@ class CreateSandbox: self._ttl = ttl def execute(self, command: CreateSandboxCommand) -> SandboxSession: - with self._locker.lock(command.chat_id): - session = self._repository.get_active_by_chat_id(command.chat_id) + chat_id = _canonical_chat_id(command.chat_id) + + with self._locker.lock(chat_id): + session = self._repository.get_active_by_chat_id(chat_id) now = self._clock.now() if session is not None and session.expires_at > now: self._logger.info( 'sandbox_reused', attrs={ - 'chat_id': command.chat_id, + 'chat_id': chat_id, 'session_id': session.session_id, 'container_id': session.container_id, }, @@ -54,7 +56,7 @@ class CreateSandbox: self._logger.info( 'sandbox_replaced', attrs={ - 'chat_id': command.chat_id, + 'chat_id': chat_id, 'session_id': session.session_id, 'container_id': session.container_id, }, @@ -66,7 +68,7 @@ class CreateSandbox: expires_at = created_at + self._ttl new_session = self._runtime.create( session_id=_new_session_id(), - chat_id=command.chat_id, + chat_id=chat_id, created_at=created_at, expires_at=expires_at, ) @@ -74,7 +76,7 @@ class CreateSandbox: self._logger.info( 'sandbox_created', attrs={ - 'chat_id': command.chat_id, + 'chat_id': chat_id, 'session_id': new_session.session_id, 'container_id': new_session.container_id, }, @@ -151,3 +153,7 @@ class CleanupExpiredSandboxes: def _new_session_id() -> str: return uuid4().hex + + +def _canonical_chat_id(chat_id: str) -> str: + return str(UUID(str(chat_id).strip())) From 770af1fe763fbe4baa56e4dca07d79d0ce735fc7 Mon Sep 17 00:00:00 2001 From: Azamat Date: Thu, 2 Apr 2026 23:09:04 +0300 Subject: [PATCH 15/30] [feat] change str id type to UUID --- adapter/docker/runtime.py | 36 ++++------ adapter/http/fastapi/schemas.py | 15 ++-- domain/sandbox.py | 5 +- repository/sandbox_lock.py | 5 +- repository/sandbox_session.py | 7 +- tasks.md | 11 +++ test/test_create_http.py | 20 +++--- test/test_docker_runtime.py | 43 ++++-------- test/test_sandbox_usecase.py | 121 +++++++++++++++++--------------- usecase/interface.py | 11 +-- usecase/sandbox.py | 49 +++++-------- 11 files changed, 150 insertions(+), 173 deletions(-) diff --git a/adapter/docker/runtime.py b/adapter/docker/runtime.py index d24d110..ad6e84a 100644 --- a/adapter/docker/runtime.py +++ b/adapter/docker/runtime.py @@ -24,16 +24,13 @@ class DockerSandboxRuntime(SandboxRuntime): def create( self, *, - session_id: str, - chat_id: str, + session_id: UUID, + chat_id: UUID, created_at: datetime, expires_at: datetime, ) -> SandboxSession: - normalized_chat_id = chat_id - try: - normalized_chat_id = _canonical_chat_id(chat_id) - chat_path = self._chat_path(normalized_chat_id) + chat_path = self._chat_path(chat_id) dependencies_path = self._readonly_host_path( self._config.dependencies_host_path ) @@ -44,19 +41,19 @@ class DockerSandboxRuntime(SandboxRuntime): container = self._client.containers.run( self._config.image, detach=True, - labels=self._labels(session_id, normalized_chat_id, expires_at), + labels=self._labels(session_id, chat_id, expires_at), mounts=self._mounts(chat_path, dependencies_path, lambda_tools_path), ) except (DockerException, OSError, ValueError) as exc: - raise SandboxStartError(normalized_chat_id) from exc + raise SandboxStartError(str(chat_id)) from exc container_id = str(getattr(container, 'id', '')).strip() if not container_id: - raise SandboxStartError(normalized_chat_id) + raise SandboxStartError(str(chat_id)) return SandboxSession( session_id=session_id, - chat_id=normalized_chat_id, + chat_id=chat_id, container_id=container_id, status=SandboxStatus.RUNNING, created_at=created_at, @@ -74,13 +71,13 @@ class DockerSandboxRuntime(SandboxRuntime): def _labels( self, - session_id: str, - chat_id: str, + session_id: UUID, + chat_id: UUID, expires_at: datetime, ) -> dict[str, str]: return { - 'session_id': session_id, - 'chat_id': chat_id, + 'session_id': str(session_id), + 'chat_id': str(chat_id), 'expires_at': expires_at.isoformat(), } @@ -110,12 +107,9 @@ class DockerSandboxRuntime(SandboxRuntime): ), ] - def _chat_path(self, chat_id: str) -> Path: - if not chat_id.strip(): - raise ValueError('invalid chat path') - + def _chat_path(self, chat_id: UUID) -> Path: chats_root = self._host_path(self._config.chats_root) - chat_path = (chats_root / chat_id).resolve(strict=False) + chat_path = (chats_root / str(chat_id)).resolve(strict=False) if not chat_path.is_relative_to(chats_root): raise ValueError('invalid chat path') return chat_path @@ -128,7 +122,3 @@ class DockerSandboxRuntime(SandboxRuntime): def _host_path(self, path_value: str) -> Path: return Path(path_value).expanduser().resolve(strict=False) - - -def _canonical_chat_id(chat_id: str) -> str: - return str(UUID(str(chat_id).strip())) diff --git a/adapter/http/fastapi/schemas.py b/adapter/http/fastapi/schemas.py index a34f702..35992ee 100644 --- a/adapter/http/fastapi/schemas.py +++ b/adapter/http/fastapi/schemas.py @@ -1,7 +1,7 @@ from datetime import datetime from uuid import UUID -from pydantic import BaseModel, ConfigDict, Field, field_validator +from pydantic import BaseModel, ConfigDict class HealthResponse(BaseModel): @@ -11,19 +11,14 @@ class HealthResponse(BaseModel): class CreateSandboxRequest(BaseModel): - model_config = ConfigDict(extra='forbid', str_strip_whitespace=True) + model_config = ConfigDict(extra='forbid') - chat_id: str = Field(min_length=1) - - @field_validator('chat_id') - @classmethod - def validate_chat_id(cls, value: str) -> str: - return str(UUID(value)) + chat_id: UUID class SandboxSessionResponse(BaseModel): - session_id: str - chat_id: str + session_id: UUID + chat_id: UUID container_id: str status: str expires_at: datetime diff --git a/domain/sandbox.py b/domain/sandbox.py index 110b4e4..a9b0f40 100644 --- a/domain/sandbox.py +++ b/domain/sandbox.py @@ -1,6 +1,7 @@ from dataclasses import dataclass from datetime import datetime from enum import Enum +from uuid import UUID class SandboxStatus(str, Enum): @@ -13,8 +14,8 @@ class SandboxStatus(str, Enum): @dataclass(frozen=True, slots=True) class SandboxSession: - session_id: str - chat_id: str + session_id: UUID + chat_id: UUID container_id: str status: SandboxStatus created_at: datetime diff --git a/repository/sandbox_lock.py b/repository/sandbox_lock.py index 704aeae..b13cd65 100644 --- a/repository/sandbox_lock.py +++ b/repository/sandbox_lock.py @@ -1,6 +1,7 @@ import threading from types import TracebackType from typing import Protocol +from uuid import UUID from usecase.interface import LockContext, SandboxLifecycleLocker @@ -31,9 +32,9 @@ class _ChatLock(LockContext): class ProcessLocalSandboxLifecycleLocker(SandboxLifecycleLocker): def __init__(self) -> None: self._registry_lock = threading.Lock() - self._locks_by_chat_id: dict[str, _SyncLock] = {} + self._locks_by_chat_id: dict[UUID, _SyncLock] = {} - def lock(self, chat_id: str) -> LockContext: + def lock(self, chat_id: UUID) -> LockContext: with self._registry_lock: lock = self._locks_by_chat_id.get(chat_id) if lock is None: diff --git a/repository/sandbox_session.py b/repository/sandbox_session.py index 6707d0c..3a8857f 100644 --- a/repository/sandbox_session.py +++ b/repository/sandbox_session.py @@ -1,5 +1,6 @@ import threading from datetime import datetime +from uuid import UUID from domain.sandbox import SandboxSession from usecase.interface import SandboxSessionRepository @@ -7,10 +8,10 @@ from usecase.interface import SandboxSessionRepository class InMemorySandboxSessionRepository(SandboxSessionRepository): def __init__(self) -> None: - self._sessions_by_chat_id: dict[str, SandboxSession] = {} + self._sessions_by_chat_id: dict[UUID, SandboxSession] = {} self._lock = threading.Lock() - def get_active_by_chat_id(self, chat_id: str) -> SandboxSession | None: + def get_active_by_chat_id(self, chat_id: UUID) -> SandboxSession | None: with self._lock: return self._sessions_by_chat_id.get(chat_id) @@ -26,7 +27,7 @@ class InMemorySandboxSessionRepository(SandboxSessionRepository): with self._lock: self._sessions_by_chat_id[session.chat_id] = session - def delete(self, session_id: str) -> None: + def delete(self, session_id: UUID) -> None: with self._lock: for chat_id, session in tuple(self._sessions_by_chat_id.items()): if session.session_id == session_id: diff --git a/tasks.md b/tasks.md index cb1e38f..058983e 100644 --- a/tasks.md +++ b/tasks.md @@ -216,3 +216,14 @@ - Файлы: `repository/sandbox_lock.py`, при необходимости тесты в `test/*` - Решение: добавить eviction/ref-count/weakref policy во внешнем lock registry без нарушения сериализации lifecycle для активного `chat_id` - Критерии приемки: registry locks не растет бесконечно без причины; сериализация для активных чатов сохраняется; поведение покрыто тестами + +### M18. Перевести sandbox ids на UUID types + +- Субагент: `feature-developer` +- Статус: completed +- Зависимости: `M15` +- Commit required: no +- Scope: сделать `chat_id` и `session_id` типом `UUID` внутри sandbox scope, оставив `container_id` строкой как внешний Docker identifier +- Файлы: `domain/sandbox.py`, `usecase/interface.py`, `usecase/sandbox.py`, `repository/sandbox_session.py`, `adapter/http/fastapi/*`, `adapter/docker/runtime.py`, `adapter/di/container.py`, `test/*` +- Решение: HTTP boundary принимает/возвращает UUID, usecase и repository работают с UUID objects, Docker labels продолжают сериализоваться в строки через `str(uuid)` +- Критерии приемки: внутри sandbox flow `chat_id` и `session_id` больше не строки; `container_id` остается `str`; pydantic корректно сериализует UUID в response; `make pre-commit` проходит diff --git a/test/test_create_http.py b/test/test_create_http.py index bbdbc6d..2a474ff 100644 --- a/test/test_create_http.py +++ b/test/test_create_http.py @@ -1,6 +1,7 @@ import asyncio import json from datetime import UTC, datetime, timedelta +from uuid import UUID from docker import DockerClient from fastapi import FastAPI @@ -28,8 +29,9 @@ from repository.sandbox_session import InMemorySandboxSessionRepository from usecase.interface import Attrs from usecase.sandbox import CleanupExpiredSandboxes, CreateSandbox, CreateSandboxCommand -CHAT_ID = '123e4567-e89b-12d3-a456-426614174000' +CHAT_ID = UUID('123e4567-e89b-12d3-a456-426614174000') NON_CANONICAL_CHAT_ID = '123E4567E89B12D3A456426614174000' +SESSION_ID = UUID('00000000-0000-0000-0000-000000000011') class FakeLogger: @@ -253,7 +255,7 @@ def test_post_create_returns_session_with_canonical_chat_id(monkeypatch) -> None config = build_config() expires_at = datetime(2026, 4, 2, 12, 5, tzinfo=UTC) session = SandboxSession( - session_id='session-123', + session_id=SESSION_ID, chat_id=CHAT_ID, container_id='container-123', status=SandboxStatus.RUNNING, @@ -284,8 +286,8 @@ def test_post_create_returns_session_with_canonical_chat_id(monkeypatch) -> None assert status_code == 200 assert response == { - 'session_id': 'session-123', - 'chat_id': CHAT_ID, + 'session_id': str(SESSION_ID), + 'chat_id': str(CHAT_ID), 'container_id': 'container-123', 'status': 'running', 'expires_at': '2026-04-02T12:05:00Z', @@ -306,7 +308,7 @@ def test_post_create_rejects_non_uuid_chat_id(monkeypatch) -> None: config = build_config() expires_at = datetime(2026, 4, 2, 12, 5, tzinfo=UTC) session = SandboxSession( - session_id='session-123', + session_id=SESSION_ID, chat_id=CHAT_ID, container_id='container-123', status=SandboxStatus.RUNNING, @@ -344,7 +346,7 @@ def test_post_create_rejects_non_uuid_chat_id(monkeypatch) -> None: def test_post_create_maps_start_errors_to_service_unavailable(monkeypatch) -> None: config = build_config() logger = FakeLogger() - create_usecase = FakeCreateSandboxUsecase(error=SandboxStartError(CHAT_ID)) + create_usecase = FakeCreateSandboxUsecase(error=SandboxStartError(str(CHAT_ID))) cleanup_usecase = FakeCleanupExpiredSandboxes() docker_client = FakeDockerClient() container = build_container( @@ -362,7 +364,7 @@ def test_post_create_maps_start_errors_to_service_unavailable(monkeypatch) -> No app = app_module.create_app(config=config) status_code, response = asyncio.run( - exercise_create_request(app, {'chat_id': CHAT_ID}) + exercise_create_request(app, {'chat_id': str(CHAT_ID)}) ) assert status_code == 503 @@ -391,7 +393,7 @@ def test_post_create_maps_generic_sandbox_errors_to_internal_error(monkeypatch) app = app_module.create_app(config=config) status_code, response = asyncio.run( - exercise_create_request(app, {'chat_id': CHAT_ID}) + exercise_create_request(app, {'chat_id': str(CHAT_ID)}) ) assert status_code == 500 @@ -403,7 +405,7 @@ def test_removed_user_endpoint_returns_not_found(monkeypatch) -> None: config = build_config() expires_at = datetime(2026, 4, 2, 12, 5, tzinfo=UTC) session = SandboxSession( - session_id='session-123', + session_id=SESSION_ID, chat_id=CHAT_ID, container_id='container-123', status=SandboxStatus.RUNNING, diff --git a/test/test_docker_runtime.py b/test/test_docker_runtime.py index 829024d..d266eff 100644 --- a/test/test_docker_runtime.py +++ b/test/test_docker_runtime.py @@ -1,6 +1,7 @@ from datetime import UTC, datetime, timedelta from pathlib import Path from typing import Any, TypedDict +from uuid import UUID import pytest from docker import DockerClient @@ -12,8 +13,9 @@ from adapter.docker.runtime import DockerSandboxRuntime from domain.error import SandboxError, SandboxStartError from domain.sandbox import SandboxStatus -CHAT_ID = '123e4567-e89b-12d3-a456-426614174000' +CHAT_ID = UUID('123e4567-e89b-12d3-a456-426614174000') NON_CANONICAL_CHAT_ID = '123E4567E89B12D3A456426614174000' +SESSION_ID = UUID('00000000-0000-0000-0000-000000000010') class FakeContainer: @@ -107,26 +109,26 @@ def test_runtime_create_applies_mount_policy_and_labels_with_canonical_chat_id( expires_at = created_at + timedelta(minutes=5) session = runtime.create( - session_id='session-123', - chat_id=NON_CANONICAL_CHAT_ID, + session_id=SESSION_ID, + chat_id=UUID(NON_CANONICAL_CHAT_ID), created_at=created_at, expires_at=expires_at, ) - assert session.session_id == 'session-123' + assert session.session_id == SESSION_ID assert session.chat_id == CHAT_ID assert session.container_id == 'container-123' assert session.status is SandboxStatus.RUNNING assert session.created_at == created_at assert session.expires_at == expires_at - assert (tmp_path / 'chats' / CHAT_ID).is_dir() + assert (tmp_path / 'chats' / str(CHAT_ID)).is_dir() call = containers.run_calls[0] assert call['args'] == ('sandbox:latest',) assert call['kwargs']['detach'] is True assert call['kwargs']['labels'] == { - 'session_id': 'session-123', - 'chat_id': CHAT_ID, + 'session_id': str(SESSION_ID), + 'chat_id': str(CHAT_ID), 'expires_at': expires_at.isoformat(), } @@ -134,7 +136,7 @@ def test_runtime_create_applies_mount_policy_and_labels_with_canonical_chat_id( assert [dict(mount) for mount in mounts] == [ { 'Target': '/workspace/chat', - 'Source': str((tmp_path / 'chats' / CHAT_ID).resolve(strict=False)), + 'Source': str((tmp_path / 'chats' / str(CHAT_ID)).resolve(strict=False)), 'Type': 'bind', 'ReadOnly': False, }, @@ -164,14 +166,14 @@ def test_runtime_create_raises_start_error_when_container_id_is_missing( with pytest.raises(SandboxStartError) as excinfo: runtime.create( - session_id='session-123', + session_id=SESSION_ID, chat_id=CHAT_ID, created_at=datetime(2026, 4, 2, 12, 0, tzinfo=UTC), expires_at=datetime(2026, 4, 2, 12, 5, tzinfo=UTC), ) assert str(excinfo.value) == 'sandbox_start_failed' - assert excinfo.value.chat_id == CHAT_ID + assert excinfo.value.chat_id == str(CHAT_ID) def test_runtime_stop_ignores_missing_container(tmp_path: Path) -> None: @@ -195,24 +197,3 @@ def test_runtime_stop_wraps_docker_errors(tmp_path: Path) -> None: runtime.stop('container-123') assert str(excinfo.value) == 'sandbox_stop_failed' - - -@pytest.mark.parametrize('chat_id', ['.', 'a/..', 'x/../y']) -def test_runtime_create_rejects_non_uuid_chat_id(tmp_path: Path, chat_id: str) -> None: - config = build_config(tmp_path) - (tmp_path / 'dependencies').mkdir() - (tmp_path / 'lambda-tools').mkdir() - containers = FakeContainers() - runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) - - with pytest.raises(SandboxStartError) as excinfo: - runtime.create( - session_id='session-123', - chat_id=chat_id, - created_at=datetime(2026, 4, 2, 12, 0, tzinfo=UTC), - expires_at=datetime(2026, 4, 2, 12, 5, tzinfo=UTC), - ) - - assert str(excinfo.value) == 'sandbox_start_failed' - assert excinfo.value.chat_id == chat_id - assert containers.run_calls == [] diff --git a/test/test_sandbox_usecase.py b/test/test_sandbox_usecase.py index 26e094f..4fedb21 100644 --- a/test/test_sandbox_usecase.py +++ b/test/test_sandbox_usecase.py @@ -1,18 +1,28 @@ import threading from datetime import UTC, datetime, timedelta +from uuid import UUID from domain.sandbox import SandboxSession, SandboxStatus from repository.sandbox_lock import ProcessLocalSandboxLifecycleLocker from repository.sandbox_session import InMemorySandboxSessionRepository from usecase.sandbox import CleanupExpiredSandboxes, CreateSandbox, CreateSandboxCommand -CHAT_ID = '11111111-1111-1111-1111-111111111111' +CHAT_ID = UUID('11111111-1111-1111-1111-111111111111') NON_CANONICAL_CHAT_ID = '11111111111111111111111111111111' -EXPIRED_CHAT_ID = '22222222-2222-2222-2222-222222222222' -BOUNDARY_CHAT_ID = '33333333-3333-3333-3333-333333333333' -ACTIVE_CHAT_ID = '44444444-4444-4444-4444-444444444444' -FAIL_CHAT_ID = '55555555-5555-5555-5555-555555555555' -CLEAN_CHAT_ID = '66666666-6666-6666-6666-666666666666' +EXPIRED_CHAT_ID = UUID('22222222-2222-2222-2222-222222222222') +BOUNDARY_CHAT_ID = UUID('33333333-3333-3333-3333-333333333333') +ACTIVE_CHAT_ID = UUID('44444444-4444-4444-4444-444444444444') +FAIL_CHAT_ID = UUID('55555555-5555-5555-5555-555555555555') +CLEAN_CHAT_ID = UUID('66666666-6666-6666-6666-666666666666') +SESSION_REUSED_ID = UUID('00000000-0000-0000-0000-000000000001') +SESSION_OLD_ID = UUID('00000000-0000-0000-0000-000000000002') +SESSION_NEW_ID = UUID('00000000-0000-0000-0000-000000000003') +SESSION_EXPIRED_ID = UUID('00000000-0000-0000-0000-000000000004') +SESSION_BOUNDARY_ID = UUID('00000000-0000-0000-0000-000000000005') +SESSION_ACTIVE_ID = UUID('00000000-0000-0000-0000-000000000006') +SESSION_FAIL_ID = UUID('00000000-0000-0000-0000-000000000007') +SESSION_CLEAN_ID = UUID('00000000-0000-0000-0000-000000000008') +SESSION_REPLACEMENT_ID = UUID('00000000-0000-0000-0000-000000000009') class FakeClock: @@ -52,9 +62,9 @@ class FakeLockContext: class FakeLocker: def __init__(self) -> None: - self.chat_ids: list[str] = [] + self.chat_ids: list[UUID] = [] - def lock(self, chat_id: str) -> FakeLockContext: + def lock(self, chat_id: UUID) -> FakeLockContext: self.chat_ids.append(chat_id) return FakeLockContext() @@ -63,7 +73,7 @@ class TrackingLockContext: def __init__( self, locker: 'TrackingLocker', - chat_id: str, + chat_id: UUID, inner_context, ) -> None: self._locker = locker @@ -89,9 +99,9 @@ class TrackingLocker: self._state_lock = threading.Lock() self._attempts = 0 self.second_attempted = threading.Event() - self.chat_ids: list[str] = [] + self.chat_ids: list[UUID] = [] - def lock(self, chat_id: str) -> TrackingLockContext: + def lock(self, chat_id: UUID) -> TrackingLockContext: return TrackingLockContext(self, chat_id, self._locker.lock(chat_id)) @@ -105,8 +115,8 @@ class BlockingCreateRuntime: def create( self, *, - session_id: str, - chat_id: str, + session_id: UUID, + chat_id: UUID, created_at: datetime, expires_at: datetime, ) -> SandboxSession: @@ -150,8 +160,8 @@ class FakeRuntime: def create( self, *, - session_id: str, - chat_id: str, + session_id: UUID, + chat_id: UUID, created_at: datetime, expires_at: datetime, ) -> SandboxSession: @@ -190,7 +200,7 @@ class FailingStopRuntime(FakeRuntime): def test_create_sandbox_reuses_active_session_when_not_expired() -> None: now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) session = SandboxSession( - session_id='session-1', + session_id=SESSION_REUSED_ID, chat_id=CHAT_ID, container_id='container-1', status=SandboxStatus.RUNNING, @@ -223,8 +233,8 @@ def test_create_sandbox_reuses_active_session_when_not_expired() -> None: 'info', 'sandbox_reused', { - 'chat_id': CHAT_ID, - 'session_id': 'session-1', + 'chat_id': str(CHAT_ID), + 'session_id': str(SESSION_REUSED_ID), 'container_id': 'container-1', }, ) @@ -236,7 +246,7 @@ def test_create_sandbox_replaces_expired_session_and_creates_new_one( ) -> None: now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) expired_session = SandboxSession( - session_id='session-old', + session_id=SESSION_OLD_ID, chat_id=CHAT_ID, container_id='container-old', status=SandboxStatus.RUNNING, @@ -256,23 +266,23 @@ def test_create_sandbox_replaces_expired_session_and_creates_new_one( logger=logger, ttl=timedelta(minutes=5), ) - monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: 'session-new') + monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID) result = usecase.execute(CreateSandboxCommand(chat_id=CHAT_ID)) assert runtime.stop_calls == ['container-old'] assert runtime.create_calls == [ { - 'session_id': 'session-new', + 'session_id': SESSION_NEW_ID, 'chat_id': CHAT_ID, 'created_at': now, 'expires_at': now + timedelta(minutes=5), } ] assert result == SandboxSession( - session_id='session-new', + session_id=SESSION_NEW_ID, chat_id=CHAT_ID, - container_id='container-session-new', + container_id=f'container-{SESSION_NEW_ID}', status=SandboxStatus.RUNNING, created_at=now, expires_at=now + timedelta(minutes=5), @@ -284,8 +294,8 @@ def test_create_sandbox_replaces_expired_session_and_creates_new_one( 'info', 'sandbox_replaced', { - 'chat_id': CHAT_ID, - 'session_id': 'session-old', + 'chat_id': str(CHAT_ID), + 'session_id': str(SESSION_OLD_ID), 'container_id': 'container-old', }, ), @@ -293,9 +303,9 @@ def test_create_sandbox_replaces_expired_session_and_creates_new_one( 'info', 'sandbox_created', { - 'chat_id': CHAT_ID, - 'session_id': 'session-new', - 'container_id': 'container-session-new', + 'chat_id': str(CHAT_ID), + 'session_id': str(SESSION_NEW_ID), + 'container_id': f'container-{SESSION_NEW_ID}', }, ), ] @@ -316,7 +326,7 @@ def test_create_sandbox_creates_new_session_when_none_exists() -> None: ttl=timedelta(minutes=5), ) - result = usecase.execute(CreateSandboxCommand(chat_id=NON_CANONICAL_CHAT_ID)) + result = usecase.execute(CreateSandboxCommand(chat_id=UUID(NON_CANONICAL_CHAT_ID))) assert result.chat_id == CHAT_ID assert result.container_id == f'container-{result.session_id}' @@ -332,15 +342,14 @@ def test_create_sandbox_creates_new_session_when_none_exists() -> None: } assert runtime.stop_calls == [] assert repository.get_active_by_chat_id(CHAT_ID) == result - assert repository.get_active_by_chat_id(NON_CANONICAL_CHAT_ID) is None assert locker.chat_ids == [CHAT_ID] assert logger.messages == [ ( 'info', 'sandbox_created', { - 'chat_id': CHAT_ID, - 'session_id': result.session_id, + 'chat_id': str(CHAT_ID), + 'session_id': str(result.session_id), 'container_id': result.container_id, }, ) @@ -363,7 +372,7 @@ def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id( logger=logger, ttl=timedelta(minutes=5), ) - monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: 'session-new') + monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID) results: list[SandboxSession | None] = [None, None] errors: list[Exception] = [] @@ -392,9 +401,9 @@ def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id( assert errors == [] assert results[0] == results[1] assert results[0] == SandboxSession( - session_id='session-new', + session_id=SESSION_NEW_ID, chat_id=CHAT_ID, - container_id='container-session-new', + container_id=f'container-{SESSION_NEW_ID}', status=SandboxStatus.RUNNING, created_at=now, expires_at=now + timedelta(minutes=5), @@ -408,18 +417,18 @@ def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id( 'info', 'sandbox_created', { - 'chat_id': CHAT_ID, - 'session_id': 'session-new', - 'container_id': 'container-session-new', + 'chat_id': str(CHAT_ID), + 'session_id': str(SESSION_NEW_ID), + 'container_id': f'container-{SESSION_NEW_ID}', }, ), ( 'info', 'sandbox_reused', { - 'chat_id': CHAT_ID, - 'session_id': 'session-new', - 'container_id': 'container-session-new', + 'chat_id': str(CHAT_ID), + 'session_id': str(SESSION_NEW_ID), + 'container_id': f'container-{SESSION_NEW_ID}', }, ), ] @@ -428,7 +437,7 @@ def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id( def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> None: now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) expired_session = SandboxSession( - session_id='session-expired', + session_id=SESSION_EXPIRED_ID, chat_id=EXPIRED_CHAT_ID, container_id='container-expired', status=SandboxStatus.RUNNING, @@ -436,7 +445,7 @@ def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> expires_at=now - timedelta(seconds=1), ) boundary_session = SandboxSession( - session_id='session-boundary', + session_id=SESSION_BOUNDARY_ID, chat_id=BOUNDARY_CHAT_ID, container_id='container-boundary', status=SandboxStatus.RUNNING, @@ -444,7 +453,7 @@ def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> expires_at=now, ) active_session = SandboxSession( - session_id='session-active', + session_id=SESSION_ACTIVE_ID, chat_id=ACTIVE_CHAT_ID, container_id='container-active', status=SandboxStatus.RUNNING, @@ -479,8 +488,8 @@ def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> 'info', 'sandbox_cleaned', { - 'chat_id': EXPIRED_CHAT_ID, - 'session_id': 'session-expired', + 'chat_id': str(EXPIRED_CHAT_ID), + 'session_id': str(SESSION_EXPIRED_ID), 'container_id': 'container-expired', }, ), @@ -488,8 +497,8 @@ def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> 'info', 'sandbox_cleaned', { - 'chat_id': BOUNDARY_CHAT_ID, - 'session_id': 'session-boundary', + 'chat_id': str(BOUNDARY_CHAT_ID), + 'session_id': str(SESSION_BOUNDARY_ID), 'container_id': 'container-boundary', }, ), @@ -499,7 +508,7 @@ def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> def test_cleanup_expired_sandboxes_skips_replaced_session_from_stale_snapshot() -> None: now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) expired_snapshot = SandboxSession( - session_id='session-expired', + session_id=SESSION_EXPIRED_ID, chat_id=CHAT_ID, container_id='container-expired', status=SandboxStatus.RUNNING, @@ -507,7 +516,7 @@ def test_cleanup_expired_sandboxes_skips_replaced_session_from_stale_snapshot() expires_at=now - timedelta(seconds=1), ) replacement_session = SandboxSession( - session_id='session-new', + session_id=SESSION_REPLACEMENT_ID, chat_id=CHAT_ID, container_id='container-new', status=SandboxStatus.RUNNING, @@ -539,7 +548,7 @@ def test_cleanup_expired_sandboxes_skips_replaced_session_from_stale_snapshot() def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None: now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) failing_session = SandboxSession( - session_id='session-fail', + session_id=SESSION_FAIL_ID, chat_id=FAIL_CHAT_ID, container_id='container-fail', status=SandboxStatus.RUNNING, @@ -547,7 +556,7 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None: expires_at=now - timedelta(minutes=1), ) cleaned_session = SandboxSession( - session_id='session-clean', + session_id=SESSION_CLEAN_ID, chat_id=CLEAN_CHAT_ID, container_id='container-clean', status=SandboxStatus.RUNNING, @@ -580,8 +589,8 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None: 'error', 'sandbox_clean_failed', { - 'chat_id': FAIL_CHAT_ID, - 'session_id': 'session-fail', + 'chat_id': str(FAIL_CHAT_ID), + 'session_id': str(SESSION_FAIL_ID), 'container_id': 'container-fail', 'error': 'RuntimeError', }, @@ -590,8 +599,8 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None: 'info', 'sandbox_cleaned', { - 'chat_id': CLEAN_CHAT_ID, - 'session_id': 'session-clean', + 'chat_id': str(CLEAN_CHAT_ID), + 'session_id': str(SESSION_CLEAN_ID), 'container_id': 'container-clean', }, ), diff --git a/usecase/interface.py b/usecase/interface.py index 0c0e321..15c581a 100644 --- a/usecase/interface.py +++ b/usecase/interface.py @@ -2,6 +2,7 @@ from collections.abc import Mapping from datetime import datetime from types import TracebackType from typing import Protocol, TypeAlias +from uuid import UUID from domain.sandbox import SandboxSession from domain.user import User @@ -19,13 +20,13 @@ class UserRepository(Protocol): class SandboxSessionRepository(Protocol): - def get_active_by_chat_id(self, chat_id: str) -> SandboxSession | None: ... + def get_active_by_chat_id(self, chat_id: UUID) -> SandboxSession | None: ... def list_expired(self, now: datetime) -> list[SandboxSession]: ... def save(self, session: SandboxSession) -> None: ... - def delete(self, session_id: str) -> None: ... + def delete(self, session_id: UUID) -> None: ... class LockContext(Protocol): @@ -40,15 +41,15 @@ class LockContext(Protocol): class SandboxLifecycleLocker(Protocol): - def lock(self, chat_id: str) -> LockContext: ... + def lock(self, chat_id: UUID) -> LockContext: ... class SandboxRuntime(Protocol): def create( self, *, - session_id: str, - chat_id: str, + session_id: UUID, + chat_id: UUID, created_at: datetime, expires_at: datetime, ) -> SandboxSession: ... diff --git a/usecase/sandbox.py b/usecase/sandbox.py index d04dd82..83ee39d 100644 --- a/usecase/sandbox.py +++ b/usecase/sandbox.py @@ -14,7 +14,7 @@ from usecase.interface import ( @dataclass(frozen=True, slots=True) class CreateSandboxCommand: - chat_id: str + chat_id: UUID class CreateSandbox: @@ -35,7 +35,7 @@ class CreateSandbox: self._ttl = ttl def execute(self, command: CreateSandboxCommand) -> SandboxSession: - chat_id = _canonical_chat_id(command.chat_id) + chat_id = command.chat_id with self._locker.lock(chat_id): session = self._repository.get_active_by_chat_id(chat_id) @@ -44,22 +44,14 @@ class CreateSandbox: if session is not None and session.expires_at > now: self._logger.info( 'sandbox_reused', - attrs={ - 'chat_id': chat_id, - 'session_id': session.session_id, - 'container_id': session.container_id, - }, + attrs=_sandbox_attrs(session), ) return session if session is not None: self._logger.info( 'sandbox_replaced', - attrs={ - 'chat_id': chat_id, - 'session_id': session.session_id, - 'container_id': session.container_id, - }, + attrs=_sandbox_attrs(session), ) self._runtime.stop(session.container_id) self._repository.delete(session.session_id) @@ -75,11 +67,7 @@ class CreateSandbox: self._repository.save(new_session) self._logger.info( 'sandbox_created', - attrs={ - 'chat_id': chat_id, - 'session_id': new_session.session_id, - 'container_id': new_session.container_id, - }, + attrs=_sandbox_attrs(new_session), ) return new_session @@ -107,14 +95,11 @@ class CleanupExpiredSandboxes: try: cleaned_session = self._cleanup_session(session) except Exception as exc: + attrs = _sandbox_attrs(session) + attrs['error'] = type(exc).__name__ self._logger.error( 'sandbox_clean_failed', - attrs={ - 'chat_id': session.chat_id, - 'session_id': session.session_id, - 'container_id': session.container_id, - 'error': type(exc).__name__, - }, + attrs=attrs, ) continue @@ -124,11 +109,7 @@ class CleanupExpiredSandboxes: cleaned_sessions.append(cleaned_session) self._logger.info( 'sandbox_cleaned', - attrs={ - 'chat_id': cleaned_session.chat_id, - 'session_id': cleaned_session.session_id, - 'container_id': cleaned_session.container_id, - }, + attrs=_sandbox_attrs(cleaned_session), ) return cleaned_sessions @@ -151,9 +132,13 @@ class CleanupExpiredSandboxes: return current_session -def _new_session_id() -> str: - return uuid4().hex +def _new_session_id() -> UUID: + return uuid4() -def _canonical_chat_id(chat_id: str) -> str: - return str(UUID(str(chat_id).strip())) +def _sandbox_attrs(session: SandboxSession) -> dict[str, str]: + return { + 'chat_id': str(session.chat_id), + 'session_id': str(session.session_id), + 'container_id': session.container_id, + } From 50af62b3fb90b686bacf0fce4fc0405f24d19d07 Mon Sep 17 00:00:00 2001 From: Azamat Date: Thu, 2 Apr 2026 23:39:25 +0300 Subject: [PATCH 16/30] [fix] restart gap --- adapter/di/container.py | 8 ++ adapter/docker/runtime.py | 64 +++++++++ adapter/http/fastapi/app.py | 2 + adapter/sandbox/__init__.py | 0 adapter/sandbox/reconciliation.py | 39 ++++++ docs/007-startup-sandbox-reconciliation.md | 17 +++ repository/sandbox_session.py | 6 + tasks.md | 2 +- test/test_create_http.py | 149 ++++++++++++++++++++- test/test_docker_runtime.py | 65 ++++++++- 10 files changed, 348 insertions(+), 4 deletions(-) create mode 100644 adapter/sandbox/__init__.py create mode 100644 adapter/sandbox/reconciliation.py create mode 100644 docs/007-startup-sandbox-reconciliation.md diff --git a/adapter/di/container.py b/adapter/di/container.py index c5c7f35..ace3a42 100644 --- a/adapter/di/container.py +++ b/adapter/di/container.py @@ -11,6 +11,7 @@ from adapter.config.model import AppConfig from adapter.docker.runtime import DockerSandboxRuntime from adapter.observability.factory import build_observability from adapter.observability.runtime import ObservabilityRuntime +from adapter.sandbox.reconciliation import SandboxSessionReconciler from repository.sandbox_lock import ProcessLocalSandboxLifecycleLocker from repository.sandbox_session import InMemorySandboxSessionRepository from usecase.interface import Clock @@ -34,6 +35,7 @@ class AppContainer: observability: ObservabilityRuntime repositories: AppRepositories usecases: AppUsecases + sandbox_reconciler: SandboxSessionReconciler = field(repr=False) _docker_client: DockerClient = field(repr=False) _is_shutdown: bool = field(default=False, init=False, repr=False) @@ -80,6 +82,11 @@ def build_container( sandbox_repository = InMemorySandboxSessionRepository() sandbox_locker = ProcessLocalSandboxLifecycleLocker() sandbox_runtime = DockerSandboxRuntime(app_config.sandbox, docker_client) + sandbox_reconciler = SandboxSessionReconciler( + state_source=sandbox_runtime, + registry=sandbox_repository, + logger=observability.logger, + ) repositories = AppRepositories(sandbox_session=sandbox_repository) usecases = AppUsecases( @@ -105,5 +112,6 @@ def build_container( observability=observability, repositories=repositories, usecases=usecases, + sandbox_reconciler=sandbox_reconciler, _docker_client=docker_client, ) diff --git a/adapter/docker/runtime.py b/adapter/docker/runtime.py index ad6e84a..3f33466 100644 --- a/adapter/docker/runtime.py +++ b/adapter/docker/runtime.py @@ -11,6 +11,8 @@ from domain.error import SandboxError, SandboxStartError from domain.sandbox import SandboxSession, SandboxStatus from usecase.interface import SandboxRuntime +SANDBOX_LABELS = ('session_id', 'chat_id', 'expires_at') + class DockerSandboxRuntime(SandboxRuntime): def __init__( @@ -69,6 +71,23 @@ class DockerSandboxRuntime(SandboxRuntime): except DockerException as exc: raise SandboxError('sandbox_stop_failed') from exc + def list_active_sessions(self) -> list[SandboxSession]: + try: + containers = self._client.containers.list( + filters={'label': list(SANDBOX_LABELS)} + ) + except DockerException as exc: + raise SandboxError('sandbox_list_failed') from exc + + sessions: list[SandboxSession] = [] + for container in containers: + session = self._session_from_container(container) + if session is None: + continue + sessions.append(session) + + return sessions + def _labels( self, session_id: UUID, @@ -120,5 +139,50 @@ class DockerSandboxRuntime(SandboxRuntime): raise ValueError('invalid host path') return host_path + def _session_from_container(self, container: object) -> SandboxSession | None: + container_id = str(getattr(container, 'id', '')).strip() + labels = getattr(container, 'labels', None) + if not container_id or not isinstance(labels, dict): + return None + + try: + session_id = UUID(labels['session_id']) + chat_id = UUID(labels['chat_id']) + created_at = self._container_created_at(container) + expires_at = _parse_datetime(labels['expires_at']) + except (KeyError, TypeError, ValueError): + return None + + return SandboxSession( + session_id=session_id, + chat_id=chat_id, + container_id=container_id, + status=SandboxStatus.RUNNING, + created_at=created_at, + expires_at=expires_at, + ) + + def _container_created_at(self, container: object) -> datetime: + attrs = getattr(container, 'attrs', None) + if not isinstance(attrs, dict): + reload_container = getattr(container, 'reload', None) + if callable(reload_container): + reload_container() + attrs = getattr(container, 'attrs', None) + + if not isinstance(attrs, dict): + raise ValueError('invalid container attrs') + + raw_created_at = attrs.get('Created') + if not isinstance(raw_created_at, str): + raise ValueError('invalid created_at') + + return _parse_datetime(raw_created_at) + def _host_path(self, path_value: str) -> Path: return Path(path_value).expanduser().resolve(strict=False) + + +def _parse_datetime(value: str) -> datetime: + normalized = f'{value[:-1]}+00:00' if value.endswith('Z') else value + return datetime.fromisoformat(normalized) diff --git a/adapter/http/fastapi/app.py b/adapter/http/fastapi/app.py index ffa4851..e9ba18f 100644 --- a/adapter/http/fastapi/app.py +++ b/adapter/http/fastapi/app.py @@ -56,6 +56,8 @@ def _build_startup_handler( if task is not None and not task.done(): return + await asyncio.to_thread(container.sandbox_reconciler.execute) + stop_event = asyncio.Event() setattr(app.state, APP_CLEANUP_STOP_STATE, stop_event) setattr( diff --git a/adapter/sandbox/__init__.py b/adapter/sandbox/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/adapter/sandbox/reconciliation.py b/adapter/sandbox/reconciliation.py new file mode 100644 index 0000000..2d04ca5 --- /dev/null +++ b/adapter/sandbox/reconciliation.py @@ -0,0 +1,39 @@ +from dataclasses import dataclass +from typing import Protocol +from uuid import UUID + +from domain.sandbox import SandboxSession +from usecase.interface import Logger + + +class SandboxSessionStateSource(Protocol): + def list_active_sessions(self) -> list[SandboxSession]: ... + + +class SandboxSessionRegistry(Protocol): + def replace_all(self, sessions: list[SandboxSession]) -> None: ... + + +@dataclass(frozen=True, slots=True) +class SandboxSessionReconciler: + state_source: SandboxSessionStateSource + registry: SandboxSessionRegistry + logger: Logger + + def execute(self) -> list[SandboxSession]: + sessions_by_chat_id: dict[UUID, SandboxSession] = {} + for session in sorted( + self.state_source.list_active_sessions(), + key=lambda item: item.created_at, + ): + sessions_by_chat_id[session.chat_id] = session + + sessions = list(sessions_by_chat_id.values()) + self.registry.replace_all(sessions) + self.logger.info( + 'sandbox_reconciled', + attrs={ + 'session_count': len(sessions), + }, + ) + return sessions diff --git a/docs/007-startup-sandbox-reconciliation.md b/docs/007-startup-sandbox-reconciliation.md new file mode 100644 index 0000000..1d4ec0f --- /dev/null +++ b/docs/007-startup-sandbox-reconciliation.md @@ -0,0 +1,17 @@ +# 007 Startup Sandbox Reconciliation + +Context +- Active sandboxes outlive the process because Docker keeps containers running across master-service restarts. +- The in-memory session repository is rebuilt on each start and otherwise loses running sandbox state. + +Decision +- Reconcile sandbox state during app startup before the cleanup loop starts serving requests. +- Read running Docker containers through sandbox labels `session_id`, `chat_id`, and `expires_at`. +- Rebuild the in-memory registry from the reconciled sessions and prefer the newest session per `chat_id`. +- Let the normal cleanup flow handle reconciled sessions that are already expired. +- Do not stop healthy sandbox containers during service shutdown; shutdown only stops background control-plane work and closes local resources. + +Consequences +- A restarted master-service reuses existing sandboxes instead of starting duplicates for the same chat. +- Startup now depends on Docker state access and should fail fast if runtime state cannot be listed. +- The reconciliation rule stays local to outer layers and does not leak Docker into usecases. diff --git a/repository/sandbox_session.py b/repository/sandbox_session.py index 3a8857f..893ec65 100644 --- a/repository/sandbox_session.py +++ b/repository/sandbox_session.py @@ -11,6 +11,12 @@ class InMemorySandboxSessionRepository(SandboxSessionRepository): self._sessions_by_chat_id: dict[UUID, SandboxSession] = {} self._lock = threading.Lock() + def replace_all(self, sessions: list[SandboxSession]) -> None: + with self._lock: + self._sessions_by_chat_id = { + session.chat_id: session for session in sessions + } + def get_active_by_chat_id(self, chat_id: UUID) -> SandboxSession | None: with self._lock: return self._sessions_by_chat_id.get(chat_id) diff --git a/tasks.md b/tasks.md index 058983e..1101e17 100644 --- a/tasks.md +++ b/tasks.md @@ -198,7 +198,7 @@ ### M16. Lifecycle reconciliation на startup/shutdown - Субагент: `feature-developer` -- Статус: pending +- Статус: completed - Зависимости: `M13` - Commit required: no - Scope: устранить restart-gap между in-memory registry и уже запущенными Docker containers diff --git a/test/test_create_http.py b/test/test_create_http.py index 2a474ff..652644b 100644 --- a/test/test_create_http.py +++ b/test/test_create_http.py @@ -23,8 +23,10 @@ from adapter.di.container import AppContainer, AppRepositories, AppUsecases from adapter.http.fastapi import app as app_module from adapter.observability.noop import NoopMetrics, NoopTracer from adapter.observability.runtime import ObservabilityRuntime +from adapter.sandbox.reconciliation import SandboxSessionReconciler from domain.error import SandboxError, SandboxStartError from domain.sandbox import SandboxSession, SandboxStatus +from repository.sandbox_lock import ProcessLocalSandboxLifecycleLocker from repository.sandbox_session import InMemorySandboxSessionRepository from usecase.interface import Attrs from usecase.sandbox import CleanupExpiredSandboxes, CreateSandbox, CreateSandboxCommand @@ -85,6 +87,61 @@ class FakeDockerClient(DockerClient): self.close_calls += 1 +class EmptySandboxState: + def __init__(self) -> None: + self.calls = 0 + + def list_active_sessions(self) -> list[SandboxSession]: + self.calls += 1 + return [] + + +class FakeClock: + def __init__(self, now: datetime) -> None: + self._now = now + + def now(self) -> datetime: + return self._now + + +class FakeLifecycleRuntime: + def __init__(self, sessions: list[SandboxSession]) -> None: + self._sessions = list(sessions) + self.list_calls = 0 + self.create_calls: list[CreateSandboxCommand] = [] + self.stop_calls: list[str] = [] + + def list_active_sessions(self) -> list[SandboxSession]: + self.list_calls += 1 + return list(self._sessions) + + def create( + self, + *, + session_id: UUID, + chat_id: UUID, + created_at: datetime, + expires_at: datetime, + ) -> SandboxSession: + self.create_calls.append(CreateSandboxCommand(chat_id=chat_id)) + session = SandboxSession( + session_id=session_id, + chat_id=chat_id, + container_id=f'container-{session_id}', + status=SandboxStatus.RUNNING, + created_at=created_at, + expires_at=expires_at, + ) + self._sessions = [ + existing for existing in self._sessions if existing.chat_id != chat_id + ] + self._sessions.append(session) + return session + + def stop(self, container_id: str) -> None: + self.stop_calls.append(container_id) + + def build_config() -> AppConfig: return AppConfig( app=AppSectionConfig(name='master', env='test'), @@ -123,10 +180,11 @@ def build_config() -> AppConfig: def build_container( config: AppConfig, - create_sandbox_usecase: FakeCreateSandboxUsecase, - cleanup_usecase: FakeCleanupExpiredSandboxes, + create_sandbox_usecase: CreateSandbox, + cleanup_usecase: CleanupExpiredSandboxes, logger: FakeLogger, docker_client: FakeDockerClient, + sandbox_reconciler: SandboxSessionReconciler | None = None, ) -> AppContainer: observability = ObservabilityRuntime( logger=logger, @@ -134,6 +192,13 @@ def build_container( tracer=NoopTracer(), ) repositories = AppRepositories(sandbox_session=InMemorySandboxSessionRepository()) + reconciler = sandbox_reconciler + if reconciler is None: + reconciler = SandboxSessionReconciler( + state_source=EmptySandboxState(), + registry=repositories.sandbox_session, + logger=logger, + ) usecases = AppUsecases( create_sandbox=create_sandbox_usecase, cleanup_expired_sandboxes=cleanup_usecase, @@ -143,6 +208,7 @@ def build_container( observability=observability, repositories=repositories, usecases=usecases, + sandbox_reconciler=reconciler, _docker_client=docker_client, ) @@ -401,6 +467,85 @@ def test_post_create_maps_generic_sandbox_errors_to_internal_error(monkeypatch) assert docker_client.close_calls == 1 +def test_startup_reconciliation_reuses_existing_container_after_restart( + monkeypatch, +) -> None: + config = build_config() + created_at = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + restored_session = SandboxSession( + session_id=SESSION_ID, + chat_id=CHAT_ID, + container_id='container-123', + status=SandboxStatus.RUNNING, + created_at=created_at, + expires_at=created_at + timedelta(minutes=5), + ) + logger = FakeLogger() + docker_client = FakeDockerClient() + runtime = FakeLifecycleRuntime([restored_session]) + repository = InMemorySandboxSessionRepository() + observability = ObservabilityRuntime( + logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), + ) + repositories = AppRepositories(sandbox_session=repository) + reconciler = SandboxSessionReconciler( + state_source=runtime, + registry=repository, + logger=logger, + ) + usecases = AppUsecases( + create_sandbox=CreateSandbox( + repository=repository, + locker=ProcessLocalSandboxLifecycleLocker(), + runtime=runtime, + clock=FakeClock(created_at), + logger=logger, + ttl=timedelta(minutes=5), + ), + cleanup_expired_sandboxes=CleanupExpiredSandboxes( + repository=repository, + locker=ProcessLocalSandboxLifecycleLocker(), + runtime=runtime, + clock=FakeClock(created_at), + logger=logger, + ), + ) + container = AppContainer( + config=config, + observability=observability, + repositories=repositories, + usecases=usecases, + sandbox_reconciler=reconciler, + _docker_client=docker_client, + ) + monkeypatch.setattr(app_module, 'build_container', lambda **kwargs: container) + monkeypatch.setattr( + app_module.FastAPIInstrumentor, 'instrument_app', lambda *args, **kwargs: None + ) + + app = app_module.create_app(config=config) + + status_code, response = asyncio.run( + exercise_create_request(app, {'chat_id': str(CHAT_ID)}) + ) + + assert status_code == 200 + assert response == { + 'session_id': str(SESSION_ID), + 'chat_id': str(CHAT_ID), + 'container_id': 'container-123', + 'status': 'running', + 'expires_at': '2026-04-02T12:05:00Z', + } + assert runtime.list_calls == 1 + assert runtime.create_calls == [] + assert runtime.stop_calls == [] + assert repository.get_active_by_chat_id(CHAT_ID) == restored_session + assert docker_client.close_calls == 1 + + def test_removed_user_endpoint_returns_not_found(monkeypatch) -> None: config = build_config() expires_at = datetime(2026, 4, 2, 12, 5, tzinfo=UTC) diff --git a/test/test_docker_runtime.py b/test/test_docker_runtime.py index d266eff..1e207f3 100644 --- a/test/test_docker_runtime.py +++ b/test/test_docker_runtime.py @@ -11,7 +11,7 @@ from docker.types import Mount from adapter.config.model import SandboxConfig from adapter.docker.runtime import DockerSandboxRuntime from domain.error import SandboxError, SandboxStartError -from domain.sandbox import SandboxStatus +from domain.sandbox import SandboxSession, SandboxStatus CHAT_ID = UUID('123e4567-e89b-12d3-a456-426614174000') NON_CANONICAL_CHAT_ID = '123E4567E89B12D3A456426614174000' @@ -27,6 +27,19 @@ class FakeContainer: self.stop_calls += 1 +class FakeListedContainer(FakeContainer): + def __init__( + self, + container_id: str, + *, + labels: dict[str, str], + created_at: str, + ) -> None: + super().__init__(container_id) + self.labels = labels + self.attrs = {'Created': created_at} + + class RunKwargs(TypedDict): detach: bool labels: dict[str, str] @@ -42,8 +55,10 @@ class FakeContainers: def __init__(self, run_result: FakeContainer | None = None) -> None: self.run_calls: list[RunCall] = [] self.get_calls: list[str] = [] + self.list_calls: list[dict[str, object]] = [] self.run_result = run_result or FakeContainer('container-123') self.get_result: FakeContainer | Exception | None = None + self.list_result: list[object] = [] def run( self, @@ -73,6 +88,10 @@ class FakeContainers: raise AssertionError('missing get result') return self.get_result + def list(self, *, filters: dict[str, list[str]]) -> list[object]: + self.list_calls.append({'filters': filters}) + return self.list_result + class FakeDockerClient(DockerClient): def __init__(self, containers: FakeContainers) -> None: @@ -197,3 +216,47 @@ def test_runtime_stop_wraps_docker_errors(tmp_path: Path) -> None: runtime.stop('container-123') assert str(excinfo.value) == 'sandbox_stop_failed' + + +def test_runtime_list_active_sessions_reads_valid_labeled_containers( + tmp_path: Path, +) -> None: + config = build_config(tmp_path) + containers = FakeContainers() + expires_at = datetime(2026, 4, 2, 12, 5, tzinfo=UTC) + containers.list_result = [ + FakeListedContainer( + 'container-123', + labels={ + 'session_id': str(SESSION_ID), + 'chat_id': str(CHAT_ID), + 'expires_at': expires_at.isoformat(), + }, + created_at='2026-04-02T12:00:00Z', + ), + FakeListedContainer( + 'container-bad', + labels={ + 'chat_id': str(CHAT_ID), + 'expires_at': expires_at.isoformat(), + }, + created_at='2026-04-02T12:01:00Z', + ), + ] + runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) + + sessions = runtime.list_active_sessions() + + assert sessions == [ + SandboxSession( + session_id=SESSION_ID, + chat_id=CHAT_ID, + container_id='container-123', + status=SandboxStatus.RUNNING, + created_at=datetime(2026, 4, 2, 12, 0, tzinfo=UTC), + expires_at=expires_at, + ) + ] + assert containers.list_calls == [ + {'filters': {'label': ['session_id', 'chat_id', 'expires_at']}} + ] From 4cb3c5410c54bb3f044cec94559677c81fc2b4f5 Mon Sep 17 00:00:00 2001 From: Azamat Date: Thu, 2 Apr 2026 23:52:36 +0300 Subject: [PATCH 17/30] [fix] lock del idle --- repository/sandbox_lock.py | 46 +++++++++++++++---- tasks.md | 2 +- test/test_sandbox_lock.py | 93 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 130 insertions(+), 11 deletions(-) create mode 100644 test/test_sandbox_lock.py diff --git a/repository/sandbox_lock.py b/repository/sandbox_lock.py index b13cd65..ffa0e35 100644 --- a/repository/sandbox_lock.py +++ b/repository/sandbox_lock.py @@ -1,4 +1,5 @@ import threading +from dataclasses import dataclass from types import TracebackType from typing import Protocol from uuid import UUID @@ -12,12 +13,25 @@ class _SyncLock(Protocol): def release(self) -> None: ... +@dataclass(slots=True) +class _LockEntry: + lock: _SyncLock + users: int = 0 + + class _ChatLock(LockContext): - def __init__(self, lock: _SyncLock) -> None: - self._lock = lock + def __init__( + self, + locker: 'ProcessLocalSandboxLifecycleLocker', + chat_id: UUID, + entry: _LockEntry, + ) -> None: + self._locker = locker + self._chat_id = chat_id + self._entry = entry def __enter__(self) -> None: - self._lock.acquire() + self._entry.lock.acquire() def __exit__( self, @@ -25,20 +39,32 @@ class _ChatLock(LockContext): exc: BaseException | None, traceback: TracebackType | None, ) -> bool | None: - self._lock.release() + self._entry.lock.release() + self._locker._release(self._chat_id, self._entry) return None class ProcessLocalSandboxLifecycleLocker(SandboxLifecycleLocker): def __init__(self) -> None: self._registry_lock = threading.Lock() - self._locks_by_chat_id: dict[UUID, _SyncLock] = {} + self._locks_by_chat_id: dict[UUID, _LockEntry] = {} def lock(self, chat_id: UUID) -> LockContext: with self._registry_lock: - lock = self._locks_by_chat_id.get(chat_id) - if lock is None: - lock = threading.Lock() - self._locks_by_chat_id[chat_id] = lock + entry = self._locks_by_chat_id.get(chat_id) + if entry is None: + entry = _LockEntry(lock=threading.Lock()) + self._locks_by_chat_id[chat_id] = entry + entry.users += 1 - return _ChatLock(lock) + return _ChatLock(self, chat_id, entry) + + def _release(self, chat_id: UUID, entry: _LockEntry) -> None: + with self._registry_lock: + entry.users -= 1 + if entry.users != 0: + return + + current_entry = self._locks_by_chat_id.get(chat_id) + if current_entry is entry: + del self._locks_by_chat_id[chat_id] diff --git a/tasks.md b/tasks.md index 1101e17..494c655 100644 --- a/tasks.md +++ b/tasks.md @@ -209,7 +209,7 @@ ### M17. Управление жизненным циклом per-chat locks - Субагент: `feature-developer` -- Статус: pending +- Статус: completed - Зависимости: `M13` - Commit required: no - Scope: ограничить неограниченный рост registry locks по числу когда-либо увиденных `chat_id` diff --git a/test/test_sandbox_lock.py b/test/test_sandbox_lock.py new file mode 100644 index 0000000..1177cec --- /dev/null +++ b/test/test_sandbox_lock.py @@ -0,0 +1,93 @@ +import threading +from uuid import UUID + +from repository.sandbox_lock import ProcessLocalSandboxLifecycleLocker + +CHAT_ID = UUID('77777777-7777-7777-7777-777777777777') + + +class LockRace: + def __init__(self, locker: ProcessLocalSandboxLifecycleLocker) -> None: + self.locker = locker + self.entered_first = threading.Event() + self.second_requested = threading.Event() + self.second_entered = threading.Event() + self.release_first = threading.Event() + self.release_second = threading.Event() + self.errors: list[Exception] = [] + self.order: list[str] = [] + self.first_entry: object | None = None + + def run_first(self) -> None: + try: + with self.locker.lock(CHAT_ID): + self.first_entry = self.locker._locks_by_chat_id[CHAT_ID] + self.order.append('first_entered') + self.entered_first.set() + assert self.release_first.wait(timeout=1) + self.order.append('first_releasing') + except Exception as exc: + self.errors.append(exc) + + def run_second(self) -> None: + try: + assert self.entered_first.wait(timeout=1) + context = self.locker.lock(CHAT_ID) + self.second_requested.set() + + with context: + self.order.append('second_entered') + self.second_entered.set() + assert self.release_second.wait(timeout=1) + self.order.append('second_releasing') + except Exception as exc: + self.errors.append(exc) + + +def test_process_local_sandbox_lifecycle_locker_evicts_idle_lock() -> None: + locker = ProcessLocalSandboxLifecycleLocker() + + with locker.lock(CHAT_ID): + assert CHAT_ID in locker._locks_by_chat_id + assert len(locker._locks_by_chat_id) == 1 + + assert CHAT_ID not in locker._locks_by_chat_id + assert len(locker._locks_by_chat_id) == 0 + + +def test_process_local_sandbox_lifecycle_locker_keeps_shared_lock_for_waiters() -> None: + locker = ProcessLocalSandboxLifecycleLocker() + race = LockRace(locker) + first_thread = threading.Thread(target=race.run_first) + second_thread = threading.Thread(target=race.run_second) + + first_thread.start() + assert race.entered_first.wait(timeout=1) + + second_thread.start() + assert race.second_requested.wait(timeout=1) + assert len(locker._locks_by_chat_id) == 1 + assert locker._locks_by_chat_id[CHAT_ID] is race.first_entry + assert not race.second_entered.wait(timeout=0.1) + + race.release_first.set() + assert race.second_entered.wait(timeout=1) + assert len(locker._locks_by_chat_id) == 1 + assert locker._locks_by_chat_id[CHAT_ID] is race.first_entry + + race.release_second.set() + + first_thread.join(timeout=1) + second_thread.join(timeout=1) + + assert not first_thread.is_alive() + assert not second_thread.is_alive() + assert race.errors == [] + assert race.order == [ + 'first_entered', + 'first_releasing', + 'second_entered', + 'second_releasing', + ] + assert CHAT_ID not in locker._locks_by_chat_id + assert len(locker._locks_by_chat_id) == 0 From e9ef178b15fb5bb9ea07f7ec6f989242aace03f8 Mon Sep 17 00:00:00 2001 From: Azamat Date: Fri, 3 Apr 2026 00:16:19 +0300 Subject: [PATCH 18/30] [feat] add docker in docker support --- config/docker-compose.yml | 42 +++++++++++++++++++++++++++++++++ docker-compose.yml | 49 +++++++++++++++++++++++++++++---------- 2 files changed, 79 insertions(+), 12 deletions(-) create mode 100644 config/docker-compose.yml diff --git a/config/docker-compose.yml b/config/docker-compose.yml new file mode 100644 index 0000000..a601f99 --- /dev/null +++ b/config/docker-compose.yml @@ -0,0 +1,42 @@ +app: + name: master-service + env: docker-compose + +http: + host: 0.0.0.0 + port: 8123 + +logging: + level: INFO + output: otel + format: json + +metrics: + enabled: true + +tracing: + enabled: true + +otel: + service_name: master-service + logs_endpoint: http://otel-collector:4318/v1/logs + metrics_endpoint: http://otel-collector:4318/v1/metrics + traces_endpoint: http://otel-collector:4318/v1/traces + metric_export_interval: 1000 + +docker: + base_url: tcp://docker-engine:2375 + +sandbox: + image: nginx:1.27-alpine + ttl_seconds: 30 + cleanup_interval_seconds: 5 + chats_root: /var/lib/master-sandbox/chats + dependencies_host_path: /var/lib/master-dependencies + lambda_tools_host_path: /var/lib/master-lambda-tools + chat_mount_path: /workspace/chat + dependencies_mount_path: /opt/dependencies + lambda_tools_mount_path: /opt/lambda-tools + +security: + token_header: X-API-Token diff --git a/docker-compose.yml b/docker-compose.yml index 86e1bbb..24d5bab 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -4,22 +4,43 @@ services: context: . dockerfile: Dockerfile target: run + user: root depends_on: - - otel-collector + docker-engine: + condition: service_healthy + otel-collector: + condition: service_started environment: - APP_API_TOKEN: ${APP_API_TOKEN:?APP_API_TOKEN is required} - APP_SIGNING_KEY: ${APP_SIGNING_KEY:?APP_SIGNING_KEY is required} - APP_ENV: docker - APP_HTTP_HOST: 0.0.0.0 - APP_HTTP_PORT: '8123' - APP_LOGGING_OUTPUT: otel - APP_METRICS_ENABLED: 'true' - APP_TRACING_ENABLED: 'true' - APP_OTEL_LOGS_ENDPOINT: http://otel-collector:4318/v1/logs - APP_OTEL_METRICS_ENDPOINT: http://otel-collector:4318/v1/metrics - APP_OTEL_TRACES_ENDPOINT: http://otel-collector:4318/v1/traces + APP_API_TOKEN: local-api-token + APP_SIGNING_KEY: local-signing-key ports: - '127.0.0.1:8123:8123' + volumes: + - ./config/docker-compose.yml:/app/config/app.yaml:ro + - sandbox-data:/var/lib/master-sandbox + - sandbox-dependencies:/var/lib/master-dependencies:ro + - sandbox-tools:/var/lib/master-lambda-tools:ro + + docker-engine: + image: docker:28-dind + privileged: true + environment: + DOCKER_TLS_CERTDIR: '' + command: + - --host=tcp://0.0.0.0:2375 + healthcheck: + test: + - CMD + - docker + - info + interval: 5s + timeout: 5s + retries: 12 + volumes: + - docker-data:/var/lib/docker + - sandbox-data:/var/lib/master-sandbox + - sandbox-dependencies:/var/lib/master-dependencies + - sandbox-tools:/var/lib/master-lambda-tools otel-collector: image: grafana/otel-lgtm:latest @@ -29,4 +50,8 @@ services: - lgtm-data:/data volumes: + docker-data: lgtm-data: + sandbox-data: + sandbox-dependencies: + sandbox-tools: From a86e1ee8c706f68c25c9f09ebdea600c4ed179d1 Mon Sep 17 00:00:00 2001 From: Azamat Date: Fri, 3 Apr 2026 00:37:35 +0300 Subject: [PATCH 19/30] add sandbox observability contracts --- adapter/observability/noop.py | 8 +++ adapter/otel/metrics.py | 32 +++++++++++ docs/008-sandbox-lifecycle-observability.md | 18 +++++++ repository/sandbox_session.py | 4 ++ tasks.md | 60 +++++++++++++++++++++ usecase/interface.py | 9 ++++ 6 files changed, 131 insertions(+) create mode 100644 docs/008-sandbox-lifecycle-observability.md diff --git a/adapter/observability/noop.py b/adapter/observability/noop.py index fe7d190..7027d41 100644 --- a/adapter/observability/noop.py +++ b/adapter/observability/noop.py @@ -20,6 +20,14 @@ class NoopMetrics: ) -> None: return None + def set( + self, + name: str, + value: int | float, + attrs: Attrs | None = None, + ) -> None: + return None + class NoopSpan: def set_attribute(self, name: str, value: AttrValue) -> None: diff --git a/adapter/otel/metrics.py b/adapter/otel/metrics.py index 48d1278..ed9abe6 100644 --- a/adapter/otel/metrics.py +++ b/adapter/otel/metrics.py @@ -5,12 +5,21 @@ from opentelemetry.metrics import Counter, Histogram, Meter from usecase.interface import Attrs +class _GaugeAdapter: + def __init__(self, gauge: object) -> None: + self._gauge = gauge + + def set(self, value: int | float, attributes: object = None) -> None: + getattr(self._gauge, 'set')(value, attributes=attributes) + + class OtelMetrics: def __init__(self, meter: Meter) -> None: self._meter = meter self._lock = Lock() self._counters: dict[str, Counter] = {} self._histograms: dict[str, Histogram] = {} + self._gauges: dict[str, _GaugeAdapter] = {} def increment( self, @@ -34,6 +43,17 @@ class OtelMetrics: attributes=None if attrs is None else dict(attrs), ) + def set( + self, + name: str, + value: int | float, + attrs: Attrs | None = None, + ) -> None: + self._gauge(name).set( + value, + attributes=None if attrs is None else dict(attrs), + ) + def _counter(self, name: str) -> Counter: counter = self._counters.get(name) if counter is not None: @@ -57,3 +77,15 @@ class OtelMetrics: histogram = self._meter.create_histogram(name) self._histograms[name] = histogram return histogram + + def _gauge(self, name: str) -> _GaugeAdapter: + gauge = self._gauges.get(name) + if gauge is not None: + return gauge + + with self._lock: + gauge = self._gauges.get(name) + if gauge is None: + gauge = _GaugeAdapter(self._meter.create_gauge(name)) + self._gauges[name] = gauge + return gauge diff --git a/docs/008-sandbox-lifecycle-observability.md b/docs/008-sandbox-lifecycle-observability.md new file mode 100644 index 0000000..f56dc10 --- /dev/null +++ b/docs/008-sandbox-lifecycle-observability.md @@ -0,0 +1,18 @@ +# 008 Sandbox lifecycle observability + +## Context +- FR-034 требует метрики по active sandbox, startup latency и cleanup +- Issue #11 требует трассировку sandbox usecase и Docker adapter steps +- Inner layers должны знать только observability ports + +## Decision +- Usecase sandbox lifecycle использует только `Logger`, `Metrics`, `Tracer` +- `Metrics` получает `set(...)` для current-state signals +- `sandbox.active.count` считается из session registry через `count_active()` +- M19 добавляет только contracts и adapter support для будущих lifecycle signals +- M20 и M21 отдельно добавят spans и runtime metrics в usecase и Docker adapter + +## Consequences +- OTel gauge остается в outer adapter, не протекает во внутренние слои +- Active sandbox count синхронизируется после create, cleanup и reconciliation +- Tests могут проверять observability через fake ports без реального OTel backend diff --git a/repository/sandbox_session.py b/repository/sandbox_session.py index 893ec65..bb680d2 100644 --- a/repository/sandbox_session.py +++ b/repository/sandbox_session.py @@ -29,6 +29,10 @@ class InMemorySandboxSessionRepository(SandboxSessionRepository): if session.expires_at <= now ] + def count_active(self) -> int: + with self._lock: + return len(self._sessions_by_chat_id) + def save(self, session: SandboxSession) -> None: with self._lock: self._sessions_by_chat_id[session.chat_id] = session diff --git a/tasks.md b/tasks.md index 494c655..d24657c 100644 --- a/tasks.md +++ b/tasks.md @@ -227,3 +227,63 @@ - Файлы: `domain/sandbox.py`, `usecase/interface.py`, `usecase/sandbox.py`, `repository/sandbox_session.py`, `adapter/http/fastapi/*`, `adapter/docker/runtime.py`, `adapter/di/container.py`, `test/*` - Решение: HTTP boundary принимает/возвращает UUID, usecase и repository работают с UUID objects, Docker labels продолжают сериализоваться в строки через `str(uuid)` - Критерии приемки: внутри sandbox flow `chat_id` и `session_id` больше не строки; `container_id` остается `str`; pydantic корректно сериализует UUID в response; `make pre-commit` проходит + +## Follow-up после issue #11 observability + +### M19. ADR и observability contracts для sandbox lifecycle + +- Исполнитель: `primary-agent` +- Статус: completed +- Зависимости: `M18` +- Commit required: yes +- Commit message: `add sandbox observability contracts` +- Scope: зафиксировать sandbox lifecycle observability policy в ADR-lite и подготовить минимальные контракты для traces и current-state metrics без нарушения clean architecture +- Файлы: `docs/008-sandbox-lifecycle-observability.md`, `usecase/interface.py`, `repository/sandbox_session.py`, `adapter/otel/metrics.py`, `adapter/observability/noop.py` +- Решение: добавить в `Metrics` порт операцию `set(...)` для gauge-like current-state сигналов; добавить в `SandboxSessionRepository` `count_active()` как источник truth для `sandbox.active.count` +- Критерии приемки: ADR занимает 10-20 строк; inner layers по-прежнему знают только порты `Logger`/`Metrics`/`Tracer`; current-state метрика активных sandbox выражается без OTel imports во внутреннем слое + +### M20. Трейсы и метрики в sandbox usecases + +- Субагент: `feature-developer` +- Статус: pending +- Зависимости: `M19` +- Commit required: yes +- Commit message: `instrument sandbox usecases` +- Scope: добавить spans и ключевые lifecycle metrics в `CreateSandbox` и `CleanupExpiredSandboxes` +- Файлы: `usecase/sandbox.py`, `adapter/di/container.py`, при необходимости тесты в `test/*` +- Решение: usecase получает `Metrics` и `Tracer` через конструктор; `CreateSandbox` и `CleanupExpiredSandboxes` публикуют `sandbox.create.total`, `sandbox.cleanup.total`, `sandbox.cleanup.error.total` и обновляют `sandbox.active.count` после мутаций registry +- Критерии приемки: есть spans `usecase.create_sandbox` и `usecase.cleanup_expired_sandboxes`; span attrs и metric attrs включают ключевые lifecycle identifiers/result fields; reuse/replace/cleanup paths наблюдаемы без OTel imports в usecase + +### M21. Трейсы и runtime metrics в Docker adapter и reconciliation + +- Субагент: `feature-developer` +- Статус: pending +- Зависимости: `M19` +- Commit required: yes +- Commit message: `instrument sandbox docker runtime` +- Scope: добавить observability в `DockerSandboxRuntime` и reconciliation path для Docker operations и current-state sync +- Файлы: `adapter/docker/runtime.py`, `adapter/sandbox/reconciliation.py`, `adapter/di/container.py`, при необходимости тесты в `test/*` +- Решение: `DockerSandboxRuntime` получает `Metrics` и `Tracer`; create/stop/list публикуют duration histograms `sandbox.runtime.create.duration_ms`, `sandbox.runtime.stop.duration_ms`, `sandbox.runtime.list_active.duration_ms`, error counter `sandbox.runtime.error.total` и span attrs по chat/session/container; reconciliation обновляет `sandbox.active.count` по registry snapshot +- Критерии приемки: Docker adapter остается во внешнем слое; ошибки Docker операций отражаются в spans и metrics; после startup reconciliation current-state метрика активных sandbox синхронизирована с registry + +### M22. Тесты на sandbox observability + +- Субагент: `test-engineer` +- Статус: pending +- Зависимости: `M20`, `M21` +- Commit required: yes +- Commit message: `add sandbox observability tests` +- Scope: покрыть regression tests новую observability policy без реального OTel backend +- Файлы: `test/test_sandbox_usecase.py`, `test/test_docker_runtime.py`, при необходимости новые focused tests в `test/*` +- Решение: использовать типизированные fake metrics/tracer implementations и проверить names/attrs ключевых spans и metrics на create/reuse/replace/cleanup/runtime paths +- Критерии приемки: тесты подтверждают spans и metrics на usecase и adapter paths; constructor wiring обновлен без mypy regressions; `make typecheck` и релевантный `pytest` проходят + +### M23. Boundary review для sandbox observability + +- Субагент: `code-reviewer` +- Статус: pending +- Зависимости: `M22` +- Commit required: no +- Scope: проверить, что observability изменения закрывают issue #11 и FR-034 без нарушения clean architecture +- Файлы: весь измененный код после `M19`-`M22` +- Критерии приемки: inner layers не импортируют OTel; Docker-specific tracing остается в `adapter/docker/`; current-state и duration metrics достаточно покрывают sandbox lifecycle; замечания сведены к minor или отсутствуют diff --git a/usecase/interface.py b/usecase/interface.py index 15c581a..69876e6 100644 --- a/usecase/interface.py +++ b/usecase/interface.py @@ -24,6 +24,8 @@ class SandboxSessionRepository(Protocol): def list_expired(self, now: datetime) -> list[SandboxSession]: ... + def count_active(self) -> int: ... + def save(self, session: SandboxSession) -> None: ... def delete(self, session_id: UUID) -> None: ... @@ -86,6 +88,13 @@ class Metrics(Protocol): attrs: Attrs | None = None, ) -> None: ... + def set( + self, + name: str, + value: int | float, + attrs: Attrs | None = None, + ) -> None: ... + class Span(Protocol): def set_attribute(self, name: str, value: AttrValue) -> None: ... From 4cdf6e45de0363a2db350ca8e78b4125f1103610 Mon Sep 17 00:00:00 2001 From: Azamat Date: Fri, 3 Apr 2026 00:56:37 +0300 Subject: [PATCH 20/30] instrument sandbox usecases --- adapter/di/container.py | 4 + tasks.md | 2 +- test/test_create_http.py | 4 + test/test_sandbox_usecase.py | 15 +++ usecase/sandbox.py | 215 +++++++++++++++++++++++++++-------- 5 files changed, 191 insertions(+), 49 deletions(-) diff --git a/adapter/di/container.py b/adapter/di/container.py index ace3a42..4b87b2f 100644 --- a/adapter/di/container.py +++ b/adapter/di/container.py @@ -96,6 +96,8 @@ def build_container( runtime=sandbox_runtime, clock=clock, logger=observability.logger, + metrics=observability.metrics, + tracer=observability.tracer, ttl=timedelta(seconds=app_config.sandbox.ttl_seconds), ), cleanup_expired_sandboxes=CleanupExpiredSandboxes( @@ -104,6 +106,8 @@ def build_container( runtime=sandbox_runtime, clock=clock, logger=observability.logger, + metrics=observability.metrics, + tracer=observability.tracer, ), ) diff --git a/tasks.md b/tasks.md index d24657c..a43d96a 100644 --- a/tasks.md +++ b/tasks.md @@ -245,7 +245,7 @@ ### M20. Трейсы и метрики в sandbox usecases - Субагент: `feature-developer` -- Статус: pending +- Статус: completed - Зависимости: `M19` - Commit required: yes - Commit message: `instrument sandbox usecases` diff --git a/test/test_create_http.py b/test/test_create_http.py index 652644b..a25eaba 100644 --- a/test/test_create_http.py +++ b/test/test_create_http.py @@ -502,6 +502,8 @@ def test_startup_reconciliation_reuses_existing_container_after_restart( runtime=runtime, clock=FakeClock(created_at), logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), ttl=timedelta(minutes=5), ), cleanup_expired_sandboxes=CleanupExpiredSandboxes( @@ -510,6 +512,8 @@ def test_startup_reconciliation_reuses_existing_container_after_restart( runtime=runtime, clock=FakeClock(created_at), logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), ), ) container = AppContainer( diff --git a/test/test_sandbox_usecase.py b/test/test_sandbox_usecase.py index 4fedb21..f744492 100644 --- a/test/test_sandbox_usecase.py +++ b/test/test_sandbox_usecase.py @@ -2,6 +2,7 @@ import threading from datetime import UTC, datetime, timedelta from uuid import UUID +from adapter.observability.noop import NoopMetrics, NoopTracer from domain.sandbox import SandboxSession, SandboxStatus from repository.sandbox_lock import ProcessLocalSandboxLifecycleLocker from repository.sandbox_session import InMemorySandboxSessionRepository @@ -218,6 +219,8 @@ def test_create_sandbox_reuses_active_session_when_not_expired() -> None: runtime=runtime, clock=FakeClock(now), logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), ttl=timedelta(minutes=5), ) @@ -264,6 +267,8 @@ def test_create_sandbox_replaces_expired_session_and_creates_new_one( runtime=runtime, clock=FakeClock(now), logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), ttl=timedelta(minutes=5), ) monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID) @@ -323,6 +328,8 @@ def test_create_sandbox_creates_new_session_when_none_exists() -> None: runtime=runtime, clock=FakeClock(now), logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), ttl=timedelta(minutes=5), ) @@ -370,6 +377,8 @@ def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id( runtime=runtime, clock=FakeClock(now), logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), ttl=timedelta(minutes=5), ) monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID) @@ -473,6 +482,8 @@ def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> runtime=runtime, clock=FakeClock(now), logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), ) result = usecase.execute() @@ -534,6 +545,8 @@ def test_cleanup_expired_sandboxes_skips_replaced_session_from_stale_snapshot() runtime=runtime, clock=FakeClock(now), logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), ) result = usecase.execute() @@ -575,6 +588,8 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None: runtime=runtime, clock=FakeClock(now), logger=logger, + metrics=NoopMetrics(), + tracer=NoopTracer(), ) result = usecase.execute() diff --git a/usecase/sandbox.py b/usecase/sandbox.py index 83ee39d..2bdb369 100644 --- a/usecase/sandbox.py +++ b/usecase/sandbox.py @@ -6,9 +6,11 @@ from domain.sandbox import SandboxSession from usecase.interface import ( Clock, Logger, + Metrics, SandboxLifecycleLocker, SandboxRuntime, SandboxSessionRepository, + Tracer, ) @@ -25,6 +27,8 @@ class CreateSandbox: runtime: SandboxRuntime, clock: Clock, logger: Logger, + metrics: Metrics, + tracer: Tracer, ttl: timedelta, ) -> None: self._repository = repository @@ -32,44 +36,80 @@ class CreateSandbox: self._runtime = runtime self._clock = clock self._logger = logger + self._metrics = metrics + self._tracer = tracer self._ttl = ttl def execute(self, command: CreateSandboxCommand) -> SandboxSession: chat_id = command.chat_id - with self._locker.lock(chat_id): - session = self._repository.get_active_by_chat_id(chat_id) - now = self._clock.now() + with self._tracer.start_span( + 'usecase.create_sandbox', + attrs={'chat.id': str(chat_id)}, + ) as span: + try: + with self._locker.lock(chat_id): + session = self._repository.get_active_by_chat_id(chat_id) + now = self._clock.now() - if session is not None and session.expires_at > now: - self._logger.info( - 'sandbox_reused', - attrs=_sandbox_attrs(session), + if session is not None and session.expires_at > now: + span.set_attribute('session.id', str(session.session_id)) + span.set_attribute('container.id', session.container_id) + span.set_attribute('sandbox.result', 'reused') + self._metrics.increment( + 'sandbox.create.total', + attrs=_result_metric_attrs('reused'), + ) + self._logger.info( + 'sandbox_reused', + attrs=_sandbox_attrs(session), + ) + return session + + result = 'created' + if session is not None: + result = 'replaced' + span.set_attribute('session.id', str(session.session_id)) + span.set_attribute('container.id', session.container_id) + self._logger.info( + 'sandbox_replaced', + attrs=_sandbox_attrs(session), + ) + self._runtime.stop(session.container_id) + self._repository.delete(session.session_id) + _set_active_count(self._metrics, self._repository) + + created_at = self._clock.now() + expires_at = created_at + self._ttl + session_id = _new_session_id() + span.set_attribute('session.id', str(session_id)) + new_session = self._runtime.create( + session_id=session_id, + chat_id=chat_id, + created_at=created_at, + expires_at=expires_at, + ) + self._repository.save(new_session) + _set_active_count(self._metrics, self._repository) + span.set_attribute('container.id', new_session.container_id) + span.set_attribute('sandbox.result', result) + self._metrics.increment( + 'sandbox.create.total', + attrs=_result_metric_attrs(result), + ) + self._logger.info( + 'sandbox_created', + attrs=_sandbox_attrs(new_session), + ) + return new_session + except Exception as exc: + span.set_attribute('sandbox.result', 'error') + self._metrics.increment( + 'sandbox.create.total', + attrs=_result_metric_attrs('error'), ) - return session - - if session is not None: - self._logger.info( - 'sandbox_replaced', - attrs=_sandbox_attrs(session), - ) - self._runtime.stop(session.container_id) - self._repository.delete(session.session_id) - - created_at = self._clock.now() - expires_at = created_at + self._ttl - new_session = self._runtime.create( - session_id=_new_session_id(), - chat_id=chat_id, - created_at=created_at, - expires_at=expires_at, - ) - self._repository.save(new_session) - self._logger.info( - 'sandbox_created', - attrs=_sandbox_attrs(new_session), - ) - return new_session + span.record_error(exc) + raise class CleanupExpiredSandboxes: @@ -80,39 +120,84 @@ class CleanupExpiredSandboxes: runtime: SandboxRuntime, clock: Clock, logger: Logger, + metrics: Metrics, + tracer: Tracer, ) -> None: self._repository = repository self._locker = locker self._runtime = runtime self._clock = clock self._logger = logger + self._metrics = metrics + self._tracer = tracer def execute(self) -> list[SandboxSession]: - expired_sessions = self._repository.list_expired(self._clock.now()) cleaned_sessions: list[SandboxSession] = [] + error_count = 0 - for session in expired_sessions: + with self._tracer.start_span( + 'usecase.cleanup_expired_sandboxes', + ) as span: try: - cleaned_session = self._cleanup_session(session) + expired_sessions = self._repository.list_expired(self._clock.now()) except Exception as exc: - attrs = _sandbox_attrs(session) - attrs['error'] = type(exc).__name__ - self._logger.error( - 'sandbox_clean_failed', - attrs=attrs, + span.set_attribute('sandbox.result', 'error') + self._metrics.increment( + 'sandbox.cleanup.error.total', + attrs=_cleanup_error_metric_attrs( + type(exc).__name__, + 'list_expired', + ), ) - continue + span.record_error(exc) + raise - if cleaned_session is None: - continue + span.set_attribute('sandbox.expired_count', len(expired_sessions)) + for session in expired_sessions: + with self._tracer.start_span( + 'usecase.cleanup_expired_sandbox', + attrs=_sandbox_span_attrs(session), + ) as cleanup_span: + try: + cleaned_session = self._cleanup_session(session) + except Exception as exc: + error_count += 1 + cleanup_span.set_attribute('sandbox.result', 'error') + cleanup_span.record_error(exc) + self._metrics.increment( + 'sandbox.cleanup.error.total', + attrs=_error_metric_attrs(type(exc).__name__), + ) + attrs = _sandbox_attrs(session) + attrs['error'] = type(exc).__name__ + self._logger.error( + 'sandbox_clean_failed', + attrs=attrs, + ) + continue - cleaned_sessions.append(cleaned_session) - self._logger.info( - 'sandbox_cleaned', - attrs=_sandbox_attrs(cleaned_session), + if cleaned_session is None: + cleanup_span.set_attribute('sandbox.result', 'skipped') + continue + + cleanup_span.set_attribute('sandbox.result', 'cleaned') + cleaned_sessions.append(cleaned_session) + self._metrics.increment( + 'sandbox.cleanup.total', + attrs=_result_metric_attrs('cleaned'), + ) + self._logger.info( + 'sandbox_cleaned', + attrs=_sandbox_attrs(cleaned_session), + ) + + span.set_attribute('sandbox.cleaned_count', len(cleaned_sessions)) + span.set_attribute('sandbox.error_count', error_count) + span.set_attribute( + 'sandbox.result', + 'completed' if error_count == 0 else 'completed_with_errors', ) - - return cleaned_sessions + return cleaned_sessions def _cleanup_session(self, session: SandboxSession) -> SandboxSession | None: with self._locker.lock(session.chat_id): @@ -129,6 +214,7 @@ class CleanupExpiredSandboxes: self._runtime.stop(current_session.container_id) self._repository.delete(current_session.session_id) + _set_active_count(self._metrics, self._repository) return current_session @@ -142,3 +228,36 @@ def _sandbox_attrs(session: SandboxSession) -> dict[str, str]: 'session_id': str(session.session_id), 'container_id': session.container_id, } + + +def _sandbox_span_attrs(session: SandboxSession) -> dict[str, str]: + return { + 'chat.id': str(session.chat_id), + 'session.id': str(session.session_id), + 'container.id': session.container_id, + } + + +def _result_metric_attrs(result: str) -> dict[str, str]: + return {'result': result} + + +def _error_metric_attrs(error_type: str) -> dict[str, str]: + return {'error.type': error_type} + + +def _cleanup_error_metric_attrs( + error_type: str, + reason: str, +) -> dict[str, str]: + return { + 'error.type': error_type, + 'reason': reason, + } + + +def _set_active_count( + metrics: Metrics, + repository: SandboxSessionRepository, +) -> None: + metrics.set('sandbox.active.count', repository.count_active()) From 8d3a080d4575d43bb3878853977ce3613036859d Mon Sep 17 00:00:00 2001 From: Azamat Date: Fri, 3 Apr 2026 01:15:23 +0300 Subject: [PATCH 21/30] instrument sandbox docker runtime --- adapter/di/container.py | 9 +- adapter/docker/runtime.py | 225 +++++++++++++++++++++++------- adapter/sandbox/reconciliation.py | 50 ++++--- tasks.md | 2 +- test/test_create_http.py | 175 ++++++++++++++++++++++- test/test_docker_runtime.py | 23 ++- 6 files changed, 411 insertions(+), 73 deletions(-) diff --git a/adapter/di/container.py b/adapter/di/container.py index 4b87b2f..b18382c 100644 --- a/adapter/di/container.py +++ b/adapter/di/container.py @@ -81,11 +81,18 @@ def build_container( sandbox_repository = InMemorySandboxSessionRepository() sandbox_locker = ProcessLocalSandboxLifecycleLocker() - sandbox_runtime = DockerSandboxRuntime(app_config.sandbox, docker_client) + sandbox_runtime = DockerSandboxRuntime( + app_config.sandbox, + docker_client, + observability.metrics, + observability.tracer, + ) sandbox_reconciler = SandboxSessionReconciler( state_source=sandbox_runtime, registry=sandbox_repository, logger=observability.logger, + metrics=observability.metrics, + tracer=observability.tracer, ) repositories = AppRepositories(sandbox_session=sandbox_repository) diff --git a/adapter/docker/runtime.py b/adapter/docker/runtime.py index 3f33466..3c6e93c 100644 --- a/adapter/docker/runtime.py +++ b/adapter/docker/runtime.py @@ -1,3 +1,4 @@ +import time from datetime import datetime from pathlib import Path from uuid import UUID @@ -9,7 +10,7 @@ from docker.types import Mount from adapter.config.model import SandboxConfig from domain.error import SandboxError, SandboxStartError from domain.sandbox import SandboxSession, SandboxStatus -from usecase.interface import SandboxRuntime +from usecase.interface import Metrics, SandboxRuntime, Span, Tracer SANDBOX_LABELS = ('session_id', 'chat_id', 'expires_at') @@ -19,9 +20,13 @@ class DockerSandboxRuntime(SandboxRuntime): self, config: SandboxConfig, client: DockerClient, + metrics: Metrics, + tracer: Tracer, ) -> None: self._config = config self._client = client + self._metrics = metrics + self._tracer = tracer def create( self, @@ -31,62 +36,143 @@ class DockerSandboxRuntime(SandboxRuntime): created_at: datetime, expires_at: datetime, ) -> SandboxSession: - try: - chat_path = self._chat_path(chat_id) - dependencies_path = self._readonly_host_path( - self._config.dependencies_host_path - ) - lambda_tools_path = self._readonly_host_path( - self._config.lambda_tools_host_path - ) - chat_path.mkdir(parents=True, exist_ok=True) - container = self._client.containers.run( - self._config.image, - detach=True, - labels=self._labels(session_id, chat_id, expires_at), - mounts=self._mounts(chat_path, dependencies_path, lambda_tools_path), - ) - except (DockerException, OSError, ValueError) as exc: - raise SandboxStartError(str(chat_id)) from exc + started_at = time.perf_counter() + result = 'error' - container_id = str(getattr(container, 'id', '')).strip() - if not container_id: - raise SandboxStartError(str(chat_id)) + with self._tracer.start_span( + 'adapter.docker.create_sandbox', + attrs={ + 'chat.id': str(chat_id), + 'session.id': str(session_id), + }, + ) as span: + try: + try: + chat_path = self._chat_path(chat_id) + dependencies_path = self._readonly_host_path( + self._config.dependencies_host_path + ) + lambda_tools_path = self._readonly_host_path( + self._config.lambda_tools_host_path + ) + chat_path.mkdir(parents=True, exist_ok=True) + container = self._client.containers.run( + self._config.image, + detach=True, + labels=self._labels(session_id, chat_id, expires_at), + mounts=self._mounts( + chat_path, + dependencies_path, + lambda_tools_path, + ), + ) + except (DockerException, OSError, ValueError) as exc: + raise SandboxStartError(str(chat_id)) from exc - return SandboxSession( - session_id=session_id, - chat_id=chat_id, - container_id=container_id, - status=SandboxStatus.RUNNING, - created_at=created_at, - expires_at=expires_at, - ) + container_id = str(getattr(container, 'id', '')).strip() + if not container_id: + raise SandboxStartError(str(chat_id)) + + result = 'created' + span.set_attribute('container.id', container_id) + span.set_attribute('sandbox.result', result) + return SandboxSession( + session_id=session_id, + chat_id=chat_id, + container_id=container_id, + status=SandboxStatus.RUNNING, + created_at=created_at, + expires_at=expires_at, + ) + except Exception as exc: + span.set_attribute('sandbox.result', result) + span.record_error(exc) + self._metrics.increment( + 'sandbox.runtime.error.total', + attrs=_runtime_error_metric_attrs('create', _error_type(exc)), + ) + raise + finally: + self._metrics.record( + 'sandbox.runtime.create.duration_ms', + _duration_ms(started_at), + attrs=_runtime_metric_attrs('create', result), + ) def stop(self, container_id: str) -> None: - try: - container = self._client.containers.get(container_id) - container.stop() - except NotFound: - return - except DockerException as exc: - raise SandboxError('sandbox_stop_failed') from exc + started_at = time.perf_counter() + result = 'error' + + with self._tracer.start_span( + 'adapter.docker.stop_sandbox', + attrs={'container.id': container_id}, + ) as span: + try: + container = self._client.containers.get(container_id) + _set_span_container_attrs(span, container) + container.stop() + result = 'stopped' + span.set_attribute('sandbox.result', result) + except NotFound: + result = 'not_found' + span.set_attribute('sandbox.result', result) + return + except DockerException as exc: + span.set_attribute('sandbox.result', result) + span.record_error(exc) + self._metrics.increment( + 'sandbox.runtime.error.total', + attrs=_runtime_error_metric_attrs('stop', type(exc).__name__), + ) + raise SandboxError('sandbox_stop_failed') from exc + finally: + self._metrics.record( + 'sandbox.runtime.stop.duration_ms', + _duration_ms(started_at), + attrs=_runtime_metric_attrs('stop', result), + ) def list_active_sessions(self) -> list[SandboxSession]: - try: - containers = self._client.containers.list( - filters={'label': list(SANDBOX_LABELS)} - ) - except DockerException as exc: - raise SandboxError('sandbox_list_failed') from exc + started_at = time.perf_counter() + result = 'error' - sessions: list[SandboxSession] = [] - for container in containers: - session = self._session_from_container(container) - if session is None: - continue - sessions.append(session) + with self._tracer.start_span( + 'adapter.docker.list_active_sandboxes', + ) as span: + try: + try: + containers = self._client.containers.list( + filters={'label': list(SANDBOX_LABELS)} + ) + except DockerException as exc: + raise SandboxError('sandbox_list_failed') from exc - return sessions + sessions: list[SandboxSession] = [] + for container in containers: + session = self._session_from_container(container) + if session is None: + continue + sessions.append(session) + + result = 'listed' + span.set_attribute('sandbox.container_count', len(containers)) + span.set_attribute('sandbox.active_count', len(sessions)) + span.set_attribute('sandbox.result', result) + return sessions + except Exception as exc: + span.set_attribute('sandbox.result', result) + span.record_error(exc) + self._metrics.increment( + 'sandbox.runtime.error.total', + attrs=_runtime_error_metric_attrs('list_active', _error_type(exc)), + ) + raise + finally: + self._metrics.record( + 'sandbox.runtime.list_active.duration_ms', + _duration_ms(started_at), + attrs=_runtime_metric_attrs('list_active', result), + ) def _labels( self, @@ -186,3 +272,44 @@ class DockerSandboxRuntime(SandboxRuntime): def _parse_datetime(value: str) -> datetime: normalized = f'{value[:-1]}+00:00' if value.endswith('Z') else value return datetime.fromisoformat(normalized) + + +def _duration_ms(started_at: float) -> float: + return (time.perf_counter() - started_at) * 1000 + + +def _runtime_metric_attrs(operation: str, result: str) -> dict[str, str]: + return { + 'operation': operation, + 'result': result, + } + + +def _runtime_error_metric_attrs( + operation: str, + error_type: str, +) -> dict[str, str]: + return { + 'operation': operation, + 'error.type': error_type, + } + + +def _error_type(error: Exception) -> str: + if isinstance(error.__cause__, Exception): + return type(error.__cause__).__name__ + return type(error).__name__ + + +def _set_span_container_attrs(span: Span, container: object) -> None: + labels = getattr(container, 'labels', None) + if not isinstance(labels, dict): + return + + session_id = labels.get('session_id') + if isinstance(session_id, str) and session_id: + span.set_attribute('session.id', session_id) + + chat_id = labels.get('chat_id') + if isinstance(chat_id, str) and chat_id: + span.set_attribute('chat.id', chat_id) diff --git a/adapter/sandbox/reconciliation.py b/adapter/sandbox/reconciliation.py index 2d04ca5..81cdb75 100644 --- a/adapter/sandbox/reconciliation.py +++ b/adapter/sandbox/reconciliation.py @@ -3,7 +3,7 @@ from typing import Protocol from uuid import UUID from domain.sandbox import SandboxSession -from usecase.interface import Logger +from usecase.interface import Logger, Metrics, Tracer class SandboxSessionStateSource(Protocol): @@ -13,27 +13,45 @@ class SandboxSessionStateSource(Protocol): class SandboxSessionRegistry(Protocol): def replace_all(self, sessions: list[SandboxSession]) -> None: ... + def count_active(self) -> int: ... + @dataclass(frozen=True, slots=True) class SandboxSessionReconciler: state_source: SandboxSessionStateSource registry: SandboxSessionRegistry logger: Logger + metrics: Metrics + tracer: Tracer def execute(self) -> list[SandboxSession]: - sessions_by_chat_id: dict[UUID, SandboxSession] = {} - for session in sorted( - self.state_source.list_active_sessions(), - key=lambda item: item.created_at, - ): - sessions_by_chat_id[session.chat_id] = session + with self.tracer.start_span( + 'adapter.sandbox.reconcile_sessions', + ) as span: + try: + sessions_by_chat_id: dict[UUID, SandboxSession] = {} + discovered_sessions = self.state_source.list_active_sessions() + span.set_attribute('sandbox.discovered_count', len(discovered_sessions)) + for session in sorted( + discovered_sessions, + key=lambda item: item.created_at, + ): + sessions_by_chat_id[session.chat_id] = session - sessions = list(sessions_by_chat_id.values()) - self.registry.replace_all(sessions) - self.logger.info( - 'sandbox_reconciled', - attrs={ - 'session_count': len(sessions), - }, - ) - return sessions + sessions = list(sessions_by_chat_id.values()) + self.registry.replace_all(sessions) + active_count = self.registry.count_active() + self.metrics.set('sandbox.active.count', active_count) + span.set_attribute('sandbox.active_count', active_count) + span.set_attribute('sandbox.result', 'reconciled') + self.logger.info( + 'sandbox_reconciled', + attrs={ + 'session_count': active_count, + }, + ) + return sessions + except Exception as exc: + span.set_attribute('sandbox.result', 'error') + span.record_error(exc) + raise diff --git a/tasks.md b/tasks.md index a43d96a..012111d 100644 --- a/tasks.md +++ b/tasks.md @@ -257,7 +257,7 @@ ### M21. Трейсы и runtime metrics в Docker adapter и reconciliation - Субагент: `feature-developer` -- Статус: pending +- Статус: completed - Зависимости: `M19` - Commit required: yes - Commit message: `instrument sandbox docker runtime` diff --git a/test/test_create_http.py b/test/test_create_http.py index a25eaba..e8686c4 100644 --- a/test/test_create_http.py +++ b/test/test_create_http.py @@ -7,6 +7,7 @@ from docker import DockerClient from fastapi import FastAPI from starlette.types import Message, Scope +import adapter.di.container as container_module from adapter.config.model import ( AppConfig, AppSectionConfig, @@ -20,6 +21,7 @@ from adapter.config.model import ( TracingConfig, ) from adapter.di.container import AppContainer, AppRepositories, AppUsecases +from adapter.docker.runtime import DockerSandboxRuntime from adapter.http.fastapi import app as app_module from adapter.observability.noop import NoopMetrics, NoopTracer from adapter.observability.runtime import ObservabilityRuntime @@ -80,7 +82,8 @@ class FakeCleanupExpiredSandboxes(CleanupExpiredSandboxes): class FakeDockerClient(DockerClient): - def __init__(self) -> None: + def __init__(self, base_url: str | None = None) -> None: + self.base_url = base_url self.close_calls = 0 def close(self) -> None: @@ -104,6 +107,79 @@ class FakeClock: return self._now +class RecordingMetrics: + def __init__(self) -> None: + self.increment_calls: list[tuple[str, int, Attrs | None]] = [] + self.record_calls: list[tuple[str, float, Attrs | None]] = [] + self.set_calls: list[tuple[str, int | float, Attrs | None]] = [] + + def increment( + self, + name: str, + value: int = 1, + attrs: Attrs | None = None, + ) -> None: + self.increment_calls.append((name, value, attrs)) + + def record( + self, + name: str, + value: float, + attrs: Attrs | None = None, + ) -> None: + self.record_calls.append((name, value, attrs)) + + def set( + self, + name: str, + value: int | float, + attrs: Attrs | None = None, + ) -> None: + self.set_calls.append((name, value, attrs)) + + +class RecordingSpan: + def __init__(self) -> None: + self.attrs: dict[str, str | int | float | bool] = {} + self.errors: list[Exception] = [] + + def set_attribute(self, name: str, value: str | int | float | bool) -> None: + self.attrs[name] = value + + def record_error(self, error: Exception) -> None: + self.errors.append(error) + + +class RecordingSpanContext: + def __init__(self, span: RecordingSpan) -> None: + self._span = span + + def __enter__(self) -> RecordingSpan: + return self._span + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + traceback: object, + ) -> bool | None: + return None + + +class RecordingTracer: + def __init__(self) -> None: + self.spans: list[tuple[str, Attrs | None, RecordingSpan]] = [] + + def start_span( + self, + name: str, + attrs: Attrs | None = None, + ) -> RecordingSpanContext: + span = RecordingSpan() + self.spans.append((name, attrs, span)) + return RecordingSpanContext(span) + + class FakeLifecycleRuntime: def __init__(self, sessions: list[SandboxSession]) -> None: self._sessions = list(sessions) @@ -142,6 +218,26 @@ class FakeLifecycleRuntime: self.stop_calls.append(container_id) +class FixedSandboxState: + def __init__(self, sessions: list[SandboxSession]) -> None: + self._sessions = list(sessions) + + def list_active_sessions(self) -> list[SandboxSession]: + return list(self._sessions) + + +class CountingRegistry: + def __init__(self, count_active_result: int) -> None: + self._count_active_result = count_active_result + self.replaced_sessions: list[SandboxSession] = [] + + def replace_all(self, sessions: list[SandboxSession]) -> None: + self.replaced_sessions = list(sessions) + + def count_active(self) -> int: + return self._count_active_result + + def build_config() -> AppConfig: return AppConfig( app=AppSectionConfig(name='master', env='test'), @@ -198,6 +294,8 @@ def build_container( state_source=EmptySandboxState(), registry=repositories.sandbox_session, logger=logger, + metrics=observability.metrics, + tracer=observability.tracer, ) usecases = AppUsecases( create_sandbox=create_sandbox_usecase, @@ -494,6 +592,8 @@ def test_startup_reconciliation_reuses_existing_container_after_restart( state_source=runtime, registry=repository, logger=logger, + metrics=observability.metrics, + tracer=observability.tracer, ) usecases = AppUsecases( create_sandbox=CreateSandbox( @@ -586,3 +686,76 @@ def test_removed_user_endpoint_returns_not_found(monkeypatch) -> None: assert status_code == 404 assert response == {'detail': 'Not Found'} assert docker_client.close_calls == 1 + + +def test_reconciliation_uses_registry_backed_active_count_metric() -> None: + logger = FakeLogger() + metrics = RecordingMetrics() + tracer = RecordingTracer() + created_at = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + session = SandboxSession( + session_id=SESSION_ID, + chat_id=CHAT_ID, + container_id='container-123', + status=SandboxStatus.RUNNING, + created_at=created_at, + expires_at=created_at + timedelta(minutes=5), + ) + registry = CountingRegistry(count_active_result=7) + reconciler = SandboxSessionReconciler( + state_source=FixedSandboxState([session]), + registry=registry, + logger=logger, + metrics=metrics, + tracer=tracer, + ) + + sessions = reconciler.execute() + + assert sessions == [session] + assert registry.replaced_sessions == [session] + assert metrics.set_calls == [('sandbox.active.count', 7, None)] + assert tracer.spans[0][0] == 'adapter.sandbox.reconcile_sessions' + assert tracer.spans[0][2].attrs['sandbox.active_count'] == 7 + + +def test_build_container_wires_observability_into_runtime_and_reconciler( + monkeypatch, +) -> None: + logger = FakeLogger() + metrics = RecordingMetrics() + tracer = RecordingTracer() + observability = ObservabilityRuntime( + logger=logger, + metrics=metrics, + tracer=tracer, + ) + docker_client = FakeDockerClient() + monkeypatch.setattr( + container_module, 'build_observability', lambda config: observability + ) + monkeypatch.setattr( + container_module.docker, + 'DockerClient', + lambda base_url: docker_client, + ) + + container = container_module.build_container(config=build_config()) + + runtime = container.sandbox_reconciler.state_source + assert isinstance(runtime, DockerSandboxRuntime) + assert runtime._metrics is metrics + assert runtime._tracer is tracer + assert container.sandbox_reconciler.metrics is metrics + assert container.sandbox_reconciler.tracer is tracer + assert container.usecases.create_sandbox._runtime is runtime + assert container.usecases.create_sandbox._metrics is metrics + assert container.usecases.create_sandbox._tracer is tracer + assert container.usecases.cleanup_expired_sandboxes._runtime is runtime + assert container.usecases.cleanup_expired_sandboxes._metrics is metrics + assert container.usecases.cleanup_expired_sandboxes._tracer is tracer + assert container._docker_client is docker_client + + container.shutdown() + + assert docker_client.close_calls == 1 diff --git a/test/test_docker_runtime.py b/test/test_docker_runtime.py index 1e207f3..ee6a2a4 100644 --- a/test/test_docker_runtime.py +++ b/test/test_docker_runtime.py @@ -10,6 +10,7 @@ from docker.types import Mount from adapter.config.model import SandboxConfig from adapter.docker.runtime import DockerSandboxRuntime +from adapter.observability.noop import NoopMetrics, NoopTracer from domain.error import SandboxError, SandboxStartError from domain.sandbox import SandboxSession, SandboxStatus @@ -116,6 +117,18 @@ def build_config(tmp_path: Path) -> SandboxConfig: ) +def build_runtime( + config: SandboxConfig, + containers: FakeContainers, +) -> DockerSandboxRuntime: + return DockerSandboxRuntime( + config, + FakeDockerClient(containers), + NoopMetrics(), + NoopTracer(), + ) + + def test_runtime_create_applies_mount_policy_and_labels_with_canonical_chat_id( tmp_path: Path, ) -> None: @@ -123,7 +136,7 @@ def test_runtime_create_applies_mount_policy_and_labels_with_canonical_chat_id( (tmp_path / 'dependencies').mkdir() (tmp_path / 'lambda-tools').mkdir() containers = FakeContainers() - runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) + runtime = build_runtime(config, containers) created_at = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) expires_at = created_at + timedelta(minutes=5) @@ -181,7 +194,7 @@ def test_runtime_create_raises_start_error_when_container_id_is_missing( (tmp_path / 'dependencies').mkdir() (tmp_path / 'lambda-tools').mkdir() containers = FakeContainers(run_result=FakeContainer('')) - runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) + runtime = build_runtime(config, containers) with pytest.raises(SandboxStartError) as excinfo: runtime.create( @@ -199,7 +212,7 @@ def test_runtime_stop_ignores_missing_container(tmp_path: Path) -> None: config = build_config(tmp_path) containers = FakeContainers() containers.get_result = NotFound('missing') - runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) + runtime = build_runtime(config, containers) runtime.stop('container-123') @@ -210,7 +223,7 @@ def test_runtime_stop_wraps_docker_errors(tmp_path: Path) -> None: config = build_config(tmp_path) containers = FakeContainers() containers.get_result = DockerException('boom') - runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) + runtime = build_runtime(config, containers) with pytest.raises(SandboxError) as excinfo: runtime.stop('container-123') @@ -243,7 +256,7 @@ def test_runtime_list_active_sessions_reads_valid_labeled_containers( created_at='2026-04-02T12:01:00Z', ), ] - runtime = DockerSandboxRuntime(config, FakeDockerClient(containers)) + runtime = build_runtime(config, containers) sessions = runtime.list_active_sessions() From dff28efecf6cd7e8c154a5ea3d5ed911bc54bd68 Mon Sep 17 00:00:00 2001 From: Azamat Date: Fri, 3 Apr 2026 01:34:10 +0300 Subject: [PATCH 22/30] add sandbox observability tests --- tasks.md | 2 +- test/test_docker_runtime.py | 276 ++++++++++++++++++++++++++++ test/test_sandbox_usecase.py | 344 +++++++++++++++++++++++++++++++++++ 3 files changed, 621 insertions(+), 1 deletion(-) diff --git a/tasks.md b/tasks.md index 012111d..e01dcde 100644 --- a/tasks.md +++ b/tasks.md @@ -269,7 +269,7 @@ ### M22. Тесты на sandbox observability - Субагент: `test-engineer` -- Статус: pending +- Статус: completed - Зависимости: `M20`, `M21` - Commit required: yes - Commit message: `add sandbox observability tests` diff --git a/test/test_docker_runtime.py b/test/test_docker_runtime.py index ee6a2a4..4db1095 100644 --- a/test/test_docker_runtime.py +++ b/test/test_docker_runtime.py @@ -1,5 +1,6 @@ from datetime import UTC, datetime, timedelta from pathlib import Path +from types import TracebackType from typing import Any, TypedDict from uuid import UUID @@ -13,6 +14,7 @@ from adapter.docker.runtime import DockerSandboxRuntime from adapter.observability.noop import NoopMetrics, NoopTracer from domain.error import SandboxError, SandboxStartError from domain.sandbox import SandboxSession, SandboxStatus +from usecase.interface import AttrValue, Attrs CHAT_ID = UUID('123e4567-e89b-12d3-a456-426614174000') NON_CANONICAL_CHAT_ID = '123E4567E89B12D3A456426614174000' @@ -103,6 +105,140 @@ class FakeDockerClient(DockerClient): return self._containers +class RecordingMetrics: + def __init__(self) -> None: + self.increment_calls: list[tuple[str, int, Attrs | None]] = [] + self.record_calls: list[tuple[str, float, Attrs | None]] = [] + self.set_calls: list[tuple[str, int | float, Attrs | None]] = [] + + def increment( + self, + name: str, + value: int = 1, + attrs: Attrs | None = None, + ) -> None: + self.increment_calls.append((name, value, attrs)) + + def record( + self, + name: str, + value: float, + attrs: Attrs | None = None, + ) -> None: + self.record_calls.append((name, value, attrs)) + + def set( + self, + name: str, + value: int | float, + attrs: Attrs | None = None, + ) -> None: + self.set_calls.append((name, value, attrs)) + + +class RecordingSpan: + def __init__(self) -> None: + self.attrs: dict[str, AttrValue] = {} + self.errors: list[Exception] = [] + + def set_attribute(self, name: str, value: AttrValue) -> None: + self.attrs[name] = value + + def record_error(self, error: Exception) -> None: + self.errors.append(error) + + +class RecordingSpanContext: + def __init__(self, span: RecordingSpan) -> None: + self._span = span + + def __enter__(self) -> RecordingSpan: + return self._span + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + traceback: TracebackType | None, + ) -> bool | None: + return None + + +class RecordingTracer: + def __init__(self) -> None: + self.spans: list[tuple[str, Attrs | None, RecordingSpan]] = [] + + def start_span( + self, + name: str, + attrs: Attrs | None = None, + ) -> RecordingSpanContext: + span = RecordingSpan() + self.spans.append((name, attrs, span)) + return RecordingSpanContext(span) + + +def _attrs_include( + actual: Attrs | dict[str, AttrValue] | None, + expected: dict[str, AttrValue], +) -> bool: + if actual is None: + return False + + return all(actual.get(name) == value for name, value in expected.items()) + + +def _find_span( + tracer: RecordingTracer, + name: str, + attrs: dict[str, AttrValue] | None = None, + span_attrs: dict[str, AttrValue] | None = None, +) -> RecordingSpan: + for recorded_name, recorded_attrs, span in tracer.spans: + if recorded_name != name: + continue + if attrs is not None and not _attrs_include(recorded_attrs, attrs): + continue + if span_attrs is not None and not _attrs_include(span.attrs, span_attrs): + continue + return span + + raise AssertionError(f'missing span {name}') + + +def _find_increment_call( + metrics: RecordingMetrics, + name: str, + *, + value: int = 1, + attrs: dict[str, AttrValue] | None = None, +) -> tuple[str, int, Attrs | None]: + for recorded_name, recorded_value, recorded_attrs in metrics.increment_calls: + if recorded_name != name or recorded_value != value: + continue + if attrs is not None and not _attrs_include(recorded_attrs, attrs): + continue + return recorded_name, recorded_value, recorded_attrs + + raise AssertionError(f'missing increment metric {name}') + + +def _find_record_call( + metrics: RecordingMetrics, + name: str, + *, + attrs: dict[str, AttrValue] | None = None, +) -> tuple[str, float, Attrs | None]: + for recorded_name, recorded_value, recorded_attrs in metrics.record_calls: + if recorded_name != name: + continue + if attrs is not None and not _attrs_include(recorded_attrs, attrs): + continue + return recorded_name, recorded_value, recorded_attrs + + raise AssertionError(f'missing record metric {name}') + + def build_config(tmp_path: Path) -> SandboxConfig: return SandboxConfig( image='sandbox:latest', @@ -187,6 +323,48 @@ def test_runtime_create_applies_mount_policy_and_labels_with_canonical_chat_id( ] +def test_runtime_create_records_observability(tmp_path: Path) -> None: + config = build_config(tmp_path) + (tmp_path / 'dependencies').mkdir() + (tmp_path / 'lambda-tools').mkdir() + containers = FakeContainers() + metrics = RecordingMetrics() + tracer = RecordingTracer() + runtime = DockerSandboxRuntime( + config, + FakeDockerClient(containers), + metrics, + tracer, + ) + created_at = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + expires_at = created_at + timedelta(minutes=5) + + session = runtime.create( + session_id=SESSION_ID, + chat_id=CHAT_ID, + created_at=created_at, + expires_at=expires_at, + ) + + assert session.container_id == 'container-123' + duration_call = _find_record_call( + metrics, + 'sandbox.runtime.create.duration_ms', + attrs={'operation': 'create', 'result': 'created'}, + ) + assert duration_call[1] >= 0 + span = _find_span( + tracer, + 'adapter.docker.create_sandbox', + {'chat.id': str(CHAT_ID), 'session.id': str(SESSION_ID)}, + { + 'container.id': 'container-123', + 'sandbox.result': 'created', + }, + ) + assert not span.errors + + def test_runtime_create_raises_start_error_when_container_id_is_missing( tmp_path: Path, ) -> None: @@ -208,6 +386,51 @@ def test_runtime_create_raises_start_error_when_container_id_is_missing( assert excinfo.value.chat_id == str(CHAT_ID) +def test_runtime_create_error_records_observability_when_container_id_missing( + tmp_path: Path, +) -> None: + config = build_config(tmp_path) + (tmp_path / 'dependencies').mkdir() + (tmp_path / 'lambda-tools').mkdir() + containers = FakeContainers(run_result=FakeContainer('')) + metrics = RecordingMetrics() + tracer = RecordingTracer() + runtime = DockerSandboxRuntime( + config, + FakeDockerClient(containers), + metrics, + tracer, + ) + + with pytest.raises(SandboxStartError) as excinfo: + runtime.create( + session_id=SESSION_ID, + chat_id=CHAT_ID, + created_at=datetime(2026, 4, 2, 12, 0, tzinfo=UTC), + expires_at=datetime(2026, 4, 2, 12, 5, tzinfo=UTC), + ) + + assert str(excinfo.value) == 'sandbox_start_failed' + _find_increment_call( + metrics, + 'sandbox.runtime.error.total', + attrs={'operation': 'create', 'error.type': 'SandboxStartError'}, + ) + duration_call = _find_record_call( + metrics, + 'sandbox.runtime.create.duration_ms', + attrs={'operation': 'create', 'result': 'error'}, + ) + assert duration_call[1] >= 0 + span = _find_span( + tracer, + 'adapter.docker.create_sandbox', + {'chat.id': str(CHAT_ID), 'session.id': str(SESSION_ID)}, + {'sandbox.result': 'error'}, + ) + assert excinfo.value in span.errors + + def test_runtime_stop_ignores_missing_container(tmp_path: Path) -> None: config = build_config(tmp_path) containers = FakeContainers() @@ -273,3 +496,56 @@ def test_runtime_list_active_sessions_reads_valid_labeled_containers( assert containers.list_calls == [ {'filters': {'label': ['session_id', 'chat_id', 'expires_at']}} ] + + +def test_runtime_list_active_records_observability(tmp_path: Path) -> None: + config = build_config(tmp_path) + containers = FakeContainers() + expires_at = datetime(2026, 4, 2, 12, 5, tzinfo=UTC) + containers.list_result = [ + FakeListedContainer( + 'container-123', + labels={ + 'session_id': str(SESSION_ID), + 'chat_id': str(CHAT_ID), + 'expires_at': expires_at.isoformat(), + }, + created_at='2026-04-02T12:00:00Z', + ), + FakeListedContainer( + 'container-bad', + labels={ + 'chat_id': str(CHAT_ID), + 'expires_at': expires_at.isoformat(), + }, + created_at='2026-04-02T12:01:00Z', + ), + ] + metrics = RecordingMetrics() + tracer = RecordingTracer() + runtime = DockerSandboxRuntime( + config, + FakeDockerClient(containers), + metrics, + tracer, + ) + + sessions = runtime.list_active_sessions() + + assert len(sessions) == 1 + duration_call = _find_record_call( + metrics, + 'sandbox.runtime.list_active.duration_ms', + attrs={'operation': 'list_active', 'result': 'listed'}, + ) + assert duration_call[1] >= 0 + span = _find_span( + tracer, + 'adapter.docker.list_active_sandboxes', + span_attrs={ + 'sandbox.container_count': 2, + 'sandbox.active_count': 1, + 'sandbox.result': 'listed', + }, + ) + assert not span.errors diff --git a/test/test_sandbox_usecase.py b/test/test_sandbox_usecase.py index f744492..403eb0f 100644 --- a/test/test_sandbox_usecase.py +++ b/test/test_sandbox_usecase.py @@ -1,11 +1,15 @@ import threading from datetime import UTC, datetime, timedelta +from types import TracebackType from uuid import UUID +import pytest + from adapter.observability.noop import NoopMetrics, NoopTracer from domain.sandbox import SandboxSession, SandboxStatus from repository.sandbox_lock import ProcessLocalSandboxLifecycleLocker from repository.sandbox_session import InMemorySandboxSessionRepository +from usecase.interface import AttrValue, Attrs from usecase.sandbox import CleanupExpiredSandboxes, CreateSandbox, CreateSandboxCommand CHAT_ID = UUID('11111111-1111-1111-1111-111111111111') @@ -53,6 +57,130 @@ class FakeLogger: self.messages.append(('error', message, attrs)) +class RecordingMetrics: + def __init__(self) -> None: + self.increment_calls: list[tuple[str, int, Attrs | None]] = [] + self.record_calls: list[tuple[str, float, Attrs | None]] = [] + self.set_calls: list[tuple[str, int | float, Attrs | None]] = [] + + def increment( + self, + name: str, + value: int = 1, + attrs: Attrs | None = None, + ) -> None: + self.increment_calls.append((name, value, attrs)) + + def record( + self, + name: str, + value: float, + attrs: Attrs | None = None, + ) -> None: + self.record_calls.append((name, value, attrs)) + + def set( + self, + name: str, + value: int | float, + attrs: Attrs | None = None, + ) -> None: + self.set_calls.append((name, value, attrs)) + + +class RecordingSpan: + def __init__(self) -> None: + self.attrs: dict[str, AttrValue] = {} + self.errors: list[Exception] = [] + + def set_attribute(self, name: str, value: AttrValue) -> None: + self.attrs[name] = value + + def record_error(self, error: Exception) -> None: + self.errors.append(error) + + +class RecordingSpanContext: + def __init__(self, span: RecordingSpan) -> None: + self._span = span + + def __enter__(self) -> RecordingSpan: + return self._span + + def __exit__( + self, + exc_type: type[BaseException] | None, + exc: BaseException | None, + traceback: TracebackType | None, + ) -> bool | None: + return None + + +class RecordingTracer: + def __init__(self) -> None: + self.spans: list[tuple[str, Attrs | None, RecordingSpan]] = [] + + def start_span( + self, + name: str, + attrs: Attrs | None = None, + ) -> RecordingSpanContext: + span = RecordingSpan() + self.spans.append((name, attrs, span)) + return RecordingSpanContext(span) + + +def _attrs_include( + actual: Attrs | dict[str, AttrValue] | None, + expected: dict[str, AttrValue], +) -> bool: + if actual is None: + return False + + return all(actual.get(name) == value for name, value in expected.items()) + + +def _find_span( + tracer: RecordingTracer, + name: str, + attrs: dict[str, AttrValue] | None = None, + span_attrs: dict[str, AttrValue] | None = None, +) -> RecordingSpan: + for recorded_name, recorded_attrs, span in tracer.spans: + if recorded_name != name: + continue + if attrs is not None and not _attrs_include(recorded_attrs, attrs): + continue + if span_attrs is not None and not _attrs_include(span.attrs, span_attrs): + continue + return span + + raise AssertionError(f'missing span {name}') + + +def _assert_increment_metric_present( + metrics: RecordingMetrics, + name: str, + *, + value: int = 1, + attrs: dict[str, AttrValue] | None = None, +) -> None: + for recorded_name, recorded_value, recorded_attrs in metrics.increment_calls: + if recorded_name != name or recorded_value != value: + continue + if attrs is not None and not _attrs_include(recorded_attrs, attrs): + continue + return + + raise AssertionError(f'missing increment metric {name}') + + +def _active_count_values(metrics: RecordingMetrics) -> list[int | float]: + return [ + value for name, value, _ in metrics.set_calls if name == 'sandbox.active.count' + ] + + class FakeLockContext: def __enter__(self) -> None: return None @@ -198,6 +326,30 @@ class FailingStopRuntime(FakeRuntime): raise RuntimeError('stop_failed') +class FailingCreateRuntime(FakeRuntime): + def __init__(self, error: Exception) -> None: + super().__init__() + self._error = error + + def create( + self, + *, + session_id: UUID, + chat_id: UUID, + created_at: datetime, + expires_at: datetime, + ) -> SandboxSession: + self.create_calls.append( + { + 'session_id': session_id, + 'chat_id': chat_id, + 'created_at': created_at, + 'expires_at': expires_at, + } + ) + raise self._error + + def test_create_sandbox_reuses_active_session_when_not_expired() -> None: now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) session = SandboxSession( @@ -244,6 +396,104 @@ def test_create_sandbox_reuses_active_session_when_not_expired() -> None: ] +def test_create_sandbox_reuse_records_observability() -> None: + now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + session = SandboxSession( + session_id=SESSION_REUSED_ID, + chat_id=CHAT_ID, + container_id='container-1', + status=SandboxStatus.RUNNING, + created_at=now - timedelta(minutes=1), + expires_at=now + timedelta(minutes=4), + ) + repository = InMemorySandboxSessionRepository() + repository.save(session) + metrics = RecordingMetrics() + tracer = RecordingTracer() + usecase = CreateSandbox( + repository=repository, + locker=FakeLocker(), + runtime=FakeRuntime(), + clock=FakeClock(now), + logger=FakeLogger(), + metrics=metrics, + tracer=tracer, + ttl=timedelta(minutes=5), + ) + + result = usecase.execute(CreateSandboxCommand(chat_id=CHAT_ID)) + + assert result == session + _assert_increment_metric_present( + metrics, + 'sandbox.create.total', + attrs={'result': 'reused'}, + ) + span = _find_span( + tracer, + 'usecase.create_sandbox', + {'chat.id': str(CHAT_ID)}, + { + 'session.id': str(SESSION_REUSED_ID), + 'container.id': 'container-1', + 'sandbox.result': 'reused', + }, + ) + assert not span.errors + + +def test_create_sandbox_replace_records_observability_and_final_active_count( + monkeypatch, +) -> None: + now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + expired_session = SandboxSession( + session_id=SESSION_OLD_ID, + chat_id=CHAT_ID, + container_id='container-old', + status=SandboxStatus.RUNNING, + created_at=now - timedelta(minutes=10), + expires_at=now, + ) + repository = InMemorySandboxSessionRepository() + repository.save(expired_session) + metrics = RecordingMetrics() + tracer = RecordingTracer() + usecase = CreateSandbox( + repository=repository, + locker=FakeLocker(), + runtime=FakeRuntime(), + clock=FakeClock(now), + logger=FakeLogger(), + metrics=metrics, + tracer=tracer, + ttl=timedelta(minutes=5), + ) + monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID) + + result = usecase.execute(CreateSandboxCommand(chat_id=CHAT_ID)) + + assert result.session_id == SESSION_NEW_ID + assert repository.count_active() == 1 + _assert_increment_metric_present( + metrics, + 'sandbox.create.total', + attrs={'result': 'replaced'}, + ) + assert _active_count_values(metrics) + assert _active_count_values(metrics)[-1] == 1 + span = _find_span( + tracer, + 'usecase.create_sandbox', + {'chat.id': str(CHAT_ID)}, + { + 'session.id': str(SESSION_NEW_ID), + 'container.id': f'container-{SESSION_NEW_ID}', + 'sandbox.result': 'replaced', + }, + ) + assert not span.errors + + def test_create_sandbox_replaces_expired_session_and_creates_new_one( monkeypatch, ) -> None: @@ -363,6 +613,42 @@ def test_create_sandbox_creates_new_session_when_none_exists() -> None: ] +def test_create_sandbox_error_records_observability(monkeypatch) -> None: + now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + metrics = RecordingMetrics() + tracer = RecordingTracer() + usecase = CreateSandbox( + repository=InMemorySandboxSessionRepository(), + locker=FakeLocker(), + runtime=FailingCreateRuntime(RuntimeError('create_failed')), + clock=FakeClock(now), + logger=FakeLogger(), + metrics=metrics, + tracer=tracer, + ttl=timedelta(minutes=5), + ) + monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID) + + with pytest.raises(RuntimeError, match='create_failed') as excinfo: + usecase.execute(CreateSandboxCommand(chat_id=CHAT_ID)) + + _assert_increment_metric_present( + metrics, + 'sandbox.create.total', + attrs={'result': 'error'}, + ) + span = _find_span( + tracer, + 'usecase.create_sandbox', + {'chat.id': str(CHAT_ID)}, + { + 'session.id': str(SESSION_NEW_ID), + 'sandbox.result': 'error', + }, + ) + assert excinfo.value in span.errors + + def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id( monkeypatch, ) -> None: @@ -516,6 +802,64 @@ def test_cleanup_expired_sandboxes_stops_and_deletes_only_expired_sessions() -> ] +def test_cleanup_expired_sandboxes_records_observability_on_cleaned_session() -> None: + now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + expired_session = SandboxSession( + session_id=SESSION_EXPIRED_ID, + chat_id=EXPIRED_CHAT_ID, + container_id='container-expired', + status=SandboxStatus.RUNNING, + created_at=now - timedelta(minutes=10), + expires_at=now - timedelta(seconds=1), + ) + repository = InMemorySandboxSessionRepository() + repository.save(expired_session) + metrics = RecordingMetrics() + tracer = RecordingTracer() + usecase = CleanupExpiredSandboxes( + repository=repository, + locker=FakeLocker(), + runtime=FakeRuntime(), + clock=FakeClock(now), + logger=FakeLogger(), + metrics=metrics, + tracer=tracer, + ) + + result = usecase.execute() + + assert result == [expired_session] + _assert_increment_metric_present( + metrics, + 'sandbox.cleanup.total', + attrs={'result': 'cleaned'}, + ) + assert _active_count_values(metrics) + assert _active_count_values(metrics)[-1] == 0 + root_span = _find_span( + tracer, + 'usecase.cleanup_expired_sandboxes', + span_attrs={ + 'sandbox.expired_count': 1, + 'sandbox.cleaned_count': 1, + 'sandbox.error_count': 0, + 'sandbox.result': 'completed', + }, + ) + assert not root_span.errors + cleanup_span = _find_span( + tracer, + 'usecase.cleanup_expired_sandbox', + { + 'chat.id': str(EXPIRED_CHAT_ID), + 'session.id': str(SESSION_EXPIRED_ID), + 'container.id': 'container-expired', + }, + {'sandbox.result': 'cleaned'}, + ) + assert not cleanup_span.errors + + def test_cleanup_expired_sandboxes_skips_replaced_session_from_stale_snapshot() -> None: now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) expired_snapshot = SandboxSession( From 02770bce7d0bbfa8a32b6c1efb0095f964f18a32 Mon Sep 17 00:00:00 2001 From: Azamat Date: Fri, 3 Apr 2026 01:55:12 +0300 Subject: [PATCH 23/30] fix sandbox replace trace identity --- tasks.md | 38 ++++++++++++++++++++++- test/test_sandbox_usecase.py | 59 +++++++++++++++++++++++++++++++++++- usecase/sandbox.py | 30 +++++++++++++++--- 3 files changed, 120 insertions(+), 7 deletions(-) diff --git a/tasks.md b/tasks.md index e01dcde..4b9e7e5 100644 --- a/tasks.md +++ b/tasks.md @@ -281,9 +281,45 @@ ### M23. Boundary review для sandbox observability - Субагент: `code-reviewer` -- Статус: pending +- Статус: in_progress - Зависимости: `M22` - Commit required: no - Scope: проверить, что observability изменения закрывают issue #11 и FR-034 без нарушения clean architecture - Файлы: весь измененный код после `M19`-`M22` - Критерии приемки: inner layers не импортируют OTel; Docker-specific tracing остается в `adapter/docker/`; current-state и duration metrics достаточно покрывают sandbox lifecycle; замечания сведены к minor или отсутствуют + +## Follow-up после M23 boundary review + +### M24. Исправить replace trace identity в CreateSandbox + +- Субагент: `feature-developer` +- Статус: completed +- Зависимости: `M23` +- Commit required: yes +- Commit message: `fix sandbox replace trace identity` +- Scope: устранить смешение old/new sandbox identifiers в replace path usecase tracing +- Файлы: `usecase/sandbox.py`, при необходимости точечные тесты в `test/*` +- Решение: сохранять старые и новые sandbox identifiers в отдельных span attrs или child spans так, чтобы replace success и replace failure оставались однозначно трассируемыми +- Критерии приемки: replace path не перетирает previous/new identifiers; при replace failure span остается консистентным и отражает обе стороны lifecycle + +### M25. Добрать failure-path observability regression tests + +- Субагент: `test-engineer` +- Статус: pending +- Зависимости: `M24` +- Commit required: yes +- Commit message: `add sandbox observability failure tests` +- Scope: покрыть tests для replace-failure trace, cleanup error metrics/spans и Docker stop observability +- Файлы: `test/test_sandbox_usecase.py`, `test/test_docker_runtime.py`, при необходимости другие focused tests в `test/*` +- Решение: использовать presence-based assertions и проверять ключевые span/metric contracts без brittle exact-order checks +- Критерии приемки: есть тест на replace failure tracing; есть тест на `sandbox.cleanup.error.total`; есть тесты на Docker stop observability для success/error/not_found или эквивалентного набора outcome paths + +### M26. Повторный boundary review для sandbox observability + +- Субагент: `code-reviewer` +- Статус: pending +- Зависимости: `M25` +- Commit required: no +- Scope: подтвердить, что follow-up fixes закрыли M23 замечания без новых boundary нарушений +- Файлы: весь измененный код после `M24`-`M25` +- Критерии приемки: нет замечаний по replace tracing identity и missing failure-path observability coverage; clean architecture по-прежнему соблюдена diff --git a/test/test_sandbox_usecase.py b/test/test_sandbox_usecase.py index 403eb0f..92c7937 100644 --- a/test/test_sandbox_usecase.py +++ b/test/test_sandbox_usecase.py @@ -9,7 +9,7 @@ from adapter.observability.noop import NoopMetrics, NoopTracer from domain.sandbox import SandboxSession, SandboxStatus from repository.sandbox_lock import ProcessLocalSandboxLifecycleLocker from repository.sandbox_session import InMemorySandboxSessionRepository -from usecase.interface import AttrValue, Attrs +from usecase.interface import Attrs, AttrValue from usecase.sandbox import CleanupExpiredSandboxes, CreateSandbox, CreateSandboxCommand CHAT_ID = UUID('11111111-1111-1111-1111-111111111111') @@ -486,6 +486,10 @@ def test_create_sandbox_replace_records_observability_and_final_active_count( 'usecase.create_sandbox', {'chat.id': str(CHAT_ID)}, { + 'sandbox.previous_session.id': str(SESSION_OLD_ID), + 'sandbox.previous_container.id': 'container-old', + 'sandbox.new_session.id': str(SESSION_NEW_ID), + 'sandbox.new_container.id': f'container-{SESSION_NEW_ID}', 'session.id': str(SESSION_NEW_ID), 'container.id': f'container-{SESSION_NEW_ID}', 'sandbox.result': 'replaced', @@ -649,6 +653,59 @@ def test_create_sandbox_error_records_observability(monkeypatch) -> None: assert excinfo.value in span.errors +def test_create_sandbox_replace_stop_failure_preserves_separate_identities( + monkeypatch, +) -> None: + now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + expired_session = SandboxSession( + session_id=SESSION_OLD_ID, + chat_id=CHAT_ID, + container_id='container-old', + status=SandboxStatus.RUNNING, + created_at=now - timedelta(minutes=10), + expires_at=now, + ) + repository = InMemorySandboxSessionRepository() + repository.save(expired_session) + metrics = RecordingMetrics() + tracer = RecordingTracer() + usecase = CreateSandbox( + repository=repository, + locker=FakeLocker(), + runtime=FailingStopRuntime('container-old'), + clock=FakeClock(now), + logger=FakeLogger(), + metrics=metrics, + tracer=tracer, + ttl=timedelta(minutes=5), + ) + monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID) + + with pytest.raises(RuntimeError, match='stop_failed') as excinfo: + usecase.execute(CreateSandboxCommand(chat_id=CHAT_ID)) + + _assert_increment_metric_present( + metrics, + 'sandbox.create.total', + attrs={'result': 'error'}, + ) + span = _find_span( + tracer, + 'usecase.create_sandbox', + {'chat.id': str(CHAT_ID)}, + { + 'sandbox.previous_session.id': str(SESSION_OLD_ID), + 'sandbox.previous_container.id': 'container-old', + 'sandbox.new_session.id': str(SESSION_NEW_ID), + 'sandbox.result': 'error', + }, + ) + assert 'sandbox.new_container.id' not in span.attrs + assert 'session.id' not in span.attrs + assert 'container.id' not in span.attrs + assert excinfo.value in span.errors + + def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id( monkeypatch, ) -> None: diff --git a/usecase/sandbox.py b/usecase/sandbox.py index 2bdb369..0a3412f 100644 --- a/usecase/sandbox.py +++ b/usecase/sandbox.py @@ -67,10 +67,22 @@ class CreateSandbox: return session result = 'created' + new_session_id: UUID | None = None if session is not None: result = 'replaced' - span.set_attribute('session.id', str(session.session_id)) - span.set_attribute('container.id', session.container_id) + new_session_id = _new_session_id() + span.set_attribute( + 'sandbox.previous_session.id', + str(session.session_id), + ) + span.set_attribute( + 'sandbox.previous_container.id', + session.container_id, + ) + span.set_attribute( + 'sandbox.new_session.id', + str(new_session_id), + ) self._logger.info( 'sandbox_replaced', attrs=_sandbox_attrs(session), @@ -81,16 +93,24 @@ class CreateSandbox: created_at = self._clock.now() expires_at = created_at + self._ttl - session_id = _new_session_id() - span.set_attribute('session.id', str(session_id)) + if new_session_id is None: + new_session_id = _new_session_id() + span.set_attribute('session.id', str(new_session_id)) new_session = self._runtime.create( - session_id=session_id, + session_id=new_session_id, chat_id=chat_id, created_at=created_at, expires_at=expires_at, ) + if result == 'replaced': + span.set_attribute( + 'sandbox.new_container.id', + new_session.container_id, + ) self._repository.save(new_session) _set_active_count(self._metrics, self._repository) + if result == 'replaced': + span.set_attribute('session.id', str(new_session.session_id)) span.set_attribute('container.id', new_session.container_id) span.set_attribute('sandbox.result', result) self._metrics.increment( From b4a2a9ceea1e1fa32e24b93cf65e6d892b866587 Mon Sep 17 00:00:00 2001 From: Azamat Date: Fri, 3 Apr 2026 02:04:51 +0300 Subject: [PATCH 24/30] add sandbox observability failure tests --- tasks.md | 2 +- test/test_docker_runtime.py | 147 ++++++++++++++++++++++++++++++++++- test/test_sandbox_usecase.py | 125 ++++++++++++++++++++++++++++- 3 files changed, 268 insertions(+), 6 deletions(-) diff --git a/tasks.md b/tasks.md index 4b9e7e5..e713a66 100644 --- a/tasks.md +++ b/tasks.md @@ -305,7 +305,7 @@ ### M25. Добрать failure-path observability regression tests - Субагент: `test-engineer` -- Статус: pending +- Статус: completed - Зависимости: `M24` - Commit required: yes - Commit message: `add sandbox observability failure tests` diff --git a/test/test_docker_runtime.py b/test/test_docker_runtime.py index 4db1095..352adad 100644 --- a/test/test_docker_runtime.py +++ b/test/test_docker_runtime.py @@ -43,6 +43,27 @@ class FakeListedContainer(FakeContainer): self.attrs = {'Created': created_at} +class FailingStopContainer(FakeListedContainer): + def __init__( + self, + container_id: str, + *, + labels: dict[str, str], + created_at: str, + error: Exception, + ) -> None: + super().__init__( + container_id, + labels=labels, + created_at=created_at, + ) + self._error = error + + def stop(self) -> None: + self.stop_calls += 1 + raise self._error + + class RunKwargs(TypedDict): detach: bool labels: dict[str, str] @@ -435,23 +456,143 @@ def test_runtime_stop_ignores_missing_container(tmp_path: Path) -> None: config = build_config(tmp_path) containers = FakeContainers() containers.get_result = NotFound('missing') - runtime = build_runtime(config, containers) + metrics = RecordingMetrics() + tracer = RecordingTracer() + runtime = DockerSandboxRuntime( + config, + FakeDockerClient(containers), + metrics, + tracer, + ) runtime.stop('container-123') assert containers.get_calls == ['container-123'] + duration_call = _find_record_call( + metrics, + 'sandbox.runtime.stop.duration_ms', + attrs={'operation': 'stop', 'result': 'not_found'}, + ) + assert duration_call[1] >= 0 + span = _find_span( + tracer, + 'adapter.docker.stop_sandbox', + {'container.id': 'container-123'}, + {'sandbox.result': 'not_found'}, + ) + assert not span.errors + stop_error_calls = [ + call + for call in metrics.increment_calls + if call[0] == 'sandbox.runtime.error.total' + and call[2] is not None + and call[2].get('operation') == 'stop' + ] + assert stop_error_calls == [] def test_runtime_stop_wraps_docker_errors(tmp_path: Path) -> None: config = build_config(tmp_path) containers = FakeContainers() - containers.get_result = DockerException('boom') - runtime = build_runtime(config, containers) + containers.get_result = FailingStopContainer( + 'container-123', + labels={ + 'session_id': str(SESSION_ID), + 'chat_id': str(CHAT_ID), + 'expires_at': '2026-04-02T12:05:00+00:00', + }, + created_at='2026-04-02T12:00:00Z', + error=DockerException('boom'), + ) + metrics = RecordingMetrics() + tracer = RecordingTracer() + runtime = DockerSandboxRuntime( + config, + FakeDockerClient(containers), + metrics, + tracer, + ) with pytest.raises(SandboxError) as excinfo: runtime.stop('container-123') assert str(excinfo.value) == 'sandbox_stop_failed' + _find_increment_call( + metrics, + 'sandbox.runtime.error.total', + attrs={'operation': 'stop', 'error.type': 'DockerException'}, + ) + duration_call = _find_record_call( + metrics, + 'sandbox.runtime.stop.duration_ms', + attrs={'operation': 'stop', 'result': 'error'}, + ) + assert duration_call[1] >= 0 + span = _find_span( + tracer, + 'adapter.docker.stop_sandbox', + {'container.id': 'container-123'}, + { + 'session.id': str(SESSION_ID), + 'chat.id': str(CHAT_ID), + 'sandbox.result': 'error', + }, + ) + cause = excinfo.value.__cause__ + assert isinstance(cause, DockerException) + assert cause in span.errors + + +def test_runtime_stop_records_observability_on_success(tmp_path: Path) -> None: + config = build_config(tmp_path) + containers = FakeContainers() + container = FakeListedContainer( + 'container-123', + labels={ + 'session_id': str(SESSION_ID), + 'chat_id': str(CHAT_ID), + 'expires_at': '2026-04-02T12:05:00+00:00', + }, + created_at='2026-04-02T12:00:00Z', + ) + containers.get_result = container + metrics = RecordingMetrics() + tracer = RecordingTracer() + runtime = DockerSandboxRuntime( + config, + FakeDockerClient(containers), + metrics, + tracer, + ) + + runtime.stop('container-123') + + assert container.stop_calls == 1 + duration_call = _find_record_call( + metrics, + 'sandbox.runtime.stop.duration_ms', + attrs={'operation': 'stop', 'result': 'stopped'}, + ) + assert duration_call[1] >= 0 + span = _find_span( + tracer, + 'adapter.docker.stop_sandbox', + {'container.id': 'container-123'}, + { + 'session.id': str(SESSION_ID), + 'chat.id': str(CHAT_ID), + 'sandbox.result': 'stopped', + }, + ) + assert not span.errors + stop_error_calls = [ + call + for call in metrics.increment_calls + if call[0] == 'sandbox.runtime.error.total' + and call[2] is not None + and call[2].get('operation') == 'stop' + ] + assert stop_error_calls == [] def test_runtime_list_active_sessions_reads_valid_labeled_containers( diff --git a/test/test_sandbox_usecase.py b/test/test_sandbox_usecase.py index 92c7937..068204c 100644 --- a/test/test_sandbox_usecase.py +++ b/test/test_sandbox_usecase.py @@ -281,6 +281,22 @@ class StaleSnapshotRepository(InMemorySandboxSessionRepository): return [self._snapshot] +class FailingSaveRepository(InMemorySandboxSessionRepository): + def __init__(self, error: Exception) -> None: + super().__init__() + self._error = error + self._fail_next_save = False + + def fail_next_save(self) -> None: + self._fail_next_save = True + + def save(self, session: SandboxSession) -> None: + if self._fail_next_save: + self._fail_next_save = False + raise self._error + super().save(session) + + class FakeRuntime: def __init__(self) -> None: self.create_calls: list[dict[str, object]] = [] @@ -706,6 +722,64 @@ def test_create_sandbox_replace_stop_failure_preserves_separate_identities( assert excinfo.value in span.errors +def test_create_sandbox_replace_save_failure_records_stage_safe_trace_ids( + monkeypatch, +) -> None: + now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + expired_session = SandboxSession( + session_id=SESSION_OLD_ID, + chat_id=CHAT_ID, + container_id='container-old', + status=SandboxStatus.RUNNING, + created_at=now - timedelta(minutes=10), + expires_at=now, + ) + repository = FailingSaveRepository(RuntimeError('save_failed')) + repository.save(expired_session) + repository.fail_next_save() + metrics = RecordingMetrics() + tracer = RecordingTracer() + runtime = FakeRuntime() + usecase = CreateSandbox( + repository=repository, + locker=FakeLocker(), + runtime=runtime, + clock=FakeClock(now), + logger=FakeLogger(), + metrics=metrics, + tracer=tracer, + ttl=timedelta(minutes=5), + ) + monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID) + + with pytest.raises(RuntimeError, match='save_failed') as excinfo: + usecase.execute(CreateSandboxCommand(chat_id=CHAT_ID)) + + assert runtime.stop_calls == ['container-old'] + assert len(runtime.create_calls) == 1 + assert repository.get_active_by_chat_id(CHAT_ID) is None + _assert_increment_metric_present( + metrics, + 'sandbox.create.total', + attrs={'result': 'error'}, + ) + span = _find_span( + tracer, + 'usecase.create_sandbox', + {'chat.id': str(CHAT_ID)}, + { + 'sandbox.previous_session.id': str(SESSION_OLD_ID), + 'sandbox.previous_container.id': 'container-old', + 'sandbox.new_session.id': str(SESSION_NEW_ID), + 'sandbox.new_container.id': f'container-{SESSION_NEW_ID}', + 'sandbox.result': 'error', + }, + ) + assert 'session.id' not in span.attrs + assert 'container.id' not in span.attrs + assert excinfo.value in span.errors + + def test_create_sandbox_serializes_duplicate_concurrent_create_for_chat_id( monkeypatch, ) -> None: @@ -982,6 +1056,8 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None: repository.save(cleaned_session) runtime = FailingStopRuntime('container-fail') logger = FakeLogger() + metrics = RecordingMetrics() + tracer = RecordingTracer() locker = FakeLocker() usecase = CleanupExpiredSandboxes( repository=repository, @@ -989,8 +1065,8 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None: runtime=runtime, clock=FakeClock(now), logger=logger, - metrics=NoopMetrics(), - tracer=NoopTracer(), + metrics=metrics, + tracer=tracer, ) result = usecase.execute() @@ -1021,3 +1097,48 @@ def test_cleanup_expired_sandboxes_continues_after_stop_failure() -> None: }, ), ] + _assert_increment_metric_present( + metrics, + 'sandbox.cleanup.error.total', + attrs={'error.type': 'RuntimeError'}, + ) + _assert_increment_metric_present( + metrics, + 'sandbox.cleanup.total', + attrs={'result': 'cleaned'}, + ) + assert _active_count_values(metrics) + assert _active_count_values(metrics)[-1] == 1 + root_span = _find_span( + tracer, + 'usecase.cleanup_expired_sandboxes', + span_attrs={ + 'sandbox.expired_count': 2, + 'sandbox.cleaned_count': 1, + 'sandbox.error_count': 1, + 'sandbox.result': 'completed_with_errors', + }, + ) + assert not root_span.errors + failed_span = _find_span( + tracer, + 'usecase.cleanup_expired_sandbox', + { + 'chat.id': str(FAIL_CHAT_ID), + 'session.id': str(SESSION_FAIL_ID), + 'container.id': 'container-fail', + }, + {'sandbox.result': 'error'}, + ) + assert [str(error) for error in failed_span.errors] == ['stop_failed'] + cleaned_span = _find_span( + tracer, + 'usecase.cleanup_expired_sandbox', + { + 'chat.id': str(CLEAN_CHAT_ID), + 'session.id': str(SESSION_CLEAN_ID), + 'container.id': 'container-clean', + }, + {'sandbox.result': 'cleaned'}, + ) + assert not cleaned_span.errors From 9b6c7908adea4321a199dd2a932662120aedb0e4 Mon Sep 17 00:00:00 2001 From: Azamat Date: Fri, 3 Apr 2026 02:18:54 +0300 Subject: [PATCH 25/30] fix sandbox create rollback gap --- tasks.md | 38 +++++++++++++++++++++++++++++++++++- test/test_sandbox_usecase.py | 37 ++++++++++++++++++++++++++++++++++- usecase/sandbox.py | 22 ++++++++++++++++++++- 3 files changed, 94 insertions(+), 3 deletions(-) diff --git a/tasks.md b/tasks.md index e713a66..77dd415 100644 --- a/tasks.md +++ b/tasks.md @@ -317,9 +317,45 @@ ### M26. Повторный boundary review для sandbox observability - Субагент: `code-reviewer` -- Статус: pending +- Статус: in_progress - Зависимости: `M25` - Commit required: no - Scope: подтвердить, что follow-up fixes закрыли M23 замечания без новых boundary нарушений - Файлы: весь измененный код после `M24`-`M25` - Критерии приемки: нет замечаний по replace tracing identity и missing failure-path observability coverage; clean architecture по-прежнему соблюдена + +## Follow-up после M26 boundary review + +### M27. Компенсация save failure после runtime.create + +- Субагент: `feature-developer` +- Статус: completed +- Зависимости: `M26` +- Commit required: yes +- Commit message: `fix sandbox create rollback gap` +- Scope: не оставлять untracked running container и неконсистентный `sandbox.active.count` при падении `repository.save()` после успешного `runtime.create()` +- Файлы: `usecase/sandbox.py`, при необходимости точечные тесты в `test/*` +- Решение: сделать create/replace path registry-safe через rollback или другой явный compensation path без нарушения clean architecture +- Критерии приемки: save failure не оставляет новый container в runtime без registry state; `sandbox.active.count` отражает финальное committed state; replace и fresh-create failure paths консистентны + +### M28. Регрессии на rollback и startup failure observability + +- Субагент: `test-engineer` +- Статус: pending +- Зависимости: `M27` +- Commit required: yes +- Commit message: `add sandbox rollback regression tests` +- Scope: покрыть tests для save-failure rollback и startup observability failure paths +- Файлы: `test/test_sandbox_usecase.py`, `test/test_docker_runtime.py`, `test/test_create_http.py`, при необходимости другие focused tests в `test/*` +- Решение: добавить tests на fresh-create/replace save failure compensation, `list_active` failure observability и reconciliation failure span/metric expectations где применимо +- Критерии приемки: rollback path покрыт; list/reconciliation failure observability не регрессирует; tests остаются presence-based и стабильными + +### M29. Финальный boundary review для sandbox observability + +- Субагент: `code-reviewer` +- Статус: pending +- Зависимости: `M28` +- Commit required: no +- Scope: подтвердить, что M27-M28 закрыли remaining M26 замечания +- Файлы: весь измененный код после `M27`-`M28` +- Критерии приемки: нет замечаний по rollback gap и startup failure observability coverage; sandbox observability slice приемлем as-is diff --git a/test/test_sandbox_usecase.py b/test/test_sandbox_usecase.py index 068204c..b2e3dcb 100644 --- a/test/test_sandbox_usecase.py +++ b/test/test_sandbox_usecase.py @@ -669,6 +669,39 @@ def test_create_sandbox_error_records_observability(monkeypatch) -> None: assert excinfo.value in span.errors +def test_create_sandbox_save_failure_stops_untracked_container(monkeypatch) -> None: + now = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + repository = FailingSaveRepository(RuntimeError('save_failed')) + repository.fail_next_save() + metrics = RecordingMetrics() + runtime = FakeRuntime() + usecase = CreateSandbox( + repository=repository, + locker=FakeLocker(), + runtime=runtime, + clock=FakeClock(now), + logger=FakeLogger(), + metrics=metrics, + tracer=NoopTracer(), + ttl=timedelta(minutes=5), + ) + monkeypatch.setattr('usecase.sandbox._new_session_id', lambda: SESSION_NEW_ID) + + with pytest.raises(RuntimeError, match='save_failed'): + usecase.execute(CreateSandboxCommand(chat_id=CHAT_ID)) + + assert len(runtime.create_calls) == 1 + assert runtime.stop_calls == [f'container-{SESSION_NEW_ID}'] + assert repository.get_active_by_chat_id(CHAT_ID) is None + assert _active_count_values(metrics) + assert _active_count_values(metrics)[-1] == 0 + _assert_increment_metric_present( + metrics, + 'sandbox.create.total', + attrs={'result': 'error'}, + ) + + def test_create_sandbox_replace_stop_failure_preserves_separate_identities( monkeypatch, ) -> None: @@ -755,9 +788,11 @@ def test_create_sandbox_replace_save_failure_records_stage_safe_trace_ids( with pytest.raises(RuntimeError, match='save_failed') as excinfo: usecase.execute(CreateSandboxCommand(chat_id=CHAT_ID)) - assert runtime.stop_calls == ['container-old'] + assert runtime.stop_calls == ['container-old', f'container-{SESSION_NEW_ID}'] assert len(runtime.create_calls) == 1 assert repository.get_active_by_chat_id(CHAT_ID) is None + assert _active_count_values(metrics) + assert _active_count_values(metrics)[-1] == 0 _assert_increment_metric_present( metrics, 'sandbox.create.total', diff --git a/usecase/sandbox.py b/usecase/sandbox.py index 0a3412f..59f1584 100644 --- a/usecase/sandbox.py +++ b/usecase/sandbox.py @@ -107,7 +107,7 @@ class CreateSandbox: 'sandbox.new_container.id', new_session.container_id, ) - self._repository.save(new_session) + self._save_created_session(new_session) _set_active_count(self._metrics, self._repository) if result == 'replaced': span.set_attribute('session.id', str(new_session.session_id)) @@ -131,6 +131,26 @@ class CreateSandbox: span.record_error(exc) raise + def _save_created_session(self, session: SandboxSession) -> None: + try: + self._repository.save(session) + except Exception as exc: + self._compensate_save_failure(session, exc) + raise + + def _compensate_save_failure( + self, + session: SandboxSession, + error: Exception, + ) -> None: + try: + self._runtime.stop(session.container_id) + except Exception as stop_error: + _set_active_count(self._metrics, self._repository) + raise error from stop_error + + _set_active_count(self._metrics, self._repository) + class CleanupExpiredSandboxes: def __init__( From c5b6a84a4b71ac5d21c431565d5eba14ebd30b72 Mon Sep 17 00:00:00 2001 From: Azamat Date: Fri, 3 Apr 2026 02:29:18 +0300 Subject: [PATCH 26/30] add sandbox rollback regression tests --- tasks.md | 2 +- test/test_create_http.py | 148 ++++++++++++++++++++++++++++++++++++ test/test_docker_runtime.py | 40 ++++++++++ 3 files changed, 189 insertions(+), 1 deletion(-) diff --git a/tasks.md b/tasks.md index 77dd415..861a726 100644 --- a/tasks.md +++ b/tasks.md @@ -341,7 +341,7 @@ ### M28. Регрессии на rollback и startup failure observability - Субагент: `test-engineer` -- Статус: pending +- Статус: completed - Зависимости: `M27` - Commit required: yes - Commit message: `add sandbox rollback regression tests` diff --git a/test/test_create_http.py b/test/test_create_http.py index e8686c4..ae302c2 100644 --- a/test/test_create_http.py +++ b/test/test_create_http.py @@ -3,6 +3,7 @@ import json from datetime import UTC, datetime, timedelta from uuid import UUID +import pytest from docker import DockerClient from fastapi import FastAPI from starlette.types import Message, Scope @@ -226,6 +227,16 @@ class FixedSandboxState: return list(self._sessions) +class FailingSandboxState: + def __init__(self, error: Exception) -> None: + self._error = error + self.calls = 0 + + def list_active_sessions(self) -> list[SandboxSession]: + self.calls += 1 + raise self._error + + class CountingRegistry: def __init__(self, count_active_result: int) -> None: self._count_active_result = count_active_result @@ -238,6 +249,25 @@ class CountingRegistry: return self._count_active_result +class FailingRegistry: + def __init__(self, error: Exception, *, fail_on: str = 'replace_all') -> None: + self._error = error + self._fail_on = fail_on + self.replaced_sessions: list[SandboxSession] = [] + self.count_calls = 0 + + def replace_all(self, sessions: list[SandboxSession]) -> None: + self.replaced_sessions = list(sessions) + if self._fail_on == 'replace_all': + raise self._error + + def count_active(self) -> int: + self.count_calls += 1 + if self._fail_on == 'count_active': + raise self._error + return 0 + + def build_config() -> AppConfig: return AppConfig( app=AppSectionConfig(name='master', env='test'), @@ -719,6 +749,124 @@ def test_reconciliation_uses_registry_backed_active_count_metric() -> None: assert tracer.spans[0][2].attrs['sandbox.active_count'] == 7 +def test_reconciliation_records_error_when_state_source_fails() -> None: + logger = FakeLogger() + metrics = RecordingMetrics() + tracer = RecordingTracer() + state_error = RuntimeError('state_failed') + state_source = FailingSandboxState(state_error) + reconciler = SandboxSessionReconciler( + state_source=state_source, + registry=CountingRegistry(count_active_result=7), + logger=logger, + metrics=metrics, + tracer=tracer, + ) + + with pytest.raises(RuntimeError, match='state_failed') as excinfo: + reconciler.execute() + + assert state_source.calls == 1 + assert metrics.set_calls == [] + spans = [ + span + for name, _, span in tracer.spans + if name == 'adapter.sandbox.reconcile_sessions' + ] + assert spans + span = spans[0] + assert span.attrs['sandbox.result'] == 'error' + assert 'sandbox.discovered_count' not in span.attrs + assert 'sandbox.active_count' not in span.attrs + assert excinfo.value in span.errors + + +def test_reconciliation_records_error_without_active_count_metric_on_registry_failure() -> ( + None +): + logger = FakeLogger() + metrics = RecordingMetrics() + tracer = RecordingTracer() + created_at = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + session = SandboxSession( + session_id=SESSION_ID, + chat_id=CHAT_ID, + container_id='container-123', + status=SandboxStatus.RUNNING, + created_at=created_at, + expires_at=created_at + timedelta(minutes=5), + ) + registry_error = RuntimeError('replace_failed') + registry = FailingRegistry(registry_error) + reconciler = SandboxSessionReconciler( + state_source=FixedSandboxState([session]), + registry=registry, + logger=logger, + metrics=metrics, + tracer=tracer, + ) + + with pytest.raises(RuntimeError, match='replace_failed') as excinfo: + reconciler.execute() + + assert registry.replaced_sessions == [session] + assert registry.count_calls == 0 + assert metrics.set_calls == [] + spans = [ + span + for name, _, span in tracer.spans + if name == 'adapter.sandbox.reconcile_sessions' + ] + assert spans + span = spans[0] + assert span.attrs['sandbox.discovered_count'] == 1 + assert span.attrs['sandbox.result'] == 'error' + assert 'sandbox.active_count' not in span.attrs + assert excinfo.value in span.errors + + +def test_reconciliation_records_error_when_registry_count_active_fails() -> None: + logger = FakeLogger() + metrics = RecordingMetrics() + tracer = RecordingTracer() + created_at = datetime(2026, 4, 2, 12, 0, tzinfo=UTC) + session = SandboxSession( + session_id=SESSION_ID, + chat_id=CHAT_ID, + container_id='container-123', + status=SandboxStatus.RUNNING, + created_at=created_at, + expires_at=created_at + timedelta(minutes=5), + ) + registry_error = RuntimeError('count_failed') + registry = FailingRegistry(registry_error, fail_on='count_active') + reconciler = SandboxSessionReconciler( + state_source=FixedSandboxState([session]), + registry=registry, + logger=logger, + metrics=metrics, + tracer=tracer, + ) + + with pytest.raises(RuntimeError, match='count_failed') as excinfo: + reconciler.execute() + + assert registry.replaced_sessions == [session] + assert registry.count_calls == 1 + assert metrics.set_calls == [] + spans = [ + span + for name, _, span in tracer.spans + if name == 'adapter.sandbox.reconcile_sessions' + ] + assert spans + span = spans[0] + assert span.attrs['sandbox.discovered_count'] == 1 + assert 'sandbox.active_count' not in span.attrs + assert span.attrs['sandbox.result'] == 'error' + assert excinfo.value in span.errors + + def test_build_container_wires_observability_into_runtime_and_reconciler( monkeypatch, ) -> None: diff --git a/test/test_docker_runtime.py b/test/test_docker_runtime.py index 352adad..267d177 100644 --- a/test/test_docker_runtime.py +++ b/test/test_docker_runtime.py @@ -83,6 +83,7 @@ class FakeContainers: self.run_result = run_result or FakeContainer('container-123') self.get_result: FakeContainer | Exception | None = None self.list_result: list[object] = [] + self.list_error: Exception | None = None def run( self, @@ -114,6 +115,8 @@ class FakeContainers: def list(self, *, filters: dict[str, list[str]]) -> list[object]: self.list_calls.append({'filters': filters}) + if self.list_error is not None: + raise self.list_error return self.list_result @@ -690,3 +693,40 @@ def test_runtime_list_active_records_observability(tmp_path: Path) -> None: }, ) assert not span.errors + + +def test_runtime_list_active_error_records_observability(tmp_path: Path) -> None: + config = build_config(tmp_path) + containers = FakeContainers() + containers.list_error = DockerException('boom') + metrics = RecordingMetrics() + tracer = RecordingTracer() + runtime = DockerSandboxRuntime( + config, + FakeDockerClient(containers), + metrics, + tracer, + ) + + with pytest.raises(SandboxError) as excinfo: + runtime.list_active_sessions() + + assert str(excinfo.value) == 'sandbox_list_failed' + _find_increment_call( + metrics, + 'sandbox.runtime.error.total', + attrs={'operation': 'list_active', 'error.type': 'DockerException'}, + ) + duration_call = _find_record_call( + metrics, + 'sandbox.runtime.list_active.duration_ms', + attrs={'operation': 'list_active', 'result': 'error'}, + ) + assert duration_call[1] >= 0 + span = _find_span( + tracer, + 'adapter.docker.list_active_sandboxes', + span_attrs={'sandbox.result': 'error'}, + ) + assert isinstance(excinfo.value.__cause__, DockerException) + assert excinfo.value in span.errors From 3293bccc5b24baa2b8222b46251fb9ae4c8ff9d4 Mon Sep 17 00:00:00 2001 From: Azamat Date: Fri, 3 Apr 2026 09:49:14 +0300 Subject: [PATCH 27/30] [feat] update readme.md --- AGENTS.md | 10 +- README.md | 315 ++++++++++++++++++++++++++++++------ config/docker-compose.yml | 4 +- tasks.md | 2 +- test/test_docker_runtime.py | 2 +- 5 files changed, 273 insertions(+), 60 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index ba5bc34..c9f89d9 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -45,12 +45,12 @@ - Do not use Beads - Do not use `bd` - Use `uv` for Python commands and dependency management -- Do not create commits on your own -- Work on one task at a time - Prefer delegation for implementation -- Delegate only one task at a time -- After one task return to the user with result verification and next options -- Wait for the user before the next task commit or fix +- After implementation, run `Code-Reviewer` agent +- Pass errors to `test-engineer` agent to capture +- Delegate `Feature-Developer` agent fix the errors +- Repeat the cycle until no errors remain +- Ensure all tests pass ## Makefile - `make install` install deps with `uv` diff --git a/README.md b/README.md index 2339337..3ff3dd6 100644 --- a/README.md +++ b/README.md @@ -1,36 +1,268 @@ -Это шаблон Python-сервиса на чистой архитектуре с заменяемым web-слоем, типизированным конфигом, явным dependency wiring и observability через порты. +# master-service -## Что это за проект +`master-service` — это control-plane сервис для sandbox-контейнеров с AI-агентом. +Он поднимает и переиспользует sandbox на чат, подключает рабочие volume, восстанавливает state после рестарта и отдает наружу минимальный HTTP API под `/api/v1`. -- Небольшой референсный сервис со слоями `domain/`, `usecase/`, `repository/` и `adapter/` -- Шаблон для сервисов на FastAPI, где FastAPI остается только во внешнем HTTP adapter -- Проект, где конфиг собирается из `config/app.yaml`, `.env` и env vars в одно дерево dataclass-конфигов -- Проект, где repository и usecase создаются один раз на старте приложения в composition root -- Проект, где логи, метрики и трейсы скрыты за интерфейсами и могут работать через `stdout`, файл или OpenTelemetry runtime +Важно: в локальном `config/app.yaml` исторически еще стоят template-имена `web-python-skelet`. +Если хочешь, чтобы `/health` и OTel service name локально тоже показывали `master-service`, переопредели: +- `APP_NAME=master-service` +- `APP_OTEL_SERVICE_NAME=master-service` -## Основные идеи +Сервис реализован на Python с Clean Architecture: +- `domain/` — сущности и доменные ошибки +- `usecase/` — сценарии приложения и порты +- `repository/` — реализации repository +- `adapter/` — HTTP, config, DI, Docker runtime и observability -- Clean Architecture и границы SOLID -- Направление зависимостей только внутрь -- Тонкие adapter-слои и явная сборка зависимостей -- Заменяемый HTTP-слой -- Observability без протекания OpenTelemetry во внутренние слои +## Что умеет сейчас + +Текущий sandbox MVP покрывает: +- `GET /api/v1/health` +- `POST /api/v1/create` с `chat_id: UUID` +- одну активную sandbox на чат +- reuse активной sandbox до истечения TTL +- cleanup просроченных sandbox в фоне +- startup reconciliation по Docker labels после рестарта сервиса +- chat mount `rw`, dependencies mount `ro`, lambda-tools mount `ro` +- логи, метрики и трейсы через порты `Logger`, `Metrics`, `Tracer` + +Пока вне scope: +- auth и access control +- p2p/WebSocket lease +- workspace/chat CRUD API +- central DB, artifacts, S3, quota и retention policy + +## Как устроен проект + +- FastAPI живет только во внешнем adapter слое +- Docker живет только во внешнем adapter слое +- конфиг собирается из `config/app.yaml`, `.env` и env vars в один dataclass tree +- repository и usecase создаются один раз на старте в `adapter/di/container.py` +- observability не протекает во внутренние слои через OpenTelemetry SDK + +## Структура + +- `domain/` — core model и domain errors +- `usecase/` — use cases и interfaces +- `repository/` — in-memory и другие repository implementations +- `adapter/config/` — typed config models и loader +- `adapter/docker/` — Docker sandbox runtime +- `adapter/observability/` — logger/metrics/tracer runtime factory +- `adapter/otel/` — OpenTelemetry adapters +- `adapter/di/` — composition root +- `adapter/http/fastapi/` — app, middleware, schemas, routers +- `adapter/sandbox/` — sandbox reconciliation logic +- `config/` — YAML config files +- `docs/` — ADR и проектные гайды ## Быстрый старт +### Требования + +- Python 3.13 +- `uv` +- локальный Docker daemon +- секреты `APP_API_TOKEN` и `APP_SIGNING_KEY` + +### Установка + ```bash make install +``` + +### Локальный запуск + +```bash APP_API_TOKEN=local-api-token APP_SIGNING_KEY=local-signing-key make run ``` -Приложение стартует на `http://0.0.0.0:8123` и публикует versioned API под `/api/v1`. +Это поднимет сам API, но для успешного `POST /api/v1/create` локально нужен еще рабочий sandbox runtime: + +- Docker daemon должен быть доступен по `docker.base_url` +- образ `sandbox.image` должен существовать локально +- директории `sandbox.dependencies_host_path` и `sandbox.lambda_tools_host_path` должны существовать + +В дефолтном `config/app.yaml` это значит: + +```bash +mkdir -p var/sandbox/dependencies var/sandbox/lambda-tools +docker image inspect ai-agent:latest >/dev/null +``` + +Если у тебя нет готового `ai-agent:latest`, проще начать с Docker Compose smoke path ниже. + +После старта сервис доступен на: +- `http://127.0.0.1:8123/api/v1/health` + +Проверка health: + +```bash +curl http://127.0.0.1:8123/api/v1/health +``` + +Создание или reuse sandbox: + +```bash +curl -X POST http://127.0.0.1:8123/api/v1/create \ + -H 'Content-Type: application/json' \ + -d '{"chat_id":"11111111-1111-1111-1111-111111111111"}' +``` + +Пример ответа: + +```json +{ + "session_id": "3701cfe3-e05e-48af-8385-442dcd954ca2", + "chat_id": "11111111-1111-1111-1111-111111111111", + "container_id": "64d839c6007de9396ee08ad4af4a22a59a6410ec5f4892a9277a87eb49c3ff5d", + "status": "running", + "expires_at": "2026-04-02T21:11:38.292893Z" +} +``` + +## Запуск через Docker Compose + +Для локального smoke-run есть `docker-compose.yml`. +Он поднимает: +- `app` +- `docker-engine` в режиме Docker-in-Docker +- `otel-collector` + +При этом `app` получает compose-specific config из: +- `config/docker-compose.yml` + +Запуск: + +```bash +make compose-up +``` + +Проверка: + +```bash +make compose-ps +make compose-logs +``` + +Остановка: + +```bash +make compose-down +``` + +Важно: +- в `config/docker-compose.yml` сейчас для smoke-проверки стоит `sandbox.image: nginx:1.27-alpine` +- для реального agent runtime замени `sandbox.image` на образ своего sandbox/agent контейнера +- в compose auth env vars нужны для startup config, но текущий MVP API еще не проверяет request token + +## Как конфигурировать + +### Источники конфига + +Конфиг собирается в таком порядке: +1. базовый YAML из `config/app.yaml` +2. значения из `.env` +3. process env vars поверх `.env` + +То есть env vars имеют наивысший приоритет. + +### Обязательные секреты + +Нужны всегда: +- `APP_API_TOKEN` +- `APP_SIGNING_KEY` + +Сейчас это startup config, а не активная request auth для `/api/v1/create` и `/api/v1/health`. +То есть в текущем MVP токен не нужно передавать в HTTP headers для вызова этих endpoint. + +### Основные секции YAML + +В `config/app.yaml` и `config/docker-compose.yml` есть секции: +- `app` +- `http` +- `logging` +- `metrics` +- `tracing` +- `otel` +- `docker` +- `sandbox` +- `security` + +### Полезные env overrides + +Чаще всего полезны: + +#### Общие +- `APP_NAME` +- `APP_ENV` +- `APP_HTTP_HOST` +- `APP_HTTP_PORT` + +#### Логирование и observability +- `APP_LOGGING_LEVEL` +- `APP_LOGGING_OUTPUT` +- `APP_LOGGING_FORMAT` +- `APP_LOGGING_FILE_PATH` +- `APP_METRICS_ENABLED` +- `APP_TRACING_ENABLED` +- `APP_OTEL_SERVICE_NAME` +- `APP_OTEL_LOGS_ENDPOINT` +- `APP_OTEL_METRICS_ENDPOINT` +- `APP_OTEL_TRACES_ENDPOINT` + +#### Docker runtime +- `APP_DOCKER_BASE_URL` + +#### Sandbox +- `APP_SANDBOX_IMAGE` +- `APP_SANDBOX_TTL_SECONDS` +- `APP_SANDBOX_CLEANUP_INTERVAL_SECONDS` +- `APP_SANDBOX_CHATS_ROOT` +- `APP_SANDBOX_DEPENDENCIES_HOST_PATH` +- `APP_SANDBOX_LAMBDA_TOOLS_HOST_PATH` +- `APP_SANDBOX_CHAT_MOUNT_PATH` +- `APP_SANDBOX_DEPENDENCIES_MOUNT_PATH` +- `APP_SANDBOX_LAMBDA_TOOLS_MOUNT_PATH` + +#### Security +- `APP_API_TOKEN_HEADER` +- `APP_API_TOKEN` +- `APP_SIGNING_KEY` + +### Что важно в sandbox config + +- `docker.base_url` — адрес Docker daemon +- `sandbox.image` — образ sandbox контейнера +- `sandbox.ttl_seconds` — TTL sandbox +- `sandbox.cleanup_interval_seconds` — частота cleanup loop +- `sandbox.chats_root` — корень chat directories +- `sandbox.dependencies_host_path` — host path для dependency cache +- `sandbox.lambda_tools_host_path` — host path для read-only lambda-tools +- `sandbox.chat_mount_path` — путь внутри sandbox для chat volume +- `sandbox.dependencies_mount_path` — путь внутри sandbox для dependency cache +- `sandbox.lambda_tools_mount_path` — путь внутри sandbox для lambda-tools + +## Основные команды + +- `make install` — установить зависимости через `uv` +- `make run` — локальный запуск +- `make run-otel` — запуск с OTel endpoints из env +- `make test` — `pytest` +- `make lint` — `ruff` +- `make typecheck` — `mypy` +- `make pre-commit` — lint + typecheck + test +- `make compose-build` — собрать compose images +- `make compose-up` — поднять локальный stack +- `make compose-down` — остановить stack +- `make compose-logs` — смотреть логи +- `make compose-ps` — смотреть статус сервисов ## Документация ### Гайды - [Правила проекта и ограничения для агента](AGENTS.md) -- [Кодстайл проекта для AI-агента](docs/CODESTYLE.md) +- [Кодстайл проекта](docs/CODESTYLE.md) - [Чистая архитектура, SOLID, DIP, Protocol и repository](docs/CLEAN_ARCHITECTURE_RU.md) - [Логи, метрики и трейсы в этом проекте](docs/OBSERVABILITY_RU.md) - [Как чистая архитектура реализована здесь](docs/PROJECT_GUIDE_RU.md) @@ -43,43 +275,24 @@ APP_API_TOKEN=local-api-token APP_SIGNING_KEY=local-signing-key make run - [003 Observability Via Interfaces](docs/003-observability-via-interfaces.md) - [004 Versioned HTTP API](docs/004-versioned-http-api.md) - [005 Early FastAPI OTel Instrumentation](docs/005-fastapi-otel-early-instrumentation.md) +- [006 MVP Docker Sandbox Orchestration](docs/006-mvp-docker-sandbox-orchestration.md) +- [007 Startup Sandbox Reconciliation](docs/007-startup-sandbox-reconciliation.md) +- [008 Sandbox Lifecycle Observability](docs/008-sandbox-lifecycle-observability.md) -## Структура проекта +## Для AI-агента -- `domain/` - core-сущности и доменные ошибки -- `usecase/` - прикладные сценарии и порты -- `repository/` - реализации repository -- `adapter/config/` - загрузка и модели типизированного конфига -- `adapter/observability/` - выбор runtime для logger, metrics и tracer -- `adapter/otel/` - OpenTelemetry adapters -- `adapter/di/` - composition root и singleton wiring -- `adapter/http/fastapi/` - HTTP-схемы, dependencies, middleware и routers -- `config/` - YAML-конфиг приложения и локального OTel collector +Если ты меняешь проект как AI-агент, сначала прочитай: -## Для ИИ +1. [AGENTS.md](AGENTS.md) +2. [docs/CODESTYLE.md](docs/CODESTYLE.md) +3. [docs/PROJECT_GUIDE_RU.md](docs/PROJECT_GUIDE_RU.md) +4. [docs/CLEAN_ARCHITECTURE_RU.md](docs/CLEAN_ARCHITECTURE_RU.md) +5. [docs/OBSERVABILITY_RU.md](docs/OBSERVABILITY_RU.md) +6. релевантные ADR в `docs/` +7. [tasks.md](tasks.md) -Если ты AI-агент и собираешься что-то менять в проекте, сначала прочитай документы в таком порядке: - -1. [Правила проекта и ограничения агента](AGENTS.md) - обязательные правила работы в этом репозитории -2. [Кодстайл проекта для AI-агента](docs/CODESTYLE.md) - границы слоев, стиль кода и правила зависимостей -3. [Как чистая архитектура реализована здесь](docs/PROJECT_GUIDE_RU.md) - практическая карта проекта и типовые сценарии изменений -4. [Чистая архитектура, SOLID, DIP, Protocol и repository](docs/CLEAN_ARCHITECTURE_RU.md) - базовые архитектурные принципы и примеры -5. [Логи, метрики и трейсы в этом проекте](docs/OBSERVABILITY_RU.md) - читать перед любыми изменениями в observability, middleware и runtime wiring -6. [ADR в `docs/`](docs/001-composition-root-and-lifetimes.md) - читать релевантные решения перед изменением архитектуры или startup wiring -7. [План задач и история работ](tasks.md) - понять, что уже сделано, что отложено и какие ограничения были зафиксированы - -Перед началом работы: - -- Определи, в каком слое будет изменение: `domain/`, `usecase/`, `repository/` или `adapter/` -- Убедись, что зависимости идут только внутрь -- Не тащи FastAPI и OpenTelemetry во внутренние слои -- Сначала изучи существующий код в нужной директории, потом вноси изменения -- Если задача затрагивает архитектурное решение, сначала сверяйся с ADR и проектными правилами - -## Запуск и команды - -- Для локального запуска нужны `APP_API_TOKEN` и `APP_SIGNING_KEY` -- `make run` запускает приложение локально -- `make run-otel` запускает приложение с локальными OTel endpoints из env vars -- `make pre-commit` запускает `ruff`, `mypy` и `pytest` -- `make compose-up` поднимает приложение и локальный LGTM stack через Docker Compose +Главные правила: +- сначала определи слой изменения +- зависимости только внутрь +- не тащи FastAPI и OpenTelemetry во внутренние слои +- архитектурные решения сверяй с ADR diff --git a/config/docker-compose.yml b/config/docker-compose.yml index a601f99..5ddb745 100644 --- a/config/docker-compose.yml +++ b/config/docker-compose.yml @@ -29,8 +29,8 @@ docker: sandbox: image: nginx:1.27-alpine - ttl_seconds: 30 - cleanup_interval_seconds: 5 + ttl_seconds: 300 + cleanup_interval_seconds: 60 chats_root: /var/lib/master-sandbox/chats dependencies_host_path: /var/lib/master-dependencies lambda_tools_host_path: /var/lib/master-lambda-tools diff --git a/tasks.md b/tasks.md index 861a726..d5009d7 100644 --- a/tasks.md +++ b/tasks.md @@ -353,7 +353,7 @@ ### M29. Финальный boundary review для sandbox observability - Субагент: `code-reviewer` -- Статус: pending +- Статус: completed - Зависимости: `M28` - Commit required: no - Scope: подтвердить, что M27-M28 закрыли remaining M26 замечания diff --git a/test/test_docker_runtime.py b/test/test_docker_runtime.py index 267d177..7f71275 100644 --- a/test/test_docker_runtime.py +++ b/test/test_docker_runtime.py @@ -14,7 +14,7 @@ from adapter.docker.runtime import DockerSandboxRuntime from adapter.observability.noop import NoopMetrics, NoopTracer from domain.error import SandboxError, SandboxStartError from domain.sandbox import SandboxSession, SandboxStatus -from usecase.interface import AttrValue, Attrs +from usecase.interface import Attrs, AttrValue CHAT_ID = UUID('123e4567-e89b-12d3-a456-426614174000') NON_CANONICAL_CHAT_ID = '123E4567E89B12D3A456426614174000' From 0ca0bac9bf12bdb9e18dadb74dc90bcc52a49751 Mon Sep 17 00:00:00 2001 From: Azamat Date: Tue, 7 Apr 2026 19:11:51 +0300 Subject: [PATCH 28/30] [feat] add tasks --- AGENTS.md | 2 +- tasks/roadmap.md | 279 ++++++++++++++++++++++++++ tasks/sprint-01-storage-foundation.md | 130 ++++++++++++ 3 files changed, 410 insertions(+), 1 deletion(-) create mode 100644 tasks/roadmap.md create mode 100644 tasks/sprint-01-storage-foundation.md diff --git a/AGENTS.md b/AGENTS.md index c9f89d9..59d88c2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -41,7 +41,7 @@ - Keep HTTP models and middleware inside `adapter/http/fastapi/` ## Workflow -- Use `tasks.md` for planning +- Use dir `tasks/` for planning - Do not use Beads - Do not use `bd` - Use `uv` for Python commands and dependency management diff --git a/tasks/roadmap.md b/tasks/roadmap.md new file mode 100644 index 0000000..aae8e49 --- /dev/null +++ b/tasks/roadmap.md @@ -0,0 +1,279 @@ +# Roadmap: workspace/chat/files + artifacts + +## Цель + +Следующий продуктовый приоритет для `master-service`: +1. управление `workspace` / `chat` / `chat files` +2. хранение и выдача `artifacts` + +Идея этапа: превратить текущий sandbox MVP в storage-centric control plane, где sandbox работает поверх явных пользовательских данных, а не только поверх `chat_id`. + +## Что считаем в scope этого roadmap + +- workspace metadata +- chat metadata и chat directories +- `history.md` рядом с чатом +- upload/list/download/delete/clear для chat files +- quota / usage accounting +- artifact metadata +- object storage adapter для artifacts +- delivery / mark-for-delete flow для artifacts +- интеграция storage model с текущим sandbox lifecycle + +## Что пока вне scope + +- полноценный auth/access control +- access lease и WebSocket handoff +- multi-node orchestration как отдельный эпик +- full retention engine для workspace/account lifecycle +- внешние messenger integrations + +## Зафиксированные продуктовые решения + +- trusted caller передает `user_id: UUID` +- в v1 один `Workspace` на одного `User` +- metadata adapters пока делаем `in-memory` +- chat history в v1 хранится в `history.md` +- chat files отдаем через master: metadata API + download endpoint +- soft quota 10 GB блокирует quota-relevant write-path операции, но не делает hard reservation +- chat create сам по себе не блокируется quota check, пока не появляется значимый file/artifact payload +- delete в v1 — hard delete; chat delete запрещен при active sandbox +- artifact delivery подтверждается явным ack +- artifact blob отдается через presigned URL, metadata — через master +- object storage layout: один bucket на environment + prefixes +- artifact states: `created`, `stored`, `delivered`, `delivery_failed`, `marked_for_delete`, `deleted` +- artifact retention: после ack -> `marked_for_delete` -> async cleanup; без ack действует отдельный TTL + +## Принципы выполнения + +- Clean Architecture и dependency direction сохраняются +- filesystem и object storage живут только во внешних adapter/repository слоях +- `domain/` и `usecase/` не знают про FastAPI, Docker, OpenTelemetry, env +- один delivery slice = одна атомарная задача = один commit +- архитектурные развилки сначала фиксируются кратким ADR-lite в `docs/` + +## Легенда исполнителей + +- `primary-agent` — архитектура, ADR, domain/contracts, финальная сборка решений +- `junior` — простые in-memory adapters и базовые unit-level задачи +- `junior+opus` — usecase/adapter/API slices со средним уровнем связности +- `test-engineer` — regression и integration test packs +- `code-reviewer` — review-only этапы + +--- + +## Phase 0. Design checkpoints + +### R00. ADR: storage source of truth +- **Рекомендуемый исполнитель:** `primary-agent` +- **Зачем:** определить, что является source of truth для `Workspace`, `Chat`, `ChatFile`, `Artifact` +- **Нужно решить:** как устроен in-memory-first этап, как metadata связаны с filesystem paths, как sandbox получает path текущего chat +- **Выход:** ADR-lite + список сущностей и ownership boundaries + +### R01. ADR: artifact lifecycle +- **Рекомендуемый исполнитель:** `primary-agent` +- **Зачем:** зафиксировать states и delivery contract для artifact flow +- **Нужно решить:** `created -> stored -> delivered -> delivery_failed -> marked_for_delete -> deleted` +- **Выход:** ADR-lite + минимальный state machine + +### R02. ADR: chat history policy +- **Рекомендуемый исполнитель:** `primary-agent` +- **Зачем:** зафиксировать, как `history.md` соотносится с metadata чата +- **Нужно решить:** кто создает файл, кто пишет initial header, как читать/обновлять metadata без парсинга файла +- **Выход:** ADR-lite + правила sync между metadata и history file + +--- + +## Phase 1. Foundation for workspace/chat/file domain + +### R10. Domain model for user storage +- **Рекомендуемый исполнитель:** `primary-agent` +- **Scope:** `Workspace`, `Chat`, `ChatFile`, domain errors +- **Layer:** `domain/` +- **Выход:** минимальные сущности, value objects, ошибки конфликтов/не-найдено/квота +- **Depends on:** `R00`, `R02` + +### R11. Usecase ports for storage and metadata +- **Рекомендуемый исполнитель:** `primary-agent` +- **Scope:** repository/storage interfaces для workspace/chat/chat-file/history/quota +- **Layer:** `usecase/` +- **Выход:** порты для metadata repo, file storage, usage reader, id generator, clock и trusted caller identity input +- **Depends on:** `R10` + +### R12. In-memory metadata adapter foundation +- **Рекомендуемый исполнитель:** `junior` +- **Scope:** первая metadata implementation для workspace/chat/chat-file/artifact metadata +- **Layer:** `repository/` +- **Примечание:** это промежуточный этап; durable DB идет отдельным follow-up после storage/product slice +- **Выход:** in-memory CRUD для workspace/chat/chat-file/artifact metadata +- **Depends on:** `R11` + +### R13. Filesystem storage adapter foundation +- **Рекомендуемый исполнитель:** `junior+opus` +- **Scope:** storage adapter для chat directories, `history.md`, uploads и file metadata extraction +- **Layer:** `adapter/` или `repository/` в зависимости от финальной границы +- **Выход:** операции create/list/delete/read metadata для chat files, history path management и download path resolution +- **Depends on:** `R11` + +### R14. Sandbox integration with chat metadata +- **Рекомендуемый исполнитель:** `junior+opus` +- **Scope:** перестать считать `chat_id` единственным источником layout, использовать metadata-backed chat path +- **Layer:** `usecase/` + outer adapters +- **Выход:** sandbox mounts current chat directory, созданную через storage model +- **Depends on:** `R12`, `R13` + +--- + +## Phase 2. Workspace and chat API + +### R20. Create workspace on first touch +- **Рекомендуемый исполнитель:** `junior+opus` +- **Scope:** auto-create workspace при первом запросе на user storage +- **API:** внутренний usecase + при необходимости явная HTTP ручка +- **Выход:** гарантированный `Workspace` для пользователя по `user_id` +- **Depends on:** `R12` + +### R21. Chat CRUD +- **Рекомендуемый исполнитель:** `junior+opus` +- **Scope:** create/get/list/delete chat +- **API:** versioned HTTP under `/api/v1` +- **Выход:** first-class chat metadata вместо implicit-only `chat_id` +- **Depends on:** `R20`, `R12`, `R13` + +### R22. History file lifecycle +- **Рекомендуемый исполнитель:** `junior+opus` +- **Scope:** создавать `history.md` вместе с chat directory, читать и обновлять metadata +- **Выход:** первый стабильный contract для истории чата в filesystem +- **Depends on:** `R21`, `R13` + +### R23. Chat file upload and file metadata API +- **Рекомендуемый исполнитель:** `junior+opus` +- **Scope:** upload/list/get metadata/delete single file/clear chat files +- **API:** HTTP adapter + schemas +- **Выход:** базовый file management API на chat scope + master download endpoint +- **Depends on:** `R21`, `R13` + +### R24. Usage and quota accounting +- **Рекомендуемый исполнитель:** `junior+opus` +- **Scope:** считать storage usage по workspace/chat, мягкая quota 10 GB +- **Выход:** usecases и metrics для current usage; file/artifact write-path reject при превышении soft quota +- **Depends on:** `R23` + +### R25. Tests and review for storage slice +- **Рекомендуемый исполнитель:** `test-engineer` +- **Scope:** unit + adapter + HTTP tests, boundary review +- **Выход:** regression coverage для workspace/chat/files +- **Depends on:** `R20`-`R24` + +--- + +## Phase 3. Artifact pipeline + +### R30. Artifact domain and metadata model +- **Рекомендуемый исполнитель:** `primary-agent` +- **Scope:** `Artifact` entity, states, delivery metadata, linkage to `user/chat` +- **Layer:** `domain/`, `usecase/` +- **Выход:** базовая artifact model + errors + repository ports для in-memory metadata stage +- **Depends on:** `R01`, `R12` + +### R31. Object storage adapter +- **Рекомендуемый исполнитель:** `junior+opus` +- **Scope:** S3-compatible adapter for artifact blobs +- **Layer:** outer `adapter/` or `repository/` +- **Выход:** upload/get/delete primitives behind interface + presigned URL support +- **Depends on:** `R30` + +### R32. Artifact registration and upload flow +- **Рекомендуемый исполнитель:** `junior+opus` +- **Scope:** usecase, который связывает metadata и object storage +- **Выход:** artifact можно зарегистрировать, сохранить blob и получить external reference +- **Depends on:** `R30`, `R31` + +### R33. Artifact list/get metadata API +- **Рекомендуемый исполнитель:** `junior+opus` +- **Scope:** list artifacts by chat/user, get artifact metadata/status +- **Выход:** внешний API для управления artifact metadata и выдачи presigned download references +- **Depends on:** `R32` + +### R34. Delivery acknowledgement flow +- **Рекомендуемый исполнитель:** `junior+opus` +- **Scope:** mark artifact as delivered / delivery_failed +- **Выход:** подтвержденный delivery state через explicit ack и база для retention +- **Depends on:** `R32` + +### R35. Artifact delete / mark-for-delete policy +- **Рекомендуемый исполнитель:** `junior+opus` +- **Scope:** policy-driven cleanup hooks после delivery confirmation или TTL +- **Выход:** artifact lifecycle завершен, metadata и blob cleanup согласованы, `marked_for_delete` используется как переходное состояние +- **Depends on:** `R34` + +### R36. Tests and review for artifact slice +- **Рекомендуемый исполнитель:** `test-engineer` +- **Scope:** unit + adapter + HTTP tests, boundary review +- **Выход:** regression coverage для artifact lifecycle и object storage integration +- **Depends on:** `R30`-`R35` + +--- + +## Phase 4. Cross-slice hardening + +### R40. Observability for storage usage and artifacts +- **Рекомендуемый исполнитель:** `junior+opus` +- **Scope:** metrics по usage/quota, artifact upload latency, artifact errors +- **Выход:** dashboards and alerts for storage-centric flows +- **Depends on:** `R24`, `R32`, `R35` + +### R41. Cleanup safety rules +- **Рекомендуемый исполнитель:** `junior+opus` +- **Scope:** не удалять chat files/artifacts, если есть активный sandbox или незавершенный delivery flow +- **Выход:** safe cleanup invariants +- **Depends on:** `R24`, `R35` + +### R42. Docs refresh +- **Рекомендуемый исполнитель:** `primary-agent` +- **Scope:** README, ADRs, API docs, operator notes +- **Выход:** актуальная документация для storage and artifact flows +- **Depends on:** все предыдущие slices + +--- + +## Рекомендуемый порядок выполнения + +### Priority A — workspace/chat/files +1. `R00` `R02` +2. `R10` `R11` +3. `R12` `R13` +4. `R20` `R21` `R22` +5. `R23` `R24` +6. `R14` `R25` + +### Priority B — artifacts +1. `R01` +2. `R30` `R31` +3. `R32` `R33` +4. `R34` `R35` +5. `R36` + +### Future follow-up after Priority A+B +1. durable metadata repository instead of in-memory adapters +2. auth/access control and access lease +3. multi-node storage/session coordination + +### Priority C — hardening +1. `R40` +2. `R41` +3. `R42` + +--- + +## Что даст этот roadmap + +После выполнения Priority A + B сервис сможет: +- создавать и хранить workspace/chat metadata +- управлять файлами пользователя в chat scope +- хранить историю чата в предсказуемом layout +- поднимать sandbox поверх first-class chat storage +- хранить artifact metadata и blob отдельно +- отдавать artifact metadata наружу и подтверждать доставку + +То есть `master-service` станет не только sandbox orchestrator, но и полноценным control-plane для user storage и artifact lifecycle. diff --git a/tasks/sprint-01-storage-foundation.md b/tasks/sprint-01-storage-foundation.md new file mode 100644 index 0000000..72d44f1 --- /dev/null +++ b/tasks/sprint-01-storage-foundation.md @@ -0,0 +1,130 @@ +# Sprint 01 — storage foundation + +## Цель спринта + +Подготовить первый исполнимый storage slice для `workspace` / `chat` / `chat files` без durable DB и без auth, но с корректными clean-architecture границами. + +## Зафиксированные допущения + +- trusted caller передает `user_id: UUID` +- один `Workspace` на одного `User` +- metadata adapters пока `in-memory` +- история чата живет в `history.md` +- chat files отдаются через master download endpoint +- soft quota блокирует quota-relevant write-path операции +- chat create сам по себе quota не блокирует +- delete в v1 — hard delete; chat delete запрещен при active sandbox + +## Scope in + +- domain model для `Workspace`, `Chat`, `ChatFile` +- usecase ports для metadata, filesystem storage и usage +- in-memory metadata adapters +- filesystem adapter для chat directories, `history.md` и file operations +- usecases для auto-create workspace, chat CRUD и file CRUD +- HTTP API для chat/file operations +- базовый usage/quota check + +## Scope out + +- durable DB +- auth/access control +- artifacts +- retention engine +- p2p/access lease +- multi-node behavior + +## Легенда исполнителей + +- `primary-agent` — архитектура, ADR, domain/contracts +- `junior` — простые repositories и базовые unit tasks +- `junior+opus` — adapter/usecase/API задачи средней сложности +- `test-engineer` — test packs +- `code-reviewer` — review-only этап + +## Порядок задач + +### S01. ADR-lite и storage contracts +- **Рекомендуемый исполнитель:** `primary-agent` +- **Commit:** `add storage foundation contracts` +- **Scope:** зафиксировать storage source-of-truth и history policy; добавить domain entities и usecase ports +- **Files:** `docs/009-storage-foundation.md`, `docs/010-chat-history-policy.md`, `domain/*`, `usecase/interface.py` или новые storage usecase files +- **Acceptance:** есть минимальные сущности/ошибки/порты; ADR краткий и консистентный + +### S02. In-memory metadata adapters +- **Рекомендуемый исполнитель:** `junior` +- **Commit:** `add in-memory storage repositories` +- **Scope:** `WorkspaceRepository`, `ChatRepository`, `ChatFileRepository` in-memory implementations +- **Files:** `repository/*` +- **Acceptance:** CRUD и базовые query paths работают без HTTP + +### S03. Filesystem chat storage adapter +- **Рекомендуемый исполнитель:** `junior+opus` +- **Commit:** `add chat filesystem storage` +- **Scope:** create chat directory, create `history.md`, save/list/delete files, collect file metadata +- **Files:** outer adapter/repository storage files +- **Acceptance:** chat directory layout стабилен; adapter не протекает внутрь + +### S04. Workspace and chat usecases +- **Рекомендуемый исполнитель:** `junior+opus` +- **Commit:** `add workspace and chat usecases` +- **Scope:** auto-create workspace on first touch, create/get/list/delete chat +- **Files:** `usecase/*`, `adapter/di/container.py` +- **Acceptance:** delete chat конфликтует при active sandbox; history path создается через storage adapter + +### S05. Chat file usecases + quota +- **Рекомендуемый исполнитель:** `junior+opus` +- **Commit:** `add chat file usecases` +- **Scope:** upload/list/metadata/delete/clear files; current usage and soft quota reject on write +- **Files:** `usecase/*`, `adapter/di/container.py` +- **Acceptance:** file write-path reject при quota overflow; usage включает `history.md` и chat files; chat metadata create не блокируется quota check + +### S06. HTTP API for chat and files +- **Рекомендуемый исполнитель:** `junior+opus` +- **Commit:** `add chat and file http api` +- **Scope:** versioned routes, schemas, error mapping, file download endpoint +- **Files:** `adapter/http/fastapi/*` +- **Acceptance:** API тонкий; FastAPI не протекает во внутренние слои + +### S07. Tests for storage foundation +- **Рекомендуемый исполнитель:** `test-engineer` +- **Commit:** `add storage foundation tests` +- **Scope:** unit + adapter + HTTP tests для workspace/chat/files/quota +- **Files:** `test/*` +- **Acceptance:** есть regression coverage для CRUD, history creation, quota reject, delete conflict, file download + +### S08. Boundary review +- **Рекомендуемый исполнитель:** `code-reviewer` +- **Commit:** no +- **Scope:** final review storage foundation slice +- **Acceptance:** clean architecture соблюдена, нет must-fix замечаний + +## Definition of done + +Спринт считается завершенным, когда: +- можно создать chat для `user_id` +- для chat создается directory и `history.md` +- можно загрузить и удалить chat files +- можно получить metadata и скачать файл через master +- quota check блокирует quota-relevant file write operations +- все тесты проходят +- boundary review не содержит must-fix замечаний + +## Риски + +- in-memory metadata не переживает restart +- download endpoint может потребовать уточнения streaming contract +- quota semantics нужно держать простыми, чтобы не породить скрытый retention scope + +## Что идет следующим спринтом + +Сразу после этого спринта нужен bridge-спринт для sandbox/storage integration: +- metadata-backed chat path для sandbox flow +- wiring текущего `CreateSandbox` поверх first-class chat metadata +- review, что sandbox lifecycle не расходится с новым storage slice + +И только потом логично брать artifact slice: +- artifact domain + metadata +- object storage adapter +- artifact upload/list/status +- delivery ack flow From 5381c997e2fc1307c5e8de5ce3ee9b7f63b00b94 Mon Sep 17 00:00:00 2001 From: Azamat Date: Tue, 7 Apr 2026 19:31:50 +0300 Subject: [PATCH 29/30] add storage foundation contracts --- docs/009-storage-foundation.md | 17 ++++++++ docs/010-chat-history-policy.md | 17 ++++++++ domain/chat.py | 35 +++++++++++++++ domain/error.py | 45 +++++++++++++++++++ domain/workspace.py | 17 ++++++++ usecase/interface.py | 77 +++++++++++++++++++++++++++++++++ 6 files changed, 208 insertions(+) create mode 100644 docs/009-storage-foundation.md create mode 100644 docs/010-chat-history-policy.md create mode 100644 domain/chat.py create mode 100644 domain/workspace.py diff --git a/docs/009-storage-foundation.md b/docs/009-storage-foundation.md new file mode 100644 index 0000000..4aadf58 --- /dev/null +++ b/docs/009-storage-foundation.md @@ -0,0 +1,17 @@ +# 009 Storage foundation + +## Context +- v1 storage slice needs workspace, chat and file flows before durable DB +- trusted caller passes `user_id`, and one workspace belongs to one user +- chat content must live outside sandbox lifecycle and survive sandbox restart + +## Decision +- metadata repositories are in-memory for the first storage slice +- `Workspace`, `Chat` and `ChatFile` are first-class domain entities +- filesystem access stays behind storage ports in outer layers +- sandbox later integrates through chat metadata and storage ports, not raw path math in usecases + +## Consequences +- metadata is lost on restart in this phase +- storage usecases and HTTP API can be built before durable persistence +- later durable metadata can replace in-memory adapters behind the same ports diff --git a/docs/010-chat-history-policy.md b/docs/010-chat-history-policy.md new file mode 100644 index 0000000..39c2125 --- /dev/null +++ b/docs/010-chat-history-policy.md @@ -0,0 +1,17 @@ +# 010 Chat history policy + +## Context +- v1 keeps chat history in filesystem, not in central DB +- chat metadata must not depend on parsing history content +- each chat already maps to an isolated working directory + +## Decision +- each chat owns one `history.md` inside its chat directory +- `history.md` is created with chat layout initialization +- chat metadata stores identity and lifecycle fields separately from history content +- history read and write stay behind storage ports in outer layers + +## Consequences +- history survives sandbox restart with chat storage +- metadata and content evolve independently +- later migration to another history backend can keep the same chat identity model diff --git a/domain/chat.py b/domain/chat.py new file mode 100644 index 0000000..3dbd7cb --- /dev/null +++ b/domain/chat.py @@ -0,0 +1,35 @@ +from dataclasses import dataclass +from datetime import datetime +from uuid import UUID + +HISTORY_FILE_NAME = 'history.md' + + +@dataclass(frozen=True, slots=True) +class ChatAttachmentName: + value: str + + def __post_init__(self) -> None: + if not self.value or self.value in {'.', '..'}: + raise ValueError('invalid attachment name') + if '/' in self.value or '\\' in self.value: + raise ValueError('invalid attachment name') + if self.value == HISTORY_FILE_NAME: + raise ValueError('reserved attachment name') + + +@dataclass(frozen=True, slots=True) +class Chat: + chat_id: UUID + workspace_id: UUID + created_at: datetime + + +@dataclass(frozen=True, slots=True) +class ChatFile: + file_id: UUID + chat_id: UUID + name: ChatAttachmentName + content_type: str | None + size_bytes: int + created_at: datetime diff --git a/domain/error.py b/domain/error.py index f691113..ff3486e 100644 --- a/domain/error.py +++ b/domain/error.py @@ -1,3 +1,6 @@ +from uuid import UUID + + class DomainError(Exception): pass @@ -18,6 +21,48 @@ class UserConflictError(UserError): self.email = email +class WorkspaceError(DomainError): + pass + + +class WorkspaceNotFoundError(WorkspaceError): + def __init__(self, workspace_id: UUID) -> None: + super().__init__('workspace_not_found') + self.workspace_id = workspace_id + + +class WorkspaceQuotaExceededError(WorkspaceError): + def __init__(self, workspace_id: UUID) -> None: + super().__init__('workspace_quota_exceeded') + self.workspace_id = workspace_id + + +class ChatError(DomainError): + pass + + +class ChatNotFoundError(ChatError): + def __init__(self, chat_id: UUID) -> None: + super().__init__('chat_not_found') + self.chat_id = chat_id + + +class ChatHasActiveSandboxError(ChatError): + def __init__(self, chat_id: UUID) -> None: + super().__init__('chat_has_active_sandbox') + self.chat_id = chat_id + + +class ChatFileError(DomainError): + pass + + +class ChatFileNotFoundError(ChatFileError): + def __init__(self, file_id: UUID) -> None: + super().__init__('chat_file_not_found') + self.file_id = file_id + + class SandboxError(DomainError): pass diff --git a/domain/workspace.py b/domain/workspace.py new file mode 100644 index 0000000..3526203 --- /dev/null +++ b/domain/workspace.py @@ -0,0 +1,17 @@ +from dataclasses import dataclass +from datetime import datetime +from uuid import UUID + + +@dataclass(frozen=True, slots=True) +class Workspace: + workspace_id: UUID + user_id: UUID + created_at: datetime + + +@dataclass(frozen=True, slots=True) +class WorkspaceUsage: + workspace_id: UUID + used_bytes: int + quota_bytes: int diff --git a/usecase/interface.py b/usecase/interface.py index 69876e6..de681d6 100644 --- a/usecase/interface.py +++ b/usecase/interface.py @@ -4,8 +4,10 @@ from types import TracebackType from typing import Protocol, TypeAlias from uuid import UUID +from domain.chat import Chat, ChatAttachmentName, ChatFile from domain.sandbox import SandboxSession from domain.user import User +from domain.workspace import Workspace, WorkspaceUsage AttrValue: TypeAlias = str | int | float | bool Attrs: TypeAlias = Mapping[str, AttrValue] @@ -19,6 +21,81 @@ class UserRepository(Protocol): def save(self, user: User) -> None: ... +class WorkspaceRepository(Protocol): + def get(self, workspace_id: UUID) -> Workspace | None: ... + + def get_by_user_id(self, user_id: UUID) -> Workspace | None: ... + + def save(self, workspace: Workspace) -> None: ... + + +class ChatRepository(Protocol): + def get(self, chat_id: UUID) -> Chat | None: ... + + def list_by_workspace_id(self, workspace_id: UUID) -> list[Chat]: ... + + def save(self, chat: Chat) -> None: ... + + def delete(self, chat_id: UUID) -> None: ... + + +class ChatFileRepository(Protocol): + def get(self, file_id: UUID) -> ChatFile | None: ... + + def get_by_chat_id_and_name( + self, + chat_id: UUID, + name: ChatAttachmentName, + ) -> ChatFile | None: ... + + def list_by_chat_id(self, chat_id: UUID) -> list[ChatFile]: ... + + def save(self, chat_file: ChatFile) -> None: ... + + def delete(self, file_id: UUID) -> None: ... + + def delete_by_chat_id(self, chat_id: UUID) -> None: ... + + +class ChatStorage(Protocol): + def ensure_chat(self, chat: Chat) -> None: ... + + def read_history(self, chat: Chat) -> str: ... + + def write_history(self, chat: Chat, content: str) -> None: ... + + def delete_chat(self, chat: Chat) -> None: ... + + def write_attachment( + self, + chat: Chat, + file_name: ChatAttachmentName, + content: bytes, + ) -> int: ... + + def read_attachment(self, chat: Chat, file_name: ChatAttachmentName) -> bytes: ... + + def delete_attachment( + self, + chat: Chat, + file_name: ChatAttachmentName, + ) -> None: ... + + def clear_attachments(self, chat: Chat) -> None: ... + + +class StorageUsageReader(Protocol): + def get_workspace_usage( + self, + workspace: Workspace, + chats: list[Chat], + ) -> WorkspaceUsage: ... + + +class IdGenerator(Protocol): + def new(self) -> UUID: ... + + class SandboxSessionRepository(Protocol): def get_active_by_chat_id(self, chat_id: UUID) -> SandboxSession | None: ... From 6fe484c44c28c5d5ef050d32a47ca9fed33b2ab9 Mon Sep 17 00:00:00 2001 From: David Shvarts Date: Tue, 7 Apr 2026 20:37:00 +0300 Subject: [PATCH 30/30] ref #13: in-memory metadata repositories (S02) --- repository/chat.py | 24 ++++ repository/chat_file.py | 57 +++++++++ repository/workspace.py | 26 +++++ test/test_storage_metadata_repositories.py | 129 +++++++++++++++++++++ 4 files changed, 236 insertions(+) create mode 100644 repository/chat.py create mode 100644 repository/chat_file.py create mode 100644 repository/workspace.py create mode 100644 test/test_storage_metadata_repositories.py diff --git a/repository/chat.py b/repository/chat.py new file mode 100644 index 0000000..1d85cc7 --- /dev/null +++ b/repository/chat.py @@ -0,0 +1,24 @@ +from uuid import UUID + +from domain.chat import Chat +from usecase.interface import ChatRepository + + +class InMemoryChatRepository(ChatRepository): + def __init__(self) -> None: + self._by_id: dict[UUID, Chat] = {} + + def get(self, chat_id: UUID) -> Chat | None: + return self._by_id.get(chat_id) + + def list_by_workspace_id(self, workspace_id: UUID) -> list[Chat]: + return sorted( + (c for c in self._by_id.values() if c.workspace_id == workspace_id), + key=lambda c: (c.created_at, c.chat_id), + ) + + def save(self, chat: Chat) -> None: + self._by_id[chat.chat_id] = chat + + def delete(self, chat_id: UUID) -> None: + self._by_id.pop(chat_id, None) diff --git a/repository/chat_file.py b/repository/chat_file.py new file mode 100644 index 0000000..ca18ce9 --- /dev/null +++ b/repository/chat_file.py @@ -0,0 +1,57 @@ +from uuid import UUID + +from domain.chat import ChatAttachmentName, ChatFile +from usecase.interface import ChatFileRepository + + +class InMemoryChatFileRepository(ChatFileRepository): + def __init__(self) -> None: + self._by_id: dict[UUID, ChatFile] = {} + self._by_chat_and_name: dict[tuple[UUID, str], UUID] = {} + + def get(self, file_id: UUID) -> ChatFile | None: + return self._by_id.get(file_id) + + def get_by_chat_id_and_name( + self, + chat_id: UUID, + name: ChatAttachmentName, + ) -> ChatFile | None: + fid = self._by_chat_and_name.get((chat_id, name.value)) + if fid is None: + return None + return self._by_id.get(fid) + + def list_by_chat_id(self, chat_id: UUID) -> list[ChatFile]: + return sorted( + (f for f in self._by_id.values() if f.chat_id == chat_id), + key=lambda f: (f.created_at, f.file_id), + ) + + def save(self, chat_file: ChatFile) -> None: + key = (chat_file.chat_id, chat_file.name.value) + existing_at_key = self._by_chat_and_name.get(key) + if existing_at_key is not None and existing_at_key != chat_file.file_id: + self._by_id.pop(existing_at_key, None) + + previous = self._by_id.get(chat_file.file_id) + if previous is not None: + prev_key = (previous.chat_id, previous.name.value) + if self._by_chat_and_name.get(prev_key) == previous.file_id: + del self._by_chat_and_name[prev_key] + + self._by_id[chat_file.file_id] = chat_file + self._by_chat_and_name[key] = chat_file.file_id + + def delete(self, file_id: UUID) -> None: + chat_file = self._by_id.pop(file_id, None) + if chat_file is None: + return + key = (chat_file.chat_id, chat_file.name.value) + if self._by_chat_and_name.get(key) == file_id: + del self._by_chat_and_name[key] + + def delete_by_chat_id(self, chat_id: UUID) -> None: + file_ids = [f.file_id for f in self._by_id.values() if f.chat_id == chat_id] + for fid in file_ids: + self.delete(fid) diff --git a/repository/workspace.py b/repository/workspace.py new file mode 100644 index 0000000..4aa5546 --- /dev/null +++ b/repository/workspace.py @@ -0,0 +1,26 @@ +from uuid import UUID + +from domain.workspace import Workspace +from usecase.interface import WorkspaceRepository + + +class InMemoryWorkspaceRepository(WorkspaceRepository): + def __init__(self) -> None: + self._by_id: dict[UUID, Workspace] = {} + self._user_id_to_workspace_id: dict[UUID, UUID] = {} + + def get(self, workspace_id: UUID) -> Workspace | None: + return self._by_id.get(workspace_id) + + def get_by_user_id(self, user_id: UUID) -> Workspace | None: + wid = self._user_id_to_workspace_id.get(user_id) + if wid is None: + return None + return self._by_id.get(wid) + + def save(self, workspace: Workspace) -> None: + existing_wid = self._user_id_to_workspace_id.get(workspace.user_id) + if existing_wid is not None and existing_wid != workspace.workspace_id: + self._by_id.pop(existing_wid, None) + self._by_id[workspace.workspace_id] = workspace + self._user_id_to_workspace_id[workspace.user_id] = workspace.workspace_id diff --git a/test/test_storage_metadata_repositories.py b/test/test_storage_metadata_repositories.py new file mode 100644 index 0000000..b03d56e --- /dev/null +++ b/test/test_storage_metadata_repositories.py @@ -0,0 +1,129 @@ +from datetime import UTC, datetime +from uuid import UUID + +from domain.chat import Chat, ChatAttachmentName, ChatFile +from domain.workspace import Workspace +from repository.chat import InMemoryChatRepository +from repository.chat_file import InMemoryChatFileRepository +from repository.workspace import InMemoryWorkspaceRepository + +USER_A = UUID('aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa') +USER_B = UUID('bbbbbbbb-bbbb-bbbb-bbbb-bbbbbbbbbbbb') +WS_A = UUID('11111111-1111-1111-1111-111111111111') +WS_B = UUID('22222222-2222-2222-2222-222222222222') +CHAT_A = UUID('33333333-3333-3333-3333-333333333333') +CHAT_B = UUID('44444444-4444-4444-4444-444444444444') +FILE_A = UUID('55555555-5555-5555-5555-555555555555') +FILE_B = UUID('66666666-6666-6666-6666-666666666666') +TS = datetime(2026, 4, 1, 12, 0, 0, tzinfo=UTC) +TS_2 = datetime(2026, 4, 1, 13, 0, 0, tzinfo=UTC) + + +def test_workspace_get_by_user_id() -> None: + repo = InMemoryWorkspaceRepository() + ws = Workspace(workspace_id=WS_A, user_id=USER_A, created_at=TS) + repo.save(ws) + assert repo.get(WS_A) == ws + assert repo.get_by_user_id(USER_A) == ws + assert repo.get_by_user_id(USER_B) is None + + +def test_workspace_replace_for_user() -> None: + repo = InMemoryWorkspaceRepository() + ws = Workspace(workspace_id=WS_A, user_id=USER_A, created_at=TS) + repo.save(ws) + new_ws = Workspace(workspace_id=WS_B, user_id=USER_A, created_at=TS_2) + repo.save(new_ws) + assert repo.get_by_user_id(USER_A) == new_ws + assert repo.get(WS_A) is None + assert repo.get(WS_B) == new_ws + + +def test_chat_crud_workspace_scope() -> None: + chat_repo = InMemoryChatRepository() + chat_a = Chat(chat_id=CHAT_A, workspace_id=WS_A, created_at=TS) + chat_b = Chat(chat_id=CHAT_B, workspace_id=WS_A, created_at=TS_2) + chat_repo.save(chat_a) + chat_repo.save(chat_b) + + listed = chat_repo.list_by_workspace_id(WS_A) + assert listed == [chat_a, chat_b] + + assert chat_repo.get(CHAT_A) == chat_a + chat_repo.delete(CHAT_A) + assert chat_repo.get(CHAT_A) is None + assert chat_repo.list_by_workspace_id(WS_A) == [chat_b] + + +def test_chat_list_only_same_workspace() -> None: + chat_repo = InMemoryChatRepository() + chat_a = Chat(chat_id=CHAT_A, workspace_id=WS_A, created_at=TS) + chat_b = Chat(chat_id=CHAT_B, workspace_id=WS_B, created_at=TS_2) + chat_repo.save(chat_a) + chat_repo.save(chat_b) + assert chat_repo.list_by_workspace_id(WS_A) == [chat_a] + assert chat_repo.list_by_workspace_id(WS_B) == [chat_b] + + +def test_chat_file_metadata_save_get_list_delete_clear() -> None: + name_a = ChatAttachmentName('doc.pdf') + name_b = ChatAttachmentName('x.png') + + repo = InMemoryChatFileRepository() + f_a = ChatFile( + file_id=FILE_A, + chat_id=CHAT_A, + name=name_a, + content_type='application/pdf', + size_bytes=100, + created_at=TS, + ) + f_b = ChatFile( + file_id=FILE_B, + chat_id=CHAT_A, + name=name_b, + content_type='image/png', + size_bytes=200, + created_at=TS_2, + ) + repo.save(f_a) + repo.save(f_b) + + assert repo.get(FILE_A) == f_a + assert repo.get_by_chat_id_and_name(CHAT_A, name_a) == f_a + listed = repo.list_by_chat_id(CHAT_A) + assert listed == [f_a, f_b] + + repo.delete(FILE_A) + assert repo.get(FILE_A) is None + assert repo.get_by_chat_id_and_name(CHAT_A, name_a) is None + + repo.save(f_a) + repo.delete_by_chat_id(CHAT_A) + assert repo.list_by_chat_id(CHAT_A) == [] + + +def test_chat_file_same_name_replaced_by_new_id() -> None: + name = ChatAttachmentName('a.txt') + repo = InMemoryChatFileRepository() + first = ChatFile( + file_id=FILE_A, + chat_id=CHAT_A, + name=name, + content_type='text/plain', + size_bytes=1, + created_at=TS, + ) + second = ChatFile( + file_id=FILE_B, + chat_id=CHAT_A, + name=name, + content_type='text/plain', + size_bytes=2, + created_at=TS_2, + ) + repo.save(first) + repo.save(second) + assert repo.get(FILE_A) is None + assert repo.get(FILE_B) == second + assert repo.get_by_chat_id_and_name(CHAT_A, name) == second