feat(context): Phase 1 - Foundation types, config and exceptions (#79 )

Implements the foundation for Context Management Engine: Types (backend/app/services/context/types/): - BaseContext: Abstract base with ID, content, priority, scoring - SystemContext: System prompts, personas, instructions - KnowledgeContext: RAG results from Knowledge Base MCP - ConversationContext: Chat history with role support - TaskContext: Task/issue context with acceptance criteria - ToolContext: Tool definitions and execution results - AssembledContext: Final assembled context result Configuration (config.py): - Token budget allocation (system 5%, task 10%, knowledge 40%, etc.) - Scoring weights (relevance 50%, recency 30%, priority 20%) - Cache settings (TTL, prefix) - Performance settings (max assembly time, parallel scoring) - Environment variable overrides with CTX_ prefix Exceptions (exceptions.py): - ContextError: Base exception - BudgetExceededError: Token budget violations - TokenCountError: Token counting failures - CompressionError: Compression failures - AssemblyTimeoutError: Assembly timeout - ScoringError, FormattingError, CacheError - ContextNotFoundError, InvalidContextError All 86 tests pass. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
docs(mcp): add comprehensive MCP server documentation
2026-01-04 02:07:39 +01:00 · 2026-01-04 01:37:04 +01:00 · 2026-01-04 01:36:55 +01:00 · 2026-01-04 01:28:20 +01:00 · 2026-01-04 01:18:50 +01:00 · 2026-01-04 01:07:40 +01:00
508 changed files with 133942 additions and 1184 deletions
--- a/.env.template
+++ b/.env.template
@@ -1,15 +1,22 @@
 # Common settings
-PROJECT_NAME=App
+PROJECT_NAME=Syndarix
 VERSION=1.0.0
 # Database settings
 POSTGRES_USER=postgres
 POSTGRES_PASSWORD=postgres
-POSTGRES_DB=app
+POSTGRES_DB=syndarix
 POSTGRES_HOST=db
 POSTGRES_PORT=5432
 DATABASE_URL=postgresql://${POSTGRES_USER}:${POSTGRES_PASSWORD}@${POSTGRES_HOST}:${POSTGRES_PORT}/${POSTGRES_DB}
 # Redis settings (cache, pub/sub, Celery broker)
 REDIS_URL=redis://redis:6379/0
 # Celery settings (optional - defaults to REDIS_URL if not set)
 # CELERY_BROKER_URL=redis://redis:6379/0
 # CELERY_RESULT_BACKEND=redis://redis:6379/0
 # Backend settings
 BACKEND_PORT=8000
 # CRITICAL: Generate a secure SECRET_KEY for production!
--- a/.gitea/workflows/ci.yaml
+++ b/.gitea/workflows/ci.yaml
@@ -0,0 +1,460 @@
 # Syndarix CI/CD Pipeline
 # Gitea Actions workflow for continuous integration and deployment
 #
 # Pipeline Structure:
 # - lint: Fast feedback (linting and type checking)
 # - test: Run test suites (depends on lint)
 # - build: Build Docker images (depends on test)
 # - deploy: Deploy to production (depends on build, only on main)
 name: CI/CD Pipeline
 on:
  push:
    branches:
      - main
      - dev
      - 'feature/**'
  pull_request:
    branches:
      - main
      - dev
 env:
  PYTHON_VERSION: "3.12"
  NODE_VERSION: "20"
  UV_VERSION: "0.4.x"
 jobs:
  # ===========================================================================
  # LINT JOB - Fast feedback first
  # ===========================================================================
  lint:
    name: Lint & Type Check
    runs-on: ubuntu-latest
    strategy:
      matrix:
        component: [backend, frontend]
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      # ----- Backend Linting -----
      - name: Set up Python
        if: matrix.component == 'backend'
        uses: actions/setup-python@v5
        with:
          python-version: ${{ env.PYTHON_VERSION }}
      - name: Install uv
        if: matrix.component == 'backend'
        uses: astral-sh/setup-uv@v4
        with:
          version: ${{ env.UV_VERSION }}
      - name: Cache uv dependencies
        if: matrix.component == 'backend'
        uses: actions/cache@v4
        with:
          path: |
            ~/.cache/uv
            backend/.venv
          key: uv-${{ runner.os }}-${{ hashFiles('backend/uv.lock') }}
          restore-keys: |
            uv-${{ runner.os }}-
      - name: Install backend dependencies
        if: matrix.component == 'backend'
        working-directory: backend
        run: uv sync --extra dev --frozen
      - name: Run ruff linting
        if: matrix.component == 'backend'
        working-directory: backend
        run: uv run ruff check app
      - name: Run ruff format check
        if: matrix.component == 'backend'
        working-directory: backend
        run: uv run ruff format --check app
      - name: Run mypy type checking
        if: matrix.component == 'backend'
        working-directory: backend
        run: uv run mypy app
      # ----- Frontend Linting -----
      - name: Set up Node.js
        if: matrix.component == 'frontend'
        uses: actions/setup-node@v4
        with:
          node-version: ${{ env.NODE_VERSION }}
      - name: Cache npm dependencies
        if: matrix.component == 'frontend'
        uses: actions/cache@v4
        with:
          path: |
            ~/.npm
            frontend/node_modules
          key: npm-${{ runner.os }}-${{ hashFiles('frontend/package-lock.json') }}
          restore-keys: |
            npm-${{ runner.os }}-
      - name: Install frontend dependencies
        if: matrix.component == 'frontend'
        working-directory: frontend
        run: npm ci
      - name: Run ESLint
        if: matrix.component == 'frontend'
        working-directory: frontend
        run: npm run lint
      - name: Run TypeScript type check
        if: matrix.component == 'frontend'
        working-directory: frontend
        run: npm run type-check
      - name: Run Prettier format check
        if: matrix.component == 'frontend'
        working-directory: frontend
        run: npm run format:check
  # ===========================================================================
  # TEST JOB - Run test suites
  # ===========================================================================
  test:
    name: Test
    runs-on: ubuntu-latest
    needs: lint
    strategy:
      matrix:
        component: [backend, frontend]
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      # ----- Backend Tests -----
      - name: Set up Python
        if: matrix.component == 'backend'
        uses: actions/setup-python@v5
        with:
          python-version: ${{ env.PYTHON_VERSION }}
      - name: Install uv
        if: matrix.component == 'backend'
        uses: astral-sh/setup-uv@v4
        with:
          version: ${{ env.UV_VERSION }}
      - name: Cache uv dependencies
        if: matrix.component == 'backend'
        uses: actions/cache@v4
        with:
          path: |
            ~/.cache/uv
            backend/.venv
          key: uv-${{ runner.os }}-${{ hashFiles('backend/uv.lock') }}
          restore-keys: |
            uv-${{ runner.os }}-
      - name: Install backend dependencies
        if: matrix.component == 'backend'
        working-directory: backend
        run: uv sync --extra dev --frozen
      - name: Run pytest with coverage
        if: matrix.component == 'backend'
        working-directory: backend
        env:
          IS_TEST: "True"
        run: |
          uv run pytest --cov=app --cov-report=xml --cov-report=term-missing --cov-fail-under=90
      - name: Upload backend coverage report
        if: matrix.component == 'backend'
        uses: actions/upload-artifact@v4
        with:
          name: backend-coverage
          path: backend/coverage.xml
          retention-days: 7
      # ----- Frontend Tests -----
      - name: Set up Node.js
        if: matrix.component == 'frontend'
        uses: actions/setup-node@v4
        with:
          node-version: ${{ env.NODE_VERSION }}
      - name: Cache npm dependencies
        if: matrix.component == 'frontend'
        uses: actions/cache@v4
        with:
          path: |
            ~/.npm
            frontend/node_modules
          key: npm-${{ runner.os }}-${{ hashFiles('frontend/package-lock.json') }}
          restore-keys: |
            npm-${{ runner.os }}-
      - name: Install frontend dependencies
        if: matrix.component == 'frontend'
        working-directory: frontend
        run: npm ci
      - name: Run Jest unit tests
        if: matrix.component == 'frontend'
        working-directory: frontend
        run: npm test -- --coverage --passWithNoTests
      - name: Upload frontend coverage report
        if: matrix.component == 'frontend'
        uses: actions/upload-artifact@v4
        with:
          name: frontend-coverage
          path: frontend/coverage/
          retention-days: 7
  # ===========================================================================
  # BUILD JOB - Build Docker images
  # ===========================================================================
  build:
    name: Build
    runs-on: ubuntu-latest
    needs: test
    strategy:
      matrix:
        component: [backend, frontend]
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Cache Docker layers
        uses: actions/cache@v4
        with:
          path: /tmp/.buildx-cache
          key: docker-${{ matrix.component }}-${{ github.sha }}
          restore-keys: |
            docker-${{ matrix.component }}-
      - name: Build backend Docker image
        if: matrix.component == 'backend'
        uses: docker/build-push-action@v5
        with:
          context: ./backend
          file: ./backend/Dockerfile
          target: production
          push: false
          tags: syndarix-backend:${{ github.sha }}
          cache-from: type=local,src=/tmp/.buildx-cache
          cache-to: type=local,dest=/tmp/.buildx-cache-new,mode=max
      - name: Build frontend Docker image
        if: matrix.component == 'frontend'
        uses: docker/build-push-action@v5
        with:
          context: ./frontend
          file: ./frontend/Dockerfile
          target: runner
          push: false
          tags: syndarix-frontend:${{ github.sha }}
          build-args: |
            NEXT_PUBLIC_API_URL=http://localhost:8000
          cache-from: type=local,src=/tmp/.buildx-cache
          cache-to: type=local,dest=/tmp/.buildx-cache-new,mode=max
      # Prevent cache from growing indefinitely
      - name: Move cache
        run: |
          rm -rf /tmp/.buildx-cache
          mv /tmp/.buildx-cache-new /tmp/.buildx-cache
  # ===========================================================================
  # DEPLOY JOB - Deploy to production (only on main branch)
  # ===========================================================================
  deploy:
    name: Deploy
    runs-on: ubuntu-latest
    needs: build
    if: github.ref == 'refs/heads/main' && github.event_name == 'push'
    environment: production
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: Deploy notification
        run: |
          echo "Deployment to production would happen here"
          echo "Branch: ${{ github.ref }}"
          echo "Commit: ${{ github.sha }}"
          echo "Actor: ${{ github.actor }}"
      # TODO: Add actual deployment steps when infrastructure is ready
      # Options:
      # - SSH to production server and run docker-compose pull && docker-compose up -d
      # - Use Kubernetes deployment
      # - Use cloud provider deployment (AWS ECS, GCP Cloud Run, etc.)
      # - Trigger webhook to deployment orchestrator
  # ===========================================================================
  # SECURITY SCAN JOB - Run on main and dev branches
  # ===========================================================================
  security:
    name: Security Scan
    runs-on: ubuntu-latest
    needs: lint
    if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev'
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ env.PYTHON_VERSION }}
      - name: Install uv
        uses: astral-sh/setup-uv@v4
        with:
          version: ${{ env.UV_VERSION }}
      - name: Install backend dependencies
        working-directory: backend
        run: uv sync --extra dev --frozen
      - name: Run Bandit security scan (via ruff)
        working-directory: backend
        run: |
          # Ruff includes flake8-bandit (S rules) for security scanning
          # Run with explicit security rules only
          uv run ruff check app --select=S --ignore=S101,S104,S105,S106,S603,S607
      - name: Run pip-audit for dependency vulnerabilities
        working-directory: backend
        run: |
          # pip-audit checks for known vulnerabilities in Python dependencies
          uv run pip-audit --require-hashes --disable-pip -r <(uv pip compile pyproject.toml) || true
          # Note: Using || true temporarily while setting up proper remediation
      - name: Check for secrets in code
        run: |
          # Basic check for common secret patterns
          # In production, use tools like gitleaks or trufflehog
          echo "Checking for potential hardcoded secrets..."
          ! grep -rn --include="*.py" --include="*.ts" --include="*.tsx" --include="*.js" \
            -E "(api_key|apikey|secret_key|secretkey|password|passwd|token)\s*=\s*['\"][^'\"]{8,}['\"]" \
            backend/app frontend/src || echo "No obvious secrets found"
      - name: Set up Node.js
        uses: actions/setup-node@v4
        with:
          node-version: ${{ env.NODE_VERSION }}
      - name: Install frontend dependencies
        working-directory: frontend
        run: npm ci
      - name: Run npm audit
        working-directory: frontend
        run: |
          npm audit --audit-level=high || true
          # Note: Using || true to not fail on moderate vulnerabilities
          # In production, consider stricter settings
  # ===========================================================================
  # E2E TEST JOB - Run end-to-end tests with Playwright
  # ===========================================================================
  e2e-tests:
    name: E2E Tests
    runs-on: ubuntu-latest
    needs: [lint, test]
    if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev' || github.event_name == 'pull_request'
    services:
      postgres:
        image: pgvector/pgvector:pg17
        env:
          POSTGRES_USER: postgres
          POSTGRES_PASSWORD: postgres
          POSTGRES_DB: syndarix_test
        ports:
          - 5432:5432
        options: >-
          --health-cmd "pg_isready -U postgres"
          --health-interval 10s
          --health-timeout 5s
          --health-retries 5
      redis:
        image: redis:7-alpine
        ports:
          - 6379:6379
        options: >-
          --health-cmd "redis-cli ping"
          --health-interval 10s
          --health-timeout 5s
          --health-retries 5
    steps:
      - name: Checkout code
        uses: actions/checkout@v4
      - name: Set up Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ env.PYTHON_VERSION }}
      - name: Install uv
        uses: astral-sh/setup-uv@v4
        with:
          version: ${{ env.UV_VERSION }}
      - name: Install backend dependencies
        working-directory: backend
        run: uv sync --extra dev --frozen
      - name: Set up Node.js
        uses: actions/setup-node@v4
        with:
          node-version: ${{ env.NODE_VERSION }}
      - name: Install frontend dependencies
        working-directory: frontend
        run: npm ci
      - name: Install Playwright browsers
        working-directory: frontend
        run: npx playwright install --with-deps chromium
      - name: Start backend server
        working-directory: backend
        env:
          DATABASE_URL: postgresql://postgres:postgres@localhost:5432/syndarix_test
          REDIS_URL: redis://localhost:6379/0
          SECRET_KEY: test-secret-key-for-e2e-tests-only
          ENVIRONMENT: test
          IS_TEST: "True"
        run: |
          # Run migrations
          uv run python -c "from app.database import create_tables; import asyncio; asyncio.run(create_tables())" || true
          # Start backend in background
          uv run uvicorn app.main:app --host 0.0.0.0 --port 8000 &
          # Wait for backend to be ready
          sleep 10
      - name: Run Playwright E2E tests
        working-directory: frontend
        env:
          NEXT_PUBLIC_API_URL: http://localhost:8000
        run: |
          npm run build
          npm run test:e2e -- --project=chromium
      - name: Upload Playwright report
        uses: actions/upload-artifact@v4
        if: always()
        with:
          name: playwright-report
          path: frontend/playwright-report/
          retention-days: 7
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -1,243 +1,204 @@
 # CLAUDE.md
-Claude Code context for FastAPI + Next.js Full-Stack Template.
+Claude Code context for **Syndarix** - AI-Powered Software Consulting Agency.
-**See [AGENTS.md](./AGENTS.md) for project context, architecture, and development commands.**
+**Built on PragmaStack.** See [AGENTS.md](./AGENTS.md) for base template context.
 ---
 ## Syndarix Project Context
 ### Vision
 Syndarix is an autonomous platform that orchestrates specialized AI agents to deliver complete software solutions with minimal human intervention. It acts as a virtual consulting agency with AI agents playing roles like Product Owner, Architect, Engineers, QA, etc.
 ### Repository
 - **URL:** https://gitea.pragmazest.com/cardosofelipe/syndarix
 - **Issue Tracker:** Gitea Issues (primary)
 - **CI/CD:** Gitea Actions
 ### Core Concepts
 **Agent Types & Instances:**
 - Agent Type = Template (base model, failover, expertise, personality)
 - Agent Instance = Spawned from type, assigned to project
 - Multiple instances of same type can work together
 **Project Workflow:**
 1. Requirements discovery with Product Owner agent
 2. Architecture spike (PO + BA + Architect brainstorm)
 3. Implementation planning and backlog creation
 4. Autonomous sprint execution with checkpoints
 5. Demo and client feedback
 **Autonomy Levels:**
 - `FULL_CONTROL`: Approve every action
 - `MILESTONE`: Approve sprint boundaries
 - `AUTONOMOUS`: Only major decisions
 **MCP-First Architecture:**
 All integrations via Model Context Protocol servers with explicit scoping:
 ```python
 # All tools take project_id for scoping
 search_knowledge(project_id="proj-123", query="auth flow")
 create_issue(project_id="proj-123", title="Add login")
 ```
 ### Directory Structure
 ```
 docs/
 ├── development/    # Workflow and coding standards
 ├── requirements/   # Requirements documents
 ├── architecture/   # Architecture documentation
 ├── adrs/          # Architecture Decision Records
 └── spikes/        # Spike research documents
 ```
 ### Current Phase
 **Backlog Population** - Creating detailed issues for Phase 0-1 implementation.
 ---
 ## Development Standards
 **CRITICAL: These rules are mandatory. See linked docs for full details.**
 ### Quick Reference
 | Topic | Documentation |
 |-------|---------------|
 | **Workflow & Branching** | [docs/development/WORKFLOW.md](./docs/development/WORKFLOW.md) |
 | **Coding Standards** | [docs/development/CODING_STANDARDS.md](./docs/development/CODING_STANDARDS.md) |
 | **Design System** | [frontend/docs/design-system/](./frontend/docs/design-system/) |
 | **Backend E2E Testing** | [backend/docs/E2E_TESTING.md](./backend/docs/E2E_TESTING.md) |
 | **Demo Mode** | [frontend/docs/DEMO_MODE.md](./frontend/docs/DEMO_MODE.md) |
 ### Essential Rules Summary
 1. **Issue-Driven Development**: Every piece of work MUST have an issue first
 2. **Branch per Feature**: `feature/<issue-number>-<description>`, single branch for design+implementation
 3. **Testing Required**: All code must be tested, aim for >90% coverage
 4. **Code Review**: Must pass multi-agent review before merge
 5. **No Direct Commits**: Never commit directly to `main` or `dev`
 6. **Stack Verification**: ALWAYS run the full stack before considering work done (see below)
 ### CRITICAL: Stack Verification Before Merge
 **This is NON-NEGOTIABLE. A feature with 100% test coverage that crashes on startup is WORTHLESS.**
 Before considering ANY issue complete:
 ```bash
 # 1. Start the dev stack
 make dev
 # 2. Wait for backend to be healthy, check logs
 docker compose -f docker-compose.dev.yml logs backend --tail=100
 # 3. Start frontend
 cd frontend && npm run dev
 # 4. Verify both are running without errors
 ```
 **The issue is NOT done if:**
 - Backend crashes on startup (import errors, missing dependencies)
 - Frontend fails to compile or render
 - Health checks fail
 - Any error appears in logs
 **Why this matters:**
 - Tests run in isolation and may pass despite broken imports
 - Docker builds cache layers and may hide dependency issues
 - A single `ModuleNotFoundError` renders all test coverage meaningless
 ### Common Commands
 ```bash
 # Backend
 IS_TEST=True uv run pytest          # Run tests
 uv run ruff check src/              # Lint
 uv run mypy src/                    # Type check
 python migrate.py auto "message"    # Database migration
 # Frontend
 npm test                            # Unit tests
 npm run lint                        # Lint
 npm run type-check                  # Type check
 npm run generate:api                # Regenerate API client
 ```
 ---
 ## Claude Code-Specific Guidance
 ### Critical User Preferences
-#### File Operations - NEVER Use Heredoc/Cat Append
+**File Operations:**
-**ALWAYS use Read/Write/Edit tools instead of `cat >> file << EOF` commands.**
+- ALWAYS use Read/Write/Edit tools instead of `cat >> file << EOF`
 - Never use heredoc - it triggers manual approval dialogs
-This triggers manual approval dialogs and disrupts workflow.
+**Work Style:**
 ```bash
 # WRONG ❌
 cat >> file.txt << EOF
 content
 EOF
 # CORRECT ✅ - Use Read, then Write tools
 ```
 #### Work Style
 - User prefers autonomous operation without frequent interruptions
 - Ask for batch permissions upfront for long work sessions
 - Work independently, document decisions clearly
 - Only use emojis if the user explicitly requests it
-### When Working with This Stack
+### Critical Pattern: Auth Store DI
 **Dependency Management:**
 - Backend uses **uv** (modern Python package manager), not pip
 - Always use `uv run` prefix: `IS_TEST=True uv run pytest`
 - Or use Makefile commands: `make test`, `make install-dev`
 - Add dependencies: `uv add <package>` or `uv add --dev <package>`
 **Database Migrations:**
 - Use the `migrate.py` helper script, not Alembic directly
 - Generate + apply: `python migrate.py auto "message"`
 - Never commit migrations without testing them first
 - Check current state: `python migrate.py current`
 **Frontend API Client Generation:**
 - Run `npm run generate:api` after backend schema changes
 - Client is auto-generated from OpenAPI spec
 - Located in `frontend/src/lib/api/generated/`
 - NEVER manually edit generated files
 **Testing Commands:**
 - Backend unit/integration: `IS_TEST=True uv run pytest` (always prefix with `IS_TEST=True`)
 - Backend E2E (requires Docker): `make test-e2e`
 - Frontend unit: `npm test`
 - Frontend E2E: `npm run test:e2e`
 - Use `make test` or `make test-cov` in backend for convenience
 **Backend E2E Testing (requires Docker):**
 - Install deps: `make install-e2e`
 - Run all E2E tests: `make test-e2e`
 - Run schema tests only: `make test-e2e-schema`
 - Run all tests: `make test-all` (unit + E2E)
 - Uses Testcontainers (real PostgreSQL) + Schemathesis (OpenAPI contract testing)
 - Markers: `@pytest.mark.e2e`, `@pytest.mark.postgres`, `@pytest.mark.schemathesis`
 - See: `backend/docs/E2E_TESTING.md` for complete guide
 ### 🔴 CRITICAL: Auth Store Dependency Injection Pattern
 **ALWAYS use `useAuth()` from `AuthContext`, NEVER import `useAuthStore` directly!**
 ```typescript
-// ❌ WRONG - Bypasses dependency injection
+// ❌ WRONG
 import { useAuthStore } from '@/lib/stores/authStore';
 const { user, isAuthenticated } = useAuthStore();
-// ✅ CORRECT - Uses dependency injection
+// ✅ CORRECT
 import { useAuth } from '@/lib/auth/AuthContext';
 const { user, isAuthenticated } = useAuth();
 ```
-**Why This Matters:**
+See [CODING_STANDARDS.md](./docs/development/CODING_STANDARDS.md#auth-store-dependency-injection) for details.
 - E2E tests inject mock stores via `window.__TEST_AUTH_STORE__`
 - Unit tests inject via `<AuthProvider store={mockStore}>`
 - Direct `useAuthStore` imports bypass this injection → **tests fail**
 - ESLint will catch violations (added Nov 2025)
 **Exceptions:**
 1. `AuthContext.tsx` - DI boundary, legitimately needs real store
 2. `client.ts` - Non-React context, uses dynamic import + `__TEST_AUTH_STORE__` check
 ### E2E Test Best Practices
 When writing or fixing Playwright tests:
 **Navigation Pattern:**
 ```typescript
 // ✅ CORRECT - Use Promise.all for Next.js Link clicks
 await Promise.all([
  page.waitForURL('/target', { timeout: 10000 }),
  link.click()
 ]);
 ```
 **Selectors:**
 - Use ID-based selectors for validation errors: `#email-error`
 - Error IDs use dashes not underscores: `#new-password-error`
 - Target `.border-destructive[role="alert"]` to avoid Next.js route announcer conflicts
 - Avoid generic `[role="alert"]` which matches multiple elements
 **URL Assertions:**
 ```typescript
 // ✅ Use regex to handle query params
 await expect(page).toHaveURL(/\/auth\/login/);
 // ❌ Don't use exact strings (fails with query params)
 await expect(page).toHaveURL('/auth/login');
 ```
 **Configuration:**
 - Uses 12 workers in non-CI mode (`playwright.config.ts`)
 - Reduces to 2 workers in CI for stability
 - Tests are designed to be non-flaky with proper waits
 ### Important Implementation Details
 **Authentication Testing:**
 - Backend fixtures in `tests/conftest.py`:
  - `async_test_db`: Fresh SQLite per test
  - `async_test_user` / `async_test_superuser`: Pre-created users
  - `user_token` / `superuser_token`: Access tokens for API calls
 - Always use `@pytest.mark.asyncio` for async tests
 - Use `@pytest_asyncio.fixture` for async fixtures
 **Database Testing:**
 ```python
 # Mock database exceptions correctly
 from unittest.mock import patch, AsyncMock
 async def mock_commit():
    raise OperationalError("Connection lost", {}, Exception())
 with patch.object(session, 'commit', side_effect=mock_commit):
    with patch.object(session, 'rollback', new_callable=AsyncMock) as mock_rollback:
        with pytest.raises(OperationalError):
            await crud_method(session, obj_in=data)
        mock_rollback.assert_called_once()
 ```
 **Frontend Component Development:**
 - Follow design system docs in `frontend/docs/design-system/`
 - Read `08-ai-guidelines.md` for AI code generation rules
 - Use parent-controlled spacing (see `04-spacing-philosophy.md`)
 - WCAG AA compliance required (see `07-accessibility.md`)
 **Security Considerations:**
 - Backend has comprehensive security tests (JWT attacks, session hijacking)
 - Never skip security headers in production
 - Rate limiting is configured in route decorators: `@limiter.limit("10/minute")`
 - Session revocation is database-backed, not just JWT expiry
 ### Common Workflows Guidance
 **When Adding a New Feature:**
 1. Start with backend schema and CRUD
 2. Implement API route with proper authorization
 3. Write backend tests (aim for >90% coverage)
 4. Generate frontend API client: `npm run generate:api`
 5. Implement frontend components
 6. Write frontend unit tests
 7. Add E2E tests for critical flows
 8. Update relevant documentation
 **When Fixing Tests:**
 - Backend: Check test database isolation and async fixture usage
 - Frontend unit: Verify mocking of `useAuth()` not `useAuthStore`
 - E2E: Use `Promise.all()` pattern and regex URL assertions
 **When Debugging:**
 - Backend: Check `IS_TEST=True` environment variable is set
 - Frontend: Run `npm run type-check` first
 - E2E: Use `npm run test:e2e:debug` for step-by-step debugging
 - Check logs: Backend has detailed error logging
 **Demo Mode (Frontend-Only Showcase):**
 - Enable: `echo "NEXT_PUBLIC_DEMO_MODE=true" > frontend/.env.local`
 - Uses MSW (Mock Service Worker) to intercept API calls in browser
 - Zero backend required - perfect for Vercel deployments
 - **Fully Automated**: MSW handlers auto-generated from OpenAPI spec
  - Run `npm run generate:api` → updates both API client AND MSW handlers
  - No manual synchronization needed!
 - Demo credentials (any password ≥8 chars works):
  - User: `demo@example.com` / `DemoPass123`
  - Admin: `admin@example.com` / `AdminPass123`
 - **Safe**: MSW never runs during tests (Jest or Playwright)
 - **Coverage**: Mock files excluded from linting and coverage
 - **Documentation**: `frontend/docs/DEMO_MODE.md` for complete guide
 ### Tool Usage Preferences
 **Prefer specialized tools over bash:**
 - Use Read/Write/Edit tools for file operations
 - Never use `cat`, `echo >`, or heredoc for file manipulation
 - Use Task tool with `subagent_type=Explore` for codebase exploration
 - Use Grep tool for code search, not bash `grep`
-**When to use parallel tool calls:**
+**Parallel tool calls for:**
- Independent git commands: `git status`, `git diff`, `git log`
+- Independent git commands
 - Reading multiple unrelated files
- Running multiple test suites simultaneously
+- Running multiple test suites
 - Independent validation steps
-## Custom Skills
+---
-No Claude Code Skills installed yet. To create one, invoke the built-in "skill-creator" skill.
+## Key Extensions (from PragmaStack base)
-**Potential skill ideas for this project:**
+- Celery + Redis for agent job queue
- API endpoint generator workflow (schema → CRUD → route → tests → frontend client)
+- WebSocket/SSE for real-time updates
- Component generator with design system compliance
+- pgvector for RAG knowledge base
- Database migration troubleshooting helper
+- MCP server integration layer
- Test coverage analyzer and improvement suggester
+
- E2E test generator for new features
+---
 ## Additional Resources
-**Comprehensive Documentation:**
+**Documentation:**
 - [AGENTS.md](./AGENTS.md) - Framework-agnostic AI assistant context
 - [README.md](./README.md) - User-facing project overview
- `backend/docs/` - Backend architecture, coding standards, common pitfalls
+- [docs/development/](./docs/development/) - Development workflow and standards
- `frontend/docs/design-system/` - Complete design system guide
+- [backend/docs/](./backend/docs/) - Backend architecture and guides
 - [frontend/docs/design-system/](./frontend/docs/design-system/) - Complete design system
 **API Documentation (when running):**
 - Swagger UI: http://localhost:8000/docs
 - ReDoc: http://localhost:8000/redoc
 - OpenAPI JSON: http://localhost:8000/api/v1/openapi.json
 **Testing Documentation:**
 - Backend tests: `backend/tests/` (97% coverage)
 - Frontend E2E: `frontend/e2e/README.md`
 - Design system: `frontend/docs/design-system/08-ai-guidelines.md`
 ---
 **For project architecture, development commands, and general context, see [AGENTS.md](./AGENTS.md).**
--- a/README.md
+++ b/README.md
@@ -1,659 +1,175 @@
-# <img src="frontend/public/logo.svg" alt="PragmaStack" width="32" height="32" style="vertical-align: middle" /> PragmaStack
+# Syndarix
-> **The Pragmatic Full-Stack Template. Production-ready, security-first, and opinionated.**
+> **Your AI-Powered Software Consulting Agency**
 >
 > An autonomous platform that orchestrates specialized AI agents to deliver complete software solutions with minimal human intervention.
-[![Backend Coverage](https://img.shields.io/badge/backend_coverage-97%25-brightgreen)](./backend/tests)
+[![Built on PragmaStack](https://img.shields.io/badge/Built_on-PragmaStack-blue)](https://gitea.pragmazest.com/cardosofelipe/fast-next-template)
 [![Frontend Coverage](https://img.shields.io/badge/frontend_coverage-97%25-brightgreen)](./frontend/tests)
 [![E2E Tests](https://img.shields.io/badge/e2e_tests-passing-success)](./frontend/e2e)
 [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](./LICENSE)
 [![PRs Welcome](https://img.shields.io/badge/PRs-welcome-brightgreen.svg)](./CONTRIBUTING.md)
 ![Landing Page](docs/images/landing.png)
 ---
-## Why PragmaStack?
+## Vision
-Building a modern full-stack application often leads to "analysis paralysis" or "boilerplate fatigue". You spend weeks setting up authentication, testing, and linting before writing a single line of business logic.
+Syndarix transforms the software development lifecycle by providing a **virtual consulting team** of AI agents that collaboratively plan, design, implement, test, and deliver complete software solutions.
-**PragmaStack cuts through the noise.**
+**The Problem:** Even with AI coding assistants, developers spend as much time managing AI as doing the work themselves. Context switching, babysitting, and knowledge fragmentation limit productivity.
-We provide a **pragmatic**, opinionated foundation that prioritizes:
+**The Solution:** A structured, autonomous agency where specialized AI agents handle different roles (Product Owner, Architect, Engineers, QA, etc.) with proper workflows, reviews, and quality gates.
 - **Speed**: Ship features, not config files.
 - **Robustness**: Security and testing are not optional.
 - **Clarity**: Code that is easy to read and maintain.
 Whether you're building a SaaS, an internal tool, or a side project, PragmaStack gives you a solid starting point without the bloat.
 ---
-## ✨ Features
+## Key Features
-### 🔐 **Authentication & Security**
+### Multi-Agent Orchestration
- JWT-based authentication with access + refresh tokens
+- Configurable agent **types** with base model, failover, expertise, and personality
- **OAuth/Social Login** (Google, GitHub) with PKCE support
+- Spawn multiple **instances** from the same type (e.g., Dave, Ellis, Kate as Software Developers)
- **OAuth 2.0 Authorization Server** (MCP-ready) for third-party integrations
+- Agent-to-agent communication and collaboration
- Session management with device tracking and revocation
+- Per-instance customization with domain-specific knowledge
 - Password reset flow (email integration ready)
 - Secure password hashing (bcrypt)
 - CSRF protection, rate limiting, and security headers
 - Comprehensive security tests (JWT algorithm attacks, session hijacking, privilege escalation)
-### 🔌 **OAuth Provider Mode (MCP Integration)**
+### Complete SDLC Support
-Full OAuth 2.0 Authorization Server for Model Context Protocol (MCP) and third-party clients:
+- **Requirements Discovery** → **Architecture Spike** → **Implementation Planning**
- **RFC 7636**: Authorization Code Flow with PKCE (S256 only)
+- **Sprint Management** with automated ceremonies
- **RFC 8414**: Server metadata discovery at `/.well-known/oauth-authorization-server`
+- **Issue Tracking** with Epic/Story/Task hierarchy
- **RFC 7662**: Token introspection endpoint
+- **Git Integration** with proper branch/PR workflows
- **RFC 7009**: Token revocation endpoint
+- **CI/CD Pipelines** with automated testing
 - **JWT access tokens**: Self-contained, configurable lifetime
 - **Opaque refresh tokens**: Secure rotation, database-backed revocation
 - **Consent management**: Users can review and revoke app permissions
 - **Client management**: Admin endpoints for registering OAuth clients
 - **Scopes**: `openid`, `profile`, `email`, `read:users`, `write:users`, `admin`
-### 👥 **Multi-Tenancy & Organizations**
+### Configurable Autonomy
- Full organization system with role-based access control (Owner, Admin, Member)
+- From `FULL_CONTROL` (approve everything) to `AUTONOMOUS` (only major milestones)
- Invite/remove members, manage permissions
+- Client can intervene at any point
- Organization-scoped data access
+- Transparent progress visibility
 - User can belong to multiple organizations
-### 🛠️ **Admin Panel**
+### MCP-First Architecture
- Complete user management (CRUD, activate/deactivate, bulk operations)
+- All integrations via **Model Context Protocol (MCP)** servers
- Organization management (create, edit, delete, member management)
+- Unified Knowledge Base with project/agent scoping
- Session monitoring across all users
+- Git providers (Gitea, GitHub, GitLab) via MCP
- Real-time statistics dashboard
+- Extensible through custom MCP tools
 - Admin-only routes with proper authorization
-### 🎨 **Modern Frontend**
+### Project Complexity Wizard
- Next.js 16 with App Router and React 19
+- **Script** → Minimal process, no repo needed
- **PragmaStack Design System** built on shadcn/ui + TailwindCSS
+- **Simple** → Single sprint, basic backlog
- Pre-configured theme with dark mode support (coming soon)
+- **Medium/Complex** → Full AGILE workflow with multiple sprints
 - Responsive, accessible components (WCAG AA compliant)
 - Rich marketing landing page with animated components
 - Live component showcase and documentation at `/dev`
 ### 🌍 **Internationalization (i18n)**
 - Built-in multi-language support with next-intl v4
 - Locale-based routing (`/en/*`, `/it/*`)
 - Seamless language switching with LocaleSwitcher component
 - SEO-friendly URLs and metadata per locale
 - Translation files for English and Italian (easily extensible)
 - Type-safe translations throughout the app
 ### 🎯 **Content & UX Features**
 - **Toast notifications** with Sonner for elegant user feedback
 - **Smooth animations** powered by Framer Motion
 - **Markdown rendering** with syntax highlighting (GitHub Flavored Markdown)
 - **Charts and visualizations** ready with Recharts
 - **SEO optimization** with dynamic sitemap and robots.txt generation
 - **Session tracking UI** with device information and revocation controls
 ### 🧪 **Comprehensive Testing**
 - **Backend Testing**: ~97% unit test coverage
  - Unit, integration, and security tests
  - Async database testing with SQLAlchemy
  - API endpoint testing with fixtures
  - Security vulnerability tests (JWT attacks, session hijacking, privilege escalation)
 - **Frontend Unit Tests**: ~97% coverage with Jest
  - Component testing
  - Hook testing
  - Utility function testing
 - **End-to-End Tests**: Playwright with zero flaky tests
  - Complete user flows (auth, navigation, settings)
  - Parallel execution for speed
  - Visual regression testing ready
 ### 📚 **Developer Experience**
 - Auto-generated TypeScript API client from OpenAPI spec
 - Interactive API documentation (Swagger + ReDoc)
 - Database migrations with Alembic helper script
 - Hot reload in development for both frontend and backend
 - Comprehensive code documentation and design system docs
 - Live component playground at `/dev` with code examples
 - Docker support for easy deployment
 - VSCode workspace settings included
 ### 📊 **Ready for Production**
 - Docker + docker-compose setup
 - Environment-based configuration
 - Database connection pooling
 - Error handling and logging
 - Health check endpoints
 - Production security headers
 - Rate limiting on sensitive endpoints
 - SEO optimization with dynamic sitemaps and robots.txt
 - Multi-language SEO with locale-specific metadata
 - Performance monitoring and bundle analysis
 ---
-## 📸 Screenshots
+## Technology Stack
-<details>
+Built on [PragmaStack](https://gitea.pragmazest.com/cardosofelipe/fast-next-template):
 <summary>Click to view screenshots</summary>
-### Landing Page
+| Component | Technology |
-![Landing Page](docs/images/landing.png)
+|-----------|------------|
 | Backend | FastAPI 0.115+ (Python 3.11+) |
 | Frontend | Next.js 16 (React 19) |
 | Database | PostgreSQL 15+ with pgvector |
 | ORM | SQLAlchemy 2.0 |
 | State Management | Zustand + TanStack Query |
 | UI | shadcn/ui + Tailwind 4 |
 | Auth | JWT dual-token + OAuth 2.0 |
 | Testing | pytest + Jest + Playwright |
-
+### Syndarix Extensions
-
+| Component | Technology |
-### Authentication
+|-----------|------------|
-![Login Page](docs/images/login.png)
+| Task Queue | Celery + Redis |
-
+| Real-time | FastAPI WebSocket / SSE |
-
+| Vector DB | pgvector (PostgreSQL extension) |
-
+| MCP SDK | Anthropic MCP SDK |
 ### Admin Dashboard
 ![Admin Dashboard](docs/images/admin-dashboard.png)
 ### Design System
 ![Components](docs/images/design-system.png)
 </details>
 ---
-## 🎭 Demo Mode
+## Project Status
-**Try the frontend without a backend!** Perfect for:
+**Phase:** Architecture & Planning
- **Free deployment** on Vercel (no backend costs)
+
- **Portfolio showcasing** with live demos
+See [docs/requirements/](./docs/requirements/) for the comprehensive requirements document.
- **Client presentations** without infrastructure setup
+
 ### Current Milestones
 - [x] Fork PragmaStack as foundation
 - [x] Create requirements document
 - [ ] Execute architecture spikes
 - [ ] Create ADRs for key decisions
 - [ ] Begin MVP implementation
 ---
 ## Documentation
 - [Requirements Document](./docs/requirements/SYNDARIX_REQUIREMENTS.md)
 - [Architecture Decisions](./docs/adrs/) (coming soon)
 - [Spike Research](./docs/spikes/) (coming soon)
 - [Architecture Overview](./docs/architecture/) (coming soon)
 ---
 ## Getting Started
 ### Prerequisites
 - Docker & Docker Compose
 - Node.js 20+
 - Python 3.11+
 - PostgreSQL 15+ (or use Docker)
 ### Quick Start
 ```bash
 cd frontend
 echo "NEXT_PUBLIC_DEMO_MODE=true" > .env.local
 npm run dev
 ```
 **Demo Credentials:**
 - Regular user: `demo@example.com` / `DemoPass123`
 - Admin user: `admin@example.com` / `AdminPass123`
 Demo mode uses [Mock Service Worker (MSW)](https://mswjs.io/) to intercept API calls in the browser. Your code remains unchanged - the same components work with both real and mocked backends.
 **Key Features:**
 - ✅ Zero backend required
 - ✅ All features functional (auth, admin, stats)
 - ✅ Realistic network delays and errors
 - ✅ Does NOT interfere with tests (97%+ coverage maintained)
 - ✅ One-line toggle: `NEXT_PUBLIC_DEMO_MODE=true`
 📖 **[Complete Demo Mode Documentation](./frontend/docs/DEMO_MODE.md)**
 ---
 ## 🚀 Tech Stack
 ### Backend
 - **[FastAPI](https://fastapi.tiangolo.com/)** - Modern async Python web framework
 - **[SQLAlchemy 2.0](https://www.sqlalchemy.org/)** - Powerful ORM with async support
 - **[PostgreSQL](https://www.postgresql.org/)** - Robust relational database
 - **[Alembic](https://alembic.sqlalchemy.org/)** - Database migrations
 - **[Pydantic v2](https://docs.pydantic.dev/)** - Data validation with type hints
 - **[pytest](https://pytest.org/)** - Testing framework with async support
 ### Frontend
 - **[Next.js 16](https://nextjs.org/)** - React framework with App Router
 - **[React 19](https://react.dev/)** - UI library
 - **[TypeScript](https://www.typescriptlang.org/)** - Type-safe JavaScript
 - **[TailwindCSS](https://tailwindcss.com/)** - Utility-first CSS framework
 - **[shadcn/ui](https://ui.shadcn.com/)** - Beautiful, accessible component library
 - **[next-intl](https://next-intl.dev/)** - Internationalization (i18n) with type safety
 - **[TanStack Query](https://tanstack.com/query)** - Powerful data fetching/caching
 - **[Zustand](https://zustand-demo.pmnd.rs/)** - Lightweight state management
 - **[Framer Motion](https://www.framer.com/motion/)** - Production-ready animation library
 - **[Sonner](https://sonner.emilkowal.ski/)** - Beautiful toast notifications
 - **[Recharts](https://recharts.org/)** - Composable charting library
 - **[React Markdown](https://github.com/remarkjs/react-markdown)** - Markdown rendering with GFM support
 - **[Playwright](https://playwright.dev/)** - End-to-end testing
 ### DevOps
 - **[Docker](https://www.docker.com/)** - Containerization
 - **[docker-compose](https://docs.docker.com/compose/)** - Multi-container orchestration
 - **GitHub Actions** (coming soon) - CI/CD pipelines
 ---
 ## 📋 Prerequisites
 - **Docker & Docker Compose** (recommended) - [Install Docker](https://docs.docker.com/get-docker/)
 - **OR manually:**
  - Python 3.12+
  - Node.js 18+ (Node 20+ recommended)
  - PostgreSQL 15+
 ---
 ## 🏃 Quick Start (Docker)
 The fastest way to get started is with Docker:
 ```bash
 # Clone the repository
-git clone https://github.com/cardosofelipe/pragma-stack.git
+git clone https://gitea.pragmazest.com/cardosofelipe/syndarix.git
-cd fast-next-template
+cd syndarix
-# Copy environment file
+# Copy environment template
 cp .env.template .env
-# Start all services (backend, frontend, database)
+# Start development environment
-docker-compose up
+docker-compose -f docker-compose.dev.yml up -d
-# In another terminal, run database migrations
+# Run database migrations
-docker-compose exec backend alembic upgrade head
+make migrate
-# Create first superuser (optional)
+# Start the development servers
-docker-compose exec backend python -c "from app.init_db import init_db; import asyncio; asyncio.run(init_db())"
+make dev
 ```
 **That's it! 🎉**
 - Frontend: http://localhost:3000
 - Backend API: http://localhost:8000
 - API Docs: http://localhost:8000/docs
 Default superuser credentials:
 - Email: `admin@example.com`
 - Password: `admin123`
 **⚠️ Change these immediately in production!**
 ---
 ## 🛠️ Manual Setup (Development)
 ### Backend Setup
 ```bash
 cd backend
 # Create virtual environment
 python -m venv .venv
 source .venv/bin/activate  # On Windows: .venv\Scripts\activate
 # Install dependencies
 pip install -r requirements.txt
 # Setup environment
 cp .env.example .env
 # Edit .env with your database credentials
 # Run migrations
 alembic upgrade head
 # Initialize database with first superuser
 python -c "from app.init_db import init_db; import asyncio; asyncio.run(init_db())"
 # Start development server
 uvicorn app.main:app --reload --host 0.0.0.0 --port 8000
 ```
 ### Frontend Setup
 ```bash
 cd frontend
 # Install dependencies
 npm install
 # Setup environment
 cp .env.local.example .env.local
 # Edit .env.local with your backend URL
 # Generate API client
 npm run generate:api
 # Start development server
 npm run dev
 ```
 Visit http://localhost:3000 to see your app!
 ---
 ## 📂 Project Structure
 ```
 ├── backend/                 # FastAPI backend
 │   ├── app/
 │   │   ├── api/            # API routes and dependencies
 │   │   ├── core/           # Core functionality (auth, config, database)
 │   │   ├── crud/           # Database operations
 │   │   ├── models/         # SQLAlchemy models
 │   │   ├── schemas/        # Pydantic schemas
 │   │   ├── services/       # Business logic
 │   │   └── utils/          # Utilities
 │   ├── tests/              # Backend tests (97% coverage)
 │   ├── alembic/            # Database migrations
 │   └── docs/               # Backend documentation
 │
 ├── frontend/               # Next.js frontend
 │   ├── src/
 │   │   ├── app/           # Next.js App Router pages
 │   │   ├── components/    # React components
 │   │   ├── lib/           # Libraries and utilities
 │   │   │   ├── api/       # API client (auto-generated)
 │   │   │   └── stores/    # Zustand stores
 │   │   └── hooks/         # Custom React hooks
 │   ├── e2e/               # Playwright E2E tests
 │   ├── tests/             # Unit tests (Jest)
 │   └── docs/              # Frontend documentation
 │       └── design-system/ # Comprehensive design system docs
 │
 ├── docker-compose.yml      # Docker orchestration
 ├── docker-compose.dev.yml  # Development with hot reload
 └── README.md              # You are here!
 ```
 ---
-## 🧪 Testing
+## Architecture Overview
 This template takes testing seriously with comprehensive coverage across all layers:
 ### Backend Unit & Integration Tests
 **High coverage (~97%)** across all critical paths including security-focused tests.
 ```bash
 cd backend
 # Run all tests
 IS_TEST=True pytest
 # Run with coverage report
 IS_TEST=True pytest --cov=app --cov-report=term-missing
 # Run specific test file
 IS_TEST=True pytest tests/api/test_auth.py -v
 # Generate HTML coverage report
 IS_TEST=True pytest --cov=app --cov-report=html
 open htmlcov/index.html
 ```
-
+====================================================================+
-**Test types:**
+|                         SYNDARIX CORE                               |
- **Unit tests**: CRUD operations, utilities, business logic
+====================================================================+
- **Integration tests**: API endpoints with database
+|  +------------------+  +------------------+  +------------------+   |
- **Security tests**: JWT algorithm attacks, session hijacking, privilege escalation
+|  | Agent Orchestrator|  | Project Manager |  | Workflow Engine  |   |
- **Error handling tests**: Database failures, validation errors
+|  +------------------+  +------------------+  +------------------+   |
-
+====================================================================+
-### Frontend Unit Tests
+                              |
-
+                              v
-**High coverage (~97%)** with Jest and React Testing Library.
+====================================================================+
-
+|                    MCP ORCHESTRATION LAYER                          |
-```bash
+|  All integrations via unified MCP servers with project scoping      |
-cd frontend
+====================================================================+
-
+                              |
-# Run unit tests
+     +------------------------+------------------------+
-npm test
+     |                        |                        |
-
+----v----+  +----v----+  +----v----+  +----v----+  +----v----+
-# Run with coverage
+|   LLM   |  |   Git   |  |Knowledge|  |  File   |  |  Code   |
-npm run test:coverage
+| Providers|  |   MCP   |  |Base MCP |  |Sys. MCP |  |Analysis |
-
+---------+  +---------+  +---------+  +---------+  +---------+
 # Watch mode
 npm run test:watch
 ```
 **Test types:**
 - Component rendering and interactions
 - Custom hooks behavior
 - State management
 - Utility functions
 - API integration mocks
 ### End-to-End Tests
 **Zero flaky tests** with Playwright covering complete user journeys.
 ```bash
 cd frontend
 # Run E2E tests
 npm run test:e2e
 # Run E2E tests in UI mode (recommended for development)
 npm run test:e2e:ui
 # Run specific test file
 npx playwright test auth-login.spec.ts
 # Generate test report
 npx playwright show-report
 ```
 **Test coverage:**
 - Complete authentication flows
 - Navigation and routing
 - Form submissions and validation
 - Settings and profile management
 - Session management
 - Admin panel workflows (in progress)
 ---
 ## 🤖 AI-Friendly Documentation
 This project includes comprehensive documentation designed for AI coding assistants:
 - **[AGENTS.md](./AGENTS.md)** - Framework-agnostic AI assistant context for PragmaStack
 - **[CLAUDE.md](./CLAUDE.md)** - Claude Code-specific guidance
 These files provide AI assistants with the **PragmaStack** architecture, patterns, and best practices.
 ---
 ## 🗄️ Database Migrations
 The template uses Alembic for database migrations:
 ```bash
 cd backend
 # Generate migration from model changes
 python migrate.py generate "description of changes"
 # Apply migrations
 python migrate.py apply
 # Or do both in one command
 python migrate.py auto "description"
 # View migration history
 python migrate.py list
 # Check current revision
 python migrate.py current
 ```
 ---
-## 📖 Documentation
+## Contributing
-### AI Assistant Documentation
+See [CONTRIBUTING.md](./CONTRIBUTING.md) for guidelines.
 - **[AGENTS.md](./AGENTS.md)** - Framework-agnostic AI coding assistant context
 - **[CLAUDE.md](./CLAUDE.md)** - Claude Code-specific guidance and preferences
 ### Backend Documentation
 - **[ARCHITECTURE.md](./backend/docs/ARCHITECTURE.md)** - System architecture and design patterns
 - **[CODING_STANDARDS.md](./backend/docs/CODING_STANDARDS.md)** - Code quality standards
 - **[COMMON_PITFALLS.md](./backend/docs/COMMON_PITFALLS.md)** - Common mistakes to avoid
 - **[FEATURE_EXAMPLE.md](./backend/docs/FEATURE_EXAMPLE.md)** - Step-by-step feature guide
 ### Frontend Documentation
 - **[PragmaStack Design System](./frontend/docs/design-system/)** - Complete design system guide
  - Quick start, foundations (colors, typography, spacing)
  - Component library guide
  - Layout patterns, spacing philosophy
  - Forms, accessibility, AI guidelines
 - **[E2E Testing Guide](./frontend/e2e/README.md)** - E2E testing setup and best practices
 ### API Documentation
 When the backend is running:
 - **Swagger UI**: http://localhost:8000/docs
 - **ReDoc**: http://localhost:8000/redoc
 - **OpenAPI JSON**: http://localhost:8000/api/v1/openapi.json
 ---
-## 🚢 Deployment
+## License
-### Docker Production Deployment
+MIT License - see [LICENSE](./LICENSE) for details.
 ```bash
 # Build and start all services
 docker-compose up -d
 # Run migrations
 docker-compose exec backend alembic upgrade head
 # View logs
 docker-compose logs -f
 # Stop services
 docker-compose down
 ```
 ### Production Checklist
 - [ ] Change default superuser credentials
 - [ ] Set strong `SECRET_KEY` in backend `.env`
 - [ ] Configure production database (PostgreSQL)
 - [ ] Set `ENVIRONMENT=production` in backend
 - [ ] Configure CORS origins for your domain
 - [ ] Setup SSL/TLS certificates
 - [ ] Configure email service for password resets
 - [ ] Setup monitoring and logging
 - [ ] Configure backup strategy
 - [ ] Review and adjust rate limits
 - [ ] Test security headers
 ---
-## 🛣️ Roadmap & Status
+## Acknowledgments
-### ✅ Completed
+- Built on [PragmaStack](https://gitea.pragmazest.com/cardosofelipe/fast-next-template)
- [x] Authentication system (JWT, refresh tokens, session management, OAuth)
+- Powered by Claude and the Anthropic API
 - [x] User management (CRUD, profile, password change)
 - [x] Organization system with RBAC (Owner, Admin, Member)
 - [x] Admin panel (users, organizations, sessions, statistics)
 - [x] **Internationalization (i18n)** with next-intl (English + Italian)
 - [x] Backend testing infrastructure (~97% coverage)
 - [x] Frontend unit testing infrastructure (~97% coverage)
 - [x] Frontend E2E testing (Playwright, zero flaky tests)
 - [x] Design system documentation
 - [x] **Marketing landing page** with animated components
 - [x] **`/dev` documentation portal** with live component examples
 - [x] **Toast notifications** system (Sonner)
 - [x] **Charts and visualizations** (Recharts)
 - [x] **Animation system** (Framer Motion)
 - [x] **Markdown rendering** with syntax highlighting
 - [x] **SEO optimization** (sitemap, robots.txt, locale-aware metadata)
 - [x] Database migrations with helper script
 - [x] Docker deployment
 - [x] API documentation (OpenAPI/Swagger)
 ### 🚧 In Progress
 - [ ] Email integration (templates ready, SMTP pending)
 ### 🔮 Planned
 - [ ] GitHub Actions CI/CD pipelines
 - [ ] Dynamic test coverage badges from CI
 - [ ] E2E test coverage reporting
 - [ ] OAuth token encryption at rest (security hardening)
 - [ ] Additional languages (Spanish, French, German, etc.)
 - [ ] SSO/SAML authentication
 - [ ] Real-time notifications with WebSockets
 - [ ] Webhook system
 - [ ] File upload/storage (S3-compatible)
 - [ ] Audit logging system
 - [ ] API versioning example
 ---
 ## 🤝 Contributing
 Contributions are welcome! Whether you're fixing bugs, improving documentation, or proposing new features, we'd love your help.
 ### How to Contribute
 1. **Fork the repository**
 2. **Create a feature branch** (`git checkout -b feature/amazing-feature`)
 3. **Make your changes**
   - Follow existing code style
   - Add tests for new features
   - Update documentation as needed
 4. **Run tests** to ensure everything works
 5. **Commit your changes** (`git commit -m 'Add amazing feature'`)
 6. **Push to your branch** (`git push origin feature/amazing-feature`)
 7. **Open a Pull Request**
 ### Development Guidelines
 - Write tests for new features (aim for >90% coverage)
 - Follow the existing architecture patterns
 - Update documentation when adding features
 - Keep commits atomic and well-described
 - Be respectful and constructive in discussions
 ### Reporting Issues
 Found a bug? Have a suggestion? [Open an issue](https://github.com/cardosofelipe/pragma-stack/issues)!
 Please include:
 - Clear description of the issue/suggestion
 - Steps to reproduce (for bugs)
 - Expected vs. actual behavior
 - Environment details (OS, Python/Node version, etc.)
 ---
 ## 📄 License
 This project is licensed under the **MIT License** - see the [LICENSE](./LICENSE) file for details.
 **TL;DR**: You can use this template for any purpose, commercial or non-commercial. Attribution is appreciated but not required!
 ---
 ## 🙏 Acknowledgments
 This template is built on the shoulders of giants:
 - [FastAPI](https://fastapi.tiangolo.com/) by Sebastián Ramírez
 - [Next.js](https://nextjs.org/) by Vercel
 - [shadcn/ui](https://ui.shadcn.com/) by shadcn
 - [TanStack Query](https://tanstack.com/query) by Tanner Linsley
 - [Playwright](https://playwright.dev/) by Microsoft
 - And countless other open-source projects that make modern development possible
 ---
 ## 💬 Questions?
 - **Documentation**: Check the `/docs` folders in backend and frontend
 - **Issues**: [GitHub Issues](https://github.com/cardosofelipe/pragma-stack/issues)
 - **Discussions**: [GitHub Discussions](https://github.com/cardosofelipe/pragma-stack/discussions)
 ---
 ## ⭐ Star This Repo
 If this template saves you time, consider giving it a star! It helps others discover the project and motivates continued development.
 **Happy coding! 🚀**
 ---
 <div align="center">
 Made with ❤️ by a developer who got tired of rebuilding the same boilerplate
 </div>
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -7,7 +7,10 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
    PYTHONPATH=/app \
    UV_COMPILE_BYTECODE=1 \
    UV_LINK_MODE=copy \
-    UV_NO_CACHE=1
+    UV_NO_CACHE=1 \
    UV_PROJECT_ENVIRONMENT=/opt/venv \
    VIRTUAL_ENV=/opt/venv \
    PATH="/opt/venv/bin:$PATH"
 # Install system dependencies and uv
 RUN apt-get update && \
@@ -20,7 +23,7 @@ RUN apt-get update && \
 # Copy dependency files
 COPY pyproject.toml uv.lock ./
-# Install dependencies using uv (development mode with dev dependencies)
+# Install dependencies using uv into /opt/venv (outside /app to survive bind mounts)
 RUN uv sync --extra dev --frozen
 # Copy application code
@@ -45,7 +48,10 @@ ENV PYTHONDONTWRITEBYTECODE=1 \
    PYTHONPATH=/app \
    UV_COMPILE_BYTECODE=1 \
    UV_LINK_MODE=copy \
-    UV_NO_CACHE=1
+    UV_NO_CACHE=1 \
    UV_PROJECT_ENVIRONMENT=/opt/venv \
    VIRTUAL_ENV=/opt/venv \
    PATH="/opt/venv/bin:$PATH"
 # Install system dependencies and uv
 RUN apt-get update && \
@@ -58,7 +64,7 @@ RUN apt-get update && \
 # Copy dependency files
 COPY pyproject.toml uv.lock ./
-# Install only production dependencies using uv (no dev dependencies)
+# Install only production dependencies using uv into /opt/venv
 RUN uv sync --frozen --no-dev
 # Copy application code
@@ -67,7 +73,7 @@ COPY entrypoint.sh /usr/local/bin/
 RUN chmod +x /usr/local/bin/entrypoint.sh
 # Set ownership to non-root user
-RUN chown -R appuser:appuser /app
+RUN chown -R appuser:appuser /app /opt/venv
 # Switch to non-root user
 USER appuser
@@ -77,4 +83,4 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=40s --retries=3 \
    CMD curl -f http://localhost:8000/health || exit 1
 ENTRYPOINT ["/usr/local/bin/entrypoint.sh"]
-CMD ["uv", "run", "uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/backend/README.md
+++ b/backend/README.md
@@ -1,6 +1,6 @@
-# PragmaStack Backend API
+# Syndarix Backend API
-> The pragmatic, production-ready FastAPI backend for PragmaStack.
+> The pragmatic, production-ready FastAPI backend for Syndarix.
 ## Overview
--- a/backend/app/alembic/env.py
+++ b/backend/app/alembic/env.py
@@ -40,6 +40,7 @@ def include_object(object, name, type_, reflected, compare_to):
            return False
    return True
 # Interpret the config file for Python logging.
 # This line sets up loggers basically.
 if config.config_file_name is not None:
--- a/backend/app/alembic/versions/0001_initial_models.py
+++ b/backend/app/alembic/versions/0001_initial_models.py
@@ -5,258 +5,442 @@ Revises:
 Create Date: 2025-11-27 09:08:09.464506
 """
 from typing import Sequence, Union
-from alembic import op
+from collections.abc import Sequence
 import sqlalchemy as sa
 from alembic import op
 from sqlalchemy.dialects import postgresql
 # revision identifiers, used by Alembic.
-revision: str = '0001'
+revision: str = "0001"
-down_revision: Union[str, None] = None
+down_revision: str | None = None
-branch_labels: Union[str, Sequence[str], None] = None
+branch_labels: str | Sequence[str] | None = None
-depends_on: Union[str, Sequence[str], None] = None
+depends_on: str | Sequence[str] | None = None
 def upgrade() -> None:
    # ### commands auto generated by Alembic - please adjust! ###
-    op.create_table('oauth_states',
+    op.create_table(
-    sa.Column('state', sa.String(length=255), nullable=False),
+        "oauth_states",
-    sa.Column('code_verifier', sa.String(length=128), nullable=True),
+        sa.Column("state", sa.String(length=255), nullable=False),
-    sa.Column('nonce', sa.String(length=255), nullable=True),
+        sa.Column("code_verifier", sa.String(length=128), nullable=True),
-    sa.Column('provider', sa.String(length=50), nullable=False),
+        sa.Column("nonce", sa.String(length=255), nullable=True),
-    sa.Column('redirect_uri', sa.String(length=500), nullable=True),
+        sa.Column("provider", sa.String(length=50), nullable=False),
-    sa.Column('user_id', sa.UUID(), nullable=True),
+        sa.Column("redirect_uri", sa.String(length=500), nullable=True),
-    sa.Column('expires_at', sa.DateTime(timezone=True), nullable=False),
+        sa.Column("user_id", sa.UUID(), nullable=True),
-    sa.Column('id', sa.UUID(), nullable=False),
+        sa.Column("expires_at", sa.DateTime(timezone=True), nullable=False),
-    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
+        sa.Column("id", sa.UUID(), nullable=False),
-    sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
+        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
-    sa.PrimaryKeyConstraint('id')
+        sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
        sa.PrimaryKeyConstraint("id"),
    )
-    op.create_index(op.f('ix_oauth_states_state'), 'oauth_states', ['state'], unique=True)
+    op.create_index(
-    op.create_table('organizations',
+        op.f("ix_oauth_states_state"), "oauth_states", ["state"], unique=True
    sa.Column('name', sa.String(length=255), nullable=False),
    sa.Column('slug', sa.String(length=255), nullable=False),
    sa.Column('description', sa.Text(), nullable=True),
    sa.Column('is_active', sa.Boolean(), nullable=False),
    sa.Column('settings', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
    sa.Column('id', sa.UUID(), nullable=False),
    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
    sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
    sa.PrimaryKeyConstraint('id')
    )
-    op.create_index(op.f('ix_organizations_is_active'), 'organizations', ['is_active'], unique=False)
+    op.create_table(
-    op.create_index(op.f('ix_organizations_name'), 'organizations', ['name'], unique=False)
+        "organizations",
-    op.create_index('ix_organizations_name_active', 'organizations', ['name', 'is_active'], unique=False)
+        sa.Column("name", sa.String(length=255), nullable=False),
-    op.create_index(op.f('ix_organizations_slug'), 'organizations', ['slug'], unique=True)
+        sa.Column("slug", sa.String(length=255), nullable=False),
-    op.create_index('ix_organizations_slug_active', 'organizations', ['slug', 'is_active'], unique=False)
+        sa.Column("description", sa.Text(), nullable=True),
-    op.create_table('users',
+        sa.Column("is_active", sa.Boolean(), nullable=False),
-    sa.Column('email', sa.String(length=255), nullable=False),
+        sa.Column("settings", postgresql.JSONB(astext_type=sa.Text()), nullable=True),
-    sa.Column('password_hash', sa.String(length=255), nullable=True),
+        sa.Column("id", sa.UUID(), nullable=False),
-    sa.Column('first_name', sa.String(length=100), nullable=False),
+        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
-    sa.Column('last_name', sa.String(length=100), nullable=True),
+        sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
-    sa.Column('phone_number', sa.String(length=20), nullable=True),
+        sa.PrimaryKeyConstraint("id"),
    sa.Column('is_active', sa.Boolean(), nullable=False),
    sa.Column('is_superuser', sa.Boolean(), nullable=False),
    sa.Column('preferences', postgresql.JSONB(astext_type=sa.Text()), nullable=True),
    sa.Column('locale', sa.String(length=10), nullable=True),
    sa.Column('deleted_at', sa.DateTime(timezone=True), nullable=True),
    sa.Column('id', sa.UUID(), nullable=False),
    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
    sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
    sa.PrimaryKeyConstraint('id')
    )
-    op.create_index(op.f('ix_users_deleted_at'), 'users', ['deleted_at'], unique=False)
+    op.create_index(
-    op.create_index(op.f('ix_users_email'), 'users', ['email'], unique=True)
+        op.f("ix_organizations_is_active"), "organizations", ["is_active"], unique=False
    op.create_index(op.f('ix_users_is_active'), 'users', ['is_active'], unique=False)
    op.create_index(op.f('ix_users_is_superuser'), 'users', ['is_superuser'], unique=False)
    op.create_index(op.f('ix_users_locale'), 'users', ['locale'], unique=False)
    op.create_table('oauth_accounts',
    sa.Column('user_id', sa.UUID(), nullable=False),
    sa.Column('provider', sa.String(length=50), nullable=False),
    sa.Column('provider_user_id', sa.String(length=255), nullable=False),
    sa.Column('provider_email', sa.String(length=255), nullable=True),
    sa.Column('access_token_encrypted', sa.String(length=2048), nullable=True),
    sa.Column('refresh_token_encrypted', sa.String(length=2048), nullable=True),
    sa.Column('token_expires_at', sa.DateTime(timezone=True), nullable=True),
    sa.Column('id', sa.UUID(), nullable=False),
    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
    sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
    sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='CASCADE'),
    sa.PrimaryKeyConstraint('id'),
    sa.UniqueConstraint('provider', 'provider_user_id', name='uq_oauth_provider_user')
    )
-    op.create_index(op.f('ix_oauth_accounts_provider'), 'oauth_accounts', ['provider'], unique=False)
+    op.create_index(
-    op.create_index(op.f('ix_oauth_accounts_provider_email'), 'oauth_accounts', ['provider_email'], unique=False)
+        op.f("ix_organizations_name"), "organizations", ["name"], unique=False
    op.create_index(op.f('ix_oauth_accounts_user_id'), 'oauth_accounts', ['user_id'], unique=False)
    op.create_index('ix_oauth_accounts_user_provider', 'oauth_accounts', ['user_id', 'provider'], unique=False)
    op.create_table('oauth_clients',
    sa.Column('client_id', sa.String(length=64), nullable=False),
    sa.Column('client_secret_hash', sa.String(length=255), nullable=True),
    sa.Column('client_name', sa.String(length=255), nullable=False),
    sa.Column('client_description', sa.String(length=1000), nullable=True),
    sa.Column('client_type', sa.String(length=20), nullable=False),
    sa.Column('redirect_uris', postgresql.JSONB(astext_type=sa.Text()), nullable=False),
    sa.Column('allowed_scopes', postgresql.JSONB(astext_type=sa.Text()), nullable=False),
    sa.Column('access_token_lifetime', sa.String(length=10), nullable=False),
    sa.Column('refresh_token_lifetime', sa.String(length=10), nullable=False),
    sa.Column('is_active', sa.Boolean(), nullable=False),
    sa.Column('owner_user_id', sa.UUID(), nullable=True),
    sa.Column('mcp_server_url', sa.String(length=2048), nullable=True),
    sa.Column('id', sa.UUID(), nullable=False),
    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
    sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
    sa.ForeignKeyConstraint(['owner_user_id'], ['users.id'], ondelete='SET NULL'),
    sa.PrimaryKeyConstraint('id')
    )
-    op.create_index(op.f('ix_oauth_clients_client_id'), 'oauth_clients', ['client_id'], unique=True)
+    op.create_index(
-    op.create_index(op.f('ix_oauth_clients_is_active'), 'oauth_clients', ['is_active'], unique=False)
+        "ix_organizations_name_active",
-    op.create_table('user_organizations',
+        "organizations",
-    sa.Column('user_id', sa.UUID(), nullable=False),
+        ["name", "is_active"],
-    sa.Column('organization_id', sa.UUID(), nullable=False),
+        unique=False,
    sa.Column('role', sa.Enum('OWNER', 'ADMIN', 'MEMBER', 'GUEST', name='organizationrole'), nullable=False),
    sa.Column('is_active', sa.Boolean(), nullable=False),
    sa.Column('custom_permissions', sa.String(length=500), nullable=True),
    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
    sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
    sa.ForeignKeyConstraint(['organization_id'], ['organizations.id'], ondelete='CASCADE'),
    sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='CASCADE'),
    sa.PrimaryKeyConstraint('user_id', 'organization_id')
    )
-    op.create_index('ix_user_org_org_active', 'user_organizations', ['organization_id', 'is_active'], unique=False)
+    op.create_index(
-    op.create_index('ix_user_org_role', 'user_organizations', ['role'], unique=False)
+        op.f("ix_organizations_slug"), "organizations", ["slug"], unique=True
    op.create_index('ix_user_org_user_active', 'user_organizations', ['user_id', 'is_active'], unique=False)
    op.create_index(op.f('ix_user_organizations_is_active'), 'user_organizations', ['is_active'], unique=False)
    op.create_table('user_sessions',
    sa.Column('user_id', sa.UUID(), nullable=False),
    sa.Column('refresh_token_jti', sa.String(length=255), nullable=False),
    sa.Column('device_name', sa.String(length=255), nullable=True),
    sa.Column('device_id', sa.String(length=255), nullable=True),
    sa.Column('ip_address', sa.String(length=45), nullable=True),
    sa.Column('user_agent', sa.String(length=500), nullable=True),
    sa.Column('last_used_at', sa.DateTime(timezone=True), nullable=False),
    sa.Column('expires_at', sa.DateTime(timezone=True), nullable=False),
    sa.Column('is_active', sa.Boolean(), nullable=False),
    sa.Column('location_city', sa.String(length=100), nullable=True),
    sa.Column('location_country', sa.String(length=100), nullable=True),
    sa.Column('id', sa.UUID(), nullable=False),
    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
    sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
    sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='CASCADE'),
    sa.PrimaryKeyConstraint('id')
    )
-    op.create_index(op.f('ix_user_sessions_is_active'), 'user_sessions', ['is_active'], unique=False)
+    op.create_index(
-    op.create_index('ix_user_sessions_jti_active', 'user_sessions', ['refresh_token_jti', 'is_active'], unique=False)
+        "ix_organizations_slug_active",
-    op.create_index(op.f('ix_user_sessions_refresh_token_jti'), 'user_sessions', ['refresh_token_jti'], unique=True)
+        "organizations",
-    op.create_index('ix_user_sessions_user_active', 'user_sessions', ['user_id', 'is_active'], unique=False)
+        ["slug", "is_active"],
-    op.create_index(op.f('ix_user_sessions_user_id'), 'user_sessions', ['user_id'], unique=False)
+        unique=False,
    op.create_table('oauth_authorization_codes',
    sa.Column('code', sa.String(length=128), nullable=False),
    sa.Column('client_id', sa.String(length=64), nullable=False),
    sa.Column('user_id', sa.UUID(), nullable=False),
    sa.Column('redirect_uri', sa.String(length=2048), nullable=False),
    sa.Column('scope', sa.String(length=1000), nullable=False),
    sa.Column('code_challenge', sa.String(length=128), nullable=True),
    sa.Column('code_challenge_method', sa.String(length=10), nullable=True),
    sa.Column('state', sa.String(length=256), nullable=True),
    sa.Column('nonce', sa.String(length=256), nullable=True),
    sa.Column('expires_at', sa.DateTime(timezone=True), nullable=False),
    sa.Column('used', sa.Boolean(), nullable=False),
    sa.Column('id', sa.UUID(), nullable=False),
    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
    sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
    sa.ForeignKeyConstraint(['client_id'], ['oauth_clients.client_id'], ondelete='CASCADE'),
    sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='CASCADE'),
    sa.PrimaryKeyConstraint('id')
    )
-    op.create_index('ix_oauth_authorization_codes_client_user', 'oauth_authorization_codes', ['client_id', 'user_id'], unique=False)
+    op.create_table(
-    op.create_index(op.f('ix_oauth_authorization_codes_code'), 'oauth_authorization_codes', ['code'], unique=True)
+        "users",
-    op.create_index('ix_oauth_authorization_codes_expires_at', 'oauth_authorization_codes', ['expires_at'], unique=False)
+        sa.Column("email", sa.String(length=255), nullable=False),
-    op.create_table('oauth_consents',
+        sa.Column("password_hash", sa.String(length=255), nullable=True),
-    sa.Column('user_id', sa.UUID(), nullable=False),
+        sa.Column("first_name", sa.String(length=100), nullable=False),
-    sa.Column('client_id', sa.String(length=64), nullable=False),
+        sa.Column("last_name", sa.String(length=100), nullable=True),
-    sa.Column('granted_scopes', sa.String(length=1000), nullable=False),
+        sa.Column("phone_number", sa.String(length=20), nullable=True),
-    sa.Column('id', sa.UUID(), nullable=False),
+        sa.Column("is_active", sa.Boolean(), nullable=False),
-    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
+        sa.Column("is_superuser", sa.Boolean(), nullable=False),
-    sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
+        sa.Column(
-    sa.ForeignKeyConstraint(['client_id'], ['oauth_clients.client_id'], ondelete='CASCADE'),
+            "preferences", postgresql.JSONB(astext_type=sa.Text()), nullable=True
-    sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='CASCADE'),
+        ),
-    sa.PrimaryKeyConstraint('id')
+        sa.Column("locale", sa.String(length=10), nullable=True),
        sa.Column("deleted_at", sa.DateTime(timezone=True), nullable=True),
        sa.Column("id", sa.UUID(), nullable=False),
        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
        sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
        sa.PrimaryKeyConstraint("id"),
    )
-    op.create_index('ix_oauth_consents_user_client', 'oauth_consents', ['user_id', 'client_id'], unique=True)
+    op.create_index(op.f("ix_users_deleted_at"), "users", ["deleted_at"], unique=False)
-    op.create_table('oauth_provider_refresh_tokens',
+    op.create_index(op.f("ix_users_email"), "users", ["email"], unique=True)
-    sa.Column('token_hash', sa.String(length=64), nullable=False),
+    op.create_index(op.f("ix_users_is_active"), "users", ["is_active"], unique=False)
-    sa.Column('jti', sa.String(length=64), nullable=False),
+    op.create_index(
-    sa.Column('client_id', sa.String(length=64), nullable=False),
+        op.f("ix_users_is_superuser"), "users", ["is_superuser"], unique=False
-    sa.Column('user_id', sa.UUID(), nullable=False),
+    )
-    sa.Column('scope', sa.String(length=1000), nullable=False),
+    op.create_index(op.f("ix_users_locale"), "users", ["locale"], unique=False)
-    sa.Column('expires_at', sa.DateTime(timezone=True), nullable=False),
+    op.create_table(
-    sa.Column('revoked', sa.Boolean(), nullable=False),
+        "oauth_accounts",
-    sa.Column('last_used_at', sa.DateTime(timezone=True), nullable=True),
+        sa.Column("user_id", sa.UUID(), nullable=False),
-    sa.Column('device_info', sa.String(length=500), nullable=True),
+        sa.Column("provider", sa.String(length=50), nullable=False),
-    sa.Column('ip_address', sa.String(length=45), nullable=True),
+        sa.Column("provider_user_id", sa.String(length=255), nullable=False),
-    sa.Column('id', sa.UUID(), nullable=False),
+        sa.Column("provider_email", sa.String(length=255), nullable=True),
-    sa.Column('created_at', sa.DateTime(timezone=True), nullable=False),
+        sa.Column("access_token_encrypted", sa.String(length=2048), nullable=True),
-    sa.Column('updated_at', sa.DateTime(timezone=True), nullable=False),
+        sa.Column("refresh_token_encrypted", sa.String(length=2048), nullable=True),
-    sa.ForeignKeyConstraint(['client_id'], ['oauth_clients.client_id'], ondelete='CASCADE'),
+        sa.Column("token_expires_at", sa.DateTime(timezone=True), nullable=True),
-    sa.ForeignKeyConstraint(['user_id'], ['users.id'], ondelete='CASCADE'),
+        sa.Column("id", sa.UUID(), nullable=False),
-    sa.PrimaryKeyConstraint('id')
+        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
        sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
        sa.ForeignKeyConstraint(["user_id"], ["users.id"], ondelete="CASCADE"),
        sa.PrimaryKeyConstraint("id"),
        sa.UniqueConstraint(
            "provider", "provider_user_id", name="uq_oauth_provider_user"
        ),
    )
    op.create_index(
        op.f("ix_oauth_accounts_provider"), "oauth_accounts", ["provider"], unique=False
    )
    op.create_index(
        op.f("ix_oauth_accounts_provider_email"),
        "oauth_accounts",
        ["provider_email"],
        unique=False,
    )
    op.create_index(
        op.f("ix_oauth_accounts_user_id"), "oauth_accounts", ["user_id"], unique=False
    )
    op.create_index(
        "ix_oauth_accounts_user_provider",
        "oauth_accounts",
        ["user_id", "provider"],
        unique=False,
    )
    op.create_table(
        "oauth_clients",
        sa.Column("client_id", sa.String(length=64), nullable=False),
        sa.Column("client_secret_hash", sa.String(length=255), nullable=True),
        sa.Column("client_name", sa.String(length=255), nullable=False),
        sa.Column("client_description", sa.String(length=1000), nullable=True),
        sa.Column("client_type", sa.String(length=20), nullable=False),
        sa.Column(
            "redirect_uris", postgresql.JSONB(astext_type=sa.Text()), nullable=False
        ),
        sa.Column(
            "allowed_scopes", postgresql.JSONB(astext_type=sa.Text()), nullable=False
        ),
        sa.Column("access_token_lifetime", sa.String(length=10), nullable=False),
        sa.Column("refresh_token_lifetime", sa.String(length=10), nullable=False),
        sa.Column("is_active", sa.Boolean(), nullable=False),
        sa.Column("owner_user_id", sa.UUID(), nullable=True),
        sa.Column("mcp_server_url", sa.String(length=2048), nullable=True),
        sa.Column("id", sa.UUID(), nullable=False),
        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
        sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
        sa.ForeignKeyConstraint(["owner_user_id"], ["users.id"], ondelete="SET NULL"),
        sa.PrimaryKeyConstraint("id"),
    )
    op.create_index(
        op.f("ix_oauth_clients_client_id"), "oauth_clients", ["client_id"], unique=True
    )
    op.create_index(
        op.f("ix_oauth_clients_is_active"), "oauth_clients", ["is_active"], unique=False
    )
    op.create_table(
        "user_organizations",
        sa.Column("user_id", sa.UUID(), nullable=False),
        sa.Column("organization_id", sa.UUID(), nullable=False),
        sa.Column(
            "role",
            sa.Enum("OWNER", "ADMIN", "MEMBER", "GUEST", name="organizationrole"),
            nullable=False,
        ),
        sa.Column("is_active", sa.Boolean(), nullable=False),
        sa.Column("custom_permissions", sa.String(length=500), nullable=True),
        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
        sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
        sa.ForeignKeyConstraint(
            ["organization_id"], ["organizations.id"], ondelete="CASCADE"
        ),
        sa.ForeignKeyConstraint(["user_id"], ["users.id"], ondelete="CASCADE"),
        sa.PrimaryKeyConstraint("user_id", "organization_id"),
    )
    op.create_index(
        "ix_user_org_org_active",
        "user_organizations",
        ["organization_id", "is_active"],
        unique=False,
    )
    op.create_index("ix_user_org_role", "user_organizations", ["role"], unique=False)
    op.create_index(
        "ix_user_org_user_active",
        "user_organizations",
        ["user_id", "is_active"],
        unique=False,
    )
    op.create_index(
        op.f("ix_user_organizations_is_active"),
        "user_organizations",
        ["is_active"],
        unique=False,
    )
    op.create_table(
        "user_sessions",
        sa.Column("user_id", sa.UUID(), nullable=False),
        sa.Column("refresh_token_jti", sa.String(length=255), nullable=False),
        sa.Column("device_name", sa.String(length=255), nullable=True),
        sa.Column("device_id", sa.String(length=255), nullable=True),
        sa.Column("ip_address", sa.String(length=45), nullable=True),
        sa.Column("user_agent", sa.String(length=500), nullable=True),
        sa.Column("last_used_at", sa.DateTime(timezone=True), nullable=False),
        sa.Column("expires_at", sa.DateTime(timezone=True), nullable=False),
        sa.Column("is_active", sa.Boolean(), nullable=False),
        sa.Column("location_city", sa.String(length=100), nullable=True),
        sa.Column("location_country", sa.String(length=100), nullable=True),
        sa.Column("id", sa.UUID(), nullable=False),
        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
        sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
        sa.ForeignKeyConstraint(["user_id"], ["users.id"], ondelete="CASCADE"),
        sa.PrimaryKeyConstraint("id"),
    )
    op.create_index(
        op.f("ix_user_sessions_is_active"), "user_sessions", ["is_active"], unique=False
    )
    op.create_index(
        "ix_user_sessions_jti_active",
        "user_sessions",
        ["refresh_token_jti", "is_active"],
        unique=False,
    )
    op.create_index(
        op.f("ix_user_sessions_refresh_token_jti"),
        "user_sessions",
        ["refresh_token_jti"],
        unique=True,
    )
    op.create_index(
        "ix_user_sessions_user_active",
        "user_sessions",
        ["user_id", "is_active"],
        unique=False,
    )
    op.create_index(
        op.f("ix_user_sessions_user_id"), "user_sessions", ["user_id"], unique=False
    )
    op.create_table(
        "oauth_authorization_codes",
        sa.Column("code", sa.String(length=128), nullable=False),
        sa.Column("client_id", sa.String(length=64), nullable=False),
        sa.Column("user_id", sa.UUID(), nullable=False),
        sa.Column("redirect_uri", sa.String(length=2048), nullable=False),
        sa.Column("scope", sa.String(length=1000), nullable=False),
        sa.Column("code_challenge", sa.String(length=128), nullable=True),
        sa.Column("code_challenge_method", sa.String(length=10), nullable=True),
        sa.Column("state", sa.String(length=256), nullable=True),
        sa.Column("nonce", sa.String(length=256), nullable=True),
        sa.Column("expires_at", sa.DateTime(timezone=True), nullable=False),
        sa.Column("used", sa.Boolean(), nullable=False),
        sa.Column("id", sa.UUID(), nullable=False),
        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
        sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
        sa.ForeignKeyConstraint(
            ["client_id"], ["oauth_clients.client_id"], ondelete="CASCADE"
        ),
        sa.ForeignKeyConstraint(["user_id"], ["users.id"], ondelete="CASCADE"),
        sa.PrimaryKeyConstraint("id"),
    )
    op.create_index(
        "ix_oauth_authorization_codes_client_user",
        "oauth_authorization_codes",
        ["client_id", "user_id"],
        unique=False,
    )
    op.create_index(
        op.f("ix_oauth_authorization_codes_code"),
        "oauth_authorization_codes",
        ["code"],
        unique=True,
    )
    op.create_index(
        "ix_oauth_authorization_codes_expires_at",
        "oauth_authorization_codes",
        ["expires_at"],
        unique=False,
    )
    op.create_table(
        "oauth_consents",
        sa.Column("user_id", sa.UUID(), nullable=False),
        sa.Column("client_id", sa.String(length=64), nullable=False),
        sa.Column("granted_scopes", sa.String(length=1000), nullable=False),
        sa.Column("id", sa.UUID(), nullable=False),
        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
        sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
        sa.ForeignKeyConstraint(
            ["client_id"], ["oauth_clients.client_id"], ondelete="CASCADE"
        ),
        sa.ForeignKeyConstraint(["user_id"], ["users.id"], ondelete="CASCADE"),
        sa.PrimaryKeyConstraint("id"),
    )
    op.create_index(
        "ix_oauth_consents_user_client",
        "oauth_consents",
        ["user_id", "client_id"],
        unique=True,
    )
    op.create_table(
        "oauth_provider_refresh_tokens",
        sa.Column("token_hash", sa.String(length=64), nullable=False),
        sa.Column("jti", sa.String(length=64), nullable=False),
        sa.Column("client_id", sa.String(length=64), nullable=False),
        sa.Column("user_id", sa.UUID(), nullable=False),
        sa.Column("scope", sa.String(length=1000), nullable=False),
        sa.Column("expires_at", sa.DateTime(timezone=True), nullable=False),
        sa.Column("revoked", sa.Boolean(), nullable=False),
        sa.Column("last_used_at", sa.DateTime(timezone=True), nullable=True),
        sa.Column("device_info", sa.String(length=500), nullable=True),
        sa.Column("ip_address", sa.String(length=45), nullable=True),
        sa.Column("id", sa.UUID(), nullable=False),
        sa.Column("created_at", sa.DateTime(timezone=True), nullable=False),
        sa.Column("updated_at", sa.DateTime(timezone=True), nullable=False),
        sa.ForeignKeyConstraint(
            ["client_id"], ["oauth_clients.client_id"], ondelete="CASCADE"
        ),
        sa.ForeignKeyConstraint(["user_id"], ["users.id"], ondelete="CASCADE"),
        sa.PrimaryKeyConstraint("id"),
    )
    op.create_index(
        "ix_oauth_provider_refresh_tokens_client_user",
        "oauth_provider_refresh_tokens",
        ["client_id", "user_id"],
        unique=False,
    )
    op.create_index(
        "ix_oauth_provider_refresh_tokens_expires_at",
        "oauth_provider_refresh_tokens",
        ["expires_at"],
        unique=False,
    )
    op.create_index(
        op.f("ix_oauth_provider_refresh_tokens_jti"),
        "oauth_provider_refresh_tokens",
        ["jti"],
        unique=True,
    )
    op.create_index(
        op.f("ix_oauth_provider_refresh_tokens_revoked"),
        "oauth_provider_refresh_tokens",
        ["revoked"],
        unique=False,
    )
    op.create_index(
        op.f("ix_oauth_provider_refresh_tokens_token_hash"),
        "oauth_provider_refresh_tokens",
        ["token_hash"],
        unique=True,
    )
    op.create_index(
        "ix_oauth_provider_refresh_tokens_user_revoked",
        "oauth_provider_refresh_tokens",
        ["user_id", "revoked"],
        unique=False,
    )
    op.create_index('ix_oauth_provider_refresh_tokens_client_user', 'oauth_provider_refresh_tokens', ['client_id', 'user_id'], unique=False)
    op.create_index('ix_oauth_provider_refresh_tokens_expires_at', 'oauth_provider_refresh_tokens', ['expires_at'], unique=False)
    op.create_index(op.f('ix_oauth_provider_refresh_tokens_jti'), 'oauth_provider_refresh_tokens', ['jti'], unique=True)
    op.create_index(op.f('ix_oauth_provider_refresh_tokens_revoked'), 'oauth_provider_refresh_tokens', ['revoked'], unique=False)
    op.create_index(op.f('ix_oauth_provider_refresh_tokens_token_hash'), 'oauth_provider_refresh_tokens', ['token_hash'], unique=True)
    op.create_index('ix_oauth_provider_refresh_tokens_user_revoked', 'oauth_provider_refresh_tokens', ['user_id', 'revoked'], unique=False)
    # ### end Alembic commands ###
 def downgrade() -> None:
    # ### commands auto generated by Alembic - please adjust! ###
-    op.drop_index('ix_oauth_provider_refresh_tokens_user_revoked', table_name='oauth_provider_refresh_tokens')
+    op.drop_index(
-    op.drop_index(op.f('ix_oauth_provider_refresh_tokens_token_hash'), table_name='oauth_provider_refresh_tokens')
+        "ix_oauth_provider_refresh_tokens_user_revoked",
-    op.drop_index(op.f('ix_oauth_provider_refresh_tokens_revoked'), table_name='oauth_provider_refresh_tokens')
+        table_name="oauth_provider_refresh_tokens",
-    op.drop_index(op.f('ix_oauth_provider_refresh_tokens_jti'), table_name='oauth_provider_refresh_tokens')
+    )
-    op.drop_index('ix_oauth_provider_refresh_tokens_expires_at', table_name='oauth_provider_refresh_tokens')
+    op.drop_index(
-    op.drop_index('ix_oauth_provider_refresh_tokens_client_user', table_name='oauth_provider_refresh_tokens')
+        op.f("ix_oauth_provider_refresh_tokens_token_hash"),
-    op.drop_table('oauth_provider_refresh_tokens')
+        table_name="oauth_provider_refresh_tokens",
-    op.drop_index('ix_oauth_consents_user_client', table_name='oauth_consents')
+    )
-    op.drop_table('oauth_consents')
+    op.drop_index(
-    op.drop_index('ix_oauth_authorization_codes_expires_at', table_name='oauth_authorization_codes')
+        op.f("ix_oauth_provider_refresh_tokens_revoked"),
-    op.drop_index(op.f('ix_oauth_authorization_codes_code'), table_name='oauth_authorization_codes')
+        table_name="oauth_provider_refresh_tokens",
-    op.drop_index('ix_oauth_authorization_codes_client_user', table_name='oauth_authorization_codes')
+    )
-    op.drop_table('oauth_authorization_codes')
+    op.drop_index(
-    op.drop_index(op.f('ix_user_sessions_user_id'), table_name='user_sessions')
+        op.f("ix_oauth_provider_refresh_tokens_jti"),
-    op.drop_index('ix_user_sessions_user_active', table_name='user_sessions')
+        table_name="oauth_provider_refresh_tokens",
-    op.drop_index(op.f('ix_user_sessions_refresh_token_jti'), table_name='user_sessions')
+    )
-    op.drop_index('ix_user_sessions_jti_active', table_name='user_sessions')
+    op.drop_index(
-    op.drop_index(op.f('ix_user_sessions_is_active'), table_name='user_sessions')
+        "ix_oauth_provider_refresh_tokens_expires_at",
-    op.drop_table('user_sessions')
+        table_name="oauth_provider_refresh_tokens",
-    op.drop_index(op.f('ix_user_organizations_is_active'), table_name='user_organizations')
+    )
-    op.drop_index('ix_user_org_user_active', table_name='user_organizations')
+    op.drop_index(
-    op.drop_index('ix_user_org_role', table_name='user_organizations')
+        "ix_oauth_provider_refresh_tokens_client_user",
-    op.drop_index('ix_user_org_org_active', table_name='user_organizations')
+        table_name="oauth_provider_refresh_tokens",
-    op.drop_table('user_organizations')
+    )
-    op.drop_index(op.f('ix_oauth_clients_is_active'), table_name='oauth_clients')
+    op.drop_table("oauth_provider_refresh_tokens")
-    op.drop_index(op.f('ix_oauth_clients_client_id'), table_name='oauth_clients')
+    op.drop_index("ix_oauth_consents_user_client", table_name="oauth_consents")
-    op.drop_table('oauth_clients')
+    op.drop_table("oauth_consents")
-    op.drop_index('ix_oauth_accounts_user_provider', table_name='oauth_accounts')
+    op.drop_index(
-    op.drop_index(op.f('ix_oauth_accounts_user_id'), table_name='oauth_accounts')
+        "ix_oauth_authorization_codes_expires_at",
-    op.drop_index(op.f('ix_oauth_accounts_provider_email'), table_name='oauth_accounts')
+        table_name="oauth_authorization_codes",
-    op.drop_index(op.f('ix_oauth_accounts_provider'), table_name='oauth_accounts')
+    )
-    op.drop_table('oauth_accounts')
+    op.drop_index(
-    op.drop_index(op.f('ix_users_locale'), table_name='users')
+        op.f("ix_oauth_authorization_codes_code"),
-    op.drop_index(op.f('ix_users_is_superuser'), table_name='users')
+        table_name="oauth_authorization_codes",
-    op.drop_index(op.f('ix_users_is_active'), table_name='users')
+    )
-    op.drop_index(op.f('ix_users_email'), table_name='users')
+    op.drop_index(
-    op.drop_index(op.f('ix_users_deleted_at'), table_name='users')
+        "ix_oauth_authorization_codes_client_user",
-    op.drop_table('users')
+        table_name="oauth_authorization_codes",
-    op.drop_index('ix_organizations_slug_active', table_name='organizations')
+    )
-    op.drop_index(op.f('ix_organizations_slug'), table_name='organizations')
+    op.drop_table("oauth_authorization_codes")
-    op.drop_index('ix_organizations_name_active', table_name='organizations')
+    op.drop_index(op.f("ix_user_sessions_user_id"), table_name="user_sessions")
-    op.drop_index(op.f('ix_organizations_name'), table_name='organizations')
+    op.drop_index("ix_user_sessions_user_active", table_name="user_sessions")
-    op.drop_index(op.f('ix_organizations_is_active'), table_name='organizations')
+    op.drop_index(
-    op.drop_table('organizations')
+        op.f("ix_user_sessions_refresh_token_jti"), table_name="user_sessions"
-    op.drop_index(op.f('ix_oauth_states_state'), table_name='oauth_states')
+    )
-    op.drop_table('oauth_states')
+    op.drop_index("ix_user_sessions_jti_active", table_name="user_sessions")
    op.drop_index(op.f("ix_user_sessions_is_active"), table_name="user_sessions")
    op.drop_table("user_sessions")
    op.drop_index(
        op.f("ix_user_organizations_is_active"), table_name="user_organizations"
    )
    op.drop_index("ix_user_org_user_active", table_name="user_organizations")
    op.drop_index("ix_user_org_role", table_name="user_organizations")
    op.drop_index("ix_user_org_org_active", table_name="user_organizations")
    op.drop_table("user_organizations")
    op.drop_index(op.f("ix_oauth_clients_is_active"), table_name="oauth_clients")
    op.drop_index(op.f("ix_oauth_clients_client_id"), table_name="oauth_clients")
    op.drop_table("oauth_clients")
    op.drop_index("ix_oauth_accounts_user_provider", table_name="oauth_accounts")
    op.drop_index(op.f("ix_oauth_accounts_user_id"), table_name="oauth_accounts")
    op.drop_index(op.f("ix_oauth_accounts_provider_email"), table_name="oauth_accounts")
    op.drop_index(op.f("ix_oauth_accounts_provider"), table_name="oauth_accounts")
    op.drop_table("oauth_accounts")
    op.drop_index(op.f("ix_users_locale"), table_name="users")
    op.drop_index(op.f("ix_users_is_superuser"), table_name="users")
    op.drop_index(op.f("ix_users_is_active"), table_name="users")
    op.drop_index(op.f("ix_users_email"), table_name="users")
    op.drop_index(op.f("ix_users_deleted_at"), table_name="users")
    op.drop_table("users")
    op.drop_index("ix_organizations_slug_active", table_name="organizations")
    op.drop_index(op.f("ix_organizations_slug"), table_name="organizations")
    op.drop_index("ix_organizations_name_active", table_name="organizations")
    op.drop_index(op.f("ix_organizations_name"), table_name="organizations")
    op.drop_index(op.f("ix_organizations_is_active"), table_name="organizations")
    op.drop_table("organizations")
    op.drop_index(op.f("ix_oauth_states_state"), table_name="oauth_states")
    op.drop_table("oauth_states")
    # ### end Alembic commands ###
--- a/backend/app/alembic/versions/0002_add_performance_indexes.py
+++ b/backend/app/alembic/versions/0002_add_performance_indexes.py
@@ -114,8 +114,13 @@ def upgrade() -> None:
 def downgrade() -> None:
    # Drop indexes in reverse order
-    op.drop_index("ix_perf_oauth_auth_codes_expires", table_name="oauth_authorization_codes")
+    op.drop_index(
-    op.drop_index("ix_perf_oauth_refresh_tokens_expires", table_name="oauth_provider_refresh_tokens")
+        "ix_perf_oauth_auth_codes_expires", table_name="oauth_authorization_codes"
    )
    op.drop_index(
        "ix_perf_oauth_refresh_tokens_expires",
        table_name="oauth_provider_refresh_tokens",
    )
    op.drop_index("ix_perf_user_sessions_expires", table_name="user_sessions")
    op.drop_index("ix_perf_organizations_slug_lower", table_name="organizations")
    op.drop_index("ix_perf_users_active", table_name="users")
--- a/backend/app/alembic/versions/0003_enable_pgvector_extension.py
+++ b/backend/app/alembic/versions/0003_enable_pgvector_extension.py
@@ -0,0 +1,66 @@
 """Enable pgvector extension
 Revision ID: 0003
 Revises: 0002
 Create Date: 2025-12-30
 This migration enables the pgvector extension for PostgreSQL, which provides
 vector similarity search capabilities required for the RAG (Retrieval-Augmented
 Generation) knowledge base system.
 Vector Dimension Reference (per ADR-008 and SPIKE-006):
 ---------------------------------------------------------
 The dimension size depends on the embedding model used:
 | Model                      | Dimensions | Use Case                    |
 |----------------------------|------------|------------------------------|
 | text-embedding-3-small     | 1536       | General docs, conversations  |
 | text-embedding-3-large     | 256-3072   | High accuracy (configurable) |
 | voyage-code-3              | 1024       | Code files (Python, JS, etc) |
 | voyage-3-large             | 1024       | High quality general purpose |
 | nomic-embed-text (Ollama)  | 768        | Local/fallback embedding     |
 Recommended defaults for Syndarix:
 - Documentation/conversations: 1536 (text-embedding-3-small)
 - Code files: 1024 (voyage-code-3)
 Prerequisites:
 --------------
 This migration requires PostgreSQL with the pgvector extension installed.
 The Docker Compose configuration uses `pgvector/pgvector:pg17` which includes
 the extension pre-installed.
 References:
 -----------
 - ADR-008: Knowledge Base and RAG Architecture
 - SPIKE-006: Knowledge Base with pgvector for RAG System
 - https://github.com/pgvector/pgvector
 """
 from collections.abc import Sequence
 from alembic import op
 # revision identifiers, used by Alembic.
 revision: str = "0003"
 down_revision: str | None = "0002"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 def upgrade() -> None:
    """Enable the pgvector extension.
    The CREATE EXTENSION IF NOT EXISTS statement is idempotent - it will
    succeed whether the extension already exists or not.
    """
    op.execute("CREATE EXTENSION IF NOT EXISTS vector")
 def downgrade() -> None:
    """Drop the pgvector extension.
    Note: This will fail if any tables with vector columns exist.
    Future migrations that create vector columns should be downgraded first.
    """
    op.execute("DROP EXTENSION IF EXISTS vector")
--- a/backend/app/alembic/versions/0004_add_syndarix_models.py
+++ b/backend/app/alembic/versions/0004_add_syndarix_models.py
@@ -0,0 +1,507 @@
 """Add Syndarix models
 Revision ID: 0004
 Revises: 0003
 Create Date: 2025-12-31
 This migration creates the core Syndarix domain tables:
 - projects: Client engagement projects
 - agent_types: Agent template configurations
 - agent_instances: Spawned agent instances assigned to projects
 - sprints: Sprint containers for issues
 - issues: Work items (epics, stories, tasks, bugs)
 """
 from collections.abc import Sequence
 import sqlalchemy as sa
 from alembic import op
 from sqlalchemy.dialects import postgresql
 # revision identifiers, used by Alembic.
 revision: str = "0004"
 down_revision: str | None = "0003"
 branch_labels: str | Sequence[str] | None = None
 depends_on: str | Sequence[str] | None = None
 def upgrade() -> None:
    """Create Syndarix domain tables."""
    # =========================================================================
    # Create projects table
    # Note: ENUM types are created automatically by sa.Enum() during table creation
    # =========================================================================
    op.create_table(
        "projects",
        sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
        sa.Column("name", sa.String(255), nullable=False),
        sa.Column("slug", sa.String(255), nullable=False),
        sa.Column("description", sa.Text(), nullable=True),
        sa.Column(
            "autonomy_level",
            sa.Enum(
                "full_control",
                "milestone",
                "autonomous",
                name="autonomy_level",
            ),
            nullable=False,
            server_default="milestone",
        ),
        sa.Column(
            "status",
            sa.Enum(
                "active",
                "paused",
                "completed",
                "archived",
                name="project_status",
            ),
            nullable=False,
            server_default="active",
        ),
        sa.Column(
            "complexity",
            sa.Enum(
                "script",
                "simple",
                "medium",
                "complex",
                name="project_complexity",
            ),
            nullable=False,
            server_default="medium",
        ),
        sa.Column(
            "client_mode",
            sa.Enum("technical", "auto", name="client_mode"),
            nullable=False,
            server_default="auto",
        ),
        sa.Column(
            "settings",
            postgresql.JSONB(astext_type=sa.Text()),
            nullable=False,
            server_default="{}",
        ),
        sa.Column("owner_id", postgresql.UUID(as_uuid=True), nullable=True),
        sa.Column(
            "created_at",
            sa.DateTime(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
        sa.Column(
            "updated_at",
            sa.DateTime(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
        sa.PrimaryKeyConstraint("id"),
        sa.ForeignKeyConstraint(["owner_id"], ["users.id"], ondelete="SET NULL"),
        sa.UniqueConstraint("slug"),
    )
    # Single column indexes
    op.create_index("ix_projects_name", "projects", ["name"])
    op.create_index("ix_projects_slug", "projects", ["slug"])
    op.create_index("ix_projects_status", "projects", ["status"])
    op.create_index("ix_projects_autonomy_level", "projects", ["autonomy_level"])
    op.create_index("ix_projects_complexity", "projects", ["complexity"])
    op.create_index("ix_projects_client_mode", "projects", ["client_mode"])
    op.create_index("ix_projects_owner_id", "projects", ["owner_id"])
    # Composite indexes
    op.create_index("ix_projects_slug_status", "projects", ["slug", "status"])
    op.create_index("ix_projects_owner_status", "projects", ["owner_id", "status"])
    op.create_index(
        "ix_projects_autonomy_status", "projects", ["autonomy_level", "status"]
    )
    op.create_index(
        "ix_projects_complexity_status", "projects", ["complexity", "status"]
    )
    # =========================================================================
    # Create agent_types table
    # =========================================================================
    op.create_table(
        "agent_types",
        sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
        sa.Column("name", sa.String(255), nullable=False),
        sa.Column("slug", sa.String(255), nullable=False),
        sa.Column("description", sa.Text(), nullable=True),
        # Areas of expertise (e.g., ["python", "fastapi", "databases"])
        sa.Column(
            "expertise",
            postgresql.JSONB(astext_type=sa.Text()),
            nullable=False,
            server_default="[]",
        ),
        # System prompt defining personality and behavior (required)
        sa.Column("personality_prompt", sa.Text(), nullable=False),
        # LLM model configuration
        sa.Column("primary_model", sa.String(100), nullable=False),
        sa.Column(
            "fallback_models",
            postgresql.JSONB(astext_type=sa.Text()),
            nullable=False,
            server_default="[]",
        ),
        # Model parameters (temperature, max_tokens, etc.)
        sa.Column(
            "model_params",
            postgresql.JSONB(astext_type=sa.Text()),
            nullable=False,
            server_default="{}",
        ),
        # MCP servers this agent can connect to
        sa.Column(
            "mcp_servers",
            postgresql.JSONB(astext_type=sa.Text()),
            nullable=False,
            server_default="[]",
        ),
        # Tool permissions configuration
        sa.Column(
            "tool_permissions",
            postgresql.JSONB(astext_type=sa.Text()),
            nullable=False,
            server_default="{}",
        ),
        sa.Column("is_active", sa.Boolean(), nullable=False, server_default="true"),
        sa.Column(
            "created_at",
            sa.DateTime(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
        sa.Column(
            "updated_at",
            sa.DateTime(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
        sa.PrimaryKeyConstraint("id"),
        sa.UniqueConstraint("slug"),
    )
    # Single column indexes
    op.create_index("ix_agent_types_name", "agent_types", ["name"])
    op.create_index("ix_agent_types_slug", "agent_types", ["slug"])
    op.create_index("ix_agent_types_is_active", "agent_types", ["is_active"])
    # Composite indexes
    op.create_index("ix_agent_types_slug_active", "agent_types", ["slug", "is_active"])
    op.create_index("ix_agent_types_name_active", "agent_types", ["name", "is_active"])
    # =========================================================================
    # Create agent_instances table
    # =========================================================================
    op.create_table(
        "agent_instances",
        sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
        sa.Column("agent_type_id", postgresql.UUID(as_uuid=True), nullable=False),
        sa.Column("project_id", postgresql.UUID(as_uuid=True), nullable=False),
        sa.Column("name", sa.String(100), nullable=False),
        sa.Column(
            "status",
            sa.Enum(
                "idle",
                "working",
                "waiting",
                "paused",
                "terminated",
                name="agent_status",
            ),
            nullable=False,
            server_default="idle",
        ),
        sa.Column("current_task", sa.Text(), nullable=True),
        # Short-term memory (conversation context, recent decisions)
        sa.Column(
            "short_term_memory",
            postgresql.JSONB(astext_type=sa.Text()),
            nullable=False,
            server_default="{}",
        ),
        # Reference to long-term memory in vector store
        sa.Column("long_term_memory_ref", sa.String(500), nullable=True),
        # Session ID for active MCP connections
        sa.Column("session_id", sa.String(255), nullable=True),
        # Activity tracking
        sa.Column("last_activity_at", sa.DateTime(timezone=True), nullable=True),
        sa.Column("terminated_at", sa.DateTime(timezone=True), nullable=True),
        # Usage metrics
        sa.Column("tasks_completed", sa.Integer(), nullable=False, server_default="0"),
        sa.Column("tokens_used", sa.BigInteger(), nullable=False, server_default="0"),
        sa.Column(
            "cost_incurred",
            sa.Numeric(precision=10, scale=4),
            nullable=False,
            server_default="0",
        ),
        sa.Column(
            "created_at",
            sa.DateTime(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
        sa.Column(
            "updated_at",
            sa.DateTime(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
        sa.PrimaryKeyConstraint("id"),
        sa.ForeignKeyConstraint(
            ["agent_type_id"], ["agent_types.id"], ondelete="RESTRICT"
        ),
        sa.ForeignKeyConstraint(["project_id"], ["projects.id"], ondelete="CASCADE"),
    )
    # Single column indexes
    op.create_index("ix_agent_instances_name", "agent_instances", ["name"])
    op.create_index("ix_agent_instances_status", "agent_instances", ["status"])
    op.create_index(
        "ix_agent_instances_agent_type_id", "agent_instances", ["agent_type_id"]
    )
    op.create_index("ix_agent_instances_project_id", "agent_instances", ["project_id"])
    op.create_index("ix_agent_instances_session_id", "agent_instances", ["session_id"])
    op.create_index(
        "ix_agent_instances_last_activity_at", "agent_instances", ["last_activity_at"]
    )
    op.create_index(
        "ix_agent_instances_terminated_at", "agent_instances", ["terminated_at"]
    )
    # Composite indexes
    op.create_index(
        "ix_agent_instances_project_status",
        "agent_instances",
        ["project_id", "status"],
    )
    op.create_index(
        "ix_agent_instances_type_status",
        "agent_instances",
        ["agent_type_id", "status"],
    )
    op.create_index(
        "ix_agent_instances_project_type",
        "agent_instances",
        ["project_id", "agent_type_id"],
    )
    # =========================================================================
    # Create sprints table (before issues for FK reference)
    # =========================================================================
    op.create_table(
        "sprints",
        sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
        sa.Column("project_id", postgresql.UUID(as_uuid=True), nullable=False),
        sa.Column("name", sa.String(255), nullable=False),
        sa.Column("number", sa.Integer(), nullable=False),
        sa.Column("goal", sa.Text(), nullable=True),
        sa.Column("start_date", sa.Date(), nullable=False),
        sa.Column("end_date", sa.Date(), nullable=False),
        sa.Column(
            "status",
            sa.Enum(
                "planned",
                "active",
                "in_review",
                "completed",
                "cancelled",
                name="sprint_status",
            ),
            nullable=False,
            server_default="planned",
        ),
        sa.Column("planned_points", sa.Integer(), nullable=True),
        sa.Column("velocity", sa.Integer(), nullable=True),
        sa.Column(
            "created_at",
            sa.DateTime(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
        sa.Column(
            "updated_at",
            sa.DateTime(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
        sa.PrimaryKeyConstraint("id"),
        sa.ForeignKeyConstraint(["project_id"], ["projects.id"], ondelete="CASCADE"),
        sa.UniqueConstraint("project_id", "number", name="uq_sprint_project_number"),
    )
    # Single column indexes
    op.create_index("ix_sprints_project_id", "sprints", ["project_id"])
    op.create_index("ix_sprints_status", "sprints", ["status"])
    op.create_index("ix_sprints_start_date", "sprints", ["start_date"])
    op.create_index("ix_sprints_end_date", "sprints", ["end_date"])
    # Composite indexes
    op.create_index("ix_sprints_project_status", "sprints", ["project_id", "status"])
    op.create_index("ix_sprints_project_number", "sprints", ["project_id", "number"])
    op.create_index("ix_sprints_date_range", "sprints", ["start_date", "end_date"])
    # =========================================================================
    # Create issues table
    # =========================================================================
    op.create_table(
        "issues",
        sa.Column("id", postgresql.UUID(as_uuid=True), nullable=False),
        sa.Column("project_id", postgresql.UUID(as_uuid=True), nullable=False),
        # Parent issue for hierarchy (Epic -> Story -> Task)
        sa.Column("parent_id", postgresql.UUID(as_uuid=True), nullable=True),
        # Issue type (epic, story, task, bug)
        sa.Column(
            "type",
            sa.Enum(
                "epic",
                "story",
                "task",
                "bug",
                name="issue_type",
            ),
            nullable=False,
            server_default="task",
        ),
        # Reporter (who created this issue)
        sa.Column("reporter_id", postgresql.UUID(as_uuid=True), nullable=True),
        # Issue content
        sa.Column("title", sa.String(500), nullable=False),
        sa.Column("body", sa.Text(), nullable=False, server_default=""),
        # Status and priority
        sa.Column(
            "status",
            sa.Enum(
                "open",
                "in_progress",
                "in_review",
                "blocked",
                "closed",
                name="issue_status",
            ),
            nullable=False,
            server_default="open",
        ),
        sa.Column(
            "priority",
            sa.Enum(
                "low",
                "medium",
                "high",
                "critical",
                name="issue_priority",
            ),
            nullable=False,
            server_default="medium",
        ),
        # Labels for categorization
        sa.Column(
            "labels",
            postgresql.JSONB(astext_type=sa.Text()),
            nullable=False,
            server_default="[]",
        ),
        # Assignment - agent or human (mutually exclusive)
        sa.Column("assigned_agent_id", postgresql.UUID(as_uuid=True), nullable=True),
        sa.Column("human_assignee", sa.String(255), nullable=True),
        # Sprint association
        sa.Column("sprint_id", postgresql.UUID(as_uuid=True), nullable=True),
        # Estimation
        sa.Column("story_points", sa.Integer(), nullable=True),
        sa.Column("due_date", sa.Date(), nullable=True),
        # External tracker integration (String for flexibility)
        sa.Column("external_tracker_type", sa.String(50), nullable=True),
        sa.Column("external_issue_id", sa.String(255), nullable=True),
        sa.Column("remote_url", sa.String(1000), nullable=True),
        sa.Column("external_issue_number", sa.Integer(), nullable=True),
        # Sync status
        sa.Column(
            "sync_status",
            sa.Enum(
                "synced",
                "pending",
                "conflict",
                "error",
                name="sync_status",
            ),
            nullable=False,
            server_default="synced",
        ),
        sa.Column("last_synced_at", sa.DateTime(timezone=True), nullable=True),
        sa.Column("external_updated_at", sa.DateTime(timezone=True), nullable=True),
        # Lifecycle
        sa.Column("closed_at", sa.DateTime(timezone=True), nullable=True),
        sa.Column(
            "created_at",
            sa.DateTime(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
        sa.Column(
            "updated_at",
            sa.DateTime(timezone=True),
            nullable=False,
            server_default=sa.text("now()"),
        ),
        sa.PrimaryKeyConstraint("id"),
        sa.ForeignKeyConstraint(["project_id"], ["projects.id"], ondelete="CASCADE"),
        sa.ForeignKeyConstraint(["parent_id"], ["issues.id"], ondelete="CASCADE"),
        sa.ForeignKeyConstraint(["sprint_id"], ["sprints.id"], ondelete="SET NULL"),
        sa.ForeignKeyConstraint(
            ["assigned_agent_id"], ["agent_instances.id"], ondelete="SET NULL"
        ),
    )
    # Single column indexes
    op.create_index("ix_issues_project_id", "issues", ["project_id"])
    op.create_index("ix_issues_parent_id", "issues", ["parent_id"])
    op.create_index("ix_issues_type", "issues", ["type"])
    op.create_index("ix_issues_reporter_id", "issues", ["reporter_id"])
    op.create_index("ix_issues_status", "issues", ["status"])
    op.create_index("ix_issues_priority", "issues", ["priority"])
    op.create_index("ix_issues_assigned_agent_id", "issues", ["assigned_agent_id"])
    op.create_index("ix_issues_human_assignee", "issues", ["human_assignee"])
    op.create_index("ix_issues_sprint_id", "issues", ["sprint_id"])
    op.create_index("ix_issues_due_date", "issues", ["due_date"])
    op.create_index(
        "ix_issues_external_tracker_type", "issues", ["external_tracker_type"]
    )
    op.create_index("ix_issues_sync_status", "issues", ["sync_status"])
    op.create_index("ix_issues_closed_at", "issues", ["closed_at"])
    # Composite indexes
    op.create_index("ix_issues_project_status", "issues", ["project_id", "status"])
    op.create_index("ix_issues_project_priority", "issues", ["project_id", "priority"])
    op.create_index("ix_issues_project_sprint", "issues", ["project_id", "sprint_id"])
    op.create_index("ix_issues_project_type", "issues", ["project_id", "type"])
    op.create_index(
        "ix_issues_project_agent", "issues", ["project_id", "assigned_agent_id"]
    )
    op.create_index(
        "ix_issues_project_status_priority",
        "issues",
        ["project_id", "status", "priority"],
    )
    op.create_index(
        "ix_issues_external_tracker_id",
        "issues",
        ["external_tracker_type", "external_issue_id"],
    )
 def downgrade() -> None:
    """Drop Syndarix domain tables."""
    # Drop tables in reverse order (respecting FK constraints)
    op.drop_table("issues")
    op.drop_table("sprints")
    op.drop_table("agent_instances")
    op.drop_table("agent_types")
    op.drop_table("projects")
    # Drop ENUM types
    op.execute("DROP TYPE IF EXISTS sprint_status")
    op.execute("DROP TYPE IF EXISTS sync_status")
    op.execute("DROP TYPE IF EXISTS issue_priority")
    op.execute("DROP TYPE IF EXISTS issue_status")
    op.execute("DROP TYPE IF EXISTS issue_type")
    op.execute("DROP TYPE IF EXISTS agent_status")
    op.execute("DROP TYPE IF EXISTS client_mode")
    op.execute("DROP TYPE IF EXISTS project_complexity")
    op.execute("DROP TYPE IF EXISTS project_status")
    op.execute("DROP TYPE IF EXISTS autonomy_level")
--- a/backend/app/api/dependencies/auth.py
+++ b/backend/app/api/dependencies/auth.py
@@ -151,3 +151,83 @@ async def get_optional_current_user(
        return user
    except (TokenExpiredError, TokenInvalidError):
        return None
 async def get_current_user_sse(
    db: AsyncSession = Depends(get_db),
    authorization: str | None = Header(None),
    token: str | None = None,  # Query parameter - passed directly from route
 ) -> User:
    """
    Get the current authenticated user for SSE endpoints.
    SSE (Server-Sent Events) via EventSource API doesn't support custom headers,
    so this dependency accepts tokens from either:
    1. Authorization header (preferred, for non-EventSource clients)
    2. Query parameter 'token' (fallback for EventSource compatibility)
    Security note: Query parameter tokens appear in server logs and browser history.
    Consider implementing short-lived SSE-specific tokens for production if this
    is a concern. The current approach is acceptable for internal/trusted networks.
    Args:
        db: Database session
        authorization: Authorization header (Bearer token)
        token: Query parameter token (fallback for EventSource)
    Returns:
        User: The authenticated user
    Raises:
        HTTPException: If authentication fails
    """
    # Try Authorization header first (preferred)
    auth_token = None
    if authorization:
        scheme, param = get_authorization_scheme_param(authorization)
        if scheme.lower() == "bearer" and param:
            auth_token = param
    # Fall back to query parameter if no header token
    if not auth_token and token:
        auth_token = token
    if not auth_token:
        raise HTTPException(
            status_code=status.HTTP_401_UNAUTHORIZED,
            detail="Not authenticated",
            headers={"WWW-Authenticate": "Bearer"},
        )
    try:
        # Decode token and get user ID
        token_data = get_token_data(auth_token)
        # Get user from database
        result = await db.execute(select(User).where(User.id == token_data.user_id))
        user = result.scalar_one_or_none()
        if not user:
            raise HTTPException(
                status_code=status.HTTP_404_NOT_FOUND, detail="User not found"
            )
        if not user.is_active:
            raise HTTPException(
                status_code=status.HTTP_403_FORBIDDEN, detail="Inactive user"
            )
        return user
    except TokenExpiredError:
        raise HTTPException(
            status_code=status.HTTP_401_UNAUTHORIZED,
            detail="Token expired",
            headers={"WWW-Authenticate": "Bearer"},
        )
    except TokenInvalidError:
        raise HTTPException(
            status_code=status.HTTP_401_UNAUTHORIZED,
            detail="Could not validate credentials",
            headers={"WWW-Authenticate": "Bearer"},
        )
--- a/backend/app/api/dependencies/event_bus.py
+++ b/backend/app/api/dependencies/event_bus.py
@@ -0,0 +1,36 @@
 """
 Event bus dependency for FastAPI routes.
 This module provides the FastAPI dependency for injecting the EventBus
 into route handlers. The event bus is a singleton that maintains
 Redis pub/sub connections for real-time event streaming.
 """
 from app.services.event_bus import (
    EventBus,
    get_connected_event_bus as _get_connected_event_bus,
 )
 async def get_event_bus() -> EventBus:
    """
    FastAPI dependency that provides a connected EventBus instance.
    The EventBus is a singleton that maintains Redis pub/sub connections.
    It's lazily initialized and connected on first access, and should be
    closed during application shutdown via close_event_bus().
    Usage:
        @router.get("/events/stream")
        async def stream_events(
            event_bus: EventBus = Depends(get_event_bus)
        ):
            ...
    Returns:
        EventBus: The global connected event bus instance
    Raises:
        EventBusConnectionError: If connection to Redis fails
    """
    return await _get_connected_event_bus()
--- a/backend/app/api/main.py
+++ b/backend/app/api/main.py
@@ -2,11 +2,18 @@ from fastapi import APIRouter
 from app.api.routes import (
    admin,
    agent_types,
    agents,
    auth,
    events,
    issues,
    mcp,
    oauth,
    oauth_provider,
    organizations,
    projects,
    sessions,
    sprints,
    users,
 )
@@ -22,3 +29,22 @@ api_router.include_router(admin.router, prefix="/admin", tags=["Admin"])
 api_router.include_router(
    organizations.router, prefix="/organizations", tags=["Organizations"]
 )
 # SSE events router - no prefix, routes define full paths
 api_router.include_router(events.router, tags=["Events"])
 # MCP (Model Context Protocol) router
 api_router.include_router(mcp.router, prefix="/mcp", tags=["MCP"])
 # Syndarix domain routers
 api_router.include_router(projects.router, prefix="/projects", tags=["Projects"])
 api_router.include_router(
    agent_types.router, prefix="/agent-types", tags=["Agent Types"]
 )
 # Issues router - routes include /projects/{project_id}/issues paths
 api_router.include_router(issues.router, tags=["Issues"])
 # Agents router - routes include /projects/{project_id}/agents paths
 api_router.include_router(agents.router, tags=["Agents"])
 # Sprints router - routes need prefix as they use /projects/{project_id}/sprints paths
 api_router.include_router(
    sprints.router, prefix="/projects/{project_id}/sprints", tags=["Sprints"]
 )
--- a/backend/app/api/routes/agent_types.py
+++ b/backend/app/api/routes/agent_types.py
@@ -0,0 +1,462 @@
 # app/api/routes/agent_types.py
 """
 AgentType configuration API endpoints.
 Provides CRUD operations for managing AI agent type templates.
 Agent types define the base configuration (model, personality, expertise)
 from which agent instances are spawned for projects.
 Authorization:
 - Read endpoints: Any authenticated user
 - Write endpoints (create, update, delete): Superusers only
 """
 import logging
 import os
 from typing import Any
 from uuid import UUID
 from fastapi import APIRouter, Depends, Query, Request, status
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.api.dependencies.auth import get_current_user
 from app.api.dependencies.permissions import require_superuser
 from app.core.database import get_db
 from app.core.exceptions import (
    DuplicateError,
    ErrorCode,
    NotFoundError,
 )
 from app.crud.syndarix.agent_type import agent_type as agent_type_crud
 from app.models.user import User
 from app.schemas.common import (
    MessageResponse,
    PaginatedResponse,
    PaginationParams,
    create_pagination_meta,
 )
 from app.schemas.syndarix import (
    AgentTypeCreate,
    AgentTypeResponse,
    AgentTypeUpdate,
 )
 router = APIRouter()
 logger = logging.getLogger(__name__)
 # Initialize limiter for this router
 limiter = Limiter(key_func=get_remote_address)
 # Use higher rate limits in test environment
 IS_TEST = os.getenv("IS_TEST", "False") == "True"
 RATE_MULTIPLIER = 100 if IS_TEST else 1
 def _build_agent_type_response(
    agent_type: Any,
    instance_count: int = 0,
 ) -> AgentTypeResponse:
    """
    Build an AgentTypeResponse from a database model.
    Args:
        agent_type: AgentType model instance
        instance_count: Number of agent instances for this type
    Returns:
        AgentTypeResponse schema
    """
    return AgentTypeResponse(
        id=agent_type.id,
        name=agent_type.name,
        slug=agent_type.slug,
        description=agent_type.description,
        expertise=agent_type.expertise,
        personality_prompt=agent_type.personality_prompt,
        primary_model=agent_type.primary_model,
        fallback_models=agent_type.fallback_models,
        model_params=agent_type.model_params,
        mcp_servers=agent_type.mcp_servers,
        tool_permissions=agent_type.tool_permissions,
        is_active=agent_type.is_active,
        created_at=agent_type.created_at,
        updated_at=agent_type.updated_at,
        instance_count=instance_count,
    )
 # ===== Write Endpoints (Admin Only) =====
@router.post(
    "",
    response_model=AgentTypeResponse,
    status_code=status.HTTP_201_CREATED,
    summary="Create Agent Type",
    description="Create a new agent type configuration (admin only)",
    operation_id="create_agent_type",
 )
@limiter.limit(f"{20 * RATE_MULTIPLIER}/minute")
 async def create_agent_type(
    request: Request,
    agent_type_in: AgentTypeCreate,
    admin: User = Depends(require_superuser),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Create a new agent type configuration.
    Agent types define templates for AI agents including:
    - Model configuration (primary model, fallback models, parameters)
    - Personality and expertise areas
    - MCP server integrations and tool permissions
    Requires superuser privileges.
    Args:
        request: FastAPI request object
        agent_type_in: Agent type creation data
        admin: Authenticated superuser
        db: Database session
    Returns:
        The created agent type configuration
    Raises:
        DuplicateError: If slug already exists
    """
    try:
        agent_type = await agent_type_crud.create(db, obj_in=agent_type_in)
        logger.info(
            f"Admin {admin.email} created agent type: {agent_type.name} "
            f"(slug: {agent_type.slug})"
        )
        return _build_agent_type_response(agent_type, instance_count=0)
    except ValueError as e:
        logger.warning(f"Failed to create agent type: {e!s}")
        raise DuplicateError(
            message=str(e),
            error_code=ErrorCode.ALREADY_EXISTS,
            field="slug",
        )
    except Exception as e:
        logger.error(f"Error creating agent type: {e!s}", exc_info=True)
        raise
@router.patch(
    "/{agent_type_id}",
    response_model=AgentTypeResponse,
    summary="Update Agent Type",
    description="Update an existing agent type configuration (admin only)",
    operation_id="update_agent_type",
 )
@limiter.limit(f"{30 * RATE_MULTIPLIER}/minute")
 async def update_agent_type(
    request: Request,
    agent_type_id: UUID,
    agent_type_in: AgentTypeUpdate,
    admin: User = Depends(require_superuser),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Update an existing agent type configuration.
    Partial updates are supported - only provided fields will be updated.
    Requires superuser privileges.
    Args:
        request: FastAPI request object
        agent_type_id: UUID of the agent type to update
        agent_type_in: Agent type update data
        admin: Authenticated superuser
        db: Database session
    Returns:
        The updated agent type configuration
    Raises:
        NotFoundError: If agent type not found
        DuplicateError: If new slug already exists
    """
    try:
        # Verify agent type exists
        result = await agent_type_crud.get_with_instance_count(
            db, agent_type_id=agent_type_id
        )
        if not result:
            raise NotFoundError(
                message=f"Agent type {agent_type_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        existing_type = result["agent_type"]
        instance_count = result["instance_count"]
        # Perform update
        updated_type = await agent_type_crud.update(
            db, db_obj=existing_type, obj_in=agent_type_in
        )
        logger.info(
            f"Admin {admin.email} updated agent type: {updated_type.name} "
            f"(id: {agent_type_id})"
        )
        return _build_agent_type_response(updated_type, instance_count=instance_count)
    except NotFoundError:
        raise
    except ValueError as e:
        logger.warning(f"Failed to update agent type {agent_type_id}: {e!s}")
        raise DuplicateError(
            message=str(e),
            error_code=ErrorCode.ALREADY_EXISTS,
            field="slug",
        )
    except Exception as e:
        logger.error(f"Error updating agent type {agent_type_id}: {e!s}", exc_info=True)
        raise
@router.delete(
    "/{agent_type_id}",
    response_model=MessageResponse,
    summary="Deactivate Agent Type",
    description="Deactivate an agent type (soft delete, admin only)",
    operation_id="deactivate_agent_type",
 )
@limiter.limit(f"{10 * RATE_MULTIPLIER}/minute")
 async def deactivate_agent_type(
    request: Request,
    agent_type_id: UUID,
    admin: User = Depends(require_superuser),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Deactivate an agent type (soft delete).
    This sets is_active=False rather than deleting the record,
    preserving referential integrity with existing agent instances.
    Requires superuser privileges.
    Args:
        request: FastAPI request object
        agent_type_id: UUID of the agent type to deactivate
        admin: Authenticated superuser
        db: Database session
    Returns:
        Success message
    Raises:
        NotFoundError: If agent type not found
    """
    try:
        deactivated = await agent_type_crud.deactivate(db, agent_type_id=agent_type_id)
        if not deactivated:
            raise NotFoundError(
                message=f"Agent type {agent_type_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        logger.info(
            f"Admin {admin.email} deactivated agent type: {deactivated.name} "
            f"(id: {agent_type_id})"
        )
        return MessageResponse(
            success=True,
            message=f"Agent type '{deactivated.name}' has been deactivated",
        )
    except NotFoundError:
        raise
    except Exception as e:
        logger.error(
            f"Error deactivating agent type {agent_type_id}: {e!s}", exc_info=True
        )
        raise
 # ===== Read Endpoints (Authenticated Users) =====
@router.get(
    "",
    response_model=PaginatedResponse[AgentTypeResponse],
    summary="List Agent Types",
    description="Get paginated list of active agent types",
    operation_id="list_agent_types",
 )
@limiter.limit(f"{60 * RATE_MULTIPLIER}/minute")
 async def list_agent_types(
    request: Request,
    pagination: PaginationParams = Depends(),
    is_active: bool = Query(True, description="Filter by active status"),
    search: str | None = Query(None, description="Search by name, slug, description"),
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    List all agent types with pagination and filtering.
    By default, returns only active agent types. Set is_active=false
    to include deactivated types (useful for admin views).
    Args:
        request: FastAPI request object
        pagination: Pagination parameters (page, limit)
        is_active: Filter by active status (default: True)
        search: Optional search term for name, slug, description
        current_user: Authenticated user
        db: Database session
    Returns:
        Paginated list of agent types with instance counts
    """
    try:
        # Get agent types with instance counts
        results, total = await agent_type_crud.get_multi_with_instance_counts(
            db,
            skip=pagination.offset,
            limit=pagination.limit,
            is_active=is_active,
            search=search,
        )
        # Build response objects
        agent_types_response = [
            _build_agent_type_response(
                item["agent_type"],
                instance_count=item["instance_count"],
            )
            for item in results
        ]
        pagination_meta = create_pagination_meta(
            total=total,
            page=pagination.page,
            limit=pagination.limit,
            items_count=len(agent_types_response),
        )
        return PaginatedResponse(data=agent_types_response, pagination=pagination_meta)
    except Exception as e:
        logger.error(f"Error listing agent types: {e!s}", exc_info=True)
        raise
@router.get(
    "/{agent_type_id}",
    response_model=AgentTypeResponse,
    summary="Get Agent Type",
    description="Get agent type details by ID",
    operation_id="get_agent_type",
 )
@limiter.limit(f"{100 * RATE_MULTIPLIER}/minute")
 async def get_agent_type(
    request: Request,
    agent_type_id: UUID,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Get detailed information about a specific agent type.
    Args:
        request: FastAPI request object
        agent_type_id: UUID of the agent type
        current_user: Authenticated user
        db: Database session
    Returns:
        Agent type details with instance count
    Raises:
        NotFoundError: If agent type not found
    """
    try:
        result = await agent_type_crud.get_with_instance_count(
            db, agent_type_id=agent_type_id
        )
        if not result:
            raise NotFoundError(
                message=f"Agent type {agent_type_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        return _build_agent_type_response(
            result["agent_type"],
            instance_count=result["instance_count"],
        )
    except NotFoundError:
        raise
    except Exception as e:
        logger.error(f"Error getting agent type {agent_type_id}: {e!s}", exc_info=True)
        raise
@router.get(
    "/slug/{slug}",
    response_model=AgentTypeResponse,
    summary="Get Agent Type by Slug",
    description="Get agent type details by slug",
    operation_id="get_agent_type_by_slug",
 )
@limiter.limit(f"{100 * RATE_MULTIPLIER}/minute")
 async def get_agent_type_by_slug(
    request: Request,
    slug: str,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Get detailed information about an agent type by its slug.
    Slugs are human-readable identifiers like "product-owner" or "backend-engineer".
    Useful for referencing agent types in configuration files or APIs.
    Args:
        request: FastAPI request object
        slug: Slug identifier of the agent type
        current_user: Authenticated user
        db: Database session
    Returns:
        Agent type details with instance count
    Raises:
        NotFoundError: If agent type not found
    """
    try:
        agent_type = await agent_type_crud.get_by_slug(db, slug=slug)
        if not agent_type:
            raise NotFoundError(
                message=f"Agent type with slug '{slug}' not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        # Get instance count separately
        result = await agent_type_crud.get_with_instance_count(
            db, agent_type_id=agent_type.id
        )
        instance_count = result["instance_count"] if result else 0
        return _build_agent_type_response(agent_type, instance_count=instance_count)
    except NotFoundError:
        raise
    except Exception as e:
        logger.error(f"Error getting agent type by slug '{slug}': {e!s}", exc_info=True)
        raise
--- a/backend/app/api/routes/agents.py
+++ b/backend/app/api/routes/agents.py
@@ -0,0 +1,984 @@
 # app/api/routes/agents.py
 """
 Agent Instance management endpoints for Syndarix projects.
 These endpoints allow project owners and superusers to manage AI agent instances
 within their projects, including spawning, pausing, resuming, and terminating agents.
 """
 import logging
 import os
 from typing import Any
 from uuid import UUID
 from fastapi import APIRouter, Depends, Query, Request, status
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.api.dependencies.auth import get_current_user
 from app.core.database import get_db
 from app.core.exceptions import (
    AuthorizationError,
    NotFoundError,
    ValidationException,
 )
 from app.crud.syndarix.agent_instance import agent_instance as agent_instance_crud
 from app.crud.syndarix.agent_type import agent_type as agent_type_crud
 from app.crud.syndarix.project import project as project_crud
 from app.models.syndarix import AgentInstance, Project
 from app.models.syndarix.enums import AgentStatus
 from app.models.user import User
 from app.schemas.common import (
    MessageResponse,
    PaginatedResponse,
    PaginationParams,
    create_pagination_meta,
 )
 from app.schemas.errors import ErrorCode
 from app.schemas.syndarix.agent_instance import (
    AgentInstanceCreate,
    AgentInstanceMetrics,
    AgentInstanceResponse,
    AgentInstanceUpdate,
 )
 router = APIRouter()
 logger = logging.getLogger(__name__)
 # Initialize limiter for this router
 limiter = Limiter(key_func=get_remote_address)
 # Use higher rate limits in test environment
 IS_TEST = os.getenv("IS_TEST", "False") == "True"
 RATE_MULTIPLIER = 100 if IS_TEST else 1
 # Valid status transitions for agent lifecycle management
 VALID_STATUS_TRANSITIONS: dict[AgentStatus, set[AgentStatus]] = {
    AgentStatus.IDLE: {AgentStatus.WORKING, AgentStatus.PAUSED, AgentStatus.TERMINATED},
    AgentStatus.WORKING: {
        AgentStatus.IDLE,
        AgentStatus.WAITING,
        AgentStatus.PAUSED,
        AgentStatus.TERMINATED,
    },
    AgentStatus.WAITING: {
        AgentStatus.IDLE,
        AgentStatus.WORKING,
        AgentStatus.PAUSED,
        AgentStatus.TERMINATED,
    },
    AgentStatus.PAUSED: {AgentStatus.IDLE, AgentStatus.TERMINATED},
    AgentStatus.TERMINATED: set(),  # Terminal state, no transitions allowed
 }
 async def verify_project_access(
    db: AsyncSession,
    project_id: UUID,
    user: User,
 ) -> Project:
    """
    Verify user has access to a project.
    Args:
        db: Database session
        project_id: UUID of the project to verify
        user: Current authenticated user
    Returns:
        Project: The project if access is granted
    Raises:
        NotFoundError: If the project does not exist
        AuthorizationError: If the user does not have access to the project
    """
    project = await project_crud.get(db, id=project_id)
    if not project:
        raise NotFoundError(
            message=f"Project {project_id} not found",
            error_code=ErrorCode.NOT_FOUND,
        )
    if not user.is_superuser and project.owner_id != user.id:
        raise AuthorizationError(
            message="You do not have access to this project",
            error_code=ErrorCode.INSUFFICIENT_PERMISSIONS,
        )
    return project
 def validate_status_transition(
    current_status: AgentStatus,
    target_status: AgentStatus,
 ) -> None:
    """
    Validate that a status transition is allowed.
    Args:
        current_status: The agent's current status
        target_status: The desired target status
    Raises:
        ValidationException: If the transition is not allowed
    """
    valid_targets = VALID_STATUS_TRANSITIONS.get(current_status, set())
    if target_status not in valid_targets:
        raise ValidationException(
            message=f"Cannot transition from {current_status.value} to {target_status.value}",
            error_code=ErrorCode.VALIDATION_ERROR,
            field="status",
        )
 def build_agent_response(
    agent: AgentInstance,
    agent_type_name: str | None = None,
    agent_type_slug: str | None = None,
    project_name: str | None = None,
    project_slug: str | None = None,
    assigned_issues_count: int = 0,
 ) -> AgentInstanceResponse:
    """
    Build an AgentInstanceResponse from an AgentInstance model.
    Args:
        agent: The agent instance model
        agent_type_name: Name of the agent type
        agent_type_slug: Slug of the agent type
        project_name: Name of the project
        project_slug: Slug of the project
        assigned_issues_count: Number of issues assigned to this agent
    Returns:
        AgentInstanceResponse: The response schema
    """
    return AgentInstanceResponse(
        id=agent.id,
        agent_type_id=agent.agent_type_id,
        project_id=agent.project_id,
        name=agent.name,
        status=agent.status,
        current_task=agent.current_task,
        short_term_memory=agent.short_term_memory or {},
        long_term_memory_ref=agent.long_term_memory_ref,
        session_id=agent.session_id,
        last_activity_at=agent.last_activity_at,
        terminated_at=agent.terminated_at,
        tasks_completed=agent.tasks_completed,
        tokens_used=agent.tokens_used,
        cost_incurred=agent.cost_incurred,
        created_at=agent.created_at,
        updated_at=agent.updated_at,
        agent_type_name=agent_type_name,
        agent_type_slug=agent_type_slug,
        project_name=project_name,
        project_slug=project_slug,
        assigned_issues_count=assigned_issues_count,
    )
 # ===== Agent Instance Management Endpoints =====
@router.post(
    "/projects/{project_id}/agents",
    response_model=AgentInstanceResponse,
    status_code=status.HTTP_201_CREATED,
    summary="Spawn Agent Instance",
    description="Spawn a new agent instance in a project. Requires project ownership or superuser.",
    operation_id="spawn_agent",
 )
@limiter.limit(f"{20 * RATE_MULTIPLIER}/minute")
 async def spawn_agent(
    request: Request,
    project_id: UUID,
    agent_in: AgentInstanceCreate,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Spawn a new agent instance in a project.
    Creates a new agent instance from an agent type template and assigns it
    to the specified project. The agent starts in IDLE status by default.
    Args:
        request: FastAPI request object (for rate limiting)
        project_id: UUID of the project to spawn the agent in
        agent_in: Agent instance creation data
        current_user: Current authenticated user
        db: Database session
    Returns:
        AgentInstanceResponse: The newly created agent instance
    Raises:
        NotFoundError: If the project is not found
        AuthorizationError: If the user lacks access to the project
        ValidationException: If the agent creation data is invalid
    """
    try:
        # Verify project access
        project = await verify_project_access(db, project_id, current_user)
        # Ensure the agent is being created for the correct project
        if agent_in.project_id != project_id:
            raise ValidationException(
                message="Agent project_id must match the URL project_id",
                error_code=ErrorCode.VALIDATION_ERROR,
                field="project_id",
            )
        # Validate that the agent type exists and is active
        agent_type = await agent_type_crud.get(db, id=agent_in.agent_type_id)
        if not agent_type:
            raise NotFoundError(
                message=f"Agent type {agent_in.agent_type_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        if not agent_type.is_active:
            raise ValidationException(
                message=f"Agent type '{agent_type.name}' is inactive and cannot be used",
                error_code=ErrorCode.VALIDATION_ERROR,
                field="agent_type_id",
            )
        # Create the agent instance
        agent = await agent_instance_crud.create(db, obj_in=agent_in)
        logger.info(
            f"User {current_user.email} spawned agent '{agent.name}' "
            f"(id={agent.id}) in project {project.slug}"
        )
        # Get agent details for response
        details = await agent_instance_crud.get_with_details(db, instance_id=agent.id)
        if details:
            return build_agent_response(
                agent=details["instance"],
                agent_type_name=details.get("agent_type_name"),
                agent_type_slug=details.get("agent_type_slug"),
                project_name=details.get("project_name"),
                project_slug=details.get("project_slug"),
                assigned_issues_count=details.get("assigned_issues_count", 0),
            )
        return build_agent_response(agent)
    except (NotFoundError, AuthorizationError, ValidationException):
        raise
    except ValueError as e:
        logger.warning(f"Failed to spawn agent: {e!s}")
        raise ValidationException(
            message=str(e),
            error_code=ErrorCode.VALIDATION_ERROR,
        )
    except Exception as e:
        logger.error(f"Error spawning agent: {e!s}", exc_info=True)
        raise
@router.get(
    "/projects/{project_id}/agents",
    response_model=PaginatedResponse[AgentInstanceResponse],
    summary="List Project Agents",
    description="List all agent instances in a project with optional filtering.",
    operation_id="list_project_agents",
 )
@limiter.limit(f"{60 * RATE_MULTIPLIER}/minute")
 async def list_project_agents(
    request: Request,
    project_id: UUID,
    pagination: PaginationParams = Depends(),
    status_filter: AgentStatus | None = Query(
        None, alias="status", description="Filter by agent status"
    ),
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    List all agent instances in a project.
    Returns a paginated list of agents with optional status filtering.
    Results are ordered by creation date (newest first).
    Args:
        request: FastAPI request object (for rate limiting)
        project_id: UUID of the project
        pagination: Pagination parameters
        status_filter: Optional filter by agent status
        current_user: Current authenticated user
        db: Database session
    Returns:
        PaginatedResponse[AgentInstanceResponse]: Paginated list of agents
    Raises:
        NotFoundError: If the project is not found
        AuthorizationError: If the user lacks access to the project
    """
    try:
        # Verify project access
        project = await verify_project_access(db, project_id, current_user)
        # Get agents for the project
        agents, total = await agent_instance_crud.get_by_project(
            db,
            project_id=project_id,
            status=status_filter,
            skip=pagination.offset,
            limit=pagination.limit,
        )
        # Build response objects
        agent_responses = []
        for agent in agents:
            # Get details for each agent (could be optimized with bulk query)
            details = await agent_instance_crud.get_with_details(
                db, instance_id=agent.id
            )
            if details:
                agent_responses.append(
                    build_agent_response(
                        agent=details["instance"],
                        agent_type_name=details.get("agent_type_name"),
                        agent_type_slug=details.get("agent_type_slug"),
                        project_name=details.get("project_name"),
                        project_slug=details.get("project_slug"),
                        assigned_issues_count=details.get("assigned_issues_count", 0),
                    )
                )
            else:
                agent_responses.append(build_agent_response(agent))
        pagination_meta = create_pagination_meta(
            total=total,
            page=pagination.page,
            limit=pagination.limit,
            items_count=len(agent_responses),
        )
        logger.debug(
            f"User {current_user.email} listed {len(agent_responses)} agents "
            f"in project {project.slug}"
        )
        return PaginatedResponse(data=agent_responses, pagination=pagination_meta)
    except (NotFoundError, AuthorizationError):
        raise
    except Exception as e:
        logger.error(f"Error listing project agents: {e!s}", exc_info=True)
        raise
 # ===== Project Agent Metrics Endpoint =====
 # NOTE: This endpoint MUST be defined before /{agent_id} routes
 # to prevent FastAPI from trying to parse "metrics" as a UUID
@router.get(
    "/projects/{project_id}/agents/metrics",
    response_model=AgentInstanceMetrics,
    summary="Get Project Agent Metrics",
    description="Get aggregated usage metrics for all agents in a project.",
    operation_id="get_project_agent_metrics",
 )
@limiter.limit(f"{60 * RATE_MULTIPLIER}/minute")
 async def get_project_agent_metrics(
    request: Request,
    project_id: UUID,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Get aggregated usage metrics for all agents in a project.
    Returns aggregated metrics across all agents including total
    tasks completed, tokens used, and cost incurred.
    Args:
        request: FastAPI request object (for rate limiting)
        project_id: UUID of the project
        current_user: Current authenticated user
        db: Database session
    Returns:
        AgentInstanceMetrics: Aggregated project agent metrics
    Raises:
        NotFoundError: If the project is not found
        AuthorizationError: If the user lacks access to the project
    """
    try:
        # Verify project access
        project = await verify_project_access(db, project_id, current_user)
        # Get aggregated metrics for the project
        metrics = await agent_instance_crud.get_project_metrics(
            db, project_id=project_id
        )
        logger.debug(
            f"User {current_user.email} retrieved project metrics for {project.slug}"
        )
        return AgentInstanceMetrics(
            total_instances=metrics["total_instances"],
            active_instances=metrics["active_instances"],
            idle_instances=metrics["idle_instances"],
            total_tasks_completed=metrics["total_tasks_completed"],
            total_tokens_used=metrics["total_tokens_used"],
            total_cost_incurred=metrics["total_cost_incurred"],
        )
    except (NotFoundError, AuthorizationError):
        raise
    except Exception as e:
        logger.error(f"Error getting project agent metrics: {e!s}", exc_info=True)
        raise
@router.get(
    "/projects/{project_id}/agents/{agent_id}",
    response_model=AgentInstanceResponse,
    summary="Get Agent Details",
    description="Get detailed information about a specific agent instance.",
    operation_id="get_agent",
 )
@limiter.limit(f"{60 * RATE_MULTIPLIER}/minute")
 async def get_agent(
    request: Request,
    project_id: UUID,
    agent_id: UUID,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Get detailed information about a specific agent instance.
    Returns full agent details including related entity information
    (agent type name, project name) and assigned issues count.
    Args:
        request: FastAPI request object (for rate limiting)
        project_id: UUID of the project
        agent_id: UUID of the agent instance
        current_user: Current authenticated user
        db: Database session
    Returns:
        AgentInstanceResponse: The agent instance details
    Raises:
        NotFoundError: If the project or agent is not found
        AuthorizationError: If the user lacks access to the project
    """
    try:
        # Verify project access
        await verify_project_access(db, project_id, current_user)
        # Get agent with full details
        details = await agent_instance_crud.get_with_details(db, instance_id=agent_id)
        if not details:
            raise NotFoundError(
                message=f"Agent {agent_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        agent = details["instance"]
        # Verify agent belongs to the specified project
        if agent.project_id != project_id:
            raise NotFoundError(
                message=f"Agent {agent_id} not found in project {project_id}",
                error_code=ErrorCode.NOT_FOUND,
            )
        logger.debug(
            f"User {current_user.email} retrieved agent {agent.name} (id={agent_id})"
        )
        return build_agent_response(
            agent=agent,
            agent_type_name=details.get("agent_type_name"),
            agent_type_slug=details.get("agent_type_slug"),
            project_name=details.get("project_name"),
            project_slug=details.get("project_slug"),
            assigned_issues_count=details.get("assigned_issues_count", 0),
        )
    except (NotFoundError, AuthorizationError):
        raise
    except Exception as e:
        logger.error(f"Error getting agent details: {e!s}", exc_info=True)
        raise
@router.patch(
    "/projects/{project_id}/agents/{agent_id}",
    response_model=AgentInstanceResponse,
    summary="Update Agent",
    description="Update an agent instance's configuration and state.",
    operation_id="update_agent",
 )
@limiter.limit(f"{30 * RATE_MULTIPLIER}/minute")
 async def update_agent(
    request: Request,
    project_id: UUID,
    agent_id: UUID,
    agent_in: AgentInstanceUpdate,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Update an agent instance's configuration and state.
    Allows updating agent status, current task, memory, and other
    configurable fields. Status transitions are validated according
    to the agent lifecycle state machine.
    Args:
        request: FastAPI request object (for rate limiting)
        project_id: UUID of the project
        agent_id: UUID of the agent instance
        agent_in: Agent update data
        current_user: Current authenticated user
        db: Database session
    Returns:
        AgentInstanceResponse: The updated agent instance
    Raises:
        NotFoundError: If the project or agent is not found
        AuthorizationError: If the user lacks access to the project
        ValidationException: If the status transition is invalid
    """
    try:
        # Verify project access
        await verify_project_access(db, project_id, current_user)
        # Get current agent
        agent = await agent_instance_crud.get(db, id=agent_id)
        if not agent:
            raise NotFoundError(
                message=f"Agent {agent_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        # Verify agent belongs to the specified project
        if agent.project_id != project_id:
            raise NotFoundError(
                message=f"Agent {agent_id} not found in project {project_id}",
                error_code=ErrorCode.NOT_FOUND,
            )
        # Validate status transition if status is being changed
        if agent_in.status is not None and agent_in.status != agent.status:
            validate_status_transition(agent.status, agent_in.status)
        # Update the agent
        updated_agent = await agent_instance_crud.update(
            db, db_obj=agent, obj_in=agent_in
        )
        logger.info(
            f"User {current_user.email} updated agent {updated_agent.name} "
            f"(id={agent_id})"
        )
        # Get updated details
        details = await agent_instance_crud.get_with_details(
            db, instance_id=updated_agent.id
        )
        if details:
            return build_agent_response(
                agent=details["instance"],
                agent_type_name=details.get("agent_type_name"),
                agent_type_slug=details.get("agent_type_slug"),
                project_name=details.get("project_name"),
                project_slug=details.get("project_slug"),
                assigned_issues_count=details.get("assigned_issues_count", 0),
            )
        return build_agent_response(updated_agent)
    except (NotFoundError, AuthorizationError, ValidationException):
        raise
    except Exception as e:
        logger.error(f"Error updating agent: {e!s}", exc_info=True)
        raise
@router.post(
    "/projects/{project_id}/agents/{agent_id}/pause",
    response_model=AgentInstanceResponse,
    summary="Pause Agent",
    description="Pause an agent instance, temporarily stopping its work.",
    operation_id="pause_agent",
 )
@limiter.limit(f"{20 * RATE_MULTIPLIER}/minute")
 async def pause_agent(
    request: Request,
    project_id: UUID,
    agent_id: UUID,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Pause an agent instance.
    Transitions the agent to PAUSED status, temporarily stopping
    its work. The agent can be resumed later with the resume endpoint.
    Args:
        request: FastAPI request object (for rate limiting)
        project_id: UUID of the project
        agent_id: UUID of the agent instance
        current_user: Current authenticated user
        db: Database session
    Returns:
        AgentInstanceResponse: The paused agent instance
    Raises:
        NotFoundError: If the project or agent is not found
        AuthorizationError: If the user lacks access to the project
        ValidationException: If the agent cannot be paused from its current state
    """
    try:
        # Verify project access
        await verify_project_access(db, project_id, current_user)
        # Get current agent
        agent = await agent_instance_crud.get(db, id=agent_id)
        if not agent:
            raise NotFoundError(
                message=f"Agent {agent_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        # Verify agent belongs to the specified project
        if agent.project_id != project_id:
            raise NotFoundError(
                message=f"Agent {agent_id} not found in project {project_id}",
                error_code=ErrorCode.NOT_FOUND,
            )
        # Validate the transition to PAUSED
        validate_status_transition(agent.status, AgentStatus.PAUSED)
        # Update status to PAUSED
        paused_agent = await agent_instance_crud.update_status(
            db,
            instance_id=agent_id,
            status=AgentStatus.PAUSED,
        )
        if not paused_agent:
            raise NotFoundError(
                message=f"Agent {agent_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        logger.info(
            f"User {current_user.email} paused agent {paused_agent.name} "
            f"(id={agent_id})"
        )
        # Get updated details
        details = await agent_instance_crud.get_with_details(
            db, instance_id=paused_agent.id
        )
        if details:
            return build_agent_response(
                agent=details["instance"],
                agent_type_name=details.get("agent_type_name"),
                agent_type_slug=details.get("agent_type_slug"),
                project_name=details.get("project_name"),
                project_slug=details.get("project_slug"),
                assigned_issues_count=details.get("assigned_issues_count", 0),
            )
        return build_agent_response(paused_agent)
    except (NotFoundError, AuthorizationError, ValidationException):
        raise
    except Exception as e:
        logger.error(f"Error pausing agent: {e!s}", exc_info=True)
        raise
@router.post(
    "/projects/{project_id}/agents/{agent_id}/resume",
    response_model=AgentInstanceResponse,
    summary="Resume Agent",
    description="Resume a paused agent instance.",
    operation_id="resume_agent",
 )
@limiter.limit(f"{20 * RATE_MULTIPLIER}/minute")
 async def resume_agent(
    request: Request,
    project_id: UUID,
    agent_id: UUID,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Resume a paused agent instance.
    Transitions the agent from PAUSED back to IDLE status,
    allowing it to accept new work.
    Args:
        request: FastAPI request object (for rate limiting)
        project_id: UUID of the project
        agent_id: UUID of the agent instance
        current_user: Current authenticated user
        db: Database session
    Returns:
        AgentInstanceResponse: The resumed agent instance
    Raises:
        NotFoundError: If the project or agent is not found
        AuthorizationError: If the user lacks access to the project
        ValidationException: If the agent cannot be resumed from its current state
    """
    try:
        # Verify project access
        await verify_project_access(db, project_id, current_user)
        # Get current agent
        agent = await agent_instance_crud.get(db, id=agent_id)
        if not agent:
            raise NotFoundError(
                message=f"Agent {agent_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        # Verify agent belongs to the specified project
        if agent.project_id != project_id:
            raise NotFoundError(
                message=f"Agent {agent_id} not found in project {project_id}",
                error_code=ErrorCode.NOT_FOUND,
            )
        # Validate the transition to IDLE (resume)
        validate_status_transition(agent.status, AgentStatus.IDLE)
        # Update status to IDLE
        resumed_agent = await agent_instance_crud.update_status(
            db,
            instance_id=agent_id,
            status=AgentStatus.IDLE,
        )
        if not resumed_agent:
            raise NotFoundError(
                message=f"Agent {agent_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        logger.info(
            f"User {current_user.email} resumed agent {resumed_agent.name} "
            f"(id={agent_id})"
        )
        # Get updated details
        details = await agent_instance_crud.get_with_details(
            db, instance_id=resumed_agent.id
        )
        if details:
            return build_agent_response(
                agent=details["instance"],
                agent_type_name=details.get("agent_type_name"),
                agent_type_slug=details.get("agent_type_slug"),
                project_name=details.get("project_name"),
                project_slug=details.get("project_slug"),
                assigned_issues_count=details.get("assigned_issues_count", 0),
            )
        return build_agent_response(resumed_agent)
    except (NotFoundError, AuthorizationError, ValidationException):
        raise
    except Exception as e:
        logger.error(f"Error resuming agent: {e!s}", exc_info=True)
        raise
@router.delete(
    "/projects/{project_id}/agents/{agent_id}",
    response_model=MessageResponse,
    summary="Terminate Agent",
    description="Terminate an agent instance, permanently stopping it.",
    operation_id="terminate_agent",
 )
@limiter.limit(f"{10 * RATE_MULTIPLIER}/minute")
 async def terminate_agent(
    request: Request,
    project_id: UUID,
    agent_id: UUID,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Terminate an agent instance.
    Permanently terminates the agent, setting its status to TERMINATED.
    This action cannot be undone - a new agent must be spawned if needed.
    The agent's session and current task are cleared.
    Args:
        request: FastAPI request object (for rate limiting)
        project_id: UUID of the project
        agent_id: UUID of the agent instance
        current_user: Current authenticated user
        db: Database session
    Returns:
        MessageResponse: Confirmation message
    Raises:
        NotFoundError: If the project or agent is not found
        AuthorizationError: If the user lacks access to the project
        ValidationException: If the agent is already terminated
    """
    try:
        # Verify project access
        await verify_project_access(db, project_id, current_user)
        # Get current agent
        agent = await agent_instance_crud.get(db, id=agent_id)
        if not agent:
            raise NotFoundError(
                message=f"Agent {agent_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        # Verify agent belongs to the specified project
        if agent.project_id != project_id:
            raise NotFoundError(
                message=f"Agent {agent_id} not found in project {project_id}",
                error_code=ErrorCode.NOT_FOUND,
            )
        # Check if already terminated
        if agent.status == AgentStatus.TERMINATED:
            raise ValidationException(
                message="Agent is already terminated",
                error_code=ErrorCode.VALIDATION_ERROR,
                field="status",
            )
        # Validate the transition to TERMINATED
        validate_status_transition(agent.status, AgentStatus.TERMINATED)
        agent_name = agent.name
        # Terminate the agent
        terminated_agent = await agent_instance_crud.terminate(db, instance_id=agent_id)
        if not terminated_agent:
            raise NotFoundError(
                message=f"Agent {agent_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        logger.info(
            f"User {current_user.email} terminated agent {agent_name} (id={agent_id})"
        )
        return MessageResponse(
            success=True,
            message=f"Agent '{agent_name}' has been terminated",
        )
    except (NotFoundError, AuthorizationError, ValidationException):
        raise
    except Exception as e:
        logger.error(f"Error terminating agent: {e!s}", exc_info=True)
        raise
@router.get(
    "/projects/{project_id}/agents/{agent_id}/metrics",
    response_model=AgentInstanceMetrics,
    summary="Get Agent Metrics",
    description="Get usage metrics for a specific agent instance.",
    operation_id="get_agent_metrics",
 )
@limiter.limit(f"{60 * RATE_MULTIPLIER}/minute")
 async def get_agent_metrics(
    request: Request,
    project_id: UUID,
    agent_id: UUID,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Get usage metrics for a specific agent instance.
    Returns metrics including tasks completed, tokens used,
    and cost incurred for the specified agent.
    Args:
        request: FastAPI request object (for rate limiting)
        project_id: UUID of the project
        agent_id: UUID of the agent instance
        current_user: Current authenticated user
        db: Database session
    Returns:
        AgentInstanceMetrics: Agent usage metrics
    Raises:
        NotFoundError: If the project or agent is not found
        AuthorizationError: If the user lacks access to the project
    """
    try:
        # Verify project access
        await verify_project_access(db, project_id, current_user)
        # Get agent
        agent = await agent_instance_crud.get(db, id=agent_id)
        if not agent:
            raise NotFoundError(
                message=f"Agent {agent_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        # Verify agent belongs to the specified project
        if agent.project_id != project_id:
            raise NotFoundError(
                message=f"Agent {agent_id} not found in project {project_id}",
                error_code=ErrorCode.NOT_FOUND,
            )
        # Calculate metrics for this single agent
        # For a single agent, we report its individual metrics
        is_active = agent.status == AgentStatus.WORKING
        is_idle = agent.status == AgentStatus.IDLE
        logger.debug(
            f"User {current_user.email} retrieved metrics for agent {agent.name} "
            f"(id={agent_id})"
        )
        return AgentInstanceMetrics(
            total_instances=1,
            active_instances=1 if is_active else 0,
            idle_instances=1 if is_idle else 0,
            total_tasks_completed=agent.tasks_completed,
            total_tokens_used=agent.tokens_used,
            total_cost_incurred=agent.cost_incurred,
        )
    except (NotFoundError, AuthorizationError):
        raise
    except Exception as e:
        logger.error(f"Error getting agent metrics: {e!s}", exc_info=True)
        raise
--- a/backend/app/api/routes/events.py
+++ b/backend/app/api/routes/events.py
@@ -0,0 +1,316 @@
 """
 SSE endpoint for real-time project event streaming.
 This module provides Server-Sent Events (SSE) endpoints for streaming
 project events to connected clients. Events are scoped to projects,
 with authorization checks to ensure clients only receive events
 for projects they have access to.
 Features:
 - Real-time event streaming via SSE
 - Project-scoped authorization
 - Automatic reconnection support (Last-Event-ID)
 - Keepalive messages every 30 seconds
 - Graceful connection cleanup
 """
 import asyncio
 import json
 import logging
 from typing import TYPE_CHECKING
 from uuid import UUID
 from fastapi import APIRouter, Depends, Header, Query, Request
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 from sse_starlette.sse import EventSourceResponse
 from app.api.dependencies.auth import get_current_user, get_current_user_sse
 from app.api.dependencies.event_bus import get_event_bus
 from app.core.database import get_db
 from app.core.exceptions import AuthorizationError
 from app.models.user import User
 from app.schemas.errors import ErrorCode
 from app.schemas.events import EventType
 from app.services.event_bus import EventBus
 if TYPE_CHECKING:
    from sqlalchemy.ext.asyncio import AsyncSession
 logger = logging.getLogger(__name__)
 router = APIRouter()
 limiter = Limiter(key_func=get_remote_address)
 # Keepalive interval in seconds
 KEEPALIVE_INTERVAL = 30
 async def check_project_access(
    project_id: UUID,
    user: User,
    db: "AsyncSession",
 ) -> bool:
    """
    Check if a user has access to a project's events.
    Authorization rules:
    - Superusers can access all projects
    - Project owners can access their own projects
    Args:
        project_id: The project to check access for
        user: The authenticated user
        db: Database session for project lookup
    Returns:
        bool: True if user has access, False otherwise
    """
    # Superusers can access all projects
    if user.is_superuser:
        logger.debug(
            f"Project access granted for superuser {user.id} on project {project_id}"
        )
        return True
    # Check if user owns the project
    from app.crud.syndarix import project as project_crud
    project = await project_crud.get(db, id=project_id)
    if not project:
        logger.debug(f"Project {project_id} not found for access check")
        return False
    has_access = bool(project.owner_id == user.id)
    logger.debug(
        f"Project access {'granted' if has_access else 'denied'} "
        f"for user {user.id} on project {project_id} (owner: {project.owner_id})"
    )
    return has_access
 async def event_generator(
    project_id: UUID,
    event_bus: EventBus,
    last_event_id: str | None = None,
 ):
    """
    Generate SSE events for a project.
    This async generator yields SSE-formatted events from the event bus,
    including keepalive comments to maintain the connection.
    Args:
        project_id: The project to stream events for
        event_bus: The EventBus instance
        last_event_id: Optional last received event ID for reconnection
    Yields:
        dict: SSE event data with 'event', 'data', and optional 'id' fields
    """
    try:
        async for event_data in event_bus.subscribe_sse(
            project_id=project_id,
            last_event_id=last_event_id,
            keepalive_interval=KEEPALIVE_INTERVAL,
        ):
            if event_data == "":
                # Keepalive - yield SSE comment
                yield {"comment": "keepalive"}
            else:
                # Parse event to extract type and id
                try:
                    event_dict = json.loads(event_data)
                    event_type = event_dict.get("type", "message")
                    event_id = event_dict.get("id")
                    yield {
                        "event": event_type,
                        "data": event_data,
                        "id": event_id,
                    }
                except json.JSONDecodeError:
                    # If we can't parse, send as generic message
                    yield {
                        "event": "message",
                        "data": event_data,
                    }
    except asyncio.CancelledError:
        logger.info(f"Event stream cancelled for project {project_id}")
        raise
    except Exception as e:
        logger.error(f"Error in event stream for project {project_id}: {e}")
        raise
@router.get(
    "/projects/{project_id}/events/stream",
    summary="Stream Project Events",
    description="""
    Stream real-time events for a project via Server-Sent Events (SSE).
    **Authentication**: Required (Bearer token OR query parameter)
    **Authorization**: Must have access to the project
    **Authentication Methods**:
    - Bearer token in Authorization header (preferred)
    - Query parameter `token` (for EventSource compatibility)
    Note: EventSource API doesn't support custom headers, so the query parameter
    option is provided for browser-based SSE clients.
    **SSE Event Format**:
    ```
    event: agent.status_changed
    id: 550e8400-e29b-41d4-a716-446655440000
    data: {"id": "...", "type": "agent.status_changed", "project_id": "...", ...}
    : keepalive
    event: issue.created
    id: 550e8400-e29b-41d4-a716-446655440001
    data: {...}
    ```
    **Reconnection**: Include the `Last-Event-ID` header with the last received
    event ID to resume from where you left off.
    **Keepalive**: The server sends a comment (`: keepalive`) every 30 seconds
    to keep the connection alive.
    **Rate Limit**: 10 connections/minute per IP
    """,
    response_class=EventSourceResponse,
    responses={
        200: {
            "description": "SSE stream established",
            "content": {"text/event-stream": {}},
        },
        401: {"description": "Not authenticated"},
        403: {"description": "Not authorized to access this project"},
        404: {"description": "Project not found"},
    },
    operation_id="stream_project_events",
 )
@limiter.limit("10/minute")
 async def stream_project_events(
    request: Request,
    project_id: UUID,
    db: "AsyncSession" = Depends(get_db),
    event_bus: EventBus = Depends(get_event_bus),
    token: str | None = Query(
        None, description="Auth token (for EventSource compatibility)"
    ),
    authorization: str | None = Header(None, alias="Authorization"),
    last_event_id: str | None = Header(None, alias="Last-Event-ID"),
 ):
    """
    Stream real-time events for a project via SSE.
    This endpoint establishes a persistent SSE connection that streams
    project events to the client in real-time. The connection includes:
    - Event streaming: All project events (agent updates, issues, etc.)
    - Keepalive: Comment every 30 seconds to maintain connection
    - Reconnection: Use Last-Event-ID header to resume after disconnect
    The connection is automatically cleaned up when the client disconnects.
    """
    # Authenticate user (supports both header and query param tokens)
    current_user = await get_current_user_sse(
        db=db, authorization=authorization, token=token
    )
    logger.info(
        f"SSE connection request for project {project_id} "
        f"by user {current_user.id} "
        f"(last_event_id={last_event_id})"
    )
    # Check project access
    has_access = await check_project_access(project_id, current_user, db)
    if not has_access:
        raise AuthorizationError(
            message=f"You don't have access to project {project_id}",
            error_code=ErrorCode.INSUFFICIENT_PERMISSIONS,
        )
    # Return SSE response
    return EventSourceResponse(
        event_generator(
            project_id=project_id,
            event_bus=event_bus,
            last_event_id=last_event_id,
        ),
        media_type="text/event-stream",
        headers={
            "Cache-Control": "no-cache",
            "Connection": "keep-alive",
            "X-Accel-Buffering": "no",  # Disable nginx buffering
        },
    )
@router.post(
    "/projects/{project_id}/events/test",
    summary="Send Test Event (Development Only)",
    description="""
    Send a test event to a project's event stream. This endpoint is
    intended for development and testing purposes.
    **Authentication**: Required (Bearer token)
    **Authorization**: Must have access to the project
    **Note**: This endpoint should be disabled or restricted in production.
    """,
    response_model=dict,
    responses={
        200: {"description": "Test event sent"},
        401: {"description": "Not authenticated"},
        403: {"description": "Not authorized to access this project"},
    },
    operation_id="send_test_event",
 )
 async def send_test_event(
    project_id: UUID,
    current_user: User = Depends(get_current_user),
    event_bus: EventBus = Depends(get_event_bus),
    db: "AsyncSession" = Depends(get_db),
 ):
    """
    Send a test event to the project's event stream.
    This is useful for testing SSE connections during development.
    """
    # Check project access
    has_access = await check_project_access(project_id, current_user, db)
    if not has_access:
        raise AuthorizationError(
            message=f"You don't have access to project {project_id}",
            error_code=ErrorCode.INSUFFICIENT_PERMISSIONS,
        )
    # Create and publish test event using the Event schema
    event = EventBus.create_event(
        event_type=EventType.AGENT_MESSAGE,
        project_id=project_id,
        actor_type="user",
        actor_id=current_user.id,
        payload={
            "message": "Test event from SSE endpoint",
            "message_type": "info",
        },
    )
    channel = event_bus.get_project_channel(project_id)
    await event_bus.publish(channel, event)
    logger.info(f"Test event sent to project {project_id}: {event.id}")
    return {
        "success": True,
        "event_id": event.id,
        "event_type": event.type.value,
        "message": "Test event sent successfully",
    }
--- a/backend/app/api/routes/issues.py
+++ b/backend/app/api/routes/issues.py
@@ -0,0 +1,968 @@
 # app/api/routes/issues.py
 """
 Issue CRUD API endpoints for Syndarix projects.
 Provides endpoints for managing issues within projects, including:
 - Create, read, update, delete operations
 - Filtering by status, priority, labels, sprint, assigned agent
 - Search across title and body
 - Assignment to agents
 - External issue tracker sync triggers
 """
 import logging
 import os
 from typing import Any
 from uuid import UUID
 from fastapi import APIRouter, Depends, Query, Request, status
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.api.dependencies.auth import get_current_user
 from app.core.database import get_db
 from app.core.exceptions import (
    AuthorizationError,
    NotFoundError,
    ValidationException,
 )
 from app.crud.syndarix.agent_instance import agent_instance as agent_instance_crud
 from app.crud.syndarix.issue import issue as issue_crud
 from app.crud.syndarix.project import project as project_crud
 from app.crud.syndarix.sprint import sprint as sprint_crud
 from app.models.syndarix.enums import (
    AgentStatus,
    IssuePriority,
    IssueStatus,
    SprintStatus,
    SyncStatus,
 )
 from app.models.user import User
 from app.schemas.common import (
    MessageResponse,
    PaginatedResponse,
    PaginationParams,
    SortOrder,
    create_pagination_meta,
 )
 from app.schemas.errors import ErrorCode
 from app.schemas.syndarix.issue import (
    IssueAssign,
    IssueCreate,
    IssueResponse,
    IssueStats,
    IssueUpdate,
 )
 router = APIRouter()
 logger = logging.getLogger(__name__)
 # Initialize limiter for this router
 limiter = Limiter(key_func=get_remote_address)
 # Use higher rate limits in test environment
 IS_TEST = os.getenv("IS_TEST", "False") == "True"
 RATE_MULTIPLIER = 100 if IS_TEST else 1
 async def verify_project_ownership(
    db: AsyncSession,
    project_id: UUID,
    user: User,
 ) -> None:
    """
    Verify that the user owns the project or is a superuser.
    Args:
        db: Database session
        project_id: Project UUID to verify
        user: Current authenticated user
    Raises:
        NotFoundError: If project does not exist
        AuthorizationError: If user does not own the project
    """
    project = await project_crud.get(db, id=project_id)
    if not project:
        raise NotFoundError(
            message=f"Project {project_id} not found",
            error_code=ErrorCode.NOT_FOUND,
        )
    if not user.is_superuser and project.owner_id != user.id:
        raise AuthorizationError(
            message="You do not have access to this project",
            error_code=ErrorCode.INSUFFICIENT_PERMISSIONS,
        )
 def _build_issue_response(
    issue: Any,
    project_name: str | None = None,
    project_slug: str | None = None,
    sprint_name: str | None = None,
    assigned_agent_type_name: str | None = None,
 ) -> IssueResponse:
    """
    Build an IssueResponse from an Issue model instance.
    Args:
        issue: Issue model instance
        project_name: Optional project name from relationship
        project_slug: Optional project slug from relationship
        sprint_name: Optional sprint name from relationship
        assigned_agent_type_name: Optional agent type name from relationship
    Returns:
        IssueResponse schema instance
    """
    return IssueResponse(
        id=issue.id,
        project_id=issue.project_id,
        title=issue.title,
        body=issue.body,
        status=issue.status,
        priority=issue.priority,
        labels=issue.labels or [],
        assigned_agent_id=issue.assigned_agent_id,
        human_assignee=issue.human_assignee,
        sprint_id=issue.sprint_id,
        story_points=issue.story_points,
        external_tracker_type=issue.external_tracker_type,
        external_issue_id=issue.external_issue_id,
        remote_url=issue.remote_url,
        external_issue_number=issue.external_issue_number,
        sync_status=issue.sync_status,
        last_synced_at=issue.last_synced_at,
        external_updated_at=issue.external_updated_at,
        closed_at=issue.closed_at,
        created_at=issue.created_at,
        updated_at=issue.updated_at,
        project_name=project_name,
        project_slug=project_slug,
        sprint_name=sprint_name,
        assigned_agent_type_name=assigned_agent_type_name,
    )
 # ===== Issue CRUD Endpoints =====
@router.post(
    "/projects/{project_id}/issues",
    response_model=IssueResponse,
    status_code=status.HTTP_201_CREATED,
    summary="Create Issue",
    description="Create a new issue in a project",
    operation_id="create_issue",
 )
@limiter.limit(f"{60 * RATE_MULTIPLIER}/minute")
 async def create_issue(
    request: Request,
    project_id: UUID,
    issue_in: IssueCreate,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Create a new issue within a project.
    The user must own the project or be a superuser.
    The project_id in the path takes precedence over any project_id in the body.
    Args:
        request: FastAPI request object (for rate limiting)
        project_id: UUID of the project to create the issue in
        issue_in: Issue creation data
        current_user: Authenticated user
        db: Database session
    Returns:
        Created issue with full details
    Raises:
        NotFoundError: If project not found
        AuthorizationError: If user lacks access
        ValidationException: If assigned agent not in project
    """
    # Verify project access
    await verify_project_ownership(db, project_id, current_user)
    # Override project_id from path
    issue_in.project_id = project_id
    # Validate assigned agent if provided
    if issue_in.assigned_agent_id:
        agent = await agent_instance_crud.get(db, id=issue_in.assigned_agent_id)
        if not agent:
            raise NotFoundError(
                message=f"Agent instance {issue_in.assigned_agent_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        if agent.project_id != project_id:
            raise ValidationException(
                message="Agent instance does not belong to this project",
                error_code=ErrorCode.VALIDATION_ERROR,
                field="assigned_agent_id",
            )
        if agent.status == AgentStatus.TERMINATED:
            raise ValidationException(
                message="Cannot assign issue to a terminated agent",
                error_code=ErrorCode.VALIDATION_ERROR,
                field="assigned_agent_id",
            )
    # Validate sprint if provided (IDOR prevention)
    if issue_in.sprint_id:
        sprint = await sprint_crud.get(db, id=issue_in.sprint_id)
        if not sprint:
            raise NotFoundError(
                message=f"Sprint {issue_in.sprint_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        if sprint.project_id != project_id:
            raise ValidationException(
                message="Sprint does not belong to this project",
                error_code=ErrorCode.VALIDATION_ERROR,
                field="sprint_id",
            )
    try:
        issue = await issue_crud.create(db, obj_in=issue_in)
        logger.info(
            f"User {current_user.email} created issue '{issue.title}' "
            f"in project {project_id}"
        )
        # Get project details for response
        project = await project_crud.get(db, id=project_id)
        return _build_issue_response(
            issue,
            project_name=project.name if project else None,
            project_slug=project.slug if project else None,
        )
    except ValueError as e:
        logger.warning(f"Failed to create issue: {e!s}")
        raise ValidationException(
            message=str(e),
            error_code=ErrorCode.VALIDATION_ERROR,
        )
    except Exception as e:
        logger.error(f"Error creating issue: {e!s}", exc_info=True)
        raise
@router.get(
    "/projects/{project_id}/issues",
    response_model=PaginatedResponse[IssueResponse],
    summary="List Issues",
    description="Get paginated list of issues in a project with filtering",
    operation_id="list_issues",
 )
@limiter.limit(f"{120 * RATE_MULTIPLIER}/minute")
 async def list_issues(
    request: Request,
    project_id: UUID,
    pagination: PaginationParams = Depends(),
    status_filter: IssueStatus | None = Query(
        None, alias="status", description="Filter by issue status"
    ),
    priority: IssuePriority | None = Query(None, description="Filter by priority"),
    labels: list[str] | None = Query(
        None, description="Filter by labels (comma-separated)"
    ),
    sprint_id: UUID | None = Query(None, description="Filter by sprint ID"),
    assigned_agent_id: UUID | None = Query(
        None, description="Filter by assigned agent ID"
    ),
    sync_status: SyncStatus | None = Query(None, description="Filter by sync status"),
    search: str | None = Query(
        None, min_length=1, max_length=100, description="Search in title and body"
    ),
    sort_by: str = Query(
        "created_at",
        description="Field to sort by (created_at, updated_at, priority, status, title)",
    ),
    sort_order: SortOrder = Query(SortOrder.DESC, description="Sort order"),
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    List issues in a project with comprehensive filtering options.
    Supports filtering by:
    - status: Issue status (open, in_progress, in_review, blocked, closed)
    - priority: Issue priority (low, medium, high, critical)
    - labels: Match issues containing any of the provided labels
    - sprint_id: Issues in a specific sprint
    - assigned_agent_id: Issues assigned to a specific agent
    - sync_status: External tracker sync status
    - search: Full-text search in title and body
    Args:
        request: FastAPI request object
        project_id: Project UUID
        pagination: Pagination parameters
        status_filter: Optional status filter
        priority: Optional priority filter
        labels: Optional labels filter
        sprint_id: Optional sprint filter
        assigned_agent_id: Optional agent assignment filter
        sync_status: Optional sync status filter
        search: Optional search query
        sort_by: Field to sort by
        sort_order: Sort direction
        current_user: Authenticated user
        db: Database session
    Returns:
        Paginated list of issues matching filters
    """
    # Verify project access
    await verify_project_ownership(db, project_id, current_user)
    try:
        # Get filtered issues
        issues, total = await issue_crud.get_by_project(
            db,
            project_id=project_id,
            status=status_filter,
            priority=priority,
            sprint_id=sprint_id,
            assigned_agent_id=assigned_agent_id,
            labels=labels,
            search=search,
            skip=pagination.offset,
            limit=pagination.limit,
            sort_by=sort_by,
            sort_order=sort_order.value,
        )
        # Build response objects
        issue_responses = [_build_issue_response(issue) for issue in issues]
        pagination_meta = create_pagination_meta(
            total=total,
            page=pagination.page,
            limit=pagination.limit,
            items_count=len(issue_responses),
        )
        return PaginatedResponse(data=issue_responses, pagination=pagination_meta)
    except Exception as e:
        logger.error(
            f"Error listing issues for project {project_id}: {e!s}", exc_info=True
        )
        raise
 # ===== Issue Statistics Endpoint =====
 # NOTE: This endpoint MUST be defined before /{issue_id} routes
 # to prevent FastAPI from trying to parse "stats" as a UUID
@router.get(
    "/projects/{project_id}/issues/stats",
    response_model=IssueStats,
    summary="Get Issue Statistics",
    description="Get aggregated issue statistics for a project",
    operation_id="get_issue_stats",
 )
@limiter.limit(f"{60 * RATE_MULTIPLIER}/minute")
 async def get_issue_stats(
    request: Request,
    project_id: UUID,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Get aggregated statistics for issues in a project.
    Returns counts by status and priority, along with story point totals.
    Args:
        request: FastAPI request object
        project_id: Project UUID
        current_user: Authenticated user
        db: Database session
    Returns:
        Issue statistics including counts by status/priority and story points
    Raises:
        NotFoundError: If project not found
        AuthorizationError: If user lacks access
    """
    # Verify project access
    await verify_project_ownership(db, project_id, current_user)
    try:
        stats = await issue_crud.get_project_stats(db, project_id=project_id)
        return IssueStats(**stats)
    except Exception as e:
        logger.error(
            f"Error getting issue stats for project {project_id}: {e!s}",
            exc_info=True,
        )
        raise
@router.get(
    "/projects/{project_id}/issues/{issue_id}",
    response_model=IssueResponse,
    summary="Get Issue",
    description="Get detailed information about a specific issue",
    operation_id="get_issue",
 )
@limiter.limit(f"{120 * RATE_MULTIPLIER}/minute")
 async def get_issue(
    request: Request,
    project_id: UUID,
    issue_id: UUID,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Get detailed information about a specific issue.
    Returns the issue with expanded relationship data including
    project name, sprint name, and assigned agent type name.
    Args:
        request: FastAPI request object
        project_id: Project UUID
        issue_id: Issue UUID
        current_user: Authenticated user
        db: Database session
    Returns:
        Issue details with relationship data
    Raises:
        NotFoundError: If project or issue not found
        AuthorizationError: If user lacks access
    """
    # Verify project access
    await verify_project_ownership(db, project_id, current_user)
    # Get issue with details
    issue_data = await issue_crud.get_with_details(db, issue_id=issue_id)
    if not issue_data:
        raise NotFoundError(
            message=f"Issue {issue_id} not found",
            error_code=ErrorCode.NOT_FOUND,
        )
    issue = issue_data["issue"]
    # Verify issue belongs to the project
    if issue.project_id != project_id:
        raise NotFoundError(
            message=f"Issue {issue_id} not found in project {project_id}",
            error_code=ErrorCode.NOT_FOUND,
        )
    return _build_issue_response(
        issue,
        project_name=issue_data.get("project_name"),
        project_slug=issue_data.get("project_slug"),
        sprint_name=issue_data.get("sprint_name"),
        assigned_agent_type_name=issue_data.get("assigned_agent_type_name"),
    )
@router.patch(
    "/projects/{project_id}/issues/{issue_id}",
    response_model=IssueResponse,
    summary="Update Issue",
    description="Update an existing issue",
    operation_id="update_issue",
 )
@limiter.limit(f"{60 * RATE_MULTIPLIER}/minute")
 async def update_issue(
    request: Request,
    project_id: UUID,
    issue_id: UUID,
    issue_in: IssueUpdate,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Update an existing issue.
    All fields are optional - only provided fields will be updated.
    Validates that assigned agent belongs to the same project.
    Args:
        request: FastAPI request object
        project_id: Project UUID
        issue_id: Issue UUID
        issue_in: Fields to update
        current_user: Authenticated user
        db: Database session
    Returns:
        Updated issue details
    Raises:
        NotFoundError: If project or issue not found
        AuthorizationError: If user lacks access
        ValidationException: If validation fails
    """
    # Verify project access
    await verify_project_ownership(db, project_id, current_user)
    # Get existing issue
    issue = await issue_crud.get(db, id=issue_id)
    if not issue:
        raise NotFoundError(
            message=f"Issue {issue_id} not found",
            error_code=ErrorCode.NOT_FOUND,
        )
    # Verify issue belongs to the project
    if issue.project_id != project_id:
        raise NotFoundError(
            message=f"Issue {issue_id} not found in project {project_id}",
            error_code=ErrorCode.NOT_FOUND,
        )
    # Validate assigned agent if being updated
    if issue_in.assigned_agent_id is not None:
        agent = await agent_instance_crud.get(db, id=issue_in.assigned_agent_id)
        if not agent:
            raise NotFoundError(
                message=f"Agent instance {issue_in.assigned_agent_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        if agent.project_id != project_id:
            raise ValidationException(
                message="Agent instance does not belong to this project",
                error_code=ErrorCode.VALIDATION_ERROR,
                field="assigned_agent_id",
            )
        if agent.status == AgentStatus.TERMINATED:
            raise ValidationException(
                message="Cannot assign issue to a terminated agent",
                error_code=ErrorCode.VALIDATION_ERROR,
                field="assigned_agent_id",
            )
    # Validate sprint if being updated (IDOR prevention and status validation)
    if issue_in.sprint_id is not None:
        sprint = await sprint_crud.get(db, id=issue_in.sprint_id)
        if not sprint:
            raise NotFoundError(
                message=f"Sprint {issue_in.sprint_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        if sprint.project_id != project_id:
            raise ValidationException(
                message="Sprint does not belong to this project",
                error_code=ErrorCode.VALIDATION_ERROR,
                field="sprint_id",
            )
        # Cannot add issues to completed or cancelled sprints
        if sprint.status in [SprintStatus.COMPLETED, SprintStatus.CANCELLED]:
            raise ValidationException(
                message=f"Cannot add issues to sprint with status '{sprint.status.value}'",
                error_code=ErrorCode.VALIDATION_ERROR,
                field="sprint_id",
            )
    try:
        updated_issue = await issue_crud.update(db, db_obj=issue, obj_in=issue_in)
        logger.info(
            f"User {current_user.email} updated issue {issue_id} in project {project_id}"
        )
        # Get full details for response
        issue_data = await issue_crud.get_with_details(db, issue_id=issue_id)
        return _build_issue_response(
            updated_issue,
            project_name=issue_data.get("project_name") if issue_data else None,
            project_slug=issue_data.get("project_slug") if issue_data else None,
            sprint_name=issue_data.get("sprint_name") if issue_data else None,
            assigned_agent_type_name=issue_data.get("assigned_agent_type_name")
            if issue_data
            else None,
        )
    except ValueError as e:
        logger.warning(f"Failed to update issue {issue_id}: {e!s}")
        raise ValidationException(
            message=str(e),
            error_code=ErrorCode.VALIDATION_ERROR,
        )
    except Exception as e:
        logger.error(f"Error updating issue {issue_id}: {e!s}", exc_info=True)
        raise
@router.delete(
    "/projects/{project_id}/issues/{issue_id}",
    response_model=MessageResponse,
    summary="Delete Issue",
    description="Delete an issue permanently",
    operation_id="delete_issue",
 )
@limiter.limit(f"{30 * RATE_MULTIPLIER}/minute")
 async def delete_issue(
    request: Request,
    project_id: UUID,
    issue_id: UUID,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Delete an issue permanently.
    The issue will be permanently removed from the database.
    Args:
        request: FastAPI request object
        project_id: Project UUID
        issue_id: Issue UUID
        current_user: Authenticated user
        db: Database session
    Returns:
        Success message
    Raises:
        NotFoundError: If project or issue not found
        AuthorizationError: If user lacks access
    """
    # Verify project access
    await verify_project_ownership(db, project_id, current_user)
    # Get existing issue
    issue = await issue_crud.get(db, id=issue_id)
    if not issue:
        raise NotFoundError(
            message=f"Issue {issue_id} not found",
            error_code=ErrorCode.NOT_FOUND,
        )
    # Verify issue belongs to the project
    if issue.project_id != project_id:
        raise NotFoundError(
            message=f"Issue {issue_id} not found in project {project_id}",
            error_code=ErrorCode.NOT_FOUND,
        )
    try:
        issue_title = issue.title
        await issue_crud.remove(db, id=issue_id)
        logger.info(
            f"User {current_user.email} deleted issue {issue_id} "
            f"('{issue_title}') from project {project_id}"
        )
        return MessageResponse(
            success=True,
            message=f"Issue '{issue_title}' has been deleted",
        )
    except Exception as e:
        logger.error(f"Error deleting issue {issue_id}: {e!s}", exc_info=True)
        raise
 # ===== Issue Assignment Endpoint =====
@router.post(
    "/projects/{project_id}/issues/{issue_id}/assign",
    response_model=IssueResponse,
    summary="Assign Issue",
    description="Assign an issue to an agent or human",
    operation_id="assign_issue",
 )
@limiter.limit(f"{60 * RATE_MULTIPLIER}/minute")
 async def assign_issue(
    request: Request,
    project_id: UUID,
    issue_id: UUID,
    assignment: IssueAssign,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Assign an issue to an agent or human.
    Only one type of assignment is allowed at a time:
    - assigned_agent_id: Assign to an AI agent instance
    - human_assignee: Assign to a human (name/email string)
    To unassign, pass both as null/None.
    Args:
        request: FastAPI request object
        project_id: Project UUID
        issue_id: Issue UUID
        assignment: Assignment data
        current_user: Authenticated user
        db: Database session
    Returns:
        Updated issue with assignment
    Raises:
        NotFoundError: If project, issue, or agent not found
        AuthorizationError: If user lacks access
        ValidationException: If agent not in project
    """
    # Verify project access
    await verify_project_ownership(db, project_id, current_user)
    # Get existing issue
    issue = await issue_crud.get(db, id=issue_id)
    if not issue:
        raise NotFoundError(
            message=f"Issue {issue_id} not found",
            error_code=ErrorCode.NOT_FOUND,
        )
    # Verify issue belongs to the project
    if issue.project_id != project_id:
        raise NotFoundError(
            message=f"Issue {issue_id} not found in project {project_id}",
            error_code=ErrorCode.NOT_FOUND,
        )
    # Process assignment based on type
    if assignment.assigned_agent_id:
        # Validate agent exists and belongs to project
        agent = await agent_instance_crud.get(db, id=assignment.assigned_agent_id)
        if not agent:
            raise NotFoundError(
                message=f"Agent instance {assignment.assigned_agent_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        if agent.project_id != project_id:
            raise ValidationException(
                message="Agent instance does not belong to this project",
                error_code=ErrorCode.VALIDATION_ERROR,
                field="assigned_agent_id",
            )
        if agent.status == AgentStatus.TERMINATED:
            raise ValidationException(
                message="Cannot assign issue to a terminated agent",
                error_code=ErrorCode.VALIDATION_ERROR,
                field="assigned_agent_id",
            )
        updated_issue = await issue_crud.assign_to_agent(
            db, issue_id=issue_id, agent_id=assignment.assigned_agent_id
        )
        logger.info(
            f"User {current_user.email} assigned issue {issue_id} to agent {agent.name}"
        )
    elif assignment.human_assignee:
        updated_issue = await issue_crud.assign_to_human(
            db, issue_id=issue_id, human_assignee=assignment.human_assignee
        )
        logger.info(
            f"User {current_user.email} assigned issue {issue_id} "
            f"to human '{assignment.human_assignee}'"
        )
    else:
        # Unassign - clear both agent and human
        updated_issue = await issue_crud.assign_to_agent(
            db, issue_id=issue_id, agent_id=None
        )
        logger.info(f"User {current_user.email} unassigned issue {issue_id}")
    if not updated_issue:
        raise NotFoundError(
            message=f"Issue {issue_id} not found",
            error_code=ErrorCode.NOT_FOUND,
        )
    # Get full details for response
    issue_data = await issue_crud.get_with_details(db, issue_id=issue_id)
    return _build_issue_response(
        updated_issue,
        project_name=issue_data.get("project_name") if issue_data else None,
        project_slug=issue_data.get("project_slug") if issue_data else None,
        sprint_name=issue_data.get("sprint_name") if issue_data else None,
        assigned_agent_type_name=issue_data.get("assigned_agent_type_name")
        if issue_data
        else None,
    )
@router.delete(
    "/projects/{project_id}/issues/{issue_id}/assignment",
    response_model=IssueResponse,
    summary="Unassign Issue",
    description="""
    Remove agent/human assignment from an issue.
    **Authentication**: Required (Bearer token)
    **Authorization**: Project owner or superuser
    This clears both agent and human assignee fields.
    **Rate Limit**: 60 requests/minute
    """,
    operation_id="unassign_issue",
 )
@limiter.limit(f"{60 * RATE_MULTIPLIER}/minute")
 async def unassign_issue(
    request: Request,
    project_id: UUID,
    issue_id: UUID,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Remove assignment from an issue.
    Clears both assigned_agent_id and human_assignee fields.
    """
    # Verify project access
    await verify_project_ownership(db, project_id, current_user)
    # Get existing issue
    issue = await issue_crud.get(db, id=issue_id)
    if not issue:
        raise NotFoundError(
            message=f"Issue {issue_id} not found",
            error_code=ErrorCode.NOT_FOUND,
        )
    # Verify issue belongs to project (IDOR prevention)
    if issue.project_id != project_id:
        raise NotFoundError(
            message=f"Issue {issue_id} not found in project {project_id}",
            error_code=ErrorCode.NOT_FOUND,
        )
    # Unassign the issue
    updated_issue = await issue_crud.unassign(db, issue_id=issue_id)
    if not updated_issue:
        raise NotFoundError(
            message=f"Issue {issue_id} not found",
            error_code=ErrorCode.NOT_FOUND,
        )
    logger.info(f"User {current_user.email} unassigned issue {issue_id}")
    # Get full details for response
    issue_data = await issue_crud.get_with_details(db, issue_id=issue_id)
    return _build_issue_response(
        updated_issue,
        project_name=issue_data.get("project_name") if issue_data else None,
        project_slug=issue_data.get("project_slug") if issue_data else None,
        sprint_name=issue_data.get("sprint_name") if issue_data else None,
        assigned_agent_type_name=issue_data.get("assigned_agent_type_name")
        if issue_data
        else None,
    )
 # ===== Issue Sync Endpoint =====
@router.post(
    "/projects/{project_id}/issues/{issue_id}/sync",
    response_model=MessageResponse,
    summary="Trigger Issue Sync",
    description="Trigger synchronization with external issue tracker",
    operation_id="sync_issue",
 )
@limiter.limit(f"{30 * RATE_MULTIPLIER}/minute")
 async def sync_issue(
    request: Request,
    project_id: UUID,
    issue_id: UUID,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Trigger synchronization of an issue with its external tracker.
    This endpoint queues a sync task for the issue. The actual synchronization
    happens asynchronously via Celery.
    Prerequisites:
    - Issue must have external_tracker_type configured
    - Project must have integration settings for the tracker
    Args:
        request: FastAPI request object
        project_id: Project UUID
        issue_id: Issue UUID
        current_user: Authenticated user
        db: Database session
    Returns:
        Message indicating sync has been triggered
    Raises:
        NotFoundError: If project or issue not found
        AuthorizationError: If user lacks access
        ValidationException: If issue has no external tracker
    """
    # Verify project access
    await verify_project_ownership(db, project_id, current_user)
    # Get existing issue
    issue = await issue_crud.get(db, id=issue_id)
    if not issue:
        raise NotFoundError(
            message=f"Issue {issue_id} not found",
            error_code=ErrorCode.NOT_FOUND,
        )
    # Verify issue belongs to the project
    if issue.project_id != project_id:
        raise NotFoundError(
            message=f"Issue {issue_id} not found in project {project_id}",
            error_code=ErrorCode.NOT_FOUND,
        )
    # Check if issue has external tracker configured
    if not issue.external_tracker_type:
        raise ValidationException(
            message="Issue does not have an external tracker configured",
            error_code=ErrorCode.VALIDATION_ERROR,
            field="external_tracker_type",
        )
    # Update sync status to pending
    await issue_crud.update_sync_status(
        db,
        issue_id=issue_id,
        sync_status=SyncStatus.PENDING,
    )
    # TODO: Queue Celery task for actual sync
    # When Celery is set up, this will be:
    # from app.tasks.sync import sync_issue_task
    # sync_issue_task.delay(str(issue_id))
    logger.info(
        f"User {current_user.email} triggered sync for issue {issue_id} "
        f"(tracker: {issue.external_tracker_type})"
    )
    return MessageResponse(
        success=True,
        message=f"Sync triggered for issue '{issue.title}'. "
        f"Status will update when complete.",
    )
--- a/backend/app/api/routes/mcp.py
+++ b/backend/app/api/routes/mcp.py
@@ -0,0 +1,446 @@
 """
 MCP (Model Context Protocol) API Endpoints
 Provides REST endpoints for managing MCP server connections
 and executing tool calls.
 """
 import logging
 import re
 from typing import Annotated, Any
 from fastapi import APIRouter, Depends, HTTPException, Path, status
 from pydantic import BaseModel, Field
 from app.api.dependencies.permissions import require_superuser
 from app.models.user import User
 from app.services.mcp import (
    MCPCircuitOpenError,
    MCPClientManager,
    MCPConnectionError,
    MCPError,
    MCPServerNotFoundError,
    MCPTimeoutError,
    MCPToolError,
    MCPToolNotFoundError,
    get_mcp_client,
 )
 logger = logging.getLogger(__name__)
 router = APIRouter()
 # Server name validation pattern: alphanumeric, hyphens, underscores, 1-64 chars
 SERVER_NAME_PATTERN = re.compile(r"^[a-zA-Z0-9_-]{1,64}$")
 # Type alias for validated server name path parameter
 ServerNamePath = Annotated[
    str,
    Path(
        description="MCP server name",
        min_length=1,
        max_length=64,
        pattern=r"^[a-zA-Z0-9_-]+$",
    ),
 ]
 # ============================================================================
 # Request/Response Schemas
 # ============================================================================
 class ServerInfo(BaseModel):
    """Information about an MCP server."""
    name: str = Field(..., description="Server name")
    url: str = Field(..., description="Server URL")
    enabled: bool = Field(..., description="Whether server is enabled")
    timeout: int = Field(..., description="Request timeout in seconds")
    transport: str = Field(..., description="Transport type (http, stdio, sse)")
    description: str | None = Field(None, description="Server description")
 class ServerListResponse(BaseModel):
    """Response containing list of MCP servers."""
    servers: list[ServerInfo]
    total: int
 class ToolInfoResponse(BaseModel):
    """Information about an MCP tool."""
    name: str = Field(..., description="Tool name")
    description: str | None = Field(None, description="Tool description")
    server_name: str | None = Field(None, description="Server providing the tool")
    input_schema: dict[str, Any] | None = Field(
        None, description="JSON schema for input"
    )
 class ToolListResponse(BaseModel):
    """Response containing list of tools."""
    tools: list[ToolInfoResponse]
    total: int
 class ServerHealthStatus(BaseModel):
    """Health status for a server."""
    name: str
    healthy: bool
    state: str
    url: str
    error: str | None = None
    tools_count: int = 0
 class HealthCheckResponse(BaseModel):
    """Response containing health status of all servers."""
    servers: dict[str, ServerHealthStatus]
    healthy_count: int
    unhealthy_count: int
    total: int
 class ToolCallRequest(BaseModel):
    """Request to execute a tool."""
    server: str = Field(..., description="MCP server name")
    tool: str = Field(..., description="Tool name to execute")
    arguments: dict[str, Any] = Field(
        default_factory=dict,
        description="Tool arguments",
    )
    timeout: float | None = Field(
        None,
        description="Optional timeout override in seconds",
    )
 class ToolCallResponse(BaseModel):
    """Response from tool execution."""
    success: bool
    data: Any | None = None
    error: str | None = None
    error_code: str | None = None
    tool_name: str | None = None
    server_name: str | None = None
    execution_time_ms: float = 0.0
    request_id: str | None = None
 class CircuitBreakerStatus(BaseModel):
    """Status of a circuit breaker."""
    server_name: str
    state: str
    failure_count: int
 class CircuitBreakerListResponse(BaseModel):
    """Response containing circuit breaker statuses."""
    circuit_breakers: list[CircuitBreakerStatus]
 # ============================================================================
 # Endpoints
 # ============================================================================
@router.get(
    "/servers",
    response_model=ServerListResponse,
    summary="List MCP Servers",
    description="Get list of all registered MCP servers with their configurations.",
 )
 async def list_servers(
    mcp: MCPClientManager = Depends(get_mcp_client),
 ) -> ServerListResponse:
    """List all registered MCP servers."""
    servers = []
    for name in mcp.list_servers():
        try:
            config = mcp.get_server_config(name)
            servers.append(
                ServerInfo(
                    name=name,
                    url=config.url,
                    enabled=config.enabled,
                    timeout=config.timeout,
                    transport=config.transport.value,
                    description=config.description,
                )
            )
        except MCPServerNotFoundError:
            continue
    return ServerListResponse(
        servers=servers,
        total=len(servers),
    )
@router.get(
    "/servers/{server_name}/tools",
    response_model=ToolListResponse,
    summary="List Server Tools",
    description="Get list of tools available on a specific MCP server.",
 )
 async def list_server_tools(
    server_name: ServerNamePath,
    mcp: MCPClientManager = Depends(get_mcp_client),
 ) -> ToolListResponse:
    """List all tools available on a specific server."""
    try:
        tools = await mcp.list_tools(server_name)
        return ToolListResponse(
            tools=[
                ToolInfoResponse(
                    name=t.name,
                    description=t.description,
                    server_name=t.server_name,
                    input_schema=t.input_schema,
                )
                for t in tools
            ],
            total=len(tools),
        )
    except MCPServerNotFoundError as e:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail=f"Server not found: {server_name}",
        ) from e
@router.get(
    "/tools",
    response_model=ToolListResponse,
    summary="List All Tools",
    description="Get list of all tools from all MCP servers.",
 )
 async def list_all_tools(
    mcp: MCPClientManager = Depends(get_mcp_client),
 ) -> ToolListResponse:
    """List all tools from all servers."""
    tools = await mcp.list_all_tools()
    return ToolListResponse(
        tools=[
            ToolInfoResponse(
                name=t.name,
                description=t.description,
                server_name=t.server_name,
                input_schema=t.input_schema,
            )
            for t in tools
        ],
        total=len(tools),
    )
@router.get(
    "/health",
    response_model=HealthCheckResponse,
    summary="Health Check",
    description="Check health status of all MCP servers.",
 )
 async def health_check(
    mcp: MCPClientManager = Depends(get_mcp_client),
 ) -> HealthCheckResponse:
    """Perform health check on all MCP servers."""
    health_results = await mcp.health_check()
    servers = {
        name: ServerHealthStatus(
            name=status.name,
            healthy=status.healthy,
            state=status.state,
            url=status.url,
            error=status.error,
            tools_count=status.tools_count,
        )
        for name, status in health_results.items()
    }
    healthy_count = sum(1 for s in servers.values() if s.healthy)
    unhealthy_count = len(servers) - healthy_count
    return HealthCheckResponse(
        servers=servers,
        healthy_count=healthy_count,
        unhealthy_count=unhealthy_count,
        total=len(servers),
    )
@router.post(
    "/call",
    response_model=ToolCallResponse,
    summary="Execute Tool (Admin Only)",
    description="Execute a tool on an MCP server. Requires superuser privileges.",
 )
 async def call_tool(
    request: ToolCallRequest,
    current_user: User = Depends(require_superuser),
    mcp: MCPClientManager = Depends(get_mcp_client),
 ) -> ToolCallResponse:
    """
    Execute a tool on an MCP server.
    This endpoint is restricted to superusers for direct tool execution.
    Normal tool execution should go through agent workflows.
    """
    logger.info(
        "Tool call by user %s: %s.%s",
        current_user.id,
        request.server,
        request.tool,
    )
    try:
        result = await mcp.call_tool(
            server=request.server,
            tool=request.tool,
            args=request.arguments,
            timeout=request.timeout,
        )
        return ToolCallResponse(
            success=result.success,
            data=result.data,
            error=result.error,
            error_code=result.error_code,
            tool_name=result.tool_name,
            server_name=result.server_name,
            execution_time_ms=result.execution_time_ms,
            request_id=result.request_id,
        )
    except MCPCircuitOpenError as e:
        raise HTTPException(
            status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
            detail=f"Server temporarily unavailable: {e.server_name}",
        ) from e
    except MCPToolNotFoundError as e:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail=f"Tool not found: {e.tool_name}",
        ) from e
    except MCPServerNotFoundError as e:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail=f"Server not found: {e.server_name}",
        ) from e
    except MCPTimeoutError as e:
        raise HTTPException(
            status_code=status.HTTP_504_GATEWAY_TIMEOUT,
            detail=str(e),
        ) from e
    except MCPConnectionError as e:
        raise HTTPException(
            status_code=status.HTTP_502_BAD_GATEWAY,
            detail=str(e),
        ) from e
    except MCPToolError as e:
        # Tool errors are returned in the response, not as HTTP errors
        return ToolCallResponse(
            success=False,
            error=str(e),
            error_code=e.error_code,
            tool_name=e.tool_name,
            server_name=e.server_name,
        )
    except MCPError as e:
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
            detail=str(e),
        ) from e
@router.get(
    "/circuit-breakers",
    response_model=CircuitBreakerListResponse,
    summary="List Circuit Breakers",
    description="Get status of all circuit breakers.",
 )
 async def list_circuit_breakers(
    mcp: MCPClientManager = Depends(get_mcp_client),
 ) -> CircuitBreakerListResponse:
    """Get status of all circuit breakers."""
    status_dict = mcp.get_circuit_breaker_status()
    return CircuitBreakerListResponse(
        circuit_breakers=[
            CircuitBreakerStatus(
                server_name=name,
                state=info.get("state", "unknown"),
                failure_count=info.get("failure_count", 0),
            )
            for name, info in status_dict.items()
        ]
    )
@router.post(
    "/circuit-breakers/{server_name}/reset",
    status_code=status.HTTP_204_NO_CONTENT,
    summary="Reset Circuit Breaker (Admin Only)",
    description="Manually reset a circuit breaker for a server.",
 )
 async def reset_circuit_breaker(
    server_name: ServerNamePath,
    current_user: User = Depends(require_superuser),
    mcp: MCPClientManager = Depends(get_mcp_client),
 ) -> None:
    """Manually reset a circuit breaker."""
    logger.info(
        "Circuit breaker reset by user %s for server %s",
        current_user.id,
        server_name,
    )
    success = await mcp.reset_circuit_breaker(server_name)
    if not success:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail=f"No circuit breaker found for server: {server_name}",
        )
@router.post(
    "/servers/{server_name}/reconnect",
    status_code=status.HTTP_204_NO_CONTENT,
    summary="Reconnect to Server (Admin Only)",
    description="Force reconnection to an MCP server.",
 )
 async def reconnect_server(
    server_name: ServerNamePath,
    current_user: User = Depends(require_superuser),
    mcp: MCPClientManager = Depends(get_mcp_client),
 ) -> None:
    """Force reconnection to an MCP server."""
    logger.info(
        "Reconnect requested by user %s for server %s",
        current_user.id,
        server_name,
    )
    try:
        await mcp.disconnect(server_name)
        await mcp.connect(server_name)
    except MCPServerNotFoundError as e:
        raise HTTPException(
            status_code=status.HTTP_404_NOT_FOUND,
            detail=f"Server not found: {server_name}",
        ) from e
    except MCPConnectionError as e:
        raise HTTPException(
            status_code=status.HTTP_502_BAD_GATEWAY,
            detail=f"Failed to reconnect: {e}",
        ) from e
--- a/backend/app/api/routes/projects.py
+++ b/backend/app/api/routes/projects.py
@@ -0,0 +1,659 @@
 # app/api/routes/projects.py
 """
 Project management API endpoints for Syndarix.
 These endpoints allow users to manage their AI-powered software consulting projects.
 Users can create, read, update, and manage the lifecycle of their projects.
 """
 import logging
 import os
 from typing import Any
 from uuid import UUID
 from fastapi import APIRouter, Depends, Query, Request, status
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.api.dependencies.auth import get_current_user
 from app.core.database import get_db
 from app.core.exceptions import (
    AuthorizationError,
    DuplicateError,
    ErrorCode,
    NotFoundError,
    ValidationException,
 )
 from app.crud.syndarix.project import project as project_crud
 from app.models.syndarix.enums import ProjectStatus
 from app.models.user import User
 from app.schemas.common import (
    MessageResponse,
    PaginatedResponse,
    PaginationParams,
    create_pagination_meta,
 )
 from app.schemas.syndarix.project import (
    ProjectCreate,
    ProjectResponse,
    ProjectUpdate,
 )
 router = APIRouter()
 logger = logging.getLogger(__name__)
 # Initialize rate limiter
 limiter = Limiter(key_func=get_remote_address)
 # Use higher rate limits in test environment
 IS_TEST = os.getenv("IS_TEST", "False") == "True"
 RATE_MULTIPLIER = 100 if IS_TEST else 1
 def _build_project_response(project_data: dict[str, Any]) -> ProjectResponse:
    """
    Build a ProjectResponse from project data dictionary.
    Args:
        project_data: Dictionary containing project and related counts
    Returns:
        ProjectResponse with all fields populated
    """
    project = project_data["project"]
    return ProjectResponse(
        id=project.id,
        name=project.name,
        slug=project.slug,
        description=project.description,
        autonomy_level=project.autonomy_level,
        status=project.status,
        settings=project.settings,
        owner_id=project.owner_id,
        created_at=project.created_at,
        updated_at=project.updated_at,
        agent_count=project_data.get("agent_count", 0),
        issue_count=project_data.get("issue_count", 0),
        active_sprint_name=project_data.get("active_sprint_name"),
    )
 def _check_project_ownership(project: Any, current_user: User) -> None:
    """
    Check if the current user owns the project or is a superuser.
    Args:
        project: The project to check ownership of
        current_user: The authenticated user
    Raises:
        AuthorizationError: If user doesn't own the project and isn't a superuser
    """
    if not current_user.is_superuser and project.owner_id != current_user.id:
        raise AuthorizationError(
            message="You do not have permission to access this project",
            error_code=ErrorCode.INSUFFICIENT_PERMISSIONS,
        )
 # =============================================================================
 # Project CRUD Endpoints
 # =============================================================================
@router.post(
    "",
    response_model=ProjectResponse,
    status_code=status.HTTP_201_CREATED,
    summary="Create Project",
    description="""
    Create a new project for the current user.
    The project will be owned by the authenticated user.
    A unique slug is required for URL-friendly project identification.
    **Rate Limit**: 10 requests/minute
    """,
    operation_id="create_project",
 )
@limiter.limit(f"{10 * RATE_MULTIPLIER}/minute")
 async def create_project(
    request: Request,
    project_in: ProjectCreate,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Create a new project.
    The authenticated user becomes the owner of the project.
    """
    try:
        # Set the owner to the current user
        project_data = ProjectCreate(
            name=project_in.name,
            slug=project_in.slug,
            description=project_in.description,
            autonomy_level=project_in.autonomy_level,
            status=project_in.status,
            settings=project_in.settings,
            owner_id=current_user.id,
        )
        project = await project_crud.create(db, obj_in=project_data)
        logger.info(f"User {current_user.email} created project {project.slug}")
        return ProjectResponse(
            id=project.id,
            name=project.name,
            slug=project.slug,
            description=project.description,
            autonomy_level=project.autonomy_level,
            status=project.status,
            settings=project.settings,
            owner_id=project.owner_id,
            created_at=project.created_at,
            updated_at=project.updated_at,
            agent_count=0,
            issue_count=0,
            active_sprint_name=None,
        )
    except ValueError as e:
        error_msg = str(e)
        if "already exists" in error_msg.lower():
            logger.warning(f"Duplicate project slug attempted: {project_in.slug}")
            raise DuplicateError(
                message=error_msg,
                error_code=ErrorCode.DUPLICATE_ENTRY,
                field="slug",
            )
        logger.error(f"Error creating project: {error_msg}", exc_info=True)
        raise
    except Exception as e:
        logger.error(f"Unexpected error creating project: {e!s}", exc_info=True)
        raise
@router.get(
    "",
    response_model=PaginatedResponse[ProjectResponse],
    summary="List Projects",
    description="""
    List projects for the current user with filtering and pagination.
    Regular users see only their own projects.
    Superusers can see all projects by setting `all_projects=true`.
    **Rate Limit**: 30 requests/minute
    """,
    operation_id="list_projects",
 )
@limiter.limit(f"{30 * RATE_MULTIPLIER}/minute")
 async def list_projects(
    request: Request,
    pagination: PaginationParams = Depends(),
    status_filter: ProjectStatus | None = Query(
        None, alias="status", description="Filter by project status"
    ),
    search: str | None = Query(
        None, description="Search by name, slug, or description"
    ),
    all_projects: bool = Query(False, description="Show all projects (superuser only)"),
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    List projects with filtering, search, and pagination.
    Regular users only see their own projects.
    Superusers can view all projects if all_projects is true.
    """
    try:
        # Determine owner filter based on user role and request
        owner_id = (
            None if (current_user.is_superuser and all_projects) else current_user.id
        )
        projects_data, total = await project_crud.get_multi_with_counts(
            db,
            skip=pagination.offset,
            limit=pagination.limit,
            status=status_filter,
            owner_id=owner_id,
            search=search,
        )
        # Build response objects
        project_responses = [_build_project_response(data) for data in projects_data]
        pagination_meta = create_pagination_meta(
            total=total,
            page=pagination.page,
            limit=pagination.limit,
            items_count=len(project_responses),
        )
        return PaginatedResponse(data=project_responses, pagination=pagination_meta)
    except Exception as e:
        logger.error(f"Error listing projects: {e!s}", exc_info=True)
        raise
@router.get(
    "/{project_id}",
    response_model=ProjectResponse,
    summary="Get Project",
    description="""
    Get detailed information about a specific project.
    Users can only access their own projects unless they are superusers.
    **Rate Limit**: 60 requests/minute
    """,
    operation_id="get_project",
 )
@limiter.limit(f"{60 * RATE_MULTIPLIER}/minute")
 async def get_project(
    request: Request,
    project_id: UUID,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Get detailed information about a project by ID.
    Includes agent count, issue count, and active sprint name.
    """
    try:
        project_data = await project_crud.get_with_counts(db, project_id=project_id)
        if not project_data:
            raise NotFoundError(
                message=f"Project {project_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        project = project_data["project"]
        _check_project_ownership(project, current_user)
        return _build_project_response(project_data)
    except (NotFoundError, AuthorizationError):
        raise
    except Exception as e:
        logger.error(f"Error getting project {project_id}: {e!s}", exc_info=True)
        raise
@router.get(
    "/slug/{slug}",
    response_model=ProjectResponse,
    summary="Get Project by Slug",
    description="""
    Get detailed information about a project by its slug.
    Users can only access their own projects unless they are superusers.
    **Rate Limit**: 60 requests/minute
    """,
    operation_id="get_project_by_slug",
 )
@limiter.limit(f"{60 * RATE_MULTIPLIER}/minute")
 async def get_project_by_slug(
    request: Request,
    slug: str,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Get detailed information about a project by slug.
    Includes agent count, issue count, and active sprint name.
    """
    try:
        project = await project_crud.get_by_slug(db, slug=slug)
        if not project:
            raise NotFoundError(
                message=f"Project with slug '{slug}' not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        _check_project_ownership(project, current_user)
        # Get project with counts
        project_data = await project_crud.get_with_counts(db, project_id=project.id)
        if not project_data:
            raise NotFoundError(
                message=f"Project with slug '{slug}' not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        return _build_project_response(project_data)
    except (NotFoundError, AuthorizationError):
        raise
    except Exception as e:
        logger.error(f"Error getting project by slug {slug}: {e!s}", exc_info=True)
        raise
@router.patch(
    "/{project_id}",
    response_model=ProjectResponse,
    summary="Update Project",
    description="""
    Update an existing project.
    Only the project owner or a superuser can update a project.
    Only provided fields will be updated.
    **Rate Limit**: 20 requests/minute
    """,
    operation_id="update_project",
 )
@limiter.limit(f"{20 * RATE_MULTIPLIER}/minute")
 async def update_project(
    request: Request,
    project_id: UUID,
    project_in: ProjectUpdate,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Update a project's information.
    Only the project owner or superusers can perform updates.
    """
    try:
        project = await project_crud.get(db, id=project_id)
        if not project:
            raise NotFoundError(
                message=f"Project {project_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        _check_project_ownership(project, current_user)
        # Update the project
        updated_project = await project_crud.update(
            db, db_obj=project, obj_in=project_in
        )
        logger.info(f"User {current_user.email} updated project {updated_project.slug}")
        # Get updated project with counts
        project_data = await project_crud.get_with_counts(
            db, project_id=updated_project.id
        )
        if not project_data:
            # This shouldn't happen, but handle gracefully
            raise NotFoundError(
                message=f"Project {project_id} not found after update",
                error_code=ErrorCode.NOT_FOUND,
            )
        return _build_project_response(project_data)
    except (NotFoundError, AuthorizationError):
        raise
    except ValueError as e:
        error_msg = str(e)
        if "already exists" in error_msg.lower():
            logger.warning(f"Duplicate project slug attempted: {project_in.slug}")
            raise DuplicateError(
                message=error_msg,
                error_code=ErrorCode.DUPLICATE_ENTRY,
                field="slug",
            )
        logger.error(f"Error updating project: {error_msg}", exc_info=True)
        raise
    except Exception as e:
        logger.error(f"Error updating project {project_id}: {e!s}", exc_info=True)
        raise
@router.delete(
    "/{project_id}",
    response_model=MessageResponse,
    summary="Archive Project",
    description="""
    Archive a project (soft delete).
    Only the project owner or a superuser can archive a project.
    Archived projects are not deleted but are no longer accessible for active work.
    **Rate Limit**: 10 requests/minute
    """,
    operation_id="archive_project",
 )
@limiter.limit(f"{10 * RATE_MULTIPLIER}/minute")
 async def archive_project(
    request: Request,
    project_id: UUID,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Archive a project by setting its status to ARCHIVED.
    This is a soft delete operation. The project data is preserved.
    """
    try:
        project = await project_crud.get(db, id=project_id)
        if not project:
            raise NotFoundError(
                message=f"Project {project_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        _check_project_ownership(project, current_user)
        # Check if project is already archived
        if project.status == ProjectStatus.ARCHIVED:
            return MessageResponse(
                success=True,
                message=f"Project '{project.name}' is already archived",
            )
        archived_project = await project_crud.archive_project(db, project_id=project_id)
        if not archived_project:
            raise NotFoundError(
                message=f"Failed to archive project {project_id}",
                error_code=ErrorCode.NOT_FOUND,
            )
        logger.info(f"User {current_user.email} archived project {project.slug}")
        return MessageResponse(
            success=True,
            message=f"Project '{archived_project.name}' has been archived",
        )
    except (NotFoundError, AuthorizationError):
        raise
    except Exception as e:
        logger.error(f"Error archiving project {project_id}: {e!s}", exc_info=True)
        raise
 # =============================================================================
 # Project Lifecycle Endpoints
 # =============================================================================
@router.post(
    "/{project_id}/pause",
    response_model=ProjectResponse,
    summary="Pause Project",
    description="""
    Pause an active project.
    Only ACTIVE projects can be paused.
    Only the project owner or a superuser can pause a project.
    **Rate Limit**: 10 requests/minute
    """,
    operation_id="pause_project",
 )
@limiter.limit(f"{10 * RATE_MULTIPLIER}/minute")
 async def pause_project(
    request: Request,
    project_id: UUID,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Pause an active project.
    Sets the project status to PAUSED. Only ACTIVE projects can be paused.
    """
    try:
        project = await project_crud.get(db, id=project_id)
        if not project:
            raise NotFoundError(
                message=f"Project {project_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        _check_project_ownership(project, current_user)
        # Validate current status (business logic validation, not authorization)
        if project.status == ProjectStatus.PAUSED:
            raise ValidationException(
                message="Project is already paused",
                error_code=ErrorCode.VALIDATION_ERROR,
                field="status",
            )
        if project.status == ProjectStatus.ARCHIVED:
            raise ValidationException(
                message="Cannot pause an archived project",
                error_code=ErrorCode.VALIDATION_ERROR,
                field="status",
            )
        if project.status == ProjectStatus.COMPLETED:
            raise ValidationException(
                message="Cannot pause a completed project",
                error_code=ErrorCode.VALIDATION_ERROR,
                field="status",
            )
        # Update status to PAUSED
        updated_project = await project_crud.update(
            db, db_obj=project, obj_in=ProjectUpdate(status=ProjectStatus.PAUSED)
        )
        logger.info(f"User {current_user.email} paused project {project.slug}")
        # Get project with counts
        project_data = await project_crud.get_with_counts(
            db, project_id=updated_project.id
        )
        if not project_data:
            raise NotFoundError(
                message=f"Project {project_id} not found after update",
                error_code=ErrorCode.NOT_FOUND,
            )
        return _build_project_response(project_data)
    except (NotFoundError, AuthorizationError, ValidationException):
        raise
    except Exception as e:
        logger.error(f"Error pausing project {project_id}: {e!s}", exc_info=True)
        raise
@router.post(
    "/{project_id}/resume",
    response_model=ProjectResponse,
    summary="Resume Project",
    description="""
    Resume a paused project.
    Only PAUSED projects can be resumed.
    Only the project owner or a superuser can resume a project.
    **Rate Limit**: 10 requests/minute
    """,
    operation_id="resume_project",
 )
@limiter.limit(f"{10 * RATE_MULTIPLIER}/minute")
 async def resume_project(
    request: Request,
    project_id: UUID,
    current_user: User = Depends(get_current_user),
    db: AsyncSession = Depends(get_db),
 ) -> Any:
    """
    Resume a paused project.
    Sets the project status back to ACTIVE. Only PAUSED projects can be resumed.
    """
    try:
        project = await project_crud.get(db, id=project_id)
        if not project:
            raise NotFoundError(
                message=f"Project {project_id} not found",
                error_code=ErrorCode.NOT_FOUND,
            )
        _check_project_ownership(project, current_user)
        # Validate current status (business logic validation, not authorization)
        if project.status == ProjectStatus.ACTIVE:
            raise ValidationException(
                message="Project is already active",
                error_code=ErrorCode.VALIDATION_ERROR,
                field="status",
            )
        if project.status == ProjectStatus.ARCHIVED:
            raise ValidationException(
                message="Cannot resume an archived project",
                error_code=ErrorCode.VALIDATION_ERROR,
                field="status",
            )
        if project.status == ProjectStatus.COMPLETED:
            raise ValidationException(
                message="Cannot resume a completed project",
                error_code=ErrorCode.VALIDATION_ERROR,
                field="status",
            )
        # Update status to ACTIVE
        updated_project = await project_crud.update(
            db, db_obj=project, obj_in=ProjectUpdate(status=ProjectStatus.ACTIVE)
        )
        logger.info(f"User {current_user.email} resumed project {project.slug}")
        # Get project with counts
        project_data = await project_crud.get_with_counts(
            db, project_id=updated_project.id
        )
        if not project_data:
            raise NotFoundError(
                message=f"Project {project_id} not found after update",
                error_code=ErrorCode.NOT_FOUND,
            )
        return _build_project_response(project_data)
    except (NotFoundError, AuthorizationError, ValidationException):
        raise
    except Exception as e:
        logger.error(f"Error resuming project {project_id}: {e!s}", exc_info=True)
        raise
--- a/backend/app/api/routes/sprints.py
+++ b/backend/app/api/routes/sprints.py
--- a/backend/app/celery_app.py
+++ b/backend/app/celery_app.py
@@ -0,0 +1,116 @@
 # app/celery_app.py
 """
 Celery application configuration for Syndarix.
 This module configures the Celery app for background task processing:
 - Agent execution tasks (LLM calls, tool execution)
 - Git operations (clone, commit, push, PR creation)
 - Issue synchronization with external trackers
 - Workflow state management
 - Cost tracking and budget monitoring
 Architecture:
 - Redis as message broker and result backend
 - Queue routing for task isolation
 - JSON serialization for cross-language compatibility
 - Beat scheduler for periodic tasks
 """
 from celery import Celery
 from app.core.config import settings
 # Create Celery application instance
 celery_app = Celery(
    "syndarix",
    broker=settings.celery_broker_url,
    backend=settings.celery_result_backend,
 )
 # Define task queues with their own exchanges and routing keys
 TASK_QUEUES = {
    "agent": {"exchange": "agent", "routing_key": "agent"},
    "git": {"exchange": "git", "routing_key": "git"},
    "sync": {"exchange": "sync", "routing_key": "sync"},
    "default": {"exchange": "default", "routing_key": "default"},
 }
 # Configure Celery
 celery_app.conf.update(
    # Serialization
    task_serializer="json",
    accept_content=["json"],
    result_serializer="json",
    # Timezone
    timezone="UTC",
    enable_utc=True,
    # Task imports for auto-discovery
    imports=("app.tasks",),
    # Default queue
    task_default_queue="default",
    # Task queues configuration
    task_queues=TASK_QUEUES,
    # Task routing - route tasks to appropriate queues
    task_routes={
        "app.tasks.agent.*": {"queue": "agent"},
        "app.tasks.git.*": {"queue": "git"},
        "app.tasks.sync.*": {"queue": "sync"},
        "app.tasks.*": {"queue": "default"},
    },
    # Time limits per ADR-003
    task_soft_time_limit=300,  # 5 minutes soft limit
    task_time_limit=600,  # 10 minutes hard limit
    # Result expiration - 24 hours
    result_expires=86400,
    # Broker connection retry
    broker_connection_retry_on_startup=True,
    # Retry configuration per ADR-003 (built-in retry with backoff)
    task_autoretry_for=(Exception,),  # Retry on all exceptions
    task_retry_kwargs={"max_retries": 3, "countdown": 5},  # Initial 5s delay
    task_retry_backoff=True,  # Enable exponential backoff
    task_retry_backoff_max=600,  # Max 10 minutes between retries
    task_retry_jitter=True,  # Add jitter to prevent thundering herd
    # Beat schedule for periodic tasks
    beat_schedule={
        # Cost aggregation every hour per ADR-012
        "aggregate-daily-costs": {
            "task": "app.tasks.cost.aggregate_daily_costs",
            "schedule": 3600.0,  # 1 hour in seconds
        },
        # Reset daily budget counters at midnight UTC
        "reset-daily-budget-counters": {
            "task": "app.tasks.cost.reset_daily_budget_counters",
            "schedule": 86400.0,  # 24 hours in seconds
        },
        # Check for stale workflows every 5 minutes
        "recover-stale-workflows": {
            "task": "app.tasks.workflow.recover_stale_workflows",
            "schedule": 300.0,  # 5 minutes in seconds
        },
        # Incremental issue sync every minute per ADR-011
        "sync-issues-incremental": {
            "task": "app.tasks.sync.sync_issues_incremental",
            "schedule": 60.0,  # 1 minute in seconds
        },
        # Full issue reconciliation every 15 minutes per ADR-011
        "sync-issues-full": {
            "task": "app.tasks.sync.sync_issues_full",
            "schedule": 900.0,  # 15 minutes in seconds
        },
    },
    # Task execution settings
    task_acks_late=True,  # Acknowledge tasks after execution
    task_reject_on_worker_lost=True,  # Reject tasks if worker dies
    worker_prefetch_multiplier=1,  # Fair task distribution
 )
 # Auto-discover tasks from task modules
 celery_app.autodiscover_tasks(
    [
        "app.tasks.agent",
        "app.tasks.git",
        "app.tasks.sync",
        "app.tasks.workflow",
        "app.tasks.cost",
    ]
 )
--- a/backend/app/core/config.py
+++ b/backend/app/core/config.py
@@ -5,7 +5,7 @@ from pydantic_settings import BaseSettings
 class Settings(BaseSettings):
-    PROJECT_NAME: str = "PragmaStack"
+    PROJECT_NAME: str = "Syndarix"
    VERSION: str = "1.0.0"
    API_V1_STR: str = "/api/v1"
@@ -39,6 +39,32 @@ class Settings(BaseSettings):
    db_pool_timeout: int = 30  # Seconds to wait for a connection
    db_pool_recycle: int = 3600  # Recycle connections after 1 hour
    # Redis configuration (Syndarix: cache, pub/sub, Celery broker)
    REDIS_URL: str = Field(
        default="redis://localhost:6379/0",
        description="Redis URL for cache, pub/sub, and Celery broker",
    )
    # Celery configuration (Syndarix: background task processing)
    CELERY_BROKER_URL: str | None = Field(
        default=None,
        description="Celery broker URL (defaults to REDIS_URL if not set)",
    )
    CELERY_RESULT_BACKEND: str | None = Field(
        default=None,
        description="Celery result backend URL (defaults to REDIS_URL if not set)",
    )
    @property
    def celery_broker_url(self) -> str:
        """Get Celery broker URL, defaulting to Redis."""
        return self.CELERY_BROKER_URL or self.REDIS_URL
    @property
    def celery_result_backend(self) -> str:
        """Get Celery result backend URL, defaulting to Redis."""
        return self.CELERY_RESULT_BACKEND or self.REDIS_URL
    # SQL debugging (disable in production)
    sql_echo: bool = False  # Log SQL statements
    sql_echo_pool: bool = False  # Log connection pool events
--- a/backend/app/core/redis.py
+++ b/backend/app/core/redis.py
@@ -0,0 +1,474 @@
 # app/core/redis.py
 """
 Redis client configuration for caching and pub/sub.
 This module provides async Redis connectivity with connection pooling
 for FastAPI endpoints and background tasks.
 Features:
 - Connection pooling for efficient resource usage
 - Cache operations (get, set, delete, expire)
 - Pub/sub operations (publish, subscribe)
 - Health check for monitoring
 """
 import asyncio
 import json
 import logging
 from collections.abc import AsyncGenerator
 from contextlib import asynccontextmanager
 from typing import Any
 from redis.asyncio import ConnectionPool, Redis
 from redis.asyncio.client import PubSub
 from redis.exceptions import ConnectionError, RedisError, TimeoutError
 from app.core.config import settings
 # Configure logging
 logger = logging.getLogger(__name__)
 # Default TTL for cache entries (1 hour)
 DEFAULT_CACHE_TTL = 3600
 # Connection pool settings
 POOL_MAX_CONNECTIONS = 50
 POOL_TIMEOUT = 10  # seconds
 class RedisClient:
    """
    Async Redis client with connection pooling.
    Provides high-level operations for caching and pub/sub
    with proper error handling and connection management.
    """
    def __init__(self, url: str | None = None) -> None:
        """
        Initialize Redis client.
        Args:
            url: Redis connection URL. Defaults to settings.REDIS_URL.
        """
        self._url = url or settings.REDIS_URL
        self._pool: ConnectionPool | None = None
        self._client: Redis | None = None
        self._lock = asyncio.Lock()
    async def _ensure_pool(self) -> ConnectionPool:
        """Ensure connection pool is initialized (thread-safe)."""
        if self._pool is None:
            async with self._lock:
                # Double-check after acquiring lock
                if self._pool is None:
                    self._pool = ConnectionPool.from_url(
                        self._url,
                        max_connections=POOL_MAX_CONNECTIONS,
                        socket_timeout=POOL_TIMEOUT,
                        socket_connect_timeout=POOL_TIMEOUT,
                        decode_responses=True,
                        health_check_interval=30,
                    )
                    logger.info("Redis connection pool initialized")
        return self._pool
    async def _get_client(self) -> Redis:
        """Get Redis client instance from pool."""
        pool = await self._ensure_pool()
        if self._client is None:
            self._client = Redis(connection_pool=pool)
        return self._client
    # =========================================================================
    # Cache Operations
    # =========================================================================
    async def cache_get(self, key: str) -> str | None:
        """
        Get a value from cache.
        Args:
            key: Cache key.
        Returns:
            Cached value or None if not found.
        """
        try:
            client = await self._get_client()
            value = await client.get(key)
            if value is not None:
                logger.debug(f"Cache hit for key: {key}")
            else:
                logger.debug(f"Cache miss for key: {key}")
            return value
        except (ConnectionError, TimeoutError) as e:
            logger.error(f"Redis cache_get failed for key '{key}': {e}")
            return None
        except RedisError as e:
            logger.error(f"Redis error in cache_get for key '{key}': {e}")
            return None
    async def cache_get_json(self, key: str) -> Any | None:
        """
        Get a JSON-serialized value from cache.
        Args:
            key: Cache key.
        Returns:
            Deserialized value or None if not found.
        """
        value = await self.cache_get(key)
        if value is not None:
            try:
                return json.loads(value)
            except json.JSONDecodeError as e:
                logger.error(f"Failed to decode JSON for key '{key}': {e}")
                return None
        return None
    async def cache_set(
        self,
        key: str,
        value: str,
        ttl: int | None = None,
    ) -> bool:
        """
        Set a value in cache.
        Args:
            key: Cache key.
            value: Value to cache.
            ttl: Time-to-live in seconds. Defaults to DEFAULT_CACHE_TTL.
        Returns:
            True if successful, False otherwise.
        """
        try:
            client = await self._get_client()
            ttl = ttl if ttl is not None else DEFAULT_CACHE_TTL
            await client.set(key, value, ex=ttl)
            logger.debug(f"Cache set for key: {key} (TTL: {ttl}s)")
            return True
        except (ConnectionError, TimeoutError) as e:
            logger.error(f"Redis cache_set failed for key '{key}': {e}")
            return False
        except RedisError as e:
            logger.error(f"Redis error in cache_set for key '{key}': {e}")
            return False
    async def cache_set_json(
        self,
        key: str,
        value: Any,
        ttl: int | None = None,
    ) -> bool:
        """
        Set a JSON-serialized value in cache.
        Args:
            key: Cache key.
            value: Value to serialize and cache.
            ttl: Time-to-live in seconds.
        Returns:
            True if successful, False otherwise.
        """
        try:
            serialized = json.dumps(value)
            return await self.cache_set(key, serialized, ttl)
        except (TypeError, ValueError) as e:
            logger.error(f"Failed to serialize value for key '{key}': {e}")
            return False
    async def cache_delete(self, key: str) -> bool:
        """
        Delete a key from cache.
        Args:
            key: Cache key to delete.
        Returns:
            True if key was deleted, False otherwise.
        """
        try:
            client = await self._get_client()
            result = await client.delete(key)
            logger.debug(f"Cache delete for key: {key} (deleted: {result > 0})")
            return result > 0
        except (ConnectionError, TimeoutError) as e:
            logger.error(f"Redis cache_delete failed for key '{key}': {e}")
            return False
        except RedisError as e:
            logger.error(f"Redis error in cache_delete for key '{key}': {e}")
            return False
    async def cache_delete_pattern(self, pattern: str) -> int:
        """
        Delete all keys matching a pattern.
        Args:
            pattern: Glob-style pattern (e.g., "user:*").
        Returns:
            Number of keys deleted.
        """
        try:
            client = await self._get_client()
            deleted = 0
            async for key in client.scan_iter(pattern):
                await client.delete(key)
                deleted += 1
            logger.debug(f"Cache delete pattern '{pattern}': {deleted} keys deleted")
            return deleted
        except (ConnectionError, TimeoutError) as e:
            logger.error(f"Redis cache_delete_pattern failed for '{pattern}': {e}")
            return 0
        except RedisError as e:
            logger.error(f"Redis error in cache_delete_pattern for '{pattern}': {e}")
            return 0
    async def cache_expire(self, key: str, ttl: int) -> bool:
        """
        Set or update TTL for a key.
        Args:
            key: Cache key.
            ttl: New TTL in seconds.
        Returns:
            True if TTL was set, False if key doesn't exist.
        """
        try:
            client = await self._get_client()
            result = await client.expire(key, ttl)
            logger.debug(
                f"Cache expire for key: {key} (TTL: {ttl}s, success: {result})"
            )
            return result
        except (ConnectionError, TimeoutError) as e:
            logger.error(f"Redis cache_expire failed for key '{key}': {e}")
            return False
        except RedisError as e:
            logger.error(f"Redis error in cache_expire for key '{key}': {e}")
            return False
    async def cache_exists(self, key: str) -> bool:
        """
        Check if a key exists in cache.
        Args:
            key: Cache key.
        Returns:
            True if key exists, False otherwise.
        """
        try:
            client = await self._get_client()
            result = await client.exists(key)
            return result > 0
        except (ConnectionError, TimeoutError) as e:
            logger.error(f"Redis cache_exists failed for key '{key}': {e}")
            return False
        except RedisError as e:
            logger.error(f"Redis error in cache_exists for key '{key}': {e}")
            return False
    async def cache_ttl(self, key: str) -> int:
        """
        Get remaining TTL for a key.
        Args:
            key: Cache key.
        Returns:
            TTL in seconds, -1 if no TTL, -2 if key doesn't exist.
        """
        try:
            client = await self._get_client()
            return await client.ttl(key)
        except (ConnectionError, TimeoutError) as e:
            logger.error(f"Redis cache_ttl failed for key '{key}': {e}")
            return -2
        except RedisError as e:
            logger.error(f"Redis error in cache_ttl for key '{key}': {e}")
            return -2
    # =========================================================================
    # Pub/Sub Operations
    # =========================================================================
    async def publish(self, channel: str, message: str | dict) -> int:
        """
        Publish a message to a channel.
        Args:
            channel: Channel name.
            message: Message to publish (string or dict for JSON serialization).
        Returns:
            Number of subscribers that received the message.
        """
        try:
            client = await self._get_client()
            if isinstance(message, dict):
                message = json.dumps(message)
            result = await client.publish(channel, message)
            logger.debug(f"Published to channel '{channel}': {result} subscribers")
            return result
        except (ConnectionError, TimeoutError) as e:
            logger.error(f"Redis publish failed for channel '{channel}': {e}")
            return 0
        except RedisError as e:
            logger.error(f"Redis error in publish for channel '{channel}': {e}")
            return 0
    @asynccontextmanager
    async def subscribe(self, *channels: str) -> AsyncGenerator[PubSub, None]:
        """
        Subscribe to one or more channels.
        Usage:
            async with redis_client.subscribe("channel1", "channel2") as pubsub:
                async for message in pubsub.listen():
                    if message["type"] == "message":
                        print(message["data"])
        Args:
            channels: Channel names to subscribe to.
        Yields:
            PubSub instance for receiving messages.
        """
        client = await self._get_client()
        pubsub = client.pubsub()
        try:
            await pubsub.subscribe(*channels)
            logger.debug(f"Subscribed to channels: {channels}")
            yield pubsub
        finally:
            await pubsub.unsubscribe(*channels)
            await pubsub.close()
            logger.debug(f"Unsubscribed from channels: {channels}")
    @asynccontextmanager
    async def psubscribe(self, *patterns: str) -> AsyncGenerator[PubSub, None]:
        """
        Subscribe to channels matching patterns.
        Usage:
            async with redis_client.psubscribe("user:*") as pubsub:
                async for message in pubsub.listen():
                    if message["type"] == "pmessage":
                        print(message["pattern"], message["channel"], message["data"])
        Args:
            patterns: Glob-style patterns to subscribe to.
        Yields:
            PubSub instance for receiving messages.
        """
        client = await self._get_client()
        pubsub = client.pubsub()
        try:
            await pubsub.psubscribe(*patterns)
            logger.debug(f"Pattern subscribed: {patterns}")
            yield pubsub
        finally:
            await pubsub.punsubscribe(*patterns)
            await pubsub.close()
            logger.debug(f"Pattern unsubscribed: {patterns}")
    # =========================================================================
    # Health & Connection Management
    # =========================================================================
    async def health_check(self) -> bool:
        """
        Check if Redis connection is healthy.
        Returns:
            True if connection is successful, False otherwise.
        """
        try:
            client = await self._get_client()
            result = await client.ping()
            return result is True
        except (ConnectionError, TimeoutError) as e:
            logger.error(f"Redis health check failed: {e}")
            return False
        except RedisError as e:
            logger.error(f"Redis health check error: {e}")
            return False
    async def close(self) -> None:
        """
        Close Redis connections and cleanup resources.
        Should be called during application shutdown.
        """
        if self._client:
            await self._client.close()
            self._client = None
            logger.debug("Redis client closed")
        if self._pool:
            await self._pool.disconnect()
            self._pool = None
            logger.info("Redis connection pool closed")
    async def get_pool_info(self) -> dict[str, Any]:
        """
        Get connection pool statistics.
        Returns:
            Dictionary with pool information.
        """
        if self._pool is None:
            return {"status": "not_initialized"}
        return {
            "status": "active",
            "max_connections": POOL_MAX_CONNECTIONS,
            "url": self._url.split("@")[-1] if "@" in self._url else self._url,
        }
 # Global Redis client instance
 redis_client = RedisClient()
 # FastAPI dependency for Redis client
 async def get_redis() -> AsyncGenerator[RedisClient, None]:
    """
    FastAPI dependency that provides the Redis client.
    Usage:
        @router.get("/cached-data")
        async def get_data(redis: RedisClient = Depends(get_redis)):
            cached = await redis.cache_get("my-key")
            ...
    """
    yield redis_client
 # Health check function for use in /health endpoint
 async def check_redis_health() -> bool:
    """
    Check if Redis connection is healthy.
    Returns:
        True if connection is successful, False otherwise.
    """
    return await redis_client.health_check()
 # Cleanup function for application shutdown
 async def close_redis() -> None:
    """
    Close Redis connections.
    Should be called during application shutdown.
    """
    await redis_client.close()
--- a/backend/app/crud/syndarix/init.py
+++ b/backend/app/crud/syndarix/init.py
@@ -0,0 +1,20 @@
 # app/crud/syndarix/__init__.py
 """
 Syndarix CRUD operations.
 This package contains CRUD operations for all Syndarix domain entities.
 """
 from .agent_instance import agent_instance
 from .agent_type import agent_type
 from .issue import issue
 from .project import project
 from .sprint import sprint
 __all__ = [
    "agent_instance",
    "agent_type",
    "issue",
    "project",
    "sprint",
 ]
--- a/backend/app/crud/syndarix/agent_instance.py
+++ b/backend/app/crud/syndarix/agent_instance.py
@@ -0,0 +1,394 @@
 # app/crud/syndarix/agent_instance.py
 """Async CRUD operations for AgentInstance model using SQLAlchemy 2.0 patterns."""
 import logging
 from datetime import UTC, datetime
 from decimal import Decimal
 from typing import Any
 from uuid import UUID
 from sqlalchemy import func, select, update
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm import joinedload
 from app.crud.base import CRUDBase
 from app.models.syndarix import AgentInstance, Issue
 from app.models.syndarix.enums import AgentStatus
 from app.schemas.syndarix import AgentInstanceCreate, AgentInstanceUpdate
 logger = logging.getLogger(__name__)
 class CRUDAgentInstance(
    CRUDBase[AgentInstance, AgentInstanceCreate, AgentInstanceUpdate]
 ):
    """Async CRUD operations for AgentInstance model."""
    async def create(
        self, db: AsyncSession, *, obj_in: AgentInstanceCreate
    ) -> AgentInstance:
        """Create a new agent instance with error handling."""
        try:
            db_obj = AgentInstance(
                agent_type_id=obj_in.agent_type_id,
                project_id=obj_in.project_id,
                name=obj_in.name,
                status=obj_in.status,
                current_task=obj_in.current_task,
                short_term_memory=obj_in.short_term_memory,
                long_term_memory_ref=obj_in.long_term_memory_ref,
                session_id=obj_in.session_id,
            )
            db.add(db_obj)
            await db.commit()
            await db.refresh(db_obj)
            return db_obj
        except IntegrityError as e:
            await db.rollback()
            error_msg = str(e.orig) if hasattr(e, "orig") else str(e)
            logger.error(f"Integrity error creating agent instance: {error_msg}")
            raise ValueError(f"Database integrity error: {error_msg}")
        except Exception as e:
            await db.rollback()
            logger.error(
                f"Unexpected error creating agent instance: {e!s}", exc_info=True
            )
            raise
    async def get_with_details(
        self,
        db: AsyncSession,
        *,
        instance_id: UUID,
    ) -> dict[str, Any] | None:
        """
        Get an agent instance with full details including related entities.
        Returns:
            Dictionary with instance and related entity details
        """
        try:
            # Get instance with joined relationships
            result = await db.execute(
                select(AgentInstance)
                .options(
                    joinedload(AgentInstance.agent_type),
                    joinedload(AgentInstance.project),
                )
                .where(AgentInstance.id == instance_id)
            )
            instance = result.scalar_one_or_none()
            if not instance:
                return None
            # Get assigned issues count
            issues_count_result = await db.execute(
                select(func.count(Issue.id)).where(
                    Issue.assigned_agent_id == instance_id
                )
            )
            assigned_issues_count = issues_count_result.scalar_one()
            return {
                "instance": instance,
                "agent_type_name": instance.agent_type.name
                if instance.agent_type
                else None,
                "agent_type_slug": instance.agent_type.slug
                if instance.agent_type
                else None,
                "project_name": instance.project.name if instance.project else None,
                "project_slug": instance.project.slug if instance.project else None,
                "assigned_issues_count": assigned_issues_count,
            }
        except Exception as e:
            logger.error(
                f"Error getting agent instance with details {instance_id}: {e!s}",
                exc_info=True,
            )
            raise
    async def get_by_project(
        self,
        db: AsyncSession,
        *,
        project_id: UUID,
        status: AgentStatus | None = None,
        skip: int = 0,
        limit: int = 100,
    ) -> tuple[list[AgentInstance], int]:
        """Get agent instances for a specific project."""
        try:
            query = select(AgentInstance).where(AgentInstance.project_id == project_id)
            if status is not None:
                query = query.where(AgentInstance.status == status)
            # Get total count
            count_query = select(func.count()).select_from(query.alias())
            count_result = await db.execute(count_query)
            total = count_result.scalar_one()
            # Apply pagination
            query = query.order_by(AgentInstance.created_at.desc())
            query = query.offset(skip).limit(limit)
            result = await db.execute(query)
            instances = list(result.scalars().all())
            return instances, total
        except Exception as e:
            logger.error(
                f"Error getting instances by project {project_id}: {e!s}",
                exc_info=True,
            )
            raise
    async def get_by_agent_type(
        self,
        db: AsyncSession,
        *,
        agent_type_id: UUID,
        status: AgentStatus | None = None,
    ) -> list[AgentInstance]:
        """Get all instances of a specific agent type."""
        try:
            query = select(AgentInstance).where(
                AgentInstance.agent_type_id == agent_type_id
            )
            if status is not None:
                query = query.where(AgentInstance.status == status)
            query = query.order_by(AgentInstance.created_at.desc())
            result = await db.execute(query)
            return list(result.scalars().all())
        except Exception as e:
            logger.error(
                f"Error getting instances by agent type {agent_type_id}: {e!s}",
                exc_info=True,
            )
            raise
    async def update_status(
        self,
        db: AsyncSession,
        *,
        instance_id: UUID,
        status: AgentStatus,
        current_task: str | None = None,
    ) -> AgentInstance | None:
        """Update the status of an agent instance."""
        try:
            result = await db.execute(
                select(AgentInstance).where(AgentInstance.id == instance_id)
            )
            instance = result.scalar_one_or_none()
            if not instance:
                return None
            instance.status = status
            instance.last_activity_at = datetime.now(UTC)
            if current_task is not None:
                instance.current_task = current_task
            await db.commit()
            await db.refresh(instance)
            return instance
        except Exception as e:
            await db.rollback()
            logger.error(
                f"Error updating instance status {instance_id}: {e!s}", exc_info=True
            )
            raise
    async def terminate(
        self,
        db: AsyncSession,
        *,
        instance_id: UUID,
    ) -> AgentInstance | None:
        """Terminate an agent instance.
        Also unassigns all issues from this agent to prevent orphaned assignments.
        """
        try:
            result = await db.execute(
                select(AgentInstance).where(AgentInstance.id == instance_id)
            )
            instance = result.scalar_one_or_none()
            if not instance:
                return None
            # Unassign all issues from this agent before terminating
            await db.execute(
                update(Issue)
                .where(Issue.assigned_agent_id == instance_id)
                .values(assigned_agent_id=None)
            )
            instance.status = AgentStatus.TERMINATED
            instance.terminated_at = datetime.now(UTC)
            instance.current_task = None
            instance.session_id = None
            await db.commit()
            await db.refresh(instance)
            return instance
        except Exception as e:
            await db.rollback()
            logger.error(
                f"Error terminating instance {instance_id}: {e!s}", exc_info=True
            )
            raise
    async def record_task_completion(
        self,
        db: AsyncSession,
        *,
        instance_id: UUID,
        tokens_used: int,
        cost_incurred: Decimal,
    ) -> AgentInstance | None:
        """Record a completed task and update metrics.
        Uses atomic SQL UPDATE to prevent lost updates under concurrent load.
        This avoids the read-modify-write race condition that occurs when
        multiple task completions happen simultaneously.
        """
        try:
            now = datetime.now(UTC)
            # Use atomic SQL UPDATE to increment counters without race conditions
            # This is safe for concurrent updates - no read-modify-write pattern
            result = await db.execute(
                update(AgentInstance)
                .where(AgentInstance.id == instance_id)
                .values(
                    tasks_completed=AgentInstance.tasks_completed + 1,
                    tokens_used=AgentInstance.tokens_used + tokens_used,
                    cost_incurred=AgentInstance.cost_incurred + cost_incurred,
                    last_activity_at=now,
                    updated_at=now,
                )
                .returning(AgentInstance)
            )
            instance = result.scalar_one_or_none()
            if not instance:
                return None
            await db.commit()
            return instance
        except Exception as e:
            await db.rollback()
            logger.error(
                f"Error recording task completion {instance_id}: {e!s}", exc_info=True
            )
            raise
    async def get_project_metrics(
        self,
        db: AsyncSession,
        *,
        project_id: UUID,
    ) -> dict[str, Any]:
        """Get aggregated metrics for all agents in a project."""
        try:
            result = await db.execute(
                select(
                    func.count(AgentInstance.id).label("total_instances"),
                    func.count(AgentInstance.id)
                    .filter(AgentInstance.status == AgentStatus.WORKING)
                    .label("active_instances"),
                    func.count(AgentInstance.id)
                    .filter(AgentInstance.status == AgentStatus.IDLE)
                    .label("idle_instances"),
                    func.sum(AgentInstance.tasks_completed).label("total_tasks"),
                    func.sum(AgentInstance.tokens_used).label("total_tokens"),
                    func.sum(AgentInstance.cost_incurred).label("total_cost"),
                ).where(AgentInstance.project_id == project_id)
            )
            row = result.one()
            return {
                "total_instances": row.total_instances or 0,
                "active_instances": row.active_instances or 0,
                "idle_instances": row.idle_instances or 0,
                "total_tasks_completed": row.total_tasks or 0,
                "total_tokens_used": row.total_tokens or 0,
                "total_cost_incurred": row.total_cost or Decimal("0.0000"),
            }
        except Exception as e:
            logger.error(
                f"Error getting project metrics {project_id}: {e!s}", exc_info=True
            )
            raise
    async def bulk_terminate_by_project(
        self,
        db: AsyncSession,
        *,
        project_id: UUID,
    ) -> int:
        """Terminate all active instances in a project.
        Also unassigns all issues from these agents to prevent orphaned assignments.
        """
        try:
            # First, unassign all issues from agents in this project
            # Get all agent IDs that will be terminated
            agents_to_terminate = await db.execute(
                select(AgentInstance.id).where(
                    AgentInstance.project_id == project_id,
                    AgentInstance.status != AgentStatus.TERMINATED,
                )
            )
            agent_ids = [row[0] for row in agents_to_terminate.fetchall()]
            # Unassign issues from these agents
            if agent_ids:
                await db.execute(
                    update(Issue)
                    .where(Issue.assigned_agent_id.in_(agent_ids))
                    .values(assigned_agent_id=None)
                )
            now = datetime.now(UTC)
            stmt = (
                update(AgentInstance)
                .where(
                    AgentInstance.project_id == project_id,
                    AgentInstance.status != AgentStatus.TERMINATED,
                )
                .values(
                    status=AgentStatus.TERMINATED,
                    terminated_at=now,
                    current_task=None,
                    session_id=None,
                    updated_at=now,
                )
            )
            result = await db.execute(stmt)
            await db.commit()
            terminated_count = result.rowcount
            logger.info(
                f"Bulk terminated {terminated_count} instances in project {project_id}"
            )
            return terminated_count
        except Exception as e:
            await db.rollback()
            logger.error(
                f"Error bulk terminating instances for project {project_id}: {e!s}",
                exc_info=True,
            )
            raise
 # Create a singleton instance for use across the application
 agent_instance = CRUDAgentInstance(AgentInstance)
--- a/backend/app/crud/syndarix/agent_type.py
+++ b/backend/app/crud/syndarix/agent_type.py
@@ -0,0 +1,265 @@
 # app/crud/syndarix/agent_type.py
 """Async CRUD operations for AgentType model using SQLAlchemy 2.0 patterns."""
 import logging
 from typing import Any
 from uuid import UUID
 from sqlalchemy import func, or_, select
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.crud.base import CRUDBase
 from app.models.syndarix import AgentInstance, AgentType
 from app.schemas.syndarix import AgentTypeCreate, AgentTypeUpdate
 logger = logging.getLogger(__name__)
 class CRUDAgentType(CRUDBase[AgentType, AgentTypeCreate, AgentTypeUpdate]):
    """Async CRUD operations for AgentType model."""
    async def get_by_slug(self, db: AsyncSession, *, slug: str) -> AgentType | None:
        """Get agent type by slug."""
        try:
            result = await db.execute(select(AgentType).where(AgentType.slug == slug))
            return result.scalar_one_or_none()
        except Exception as e:
            logger.error(f"Error getting agent type by slug {slug}: {e!s}")
            raise
    async def create(self, db: AsyncSession, *, obj_in: AgentTypeCreate) -> AgentType:
        """Create a new agent type with error handling."""
        try:
            db_obj = AgentType(
                name=obj_in.name,
                slug=obj_in.slug,
                description=obj_in.description,
                expertise=obj_in.expertise,
                personality_prompt=obj_in.personality_prompt,
                primary_model=obj_in.primary_model,
                fallback_models=obj_in.fallback_models,
                model_params=obj_in.model_params,
                mcp_servers=obj_in.mcp_servers,
                tool_permissions=obj_in.tool_permissions,
                is_active=obj_in.is_active,
            )
            db.add(db_obj)
            await db.commit()
            await db.refresh(db_obj)
            return db_obj
        except IntegrityError as e:
            await db.rollback()
            error_msg = str(e.orig) if hasattr(e, "orig") else str(e)
            if "slug" in error_msg.lower():
                logger.warning(f"Duplicate slug attempted: {obj_in.slug}")
                raise ValueError(f"Agent type with slug '{obj_in.slug}' already exists")
            logger.error(f"Integrity error creating agent type: {error_msg}")
            raise ValueError(f"Database integrity error: {error_msg}")
        except Exception as e:
            await db.rollback()
            logger.error(f"Unexpected error creating agent type: {e!s}", exc_info=True)
            raise
    async def get_multi_with_filters(
        self,
        db: AsyncSession,
        *,
        skip: int = 0,
        limit: int = 100,
        is_active: bool | None = None,
        search: str | None = None,
        sort_by: str = "created_at",
        sort_order: str = "desc",
    ) -> tuple[list[AgentType], int]:
        """
        Get multiple agent types with filtering, searching, and sorting.
        Returns:
            Tuple of (agent types list, total count)
        """
        try:
            query = select(AgentType)
            # Apply filters
            if is_active is not None:
                query = query.where(AgentType.is_active == is_active)
            if search:
                search_filter = or_(
                    AgentType.name.ilike(f"%{search}%"),
                    AgentType.slug.ilike(f"%{search}%"),
                    AgentType.description.ilike(f"%{search}%"),
                )
                query = query.where(search_filter)
            # Get total count before pagination
            count_query = select(func.count()).select_from(query.alias())
            count_result = await db.execute(count_query)
            total = count_result.scalar_one()
            # Apply sorting
            sort_column = getattr(AgentType, sort_by, AgentType.created_at)
            if sort_order == "desc":
                query = query.order_by(sort_column.desc())
            else:
                query = query.order_by(sort_column.asc())
            # Apply pagination
            query = query.offset(skip).limit(limit)
            result = await db.execute(query)
            agent_types = list(result.scalars().all())
            return agent_types, total
        except Exception as e:
            logger.error(f"Error getting agent types with filters: {e!s}")
            raise
    async def get_with_instance_count(
        self,
        db: AsyncSession,
        *,
        agent_type_id: UUID,
    ) -> dict[str, Any] | None:
        """
        Get a single agent type with its instance count.
        Returns:
            Dictionary with agent_type and instance_count
        """
        try:
            result = await db.execute(
                select(AgentType).where(AgentType.id == agent_type_id)
            )
            agent_type = result.scalar_one_or_none()
            if not agent_type:
                return None
            # Get instance count
            count_result = await db.execute(
                select(func.count(AgentInstance.id)).where(
                    AgentInstance.agent_type_id == agent_type_id
                )
            )
            instance_count = count_result.scalar_one()
            return {
                "agent_type": agent_type,
                "instance_count": instance_count,
            }
        except Exception as e:
            logger.error(
                f"Error getting agent type with count {agent_type_id}: {e!s}",
                exc_info=True,
            )
            raise
    async def get_multi_with_instance_counts(
        self,
        db: AsyncSession,
        *,
        skip: int = 0,
        limit: int = 100,
        is_active: bool | None = None,
        search: str | None = None,
    ) -> tuple[list[dict[str, Any]], int]:
        """
        Get agent types with instance counts in optimized queries.
        Returns:
            Tuple of (list of dicts with agent_type and instance_count, total count)
        """
        try:
            # Get filtered agent types
            agent_types, total = await self.get_multi_with_filters(
                db,
                skip=skip,
                limit=limit,
                is_active=is_active,
                search=search,
            )
            if not agent_types:
                return [], 0
            agent_type_ids = [at.id for at in agent_types]
            # Get instance counts in bulk
            counts_result = await db.execute(
                select(
                    AgentInstance.agent_type_id,
                    func.count(AgentInstance.id).label("count"),
                )
                .where(AgentInstance.agent_type_id.in_(agent_type_ids))
                .group_by(AgentInstance.agent_type_id)
            )
            counts = {row.agent_type_id: row.count for row in counts_result}
            # Combine results
            results = [
                {
                    "agent_type": agent_type,
                    "instance_count": counts.get(agent_type.id, 0),
                }
                for agent_type in agent_types
            ]
            return results, total
        except Exception as e:
            logger.error(f"Error getting agent types with counts: {e!s}", exc_info=True)
            raise
    async def get_by_expertise(
        self,
        db: AsyncSession,
        *,
        expertise: str,
        is_active: bool = True,
    ) -> list[AgentType]:
        """Get agent types that have a specific expertise."""
        try:
            # Use PostgreSQL JSONB contains operator
            query = select(AgentType).where(
                AgentType.expertise.contains([expertise.lower()]),
                AgentType.is_active == is_active,
            )
            result = await db.execute(query)
            return list(result.scalars().all())
        except Exception as e:
            logger.error(
                f"Error getting agent types by expertise {expertise}: {e!s}",
                exc_info=True,
            )
            raise
    async def deactivate(
        self,
        db: AsyncSession,
        *,
        agent_type_id: UUID,
    ) -> AgentType | None:
        """Deactivate an agent type (soft delete)."""
        try:
            result = await db.execute(
                select(AgentType).where(AgentType.id == agent_type_id)
            )
            agent_type = result.scalar_one_or_none()
            if not agent_type:
                return None
            agent_type.is_active = False
            await db.commit()
            await db.refresh(agent_type)
            return agent_type
        except Exception as e:
            await db.rollback()
            logger.error(
                f"Error deactivating agent type {agent_type_id}: {e!s}", exc_info=True
            )
            raise
 # Create a singleton instance for use across the application
 agent_type = CRUDAgentType(AgentType)
--- a/backend/app/crud/syndarix/issue.py
+++ b/backend/app/crud/syndarix/issue.py
@@ -0,0 +1,525 @@
 # app/crud/syndarix/issue.py
 """Async CRUD operations for Issue model using SQLAlchemy 2.0 patterns."""
 import logging
 from datetime import UTC, datetime
 from typing import Any
 from uuid import UUID
 from sqlalchemy import func, or_, select
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm import joinedload
 from app.crud.base import CRUDBase
 from app.models.syndarix import AgentInstance, Issue
 from app.models.syndarix.enums import IssuePriority, IssueStatus, SyncStatus
 from app.schemas.syndarix import IssueCreate, IssueUpdate
 logger = logging.getLogger(__name__)
 class CRUDIssue(CRUDBase[Issue, IssueCreate, IssueUpdate]):
    """Async CRUD operations for Issue model."""
    async def create(self, db: AsyncSession, *, obj_in: IssueCreate) -> Issue:
        """Create a new issue with error handling."""
        try:
            db_obj = Issue(
                project_id=obj_in.project_id,
                title=obj_in.title,
                body=obj_in.body,
                status=obj_in.status,
                priority=obj_in.priority,
                labels=obj_in.labels,
                assigned_agent_id=obj_in.assigned_agent_id,
                human_assignee=obj_in.human_assignee,
                sprint_id=obj_in.sprint_id,
                story_points=obj_in.story_points,
                external_tracker_type=obj_in.external_tracker_type,
                external_issue_id=obj_in.external_issue_id,
                remote_url=obj_in.remote_url,
                external_issue_number=obj_in.external_issue_number,
                sync_status=SyncStatus.SYNCED,
            )
            db.add(db_obj)
            await db.commit()
            await db.refresh(db_obj)
            return db_obj
        except IntegrityError as e:
            await db.rollback()
            error_msg = str(e.orig) if hasattr(e, "orig") else str(e)
            logger.error(f"Integrity error creating issue: {error_msg}")
            raise ValueError(f"Database integrity error: {error_msg}")
        except Exception as e:
            await db.rollback()
            logger.error(f"Unexpected error creating issue: {e!s}", exc_info=True)
            raise
    async def get_with_details(
        self,
        db: AsyncSession,
        *,
        issue_id: UUID,
    ) -> dict[str, Any] | None:
        """
        Get an issue with full details including related entity names.
        Returns:
            Dictionary with issue and related entity details
        """
        try:
            # Get issue with joined relationships
            result = await db.execute(
                select(Issue)
                .options(
                    joinedload(Issue.project),
                    joinedload(Issue.sprint),
                    joinedload(Issue.assigned_agent).joinedload(
                        AgentInstance.agent_type
                    ),
                )
                .where(Issue.id == issue_id)
            )
            issue = result.scalar_one_or_none()
            if not issue:
                return None
            return {
                "issue": issue,
                "project_name": issue.project.name if issue.project else None,
                "project_slug": issue.project.slug if issue.project else None,
                "sprint_name": issue.sprint.name if issue.sprint else None,
                "assigned_agent_type_name": (
                    issue.assigned_agent.agent_type.name
                    if issue.assigned_agent and issue.assigned_agent.agent_type
                    else None
                ),
            }
        except Exception as e:
            logger.error(
                f"Error getting issue with details {issue_id}: {e!s}", exc_info=True
            )
            raise
    async def get_by_project(
        self,
        db: AsyncSession,
        *,
        project_id: UUID,
        status: IssueStatus | None = None,
        priority: IssuePriority | None = None,
        sprint_id: UUID | None = None,
        assigned_agent_id: UUID | None = None,
        labels: list[str] | None = None,
        search: str | None = None,
        skip: int = 0,
        limit: int = 100,
        sort_by: str = "created_at",
        sort_order: str = "desc",
    ) -> tuple[list[Issue], int]:
        """Get issues for a specific project with filters."""
        try:
            query = select(Issue).where(Issue.project_id == project_id)
            # Apply filters
            if status is not None:
                query = query.where(Issue.status == status)
            if priority is not None:
                query = query.where(Issue.priority == priority)
            if sprint_id is not None:
                query = query.where(Issue.sprint_id == sprint_id)
            if assigned_agent_id is not None:
                query = query.where(Issue.assigned_agent_id == assigned_agent_id)
            if labels:
                # Match any of the provided labels
                for label in labels:
                    query = query.where(Issue.labels.contains([label.lower()]))
            if search:
                search_filter = or_(
                    Issue.title.ilike(f"%{search}%"),
                    Issue.body.ilike(f"%{search}%"),
                )
                query = query.where(search_filter)
            # Get total count
            count_query = select(func.count()).select_from(query.alias())
            count_result = await db.execute(count_query)
            total = count_result.scalar_one()
            # Apply sorting
            sort_column = getattr(Issue, sort_by, Issue.created_at)
            if sort_order == "desc":
                query = query.order_by(sort_column.desc())
            else:
                query = query.order_by(sort_column.asc())
            # Apply pagination
            query = query.offset(skip).limit(limit)
            result = await db.execute(query)
            issues = list(result.scalars().all())
            return issues, total
        except Exception as e:
            logger.error(
                f"Error getting issues by project {project_id}: {e!s}", exc_info=True
            )
            raise
    async def get_by_sprint(
        self,
        db: AsyncSession,
        *,
        sprint_id: UUID,
        status: IssueStatus | None = None,
    ) -> list[Issue]:
        """Get all issues in a sprint."""
        try:
            query = select(Issue).where(Issue.sprint_id == sprint_id)
            if status is not None:
                query = query.where(Issue.status == status)
            query = query.order_by(Issue.priority.desc(), Issue.created_at.asc())
            result = await db.execute(query)
            return list(result.scalars().all())
        except Exception as e:
            logger.error(
                f"Error getting issues by sprint {sprint_id}: {e!s}", exc_info=True
            )
            raise
    async def assign_to_agent(
        self,
        db: AsyncSession,
        *,
        issue_id: UUID,
        agent_id: UUID | None,
    ) -> Issue | None:
        """Assign an issue to an agent (or unassign if agent_id is None)."""
        try:
            result = await db.execute(select(Issue).where(Issue.id == issue_id))
            issue = result.scalar_one_or_none()
            if not issue:
                return None
            issue.assigned_agent_id = agent_id
            issue.human_assignee = None  # Clear human assignee when assigning to agent
            await db.commit()
            await db.refresh(issue)
            return issue
        except Exception as e:
            await db.rollback()
            logger.error(
                f"Error assigning issue {issue_id} to agent {agent_id}: {e!s}",
                exc_info=True,
            )
            raise
    async def assign_to_human(
        self,
        db: AsyncSession,
        *,
        issue_id: UUID,
        human_assignee: str | None,
    ) -> Issue | None:
        """Assign an issue to a human (or unassign if human_assignee is None)."""
        try:
            result = await db.execute(select(Issue).where(Issue.id == issue_id))
            issue = result.scalar_one_or_none()
            if not issue:
                return None
            issue.human_assignee = human_assignee
            issue.assigned_agent_id = None  # Clear agent when assigning to human
            await db.commit()
            await db.refresh(issue)
            return issue
        except Exception as e:
            await db.rollback()
            logger.error(
                f"Error assigning issue {issue_id} to human {human_assignee}: {e!s}",
                exc_info=True,
            )
            raise
    async def close_issue(
        self,
        db: AsyncSession,
        *,
        issue_id: UUID,
    ) -> Issue | None:
        """Close an issue by setting status and closed_at timestamp."""
        try:
            result = await db.execute(select(Issue).where(Issue.id == issue_id))
            issue = result.scalar_one_or_none()
            if not issue:
                return None
            issue.status = IssueStatus.CLOSED
            issue.closed_at = datetime.now(UTC)
            await db.commit()
            await db.refresh(issue)
            return issue
        except Exception as e:
            await db.rollback()
            logger.error(f"Error closing issue {issue_id}: {e!s}", exc_info=True)
            raise
    async def reopen_issue(
        self,
        db: AsyncSession,
        *,
        issue_id: UUID,
    ) -> Issue | None:
        """Reopen a closed issue."""
        try:
            result = await db.execute(select(Issue).where(Issue.id == issue_id))
            issue = result.scalar_one_or_none()
            if not issue:
                return None
            issue.status = IssueStatus.OPEN
            issue.closed_at = None
            await db.commit()
            await db.refresh(issue)
            return issue
        except Exception as e:
            await db.rollback()
            logger.error(f"Error reopening issue {issue_id}: {e!s}", exc_info=True)
            raise
    async def update_sync_status(
        self,
        db: AsyncSession,
        *,
        issue_id: UUID,
        sync_status: SyncStatus,
        last_synced_at: datetime | None = None,
        external_updated_at: datetime | None = None,
    ) -> Issue | None:
        """Update the sync status of an issue."""
        try:
            result = await db.execute(select(Issue).where(Issue.id == issue_id))
            issue = result.scalar_one_or_none()
            if not issue:
                return None
            issue.sync_status = sync_status
            if last_synced_at:
                issue.last_synced_at = last_synced_at
            if external_updated_at:
                issue.external_updated_at = external_updated_at
            await db.commit()
            await db.refresh(issue)
            return issue
        except Exception as e:
            await db.rollback()
            logger.error(
                f"Error updating sync status for issue {issue_id}: {e!s}", exc_info=True
            )
            raise
    async def get_project_stats(
        self,
        db: AsyncSession,
        *,
        project_id: UUID,
    ) -> dict[str, Any]:
        """Get issue statistics for a project."""
        try:
            # Get counts by status
            status_counts = await db.execute(
                select(Issue.status, func.count(Issue.id).label("count"))
                .where(Issue.project_id == project_id)
                .group_by(Issue.status)
            )
            by_status = {row.status.value: row.count for row in status_counts}
            # Get counts by priority
            priority_counts = await db.execute(
                select(Issue.priority, func.count(Issue.id).label("count"))
                .where(Issue.project_id == project_id)
                .group_by(Issue.priority)
            )
            by_priority = {row.priority.value: row.count for row in priority_counts}
            # Get story points
            points_result = await db.execute(
                select(
                    func.sum(Issue.story_points).label("total"),
                    func.sum(Issue.story_points)
                    .filter(Issue.status == IssueStatus.CLOSED)
                    .label("completed"),
                ).where(Issue.project_id == project_id)
            )
            points_row = points_result.one()
            total_issues = sum(by_status.values())
            return {
                "total": total_issues,
                "open": by_status.get("open", 0),
                "in_progress": by_status.get("in_progress", 0),
                "in_review": by_status.get("in_review", 0),
                "blocked": by_status.get("blocked", 0),
                "closed": by_status.get("closed", 0),
                "by_priority": by_priority,
                "total_story_points": points_row.total,
                "completed_story_points": points_row.completed,
            }
        except Exception as e:
            logger.error(
                f"Error getting issue stats for project {project_id}: {e!s}",
                exc_info=True,
            )
            raise
    async def get_by_external_id(
        self,
        db: AsyncSession,
        *,
        external_tracker_type: str,
        external_issue_id: str,
    ) -> Issue | None:
        """Get an issue by its external tracker ID."""
        try:
            result = await db.execute(
                select(Issue).where(
                    Issue.external_tracker_type == external_tracker_type,
                    Issue.external_issue_id == external_issue_id,
                )
            )
            return result.scalar_one_or_none()
        except Exception as e:
            logger.error(
                f"Error getting issue by external ID {external_tracker_type}:{external_issue_id}: {e!s}",
                exc_info=True,
            )
            raise
    async def get_pending_sync(
        self,
        db: AsyncSession,
        *,
        project_id: UUID | None = None,
        limit: int = 100,
    ) -> list[Issue]:
        """Get issues that need to be synced with external tracker."""
        try:
            query = select(Issue).where(
                Issue.external_tracker_type.isnot(None),
                Issue.sync_status.in_([SyncStatus.PENDING, SyncStatus.ERROR]),
            )
            if project_id:
                query = query.where(Issue.project_id == project_id)
            query = query.order_by(Issue.updated_at.asc()).limit(limit)
            result = await db.execute(query)
            return list(result.scalars().all())
        except Exception as e:
            logger.error(f"Error getting pending sync issues: {e!s}", exc_info=True)
            raise
    async def remove_sprint_from_issues(
        self,
        db: AsyncSession,
        *,
        sprint_id: UUID,
    ) -> int:
        """Remove sprint assignment from all issues in a sprint.
        Used when deleting a sprint to clean up references.
        Returns:
            Number of issues updated
        """
        try:
            from sqlalchemy import update
            result = await db.execute(
                update(Issue).where(Issue.sprint_id == sprint_id).values(sprint_id=None)
            )
            await db.commit()
            return result.rowcount
        except Exception as e:
            await db.rollback()
            logger.error(
                f"Error removing sprint {sprint_id} from issues: {e!s}",
                exc_info=True,
            )
            raise
    async def unassign(
        self,
        db: AsyncSession,
        *,
        issue_id: UUID,
    ) -> Issue | None:
        """Remove agent assignment from an issue.
        Returns:
            Updated issue or None if not found
        """
        try:
            result = await db.execute(select(Issue).where(Issue.id == issue_id))
            issue = result.scalar_one_or_none()
            if not issue:
                return None
            issue.assigned_agent_id = None
            await db.commit()
            await db.refresh(issue)
            return issue
        except Exception as e:
            await db.rollback()
            logger.error(f"Error unassigning issue {issue_id}: {e!s}", exc_info=True)
            raise
    async def remove_from_sprint(
        self,
        db: AsyncSession,
        *,
        issue_id: UUID,
    ) -> Issue | None:
        """Remove an issue from its current sprint.
        Returns:
            Updated issue or None if not found
        """
        try:
            result = await db.execute(select(Issue).where(Issue.id == issue_id))
            issue = result.scalar_one_or_none()
            if not issue:
                return None
            issue.sprint_id = None
            await db.commit()
            await db.refresh(issue)
            return issue
        except Exception as e:
            await db.rollback()
            logger.error(
                f"Error removing issue {issue_id} from sprint: {e!s}",
                exc_info=True,
            )
            raise
 # Create a singleton instance for use across the application
 issue = CRUDIssue(Issue)
--- a/backend/app/crud/syndarix/project.py
+++ b/backend/app/crud/syndarix/project.py
@@ -0,0 +1,362 @@
 # app/crud/syndarix/project.py
 """Async CRUD operations for Project model using SQLAlchemy 2.0 patterns."""
 import logging
 from datetime import UTC, datetime
 from typing import Any
 from uuid import UUID
 from sqlalchemy import func, or_, select, update
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.ext.asyncio import AsyncSession
 from app.crud.base import CRUDBase
 from app.models.syndarix import AgentInstance, Issue, Project, Sprint
 from app.models.syndarix.enums import AgentStatus, ProjectStatus, SprintStatus
 from app.schemas.syndarix import ProjectCreate, ProjectUpdate
 logger = logging.getLogger(__name__)
 class CRUDProject(CRUDBase[Project, ProjectCreate, ProjectUpdate]):
    """Async CRUD operations for Project model."""
    async def get_by_slug(self, db: AsyncSession, *, slug: str) -> Project | None:
        """Get project by slug."""
        try:
            result = await db.execute(select(Project).where(Project.slug == slug))
            return result.scalar_one_or_none()
        except Exception as e:
            logger.error(f"Error getting project by slug {slug}: {e!s}")
            raise
    async def create(self, db: AsyncSession, *, obj_in: ProjectCreate) -> Project:
        """Create a new project with error handling."""
        try:
            db_obj = Project(
                name=obj_in.name,
                slug=obj_in.slug,
                description=obj_in.description,
                autonomy_level=obj_in.autonomy_level,
                status=obj_in.status,
                settings=obj_in.settings or {},
                owner_id=obj_in.owner_id,
            )
            db.add(db_obj)
            await db.commit()
            await db.refresh(db_obj)
            return db_obj
        except IntegrityError as e:
            await db.rollback()
            error_msg = str(e.orig) if hasattr(e, "orig") else str(e)
            if "slug" in error_msg.lower():
                logger.warning(f"Duplicate slug attempted: {obj_in.slug}")
                raise ValueError(f"Project with slug '{obj_in.slug}' already exists")
            logger.error(f"Integrity error creating project: {error_msg}")
            raise ValueError(f"Database integrity error: {error_msg}")
        except Exception as e:
            await db.rollback()
            logger.error(f"Unexpected error creating project: {e!s}", exc_info=True)
            raise
    async def get_multi_with_filters(
        self,
        db: AsyncSession,
        *,
        skip: int = 0,
        limit: int = 100,
        status: ProjectStatus | None = None,
        owner_id: UUID | None = None,
        search: str | None = None,
        sort_by: str = "created_at",
        sort_order: str = "desc",
    ) -> tuple[list[Project], int]:
        """
        Get multiple projects with filtering, searching, and sorting.
        Returns:
            Tuple of (projects list, total count)
        """
        try:
            query = select(Project)
            # Apply filters
            if status is not None:
                query = query.where(Project.status == status)
            if owner_id is not None:
                query = query.where(Project.owner_id == owner_id)
            if search:
                search_filter = or_(
                    Project.name.ilike(f"%{search}%"),
                    Project.slug.ilike(f"%{search}%"),
                    Project.description.ilike(f"%{search}%"),
                )
                query = query.where(search_filter)
            # Get total count before pagination
            count_query = select(func.count()).select_from(query.alias())
            count_result = await db.execute(count_query)
            total = count_result.scalar_one()
            # Apply sorting
            sort_column = getattr(Project, sort_by, Project.created_at)
            if sort_order == "desc":
                query = query.order_by(sort_column.desc())
            else:
                query = query.order_by(sort_column.asc())
            # Apply pagination
            query = query.offset(skip).limit(limit)
            result = await db.execute(query)
            projects = list(result.scalars().all())
            return projects, total
        except Exception as e:
            logger.error(f"Error getting projects with filters: {e!s}")
            raise
    async def get_with_counts(
        self,
        db: AsyncSession,
        *,
        project_id: UUID,
    ) -> dict[str, Any] | None:
        """
        Get a single project with agent and issue counts.
        Returns:
            Dictionary with project, agent_count, issue_count, active_sprint_name
        """
        try:
            # Get project
            result = await db.execute(select(Project).where(Project.id == project_id))
            project = result.scalar_one_or_none()
            if not project:
                return None
            # Get agent count
            agent_count_result = await db.execute(
                select(func.count(AgentInstance.id)).where(
                    AgentInstance.project_id == project_id
                )
            )
            agent_count = agent_count_result.scalar_one()
            # Get issue count
            issue_count_result = await db.execute(
                select(func.count(Issue.id)).where(Issue.project_id == project_id)
            )
            issue_count = issue_count_result.scalar_one()
            # Get active sprint name
            active_sprint_result = await db.execute(
                select(Sprint.name).where(
                    Sprint.project_id == project_id,
                    Sprint.status == SprintStatus.ACTIVE,
                )
            )
            active_sprint_name = active_sprint_result.scalar_one_or_none()
            return {
                "project": project,
                "agent_count": agent_count,
                "issue_count": issue_count,
                "active_sprint_name": active_sprint_name,
            }
        except Exception as e:
            logger.error(
                f"Error getting project with counts {project_id}: {e!s}", exc_info=True
            )
            raise
    async def get_multi_with_counts(
        self,
        db: AsyncSession,
        *,
        skip: int = 0,
        limit: int = 100,
        status: ProjectStatus | None = None,
        owner_id: UUID | None = None,
        search: str | None = None,
    ) -> tuple[list[dict[str, Any]], int]:
        """
        Get projects with agent/issue counts in optimized queries.
        Returns:
            Tuple of (list of dicts with project and counts, total count)
        """
        try:
            # Get filtered projects
            projects, total = await self.get_multi_with_filters(
                db,
                skip=skip,
                limit=limit,
                status=status,
                owner_id=owner_id,
                search=search,
            )
            if not projects:
                return [], 0
            project_ids = [p.id for p in projects]
            # Get agent counts in bulk
            agent_counts_result = await db.execute(
                select(
                    AgentInstance.project_id,
                    func.count(AgentInstance.id).label("count"),
                )
                .where(AgentInstance.project_id.in_(project_ids))
                .group_by(AgentInstance.project_id)
            )
            agent_counts = {row.project_id: row.count for row in agent_counts_result}
            # Get issue counts in bulk
            issue_counts_result = await db.execute(
                select(
                    Issue.project_id,
                    func.count(Issue.id).label("count"),
                )
                .where(Issue.project_id.in_(project_ids))
                .group_by(Issue.project_id)
            )
            issue_counts = {row.project_id: row.count for row in issue_counts_result}
            # Get active sprint names
            active_sprints_result = await db.execute(
                select(Sprint.project_id, Sprint.name).where(
                    Sprint.project_id.in_(project_ids),
                    Sprint.status == SprintStatus.ACTIVE,
                )
            )
            active_sprints = {row.project_id: row.name for row in active_sprints_result}
            # Combine results
            results = [
                {
                    "project": project,
                    "agent_count": agent_counts.get(project.id, 0),
                    "issue_count": issue_counts.get(project.id, 0),
                    "active_sprint_name": active_sprints.get(project.id),
                }
                for project in projects
            ]
            return results, total
        except Exception as e:
            logger.error(f"Error getting projects with counts: {e!s}", exc_info=True)
            raise
    async def get_projects_by_owner(
        self,
        db: AsyncSession,
        *,
        owner_id: UUID,
        status: ProjectStatus | None = None,
    ) -> list[Project]:
        """Get all projects owned by a specific user."""
        try:
            query = select(Project).where(Project.owner_id == owner_id)
            if status is not None:
                query = query.where(Project.status == status)
            query = query.order_by(Project.created_at.desc())
            result = await db.execute(query)
            return list(result.scalars().all())
        except Exception as e:
            logger.error(
                f"Error getting projects by owner {owner_id}: {e!s}", exc_info=True
            )
            raise
    async def archive_project(
        self,
        db: AsyncSession,
        *,
        project_id: UUID,
    ) -> Project | None:
        """Archive a project by setting status to ARCHIVED.
        This also performs cascading cleanup:
        - Terminates all active agent instances
        - Cancels all planned/active sprints
        - Unassigns issues from terminated agents
        """
        try:
            result = await db.execute(select(Project).where(Project.id == project_id))
            project = result.scalar_one_or_none()
            if not project:
                return None
            now = datetime.now(UTC)
            # 1. Get all agent IDs that will be terminated
            agents_to_terminate = await db.execute(
                select(AgentInstance.id).where(
                    AgentInstance.project_id == project_id,
                    AgentInstance.status != AgentStatus.TERMINATED,
                )
            )
            agent_ids = [row[0] for row in agents_to_terminate.fetchall()]
            # 2. Unassign issues from these agents to prevent orphaned assignments
            if agent_ids:
                await db.execute(
                    update(Issue)
                    .where(Issue.assigned_agent_id.in_(agent_ids))
                    .values(assigned_agent_id=None)
                )
            # 3. Terminate all active agents
            await db.execute(
                update(AgentInstance)
                .where(
                    AgentInstance.project_id == project_id,
                    AgentInstance.status != AgentStatus.TERMINATED,
                )
                .values(
                    status=AgentStatus.TERMINATED,
                    terminated_at=now,
                    current_task=None,
                    session_id=None,
                    updated_at=now,
                )
            )
            # 4. Cancel all planned/active sprints
            await db.execute(
                update(Sprint)
                .where(
                    Sprint.project_id == project_id,
                    Sprint.status.in_([SprintStatus.PLANNED, SprintStatus.ACTIVE]),
                )
                .values(
                    status=SprintStatus.CANCELLED,
                    updated_at=now,
                )
            )
            # 5. Archive the project
            project.status = ProjectStatus.ARCHIVED
            await db.commit()
            await db.refresh(project)
            logger.info(
                f"Archived project {project_id}: terminated agents={len(agent_ids)}"
            )
            return project
        except Exception as e:
            await db.rollback()
            logger.error(f"Error archiving project {project_id}: {e!s}", exc_info=True)
            raise
 # Create a singleton instance for use across the application
 project = CRUDProject(Project)
--- a/backend/app/crud/syndarix/sprint.py
+++ b/backend/app/crud/syndarix/sprint.py
@@ -0,0 +1,439 @@
 # app/crud/syndarix/sprint.py
 """Async CRUD operations for Sprint model using SQLAlchemy 2.0 patterns."""
 import logging
 from datetime import date
 from typing import Any
 from uuid import UUID
 from sqlalchemy import func, select
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.ext.asyncio import AsyncSession
 from sqlalchemy.orm import joinedload
 from app.crud.base import CRUDBase
 from app.models.syndarix import Issue, Sprint
 from app.models.syndarix.enums import IssueStatus, SprintStatus
 from app.schemas.syndarix import SprintCreate, SprintUpdate
 logger = logging.getLogger(__name__)
 class CRUDSprint(CRUDBase[Sprint, SprintCreate, SprintUpdate]):
    """Async CRUD operations for Sprint model."""
    async def create(self, db: AsyncSession, *, obj_in: SprintCreate) -> Sprint:
        """Create a new sprint with error handling."""
        try:
            db_obj = Sprint(
                project_id=obj_in.project_id,
                name=obj_in.name,
                number=obj_in.number,
                goal=obj_in.goal,
                start_date=obj_in.start_date,
                end_date=obj_in.end_date,
                status=obj_in.status,
                planned_points=obj_in.planned_points,
                velocity=obj_in.velocity,
            )
            db.add(db_obj)
            await db.commit()
            await db.refresh(db_obj)
            return db_obj
        except IntegrityError as e:
            await db.rollback()
            error_msg = str(e.orig) if hasattr(e, "orig") else str(e)
            logger.error(f"Integrity error creating sprint: {error_msg}")
            raise ValueError(f"Database integrity error: {error_msg}")
        except Exception as e:
            await db.rollback()
            logger.error(f"Unexpected error creating sprint: {e!s}", exc_info=True)
            raise
    async def get_with_details(
        self,
        db: AsyncSession,
        *,
        sprint_id: UUID,
    ) -> dict[str, Any] | None:
        """
        Get a sprint with full details including issue counts.
        Returns:
            Dictionary with sprint and related details
        """
        try:
            # Get sprint with joined project
            result = await db.execute(
                select(Sprint)
                .options(joinedload(Sprint.project))
                .where(Sprint.id == sprint_id)
            )
            sprint = result.scalar_one_or_none()
            if not sprint:
                return None
            # Get issue counts
            issue_counts = await db.execute(
                select(
                    func.count(Issue.id).label("total"),
                    func.count(Issue.id)
                    .filter(Issue.status == IssueStatus.OPEN)
                    .label("open"),
                    func.count(Issue.id)
                    .filter(Issue.status == IssueStatus.CLOSED)
                    .label("completed"),
                ).where(Issue.sprint_id == sprint_id)
            )
            counts = issue_counts.one()
            return {
                "sprint": sprint,
                "project_name": sprint.project.name if sprint.project else None,
                "project_slug": sprint.project.slug if sprint.project else None,
                "issue_count": counts.total,
                "open_issues": counts.open,
                "completed_issues": counts.completed,
            }
        except Exception as e:
            logger.error(
                f"Error getting sprint with details {sprint_id}: {e!s}", exc_info=True
            )
            raise
    async def get_by_project(
        self,
        db: AsyncSession,
        *,
        project_id: UUID,
        status: SprintStatus | None = None,
        skip: int = 0,
        limit: int = 100,
    ) -> tuple[list[Sprint], int]:
        """Get sprints for a specific project."""
        try:
            query = select(Sprint).where(Sprint.project_id == project_id)
            if status is not None:
                query = query.where(Sprint.status == status)
            # Get total count
            count_query = select(func.count()).select_from(query.alias())
            count_result = await db.execute(count_query)
            total = count_result.scalar_one()
            # Apply sorting (by number descending - newest first)
            query = query.order_by(Sprint.number.desc())
            query = query.offset(skip).limit(limit)
            result = await db.execute(query)
            sprints = list(result.scalars().all())
            return sprints, total
        except Exception as e:
            logger.error(
                f"Error getting sprints by project {project_id}: {e!s}", exc_info=True
            )
            raise
    async def get_active_sprint(
        self,
        db: AsyncSession,
        *,
        project_id: UUID,
    ) -> Sprint | None:
        """Get the currently active sprint for a project."""
        try:
            result = await db.execute(
                select(Sprint).where(
                    Sprint.project_id == project_id,
                    Sprint.status == SprintStatus.ACTIVE,
                )
            )
            return result.scalar_one_or_none()
        except Exception as e:
            logger.error(
                f"Error getting active sprint for project {project_id}: {e!s}",
                exc_info=True,
            )
            raise
    async def get_next_sprint_number(
        self,
        db: AsyncSession,
        *,
        project_id: UUID,
    ) -> int:
        """Get the next sprint number for a project."""
        try:
            result = await db.execute(
                select(func.max(Sprint.number)).where(Sprint.project_id == project_id)
            )
            max_number = result.scalar_one_or_none()
            return (max_number or 0) + 1
        except Exception as e:
            logger.error(
                f"Error getting next sprint number for project {project_id}: {e!s}",
                exc_info=True,
            )
            raise
    async def start_sprint(
        self,
        db: AsyncSession,
        *,
        sprint_id: UUID,
        start_date: date | None = None,
    ) -> Sprint | None:
        """Start a planned sprint.
        Uses row-level locking (SELECT FOR UPDATE) to prevent race conditions
        when multiple requests try to start sprints concurrently.
        """
        try:
            # Lock the sprint row to prevent concurrent modifications
            result = await db.execute(
                select(Sprint).where(Sprint.id == sprint_id).with_for_update()
            )
            sprint = result.scalar_one_or_none()
            if not sprint:
                return None
            if sprint.status != SprintStatus.PLANNED:
                raise ValueError(
                    f"Cannot start sprint with status {sprint.status.value}"
                )
            # Check for existing active sprint with lock to prevent race condition
            # Lock all sprints for this project to ensure atomic check-and-update
            active_check = await db.execute(
                select(Sprint)
                .where(
                    Sprint.project_id == sprint.project_id,
                    Sprint.status == SprintStatus.ACTIVE,
                )
                .with_for_update()
            )
            active_sprint = active_check.scalar_one_or_none()
            if active_sprint:
                raise ValueError(
                    f"Project already has an active sprint: {active_sprint.name}"
                )
            sprint.status = SprintStatus.ACTIVE
            if start_date:
                sprint.start_date = start_date
            # Calculate planned points from issues
            points_result = await db.execute(
                select(func.sum(Issue.story_points)).where(Issue.sprint_id == sprint_id)
            )
            sprint.planned_points = points_result.scalar_one_or_none() or 0
            await db.commit()
            await db.refresh(sprint)
            return sprint
        except ValueError:
            raise
        except Exception as e:
            await db.rollback()
            logger.error(f"Error starting sprint {sprint_id}: {e!s}", exc_info=True)
            raise
    async def complete_sprint(
        self,
        db: AsyncSession,
        *,
        sprint_id: UUID,
    ) -> Sprint | None:
        """Complete an active sprint and calculate completed points.
        Uses row-level locking (SELECT FOR UPDATE) to prevent race conditions
        when velocity is being calculated and other operations might modify issues.
        """
        try:
            # Lock the sprint row to prevent concurrent modifications
            result = await db.execute(
                select(Sprint).where(Sprint.id == sprint_id).with_for_update()
            )
            sprint = result.scalar_one_or_none()
            if not sprint:
                return None
            if sprint.status != SprintStatus.ACTIVE:
                raise ValueError(
                    f"Cannot complete sprint with status {sprint.status.value}"
                )
            sprint.status = SprintStatus.COMPLETED
            # Calculate velocity (completed points) from closed issues
            # Note: Issues are not locked, but sprint lock ensures this sprint's
            # completion is atomic and prevents concurrent completion attempts
            points_result = await db.execute(
                select(func.sum(Issue.story_points)).where(
                    Issue.sprint_id == sprint_id,
                    Issue.status == IssueStatus.CLOSED,
                )
            )
            sprint.velocity = points_result.scalar_one_or_none() or 0
            await db.commit()
            await db.refresh(sprint)
            return sprint
        except ValueError:
            raise
        except Exception as e:
            await db.rollback()
            logger.error(f"Error completing sprint {sprint_id}: {e!s}", exc_info=True)
            raise
    async def cancel_sprint(
        self,
        db: AsyncSession,
        *,
        sprint_id: UUID,
    ) -> Sprint | None:
        """Cancel a sprint (only PLANNED or ACTIVE sprints can be cancelled).
        Uses row-level locking to prevent race conditions with concurrent
        sprint status modifications.
        """
        try:
            # Lock the sprint row to prevent concurrent modifications
            result = await db.execute(
                select(Sprint).where(Sprint.id == sprint_id).with_for_update()
            )
            sprint = result.scalar_one_or_none()
            if not sprint:
                return None
            if sprint.status not in [SprintStatus.PLANNED, SprintStatus.ACTIVE]:
                raise ValueError(
                    f"Cannot cancel sprint with status {sprint.status.value}"
                )
            sprint.status = SprintStatus.CANCELLED
            await db.commit()
            await db.refresh(sprint)
            return sprint
        except ValueError:
            raise
        except Exception as e:
            await db.rollback()
            logger.error(f"Error cancelling sprint {sprint_id}: {e!s}", exc_info=True)
            raise
    async def get_velocity(
        self,
        db: AsyncSession,
        *,
        project_id: UUID,
        limit: int = 5,
    ) -> list[dict[str, Any]]:
        """Get velocity data for completed sprints."""
        try:
            result = await db.execute(
                select(Sprint)
                .where(
                    Sprint.project_id == project_id,
                    Sprint.status == SprintStatus.COMPLETED,
                )
                .order_by(Sprint.number.desc())
                .limit(limit)
            )
            sprints = list(result.scalars().all())
            velocity_data = []
            for sprint in reversed(sprints):  # Return in chronological order
                velocity_ratio = None
                if sprint.planned_points and sprint.planned_points > 0:
                    velocity_ratio = (sprint.velocity or 0) / sprint.planned_points
                velocity_data.append(
                    {
                        "sprint_number": sprint.number,
                        "sprint_name": sprint.name,
                        "planned_points": sprint.planned_points,
                        "velocity": sprint.velocity,
                        "velocity_ratio": velocity_ratio,
                    }
                )
            return velocity_data
        except Exception as e:
            logger.error(
                f"Error getting velocity for project {project_id}: {e!s}",
                exc_info=True,
            )
            raise
    async def get_sprints_with_issue_counts(
        self,
        db: AsyncSession,
        *,
        project_id: UUID,
        skip: int = 0,
        limit: int = 100,
    ) -> tuple[list[dict[str, Any]], int]:
        """Get sprints with issue counts in optimized queries."""
        try:
            # Get sprints
            sprints, total = await self.get_by_project(
                db, project_id=project_id, skip=skip, limit=limit
            )
            if not sprints:
                return [], 0
            sprint_ids = [s.id for s in sprints]
            # Get issue counts in bulk
            issue_counts = await db.execute(
                select(
                    Issue.sprint_id,
                    func.count(Issue.id).label("total"),
                    func.count(Issue.id)
                    .filter(Issue.status == IssueStatus.OPEN)
                    .label("open"),
                    func.count(Issue.id)
                    .filter(Issue.status == IssueStatus.CLOSED)
                    .label("completed"),
                )
                .where(Issue.sprint_id.in_(sprint_ids))
                .group_by(Issue.sprint_id)
            )
            counts_map = {
                row.sprint_id: {
                    "issue_count": row.total,
                    "open_issues": row.open,
                    "completed_issues": row.completed,
                }
                for row in issue_counts
            }
            # Combine results
            results = [
                {
                    "sprint": sprint,
                    **counts_map.get(
                        sprint.id,
                        {"issue_count": 0, "open_issues": 0, "completed_issues": 0},
                    ),
                }
                for sprint in sprints
            ]
            return results, total
        except Exception as e:
            logger.error(
                f"Error getting sprints with counts for project {project_id}: {e!s}",
                exc_info=True,
            )
            raise
 # Create a singleton instance for use across the application
 sprint = CRUDSprint(Sprint)
--- a/backend/app/models/init.py
+++ b/backend/app/models/init.py
@@ -18,13 +18,26 @@ from .oauth_provider_token import OAuthConsent, OAuthProviderRefreshToken
 from .oauth_state import OAuthState
 from .organization import Organization
 # Syndarix domain models
 from .syndarix import (
    AgentInstance,
    AgentType,
    Issue,
    Project,
    Sprint,
 )
 # Import models
 from .user import User
 from .user_organization import OrganizationRole, UserOrganization
 from .user_session import UserSession
 __all__ = [
    # Syndarix models
    "AgentInstance",
    "AgentType",
    "Base",
    "Issue",
    "OAuthAccount",
    "OAuthAuthorizationCode",
    "OAuthClient",
@@ -33,6 +46,8 @@ __all__ = [
    "OAuthState",
    "Organization",
    "OrganizationRole",
    "Project",
    "Sprint",
    "TimestampMixin",
    "UUIDMixin",
    "User",
--- a/backend/app/models/syndarix/init.py
+++ b/backend/app/models/syndarix/init.py
@@ -0,0 +1,47 @@
 # app/models/syndarix/__init__.py
 """
 Syndarix domain models.
 This package contains all the core entities for the Syndarix AI consulting platform:
 - Project: Client engagements with autonomy settings
 - AgentType: Templates for AI agent capabilities
 - AgentInstance: Spawned agents working on projects
 - Issue: Units of work with external tracker sync
 - Sprint: Time-boxed iterations for organizing work
 """
 from .agent_instance import AgentInstance
 from .agent_type import AgentType
 from .enums import (
    AgentStatus,
    AutonomyLevel,
    ClientMode,
    IssuePriority,
    IssueStatus,
    IssueType,
    ProjectComplexity,
    ProjectStatus,
    SprintStatus,
    SyncStatus,
 )
 from .issue import Issue
 from .project import Project
 from .sprint import Sprint
 __all__ = [
    "AgentInstance",
    "AgentStatus",
    "AgentType",
    "AutonomyLevel",
    "ClientMode",
    "Issue",
    "IssuePriority",
    "IssueStatus",
    "IssueType",
    "Project",
    "ProjectComplexity",
    "ProjectStatus",
    "Sprint",
    "SprintStatus",
    "SyncStatus",
 ]
--- a/backend/app/models/syndarix/agent_instance.py
+++ b/backend/app/models/syndarix/agent_instance.py
@@ -0,0 +1,111 @@
 # app/models/syndarix/agent_instance.py
 """
 AgentInstance model for Syndarix AI consulting platform.
 An AgentInstance is a spawned instance of an AgentType, assigned to a
 specific project to perform work.
 """
 from sqlalchemy import (
    BigInteger,
    Column,
    DateTime,
    Enum,
    ForeignKey,
    Index,
    Integer,
    Numeric,
    String,
    Text,
 )
 from sqlalchemy.dialects.postgresql import (
    JSONB,
    UUID as PGUUID,
 )
 from sqlalchemy.orm import relationship
 from app.models.base import Base, TimestampMixin, UUIDMixin
 from .enums import AgentStatus
 class AgentInstance(Base, UUIDMixin, TimestampMixin):
    """
    AgentInstance model representing a spawned agent working on a project.
    Tracks:
    - Current status and task
    - Memory (short-term in DB, long-term reference to vector store)
    - Session information for MCP connections
    - Usage metrics (tasks completed, tokens, cost)
    """
    __tablename__ = "agent_instances"
    # Foreign keys
    agent_type_id = Column(
        PGUUID(as_uuid=True),
        ForeignKey("agent_types.id", ondelete="RESTRICT"),
        nullable=False,
        index=True,
    )
    project_id = Column(
        PGUUID(as_uuid=True),
        ForeignKey("projects.id", ondelete="CASCADE"),
        nullable=False,
        index=True,
    )
    # Agent instance name (e.g., "Dave", "Eve") for personality
    name = Column(String(100), nullable=False, index=True)
    # Status tracking
    status: Column[AgentStatus] = Column(
        Enum(AgentStatus),
        default=AgentStatus.IDLE,
        nullable=False,
        index=True,
    )
    # Current task description (brief summary of what agent is doing)
    current_task = Column(Text, nullable=True)
    # Short-term memory stored in database (conversation context, recent decisions)
    short_term_memory = Column(JSONB, default=dict, nullable=False)
    # Reference to long-term memory in vector store (e.g., "project-123/agent-456")
    long_term_memory_ref = Column(String(500), nullable=True)
    # Session ID for active MCP connections
    session_id = Column(String(255), nullable=True, index=True)
    # Activity tracking
    last_activity_at = Column(DateTime(timezone=True), nullable=True, index=True)
    terminated_at = Column(DateTime(timezone=True), nullable=True, index=True)
    # Usage metrics
    tasks_completed = Column(Integer, default=0, nullable=False)
    tokens_used = Column(BigInteger, default=0, nullable=False)
    cost_incurred = Column(Numeric(precision=10, scale=4), default=0, nullable=False)
    # Relationships
    agent_type = relationship("AgentType", back_populates="instances")
    project = relationship("Project", back_populates="agent_instances")
    assigned_issues = relationship(
        "Issue",
        back_populates="assigned_agent",
        foreign_keys="Issue.assigned_agent_id",
    )
    __table_args__ = (
        Index("ix_agent_instances_project_status", "project_id", "status"),
        Index("ix_agent_instances_type_status", "agent_type_id", "status"),
        Index("ix_agent_instances_project_type", "project_id", "agent_type_id"),
    )
    def __repr__(self) -> str:
        return (
            f"<AgentInstance {self.name} ({self.id}) type={self.agent_type_id} "
            f"project={self.project_id} status={self.status.value}>"
        )
--- a/backend/app/models/syndarix/agent_type.py
+++ b/backend/app/models/syndarix/agent_type.py
@@ -0,0 +1,72 @@
 # app/models/syndarix/agent_type.py
 """
 AgentType model for Syndarix AI consulting platform.
 An AgentType is a template that defines the capabilities, personality,
 and model configuration for agent instances.
 """
 from sqlalchemy import Boolean, Column, Index, String, Text
 from sqlalchemy.dialects.postgresql import JSONB
 from sqlalchemy.orm import relationship
 from app.models.base import Base, TimestampMixin, UUIDMixin
 class AgentType(Base, UUIDMixin, TimestampMixin):
    """
    AgentType model representing a template for agent instances.
    Each agent type defines:
    - Expertise areas and personality prompt
    - Model configuration (primary, fallback, parameters)
    - MCP server access and tool permissions
    Examples: ProductOwner, Architect, BackendEngineer, QAEngineer
    """
    __tablename__ = "agent_types"
    name = Column(String(255), nullable=False, index=True)
    slug = Column(String(255), unique=True, nullable=False, index=True)
    description = Column(Text, nullable=True)
    # Areas of expertise for this agent type (e.g., ["python", "fastapi", "databases"])
    expertise = Column(JSONB, default=list, nullable=False)
    # System prompt defining the agent's personality and behavior
    personality_prompt = Column(Text, nullable=False)
    # Primary LLM model to use (e.g., "claude-opus-4-5-20251101")
    primary_model = Column(String(100), nullable=False)
    # Fallback models in order of preference
    fallback_models = Column(JSONB, default=list, nullable=False)
    # Model parameters (temperature, max_tokens, etc.)
    model_params = Column(JSONB, default=dict, nullable=False)
    # List of MCP servers this agent can connect to
    mcp_servers = Column(JSONB, default=list, nullable=False)
    # Tool permissions configuration
    # Structure: {"allowed": ["*"], "denied": [], "require_approval": ["gitea:create_pr"]}
    tool_permissions = Column(JSONB, default=dict, nullable=False)
    # Whether this agent type is available for new instances
    is_active = Column(Boolean, default=True, nullable=False, index=True)
    # Relationships
    instances = relationship(
        "AgentInstance",
        back_populates="agent_type",
        cascade="all, delete-orphan",
    )
    __table_args__ = (
        Index("ix_agent_types_slug_active", "slug", "is_active"),
        Index("ix_agent_types_name_active", "name", "is_active"),
    )
    def __repr__(self) -> str:
        return f"<AgentType {self.name} ({self.slug}) active={self.is_active}>"
--- a/backend/app/models/syndarix/enums.py
+++ b/backend/app/models/syndarix/enums.py
@@ -0,0 +1,169 @@
 # app/models/syndarix/enums.py
 """
 Enums for Syndarix domain models.
 These enums represent the core state machines and categorizations
 used throughout the Syndarix AI consulting platform.
 """
 from enum import Enum as PyEnum
 class AutonomyLevel(str, PyEnum):
    """
    Defines how much control the human has over agent actions.
    FULL_CONTROL: Human must approve every agent action
    MILESTONE: Human approves at sprint boundaries and major decisions
    AUTONOMOUS: Agents work independently, only escalating critical issues
    """
    FULL_CONTROL = "full_control"
    MILESTONE = "milestone"
    AUTONOMOUS = "autonomous"
 class ProjectComplexity(str, PyEnum):
    """
    Project complexity level for estimation and planning.
    SCRIPT: Simple automation or script-level work
    SIMPLE: Straightforward feature or fix
    MEDIUM: Standard complexity with some architectural considerations
    COMPLEX: Large-scale feature requiring significant design work
    """
    SCRIPT = "script"
    SIMPLE = "simple"
    MEDIUM = "medium"
    COMPLEX = "complex"
 class ClientMode(str, PyEnum):
    """
    How the client prefers to interact with agents.
    TECHNICAL: Client is technical and prefers detailed updates
    AUTO: Agents automatically determine communication level
    """
    TECHNICAL = "technical"
    AUTO = "auto"
 class ProjectStatus(str, PyEnum):
    """
    Project lifecycle status.
    ACTIVE: Project is actively being worked on
    PAUSED: Project is temporarily on hold
    COMPLETED: Project has been delivered successfully
    ARCHIVED: Project is no longer accessible for work
    """
    ACTIVE = "active"
    PAUSED = "paused"
    COMPLETED = "completed"
    ARCHIVED = "archived"
 class AgentStatus(str, PyEnum):
    """
    Current operational status of an agent instance.
    IDLE: Agent is available but not currently working
    WORKING: Agent is actively processing a task
    WAITING: Agent is waiting for external input or approval
    PAUSED: Agent has been manually paused
    TERMINATED: Agent instance has been shut down
    """
    IDLE = "idle"
    WORKING = "working"
    WAITING = "waiting"
    PAUSED = "paused"
    TERMINATED = "terminated"
 class IssueType(str, PyEnum):
    """
    Issue type for categorization and hierarchy.
    EPIC: Large feature or body of work containing stories
    STORY: User-facing feature or requirement
    TASK: Technical work item
    BUG: Defect or issue to be fixed
    """
    EPIC = "epic"
    STORY = "story"
    TASK = "task"
    BUG = "bug"
 class IssueStatus(str, PyEnum):
    """
    Issue workflow status.
    OPEN: Issue is ready to be worked on
    IN_PROGRESS: Agent or human is actively working on the issue
    IN_REVIEW: Work is complete, awaiting review
    BLOCKED: Issue cannot proceed due to dependencies or blockers
    CLOSED: Issue has been completed or cancelled
    """
    OPEN = "open"
    IN_PROGRESS = "in_progress"
    IN_REVIEW = "in_review"
    BLOCKED = "blocked"
    CLOSED = "closed"
 class IssuePriority(str, PyEnum):
    """
    Issue priority levels.
    LOW: Nice to have, can be deferred
    MEDIUM: Standard priority, should be done
    HIGH: Important, should be prioritized
    CRITICAL: Must be done immediately, blocking other work
    """
    LOW = "low"
    MEDIUM = "medium"
    HIGH = "high"
    CRITICAL = "critical"
 class SyncStatus(str, PyEnum):
    """
    External issue tracker synchronization status.
    SYNCED: Local and remote are in sync
    PENDING: Local changes waiting to be pushed
    CONFLICT: Merge conflict between local and remote
    ERROR: Synchronization failed due to an error
    """
    SYNCED = "synced"
    PENDING = "pending"
    CONFLICT = "conflict"
    ERROR = "error"
 class SprintStatus(str, PyEnum):
    """
    Sprint lifecycle status.
    PLANNED: Sprint has been created but not started
    ACTIVE: Sprint is currently in progress
    IN_REVIEW: Sprint work is done, demo/review pending
    COMPLETED: Sprint has been finished successfully
    CANCELLED: Sprint was cancelled before completion
    """
    PLANNED = "planned"
    ACTIVE = "active"
    IN_REVIEW = "in_review"
    COMPLETED = "completed"
    CANCELLED = "cancelled"
--- a/backend/app/models/syndarix/issue.py
+++ b/backend/app/models/syndarix/issue.py
@@ -0,0 +1,176 @@
 # app/models/syndarix/issue.py
 """
 Issue model for Syndarix AI consulting platform.
 An Issue represents a unit of work that can be assigned to agents or humans,
 with optional synchronization to external issue trackers (Gitea, GitHub, GitLab).
 """
 from sqlalchemy import (
    Column,
    Date,
    DateTime,
    Enum,
    ForeignKey,
    Index,
    Integer,
    String,
    Text,
 )
 from sqlalchemy.dialects.postgresql import (
    JSONB,
    UUID as PGUUID,
 )
 from sqlalchemy.orm import relationship
 from app.models.base import Base, TimestampMixin, UUIDMixin
 from .enums import IssuePriority, IssueStatus, IssueType, SyncStatus
 class Issue(Base, UUIDMixin, TimestampMixin):
    """
    Issue model representing a unit of work in a project.
    Features:
    - Standard issue fields (title, body, status, priority)
    - Assignment to agent instances or human assignees
    - Sprint association for backlog management
    - External tracker synchronization (Gitea, GitHub, GitLab)
    """
    __tablename__ = "issues"
    # Foreign key to project
    project_id = Column(
        PGUUID(as_uuid=True),
        ForeignKey("projects.id", ondelete="CASCADE"),
        nullable=False,
        index=True,
    )
    # Parent issue for hierarchy (Epic -> Story -> Task)
    parent_id = Column(
        PGUUID(as_uuid=True),
        ForeignKey("issues.id", ondelete="CASCADE"),
        nullable=True,
        index=True,
    )
    # Issue type (Epic, Story, Task, Bug)
    type: Column[IssueType] = Column(
        Enum(IssueType),
        default=IssueType.TASK,
        nullable=False,
        index=True,
    )
    # Reporter (who created this issue - can be user or agent)
    reporter_id = Column(
        PGUUID(as_uuid=True),
        nullable=True,  # System-generated issues may have no reporter
        index=True,
    )
    # Issue content
    title = Column(String(500), nullable=False)
    body = Column(Text, nullable=False, default="")
    # Status and priority
    status: Column[IssueStatus] = Column(
        Enum(IssueStatus),
        default=IssueStatus.OPEN,
        nullable=False,
        index=True,
    )
    priority: Column[IssuePriority] = Column(
        Enum(IssuePriority),
        default=IssuePriority.MEDIUM,
        nullable=False,
        index=True,
    )
    # Labels for categorization (e.g., ["bug", "frontend", "urgent"])
    labels = Column(JSONB, default=list, nullable=False)
    # Assignment - either to an agent or a human (mutually exclusive)
    assigned_agent_id = Column(
        PGUUID(as_uuid=True),
        ForeignKey("agent_instances.id", ondelete="SET NULL"),
        nullable=True,
        index=True,
    )
    # Human assignee (username or email, not a FK to allow external users)
    human_assignee = Column(String(255), nullable=True, index=True)
    # Sprint association
    sprint_id = Column(
        PGUUID(as_uuid=True),
        ForeignKey("sprints.id", ondelete="SET NULL"),
        nullable=True,
        index=True,
    )
    # Story points for estimation
    story_points = Column(Integer, nullable=True)
    # Due date for the issue
    due_date = Column(Date, nullable=True, index=True)
    # External tracker integration
    external_tracker_type = Column(
        String(50),
        nullable=True,
        index=True,
    )  # 'gitea', 'github', 'gitlab'
    external_issue_id = Column(String(255), nullable=True)  # External system's ID
    remote_url = Column(String(1000), nullable=True)  # Link to external issue
    external_issue_number = Column(Integer, nullable=True)  # Issue number (e.g., #123)
    # Sync status with external tracker
    sync_status: Column[SyncStatus] = Column(
        Enum(SyncStatus),
        default=SyncStatus.SYNCED,
        nullable=False,
        # Note: Index defined in __table_args__ as ix_issues_sync_status
    )
    last_synced_at = Column(DateTime(timezone=True), nullable=True)
    external_updated_at = Column(DateTime(timezone=True), nullable=True)
    # Lifecycle timestamp
    closed_at = Column(DateTime(timezone=True), nullable=True, index=True)
    # Relationships
    project = relationship("Project", back_populates="issues")
    assigned_agent = relationship(
        "AgentInstance",
        back_populates="assigned_issues",
        foreign_keys=[assigned_agent_id],
    )
    sprint = relationship("Sprint", back_populates="issues")
    parent = relationship("Issue", remote_side="Issue.id", backref="children")
    __table_args__ = (
        Index("ix_issues_project_status", "project_id", "status"),
        Index("ix_issues_project_priority", "project_id", "priority"),
        Index("ix_issues_project_sprint", "project_id", "sprint_id"),
        Index(
            "ix_issues_external_tracker_id",
            "external_tracker_type",
            "external_issue_id",
        ),
        Index("ix_issues_sync_status", "sync_status"),
        Index("ix_issues_project_agent", "project_id", "assigned_agent_id"),
        Index("ix_issues_project_type", "project_id", "type"),
        Index("ix_issues_project_status_priority", "project_id", "status", "priority"),
    )
    def __repr__(self) -> str:
        return (
            f"<Issue {self.id} title='{self.title[:30]}...' "
            f"status={self.status.value} priority={self.priority.value}>"
        )
--- a/backend/app/models/syndarix/project.py
+++ b/backend/app/models/syndarix/project.py
@@ -0,0 +1,103 @@
 # app/models/syndarix/project.py
 """
 Project model for Syndarix AI consulting platform.
 A Project represents a client engagement where AI agents collaborate
 to deliver software solutions.
 """
 from sqlalchemy import Column, Enum, ForeignKey, Index, String, Text
 from sqlalchemy.dialects.postgresql import (
    JSONB,
    UUID as PGUUID,
 )
 from sqlalchemy.orm import relationship
 from app.models.base import Base, TimestampMixin, UUIDMixin
 from .enums import AutonomyLevel, ClientMode, ProjectComplexity, ProjectStatus
 class Project(Base, UUIDMixin, TimestampMixin):
    """
    Project model representing a client engagement.
    A project contains:
    - Configuration for how autonomous agents should operate
    - Settings for MCP server integrations
    - Relationship to assigned agents, issues, and sprints
    """
    __tablename__ = "projects"
    name = Column(String(255), nullable=False, index=True)
    slug = Column(String(255), unique=True, nullable=False, index=True)
    description = Column(Text, nullable=True)
    autonomy_level: Column[AutonomyLevel] = Column(
        Enum(AutonomyLevel),
        default=AutonomyLevel.MILESTONE,
        nullable=False,
        index=True,
    )
    status: Column[ProjectStatus] = Column(
        Enum(ProjectStatus),
        default=ProjectStatus.ACTIVE,
        nullable=False,
        index=True,
    )
    complexity: Column[ProjectComplexity] = Column(
        Enum(ProjectComplexity),
        default=ProjectComplexity.MEDIUM,
        nullable=False,
        index=True,
    )
    client_mode: Column[ClientMode] = Column(
        Enum(ClientMode),
        default=ClientMode.AUTO,
        nullable=False,
        index=True,
    )
    # JSON field for flexible project configuration
    # Can include: mcp_servers, webhook_urls, notification_settings, etc.
    settings = Column(JSONB, default=dict, nullable=False)
    # Foreign key to the User who owns this project
    owner_id = Column(
        PGUUID(as_uuid=True),
        ForeignKey("users.id", ondelete="SET NULL"),
        nullable=True,
        index=True,
    )
    # Relationships
    owner = relationship("User", foreign_keys=[owner_id])
    agent_instances = relationship(
        "AgentInstance",
        back_populates="project",
        cascade="all, delete-orphan",
    )
    issues = relationship(
        "Issue",
        back_populates="project",
        cascade="all, delete-orphan",
    )
    sprints = relationship(
        "Sprint",
        back_populates="project",
        cascade="all, delete-orphan",
    )
    __table_args__ = (
        Index("ix_projects_slug_status", "slug", "status"),
        Index("ix_projects_owner_status", "owner_id", "status"),
        Index("ix_projects_autonomy_status", "autonomy_level", "status"),
        Index("ix_projects_complexity_status", "complexity", "status"),
    )
    def __repr__(self) -> str:
        return f"<Project {self.name} ({self.slug}) status={self.status.value}>"
--- a/backend/app/models/syndarix/sprint.py
+++ b/backend/app/models/syndarix/sprint.py
@@ -0,0 +1,86 @@
 # app/models/syndarix/sprint.py
 """
 Sprint model for Syndarix AI consulting platform.
 A Sprint represents a time-boxed iteration for organizing and delivering work.
 """
 from sqlalchemy import (
    Column,
    Date,
    Enum,
    ForeignKey,
    Index,
    Integer,
    String,
    Text,
    UniqueConstraint,
 )
 from sqlalchemy.dialects.postgresql import UUID as PGUUID
 from sqlalchemy.orm import relationship
 from app.models.base import Base, TimestampMixin, UUIDMixin
 from .enums import SprintStatus
 class Sprint(Base, UUIDMixin, TimestampMixin):
    """
    Sprint model representing a time-boxed iteration.
    Tracks:
    - Sprint metadata (name, number, goal)
    - Date range (start/end)
    - Progress metrics (planned vs completed points)
    """
    __tablename__ = "sprints"
    # Foreign key to project
    project_id = Column(
        PGUUID(as_uuid=True),
        ForeignKey("projects.id", ondelete="CASCADE"),
        nullable=False,
        index=True,
    )
    # Sprint identification
    name = Column(String(255), nullable=False)
    number = Column(Integer, nullable=False)  # Sprint number within project
    # Sprint goal (what we aim to achieve)
    goal = Column(Text, nullable=True)
    # Date range
    start_date = Column(Date, nullable=False, index=True)
    end_date = Column(Date, nullable=False, index=True)
    # Status
    status: Column[SprintStatus] = Column(
        Enum(SprintStatus),
        default=SprintStatus.PLANNED,
        nullable=False,
        index=True,
    )
    # Progress metrics
    planned_points = Column(Integer, nullable=True)  # Sum of story points at start
    velocity = Column(Integer, nullable=True)  # Sum of completed story points
    # Relationships
    project = relationship("Project", back_populates="sprints")
    issues = relationship("Issue", back_populates="sprint")
    __table_args__ = (
        Index("ix_sprints_project_status", "project_id", "status"),
        Index("ix_sprints_project_number", "project_id", "number"),
        Index("ix_sprints_date_range", "start_date", "end_date"),
        # Ensure sprint numbers are unique within a project
        UniqueConstraint("project_id", "number", name="uq_sprint_project_number"),
    )
    def __repr__(self) -> str:
        return (
            f"<Sprint {self.name} (#{self.number}) "
            f"project={self.project_id} status={self.status.value}>"
        )
--- a/backend/app/schemas/events.py
+++ b/backend/app/schemas/events.py
@@ -0,0 +1,273 @@
 """
 Event schemas for the Syndarix EventBus (Redis Pub/Sub).
 This module defines event types and payload schemas for real-time communication
 between services, agents, and the frontend.
 """
 from datetime import datetime
 from enum import Enum
 from typing import Literal
 from uuid import UUID
 from pydantic import BaseModel, Field
 class EventType(str, Enum):
    """
    Event types for the EventBus.
    Naming convention: {domain}.{action}
    """
    # Agent Events
    AGENT_SPAWNED = "agent.spawned"
    AGENT_STATUS_CHANGED = "agent.status_changed"
    AGENT_MESSAGE = "agent.message"
    AGENT_TERMINATED = "agent.terminated"
    # Issue Events
    ISSUE_CREATED = "issue.created"
    ISSUE_UPDATED = "issue.updated"
    ISSUE_ASSIGNED = "issue.assigned"
    ISSUE_CLOSED = "issue.closed"
    # Sprint Events
    SPRINT_STARTED = "sprint.started"
    SPRINT_COMPLETED = "sprint.completed"
    # Approval Events
    APPROVAL_REQUESTED = "approval.requested"
    APPROVAL_GRANTED = "approval.granted"
    APPROVAL_DENIED = "approval.denied"
    # Project Events
    PROJECT_CREATED = "project.created"
    PROJECT_UPDATED = "project.updated"
    PROJECT_ARCHIVED = "project.archived"
    # Workflow Events
    WORKFLOW_STARTED = "workflow.started"
    WORKFLOW_STEP_COMPLETED = "workflow.step_completed"
    WORKFLOW_COMPLETED = "workflow.completed"
    WORKFLOW_FAILED = "workflow.failed"
 ActorType = Literal["agent", "user", "system"]
 class Event(BaseModel):
    """
    Base event schema for the EventBus.
    All events published to the EventBus must conform to this schema.
    """
    id: str = Field(
        ...,
        description="Unique event identifier (UUID string)",
        examples=["550e8400-e29b-41d4-a716-446655440000"],
    )
    type: EventType = Field(
        ...,
        description="Event type enum value",
        examples=[EventType.AGENT_MESSAGE],
    )
    timestamp: datetime = Field(
        ...,
        description="When the event occurred (UTC)",
        examples=["2024-01-15T10:30:00Z"],
    )
    project_id: UUID = Field(
        ...,
        description="Project this event belongs to",
        examples=["550e8400-e29b-41d4-a716-446655440001"],
    )
    actor_id: UUID | None = Field(
        default=None,
        description="ID of the agent or user who triggered the event",
        examples=["550e8400-e29b-41d4-a716-446655440002"],
    )
    actor_type: ActorType = Field(
        ...,
        description="Type of actor: 'agent', 'user', or 'system'",
        examples=["agent"],
    )
    payload: dict = Field(
        default_factory=dict,
        description="Event-specific payload data",
    )
    model_config = {
        "json_schema_extra": {
            "example": {
                "id": "550e8400-e29b-41d4-a716-446655440000",
                "type": "agent.message",
                "timestamp": "2024-01-15T10:30:00Z",
                "project_id": "550e8400-e29b-41d4-a716-446655440001",
                "actor_id": "550e8400-e29b-41d4-a716-446655440002",
                "actor_type": "agent",
                "payload": {"message": "Processing task...", "progress": 50},
            }
        }
    }
 # Specific payload schemas for type safety
 class AgentSpawnedPayload(BaseModel):
    """Payload for AGENT_SPAWNED events."""
    agent_instance_id: UUID = Field(..., description="ID of the spawned agent instance")
    agent_type_id: UUID = Field(..., description="ID of the agent type")
    agent_name: str = Field(..., description="Human-readable name of the agent")
    role: str = Field(..., description="Agent role (e.g., 'product_owner', 'engineer')")
 class AgentStatusChangedPayload(BaseModel):
    """Payload for AGENT_STATUS_CHANGED events."""
    agent_instance_id: UUID = Field(..., description="ID of the agent instance")
    previous_status: str = Field(..., description="Previous status")
    new_status: str = Field(..., description="New status")
    reason: str | None = Field(default=None, description="Reason for status change")
 class AgentMessagePayload(BaseModel):
    """Payload for AGENT_MESSAGE events."""
    agent_instance_id: UUID = Field(..., description="ID of the agent instance")
    message: str = Field(..., description="Message content")
    message_type: str = Field(
        default="info",
        description="Message type: 'info', 'warning', 'error', 'debug'",
    )
    metadata: dict = Field(
        default_factory=dict,
        description="Additional metadata (e.g., token usage, model info)",
    )
 class AgentTerminatedPayload(BaseModel):
    """Payload for AGENT_TERMINATED events."""
    agent_instance_id: UUID = Field(..., description="ID of the agent instance")
    termination_reason: str = Field(..., description="Reason for termination")
    final_status: str = Field(..., description="Final status at termination")
 class IssueCreatedPayload(BaseModel):
    """Payload for ISSUE_CREATED events."""
    issue_id: str = Field(..., description="Issue ID (from external tracker)")
    title: str = Field(..., description="Issue title")
    priority: str | None = Field(default=None, description="Issue priority")
    labels: list[str] = Field(default_factory=list, description="Issue labels")
 class IssueUpdatedPayload(BaseModel):
    """Payload for ISSUE_UPDATED events."""
    issue_id: str = Field(..., description="Issue ID (from external tracker)")
    changes: dict = Field(..., description="Dictionary of field changes")
 class IssueAssignedPayload(BaseModel):
    """Payload for ISSUE_ASSIGNED events."""
    issue_id: str = Field(..., description="Issue ID (from external tracker)")
    assignee_id: UUID | None = Field(
        default=None, description="Agent or user assigned to"
    )
    assignee_name: str | None = Field(default=None, description="Assignee name")
 class IssueClosedPayload(BaseModel):
    """Payload for ISSUE_CLOSED events."""
    issue_id: str = Field(..., description="Issue ID (from external tracker)")
    resolution: str = Field(..., description="Resolution status")
 class SprintStartedPayload(BaseModel):
    """Payload for SPRINT_STARTED events."""
    sprint_id: UUID = Field(..., description="Sprint ID")
    sprint_name: str = Field(..., description="Sprint name")
    goal: str | None = Field(default=None, description="Sprint goal")
    issue_count: int = Field(default=0, description="Number of issues in sprint")
 class SprintCompletedPayload(BaseModel):
    """Payload for SPRINT_COMPLETED events."""
    sprint_id: UUID = Field(..., description="Sprint ID")
    sprint_name: str = Field(..., description="Sprint name")
    completed_issues: int = Field(default=0, description="Number of completed issues")
    incomplete_issues: int = Field(default=0, description="Number of incomplete issues")
 class ApprovalRequestedPayload(BaseModel):
    """Payload for APPROVAL_REQUESTED events."""
    approval_id: UUID = Field(..., description="Approval request ID")
    approval_type: str = Field(..., description="Type of approval needed")
    description: str = Field(..., description="Description of what needs approval")
    requested_by: UUID | None = Field(
        default=None, description="Agent/user requesting approval"
    )
    timeout_minutes: int | None = Field(
        default=None, description="Minutes before auto-escalation"
    )
 class ApprovalGrantedPayload(BaseModel):
    """Payload for APPROVAL_GRANTED events."""
    approval_id: UUID = Field(..., description="Approval request ID")
    approved_by: UUID = Field(..., description="User who granted approval")
    comments: str | None = Field(default=None, description="Approval comments")
 class ApprovalDeniedPayload(BaseModel):
    """Payload for APPROVAL_DENIED events."""
    approval_id: UUID = Field(..., description="Approval request ID")
    denied_by: UUID = Field(..., description="User who denied approval")
    reason: str = Field(..., description="Reason for denial")
 class WorkflowStartedPayload(BaseModel):
    """Payload for WORKFLOW_STARTED events."""
    workflow_id: UUID = Field(..., description="Workflow execution ID")
    workflow_type: str = Field(..., description="Type of workflow")
    total_steps: int = Field(default=0, description="Total number of steps")
 class WorkflowStepCompletedPayload(BaseModel):
    """Payload for WORKFLOW_STEP_COMPLETED events."""
    workflow_id: UUID = Field(..., description="Workflow execution ID")
    step_name: str = Field(..., description="Name of completed step")
    step_number: int = Field(..., description="Step number (1-indexed)")
    total_steps: int = Field(..., description="Total number of steps")
    result: dict = Field(default_factory=dict, description="Step result data")
 class WorkflowCompletedPayload(BaseModel):
    """Payload for WORKFLOW_COMPLETED events."""
    workflow_id: UUID = Field(..., description="Workflow execution ID")
    duration_seconds: float = Field(..., description="Total execution duration")
    result: dict = Field(default_factory=dict, description="Workflow result data")
 class WorkflowFailedPayload(BaseModel):
    """Payload for WORKFLOW_FAILED events."""
    workflow_id: UUID = Field(..., description="Workflow execution ID")
    error_message: str = Field(..., description="Error message")
    failed_step: str | None = Field(default=None, description="Step that failed")
    recoverable: bool = Field(default=False, description="Whether error is recoverable")
--- a/backend/app/schemas/syndarix/init.py
+++ b/backend/app/schemas/syndarix/init.py
@@ -0,0 +1,113 @@
 # app/schemas/syndarix/__init__.py
 """
 Syndarix domain schemas.
 This package contains Pydantic schemas for validating and serializing
 Syndarix domain entities.
 """
 from .agent_instance import (
    AgentInstanceCreate,
    AgentInstanceInDB,
    AgentInstanceListResponse,
    AgentInstanceMetrics,
    AgentInstanceResponse,
    AgentInstanceTerminate,
    AgentInstanceUpdate,
 )
 from .agent_type import (
    AgentTypeCreate,
    AgentTypeInDB,
    AgentTypeListResponse,
    AgentTypeResponse,
    AgentTypeUpdate,
 )
 from .enums import (
    AgentStatus,
    AutonomyLevel,
    IssuePriority,
    IssueStatus,
    ProjectStatus,
    SprintStatus,
    SyncStatus,
 )
 from .issue import (
    IssueAssign,
    IssueClose,
    IssueCreate,
    IssueInDB,
    IssueListResponse,
    IssueResponse,
    IssueStats,
    IssueSyncUpdate,
    IssueUpdate,
 )
 from .project import (
    ProjectCreate,
    ProjectInDB,
    ProjectListResponse,
    ProjectResponse,
    ProjectUpdate,
 )
 from .sprint import (
    SprintBurndown,
    SprintComplete,
    SprintCreate,
    SprintInDB,
    SprintListResponse,
    SprintResponse,
    SprintStart,
    SprintUpdate,
    SprintVelocity,
 )
 __all__ = [
    # AgentInstance schemas
    "AgentInstanceCreate",
    "AgentInstanceInDB",
    "AgentInstanceListResponse",
    "AgentInstanceMetrics",
    "AgentInstanceResponse",
    "AgentInstanceTerminate",
    "AgentInstanceUpdate",
    # Enums
    "AgentStatus",
    # AgentType schemas
    "AgentTypeCreate",
    "AgentTypeInDB",
    "AgentTypeListResponse",
    "AgentTypeResponse",
    "AgentTypeUpdate",
    "AutonomyLevel",
    # Issue schemas
    "IssueAssign",
    "IssueClose",
    "IssueCreate",
    "IssueInDB",
    "IssueListResponse",
    "IssuePriority",
    "IssueResponse",
    "IssueStats",
    "IssueStatus",
    "IssueSyncUpdate",
    "IssueUpdate",
    # Project schemas
    "ProjectCreate",
    "ProjectInDB",
    "ProjectListResponse",
    "ProjectResponse",
    "ProjectStatus",
    "ProjectUpdate",
    # Sprint schemas
    "SprintBurndown",
    "SprintComplete",
    "SprintCreate",
    "SprintInDB",
    "SprintListResponse",
    "SprintResponse",
    "SprintStart",
    "SprintStatus",
    "SprintUpdate",
    "SprintVelocity",
    "SyncStatus",
 ]
--- a/backend/app/schemas/syndarix/agent_instance.py
+++ b/backend/app/schemas/syndarix/agent_instance.py
@@ -0,0 +1,124 @@
 # app/schemas/syndarix/agent_instance.py
 """
 Pydantic schemas for AgentInstance entity.
 """
 from datetime import datetime
 from decimal import Decimal
 from typing import Any
 from uuid import UUID
 from pydantic import BaseModel, ConfigDict, Field
 from .enums import AgentStatus
 class AgentInstanceBase(BaseModel):
    """Base agent instance schema with common fields."""
    agent_type_id: UUID
    project_id: UUID
    status: AgentStatus = AgentStatus.IDLE
    current_task: str | None = None
    short_term_memory: dict[str, Any] = Field(default_factory=dict)
    long_term_memory_ref: str | None = Field(None, max_length=500)
    session_id: str | None = Field(None, max_length=255)
 class AgentInstanceCreate(BaseModel):
    """Schema for creating a new agent instance."""
    agent_type_id: UUID
    project_id: UUID
    name: str = Field(..., min_length=1, max_length=100)
    status: AgentStatus = AgentStatus.IDLE
    current_task: str | None = None
    short_term_memory: dict[str, Any] = Field(default_factory=dict)
    long_term_memory_ref: str | None = Field(None, max_length=500)
    session_id: str | None = Field(None, max_length=255)
 class AgentInstanceUpdate(BaseModel):
    """Schema for updating an agent instance."""
    status: AgentStatus | None = None
    current_task: str | None = None
    short_term_memory: dict[str, Any] | None = None
    long_term_memory_ref: str | None = None
    session_id: str | None = None
    last_activity_at: datetime | None = None
    tasks_completed: int | None = Field(None, ge=0)
    tokens_used: int | None = Field(None, ge=0)
    cost_incurred: Decimal | None = Field(None, ge=0)
 class AgentInstanceTerminate(BaseModel):
    """Schema for terminating an agent instance."""
    reason: str | None = None
 class AgentInstanceInDB(AgentInstanceBase):
    """Schema for agent instance in database."""
    id: UUID
    last_activity_at: datetime | None = None
    terminated_at: datetime | None = None
    tasks_completed: int = 0
    tokens_used: int = 0
    cost_incurred: Decimal = Decimal("0.0000")
    created_at: datetime
    updated_at: datetime
    model_config = ConfigDict(from_attributes=True)
 class AgentInstanceResponse(BaseModel):
    """Schema for agent instance API responses."""
    id: UUID
    agent_type_id: UUID
    project_id: UUID
    name: str
    status: AgentStatus
    current_task: str | None = None
    short_term_memory: dict[str, Any] = Field(default_factory=dict)
    long_term_memory_ref: str | None = None
    session_id: str | None = None
    last_activity_at: datetime | None = None
    terminated_at: datetime | None = None
    tasks_completed: int = 0
    tokens_used: int = 0
    cost_incurred: Decimal = Decimal("0.0000")
    created_at: datetime
    updated_at: datetime
    # Expanded fields from relationships
    agent_type_name: str | None = None
    agent_type_slug: str | None = None
    project_name: str | None = None
    project_slug: str | None = None
    assigned_issues_count: int | None = 0
    model_config = ConfigDict(from_attributes=True)
 class AgentInstanceListResponse(BaseModel):
    """Schema for paginated agent instance list responses."""
    agent_instances: list[AgentInstanceResponse]
    total: int
    page: int
    page_size: int
    pages: int
 class AgentInstanceMetrics(BaseModel):
    """Schema for agent instance metrics summary."""
    total_instances: int
    active_instances: int
    idle_instances: int
    total_tasks_completed: int
    total_tokens_used: int
    total_cost_incurred: Decimal
--- a/backend/app/schemas/syndarix/agent_type.py
+++ b/backend/app/schemas/syndarix/agent_type.py
@@ -0,0 +1,151 @@
 # app/schemas/syndarix/agent_type.py
 """
 Pydantic schemas for AgentType entity.
 """
 import re
 from datetime import datetime
 from typing import Any
 from uuid import UUID
 from pydantic import BaseModel, ConfigDict, Field, field_validator
 class AgentTypeBase(BaseModel):
    """Base agent type schema with common fields."""
    name: str = Field(..., min_length=1, max_length=255)
    slug: str | None = Field(None, min_length=1, max_length=255)
    description: str | None = None
    expertise: list[str] = Field(default_factory=list)
    personality_prompt: str = Field(..., min_length=1)
    primary_model: str = Field(..., min_length=1, max_length=100)
    fallback_models: list[str] = Field(default_factory=list)
    model_params: dict[str, Any] = Field(default_factory=dict)
    mcp_servers: list[str] = Field(default_factory=list)
    tool_permissions: dict[str, Any] = Field(default_factory=dict)
    is_active: bool = True
    @field_validator("slug")
    @classmethod
    def validate_slug(cls, v: str | None) -> str | None:
        """Validate slug format: lowercase, alphanumeric, hyphens only."""
        if v is None:
            return v
        if not re.match(r"^[a-z0-9-]+$", v):
            raise ValueError(
                "Slug must contain only lowercase letters, numbers, and hyphens"
            )
        if v.startswith("-") or v.endswith("-"):
            raise ValueError("Slug cannot start or end with a hyphen")
        if "--" in v:
            raise ValueError("Slug cannot contain consecutive hyphens")
        return v
    @field_validator("name")
    @classmethod
    def validate_name(cls, v: str) -> str:
        """Validate agent type name."""
        if not v or v.strip() == "":
            raise ValueError("Agent type name cannot be empty")
        return v.strip()
    @field_validator("expertise")
    @classmethod
    def validate_expertise(cls, v: list[str]) -> list[str]:
        """Validate and normalize expertise list."""
        return [e.strip().lower() for e in v if e.strip()]
    @field_validator("mcp_servers")
    @classmethod
    def validate_mcp_servers(cls, v: list[str]) -> list[str]:
        """Validate MCP server list."""
        return [s.strip() for s in v if s.strip()]
 class AgentTypeCreate(AgentTypeBase):
    """Schema for creating a new agent type."""
    name: str = Field(..., min_length=1, max_length=255)
    slug: str = Field(..., min_length=1, max_length=255)
    personality_prompt: str = Field(..., min_length=1)
    primary_model: str = Field(..., min_length=1, max_length=100)
 class AgentTypeUpdate(BaseModel):
    """Schema for updating an agent type."""
    name: str | None = Field(None, min_length=1, max_length=255)
    slug: str | None = Field(None, min_length=1, max_length=255)
    description: str | None = None
    expertise: list[str] | None = None
    personality_prompt: str | None = None
    primary_model: str | None = Field(None, min_length=1, max_length=100)
    fallback_models: list[str] | None = None
    model_params: dict[str, Any] | None = None
    mcp_servers: list[str] | None = None
    tool_permissions: dict[str, Any] | None = None
    is_active: bool | None = None
    @field_validator("slug")
    @classmethod
    def validate_slug(cls, v: str | None) -> str | None:
        """Validate slug format."""
        if v is None:
            return v
        if not re.match(r"^[a-z0-9-]+$", v):
            raise ValueError(
                "Slug must contain only lowercase letters, numbers, and hyphens"
            )
        if v.startswith("-") or v.endswith("-"):
            raise ValueError("Slug cannot start or end with a hyphen")
        if "--" in v:
            raise ValueError("Slug cannot contain consecutive hyphens")
        return v
    @field_validator("name")
    @classmethod
    def validate_name(cls, v: str | None) -> str | None:
        """Validate agent type name."""
        if v is not None and (not v or v.strip() == ""):
            raise ValueError("Agent type name cannot be empty")
        return v.strip() if v else v
    @field_validator("expertise")
    @classmethod
    def validate_expertise(cls, v: list[str] | None) -> list[str] | None:
        """Validate and normalize expertise list."""
        if v is None:
            return v
        return [e.strip().lower() for e in v if e.strip()]
 class AgentTypeInDB(AgentTypeBase):
    """Schema for agent type in database."""
    id: UUID
    created_at: datetime
    updated_at: datetime
    model_config = ConfigDict(from_attributes=True)
 class AgentTypeResponse(AgentTypeBase):
    """Schema for agent type API responses."""
    id: UUID
    created_at: datetime
    updated_at: datetime
    instance_count: int | None = 0
    model_config = ConfigDict(from_attributes=True)
 class AgentTypeListResponse(BaseModel):
    """Schema for paginated agent type list responses."""
    agent_types: list[AgentTypeResponse]
    total: int
    page: int
    page_size: int
    pages: int
--- a/backend/app/schemas/syndarix/enums.py
+++ b/backend/app/schemas/syndarix/enums.py
@@ -0,0 +1,26 @@
 # app/schemas/syndarix/enums.py
 """
 Re-export enums from models for use in schemas.
 This allows schemas to import enums without depending on SQLAlchemy models directly.
 """
 from app.models.syndarix.enums import (
    AgentStatus,
    AutonomyLevel,
    IssuePriority,
    IssueStatus,
    ProjectStatus,
    SprintStatus,
    SyncStatus,
 )
 __all__ = [
    "AgentStatus",
    "AutonomyLevel",
    "IssuePriority",
    "IssueStatus",
    "ProjectStatus",
    "SprintStatus",
    "SyncStatus",
 ]
--- a/backend/app/schemas/syndarix/issue.py
+++ b/backend/app/schemas/syndarix/issue.py
@@ -0,0 +1,191 @@
 # app/schemas/syndarix/issue.py
 """
 Pydantic schemas for Issue entity.
 """
 from datetime import datetime
 from typing import Literal
 from uuid import UUID
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from .enums import IssuePriority, IssueStatus, SyncStatus
 class IssueBase(BaseModel):
    """Base issue schema with common fields."""
    title: str = Field(..., min_length=1, max_length=500)
    body: str = ""
    status: IssueStatus = IssueStatus.OPEN
    priority: IssuePriority = IssuePriority.MEDIUM
    labels: list[str] = Field(default_factory=list)
    story_points: int | None = Field(None, ge=0, le=100)
    @field_validator("title")
    @classmethod
    def validate_title(cls, v: str) -> str:
        """Validate issue title."""
        if not v or v.strip() == "":
            raise ValueError("Issue title cannot be empty")
        return v.strip()
    @field_validator("labels")
    @classmethod
    def validate_labels(cls, v: list[str]) -> list[str]:
        """Validate and normalize labels."""
        return [label.strip().lower() for label in v if label.strip()]
 class IssueCreate(IssueBase):
    """Schema for creating a new issue."""
    project_id: UUID
    assigned_agent_id: UUID | None = None
    human_assignee: str | None = Field(None, max_length=255)
    sprint_id: UUID | None = None
    # External tracker fields (optional, for importing from external systems)
    external_tracker_type: Literal["gitea", "github", "gitlab"] | None = None
    external_issue_id: str | None = Field(None, max_length=255)
    remote_url: str | None = Field(None, max_length=1000)
    external_issue_number: int | None = None
 class IssueUpdate(BaseModel):
    """Schema for updating an issue."""
    title: str | None = Field(None, min_length=1, max_length=500)
    body: str | None = None
    status: IssueStatus | None = None
    priority: IssuePriority | None = None
    labels: list[str] | None = None
    assigned_agent_id: UUID | None = None
    human_assignee: str | None = Field(None, max_length=255)
    sprint_id: UUID | None = None
    story_points: int | None = Field(None, ge=0, le=100)
    sync_status: SyncStatus | None = None
    @field_validator("title")
    @classmethod
    def validate_title(cls, v: str | None) -> str | None:
        """Validate issue title."""
        if v is not None and (not v or v.strip() == ""):
            raise ValueError("Issue title cannot be empty")
        return v.strip() if v else v
    @field_validator("labels")
    @classmethod
    def validate_labels(cls, v: list[str] | None) -> list[str] | None:
        """Validate and normalize labels."""
        if v is None:
            return v
        return [label.strip().lower() for label in v if label.strip()]
 class IssueClose(BaseModel):
    """Schema for closing an issue."""
    resolution: str | None = None  # Optional resolution note
 class IssueAssign(BaseModel):
    """Schema for assigning an issue."""
    assigned_agent_id: UUID | None = None
    human_assignee: str | None = Field(None, max_length=255)
    @model_validator(mode="after")
    def validate_assignment(self) -> "IssueAssign":
        """Ensure only one type of assignee is set."""
        if self.assigned_agent_id and self.human_assignee:
            raise ValueError("Cannot assign to both an agent and a human. Choose one.")
        return self
 class IssueSyncUpdate(BaseModel):
    """Schema for updating sync-related fields."""
    sync_status: SyncStatus
    last_synced_at: datetime | None = None
    external_updated_at: datetime | None = None
 class IssueInDB(IssueBase):
    """Schema for issue in database."""
    id: UUID
    project_id: UUID
    assigned_agent_id: UUID | None = None
    human_assignee: str | None = None
    sprint_id: UUID | None = None
    external_tracker_type: str | None = None
    external_issue_id: str | None = None
    remote_url: str | None = None
    external_issue_number: int | None = None
    sync_status: SyncStatus = SyncStatus.SYNCED
    last_synced_at: datetime | None = None
    external_updated_at: datetime | None = None
    closed_at: datetime | None = None
    created_at: datetime
    updated_at: datetime
    model_config = ConfigDict(from_attributes=True)
 class IssueResponse(BaseModel):
    """Schema for issue API responses."""
    id: UUID
    project_id: UUID
    title: str
    body: str
    status: IssueStatus
    priority: IssuePriority
    labels: list[str] = Field(default_factory=list)
    assigned_agent_id: UUID | None = None
    human_assignee: str | None = None
    sprint_id: UUID | None = None
    story_points: int | None = None
    external_tracker_type: str | None = None
    external_issue_id: str | None = None
    remote_url: str | None = None
    external_issue_number: int | None = None
    sync_status: SyncStatus = SyncStatus.SYNCED
    last_synced_at: datetime | None = None
    external_updated_at: datetime | None = None
    closed_at: datetime | None = None
    created_at: datetime
    updated_at: datetime
    # Expanded fields from relationships
    project_name: str | None = None
    project_slug: str | None = None
    sprint_name: str | None = None
    assigned_agent_type_name: str | None = None
    model_config = ConfigDict(from_attributes=True)
 class IssueListResponse(BaseModel):
    """Schema for paginated issue list responses."""
    issues: list[IssueResponse]
    total: int
    page: int
    page_size: int
    pages: int
 class IssueStats(BaseModel):
    """Schema for issue statistics."""
    total: int
    open: int
    in_progress: int
    in_review: int
    blocked: int
    closed: int
    by_priority: dict[str, int]
    total_story_points: int | None = None
    completed_story_points: int | None = None
--- a/backend/app/schemas/syndarix/project.py
+++ b/backend/app/schemas/syndarix/project.py
@@ -0,0 +1,131 @@
 # app/schemas/syndarix/project.py
 """
 Pydantic schemas for Project entity.
 """
 import re
 from datetime import datetime
 from typing import Any
 from uuid import UUID
 from pydantic import BaseModel, ConfigDict, Field, field_validator
 from .enums import AutonomyLevel, ProjectStatus
 class ProjectBase(BaseModel):
    """Base project schema with common fields."""
    name: str = Field(..., min_length=1, max_length=255)
    slug: str | None = Field(None, min_length=1, max_length=255)
    description: str | None = None
    autonomy_level: AutonomyLevel = AutonomyLevel.MILESTONE
    status: ProjectStatus = ProjectStatus.ACTIVE
    settings: dict[str, Any] = Field(default_factory=dict)
    @field_validator("slug")
    @classmethod
    def validate_slug(cls, v: str | None) -> str | None:
        """Validate slug format: lowercase, alphanumeric, hyphens only."""
        if v is None:
            return v
        if not re.match(r"^[a-z0-9-]+$", v):
            raise ValueError(
                "Slug must contain only lowercase letters, numbers, and hyphens"
            )
        if v.startswith("-") or v.endswith("-"):
            raise ValueError("Slug cannot start or end with a hyphen")
        if "--" in v:
            raise ValueError("Slug cannot contain consecutive hyphens")
        return v
    @field_validator("name")
    @classmethod
    def validate_name(cls, v: str) -> str:
        """Validate project name."""
        if not v or v.strip() == "":
            raise ValueError("Project name cannot be empty")
        return v.strip()
 class ProjectCreate(ProjectBase):
    """Schema for creating a new project."""
    name: str = Field(..., min_length=1, max_length=255)
    slug: str = Field(..., min_length=1, max_length=255)
    owner_id: UUID | None = None
 class ProjectUpdate(BaseModel):
    """Schema for updating a project.
    Note: owner_id is intentionally excluded to prevent IDOR vulnerabilities.
    Project ownership transfer should be done via a dedicated endpoint with
    proper authorization checks.
    """
    name: str | None = Field(None, min_length=1, max_length=255)
    slug: str | None = Field(None, min_length=1, max_length=255)
    description: str | None = None
    autonomy_level: AutonomyLevel | None = None
    status: ProjectStatus | None = None
    settings: dict[str, Any] | None = None
    @field_validator("slug")
    @classmethod
    def validate_slug(cls, v: str | None) -> str | None:
        """Validate slug format."""
        if v is None:
            return v
        if not re.match(r"^[a-z0-9-]+$", v):
            raise ValueError(
                "Slug must contain only lowercase letters, numbers, and hyphens"
            )
        if v.startswith("-") or v.endswith("-"):
            raise ValueError("Slug cannot start or end with a hyphen")
        if "--" in v:
            raise ValueError("Slug cannot contain consecutive hyphens")
        return v
    @field_validator("name")
    @classmethod
    def validate_name(cls, v: str | None) -> str | None:
        """Validate project name."""
        if v is not None and (not v or v.strip() == ""):
            raise ValueError("Project name cannot be empty")
        return v.strip() if v else v
 class ProjectInDB(ProjectBase):
    """Schema for project in database."""
    id: UUID
    owner_id: UUID | None = None
    created_at: datetime
    updated_at: datetime
    model_config = ConfigDict(from_attributes=True)
 class ProjectResponse(ProjectBase):
    """Schema for project API responses."""
    id: UUID
    owner_id: UUID | None = None
    created_at: datetime
    updated_at: datetime
    agent_count: int | None = 0
    issue_count: int | None = 0
    active_sprint_name: str | None = None
    model_config = ConfigDict(from_attributes=True)
 class ProjectListResponse(BaseModel):
    """Schema for paginated project list responses."""
    projects: list[ProjectResponse]
    total: int
    page: int
    page_size: int
    pages: int
--- a/backend/app/schemas/syndarix/sprint.py
+++ b/backend/app/schemas/syndarix/sprint.py
@@ -0,0 +1,135 @@
 # app/schemas/syndarix/sprint.py
 """
 Pydantic schemas for Sprint entity.
 """
 from datetime import date, datetime
 from uuid import UUID
 from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
 from .enums import SprintStatus
 class SprintBase(BaseModel):
    """Base sprint schema with common fields."""
    name: str = Field(..., min_length=1, max_length=255)
    number: int = Field(..., ge=1)
    goal: str | None = None
    start_date: date
    end_date: date
    status: SprintStatus = SprintStatus.PLANNED
    planned_points: int | None = Field(None, ge=0)
    velocity: int | None = Field(None, ge=0)
    @field_validator("name")
    @classmethod
    def validate_name(cls, v: str) -> str:
        """Validate sprint name."""
        if not v or v.strip() == "":
            raise ValueError("Sprint name cannot be empty")
        return v.strip()
    @model_validator(mode="after")
    def validate_dates(self) -> "SprintBase":
        """Validate that end_date is after start_date."""
        if self.end_date < self.start_date:
            raise ValueError("End date must be after or equal to start date")
        return self
 class SprintCreate(SprintBase):
    """Schema for creating a new sprint."""
    project_id: UUID
 class SprintUpdate(BaseModel):
    """Schema for updating a sprint."""
    name: str | None = Field(None, min_length=1, max_length=255)
    goal: str | None = None
    start_date: date | None = None
    end_date: date | None = None
    status: SprintStatus | None = None
    planned_points: int | None = Field(None, ge=0)
    velocity: int | None = Field(None, ge=0)
    @field_validator("name")
    @classmethod
    def validate_name(cls, v: str | None) -> str | None:
        """Validate sprint name."""
        if v is not None and (not v or v.strip() == ""):
            raise ValueError("Sprint name cannot be empty")
        return v.strip() if v else v
 class SprintStart(BaseModel):
    """Schema for starting a sprint."""
    start_date: date | None = None  # Optionally override start date
 class SprintComplete(BaseModel):
    """Schema for completing a sprint."""
    velocity: int | None = Field(None, ge=0)
    notes: str | None = None
 class SprintInDB(SprintBase):
    """Schema for sprint in database."""
    id: UUID
    project_id: UUID
    created_at: datetime
    updated_at: datetime
    model_config = ConfigDict(from_attributes=True)
 class SprintResponse(SprintBase):
    """Schema for sprint API responses."""
    id: UUID
    project_id: UUID
    created_at: datetime
    updated_at: datetime
    # Expanded fields from relationships
    project_name: str | None = None
    project_slug: str | None = None
    issue_count: int | None = 0
    open_issues: int | None = 0
    completed_issues: int | None = 0
    model_config = ConfigDict(from_attributes=True)
 class SprintListResponse(BaseModel):
    """Schema for paginated sprint list responses."""
    sprints: list[SprintResponse]
    total: int
    page: int
    page_size: int
    pages: int
 class SprintVelocity(BaseModel):
    """Schema for sprint velocity metrics."""
    sprint_number: int
    sprint_name: str
    planned_points: int | None
    velocity: int | None  # Sum of completed story points
    velocity_ratio: float | None  # velocity/planned ratio
 class SprintBurndown(BaseModel):
    """Schema for sprint burndown data point."""
    date: date
    remaining_points: int
    ideal_remaining: float
--- a/backend/app/services/context/init.py
+++ b/backend/app/services/context/init.py
@@ -0,0 +1,105 @@
 """
 Context Management Engine
 Sophisticated context assembly and optimization for LLM requests.
 Provides intelligent context selection, token budget management,
 and model-specific formatting.
 Usage:
    from app.services.context import (
        ContextSettings,
        get_context_settings,
        SystemContext,
        KnowledgeContext,
        ConversationContext,
        TaskContext,
        ToolContext,
    )
    # Get settings
    settings = get_context_settings()
    # Create context instances
    system_ctx = SystemContext.create_persona(
        name="Code Assistant",
        description="You are a helpful code assistant.",
        capabilities=["Write code", "Debug issues"],
    )
 """
 # Configuration
 from .config import (
    ContextSettings,
    get_context_settings,
    get_default_settings,
    reset_context_settings,
 )
 # Exceptions
 from .exceptions import (
    AssemblyTimeoutError,
    BudgetExceededError,
    CacheError,
    CompressionError,
    ContextError,
    ContextNotFoundError,
    FormattingError,
    InvalidContextError,
    ScoringError,
    TokenCountError,
 )
 # Types
 from .types import (
    AssembledContext,
    BaseContext,
    ContextPriority,
    ContextType,
    ConversationContext,
    KnowledgeContext,
    MessageRole,
    SystemContext,
    TaskComplexity,
    TaskContext,
    TaskStatus,
    ToolContext,
    ToolResultStatus,
 )
 __all__ = [
    # Configuration
    "ContextSettings",
    "get_context_settings",
    "get_default_settings",
    "reset_context_settings",
    # Exceptions
    "AssemblyTimeoutError",
    "BudgetExceededError",
    "CacheError",
    "CompressionError",
    "ContextError",
    "ContextNotFoundError",
    "FormattingError",
    "InvalidContextError",
    "ScoringError",
    "TokenCountError",
    # Types - Base
    "AssembledContext",
    "BaseContext",
    "ContextPriority",
    "ContextType",
    # Types - Conversation
    "ConversationContext",
    "MessageRole",
    # Types - Knowledge
    "KnowledgeContext",
    # Types - System
    "SystemContext",
    # Types - Task
    "TaskComplexity",
    "TaskContext",
    "TaskStatus",
    # Types - Tool
    "ToolContext",
    "ToolResultStatus",
 ]
--- a/backend/app/services/context/adapters/init.py
+++ b/backend/app/services/context/adapters/init.py
@@ -0,0 +1,5 @@
 """
 Model Adapters Module.
 Provides model-specific context formatting.
 """
--- a/backend/app/services/context/assembly/init.py
+++ b/backend/app/services/context/assembly/init.py
@@ -0,0 +1,5 @@
 """
 Context Assembly Module.
 Provides the assembly pipeline and formatting.
 """
--- a/backend/app/services/context/budget/init.py
+++ b/backend/app/services/context/budget/init.py
@@ -0,0 +1,5 @@
 """
 Token Budget Management Module.
 Provides token counting and budget allocation.
 """
--- a/backend/app/services/context/cache/init.py
+++ b/backend/app/services/context/cache/init.py
@@ -0,0 +1,5 @@
 """
 Context Cache Module.
 Provides Redis-based caching for assembled contexts.
 """
--- a/backend/app/services/context/compression/init.py
+++ b/backend/app/services/context/compression/init.py
@@ -0,0 +1,5 @@
 """
 Context Compression Module.
 Provides truncation and compression strategies.
 """
--- a/backend/app/services/context/config.py
+++ b/backend/app/services/context/config.py
@@ -0,0 +1,328 @@
 """
 Context Management Engine Configuration.
 Provides Pydantic settings for context assembly,
 token budget allocation, and caching.
 """
 import threading
 from functools import lru_cache
 from typing import Any
 from pydantic import Field, field_validator, model_validator
 from pydantic_settings import BaseSettings
 class ContextSettings(BaseSettings):
    """
    Configuration for the Context Management Engine.
    All settings can be overridden via environment variables
    with the CTX_ prefix.
    """
    # Budget allocation percentages (must sum to 1.0)
    budget_system: float = Field(
        default=0.05,
        ge=0.0,
        le=1.0,
        description="Percentage of budget for system prompts (5%)",
    )
    budget_task: float = Field(
        default=0.10,
        ge=0.0,
        le=1.0,
        description="Percentage of budget for task context (10%)",
    )
    budget_knowledge: float = Field(
        default=0.40,
        ge=0.0,
        le=1.0,
        description="Percentage of budget for RAG/knowledge (40%)",
    )
    budget_conversation: float = Field(
        default=0.20,
        ge=0.0,
        le=1.0,
        description="Percentage of budget for conversation history (20%)",
    )
    budget_tools: float = Field(
        default=0.05,
        ge=0.0,
        le=1.0,
        description="Percentage of budget for tool descriptions (5%)",
    )
    budget_response: float = Field(
        default=0.15,
        ge=0.0,
        le=1.0,
        description="Percentage reserved for response (15%)",
    )
    budget_buffer: float = Field(
        default=0.05,
        ge=0.0,
        le=1.0,
        description="Percentage buffer for safety margin (5%)",
    )
    # Scoring weights
    scoring_relevance_weight: float = Field(
        default=0.5,
        ge=0.0,
        le=1.0,
        description="Weight for relevance scoring",
    )
    scoring_recency_weight: float = Field(
        default=0.3,
        ge=0.0,
        le=1.0,
        description="Weight for recency scoring",
    )
    scoring_priority_weight: float = Field(
        default=0.2,
        ge=0.0,
        le=1.0,
        description="Weight for priority scoring",
    )
    # Recency decay settings
    recency_decay_hours: float = Field(
        default=24.0,
        gt=0.0,
        description="Hours until recency score decays to 50%",
    )
    recency_max_age_hours: float = Field(
        default=168.0,
        gt=0.0,
        description="Hours until context is considered stale (7 days)",
    )
    # Compression settings
    compression_threshold: float = Field(
        default=0.8,
        ge=0.0,
        le=1.0,
        description="Compress when budget usage exceeds this percentage",
    )
    truncation_suffix: str = Field(
        default="... [truncated]",
        description="Suffix to add when truncating content",
    )
    summary_model_group: str = Field(
        default="fast",
        description="Model group to use for summarization",
    )
    # Caching settings
    cache_enabled: bool = Field(
        default=True,
        description="Enable Redis caching for assembled contexts",
    )
    cache_ttl_seconds: int = Field(
        default=3600,
        ge=60,
        le=86400,
        description="Cache TTL in seconds (1 hour default, max 24 hours)",
    )
    cache_prefix: str = Field(
        default="ctx",
        description="Redis key prefix for context cache",
    )
    # Performance settings
    max_assembly_time_ms: int = Field(
        default=100,
        ge=10,
        le=5000,
        description="Maximum time for context assembly in milliseconds",
    )
    parallel_scoring: bool = Field(
        default=True,
        description="Score contexts in parallel for better performance",
    )
    max_parallel_scores: int = Field(
        default=10,
        ge=1,
        le=50,
        description="Maximum number of contexts to score in parallel",
    )
    # Knowledge retrieval settings
    knowledge_search_type: str = Field(
        default="hybrid",
        description="Default search type for knowledge retrieval",
    )
    knowledge_max_results: int = Field(
        default=10,
        ge=1,
        le=50,
        description="Maximum knowledge chunks to retrieve",
    )
    knowledge_min_score: float = Field(
        default=0.5,
        ge=0.0,
        le=1.0,
        description="Minimum relevance score for knowledge",
    )
    # Conversation history settings
    conversation_max_turns: int = Field(
        default=20,
        ge=1,
        le=100,
        description="Maximum conversation turns to include",
    )
    conversation_recent_priority: bool = Field(
        default=True,
        description="Prioritize recent conversation turns",
    )
    @field_validator("knowledge_search_type")
    @classmethod
    def validate_search_type(cls, v: str) -> str:
        """Validate search type is valid."""
        valid_types = {"semantic", "keyword", "hybrid"}
        if v not in valid_types:
            raise ValueError(f"search_type must be one of: {valid_types}")
        return v
    @model_validator(mode="after")
    def validate_budget_allocation(self) -> "ContextSettings":
        """Validate that budget percentages sum to 1.0."""
        total = (
            self.budget_system
            + self.budget_task
            + self.budget_knowledge
            + self.budget_conversation
            + self.budget_tools
            + self.budget_response
            + self.budget_buffer
        )
        # Allow small floating point error
        if abs(total - 1.0) > 0.001:
            raise ValueError(
                f"Budget percentages must sum to 1.0, got {total:.3f}. "
                f"Current allocation: system={self.budget_system}, task={self.budget_task}, "
                f"knowledge={self.budget_knowledge}, conversation={self.budget_conversation}, "
                f"tools={self.budget_tools}, response={self.budget_response}, buffer={self.budget_buffer}"
            )
        return self
    @model_validator(mode="after")
    def validate_scoring_weights(self) -> "ContextSettings":
        """Validate that scoring weights sum to 1.0."""
        total = (
            self.scoring_relevance_weight
            + self.scoring_recency_weight
            + self.scoring_priority_weight
        )
        # Allow small floating point error
        if abs(total - 1.0) > 0.001:
            raise ValueError(
                f"Scoring weights must sum to 1.0, got {total:.3f}. "
                f"Current weights: relevance={self.scoring_relevance_weight}, "
                f"recency={self.scoring_recency_weight}, priority={self.scoring_priority_weight}"
            )
        return self
    def get_budget_allocation(self) -> dict[str, float]:
        """Get budget allocation as a dictionary."""
        return {
            "system": self.budget_system,
            "task": self.budget_task,
            "knowledge": self.budget_knowledge,
            "conversation": self.budget_conversation,
            "tools": self.budget_tools,
            "response": self.budget_response,
            "buffer": self.budget_buffer,
        }
    def get_scoring_weights(self) -> dict[str, float]:
        """Get scoring weights as a dictionary."""
        return {
            "relevance": self.scoring_relevance_weight,
            "recency": self.scoring_recency_weight,
            "priority": self.scoring_priority_weight,
        }
    def to_dict(self) -> dict[str, Any]:
        """Convert settings to dictionary for logging/debugging."""
        return {
            "budget": self.get_budget_allocation(),
            "scoring": self.get_scoring_weights(),
            "compression": {
                "threshold": self.compression_threshold,
                "summary_model_group": self.summary_model_group,
            },
            "cache": {
                "enabled": self.cache_enabled,
                "ttl_seconds": self.cache_ttl_seconds,
                "prefix": self.cache_prefix,
            },
            "performance": {
                "max_assembly_time_ms": self.max_assembly_time_ms,
                "parallel_scoring": self.parallel_scoring,
                "max_parallel_scores": self.max_parallel_scores,
            },
            "knowledge": {
                "search_type": self.knowledge_search_type,
                "max_results": self.knowledge_max_results,
                "min_score": self.knowledge_min_score,
            },
            "conversation": {
                "max_turns": self.conversation_max_turns,
                "recent_priority": self.conversation_recent_priority,
            },
        }
    model_config = {
        "env_prefix": "CTX_",
        "env_file": "../.env",
        "env_file_encoding": "utf-8",
        "case_sensitive": False,
        "extra": "ignore",
    }
 # Thread-safe singleton pattern
 _settings: ContextSettings | None = None
 _settings_lock = threading.Lock()
 def get_context_settings() -> ContextSettings:
    """
    Get the global ContextSettings instance.
    Thread-safe with double-checked locking pattern.
    Returns:
        ContextSettings instance
    """
    global _settings
    if _settings is None:
        with _settings_lock:
            if _settings is None:
                _settings = ContextSettings()
    return _settings
 def reset_context_settings() -> None:
    """
    Reset the global settings instance.
    Primarily used for testing.
    """
    global _settings
    with _settings_lock:
        _settings = None
@lru_cache(maxsize=1)
 def get_default_settings() -> ContextSettings:
    """
    Get default settings (cached).
    Use this for read-only access to defaults.
    For mutable access, use get_context_settings().
    """
    return ContextSettings()
--- a/backend/app/services/context/exceptions.py
+++ b/backend/app/services/context/exceptions.py
@@ -0,0 +1,354 @@
 """
 Context Management Engine Exceptions.
 Provides a hierarchy of exceptions for context assembly,
 token budget management, and related operations.
 """
 from typing import Any
 class ContextError(Exception):
    """
    Base exception for all context management errors.
    All context-related exceptions should inherit from this class
    to allow for catch-all handling when needed.
    """
    def __init__(self, message: str, details: dict[str, Any] | None = None) -> None:
        """
        Initialize context error.
        Args:
            message: Human-readable error message
            details: Optional dict with additional error context
        """
        self.message = message
        self.details = details or {}
        super().__init__(message)
    def to_dict(self) -> dict[str, Any]:
        """Convert exception to dictionary for logging/serialization."""
        return {
            "error_type": self.__class__.__name__,
            "message": self.message,
            "details": self.details,
        }
 class BudgetExceededError(ContextError):
    """
    Raised when token budget is exceeded.
    This occurs when the assembled context would exceed the
    allocated token budget for a specific context type or total.
    """
    def __init__(
        self,
        message: str = "Token budget exceeded",
        allocated: int = 0,
        requested: int = 0,
        context_type: str | None = None,
    ) -> None:
        """
        Initialize budget exceeded error.
        Args:
            message: Error message
            allocated: Tokens allocated for this context type
            requested: Tokens requested
            context_type: Type of context that exceeded budget
        """
        details = {
            "allocated": allocated,
            "requested": requested,
            "overage": requested - allocated,
        }
        if context_type:
            details["context_type"] = context_type
        super().__init__(message, details)
        self.allocated = allocated
        self.requested = requested
        self.context_type = context_type
 class TokenCountError(ContextError):
    """
    Raised when token counting fails.
    This typically occurs when the LLM Gateway token counting
    service is unavailable or returns an error.
    """
    def __init__(
        self,
        message: str = "Failed to count tokens",
        model: str | None = None,
        text_length: int | None = None,
    ) -> None:
        """
        Initialize token count error.
        Args:
            message: Error message
            model: Model for which counting was attempted
            text_length: Length of text that failed to count
        """
        details: dict[str, Any] = {}
        if model:
            details["model"] = model
        if text_length is not None:
            details["text_length"] = text_length
        super().__init__(message, details)
        self.model = model
        self.text_length = text_length
 class CompressionError(ContextError):
    """
    Raised when context compression fails.
    This can occur when summarization or truncation cannot
    reduce content to fit within the budget.
    """
    def __init__(
        self,
        message: str = "Failed to compress context",
        original_tokens: int | None = None,
        target_tokens: int | None = None,
        achieved_tokens: int | None = None,
    ) -> None:
        """
        Initialize compression error.
        Args:
            message: Error message
            original_tokens: Tokens before compression
            target_tokens: Target token count
            achieved_tokens: Tokens achieved after compression attempt
        """
        details: dict[str, Any] = {}
        if original_tokens is not None:
            details["original_tokens"] = original_tokens
        if target_tokens is not None:
            details["target_tokens"] = target_tokens
        if achieved_tokens is not None:
            details["achieved_tokens"] = achieved_tokens
        super().__init__(message, details)
        self.original_tokens = original_tokens
        self.target_tokens = target_tokens
        self.achieved_tokens = achieved_tokens
 class AssemblyTimeoutError(ContextError):
    """
    Raised when context assembly exceeds time limit.
    Context assembly must complete within a configurable
    time limit to maintain responsiveness.
    """
    def __init__(
        self,
        message: str = "Context assembly timed out",
        timeout_ms: int = 0,
        elapsed_ms: float = 0.0,
        stage: str | None = None,
    ) -> None:
        """
        Initialize assembly timeout error.
        Args:
            message: Error message
            timeout_ms: Configured timeout in milliseconds
            elapsed_ms: Actual elapsed time in milliseconds
            stage: Pipeline stage where timeout occurred
        """
        details = {
            "timeout_ms": timeout_ms,
            "elapsed_ms": round(elapsed_ms, 2),
        }
        if stage:
            details["stage"] = stage
        super().__init__(message, details)
        self.timeout_ms = timeout_ms
        self.elapsed_ms = elapsed_ms
        self.stage = stage
 class ScoringError(ContextError):
    """
    Raised when context scoring fails.
    This occurs when relevance, recency, or priority scoring
    encounters an error.
    """
    def __init__(
        self,
        message: str = "Failed to score context",
        scorer_type: str | None = None,
        context_id: str | None = None,
    ) -> None:
        """
        Initialize scoring error.
        Args:
            message: Error message
            scorer_type: Type of scorer that failed
            context_id: ID of context being scored
        """
        details: dict[str, Any] = {}
        if scorer_type:
            details["scorer_type"] = scorer_type
        if context_id:
            details["context_id"] = context_id
        super().__init__(message, details)
        self.scorer_type = scorer_type
        self.context_id = context_id
 class FormattingError(ContextError):
    """
    Raised when context formatting fails.
    This occurs when converting assembled context to
    model-specific format fails.
    """
    def __init__(
        self,
        message: str = "Failed to format context",
        model: str | None = None,
        adapter: str | None = None,
    ) -> None:
        """
        Initialize formatting error.
        Args:
            message: Error message
            model: Target model
            adapter: Adapter that failed
        """
        details: dict[str, Any] = {}
        if model:
            details["model"] = model
        if adapter:
            details["adapter"] = adapter
        super().__init__(message, details)
        self.model = model
        self.adapter = adapter
 class CacheError(ContextError):
    """
    Raised when cache operations fail.
    This is typically non-fatal and should be handled
    gracefully by falling back to recomputation.
    """
    def __init__(
        self,
        message: str = "Cache operation failed",
        operation: str | None = None,
        cache_key: str | None = None,
    ) -> None:
        """
        Initialize cache error.
        Args:
            message: Error message
            operation: Cache operation that failed (get, set, delete)
            cache_key: Key involved in the failed operation
        """
        details: dict[str, Any] = {}
        if operation:
            details["operation"] = operation
        if cache_key:
            details["cache_key"] = cache_key
        super().__init__(message, details)
        self.operation = operation
        self.cache_key = cache_key
 class ContextNotFoundError(ContextError):
    """
    Raised when expected context is not found.
    This occurs when required context sources return
    no results or are unavailable.
    """
    def __init__(
        self,
        message: str = "Required context not found",
        source: str | None = None,
        query: str | None = None,
    ) -> None:
        """
        Initialize context not found error.
        Args:
            message: Error message
            source: Source that returned no results
            query: Query used to search
        """
        details: dict[str, Any] = {}
        if source:
            details["source"] = source
        if query:
            details["query"] = query
        super().__init__(message, details)
        self.source = source
        self.query = query
 class InvalidContextError(ContextError):
    """
    Raised when context data is invalid.
    This occurs when context content or metadata
    fails validation.
    """
    def __init__(
        self,
        message: str = "Invalid context data",
        field: str | None = None,
        value: Any | None = None,
        reason: str | None = None,
    ) -> None:
        """
        Initialize invalid context error.
        Args:
            message: Error message
            field: Field that is invalid
            value: Invalid value (may be redacted for security)
            reason: Reason for invalidity
        """
        details: dict[str, Any] = {}
        if field:
            details["field"] = field
        if value is not None:
            # Avoid logging potentially sensitive values
            details["value_type"] = type(value).__name__
        if reason:
            details["reason"] = reason
        super().__init__(message, details)
        self.field = field
        self.value = value
        self.reason = reason
--- a/backend/app/services/context/prioritization/init.py
+++ b/backend/app/services/context/prioritization/init.py
@@ -0,0 +1,5 @@
 """
 Context Prioritization Module.
 Provides context ranking and selection.
 """
--- a/backend/app/services/context/scoring/init.py
+++ b/backend/app/services/context/scoring/init.py
@@ -0,0 +1,5 @@
 """
 Context Scoring Module.
 Provides relevance, recency, and composite scoring.
 """
--- a/backend/app/services/context/types/init.py
+++ b/backend/app/services/context/types/init.py
@@ -0,0 +1,49 @@
 """
 Context Types Module.
 Provides all context types used in the Context Management Engine.
 """
 from .base import (
    AssembledContext,
    BaseContext,
    ContextPriority,
    ContextType,
 )
 from .conversation import (
    ConversationContext,
    MessageRole,
 )
 from .knowledge import KnowledgeContext
 from .system import SystemContext
 from .task import (
    TaskComplexity,
    TaskContext,
    TaskStatus,
 )
 from .tool import (
    ToolContext,
    ToolResultStatus,
 )
 __all__ = [
    # Base types
    "AssembledContext",
    "BaseContext",
    "ContextPriority",
    "ContextType",
    # Conversation
    "ConversationContext",
    "MessageRole",
    # Knowledge
    "KnowledgeContext",
    # System
    "SystemContext",
    # Task
    "TaskComplexity",
    "TaskContext",
    "TaskStatus",
    # Tool
    "ToolContext",
    "ToolResultStatus",
 ]
--- a/backend/app/services/context/types/base.py
+++ b/backend/app/services/context/types/base.py
@@ -0,0 +1,320 @@
 """
 Base Context Types and Enums.
 Provides the foundation for all context types used in
 the Context Management Engine.
 """
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field
 from datetime import UTC, datetime
 from enum import Enum
 from typing import Any
 from uuid import uuid4
 class ContextType(str, Enum):
    """
    Types of context that can be assembled.
    Each type has specific handling, formatting, and
    budget allocation rules.
    """
    SYSTEM = "system"
    TASK = "task"
    KNOWLEDGE = "knowledge"
    CONVERSATION = "conversation"
    TOOL = "tool"
    @classmethod
    def from_string(cls, value: str) -> "ContextType":
        """
        Convert string to ContextType.
        Args:
            value: String value
        Returns:
            ContextType enum value
        Raises:
            ValueError: If value is not a valid context type
        """
        try:
            return cls(value.lower())
        except ValueError:
            valid = ", ".join(t.value for t in cls)
            raise ValueError(f"Invalid context type '{value}'. Valid types: {valid}")
 class ContextPriority(int, Enum):
    """
    Priority levels for context ordering.
    Higher values indicate higher priority.
    """
    LOWEST = 0
    LOW = 25
    NORMAL = 50
    HIGH = 75
    HIGHEST = 100
    CRITICAL = 150  # Never omit
    @classmethod
    def from_int(cls, value: int) -> "ContextPriority":
        """
        Get closest priority level for an integer.
        Args:
            value: Integer priority value
        Returns:
            Closest ContextPriority enum value
        """
        priorities = sorted(cls, key=lambda p: p.value)
        for priority in reversed(priorities):
            if value >= priority.value:
                return priority
        return cls.LOWEST
@dataclass(eq=False)
 class BaseContext(ABC):
    """
    Abstract base class for all context types.
    Provides common fields and methods for context handling,
    scoring, and serialization.
    """
    # Required fields
    content: str
    source: str
    # Optional fields with defaults
    id: str = field(default_factory=lambda: str(uuid4()))
    timestamp: datetime = field(default_factory=lambda: datetime.now(UTC))
    priority: int = field(default=ContextPriority.NORMAL.value)
    metadata: dict[str, Any] = field(default_factory=dict)
    # Computed/cached fields
    _token_count: int | None = field(default=None, repr=False)
    _score: float | None = field(default=None, repr=False)
    @property
    def token_count(self) -> int | None:
        """Get cached token count (None if not counted yet)."""
        return self._token_count
    @token_count.setter
    def token_count(self, value: int) -> None:
        """Set token count."""
        self._token_count = value
    @property
    def score(self) -> float | None:
        """Get cached score (None if not scored yet)."""
        return self._score
    @score.setter
    def score(self, value: float) -> None:
        """Set score (clamped to 0.0-1.0)."""
        self._score = max(0.0, min(1.0, value))
    @abstractmethod
    def get_type(self) -> ContextType:
        """
        Get the type of this context.
        Returns:
            ContextType enum value
        """
        ...
    def get_age_seconds(self) -> float:
        """
        Get age of context in seconds.
        Returns:
            Age in seconds since creation
        """
        now = datetime.now(UTC)
        delta = now - self.timestamp
        return delta.total_seconds()
    def get_age_hours(self) -> float:
        """
        Get age of context in hours.
        Returns:
            Age in hours since creation
        """
        return self.get_age_seconds() / 3600
    def is_stale(self, max_age_hours: float = 168.0) -> bool:
        """
        Check if context is stale.
        Args:
            max_age_hours: Maximum age before considered stale (default 7 days)
        Returns:
            True if context is older than max_age_hours
        """
        return self.get_age_hours() > max_age_hours
    def to_dict(self) -> dict[str, Any]:
        """
        Convert context to dictionary for serialization.
        Returns:
            Dictionary representation
        """
        return {
            "id": self.id,
            "type": self.get_type().value,
            "content": self.content,
            "source": self.source,
            "timestamp": self.timestamp.isoformat(),
            "priority": self.priority,
            "metadata": self.metadata,
            "token_count": self._token_count,
            "score": self._score,
        }
    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> "BaseContext":
        """
        Create context from dictionary.
        Note: Subclasses should override this to return correct type.
        Args:
            data: Dictionary with context data
        Returns:
            Context instance
        """
        raise NotImplementedError("Subclasses must implement from_dict")
    def truncate(self, max_tokens: int, suffix: str = "... [truncated]") -> str:
        """
        Truncate content to fit within token limit.
        This is a rough estimation based on characters.
        For accurate truncation, use the TokenCalculator.
        Args:
            max_tokens: Maximum tokens allowed
            suffix: Suffix to append when truncated
        Returns:
            Truncated content
        """
        if self._token_count is None or self._token_count <= max_tokens:
            return self.content
        # Rough estimation: 4 chars per token on average
        estimated_chars = max_tokens * 4
        suffix_chars = len(suffix)
        if len(self.content) <= estimated_chars:
            return self.content
        truncated = self.content[: estimated_chars - suffix_chars]
        # Try to break at word boundary
        last_space = truncated.rfind(" ")
        if last_space > estimated_chars * 0.8:
            truncated = truncated[:last_space]
        return truncated + suffix
    def __hash__(self) -> int:
        """Hash based on ID for set/dict usage."""
        return hash(self.id)
    def __eq__(self, other: object) -> bool:
        """Equality based on ID."""
        if not isinstance(other, BaseContext):
            return False
        return self.id == other.id
@dataclass
 class AssembledContext:
    """
    Result of context assembly.
    Contains the final formatted context ready for LLM consumption,
    along with metadata about the assembly process.
    """
    # Main content
    content: str
    token_count: int
    # Assembly metadata
    contexts_included: int
    contexts_excluded: int = 0
    assembly_time_ms: float = 0.0
    # Budget tracking
    budget_total: int = 0
    budget_used: int = 0
    # Context breakdown
    by_type: dict[str, int] = field(default_factory=dict)
    # Cache info
    cache_hit: bool = False
    cache_key: str | None = None
    @property
    def budget_utilization(self) -> float:
        """Get budget utilization percentage."""
        if self.budget_total == 0:
            return 0.0
        return self.budget_used / self.budget_total
    def to_dict(self) -> dict[str, Any]:
        """Convert to dictionary."""
        return {
            "content": self.content,
            "token_count": self.token_count,
            "contexts_included": self.contexts_included,
            "contexts_excluded": self.contexts_excluded,
            "assembly_time_ms": round(self.assembly_time_ms, 2),
            "budget_total": self.budget_total,
            "budget_used": self.budget_used,
            "budget_utilization": round(self.budget_utilization, 3),
            "by_type": self.by_type,
            "cache_hit": self.cache_hit,
            "cache_key": self.cache_key,
        }
    def to_json(self) -> str:
        """Convert to JSON string."""
        import json
        return json.dumps(self.to_dict())
    @classmethod
    def from_json(cls, json_str: str) -> "AssembledContext":
        """Create from JSON string."""
        import json
        data = json.loads(json_str)
        return cls(
            content=data["content"],
            token_count=data["token_count"],
            contexts_included=data["contexts_included"],
            contexts_excluded=data.get("contexts_excluded", 0),
            assembly_time_ms=data.get("assembly_time_ms", 0.0),
            budget_total=data.get("budget_total", 0),
            budget_used=data.get("budget_used", 0),
            by_type=data.get("by_type", {}),
            cache_hit=data.get("cache_hit", False),
            cache_key=data.get("cache_key"),
        )
--- a/backend/app/services/context/types/conversation.py
+++ b/backend/app/services/context/types/conversation.py
@@ -0,0 +1,182 @@
 """
 Conversation Context Type.
 Represents conversation history for context continuity.
 """
 from dataclasses import dataclass, field
 from datetime import UTC, datetime
 from enum import Enum
 from typing import Any
 from .base import BaseContext, ContextPriority, ContextType
 class MessageRole(str, Enum):
    """Roles for conversation messages."""
    USER = "user"
    ASSISTANT = "assistant"
    SYSTEM = "system"
    TOOL = "tool"
    @classmethod
    def from_string(cls, value: str) -> "MessageRole":
        """Convert string to MessageRole."""
        try:
            return cls(value.lower())
        except ValueError:
            # Default to user for unknown roles
            return cls.USER
@dataclass(eq=False)
 class ConversationContext(BaseContext):
    """
    Context from conversation history.
    Represents a single turn in the conversation,
    including user messages, assistant responses,
    and tool results.
    """
    # Conversation-specific fields
    role: MessageRole = field(default=MessageRole.USER)
    turn_index: int = field(default=0)
    session_id: str | None = field(default=None)
    parent_message_id: str | None = field(default=None)
    def get_type(self) -> ContextType:
        """Return CONVERSATION context type."""
        return ContextType.CONVERSATION
    def to_dict(self) -> dict[str, Any]:
        """Convert to dictionary with conversation-specific fields."""
        base = super().to_dict()
        base.update(
            {
                "role": self.role.value,
                "turn_index": self.turn_index,
                "session_id": self.session_id,
                "parent_message_id": self.parent_message_id,
            }
        )
        return base
    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> "ConversationContext":
        """Create ConversationContext from dictionary."""
        role = data.get("role", "user")
        if isinstance(role, str):
            role = MessageRole.from_string(role)
        return cls(
            id=data.get("id", ""),
            content=data["content"],
            source=data.get("source", "conversation"),
            timestamp=datetime.fromisoformat(data["timestamp"])
            if isinstance(data.get("timestamp"), str)
            else data.get("timestamp", datetime.now(UTC)),
            priority=data.get("priority", ContextPriority.NORMAL.value),
            metadata=data.get("metadata", {}),
            role=role,
            turn_index=data.get("turn_index", 0),
            session_id=data.get("session_id"),
            parent_message_id=data.get("parent_message_id"),
        )
    @classmethod
    def from_message(
        cls,
        content: str,
        role: str | MessageRole,
        turn_index: int = 0,
        session_id: str | None = None,
        timestamp: datetime | None = None,
    ) -> "ConversationContext":
        """
        Create ConversationContext from a message.
        Args:
            content: Message content
            role: Message role (user, assistant, system, tool)
            turn_index: Position in conversation
            session_id: Session identifier
            timestamp: Message timestamp
        Returns:
            ConversationContext instance
        """
        if isinstance(role, str):
            role = MessageRole.from_string(role)
        # Recent messages have higher priority
        priority = ContextPriority.NORMAL.value
        return cls(
            content=content,
            source="conversation",
            role=role,
            turn_index=turn_index,
            session_id=session_id,
            timestamp=timestamp or datetime.now(UTC),
            priority=priority,
        )
    @classmethod
    def from_history(
        cls,
        messages: list[dict[str, Any]],
        session_id: str | None = None,
    ) -> list["ConversationContext"]:
        """
        Create multiple ConversationContexts from message history.
        Args:
            messages: List of message dicts with 'role' and 'content'
            session_id: Session identifier
        Returns:
            List of ConversationContext instances
        """
        contexts = []
        for i, msg in enumerate(messages):
            ctx = cls.from_message(
                content=msg.get("content", ""),
                role=msg.get("role", "user"),
                turn_index=i,
                session_id=session_id,
                timestamp=datetime.fromisoformat(msg["timestamp"])
                if "timestamp" in msg
                else None,
            )
            contexts.append(ctx)
        return contexts
    def is_user_message(self) -> bool:
        """Check if this is a user message."""
        return self.role == MessageRole.USER
    def is_assistant_message(self) -> bool:
        """Check if this is an assistant message."""
        return self.role == MessageRole.ASSISTANT
    def is_tool_result(self) -> bool:
        """Check if this is a tool result."""
        return self.role == MessageRole.TOOL
    def format_for_prompt(self) -> str:
        """
        Format message for inclusion in prompt.
        Returns:
            Formatted message string
        """
        role_labels = {
            MessageRole.USER: "User",
            MessageRole.ASSISTANT: "Assistant",
            MessageRole.SYSTEM: "System",
            MessageRole.TOOL: "Tool Result",
        }
        label = role_labels.get(self.role, "Unknown")
        return f"{label}: {self.content}"
--- a/backend/app/services/context/types/knowledge.py
+++ b/backend/app/services/context/types/knowledge.py
@@ -0,0 +1,143 @@
 """
 Knowledge Context Type.
 Represents RAG results from the Knowledge Base MCP server.
 """
 from dataclasses import dataclass, field
 from datetime import UTC, datetime
 from typing import Any
 from .base import BaseContext, ContextPriority, ContextType
@dataclass(eq=False)
 class KnowledgeContext(BaseContext):
    """
    Context from knowledge base / RAG retrieval.
    Knowledge context represents chunks retrieved from the
    Knowledge Base MCP server, including:
    - Code snippets
    - Documentation
    - Previous conversations
    - External knowledge
    Each chunk includes relevance scoring from the search.
    """
    # Knowledge-specific fields
    collection: str = field(default="default")
    file_type: str | None = field(default=None)
    chunk_index: int = field(default=0)
    relevance_score: float = field(default=0.0)
    search_query: str = field(default="")
    def get_type(self) -> ContextType:
        """Return KNOWLEDGE context type."""
        return ContextType.KNOWLEDGE
    def to_dict(self) -> dict[str, Any]:
        """Convert to dictionary with knowledge-specific fields."""
        base = super().to_dict()
        base.update(
            {
                "collection": self.collection,
                "file_type": self.file_type,
                "chunk_index": self.chunk_index,
                "relevance_score": self.relevance_score,
                "search_query": self.search_query,
            }
        )
        return base
    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> "KnowledgeContext":
        """Create KnowledgeContext from dictionary."""
        return cls(
            id=data.get("id", ""),
            content=data["content"],
            source=data["source"],
            timestamp=datetime.fromisoformat(data["timestamp"])
            if isinstance(data.get("timestamp"), str)
            else data.get("timestamp", datetime.now(UTC)),
            priority=data.get("priority", ContextPriority.NORMAL.value),
            metadata=data.get("metadata", {}),
            collection=data.get("collection", "default"),
            file_type=data.get("file_type"),
            chunk_index=data.get("chunk_index", 0),
            relevance_score=data.get("relevance_score", 0.0),
            search_query=data.get("search_query", ""),
        )
    @classmethod
    def from_search_result(
        cls,
        result: dict[str, Any],
        query: str,
    ) -> "KnowledgeContext":
        """
        Create KnowledgeContext from a Knowledge Base search result.
        Args:
            result: Search result from Knowledge Base MCP
            query: Search query used
        Returns:
            KnowledgeContext instance
        """
        return cls(
            content=result.get("content", ""),
            source=result.get("source_path", "unknown"),
            collection=result.get("collection", "default"),
            file_type=result.get("file_type"),
            chunk_index=result.get("chunk_index", 0),
            relevance_score=result.get("score", 0.0),
            search_query=query,
            metadata={
                "chunk_id": result.get("id"),
                "content_hash": result.get("content_hash"),
            },
        )
    @classmethod
    def from_search_results(
        cls,
        results: list[dict[str, Any]],
        query: str,
    ) -> list["KnowledgeContext"]:
        """
        Create multiple KnowledgeContexts from search results.
        Args:
            results: List of search results
            query: Search query used
        Returns:
            List of KnowledgeContext instances
        """
        return [cls.from_search_result(r, query) for r in results]
    def is_code(self) -> bool:
        """Check if this is code content."""
        code_types = {"python", "javascript", "typescript", "go", "rust", "java", "c", "cpp"}
        return self.file_type is not None and self.file_type.lower() in code_types
    def is_documentation(self) -> bool:
        """Check if this is documentation content."""
        doc_types = {"markdown", "rst", "txt", "md"}
        return self.file_type is not None and self.file_type.lower() in doc_types
    def get_formatted_source(self) -> str:
        """
        Get a formatted source string for display.
        Returns:
            Formatted source string
        """
        parts = [self.source]
        if self.file_type:
            parts.append(f"({self.file_type})")
        if self.collection != "default":
            parts.insert(0, f"[{self.collection}]")
        return " ".join(parts)
--- a/backend/app/services/context/types/system.py
+++ b/backend/app/services/context/types/system.py
@@ -0,0 +1,138 @@
 """
 System Context Type.
 Represents system prompts, instructions, and agent personas.
 """
 from dataclasses import dataclass, field
 from datetime import UTC, datetime
 from typing import Any
 from .base import BaseContext, ContextPriority, ContextType
@dataclass(eq=False)
 class SystemContext(BaseContext):
    """
    Context for system prompts and instructions.
    System context typically includes:
    - Agent persona and role definitions
    - Behavioral instructions
    - Safety guidelines
    - Output format requirements
    System context is usually high priority and should
    rarely be truncated or omitted.
    """
    # System context specific fields
    role: str = field(default="assistant")
    instructions_type: str = field(default="general")
    def __post_init__(self) -> None:
        """Set high priority for system context."""
        # System context defaults to high priority
        if self.priority == ContextPriority.NORMAL.value:
            self.priority = ContextPriority.HIGH.value
    def get_type(self) -> ContextType:
        """Return SYSTEM context type."""
        return ContextType.SYSTEM
    def to_dict(self) -> dict[str, Any]:
        """Convert to dictionary with system-specific fields."""
        base = super().to_dict()
        base.update(
            {
                "role": self.role,
                "instructions_type": self.instructions_type,
            }
        )
        return base
    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> "SystemContext":
        """Create SystemContext from dictionary."""
        return cls(
            id=data.get("id", ""),
            content=data["content"],
            source=data["source"],
            timestamp=datetime.fromisoformat(data["timestamp"])
            if isinstance(data.get("timestamp"), str)
            else data.get("timestamp", datetime.now(UTC)),
            priority=data.get("priority", ContextPriority.HIGH.value),
            metadata=data.get("metadata", {}),
            role=data.get("role", "assistant"),
            instructions_type=data.get("instructions_type", "general"),
        )
    @classmethod
    def create_persona(
        cls,
        name: str,
        description: str,
        capabilities: list[str] | None = None,
        constraints: list[str] | None = None,
    ) -> "SystemContext":
        """
        Create a persona system context.
        Args:
            name: Agent name/role
            description: Role description
            capabilities: List of things the agent can do
            constraints: List of limitations
        Returns:
            SystemContext with formatted persona
        """
        parts = [f"You are {name}.", "", description]
        if capabilities:
            parts.append("")
            parts.append("You can:")
            for cap in capabilities:
                parts.append(f"- {cap}")
        if constraints:
            parts.append("")
            parts.append("You must not:")
            for constraint in constraints:
                parts.append(f"- {constraint}")
        return cls(
            content="\n".join(parts),
            source="persona_builder",
            role=name.lower().replace(" ", "_"),
            instructions_type="persona",
            priority=ContextPriority.HIGHEST.value,
        )
    @classmethod
    def create_instructions(
        cls,
        instructions: str | list[str],
        source: str = "instructions",
    ) -> "SystemContext":
        """
        Create an instructions system context.
        Args:
            instructions: Instructions string or list of instruction strings
            source: Source identifier
        Returns:
            SystemContext with instructions
        """
        if isinstance(instructions, list):
            content = "\n".join(f"- {inst}" for inst in instructions)
        else:
            content = instructions
        return cls(
            content=content,
            source=source,
            instructions_type="instructions",
            priority=ContextPriority.HIGH.value,
        )
--- a/backend/app/services/context/types/task.py
+++ b/backend/app/services/context/types/task.py
@@ -0,0 +1,195 @@
 """
 Task Context Type.
 Represents the current task or objective for the agent.
 """
 from dataclasses import dataclass, field
 from datetime import UTC, datetime
 from enum import Enum
 from typing import Any
 from .base import BaseContext, ContextPriority, ContextType
 class TaskStatus(str, Enum):
    """Status of a task."""
    PENDING = "pending"
    IN_PROGRESS = "in_progress"
    BLOCKED = "blocked"
    COMPLETED = "completed"
    FAILED = "failed"
 class TaskComplexity(str, Enum):
    """Complexity level of a task."""
    TRIVIAL = "trivial"
    SIMPLE = "simple"
    MODERATE = "moderate"
    COMPLEX = "complex"
    VERY_COMPLEX = "very_complex"
@dataclass(eq=False)
 class TaskContext(BaseContext):
    """
    Context for the current task or objective.
    Task context provides information about what the agent
    should accomplish, including:
    - Task description and goals
    - Acceptance criteria
    - Constraints and requirements
    - Related issue/ticket information
    """
    # Task-specific fields
    title: str = field(default="")
    status: TaskStatus = field(default=TaskStatus.PENDING)
    complexity: TaskComplexity = field(default=TaskComplexity.MODERATE)
    issue_id: str | None = field(default=None)
    project_id: str | None = field(default=None)
    acceptance_criteria: list[str] = field(default_factory=list)
    constraints: list[str] = field(default_factory=list)
    parent_task_id: str | None = field(default=None)
    def __post_init__(self) -> None:
        """Set high priority for task context."""
        # Task context defaults to high priority
        if self.priority == ContextPriority.NORMAL.value:
            self.priority = ContextPriority.HIGH.value
    def get_type(self) -> ContextType:
        """Return TASK context type."""
        return ContextType.TASK
    def to_dict(self) -> dict[str, Any]:
        """Convert to dictionary with task-specific fields."""
        base = super().to_dict()
        base.update(
            {
                "title": self.title,
                "status": self.status.value,
                "complexity": self.complexity.value,
                "issue_id": self.issue_id,
                "project_id": self.project_id,
                "acceptance_criteria": self.acceptance_criteria,
                "constraints": self.constraints,
                "parent_task_id": self.parent_task_id,
            }
        )
        return base
    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> "TaskContext":
        """Create TaskContext from dictionary."""
        status = data.get("status", "pending")
        if isinstance(status, str):
            status = TaskStatus(status)
        complexity = data.get("complexity", "moderate")
        if isinstance(complexity, str):
            complexity = TaskComplexity(complexity)
        return cls(
            id=data.get("id", ""),
            content=data["content"],
            source=data.get("source", "task"),
            timestamp=datetime.fromisoformat(data["timestamp"])
            if isinstance(data.get("timestamp"), str)
            else data.get("timestamp", datetime.now(UTC)),
            priority=data.get("priority", ContextPriority.HIGH.value),
            metadata=data.get("metadata", {}),
            title=data.get("title", ""),
            status=status,
            complexity=complexity,
            issue_id=data.get("issue_id"),
            project_id=data.get("project_id"),
            acceptance_criteria=data.get("acceptance_criteria", []),
            constraints=data.get("constraints", []),
            parent_task_id=data.get("parent_task_id"),
        )
    @classmethod
    def create(
        cls,
        title: str,
        description: str,
        acceptance_criteria: list[str] | None = None,
        constraints: list[str] | None = None,
        issue_id: str | None = None,
        project_id: str | None = None,
        complexity: TaskComplexity | str = TaskComplexity.MODERATE,
    ) -> "TaskContext":
        """
        Create a task context.
        Args:
            title: Task title
            description: Task description
            acceptance_criteria: List of acceptance criteria
            constraints: List of constraints
            issue_id: Related issue ID
            project_id: Project ID
            complexity: Task complexity
        Returns:
            TaskContext instance
        """
        if isinstance(complexity, str):
            complexity = TaskComplexity(complexity)
        return cls(
            content=description,
            source=f"task:{issue_id}" if issue_id else "task",
            title=title,
            status=TaskStatus.IN_PROGRESS,
            complexity=complexity,
            issue_id=issue_id,
            project_id=project_id,
            acceptance_criteria=acceptance_criteria or [],
            constraints=constraints or [],
        )
    def format_for_prompt(self) -> str:
        """
        Format task for inclusion in prompt.
        Returns:
            Formatted task string
        """
        parts = []
        if self.title:
            parts.append(f"Task: {self.title}")
            parts.append("")
        parts.append(self.content)
        if self.acceptance_criteria:
            parts.append("")
            parts.append("Acceptance Criteria:")
            for criterion in self.acceptance_criteria:
                parts.append(f"- {criterion}")
        if self.constraints:
            parts.append("")
            parts.append("Constraints:")
            for constraint in self.constraints:
                parts.append(f"- {constraint}")
        return "\n".join(parts)
    def is_active(self) -> bool:
        """Check if task is currently active."""
        return self.status in (TaskStatus.PENDING, TaskStatus.IN_PROGRESS)
    def is_complete(self) -> bool:
        """Check if task is complete."""
        return self.status == TaskStatus.COMPLETED
    def is_blocked(self) -> bool:
        """Check if task is blocked."""
        return self.status == TaskStatus.BLOCKED
--- a/backend/app/services/context/types/tool.py
+++ b/backend/app/services/context/types/tool.py
@@ -0,0 +1,207 @@
 """
 Tool Context Type.
 Represents available tools and recent tool execution results.
 """
 from dataclasses import dataclass, field
 from datetime import UTC, datetime
 from enum import Enum
 from typing import Any
 from .base import BaseContext, ContextPriority, ContextType
 class ToolResultStatus(str, Enum):
    """Status of a tool execution result."""
    SUCCESS = "success"
    ERROR = "error"
    TIMEOUT = "timeout"
    CANCELLED = "cancelled"
@dataclass(eq=False)
 class ToolContext(BaseContext):
    """
    Context for tools and tool execution results.
    Tool context includes:
    - Tool descriptions and parameters
    - Recent tool execution results
    - Tool availability information
    This helps the LLM understand what tools are available
    and what results previous tool calls produced.
    """
    # Tool-specific fields
    tool_name: str = field(default="")
    tool_description: str = field(default="")
    is_result: bool = field(default=False)
    result_status: ToolResultStatus | None = field(default=None)
    execution_time_ms: float | None = field(default=None)
    parameters: dict[str, Any] = field(default_factory=dict)
    server_name: str | None = field(default=None)
    def get_type(self) -> ContextType:
        """Return TOOL context type."""
        return ContextType.TOOL
    def to_dict(self) -> dict[str, Any]:
        """Convert to dictionary with tool-specific fields."""
        base = super().to_dict()
        base.update(
            {
                "tool_name": self.tool_name,
                "tool_description": self.tool_description,
                "is_result": self.is_result,
                "result_status": self.result_status.value if self.result_status else None,
                "execution_time_ms": self.execution_time_ms,
                "parameters": self.parameters,
                "server_name": self.server_name,
            }
        )
        return base
    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> "ToolContext":
        """Create ToolContext from dictionary."""
        result_status = data.get("result_status")
        if isinstance(result_status, str):
            result_status = ToolResultStatus(result_status)
        return cls(
            id=data.get("id", ""),
            content=data["content"],
            source=data.get("source", "tool"),
            timestamp=datetime.fromisoformat(data["timestamp"])
            if isinstance(data.get("timestamp"), str)
            else data.get("timestamp", datetime.now(UTC)),
            priority=data.get("priority", ContextPriority.NORMAL.value),
            metadata=data.get("metadata", {}),
            tool_name=data.get("tool_name", ""),
            tool_description=data.get("tool_description", ""),
            is_result=data.get("is_result", False),
            result_status=result_status,
            execution_time_ms=data.get("execution_time_ms"),
            parameters=data.get("parameters", {}),
            server_name=data.get("server_name"),
        )
    @classmethod
    def from_tool_definition(
        cls,
        name: str,
        description: str,
        parameters: dict[str, Any] | None = None,
        server_name: str | None = None,
    ) -> "ToolContext":
        """
        Create a ToolContext from a tool definition.
        Args:
            name: Tool name
            description: Tool description
            parameters: Tool parameter schema
            server_name: MCP server name
        Returns:
            ToolContext instance
        """
        # Format content as tool documentation
        content_parts = [f"Tool: {name}", "", description]
        if parameters:
            content_parts.append("")
            content_parts.append("Parameters:")
            for param_name, param_info in parameters.items():
                param_type = param_info.get("type", "any")
                param_desc = param_info.get("description", "")
                required = param_info.get("required", False)
                req_marker = " (required)" if required else ""
                content_parts.append(f"  - {param_name}: {param_type}{req_marker}")
                if param_desc:
                    content_parts.append(f"    {param_desc}")
        return cls(
            content="\n".join(content_parts),
            source=f"tool:{server_name}:{name}" if server_name else f"tool:{name}",
            tool_name=name,
            tool_description=description,
            is_result=False,
            parameters=parameters or {},
            server_name=server_name,
            priority=ContextPriority.LOW.value,
        )
    @classmethod
    def from_tool_result(
        cls,
        tool_name: str,
        result: Any,
        status: ToolResultStatus = ToolResultStatus.SUCCESS,
        execution_time_ms: float | None = None,
        parameters: dict[str, Any] | None = None,
        server_name: str | None = None,
    ) -> "ToolContext":
        """
        Create a ToolContext from a tool execution result.
        Args:
            tool_name: Name of the tool that was executed
            result: Result content (will be converted to string)
            status: Execution status
            execution_time_ms: Execution time in milliseconds
            parameters: Parameters that were passed to the tool
            server_name: MCP server name
        Returns:
            ToolContext instance
        """
        # Convert result to string content
        if isinstance(result, str):
            content = result
        elif isinstance(result, dict):
            import json
            try:
                content = json.dumps(result, indent=2)
            except (TypeError, ValueError):
                content = str(result)
        else:
            content = str(result)
        return cls(
            content=content,
            source=f"tool_result:{server_name}:{tool_name}" if server_name else f"tool_result:{tool_name}",
            tool_name=tool_name,
            is_result=True,
            result_status=status,
            execution_time_ms=execution_time_ms,
            parameters=parameters or {},
            server_name=server_name,
            priority=ContextPriority.HIGH.value,  # Recent results are high priority
        )
    def is_successful(self) -> bool:
        """Check if this is a successful tool result."""
        return self.is_result and self.result_status == ToolResultStatus.SUCCESS
    def is_error(self) -> bool:
        """Check if this is an error result."""
        return self.is_result and self.result_status == ToolResultStatus.ERROR
    def format_for_prompt(self) -> str:
        """
        Format tool context for inclusion in prompt.
        Returns:
            Formatted tool string
        """
        if self.is_result:
            status_str = self.result_status.value if self.result_status else "unknown"
            header = f"Tool Result ({self.tool_name}, {status_str}):"
            return f"{header}\n{self.content}"
        else:
            return self.content
--- a/backend/app/services/event_bus.py
+++ b/backend/app/services/event_bus.py
@@ -0,0 +1,611 @@
 """
 EventBus service for Redis Pub/Sub communication.
 This module provides a centralized event bus for publishing and subscribing to
 events across the Syndarix platform. It uses Redis Pub/Sub for real-time
 message delivery between services, agents, and the frontend.
 Architecture:
 - Publishers emit events to project/agent-specific Redis channels
 - SSE endpoints subscribe to channels and stream events to clients
 - Events include metadata for reconnection support (Last-Event-ID)
 - Events are typed with the EventType enum for consistency
 Usage:
    # Publishing events
    event_bus = EventBus()
    await event_bus.connect()
    event = event_bus.create_event(
        event_type=EventType.AGENT_MESSAGE,
        project_id=project_id,
        actor_type="agent",
        payload={"message": "Processing..."}
    )
    await event_bus.publish(event_bus.get_project_channel(project_id), event)
    # Subscribing to events
    async for event in event_bus.subscribe(["project:123", "agent:456"]):
        handle_event(event)
    # Cleanup
    await event_bus.disconnect()
 """
 import asyncio
 import json
 import logging
 from collections.abc import AsyncGenerator, AsyncIterator
 from contextlib import asynccontextmanager
 from datetime import UTC, datetime
 from typing import Any
 from uuid import UUID, uuid4
 import redis.asyncio as redis
 from pydantic import ValidationError
 from app.core.config import settings
 from app.schemas.events import ActorType, Event, EventType
 logger = logging.getLogger(__name__)
 class EventBusError(Exception):
    """Base exception for EventBus errors."""
 class EventBusConnectionError(EventBusError):
    """Raised when connection to Redis fails."""
 class EventBusPublishError(EventBusError):
    """Raised when publishing an event fails."""
 class EventBusSubscriptionError(EventBusError):
    """Raised when subscribing to channels fails."""
 class EventBus:
    """
    EventBus for Redis Pub/Sub communication.
    Provides methods to publish events to channels and subscribe to events
    from multiple channels. Handles connection management, serialization,
    and error recovery.
    This class provides:
    - Event publishing to project/agent-specific channels
    - Subscription management for SSE endpoints
    - Reconnection support via event IDs (Last-Event-ID)
    - Keepalive messages for connection health
    - Type-safe event creation with the Event schema
    Attributes:
        redis_url: Redis connection URL
        redis_client: Async Redis client instance
        pubsub: Redis PubSub instance for subscriptions
    """
    # Channel prefixes for different entity types
    PROJECT_CHANNEL_PREFIX = "project"
    AGENT_CHANNEL_PREFIX = "agent"
    USER_CHANNEL_PREFIX = "user"
    GLOBAL_CHANNEL = "syndarix:global"
    def __init__(self, redis_url: str | None = None) -> None:
        """
        Initialize the EventBus.
        Args:
            redis_url: Redis connection URL. Defaults to settings.REDIS_URL.
        """
        self.redis_url = redis_url or settings.REDIS_URL
        self._redis_client: redis.Redis | None = None
        self._pubsub: redis.client.PubSub | None = None
        self._connected = False
    @property
    def redis_client(self) -> redis.Redis:
        """Get the Redis client, raising if not connected."""
        if self._redis_client is None:
            raise EventBusConnectionError(
                "EventBus not connected. Call connect() first."
            )
        return self._redis_client
    @property
    def pubsub(self) -> redis.client.PubSub:
        """Get the PubSub instance, raising if not connected."""
        if self._pubsub is None:
            raise EventBusConnectionError(
                "EventBus not connected. Call connect() first."
            )
        return self._pubsub
    @property
    def is_connected(self) -> bool:
        """Check if the EventBus is connected to Redis."""
        return self._connected and self._redis_client is not None
    async def connect(self) -> None:
        """
        Connect to Redis and initialize the PubSub client.
        Raises:
            EventBusConnectionError: If connection to Redis fails.
        """
        if self._connected:
            logger.debug("EventBus already connected")
            return
        try:
            self._redis_client = redis.from_url(
                self.redis_url,
                encoding="utf-8",
                decode_responses=True,
            )
            # Test connection - ping() returns a coroutine for async Redis
            ping_result = self._redis_client.ping()
            if hasattr(ping_result, "__await__"):
                await ping_result
            self._pubsub = self._redis_client.pubsub()
            self._connected = True
            logger.info("EventBus connected to Redis")
        except redis.ConnectionError as e:
            logger.error(f"Failed to connect to Redis: {e}", exc_info=True)
            raise EventBusConnectionError(f"Failed to connect to Redis: {e}") from e
        except redis.RedisError as e:
            logger.error(f"Redis error during connection: {e}", exc_info=True)
            raise EventBusConnectionError(f"Redis error: {e}") from e
    async def disconnect(self) -> None:
        """
        Disconnect from Redis and cleanup resources.
        """
        if self._pubsub:
            try:
                await self._pubsub.unsubscribe()
                await self._pubsub.close()
            except redis.RedisError as e:
                logger.warning(f"Error closing PubSub: {e}")
            finally:
                self._pubsub = None
        if self._redis_client:
            try:
                await self._redis_client.aclose()
            except redis.RedisError as e:
                logger.warning(f"Error closing Redis client: {e}")
            finally:
                self._redis_client = None
        self._connected = False
        logger.info("EventBus disconnected from Redis")
    @asynccontextmanager
    async def connection(self) -> AsyncIterator["EventBus"]:
        """
        Context manager for automatic connection handling.
        Usage:
            async with event_bus.connection() as bus:
                await bus.publish(channel, event)
        """
        await self.connect()
        try:
            yield self
        finally:
            await self.disconnect()
    def get_project_channel(self, project_id: UUID | str) -> str:
        """
        Get the channel name for a project.
        Args:
            project_id: The project UUID or string
        Returns:
            Channel name string in format "project:{uuid}"
        """
        return f"{self.PROJECT_CHANNEL_PREFIX}:{project_id}"
    def get_agent_channel(self, agent_id: UUID | str) -> str:
        """
        Get the channel name for an agent instance.
        Args:
            agent_id: The agent instance UUID or string
        Returns:
            Channel name string in format "agent:{uuid}"
        """
        return f"{self.AGENT_CHANNEL_PREFIX}:{agent_id}"
    def get_user_channel(self, user_id: UUID | str) -> str:
        """
        Get the channel name for a user (personal notifications).
        Args:
            user_id: The user UUID or string
        Returns:
            Channel name string in format "user:{uuid}"
        """
        return f"{self.USER_CHANNEL_PREFIX}:{user_id}"
    @staticmethod
    def create_event(
        event_type: EventType,
        project_id: UUID,
        actor_type: ActorType,
        payload: dict | None = None,
        actor_id: UUID | None = None,
        event_id: str | None = None,
        timestamp: datetime | None = None,
    ) -> Event:
        """
        Factory method to create a new Event.
        Args:
            event_type: The type of event
            project_id: The project this event belongs to
            actor_type: Type of actor ('agent', 'user', or 'system')
            payload: Event-specific payload data
            actor_id: ID of the agent or user who triggered the event
            event_id: Optional custom event ID (UUID string)
            timestamp: Optional custom timestamp (defaults to now UTC)
        Returns:
            A new Event instance
        """
        return Event(
            id=event_id or str(uuid4()),
            type=event_type,
            timestamp=timestamp or datetime.now(UTC),
            project_id=project_id,
            actor_id=actor_id,
            actor_type=actor_type,
            payload=payload or {},
        )
    def _serialize_event(self, event: Event) -> str:
        """
        Serialize an event to JSON string.
        Args:
            event: The Event to serialize
        Returns:
            JSON string representation of the event
        """
        return event.model_dump_json()
    def _deserialize_event(self, data: str) -> Event:
        """
        Deserialize a JSON string to an Event.
        Args:
            data: JSON string to deserialize
        Returns:
            Deserialized Event instance
        Raises:
            ValidationError: If the data doesn't match the Event schema
        """
        return Event.model_validate_json(data)
    async def publish(self, channel: str, event: Event) -> int:
        """
        Publish an event to a channel.
        Args:
            channel: The channel name to publish to
            event: The Event to publish
        Returns:
            Number of subscribers that received the message
        Raises:
            EventBusConnectionError: If not connected to Redis
            EventBusPublishError: If publishing fails
        """
        if not self.is_connected:
            raise EventBusConnectionError("EventBus not connected")
        try:
            message = self._serialize_event(event)
            subscriber_count = await self.redis_client.publish(channel, message)
            logger.debug(
                f"Published event {event.type} to {channel} "
                f"(received by {subscriber_count} subscribers)"
            )
            return subscriber_count
        except redis.RedisError as e:
            logger.error(f"Failed to publish event to {channel}: {e}", exc_info=True)
            raise EventBusPublishError(f"Failed to publish event: {e}") from e
    async def publish_to_project(self, event: Event) -> int:
        """
        Publish an event to the project's channel.
        Convenience method that publishes to the project channel based on
        the event's project_id.
        Args:
            event: The Event to publish (must have project_id set)
        Returns:
            Number of subscribers that received the message
        """
        channel = self.get_project_channel(event.project_id)
        return await self.publish(channel, event)
    async def publish_multi(self, channels: list[str], event: Event) -> dict[str, int]:
        """
        Publish an event to multiple channels.
        Args:
            channels: List of channel names to publish to
            event: The Event to publish
        Returns:
            Dictionary mapping channel names to subscriber counts
        """
        results = {}
        for channel in channels:
            try:
                results[channel] = await self.publish(channel, event)
            except EventBusPublishError as e:
                logger.warning(f"Failed to publish to {channel}: {e}")
                results[channel] = 0
        return results
    async def subscribe(
        self, channels: list[str], *, max_wait: float | None = None
    ) -> AsyncIterator[Event]:
        """
        Subscribe to one or more channels and yield events.
        This is an async generator that yields Event objects as they arrive.
        Use max_wait to limit how long to wait for messages.
        Args:
            channels: List of channel names to subscribe to
            max_wait: Optional maximum wait time in seconds for each message.
                      If None, waits indefinitely.
        Yields:
            Event objects received from subscribed channels
        Raises:
            EventBusConnectionError: If not connected to Redis
            EventBusSubscriptionError: If subscription fails
        Example:
            async for event in event_bus.subscribe(["project:123"], max_wait=30):
                print(f"Received: {event.type}")
        """
        if not self.is_connected:
            raise EventBusConnectionError("EventBus not connected")
        # Create a new pubsub for this subscription
        subscription_pubsub = self.redis_client.pubsub()
        try:
            await subscription_pubsub.subscribe(*channels)
            logger.info(f"Subscribed to channels: {channels}")
        except redis.RedisError as e:
            logger.error(f"Failed to subscribe to channels: {e}", exc_info=True)
            await subscription_pubsub.close()
            raise EventBusSubscriptionError(f"Failed to subscribe: {e}") from e
        try:
            while True:
                try:
                    if max_wait is not None:
                        async with asyncio.timeout(max_wait):
                            message = await subscription_pubsub.get_message(
                                ignore_subscribe_messages=True, timeout=1.0
                            )
                    else:
                        message = await subscription_pubsub.get_message(
                            ignore_subscribe_messages=True, timeout=1.0
                        )
                except TimeoutError:
                    # Timeout reached, stop iteration
                    return
                if message is None:
                    continue
                if message["type"] == "message":
                    try:
                        event = self._deserialize_event(message["data"])
                        yield event
                    except ValidationError as e:
                        logger.warning(
                            f"Invalid event data received: {e}",
                            extra={"channel": message.get("channel")},
                        )
                        continue
                    except json.JSONDecodeError as e:
                        logger.warning(
                            f"Failed to decode event JSON: {e}",
                            extra={"channel": message.get("channel")},
                        )
                        continue
        finally:
            try:
                await subscription_pubsub.unsubscribe(*channels)
                await subscription_pubsub.close()
                logger.debug(f"Unsubscribed from channels: {channels}")
            except redis.RedisError as e:
                logger.warning(f"Error unsubscribing from channels: {e}")
    async def subscribe_sse(
        self,
        project_id: str | UUID,
        last_event_id: str | None = None,
        keepalive_interval: int = 30,
    ) -> AsyncGenerator[str, None]:
        """
        Subscribe to events for a project in SSE format.
        This is an async generator that yields SSE-formatted event strings.
        It includes keepalive messages at the specified interval.
        Args:
            project_id: The project to subscribe to
            last_event_id: Optional last received event ID for reconnection
            keepalive_interval: Seconds between keepalive messages (default 30)
        Yields:
            SSE-formatted event strings (ready to send to client)
        """
        if not self.is_connected:
            raise EventBusConnectionError("EventBus not connected")
        project_id_str = str(project_id)
        channel = self.get_project_channel(project_id_str)
        subscription_pubsub = self.redis_client.pubsub()
        await subscription_pubsub.subscribe(channel)
        logger.info(
            f"Subscribed to SSE events for project {project_id_str} "
            f"(last_event_id={last_event_id})"
        )
        try:
            while True:
                try:
                    # Wait for messages with a timeout for keepalive
                    message = await asyncio.wait_for(
                        subscription_pubsub.get_message(ignore_subscribe_messages=True),
                        timeout=keepalive_interval,
                    )
                    if message is not None and message["type"] == "message":
                        event_data = message["data"]
                        # If reconnecting, check if we should skip this event
                        if last_event_id:
                            try:
                                event_dict = json.loads(event_data)
                                if event_dict.get("id") == last_event_id:
                                    # Found the last event, start yielding from next
                                    last_event_id = None
                                    continue
                            except json.JSONDecodeError:
                                pass
                        yield event_data
                except TimeoutError:
                    # Send keepalive comment
                    yield ""  # Empty string signals keepalive
        except asyncio.CancelledError:
            logger.info(f"SSE subscription cancelled for project {project_id_str}")
            raise
        finally:
            await subscription_pubsub.unsubscribe(channel)
            await subscription_pubsub.close()
            logger.info(f"Unsubscribed SSE from project {project_id_str}")
    async def subscribe_with_callback(
        self,
        channels: list[str],
        callback: Any,  # Callable[[Event], Awaitable[None]]
        stop_event: asyncio.Event | None = None,
    ) -> None:
        """
        Subscribe to channels and process events with a callback.
        This method runs until stop_event is set or an unrecoverable error occurs.
        Args:
            channels: List of channel names to subscribe to
            callback: Async function to call for each event
            stop_event: Optional asyncio.Event to signal stop
        Example:
            async def handle_event(event: Event):
                print(f"Handling: {event.type}")
            stop = asyncio.Event()
            asyncio.create_task(
                event_bus.subscribe_with_callback(["project:123"], handle_event, stop)
            )
            # Later...
            stop.set()
        """
        if stop_event is None:
            stop_event = asyncio.Event()
        try:
            async for event in self.subscribe(channels):
                if stop_event.is_set():
                    break
                try:
                    await callback(event)
                except Exception as e:
                    logger.error(f"Error in event callback: {e}", exc_info=True)
        except EventBusSubscriptionError:
            raise
        except Exception as e:
            logger.error(f"Unexpected error in subscription loop: {e}", exc_info=True)
            raise
 # Singleton instance for application-wide use
 _event_bus: EventBus | None = None
 def get_event_bus() -> EventBus:
    """
    Get the singleton EventBus instance.
    Creates a new instance if one doesn't exist. Note that you still need
    to call connect() before using the EventBus.
    Returns:
        The singleton EventBus instance
    """
    global _event_bus
    if _event_bus is None:
        _event_bus = EventBus()
    return _event_bus
 async def get_connected_event_bus() -> EventBus:
    """
    Get a connected EventBus instance.
    Ensures the EventBus is connected before returning. For use in
    FastAPI dependency injection.
    Returns:
        A connected EventBus instance
    Raises:
        EventBusConnectionError: If connection fails
    """
    event_bus = get_event_bus()
    if not event_bus.is_connected:
        await event_bus.connect()
    return event_bus
 async def close_event_bus() -> None:
    """
    Close the global EventBus instance.
    Should be called during application shutdown.
    """
    global _event_bus
    if _event_bus is not None:
        await _event_bus.disconnect()
        _event_bus = None
--- a/backend/app/services/mcp/init.py
+++ b/backend/app/services/mcp/init.py
@@ -0,0 +1,85 @@
 """
 MCP Client Service Package
 Provides infrastructure for communicating with MCP (Model Context Protocol)
 servers. This is the foundation for AI agent tool integration.
 Usage:
    from app.services.mcp import get_mcp_client, MCPClientManager
    # In FastAPI route
    async def my_route(mcp: MCPClientManager = Depends(get_mcp_client)):
        result = await mcp.call_tool("llm-gateway", "chat", {"prompt": "Hello"})
    # Direct usage
    manager = MCPClientManager()
    await manager.initialize()
    result = await manager.call_tool("issues", "create_issue", {...})
    await manager.shutdown()
 """
 from .client_manager import (
    MCPClientManager,
    ServerHealth,
    get_mcp_client,
    reset_mcp_client,
    shutdown_mcp_client,
 )
 from .config import (
    MCPConfig,
    MCPServerConfig,
    TransportType,
    create_default_config,
    load_mcp_config,
 )
 from .connection import ConnectionPool, ConnectionState, MCPConnection
 from .exceptions import (
    MCPCircuitOpenError,
    MCPConnectionError,
    MCPError,
    MCPServerNotFoundError,
    MCPTimeoutError,
    MCPToolError,
    MCPToolNotFoundError,
    MCPValidationError,
 )
 from .registry import MCPServerRegistry, ServerCapabilities, get_registry
 from .routing import AsyncCircuitBreaker, CircuitState, ToolInfo, ToolResult, ToolRouter
 __all__ = [
    # Main facade
    "MCPClientManager",
    "get_mcp_client",
    "shutdown_mcp_client",
    "reset_mcp_client",
    "ServerHealth",
    # Configuration
    "MCPConfig",
    "MCPServerConfig",
    "TransportType",
    "load_mcp_config",
    "create_default_config",
    # Registry
    "MCPServerRegistry",
    "ServerCapabilities",
    "get_registry",
    # Connection
    "ConnectionPool",
    "ConnectionState",
    "MCPConnection",
    # Routing
    "ToolRouter",
    "ToolInfo",
    "ToolResult",
    "AsyncCircuitBreaker",
    "CircuitState",
    # Exceptions
    "MCPError",
    "MCPConnectionError",
    "MCPTimeoutError",
    "MCPToolError",
    "MCPServerNotFoundError",
    "MCPToolNotFoundError",
    "MCPCircuitOpenError",
    "MCPValidationError",
 ]
--- a/backend/app/services/mcp/client_manager.py
+++ b/backend/app/services/mcp/client_manager.py
@@ -0,0 +1,430 @@
 """
 MCP Client Manager
 Main facade for all MCP operations. Manages server connections,
 tool discovery, and provides a unified interface for tool calls.
 """
 import asyncio
 import logging
 from dataclasses import dataclass
 from typing import Any
 from .config import MCPConfig, MCPServerConfig, load_mcp_config
 from .connection import ConnectionPool, ConnectionState
 from .exceptions import MCPServerNotFoundError
 from .registry import MCPServerRegistry, get_registry
 from .routing import ToolInfo, ToolResult, ToolRouter
 logger = logging.getLogger(__name__)
@dataclass
 class ServerHealth:
    """Health status for an MCP server."""
    name: str
    healthy: bool
    state: str
    url: str
    error: str | None = None
    tools_count: int = 0
    def to_dict(self) -> dict[str, Any]:
        """Convert to dictionary."""
        return {
            "name": self.name,
            "healthy": self.healthy,
            "state": self.state,
            "url": self.url,
            "error": self.error,
            "tools_count": self.tools_count,
        }
 class MCPClientManager:
    """
    Central manager for all MCP client operations.
    Provides a unified interface for:
    - Connecting to MCP servers
    - Discovering and calling tools
    - Health monitoring
    - Connection lifecycle management
    This is the main entry point for MCP operations in the application.
    """
    def __init__(
        self,
        config: MCPConfig | None = None,
        registry: MCPServerRegistry | None = None,
    ) -> None:
        """
        Initialize the MCP client manager.
        Args:
            config: Optional MCP configuration. If None, loads from default.
            registry: Optional registry instance. If None, uses singleton.
        """
        self._registry = registry or get_registry()
        self._pool = ConnectionPool()
        self._router: ToolRouter | None = None
        self._initialized = False
        self._lock = asyncio.Lock()
        # Load configuration if provided
        if config is not None:
            self._registry.load_config(config)
    @property
    def is_initialized(self) -> bool:
        """Check if the manager is initialized."""
        return self._initialized
    async def initialize(self, config: MCPConfig | None = None) -> None:
        """
        Initialize the MCP client manager.
        Loads configuration, creates connections, and discovers tools.
        Args:
            config: Optional configuration to load
        """
        async with self._lock:
            if self._initialized:
                logger.warning("MCPClientManager already initialized")
                return
            logger.info("Initializing MCP Client Manager")
            # Load configuration
            if config is not None:
                self._registry.load_config(config)
            elif len(self._registry.list_servers()) == 0:
                # Try to load from default location
                self._registry.load_config(load_mcp_config())
            # Create router
            self._router = ToolRouter(self._registry, self._pool)
            # Connect to all enabled servers
            await self._connect_all_servers()
            # Discover tools from all servers
            if self._router:
                await self._router.discover_tools()
            self._initialized = True
            logger.info(
                "MCP Client Manager initialized with %d servers",
                len(self._registry.list_enabled_servers()),
            )
    async def _connect_all_servers(self) -> None:
        """Connect to all enabled MCP servers."""
        enabled_servers = self._registry.get_enabled_configs()
        for name, config in enabled_servers.items():
            try:
                await self._pool.get_connection(name, config)
                logger.info("Connected to MCP server: %s", name)
            except Exception as e:
                logger.error("Failed to connect to MCP server %s: %s", name, e)
    async def shutdown(self) -> None:
        """
        Shutdown the MCP client manager.
        Closes all connections and cleans up resources.
        """
        async with self._lock:
            if not self._initialized:
                return
            logger.info("Shutting down MCP Client Manager")
            await self._pool.close_all()
            self._initialized = False
            logger.info("MCP Client Manager shutdown complete")
    async def connect(self, server_name: str) -> None:
        """
        Connect to a specific MCP server.
        Args:
            server_name: Name of the server to connect to
        Raises:
            MCPServerNotFoundError: If server is not registered
        """
        config = self._registry.get(server_name)
        await self._pool.get_connection(server_name, config)
        logger.info("Connected to MCP server: %s", server_name)
    async def disconnect(self, server_name: str) -> None:
        """
        Disconnect from a specific MCP server.
        Args:
            server_name: Name of the server to disconnect from
        """
        await self._pool.close_connection(server_name)
        logger.info("Disconnected from MCP server: %s", server_name)
    async def disconnect_all(self) -> None:
        """Disconnect from all MCP servers."""
        await self._pool.close_all()
    async def call_tool(
        self,
        server: str,
        tool: str,
        args: dict[str, Any] | None = None,
        timeout: float | None = None,
    ) -> ToolResult:
        """
        Call a tool on a specific MCP server.
        Args:
            server: Name of the MCP server
            tool: Name of the tool to call
            args: Tool arguments
            timeout: Optional timeout override
        Returns:
            Tool execution result
        """
        if not self._initialized or self._router is None:
            await self.initialize()
        assert self._router is not None  # Guaranteed after initialize()
        return await self._router.call_tool(
            server_name=server,
            tool_name=tool,
            arguments=args,
            timeout=timeout,
        )
    async def route_tool(
        self,
        tool: str,
        args: dict[str, Any] | None = None,
        timeout: float | None = None,
    ) -> ToolResult:
        """
        Route a tool call to the appropriate server automatically.
        Args:
            tool: Name of the tool to call
            args: Tool arguments
            timeout: Optional timeout override
        Returns:
            Tool execution result
        """
        if not self._initialized or self._router is None:
            await self.initialize()
        assert self._router is not None  # Guaranteed after initialize()
        return await self._router.route_tool(
            tool_name=tool,
            arguments=args,
            timeout=timeout,
        )
    async def list_tools(self, server: str) -> list[ToolInfo]:
        """
        List all tools available on a specific server.
        Args:
            server: Name of the MCP server
        Returns:
            List of tool information
        """
        capabilities = await self._registry.get_capabilities(server)
        return [
            ToolInfo(
                name=t.get("name", ""),
                description=t.get("description"),
                server_name=server,
                input_schema=t.get("input_schema"),
            )
            for t in capabilities.tools
        ]
    async def list_all_tools(self) -> list[ToolInfo]:
        """
        List all tools from all servers.
        Returns:
            List of tool information
        """
        if not self._initialized or self._router is None:
            await self.initialize()
        assert self._router is not None  # Guaranteed after initialize()
        return await self._router.list_all_tools()
    async def health_check(self) -> dict[str, ServerHealth]:
        """
        Perform health check on all MCP servers.
        Returns:
            Dict mapping server names to health status
        """
        results: dict[str, ServerHealth] = {}
        pool_status = self._pool.get_status()
        pool_health = await self._pool.health_check_all()
        for server_name in self._registry.list_servers():
            try:
                config = self._registry.get(server_name)
                status = pool_status.get(server_name, {})
                healthy = pool_health.get(server_name, False)
                capabilities = self._registry.get_cached_capabilities(server_name)
                results[server_name] = ServerHealth(
                    name=server_name,
                    healthy=healthy,
                    state=status.get("state", ConnectionState.DISCONNECTED.value),
                    url=config.url,
                    tools_count=len(capabilities.tools),
                )
            except MCPServerNotFoundError:
                pass
            except Exception as e:
                results[server_name] = ServerHealth(
                    name=server_name,
                    healthy=False,
                    state=ConnectionState.ERROR.value,
                    url="unknown",
                    error=str(e),
                )
        return results
    def list_servers(self) -> list[str]:
        """Get list of all registered server names."""
        return self._registry.list_servers()
    def list_enabled_servers(self) -> list[str]:
        """Get list of enabled server names."""
        return self._registry.list_enabled_servers()
    def get_server_config(self, server_name: str) -> MCPServerConfig:
        """
        Get configuration for a specific server.
        Args:
            server_name: Name of the server
        Returns:
            Server configuration
        Raises:
            MCPServerNotFoundError: If server is not registered
        """
        return self._registry.get(server_name)
    def register_server(
        self,
        name: str,
        config: MCPServerConfig,
    ) -> None:
        """
        Register a new MCP server at runtime.
        Args:
            name: Unique server name
            config: Server configuration
        """
        self._registry.register(name, config)
    def unregister_server(self, name: str) -> bool:
        """
        Unregister an MCP server.
        Args:
            name: Server name to unregister
        Returns:
            True if server was found and removed
        """
        return self._registry.unregister(name)
    def get_circuit_breaker_status(self) -> dict[str, dict[str, Any]]:
        """Get status of all circuit breakers."""
        if self._router is None:
            return {}
        return self._router.get_circuit_breaker_status()
    async def reset_circuit_breaker(self, server_name: str) -> bool:
        """
        Reset a circuit breaker for a server.
        Args:
            server_name: Name of the server
        Returns:
            True if circuit breaker was reset
        """
        if self._router is None:
            return False
        return await self._router.reset_circuit_breaker(server_name)
 # Singleton instance
 _manager_instance: MCPClientManager | None = None
 _manager_lock = asyncio.Lock()
 async def get_mcp_client() -> MCPClientManager:
    """
    Get the global MCP client manager instance.
    This is the main dependency injection point for FastAPI.
    Uses proper locking to avoid race conditions in async contexts.
    """
    global _manager_instance
    # Use lock for the entire check-and-create operation to avoid race conditions
    async with _manager_lock:
        if _manager_instance is None:
            _manager_instance = MCPClientManager()
            await _manager_instance.initialize()
    return _manager_instance
 async def shutdown_mcp_client() -> None:
    """Shutdown the global MCP client manager."""
    global _manager_instance
    # Use lock to prevent race with get_mcp_client()
    async with _manager_lock:
        if _manager_instance is not None:
            await _manager_instance.shutdown()
            _manager_instance = None
 async def reset_mcp_client() -> None:
    """
    Reset the global MCP client manager (for testing).
    This is an async function to properly acquire the manager lock
    and avoid race conditions with get_mcp_client().
    """
    global _manager_instance
    async with _manager_lock:
        if _manager_instance is not None:
            # Shutdown gracefully before resetting
            try:
                await _manager_instance.shutdown()
            except Exception:  # noqa: S110
                pass  # Ignore errors during test cleanup
        _manager_instance = None
--- a/backend/app/services/mcp/config.py
+++ b/backend/app/services/mcp/config.py
@@ -0,0 +1,232 @@
 """
 MCP Configuration System
 Pydantic models for MCP server configuration with YAML file loading
 and environment variable overrides.
 """
 import os
 from enum import Enum
 from pathlib import Path
 from typing import Any
 import yaml
 from pydantic import BaseModel, Field, field_validator
 class TransportType(str, Enum):
    """Supported MCP transport types."""
    HTTP = "http"
    STDIO = "stdio"
    SSE = "sse"
 class MCPServerConfig(BaseModel):
    """Configuration for a single MCP server."""
    url: str = Field(..., description="Server URL (supports ${ENV_VAR} syntax)")
    transport: TransportType = Field(
        default=TransportType.HTTP,
        description="Transport protocol to use",
    )
    timeout: int = Field(
        default=30,
        ge=1,
        le=600,
        description="Request timeout in seconds",
    )
    retry_attempts: int = Field(
        default=3,
        ge=0,
        le=10,
        description="Number of retry attempts on failure",
    )
    retry_delay: float = Field(
        default=1.0,
        ge=0.1,
        le=60.0,
        description="Initial delay between retries in seconds",
    )
    retry_max_delay: float = Field(
        default=30.0,
        ge=1.0,
        le=300.0,
        description="Maximum delay between retries in seconds",
    )
    circuit_breaker_threshold: int = Field(
        default=5,
        ge=1,
        le=50,
        description="Number of failures before opening circuit",
    )
    circuit_breaker_timeout: float = Field(
        default=30.0,
        ge=5.0,
        le=300.0,
        description="Seconds to wait before attempting to close circuit",
    )
    enabled: bool = Field(
        default=True,
        description="Whether this server is enabled",
    )
    description: str | None = Field(
        default=None,
        description="Human-readable description of the server",
    )
    @field_validator("url", mode="before")
    @classmethod
    def expand_env_vars(cls, v: str) -> str:
        """Expand environment variables in URL using ${VAR:-default} syntax."""
        if not isinstance(v, str):
            return v
        result = v
        # Find all ${VAR} or ${VAR:-default} patterns
        import re
        pattern = r"\$\{([^}]+)\}"
        matches = re.findall(pattern, v)
        for match in matches:
            if ":-" in match:
                var_name, default = match.split(":-", 1)
            else:
                var_name, default = match, ""
            env_value = os.environ.get(var_name.strip(), default)
            result = result.replace(f"${{{match}}}", env_value)
        return result
 class MCPConfig(BaseModel):
    """Root configuration for all MCP servers."""
    mcp_servers: dict[str, MCPServerConfig] = Field(
        default_factory=dict,
        description="Map of server names to their configurations",
    )
    # Global defaults
    default_timeout: int = Field(
        default=30,
        description="Default timeout for all servers",
    )
    default_retry_attempts: int = Field(
        default=3,
        description="Default retry attempts for all servers",
    )
    connection_pool_size: int = Field(
        default=10,
        ge=1,
        le=100,
        description="Maximum connections per server",
    )
    health_check_interval: int = Field(
        default=30,
        ge=5,
        le=300,
        description="Seconds between health checks",
    )
    @classmethod
    def from_yaml(cls, path: str | Path) -> "MCPConfig":
        """Load configuration from a YAML file."""
        path = Path(path)
        if not path.exists():
            raise FileNotFoundError(f"MCP config file not found: {path}")
        with path.open("r") as f:
            data = yaml.safe_load(f)
        if data is None:
            data = {}
        return cls.model_validate(data)
    @classmethod
    def from_dict(cls, data: dict[str, Any]) -> "MCPConfig":
        """Load configuration from a dictionary."""
        return cls.model_validate(data)
    def get_server(self, name: str) -> MCPServerConfig | None:
        """Get a server configuration by name."""
        return self.mcp_servers.get(name)
    def get_enabled_servers(self) -> dict[str, MCPServerConfig]:
        """Get all enabled server configurations."""
        return {
            name: config for name, config in self.mcp_servers.items() if config.enabled
        }
    def list_server_names(self) -> list[str]:
        """Get list of all configured server names."""
        return list(self.mcp_servers.keys())
 # Default configuration path
 DEFAULT_CONFIG_PATH = Path(__file__).parent.parent.parent.parent / "mcp_servers.yaml"
 def load_mcp_config(path: str | Path | None = None) -> MCPConfig:
    """
    Load MCP configuration from file or environment.
    Priority:
    1. Explicit path parameter
    2. MCP_CONFIG_PATH environment variable
    3. Default path (backend/mcp_servers.yaml)
    4. Empty config if no file exists
    """
    if path is None:
        path = os.environ.get("MCP_CONFIG_PATH", str(DEFAULT_CONFIG_PATH))
    path = Path(path)
    if not path.exists():
        # Return empty config if no file exists (allows runtime registration)
        return MCPConfig()
    return MCPConfig.from_yaml(path)
 def create_default_config() -> MCPConfig:
    """
    Create a default MCP configuration with standard servers.
    This is useful for development and as a template.
    """
    return MCPConfig(
        mcp_servers={
            "llm-gateway": MCPServerConfig(
                url="${LLM_GATEWAY_URL:-http://localhost:8001}",
                transport=TransportType.HTTP,
                timeout=60,
                description="LLM Gateway for multi-provider AI interactions",
            ),
            "knowledge-base": MCPServerConfig(
                url="${KNOWLEDGE_BASE_URL:-http://localhost:8002}",
                transport=TransportType.HTTP,
                timeout=30,
                description="Knowledge Base for RAG and document retrieval",
            ),
            "git-ops": MCPServerConfig(
                url="${GIT_OPS_URL:-http://localhost:8003}",
                transport=TransportType.HTTP,
                timeout=120,
                description="Git Operations for repository management",
            ),
            "issues": MCPServerConfig(
                url="${ISSUES_URL:-http://localhost:8004}",
                transport=TransportType.HTTP,
                timeout=30,
                description="Issue Tracker for Gitea/GitHub/GitLab",
            ),
        },
        default_timeout=30,
        default_retry_attempts=3,
        connection_pool_size=10,
        health_check_interval=30,
    )
--- a/backend/app/services/mcp/connection.py
+++ b/backend/app/services/mcp/connection.py
@@ -0,0 +1,473 @@
 """
 MCP Connection Management
 Handles connection lifecycle, pooling, and automatic reconnection
 for MCP servers.
 """
 import asyncio
 import logging
 import time
 from collections.abc import AsyncGenerator
 from contextlib import asynccontextmanager
 from enum import Enum
 from typing import Any
 import httpx
 from .config import MCPServerConfig, TransportType
 from .exceptions import MCPConnectionError, MCPTimeoutError
 logger = logging.getLogger(__name__)
 class ConnectionState(str, Enum):
    """Connection state enumeration."""
    DISCONNECTED = "disconnected"
    CONNECTING = "connecting"
    CONNECTED = "connected"
    RECONNECTING = "reconnecting"
    ERROR = "error"
 class MCPConnection:
    """
    Manages a single connection to an MCP server.
    Handles connection lifecycle, health checking, and automatic reconnection.
    """
    def __init__(
        self,
        server_name: str,
        config: MCPServerConfig,
    ) -> None:
        """
        Initialize connection.
        Args:
            server_name: Name of the MCP server
            config: Server configuration
        """
        self.server_name = server_name
        self.config = config
        self._state = ConnectionState.DISCONNECTED
        self._client: httpx.AsyncClient | None = None
        self._lock = asyncio.Lock()
        self._last_activity: float | None = None
        self._connection_attempts = 0
        self._last_error: Exception | None = None
        # Reconnection settings
        self._base_delay = config.retry_delay
        self._max_delay = config.retry_max_delay
        self._max_attempts = config.retry_attempts
    @property
    def state(self) -> ConnectionState:
        """Get current connection state."""
        return self._state
    @property
    def is_connected(self) -> bool:
        """Check if connection is established."""
        return self._state == ConnectionState.CONNECTED
    @property
    def last_error(self) -> Exception | None:
        """Get the last error that occurred."""
        return self._last_error
    async def connect(self) -> None:
        """
        Establish connection to the MCP server.
        Raises:
            MCPConnectionError: If connection fails after all retries
        """
        async with self._lock:
            if self._state == ConnectionState.CONNECTED:
                return
            self._state = ConnectionState.CONNECTING
            self._connection_attempts = 0
            self._last_error = None
            while self._connection_attempts < self._max_attempts:
                try:
                    await self._do_connect()
                    self._state = ConnectionState.CONNECTED
                    self._last_activity = time.time()
                    logger.info(
                        "Connected to MCP server: %s at %s",
                        self.server_name,
                        self.config.url,
                    )
                    return
                except Exception as e:
                    self._connection_attempts += 1
                    self._last_error = e
                    logger.warning(
                        "Connection attempt %d/%d failed for %s: %s",
                        self._connection_attempts,
                        self._max_attempts,
                        self.server_name,
                        e,
                    )
                    if self._connection_attempts < self._max_attempts:
                        delay = self._calculate_backoff_delay()
                        logger.debug(
                            "Retrying connection to %s in %.1fs",
                            self.server_name,
                            delay,
                        )
                        await asyncio.sleep(delay)
            # All attempts failed
            self._state = ConnectionState.ERROR
            raise MCPConnectionError(
                f"Failed to connect after {self._max_attempts} attempts",
                server_name=self.server_name,
                url=self.config.url,
                cause=self._last_error,
            )
    async def _do_connect(self) -> None:
        """Perform the actual connection (transport-specific)."""
        if self.config.transport == TransportType.HTTP:
            self._client = httpx.AsyncClient(
                base_url=self.config.url,
                timeout=httpx.Timeout(self.config.timeout),
                headers={
                    "User-Agent": "Syndarix-MCP-Client/1.0",
                    "Accept": "application/json",
                },
            )
            # Verify connectivity with a simple request
            try:
                # Try to hit the MCP capabilities endpoint
                response = await self._client.get("/mcp/capabilities")
                if response.status_code not in (200, 404):
                    # 404 is acceptable - server might not have capabilities endpoint
                    response.raise_for_status()
            except httpx.HTTPStatusError as e:
                if e.response.status_code != 404:
                    raise
            except httpx.ConnectError as e:
                raise MCPConnectionError(
                    "Failed to connect to server",
                    server_name=self.server_name,
                    url=self.config.url,
                    cause=e,
                ) from e
        else:
            # For STDIO and SSE transports, we'll implement later
            raise NotImplementedError(
                f"Transport {self.config.transport} not yet implemented"
            )
    def _calculate_backoff_delay(self) -> float:
        """Calculate exponential backoff delay with jitter."""
        import random
        delay = self._base_delay * (2 ** (self._connection_attempts - 1))
        delay = min(delay, self._max_delay)
        # Add jitter (±25%)
        jitter = delay * 0.25 * (random.random() * 2 - 1)
        return delay + jitter
    async def disconnect(self) -> None:
        """Disconnect from the MCP server."""
        async with self._lock:
            if self._client is not None:
                try:
                    await self._client.aclose()
                except Exception as e:
                    logger.warning(
                        "Error closing connection to %s: %s",
                        self.server_name,
                        e,
                    )
                finally:
                    self._client = None
            self._state = ConnectionState.DISCONNECTED
            logger.info("Disconnected from MCP server: %s", self.server_name)
    async def reconnect(self) -> None:
        """Reconnect to the MCP server."""
        async with self._lock:
            self._state = ConnectionState.RECONNECTING
        await self.disconnect()
        await self.connect()
    async def health_check(self) -> bool:
        """
        Perform a health check on the connection.
        Returns:
            True if connection is healthy
        """
        if not self.is_connected or self._client is None:
            return False
        try:
            if self.config.transport == TransportType.HTTP:
                response = await self._client.get(
                    "/health",
                    timeout=5.0,
                )
                return response.status_code == 200
            return True
        except Exception as e:
            logger.warning(
                "Health check failed for %s: %s",
                self.server_name,
                e,
            )
            return False
    async def execute_request(
        self,
        method: str,
        path: str,
        data: dict[str, Any] | None = None,
        timeout: float | None = None,
    ) -> dict[str, Any]:
        """
        Execute an HTTP request to the MCP server.
        Args:
            method: HTTP method (GET, POST, etc.)
            path: Request path
            data: Optional request body
            timeout: Optional timeout override
        Returns:
            Response data
        Raises:
            MCPConnectionError: If not connected
            MCPTimeoutError: If request times out
        """
        if not self.is_connected or self._client is None:
            raise MCPConnectionError(
                "Not connected to server",
                server_name=self.server_name,
            )
        effective_timeout = timeout or self.config.timeout
        try:
            if method.upper() == "GET":
                response = await self._client.get(
                    path,
                    timeout=effective_timeout,
                )
            elif method.upper() == "POST":
                response = await self._client.post(
                    path,
                    json=data,
                    timeout=effective_timeout,
                )
            else:
                response = await self._client.request(
                    method.upper(),
                    path,
                    json=data,
                    timeout=effective_timeout,
                )
            self._last_activity = time.time()
            response.raise_for_status()
            return response.json()
        except httpx.TimeoutException as e:
            raise MCPTimeoutError(
                "Request timed out",
                server_name=self.server_name,
                timeout_seconds=effective_timeout,
                operation=f"{method} {path}",
            ) from e
        except httpx.HTTPStatusError as e:
            raise MCPConnectionError(
                f"HTTP error: {e.response.status_code}",
                server_name=self.server_name,
                url=f"{self.config.url}{path}",
                cause=e,
            ) from e
        except Exception as e:
            raise MCPConnectionError(
                f"Request failed: {e}",
                server_name=self.server_name,
                cause=e,
            ) from e
 class ConnectionPool:
    """
    Pool of connections to MCP servers.
    Manages connection lifecycle and provides connection reuse.
    """
    def __init__(self, max_connections_per_server: int = 10) -> None:
        """
        Initialize connection pool.
        Args:
            max_connections_per_server: Maximum connections per server
        """
        self._connections: dict[str, MCPConnection] = {}
        self._lock = asyncio.Lock()
        self._per_server_locks: dict[str, asyncio.Lock] = {}
        self._max_per_server = max_connections_per_server
    def _get_server_lock(self, server_name: str) -> asyncio.Lock:
        """Get or create a lock for a specific server.
        Uses setdefault for atomic dict access to prevent race conditions
        where two coroutines could create different locks for the same server.
        """
        # setdefault is atomic - if key exists, returns existing value
        # if key doesn't exist, inserts new value and returns it
        return self._per_server_locks.setdefault(server_name, asyncio.Lock())
    async def get_connection(
        self,
        server_name: str,
        config: MCPServerConfig,
    ) -> MCPConnection:
        """
        Get or create a connection to a server.
        Uses per-server locking to avoid blocking all connections
        when establishing a new connection.
        Args:
            server_name: Name of the server
            config: Server configuration
        Returns:
            Active connection
        """
        # Quick check without lock - if connection exists and is connected, return it
        if server_name in self._connections:
            connection = self._connections[server_name]
            if connection.is_connected:
                return connection
        # Need to create or reconnect - use per-server lock to avoid blocking others
        async with self._lock:
            server_lock = self._get_server_lock(server_name)
        async with server_lock:
            # Double-check after acquiring per-server lock
            if server_name in self._connections:
                connection = self._connections[server_name]
                if connection.is_connected:
                    return connection
                # Connection exists but not connected - reconnect
                await connection.connect()
                return connection
            # Create new connection (outside global lock, under per-server lock)
            connection = MCPConnection(server_name, config)
            await connection.connect()
            # Store connection under global lock
            async with self._lock:
                self._connections[server_name] = connection
            return connection
    async def release_connection(self, server_name: str) -> None:
        """
        Release a connection (currently just tracks usage).
        Args:
            server_name: Name of the server
        """
        # For now, we keep connections alive
        # Future: implement connection reaping for idle connections
    async def close_connection(self, server_name: str) -> None:
        """
        Close and remove a connection.
        Args:
            server_name: Name of the server
        """
        async with self._lock:
            if server_name in self._connections:
                await self._connections[server_name].disconnect()
                del self._connections[server_name]
            # Clean up per-server lock
            if server_name in self._per_server_locks:
                del self._per_server_locks[server_name]
    async def close_all(self) -> None:
        """Close all connections in the pool."""
        async with self._lock:
            for connection in self._connections.values():
                try:
                    await connection.disconnect()
                except Exception as e:
                    logger.warning("Error closing connection: %s", e)
            self._connections.clear()
            self._per_server_locks.clear()
            logger.info("Closed all MCP connections")
    async def health_check_all(self) -> dict[str, bool]:
        """
        Perform health check on all connections.
        Returns:
            Dict mapping server names to health status
        """
        # Copy connections under lock to prevent modification during iteration
        async with self._lock:
            connections_snapshot = dict(self._connections)
        results = {}
        for name, connection in connections_snapshot.items():
            results[name] = await connection.health_check()
        return results
    def get_status(self) -> dict[str, dict[str, Any]]:
        """
        Get status of all connections.
        Returns:
            Dict mapping server names to status info
        """
        return {
            name: {
                "state": conn.state.value,
                "is_connected": conn.is_connected,
                "url": conn.config.url,
            }
            for name, conn in self._connections.items()
        }
    @asynccontextmanager
    async def connection(
        self,
        server_name: str,
        config: MCPServerConfig,
    ) -> AsyncGenerator[MCPConnection, None]:
        """
        Context manager for getting a connection.
        Usage:
            async with pool.connection("server", config) as conn:
                result = await conn.execute_request("POST", "/tool", data)
        """
        conn = await self.get_connection(server_name, config)
        try:
            yield conn
        finally:
            await self.release_connection(server_name)
--- a/backend/app/services/mcp/exceptions.py
+++ b/backend/app/services/mcp/exceptions.py
@@ -0,0 +1,201 @@
 """
 MCP Exception Classes
 Custom exceptions for MCP client operations with detailed error context.
 """
 from typing import Any
 class MCPError(Exception):
    """Base exception for all MCP-related errors."""
    def __init__(
        self,
        message: str,
        *,
        server_name: str | None = None,
        details: dict[str, Any] | None = None,
    ) -> None:
        super().__init__(message)
        self.message = message
        self.server_name = server_name
        self.details = details or {}
    def __str__(self) -> str:
        parts = [self.message]
        if self.server_name:
            parts.append(f"server={self.server_name}")
        if self.details:
            parts.append(f"details={self.details}")
        return " | ".join(parts)
 class MCPConnectionError(MCPError):
    """Raised when connection to an MCP server fails."""
    def __init__(
        self,
        message: str,
        *,
        server_name: str | None = None,
        url: str | None = None,
        cause: Exception | None = None,
        details: dict[str, Any] | None = None,
    ) -> None:
        super().__init__(message, server_name=server_name, details=details)
        self.url = url
        self.cause = cause
    def __str__(self) -> str:
        base = super().__str__()
        if self.url:
            base = f"{base} | url={self.url}"
        if self.cause:
            base = f"{base} | cause={type(self.cause).__name__}: {self.cause}"
        return base
 class MCPTimeoutError(MCPError):
    """Raised when an MCP operation times out."""
    def __init__(
        self,
        message: str,
        *,
        server_name: str | None = None,
        timeout_seconds: float | None = None,
        operation: str | None = None,
        details: dict[str, Any] | None = None,
    ) -> None:
        super().__init__(message, server_name=server_name, details=details)
        self.timeout_seconds = timeout_seconds
        self.operation = operation
    def __str__(self) -> str:
        base = super().__str__()
        if self.timeout_seconds is not None:
            base = f"{base} | timeout={self.timeout_seconds}s"
        if self.operation:
            base = f"{base} | operation={self.operation}"
        return base
 class MCPToolError(MCPError):
    """Raised when a tool execution fails."""
    def __init__(
        self,
        message: str,
        *,
        server_name: str | None = None,
        tool_name: str | None = None,
        tool_args: dict[str, Any] | None = None,
        error_code: str | None = None,
        details: dict[str, Any] | None = None,
    ) -> None:
        super().__init__(message, server_name=server_name, details=details)
        self.tool_name = tool_name
        self.tool_args = tool_args
        self.error_code = error_code
    def __str__(self) -> str:
        base = super().__str__()
        if self.tool_name:
            base = f"{base} | tool={self.tool_name}"
        if self.error_code:
            base = f"{base} | error_code={self.error_code}"
        return base
 class MCPServerNotFoundError(MCPError):
    """Raised when a requested MCP server is not registered."""
    def __init__(
        self,
        server_name: str,
        *,
        available_servers: list[str] | None = None,
        details: dict[str, Any] | None = None,
    ) -> None:
        message = f"MCP server not found: {server_name}"
        super().__init__(message, server_name=server_name, details=details)
        self.available_servers = available_servers or []
    def __str__(self) -> str:
        base = super().__str__()
        if self.available_servers:
            base = f"{base} | available={self.available_servers}"
        return base
 class MCPToolNotFoundError(MCPError):
    """Raised when a requested tool is not found on any server."""
    def __init__(
        self,
        tool_name: str,
        *,
        server_name: str | None = None,
        available_tools: list[str] | None = None,
        details: dict[str, Any] | None = None,
    ) -> None:
        message = f"Tool not found: {tool_name}"
        super().__init__(message, server_name=server_name, details=details)
        self.tool_name = tool_name
        self.available_tools = available_tools or []
    def __str__(self) -> str:
        base = super().__str__()
        if self.available_tools:
            base = f"{base} | available_tools={self.available_tools[:5]}..."
        return base
 class MCPCircuitOpenError(MCPError):
    """Raised when a circuit breaker is open (server temporarily unavailable)."""
    def __init__(
        self,
        server_name: str,
        *,
        failure_count: int | None = None,
        reset_timeout: float | None = None,
        details: dict[str, Any] | None = None,
    ) -> None:
        message = f"Circuit breaker open for server: {server_name}"
        super().__init__(message, server_name=server_name, details=details)
        self.failure_count = failure_count
        self.reset_timeout = reset_timeout
    def __str__(self) -> str:
        base = super().__str__()
        if self.failure_count is not None:
            base = f"{base} | failures={self.failure_count}"
        if self.reset_timeout is not None:
            base = f"{base} | reset_in={self.reset_timeout}s"
        return base
 class MCPValidationError(MCPError):
    """Raised when tool arguments fail validation."""
    def __init__(
        self,
        message: str,
        *,
        tool_name: str | None = None,
        field_errors: dict[str, str] | None = None,
        details: dict[str, Any] | None = None,
    ) -> None:
        super().__init__(message, details=details)
        self.tool_name = tool_name
        self.field_errors = field_errors or {}
    def __str__(self) -> str:
        base = super().__str__()
        if self.tool_name:
            base = f"{base} | tool={self.tool_name}"
        if self.field_errors:
            base = f"{base} | fields={list(self.field_errors.keys())}"
        return base
--- a/backend/app/services/mcp/registry.py
+++ b/backend/app/services/mcp/registry.py
@@ -0,0 +1,305 @@
 """
 MCP Server Registry
 Thread-safe singleton registry for managing MCP server configurations
 and their capabilities.
 """
 import asyncio
 import logging
 from threading import Lock
 from typing import Any
 from .config import MCPConfig, MCPServerConfig, load_mcp_config
 from .exceptions import MCPServerNotFoundError
 logger = logging.getLogger(__name__)
 class ServerCapabilities:
    """Cached capabilities for an MCP server."""
    def __init__(
        self,
        tools: list[dict[str, Any]] | None = None,
        resources: list[dict[str, Any]] | None = None,
        prompts: list[dict[str, Any]] | None = None,
    ) -> None:
        self.tools = tools or []
        self.resources = resources or []
        self.prompts = prompts or []
        self._loaded = False
        self._load_time: float | None = None
    @property
    def is_loaded(self) -> bool:
        """Check if capabilities have been loaded."""
        return self._loaded
    @property
    def tool_names(self) -> list[str]:
        """Get list of tool names."""
        return [t.get("name", "") for t in self.tools if t.get("name")]
    def mark_loaded(self) -> None:
        """Mark capabilities as loaded."""
        import time
        self._loaded = True
        self._load_time = time.time()
 class MCPServerRegistry:
    """
    Thread-safe singleton registry for MCP servers.
    Manages server configurations and caches their capabilities.
    """
    _instance: "MCPServerRegistry | None" = None
    _lock = Lock()
    def __new__(cls) -> "MCPServerRegistry":
        """Ensure singleton pattern."""
        with cls._lock:
            if cls._instance is None:
                cls._instance = super().__new__(cls)
                cls._instance._initialized = False
            return cls._instance
    def __init__(self) -> None:
        """Initialize registry (only runs once due to singleton)."""
        if getattr(self, "_initialized", False):
            return
        self._config: MCPConfig = MCPConfig()
        self._capabilities: dict[str, ServerCapabilities] = {}
        self._capabilities_lock = asyncio.Lock()
        self._initialized = True
        logger.info("MCP Server Registry initialized")
    @classmethod
    def get_instance(cls) -> "MCPServerRegistry":
        """Get the singleton registry instance."""
        return cls()
    @classmethod
    def reset_instance(cls) -> None:
        """Reset the singleton (for testing)."""
        with cls._lock:
            cls._instance = None
    def load_config(self, config: MCPConfig | None = None) -> None:
        """
        Load configuration into the registry.
        Args:
            config: Optional config to load. If None, loads from default path.
        """
        if config is None:
            config = load_mcp_config()
        self._config = config
        self._capabilities.clear()
        logger.info(
            "Loaded MCP configuration with %d servers",
            len(config.mcp_servers),
        )
        for name in config.list_server_names():
            logger.debug("Registered MCP server: %s", name)
    def register(self, name: str, config: MCPServerConfig) -> None:
        """
        Register a new MCP server.
        Args:
            name: Unique server name
            config: Server configuration
        """
        self._config.mcp_servers[name] = config
        self._capabilities.pop(name, None)  # Clear any cached capabilities
        logger.info("Registered MCP server: %s at %s", name, config.url)
    def unregister(self, name: str) -> bool:
        """
        Unregister an MCP server.
        Args:
            name: Server name to unregister
        Returns:
            True if server was found and removed
        """
        if name in self._config.mcp_servers:
            del self._config.mcp_servers[name]
            self._capabilities.pop(name, None)
            logger.info("Unregistered MCP server: %s", name)
            return True
        return False
    def get(self, name: str) -> MCPServerConfig:
        """
        Get a server configuration by name.
        Args:
            name: Server name
        Returns:
            Server configuration
        Raises:
            MCPServerNotFoundError: If server is not registered
        """
        config = self._config.get_server(name)
        if config is None:
            raise MCPServerNotFoundError(
                server_name=name,
                available_servers=self.list_servers(),
            )
        return config
    def get_or_none(self, name: str) -> MCPServerConfig | None:
        """
        Get a server configuration by name, or None if not found.
        Args:
            name: Server name
        Returns:
            Server configuration or None
        """
        return self._config.get_server(name)
    def list_servers(self) -> list[str]:
        """Get list of all registered server names."""
        return self._config.list_server_names()
    def list_enabled_servers(self) -> list[str]:
        """Get list of enabled server names."""
        return list(self._config.get_enabled_servers().keys())
    def get_all_configs(self) -> dict[str, MCPServerConfig]:
        """Get all server configurations."""
        return dict(self._config.mcp_servers)
    def get_enabled_configs(self) -> dict[str, MCPServerConfig]:
        """Get all enabled server configurations."""
        return self._config.get_enabled_servers()
    async def get_capabilities(
        self,
        name: str,
        force_refresh: bool = False,
    ) -> ServerCapabilities:
        """
        Get capabilities for a server (lazy-loaded and cached).
        Args:
            name: Server name
            force_refresh: If True, refresh cached capabilities
        Returns:
            Server capabilities
        Raises:
            MCPServerNotFoundError: If server is not registered
        """
        # Verify server exists
        self.get(name)
        async with self._capabilities_lock:
            if name not in self._capabilities or force_refresh:
                # Will be populated by connection manager when connecting
                self._capabilities[name] = ServerCapabilities()
            return self._capabilities[name]
    def set_capabilities(
        self,
        name: str,
        tools: list[dict[str, Any]] | None = None,
        resources: list[dict[str, Any]] | None = None,
        prompts: list[dict[str, Any]] | None = None,
    ) -> None:
        """
        Set capabilities for a server (called by connection manager).
        Args:
            name: Server name
            tools: List of tool definitions
            resources: List of resource definitions
            prompts: List of prompt definitions
        """
        capabilities = ServerCapabilities(
            tools=tools,
            resources=resources,
            prompts=prompts,
        )
        capabilities.mark_loaded()
        self._capabilities[name] = capabilities
        logger.debug(
            "Updated capabilities for %s: %d tools, %d resources, %d prompts",
            name,
            len(capabilities.tools),
            len(capabilities.resources),
            len(capabilities.prompts),
        )
    def get_cached_capabilities(self, name: str) -> ServerCapabilities:
        """
        Get cached capabilities without async loading.
        Use this for synchronous access when you only need
        cached values (e.g., for health check responses).
        Args:
            name: Server name
        Returns:
            Cached capabilities or empty ServerCapabilities
        """
        return self._capabilities.get(name, ServerCapabilities())
    def find_server_for_tool(self, tool_name: str) -> str | None:
        """
        Find which server provides a specific tool.
        Args:
            tool_name: Name of the tool to find
        Returns:
            Server name or None if not found
        """
        for name, caps in self._capabilities.items():
            if tool_name in caps.tool_names:
                return name
        return None
    def get_all_tools(self) -> dict[str, list[dict[str, Any]]]:
        """
        Get all tools from all servers.
        Returns:
            Dict mapping server name to list of tool definitions
        """
        return {
            name: caps.tools
            for name, caps in self._capabilities.items()
            if caps.is_loaded
        }
    @property
    def global_config(self) -> MCPConfig:
        """Get the global MCP configuration."""
        return self._config
 # Module-level convenience function
 def get_registry() -> MCPServerRegistry:
    """Get the global MCP server registry instance."""
    return MCPServerRegistry.get_instance()
--- a/backend/app/services/mcp/routing.py
+++ b/backend/app/services/mcp/routing.py
@@ -0,0 +1,619 @@
 """
 MCP Tool Call Routing
 Routes tool calls to appropriate servers with retry logic,
 circuit breakers, and request/response serialization.
 """
 import asyncio
 import logging
 import time
 import uuid
 from dataclasses import dataclass, field
 from enum import Enum
 from typing import Any
 from .config import MCPServerConfig
 from .connection import ConnectionPool, MCPConnection
 from .exceptions import (
    MCPCircuitOpenError,
    MCPError,
    MCPTimeoutError,
    MCPToolError,
    MCPToolNotFoundError,
 )
 from .registry import MCPServerRegistry
 logger = logging.getLogger(__name__)
 class CircuitState(Enum):
    """Circuit breaker states."""
    CLOSED = "closed"
    OPEN = "open"
    HALF_OPEN = "half-open"
 class AsyncCircuitBreaker:
    """
    Async-compatible circuit breaker implementation.
    Unlike pybreaker which wraps sync functions, this implementation
    provides explicit success/failure tracking for async code.
    """
    def __init__(
        self,
        fail_max: int = 5,
        reset_timeout: float = 30.0,
        name: str = "",
    ) -> None:
        """
        Initialize circuit breaker.
        Args:
            fail_max: Maximum failures before opening circuit
            reset_timeout: Seconds to wait before trying again
            name: Name for logging
        """
        self.fail_max = fail_max
        self.reset_timeout = reset_timeout
        self.name = name
        self._state = CircuitState.CLOSED
        self._fail_counter = 0
        self._last_failure_time: float | None = None
        self._lock = asyncio.Lock()
    @property
    def current_state(self) -> str:
        """Get current state as string."""
        # Check if we should transition from OPEN to HALF_OPEN
        if self._state == CircuitState.OPEN:
            if self._should_try_reset():
                return CircuitState.HALF_OPEN.value
        return self._state.value
    @property
    def fail_counter(self) -> int:
        """Get current failure count."""
        return self._fail_counter
    def _should_try_reset(self) -> bool:
        """Check if enough time has passed to try resetting."""
        if self._last_failure_time is None:
            return True
        return (time.time() - self._last_failure_time) >= self.reset_timeout
    async def success(self) -> None:
        """Record a successful call."""
        async with self._lock:
            self._fail_counter = 0
            self._state = CircuitState.CLOSED
            self._last_failure_time = None
    async def failure(self) -> None:
        """Record a failed call."""
        async with self._lock:
            self._fail_counter += 1
            self._last_failure_time = time.time()
            if self._fail_counter >= self.fail_max:
                self._state = CircuitState.OPEN
                logger.warning(
                    "Circuit breaker %s opened after %d failures",
                    self.name,
                    self._fail_counter,
                )
    def is_open(self) -> bool:
        """Check if circuit is open (not allowing calls)."""
        if self._state == CircuitState.OPEN:
            return not self._should_try_reset()
        return False
    async def reset(self) -> None:
        """Manually reset the circuit breaker."""
        async with self._lock:
            self._state = CircuitState.CLOSED
            self._fail_counter = 0
            self._last_failure_time = None
@dataclass
 class ToolInfo:
    """Information about an available tool."""
    name: str
    description: str | None = None
    server_name: str | None = None
    input_schema: dict[str, Any] | None = None
    def to_dict(self) -> dict[str, Any]:
        """Convert to dictionary."""
        return {
            "name": self.name,
            "description": self.description,
            "server_name": self.server_name,
            "input_schema": self.input_schema,
        }
@dataclass
 class ToolResult:
    """Result of a tool execution."""
    success: bool
    data: Any = None
    error: str | None = None
    error_code: str | None = None
    tool_name: str | None = None
    server_name: str | None = None
    execution_time_ms: float = 0.0
    request_id: str = field(default_factory=lambda: str(uuid.uuid4()))
    def to_dict(self) -> dict[str, Any]:
        """Convert to dictionary."""
        return {
            "success": self.success,
            "data": self.data,
            "error": self.error,
            "error_code": self.error_code,
            "tool_name": self.tool_name,
            "server_name": self.server_name,
            "execution_time_ms": self.execution_time_ms,
            "request_id": self.request_id,
        }
 class ToolRouter:
    """
    Routes tool calls to the appropriate MCP server.
    Features:
    - Tool name to server mapping
    - Retry logic with exponential backoff
    - Circuit breaker pattern for fault tolerance
    - Request/response serialization
    - Execution timing and metrics
    """
    def __init__(
        self,
        registry: MCPServerRegistry,
        connection_pool: ConnectionPool,
    ) -> None:
        """
        Initialize the tool router.
        Args:
            registry: MCP server registry
            connection_pool: Connection pool for servers
        """
        self._registry = registry
        self._pool = connection_pool
        self._circuit_breakers: dict[str, AsyncCircuitBreaker] = {}
        self._tool_to_server: dict[str, str] = {}
        self._lock = asyncio.Lock()
    def _get_circuit_breaker(
        self,
        server_name: str,
        config: MCPServerConfig,
    ) -> AsyncCircuitBreaker:
        """Get or create a circuit breaker for a server."""
        if server_name not in self._circuit_breakers:
            self._circuit_breakers[server_name] = AsyncCircuitBreaker(
                fail_max=config.circuit_breaker_threshold,
                reset_timeout=config.circuit_breaker_timeout,
                name=f"mcp-{server_name}",
            )
        return self._circuit_breakers[server_name]
    async def register_tool_mapping(
        self,
        tool_name: str,
        server_name: str,
    ) -> None:
        """
        Register a mapping from tool name to server.
        Args:
            tool_name: Name of the tool
            server_name: Name of the server providing the tool
        """
        async with self._lock:
            self._tool_to_server[tool_name] = server_name
            logger.debug("Registered tool %s -> server %s", tool_name, server_name)
    async def discover_tools(self) -> None:
        """
        Discover all tools from registered servers and build mappings.
        """
        for server_name in self._registry.list_enabled_servers():
            try:
                config = self._registry.get(server_name)
                connection = await self._pool.get_connection(server_name, config)
                # Fetch tools from server
                tools = await self._fetch_tools_from_server(connection)
                # Update registry with capabilities
                self._registry.set_capabilities(
                    server_name,
                    tools=[t.to_dict() for t in tools],
                )
                # Update tool mappings
                for tool in tools:
                    await self.register_tool_mapping(tool.name, server_name)
                logger.info(
                    "Discovered %d tools from server %s",
                    len(tools),
                    server_name,
                )
            except Exception as e:
                logger.warning(
                    "Failed to discover tools from %s: %s",
                    server_name,
                    e,
                )
    async def _fetch_tools_from_server(
        self,
        connection: MCPConnection,
    ) -> list[ToolInfo]:
        """Fetch available tools from an MCP server."""
        try:
            response = await connection.execute_request(
                "GET",
                "/mcp/tools",
            )
            tools = []
            for tool_data in response.get("tools", []):
                tools.append(
                    ToolInfo(
                        name=tool_data.get("name", ""),
                        description=tool_data.get("description"),
                        server_name=connection.server_name,
                        input_schema=tool_data.get("inputSchema"),
                    )
                )
            return tools
        except Exception as e:
            logger.warning(
                "Error fetching tools from %s: %s",
                connection.server_name,
                e,
            )
            return []
    def find_server_for_tool(self, tool_name: str) -> str | None:
        """
        Find which server provides a specific tool.
        Args:
            tool_name: Name of the tool
        Returns:
            Server name or None if not found
        """
        return self._tool_to_server.get(tool_name)
    async def call_tool(
        self,
        server_name: str,
        tool_name: str,
        arguments: dict[str, Any] | None = None,
        timeout: float | None = None,
    ) -> ToolResult:
        """
        Call a tool on a specific server.
        Args:
            server_name: Name of the MCP server
            tool_name: Name of the tool to call
            arguments: Tool arguments
            timeout: Optional timeout override
        Returns:
            Tool execution result
        """
        start_time = time.time()
        request_id = str(uuid.uuid4())
        logger.debug(
            "Tool call [%s]: %s.%s with args %s",
            request_id,
            server_name,
            tool_name,
            arguments,
        )
        try:
            config = self._registry.get(server_name)
            circuit_breaker = self._get_circuit_breaker(server_name, config)
            # Check circuit breaker state
            if circuit_breaker.is_open():
                raise MCPCircuitOpenError(
                    server_name=server_name,
                    failure_count=circuit_breaker.fail_counter,
                    reset_timeout=config.circuit_breaker_timeout,
                )
            # Execute with retry logic
            result = await self._execute_with_retry(
                server_name=server_name,
                config=config,
                tool_name=tool_name,
                arguments=arguments or {},
                timeout=timeout,
                circuit_breaker=circuit_breaker,
            )
            execution_time = (time.time() - start_time) * 1000
            return ToolResult(
                success=True,
                data=result,
                tool_name=tool_name,
                server_name=server_name,
                execution_time_ms=execution_time,
                request_id=request_id,
            )
        except MCPCircuitOpenError:
            raise
        except MCPError as e:
            execution_time = (time.time() - start_time) * 1000
            logger.error(
                "Tool call failed [%s]: %s.%s - %s",
                request_id,
                server_name,
                tool_name,
                e,
            )
            return ToolResult(
                success=False,
                error=str(e),
                error_code=type(e).__name__,
                tool_name=tool_name,
                server_name=server_name,
                execution_time_ms=execution_time,
                request_id=request_id,
            )
        except Exception as e:
            execution_time = (time.time() - start_time) * 1000
            logger.exception(
                "Unexpected error in tool call [%s]: %s.%s",
                request_id,
                server_name,
                tool_name,
            )
            return ToolResult(
                success=False,
                error=str(e),
                error_code="UnexpectedError",
                tool_name=tool_name,
                server_name=server_name,
                execution_time_ms=execution_time,
                request_id=request_id,
            )
    async def _execute_with_retry(
        self,
        server_name: str,
        config: MCPServerConfig,
        tool_name: str,
        arguments: dict[str, Any],
        timeout: float | None,
        circuit_breaker: AsyncCircuitBreaker,
    ) -> Any:
        """Execute tool call with retry logic."""
        last_error: Exception | None = None
        attempts = 0
        max_attempts = config.retry_attempts + 1  # +1 for initial attempt
        while attempts < max_attempts:
            attempts += 1
            try:
                # Use circuit breaker to track failures
                result = await self._execute_tool_call(
                    server_name=server_name,
                    config=config,
                    tool_name=tool_name,
                    arguments=arguments,
                    timeout=timeout,
                )
                # Success - record it
                await circuit_breaker.success()
                return result
            except MCPCircuitOpenError:
                raise
            except MCPTimeoutError:
                # Timeout - don't retry
                await circuit_breaker.failure()
                raise
            except MCPToolError:
                # Tool-level error - don't retry (user error)
                raise
            except Exception as e:
                last_error = e
                await circuit_breaker.failure()
                if attempts < max_attempts:
                    delay = self._calculate_retry_delay(attempts, config)
                    logger.warning(
                        "Tool call attempt %d/%d failed for %s.%s: %s. "
                        "Retrying in %.1fs",
                        attempts,
                        max_attempts,
                        server_name,
                        tool_name,
                        e,
                        delay,
                    )
                    await asyncio.sleep(delay)
        # All attempts failed
        raise MCPToolError(
            f"Tool call failed after {max_attempts} attempts",
            server_name=server_name,
            tool_name=tool_name,
            tool_args=arguments,
            details={"last_error": str(last_error)},
        )
    def _calculate_retry_delay(
        self,
        attempt: int,
        config: MCPServerConfig,
    ) -> float:
        """Calculate exponential backoff delay with jitter."""
        import random
        delay = config.retry_delay * (2 ** (attempt - 1))
        delay = min(delay, config.retry_max_delay)
        # Add jitter (±25%)
        jitter = delay * 0.25 * (random.random() * 2 - 1)
        return max(0.1, delay + jitter)
    async def _execute_tool_call(
        self,
        server_name: str,
        config: MCPServerConfig,
        tool_name: str,
        arguments: dict[str, Any],
        timeout: float | None,
    ) -> Any:
        """Execute a single tool call."""
        connection = await self._pool.get_connection(server_name, config)
        # Build MCP tool call request
        request_body = {
            "jsonrpc": "2.0",
            "method": "tools/call",
            "params": {
                "name": tool_name,
                "arguments": arguments,
            },
            "id": str(uuid.uuid4()),
        }
        response = await connection.execute_request(
            method="POST",
            path="/mcp",
            data=request_body,
            timeout=timeout,
        )
        # Handle JSON-RPC response
        if "error" in response:
            error = response["error"]
            raise MCPToolError(
                error.get("message", "Tool execution failed"),
                server_name=server_name,
                tool_name=tool_name,
                tool_args=arguments,
                error_code=str(error.get("code", "UNKNOWN")),
            )
        return response.get("result")
    async def route_tool(
        self,
        tool_name: str,
        arguments: dict[str, Any] | None = None,
        timeout: float | None = None,
    ) -> ToolResult:
        """
        Route a tool call to the appropriate server.
        Automatically discovers which server provides the tool.
        Args:
            tool_name: Name of the tool to call
            arguments: Tool arguments
            timeout: Optional timeout override
        Returns:
            Tool execution result
        Raises:
            MCPToolNotFoundError: If no server provides the tool
        """
        server_name = self.find_server_for_tool(tool_name)
        if server_name is None:
            # Try to find from registry
            server_name = self._registry.find_server_for_tool(tool_name)
        if server_name is None:
            raise MCPToolNotFoundError(
                tool_name=tool_name,
                available_tools=list(self._tool_to_server.keys()),
            )
        return await self.call_tool(
            server_name=server_name,
            tool_name=tool_name,
            arguments=arguments,
            timeout=timeout,
        )
    async def list_all_tools(self) -> list[ToolInfo]:
        """
        Get all available tools from all servers.
        Returns:
            List of tool information
        """
        tools = []
        all_server_tools = self._registry.get_all_tools()
        for server_name, server_tools in all_server_tools.items():
            for tool_data in server_tools:
                tools.append(
                    ToolInfo(
                        name=tool_data.get("name", ""),
                        description=tool_data.get("description"),
                        server_name=server_name,
                        input_schema=tool_data.get("input_schema"),
                    )
                )
        return tools
    def get_circuit_breaker_status(self) -> dict[str, dict[str, Any]]:
        """Get status of all circuit breakers."""
        return {
            name: {
                "state": cb.current_state,
                "failure_count": cb.fail_counter,
            }
            for name, cb in self._circuit_breakers.items()
        }
    async def reset_circuit_breaker(self, server_name: str) -> bool:
        """
        Manually reset a circuit breaker.
        Args:
            server_name: Name of the server
        Returns:
            True if circuit breaker was reset
        """
        async with self._lock:
            if server_name in self._circuit_breakers:
                # Reset by removing (will be recreated on next call)
                del self._circuit_breakers[server_name]
                logger.info("Reset circuit breaker for %s", server_name)
                return True
            return False
--- a/backend/app/services/oauth_service.py
+++ b/backend/app/services/oauth_service.py
@@ -343,7 +343,9 @@ class OAuthService:
                await oauth_account.update_tokens(
                    db,
                    account=existing_oauth,
-                    access_token_encrypted=token.get("access_token"),                    refresh_token_encrypted=token.get("refresh_token"),                    token_expires_at=datetime.now(UTC)
+                    access_token_encrypted=token.get("access_token"),
                    refresh_token_encrypted=token.get("refresh_token"),
                    token_expires_at=datetime.now(UTC)
                    + timedelta(seconds=token.get("expires_in", 3600)),
                )
@@ -375,7 +377,9 @@ class OAuthService:
                provider=provider,
                provider_user_id=provider_user_id,
                provider_email=provider_email,
-                access_token_encrypted=token.get("access_token"),                refresh_token_encrypted=token.get("refresh_token"),                token_expires_at=datetime.now(UTC)
+                access_token_encrypted=token.get("access_token"),
                refresh_token_encrypted=token.get("refresh_token"),
                token_expires_at=datetime.now(UTC)
                + timedelta(seconds=token.get("expires_in", 3600))
                if token.get("expires_in")
                else None,
@@ -644,7 +648,9 @@ class OAuthService:
            provider=provider,
            provider_user_id=provider_user_id,
            provider_email=email,
-            access_token_encrypted=token.get("access_token"),            refresh_token_encrypted=token.get("refresh_token"),            token_expires_at=datetime.now(UTC)
+            access_token_encrypted=token.get("access_token"),
            refresh_token_encrypted=token.get("refresh_token"),
            token_expires_at=datetime.now(UTC)
            + timedelta(seconds=token.get("expires_in", 3600))
            if token.get("expires_in")
            else None,
--- a/backend/app/services/safety/init.py
+++ b/backend/app/services/safety/init.py
@@ -0,0 +1,170 @@
 """
 Safety and Guardrails Framework
 Comprehensive safety framework for autonomous agent operation.
 Provides multi-layered protection including:
 - Pre-execution validation
 - Cost and budget controls
 - Rate limiting
 - Loop detection and prevention
 - Human-in-the-loop approval
 - Rollback and checkpointing
 - Content filtering
 - Sandboxed execution
 - Emergency controls
 - Complete audit trail
 Usage:
    from app.services.safety import get_safety_guardian, SafetyGuardian
    guardian = await get_safety_guardian()
    result = await guardian.validate(action_request)
    if result.allowed:
        # Execute action
        pass
    else:
        # Handle denial
        print(f"Action denied: {result.reasons}")
 """
 # Exceptions
 # Audit
 from .audit import (
    AuditLogger,
    get_audit_logger,
    reset_audit_logger,
    shutdown_audit_logger,
 )
 # Configuration
 from .config import (
    AutonomyConfig,
    SafetyConfig,
    get_autonomy_config,
    get_default_policy,
    get_policy_for_autonomy_level,
    get_safety_config,
    load_policies_from_directory,
    load_policy_from_file,
    reset_config_cache,
 )
 from .exceptions import (
    ApprovalDeniedError,
    ApprovalRequiredError,
    ApprovalTimeoutError,
    BudgetExceededError,
    CheckpointError,
    ContentFilterError,
    EmergencyStopError,
    LoopDetectedError,
    PermissionDeniedError,
    PolicyViolationError,
    RateLimitExceededError,
    RollbackError,
    SafetyError,
    SandboxError,
    SandboxTimeoutError,
    ValidationError,
 )
 # Guardian
 from .guardian import (
    SafetyGuardian,
    get_safety_guardian,
    reset_safety_guardian,
    shutdown_safety_guardian,
 )
 # Models
 from .models import (
    ActionMetadata,
    ActionRequest,
    ActionResult,
    ActionType,
    ApprovalRequest,
    ApprovalResponse,
    ApprovalStatus,
    AuditEvent,
    AuditEventType,
    AutonomyLevel,
    BudgetScope,
    BudgetStatus,
    Checkpoint,
    CheckpointType,
    GuardianResult,
    PermissionLevel,
    RateLimitConfig,
    RateLimitStatus,
    ResourceType,
    RollbackResult,
    SafetyDecision,
    SafetyPolicy,
    ValidationResult,
    ValidationRule,
 )
 __all__ = [
    "ActionMetadata",
    "ActionRequest",
    "ActionResult",
    # Models
    "ActionType",
    "ApprovalDeniedError",
    "ApprovalRequest",
    "ApprovalRequiredError",
    "ApprovalResponse",
    "ApprovalStatus",
    "ApprovalTimeoutError",
    "AuditEvent",
    "AuditEventType",
    # Audit
    "AuditLogger",
    "AutonomyConfig",
    "AutonomyLevel",
    "BudgetExceededError",
    "BudgetScope",
    "BudgetStatus",
    "Checkpoint",
    "CheckpointError",
    "CheckpointType",
    "ContentFilterError",
    "EmergencyStopError",
    "GuardianResult",
    "LoopDetectedError",
    "PermissionDeniedError",
    "PermissionLevel",
    "PolicyViolationError",
    "RateLimitConfig",
    "RateLimitExceededError",
    "RateLimitStatus",
    "ResourceType",
    "RollbackError",
    "RollbackResult",
    # Configuration
    "SafetyConfig",
    "SafetyDecision",
    # Exceptions
    "SafetyError",
    # Guardian
    "SafetyGuardian",
    "SafetyPolicy",
    "SandboxError",
    "SandboxTimeoutError",
    "ValidationError",
    "ValidationResult",
    "ValidationRule",
    "get_audit_logger",
    "get_autonomy_config",
    "get_default_policy",
    "get_policy_for_autonomy_level",
    "get_safety_config",
    "get_safety_guardian",
    "load_policies_from_directory",
    "load_policy_from_file",
    "reset_audit_logger",
    "reset_config_cache",
    "reset_safety_guardian",
    "shutdown_audit_logger",
    "shutdown_safety_guardian",
 ]
--- a/backend/app/services/safety/audit/init.py
+++ b/backend/app/services/safety/audit/init.py
@@ -0,0 +1,19 @@
 """
 Audit System
 Comprehensive audit logging for all safety-related events.
 """
 from .logger import (
    AuditLogger,
    get_audit_logger,
    reset_audit_logger,
    shutdown_audit_logger,
 )
 __all__ = [
    "AuditLogger",
    "get_audit_logger",
    "reset_audit_logger",
    "shutdown_audit_logger",
 ]
--- a/backend/app/services/safety/audit/logger.py
+++ b/backend/app/services/safety/audit/logger.py
@@ -0,0 +1,581 @@
 """
 Audit Logger
 Comprehensive audit logging for all safety-related events.
 Provides tamper detection, structured logging, and compliance support.
 """
 import asyncio
 import hashlib
 import json
 import logging
 from collections import deque
 from datetime import datetime, timedelta
 from typing import Any
 from uuid import uuid4
 from ..config import get_safety_config
 from ..models import (
    ActionRequest,
    AuditEvent,
    AuditEventType,
    SafetyDecision,
 )
 logger = logging.getLogger(__name__)
 class AuditLogger:
    """
    Audit logger for safety events.
    Features:
    - Structured event logging
    - In-memory buffer with async flush
    - Tamper detection via hash chains
    - Query/search capability
    - Retention policy enforcement
    """
    def __init__(
        self,
        max_buffer_size: int = 1000,
        flush_interval_seconds: float = 10.0,
        enable_hash_chain: bool = True,
    ) -> None:
        """
        Initialize the audit logger.
        Args:
            max_buffer_size: Maximum events to buffer before auto-flush
            flush_interval_seconds: Interval for periodic flush
            enable_hash_chain: Enable tamper detection via hash chain
        """
        self._buffer: deque[AuditEvent] = deque(maxlen=max_buffer_size)
        self._persisted: list[AuditEvent] = []
        self._flush_interval = flush_interval_seconds
        self._enable_hash_chain = enable_hash_chain
        self._last_hash: str | None = None
        self._lock = asyncio.Lock()
        self._flush_task: asyncio.Task[None] | None = None
        self._running = False
        # Event handlers for real-time processing
        self._handlers: list[Any] = []
        config = get_safety_config()
        self._retention_days = config.audit_retention_days
        self._include_sensitive = config.audit_include_sensitive
    async def start(self) -> None:
        """Start the audit logger background tasks."""
        if self._running:
            return
        self._running = True
        self._flush_task = asyncio.create_task(self._periodic_flush())
        logger.info("Audit logger started")
    async def stop(self) -> None:
        """Stop the audit logger and flush remaining events."""
        self._running = False
        if self._flush_task:
            self._flush_task.cancel()
            try:
                await self._flush_task
            except asyncio.CancelledError:
                pass
        # Final flush
        await self.flush()
        logger.info("Audit logger stopped")
    async def log(
        self,
        event_type: AuditEventType,
        *,
        agent_id: str | None = None,
        action_id: str | None = None,
        project_id: str | None = None,
        session_id: str | None = None,
        user_id: str | None = None,
        decision: SafetyDecision | None = None,
        details: dict[str, Any] | None = None,
        correlation_id: str | None = None,
    ) -> AuditEvent:
        """
        Log an audit event.
        Args:
            event_type: Type of audit event
            agent_id: Agent ID if applicable
            action_id: Action ID if applicable
            project_id: Project ID if applicable
            session_id: Session ID if applicable
            user_id: User ID if applicable
            decision: Safety decision if applicable
            details: Additional event details
            correlation_id: Correlation ID for tracing
        Returns:
            The created audit event
        """
        # Sanitize sensitive data if needed
        sanitized_details = self._sanitize_details(details) if details else {}
        event = AuditEvent(
            id=str(uuid4()),
            event_type=event_type,
            timestamp=datetime.utcnow(),
            agent_id=agent_id,
            action_id=action_id,
            project_id=project_id,
            session_id=session_id,
            user_id=user_id,
            decision=decision,
            details=sanitized_details,
            correlation_id=correlation_id,
        )
        async with self._lock:
            # Add hash chain for tamper detection
            if self._enable_hash_chain:
                event_hash = self._compute_hash(event)
                sanitized_details["_hash"] = event_hash
                sanitized_details["_prev_hash"] = self._last_hash
                self._last_hash = event_hash
            self._buffer.append(event)
        # Notify handlers
        await self._notify_handlers(event)
        # Log to standard logger as well
        self._log_to_logger(event)
        return event
    async def log_action_request(
        self,
        action: ActionRequest,
        decision: SafetyDecision,
        reasons: list[str] | None = None,
    ) -> AuditEvent:
        """Log an action request with its validation decision."""
        event_type = (
            AuditEventType.ACTION_DENIED
            if decision == SafetyDecision.DENY
            else AuditEventType.ACTION_VALIDATED
        )
        return await self.log(
            event_type,
            agent_id=action.metadata.agent_id,
            action_id=action.id,
            project_id=action.metadata.project_id,
            session_id=action.metadata.session_id,
            user_id=action.metadata.user_id,
            decision=decision,
            details={
                "action_type": action.action_type.value,
                "tool_name": action.tool_name,
                "resource": action.resource,
                "is_destructive": action.is_destructive,
                "reasons": reasons or [],
            },
            correlation_id=action.metadata.correlation_id,
        )
    async def log_action_executed(
        self,
        action: ActionRequest,
        success: bool,
        execution_time_ms: float,
        error: str | None = None,
    ) -> AuditEvent:
        """Log an action execution result."""
        event_type = (
            AuditEventType.ACTION_EXECUTED if success else AuditEventType.ACTION_FAILED
        )
        return await self.log(
            event_type,
            agent_id=action.metadata.agent_id,
            action_id=action.id,
            project_id=action.metadata.project_id,
            session_id=action.metadata.session_id,
            decision=SafetyDecision.ALLOW if success else SafetyDecision.DENY,
            details={
                "action_type": action.action_type.value,
                "tool_name": action.tool_name,
                "success": success,
                "execution_time_ms": execution_time_ms,
                "error": error,
            },
            correlation_id=action.metadata.correlation_id,
        )
    async def log_approval_event(
        self,
        event_type: AuditEventType,
        approval_id: str,
        action: ActionRequest,
        decided_by: str | None = None,
        reason: str | None = None,
    ) -> AuditEvent:
        """Log an approval-related event."""
        return await self.log(
            event_type,
            agent_id=action.metadata.agent_id,
            action_id=action.id,
            project_id=action.metadata.project_id,
            session_id=action.metadata.session_id,
            user_id=decided_by,
            details={
                "approval_id": approval_id,
                "action_type": action.action_type.value,
                "tool_name": action.tool_name,
                "decided_by": decided_by,
                "reason": reason,
            },
            correlation_id=action.metadata.correlation_id,
        )
    async def log_budget_event(
        self,
        event_type: AuditEventType,
        agent_id: str,
        scope: str,
        current_usage: float,
        limit: float,
        unit: str = "tokens",
    ) -> AuditEvent:
        """Log a budget-related event."""
        return await self.log(
            event_type,
            agent_id=agent_id,
            details={
                "scope": scope,
                "current_usage": current_usage,
                "limit": limit,
                "unit": unit,
                "usage_percent": (current_usage / limit * 100) if limit > 0 else 0,
            },
        )
    async def log_emergency_stop(
        self,
        stop_type: str,
        triggered_by: str,
        reason: str,
        affected_agents: list[str] | None = None,
    ) -> AuditEvent:
        """Log an emergency stop event."""
        return await self.log(
            AuditEventType.EMERGENCY_STOP,
            user_id=triggered_by,
            details={
                "stop_type": stop_type,
                "triggered_by": triggered_by,
                "reason": reason,
                "affected_agents": affected_agents or [],
            },
        )
    async def flush(self) -> int:
        """
        Flush buffered events to persistent storage.
        Returns:
            Number of events flushed
        """
        async with self._lock:
            if not self._buffer:
                return 0
            events = list(self._buffer)
            self._buffer.clear()
        # Persist events (in production, this would go to database/storage)
        self._persisted.extend(events)
        # Enforce retention
        self._enforce_retention()
        logger.debug("Flushed %d audit events", len(events))
        return len(events)
    async def query(
        self,
        *,
        event_types: list[AuditEventType] | None = None,
        agent_id: str | None = None,
        action_id: str | None = None,
        project_id: str | None = None,
        session_id: str | None = None,
        user_id: str | None = None,
        start_time: datetime | None = None,
        end_time: datetime | None = None,
        correlation_id: str | None = None,
        limit: int = 100,
        offset: int = 0,
    ) -> list[AuditEvent]:
        """
        Query audit events with filters.
        Args:
            event_types: Filter by event types
            agent_id: Filter by agent ID
            action_id: Filter by action ID
            project_id: Filter by project ID
            session_id: Filter by session ID
            user_id: Filter by user ID
            start_time: Filter events after this time
            end_time: Filter events before this time
            correlation_id: Filter by correlation ID
            limit: Maximum results to return
            offset: Result offset for pagination
        Returns:
            List of matching audit events
        """
        # Combine buffer and persisted for query
        all_events = list(self._persisted) + list(self._buffer)
        results = []
        for event in all_events:
            if event_types and event.event_type not in event_types:
                continue
            if agent_id and event.agent_id != agent_id:
                continue
            if action_id and event.action_id != action_id:
                continue
            if project_id and event.project_id != project_id:
                continue
            if session_id and event.session_id != session_id:
                continue
            if user_id and event.user_id != user_id:
                continue
            if start_time and event.timestamp < start_time:
                continue
            if end_time and event.timestamp > end_time:
                continue
            if correlation_id and event.correlation_id != correlation_id:
                continue
            results.append(event)
        # Sort by timestamp descending
        results.sort(key=lambda e: e.timestamp, reverse=True)
        # Apply pagination
        return results[offset : offset + limit]
    async def get_action_history(
        self,
        agent_id: str,
        limit: int = 100,
    ) -> list[AuditEvent]:
        """Get action history for an agent."""
        return await self.query(
            agent_id=agent_id,
            event_types=[
                AuditEventType.ACTION_REQUESTED,
                AuditEventType.ACTION_VALIDATED,
                AuditEventType.ACTION_DENIED,
                AuditEventType.ACTION_EXECUTED,
                AuditEventType.ACTION_FAILED,
            ],
            limit=limit,
        )
    async def verify_integrity(self) -> tuple[bool, list[str]]:
        """
        Verify audit log integrity using hash chain.
        Returns:
            Tuple of (is_valid, list of issues found)
        """
        if not self._enable_hash_chain:
            return True, []
        issues: list[str] = []
        all_events = list(self._persisted) + list(self._buffer)
        prev_hash: str | None = None
        for event in sorted(all_events, key=lambda e: e.timestamp):
            stored_prev = event.details.get("_prev_hash")
            stored_hash = event.details.get("_hash")
            if stored_prev != prev_hash:
                issues.append(
                    f"Hash chain broken at event {event.id}: "
                    f"expected prev_hash={prev_hash}, got {stored_prev}"
                )
            if stored_hash:
                computed = self._compute_hash(event)
                if computed != stored_hash:
                    issues.append(
                        f"Hash mismatch at event {event.id}: "
                        f"expected {computed}, got {stored_hash}"
                    )
            prev_hash = stored_hash
        return len(issues) == 0, issues
    def add_handler(self, handler: Any) -> None:
        """Add a real-time event handler."""
        self._handlers.append(handler)
    def remove_handler(self, handler: Any) -> None:
        """Remove an event handler."""
        if handler in self._handlers:
            self._handlers.remove(handler)
    def _sanitize_details(self, details: dict[str, Any]) -> dict[str, Any]:
        """Sanitize sensitive data from details."""
        if self._include_sensitive:
            return details
        sanitized: dict[str, Any] = {}
        sensitive_keys = {
            "password",
            "secret",
            "token",
            "api_key",
            "apikey",
            "auth",
            "credential",
        }
        for key, value in details.items():
            lower_key = key.lower()
            if any(s in lower_key for s in sensitive_keys):
                sanitized[key] = "[REDACTED]"
            elif isinstance(value, dict):
                sanitized[key] = self._sanitize_details(value)
            else:
                sanitized[key] = value
        return sanitized
    def _compute_hash(self, event: AuditEvent) -> str:
        """Compute hash for an event (excluding hash fields)."""
        data = {
            "id": event.id,
            "event_type": event.event_type.value,
            "timestamp": event.timestamp.isoformat(),
            "agent_id": event.agent_id,
            "action_id": event.action_id,
            "project_id": event.project_id,
            "session_id": event.session_id,
            "user_id": event.user_id,
            "decision": event.decision.value if event.decision else None,
            "details": {
                k: v for k, v in event.details.items() if not k.startswith("_")
            },
            "correlation_id": event.correlation_id,
        }
        if self._last_hash:
            data["_prev_hash"] = self._last_hash
        serialized = json.dumps(data, sort_keys=True, default=str)
        return hashlib.sha256(serialized.encode()).hexdigest()
    def _log_to_logger(self, event: AuditEvent) -> None:
        """Log event to standard Python logger."""
        log_data = {
            "audit_event": event.event_type.value,
            "event_id": event.id,
            "agent_id": event.agent_id,
            "action_id": event.action_id,
            "decision": event.decision.value if event.decision else None,
        }
        # Use appropriate log level based on event type
        if event.event_type in {
            AuditEventType.ACTION_DENIED,
            AuditEventType.POLICY_VIOLATION,
            AuditEventType.EMERGENCY_STOP,
        }:
            logger.warning("Audit: %s", log_data)
        elif event.event_type in {
            AuditEventType.ACTION_FAILED,
            AuditEventType.ROLLBACK_FAILED,
        }:
            logger.error("Audit: %s", log_data)
        else:
            logger.info("Audit: %s", log_data)
    def _enforce_retention(self) -> None:
        """Enforce retention policy on persisted events."""
        if not self._retention_days:
            return
        cutoff = datetime.utcnow() - timedelta(days=self._retention_days)
        before_count = len(self._persisted)
        self._persisted = [e for e in self._persisted if e.timestamp >= cutoff]
        removed = before_count - len(self._persisted)
        if removed > 0:
            logger.info("Removed %d expired audit events", removed)
    async def _periodic_flush(self) -> None:
        """Background task for periodic flushing."""
        while self._running:
            try:
                await asyncio.sleep(self._flush_interval)
                await self.flush()
            except asyncio.CancelledError:
                break
            except Exception as e:
                logger.error("Error in periodic audit flush: %s", e)
    async def _notify_handlers(self, event: AuditEvent) -> None:
        """Notify all registered handlers of a new event."""
        for handler in self._handlers:
            try:
                if asyncio.iscoroutinefunction(handler):
                    await handler(event)
                else:
                    handler(event)
            except Exception as e:
                logger.error("Error in audit event handler: %s", e)
 # Singleton instance
 _audit_logger: AuditLogger | None = None
 _audit_lock = asyncio.Lock()
 async def get_audit_logger() -> AuditLogger:
    """Get the global audit logger instance."""
    global _audit_logger
    async with _audit_lock:
        if _audit_logger is None:
            _audit_logger = AuditLogger()
            await _audit_logger.start()
    return _audit_logger
 async def shutdown_audit_logger() -> None:
    """Shutdown the global audit logger."""
    global _audit_logger
    async with _audit_lock:
        if _audit_logger is not None:
            await _audit_logger.stop()
            _audit_logger = None
 def reset_audit_logger() -> None:
    """Reset the audit logger (for testing)."""
    global _audit_logger
    _audit_logger = None
--- a/backend/app/services/safety/config.py
+++ b/backend/app/services/safety/config.py
@@ -0,0 +1,304 @@
 """
 Safety Framework Configuration
 Pydantic settings for the safety and guardrails framework.
 """
 import logging
 import os
 from functools import lru_cache
 from pathlib import Path
 from typing import Any
 import yaml
 from pydantic import Field
 from pydantic_settings import BaseSettings, SettingsConfigDict
 from .models import AutonomyLevel, SafetyPolicy
 logger = logging.getLogger(__name__)
 class SafetyConfig(BaseSettings):
    """Configuration for the safety framework."""
    model_config = SettingsConfigDict(
        env_prefix="SAFETY_",
        env_file=".env",
        env_file_encoding="utf-8",
        extra="ignore",
    )
    # General settings
    enabled: bool = Field(True, description="Enable safety framework")
    strict_mode: bool = Field(True, description="Strict mode (fail closed on errors)")
    log_level: str = Field("INFO", description="Logging level")
    # Default autonomy level
    default_autonomy_level: AutonomyLevel = Field(
        AutonomyLevel.MILESTONE,
        description="Default autonomy level for new agents",
    )
    # Default budget limits
    default_session_token_budget: int = Field(
        100_000, description="Default tokens per session"
    )
    default_daily_token_budget: int = Field(
        1_000_000, description="Default tokens per day"
    )
    default_session_cost_limit: float = Field(
        10.0, description="Default USD per session"
    )
    default_daily_cost_limit: float = Field(100.0, description="Default USD per day")
    # Default rate limits
    default_actions_per_minute: int = Field(60, description="Default actions per min")
    default_llm_calls_per_minute: int = Field(20, description="Default LLM calls/min")
    default_file_ops_per_minute: int = Field(100, description="Default file ops/min")
    # Loop detection
    loop_detection_enabled: bool = Field(True, description="Enable loop detection")
    max_repeated_actions: int = Field(5, description="Max exact repetitions")
    max_similar_actions: int = Field(10, description="Max similar actions")
    loop_history_size: int = Field(100, description="Action history size for loops")
    # HITL settings
    hitl_enabled: bool = Field(True, description="Enable human-in-the-loop")
    hitl_default_timeout: int = Field(300, description="Default approval timeout (s)")
    hitl_notification_channels: list[str] = Field(
        default_factory=list, description="Notification channels"
    )
    # Rollback settings
    rollback_enabled: bool = Field(True, description="Enable rollback capability")
    checkpoint_dir: str = Field(
        "/tmp/syndarix_checkpoints",  # noqa: S108
        description="Directory for checkpoint storage",
    )
    checkpoint_retention_hours: int = Field(24, description="Checkpoint retention")
    auto_checkpoint_destructive: bool = Field(
        True, description="Auto-checkpoint destructive actions"
    )
    # Sandbox settings
    sandbox_enabled: bool = Field(False, description="Enable sandbox execution")
    sandbox_timeout: int = Field(300, description="Sandbox timeout (s)")
    sandbox_memory_mb: int = Field(1024, description="Sandbox memory limit (MB)")
    sandbox_cpu_limit: float = Field(1.0, description="Sandbox CPU limit")
    sandbox_network_enabled: bool = Field(False, description="Allow sandbox network")
    # Audit settings
    audit_enabled: bool = Field(True, description="Enable audit logging")
    audit_retention_days: int = Field(90, description="Audit log retention (days)")
    audit_include_sensitive: bool = Field(
        False, description="Include sensitive data in audit"
    )
    # Content filtering
    content_filter_enabled: bool = Field(True, description="Enable content filtering")
    filter_pii: bool = Field(True, description="Filter PII")
    filter_secrets: bool = Field(True, description="Filter secrets")
    # Emergency controls
    emergency_stop_enabled: bool = Field(True, description="Enable emergency stop")
    emergency_webhook_url: str | None = Field(None, description="Emergency webhook")
    # Policy file path
    policy_file: str | None = Field(None, description="Path to policy YAML file")
    # Validation cache
    validation_cache_ttl: int = Field(60, description="Validation cache TTL (s)")
    validation_cache_size: int = Field(1000, description="Validation cache size")
 class AutonomyConfig(BaseSettings):
    """Configuration for autonomy levels."""
    model_config = SettingsConfigDict(
        env_prefix="AUTONOMY_",
        env_file=".env",
        env_file_encoding="utf-8",
        extra="ignore",
    )
    # FULL_CONTROL settings
    full_control_cost_limit: float = Field(1.0, description="USD limit per session")
    full_control_require_all_approval: bool = Field(
        True, description="Require approval for all"
    )
    full_control_block_destructive: bool = Field(
        True, description="Block destructive actions"
    )
    # MILESTONE settings
    milestone_cost_limit: float = Field(10.0, description="USD limit per session")
    milestone_require_critical_approval: bool = Field(
        True, description="Require approval for critical"
    )
    milestone_auto_checkpoint: bool = Field(
        True, description="Auto-checkpoint destructive"
    )
    # AUTONOMOUS settings
    autonomous_cost_limit: float = Field(100.0, description="USD limit per session")
    autonomous_auto_approve_normal: bool = Field(
        True, description="Auto-approve normal actions"
    )
    autonomous_auto_checkpoint: bool = Field(True, description="Auto-checkpoint all")
 def _expand_env_vars(value: Any) -> Any:
    """Recursively expand environment variables in values."""
    if isinstance(value, str):
        return os.path.expandvars(value)
    elif isinstance(value, dict):
        return {k: _expand_env_vars(v) for k, v in value.items()}
    elif isinstance(value, list):
        return [_expand_env_vars(v) for v in value]
    return value
 def load_policy_from_file(file_path: str | Path) -> SafetyPolicy | None:
    """Load a safety policy from a YAML file."""
    path = Path(file_path)
    if not path.exists():
        logger.warning("Policy file not found: %s", path)
        return None
    try:
        with open(path) as f:
            data = yaml.safe_load(f)
        if data is None:
            logger.warning("Empty policy file: %s", path)
            return None
        # Expand environment variables
        data = _expand_env_vars(data)
        return SafetyPolicy(**data)
    except Exception as e:
        logger.error("Failed to load policy file %s: %s", path, e)
        return None
 def load_policies_from_directory(directory: str | Path) -> dict[str, SafetyPolicy]:
    """Load all safety policies from a directory."""
    policies: dict[str, SafetyPolicy] = {}
    path = Path(directory)
    if not path.exists() or not path.is_dir():
        logger.warning("Policy directory not found: %s", path)
        return policies
    for file_path in path.glob("*.yaml"):
        policy = load_policy_from_file(file_path)
        if policy:
            policies[policy.name] = policy
            logger.info("Loaded policy: %s from %s", policy.name, file_path.name)
    return policies
@lru_cache(maxsize=1)
 def get_safety_config() -> SafetyConfig:
    """Get the safety configuration (cached singleton)."""
    return SafetyConfig()
@lru_cache(maxsize=1)
 def get_autonomy_config() -> AutonomyConfig:
    """Get the autonomy configuration (cached singleton)."""
    return AutonomyConfig()
 def get_default_policy() -> SafetyPolicy:
    """Get the default safety policy."""
    config = get_safety_config()
    return SafetyPolicy(
        name="default",
        description="Default safety policy",
        max_tokens_per_session=config.default_session_token_budget,
        max_tokens_per_day=config.default_daily_token_budget,
        max_cost_per_session_usd=config.default_session_cost_limit,
        max_cost_per_day_usd=config.default_daily_cost_limit,
        max_actions_per_minute=config.default_actions_per_minute,
        max_llm_calls_per_minute=config.default_llm_calls_per_minute,
        max_file_operations_per_minute=config.default_file_ops_per_minute,
        max_repeated_actions=config.max_repeated_actions,
        max_similar_actions=config.max_similar_actions,
        require_sandbox=config.sandbox_enabled,
        sandbox_timeout_seconds=config.sandbox_timeout,
        sandbox_memory_mb=config.sandbox_memory_mb,
    )
 def get_policy_for_autonomy_level(level: AutonomyLevel) -> SafetyPolicy:
    """Get the safety policy for a given autonomy level."""
    autonomy = get_autonomy_config()
    base_policy = get_default_policy()
    if level == AutonomyLevel.FULL_CONTROL:
        return SafetyPolicy(
            name="full_control",
            description="Full control mode - all actions require approval",
            max_cost_per_session_usd=autonomy.full_control_cost_limit,
            max_cost_per_day_usd=autonomy.full_control_cost_limit * 10,
            require_approval_for=["*"],  # All actions
            max_tokens_per_session=base_policy.max_tokens_per_session // 10,
            max_tokens_per_day=base_policy.max_tokens_per_day // 10,
            max_actions_per_minute=base_policy.max_actions_per_minute // 2,
            max_llm_calls_per_minute=base_policy.max_llm_calls_per_minute // 2,
            max_file_operations_per_minute=base_policy.max_file_operations_per_minute
            // 2,
            denied_tools=["delete_*", "destroy_*", "drop_*"],
        )
    elif level == AutonomyLevel.MILESTONE:
        return SafetyPolicy(
            name="milestone",
            description="Milestone mode - approval at milestones only",
            max_cost_per_session_usd=autonomy.milestone_cost_limit,
            max_cost_per_day_usd=autonomy.milestone_cost_limit * 10,
            require_approval_for=[
                "delete_file",
                "push_to_remote",
                "deploy_*",
                "modify_critical_*",
                "create_pull_request",
            ],
            max_tokens_per_session=base_policy.max_tokens_per_session,
            max_tokens_per_day=base_policy.max_tokens_per_day,
            max_actions_per_minute=base_policy.max_actions_per_minute,
            max_llm_calls_per_minute=base_policy.max_llm_calls_per_minute,
            max_file_operations_per_minute=base_policy.max_file_operations_per_minute,
        )
    else:  # AUTONOMOUS
        return SafetyPolicy(
            name="autonomous",
            description="Autonomous mode - minimal intervention",
            max_cost_per_session_usd=autonomy.autonomous_cost_limit,
            max_cost_per_day_usd=autonomy.autonomous_cost_limit * 10,
            require_approval_for=[
                "deploy_to_production",
                "delete_repository",
                "modify_production_config",
            ],
            max_tokens_per_session=base_policy.max_tokens_per_session * 5,
            max_tokens_per_day=base_policy.max_tokens_per_day * 5,
            max_actions_per_minute=base_policy.max_actions_per_minute * 2,
            max_llm_calls_per_minute=base_policy.max_llm_calls_per_minute * 2,
            max_file_operations_per_minute=base_policy.max_file_operations_per_minute
            * 2,
        )
 def reset_config_cache() -> None:
    """Reset configuration caches (for testing)."""
    get_safety_config.cache_clear()
    get_autonomy_config.cache_clear()
--- a/backend/app/services/safety/content/init.py
+++ b/backend/app/services/safety/content/init.py
@@ -0,0 +1,23 @@
 """Content filtering for safety."""
 from .filter import (
    ContentCategory,
    ContentFilter,
    FilterAction,
    FilterMatch,
    FilterPattern,
    FilterResult,
    filter_content,
    scan_for_secrets,
 )
 __all__ = [
    "ContentCategory",
    "ContentFilter",
    "FilterAction",
    "FilterMatch",
    "FilterPattern",
    "FilterResult",
    "filter_content",
    "scan_for_secrets",
 ]
--- a/backend/app/services/safety/content/filter.py
+++ b/backend/app/services/safety/content/filter.py
@@ -0,0 +1,550 @@
 """
 Content Filter
 Filters and sanitizes content for safety, including PII detection and secret scanning.
 """
 import asyncio
 import logging
 import re
 from dataclasses import dataclass, field, replace
 from enum import Enum
 from typing import Any, ClassVar
 from ..exceptions import ContentFilterError
 logger = logging.getLogger(__name__)
 class ContentCategory(str, Enum):
    """Categories of sensitive content."""
    PII = "pii"
    SECRETS = "secrets"
    CREDENTIALS = "credentials"
    FINANCIAL = "financial"
    HEALTH = "health"
    PROFANITY = "profanity"
    INJECTION = "injection"
    CUSTOM = "custom"
 class FilterAction(str, Enum):
    """Actions to take on detected content."""
    ALLOW = "allow"
    REDACT = "redact"
    BLOCK = "block"
    WARN = "warn"
@dataclass
 class FilterMatch:
    """A match found by a filter."""
    category: ContentCategory
    pattern_name: str
    matched_text: str
    start_pos: int
    end_pos: int
    confidence: float = 1.0
    redacted_text: str | None = None
@dataclass
 class FilterResult:
    """Result of content filtering."""
    original_content: str
    filtered_content: str
    matches: list[FilterMatch] = field(default_factory=list)
    blocked: bool = False
    block_reason: str | None = None
    warnings: list[str] = field(default_factory=list)
    @property
    def has_sensitive_content(self) -> bool:
        """Check if any sensitive content was found."""
        return len(self.matches) > 0
@dataclass
 class FilterPattern:
    """A pattern for detecting sensitive content."""
    name: str
    category: ContentCategory
    pattern: str  # Regex pattern
    action: FilterAction = FilterAction.REDACT
    replacement: str = "[REDACTED]"
    confidence: float = 1.0
    enabled: bool = True
    def __post_init__(self) -> None:
        """Compile the regex pattern."""
        self._compiled = re.compile(self.pattern, re.IGNORECASE | re.MULTILINE)
    def find_matches(self, content: str) -> list[FilterMatch]:
        """Find all matches in content."""
        matches = []
        for match in self._compiled.finditer(content):
            matches.append(
                FilterMatch(
                    category=self.category,
                    pattern_name=self.name,
                    matched_text=match.group(),
                    start_pos=match.start(),
                    end_pos=match.end(),
                    confidence=self.confidence,
                    redacted_text=self.replacement,
                )
            )
        return matches
 class ContentFilter:
    """
    Filters content for sensitive information.
    Features:
    - PII detection (emails, phones, SSN, etc.)
    - Secret scanning (API keys, tokens, passwords)
    - Credential detection
    - Injection attack prevention
    - Custom pattern support
    - Configurable actions (allow, redact, block, warn)
    """
    # Default patterns for common sensitive data
    DEFAULT_PATTERNS: ClassVar[list[FilterPattern]] = [
        # PII Patterns
        FilterPattern(
            name="email",
            category=ContentCategory.PII,
            pattern=r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b",
            action=FilterAction.REDACT,
            replacement="[EMAIL]",
        ),
        FilterPattern(
            name="phone_us",
            category=ContentCategory.PII,
            pattern=r"\b(?:\+1[-.\s]?)?(?:\(?\d{3}\)?[-.\s]?)?\d{3}[-.\s]?\d{4}\b",
            action=FilterAction.REDACT,
            replacement="[PHONE]",
        ),
        FilterPattern(
            name="ssn",
            category=ContentCategory.PII,
            pattern=r"\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b",
            action=FilterAction.REDACT,
            replacement="[SSN]",
        ),
        FilterPattern(
            name="credit_card",
            category=ContentCategory.FINANCIAL,
            pattern=r"\b(?:\d{4}[-\s]?){3}\d{4}\b",
            action=FilterAction.REDACT,
            replacement="[CREDIT_CARD]",
        ),
        FilterPattern(
            name="ip_address",
            category=ContentCategory.PII,
            pattern=r"\b(?:\d{1,3}\.){3}\d{1,3}\b",
            action=FilterAction.WARN,
            replacement="[IP]",
            confidence=0.8,
        ),
        # Secret Patterns
        FilterPattern(
            name="api_key_generic",
            category=ContentCategory.SECRETS,
            pattern=r"\b(?:api[_-]?key|apikey)\s*[:=]\s*['\"]?([A-Za-z0-9_-]{20,})['\"]?",
            action=FilterAction.BLOCK,
            replacement="[API_KEY]",
        ),
        FilterPattern(
            name="aws_access_key",
            category=ContentCategory.SECRETS,
            pattern=r"\bAKIA[0-9A-Z]{16}\b",
            action=FilterAction.BLOCK,
            replacement="[AWS_KEY]",
        ),
        FilterPattern(
            name="aws_secret_key",
            category=ContentCategory.SECRETS,
            pattern=r"\b[A-Za-z0-9/+=]{40}\b",
            action=FilterAction.WARN,
            replacement="[AWS_SECRET]",
            confidence=0.6,  # Lower confidence - might be false positive
        ),
        FilterPattern(
            name="github_token",
            category=ContentCategory.SECRETS,
            pattern=r"\b(ghp|gho|ghu|ghs|ghr)_[A-Za-z0-9]{36,}\b",
            action=FilterAction.BLOCK,
            replacement="[GITHUB_TOKEN]",
        ),
        FilterPattern(
            name="jwt_token",
            category=ContentCategory.SECRETS,
            pattern=r"\beyJ[A-Za-z0-9_-]*\.eyJ[A-Za-z0-9_-]*\.[A-Za-z0-9_-]*\b",
            action=FilterAction.BLOCK,
            replacement="[JWT]",
        ),
        # Credential Patterns
        FilterPattern(
            name="password_in_url",
            category=ContentCategory.CREDENTIALS,
            pattern=r"://[^:]+:([^@]+)@",
            action=FilterAction.BLOCK,
            replacement="://[REDACTED]@",
        ),
        FilterPattern(
            name="password_assignment",
            category=ContentCategory.CREDENTIALS,
            pattern=r"\b(?:password|passwd|pwd)\s*[:=]\s*['\"]?([^\s'\"]+)['\"]?",
            action=FilterAction.REDACT,
            replacement="[PASSWORD]",
        ),
        FilterPattern(
            name="private_key",
            category=ContentCategory.SECRETS,
            pattern=r"-----BEGIN (?:RSA |DSA |EC |OPENSSH )?PRIVATE KEY-----",
            action=FilterAction.BLOCK,
            replacement="[PRIVATE_KEY]",
        ),
        # Injection Patterns
        FilterPattern(
            name="sql_injection",
            category=ContentCategory.INJECTION,
            pattern=r"(?:'\s*(?:OR|AND)\s*')|(?:--\s*$)|(?:;\s*(?:DROP|DELETE|UPDATE|INSERT))",
            action=FilterAction.BLOCK,
            replacement="[BLOCKED]",
        ),
        FilterPattern(
            name="command_injection",
            category=ContentCategory.INJECTION,
            pattern=r"[;&|`$]|\$\(|\$\{",
            action=FilterAction.WARN,
            replacement="[CMD]",
            confidence=0.5,  # Low confidence - common in code
        ),
    ]
    def __init__(
        self,
        enable_pii_filter: bool = True,
        enable_secret_filter: bool = True,
        enable_injection_filter: bool = True,
        custom_patterns: list[FilterPattern] | None = None,
        default_action: FilterAction = FilterAction.REDACT,
    ) -> None:
        """
        Initialize the ContentFilter.
        Args:
            enable_pii_filter: Enable PII detection
            enable_secret_filter: Enable secret scanning
            enable_injection_filter: Enable injection detection
            custom_patterns: Additional custom patterns
            default_action: Default action for matches
        """
        self._patterns: list[FilterPattern] = []
        self._default_action = default_action
        self._lock = asyncio.Lock()
        # Load default patterns based on configuration
        # Use replace() to create a copy of each pattern to avoid mutating shared defaults
        for pattern in self.DEFAULT_PATTERNS:
            if pattern.category == ContentCategory.PII and not enable_pii_filter:
                continue
            if pattern.category == ContentCategory.SECRETS and not enable_secret_filter:
                continue
            if (
                pattern.category == ContentCategory.CREDENTIALS
                and not enable_secret_filter
            ):
                continue
            if (
                pattern.category == ContentCategory.INJECTION
                and not enable_injection_filter
            ):
                continue
            self._patterns.append(replace(pattern))
        # Add custom patterns
        if custom_patterns:
            self._patterns.extend(custom_patterns)
        logger.info("ContentFilter initialized with %d patterns", len(self._patterns))
    def add_pattern(self, pattern: FilterPattern) -> None:
        """Add a custom pattern."""
        self._patterns.append(pattern)
        logger.debug("Added pattern: %s", pattern.name)
    def remove_pattern(self, pattern_name: str) -> bool:
        """Remove a pattern by name."""
        for i, pattern in enumerate(self._patterns):
            if pattern.name == pattern_name:
                del self._patterns[i]
                logger.debug("Removed pattern: %s", pattern_name)
                return True
        return False
    def enable_pattern(self, pattern_name: str, enabled: bool = True) -> bool:
        """Enable or disable a pattern."""
        for pattern in self._patterns:
            if pattern.name == pattern_name:
                pattern.enabled = enabled
                return True
        return False
    async def filter(
        self,
        content: str,
        context: dict[str, Any] | None = None,
        raise_on_block: bool = False,
    ) -> FilterResult:
        """
        Filter content for sensitive information.
        Args:
            content: Content to filter
            context: Optional context for filtering decisions
            raise_on_block: Raise exception if content is blocked
        Returns:
            FilterResult with filtered content and match details
        Raises:
            ContentFilterError: If content is blocked and raise_on_block=True
        """
        all_matches: list[FilterMatch] = []
        blocked = False
        block_reason: str | None = None
        warnings: list[str] = []
        # Find all matches
        for pattern in self._patterns:
            if not pattern.enabled:
                continue
            matches = pattern.find_matches(content)
            for match in matches:
                all_matches.append(match)
                if pattern.action == FilterAction.BLOCK:
                    blocked = True
                    block_reason = f"Blocked by pattern: {pattern.name}"
                elif pattern.action == FilterAction.WARN:
                    warnings.append(
                        f"Warning: {pattern.name} detected at position {match.start_pos}"
                    )
        # Sort matches by position (reverse for replacement)
        all_matches.sort(key=lambda m: m.start_pos, reverse=True)
        # Apply redactions
        filtered_content = content
        for match in all_matches:
            matched_pattern = self._get_pattern(match.pattern_name)
            if matched_pattern and matched_pattern.action in (
                FilterAction.REDACT,
                FilterAction.BLOCK,
            ):
                filtered_content = (
                    filtered_content[: match.start_pos]
                    + (match.redacted_text or "[REDACTED]")
                    + filtered_content[match.end_pos :]
                )
        # Re-sort for result
        all_matches.sort(key=lambda m: m.start_pos)
        result = FilterResult(
            original_content=content,
            filtered_content=filtered_content if not blocked else "",
            matches=all_matches,
            blocked=blocked,
            block_reason=block_reason,
            warnings=warnings,
        )
        if blocked:
            logger.warning(
                "Content blocked: %s (%d matches)",
                block_reason,
                len(all_matches),
            )
            if raise_on_block:
                raise ContentFilterError(
                    block_reason or "Content blocked",
                    filter_type=all_matches[0].category.value
                    if all_matches
                    else "unknown",
                    detected_patterns=[m.pattern_name for m in all_matches]
                    if all_matches
                    else [],
                )
        elif all_matches:
            logger.debug(
                "Content filtered: %d matches, %d warnings",
                len(all_matches),
                len(warnings),
            )
        return result
    async def filter_dict(
        self,
        data: dict[str, Any],
        keys_to_filter: list[str] | None = None,
        recursive: bool = True,
    ) -> dict[str, Any]:
        """
        Filter string values in a dictionary.
        Args:
            data: Dictionary to filter
            keys_to_filter: Specific keys to filter (None = all)
            recursive: Filter nested dictionaries
        Returns:
            Filtered dictionary
        """
        result: dict[str, Any] = {}
        for key, value in data.items():
            if isinstance(value, str):
                if keys_to_filter is None or key in keys_to_filter:
                    filter_result = await self.filter(value)
                    result[key] = filter_result.filtered_content
                else:
                    result[key] = value
            elif isinstance(value, dict) and recursive:
                result[key] = await self.filter_dict(value, keys_to_filter, recursive)
            elif isinstance(value, list):
                result[key] = [
                    (await self.filter(item)).filtered_content
                    if isinstance(item, str)
                    else item
                    for item in value
                ]
            else:
                result[key] = value
        return result
    async def scan(
        self,
        content: str,
        categories: list[ContentCategory] | None = None,
    ) -> list[FilterMatch]:
        """
        Scan content without filtering (detection only).
        Args:
            content: Content to scan
            categories: Limit to specific categories
        Returns:
            List of matches found
        """
        all_matches: list[FilterMatch] = []
        for pattern in self._patterns:
            if not pattern.enabled:
                continue
            if categories and pattern.category not in categories:
                continue
            matches = pattern.find_matches(content)
            all_matches.extend(matches)
        all_matches.sort(key=lambda m: m.start_pos)
        return all_matches
    async def validate_safe(
        self,
        content: str,
        categories: list[ContentCategory] | None = None,
        allow_warnings: bool = True,
    ) -> tuple[bool, list[str]]:
        """
        Validate that content is safe (no blocked patterns).
        Args:
            content: Content to validate
            categories: Limit to specific categories
            allow_warnings: Allow content with warnings
        Returns:
            Tuple of (is_safe, list of issues)
        """
        issues: list[str] = []
        for pattern in self._patterns:
            if not pattern.enabled:
                continue
            if categories and pattern.category not in categories:
                continue
            matches = pattern.find_matches(content)
            for match in matches:
                if pattern.action == FilterAction.BLOCK:
                    issues.append(
                        f"Blocked: {pattern.name} at position {match.start_pos}"
                    )
                elif pattern.action == FilterAction.WARN and not allow_warnings:
                    issues.append(
                        f"Warning: {pattern.name} at position {match.start_pos}"
                    )
        return len(issues) == 0, issues
    def _get_pattern(self, name: str) -> FilterPattern | None:
        """Get a pattern by name."""
        for pattern in self._patterns:
            if pattern.name == name:
                return pattern
        return None
    def get_pattern_stats(self) -> dict[str, Any]:
        """Get statistics about configured patterns."""
        by_category: dict[str, int] = {}
        by_action: dict[str, int] = {}
        for pattern in self._patterns:
            cat = pattern.category.value
            by_category[cat] = by_category.get(cat, 0) + 1
            act = pattern.action.value
            by_action[act] = by_action.get(act, 0) + 1
        return {
            "total_patterns": len(self._patterns),
            "enabled_patterns": sum(1 for p in self._patterns if p.enabled),
            "by_category": by_category,
            "by_action": by_action,
        }
 # Convenience function for quick filtering
 async def filter_content(content: str) -> str:
    """Quick filter content with default settings."""
    filter_instance = ContentFilter()
    result = await filter_instance.filter(content)
    return result.filtered_content
 async def scan_for_secrets(content: str) -> list[FilterMatch]:
    """Quick scan for secrets only."""
    filter_instance = ContentFilter(
        enable_pii_filter=False,
        enable_injection_filter=False,
    )
    return await filter_instance.scan(
        content,
        categories=[ContentCategory.SECRETS, ContentCategory.CREDENTIALS],
    )
--- a/backend/app/services/safety/costs/init.py
+++ b/backend/app/services/safety/costs/init.py
@@ -0,0 +1,15 @@
 """
 Cost Control Module
 Budget management and cost tracking.
 """
 from .controller import (
    BudgetTracker,
    CostController,
 )
 __all__ = [
    "BudgetTracker",
    "CostController",
 ]
--- a/backend/app/services/safety/costs/controller.py
+++ b/backend/app/services/safety/costs/controller.py
@@ -0,0 +1,498 @@
 """
 Cost Controller
 Budget management and cost tracking for agent operations.
 """
 import asyncio
 import logging
 from datetime import datetime, timedelta
 from typing import Any
 from ..config import get_safety_config
 from ..exceptions import BudgetExceededError
 from ..models import (
    ActionRequest,
    BudgetScope,
    BudgetStatus,
 )
 logger = logging.getLogger(__name__)
 class BudgetTracker:
    """Tracks usage against a budget limit."""
    def __init__(
        self,
        scope: BudgetScope,
        scope_id: str,
        tokens_limit: int,
        cost_limit_usd: float,
        reset_interval: timedelta | None = None,
        warning_threshold: float = 0.8,
    ) -> None:
        self.scope = scope
        self.scope_id = scope_id
        self.tokens_limit = tokens_limit
        self.cost_limit_usd = cost_limit_usd
        self.warning_threshold = warning_threshold
        self._reset_interval = reset_interval
        self._tokens_used = 0
        self._cost_used_usd = 0.0
        self._created_at = datetime.utcnow()
        self._last_reset = datetime.utcnow()
        self._lock = asyncio.Lock()
    async def add_usage(self, tokens: int, cost_usd: float) -> None:
        """Add usage to the tracker."""
        async with self._lock:
            self._check_reset()
            self._tokens_used += tokens
            self._cost_used_usd += cost_usd
    async def get_status(self) -> BudgetStatus:
        """Get current budget status."""
        async with self._lock:
            self._check_reset()
            tokens_remaining = max(0, self.tokens_limit - self._tokens_used)
            cost_remaining = max(0, self.cost_limit_usd - self._cost_used_usd)
            token_usage_ratio = (
                self._tokens_used / self.tokens_limit if self.tokens_limit > 0 else 0
            )
            cost_usage_ratio = (
                self._cost_used_usd / self.cost_limit_usd
                if self.cost_limit_usd > 0
                else 0
            )
            is_warning = (
                max(token_usage_ratio, cost_usage_ratio) >= self.warning_threshold
            )
            is_exceeded = (
                self._tokens_used >= self.tokens_limit
                or self._cost_used_usd >= self.cost_limit_usd
            )
            reset_at = None
            if self._reset_interval:
                reset_at = self._last_reset + self._reset_interval
            return BudgetStatus(
                scope=self.scope,
                scope_id=self.scope_id,
                tokens_used=self._tokens_used,
                tokens_limit=self.tokens_limit,
                cost_used_usd=self._cost_used_usd,
                cost_limit_usd=self.cost_limit_usd,
                tokens_remaining=tokens_remaining,
                cost_remaining_usd=cost_remaining,
                warning_threshold=self.warning_threshold,
                is_warning=is_warning,
                is_exceeded=is_exceeded,
                reset_at=reset_at,
            )
    async def check_budget(
        self, estimated_tokens: int, estimated_cost_usd: float
    ) -> bool:
        """Check if there's enough budget for an operation."""
        async with self._lock:
            self._check_reset()
            would_exceed_tokens = (
                self._tokens_used + estimated_tokens
            ) > self.tokens_limit
            would_exceed_cost = (
                self._cost_used_usd + estimated_cost_usd
            ) > self.cost_limit_usd
            return not (would_exceed_tokens or would_exceed_cost)
    def _check_reset(self) -> None:
        """Check if budget should reset."""
        if self._reset_interval is None:
            return
        now = datetime.utcnow()
        if now >= self._last_reset + self._reset_interval:
            logger.info(
                "Resetting budget for %s:%s",
                self.scope.value,
                self.scope_id,
            )
            self._tokens_used = 0
            self._cost_used_usd = 0.0
            self._last_reset = now
    async def reset(self) -> None:
        """Manually reset the budget."""
        async with self._lock:
            self._tokens_used = 0
            self._cost_used_usd = 0.0
            self._last_reset = datetime.utcnow()
 class CostController:
    """
    Controls costs and budgets for agent operations.
    Features:
    - Per-agent, per-project, per-session budgets
    - Real-time cost tracking
    - Budget alerts at configurable thresholds
    - Cost prediction for planned actions
    - Budget rollover policies
    """
    def __init__(
        self,
        default_session_tokens: int | None = None,
        default_session_cost_usd: float | None = None,
        default_daily_tokens: int | None = None,
        default_daily_cost_usd: float | None = None,
    ) -> None:
        """
        Initialize the CostController.
        Args:
            default_session_tokens: Default token budget per session
            default_session_cost_usd: Default USD budget per session
            default_daily_tokens: Default token budget per day
            default_daily_cost_usd: Default USD budget per day
        """
        config = get_safety_config()
        self._default_session_tokens = (
            default_session_tokens or config.default_session_token_budget
        )
        self._default_session_cost = (
            default_session_cost_usd or config.default_session_cost_limit
        )
        self._default_daily_tokens = (
            default_daily_tokens or config.default_daily_token_budget
        )
        self._default_daily_cost = (
            default_daily_cost_usd or config.default_daily_cost_limit
        )
        self._trackers: dict[str, BudgetTracker] = {}
        self._lock = asyncio.Lock()
        # Alert handlers
        self._alert_handlers: list[Any] = []
        # Track which budgets have had warning alerts sent (to avoid spam)
        self._warned_budgets: set[str] = set()
    async def get_or_create_tracker(
        self,
        scope: BudgetScope,
        scope_id: str,
    ) -> BudgetTracker:
        """Get or create a budget tracker."""
        key = f"{scope.value}:{scope_id}"
        async with self._lock:
            if key not in self._trackers:
                if scope == BudgetScope.SESSION:
                    tracker = BudgetTracker(
                        scope=scope,
                        scope_id=scope_id,
                        tokens_limit=self._default_session_tokens,
                        cost_limit_usd=self._default_session_cost,
                    )
                elif scope == BudgetScope.DAILY:
                    tracker = BudgetTracker(
                        scope=scope,
                        scope_id=scope_id,
                        tokens_limit=self._default_daily_tokens,
                        cost_limit_usd=self._default_daily_cost,
                        reset_interval=timedelta(days=1),
                    )
                else:
                    # Default
                    tracker = BudgetTracker(
                        scope=scope,
                        scope_id=scope_id,
                        tokens_limit=self._default_session_tokens,
                        cost_limit_usd=self._default_session_cost,
                    )
                self._trackers[key] = tracker
            return self._trackers[key]
    async def check_budget(
        self,
        agent_id: str,
        session_id: str | None,
        estimated_tokens: int,
        estimated_cost_usd: float,
    ) -> bool:
        """
        Check if there's enough budget for an operation.
        Args:
            agent_id: ID of the agent
            session_id: Optional session ID
            estimated_tokens: Estimated token usage
            estimated_cost_usd: Estimated USD cost
        Returns:
            True if budget is available
        """
        # Check session budget
        if session_id:
            session_tracker = await self.get_or_create_tracker(
                BudgetScope.SESSION, session_id
            )
            if not await session_tracker.check_budget(
                estimated_tokens, estimated_cost_usd
            ):
                return False
        # Check agent daily budget
        agent_tracker = await self.get_or_create_tracker(BudgetScope.DAILY, agent_id)
        if not await agent_tracker.check_budget(estimated_tokens, estimated_cost_usd):
            return False
        return True
    async def check_action(self, action: ActionRequest) -> bool:
        """
        Check if an action is within budget.
        Args:
            action: The action to check
        Returns:
            True if within budget
        """
        return await self.check_budget(
            agent_id=action.metadata.agent_id,
            session_id=action.metadata.session_id,
            estimated_tokens=action.estimated_cost_tokens,
            estimated_cost_usd=action.estimated_cost_usd,
        )
    async def require_budget(
        self,
        agent_id: str,
        session_id: str | None,
        estimated_tokens: int,
        estimated_cost_usd: float,
    ) -> None:
        """
        Require budget or raise exception.
        Args:
            agent_id: ID of the agent
            session_id: Optional session ID
            estimated_tokens: Estimated token usage
            estimated_cost_usd: Estimated USD cost
        Raises:
            BudgetExceededError: If budget is exceeded
        """
        if not await self.check_budget(
            agent_id, session_id, estimated_tokens, estimated_cost_usd
        ):
            # Determine which budget was exceeded
            if session_id:
                session_tracker = await self.get_or_create_tracker(
                    BudgetScope.SESSION, session_id
                )
                session_status = await session_tracker.get_status()
                if session_status.is_exceeded:
                    raise BudgetExceededError(
                        "Session budget exceeded",
                        budget_type="session",
                        current_usage=session_status.tokens_used,
                        budget_limit=session_status.tokens_limit,
                        agent_id=agent_id,
                    )
            agent_tracker = await self.get_or_create_tracker(
                BudgetScope.DAILY, agent_id
            )
            agent_status = await agent_tracker.get_status()
            raise BudgetExceededError(
                "Daily budget exceeded",
                budget_type="daily",
                current_usage=agent_status.tokens_used,
                budget_limit=agent_status.tokens_limit,
                agent_id=agent_id,
            )
    async def record_usage(
        self,
        agent_id: str,
        session_id: str | None,
        tokens: int,
        cost_usd: float,
    ) -> None:
        """
        Record actual usage.
        Args:
            agent_id: ID of the agent
            session_id: Optional session ID
            tokens: Actual token usage
            cost_usd: Actual USD cost
        """
        # Update session budget
        if session_id:
            session_key = f"session:{session_id}"
            session_tracker = await self.get_or_create_tracker(
                BudgetScope.SESSION, session_id
            )
            await session_tracker.add_usage(tokens, cost_usd)
            # Check for warning (only alert once per budget to avoid spam)
            status = await session_tracker.get_status()
            if status.is_warning and not status.is_exceeded:
                if session_key not in self._warned_budgets:
                    self._warned_budgets.add(session_key)
                    await self._send_alert(
                        "warning",
                        f"Session {session_id} at {status.tokens_used}/{status.tokens_limit} tokens",
                        status,
                    )
            elif not status.is_warning:
                # Clear warning flag if usage dropped below threshold (e.g., after reset)
                self._warned_budgets.discard(session_key)
        # Update agent daily budget
        daily_key = f"daily:{agent_id}"
        agent_tracker = await self.get_or_create_tracker(BudgetScope.DAILY, agent_id)
        await agent_tracker.add_usage(tokens, cost_usd)
        # Check for warning (only alert once per budget to avoid spam)
        status = await agent_tracker.get_status()
        if status.is_warning and not status.is_exceeded:
            if daily_key not in self._warned_budgets:
                self._warned_budgets.add(daily_key)
                await self._send_alert(
                    "warning",
                    f"Agent {agent_id} at {status.tokens_used}/{status.tokens_limit} daily tokens",
                    status,
                )
        elif not status.is_warning:
            # Clear warning flag if usage dropped below threshold (e.g., after reset)
            self._warned_budgets.discard(daily_key)
    async def get_status(
        self,
        scope: BudgetScope,
        scope_id: str,
    ) -> BudgetStatus | None:
        """
        Get budget status.
        Args:
            scope: Budget scope
            scope_id: ID within scope
        Returns:
            Budget status or None if not tracked
        """
        key = f"{scope.value}:{scope_id}"
        async with self._lock:
            tracker = self._trackers.get(key)
            # Get status while holding lock to prevent TOCTOU race
            if tracker:
                return await tracker.get_status()
            return None
    async def get_all_statuses(self) -> list[BudgetStatus]:
        """Get status of all tracked budgets."""
        statuses = []
        async with self._lock:
            # Get all statuses while holding lock to prevent TOCTOU race
            for tracker in self._trackers.values():
                statuses.append(await tracker.get_status())
        return statuses
    async def set_budget(
        self,
        scope: BudgetScope,
        scope_id: str,
        tokens_limit: int,
        cost_limit_usd: float,
    ) -> None:
        """
        Set a custom budget limit.
        Args:
            scope: Budget scope
            scope_id: ID within scope
            tokens_limit: Token limit
            cost_limit_usd: USD limit
        """
        key = f"{scope.value}:{scope_id}"
        reset_interval = None
        if scope == BudgetScope.DAILY:
            reset_interval = timedelta(days=1)
        elif scope == BudgetScope.WEEKLY:
            reset_interval = timedelta(weeks=1)
        elif scope == BudgetScope.MONTHLY:
            reset_interval = timedelta(days=30)
        async with self._lock:
            self._trackers[key] = BudgetTracker(
                scope=scope,
                scope_id=scope_id,
                tokens_limit=tokens_limit,
                cost_limit_usd=cost_limit_usd,
                reset_interval=reset_interval,
            )
    async def reset_budget(self, scope: BudgetScope, scope_id: str) -> bool:
        """
        Reset a budget tracker.
        Args:
            scope: Budget scope
            scope_id: ID within scope
        Returns:
            True if tracker was found and reset
        """
        key = f"{scope.value}:{scope_id}"
        async with self._lock:
            tracker = self._trackers.get(key)
            # Reset while holding lock to prevent TOCTOU race
            if tracker:
                await tracker.reset()
                return True
            return False
    def add_alert_handler(self, handler: Any) -> None:
        """Add an alert handler."""
        self._alert_handlers.append(handler)
    def remove_alert_handler(self, handler: Any) -> None:
        """Remove an alert handler."""
        if handler in self._alert_handlers:
            self._alert_handlers.remove(handler)
    async def _send_alert(
        self,
        alert_type: str,
        message: str,
        status: BudgetStatus,
    ) -> None:
        """Send alert to all handlers."""
        for handler in self._alert_handlers:
            try:
                if asyncio.iscoroutinefunction(handler):
                    await handler(alert_type, message, status)
                else:
                    handler(alert_type, message, status)
            except Exception as e:
                logger.error("Error in alert handler: %s", e)
--- a/backend/app/services/safety/emergency/init.py
+++ b/backend/app/services/safety/emergency/init.py
@@ -0,0 +1,23 @@
 """Emergency controls for agent safety."""
 from .controls import (
    EmergencyControls,
    EmergencyEvent,
    EmergencyReason,
    EmergencyState,
    EmergencyTrigger,
    check_emergency_allowed,
    emergency_stop_global,
    get_emergency_controls,
 )
 __all__ = [
    "EmergencyControls",
    "EmergencyEvent",
    "EmergencyReason",
    "EmergencyState",
    "EmergencyTrigger",
    "check_emergency_allowed",
    "emergency_stop_global",
    "get_emergency_controls",
 ]
--- a/backend/app/services/safety/emergency/controls.py
+++ b/backend/app/services/safety/emergency/controls.py
@@ -0,0 +1,596 @@
 """
 Emergency Controls
 Emergency stop and pause functionality for agent safety.
 """
 import asyncio
 import logging
 from collections.abc import Callable
 from dataclasses import dataclass, field
 from datetime import datetime
 from enum import Enum
 from typing import Any
 from ..exceptions import EmergencyStopError
 logger = logging.getLogger(__name__)
 class EmergencyState(str, Enum):
    """Emergency control states."""
    NORMAL = "normal"
    PAUSED = "paused"
    STOPPED = "stopped"
 class EmergencyReason(str, Enum):
    """Reasons for emergency actions."""
    MANUAL = "manual"
    SAFETY_VIOLATION = "safety_violation"
    BUDGET_EXCEEDED = "budget_exceeded"
    LOOP_DETECTED = "loop_detected"
    RATE_LIMIT = "rate_limit"
    CONTENT_VIOLATION = "content_violation"
    SYSTEM_ERROR = "system_error"
    EXTERNAL_TRIGGER = "external_trigger"
@dataclass
 class EmergencyEvent:
    """Record of an emergency action."""
    id: str
    state: EmergencyState
    reason: EmergencyReason
    triggered_by: str
    message: str
    scope: str  # "global", "project:<id>", "agent:<id>"
    timestamp: datetime = field(default_factory=datetime.utcnow)
    metadata: dict[str, Any] = field(default_factory=dict)
    resolved_at: datetime | None = None
    resolved_by: str | None = None
 class EmergencyControls:
    """
    Emergency stop and pause controls for agent safety.
    Features:
    - Global emergency stop
    - Per-project/agent emergency controls
    - Graceful pause with state preservation
    - Automatic triggers from safety violations
    - Manual override capabilities
    - Event history and audit trail
    """
    def __init__(
        self,
        notification_handlers: list[Callable[..., Any]] | None = None,
    ) -> None:
        """
        Initialize EmergencyControls.
        Args:
            notification_handlers: Handlers to call on emergency events
        """
        self._global_state = EmergencyState.NORMAL
        self._scoped_states: dict[str, EmergencyState] = {}
        self._events: list[EmergencyEvent] = []
        self._notification_handlers = notification_handlers or []
        self._lock = asyncio.Lock()
        self._event_id_counter = 0
        # Callbacks for state changes
        self._on_stop_callbacks: list[Callable[..., Any]] = []
        self._on_pause_callbacks: list[Callable[..., Any]] = []
        self._on_resume_callbacks: list[Callable[..., Any]] = []
    def _generate_event_id(self) -> str:
        """Generate a unique event ID."""
        self._event_id_counter += 1
        return f"emerg-{self._event_id_counter:06d}"
    async def emergency_stop(
        self,
        reason: EmergencyReason,
        triggered_by: str,
        message: str,
        scope: str = "global",
        metadata: dict[str, Any] | None = None,
    ) -> EmergencyEvent:
        """
        Trigger emergency stop.
        Args:
            reason: Reason for the stop
            triggered_by: Who/what triggered the stop
            message: Human-readable message
            scope: Scope of the stop (global, project:<id>, agent:<id>)
            metadata: Additional context
        Returns:
            The emergency event record
        """
        async with self._lock:
            event = EmergencyEvent(
                id=self._generate_event_id(),
                state=EmergencyState.STOPPED,
                reason=reason,
                triggered_by=triggered_by,
                message=message,
                scope=scope,
                metadata=metadata or {},
            )
            if scope == "global":
                self._global_state = EmergencyState.STOPPED
            else:
                self._scoped_states[scope] = EmergencyState.STOPPED
            self._events.append(event)
        logger.critical(
            "EMERGENCY STOP: scope=%s, reason=%s, by=%s - %s",
            scope,
            reason.value,
            triggered_by,
            message,
        )
        # Execute callbacks
        await self._execute_callbacks(self._on_stop_callbacks, event)
        await self._notify_handlers("emergency_stop", event)
        return event
    async def pause(
        self,
        reason: EmergencyReason,
        triggered_by: str,
        message: str,
        scope: str = "global",
        metadata: dict[str, Any] | None = None,
    ) -> EmergencyEvent:
        """
        Pause operations (can be resumed).
        Args:
            reason: Reason for the pause
            triggered_by: Who/what triggered the pause
            message: Human-readable message
            scope: Scope of the pause
            metadata: Additional context
        Returns:
            The emergency event record
        """
        async with self._lock:
            event = EmergencyEvent(
                id=self._generate_event_id(),
                state=EmergencyState.PAUSED,
                reason=reason,
                triggered_by=triggered_by,
                message=message,
                scope=scope,
                metadata=metadata or {},
            )
            if scope == "global":
                self._global_state = EmergencyState.PAUSED
            else:
                self._scoped_states[scope] = EmergencyState.PAUSED
            self._events.append(event)
        logger.warning(
            "PAUSE: scope=%s, reason=%s, by=%s - %s",
            scope,
            reason.value,
            triggered_by,
            message,
        )
        await self._execute_callbacks(self._on_pause_callbacks, event)
        await self._notify_handlers("pause", event)
        return event
    async def resume(
        self,
        scope: str = "global",
        resumed_by: str = "system",
        message: str | None = None,
    ) -> bool:
        """
        Resume operations from paused state.
        Args:
            scope: Scope to resume
            resumed_by: Who/what is resuming
            message: Optional message
        Returns:
            True if resumed, False if not in paused state
        """
        async with self._lock:
            current_state = self._get_state(scope)
            if current_state == EmergencyState.STOPPED:
                logger.warning(
                    "Cannot resume from STOPPED state: %s (requires reset)",
                    scope,
                )
                return False
            if current_state == EmergencyState.NORMAL:
                return True  # Already normal
            # Find the pause event and mark as resolved
            for event in reversed(self._events):
                if event.scope == scope and event.state == EmergencyState.PAUSED:
                    if event.resolved_at is None:
                        event.resolved_at = datetime.utcnow()
                        event.resolved_by = resumed_by
                        break
            if scope == "global":
                self._global_state = EmergencyState.NORMAL
            else:
                self._scoped_states[scope] = EmergencyState.NORMAL
        logger.info(
            "RESUMED: scope=%s, by=%s%s",
            scope,
            resumed_by,
            f" - {message}" if message else "",
        )
        await self._execute_callbacks(
            self._on_resume_callbacks,
            {"scope": scope, "resumed_by": resumed_by},
        )
        await self._notify_handlers(
            "resume", {"scope": scope, "resumed_by": resumed_by}
        )
        return True
    async def reset(
        self,
        scope: str = "global",
        reset_by: str = "admin",
        message: str | None = None,
    ) -> bool:
        """
        Reset from stopped state (requires explicit action).
        Args:
            scope: Scope to reset
            reset_by: Who is resetting (should be admin)
            message: Optional message
        Returns:
            True if reset successful
        """
        async with self._lock:
            current_state = self._get_state(scope)
            if current_state == EmergencyState.NORMAL:
                return True
            # Find the stop event and mark as resolved
            for event in reversed(self._events):
                if event.scope == scope and event.state == EmergencyState.STOPPED:
                    if event.resolved_at is None:
                        event.resolved_at = datetime.utcnow()
                        event.resolved_by = reset_by
                        break
            if scope == "global":
                self._global_state = EmergencyState.NORMAL
            else:
                self._scoped_states[scope] = EmergencyState.NORMAL
        logger.warning(
            "EMERGENCY RESET: scope=%s, by=%s%s",
            scope,
            reset_by,
            f" - {message}" if message else "",
        )
        await self._notify_handlers("reset", {"scope": scope, "reset_by": reset_by})
        return True
    async def check_allowed(
        self,
        scope: str | None = None,
        raise_if_blocked: bool = True,
    ) -> bool:
        """
        Check if operations are allowed.
        Args:
            scope: Specific scope to check (also checks global)
            raise_if_blocked: Raise exception if blocked
        Returns:
            True if operations are allowed
        Raises:
            EmergencyStopError: If blocked and raise_if_blocked=True
        """
        async with self._lock:
            # Always check global state
            if self._global_state != EmergencyState.NORMAL:
                if raise_if_blocked:
                    raise EmergencyStopError(
                        f"Global emergency state: {self._global_state.value}",
                        stop_type=self._get_last_reason("global") or "emergency",
                        triggered_by=self._get_last_triggered_by("global"),
                    )
                return False
            # Check specific scope
            if scope and scope in self._scoped_states:
                state = self._scoped_states[scope]
                if state != EmergencyState.NORMAL:
                    if raise_if_blocked:
                        raise EmergencyStopError(
                            f"Emergency state for {scope}: {state.value}",
                            stop_type=self._get_last_reason(scope) or "emergency",
                            triggered_by=self._get_last_triggered_by(scope),
                            details={"scope": scope},
                        )
                    return False
        return True
    def _get_state(self, scope: str) -> EmergencyState:
        """Get state for a scope."""
        if scope == "global":
            return self._global_state
        return self._scoped_states.get(scope, EmergencyState.NORMAL)
    def _get_last_reason(self, scope: str) -> str:
        """Get reason from last event for scope."""
        for event in reversed(self._events):
            if event.scope == scope and event.resolved_at is None:
                return event.reason.value
        return "unknown"
    def _get_last_triggered_by(self, scope: str) -> str:
        """Get triggered_by from last event for scope."""
        for event in reversed(self._events):
            if event.scope == scope and event.resolved_at is None:
                return event.triggered_by
        return "unknown"
    async def get_state(self, scope: str = "global") -> EmergencyState:
        """Get current state for a scope."""
        async with self._lock:
            return self._get_state(scope)
    async def get_all_states(self) -> dict[str, EmergencyState]:
        """Get all current states."""
        async with self._lock:
            states = {"global": self._global_state}
            states.update(self._scoped_states)
            return states
    async def get_active_events(self) -> list[EmergencyEvent]:
        """Get all unresolved emergency events."""
        async with self._lock:
            return [e for e in self._events if e.resolved_at is None]
    async def get_event_history(
        self,
        scope: str | None = None,
        limit: int = 100,
    ) -> list[EmergencyEvent]:
        """Get emergency event history."""
        async with self._lock:
            events = list(self._events)
        if scope:
            events = [e for e in events if e.scope == scope]
        return events[-limit:]
    def on_stop(self, callback: Callable[..., Any]) -> None:
        """Register callback for stop events."""
        self._on_stop_callbacks.append(callback)
    def on_pause(self, callback: Callable[..., Any]) -> None:
        """Register callback for pause events."""
        self._on_pause_callbacks.append(callback)
    def on_resume(self, callback: Callable[..., Any]) -> None:
        """Register callback for resume events."""
        self._on_resume_callbacks.append(callback)
    def add_notification_handler(self, handler: Callable[..., Any]) -> None:
        """Add a notification handler."""
        self._notification_handlers.append(handler)
    async def _execute_callbacks(
        self,
        callbacks: list[Callable[..., Any]],
        data: Any,
    ) -> None:
        """Execute callbacks safely."""
        for callback in callbacks:
            try:
                if asyncio.iscoroutinefunction(callback):
                    await callback(data)
                else:
                    callback(data)
            except Exception as e:
                logger.error("Error in callback: %s", e)
    async def _notify_handlers(self, event_type: str, data: Any) -> None:
        """Notify all handlers of an event."""
        for handler in self._notification_handlers:
            try:
                if asyncio.iscoroutinefunction(handler):
                    await handler(event_type, data)
                else:
                    handler(event_type, data)
            except Exception as e:
                logger.error("Error in notification handler: %s", e)
 class EmergencyTrigger:
    """
    Automatic emergency triggers based on conditions.
    """
    def __init__(self, controls: EmergencyControls) -> None:
        """
        Initialize EmergencyTrigger.
        Args:
            controls: EmergencyControls instance to trigger
        """
        self._controls = controls
    async def trigger_on_safety_violation(
        self,
        violation_type: str,
        details: dict[str, Any],
        scope: str = "global",
    ) -> EmergencyEvent:
        """
        Trigger emergency from safety violation.
        Args:
            violation_type: Type of violation
            details: Violation details
            scope: Scope for the emergency
        Returns:
            Emergency event
        """
        return await self._controls.emergency_stop(
            reason=EmergencyReason.SAFETY_VIOLATION,
            triggered_by="safety_system",
            message=f"Safety violation: {violation_type}",
            scope=scope,
            metadata={"violation_type": violation_type, **details},
        )
    async def trigger_on_budget_exceeded(
        self,
        budget_type: str,
        current: float,
        limit: float,
        scope: str = "global",
    ) -> EmergencyEvent:
        """
        Trigger emergency from budget exceeded.
        Args:
            budget_type: Type of budget
            current: Current usage
            limit: Budget limit
            scope: Scope for the emergency
        Returns:
            Emergency event
        """
        return await self._controls.pause(
            reason=EmergencyReason.BUDGET_EXCEEDED,
            triggered_by="budget_controller",
            message=f"Budget exceeded: {budget_type} ({current:.2f}/{limit:.2f})",
            scope=scope,
            metadata={"budget_type": budget_type, "current": current, "limit": limit},
        )
    async def trigger_on_loop_detected(
        self,
        loop_type: str,
        agent_id: str,
        details: dict[str, Any],
    ) -> EmergencyEvent:
        """
        Trigger emergency from loop detection.
        Args:
            loop_type: Type of loop
            agent_id: Agent that's looping
            details: Loop details
        Returns:
            Emergency event
        """
        return await self._controls.pause(
            reason=EmergencyReason.LOOP_DETECTED,
            triggered_by="loop_detector",
            message=f"Loop detected: {loop_type} in agent {agent_id}",
            scope=f"agent:{agent_id}",
            metadata={"loop_type": loop_type, "agent_id": agent_id, **details},
        )
    async def trigger_on_content_violation(
        self,
        category: str,
        pattern: str,
        scope: str = "global",
    ) -> EmergencyEvent:
        """
        Trigger emergency from content violation.
        Args:
            category: Content category
            pattern: Pattern that matched
            scope: Scope for the emergency
        Returns:
            Emergency event
        """
        return await self._controls.emergency_stop(
            reason=EmergencyReason.CONTENT_VIOLATION,
            triggered_by="content_filter",
            message=f"Content violation: {category} ({pattern})",
            scope=scope,
            metadata={"category": category, "pattern": pattern},
        )
 # Singleton instance
 _emergency_controls: EmergencyControls | None = None
 _lock = asyncio.Lock()
 async def get_emergency_controls() -> EmergencyControls:
    """Get the singleton EmergencyControls instance."""
    global _emergency_controls
    async with _lock:
        if _emergency_controls is None:
            _emergency_controls = EmergencyControls()
        return _emergency_controls
 async def emergency_stop_global(
    reason: str,
    triggered_by: str = "system",
 ) -> EmergencyEvent:
    """Quick global emergency stop."""
    controls = await get_emergency_controls()
    return await controls.emergency_stop(
        reason=EmergencyReason.MANUAL,
        triggered_by=triggered_by,
        message=reason,
        scope="global",
    )
 async def check_emergency_allowed(scope: str | None = None) -> bool:
    """Quick check if operations are allowed."""
    controls = await get_emergency_controls()
    return await controls.check_allowed(scope=scope, raise_if_blocked=False)
--- a/backend/app/services/safety/exceptions.py
+++ b/backend/app/services/safety/exceptions.py
@@ -0,0 +1,277 @@
 """
 Safety Framework Exceptions
 Custom exception classes for the safety and guardrails framework.
 """
 from typing import Any
 class SafetyError(Exception):
    """Base exception for all safety-related errors."""
    def __init__(
        self,
        message: str,
        *,
        action_id: str | None = None,
        agent_id: str | None = None,
        details: dict[str, Any] | None = None,
    ) -> None:
        super().__init__(message)
        self.message = message
        self.action_id = action_id
        self.agent_id = agent_id
        self.details = details or {}
 class PermissionDeniedError(SafetyError):
    """Raised when an action is not permitted."""
    def __init__(
        self,
        message: str = "Permission denied",
        *,
        action_type: str | None = None,
        resource: str | None = None,
        required_permission: str | None = None,
        **kwargs: Any,
    ) -> None:
        super().__init__(message, **kwargs)
        self.action_type = action_type
        self.resource = resource
        self.required_permission = required_permission
 class BudgetExceededError(SafetyError):
    """Raised when cost budget is exceeded."""
    def __init__(
        self,
        message: str = "Budget exceeded",
        *,
        budget_type: str = "session",
        current_usage: float = 0.0,
        budget_limit: float = 0.0,
        unit: str = "tokens",
        **kwargs: Any,
    ) -> None:
        super().__init__(message, **kwargs)
        self.budget_type = budget_type
        self.current_usage = current_usage
        self.budget_limit = budget_limit
        self.unit = unit
 class RateLimitExceededError(SafetyError):
    """Raised when rate limit is exceeded."""
    def __init__(
        self,
        message: str = "Rate limit exceeded",
        *,
        limit_type: str = "actions",
        limit_value: int = 0,
        window_seconds: int = 60,
        retry_after_seconds: float = 0.0,
        **kwargs: Any,
    ) -> None:
        super().__init__(message, **kwargs)
        self.limit_type = limit_type
        self.limit_value = limit_value
        self.window_seconds = window_seconds
        self.retry_after_seconds = retry_after_seconds
 class LoopDetectedError(SafetyError):
    """Raised when an action loop is detected."""
    def __init__(
        self,
        message: str = "Loop detected",
        *,
        loop_type: str = "exact",
        repetition_count: int = 0,
        action_pattern: list[str] | None = None,
        **kwargs: Any,
    ) -> None:
        super().__init__(message, **kwargs)
        self.loop_type = loop_type
        self.repetition_count = repetition_count
        self.action_pattern = action_pattern or []
 class ApprovalRequiredError(SafetyError):
    """Raised when human approval is required."""
    def __init__(
        self,
        message: str = "Human approval required",
        *,
        approval_id: str | None = None,
        reason: str | None = None,
        timeout_seconds: int = 300,
        **kwargs: Any,
    ) -> None:
        super().__init__(message, **kwargs)
        self.approval_id = approval_id
        self.reason = reason
        self.timeout_seconds = timeout_seconds
 class ApprovalDeniedError(SafetyError):
    """Raised when human explicitly denies an action."""
    def __init__(
        self,
        message: str = "Approval denied by human",
        *,
        approval_id: str | None = None,
        denied_by: str | None = None,
        denial_reason: str | None = None,
        **kwargs: Any,
    ) -> None:
        super().__init__(message, **kwargs)
        self.approval_id = approval_id
        self.denied_by = denied_by
        self.denial_reason = denial_reason
 class ApprovalTimeoutError(SafetyError):
    """Raised when approval request times out."""
    def __init__(
        self,
        message: str = "Approval request timed out",
        *,
        approval_id: str | None = None,
        timeout_seconds: int = 300,
        **kwargs: Any,
    ) -> None:
        super().__init__(message, **kwargs)
        self.approval_id = approval_id
        self.timeout_seconds = timeout_seconds
 class RollbackError(SafetyError):
    """Raised when rollback fails."""
    def __init__(
        self,
        message: str = "Rollback failed",
        *,
        checkpoint_id: str | None = None,
        failed_actions: list[str] | None = None,
        **kwargs: Any,
    ) -> None:
        super().__init__(message, **kwargs)
        self.checkpoint_id = checkpoint_id
        self.failed_actions = failed_actions or []
 class CheckpointError(SafetyError):
    """Raised when checkpoint creation fails."""
    def __init__(
        self,
        message: str = "Checkpoint creation failed",
        *,
        checkpoint_type: str | None = None,
        **kwargs: Any,
    ) -> None:
        super().__init__(message, **kwargs)
        self.checkpoint_type = checkpoint_type
 class ValidationError(SafetyError):
    """Raised when action validation fails."""
    def __init__(
        self,
        message: str = "Validation failed",
        *,
        validation_rules: list[str] | None = None,
        failed_rules: list[str] | None = None,
        **kwargs: Any,
    ) -> None:
        super().__init__(message, **kwargs)
        self.validation_rules = validation_rules or []
        self.failed_rules = failed_rules or []
 class ContentFilterError(SafetyError):
    """Raised when content filtering detects prohibited content."""
    def __init__(
        self,
        message: str = "Prohibited content detected",
        *,
        filter_type: str | None = None,
        detected_patterns: list[str] | None = None,
        **kwargs: Any,
    ) -> None:
        super().__init__(message, **kwargs)
        self.filter_type = filter_type
        self.detected_patterns = detected_patterns or []
 class SandboxError(SafetyError):
    """Raised when sandbox execution fails."""
    def __init__(
        self,
        message: str = "Sandbox execution failed",
        *,
        exit_code: int | None = None,
        stderr: str | None = None,
        **kwargs: Any,
    ) -> None:
        super().__init__(message, **kwargs)
        self.exit_code = exit_code
        self.stderr = stderr
 class SandboxTimeoutError(SandboxError):
    """Raised when sandbox execution times out."""
    def __init__(
        self,
        message: str = "Sandbox execution timed out",
        *,
        timeout_seconds: int = 300,
        **kwargs: Any,
    ) -> None:
        super().__init__(message, **kwargs)
        self.timeout_seconds = timeout_seconds
 class EmergencyStopError(SafetyError):
    """Raised when emergency stop is triggered."""
    def __init__(
        self,
        message: str = "Emergency stop triggered",
        *,
        stop_type: str = "kill",
        triggered_by: str | None = None,
        **kwargs: Any,
    ) -> None:
        super().__init__(message, **kwargs)
        self.stop_type = stop_type
        self.triggered_by = triggered_by
 class PolicyViolationError(SafetyError):
    """Raised when an action violates a safety policy."""
    def __init__(
        self,
        message: str = "Policy violation",
        *,
        policy_name: str | None = None,
        violated_rules: list[str] | None = None,
        **kwargs: Any,
    ) -> None:
        super().__init__(message, **kwargs)
        self.policy_name = policy_name
        self.violated_rules = violated_rules or []
--- a/backend/app/services/safety/guardian.py
+++ b/backend/app/services/safety/guardian.py
@@ -0,0 +1,864 @@
 """
 Safety Guardian
 Main facade for the safety framework. Orchestrates all safety checks
 before, during, and after action execution.
 """
 import asyncio
 import logging
 from typing import Any
 from .audit import AuditLogger, get_audit_logger
 from .config import (
    SafetyConfig,
    get_policy_for_autonomy_level,
    get_safety_config,
 )
 from .costs.controller import CostController
 from .exceptions import (
    BudgetExceededError,
    LoopDetectedError,
    RateLimitExceededError,
    SafetyError,
 )
 from .limits.limiter import RateLimiter
 from .loops.detector import LoopDetector
 from .models import (
    ActionRequest,
    ActionResult,
    AuditEventType,
    BudgetScope,
    GuardianResult,
    SafetyDecision,
    SafetyPolicy,
 )
 logger = logging.getLogger(__name__)
 class SafetyGuardian:
    """
    Central orchestrator for all safety checks.
    The SafetyGuardian is the main entry point for validating agent actions.
    It coordinates multiple safety subsystems:
    - Permission checking
    - Cost/budget control
    - Rate limiting
    - Loop detection
    - Human-in-the-loop approval
    - Rollback/checkpoint management
    - Content filtering
    - Sandbox execution
    Usage:
        guardian = SafetyGuardian()
        await guardian.initialize()
        # Before executing an action
        result = await guardian.validate(action_request)
        if not result.allowed:
            # Handle denial
        # After action execution
        await guardian.record_execution(action_request, action_result)
    """
    def __init__(
        self,
        config: SafetyConfig | None = None,
        audit_logger: AuditLogger | None = None,
        cost_controller: CostController | None = None,
        rate_limiter: RateLimiter | None = None,
        loop_detector: LoopDetector | None = None,
    ) -> None:
        """
        Initialize the SafetyGuardian.
        Args:
            config: Optional safety configuration. If None, loads from environment.
            audit_logger: Optional audit logger. If None, uses global instance.
            cost_controller: Optional cost controller. If None, creates default.
            rate_limiter: Optional rate limiter. If None, creates default.
            loop_detector: Optional loop detector. If None, creates default.
        """
        self._config = config or get_safety_config()
        self._audit_logger = audit_logger
        self._initialized = False
        self._lock = asyncio.Lock()
        # Core safety subsystems (always initialized)
        self._cost_controller: CostController | None = cost_controller
        self._rate_limiter: RateLimiter | None = rate_limiter
        self._loop_detector: LoopDetector | None = loop_detector
        # Optional subsystems (will be initialized when available)
        self._permission_manager: Any = None
        self._hitl_manager: Any = None
        self._rollback_manager: Any = None
        self._content_filter: Any = None
        self._sandbox_executor: Any = None
        self._emergency_controls: Any = None
        # Policy cache
        self._policies: dict[str, SafetyPolicy] = {}
        self._default_policy: SafetyPolicy | None = None
    @property
    def is_initialized(self) -> bool:
        """Check if the guardian is initialized."""
        return self._initialized
    @property
    def cost_controller(self) -> CostController | None:
        """Get the cost controller instance."""
        return self._cost_controller
    @property
    def rate_limiter(self) -> RateLimiter | None:
        """Get the rate limiter instance."""
        return self._rate_limiter
    @property
    def loop_detector(self) -> LoopDetector | None:
        """Get the loop detector instance."""
        return self._loop_detector
    async def initialize(self) -> None:
        """Initialize the SafetyGuardian and all subsystems."""
        async with self._lock:
            if self._initialized:
                logger.warning("SafetyGuardian already initialized")
                return
            logger.info("Initializing SafetyGuardian")
            # Get audit logger
            if self._audit_logger is None:
                self._audit_logger = await get_audit_logger()
            # Initialize core safety subsystems
            if self._cost_controller is None:
                self._cost_controller = CostController()
                logger.debug("Initialized CostController")
            if self._rate_limiter is None:
                self._rate_limiter = RateLimiter()
                logger.debug("Initialized RateLimiter")
            if self._loop_detector is None:
                self._loop_detector = LoopDetector()
                logger.debug("Initialized LoopDetector")
            self._initialized = True
            logger.info(
                "SafetyGuardian initialized with CostController, RateLimiter, LoopDetector"
            )
    async def shutdown(self) -> None:
        """Shutdown the SafetyGuardian and all subsystems."""
        async with self._lock:
            if not self._initialized:
                return
            logger.info("Shutting down SafetyGuardian")
            # Shutdown subsystems
            # (Will be implemented as subsystems are added)
            self._initialized = False
            logger.info("SafetyGuardian shutdown complete")
    async def validate(
        self,
        action: ActionRequest,
        policy: SafetyPolicy | None = None,
    ) -> GuardianResult:
        """
        Validate an action before execution.
        Runs all safety checks in order:
        1. Permission check
        2. Cost/budget check
        3. Rate limit check
        4. Loop detection
        5. HITL check (if required)
        6. Checkpoint creation (if destructive)
        Args:
            action: The action to validate
            policy: Optional policy override. If None, uses autonomy-level policy.
        Returns:
            GuardianResult with decision and details
        """
        if not self._initialized:
            await self.initialize()
        if not self._config.enabled:
            # Safety disabled - allow everything (NOT RECOMMENDED)
            logger.warning("Safety framework disabled - allowing action %s", action.id)
            return GuardianResult(
                action_id=action.id,
                allowed=True,
                decision=SafetyDecision.ALLOW,
                reasons=["Safety framework disabled"],
            )
        # Get policy for this action
        effective_policy = policy or self._get_policy(action)
        reasons: list[str] = []
        audit_events = []
        try:
            # Log action request
            if self._audit_logger:
                event = await self._audit_logger.log(
                    AuditEventType.ACTION_REQUESTED,
                    agent_id=action.metadata.agent_id,
                    action_id=action.id,
                    project_id=action.metadata.project_id,
                    session_id=action.metadata.session_id,
                    details={
                        "action_type": action.action_type.value,
                        "tool_name": action.tool_name,
                        "resource": action.resource,
                    },
                    correlation_id=action.metadata.correlation_id,
                )
                audit_events.append(event)
            # 1. Permission check
            permission_result = await self._check_permissions(action, effective_policy)
            if permission_result.decision == SafetyDecision.DENY:
                return await self._create_denial_result(
                    action, permission_result.reasons, audit_events
                )
            # 2. Cost/budget check
            budget_result = await self._check_budget(action, effective_policy)
            if budget_result.decision == SafetyDecision.DENY:
                return await self._create_denial_result(
                    action, budget_result.reasons, audit_events
                )
            # 3. Rate limit check
            rate_result = await self._check_rate_limit(action, effective_policy)
            if rate_result.decision == SafetyDecision.DENY:
                return await self._create_denial_result(
                    action,
                    rate_result.reasons,
                    audit_events,
                    retry_after=rate_result.retry_after_seconds,
                )
            if rate_result.decision == SafetyDecision.DELAY:
                # Return delay decision
                return GuardianResult(
                    action_id=action.id,
                    allowed=False,
                    decision=SafetyDecision.DELAY,
                    reasons=rate_result.reasons,
                    retry_after_seconds=rate_result.retry_after_seconds,
                    audit_events=audit_events,
                )
            # 4. Loop detection
            loop_result = await self._check_loops(action, effective_policy)
            if loop_result.decision == SafetyDecision.DENY:
                return await self._create_denial_result(
                    action, loop_result.reasons, audit_events
                )
            # 5. HITL check
            hitl_result = await self._check_hitl(action, effective_policy)
            if hitl_result.decision == SafetyDecision.REQUIRE_APPROVAL:
                return GuardianResult(
                    action_id=action.id,
                    allowed=False,
                    decision=SafetyDecision.REQUIRE_APPROVAL,
                    reasons=hitl_result.reasons,
                    approval_id=hitl_result.approval_id,
                    audit_events=audit_events,
                )
            # 6. Create checkpoint if destructive
            checkpoint_id = None
            if action.is_destructive and self._config.auto_checkpoint_destructive:
                checkpoint_id = await self._create_checkpoint(action)
            # All checks passed
            reasons.append("All safety checks passed")
            if self._audit_logger:
                event = await self._audit_logger.log_action_request(
                    action, SafetyDecision.ALLOW, reasons
                )
                audit_events.append(event)
            return GuardianResult(
                action_id=action.id,
                allowed=True,
                decision=SafetyDecision.ALLOW,
                reasons=reasons,
                checkpoint_id=checkpoint_id,
                audit_events=audit_events,
            )
        except SafetyError as e:
            # Known safety error
            return await self._create_denial_result(action, [str(e)], audit_events)
        except Exception as e:
            # Unknown error - fail closed in strict mode
            logger.error("Unexpected error in safety validation: %s", e)
            if self._config.strict_mode:
                return await self._create_denial_result(
                    action,
                    [f"Safety validation error: {e}"],
                    audit_events,
                )
            else:
                # Non-strict mode - allow with warning
                logger.warning("Non-strict mode: allowing action despite error")
                return GuardianResult(
                    action_id=action.id,
                    allowed=True,
                    decision=SafetyDecision.ALLOW,
                    reasons=["Allowed despite validation error (non-strict mode)"],
                    audit_events=audit_events,
                )
    async def record_execution(
        self,
        action: ActionRequest,
        result: ActionResult,
    ) -> None:
        """
        Record action execution result for auditing and tracking.
        Args:
            action: The executed action
            result: The execution result
        """
        if self._audit_logger:
            await self._audit_logger.log_action_executed(
                action,
                success=result.success,
                execution_time_ms=result.execution_time_ms,
                error=result.error,
            )
        # Update cost tracking
        if self._cost_controller:
            try:
                # Use explicit None check - 0 is a valid cost value
                tokens = (
                    result.actual_cost_tokens
                    if result.actual_cost_tokens is not None
                    else action.estimated_cost_tokens
                )
                cost_usd = (
                    result.actual_cost_usd
                    if result.actual_cost_usd is not None
                    else action.estimated_cost_usd
                )
                await self._cost_controller.record_usage(
                    agent_id=action.metadata.agent_id,
                    session_id=action.metadata.session_id,
                    tokens=tokens,
                    cost_usd=cost_usd,
                )
            except Exception as e:
                logger.warning("Failed to record cost: %s", e)
        # Update rate limiter - consume slots for executed actions
        if self._rate_limiter:
            try:
                await self._rate_limiter.record_action(action)
            except Exception as e:
                logger.warning("Failed to record action in rate limiter: %s", e)
        # Update loop detection history
        if self._loop_detector:
            try:
                await self._loop_detector.record(action)
            except Exception as e:
                logger.warning("Failed to record action in loop detector: %s", e)
    async def rollback(self, checkpoint_id: str) -> bool:
        """
        Rollback to a checkpoint.
        Args:
            checkpoint_id: ID of the checkpoint to rollback to
        Returns:
            True if rollback succeeded
        """
        if self._rollback_manager is None:
            logger.warning("Rollback manager not available")
            return False
        # Delegate to rollback manager
        return await self._rollback_manager.rollback(checkpoint_id)
    async def emergency_stop(
        self,
        stop_type: str = "kill",
        reason: str = "Manual emergency stop",
        triggered_by: str = "system",
    ) -> None:
        """
        Trigger emergency stop.
        Args:
            stop_type: Type of stop (kill, pause, lockdown)
            reason: Reason for the stop
            triggered_by: Who triggered the stop
        """
        logger.critical(
            "Emergency stop triggered: type=%s, reason=%s, by=%s",
            stop_type,
            reason,
            triggered_by,
        )
        if self._audit_logger:
            await self._audit_logger.log_emergency_stop(
                stop_type=stop_type,
                triggered_by=triggered_by,
                reason=reason,
            )
        if self._emergency_controls:
            await self._emergency_controls.execute_stop(stop_type)
    def _get_policy(self, action: ActionRequest) -> SafetyPolicy:
        """Get the effective policy for an action."""
        # Check cached policies
        autonomy_level = action.metadata.autonomy_level
        if autonomy_level.value not in self._policies:
            self._policies[autonomy_level.value] = get_policy_for_autonomy_level(
                autonomy_level
            )
        return self._policies[autonomy_level.value]
    async def _check_permissions(
        self,
        action: ActionRequest,
        policy: SafetyPolicy,
    ) -> GuardianResult:
        """Check if action is permitted."""
        reasons: list[str] = []
        # Check denied tools
        if action.tool_name:
            for pattern in policy.denied_tools:
                if self._matches_pattern(action.tool_name, pattern):
                    reasons.append(
                        f"Tool '{action.tool_name}' denied by pattern '{pattern}'"
                    )
                    return GuardianResult(
                        action_id=action.id,
                        allowed=False,
                        decision=SafetyDecision.DENY,
                        reasons=reasons,
                    )
        # Check allowed tools (if not "*")
        if action.tool_name and "*" not in policy.allowed_tools:
            allowed = False
            for pattern in policy.allowed_tools:
                if self._matches_pattern(action.tool_name, pattern):
                    allowed = True
                    break
            if not allowed:
                reasons.append(f"Tool '{action.tool_name}' not in allowed list")
                return GuardianResult(
                    action_id=action.id,
                    allowed=False,
                    decision=SafetyDecision.DENY,
                    reasons=reasons,
                )
        # Check file patterns
        if action.resource:
            for pattern in policy.denied_file_patterns:
                if self._matches_pattern(action.resource, pattern):
                    reasons.append(
                        f"Resource '{action.resource}' denied by pattern '{pattern}'"
                    )
                    return GuardianResult(
                        action_id=action.id,
                        allowed=False,
                        decision=SafetyDecision.DENY,
                        reasons=reasons,
                    )
        return GuardianResult(
            action_id=action.id,
            allowed=True,
            decision=SafetyDecision.ALLOW,
            reasons=["Permission check passed"],
        )
    async def _check_budget(
        self,
        action: ActionRequest,
        policy: SafetyPolicy,
    ) -> GuardianResult:
        """Check if action is within budget."""
        if self._cost_controller is None:
            logger.warning("CostController not initialized - skipping budget check")
            return GuardianResult(
                action_id=action.id,
                allowed=True,
                decision=SafetyDecision.ALLOW,
                reasons=["Budget check skipped (controller not initialized)"],
            )
        agent_id = action.metadata.agent_id
        session_id = action.metadata.session_id
        try:
            # Check if we have budget for this action
            has_budget = await self._cost_controller.check_budget(
                agent_id=agent_id,
                session_id=session_id,
                estimated_tokens=action.estimated_cost_tokens,
                estimated_cost_usd=action.estimated_cost_usd,
            )
            if not has_budget:
                # Get current status for better error message
                if session_id:
                    session_status = await self._cost_controller.get_status(
                        BudgetScope.SESSION, session_id
                    )
                    if session_status and session_status.is_exceeded:
                        return GuardianResult(
                            action_id=action.id,
                            allowed=False,
                            decision=SafetyDecision.DENY,
                            reasons=[
                                f"Session budget exceeded: {session_status.tokens_used}"
                                f"/{session_status.tokens_limit} tokens"
                            ],
                        )
                agent_status = await self._cost_controller.get_status(
                    BudgetScope.DAILY, agent_id
                )
                if agent_status and agent_status.is_exceeded:
                    return GuardianResult(
                        action_id=action.id,
                        allowed=False,
                        decision=SafetyDecision.DENY,
                        reasons=[
                            f"Daily budget exceeded: {agent_status.tokens_used}"
                            f"/{agent_status.tokens_limit} tokens"
                        ],
                    )
                # Generic budget exceeded
                return GuardianResult(
                    action_id=action.id,
                    allowed=False,
                    decision=SafetyDecision.DENY,
                    reasons=["Budget exceeded"],
                )
            return GuardianResult(
                action_id=action.id,
                allowed=True,
                decision=SafetyDecision.ALLOW,
                reasons=["Budget check passed"],
            )
        except BudgetExceededError as e:
            return GuardianResult(
                action_id=action.id,
                allowed=False,
                decision=SafetyDecision.DENY,
                reasons=[str(e)],
            )
    async def _check_rate_limit(
        self,
        action: ActionRequest,
        policy: SafetyPolicy,
    ) -> GuardianResult:
        """Check if action is within rate limits."""
        if self._rate_limiter is None:
            logger.warning("RateLimiter not initialized - skipping rate limit check")
            return GuardianResult(
                action_id=action.id,
                allowed=True,
                decision=SafetyDecision.ALLOW,
                reasons=["Rate limit check skipped (limiter not initialized)"],
            )
        try:
            # Check all applicable rate limits for this action
            allowed, statuses = await self._rate_limiter.check_action(action)
            if not allowed:
                # Find the first exceeded limit for the error message
                exceeded_status = next(
                    (s for s in statuses if s.is_limited),
                    statuses[0] if statuses else None,
                )
                if exceeded_status:
                    retry_after = exceeded_status.retry_after_seconds
                    # Determine if this is a soft limit (delay) or hard limit (deny)
                    if retry_after > 0 and retry_after <= 5.0:
                        # Short wait - suggest delay
                        return GuardianResult(
                            action_id=action.id,
                            allowed=False,
                            decision=SafetyDecision.DELAY,
                            reasons=[
                                f"Rate limit '{exceeded_status.name}' exceeded. "
                                f"Current: {exceeded_status.current_count}/{exceeded_status.limit}"
                            ],
                            retry_after_seconds=retry_after,
                        )
                    else:
                        # Hard deny
                        return GuardianResult(
                            action_id=action.id,
                            allowed=False,
                            decision=SafetyDecision.DENY,
                            reasons=[
                                f"Rate limit '{exceeded_status.name}' exceeded. "
                                f"Current: {exceeded_status.current_count}/{exceeded_status.limit}. "
                                f"Retry after {retry_after:.1f}s"
                            ],
                            retry_after_seconds=retry_after,
                        )
                return GuardianResult(
                    action_id=action.id,
                    allowed=False,
                    decision=SafetyDecision.DENY,
                    reasons=["Rate limit exceeded"],
                )
            return GuardianResult(
                action_id=action.id,
                allowed=True,
                decision=SafetyDecision.ALLOW,
                reasons=["Rate limit check passed"],
            )
        except RateLimitExceededError as e:
            return GuardianResult(
                action_id=action.id,
                allowed=False,
                decision=SafetyDecision.DENY,
                reasons=[str(e)],
                retry_after_seconds=e.retry_after_seconds,
            )
    async def _check_loops(
        self,
        action: ActionRequest,
        policy: SafetyPolicy,
    ) -> GuardianResult:
        """Check for action loops."""
        if self._loop_detector is None:
            logger.warning("LoopDetector not initialized - skipping loop check")
            return GuardianResult(
                action_id=action.id,
                allowed=True,
                decision=SafetyDecision.ALLOW,
                reasons=["Loop check skipped (detector not initialized)"],
            )
        try:
            # Check if this action would create a loop
            is_loop, loop_type = await self._loop_detector.check(action)
            if is_loop:
                # Get suggestions for breaking the loop
                from .loops.detector import LoopBreaker
                suggestions = await LoopBreaker.suggest_alternatives(
                    action, loop_type or "unknown"
                )
                return GuardianResult(
                    action_id=action.id,
                    allowed=False,
                    decision=SafetyDecision.DENY,
                    reasons=[
                        f"Loop detected: {loop_type}",
                        *suggestions,
                    ],
                )
            return GuardianResult(
                action_id=action.id,
                allowed=True,
                decision=SafetyDecision.ALLOW,
                reasons=["Loop check passed"],
            )
        except LoopDetectedError as e:
            return GuardianResult(
                action_id=action.id,
                allowed=False,
                decision=SafetyDecision.DENY,
                reasons=[str(e)],
            )
    async def _check_hitl(
        self,
        action: ActionRequest,
        policy: SafetyPolicy,
    ) -> GuardianResult:
        """Check if human approval is required."""
        if not self._config.hitl_enabled:
            return GuardianResult(
                action_id=action.id,
                allowed=True,
                decision=SafetyDecision.ALLOW,
                reasons=["HITL disabled"],
            )
        # Check if action requires approval
        requires_approval = False
        for pattern in policy.require_approval_for:
            if pattern == "*":
                requires_approval = True
                break
            if action.tool_name and self._matches_pattern(action.tool_name, pattern):
                requires_approval = True
                break
            if action.action_type.value and self._matches_pattern(
                action.action_type.value, pattern
            ):
                requires_approval = True
                break
        if requires_approval:
            # TODO: Create approval request with HITLManager
            return GuardianResult(
                action_id=action.id,
                allowed=False,
                decision=SafetyDecision.REQUIRE_APPROVAL,
                reasons=["Action requires human approval"],
                approval_id=None,  # Will be set by HITLManager
            )
        return GuardianResult(
            action_id=action.id,
            allowed=True,
            decision=SafetyDecision.ALLOW,
            reasons=["No approval required"],
        )
    async def _create_checkpoint(self, action: ActionRequest) -> str | None:
        """Create a checkpoint before destructive action."""
        if self._rollback_manager is None:
            logger.warning("Rollback manager not available - skipping checkpoint")
            return None
        # TODO: Implement with RollbackManager
        return None
    async def _create_denial_result(
        self,
        action: ActionRequest,
        reasons: list[str],
        audit_events: list[Any],
        retry_after: float | None = None,
    ) -> GuardianResult:
        """Create a denial result with audit logging."""
        if self._audit_logger:
            event = await self._audit_logger.log_action_request(
                action, SafetyDecision.DENY, reasons
            )
            audit_events.append(event)
        return GuardianResult(
            action_id=action.id,
            allowed=False,
            decision=SafetyDecision.DENY,
            reasons=reasons,
            retry_after_seconds=retry_after,
            audit_events=audit_events,
        )
    def _matches_pattern(self, value: str, pattern: str) -> bool:
        """Check if value matches a pattern (supports * wildcard)."""
        if pattern == "*":
            return True
        if "*" not in pattern:
            return value == pattern
        # Simple wildcard matching
        if pattern.startswith("*") and pattern.endswith("*"):
            return pattern[1:-1] in value
        elif pattern.startswith("*"):
            return value.endswith(pattern[1:])
        elif pattern.endswith("*"):
            return value.startswith(pattern[:-1])
        else:
            # Pattern like "foo*bar"
            parts = pattern.split("*")
            if len(parts) == 2:
                return value.startswith(parts[0]) and value.endswith(parts[1])
        return False
 # Singleton instance
 _guardian_instance: SafetyGuardian | None = None
 _guardian_lock = asyncio.Lock()
 async def get_safety_guardian() -> SafetyGuardian:
    """Get the global SafetyGuardian instance."""
    global _guardian_instance
    async with _guardian_lock:
        if _guardian_instance is None:
            _guardian_instance = SafetyGuardian()
            await _guardian_instance.initialize()
    return _guardian_instance
 async def shutdown_safety_guardian() -> None:
    """Shutdown the global SafetyGuardian."""
    global _guardian_instance
    async with _guardian_lock:
        if _guardian_instance is not None:
            await _guardian_instance.shutdown()
            _guardian_instance = None
 async def reset_safety_guardian() -> None:
    """
    Reset the SafetyGuardian (for testing).
    This is an async function to properly acquire the guardian lock
    and avoid race conditions with get_safety_guardian().
    """
    global _guardian_instance
    async with _guardian_lock:
        if _guardian_instance is not None:
            try:
                await _guardian_instance.shutdown()
            except Exception:  # noqa: S110
                pass  # Ignore errors during test cleanup
        _guardian_instance = None
--- a/backend/app/services/safety/hitl/init.py
+++ b/backend/app/services/safety/hitl/init.py
@@ -0,0 +1,5 @@
 """Human-in-the-Loop approval workflows."""
 from .manager import ApprovalQueue, HITLManager
 __all__ = ["ApprovalQueue", "HITLManager"]
--- a/backend/app/services/safety/hitl/manager.py
+++ b/backend/app/services/safety/hitl/manager.py
@@ -0,0 +1,449 @@
 """
 Human-in-the-Loop (HITL) Manager
 Manages approval workflows for actions requiring human oversight.
 """
 import asyncio
 import logging
 from collections.abc import Callable
 from datetime import datetime, timedelta
 from typing import Any
 from uuid import uuid4
 from ..config import get_safety_config
 from ..exceptions import (
    ApprovalDeniedError,
    ApprovalRequiredError,
    ApprovalTimeoutError,
 )
 from ..models import (
    ActionRequest,
    ApprovalRequest,
    ApprovalResponse,
    ApprovalStatus,
 )
 logger = logging.getLogger(__name__)
 class ApprovalQueue:
    """Queue for pending approval requests."""
    def __init__(self) -> None:
        self._pending: dict[str, ApprovalRequest] = {}
        self._completed: dict[str, ApprovalResponse] = {}
        self._waiters: dict[str, asyncio.Event] = {}
        self._lock = asyncio.Lock()
    async def add(self, request: ApprovalRequest) -> None:
        """Add an approval request to the queue."""
        async with self._lock:
            self._pending[request.id] = request
            self._waiters[request.id] = asyncio.Event()
    async def get_pending(self, request_id: str) -> ApprovalRequest | None:
        """Get a pending request by ID."""
        async with self._lock:
            return self._pending.get(request_id)
    async def complete(self, response: ApprovalResponse) -> bool:
        """Complete an approval request."""
        async with self._lock:
            if response.request_id not in self._pending:
                return False
            del self._pending[response.request_id]
            self._completed[response.request_id] = response
            # Notify waiters
            if response.request_id in self._waiters:
                self._waiters[response.request_id].set()
            return True
    async def wait_for_response(
        self,
        request_id: str,
        timeout_seconds: float,
    ) -> ApprovalResponse | None:
        """Wait for a response to an approval request."""
        async with self._lock:
            waiter = self._waiters.get(request_id)
            if not waiter:
                return self._completed.get(request_id)
        try:
            await asyncio.wait_for(waiter.wait(), timeout=timeout_seconds)
        except TimeoutError:
            return None
        async with self._lock:
            return self._completed.get(request_id)
    async def list_pending(self) -> list[ApprovalRequest]:
        """List all pending requests."""
        async with self._lock:
            return list(self._pending.values())
    async def cancel(self, request_id: str) -> bool:
        """Cancel a pending request."""
        async with self._lock:
            if request_id not in self._pending:
                return False
            del self._pending[request_id]
            # Create cancelled response
            response = ApprovalResponse(
                request_id=request_id,
                status=ApprovalStatus.CANCELLED,
                reason="Cancelled",
            )
            self._completed[request_id] = response
            # Notify waiters
            if request_id in self._waiters:
                self._waiters[request_id].set()
            return True
    async def cleanup_expired(self) -> int:
        """Clean up expired requests."""
        now = datetime.utcnow()
        to_timeout: list[str] = []
        async with self._lock:
            for request_id, request in self._pending.items():
                if request.expires_at and request.expires_at < now:
                    to_timeout.append(request_id)
        count = 0
        for request_id in to_timeout:
            async with self._lock:
                if request_id in self._pending:
                    del self._pending[request_id]
                    self._completed[request_id] = ApprovalResponse(
                        request_id=request_id,
                        status=ApprovalStatus.TIMEOUT,
                        reason="Request timed out",
                    )
                    if request_id in self._waiters:
                        self._waiters[request_id].set()
                    count += 1
        return count
 class HITLManager:
    """
    Manages Human-in-the-Loop approval workflows.
    Features:
    - Approval request queue
    - Configurable timeout handling (default deny)
    - Approval delegation
    - Batch approval for similar actions
    - Approval with modifications
    - Notification channels
    """
    def __init__(
        self,
        default_timeout: int | None = None,
    ) -> None:
        """
        Initialize the HITLManager.
        Args:
            default_timeout: Default timeout for approval requests in seconds
        """
        config = get_safety_config()
        self._default_timeout = default_timeout or config.hitl_default_timeout
        self._queue = ApprovalQueue()
        self._notification_handlers: list[Callable[..., Any]] = []
        self._running = False
        self._cleanup_task: asyncio.Task[None] | None = None
    async def start(self) -> None:
        """Start the HITL manager background tasks."""
        if self._running:
            return
        self._running = True
        self._cleanup_task = asyncio.create_task(self._periodic_cleanup())
        logger.info("HITL Manager started")
    async def stop(self) -> None:
        """Stop the HITL manager."""
        self._running = False
        if self._cleanup_task:
            self._cleanup_task.cancel()
            try:
                await self._cleanup_task
            except asyncio.CancelledError:
                pass
        logger.info("HITL Manager stopped")
    async def request_approval(
        self,
        action: ActionRequest,
        reason: str,
        timeout_seconds: int | None = None,
        urgency: str = "normal",
        context: dict[str, Any] | None = None,
    ) -> ApprovalRequest:
        """
        Create an approval request for an action.
        Args:
            action: The action requiring approval
            reason: Why approval is required
            timeout_seconds: Timeout for this request
            urgency: Urgency level (low, normal, high, critical)
            context: Additional context for the approver
        Returns:
            The created approval request
        """
        timeout = timeout_seconds or self._default_timeout
        expires_at = datetime.utcnow() + timedelta(seconds=timeout)
        request = ApprovalRequest(
            id=str(uuid4()),
            action=action,
            reason=reason,
            urgency=urgency,
            timeout_seconds=timeout,
            expires_at=expires_at,
            context=context or {},
        )
        await self._queue.add(request)
        # Notify handlers
        await self._notify_handlers("approval_requested", request)
        logger.info(
            "Approval requested: %s for action %s (timeout: %ds)",
            request.id,
            action.id,
            timeout,
        )
        return request
    async def wait_for_approval(
        self,
        request_id: str,
        timeout_seconds: int | None = None,
    ) -> ApprovalResponse:
        """
        Wait for an approval decision.
        Args:
            request_id: ID of the approval request
            timeout_seconds: Override timeout
        Returns:
            The approval response
        Raises:
            ApprovalTimeoutError: If timeout expires
            ApprovalDeniedError: If approval is denied
        """
        request = await self._queue.get_pending(request_id)
        if not request:
            raise ApprovalRequiredError(
                f"Approval request not found: {request_id}",
                approval_id=request_id,
            )
        timeout = timeout_seconds or request.timeout_seconds or self._default_timeout
        response = await self._queue.wait_for_response(request_id, timeout)
        if response is None:
            # Timeout - default deny
            response = ApprovalResponse(
                request_id=request_id,
                status=ApprovalStatus.TIMEOUT,
                reason="Request timed out (default deny)",
            )
            await self._queue.complete(response)
            raise ApprovalTimeoutError(
                "Approval request timed out",
                approval_id=request_id,
                timeout_seconds=timeout,
            )
        if response.status == ApprovalStatus.DENIED:
            raise ApprovalDeniedError(
                response.reason or "Approval denied",
                approval_id=request_id,
                denied_by=response.decided_by,
                denial_reason=response.reason,
            )
        if response.status == ApprovalStatus.TIMEOUT:
            raise ApprovalTimeoutError(
                "Approval request timed out",
                approval_id=request_id,
                timeout_seconds=timeout,
            )
        if response.status == ApprovalStatus.CANCELLED:
            raise ApprovalDeniedError(
                "Approval request was cancelled",
                approval_id=request_id,
                denial_reason="Cancelled",
            )
        return response
    async def approve(
        self,
        request_id: str,
        decided_by: str,
        reason: str | None = None,
        modifications: dict[str, Any] | None = None,
    ) -> bool:
        """
        Approve a pending request.
        Args:
            request_id: ID of the approval request
            decided_by: Who approved
            reason: Optional approval reason
            modifications: Optional modifications to the action
        Returns:
            True if approval was recorded
        """
        response = ApprovalResponse(
            request_id=request_id,
            status=ApprovalStatus.APPROVED,
            decided_by=decided_by,
            reason=reason,
            modifications=modifications,
        )
        success = await self._queue.complete(response)
        if success:
            logger.info(
                "Approval granted: %s by %s",
                request_id,
                decided_by,
            )
            await self._notify_handlers("approval_granted", response)
        return success
    async def deny(
        self,
        request_id: str,
        decided_by: str,
        reason: str | None = None,
    ) -> bool:
        """
        Deny a pending request.
        Args:
            request_id: ID of the approval request
            decided_by: Who denied
            reason: Denial reason
        Returns:
            True if denial was recorded
        """
        response = ApprovalResponse(
            request_id=request_id,
            status=ApprovalStatus.DENIED,
            decided_by=decided_by,
            reason=reason,
        )
        success = await self._queue.complete(response)
        if success:
            logger.info(
                "Approval denied: %s by %s - %s",
                request_id,
                decided_by,
                reason,
            )
            await self._notify_handlers("approval_denied", response)
        return success
    async def cancel(self, request_id: str) -> bool:
        """
        Cancel a pending request.
        Args:
            request_id: ID of the approval request
        Returns:
            True if request was cancelled
        """
        success = await self._queue.cancel(request_id)
        if success:
            logger.info("Approval request cancelled: %s", request_id)
        return success
    async def list_pending(self) -> list[ApprovalRequest]:
        """List all pending approval requests."""
        return await self._queue.list_pending()
    async def get_request(self, request_id: str) -> ApprovalRequest | None:
        """Get an approval request by ID."""
        return await self._queue.get_pending(request_id)
    def add_notification_handler(
        self,
        handler: Callable[..., Any],
    ) -> None:
        """Add a notification handler."""
        self._notification_handlers.append(handler)
    def remove_notification_handler(
        self,
        handler: Callable[..., Any],
    ) -> None:
        """Remove a notification handler."""
        if handler in self._notification_handlers:
            self._notification_handlers.remove(handler)
    async def _notify_handlers(
        self,
        event_type: str,
        data: Any,
    ) -> None:
        """Notify all handlers of an event."""
        for handler in self._notification_handlers:
            try:
                if asyncio.iscoroutinefunction(handler):
                    await handler(event_type, data)
                else:
                    handler(event_type, data)
            except Exception as e:
                logger.error("Error in notification handler: %s", e)
    async def _periodic_cleanup(self) -> None:
        """Background task for cleaning up expired requests."""
        while self._running:
            try:
                await asyncio.sleep(30)  # Check every 30 seconds
                count = await self._queue.cleanup_expired()
                if count:
                    logger.debug("Cleaned up %d expired approval requests", count)
            except asyncio.CancelledError:
                break
            except Exception as e:
                logger.error("Error in approval cleanup: %s", e)
--- a/backend/app/services/safety/limits/init.py
+++ b/backend/app/services/safety/limits/init.py
@@ -0,0 +1,15 @@
 """
 Rate Limiting Module
 Sliding window rate limiting for agent operations.
 """
 from .limiter import (
    RateLimiter,
    SlidingWindowCounter,
 )
 __all__ = [
    "RateLimiter",
    "SlidingWindowCounter",
 ]
--- a/backend/app/services/safety/limits/limiter.py
+++ b/backend/app/services/safety/limits/limiter.py
@@ -0,0 +1,396 @@
 """
 Rate Limiter
 Sliding window rate limiting for agent operations.
 """
 import asyncio
 import logging
 import time
 from collections import deque
 from ..config import get_safety_config
 from ..exceptions import RateLimitExceededError
 from ..models import (
    ActionRequest,
    RateLimitConfig,
    RateLimitStatus,
 )
 logger = logging.getLogger(__name__)
 class SlidingWindowCounter:
    """Sliding window counter for rate limiting."""
    def __init__(
        self,
        limit: int,
        window_seconds: int,
        burst_limit: int | None = None,
    ) -> None:
        self.limit = limit
        self.window_seconds = window_seconds
        self.burst_limit = burst_limit or limit
        self._timestamps: deque[float] = deque()
        self._lock = asyncio.Lock()
    async def try_acquire(self) -> tuple[bool, float]:
        """
        Try to acquire a slot.
        Returns:
            Tuple of (allowed, retry_after_seconds)
        """
        now = time.time()
        window_start = now - self.window_seconds
        async with self._lock:
            # Remove expired entries
            while self._timestamps and self._timestamps[0] < window_start:
                self._timestamps.popleft()
            current_count = len(self._timestamps)
            # Check burst limit (instant check)
            if current_count >= self.burst_limit:
                # Calculate retry time
                oldest = self._timestamps[0] if self._timestamps else now
                retry_after = oldest + self.window_seconds - now
                return False, max(0, retry_after)
            # Check window limit
            if current_count >= self.limit:
                oldest = self._timestamps[0] if self._timestamps else now
                retry_after = oldest + self.window_seconds - now
                return False, max(0, retry_after)
            # Allow and record
            self._timestamps.append(now)
            return True, 0.0
    async def get_status(self) -> tuple[int, int, float]:
        """
        Get current status.
        Returns:
            Tuple of (current_count, remaining, reset_in_seconds)
        """
        now = time.time()
        window_start = now - self.window_seconds
        async with self._lock:
            # Remove expired entries
            while self._timestamps and self._timestamps[0] < window_start:
                self._timestamps.popleft()
            current_count = len(self._timestamps)
            remaining = max(0, self.limit - current_count)
            if self._timestamps:
                reset_in = self._timestamps[0] + self.window_seconds - now
            else:
                reset_in = 0.0
            return current_count, remaining, max(0, reset_in)
 class RateLimiter:
    """
    Rate limiter for agent operations.
    Features:
    - Per-tool rate limits
    - Per-agent rate limits
    - Per-resource rate limits
    - Sliding window implementation
    - Burst allowance with recovery
    - Slowdown before hard block
    """
    def __init__(self) -> None:
        """Initialize the RateLimiter."""
        config = get_safety_config()
        self._configs: dict[str, RateLimitConfig] = {}
        self._counters: dict[str, SlidingWindowCounter] = {}
        self._lock = asyncio.Lock()
        # Default rate limits
        self._default_limits = {
            "actions": RateLimitConfig(
                name="actions",
                limit=config.default_actions_per_minute,
                window_seconds=60,
            ),
            "llm_calls": RateLimitConfig(
                name="llm_calls",
                limit=config.default_llm_calls_per_minute,
                window_seconds=60,
            ),
            "file_ops": RateLimitConfig(
                name="file_ops",
                limit=config.default_file_ops_per_minute,
                window_seconds=60,
            ),
        }
    def configure(self, config: RateLimitConfig) -> None:
        """
        Configure a rate limit.
        Args:
            config: Rate limit configuration
        """
        self._configs[config.name] = config
        logger.debug(
            "Configured rate limit: %s = %d/%ds",
            config.name,
            config.limit,
            config.window_seconds,
        )
    async def check(
        self,
        limit_name: str,
        key: str,
    ) -> RateLimitStatus:
        """
        Check rate limit without consuming a slot.
        Args:
            limit_name: Name of the rate limit
            key: Key for tracking (e.g., agent_id)
        Returns:
            Rate limit status
        """
        counter = await self._get_counter(limit_name, key)
        config = self._get_config(limit_name)
        current, remaining, reset_in = await counter.get_status()
        from datetime import datetime, timedelta
        return RateLimitStatus(
            name=limit_name,
            current_count=current,
            limit=config.limit,
            window_seconds=config.window_seconds,
            remaining=remaining,
            reset_at=datetime.utcnow() + timedelta(seconds=reset_in),
            is_limited=remaining <= 0,
            retry_after_seconds=reset_in if remaining <= 0 else 0.0,
        )
    async def acquire(
        self,
        limit_name: str,
        key: str,
    ) -> tuple[bool, RateLimitStatus]:
        """
        Try to acquire a rate limit slot.
        Args:
            limit_name: Name of the rate limit
            key: Key for tracking (e.g., agent_id)
        Returns:
            Tuple of (allowed, status)
        """
        counter = await self._get_counter(limit_name, key)
        config = self._get_config(limit_name)
        allowed, retry_after = await counter.try_acquire()
        current, remaining, reset_in = await counter.get_status()
        from datetime import datetime, timedelta
        status = RateLimitStatus(
            name=limit_name,
            current_count=current,
            limit=config.limit,
            window_seconds=config.window_seconds,
            remaining=remaining,
            reset_at=datetime.utcnow() + timedelta(seconds=reset_in),
            is_limited=not allowed,
            retry_after_seconds=retry_after,
        )
        return allowed, status
    async def check_action(
        self,
        action: ActionRequest,
    ) -> tuple[bool, list[RateLimitStatus]]:
        """
        Check all applicable rate limits for an action WITHOUT consuming slots.
        Use this during validation to check if action would be allowed.
        Call record_action() after successful execution to consume slots.
        Args:
            action: The action to check
        Returns:
            Tuple of (allowed, list of statuses)
        """
        agent_id = action.metadata.agent_id
        statuses: list[RateLimitStatus] = []
        allowed = True
        # Check general actions limit (read-only)
        actions_status = await self.check("actions", agent_id)
        statuses.append(actions_status)
        if actions_status.is_limited:
            allowed = False
        # Check LLM-specific limit for LLM calls
        if action.action_type.value == "llm_call":
            llm_status = await self.check("llm_calls", agent_id)
            statuses.append(llm_status)
            if llm_status.is_limited:
                allowed = False
        # Check file ops limit for file operations
        if action.action_type.value in {"file_read", "file_write", "file_delete"}:
            file_status = await self.check("file_ops", agent_id)
            statuses.append(file_status)
            if file_status.is_limited:
                allowed = False
        return allowed, statuses
    async def record_action(
        self,
        action: ActionRequest,
    ) -> None:
        """
        Record an action by consuming rate limit slots.
        Call this AFTER successful execution to properly count the action.
        Args:
            action: The executed action
        """
        agent_id = action.metadata.agent_id
        # Consume general actions slot
        await self.acquire("actions", agent_id)
        # Consume LLM-specific slot for LLM calls
        if action.action_type.value == "llm_call":
            await self.acquire("llm_calls", agent_id)
        # Consume file ops slot for file operations
        if action.action_type.value in {"file_read", "file_write", "file_delete"}:
            await self.acquire("file_ops", agent_id)
    async def require(
        self,
        limit_name: str,
        key: str,
    ) -> None:
        """
        Require rate limit slot or raise exception.
        Args:
            limit_name: Name of the rate limit
            key: Key for tracking
        Raises:
            RateLimitExceededError: If rate limit exceeded
        """
        allowed, status = await self.acquire(limit_name, key)
        if not allowed:
            raise RateLimitExceededError(
                f"Rate limit exceeded: {limit_name}",
                limit_type=limit_name,
                limit_value=status.limit,
                window_seconds=status.window_seconds,
                retry_after_seconds=status.retry_after_seconds,
            )
    async def get_all_statuses(self, key: str) -> dict[str, RateLimitStatus]:
        """
        Get status of all rate limits for a key.
        Args:
            key: Key for tracking
        Returns:
            Dict of limit name to status
        """
        statuses = {}
        for name in self._default_limits:
            statuses[name] = await self.check(name, key)
        for name in self._configs:
            if name not in statuses:
                statuses[name] = await self.check(name, key)
        return statuses
    async def reset(self, limit_name: str, key: str) -> bool:
        """
        Reset a rate limit counter.
        Args:
            limit_name: Name of the rate limit
            key: Key for tracking
        Returns:
            True if counter was found and reset
        """
        counter_key = f"{limit_name}:{key}"
        async with self._lock:
            if counter_key in self._counters:
                del self._counters[counter_key]
                return True
        return False
    async def reset_all(self, key: str) -> int:
        """
        Reset all rate limit counters for a key.
        Args:
            key: Key for tracking
        Returns:
            Number of counters reset
        """
        count = 0
        async with self._lock:
            to_remove = [k for k in self._counters if k.endswith(f":{key}")]
            for k in to_remove:
                del self._counters[k]
                count += 1
        return count
    def _get_config(self, limit_name: str) -> RateLimitConfig:
        """Get configuration for a rate limit."""
        if limit_name in self._configs:
            return self._configs[limit_name]
        if limit_name in self._default_limits:
            return self._default_limits[limit_name]
        # Return default
        return RateLimitConfig(
            name=limit_name,
            limit=60,
            window_seconds=60,
        )
    async def _get_counter(
        self,
        limit_name: str,
        key: str,
    ) -> SlidingWindowCounter:
        """Get or create a counter."""
        counter_key = f"{limit_name}:{key}"
        config = self._get_config(limit_name)
        async with self._lock:
            if counter_key not in self._counters:
                self._counters[counter_key] = SlidingWindowCounter(
                    limit=config.limit,
                    window_seconds=config.window_seconds,
                    burst_limit=config.burst_limit,
                )
            return self._counters[counter_key]
--- a/backend/app/services/safety/loops/init.py
+++ b/backend/app/services/safety/loops/init.py
@@ -0,0 +1,17 @@
 """
 Loop Detection Module
 Detects and prevents action loops in agent behavior.
 """
 from .detector import (
    ActionSignature,
    LoopBreaker,
    LoopDetector,
 )
 __all__ = [
    "ActionSignature",
    "LoopBreaker",
    "LoopDetector",
 ]
--- a/backend/app/services/safety/loops/detector.py
+++ b/backend/app/services/safety/loops/detector.py
@@ -0,0 +1,269 @@
 """
 Loop Detector
 Detects and prevents action loops in agent behavior.
 """
 import asyncio
 import hashlib
 import json
 import logging
 from collections import Counter, deque
 from typing import Any
 from ..config import get_safety_config
 from ..exceptions import LoopDetectedError
 from ..models import ActionRequest
 logger = logging.getLogger(__name__)
 class ActionSignature:
    """Signature of an action for comparison."""
    def __init__(self, action: ActionRequest) -> None:
        self.action_type = action.action_type.value
        self.tool_name = action.tool_name
        self.resource = action.resource
        self.args_hash = self._hash_args(action.arguments)
    def _hash_args(self, args: dict[str, Any]) -> str:
        """Create a hash of the arguments."""
        try:
            serialized = json.dumps(args, sort_keys=True, default=str)
            return hashlib.sha256(serialized.encode()).hexdigest()[:8]
        except Exception:
            return ""
    def exact_key(self) -> str:
        """Key for exact match detection."""
        return f"{self.action_type}:{self.tool_name}:{self.resource}:{self.args_hash}"
    def semantic_key(self) -> str:
        """Key for semantic (similar) match detection."""
        return f"{self.action_type}:{self.tool_name}:{self.resource}"
    def type_key(self) -> str:
        """Key for action type only."""
        return f"{self.action_type}"
 class LoopDetector:
    """
    Detects action loops and repetitive behavior.
    Loop Types:
    - Exact: Same action with same arguments
    - Semantic: Similar actions (same type/tool/resource, different args)
    - Oscillation: A→B→A→B patterns
    """
    def __init__(
        self,
        history_size: int | None = None,
        max_exact_repetitions: int | None = None,
        max_semantic_repetitions: int | None = None,
    ) -> None:
        """
        Initialize the LoopDetector.
        Args:
            history_size: Size of action history to track
            max_exact_repetitions: Max allowed exact repetitions
            max_semantic_repetitions: Max allowed semantic repetitions
        """
        config = get_safety_config()
        self._history_size = history_size or config.loop_history_size
        self._max_exact = max_exact_repetitions or config.max_repeated_actions
        self._max_semantic = max_semantic_repetitions or config.max_similar_actions
        # Per-agent history
        self._histories: dict[str, deque[ActionSignature]] = {}
        self._lock = asyncio.Lock()
    async def check(self, action: ActionRequest) -> tuple[bool, str | None]:
        """
        Check if an action would create a loop.
        Args:
            action: The action to check
        Returns:
            Tuple of (is_loop, loop_type)
        """
        agent_id = action.metadata.agent_id
        signature = ActionSignature(action)
        async with self._lock:
            history = self._get_history(agent_id)
            # Check exact repetition
            exact_key = signature.exact_key()
            exact_count = sum(1 for h in history if h.exact_key() == exact_key)
            if exact_count >= self._max_exact:
                return True, "exact"
            # Check semantic repetition
            semantic_key = signature.semantic_key()
            semantic_count = sum(1 for h in history if h.semantic_key() == semantic_key)
            if semantic_count >= self._max_semantic:
                return True, "semantic"
            # Check oscillation (A→B→A→B pattern)
            if len(history) >= 3:
                pattern = self._detect_oscillation(history, signature)
                if pattern:
                    return True, "oscillation"
        return False, None
    async def check_and_raise(self, action: ActionRequest) -> None:
        """
        Check for loops and raise if detected.
        Args:
            action: The action to check
        Raises:
            LoopDetectedError: If loop is detected
        """
        is_loop, loop_type = await self.check(action)
        if is_loop:
            signature = ActionSignature(action)
            raise LoopDetectedError(
                f"Loop detected: {loop_type}",
                loop_type=loop_type or "unknown",
                repetition_count=self._max_exact
                if loop_type == "exact"
                else self._max_semantic,
                action_pattern=[signature.semantic_key()],
                agent_id=action.metadata.agent_id,
                action_id=action.id,
            )
    async def record(self, action: ActionRequest) -> None:
        """
        Record an action in history.
        Args:
            action: The action to record
        """
        agent_id = action.metadata.agent_id
        signature = ActionSignature(action)
        async with self._lock:
            history = self._get_history(agent_id)
            history.append(signature)
    async def clear_history(self, agent_id: str) -> None:
        """
        Clear history for an agent.
        Args:
            agent_id: ID of the agent
        """
        async with self._lock:
            if agent_id in self._histories:
                self._histories[agent_id].clear()
    async def get_stats(self, agent_id: str) -> dict[str, Any]:
        """
        Get loop detection stats for an agent.
        Args:
            agent_id: ID of the agent
        Returns:
            Stats dictionary
        """
        async with self._lock:
            history = self._get_history(agent_id)
            # Count action types
            type_counts = Counter(h.type_key() for h in history)
            semantic_counts = Counter(h.semantic_key() for h in history)
            return {
                "history_size": len(history),
                "max_history": self._history_size,
                "action_type_counts": dict(type_counts),
                "top_semantic_patterns": semantic_counts.most_common(5),
            }
    def _get_history(self, agent_id: str) -> deque[ActionSignature]:
        """Get or create history for an agent."""
        if agent_id not in self._histories:
            self._histories[agent_id] = deque(maxlen=self._history_size)
        return self._histories[agent_id]
    def _detect_oscillation(
        self,
        history: deque[ActionSignature],
        current: ActionSignature,
    ) -> bool:
        """
        Detect A→B→A→B oscillation pattern.
        Looks at last 4+ actions including current.
        """
        if len(history) < 3:
            return False
        # Get last 3 actions + current
        recent = [*list(history)[-3:], current]
        # Check for A→B→A→B pattern
        if len(recent) >= 4:
            # Get semantic keys
            keys = [a.semantic_key() for a in recent[-4:]]
            # Pattern: k[0]==k[2] and k[1]==k[3] and k[0]!=k[1]
            if keys[0] == keys[2] and keys[1] == keys[3] and keys[0] != keys[1]:
                return True
        return False
 class LoopBreaker:
    """
    Strategies for breaking detected loops.
    """
    @staticmethod
    async def suggest_alternatives(
        action: ActionRequest,
        loop_type: str,
    ) -> list[str]:
        """
        Suggest alternative actions when loop is detected.
        Args:
            action: The looping action
            loop_type: Type of loop detected
        Returns:
            List of suggestions
        """
        suggestions = []
        if loop_type == "exact":
            suggestions.append(
                "The same action with identical arguments has been repeated too many times. "
                "Consider: (1) Verify the action succeeded, (2) Try a different approach, "
                "(3) Escalate for human review"
            )
        elif loop_type == "semantic":
            suggestions.append(
                "Similar actions have been repeated too many times. "
                "Consider: (1) Review if the approach is working, (2) Try an alternative method, "
                "(3) Request clarification on the goal"
            )
        elif loop_type == "oscillation":
            suggestions.append(
                "An oscillating pattern was detected (A→B→A→B). "
                "This usually indicates conflicting goals or a stuck state. "
                "Consider: (1) Step back and reassess, (2) Request human guidance"
            )
        return suggestions
--- a/backend/app/services/safety/mcp/init.py
+++ b/backend/app/services/safety/mcp/init.py
@@ -0,0 +1,17 @@
 """MCP safety integration."""
 from .integration import (
    MCPSafetyWrapper,
    MCPToolCall,
    MCPToolResult,
    SafeToolExecutor,
    create_mcp_wrapper,
 )
 __all__ = [
    "MCPSafetyWrapper",
    "MCPToolCall",
    "MCPToolResult",
    "SafeToolExecutor",
    "create_mcp_wrapper",
 ]
--- a/backend/app/services/safety/mcp/integration.py
+++ b/backend/app/services/safety/mcp/integration.py
@@ -0,0 +1,409 @@
 """
 MCP Safety Integration
 Provides safety-aware wrappers for MCP tool execution.
 """
 import asyncio
 import logging
 from collections.abc import Callable
 from dataclasses import dataclass, field
 from datetime import datetime
 from typing import Any, ClassVar, TypeVar
 from ..audit import AuditLogger
 from ..emergency import EmergencyControls, get_emergency_controls
 from ..exceptions import (
    EmergencyStopError,
    SafetyError,
 )
 from ..guardian import SafetyGuardian, get_safety_guardian
 from ..models import (
    ActionMetadata,
    ActionRequest,
    ActionType,
    AutonomyLevel,
    SafetyDecision,
 )
 logger = logging.getLogger(__name__)
 T = TypeVar("T")
@dataclass
 class MCPToolCall:
    """Represents an MCP tool call."""
    tool_name: str
    arguments: dict[str, Any]
    server_name: str | None = None
    project_id: str | None = None
    context: dict[str, Any] = field(default_factory=dict)
@dataclass
 class MCPToolResult:
    """Result of an MCP tool execution."""
    success: bool
    result: Any = None
    error: str | None = None
    safety_decision: SafetyDecision = SafetyDecision.ALLOW
    execution_time_ms: float = 0.0
    approval_id: str | None = None
    checkpoint_id: str | None = None
    metadata: dict[str, Any] = field(default_factory=dict)
 class MCPSafetyWrapper:
    """
    Wraps MCP tool execution with safety checks.
    Features:
    - Pre-execution validation via SafetyGuardian
    - Permission checking per tool/resource
    - Budget and rate limit enforcement
    - Audit logging of all MCP calls
    - Emergency stop integration
    - Checkpoint creation for destructive operations
    """
    # Tool categories for automatic classification
    DESTRUCTIVE_TOOLS: ClassVar[set[str]] = {
        "file_write",
        "file_delete",
        "database_mutate",
        "shell_execute",
        "git_push",
        "git_commit",
        "deploy",
    }
    READ_ONLY_TOOLS: ClassVar[set[str]] = {
        "file_read",
        "database_query",
        "git_status",
        "git_log",
        "list_files",
        "search",
    }
    def __init__(
        self,
        guardian: SafetyGuardian | None = None,
        audit_logger: AuditLogger | None = None,
        emergency_controls: EmergencyControls | None = None,
    ) -> None:
        """
        Initialize MCPSafetyWrapper.
        Args:
            guardian: SafetyGuardian instance (uses singleton if not provided)
            audit_logger: AuditLogger instance
            emergency_controls: EmergencyControls instance
        """
        self._guardian = guardian
        self._audit_logger = audit_logger
        self._emergency_controls = emergency_controls
        self._tool_handlers: dict[str, Callable[..., Any]] = {}
        self._lock = asyncio.Lock()
    async def _get_guardian(self) -> SafetyGuardian:
        """Get or create SafetyGuardian."""
        if self._guardian is None:
            self._guardian = await get_safety_guardian()
        return self._guardian
    async def _get_emergency_controls(self) -> EmergencyControls:
        """Get or create EmergencyControls."""
        if self._emergency_controls is None:
            self._emergency_controls = await get_emergency_controls()
        return self._emergency_controls
    def register_tool_handler(
        self,
        tool_name: str,
        handler: Callable[..., Any],
    ) -> None:
        """
        Register a handler for a tool.
        Args:
            tool_name: Name of the tool
            handler: Async function to handle the tool call
        """
        self._tool_handlers[tool_name] = handler
        logger.debug("Registered handler for tool: %s", tool_name)
    async def execute(
        self,
        tool_call: MCPToolCall,
        agent_id: str,
        autonomy_level: AutonomyLevel = AutonomyLevel.MILESTONE,
        bypass_safety: bool = False,
    ) -> MCPToolResult:
        """
        Execute an MCP tool call with safety checks.
        Args:
            tool_call: The tool call to execute
            agent_id: ID of the calling agent
            autonomy_level: Agent's autonomy level
            bypass_safety: Bypass safety checks (emergency only)
        Returns:
            MCPToolResult with execution outcome
        """
        start_time = datetime.utcnow()
        # Check emergency controls first
        emergency = await self._get_emergency_controls()
        scope = f"agent:{agent_id}"
        if tool_call.project_id:
            scope = f"project:{tool_call.project_id}"
        try:
            await emergency.check_allowed(scope=scope, raise_if_blocked=True)
        except EmergencyStopError as e:
            return MCPToolResult(
                success=False,
                error=str(e),
                safety_decision=SafetyDecision.DENY,
                metadata={"emergency_stop": True},
            )
        # Build action request
        action = self._build_action_request(
            tool_call=tool_call,
            agent_id=agent_id,
            autonomy_level=autonomy_level,
        )
        # Skip safety checks if bypass is enabled
        if bypass_safety:
            logger.warning(
                "Safety bypass enabled for tool: %s (agent: %s)",
                tool_call.tool_name,
                agent_id,
            )
            return await self._execute_tool(tool_call, action, start_time)
        # Run safety validation
        guardian = await self._get_guardian()
        try:
            guardian_result = await guardian.validate(action)
        except SafetyError as e:
            return MCPToolResult(
                success=False,
                error=str(e),
                safety_decision=SafetyDecision.DENY,
                execution_time_ms=self._elapsed_ms(start_time),
            )
        # Handle safety decision
        if guardian_result.decision == SafetyDecision.DENY:
            return MCPToolResult(
                success=False,
                error="; ".join(guardian_result.reasons),
                safety_decision=SafetyDecision.DENY,
                execution_time_ms=self._elapsed_ms(start_time),
            )
        if guardian_result.decision == SafetyDecision.REQUIRE_APPROVAL:
            # For now, just return that approval is required
            # The caller should handle the approval flow
            return MCPToolResult(
                success=False,
                error="Action requires human approval",
                safety_decision=SafetyDecision.REQUIRE_APPROVAL,
                approval_id=guardian_result.approval_id,
                execution_time_ms=self._elapsed_ms(start_time),
            )
        # Execute the tool
        result = await self._execute_tool(
            tool_call,
            action,
            start_time,
            checkpoint_id=guardian_result.checkpoint_id,
        )
        return result
    async def _execute_tool(
        self,
        tool_call: MCPToolCall,
        action: ActionRequest,
        start_time: datetime,
        checkpoint_id: str | None = None,
    ) -> MCPToolResult:
        """Execute the actual tool call."""
        handler = self._tool_handlers.get(tool_call.tool_name)
        if handler is None:
            return MCPToolResult(
                success=False,
                error=f"No handler registered for tool: {tool_call.tool_name}",
                safety_decision=SafetyDecision.ALLOW,
                execution_time_ms=self._elapsed_ms(start_time),
            )
        try:
            if asyncio.iscoroutinefunction(handler):
                result = await handler(**tool_call.arguments)
            else:
                result = handler(**tool_call.arguments)
            return MCPToolResult(
                success=True,
                result=result,
                safety_decision=SafetyDecision.ALLOW,
                execution_time_ms=self._elapsed_ms(start_time),
                checkpoint_id=checkpoint_id,
            )
        except Exception as e:
            logger.error("Tool execution failed: %s - %s", tool_call.tool_name, e)
            return MCPToolResult(
                success=False,
                error=str(e),
                safety_decision=SafetyDecision.ALLOW,
                execution_time_ms=self._elapsed_ms(start_time),
                checkpoint_id=checkpoint_id,
            )
    def _build_action_request(
        self,
        tool_call: MCPToolCall,
        agent_id: str,
        autonomy_level: AutonomyLevel,
    ) -> ActionRequest:
        """Build an ActionRequest from an MCP tool call."""
        action_type = self._classify_tool(tool_call.tool_name)
        metadata = ActionMetadata(
            agent_id=agent_id,
            session_id=tool_call.context.get("session_id", ""),
            project_id=tool_call.project_id or "",
            autonomy_level=autonomy_level,
        )
        return ActionRequest(
            action_type=action_type,
            tool_name=tool_call.tool_name,
            arguments=tool_call.arguments,
            resource=tool_call.arguments.get(
                "path", tool_call.arguments.get("resource")
            ),
            metadata=metadata,
        )
    def _classify_tool(self, tool_name: str) -> ActionType:
        """Classify a tool into an action type."""
        tool_lower = tool_name.lower()
        # Check destructive patterns
        if any(
            d in tool_lower for d in ["write", "create", "delete", "remove", "update"]
        ):
            if "file" in tool_lower:
                if "delete" in tool_lower or "remove" in tool_lower:
                    return ActionType.FILE_DELETE
                return ActionType.FILE_WRITE
            if "database" in tool_lower or "db" in tool_lower:
                return ActionType.DATABASE_MUTATE
        # Check read patterns
        if any(r in tool_lower for r in ["read", "get", "list", "search", "query"]):
            if "file" in tool_lower:
                return ActionType.FILE_READ
            if "database" in tool_lower or "db" in tool_lower:
                return ActionType.DATABASE_QUERY
        # Check specific types
        if "shell" in tool_lower or "exec" in tool_lower or "bash" in tool_lower:
            return ActionType.SHELL_COMMAND
        if "git" in tool_lower:
            return ActionType.GIT_OPERATION
        if "http" in tool_lower or "fetch" in tool_lower or "request" in tool_lower:
            return ActionType.NETWORK_REQUEST
        if "llm" in tool_lower or "ai" in tool_lower or "claude" in tool_lower:
            return ActionType.LLM_CALL
        # Default to tool call
        return ActionType.TOOL_CALL
    def _elapsed_ms(self, start_time: datetime) -> float:
        """Calculate elapsed time in milliseconds."""
        return (datetime.utcnow() - start_time).total_seconds() * 1000
 class SafeToolExecutor:
    """
    Context manager for safe tool execution with automatic cleanup.
    Usage:
        async with SafeToolExecutor(wrapper, tool_call, agent_id) as executor:
            result = await executor.execute()
            if result.success:
                # Use result
            else:
                # Handle error or approval required
    """
    def __init__(
        self,
        wrapper: MCPSafetyWrapper,
        tool_call: MCPToolCall,
        agent_id: str,
        autonomy_level: AutonomyLevel = AutonomyLevel.MILESTONE,
    ) -> None:
        self._wrapper = wrapper
        self._tool_call = tool_call
        self._agent_id = agent_id
        self._autonomy_level = autonomy_level
        self._result: MCPToolResult | None = None
    async def __aenter__(self) -> "SafeToolExecutor":
        return self
    async def __aexit__(
        self,
        exc_type: type[Exception] | None,
        exc_val: Exception | None,
        exc_tb: Any,
    ) -> bool:
        # Could trigger rollback here if needed
        return False
    async def execute(self) -> MCPToolResult:
        """Execute the tool call."""
        self._result = await self._wrapper.execute(
            self._tool_call,
            self._agent_id,
            self._autonomy_level,
        )
        return self._result
    @property
    def result(self) -> MCPToolResult | None:
        """Get the execution result."""
        return self._result
 # Factory function
 async def create_mcp_wrapper(
    guardian: SafetyGuardian | None = None,
 ) -> MCPSafetyWrapper:
    """Create an MCPSafetyWrapper with default configuration."""
    if guardian is None:
        guardian = await get_safety_guardian()
    return MCPSafetyWrapper(
        guardian=guardian,
        emergency_controls=await get_emergency_controls(),
    )
--- a/backend/app/services/safety/metrics/init.py
+++ b/backend/app/services/safety/metrics/init.py
@@ -0,0 +1,19 @@
 """Safety metrics collection and export."""
 from .collector import (
    MetricType,
    MetricValue,
    SafetyMetrics,
    get_safety_metrics,
    record_mcp_call,
    record_validation,
 )
 __all__ = [
    "MetricType",
    "MetricValue",
    "SafetyMetrics",
    "get_safety_metrics",
    "record_mcp_call",
    "record_validation",
 ]
--- a/backend/app/services/safety/metrics/collector.py
+++ b/backend/app/services/safety/metrics/collector.py
@@ -0,0 +1,430 @@
 """
 Safety Metrics Collector
 Collects and exposes metrics for the safety framework.
 """
 import asyncio
 import logging
 from collections import Counter, defaultdict
 from dataclasses import dataclass, field
 from datetime import datetime
 from enum import Enum
 from typing import Any
 logger = logging.getLogger(__name__)
 class MetricType(str, Enum):
    """Types of metrics."""
    COUNTER = "counter"
    GAUGE = "gauge"
    HISTOGRAM = "histogram"
@dataclass
 class MetricValue:
    """A single metric value."""
    name: str
    metric_type: MetricType
    value: float
    labels: dict[str, str] = field(default_factory=dict)
    timestamp: datetime = field(default_factory=datetime.utcnow)
@dataclass
 class HistogramBucket:
    """Histogram bucket for distribution metrics."""
    le: float  # Less than or equal
    count: int = 0
 class SafetyMetrics:
    """
    Collects safety framework metrics.
    Metrics tracked:
    - Action validation counts (by decision type)
    - Approval request counts and latencies
    - Budget usage and remaining
    - Rate limit hits
    - Loop detections
    - Emergency events
    - Content filter matches
    """
    def __init__(self) -> None:
        """Initialize SafetyMetrics."""
        self._counters: dict[str, Counter[str]] = defaultdict(Counter)
        self._gauges: dict[str, dict[str, float]] = defaultdict(dict)
        self._histograms: dict[str, list[float]] = defaultdict(list)
        self._histogram_buckets: dict[str, list[HistogramBucket]] = {}
        self._lock = asyncio.Lock()
        # Initialize histogram buckets
        self._init_histogram_buckets()
    def _init_histogram_buckets(self) -> None:
        """Initialize histogram buckets for latency metrics."""
        latency_buckets = [
            0.01,
            0.05,
            0.1,
            0.25,
            0.5,
            1.0,
            2.5,
            5.0,
            10.0,
            float("inf"),
        ]
        for name in [
            "validation_latency_seconds",
            "approval_latency_seconds",
            "mcp_execution_latency_seconds",
        ]:
            self._histogram_buckets[name] = [
                HistogramBucket(le=b) for b in latency_buckets
            ]
    # Counter methods
    async def inc_validations(
        self,
        decision: str,
        agent_id: str | None = None,
    ) -> None:
        """Increment validation counter."""
        async with self._lock:
            labels = f"decision={decision}"
            if agent_id:
                labels += f",agent_id={agent_id}"
            self._counters["safety_validations_total"][labels] += 1
    async def inc_approvals_requested(self, urgency: str = "normal") -> None:
        """Increment approval requests counter."""
        async with self._lock:
            labels = f"urgency={urgency}"
            self._counters["safety_approvals_requested_total"][labels] += 1
    async def inc_approvals_granted(self) -> None:
        """Increment approvals granted counter."""
        async with self._lock:
            self._counters["safety_approvals_granted_total"][""] += 1
    async def inc_approvals_denied(self, reason: str = "manual") -> None:
        """Increment approvals denied counter."""
        async with self._lock:
            labels = f"reason={reason}"
            self._counters["safety_approvals_denied_total"][labels] += 1
    async def inc_rate_limit_exceeded(self, limit_type: str) -> None:
        """Increment rate limit exceeded counter."""
        async with self._lock:
            labels = f"limit_type={limit_type}"
            self._counters["safety_rate_limit_exceeded_total"][labels] += 1
    async def inc_budget_exceeded(self, budget_type: str) -> None:
        """Increment budget exceeded counter."""
        async with self._lock:
            labels = f"budget_type={budget_type}"
            self._counters["safety_budget_exceeded_total"][labels] += 1
    async def inc_loops_detected(self, loop_type: str) -> None:
        """Increment loop detection counter."""
        async with self._lock:
            labels = f"loop_type={loop_type}"
            self._counters["safety_loops_detected_total"][labels] += 1
    async def inc_emergency_events(self, event_type: str, scope: str) -> None:
        """Increment emergency events counter."""
        async with self._lock:
            labels = f"event_type={event_type},scope={scope}"
            self._counters["safety_emergency_events_total"][labels] += 1
    async def inc_content_filtered(self, category: str, action: str) -> None:
        """Increment content filter counter."""
        async with self._lock:
            labels = f"category={category},action={action}"
            self._counters["safety_content_filtered_total"][labels] += 1
    async def inc_checkpoints_created(self) -> None:
        """Increment checkpoints created counter."""
        async with self._lock:
            self._counters["safety_checkpoints_created_total"][""] += 1
    async def inc_rollbacks_executed(self, success: bool) -> None:
        """Increment rollbacks counter."""
        async with self._lock:
            labels = f"success={str(success).lower()}"
            self._counters["safety_rollbacks_total"][labels] += 1
    async def inc_mcp_calls(self, tool_name: str, success: bool) -> None:
        """Increment MCP tool calls counter."""
        async with self._lock:
            labels = f"tool_name={tool_name},success={str(success).lower()}"
            self._counters["safety_mcp_calls_total"][labels] += 1
    # Gauge methods
    async def set_budget_remaining(
        self,
        scope: str,
        budget_type: str,
        remaining: float,
    ) -> None:
        """Set remaining budget gauge."""
        async with self._lock:
            labels = f"scope={scope},budget_type={budget_type}"
            self._gauges["safety_budget_remaining"][labels] = remaining
    async def set_rate_limit_remaining(
        self,
        scope: str,
        limit_type: str,
        remaining: int,
    ) -> None:
        """Set remaining rate limit gauge."""
        async with self._lock:
            labels = f"scope={scope},limit_type={limit_type}"
            self._gauges["safety_rate_limit_remaining"][labels] = float(remaining)
    async def set_pending_approvals(self, count: int) -> None:
        """Set pending approvals gauge."""
        async with self._lock:
            self._gauges["safety_pending_approvals"][""] = float(count)
    async def set_active_checkpoints(self, count: int) -> None:
        """Set active checkpoints gauge."""
        async with self._lock:
            self._gauges["safety_active_checkpoints"][""] = float(count)
    async def set_emergency_state(self, scope: str, state: str) -> None:
        """Set emergency state gauge (0=normal, 1=paused, 2=stopped)."""
        async with self._lock:
            state_value = {"normal": 0, "paused": 1, "stopped": 2}.get(state, -1)
            labels = f"scope={scope}"
            self._gauges["safety_emergency_state"][labels] = float(state_value)
    # Histogram methods
    async def observe_validation_latency(self, latency_seconds: float) -> None:
        """Observe validation latency."""
        async with self._lock:
            self._observe_histogram("validation_latency_seconds", latency_seconds)
    async def observe_approval_latency(self, latency_seconds: float) -> None:
        """Observe approval latency."""
        async with self._lock:
            self._observe_histogram("approval_latency_seconds", latency_seconds)
    async def observe_mcp_execution_latency(self, latency_seconds: float) -> None:
        """Observe MCP execution latency."""
        async with self._lock:
            self._observe_histogram("mcp_execution_latency_seconds", latency_seconds)
    def _observe_histogram(self, name: str, value: float) -> None:
        """Record a value in a histogram."""
        self._histograms[name].append(value)
        # Update buckets
        if name in self._histogram_buckets:
            for bucket in self._histogram_buckets[name]:
                if value <= bucket.le:
                    bucket.count += 1
    # Export methods
    async def get_all_metrics(self) -> list[MetricValue]:
        """Get all metrics as MetricValue objects."""
        metrics: list[MetricValue] = []
        async with self._lock:
            # Export counters
            for name, counter in self._counters.items():
                for labels_str, value in counter.items():
                    labels = self._parse_labels(labels_str)
                    metrics.append(
                        MetricValue(
                            name=name,
                            metric_type=MetricType.COUNTER,
                            value=float(value),
                            labels=labels,
                        )
                    )
            # Export gauges
            for name, gauge_dict in self._gauges.items():
                for labels_str, gauge_value in gauge_dict.items():
                    gauge_labels = self._parse_labels(labels_str)
                    metrics.append(
                        MetricValue(
                            name=name,
                            metric_type=MetricType.GAUGE,
                            value=gauge_value,
                            labels=gauge_labels,
                        )
                    )
            # Export histogram summaries
            for name, values in self._histograms.items():
                if values:
                    metrics.append(
                        MetricValue(
                            name=f"{name}_count",
                            metric_type=MetricType.COUNTER,
                            value=float(len(values)),
                        )
                    )
                    metrics.append(
                        MetricValue(
                            name=f"{name}_sum",
                            metric_type=MetricType.COUNTER,
                            value=sum(values),
                        )
                    )
        return metrics
    async def get_prometheus_format(self) -> str:
        """Export metrics in Prometheus text format."""
        lines: list[str] = []
        async with self._lock:
            # Export counters
            for name, counter in self._counters.items():
                lines.append(f"# TYPE {name} counter")
                for labels_str, value in counter.items():
                    if labels_str:
                        lines.append(f"{name}{{{labels_str}}} {value}")
                    else:
                        lines.append(f"{name} {value}")
            # Export gauges
            for name, gauge_dict in self._gauges.items():
                lines.append(f"# TYPE {name} gauge")
                for labels_str, gauge_value in gauge_dict.items():
                    if labels_str:
                        lines.append(f"{name}{{{labels_str}}} {gauge_value}")
                    else:
                        lines.append(f"{name} {gauge_value}")
            # Export histograms
            for name, buckets in self._histogram_buckets.items():
                lines.append(f"# TYPE {name} histogram")
                for bucket in buckets:
                    le_str = "+Inf" if bucket.le == float("inf") else str(bucket.le)
                    lines.append(f'{name}_bucket{{le="{le_str}"}} {bucket.count}')
                if name in self._histograms:
                    values = self._histograms[name]
                    lines.append(f"{name}_count {len(values)}")
                    lines.append(f"{name}_sum {sum(values)}")
        return "\n".join(lines)
    async def get_summary(self) -> dict[str, Any]:
        """Get a summary of key metrics."""
        async with self._lock:
            total_validations = sum(self._counters["safety_validations_total"].values())
            denied_validations = sum(
                v
                for k, v in self._counters["safety_validations_total"].items()
                if "decision=deny" in k
            )
            return {
                "total_validations": total_validations,
                "denied_validations": denied_validations,
                "approval_requests": sum(
                    self._counters["safety_approvals_requested_total"].values()
                ),
                "approvals_granted": sum(
                    self._counters["safety_approvals_granted_total"].values()
                ),
                "approvals_denied": sum(
                    self._counters["safety_approvals_denied_total"].values()
                ),
                "rate_limit_hits": sum(
                    self._counters["safety_rate_limit_exceeded_total"].values()
                ),
                "budget_exceeded": sum(
                    self._counters["safety_budget_exceeded_total"].values()
                ),
                "loops_detected": sum(
                    self._counters["safety_loops_detected_total"].values()
                ),
                "emergency_events": sum(
                    self._counters["safety_emergency_events_total"].values()
                ),
                "content_filtered": sum(
                    self._counters["safety_content_filtered_total"].values()
                ),
                "checkpoints_created": sum(
                    self._counters["safety_checkpoints_created_total"].values()
                ),
                "rollbacks_executed": sum(
                    self._counters["safety_rollbacks_total"].values()
                ),
                "mcp_calls": sum(self._counters["safety_mcp_calls_total"].values()),
                "pending_approvals": self._gauges.get(
                    "safety_pending_approvals", {}
                ).get("", 0),
                "active_checkpoints": self._gauges.get(
                    "safety_active_checkpoints", {}
                ).get("", 0),
            }
    async def reset(self) -> None:
        """Reset all metrics."""
        async with self._lock:
            self._counters.clear()
            self._gauges.clear()
            self._histograms.clear()
            self._init_histogram_buckets()
    def _parse_labels(self, labels_str: str) -> dict[str, str]:
        """Parse labels string into dictionary."""
        if not labels_str:
            return {}
        labels = {}
        for pair in labels_str.split(","):
            if "=" in pair:
                key, value = pair.split("=", 1)
                labels[key.strip()] = value.strip()
        return labels
 # Singleton instance
 _metrics: SafetyMetrics | None = None
 _lock = asyncio.Lock()
 async def get_safety_metrics() -> SafetyMetrics:
    """Get the singleton SafetyMetrics instance."""
    global _metrics
    async with _lock:
        if _metrics is None:
            _metrics = SafetyMetrics()
        return _metrics
 # Convenience functions
 async def record_validation(decision: str, agent_id: str | None = None) -> None:
    """Record a validation event."""
    metrics = await get_safety_metrics()
    await metrics.inc_validations(decision, agent_id)
 async def record_mcp_call(tool_name: str, success: bool, latency_ms: float) -> None:
    """Record an MCP tool call."""
    metrics = await get_safety_metrics()
    await metrics.inc_mcp_calls(tool_name, success)
    await metrics.observe_mcp_execution_latency(latency_ms / 1000)
--- a/backend/app/services/safety/models.py
+++ b/backend/app/services/safety/models.py
@@ -0,0 +1,470 @@
 """
 Safety Framework Models
 Core Pydantic models for actions, events, policies, and safety decisions.
 """
 from datetime import datetime
 from enum import Enum
 from typing import Any
 from uuid import uuid4
 from pydantic import BaseModel, Field
 # ============================================================================
 # Enums
 # ============================================================================
 class ActionType(str, Enum):
    """Types of actions that can be performed."""
    TOOL_CALL = "tool_call"
    FILE_READ = "file_read"
    FILE_WRITE = "file_write"
    FILE_DELETE = "file_delete"
    API_CALL = "api_call"
    DATABASE_QUERY = "database_query"
    DATABASE_MUTATE = "database_mutate"
    GIT_OPERATION = "git_operation"
    SHELL_COMMAND = "shell_command"
    LLM_CALL = "llm_call"
    NETWORK_REQUEST = "network_request"
    CUSTOM = "custom"
 class ResourceType(str, Enum):
    """Types of resources that can be accessed."""
    FILE = "file"
    DATABASE = "database"
    API = "api"
    NETWORK = "network"
    GIT = "git"
    SHELL = "shell"
    LLM = "llm"
    MEMORY = "memory"
    CUSTOM = "custom"
 class PermissionLevel(str, Enum):
    """Permission levels for resource access."""
    NONE = "none"
    READ = "read"
    WRITE = "write"
    EXECUTE = "execute"
    DELETE = "delete"
    ADMIN = "admin"
 class AutonomyLevel(str, Enum):
    """Autonomy levels for agent operation."""
    FULL_CONTROL = "full_control"  # Approve every action
    MILESTONE = "milestone"  # Approve at milestones
    AUTONOMOUS = "autonomous"  # Only major decisions
 class SafetyDecision(str, Enum):
    """Result of safety validation."""
    ALLOW = "allow"
    DENY = "deny"
    REQUIRE_APPROVAL = "require_approval"
    DELAY = "delay"
    SANDBOX = "sandbox"
 class ApprovalStatus(str, Enum):
    """Status of approval request."""
    PENDING = "pending"
    APPROVED = "approved"
    DENIED = "denied"
    TIMEOUT = "timeout"
    CANCELLED = "cancelled"
 class AuditEventType(str, Enum):
    """Types of audit events."""
    ACTION_REQUESTED = "action_requested"
    ACTION_VALIDATED = "action_validated"
    ACTION_DENIED = "action_denied"
    ACTION_EXECUTED = "action_executed"
    ACTION_FAILED = "action_failed"
    APPROVAL_REQUESTED = "approval_requested"
    APPROVAL_GRANTED = "approval_granted"
    APPROVAL_DENIED = "approval_denied"
    APPROVAL_TIMEOUT = "approval_timeout"
    CHECKPOINT_CREATED = "checkpoint_created"
    ROLLBACK_STARTED = "rollback_started"
    ROLLBACK_COMPLETED = "rollback_completed"
    ROLLBACK_FAILED = "rollback_failed"
    BUDGET_WARNING = "budget_warning"
    BUDGET_EXCEEDED = "budget_exceeded"
    RATE_LIMITED = "rate_limited"
    LOOP_DETECTED = "loop_detected"
    EMERGENCY_STOP = "emergency_stop"
    POLICY_VIOLATION = "policy_violation"
    CONTENT_FILTERED = "content_filtered"
 # ============================================================================
 # Action Models
 # ============================================================================
 class ActionMetadata(BaseModel):
    """Metadata associated with an action."""
    agent_id: str = Field(..., description="ID of the agent performing the action")
    project_id: str | None = Field(None, description="ID of the project context")
    session_id: str | None = Field(None, description="ID of the current session")
    task_id: str | None = Field(None, description="ID of the current task")
    parent_action_id: str | None = Field(None, description="ID of the parent action")
    correlation_id: str | None = Field(None, description="Correlation ID for tracing")
    user_id: str | None = Field(None, description="ID of the user who initiated")
    autonomy_level: AutonomyLevel = Field(
        default=AutonomyLevel.MILESTONE,
        description="Current autonomy level",
    )
    context: dict[str, Any] = Field(
        default_factory=dict,
        description="Additional context",
    )
 class ActionRequest(BaseModel):
    """Request to perform an action."""
    id: str = Field(default_factory=lambda: str(uuid4()))
    action_type: ActionType = Field(..., description="Type of action to perform")
    tool_name: str | None = Field(None, description="Name of the tool to call")
    resource: str | None = Field(None, description="Resource being accessed")
    resource_type: ResourceType | None = Field(None, description="Type of resource")
    arguments: dict[str, Any] = Field(
        default_factory=dict,
        description="Action arguments",
    )
    metadata: ActionMetadata = Field(..., description="Action metadata")
    estimated_cost_tokens: int = Field(0, description="Estimated token cost")
    estimated_cost_usd: float = Field(0.0, description="Estimated USD cost")
    is_destructive: bool = Field(False, description="Whether action is destructive")
    is_reversible: bool = Field(True, description="Whether action can be rolled back")
    timestamp: datetime = Field(default_factory=datetime.utcnow)
 class ActionResult(BaseModel):
    """Result of an executed action."""
    action_id: str = Field(..., description="ID of the action")
    success: bool = Field(..., description="Whether action succeeded")
    data: Any = Field(None, description="Action result data")
    error: str | None = Field(None, description="Error message if failed")
    error_code: str | None = Field(None, description="Error code if failed")
    execution_time_ms: float = Field(0.0, description="Execution time in ms")
    actual_cost_tokens: int = Field(0, description="Actual token cost")
    actual_cost_usd: float = Field(0.0, description="Actual USD cost")
    checkpoint_id: str | None = Field(None, description="Checkpoint ID if created")
    timestamp: datetime = Field(default_factory=datetime.utcnow)
 # ============================================================================
 # Validation Models
 # ============================================================================
 class ValidationRule(BaseModel):
    """A single validation rule."""
    id: str = Field(default_factory=lambda: str(uuid4()))
    name: str = Field(..., description="Rule name")
    description: str | None = Field(None, description="Rule description")
    priority: int = Field(0, description="Rule priority (higher = evaluated first)")
    enabled: bool = Field(True, description="Whether rule is enabled")
    # Rule conditions
    action_types: list[ActionType] | None = Field(
        None, description="Action types this rule applies to"
    )
    tool_patterns: list[str] | None = Field(
        None, description="Tool name patterns (supports wildcards)"
    )
    resource_patterns: list[str] | None = Field(
        None, description="Resource patterns (supports wildcards)"
    )
    agent_ids: list[str] | None = Field(
        None, description="Agent IDs this rule applies to"
    )
    # Rule decision
    decision: SafetyDecision = Field(..., description="Decision when rule matches")
    reason: str | None = Field(None, description="Reason for decision")
 class ValidationResult(BaseModel):
    """Result of action validation."""
    action_id: str = Field(..., description="ID of the validated action")
    decision: SafetyDecision = Field(..., description="Validation decision")
    applied_rules: list[str] = Field(
        default_factory=list, description="IDs of applied rules"
    )
    reasons: list[str] = Field(default_factory=list, description="Reasons for decision")
    approval_id: str | None = Field(None, description="Approval request ID if needed")
    retry_after_seconds: float | None = Field(
        None, description="Retry delay if rate limited"
    )
    timestamp: datetime = Field(default_factory=datetime.utcnow)
 # ============================================================================
 # Budget Models
 # ============================================================================
 class BudgetScope(str, Enum):
    """Scope of a budget limit."""
    SESSION = "session"
    DAILY = "daily"
    WEEKLY = "weekly"
    MONTHLY = "monthly"
    PROJECT = "project"
    AGENT = "agent"
 class BudgetStatus(BaseModel):
    """Current budget status."""
    scope: BudgetScope = Field(..., description="Budget scope")
    scope_id: str = Field(..., description="ID within scope (session/agent/project)")
    tokens_used: int = Field(0, description="Tokens used in this scope")
    tokens_limit: int = Field(100000, description="Token limit for this scope")
    cost_used_usd: float = Field(0.0, description="USD spent in this scope")
    cost_limit_usd: float = Field(10.0, description="USD limit for this scope")
    tokens_remaining: int = Field(0, description="Remaining tokens")
    cost_remaining_usd: float = Field(0.0, description="Remaining USD budget")
    warning_threshold: float = Field(0.8, description="Warn at this usage fraction")
    is_warning: bool = Field(False, description="Whether at warning level")
    is_exceeded: bool = Field(False, description="Whether budget exceeded")
    reset_at: datetime | None = Field(None, description="When budget resets")
 # ============================================================================
 # Rate Limit Models
 # ============================================================================
 class RateLimitConfig(BaseModel):
    """Configuration for a rate limit."""
    name: str = Field(..., description="Rate limit name")
    limit: int = Field(..., description="Maximum allowed in window")
    window_seconds: int = Field(60, description="Time window in seconds")
    burst_limit: int | None = Field(None, description="Burst allowance")
    slowdown_threshold: float = Field(0.8, description="Start slowing at this fraction")
 class RateLimitStatus(BaseModel):
    """Current rate limit status."""
    name: str = Field(..., description="Rate limit name")
    current_count: int = Field(0, description="Current count in window")
    limit: int = Field(..., description="Maximum allowed")
    window_seconds: int = Field(..., description="Time window")
    remaining: int = Field(..., description="Remaining in window")
    reset_at: datetime = Field(..., description="When window resets")
    is_limited: bool = Field(False, description="Whether currently limited")
    retry_after_seconds: float = Field(0.0, description="Seconds until retry")
 # ============================================================================
 # Approval Models
 # ============================================================================
 class ApprovalRequest(BaseModel):
    """Request for human approval."""
    id: str = Field(default_factory=lambda: str(uuid4()))
    action: ActionRequest = Field(..., description="Action requiring approval")
    reason: str = Field(..., description="Why approval is required")
    urgency: str = Field("normal", description="Urgency level")
    timeout_seconds: int = Field(300, description="Timeout for approval")
    created_at: datetime = Field(default_factory=datetime.utcnow)
    expires_at: datetime | None = Field(None, description="When request expires")
    suggested_action: str | None = Field(None, description="Suggested response")
    context: dict[str, Any] = Field(default_factory=dict, description="Extra context")
 class ApprovalResponse(BaseModel):
    """Response to an approval request."""
    request_id: str = Field(..., description="ID of the approval request")
    status: ApprovalStatus = Field(..., description="Approval status")
    decided_by: str | None = Field(None, description="Who made the decision")
    reason: str | None = Field(None, description="Reason for decision")
    modifications: dict[str, Any] | None = Field(
        None, description="Modifications to action"
    )
    decided_at: datetime = Field(default_factory=datetime.utcnow)
 # ============================================================================
 # Checkpoint/Rollback Models
 # ============================================================================
 class CheckpointType(str, Enum):
    """Types of checkpoints."""
    FILE = "file"
    DATABASE = "database"
    GIT = "git"
    COMPOSITE = "composite"
 class Checkpoint(BaseModel):
    """A rollback checkpoint."""
    id: str = Field(default_factory=lambda: str(uuid4()))
    checkpoint_type: CheckpointType = Field(..., description="Type of checkpoint")
    action_id: str = Field(..., description="Action this checkpoint is for")
    created_at: datetime = Field(default_factory=datetime.utcnow)
    expires_at: datetime | None = Field(None, description="When checkpoint expires")
    data: dict[str, Any] = Field(default_factory=dict, description="Checkpoint data")
    description: str | None = Field(None, description="Description of checkpoint")
    is_valid: bool = Field(True, description="Whether checkpoint is still valid")
 class RollbackResult(BaseModel):
    """Result of a rollback operation."""
    checkpoint_id: str = Field(..., description="ID of checkpoint rolled back to")
    success: bool = Field(..., description="Whether rollback succeeded")
    actions_rolled_back: list[str] = Field(
        default_factory=list, description="IDs of rolled back actions"
    )
    failed_actions: list[str] = Field(
        default_factory=list, description="IDs of actions that failed to rollback"
    )
    error: str | None = Field(None, description="Error message if failed")
    timestamp: datetime = Field(default_factory=datetime.utcnow)
 # ============================================================================
 # Audit Models
 # ============================================================================
 class AuditEvent(BaseModel):
    """An audit log event."""
    id: str = Field(default_factory=lambda: str(uuid4()))
    event_type: AuditEventType = Field(..., description="Type of audit event")
    timestamp: datetime = Field(default_factory=datetime.utcnow)
    agent_id: str | None = Field(None, description="Agent ID if applicable")
    action_id: str | None = Field(None, description="Action ID if applicable")
    project_id: str | None = Field(None, description="Project ID if applicable")
    session_id: str | None = Field(None, description="Session ID if applicable")
    user_id: str | None = Field(None, description="User ID if applicable")
    decision: SafetyDecision | None = Field(None, description="Safety decision")
    details: dict[str, Any] = Field(default_factory=dict, description="Event details")
    correlation_id: str | None = Field(None, description="Correlation ID for tracing")
 # ============================================================================
 # Policy Models
 # ============================================================================
 class SafetyPolicy(BaseModel):
    """A complete safety policy configuration."""
    name: str = Field(..., description="Policy name")
    description: str | None = Field(None, description="Policy description")
    version: str = Field("1.0.0", description="Policy version")
    enabled: bool = Field(True, description="Whether policy is enabled")
    # Cost controls
    max_tokens_per_session: int = Field(100_000, description="Max tokens per session")
    max_tokens_per_day: int = Field(1_000_000, description="Max tokens per day")
    max_cost_per_session_usd: float = Field(10.0, description="Max USD per session")
    max_cost_per_day_usd: float = Field(100.0, description="Max USD per day")
    # Rate limits
    max_actions_per_minute: int = Field(60, description="Max actions per minute")
    max_llm_calls_per_minute: int = Field(20, description="Max LLM calls per minute")
    max_file_operations_per_minute: int = Field(
        100, description="Max file ops per minute"
    )
    # Permissions
    allowed_tools: list[str] = Field(
        default_factory=lambda: ["*"],
        description="Allowed tool patterns",
    )
    denied_tools: list[str] = Field(
        default_factory=list,
        description="Denied tool patterns",
    )
    allowed_file_patterns: list[str] = Field(
        default_factory=lambda: ["**/*"],
        description="Allowed file patterns",
    )
    denied_file_patterns: list[str] = Field(
        default_factory=lambda: ["**/.env", "**/secrets/**"],
        description="Denied file patterns",
    )
    # HITL
    require_approval_for: list[str] = Field(
        default_factory=lambda: [
            "delete_file",
            "push_to_remote",
            "deploy_to_production",
            "modify_critical_config",
        ],
        description="Actions requiring approval",
    )
    # Loop detection
    max_repeated_actions: int = Field(5, description="Max exact repetitions")
    max_similar_actions: int = Field(10, description="Max similar actions")
    # Sandbox
    require_sandbox: bool = Field(False, description="Require sandbox execution")
    sandbox_timeout_seconds: int = Field(300, description="Sandbox timeout")
    sandbox_memory_mb: int = Field(1024, description="Sandbox memory limit")
    # Validation rules
    validation_rules: list[ValidationRule] = Field(
        default_factory=list,
        description="Custom validation rules",
    )
 # ============================================================================
 # Guardian Result Models
 # ============================================================================
 class GuardianResult(BaseModel):
    """Result of SafetyGuardian evaluation."""
    action_id: str = Field(..., description="ID of the action")
    allowed: bool = Field(..., description="Whether action is allowed")
    decision: SafetyDecision = Field(..., description="Safety decision")
    reasons: list[str] = Field(default_factory=list, description="Decision reasons")
    approval_id: str | None = Field(None, description="Approval ID if needed")
    checkpoint_id: str | None = Field(None, description="Checkpoint ID if created")
    retry_after_seconds: float | None = Field(None, description="Retry delay")
    modified_action: ActionRequest | None = Field(
        None, description="Modified action if changed"
    )
    audit_events: list[AuditEvent] = Field(
        default_factory=list, description="Generated audit events"
    )
--- a/backend/app/services/safety/permissions/init.py
+++ b/backend/app/services/safety/permissions/init.py
@@ -0,0 +1,15 @@
 """
 Permission Management Module
 Agent permissions for resource access.
 """
 from .manager import (
    PermissionGrant,
    PermissionManager,
 )
 __all__ = [
    "PermissionGrant",
    "PermissionManager",
 ]
--- a/backend/app/services/safety/permissions/manager.py
+++ b/backend/app/services/safety/permissions/manager.py
@@ -0,0 +1,384 @@
 """
 Permission Manager
 Manages permissions for agent actions on resources.
 """
 import asyncio
 import fnmatch
 import logging
 from datetime import datetime, timedelta
 from uuid import uuid4
 from ..exceptions import PermissionDeniedError
 from ..models import (
    ActionRequest,
    ActionType,
    PermissionLevel,
    ResourceType,
 )
 logger = logging.getLogger(__name__)
 class PermissionGrant:
    """A permission grant for an agent on a resource."""
    def __init__(
        self,
        agent_id: str,
        resource_pattern: str,
        resource_type: ResourceType,
        level: PermissionLevel,
        *,
        expires_at: datetime | None = None,
        granted_by: str | None = None,
        reason: str | None = None,
    ) -> None:
        self.id = str(uuid4())
        self.agent_id = agent_id
        self.resource_pattern = resource_pattern
        self.resource_type = resource_type
        self.level = level
        self.expires_at = expires_at
        self.granted_by = granted_by
        self.reason = reason
        self.created_at = datetime.utcnow()
    def is_expired(self) -> bool:
        """Check if the grant has expired."""
        if self.expires_at is None:
            return False
        return datetime.utcnow() > self.expires_at
    def matches(self, resource: str, resource_type: ResourceType) -> bool:
        """Check if this grant applies to a resource."""
        if self.resource_type != resource_type:
            return False
        return fnmatch.fnmatch(resource, self.resource_pattern)
    def allows(self, required_level: PermissionLevel) -> bool:
        """Check if this grant allows the required permission level."""
        # Permission level hierarchy
        hierarchy = {
            PermissionLevel.NONE: 0,
            PermissionLevel.READ: 1,
            PermissionLevel.WRITE: 2,
            PermissionLevel.EXECUTE: 3,
            PermissionLevel.DELETE: 4,
            PermissionLevel.ADMIN: 5,
        }
        return hierarchy[self.level] >= hierarchy[required_level]
 class PermissionManager:
    """
    Manages permissions for agent access to resources.
    Features:
    - Permission grants by agent/resource pattern
    - Permission inheritance (project → agent → action)
    - Temporary permissions with expiration
    - Least-privilege defaults
    - Permission escalation logging
    """
    def __init__(
        self,
        default_deny: bool = True,
    ) -> None:
        """
        Initialize the PermissionManager.
        Args:
            default_deny: If True, deny access unless explicitly granted
        """
        self._grants: list[PermissionGrant] = []
        self._default_deny = default_deny
        self._lock = asyncio.Lock()
        # Default permissions for common resources
        self._default_permissions: dict[ResourceType, PermissionLevel] = {
            ResourceType.FILE: PermissionLevel.READ,
            ResourceType.DATABASE: PermissionLevel.READ,
            ResourceType.API: PermissionLevel.READ,
            ResourceType.GIT: PermissionLevel.READ,
            ResourceType.LLM: PermissionLevel.EXECUTE,
            ResourceType.SHELL: PermissionLevel.NONE,
            ResourceType.NETWORK: PermissionLevel.READ,
        }
    async def grant(
        self,
        agent_id: str,
        resource_pattern: str,
        resource_type: ResourceType,
        level: PermissionLevel,
        *,
        duration_seconds: int | None = None,
        granted_by: str | None = None,
        reason: str | None = None,
    ) -> PermissionGrant:
        """
        Grant a permission to an agent.
        Args:
            agent_id: ID of the agent
            resource_pattern: Pattern for matching resources (supports wildcards)
            resource_type: Type of resource
            level: Permission level to grant
            duration_seconds: Optional duration for temporary permission
            granted_by: Who granted the permission
            reason: Reason for granting
        Returns:
            The created permission grant
        """
        expires_at = None
        if duration_seconds:
            expires_at = datetime.utcnow() + timedelta(seconds=duration_seconds)
        grant = PermissionGrant(
            agent_id=agent_id,
            resource_pattern=resource_pattern,
            resource_type=resource_type,
            level=level,
            expires_at=expires_at,
            granted_by=granted_by,
            reason=reason,
        )
        async with self._lock:
            self._grants.append(grant)
        logger.info(
            "Permission granted: agent=%s, resource=%s, type=%s, level=%s",
            agent_id,
            resource_pattern,
            resource_type.value,
            level.value,
        )
        return grant
    async def revoke(self, grant_id: str) -> bool:
        """
        Revoke a permission grant.
        Args:
            grant_id: ID of the grant to revoke
        Returns:
            True if grant was found and revoked
        """
        async with self._lock:
            for i, grant in enumerate(self._grants):
                if grant.id == grant_id:
                    del self._grants[i]
                    logger.info("Permission revoked: %s", grant_id)
                    return True
        return False
    async def revoke_all(self, agent_id: str) -> int:
        """
        Revoke all permissions for an agent.
        Args:
            agent_id: ID of the agent
        Returns:
            Number of grants revoked
        """
        async with self._lock:
            original_count = len(self._grants)
            self._grants = [g for g in self._grants if g.agent_id != agent_id]
            revoked = original_count - len(self._grants)
        if revoked:
            logger.info("Revoked %d permissions for agent %s", revoked, agent_id)
        return revoked
    async def check(
        self,
        agent_id: str,
        resource: str,
        resource_type: ResourceType,
        required_level: PermissionLevel,
    ) -> bool:
        """
        Check if an agent has permission to access a resource.
        Args:
            agent_id: ID of the agent
            resource: Resource to access
            resource_type: Type of resource
            required_level: Required permission level
        Returns:
            True if access is allowed
        """
        # Clean up expired grants
        await self._cleanup_expired()
        async with self._lock:
            for grant in self._grants:
                if grant.agent_id != agent_id:
                    continue
                if grant.is_expired():
                    continue
                if grant.matches(resource, resource_type):
                    if grant.allows(required_level):
                        return True
        # Check default permissions
        if not self._default_deny:
            default_level = self._default_permissions.get(
                resource_type, PermissionLevel.NONE
            )
            hierarchy = {
                PermissionLevel.NONE: 0,
                PermissionLevel.READ: 1,
                PermissionLevel.WRITE: 2,
                PermissionLevel.EXECUTE: 3,
                PermissionLevel.DELETE: 4,
                PermissionLevel.ADMIN: 5,
            }
            if hierarchy[default_level] >= hierarchy[required_level]:
                return True
        return False
    async def check_action(self, action: ActionRequest) -> bool:
        """
        Check if an action is permitted.
        Args:
            action: The action to check
        Returns:
            True if action is allowed
        """
        # Determine required permission level from action type
        level_map = {
            ActionType.FILE_READ: PermissionLevel.READ,
            ActionType.FILE_WRITE: PermissionLevel.WRITE,
            ActionType.FILE_DELETE: PermissionLevel.DELETE,
            ActionType.DATABASE_QUERY: PermissionLevel.READ,
            ActionType.DATABASE_MUTATE: PermissionLevel.WRITE,
            ActionType.SHELL_COMMAND: PermissionLevel.EXECUTE,
            ActionType.API_CALL: PermissionLevel.EXECUTE,
            ActionType.GIT_OPERATION: PermissionLevel.WRITE,
            ActionType.LLM_CALL: PermissionLevel.EXECUTE,
            ActionType.NETWORK_REQUEST: PermissionLevel.READ,
            ActionType.TOOL_CALL: PermissionLevel.EXECUTE,
        }
        required_level = level_map.get(action.action_type, PermissionLevel.EXECUTE)
        # Determine resource type from action
        resource_type_map = {
            ActionType.FILE_READ: ResourceType.FILE,
            ActionType.FILE_WRITE: ResourceType.FILE,
            ActionType.FILE_DELETE: ResourceType.FILE,
            ActionType.DATABASE_QUERY: ResourceType.DATABASE,
            ActionType.DATABASE_MUTATE: ResourceType.DATABASE,
            ActionType.SHELL_COMMAND: ResourceType.SHELL,
            ActionType.API_CALL: ResourceType.API,
            ActionType.GIT_OPERATION: ResourceType.GIT,
            ActionType.LLM_CALL: ResourceType.LLM,
            ActionType.NETWORK_REQUEST: ResourceType.NETWORK,
        }
        resource_type = resource_type_map.get(action.action_type, ResourceType.CUSTOM)
        resource = action.resource or action.tool_name or "*"
        return await self.check(
            agent_id=action.metadata.agent_id,
            resource=resource,
            resource_type=resource_type,
            required_level=required_level,
        )
    async def require_permission(
        self,
        agent_id: str,
        resource: str,
        resource_type: ResourceType,
        required_level: PermissionLevel,
    ) -> None:
        """
        Require permission or raise exception.
        Args:
            agent_id: ID of the agent
            resource: Resource to access
            resource_type: Type of resource
            required_level: Required permission level
        Raises:
            PermissionDeniedError: If permission is denied
        """
        if not await self.check(agent_id, resource, resource_type, required_level):
            raise PermissionDeniedError(
                f"Permission denied: {resource}",
                action_type=None,
                resource=resource,
                required_permission=required_level.value,
                agent_id=agent_id,
            )
    async def list_grants(
        self,
        agent_id: str | None = None,
        resource_type: ResourceType | None = None,
    ) -> list[PermissionGrant]:
        """
        List permission grants.
        Args:
            agent_id: Optional filter by agent
            resource_type: Optional filter by resource type
        Returns:
            List of matching grants
        """
        await self._cleanup_expired()
        async with self._lock:
            grants = list(self._grants)
        if agent_id:
            grants = [g for g in grants if g.agent_id == agent_id]
        if resource_type:
            grants = [g for g in grants if g.resource_type == resource_type]
        return grants
    def set_default_permission(
        self,
        resource_type: ResourceType,
        level: PermissionLevel,
    ) -> None:
        """
        Set the default permission level for a resource type.
        Args:
            resource_type: Type of resource
            level: Default permission level
        """
        self._default_permissions[resource_type] = level
    async def _cleanup_expired(self) -> None:
        """Remove expired grants."""
        async with self._lock:
            original_count = len(self._grants)
            self._grants = [g for g in self._grants if not g.is_expired()]
            removed = original_count - len(self._grants)
        if removed:
            logger.debug("Cleaned up %d expired permission grants", removed)
--- a/backend/app/services/safety/policies/init.py
+++ b/backend/app/services/safety/policies/init.py
@@ -0,0 +1 @@
 """${dir} module."""
--- a/backend/app/services/safety/rollback/init.py
+++ b/backend/app/services/safety/rollback/init.py
@@ -0,0 +1,5 @@
 """Rollback management for agent actions."""
 from .manager import RollbackManager, TransactionContext
 __all__ = ["RollbackManager", "TransactionContext"]
--- a/backend/app/services/safety/rollback/manager.py
+++ b/backend/app/services/safety/rollback/manager.py
@@ -0,0 +1,417 @@
 """
 Rollback Manager
 Manages checkpoints and rollback operations for agent actions.
 """
 import asyncio
 import logging
 from datetime import datetime, timedelta
 from pathlib import Path
 from typing import Any
 from uuid import uuid4
 from ..config import get_safety_config
 from ..exceptions import RollbackError
 from ..models import (
    ActionRequest,
    Checkpoint,
    CheckpointType,
    RollbackResult,
 )
 logger = logging.getLogger(__name__)
 class FileCheckpoint:
    """Stores file state for rollback."""
    def __init__(
        self,
        checkpoint_id: str,
        file_path: str,
        original_content: bytes | None,
        existed: bool,
    ) -> None:
        self.checkpoint_id = checkpoint_id
        self.file_path = file_path
        self.original_content = original_content
        self.existed = existed
        self.created_at = datetime.utcnow()
 class RollbackManager:
    """
    Manages checkpoints and rollback operations.
    Features:
    - File system checkpoints
    - Transaction wrapping for actions
    - Automatic checkpoint for destructive actions
    - Rollback triggers on failure
    - Checkpoint expiration and cleanup
    """
    def __init__(
        self,
        checkpoint_dir: str | None = None,
        retention_hours: int | None = None,
    ) -> None:
        """
        Initialize the RollbackManager.
        Args:
            checkpoint_dir: Directory for storing checkpoint data
            retention_hours: Hours to retain checkpoints
        """
        config = get_safety_config()
        self._checkpoint_dir = Path(checkpoint_dir or config.checkpoint_dir)
        self._retention_hours = retention_hours or config.checkpoint_retention_hours
        self._checkpoints: dict[str, Checkpoint] = {}
        self._file_checkpoints: dict[str, list[FileCheckpoint]] = {}
        self._lock = asyncio.Lock()
        # Ensure checkpoint directory exists
        self._checkpoint_dir.mkdir(parents=True, exist_ok=True)
    async def create_checkpoint(
        self,
        action: ActionRequest,
        checkpoint_type: CheckpointType = CheckpointType.COMPOSITE,
        description: str | None = None,
    ) -> Checkpoint:
        """
        Create a checkpoint before an action.
        Args:
            action: The action to checkpoint for
            checkpoint_type: Type of checkpoint
            description: Optional description
        Returns:
            The created checkpoint
        """
        checkpoint_id = str(uuid4())
        checkpoint = Checkpoint(
            id=checkpoint_id,
            checkpoint_type=checkpoint_type,
            action_id=action.id,
            created_at=datetime.utcnow(),
            expires_at=datetime.utcnow() + timedelta(hours=self._retention_hours),
            data={
                "action_type": action.action_type.value,
                "tool_name": action.tool_name,
                "resource": action.resource,
            },
            description=description or f"Checkpoint for {action.tool_name}",
        )
        async with self._lock:
            self._checkpoints[checkpoint_id] = checkpoint
            self._file_checkpoints[checkpoint_id] = []
        logger.info(
            "Created checkpoint %s for action %s",
            checkpoint_id,
            action.id,
        )
        return checkpoint
    async def checkpoint_file(
        self,
        checkpoint_id: str,
        file_path: str,
    ) -> None:
        """
        Store current state of a file for checkpoint.
        Args:
            checkpoint_id: ID of the checkpoint
            file_path: Path to the file
        """
        path = Path(file_path)
        if path.exists():
            content = path.read_bytes()
            existed = True
        else:
            content = None
            existed = False
        file_checkpoint = FileCheckpoint(
            checkpoint_id=checkpoint_id,
            file_path=file_path,
            original_content=content,
            existed=existed,
        )
        async with self._lock:
            if checkpoint_id not in self._file_checkpoints:
                self._file_checkpoints[checkpoint_id] = []
            self._file_checkpoints[checkpoint_id].append(file_checkpoint)
        logger.debug(
            "Stored file state for checkpoint %s: %s (existed=%s)",
            checkpoint_id,
            file_path,
            existed,
        )
    async def checkpoint_files(
        self,
        checkpoint_id: str,
        file_paths: list[str],
    ) -> None:
        """
        Store current state of multiple files.
        Args:
            checkpoint_id: ID of the checkpoint
            file_paths: Paths to the files
        """
        for path in file_paths:
            await self.checkpoint_file(checkpoint_id, path)
    async def rollback(
        self,
        checkpoint_id: str,
    ) -> RollbackResult:
        """
        Rollback to a checkpoint.
        Args:
            checkpoint_id: ID of the checkpoint
        Returns:
            Result of the rollback operation
        """
        async with self._lock:
            checkpoint = self._checkpoints.get(checkpoint_id)
            if not checkpoint:
                raise RollbackError(
                    f"Checkpoint not found: {checkpoint_id}",
                    checkpoint_id=checkpoint_id,
                )
            if not checkpoint.is_valid:
                raise RollbackError(
                    f"Checkpoint is no longer valid: {checkpoint_id}",
                    checkpoint_id=checkpoint_id,
                )
            file_checkpoints = self._file_checkpoints.get(checkpoint_id, [])
        actions_rolled_back: list[str] = []
        failed_actions: list[str] = []
        # Rollback file changes
        for fc in file_checkpoints:
            try:
                await self._rollback_file(fc)
                actions_rolled_back.append(f"file:{fc.file_path}")
            except Exception as e:
                logger.error("Failed to rollback file %s: %s", fc.file_path, e)
                failed_actions.append(f"file:{fc.file_path}")
        success = len(failed_actions) == 0
        # Mark checkpoint as used
        async with self._lock:
            if checkpoint_id in self._checkpoints:
                self._checkpoints[checkpoint_id].is_valid = False
        result = RollbackResult(
            checkpoint_id=checkpoint_id,
            success=success,
            actions_rolled_back=actions_rolled_back,
            failed_actions=failed_actions,
            error=None
            if success
            else f"Failed to rollback {len(failed_actions)} items",
        )
        if success:
            logger.info("Rollback successful for checkpoint %s", checkpoint_id)
        else:
            logger.error(
                "Rollback partially failed for checkpoint %s: %d failures",
                checkpoint_id,
                len(failed_actions),
            )
        return result
    async def discard_checkpoint(self, checkpoint_id: str) -> bool:
        """
        Discard a checkpoint without rolling back.
        Args:
            checkpoint_id: ID of the checkpoint
        Returns:
            True if checkpoint was found and discarded
        """
        async with self._lock:
            if checkpoint_id in self._checkpoints:
                del self._checkpoints[checkpoint_id]
                if checkpoint_id in self._file_checkpoints:
                    del self._file_checkpoints[checkpoint_id]
                logger.debug("Discarded checkpoint %s", checkpoint_id)
                return True
        return False
    async def get_checkpoint(self, checkpoint_id: str) -> Checkpoint | None:
        """Get a checkpoint by ID."""
        async with self._lock:
            return self._checkpoints.get(checkpoint_id)
    async def list_checkpoints(
        self,
        action_id: str | None = None,
        include_expired: bool = False,
    ) -> list[Checkpoint]:
        """
        List checkpoints.
        Args:
            action_id: Optional filter by action ID
            include_expired: Include expired checkpoints
        Returns:
            List of checkpoints
        """
        now = datetime.utcnow()
        async with self._lock:
            checkpoints = list(self._checkpoints.values())
        if action_id:
            checkpoints = [c for c in checkpoints if c.action_id == action_id]
        if not include_expired:
            checkpoints = [
                c for c in checkpoints if c.expires_at is None or c.expires_at > now
            ]
        return checkpoints
    async def cleanup_expired(self) -> int:
        """
        Clean up expired checkpoints.
        Returns:
            Number of checkpoints cleaned up
        """
        now = datetime.utcnow()
        to_remove: list[str] = []
        async with self._lock:
            for checkpoint_id, checkpoint in self._checkpoints.items():
                if checkpoint.expires_at and checkpoint.expires_at < now:
                    to_remove.append(checkpoint_id)
            for checkpoint_id in to_remove:
                del self._checkpoints[checkpoint_id]
                if checkpoint_id in self._file_checkpoints:
                    del self._file_checkpoints[checkpoint_id]
        if to_remove:
            logger.info("Cleaned up %d expired checkpoints", len(to_remove))
        return len(to_remove)
    async def _rollback_file(self, fc: FileCheckpoint) -> None:
        """Rollback a single file to its checkpoint state."""
        path = Path(fc.file_path)
        if fc.existed:
            # Restore original content
            if fc.original_content is not None:
                path.parent.mkdir(parents=True, exist_ok=True)
                path.write_bytes(fc.original_content)
                logger.debug("Restored file: %s", fc.file_path)
        else:
            # File didn't exist before - delete it
            if path.exists():
                path.unlink()
                logger.debug("Deleted file (didn't exist before): %s", fc.file_path)
 class TransactionContext:
    """
    Context manager for transactional action execution.
    Usage:
        async with TransactionContext(rollback_manager, action) as tx:
            tx.checkpoint_file("/path/to/file")
            # Do work...
            # If exception occurs, automatic rollback
    """
    def __init__(
        self,
        manager: RollbackManager,
        action: ActionRequest,
        auto_rollback: bool = True,
    ) -> None:
        self._manager = manager
        self._action = action
        self._auto_rollback = auto_rollback
        self._checkpoint: Checkpoint | None = None
        self._committed = False
    async def __aenter__(self) -> "TransactionContext":
        self._checkpoint = await self._manager.create_checkpoint(self._action)
        return self
    async def __aexit__(
        self,
        exc_type: type | None,
        exc_val: Exception | None,
        exc_tb: Any,
    ) -> bool:
        if exc_val is not None and self._auto_rollback and not self._committed:
            # Exception occurred - rollback
            if self._checkpoint:
                try:
                    await self._manager.rollback(self._checkpoint.id)
                    logger.info(
                        "Auto-rollback completed for action %s",
                        self._action.id,
                    )
                except Exception as e:
                    logger.error("Auto-rollback failed: %s", e)
        elif self._committed and self._checkpoint:
            # Committed - discard checkpoint
            await self._manager.discard_checkpoint(self._checkpoint.id)
        return False  # Don't suppress the exception
    @property
    def checkpoint_id(self) -> str | None:
        """Get the checkpoint ID."""
        return self._checkpoint.id if self._checkpoint else None
    async def checkpoint_file(self, file_path: str) -> None:
        """Checkpoint a file for this transaction."""
        if self._checkpoint:
            await self._manager.checkpoint_file(self._checkpoint.id, file_path)
    async def checkpoint_files(self, file_paths: list[str]) -> None:
        """Checkpoint multiple files for this transaction."""
        if self._checkpoint:
            await self._manager.checkpoint_files(self._checkpoint.id, file_paths)
    def commit(self) -> None:
        """Mark transaction as committed (no rollback on exit)."""
        self._committed = True
    async def rollback(self) -> RollbackResult | None:
        """Manually trigger rollback."""
        if self._checkpoint:
            return await self._manager.rollback(self._checkpoint.id)
        return None
--- a/Show More
+++ b/Show More