syndarix/mcp-servers/llm-gateway/server.py

"""
Syndarix LLM Gateway MCP Server.

Provides unified LLM access with:
- Multi-provider support (Claude, GPT, Gemini, Qwen, DeepSeek)
- Automatic failover chains
- Cost tracking via LiteLLM callbacks
- Model group routing (high-reasoning, code-generation, fast-response, cost-optimized)

Per ADR-004: LLM Provider Abstraction.
"""

import os

from fastmcp import FastMCP

# Create MCP server
mcp = FastMCP(
    "syndarix-llm-gateway",
    description="Unified LLM access with failover and cost tracking",
)

# Configuration
REDIS_URL = os.getenv("REDIS_URL", "redis://localhost:6379/0")
DATABASE_URL = os.getenv("DATABASE_URL")


@mcp.tool()
async def chat_completion(
    project_id: str,
    agent_id: str,
    messages: list[dict],
    model_group: str = "high-reasoning",
    max_tokens: int = 4096,
    temperature: float = 0.7,
) -> dict:
    """
    Generate a chat completion using the specified model group.

    Args:
        project_id: UUID of the project (required for cost attribution)
        agent_id: UUID of the agent instance making the request
        messages: List of message dicts with 'role' and 'content'
        model_group: Model routing group (high-reasoning, code-generation, fast-response, cost-optimized, self-hosted)
        max_tokens: Maximum tokens to generate
        temperature: Sampling temperature (0.0-2.0)

    Returns:
        Completion response with content and usage statistics
    """
    # TODO: Implement with LiteLLM
    # 1. Map model_group to primary model + fallbacks
    # 2. Check project budget before making request
    # 3. Make completion request with failover
    # 4. Log usage via callback
    # 5. Return response
    return {
        "status": "not_implemented",
        "project_id": project_id,
        "agent_id": agent_id,
        "model_group": model_group,
    }


@mcp.tool()
async def get_embeddings(
    project_id: str,
    texts: list[str],
    model: str = "text-embedding-3-small",
) -> dict:
    """
    Generate embeddings for the given texts.

    Args:
        project_id: UUID of the project (required for cost attribution)
        texts: List of texts to embed
        model: Embedding model to use

    Returns:
        List of embedding vectors
    """
    # TODO: Implement with LiteLLM embeddings
    return {
        "status": "not_implemented",
        "project_id": project_id,
        "text_count": len(texts),
    }


@mcp.tool()
async def get_budget_status(project_id: str) -> dict:
    """
    Get current budget status for a project.

    Args:
        project_id: UUID of the project

    Returns:
        Budget status with usage, limits, and percentage
    """
    # TODO: Implement budget check from Redis
    return {
        "status": "not_implemented",
        "project_id": project_id,
    }


@mcp.tool()
async def list_available_models() -> dict:
    """
    List all available models and their capabilities.

    Returns:
        Dictionary of model groups and available models
    """
    return {
        "model_groups": {
            "high-reasoning": {
                "primary": "claude-opus-4-5",
                "fallbacks": ["gpt-5.1-codex-max", "gemini-3-pro"],
                "description": "Complex analysis, architecture decisions",
            },
            "code-generation": {
                "primary": "gpt-5.1-codex-max",
                "fallbacks": ["claude-opus-4-5", "deepseek-v3.2"],
                "description": "Code writing and refactoring",
            },
            "fast-response": {
                "primary": "gemini-3-flash",
                "fallbacks": ["qwen3-235b", "deepseek-v3.2"],
                "description": "Quick tasks, simple queries",
            },
            "cost-optimized": {
                "primary": "qwen3-235b",
                "fallbacks": ["deepseek-v3.2"],
                "description": "High-volume, non-critical tasks",
            },
            "self-hosted": {
                "primary": "deepseek-v3.2",
                "fallbacks": ["qwen3-235b"],
                "description": "Privacy-sensitive, air-gapped",
            },
        }
    }


if __name__ == "__main__":
    mcp.run()