feat(knowledge-base): implement Knowledge Base MCP Server (#57)

Implements RAG capabilities with pgvector for semantic search: - Intelligent chunking strategies (code-aware, markdown-aware, text) - Semantic search with vector similarity (HNSW index) - Keyword search with PostgreSQL full-text search - Hybrid search using Reciprocal Rank Fusion (RRF) - Redis caching for embeddings - Collection management (ingest, search, delete, stats) - FastMCP tools: search_knowledge, ingest_content, delete_content, list_collections, get_collection_stats, update_document Testing: - 128 comprehensive tests covering all components - 58% code coverage (database integration tests use mocks) - Passes ruff linting and mypy type checking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-03 21:33:26 +01:00
parent 18d717e996
commit d0fc7f37ff
26 changed files with 9530 additions and 120 deletions
--- a/mcp-servers/knowledge-base/chunking/markdown.py
+++ b/mcp-servers/knowledge-base/chunking/markdown.py
@@ -0,0 +1,483 @@
+"""
+Markdown-aware chunking implementation.
+
+Provides intelligent chunking for markdown content that respects
+heading hierarchy and preserves document structure.
+"""
+
+import logging
+import re
+from typing import Any
+
+from chunking.base import BaseChunker
+from config import Settings
+from models import Chunk, ChunkType, FileType
+
+logger = logging.getLogger(__name__)
+
+# Patterns for markdown elements
+HEADING_PATTERN = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
+CODE_BLOCK_PATTERN = re.compile(r"^```", re.MULTILINE)
+HR_PATTERN = re.compile(r"^(-{3,}|_{3,}|\*{3,})$", re.MULTILINE)
+
+
+class MarkdownChunker(BaseChunker):
+    """
+    Markdown-aware chunker that respects document structure.
+
+    Features:
+    - Respects heading hierarchy
+    - Preserves heading context in chunks
+    - Handles code blocks as units
+    - Maintains list continuity where possible
+    """
+
+    def __init__(
+        self,
+        chunk_size: int,
+        chunk_overlap: int,
+        settings: Settings | None = None,
+    ) -> None:
+        """Initialize markdown chunker."""
+        super().__init__(chunk_size, chunk_overlap, settings)
+
+    @property
+    def chunk_type(self) -> ChunkType:
+        """Get chunk type."""
+        return ChunkType.MARKDOWN
+
+    def chunk(
+        self,
+        content: str,
+        source_path: str | None = None,
+        file_type: FileType | None = None,
+        metadata: dict[str, Any] | None = None,
+    ) -> list[Chunk]:
+        """
+        Chunk markdown content.
+
+        Splits on heading boundaries and preserves heading context.
+        """
+        if not content.strip():
+            return []
+
+        metadata = metadata or {}
+        file_type = file_type or FileType.MARKDOWN
+
+        # Split content into sections by headings
+        sections = self._split_by_headings(content)
+
+        if not sections:
+            # No headings, chunk as plain text
+            return self._chunk_text_block(
+                content, source_path, file_type, metadata, []
+            )
+
+        chunks: list[Chunk] = []
+        heading_stack: list[tuple[int, str]] = []  # (level, text)
+
+        for section in sections:
+            heading_level = section.get("level", 0)
+            heading_text = section.get("heading", "")
+            section_content = section.get("content", "")
+            start_line = section.get("start_line", 1)
+            end_line = section.get("end_line", 1)
+
+            # Update heading stack
+            if heading_level > 0:
+                # Pop headings of equal or higher level
+                while heading_stack and heading_stack[-1][0] >= heading_level:
+                    heading_stack.pop()
+                heading_stack.append((heading_level, heading_text))
+
+            # Build heading context prefix
+            heading_context = " > ".join(h[1] for h in heading_stack)
+
+            section_chunks = self._chunk_section(
+                content=section_content,
+                heading_context=heading_context,
+                heading_level=heading_level,
+                heading_text=heading_text,
+                start_line=start_line,
+                end_line=end_line,
+                source_path=source_path,
+                file_type=file_type,
+                metadata=metadata,
+            )
+            chunks.extend(section_chunks)
+
+        return chunks
+
+    def _split_by_headings(self, content: str) -> list[dict[str, Any]]:
+        """Split content into sections by headings."""
+        sections: list[dict[str, Any]] = []
+        lines = content.split("\n")
+
+        current_section: dict[str, Any] = {
+            "level": 0,
+            "heading": "",
+            "content": "",
+            "start_line": 1,
+            "end_line": 1,
+        }
+        current_lines: list[str] = []
+        in_code_block = False
+
+        for i, line in enumerate(lines):
+            # Track code blocks
+            if line.strip().startswith("```"):
+                in_code_block = not in_code_block
+                current_lines.append(line)
+                continue
+
+            # Skip heading detection in code blocks
+            if in_code_block:
+                current_lines.append(line)
+                continue
+
+            # Check for heading
+            heading_match = HEADING_PATTERN.match(line)
+            if heading_match:
+                # Save previous section
+                if current_lines:
+                    current_section["content"] = "\n".join(current_lines)
+                    current_section["end_line"] = i
+                    if current_section["content"].strip():
+                        sections.append(current_section)
+
+                # Start new section
+                level = len(heading_match.group(1))
+                heading_text = heading_match.group(2).strip()
+                current_section = {
+                    "level": level,
+                    "heading": heading_text,
+                    "content": "",
+                    "start_line": i + 1,
+                    "end_line": i + 1,
+                }
+                current_lines = [line]
+            else:
+                current_lines.append(line)
+
+        # Save final section
+        if current_lines:
+            current_section["content"] = "\n".join(current_lines)
+            current_section["end_line"] = len(lines)
+            if current_section["content"].strip():
+                sections.append(current_section)
+
+        return sections
+
+    def _chunk_section(
+        self,
+        content: str,
+        heading_context: str,
+        heading_level: int,
+        heading_text: str,
+        start_line: int,
+        end_line: int,
+        source_path: str | None,
+        file_type: FileType,
+        metadata: dict[str, Any],
+    ) -> list[Chunk]:
+        """Chunk a single section of markdown."""
+        if not content.strip():
+            return []
+
+        token_count = self.count_tokens(content)
+
+        # If section fits in one chunk, return as-is
+        if token_count <= self.chunk_size:
+            section_metadata = {
+                **metadata,
+                "heading_context": heading_context,
+                "heading_level": heading_level,
+                "heading_text": heading_text,
+            }
+            return [
+                self._create_chunk(
+                    content=content.strip(),
+                    source_path=source_path,
+                    start_line=start_line,
+                    end_line=end_line,
+                    file_type=file_type,
+                    metadata=section_metadata,
+                )
+            ]
+
+        # Need to split - try to split on paragraphs first
+        return self._chunk_text_block(
+            content,
+            source_path,
+            file_type,
+            {
+                **metadata,
+                "heading_context": heading_context,
+                "heading_level": heading_level,
+                "heading_text": heading_text,
+            },
+            _heading_stack=[(heading_level, heading_text)] if heading_text else [],
+            base_line=start_line,
+        )
+
+    def _chunk_text_block(
+        self,
+        content: str,
+        source_path: str | None,
+        file_type: FileType,
+        metadata: dict[str, Any],
+        _heading_stack: list[tuple[int, str]],
+        base_line: int = 1,
+    ) -> list[Chunk]:
+        """Chunk a block of text by paragraphs."""
+        # Split into paragraphs (separated by blank lines)
+        paragraphs = self._split_into_paragraphs(content)
+
+        if not paragraphs:
+            return []
+
+        chunks: list[Chunk] = []
+        current_content: list[str] = []
+        current_tokens = 0
+        chunk_start_line = base_line
+
+        for para_info in paragraphs:
+            para_content = para_info["content"]
+            para_tokens = para_info["tokens"]
+            para_start = para_info["start_line"]
+
+            # Handle very large paragraphs
+            if para_tokens > self.chunk_size:
+                # Flush current content
+                if current_content:
+                    chunk_text = "\n\n".join(current_content)
+                    chunks.append(
+                        self._create_chunk(
+                            content=chunk_text.strip(),
+                            source_path=source_path,
+                            start_line=chunk_start_line,
+                            end_line=base_line + para_start - 1,
+                            file_type=file_type,
+                            metadata=metadata,
+                        )
+                    )
+                    current_content = []
+                    current_tokens = 0
+
+                # Split large paragraph by sentences/lines
+                sub_chunks = self._split_large_paragraph(
+                    para_content,
+                    source_path,
+                    file_type,
+                    metadata,
+                    base_line + para_start,
+                )
+                chunks.extend(sub_chunks)
+                chunk_start_line = base_line + para_info["end_line"] + 1
+                continue
+
+            # Check if adding this paragraph exceeds limit
+            if current_tokens + para_tokens > self.chunk_size and current_content:
+                # Create chunk
+                chunk_text = "\n\n".join(current_content)
+                chunks.append(
+                    self._create_chunk(
+                        content=chunk_text.strip(),
+                        source_path=source_path,
+                        start_line=chunk_start_line,
+                        end_line=base_line + para_start - 1,
+                        file_type=file_type,
+                        metadata=metadata,
+                    )
+                )
+
+                # Overlap: include last paragraph if it fits
+                if current_content and self.count_tokens(current_content[-1]) <= self.chunk_overlap:
+                    current_content = [current_content[-1]]
+                    current_tokens = self.count_tokens(current_content[-1])
+                else:
+                    current_content = []
+                    current_tokens = 0
+
+                chunk_start_line = base_line + para_start
+
+            current_content.append(para_content)
+            current_tokens += para_tokens
+
+        # Final chunk
+        if current_content:
+            chunk_text = "\n\n".join(current_content)
+            end_line_num = base_line + (paragraphs[-1]["end_line"] if paragraphs else 0)
+            chunks.append(
+                self._create_chunk(
+                    content=chunk_text.strip(),
+                    source_path=source_path,
+                    start_line=chunk_start_line,
+                    end_line=end_line_num,
+                    file_type=file_type,
+                    metadata=metadata,
+                )
+            )
+
+        return chunks
+
+    def _split_into_paragraphs(self, content: str) -> list[dict[str, Any]]:
+        """Split content into paragraphs with metadata."""
+        paragraphs: list[dict[str, Any]] = []
+        lines = content.split("\n")
+
+        current_para: list[str] = []
+        para_start = 0
+        in_code_block = False
+
+        for i, line in enumerate(lines):
+            # Track code blocks (keep them as single units)
+            if line.strip().startswith("```"):
+                if in_code_block:
+                    # End of code block
+                    current_para.append(line)
+                    in_code_block = False
+                else:
+                    # Start of code block - save previous paragraph
+                    if current_para and any(p.strip() for p in current_para):
+                        para_content = "\n".join(current_para)
+                        paragraphs.append({
+                            "content": para_content,
+                            "tokens": self.count_tokens(para_content),
+                            "start_line": para_start,
+                            "end_line": i - 1,
+                        })
+                    current_para = [line]
+                    para_start = i
+                    in_code_block = True
+                continue
+
+            if in_code_block:
+                current_para.append(line)
+                continue
+
+            # Empty line indicates paragraph break
+            if not line.strip():
+                if current_para and any(p.strip() for p in current_para):
+                    para_content = "\n".join(current_para)
+                    paragraphs.append({
+                        "content": para_content,
+                        "tokens": self.count_tokens(para_content),
+                        "start_line": para_start,
+                        "end_line": i - 1,
+                    })
+                current_para = []
+                para_start = i + 1
+            else:
+                if not current_para:
+                    para_start = i
+                current_para.append(line)
+
+        # Final paragraph
+        if current_para and any(p.strip() for p in current_para):
+            para_content = "\n".join(current_para)
+            paragraphs.append({
+                "content": para_content,
+                "tokens": self.count_tokens(para_content),
+                "start_line": para_start,
+                "end_line": len(lines) - 1,
+            })
+
+        return paragraphs
+
+    def _split_large_paragraph(
+        self,
+        content: str,
+        source_path: str | None,
+        file_type: FileType,
+        metadata: dict[str, Any],
+        base_line: int,
+    ) -> list[Chunk]:
+        """Split a large paragraph into smaller chunks."""
+        # Try splitting by sentences
+        sentences = self._split_into_sentences(content)
+
+        chunks: list[Chunk] = []
+        current_content: list[str] = []
+        current_tokens = 0
+
+        for sentence in sentences:
+            sentence_tokens = self.count_tokens(sentence)
+
+            # If single sentence is too large, truncate
+            if sentence_tokens > self.chunk_size:
+                if current_content:
+                    chunk_text = " ".join(current_content)
+                    chunks.append(
+                        self._create_chunk(
+                            content=chunk_text.strip(),
+                            source_path=source_path,
+                            start_line=base_line,
+                            end_line=base_line,
+                            file_type=file_type,
+                            metadata=metadata,
+                        )
+                    )
+                    current_content = []
+                    current_tokens = 0
+
+                truncated = self.truncate_to_tokens(sentence, self.chunk_size)
+                chunks.append(
+                    self._create_chunk(
+                        content=truncated.strip(),
+                        source_path=source_path,
+                        start_line=base_line,
+                        end_line=base_line,
+                        file_type=file_type,
+                        metadata={**metadata, "truncated": True},
+                    )
+                )
+                continue
+
+            if current_tokens + sentence_tokens > self.chunk_size and current_content:
+                chunk_text = " ".join(current_content)
+                chunks.append(
+                    self._create_chunk(
+                        content=chunk_text.strip(),
+                        source_path=source_path,
+                        start_line=base_line,
+                        end_line=base_line,
+                        file_type=file_type,
+                        metadata=metadata,
+                    )
+                )
+
+                # Overlap with last sentence
+                if current_content and self.count_tokens(current_content[-1]) <= self.chunk_overlap:
+                    current_content = [current_content[-1]]
+                    current_tokens = self.count_tokens(current_content[-1])
+                else:
+                    current_content = []
+                    current_tokens = 0
+
+            current_content.append(sentence)
+            current_tokens += sentence_tokens
+
+        # Final chunk
+        if current_content:
+            chunk_text = " ".join(current_content)
+            chunks.append(
+                self._create_chunk(
+                    content=chunk_text.strip(),
+                    source_path=source_path,
+                    start_line=base_line,
+                    end_line=base_line,
+                    file_type=file_type,
+                    metadata=metadata,
+                )
+            )
+
+        return chunks
+
+    def _split_into_sentences(self, text: str) -> list[str]:
+        """Split text into sentences."""
+        # Simple sentence splitting on common terminators
+        # More sophisticated splitting could use nltk or spacy
+        sentence_endings = re.compile(r"(?<=[.!?])\s+")
+        sentences = sentence_endings.split(text)
+        return [s.strip() for s in sentences if s.strip()]