forked from cardosofelipe/fast-next-template
feat(knowledge-base): implement Knowledge Base MCP Server (#57)
Implements RAG capabilities with pgvector for semantic search: - Intelligent chunking strategies (code-aware, markdown-aware, text) - Semantic search with vector similarity (HNSW index) - Keyword search with PostgreSQL full-text search - Hybrid search using Reciprocal Rank Fusion (RRF) - Redis caching for embeddings - Collection management (ingest, search, delete, stats) - FastMCP tools: search_knowledge, ingest_content, delete_content, list_collections, get_collection_stats, update_document Testing: - 128 comprehensive tests covering all components - 58% code coverage (database integration tests use mocks) - Passes ruff linting and mypy type checking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
281
mcp-servers/knowledge-base/chunking/base.py
Normal file
281
mcp-servers/knowledge-base/chunking/base.py
Normal file
@@ -0,0 +1,281 @@
|
||||
"""
|
||||
Base chunker implementation.
|
||||
|
||||
Provides abstract interface and common utilities for content chunking.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Any
|
||||
|
||||
import tiktoken
|
||||
|
||||
from config import Settings, get_settings
|
||||
from exceptions import ChunkingError
|
||||
from models import FILE_EXTENSION_MAP, Chunk, ChunkType, FileType
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class BaseChunker(ABC):
|
||||
"""
|
||||
Abstract base class for content chunkers.
|
||||
|
||||
Subclasses implement specific chunking strategies for
|
||||
different content types (code, markdown, text).
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int,
|
||||
chunk_overlap: int,
|
||||
settings: Settings | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Initialize chunker.
|
||||
|
||||
Args:
|
||||
chunk_size: Target tokens per chunk
|
||||
chunk_overlap: Token overlap between chunks
|
||||
settings: Application settings
|
||||
"""
|
||||
self._settings = settings or get_settings()
|
||||
self.chunk_size = chunk_size
|
||||
self.chunk_overlap = chunk_overlap
|
||||
|
||||
# Use cl100k_base encoding (GPT-4/text-embedding-3)
|
||||
self._tokenizer = tiktoken.get_encoding("cl100k_base")
|
||||
|
||||
def count_tokens(self, text: str) -> int:
|
||||
"""Count tokens in text."""
|
||||
return len(self._tokenizer.encode(text))
|
||||
|
||||
def truncate_to_tokens(self, text: str, max_tokens: int) -> str:
|
||||
"""Truncate text to max tokens."""
|
||||
tokens = self._tokenizer.encode(text)
|
||||
if len(tokens) <= max_tokens:
|
||||
return text
|
||||
return self._tokenizer.decode(tokens[:max_tokens])
|
||||
|
||||
@abstractmethod
|
||||
def chunk(
|
||||
self,
|
||||
content: str,
|
||||
source_path: str | None = None,
|
||||
file_type: FileType | None = None,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
) -> list[Chunk]:
|
||||
"""
|
||||
Split content into chunks.
|
||||
|
||||
Args:
|
||||
content: Content to chunk
|
||||
source_path: Source file path for reference
|
||||
file_type: File type for specialized handling
|
||||
metadata: Additional metadata to include
|
||||
|
||||
Returns:
|
||||
List of Chunk objects
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def chunk_type(self) -> ChunkType:
|
||||
"""Get the chunk type this chunker produces."""
|
||||
pass
|
||||
|
||||
def _create_chunk(
|
||||
self,
|
||||
content: str,
|
||||
source_path: str | None = None,
|
||||
start_line: int | None = None,
|
||||
end_line: int | None = None,
|
||||
file_type: FileType | None = None,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
) -> Chunk:
|
||||
"""Create a chunk with token count."""
|
||||
token_count = self.count_tokens(content)
|
||||
return Chunk(
|
||||
content=content,
|
||||
chunk_type=self.chunk_type,
|
||||
file_type=file_type,
|
||||
source_path=source_path,
|
||||
start_line=start_line,
|
||||
end_line=end_line,
|
||||
metadata=metadata or {},
|
||||
token_count=token_count,
|
||||
)
|
||||
|
||||
|
||||
class ChunkerFactory:
|
||||
"""
|
||||
Factory for creating appropriate chunkers.
|
||||
|
||||
Selects the best chunker based on file type or content.
|
||||
"""
|
||||
|
||||
def __init__(self, settings: Settings | None = None) -> None:
|
||||
"""Initialize factory."""
|
||||
self._settings = settings or get_settings()
|
||||
self._chunkers: dict[str, BaseChunker] = {}
|
||||
|
||||
def _get_code_chunker(self) -> "BaseChunker":
|
||||
"""Get or create code chunker."""
|
||||
from chunking.code import CodeChunker
|
||||
|
||||
if "code" not in self._chunkers:
|
||||
self._chunkers["code"] = CodeChunker(
|
||||
chunk_size=self._settings.code_chunk_size,
|
||||
chunk_overlap=self._settings.code_chunk_overlap,
|
||||
settings=self._settings,
|
||||
)
|
||||
return self._chunkers["code"]
|
||||
|
||||
def _get_markdown_chunker(self) -> "BaseChunker":
|
||||
"""Get or create markdown chunker."""
|
||||
from chunking.markdown import MarkdownChunker
|
||||
|
||||
if "markdown" not in self._chunkers:
|
||||
self._chunkers["markdown"] = MarkdownChunker(
|
||||
chunk_size=self._settings.markdown_chunk_size,
|
||||
chunk_overlap=self._settings.markdown_chunk_overlap,
|
||||
settings=self._settings,
|
||||
)
|
||||
return self._chunkers["markdown"]
|
||||
|
||||
def _get_text_chunker(self) -> "BaseChunker":
|
||||
"""Get or create text chunker."""
|
||||
from chunking.text import TextChunker
|
||||
|
||||
if "text" not in self._chunkers:
|
||||
self._chunkers["text"] = TextChunker(
|
||||
chunk_size=self._settings.text_chunk_size,
|
||||
chunk_overlap=self._settings.text_chunk_overlap,
|
||||
settings=self._settings,
|
||||
)
|
||||
return self._chunkers["text"]
|
||||
|
||||
def get_chunker(
|
||||
self,
|
||||
file_type: FileType | None = None,
|
||||
chunk_type: ChunkType | None = None,
|
||||
) -> BaseChunker:
|
||||
"""
|
||||
Get appropriate chunker for content type.
|
||||
|
||||
Args:
|
||||
file_type: File type to chunk
|
||||
chunk_type: Explicit chunk type to use
|
||||
|
||||
Returns:
|
||||
Appropriate chunker instance
|
||||
"""
|
||||
# If explicit chunk type specified, use it
|
||||
if chunk_type:
|
||||
if chunk_type == ChunkType.CODE:
|
||||
return self._get_code_chunker()
|
||||
elif chunk_type == ChunkType.MARKDOWN:
|
||||
return self._get_markdown_chunker()
|
||||
else:
|
||||
return self._get_text_chunker()
|
||||
|
||||
# Otherwise, infer from file type
|
||||
if file_type:
|
||||
if file_type == FileType.MARKDOWN:
|
||||
return self._get_markdown_chunker()
|
||||
elif file_type in (FileType.TEXT, FileType.JSON, FileType.YAML, FileType.TOML):
|
||||
return self._get_text_chunker()
|
||||
else:
|
||||
# Code files
|
||||
return self._get_code_chunker()
|
||||
|
||||
# Default to text chunker
|
||||
return self._get_text_chunker()
|
||||
|
||||
def get_chunker_for_path(self, source_path: str) -> tuple[BaseChunker, FileType | None]:
|
||||
"""
|
||||
Get chunker based on file path extension.
|
||||
|
||||
Args:
|
||||
source_path: File path to chunk
|
||||
|
||||
Returns:
|
||||
Tuple of (chunker, file_type)
|
||||
"""
|
||||
# Extract extension
|
||||
ext = ""
|
||||
if "." in source_path:
|
||||
ext = "." + source_path.rsplit(".", 1)[-1].lower()
|
||||
|
||||
file_type = FILE_EXTENSION_MAP.get(ext)
|
||||
chunker = self.get_chunker(file_type=file_type)
|
||||
|
||||
return chunker, file_type
|
||||
|
||||
def chunk_content(
|
||||
self,
|
||||
content: str,
|
||||
source_path: str | None = None,
|
||||
file_type: FileType | None = None,
|
||||
chunk_type: ChunkType | None = None,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
) -> list[Chunk]:
|
||||
"""
|
||||
Chunk content using appropriate strategy.
|
||||
|
||||
Args:
|
||||
content: Content to chunk
|
||||
source_path: Source file path
|
||||
file_type: File type
|
||||
chunk_type: Explicit chunk type
|
||||
metadata: Additional metadata
|
||||
|
||||
Returns:
|
||||
List of chunks
|
||||
"""
|
||||
# If we have a source path but no file type, infer it
|
||||
if source_path and not file_type:
|
||||
chunker, file_type = self.get_chunker_for_path(source_path)
|
||||
else:
|
||||
chunker = self.get_chunker(file_type=file_type, chunk_type=chunk_type)
|
||||
|
||||
try:
|
||||
chunks = chunker.chunk(
|
||||
content=content,
|
||||
source_path=source_path,
|
||||
file_type=file_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
|
||||
logger.debug(
|
||||
f"Chunked content into {len(chunks)} chunks "
|
||||
f"(type={chunker.chunk_type.value})"
|
||||
)
|
||||
|
||||
return chunks
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Chunking error: {e}")
|
||||
raise ChunkingError(
|
||||
message=f"Failed to chunk content: {e}",
|
||||
cause=e,
|
||||
)
|
||||
|
||||
|
||||
# Global chunker factory instance
|
||||
_chunker_factory: ChunkerFactory | None = None
|
||||
|
||||
|
||||
def get_chunker_factory() -> ChunkerFactory:
|
||||
"""Get the global chunker factory instance."""
|
||||
global _chunker_factory
|
||||
if _chunker_factory is None:
|
||||
_chunker_factory = ChunkerFactory()
|
||||
return _chunker_factory
|
||||
|
||||
|
||||
def reset_chunker_factory() -> None:
|
||||
"""Reset the global chunker factory (for testing)."""
|
||||
global _chunker_factory
|
||||
_chunker_factory = None
|
||||
Reference in New Issue
Block a user