Files
syndarix/mcp-servers/knowledge-base/chunking/base.py
Felipe Cardoso d0fc7f37ff feat(knowledge-base): implement Knowledge Base MCP Server (#57)
Implements RAG capabilities with pgvector for semantic search:

- Intelligent chunking strategies (code-aware, markdown-aware, text)
- Semantic search with vector similarity (HNSW index)
- Keyword search with PostgreSQL full-text search
- Hybrid search using Reciprocal Rank Fusion (RRF)
- Redis caching for embeddings
- Collection management (ingest, search, delete, stats)
- FastMCP tools: search_knowledge, ingest_content, delete_content,
  list_collections, get_collection_stats, update_document

Testing:
- 128 comprehensive tests covering all components
- 58% code coverage (database integration tests use mocks)
- Passes ruff linting and mypy type checking

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-03 21:33:26 +01:00

282 lines
8.2 KiB
Python

"""
Base chunker implementation.
Provides abstract interface and common utilities for content chunking.
"""
import logging
from abc import ABC, abstractmethod
from typing import Any
import tiktoken
from config import Settings, get_settings
from exceptions import ChunkingError
from models import FILE_EXTENSION_MAP, Chunk, ChunkType, FileType
logger = logging.getLogger(__name__)
class BaseChunker(ABC):
"""
Abstract base class for content chunkers.
Subclasses implement specific chunking strategies for
different content types (code, markdown, text).
"""
def __init__(
self,
chunk_size: int,
chunk_overlap: int,
settings: Settings | None = None,
) -> None:
"""
Initialize chunker.
Args:
chunk_size: Target tokens per chunk
chunk_overlap: Token overlap between chunks
settings: Application settings
"""
self._settings = settings or get_settings()
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
# Use cl100k_base encoding (GPT-4/text-embedding-3)
self._tokenizer = tiktoken.get_encoding("cl100k_base")
def count_tokens(self, text: str) -> int:
"""Count tokens in text."""
return len(self._tokenizer.encode(text))
def truncate_to_tokens(self, text: str, max_tokens: int) -> str:
"""Truncate text to max tokens."""
tokens = self._tokenizer.encode(text)
if len(tokens) <= max_tokens:
return text
return self._tokenizer.decode(tokens[:max_tokens])
@abstractmethod
def chunk(
self,
content: str,
source_path: str | None = None,
file_type: FileType | None = None,
metadata: dict[str, Any] | None = None,
) -> list[Chunk]:
"""
Split content into chunks.
Args:
content: Content to chunk
source_path: Source file path for reference
file_type: File type for specialized handling
metadata: Additional metadata to include
Returns:
List of Chunk objects
"""
pass
@property
@abstractmethod
def chunk_type(self) -> ChunkType:
"""Get the chunk type this chunker produces."""
pass
def _create_chunk(
self,
content: str,
source_path: str | None = None,
start_line: int | None = None,
end_line: int | None = None,
file_type: FileType | None = None,
metadata: dict[str, Any] | None = None,
) -> Chunk:
"""Create a chunk with token count."""
token_count = self.count_tokens(content)
return Chunk(
content=content,
chunk_type=self.chunk_type,
file_type=file_type,
source_path=source_path,
start_line=start_line,
end_line=end_line,
metadata=metadata or {},
token_count=token_count,
)
class ChunkerFactory:
"""
Factory for creating appropriate chunkers.
Selects the best chunker based on file type or content.
"""
def __init__(self, settings: Settings | None = None) -> None:
"""Initialize factory."""
self._settings = settings or get_settings()
self._chunkers: dict[str, BaseChunker] = {}
def _get_code_chunker(self) -> "BaseChunker":
"""Get or create code chunker."""
from chunking.code import CodeChunker
if "code" not in self._chunkers:
self._chunkers["code"] = CodeChunker(
chunk_size=self._settings.code_chunk_size,
chunk_overlap=self._settings.code_chunk_overlap,
settings=self._settings,
)
return self._chunkers["code"]
def _get_markdown_chunker(self) -> "BaseChunker":
"""Get or create markdown chunker."""
from chunking.markdown import MarkdownChunker
if "markdown" not in self._chunkers:
self._chunkers["markdown"] = MarkdownChunker(
chunk_size=self._settings.markdown_chunk_size,
chunk_overlap=self._settings.markdown_chunk_overlap,
settings=self._settings,
)
return self._chunkers["markdown"]
def _get_text_chunker(self) -> "BaseChunker":
"""Get or create text chunker."""
from chunking.text import TextChunker
if "text" not in self._chunkers:
self._chunkers["text"] = TextChunker(
chunk_size=self._settings.text_chunk_size,
chunk_overlap=self._settings.text_chunk_overlap,
settings=self._settings,
)
return self._chunkers["text"]
def get_chunker(
self,
file_type: FileType | None = None,
chunk_type: ChunkType | None = None,
) -> BaseChunker:
"""
Get appropriate chunker for content type.
Args:
file_type: File type to chunk
chunk_type: Explicit chunk type to use
Returns:
Appropriate chunker instance
"""
# If explicit chunk type specified, use it
if chunk_type:
if chunk_type == ChunkType.CODE:
return self._get_code_chunker()
elif chunk_type == ChunkType.MARKDOWN:
return self._get_markdown_chunker()
else:
return self._get_text_chunker()
# Otherwise, infer from file type
if file_type:
if file_type == FileType.MARKDOWN:
return self._get_markdown_chunker()
elif file_type in (FileType.TEXT, FileType.JSON, FileType.YAML, FileType.TOML):
return self._get_text_chunker()
else:
# Code files
return self._get_code_chunker()
# Default to text chunker
return self._get_text_chunker()
def get_chunker_for_path(self, source_path: str) -> tuple[BaseChunker, FileType | None]:
"""
Get chunker based on file path extension.
Args:
source_path: File path to chunk
Returns:
Tuple of (chunker, file_type)
"""
# Extract extension
ext = ""
if "." in source_path:
ext = "." + source_path.rsplit(".", 1)[-1].lower()
file_type = FILE_EXTENSION_MAP.get(ext)
chunker = self.get_chunker(file_type=file_type)
return chunker, file_type
def chunk_content(
self,
content: str,
source_path: str | None = None,
file_type: FileType | None = None,
chunk_type: ChunkType | None = None,
metadata: dict[str, Any] | None = None,
) -> list[Chunk]:
"""
Chunk content using appropriate strategy.
Args:
content: Content to chunk
source_path: Source file path
file_type: File type
chunk_type: Explicit chunk type
metadata: Additional metadata
Returns:
List of chunks
"""
# If we have a source path but no file type, infer it
if source_path and not file_type:
chunker, file_type = self.get_chunker_for_path(source_path)
else:
chunker = self.get_chunker(file_type=file_type, chunk_type=chunk_type)
try:
chunks = chunker.chunk(
content=content,
source_path=source_path,
file_type=file_type,
metadata=metadata,
)
logger.debug(
f"Chunked content into {len(chunks)} chunks "
f"(type={chunker.chunk_type.value})"
)
return chunks
except Exception as e:
logger.error(f"Chunking error: {e}")
raise ChunkingError(
message=f"Failed to chunk content: {e}",
cause=e,
)
# Global chunker factory instance
_chunker_factory: ChunkerFactory | None = None
def get_chunker_factory() -> ChunkerFactory:
"""Get the global chunker factory instance."""
global _chunker_factory
if _chunker_factory is None:
_chunker_factory = ChunkerFactory()
return _chunker_factory
def reset_chunker_factory() -> None:
"""Reset the global chunker factory (for testing)."""
global _chunker_factory
_chunker_factory = None