feat(knowledge-base): implement Knowledge Base MCP Server (#57)
Implements RAG capabilities with pgvector for semantic search: - Intelligent chunking strategies (code-aware, markdown-aware, text) - Semantic search with vector similarity (HNSW index) - Keyword search with PostgreSQL full-text search - Hybrid search using Reciprocal Rank Fusion (RRF) - Redis caching for embeddings - Collection management (ingest, search, delete, stats) - FastMCP tools: search_knowledge, ingest_content, delete_content, list_collections, get_collection_stats, update_document Testing: - 128 comprehensive tests covering all components - 58% code coverage (database integration tests use mocks) - Passes ruff linting and mypy type checking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
389
mcp-servers/knowledge-base/chunking/text.py
Normal file
389
mcp-servers/knowledge-base/chunking/text.py
Normal file
@@ -0,0 +1,389 @@
|
||||
"""
|
||||
Plain text chunking implementation.
|
||||
|
||||
Provides simple text chunking with paragraph and sentence
|
||||
boundary detection.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from chunking.base import BaseChunker
|
||||
from config import Settings
|
||||
from models import Chunk, ChunkType, FileType
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class TextChunker(BaseChunker):
|
||||
"""
|
||||
Plain text chunker with paragraph awareness.
|
||||
|
||||
Features:
|
||||
- Splits on paragraph boundaries
|
||||
- Falls back to sentence/word boundaries
|
||||
- Configurable overlap for context preservation
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int,
|
||||
chunk_overlap: int,
|
||||
settings: Settings | None = None,
|
||||
) -> None:
|
||||
"""Initialize text chunker."""
|
||||
super().__init__(chunk_size, chunk_overlap, settings)
|
||||
|
||||
@property
|
||||
def chunk_type(self) -> ChunkType:
|
||||
"""Get chunk type."""
|
||||
return ChunkType.TEXT
|
||||
|
||||
def chunk(
|
||||
self,
|
||||
content: str,
|
||||
source_path: str | None = None,
|
||||
file_type: FileType | None = None,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
) -> list[Chunk]:
|
||||
"""
|
||||
Chunk plain text content.
|
||||
|
||||
Tries paragraph boundaries first, then sentences.
|
||||
"""
|
||||
if not content.strip():
|
||||
return []
|
||||
|
||||
metadata = metadata or {}
|
||||
|
||||
# Check if content fits in a single chunk
|
||||
total_tokens = self.count_tokens(content)
|
||||
if total_tokens <= self.chunk_size:
|
||||
return [
|
||||
self._create_chunk(
|
||||
content=content.strip(),
|
||||
source_path=source_path,
|
||||
start_line=1,
|
||||
end_line=content.count("\n") + 1,
|
||||
file_type=file_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
]
|
||||
|
||||
# Try paragraph-based chunking
|
||||
paragraphs = self._split_paragraphs(content)
|
||||
if len(paragraphs) > 1:
|
||||
return self._chunk_by_paragraphs(
|
||||
paragraphs, source_path, file_type, metadata
|
||||
)
|
||||
|
||||
# Fall back to sentence-based chunking
|
||||
return self._chunk_by_sentences(
|
||||
content, source_path, file_type, metadata
|
||||
)
|
||||
|
||||
def _split_paragraphs(self, content: str) -> list[dict[str, Any]]:
|
||||
"""Split content into paragraphs."""
|
||||
paragraphs: list[dict[str, Any]] = []
|
||||
|
||||
# Split on double newlines (paragraph boundaries)
|
||||
raw_paras = re.split(r"\n\s*\n", content)
|
||||
|
||||
line_num = 1
|
||||
for para in raw_paras:
|
||||
para = para.strip()
|
||||
if not para:
|
||||
continue
|
||||
|
||||
para_lines = para.count("\n") + 1
|
||||
paragraphs.append({
|
||||
"content": para,
|
||||
"tokens": self.count_tokens(para),
|
||||
"start_line": line_num,
|
||||
"end_line": line_num + para_lines - 1,
|
||||
})
|
||||
line_num += para_lines + 1 # +1 for blank line between paragraphs
|
||||
|
||||
return paragraphs
|
||||
|
||||
def _chunk_by_paragraphs(
|
||||
self,
|
||||
paragraphs: list[dict[str, Any]],
|
||||
source_path: str | None,
|
||||
file_type: FileType | None,
|
||||
metadata: dict[str, Any],
|
||||
) -> list[Chunk]:
|
||||
"""Chunk by combining paragraphs up to size limit."""
|
||||
chunks: list[Chunk] = []
|
||||
current_paras: list[str] = []
|
||||
current_tokens = 0
|
||||
chunk_start = paragraphs[0]["start_line"] if paragraphs else 1
|
||||
chunk_end = chunk_start
|
||||
|
||||
for para in paragraphs:
|
||||
para_content = para["content"]
|
||||
para_tokens = para["tokens"]
|
||||
|
||||
# Handle paragraphs larger than chunk size
|
||||
if para_tokens > self.chunk_size:
|
||||
# Flush current content
|
||||
if current_paras:
|
||||
chunk_text = "\n\n".join(current_paras)
|
||||
chunks.append(
|
||||
self._create_chunk(
|
||||
content=chunk_text,
|
||||
source_path=source_path,
|
||||
start_line=chunk_start,
|
||||
end_line=chunk_end,
|
||||
file_type=file_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
current_paras = []
|
||||
current_tokens = 0
|
||||
|
||||
# Split large paragraph
|
||||
sub_chunks = self._split_large_text(
|
||||
para_content,
|
||||
source_path,
|
||||
file_type,
|
||||
metadata,
|
||||
para["start_line"],
|
||||
)
|
||||
chunks.extend(sub_chunks)
|
||||
chunk_start = para["end_line"] + 1
|
||||
chunk_end = chunk_start
|
||||
continue
|
||||
|
||||
# Check if adding paragraph exceeds limit
|
||||
if current_tokens + para_tokens > self.chunk_size and current_paras:
|
||||
chunk_text = "\n\n".join(current_paras)
|
||||
chunks.append(
|
||||
self._create_chunk(
|
||||
content=chunk_text,
|
||||
source_path=source_path,
|
||||
start_line=chunk_start,
|
||||
end_line=chunk_end,
|
||||
file_type=file_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
|
||||
# Overlap: keep last paragraph if small enough
|
||||
overlap_para = None
|
||||
if current_paras and self.count_tokens(current_paras[-1]) <= self.chunk_overlap:
|
||||
overlap_para = current_paras[-1]
|
||||
|
||||
current_paras = [overlap_para] if overlap_para else []
|
||||
current_tokens = self.count_tokens(overlap_para) if overlap_para else 0
|
||||
chunk_start = para["start_line"]
|
||||
|
||||
current_paras.append(para_content)
|
||||
current_tokens += para_tokens
|
||||
chunk_end = para["end_line"]
|
||||
|
||||
# Final chunk
|
||||
if current_paras:
|
||||
chunk_text = "\n\n".join(current_paras)
|
||||
chunks.append(
|
||||
self._create_chunk(
|
||||
content=chunk_text,
|
||||
source_path=source_path,
|
||||
start_line=chunk_start,
|
||||
end_line=chunk_end,
|
||||
file_type=file_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
|
||||
return chunks
|
||||
|
||||
def _chunk_by_sentences(
|
||||
self,
|
||||
content: str,
|
||||
source_path: str | None,
|
||||
file_type: FileType | None,
|
||||
metadata: dict[str, Any],
|
||||
) -> list[Chunk]:
|
||||
"""Chunk by sentences."""
|
||||
sentences = self._split_sentences(content)
|
||||
|
||||
if not sentences:
|
||||
return []
|
||||
|
||||
chunks: list[Chunk] = []
|
||||
current_sentences: list[str] = []
|
||||
current_tokens = 0
|
||||
|
||||
for sentence in sentences:
|
||||
sentence_tokens = self.count_tokens(sentence)
|
||||
|
||||
# Handle sentences larger than chunk size
|
||||
if sentence_tokens > self.chunk_size:
|
||||
if current_sentences:
|
||||
chunk_text = " ".join(current_sentences)
|
||||
chunks.append(
|
||||
self._create_chunk(
|
||||
content=chunk_text,
|
||||
source_path=source_path,
|
||||
start_line=1,
|
||||
end_line=1,
|
||||
file_type=file_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
current_sentences = []
|
||||
current_tokens = 0
|
||||
|
||||
# Truncate large sentence
|
||||
truncated = self.truncate_to_tokens(sentence, self.chunk_size)
|
||||
chunks.append(
|
||||
self._create_chunk(
|
||||
content=truncated,
|
||||
source_path=source_path,
|
||||
start_line=1,
|
||||
end_line=1,
|
||||
file_type=file_type,
|
||||
metadata={**metadata, "truncated": True},
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
# Check if adding sentence exceeds limit
|
||||
if current_tokens + sentence_tokens > self.chunk_size and current_sentences:
|
||||
chunk_text = " ".join(current_sentences)
|
||||
chunks.append(
|
||||
self._create_chunk(
|
||||
content=chunk_text,
|
||||
source_path=source_path,
|
||||
start_line=1,
|
||||
end_line=1,
|
||||
file_type=file_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
|
||||
# Overlap: keep last sentence if small enough
|
||||
overlap = None
|
||||
if current_sentences and self.count_tokens(current_sentences[-1]) <= self.chunk_overlap:
|
||||
overlap = current_sentences[-1]
|
||||
|
||||
current_sentences = [overlap] if overlap else []
|
||||
current_tokens = self.count_tokens(overlap) if overlap else 0
|
||||
|
||||
current_sentences.append(sentence)
|
||||
current_tokens += sentence_tokens
|
||||
|
||||
# Final chunk
|
||||
if current_sentences:
|
||||
chunk_text = " ".join(current_sentences)
|
||||
chunks.append(
|
||||
self._create_chunk(
|
||||
content=chunk_text,
|
||||
source_path=source_path,
|
||||
start_line=1,
|
||||
end_line=content.count("\n") + 1,
|
||||
file_type=file_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
|
||||
return chunks
|
||||
|
||||
def _split_sentences(self, text: str) -> list[str]:
|
||||
"""Split text into sentences."""
|
||||
# Handle common sentence endings
|
||||
# This is a simple approach - production might use nltk or spacy
|
||||
sentence_pattern = re.compile(
|
||||
r"(?<=[.!?])\s+(?=[A-Z])|" # Standard sentence ending
|
||||
r"(?<=[.!?])\s*$|" # End of text
|
||||
r"(?<=\n)\s*(?=\S)" # Newlines as boundaries
|
||||
)
|
||||
|
||||
sentences = sentence_pattern.split(text)
|
||||
return [s.strip() for s in sentences if s.strip()]
|
||||
|
||||
def _split_large_text(
|
||||
self,
|
||||
text: str,
|
||||
source_path: str | None,
|
||||
file_type: FileType | None,
|
||||
metadata: dict[str, Any],
|
||||
base_line: int,
|
||||
) -> list[Chunk]:
|
||||
"""Split text that exceeds chunk size."""
|
||||
# First try sentences
|
||||
sentences = self._split_sentences(text)
|
||||
|
||||
if len(sentences) > 1:
|
||||
return self._chunk_by_sentences(
|
||||
text, source_path, file_type, metadata
|
||||
)
|
||||
|
||||
# Fall back to word-based splitting
|
||||
return self._chunk_by_words(
|
||||
text, source_path, file_type, metadata, base_line
|
||||
)
|
||||
|
||||
def _chunk_by_words(
|
||||
self,
|
||||
text: str,
|
||||
source_path: str | None,
|
||||
file_type: FileType | None,
|
||||
metadata: dict[str, Any],
|
||||
base_line: int,
|
||||
) -> list[Chunk]:
|
||||
"""Last resort: chunk by words."""
|
||||
words = text.split()
|
||||
chunks: list[Chunk] = []
|
||||
current_words: list[str] = []
|
||||
current_tokens = 0
|
||||
|
||||
for word in words:
|
||||
word_tokens = self.count_tokens(word + " ")
|
||||
|
||||
if current_tokens + word_tokens > self.chunk_size and current_words:
|
||||
chunk_text = " ".join(current_words)
|
||||
chunks.append(
|
||||
self._create_chunk(
|
||||
content=chunk_text,
|
||||
source_path=source_path,
|
||||
start_line=base_line,
|
||||
end_line=base_line,
|
||||
file_type=file_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
|
||||
# Word overlap
|
||||
overlap_count = 0
|
||||
overlap_words: list[str] = []
|
||||
for w in reversed(current_words):
|
||||
w_tokens = self.count_tokens(w + " ")
|
||||
if overlap_count + w_tokens > self.chunk_overlap:
|
||||
break
|
||||
overlap_words.insert(0, w)
|
||||
overlap_count += w_tokens
|
||||
|
||||
current_words = overlap_words
|
||||
current_tokens = overlap_count
|
||||
|
||||
current_words.append(word)
|
||||
current_tokens += word_tokens
|
||||
|
||||
# Final chunk
|
||||
if current_words:
|
||||
chunk_text = " ".join(current_words)
|
||||
chunks.append(
|
||||
self._create_chunk(
|
||||
content=chunk_text,
|
||||
source_path=source_path,
|
||||
start_line=base_line,
|
||||
end_line=base_line,
|
||||
file_type=file_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
|
||||
return chunks
|
||||
Reference in New Issue
Block a user