feat(knowledge-base): implement Knowledge Base MCP Server (#57)
Implements RAG capabilities with pgvector for semantic search: - Intelligent chunking strategies (code-aware, markdown-aware, text) - Semantic search with vector similarity (HNSW index) - Keyword search with PostgreSQL full-text search - Hybrid search using Reciprocal Rank Fusion (RRF) - Redis caching for embeddings - Collection management (ingest, search, delete, stats) - FastMCP tools: search_knowledge, ingest_content, delete_content, list_collections, get_collection_stats, update_document Testing: - 128 comprehensive tests covering all components - 58% code coverage (database integration tests use mocks) - Passes ruff linting and mypy type checking 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
483
mcp-servers/knowledge-base/chunking/markdown.py
Normal file
483
mcp-servers/knowledge-base/chunking/markdown.py
Normal file
@@ -0,0 +1,483 @@
|
||||
"""
|
||||
Markdown-aware chunking implementation.
|
||||
|
||||
Provides intelligent chunking for markdown content that respects
|
||||
heading hierarchy and preserves document structure.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
from chunking.base import BaseChunker
|
||||
from config import Settings
|
||||
from models import Chunk, ChunkType, FileType
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Patterns for markdown elements
|
||||
HEADING_PATTERN = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
|
||||
CODE_BLOCK_PATTERN = re.compile(r"^```", re.MULTILINE)
|
||||
HR_PATTERN = re.compile(r"^(-{3,}|_{3,}|\*{3,})$", re.MULTILINE)
|
||||
|
||||
|
||||
class MarkdownChunker(BaseChunker):
|
||||
"""
|
||||
Markdown-aware chunker that respects document structure.
|
||||
|
||||
Features:
|
||||
- Respects heading hierarchy
|
||||
- Preserves heading context in chunks
|
||||
- Handles code blocks as units
|
||||
- Maintains list continuity where possible
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
chunk_size: int,
|
||||
chunk_overlap: int,
|
||||
settings: Settings | None = None,
|
||||
) -> None:
|
||||
"""Initialize markdown chunker."""
|
||||
super().__init__(chunk_size, chunk_overlap, settings)
|
||||
|
||||
@property
|
||||
def chunk_type(self) -> ChunkType:
|
||||
"""Get chunk type."""
|
||||
return ChunkType.MARKDOWN
|
||||
|
||||
def chunk(
|
||||
self,
|
||||
content: str,
|
||||
source_path: str | None = None,
|
||||
file_type: FileType | None = None,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
) -> list[Chunk]:
|
||||
"""
|
||||
Chunk markdown content.
|
||||
|
||||
Splits on heading boundaries and preserves heading context.
|
||||
"""
|
||||
if not content.strip():
|
||||
return []
|
||||
|
||||
metadata = metadata or {}
|
||||
file_type = file_type or FileType.MARKDOWN
|
||||
|
||||
# Split content into sections by headings
|
||||
sections = self._split_by_headings(content)
|
||||
|
||||
if not sections:
|
||||
# No headings, chunk as plain text
|
||||
return self._chunk_text_block(
|
||||
content, source_path, file_type, metadata, []
|
||||
)
|
||||
|
||||
chunks: list[Chunk] = []
|
||||
heading_stack: list[tuple[int, str]] = [] # (level, text)
|
||||
|
||||
for section in sections:
|
||||
heading_level = section.get("level", 0)
|
||||
heading_text = section.get("heading", "")
|
||||
section_content = section.get("content", "")
|
||||
start_line = section.get("start_line", 1)
|
||||
end_line = section.get("end_line", 1)
|
||||
|
||||
# Update heading stack
|
||||
if heading_level > 0:
|
||||
# Pop headings of equal or higher level
|
||||
while heading_stack and heading_stack[-1][0] >= heading_level:
|
||||
heading_stack.pop()
|
||||
heading_stack.append((heading_level, heading_text))
|
||||
|
||||
# Build heading context prefix
|
||||
heading_context = " > ".join(h[1] for h in heading_stack)
|
||||
|
||||
section_chunks = self._chunk_section(
|
||||
content=section_content,
|
||||
heading_context=heading_context,
|
||||
heading_level=heading_level,
|
||||
heading_text=heading_text,
|
||||
start_line=start_line,
|
||||
end_line=end_line,
|
||||
source_path=source_path,
|
||||
file_type=file_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
chunks.extend(section_chunks)
|
||||
|
||||
return chunks
|
||||
|
||||
def _split_by_headings(self, content: str) -> list[dict[str, Any]]:
|
||||
"""Split content into sections by headings."""
|
||||
sections: list[dict[str, Any]] = []
|
||||
lines = content.split("\n")
|
||||
|
||||
current_section: dict[str, Any] = {
|
||||
"level": 0,
|
||||
"heading": "",
|
||||
"content": "",
|
||||
"start_line": 1,
|
||||
"end_line": 1,
|
||||
}
|
||||
current_lines: list[str] = []
|
||||
in_code_block = False
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
# Track code blocks
|
||||
if line.strip().startswith("```"):
|
||||
in_code_block = not in_code_block
|
||||
current_lines.append(line)
|
||||
continue
|
||||
|
||||
# Skip heading detection in code blocks
|
||||
if in_code_block:
|
||||
current_lines.append(line)
|
||||
continue
|
||||
|
||||
# Check for heading
|
||||
heading_match = HEADING_PATTERN.match(line)
|
||||
if heading_match:
|
||||
# Save previous section
|
||||
if current_lines:
|
||||
current_section["content"] = "\n".join(current_lines)
|
||||
current_section["end_line"] = i
|
||||
if current_section["content"].strip():
|
||||
sections.append(current_section)
|
||||
|
||||
# Start new section
|
||||
level = len(heading_match.group(1))
|
||||
heading_text = heading_match.group(2).strip()
|
||||
current_section = {
|
||||
"level": level,
|
||||
"heading": heading_text,
|
||||
"content": "",
|
||||
"start_line": i + 1,
|
||||
"end_line": i + 1,
|
||||
}
|
||||
current_lines = [line]
|
||||
else:
|
||||
current_lines.append(line)
|
||||
|
||||
# Save final section
|
||||
if current_lines:
|
||||
current_section["content"] = "\n".join(current_lines)
|
||||
current_section["end_line"] = len(lines)
|
||||
if current_section["content"].strip():
|
||||
sections.append(current_section)
|
||||
|
||||
return sections
|
||||
|
||||
def _chunk_section(
|
||||
self,
|
||||
content: str,
|
||||
heading_context: str,
|
||||
heading_level: int,
|
||||
heading_text: str,
|
||||
start_line: int,
|
||||
end_line: int,
|
||||
source_path: str | None,
|
||||
file_type: FileType,
|
||||
metadata: dict[str, Any],
|
||||
) -> list[Chunk]:
|
||||
"""Chunk a single section of markdown."""
|
||||
if not content.strip():
|
||||
return []
|
||||
|
||||
token_count = self.count_tokens(content)
|
||||
|
||||
# If section fits in one chunk, return as-is
|
||||
if token_count <= self.chunk_size:
|
||||
section_metadata = {
|
||||
**metadata,
|
||||
"heading_context": heading_context,
|
||||
"heading_level": heading_level,
|
||||
"heading_text": heading_text,
|
||||
}
|
||||
return [
|
||||
self._create_chunk(
|
||||
content=content.strip(),
|
||||
source_path=source_path,
|
||||
start_line=start_line,
|
||||
end_line=end_line,
|
||||
file_type=file_type,
|
||||
metadata=section_metadata,
|
||||
)
|
||||
]
|
||||
|
||||
# Need to split - try to split on paragraphs first
|
||||
return self._chunk_text_block(
|
||||
content,
|
||||
source_path,
|
||||
file_type,
|
||||
{
|
||||
**metadata,
|
||||
"heading_context": heading_context,
|
||||
"heading_level": heading_level,
|
||||
"heading_text": heading_text,
|
||||
},
|
||||
_heading_stack=[(heading_level, heading_text)] if heading_text else [],
|
||||
base_line=start_line,
|
||||
)
|
||||
|
||||
def _chunk_text_block(
|
||||
self,
|
||||
content: str,
|
||||
source_path: str | None,
|
||||
file_type: FileType,
|
||||
metadata: dict[str, Any],
|
||||
_heading_stack: list[tuple[int, str]],
|
||||
base_line: int = 1,
|
||||
) -> list[Chunk]:
|
||||
"""Chunk a block of text by paragraphs."""
|
||||
# Split into paragraphs (separated by blank lines)
|
||||
paragraphs = self._split_into_paragraphs(content)
|
||||
|
||||
if not paragraphs:
|
||||
return []
|
||||
|
||||
chunks: list[Chunk] = []
|
||||
current_content: list[str] = []
|
||||
current_tokens = 0
|
||||
chunk_start_line = base_line
|
||||
|
||||
for para_info in paragraphs:
|
||||
para_content = para_info["content"]
|
||||
para_tokens = para_info["tokens"]
|
||||
para_start = para_info["start_line"]
|
||||
|
||||
# Handle very large paragraphs
|
||||
if para_tokens > self.chunk_size:
|
||||
# Flush current content
|
||||
if current_content:
|
||||
chunk_text = "\n\n".join(current_content)
|
||||
chunks.append(
|
||||
self._create_chunk(
|
||||
content=chunk_text.strip(),
|
||||
source_path=source_path,
|
||||
start_line=chunk_start_line,
|
||||
end_line=base_line + para_start - 1,
|
||||
file_type=file_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
current_content = []
|
||||
current_tokens = 0
|
||||
|
||||
# Split large paragraph by sentences/lines
|
||||
sub_chunks = self._split_large_paragraph(
|
||||
para_content,
|
||||
source_path,
|
||||
file_type,
|
||||
metadata,
|
||||
base_line + para_start,
|
||||
)
|
||||
chunks.extend(sub_chunks)
|
||||
chunk_start_line = base_line + para_info["end_line"] + 1
|
||||
continue
|
||||
|
||||
# Check if adding this paragraph exceeds limit
|
||||
if current_tokens + para_tokens > self.chunk_size and current_content:
|
||||
# Create chunk
|
||||
chunk_text = "\n\n".join(current_content)
|
||||
chunks.append(
|
||||
self._create_chunk(
|
||||
content=chunk_text.strip(),
|
||||
source_path=source_path,
|
||||
start_line=chunk_start_line,
|
||||
end_line=base_line + para_start - 1,
|
||||
file_type=file_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
|
||||
# Overlap: include last paragraph if it fits
|
||||
if current_content and self.count_tokens(current_content[-1]) <= self.chunk_overlap:
|
||||
current_content = [current_content[-1]]
|
||||
current_tokens = self.count_tokens(current_content[-1])
|
||||
else:
|
||||
current_content = []
|
||||
current_tokens = 0
|
||||
|
||||
chunk_start_line = base_line + para_start
|
||||
|
||||
current_content.append(para_content)
|
||||
current_tokens += para_tokens
|
||||
|
||||
# Final chunk
|
||||
if current_content:
|
||||
chunk_text = "\n\n".join(current_content)
|
||||
end_line_num = base_line + (paragraphs[-1]["end_line"] if paragraphs else 0)
|
||||
chunks.append(
|
||||
self._create_chunk(
|
||||
content=chunk_text.strip(),
|
||||
source_path=source_path,
|
||||
start_line=chunk_start_line,
|
||||
end_line=end_line_num,
|
||||
file_type=file_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
|
||||
return chunks
|
||||
|
||||
def _split_into_paragraphs(self, content: str) -> list[dict[str, Any]]:
|
||||
"""Split content into paragraphs with metadata."""
|
||||
paragraphs: list[dict[str, Any]] = []
|
||||
lines = content.split("\n")
|
||||
|
||||
current_para: list[str] = []
|
||||
para_start = 0
|
||||
in_code_block = False
|
||||
|
||||
for i, line in enumerate(lines):
|
||||
# Track code blocks (keep them as single units)
|
||||
if line.strip().startswith("```"):
|
||||
if in_code_block:
|
||||
# End of code block
|
||||
current_para.append(line)
|
||||
in_code_block = False
|
||||
else:
|
||||
# Start of code block - save previous paragraph
|
||||
if current_para and any(p.strip() for p in current_para):
|
||||
para_content = "\n".join(current_para)
|
||||
paragraphs.append({
|
||||
"content": para_content,
|
||||
"tokens": self.count_tokens(para_content),
|
||||
"start_line": para_start,
|
||||
"end_line": i - 1,
|
||||
})
|
||||
current_para = [line]
|
||||
para_start = i
|
||||
in_code_block = True
|
||||
continue
|
||||
|
||||
if in_code_block:
|
||||
current_para.append(line)
|
||||
continue
|
||||
|
||||
# Empty line indicates paragraph break
|
||||
if not line.strip():
|
||||
if current_para and any(p.strip() for p in current_para):
|
||||
para_content = "\n".join(current_para)
|
||||
paragraphs.append({
|
||||
"content": para_content,
|
||||
"tokens": self.count_tokens(para_content),
|
||||
"start_line": para_start,
|
||||
"end_line": i - 1,
|
||||
})
|
||||
current_para = []
|
||||
para_start = i + 1
|
||||
else:
|
||||
if not current_para:
|
||||
para_start = i
|
||||
current_para.append(line)
|
||||
|
||||
# Final paragraph
|
||||
if current_para and any(p.strip() for p in current_para):
|
||||
para_content = "\n".join(current_para)
|
||||
paragraphs.append({
|
||||
"content": para_content,
|
||||
"tokens": self.count_tokens(para_content),
|
||||
"start_line": para_start,
|
||||
"end_line": len(lines) - 1,
|
||||
})
|
||||
|
||||
return paragraphs
|
||||
|
||||
def _split_large_paragraph(
|
||||
self,
|
||||
content: str,
|
||||
source_path: str | None,
|
||||
file_type: FileType,
|
||||
metadata: dict[str, Any],
|
||||
base_line: int,
|
||||
) -> list[Chunk]:
|
||||
"""Split a large paragraph into smaller chunks."""
|
||||
# Try splitting by sentences
|
||||
sentences = self._split_into_sentences(content)
|
||||
|
||||
chunks: list[Chunk] = []
|
||||
current_content: list[str] = []
|
||||
current_tokens = 0
|
||||
|
||||
for sentence in sentences:
|
||||
sentence_tokens = self.count_tokens(sentence)
|
||||
|
||||
# If single sentence is too large, truncate
|
||||
if sentence_tokens > self.chunk_size:
|
||||
if current_content:
|
||||
chunk_text = " ".join(current_content)
|
||||
chunks.append(
|
||||
self._create_chunk(
|
||||
content=chunk_text.strip(),
|
||||
source_path=source_path,
|
||||
start_line=base_line,
|
||||
end_line=base_line,
|
||||
file_type=file_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
current_content = []
|
||||
current_tokens = 0
|
||||
|
||||
truncated = self.truncate_to_tokens(sentence, self.chunk_size)
|
||||
chunks.append(
|
||||
self._create_chunk(
|
||||
content=truncated.strip(),
|
||||
source_path=source_path,
|
||||
start_line=base_line,
|
||||
end_line=base_line,
|
||||
file_type=file_type,
|
||||
metadata={**metadata, "truncated": True},
|
||||
)
|
||||
)
|
||||
continue
|
||||
|
||||
if current_tokens + sentence_tokens > self.chunk_size and current_content:
|
||||
chunk_text = " ".join(current_content)
|
||||
chunks.append(
|
||||
self._create_chunk(
|
||||
content=chunk_text.strip(),
|
||||
source_path=source_path,
|
||||
start_line=base_line,
|
||||
end_line=base_line,
|
||||
file_type=file_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
|
||||
# Overlap with last sentence
|
||||
if current_content and self.count_tokens(current_content[-1]) <= self.chunk_overlap:
|
||||
current_content = [current_content[-1]]
|
||||
current_tokens = self.count_tokens(current_content[-1])
|
||||
else:
|
||||
current_content = []
|
||||
current_tokens = 0
|
||||
|
||||
current_content.append(sentence)
|
||||
current_tokens += sentence_tokens
|
||||
|
||||
# Final chunk
|
||||
if current_content:
|
||||
chunk_text = " ".join(current_content)
|
||||
chunks.append(
|
||||
self._create_chunk(
|
||||
content=chunk_text.strip(),
|
||||
source_path=source_path,
|
||||
start_line=base_line,
|
||||
end_line=base_line,
|
||||
file_type=file_type,
|
||||
metadata=metadata,
|
||||
)
|
||||
)
|
||||
|
||||
return chunks
|
||||
|
||||
def _split_into_sentences(self, text: str) -> list[str]:
|
||||
"""Split text into sentences."""
|
||||
# Simple sentence splitting on common terminators
|
||||
# More sophisticated splitting could use nltk or spacy
|
||||
sentence_endings = re.compile(r"(?<=[.!?])\s+")
|
||||
sentences = sentence_endings.split(text)
|
||||
return [s.strip() for s in sentences if s.strip()]
|
||||
Reference in New Issue
Block a user