Files
syndarix/mcp-servers/knowledge-base/chunking/markdown.py
Felipe Cardoso 51404216ae refactor(knowledge-base mcp server): adjust formatting for consistency and readability
Improved code formatting, line breaks, and indentation across chunking logic and multiple test modules to enhance code clarity and maintain consistent style. No functional changes made.
2026-01-06 17:20:31 +01:00

494 lines
17 KiB
Python

"""
Markdown-aware chunking implementation.
Provides intelligent chunking for markdown content that respects
heading hierarchy and preserves document structure.
"""
import logging
import re
from typing import Any
from chunking.base import BaseChunker
from config import Settings
from models import Chunk, ChunkType, FileType
logger = logging.getLogger(__name__)
# Patterns for markdown elements
HEADING_PATTERN = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
CODE_BLOCK_PATTERN = re.compile(r"^```", re.MULTILINE)
HR_PATTERN = re.compile(r"^(-{3,}|_{3,}|\*{3,})$", re.MULTILINE)
class MarkdownChunker(BaseChunker):
"""
Markdown-aware chunker that respects document structure.
Features:
- Respects heading hierarchy
- Preserves heading context in chunks
- Handles code blocks as units
- Maintains list continuity where possible
"""
def __init__(
self,
chunk_size: int,
chunk_overlap: int,
settings: Settings | None = None,
) -> None:
"""Initialize markdown chunker."""
super().__init__(chunk_size, chunk_overlap, settings)
@property
def chunk_type(self) -> ChunkType:
"""Get chunk type."""
return ChunkType.MARKDOWN
def chunk(
self,
content: str,
source_path: str | None = None,
file_type: FileType | None = None,
metadata: dict[str, Any] | None = None,
) -> list[Chunk]:
"""
Chunk markdown content.
Splits on heading boundaries and preserves heading context.
"""
if not content.strip():
return []
metadata = metadata or {}
file_type = file_type or FileType.MARKDOWN
# Split content into sections by headings
sections = self._split_by_headings(content)
if not sections:
# No headings, chunk as plain text
return self._chunk_text_block(content, source_path, file_type, metadata, [])
chunks: list[Chunk] = []
heading_stack: list[tuple[int, str]] = [] # (level, text)
for section in sections:
heading_level = section.get("level", 0)
heading_text = section.get("heading", "")
section_content = section.get("content", "")
start_line = section.get("start_line", 1)
end_line = section.get("end_line", 1)
# Update heading stack
if heading_level > 0:
# Pop headings of equal or higher level
while heading_stack and heading_stack[-1][0] >= heading_level:
heading_stack.pop()
heading_stack.append((heading_level, heading_text))
# Build heading context prefix
heading_context = " > ".join(h[1] for h in heading_stack)
section_chunks = self._chunk_section(
content=section_content,
heading_context=heading_context,
heading_level=heading_level,
heading_text=heading_text,
start_line=start_line,
end_line=end_line,
source_path=source_path,
file_type=file_type,
metadata=metadata,
)
chunks.extend(section_chunks)
return chunks
def _split_by_headings(self, content: str) -> list[dict[str, Any]]:
"""Split content into sections by headings."""
sections: list[dict[str, Any]] = []
lines = content.split("\n")
current_section: dict[str, Any] = {
"level": 0,
"heading": "",
"content": "",
"start_line": 1,
"end_line": 1,
}
current_lines: list[str] = []
in_code_block = False
for i, line in enumerate(lines):
# Track code blocks
if line.strip().startswith("```"):
in_code_block = not in_code_block
current_lines.append(line)
continue
# Skip heading detection in code blocks
if in_code_block:
current_lines.append(line)
continue
# Check for heading
heading_match = HEADING_PATTERN.match(line)
if heading_match:
# Save previous section
if current_lines:
current_section["content"] = "\n".join(current_lines)
current_section["end_line"] = i
if current_section["content"].strip():
sections.append(current_section)
# Start new section
level = len(heading_match.group(1))
heading_text = heading_match.group(2).strip()
current_section = {
"level": level,
"heading": heading_text,
"content": "",
"start_line": i + 1,
"end_line": i + 1,
}
current_lines = [line]
else:
current_lines.append(line)
# Save final section
if current_lines:
current_section["content"] = "\n".join(current_lines)
current_section["end_line"] = len(lines)
if current_section["content"].strip():
sections.append(current_section)
return sections
def _chunk_section(
self,
content: str,
heading_context: str,
heading_level: int,
heading_text: str,
start_line: int,
end_line: int,
source_path: str | None,
file_type: FileType,
metadata: dict[str, Any],
) -> list[Chunk]:
"""Chunk a single section of markdown."""
if not content.strip():
return []
token_count = self.count_tokens(content)
# If section fits in one chunk, return as-is
if token_count <= self.chunk_size:
section_metadata = {
**metadata,
"heading_context": heading_context,
"heading_level": heading_level,
"heading_text": heading_text,
}
return [
self._create_chunk(
content=content.strip(),
source_path=source_path,
start_line=start_line,
end_line=end_line,
file_type=file_type,
metadata=section_metadata,
)
]
# Need to split - try to split on paragraphs first
return self._chunk_text_block(
content,
source_path,
file_type,
{
**metadata,
"heading_context": heading_context,
"heading_level": heading_level,
"heading_text": heading_text,
},
_heading_stack=[(heading_level, heading_text)] if heading_text else [],
base_line=start_line,
)
def _chunk_text_block(
self,
content: str,
source_path: str | None,
file_type: FileType,
metadata: dict[str, Any],
_heading_stack: list[tuple[int, str]],
base_line: int = 1,
) -> list[Chunk]:
"""Chunk a block of text by paragraphs."""
# Split into paragraphs (separated by blank lines)
paragraphs = self._split_into_paragraphs(content)
if not paragraphs:
return []
chunks: list[Chunk] = []
current_content: list[str] = []
current_tokens = 0
chunk_start_line = base_line
for para_info in paragraphs:
para_content = para_info["content"]
para_tokens = para_info["tokens"]
para_start = para_info["start_line"]
# Handle very large paragraphs
if para_tokens > self.chunk_size:
# Flush current content
if current_content:
chunk_text = "\n\n".join(current_content)
chunks.append(
self._create_chunk(
content=chunk_text.strip(),
source_path=source_path,
start_line=chunk_start_line,
end_line=base_line + para_start - 1,
file_type=file_type,
metadata=metadata,
)
)
current_content = []
current_tokens = 0
# Split large paragraph by sentences/lines
sub_chunks = self._split_large_paragraph(
para_content,
source_path,
file_type,
metadata,
base_line + para_start,
)
chunks.extend(sub_chunks)
chunk_start_line = base_line + para_info["end_line"] + 1
continue
# Check if adding this paragraph exceeds limit
if current_tokens + para_tokens > self.chunk_size and current_content:
# Create chunk
chunk_text = "\n\n".join(current_content)
chunks.append(
self._create_chunk(
content=chunk_text.strip(),
source_path=source_path,
start_line=chunk_start_line,
end_line=base_line + para_start - 1,
file_type=file_type,
metadata=metadata,
)
)
# Overlap: include last paragraph if it fits
if (
current_content
and self.count_tokens(current_content[-1]) <= self.chunk_overlap
):
current_content = [current_content[-1]]
current_tokens = self.count_tokens(current_content[-1])
else:
current_content = []
current_tokens = 0
chunk_start_line = base_line + para_start
current_content.append(para_content)
current_tokens += para_tokens
# Final chunk
if current_content:
chunk_text = "\n\n".join(current_content)
end_line_num = base_line + (paragraphs[-1]["end_line"] if paragraphs else 0)
chunks.append(
self._create_chunk(
content=chunk_text.strip(),
source_path=source_path,
start_line=chunk_start_line,
end_line=end_line_num,
file_type=file_type,
metadata=metadata,
)
)
return chunks
def _split_into_paragraphs(self, content: str) -> list[dict[str, Any]]:
"""Split content into paragraphs with metadata."""
paragraphs: list[dict[str, Any]] = []
lines = content.split("\n")
current_para: list[str] = []
para_start = 0
in_code_block = False
for i, line in enumerate(lines):
# Track code blocks (keep them as single units)
if line.strip().startswith("```"):
if in_code_block:
# End of code block
current_para.append(line)
in_code_block = False
else:
# Start of code block - save previous paragraph
if current_para and any(p.strip() for p in current_para):
para_content = "\n".join(current_para)
paragraphs.append(
{
"content": para_content,
"tokens": self.count_tokens(para_content),
"start_line": para_start,
"end_line": i - 1,
}
)
current_para = [line]
para_start = i
in_code_block = True
continue
if in_code_block:
current_para.append(line)
continue
# Empty line indicates paragraph break
if not line.strip():
if current_para and any(p.strip() for p in current_para):
para_content = "\n".join(current_para)
paragraphs.append(
{
"content": para_content,
"tokens": self.count_tokens(para_content),
"start_line": para_start,
"end_line": i - 1,
}
)
current_para = []
para_start = i + 1
else:
if not current_para:
para_start = i
current_para.append(line)
# Final paragraph
if current_para and any(p.strip() for p in current_para):
para_content = "\n".join(current_para)
paragraphs.append(
{
"content": para_content,
"tokens": self.count_tokens(para_content),
"start_line": para_start,
"end_line": len(lines) - 1,
}
)
return paragraphs
def _split_large_paragraph(
self,
content: str,
source_path: str | None,
file_type: FileType,
metadata: dict[str, Any],
base_line: int,
) -> list[Chunk]:
"""Split a large paragraph into smaller chunks."""
# Try splitting by sentences
sentences = self._split_into_sentences(content)
chunks: list[Chunk] = []
current_content: list[str] = []
current_tokens = 0
for sentence in sentences:
sentence_tokens = self.count_tokens(sentence)
# If single sentence is too large, truncate
if sentence_tokens > self.chunk_size:
if current_content:
chunk_text = " ".join(current_content)
chunks.append(
self._create_chunk(
content=chunk_text.strip(),
source_path=source_path,
start_line=base_line,
end_line=base_line,
file_type=file_type,
metadata=metadata,
)
)
current_content = []
current_tokens = 0
truncated = self.truncate_to_tokens(sentence, self.chunk_size)
chunks.append(
self._create_chunk(
content=truncated.strip(),
source_path=source_path,
start_line=base_line,
end_line=base_line,
file_type=file_type,
metadata={**metadata, "truncated": True},
)
)
continue
if current_tokens + sentence_tokens > self.chunk_size and current_content:
chunk_text = " ".join(current_content)
chunks.append(
self._create_chunk(
content=chunk_text.strip(),
source_path=source_path,
start_line=base_line,
end_line=base_line,
file_type=file_type,
metadata=metadata,
)
)
# Overlap with last sentence
if (
current_content
and self.count_tokens(current_content[-1]) <= self.chunk_overlap
):
current_content = [current_content[-1]]
current_tokens = self.count_tokens(current_content[-1])
else:
current_content = []
current_tokens = 0
current_content.append(sentence)
current_tokens += sentence_tokens
# Final chunk
if current_content:
chunk_text = " ".join(current_content)
chunks.append(
self._create_chunk(
content=chunk_text.strip(),
source_path=source_path,
start_line=base_line,
end_line=base_line,
file_type=file_type,
metadata=metadata,
)
)
return chunks
def _split_into_sentences(self, text: str) -> list[str]:
"""Split text into sentences."""
# Simple sentence splitting on common terminators
# More sophisticated splitting could use nltk or spacy
sentence_endings = re.compile(r"(?<=[.!?])\s+")
sentences = sentence_endings.split(text)
return [s.strip() for s in sentences if s.strip()]