forked from cardosofelipe/fast-next-template
Improved code formatting, line breaks, and indentation across chunking logic and multiple test modules to enhance code clarity and maintain consistent style. No functional changes made.
494 lines
17 KiB
Python
494 lines
17 KiB
Python
"""
|
|
Markdown-aware chunking implementation.
|
|
|
|
Provides intelligent chunking for markdown content that respects
|
|
heading hierarchy and preserves document structure.
|
|
"""
|
|
|
|
import logging
|
|
import re
|
|
from typing import Any
|
|
|
|
from chunking.base import BaseChunker
|
|
from config import Settings
|
|
from models import Chunk, ChunkType, FileType
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Patterns for markdown elements
|
|
HEADING_PATTERN = re.compile(r"^(#{1,6})\s+(.+)$", re.MULTILINE)
|
|
CODE_BLOCK_PATTERN = re.compile(r"^```", re.MULTILINE)
|
|
HR_PATTERN = re.compile(r"^(-{3,}|_{3,}|\*{3,})$", re.MULTILINE)
|
|
|
|
|
|
class MarkdownChunker(BaseChunker):
|
|
"""
|
|
Markdown-aware chunker that respects document structure.
|
|
|
|
Features:
|
|
- Respects heading hierarchy
|
|
- Preserves heading context in chunks
|
|
- Handles code blocks as units
|
|
- Maintains list continuity where possible
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
chunk_size: int,
|
|
chunk_overlap: int,
|
|
settings: Settings | None = None,
|
|
) -> None:
|
|
"""Initialize markdown chunker."""
|
|
super().__init__(chunk_size, chunk_overlap, settings)
|
|
|
|
@property
|
|
def chunk_type(self) -> ChunkType:
|
|
"""Get chunk type."""
|
|
return ChunkType.MARKDOWN
|
|
|
|
def chunk(
|
|
self,
|
|
content: str,
|
|
source_path: str | None = None,
|
|
file_type: FileType | None = None,
|
|
metadata: dict[str, Any] | None = None,
|
|
) -> list[Chunk]:
|
|
"""
|
|
Chunk markdown content.
|
|
|
|
Splits on heading boundaries and preserves heading context.
|
|
"""
|
|
if not content.strip():
|
|
return []
|
|
|
|
metadata = metadata or {}
|
|
file_type = file_type or FileType.MARKDOWN
|
|
|
|
# Split content into sections by headings
|
|
sections = self._split_by_headings(content)
|
|
|
|
if not sections:
|
|
# No headings, chunk as plain text
|
|
return self._chunk_text_block(content, source_path, file_type, metadata, [])
|
|
|
|
chunks: list[Chunk] = []
|
|
heading_stack: list[tuple[int, str]] = [] # (level, text)
|
|
|
|
for section in sections:
|
|
heading_level = section.get("level", 0)
|
|
heading_text = section.get("heading", "")
|
|
section_content = section.get("content", "")
|
|
start_line = section.get("start_line", 1)
|
|
end_line = section.get("end_line", 1)
|
|
|
|
# Update heading stack
|
|
if heading_level > 0:
|
|
# Pop headings of equal or higher level
|
|
while heading_stack and heading_stack[-1][0] >= heading_level:
|
|
heading_stack.pop()
|
|
heading_stack.append((heading_level, heading_text))
|
|
|
|
# Build heading context prefix
|
|
heading_context = " > ".join(h[1] for h in heading_stack)
|
|
|
|
section_chunks = self._chunk_section(
|
|
content=section_content,
|
|
heading_context=heading_context,
|
|
heading_level=heading_level,
|
|
heading_text=heading_text,
|
|
start_line=start_line,
|
|
end_line=end_line,
|
|
source_path=source_path,
|
|
file_type=file_type,
|
|
metadata=metadata,
|
|
)
|
|
chunks.extend(section_chunks)
|
|
|
|
return chunks
|
|
|
|
def _split_by_headings(self, content: str) -> list[dict[str, Any]]:
|
|
"""Split content into sections by headings."""
|
|
sections: list[dict[str, Any]] = []
|
|
lines = content.split("\n")
|
|
|
|
current_section: dict[str, Any] = {
|
|
"level": 0,
|
|
"heading": "",
|
|
"content": "",
|
|
"start_line": 1,
|
|
"end_line": 1,
|
|
}
|
|
current_lines: list[str] = []
|
|
in_code_block = False
|
|
|
|
for i, line in enumerate(lines):
|
|
# Track code blocks
|
|
if line.strip().startswith("```"):
|
|
in_code_block = not in_code_block
|
|
current_lines.append(line)
|
|
continue
|
|
|
|
# Skip heading detection in code blocks
|
|
if in_code_block:
|
|
current_lines.append(line)
|
|
continue
|
|
|
|
# Check for heading
|
|
heading_match = HEADING_PATTERN.match(line)
|
|
if heading_match:
|
|
# Save previous section
|
|
if current_lines:
|
|
current_section["content"] = "\n".join(current_lines)
|
|
current_section["end_line"] = i
|
|
if current_section["content"].strip():
|
|
sections.append(current_section)
|
|
|
|
# Start new section
|
|
level = len(heading_match.group(1))
|
|
heading_text = heading_match.group(2).strip()
|
|
current_section = {
|
|
"level": level,
|
|
"heading": heading_text,
|
|
"content": "",
|
|
"start_line": i + 1,
|
|
"end_line": i + 1,
|
|
}
|
|
current_lines = [line]
|
|
else:
|
|
current_lines.append(line)
|
|
|
|
# Save final section
|
|
if current_lines:
|
|
current_section["content"] = "\n".join(current_lines)
|
|
current_section["end_line"] = len(lines)
|
|
if current_section["content"].strip():
|
|
sections.append(current_section)
|
|
|
|
return sections
|
|
|
|
def _chunk_section(
|
|
self,
|
|
content: str,
|
|
heading_context: str,
|
|
heading_level: int,
|
|
heading_text: str,
|
|
start_line: int,
|
|
end_line: int,
|
|
source_path: str | None,
|
|
file_type: FileType,
|
|
metadata: dict[str, Any],
|
|
) -> list[Chunk]:
|
|
"""Chunk a single section of markdown."""
|
|
if not content.strip():
|
|
return []
|
|
|
|
token_count = self.count_tokens(content)
|
|
|
|
# If section fits in one chunk, return as-is
|
|
if token_count <= self.chunk_size:
|
|
section_metadata = {
|
|
**metadata,
|
|
"heading_context": heading_context,
|
|
"heading_level": heading_level,
|
|
"heading_text": heading_text,
|
|
}
|
|
return [
|
|
self._create_chunk(
|
|
content=content.strip(),
|
|
source_path=source_path,
|
|
start_line=start_line,
|
|
end_line=end_line,
|
|
file_type=file_type,
|
|
metadata=section_metadata,
|
|
)
|
|
]
|
|
|
|
# Need to split - try to split on paragraphs first
|
|
return self._chunk_text_block(
|
|
content,
|
|
source_path,
|
|
file_type,
|
|
{
|
|
**metadata,
|
|
"heading_context": heading_context,
|
|
"heading_level": heading_level,
|
|
"heading_text": heading_text,
|
|
},
|
|
_heading_stack=[(heading_level, heading_text)] if heading_text else [],
|
|
base_line=start_line,
|
|
)
|
|
|
|
def _chunk_text_block(
|
|
self,
|
|
content: str,
|
|
source_path: str | None,
|
|
file_type: FileType,
|
|
metadata: dict[str, Any],
|
|
_heading_stack: list[tuple[int, str]],
|
|
base_line: int = 1,
|
|
) -> list[Chunk]:
|
|
"""Chunk a block of text by paragraphs."""
|
|
# Split into paragraphs (separated by blank lines)
|
|
paragraphs = self._split_into_paragraphs(content)
|
|
|
|
if not paragraphs:
|
|
return []
|
|
|
|
chunks: list[Chunk] = []
|
|
current_content: list[str] = []
|
|
current_tokens = 0
|
|
chunk_start_line = base_line
|
|
|
|
for para_info in paragraphs:
|
|
para_content = para_info["content"]
|
|
para_tokens = para_info["tokens"]
|
|
para_start = para_info["start_line"]
|
|
|
|
# Handle very large paragraphs
|
|
if para_tokens > self.chunk_size:
|
|
# Flush current content
|
|
if current_content:
|
|
chunk_text = "\n\n".join(current_content)
|
|
chunks.append(
|
|
self._create_chunk(
|
|
content=chunk_text.strip(),
|
|
source_path=source_path,
|
|
start_line=chunk_start_line,
|
|
end_line=base_line + para_start - 1,
|
|
file_type=file_type,
|
|
metadata=metadata,
|
|
)
|
|
)
|
|
current_content = []
|
|
current_tokens = 0
|
|
|
|
# Split large paragraph by sentences/lines
|
|
sub_chunks = self._split_large_paragraph(
|
|
para_content,
|
|
source_path,
|
|
file_type,
|
|
metadata,
|
|
base_line + para_start,
|
|
)
|
|
chunks.extend(sub_chunks)
|
|
chunk_start_line = base_line + para_info["end_line"] + 1
|
|
continue
|
|
|
|
# Check if adding this paragraph exceeds limit
|
|
if current_tokens + para_tokens > self.chunk_size and current_content:
|
|
# Create chunk
|
|
chunk_text = "\n\n".join(current_content)
|
|
chunks.append(
|
|
self._create_chunk(
|
|
content=chunk_text.strip(),
|
|
source_path=source_path,
|
|
start_line=chunk_start_line,
|
|
end_line=base_line + para_start - 1,
|
|
file_type=file_type,
|
|
metadata=metadata,
|
|
)
|
|
)
|
|
|
|
# Overlap: include last paragraph if it fits
|
|
if (
|
|
current_content
|
|
and self.count_tokens(current_content[-1]) <= self.chunk_overlap
|
|
):
|
|
current_content = [current_content[-1]]
|
|
current_tokens = self.count_tokens(current_content[-1])
|
|
else:
|
|
current_content = []
|
|
current_tokens = 0
|
|
|
|
chunk_start_line = base_line + para_start
|
|
|
|
current_content.append(para_content)
|
|
current_tokens += para_tokens
|
|
|
|
# Final chunk
|
|
if current_content:
|
|
chunk_text = "\n\n".join(current_content)
|
|
end_line_num = base_line + (paragraphs[-1]["end_line"] if paragraphs else 0)
|
|
chunks.append(
|
|
self._create_chunk(
|
|
content=chunk_text.strip(),
|
|
source_path=source_path,
|
|
start_line=chunk_start_line,
|
|
end_line=end_line_num,
|
|
file_type=file_type,
|
|
metadata=metadata,
|
|
)
|
|
)
|
|
|
|
return chunks
|
|
|
|
def _split_into_paragraphs(self, content: str) -> list[dict[str, Any]]:
|
|
"""Split content into paragraphs with metadata."""
|
|
paragraphs: list[dict[str, Any]] = []
|
|
lines = content.split("\n")
|
|
|
|
current_para: list[str] = []
|
|
para_start = 0
|
|
in_code_block = False
|
|
|
|
for i, line in enumerate(lines):
|
|
# Track code blocks (keep them as single units)
|
|
if line.strip().startswith("```"):
|
|
if in_code_block:
|
|
# End of code block
|
|
current_para.append(line)
|
|
in_code_block = False
|
|
else:
|
|
# Start of code block - save previous paragraph
|
|
if current_para and any(p.strip() for p in current_para):
|
|
para_content = "\n".join(current_para)
|
|
paragraphs.append(
|
|
{
|
|
"content": para_content,
|
|
"tokens": self.count_tokens(para_content),
|
|
"start_line": para_start,
|
|
"end_line": i - 1,
|
|
}
|
|
)
|
|
current_para = [line]
|
|
para_start = i
|
|
in_code_block = True
|
|
continue
|
|
|
|
if in_code_block:
|
|
current_para.append(line)
|
|
continue
|
|
|
|
# Empty line indicates paragraph break
|
|
if not line.strip():
|
|
if current_para and any(p.strip() for p in current_para):
|
|
para_content = "\n".join(current_para)
|
|
paragraphs.append(
|
|
{
|
|
"content": para_content,
|
|
"tokens": self.count_tokens(para_content),
|
|
"start_line": para_start,
|
|
"end_line": i - 1,
|
|
}
|
|
)
|
|
current_para = []
|
|
para_start = i + 1
|
|
else:
|
|
if not current_para:
|
|
para_start = i
|
|
current_para.append(line)
|
|
|
|
# Final paragraph
|
|
if current_para and any(p.strip() for p in current_para):
|
|
para_content = "\n".join(current_para)
|
|
paragraphs.append(
|
|
{
|
|
"content": para_content,
|
|
"tokens": self.count_tokens(para_content),
|
|
"start_line": para_start,
|
|
"end_line": len(lines) - 1,
|
|
}
|
|
)
|
|
|
|
return paragraphs
|
|
|
|
def _split_large_paragraph(
|
|
self,
|
|
content: str,
|
|
source_path: str | None,
|
|
file_type: FileType,
|
|
metadata: dict[str, Any],
|
|
base_line: int,
|
|
) -> list[Chunk]:
|
|
"""Split a large paragraph into smaller chunks."""
|
|
# Try splitting by sentences
|
|
sentences = self._split_into_sentences(content)
|
|
|
|
chunks: list[Chunk] = []
|
|
current_content: list[str] = []
|
|
current_tokens = 0
|
|
|
|
for sentence in sentences:
|
|
sentence_tokens = self.count_tokens(sentence)
|
|
|
|
# If single sentence is too large, truncate
|
|
if sentence_tokens > self.chunk_size:
|
|
if current_content:
|
|
chunk_text = " ".join(current_content)
|
|
chunks.append(
|
|
self._create_chunk(
|
|
content=chunk_text.strip(),
|
|
source_path=source_path,
|
|
start_line=base_line,
|
|
end_line=base_line,
|
|
file_type=file_type,
|
|
metadata=metadata,
|
|
)
|
|
)
|
|
current_content = []
|
|
current_tokens = 0
|
|
|
|
truncated = self.truncate_to_tokens(sentence, self.chunk_size)
|
|
chunks.append(
|
|
self._create_chunk(
|
|
content=truncated.strip(),
|
|
source_path=source_path,
|
|
start_line=base_line,
|
|
end_line=base_line,
|
|
file_type=file_type,
|
|
metadata={**metadata, "truncated": True},
|
|
)
|
|
)
|
|
continue
|
|
|
|
if current_tokens + sentence_tokens > self.chunk_size and current_content:
|
|
chunk_text = " ".join(current_content)
|
|
chunks.append(
|
|
self._create_chunk(
|
|
content=chunk_text.strip(),
|
|
source_path=source_path,
|
|
start_line=base_line,
|
|
end_line=base_line,
|
|
file_type=file_type,
|
|
metadata=metadata,
|
|
)
|
|
)
|
|
|
|
# Overlap with last sentence
|
|
if (
|
|
current_content
|
|
and self.count_tokens(current_content[-1]) <= self.chunk_overlap
|
|
):
|
|
current_content = [current_content[-1]]
|
|
current_tokens = self.count_tokens(current_content[-1])
|
|
else:
|
|
current_content = []
|
|
current_tokens = 0
|
|
|
|
current_content.append(sentence)
|
|
current_tokens += sentence_tokens
|
|
|
|
# Final chunk
|
|
if current_content:
|
|
chunk_text = " ".join(current_content)
|
|
chunks.append(
|
|
self._create_chunk(
|
|
content=chunk_text.strip(),
|
|
source_path=source_path,
|
|
start_line=base_line,
|
|
end_line=base_line,
|
|
file_type=file_type,
|
|
metadata=metadata,
|
|
)
|
|
)
|
|
|
|
return chunks
|
|
|
|
def _split_into_sentences(self, text: str) -> list[str]:
|
|
"""Split text into sentences."""
|
|
# Simple sentence splitting on common terminators
|
|
# More sophisticated splitting could use nltk or spacy
|
|
sentence_endings = re.compile(r"(?<=[.!?])\s+")
|
|
sentences = sentence_endings.split(text)
|
|
return [s.strip() for s in sentences if s.strip()]
|