refactor(knowledge-base mcp server): adjust formatting for consistency and readability
Improved code formatting, line breaks, and indentation across chunking logic and multiple test modules to enhance code clarity and maintain consistent style. No functional changes made.
This commit is contained in:
@@ -184,7 +184,12 @@ class ChunkerFactory:
|
||||
if file_type:
|
||||
if file_type == FileType.MARKDOWN:
|
||||
return self._get_markdown_chunker()
|
||||
elif file_type in (FileType.TEXT, FileType.JSON, FileType.YAML, FileType.TOML):
|
||||
elif file_type in (
|
||||
FileType.TEXT,
|
||||
FileType.JSON,
|
||||
FileType.YAML,
|
||||
FileType.TOML,
|
||||
):
|
||||
return self._get_text_chunker()
|
||||
else:
|
||||
# Code files
|
||||
@@ -193,7 +198,9 @@ class ChunkerFactory:
|
||||
# Default to text chunker
|
||||
return self._get_text_chunker()
|
||||
|
||||
def get_chunker_for_path(self, source_path: str) -> tuple[BaseChunker, FileType | None]:
|
||||
def get_chunker_for_path(
|
||||
self, source_path: str
|
||||
) -> tuple[BaseChunker, FileType | None]:
|
||||
"""
|
||||
Get chunker based on file path extension.
|
||||
|
||||
|
||||
@@ -151,7 +151,7 @@ class CodeChunker(BaseChunker):
|
||||
for struct_type, pattern in patterns.items():
|
||||
for match in pattern.finditer(content):
|
||||
# Convert character position to line number
|
||||
line_num = content[:match.start()].count("\n")
|
||||
line_num = content[: match.start()].count("\n")
|
||||
boundaries.append((line_num, struct_type))
|
||||
|
||||
if not boundaries:
|
||||
|
||||
@@ -69,9 +69,7 @@ class MarkdownChunker(BaseChunker):
|
||||
|
||||
if not sections:
|
||||
# No headings, chunk as plain text
|
||||
return self._chunk_text_block(
|
||||
content, source_path, file_type, metadata, []
|
||||
)
|
||||
return self._chunk_text_block(content, source_path, file_type, metadata, [])
|
||||
|
||||
chunks: list[Chunk] = []
|
||||
heading_stack: list[tuple[int, str]] = [] # (level, text)
|
||||
@@ -292,7 +290,10 @@ class MarkdownChunker(BaseChunker):
|
||||
)
|
||||
|
||||
# Overlap: include last paragraph if it fits
|
||||
if current_content and self.count_tokens(current_content[-1]) <= self.chunk_overlap:
|
||||
if (
|
||||
current_content
|
||||
and self.count_tokens(current_content[-1]) <= self.chunk_overlap
|
||||
):
|
||||
current_content = [current_content[-1]]
|
||||
current_tokens = self.count_tokens(current_content[-1])
|
||||
else:
|
||||
@@ -341,12 +342,14 @@ class MarkdownChunker(BaseChunker):
|
||||
# Start of code block - save previous paragraph
|
||||
if current_para and any(p.strip() for p in current_para):
|
||||
para_content = "\n".join(current_para)
|
||||
paragraphs.append({
|
||||
"content": para_content,
|
||||
"tokens": self.count_tokens(para_content),
|
||||
"start_line": para_start,
|
||||
"end_line": i - 1,
|
||||
})
|
||||
paragraphs.append(
|
||||
{
|
||||
"content": para_content,
|
||||
"tokens": self.count_tokens(para_content),
|
||||
"start_line": para_start,
|
||||
"end_line": i - 1,
|
||||
}
|
||||
)
|
||||
current_para = [line]
|
||||
para_start = i
|
||||
in_code_block = True
|
||||
@@ -360,12 +363,14 @@ class MarkdownChunker(BaseChunker):
|
||||
if not line.strip():
|
||||
if current_para and any(p.strip() for p in current_para):
|
||||
para_content = "\n".join(current_para)
|
||||
paragraphs.append({
|
||||
"content": para_content,
|
||||
"tokens": self.count_tokens(para_content),
|
||||
"start_line": para_start,
|
||||
"end_line": i - 1,
|
||||
})
|
||||
paragraphs.append(
|
||||
{
|
||||
"content": para_content,
|
||||
"tokens": self.count_tokens(para_content),
|
||||
"start_line": para_start,
|
||||
"end_line": i - 1,
|
||||
}
|
||||
)
|
||||
current_para = []
|
||||
para_start = i + 1
|
||||
else:
|
||||
@@ -376,12 +381,14 @@ class MarkdownChunker(BaseChunker):
|
||||
# Final paragraph
|
||||
if current_para and any(p.strip() for p in current_para):
|
||||
para_content = "\n".join(current_para)
|
||||
paragraphs.append({
|
||||
"content": para_content,
|
||||
"tokens": self.count_tokens(para_content),
|
||||
"start_line": para_start,
|
||||
"end_line": len(lines) - 1,
|
||||
})
|
||||
paragraphs.append(
|
||||
{
|
||||
"content": para_content,
|
||||
"tokens": self.count_tokens(para_content),
|
||||
"start_line": para_start,
|
||||
"end_line": len(lines) - 1,
|
||||
}
|
||||
)
|
||||
|
||||
return paragraphs
|
||||
|
||||
@@ -448,7 +455,10 @@ class MarkdownChunker(BaseChunker):
|
||||
)
|
||||
|
||||
# Overlap with last sentence
|
||||
if current_content and self.count_tokens(current_content[-1]) <= self.chunk_overlap:
|
||||
if (
|
||||
current_content
|
||||
and self.count_tokens(current_content[-1]) <= self.chunk_overlap
|
||||
):
|
||||
current_content = [current_content[-1]]
|
||||
current_tokens = self.count_tokens(current_content[-1])
|
||||
else:
|
||||
|
||||
@@ -79,9 +79,7 @@ class TextChunker(BaseChunker):
|
||||
)
|
||||
|
||||
# Fall back to sentence-based chunking
|
||||
return self._chunk_by_sentences(
|
||||
content, source_path, file_type, metadata
|
||||
)
|
||||
return self._chunk_by_sentences(content, source_path, file_type, metadata)
|
||||
|
||||
def _split_paragraphs(self, content: str) -> list[dict[str, Any]]:
|
||||
"""Split content into paragraphs."""
|
||||
@@ -97,12 +95,14 @@ class TextChunker(BaseChunker):
|
||||
continue
|
||||
|
||||
para_lines = para.count("\n") + 1
|
||||
paragraphs.append({
|
||||
"content": para,
|
||||
"tokens": self.count_tokens(para),
|
||||
"start_line": line_num,
|
||||
"end_line": line_num + para_lines - 1,
|
||||
})
|
||||
paragraphs.append(
|
||||
{
|
||||
"content": para,
|
||||
"tokens": self.count_tokens(para),
|
||||
"start_line": line_num,
|
||||
"end_line": line_num + para_lines - 1,
|
||||
}
|
||||
)
|
||||
line_num += para_lines + 1 # +1 for blank line between paragraphs
|
||||
|
||||
return paragraphs
|
||||
@@ -172,7 +172,10 @@ class TextChunker(BaseChunker):
|
||||
|
||||
# Overlap: keep last paragraph if small enough
|
||||
overlap_para = None
|
||||
if current_paras and self.count_tokens(current_paras[-1]) <= self.chunk_overlap:
|
||||
if (
|
||||
current_paras
|
||||
and self.count_tokens(current_paras[-1]) <= self.chunk_overlap
|
||||
):
|
||||
overlap_para = current_paras[-1]
|
||||
|
||||
current_paras = [overlap_para] if overlap_para else []
|
||||
@@ -266,7 +269,10 @@ class TextChunker(BaseChunker):
|
||||
|
||||
# Overlap: keep last sentence if small enough
|
||||
overlap = None
|
||||
if current_sentences and self.count_tokens(current_sentences[-1]) <= self.chunk_overlap:
|
||||
if (
|
||||
current_sentences
|
||||
and self.count_tokens(current_sentences[-1]) <= self.chunk_overlap
|
||||
):
|
||||
overlap = current_sentences[-1]
|
||||
|
||||
current_sentences = [overlap] if overlap else []
|
||||
@@ -317,14 +323,10 @@ class TextChunker(BaseChunker):
|
||||
sentences = self._split_sentences(text)
|
||||
|
||||
if len(sentences) > 1:
|
||||
return self._chunk_by_sentences(
|
||||
text, source_path, file_type, metadata
|
||||
)
|
||||
return self._chunk_by_sentences(text, source_path, file_type, metadata)
|
||||
|
||||
# Fall back to word-based splitting
|
||||
return self._chunk_by_words(
|
||||
text, source_path, file_type, metadata, base_line
|
||||
)
|
||||
return self._chunk_by_words(text, source_path, file_type, metadata, base_line)
|
||||
|
||||
def _chunk_by_words(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user