chunking: add structured
This commit is contained in:
136
app/chunking/structured_chunker.py
Normal file
136
app/chunking/structured_chunker.py
Normal file
@@ -0,0 +1,136 @@
|
||||
import hashlib
|
||||
import re
|
||||
from typing import List, Dict, Optional
|
||||
from pathlib import Path
|
||||
|
||||
class StructuredChunker:
|
||||
def __init__(self, chunk_size: int = 1000, overlap: int = 200):
|
||||
self.chunk_size = chunk_size
|
||||
self.overlap = overlap
|
||||
|
||||
def chunk_text(self, text: str, metadata: Optional[Dict] = None) -> List[Dict]:
|
||||
if not text or len(text) < self.chunk_size:
|
||||
return [self._create_chunk(text, 0, metadata)]
|
||||
|
||||
paragraphs = text.split('\n\n')
|
||||
chunks = []
|
||||
current_chunk = []
|
||||
current_size = 0
|
||||
chunk_idx = 0
|
||||
|
||||
for para_idx, para in enumerate(paragraphs):
|
||||
para_size = len(para)
|
||||
|
||||
if current_size + para_size > self.chunk_size and current_chunk:
|
||||
chunk_text = '\n\n'.join(current_chunk)
|
||||
chunks.append(self._create_chunk(chunk_text, chunk_idx, metadata, para_idx=para_idx))
|
||||
chunk_idx += 1
|
||||
|
||||
overlap_text = self._get_overlap_text(current_chunk)
|
||||
current_chunk = [overlap_text] if overlap_text else []
|
||||
current_size = len(overlap_text) if overlap_text else 0
|
||||
|
||||
current_chunk.append(para)
|
||||
current_size += para_size
|
||||
|
||||
if current_chunk:
|
||||
chunk_text = '\n\n'.join(current_chunk)
|
||||
chunks.append(self._create_chunk(chunk_text, chunk_idx, metadata))
|
||||
|
||||
return chunks
|
||||
|
||||
def chunk_code(self, text: str, metadata: Optional[Dict] = None) -> List[Dict]:
|
||||
chunks = []
|
||||
functions = self._extract_functions(text)
|
||||
|
||||
if functions:
|
||||
for idx, func in enumerate(functions):
|
||||
chunks.append(self._create_chunk(
|
||||
func['code'],
|
||||
idx,
|
||||
{**(metadata or {}), 'type': 'function', 'name': func['name']},
|
||||
section=func['name']
|
||||
))
|
||||
else:
|
||||
return self.chunk_text(text, {**(metadata or {}), 'type': 'code'})
|
||||
|
||||
return chunks
|
||||
|
||||
def chunk_markdown(self, text: str, metadata: Optional[Dict] = None) -> List[Dict]:
|
||||
sections = self._extract_md_sections(text)
|
||||
|
||||
if sections:
|
||||
chunks = []
|
||||
for idx, section in enumerate(sections):
|
||||
chunks.append(self._create_chunk(
|
||||
section['content'],
|
||||
idx,
|
||||
{**(metadata or {}), 'type': 'section', 'heading': section['heading']},
|
||||
section=section['heading']
|
||||
))
|
||||
return chunks
|
||||
else:
|
||||
return self.chunk_text(text, {**(metadata or {}), 'type': 'markdown'})
|
||||
|
||||
def _create_chunk(self, text: str, index: int, metadata: Optional[Dict], para_idx: Optional[int] = None, section: Optional[str] = None) -> Dict:
|
||||
chunk_id = hashlib.sha256(f'{text}{index}'.encode()).hexdigest()[:16]
|
||||
|
||||
return {
|
||||
'chunk_id': chunk_id,
|
||||
'index': index,
|
||||
'text': text,
|
||||
'size': len(text),
|
||||
'offset_start': para_idx if para_idx is not None else index * (self.chunk_size - self.overlap),
|
||||
'offset_end': (para_idx if para_idx is not None else index * (self.chunk_size - self.overlap)) + len(text),
|
||||
'section': section,
|
||||
'metadata': metadata or {}
|
||||
}
|
||||
|
||||
def _get_overlap_text(self, chunks: List[str]) -> str:
|
||||
combined = '\n\n'.join(chunks)
|
||||
if len(combined) <= self.overlap:
|
||||
return combined
|
||||
return combined[-self.overlap:]
|
||||
|
||||
def _extract_functions(self, text: str) -> List[Dict]:
|
||||
patterns = [
|
||||
r'(def\s+(\w+)\s*\([^)]*\):.*?)(?=\ndef\s+\w+|class\s+\w+|\Z)',
|
||||
r'(function\s+(\w+)\s*\([^)]*\)\s*\{.*?\n\})',
|
||||
r'(public\s+(?:static\s+)?[\w<>]+\s+(\w+)\s*\([^)]*\)\s*\{.*?\n\})'
|
||||
]
|
||||
|
||||
functions = []
|
||||
for pattern in patterns:
|
||||
matches = re.finditer(pattern, text, re.DOTALL | re.MULTILINE)
|
||||
for match in matches:
|
||||
functions.append({
|
||||
'name': match.group(2),
|
||||
'code': match.group(1)
|
||||
})
|
||||
|
||||
return functions
|
||||
|
||||
def _extract_md_sections(self, text: str) -> List[Dict]:
|
||||
lines = text.split('\n')
|
||||
sections = []
|
||||
current_section = {'heading': 'Introduction', 'content': []}
|
||||
|
||||
for line in lines:
|
||||
if line.startswith('#'):
|
||||
if current_section['content']:
|
||||
sections.append({
|
||||
'heading': current_section['heading'],
|
||||
'content': '\n'.join(current_section['content'])
|
||||
})
|
||||
heading = re.sub(r'^#+\s*', '', line)
|
||||
current_section = {'heading': heading, 'content': []}
|
||||
else:
|
||||
current_section['content'].append(line)
|
||||
|
||||
if current_section['content']:
|
||||
sections.append({
|
||||
'heading': current_section['heading'],
|
||||
'content': '\n'.join(current_section['content'])
|
||||
})
|
||||
|
||||
return sections if len(sections) > 1 else []
|
||||
Reference in New Issue
Block a user