chunking: add structured
This commit is contained in:
152
app/batch_process.py
Executable file
152
app/batch_process.py
Executable file
@@ -0,0 +1,152 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
import psycopg2
|
||||
import logging
|
||||
from typing import Dict, List
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
from extraction.incremental import IncrementalExtractor
|
||||
from parsers.text_parser import TextParser
|
||||
from parsers.code_parser import CodeParser
|
||||
from parsers.pdf_parser import PDFParser
|
||||
from parsers.image_parser import ImageParser
|
||||
from parsers.audio_parser import AudioParser
|
||||
from parsers.document_parser import DocumentParser
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DISK_MOUNT_MAP = {
|
||||
'SMT': '/media/mike/SMT',
|
||||
'DISK1': '/media/mike/DISK1',
|
||||
'LLM': '/media/mike/LLM',
|
||||
'WINDOWS': '/media/mike/WINDOWS',
|
||||
'Apps': '/media/mike/Apps',
|
||||
'Space': '/media/mike/Space',
|
||||
'LVM': '/media/mike/LVM'
|
||||
}
|
||||
|
||||
DB_CONFIG = {
|
||||
'host': '192.168.1.159',
|
||||
'database': 'disk_reorganizer_db',
|
||||
'user': 'disk_reorg_user',
|
||||
'password': 'heel-goed-wachtwoord'
|
||||
}
|
||||
|
||||
def get_files_to_parse(parser_type: str, limit: int, max_size: int = 10 * 1024 * 1024) -> List[Dict]:
|
||||
conn = psycopg2.connect(**DB_CONFIG)
|
||||
cursor = conn.cursor()
|
||||
|
||||
ext_map = {
|
||||
'text': "'.txt', '.md', '.log', '.json', '.yaml', '.yml', '.xml', '.csv'",
|
||||
'code': "'.py', '.js', '.java', '.go', '.rs', '.ts', '.tsx', '.jsx', '.cpp', '.h', '.c', '.php'",
|
||||
'pdf': "'.pdf'",
|
||||
'image': "'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'",
|
||||
'audio': "'.mp3', '.wav', '.flac', '.m4a', '.ogg'",
|
||||
'document': "'.docx', '.doc', '.odt', '.rtf'"
|
||||
}
|
||||
|
||||
extensions = ext_map.get(parser_type, '')
|
||||
if not extensions:
|
||||
logger.error(f'Unknown parser type: {parser_type}')
|
||||
return []
|
||||
|
||||
query = f'''
|
||||
SELECT path, disk_label, size, checksum
|
||||
FROM files
|
||||
WHERE extracted_text IS NULL
|
||||
AND size < {max_size}
|
||||
AND size > 0
|
||||
AND (
|
||||
{' OR '.join([f"path LIKE '%{ext}'" for ext in extensions.replace("'", "").split(', ')])}
|
||||
)
|
||||
LIMIT {limit}
|
||||
'''
|
||||
|
||||
cursor.execute(query)
|
||||
files = []
|
||||
|
||||
for row in cursor.fetchall():
|
||||
path, disk, size, checksum = row
|
||||
mount = DISK_MOUNT_MAP.get(disk, '')
|
||||
if not mount:
|
||||
continue
|
||||
|
||||
full_path = Path(mount) / path
|
||||
if full_path.exists():
|
||||
files.append({
|
||||
'path': path,
|
||||
'full_path': full_path,
|
||||
'disk_label': disk,
|
||||
'size': size,
|
||||
'checksum': checksum
|
||||
})
|
||||
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
logger.info(f'Found {len(files)} {parser_type} files to parse')
|
||||
return files
|
||||
|
||||
def batch_parse(parser_type: str, limit: int, max_size: int):
|
||||
parsers = {
|
||||
'text': ('text_parser', TextParser()),
|
||||
'code': ('code_parser', CodeParser()),
|
||||
'pdf': ('pdf_parser', PDFParser()),
|
||||
'image': ('image_parser', ImageParser()),
|
||||
'audio': ('audio_parser', AudioParser(whisper_model='base')),
|
||||
'document': ('document_parser', DocumentParser())
|
||||
}
|
||||
|
||||
if parser_type not in parsers:
|
||||
logger.error(f'Unknown parser type: {parser_type}')
|
||||
return
|
||||
|
||||
parser_name, parser = parsers[parser_type]
|
||||
files = get_files_to_parse(parser_type, limit, max_size)
|
||||
|
||||
if not files:
|
||||
logger.info(f'No files to parse for {parser_type}')
|
||||
return
|
||||
|
||||
extractor = IncrementalExtractor(DB_CONFIG)
|
||||
|
||||
logger.info(f'Starting batch parse of {len(files)} files with {parser_name}')
|
||||
|
||||
def parse_func(path):
|
||||
for f in files:
|
||||
if str(f['full_path']) == str(path) or f['path'] == str(path):
|
||||
return parser.parse(f['full_path'])
|
||||
return parser.parse(path)
|
||||
|
||||
stats = extractor.batch_extract(
|
||||
files,
|
||||
parse_func,
|
||||
parser_name=parser_name,
|
||||
batch_size=100,
|
||||
skip_existing=True
|
||||
)
|
||||
|
||||
logger.info(f'\n=== BATCH PARSE COMPLETE ===')
|
||||
logger.info(f'Processed: {stats["processed"]}')
|
||||
logger.info(f'Extracted: {stats["extracted"]}')
|
||||
logger.info(f'Skipped: {stats["skipped"]}')
|
||||
logger.info(f'Errors: {stats["errors"]}')
|
||||
if stats['extracted'] > 0:
|
||||
logger.info(f'Avg time: {stats["total_time_ms"]/stats["extracted"]:.1f}ms per file')
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='Batch process files with incremental extraction')
|
||||
parser.add_argument('parser_type', choices=['text', 'code', 'pdf', 'image', 'audio', 'document'],
|
||||
help='Type of parser to use')
|
||||
parser.add_argument('--limit', type=int, default=1000, help='Maximum files to process')
|
||||
parser.add_argument('--max-size', type=int, default=10*1024*1024, help='Maximum file size in bytes')
|
||||
|
||||
args = parser.parse_args()
|
||||
batch_parse(args.parser_type, args.limit, args.max_size)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user