chunking: add structured

2025-12-13 14:32:43 +01:00
parent 1583df8f57
commit b3d06660a4
7 changed files with 507 additions and 1 deletions
--- a/app/batch_process.py
+++ b/app/batch_process.py
@@ -0,0 +1,152 @@
+#!/usr/bin/env python3
+import sys
+import argparse
+from pathlib import Path
+import psycopg2
+import logging
+from typing import Dict, List
+
+sys.path.insert(0, str(Path(__file__).parent))
+
+from extraction.incremental import IncrementalExtractor
+from parsers.text_parser import TextParser
+from parsers.code_parser import CodeParser
+from parsers.pdf_parser import PDFParser
+from parsers.image_parser import ImageParser
+from parsers.audio_parser import AudioParser
+from parsers.document_parser import DocumentParser
+
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+DISK_MOUNT_MAP = {
+    'SMT': '/media/mike/SMT',
+    'DISK1': '/media/mike/DISK1',
+    'LLM': '/media/mike/LLM',
+    'WINDOWS': '/media/mike/WINDOWS',
+    'Apps': '/media/mike/Apps',
+    'Space': '/media/mike/Space',
+    'LVM': '/media/mike/LVM'
+}
+
+DB_CONFIG = {
+    'host': '192.168.1.159',
+    'database': 'disk_reorganizer_db',
+    'user': 'disk_reorg_user',
+    'password': 'heel-goed-wachtwoord'
+}
+
+def get_files_to_parse(parser_type: str, limit: int, max_size: int = 10 * 1024 * 1024) -> List[Dict]:
+    conn = psycopg2.connect(**DB_CONFIG)
+    cursor = conn.cursor()
+
+    ext_map = {
+        'text': "'.txt', '.md', '.log', '.json', '.yaml', '.yml', '.xml', '.csv'",
+        'code': "'.py', '.js', '.java', '.go', '.rs', '.ts', '.tsx', '.jsx', '.cpp', '.h', '.c', '.php'",
+        'pdf': "'.pdf'",
+        'image': "'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'",
+        'audio': "'.mp3', '.wav', '.flac', '.m4a', '.ogg'",
+        'document': "'.docx', '.doc', '.odt', '.rtf'"
+    }
+
+    extensions = ext_map.get(parser_type, '')
+    if not extensions:
+        logger.error(f'Unknown parser type: {parser_type}')
+        return []
+
+    query = f'''
+        SELECT path, disk_label, size, checksum
+        FROM files
+        WHERE extracted_text IS NULL
+        AND size < {max_size}
+        AND size > 0
+        AND (
+            {' OR '.join([f"path LIKE '%{ext}'" for ext in extensions.replace("'", "").split(', ')])}
+        )
+        LIMIT {limit}
+    '''
+
+    cursor.execute(query)
+    files = []
+
+    for row in cursor.fetchall():
+        path, disk, size, checksum = row
+        mount = DISK_MOUNT_MAP.get(disk, '')
+        if not mount:
+            continue
+
+        full_path = Path(mount) / path
+        if full_path.exists():
+            files.append({
+                'path': path,
+                'full_path': full_path,
+                'disk_label': disk,
+                'size': size,
+                'checksum': checksum
+            })
+
+    cursor.close()
+    conn.close()
+
+    logger.info(f'Found {len(files)} {parser_type} files to parse')
+    return files
+
+def batch_parse(parser_type: str, limit: int, max_size: int):
+    parsers = {
+        'text': ('text_parser', TextParser()),
+        'code': ('code_parser', CodeParser()),
+        'pdf': ('pdf_parser', PDFParser()),
+        'image': ('image_parser', ImageParser()),
+        'audio': ('audio_parser', AudioParser(whisper_model='base')),
+        'document': ('document_parser', DocumentParser())
+    }
+
+    if parser_type not in parsers:
+        logger.error(f'Unknown parser type: {parser_type}')
+        return
+
+    parser_name, parser = parsers[parser_type]
+    files = get_files_to_parse(parser_type, limit, max_size)
+
+    if not files:
+        logger.info(f'No files to parse for {parser_type}')
+        return
+
+    extractor = IncrementalExtractor(DB_CONFIG)
+
+    logger.info(f'Starting batch parse of {len(files)} files with {parser_name}')
+
+    def parse_func(path):
+        for f in files:
+            if str(f['full_path']) == str(path) or f['path'] == str(path):
+                return parser.parse(f['full_path'])
+        return parser.parse(path)
+
+    stats = extractor.batch_extract(
+        files,
+        parse_func,
+        parser_name=parser_name,
+        batch_size=100,
+        skip_existing=True
+    )
+
+    logger.info(f'\n=== BATCH PARSE COMPLETE ===')
+    logger.info(f'Processed: {stats["processed"]}')
+    logger.info(f'Extracted: {stats["extracted"]}')
+    logger.info(f'Skipped: {stats["skipped"]}')
+    logger.info(f'Errors: {stats["errors"]}')
+    if stats['extracted'] > 0:
+        logger.info(f'Avg time: {stats["total_time_ms"]/stats["extracted"]:.1f}ms per file')
+
+def main():
+    parser = argparse.ArgumentParser(description='Batch process files with incremental extraction')
+    parser.add_argument('parser_type', choices=['text', 'code', 'pdf', 'image', 'audio', 'document'],
+                       help='Type of parser to use')
+    parser.add_argument('--limit', type=int, default=1000, help='Maximum files to process')
+    parser.add_argument('--max-size', type=int, default=10*1024*1024, help='Maximum file size in bytes')
+
+    args = parser.parse_args()
+    batch_parse(args.parser_type, args.limit, args.max_size)
+
+if __name__ == '__main__':
+    main()