#!/usr/bin/env python3 import sys import argparse from pathlib import Path import psycopg2 import logging from typing import Dict, List sys.path.insert(0, str(Path(__file__).parent)) from extraction.incremental import IncrementalExtractor from parsers.text_parser import TextParser from parsers.code_parser import CodeParser from parsers.pdf_parser import PDFParser from parsers.image_parser import ImageParser from parsers.audio_parser import AudioParser from parsers.document_parser import DocumentParser logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) DISK_MOUNT_MAP = { 'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM', 'WINDOWS': '/media/mike/WINDOWS', 'Apps': '/media/mike/Apps', 'Space': '/media/mike/Space', 'LVM': '/media/mike/LVM' } DB_CONFIG = { 'host': '192.168.1.159', 'database': 'disk_reorganizer_db', 'user': 'disk_reorg_user', 'password': 'heel-goed-wachtwoord' } def get_files_to_parse(parser_type: str, limit: int, max_size: int = 10 * 1024 * 1024) -> List[Dict]: conn = psycopg2.connect(**DB_CONFIG) cursor = conn.cursor() ext_map = { 'text': "'.txt', '.md', '.log', '.json', '.yaml', '.yml', '.xml', '.csv'", 'code': "'.py', '.js', '.java', '.go', '.rs', '.ts', '.tsx', '.jsx', '.cpp', '.h', '.c', '.php'", 'pdf': "'.pdf'", 'image': "'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'", 'audio': "'.mp3', '.wav', '.flac', '.m4a', '.ogg'", 'document': "'.docx', '.doc', '.odt', '.rtf'" } extensions = ext_map.get(parser_type, '') if not extensions: logger.error(f'Unknown parser type: {parser_type}') return [] query = f''' SELECT path, disk_label, size, checksum FROM files WHERE extracted_text IS NULL AND size < {max_size} AND size > 0 AND ( {' OR '.join([f"path LIKE '%{ext}'" for ext in extensions.replace("'", "").split(', ')])} ) LIMIT {limit} ''' cursor.execute(query) files = [] for row in cursor.fetchall(): path, disk, size, checksum = row mount = DISK_MOUNT_MAP.get(disk, '') if not mount: continue full_path = Path(mount) / path if full_path.exists(): files.append({ 'path': path, 'full_path': full_path, 'disk_label': disk, 'size': size, 'checksum': checksum }) cursor.close() conn.close() logger.info(f'Found {len(files)} {parser_type} files to parse') return files def batch_parse(parser_type: str, limit: int, max_size: int): parsers = { 'text': ('text_parser', TextParser()), 'code': ('code_parser', CodeParser()), 'pdf': ('pdf_parser', PDFParser()), 'image': ('image_parser', ImageParser()), 'audio': ('audio_parser', AudioParser(whisper_model='base')), 'document': ('document_parser', DocumentParser()) } if parser_type not in parsers: logger.error(f'Unknown parser type: {parser_type}') return parser_name, parser = parsers[parser_type] files = get_files_to_parse(parser_type, limit, max_size) if not files: logger.info(f'No files to parse for {parser_type}') return extractor = IncrementalExtractor(DB_CONFIG) logger.info(f'Starting batch parse of {len(files)} files with {parser_name}') def parse_func(path): for f in files: if str(f['full_path']) == str(path) or f['path'] == str(path): return parser.parse(f['full_path']) return parser.parse(path) stats = extractor.batch_extract( files, parse_func, parser_name=parser_name, batch_size=100, skip_existing=True ) logger.info(f'\n=== BATCH PARSE COMPLETE ===') logger.info(f'Processed: {stats["processed"]}') logger.info(f'Extracted: {stats["extracted"]}') logger.info(f'Skipped: {stats["skipped"]}') logger.info(f'Errors: {stats["errors"]}') if stats['extracted'] > 0: logger.info(f'Avg time: {stats["total_time_ms"]/stats["extracted"]:.1f}ms per file') def main(): parser = argparse.ArgumentParser(description='Batch process files with incremental extraction') parser.add_argument('parser_type', choices=['text', 'code', 'pdf', 'image', 'audio', 'document'], help='Type of parser to use') parser.add_argument('--limit', type=int, default=1000, help='Maximum files to process') parser.add_argument('--max-size', type=int, default=10*1024*1024, help='Maximum file size in bytes') args = parser.parse_args() batch_parse(args.parser_type, args.limit, args.max_size) if __name__ == '__main__': main()