Files
defrag/app/batch_process.py
2025-12-13 14:32:43 +01:00

153 lines
4.8 KiB
Python
Executable File

#!/usr/bin/env python3
import sys
import argparse
from pathlib import Path
import psycopg2
import logging
from typing import Dict, List
sys.path.insert(0, str(Path(__file__).parent))
from extraction.incremental import IncrementalExtractor
from parsers.text_parser import TextParser
from parsers.code_parser import CodeParser
from parsers.pdf_parser import PDFParser
from parsers.image_parser import ImageParser
from parsers.audio_parser import AudioParser
from parsers.document_parser import DocumentParser
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
DISK_MOUNT_MAP = {
'SMT': '/media/mike/SMT',
'DISK1': '/media/mike/DISK1',
'LLM': '/media/mike/LLM',
'WINDOWS': '/media/mike/WINDOWS',
'Apps': '/media/mike/Apps',
'Space': '/media/mike/Space',
'LVM': '/media/mike/LVM'
}
DB_CONFIG = {
'host': '192.168.1.159',
'database': 'disk_reorganizer_db',
'user': 'disk_reorg_user',
'password': 'heel-goed-wachtwoord'
}
def get_files_to_parse(parser_type: str, limit: int, max_size: int = 10 * 1024 * 1024) -> List[Dict]:
conn = psycopg2.connect(**DB_CONFIG)
cursor = conn.cursor()
ext_map = {
'text': "'.txt', '.md', '.log', '.json', '.yaml', '.yml', '.xml', '.csv'",
'code': "'.py', '.js', '.java', '.go', '.rs', '.ts', '.tsx', '.jsx', '.cpp', '.h', '.c', '.php'",
'pdf': "'.pdf'",
'image': "'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'",
'audio': "'.mp3', '.wav', '.flac', '.m4a', '.ogg'",
'document': "'.docx', '.doc', '.odt', '.rtf'"
}
extensions = ext_map.get(parser_type, '')
if not extensions:
logger.error(f'Unknown parser type: {parser_type}')
return []
query = f'''
SELECT path, disk_label, size, checksum
FROM files
WHERE extracted_text IS NULL
AND size < {max_size}
AND size > 0
AND (
{' OR '.join([f"path LIKE '%{ext}'" for ext in extensions.replace("'", "").split(', ')])}
)
LIMIT {limit}
'''
cursor.execute(query)
files = []
for row in cursor.fetchall():
path, disk, size, checksum = row
mount = DISK_MOUNT_MAP.get(disk, '')
if not mount:
continue
full_path = Path(mount) / path
if full_path.exists():
files.append({
'path': path,
'full_path': full_path,
'disk_label': disk,
'size': size,
'checksum': checksum
})
cursor.close()
conn.close()
logger.info(f'Found {len(files)} {parser_type} files to parse')
return files
def batch_parse(parser_type: str, limit: int, max_size: int):
parsers = {
'text': ('text_parser', TextParser()),
'code': ('code_parser', CodeParser()),
'pdf': ('pdf_parser', PDFParser()),
'image': ('image_parser', ImageParser()),
'audio': ('audio_parser', AudioParser(whisper_model='base')),
'document': ('document_parser', DocumentParser())
}
if parser_type not in parsers:
logger.error(f'Unknown parser type: {parser_type}')
return
parser_name, parser = parsers[parser_type]
files = get_files_to_parse(parser_type, limit, max_size)
if not files:
logger.info(f'No files to parse for {parser_type}')
return
extractor = IncrementalExtractor(DB_CONFIG)
logger.info(f'Starting batch parse of {len(files)} files with {parser_name}')
def parse_func(path):
for f in files:
if str(f['full_path']) == str(path) or f['path'] == str(path):
return parser.parse(f['full_path'])
return parser.parse(path)
stats = extractor.batch_extract(
files,
parse_func,
parser_name=parser_name,
batch_size=100,
skip_existing=True
)
logger.info(f'\n=== BATCH PARSE COMPLETE ===')
logger.info(f'Processed: {stats["processed"]}')
logger.info(f'Extracted: {stats["extracted"]}')
logger.info(f'Skipped: {stats["skipped"]}')
logger.info(f'Errors: {stats["errors"]}')
if stats['extracted'] > 0:
logger.info(f'Avg time: {stats["total_time_ms"]/stats["extracted"]:.1f}ms per file')
def main():
parser = argparse.ArgumentParser(description='Batch process files with incremental extraction')
parser.add_argument('parser_type', choices=['text', 'code', 'pdf', 'image', 'audio', 'document'],
help='Type of parser to use')
parser.add_argument('--limit', type=int, default=1000, help='Maximum files to process')
parser.add_argument('--max-size', type=int, default=10*1024*1024, help='Maximum file size in bytes')
args = parser.parse_args()
batch_parse(args.parser_type, args.limit, args.max_size)
if __name__ == '__main__':
main()