153 lines
4.8 KiB
Python
Executable File
153 lines
4.8 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
import sys
|
|
import argparse
|
|
from pathlib import Path
|
|
import psycopg2
|
|
import logging
|
|
from typing import Dict, List
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
from extraction.incremental import IncrementalExtractor
|
|
from parsers.text_parser import TextParser
|
|
from parsers.code_parser import CodeParser
|
|
from parsers.pdf_parser import PDFParser
|
|
from parsers.image_parser import ImageParser
|
|
from parsers.audio_parser import AudioParser
|
|
from parsers.document_parser import DocumentParser
|
|
|
|
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
DISK_MOUNT_MAP = {
|
|
'SMT': '/media/mike/SMT',
|
|
'DISK1': '/media/mike/DISK1',
|
|
'LLM': '/media/mike/LLM',
|
|
'WINDOWS': '/media/mike/WINDOWS',
|
|
'Apps': '/media/mike/Apps',
|
|
'Space': '/media/mike/Space',
|
|
'LVM': '/media/mike/LVM'
|
|
}
|
|
|
|
DB_CONFIG = {
|
|
'host': '192.168.1.159',
|
|
'database': 'disk_reorganizer_db',
|
|
'user': 'disk_reorg_user',
|
|
'password': 'heel-goed-wachtwoord'
|
|
}
|
|
|
|
def get_files_to_parse(parser_type: str, limit: int, max_size: int = 10 * 1024 * 1024) -> List[Dict]:
|
|
conn = psycopg2.connect(**DB_CONFIG)
|
|
cursor = conn.cursor()
|
|
|
|
ext_map = {
|
|
'text': "'.txt', '.md', '.log', '.json', '.yaml', '.yml', '.xml', '.csv'",
|
|
'code': "'.py', '.js', '.java', '.go', '.rs', '.ts', '.tsx', '.jsx', '.cpp', '.h', '.c', '.php'",
|
|
'pdf': "'.pdf'",
|
|
'image': "'.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff', '.webp'",
|
|
'audio': "'.mp3', '.wav', '.flac', '.m4a', '.ogg'",
|
|
'document': "'.docx', '.doc', '.odt', '.rtf'"
|
|
}
|
|
|
|
extensions = ext_map.get(parser_type, '')
|
|
if not extensions:
|
|
logger.error(f'Unknown parser type: {parser_type}')
|
|
return []
|
|
|
|
query = f'''
|
|
SELECT path, disk_label, size, checksum
|
|
FROM files
|
|
WHERE extracted_text IS NULL
|
|
AND size < {max_size}
|
|
AND size > 0
|
|
AND (
|
|
{' OR '.join([f"path LIKE '%{ext}'" for ext in extensions.replace("'", "").split(', ')])}
|
|
)
|
|
LIMIT {limit}
|
|
'''
|
|
|
|
cursor.execute(query)
|
|
files = []
|
|
|
|
for row in cursor.fetchall():
|
|
path, disk, size, checksum = row
|
|
mount = DISK_MOUNT_MAP.get(disk, '')
|
|
if not mount:
|
|
continue
|
|
|
|
full_path = Path(mount) / path
|
|
if full_path.exists():
|
|
files.append({
|
|
'path': path,
|
|
'full_path': full_path,
|
|
'disk_label': disk,
|
|
'size': size,
|
|
'checksum': checksum
|
|
})
|
|
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
logger.info(f'Found {len(files)} {parser_type} files to parse')
|
|
return files
|
|
|
|
def batch_parse(parser_type: str, limit: int, max_size: int):
|
|
parsers = {
|
|
'text': ('text_parser', TextParser()),
|
|
'code': ('code_parser', CodeParser()),
|
|
'pdf': ('pdf_parser', PDFParser()),
|
|
'image': ('image_parser', ImageParser()),
|
|
'audio': ('audio_parser', AudioParser(whisper_model='base')),
|
|
'document': ('document_parser', DocumentParser())
|
|
}
|
|
|
|
if parser_type not in parsers:
|
|
logger.error(f'Unknown parser type: {parser_type}')
|
|
return
|
|
|
|
parser_name, parser = parsers[parser_type]
|
|
files = get_files_to_parse(parser_type, limit, max_size)
|
|
|
|
if not files:
|
|
logger.info(f'No files to parse for {parser_type}')
|
|
return
|
|
|
|
extractor = IncrementalExtractor(DB_CONFIG)
|
|
|
|
logger.info(f'Starting batch parse of {len(files)} files with {parser_name}')
|
|
|
|
def parse_func(path):
|
|
for f in files:
|
|
if str(f['full_path']) == str(path) or f['path'] == str(path):
|
|
return parser.parse(f['full_path'])
|
|
return parser.parse(path)
|
|
|
|
stats = extractor.batch_extract(
|
|
files,
|
|
parse_func,
|
|
parser_name=parser_name,
|
|
batch_size=100,
|
|
skip_existing=True
|
|
)
|
|
|
|
logger.info(f'\n=== BATCH PARSE COMPLETE ===')
|
|
logger.info(f'Processed: {stats["processed"]}')
|
|
logger.info(f'Extracted: {stats["extracted"]}')
|
|
logger.info(f'Skipped: {stats["skipped"]}')
|
|
logger.info(f'Errors: {stats["errors"]}')
|
|
if stats['extracted'] > 0:
|
|
logger.info(f'Avg time: {stats["total_time_ms"]/stats["extracted"]:.1f}ms per file')
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description='Batch process files with incremental extraction')
|
|
parser.add_argument('parser_type', choices=['text', 'code', 'pdf', 'image', 'audio', 'document'],
|
|
help='Type of parser to use')
|
|
parser.add_argument('--limit', type=int, default=1000, help='Maximum files to process')
|
|
parser.add_argument('--max-size', type=int, default=10*1024*1024, help='Maximum file size in bytes')
|
|
|
|
args = parser.parse_args()
|
|
batch_parse(args.parser_type, args.limit, args.max_size)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|