"""Discovery engine coordinating scanner and system APIs""" from pathlib import Path from typing import Optional, Callable from datetime import datetime import psycopg2 from psycopg2.extras import execute_batch from .scanner import FileScanner from .system import SystemAPI from ._protocols import FileMeta from ..shared.models import FileRecord, DiskInfo, ProcessingStats from ..shared.config import DatabaseConfig from ..shared.logger import ProgressLogger class DiscoveryEngine: """Discovery engine for scanning and cataloging files""" def __init__( self, db_config: DatabaseConfig, logger: ProgressLogger, batch_size: int = 1000 ): """Initialize discovery engine Args: db_config: Database configuration logger: Progress logger batch_size: Number of records to batch before database commit """ self.db_config = db_config self.logger = logger self.batch_size = batch_size self.system_api = SystemAPI() self._connection = None def _get_connection(self): """Get or create database connection""" if self._connection is None or self._connection.closed: self._connection = psycopg2.connect( host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password ) return self._connection def _ensure_tables(self): """Ensure database tables exist""" conn = self._get_connection() cursor = conn.cursor() # Create files table cursor.execute(""" CREATE TABLE IF NOT EXISTS files ( id SERIAL PRIMARY KEY, path TEXT NOT NULL UNIQUE, size BIGINT NOT NULL, modified_time DOUBLE PRECISION NOT NULL, created_time DOUBLE PRECISION NOT NULL, disk_label TEXT NOT NULL, checksum TEXT, status TEXT DEFAULT 'indexed', category TEXT, duplicate_of TEXT, discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) """) # Create index on path cursor.execute(""" CREATE INDEX IF NOT EXISTS idx_files_path ON files(path) """) # Create index on disk cursor.execute(""" CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label) """) # Create index on checksum cursor.execute(""" CREATE INDEX IF NOT EXISTS idx_files_checksum ON files(checksum) """) conn.commit() cursor.close() def discover_path( self, root: Path, scanner: Optional[FileScanner] = None, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None ) -> ProcessingStats: """Discover and catalog files in a path Args: root: Root path to discover scanner: Optional custom scanner (default: FileScanner()) progress_callback: Optional callback for progress updates Returns: ProcessingStats with discovery statistics """ self.logger.section(f"Discovering: {root}") # Ensure tables exist self._ensure_tables() # Create scanner if not provided if scanner is None: scanner = FileScanner( error_handler=lambda e, p: self.logger.warning(f"Error scanning {p}: {e}") ) # Get disk info for the root path disk = self.system_api.get_disk_for_path(root) if disk is None: disk = str(root) # Initialize statistics stats = ProcessingStats() batch = [] conn = self._get_connection() cursor = conn.cursor() try: # Scan files for file_meta in scanner.scan(root): # Create file record record = FileRecord( path=file_meta.path, size=file_meta.size, modified_time=file_meta.modified_time, created_time=file_meta.created_time, disk_label=disk ) batch.append(record) stats.files_processed += 1 stats.bytes_processed += record.size # Batch insert if len(batch) >= self.batch_size: self._insert_batch(cursor, batch) conn.commit() batch.clear() # Progress callback if progress_callback: progress_callback(stats.files_processed, 0, stats) # Log progress if stats.files_processed % (self.batch_size * 10) == 0: self.logger.progress( stats.files_processed, stats.files_processed, # We don't know total prefix="Files discovered", bytes_processed=stats.bytes_processed, elapsed_seconds=stats.elapsed_seconds ) # Insert remaining batch if batch: self._insert_batch(cursor, batch) conn.commit() stats.files_succeeded = stats.files_processed except Exception as e: conn.rollback() self.logger.error(f"Discovery failed: {e}") raise finally: cursor.close() self.logger.info( f"Discovery complete: {stats.files_processed} files, " f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s" ) return stats def _insert_batch(self, cursor, batch: list[FileRecord]): """Insert batch of file records Args: cursor: Database cursor batch: List of FileRecord objects """ query = """ INSERT INTO files (path, size, modified_time, created_time, disk_label, checksum, status, category, duplicate_of) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s) ON CONFLICT (path) DO UPDATE SET size = EXCLUDED.size, modified_time = EXCLUDED.modified_time, updated_at = CURRENT_TIMESTAMP """ data = [ ( str(record.path), record.size, record.modified_time, record.created_time, record.disk_label, record.checksum, record.status, record.category, record.duplicate_of ) for record in batch ] execute_batch(cursor, query, data, page_size=self.batch_size) def get_disk_info(self) -> list[DiskInfo]: """Get information about all disks Returns: List of DiskInfo objects """ self.logger.subsection("Querying disk information") disks = [] for disk_info in self.system_api.query_nvmes(): # Get mount point if available mount_point = None fs_type = "unknown" for mount in self.system_api.query_mounts(): if mount.device == disk_info.device: mount_point = Path(mount.mount_point) fs_type = mount.fs_type break if mount_point: total, used, free = self.system_api.get_disk_usage(mount_point) else: total = disk_info.size used = 0 free = disk_info.size disk = DiskInfo( name=disk_info.device, device=disk_info.device, mount_point=mount_point or Path("/"), total_size=total, used_size=used, free_size=free, fs_type=fs_type ) disks.append(disk) self.logger.info( f" {disk.name}: {disk.usage_percent:.1f}% used " f"({disk.used_size:,} / {disk.total_size:,} bytes)" ) return disks def get_file_count(self, disk: Optional[str] = None) -> int: """Get count of discovered files Args: disk: Optional disk filter Returns: Count of files """ conn = self._get_connection() cursor = conn.cursor() if disk: cursor.execute("SELECT COUNT(*) FROM files WHERE disk_label = %s", (disk,)) else: cursor.execute("SELECT COUNT(*) FROM files") count = cursor.fetchone()[0] cursor.close() return count def get_total_size(self, disk: Optional[str] = None) -> int: """Get total size of discovered files Args: disk: Optional disk filter Returns: Total size in bytes """ conn = self._get_connection() cursor = conn.cursor() if disk: cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE disk_label = %s", (disk,)) else: cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files") total = cursor.fetchone()[0] cursor.close() return total def close(self): """Close database connection""" if self._connection and not self._connection.closed: self._connection.close() def __enter__(self): """Context manager entry""" return self def __exit__(self, exc_type, exc_val, exc_tb): """Context manager exit""" self.close()