134 lines
6.9 KiB
Python
134 lines
6.9 KiB
Python
from pathlib import Path
|
|
from typing import Optional, Callable
|
|
from datetime import datetime
|
|
import psycopg2
|
|
from psycopg2.extras import execute_batch
|
|
from .scanner import FileScanner
|
|
from .system import SystemAPI
|
|
from ._protocols import FileMeta
|
|
from ..shared.models import FileRecord, DiskInfo, ProcessingStats
|
|
from ..shared.config import DatabaseConfig
|
|
from ..shared.logger import ProgressLogger
|
|
|
|
class DiscoveryEngine:
|
|
|
|
def __init__(self, db_config: DatabaseConfig, logger: ProgressLogger, batch_size: int=1000):
|
|
self.db_config = db_config
|
|
self.logger = logger
|
|
self.batch_size = batch_size
|
|
self.system_api = SystemAPI()
|
|
self._connection = None
|
|
|
|
def _get_connection(self):
|
|
if self._connection is None or self._connection.closed:
|
|
self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
|
|
return self._connection
|
|
|
|
def _ensure_tables(self):
|
|
conn = self._get_connection()
|
|
cursor = conn.cursor()
|
|
cursor.execute("\n CREATE TABLE IF NOT EXISTS files (\n id SERIAL PRIMARY KEY,\n path TEXT NOT NULL UNIQUE,\n size BIGINT NOT NULL,\n modified_time DOUBLE PRECISION NOT NULL,\n created_time DOUBLE PRECISION NOT NULL,\n disk_label TEXT NOT NULL,\n checksum TEXT,\n status TEXT DEFAULT 'indexed',\n category TEXT,\n duplicate_of TEXT,\n discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n )\n ")
|
|
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_path ON files(path)\n ')
|
|
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label)\n ')
|
|
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_checksum ON files(checksum)\n ')
|
|
conn.commit()
|
|
cursor.close()
|
|
|
|
def discover_path(self, root: Path, scanner: Optional[FileScanner]=None, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats:
|
|
self.logger.section(f'Discovering: {root}')
|
|
self._ensure_tables()
|
|
if scanner is None:
|
|
scanner = FileScanner(error_handler=lambda e, p: self.logger.warning(f'Error scanning {p}: {e}'))
|
|
disk = self.system_api.get_disk_for_path(root)
|
|
if disk is None:
|
|
disk = str(root)
|
|
stats = ProcessingStats()
|
|
batch = []
|
|
conn = self._get_connection()
|
|
cursor = conn.cursor()
|
|
try:
|
|
for file_meta in scanner.scan(root):
|
|
record = FileRecord(path=file_meta.path, size=file_meta.size, modified_time=file_meta.modified_time, created_time=file_meta.created_time, disk_label=disk)
|
|
batch.append(record)
|
|
stats.files_processed += 1
|
|
stats.bytes_processed += record.size
|
|
if len(batch) >= self.batch_size:
|
|
self._insert_batch(cursor, batch)
|
|
conn.commit()
|
|
batch.clear()
|
|
if progress_callback:
|
|
progress_callback(stats.files_processed, 0, stats)
|
|
if stats.files_processed % (self.batch_size * 10) == 0:
|
|
self.logger.progress(stats.files_processed, stats.files_processed, prefix='Files discovered', bytes_processed=stats.bytes_processed, elapsed_seconds=stats.elapsed_seconds)
|
|
if batch:
|
|
self._insert_batch(cursor, batch)
|
|
conn.commit()
|
|
stats.files_succeeded = stats.files_processed
|
|
except Exception as e:
|
|
conn.rollback()
|
|
self.logger.error(f'Discovery failed: {e}')
|
|
raise
|
|
finally:
|
|
cursor.close()
|
|
self.logger.info(f'Discovery complete: {stats.files_processed} files, {stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s')
|
|
return stats
|
|
|
|
def _insert_batch(self, cursor, batch: list[FileRecord]):
|
|
query = '\n INSERT INTO files (path, size, modified_time, created_time, disk_label, checksum, status, category, duplicate_of)\n VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)\n ON CONFLICT (path) DO UPDATE SET\n size = EXCLUDED.size,\n modified_time = EXCLUDED.modified_time,\n updated_at = CURRENT_TIMESTAMP\n '
|
|
data = [(str(record.path), record.size, record.modified_time, record.created_time, record.disk_label, record.checksum, record.status, record.category, record.duplicate_of) for record in batch]
|
|
execute_batch(cursor, query, data, page_size=self.batch_size)
|
|
|
|
def get_disk_info(self) -> list[DiskInfo]:
|
|
self.logger.subsection('Querying disk information')
|
|
disks = []
|
|
for disk_info in self.system_api.query_nvmes():
|
|
mount_point = None
|
|
fs_type = 'unknown'
|
|
for mount in self.system_api.query_mounts():
|
|
if mount.device == disk_info.device:
|
|
mount_point = Path(mount.mount_point)
|
|
fs_type = mount.fs_type
|
|
break
|
|
if mount_point:
|
|
total, used, free = self.system_api.get_disk_usage(mount_point)
|
|
else:
|
|
total = disk_info.size
|
|
used = 0
|
|
free = disk_info.size
|
|
disk = DiskInfo(name=disk_info.device, device=disk_info.device, mount_point=mount_point or Path('/'), total_size=total, used_size=used, free_size=free, fs_type=fs_type)
|
|
disks.append(disk)
|
|
self.logger.info(f' {disk.name}: {disk.usage_percent:.1f}% used ({disk.used_size:,} / {disk.total_size:,} bytes)')
|
|
return disks
|
|
|
|
def get_file_count(self, disk: Optional[str]=None) -> int:
|
|
conn = self._get_connection()
|
|
cursor = conn.cursor()
|
|
if disk:
|
|
cursor.execute('SELECT COUNT(*) FROM files WHERE disk_label = %s', (disk,))
|
|
else:
|
|
cursor.execute('SELECT COUNT(*) FROM files')
|
|
count = cursor.fetchone()[0]
|
|
cursor.close()
|
|
return count
|
|
|
|
def get_total_size(self, disk: Optional[str]=None) -> int:
|
|
conn = self._get_connection()
|
|
cursor = conn.cursor()
|
|
if disk:
|
|
cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files WHERE disk_label = %s', (disk,))
|
|
else:
|
|
cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files')
|
|
total = cursor.fetchone()[0]
|
|
cursor.close()
|
|
return total
|
|
|
|
def close(self):
|
|
if self._connection and (not self._connection.closed):
|
|
self._connection.close()
|
|
|
|
def __enter__(self):
|
|
return self
|
|
|
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
self.close()
|