base
This commit is contained in:
321
app/discovery/engine.py
Normal file
321
app/discovery/engine.py
Normal file
@@ -0,0 +1,321 @@
|
||||
"""Discovery engine coordinating scanner and system APIs"""
|
||||
from pathlib import Path
|
||||
from typing import Optional, Callable
|
||||
from datetime import datetime
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
|
||||
from .scanner import FileScanner
|
||||
from .system import SystemAPI
|
||||
from ._protocols import FileMeta
|
||||
from ..shared.models import FileRecord, DiskInfo, ProcessingStats
|
||||
from ..shared.config import DatabaseConfig
|
||||
from ..shared.logger import ProgressLogger
|
||||
|
||||
|
||||
class DiscoveryEngine:
|
||||
"""Discovery engine for scanning and cataloging files"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db_config: DatabaseConfig,
|
||||
logger: ProgressLogger,
|
||||
batch_size: int = 1000
|
||||
):
|
||||
"""Initialize discovery engine
|
||||
|
||||
Args:
|
||||
db_config: Database configuration
|
||||
logger: Progress logger
|
||||
batch_size: Number of records to batch before database commit
|
||||
"""
|
||||
self.db_config = db_config
|
||||
self.logger = logger
|
||||
self.batch_size = batch_size
|
||||
self.system_api = SystemAPI()
|
||||
self._connection = None
|
||||
|
||||
def _get_connection(self):
|
||||
"""Get or create database connection"""
|
||||
if self._connection is None or self._connection.closed:
|
||||
self._connection = psycopg2.connect(
|
||||
host=self.db_config.host,
|
||||
port=self.db_config.port,
|
||||
database=self.db_config.database,
|
||||
user=self.db_config.user,
|
||||
password=self.db_config.password
|
||||
)
|
||||
return self._connection
|
||||
|
||||
def _ensure_tables(self):
|
||||
"""Ensure database tables exist"""
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Create files table
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS files (
|
||||
id SERIAL PRIMARY KEY,
|
||||
path TEXT NOT NULL UNIQUE,
|
||||
size BIGINT NOT NULL,
|
||||
modified_time DOUBLE PRECISION NOT NULL,
|
||||
created_time DOUBLE PRECISION NOT NULL,
|
||||
disk TEXT NOT NULL,
|
||||
checksum TEXT,
|
||||
status TEXT DEFAULT 'indexed',
|
||||
category TEXT,
|
||||
duplicate_of TEXT,
|
||||
discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
# Create index on path
|
||||
cursor.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_files_path ON files_bak(path)
|
||||
""")
|
||||
|
||||
# Create index on disk
|
||||
cursor.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_files_disk ON files_bak(disk)
|
||||
""")
|
||||
|
||||
# Create index on checksum
|
||||
cursor.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_files_checksum ON files_bak(checksum)
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def discover_path(
|
||||
self,
|
||||
root: Path,
|
||||
scanner: Optional[FileScanner] = None,
|
||||
progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
|
||||
) -> ProcessingStats:
|
||||
"""Discover and catalog files in a path
|
||||
|
||||
Args:
|
||||
root: Root path to discover
|
||||
scanner: Optional custom scanner (default: FileScanner())
|
||||
progress_callback: Optional callback for progress updates
|
||||
|
||||
Returns:
|
||||
ProcessingStats with discovery statistics
|
||||
"""
|
||||
self.logger.section(f"Discovering: {root}")
|
||||
|
||||
# Ensure tables exist
|
||||
self._ensure_tables()
|
||||
|
||||
# Create scanner if not provided
|
||||
if scanner is None:
|
||||
scanner = FileScanner(
|
||||
error_handler=lambda e, p: self.logger.warning(f"Error scanning {p}: {e}")
|
||||
)
|
||||
|
||||
# Get disk info for the root path
|
||||
disk = self.system_api.get_disk_for_path(root)
|
||||
if disk is None:
|
||||
disk = str(root)
|
||||
|
||||
# Initialize statistics
|
||||
stats = ProcessingStats()
|
||||
batch = []
|
||||
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
# Scan files
|
||||
for file_meta in scanner.scan(root):
|
||||
# Create file record
|
||||
record = FileRecord(
|
||||
path=file_meta.path,
|
||||
size=file_meta.size,
|
||||
modified_time=file_meta.modified_time,
|
||||
created_time=file_meta.created_time,
|
||||
disk=disk
|
||||
)
|
||||
|
||||
batch.append(record)
|
||||
stats.files_processed += 1
|
||||
stats.bytes_processed += record.size
|
||||
|
||||
# Batch insert
|
||||
if len(batch) >= self.batch_size:
|
||||
self._insert_batch(cursor, batch)
|
||||
conn.commit()
|
||||
batch.clear()
|
||||
|
||||
# Progress callback
|
||||
if progress_callback:
|
||||
progress_callback(stats.files_processed, 0, stats)
|
||||
|
||||
# Log progress
|
||||
if stats.files_processed % (self.batch_size * 10) == 0:
|
||||
self.logger.progress(
|
||||
stats.files_processed,
|
||||
stats.files_processed, # We don't know total
|
||||
prefix="Files discovered",
|
||||
bytes_processed=stats.bytes_processed,
|
||||
elapsed_seconds=stats.elapsed_seconds
|
||||
)
|
||||
|
||||
# Insert remaining batch
|
||||
if batch:
|
||||
self._insert_batch(cursor, batch)
|
||||
conn.commit()
|
||||
|
||||
stats.files_succeeded = stats.files_processed
|
||||
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
self.logger.error(f"Discovery failed: {e}")
|
||||
raise
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
|
||||
self.logger.info(
|
||||
f"Discovery complete: {stats.files_processed} files, "
|
||||
f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
|
||||
)
|
||||
|
||||
return stats
|
||||
|
||||
def _insert_batch(self, cursor, batch: list[FileRecord]):
|
||||
"""Insert batch of file records
|
||||
|
||||
Args:
|
||||
cursor: Database cursor
|
||||
batch: List of FileRecord objects
|
||||
"""
|
||||
query = """
|
||||
INSERT INTO files_bak (path, size, modified_time, created_time, disk, checksum, status, category, duplicate_of)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (path) DO UPDATE SET
|
||||
size = EXCLUDED.size,
|
||||
modified_time = EXCLUDED.modified_time,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
"""
|
||||
|
||||
data = [
|
||||
(
|
||||
str(record.path),
|
||||
record.size,
|
||||
record.modified_time,
|
||||
record.created_time,
|
||||
record.disk,
|
||||
record.checksum,
|
||||
record.status,
|
||||
record.category,
|
||||
record.duplicate_of
|
||||
)
|
||||
for record in batch
|
||||
]
|
||||
|
||||
execute_batch(cursor, query, data, page_size=self.batch_size)
|
||||
|
||||
def get_disk_info(self) -> list[DiskInfo]:
|
||||
"""Get information about all disks
|
||||
|
||||
Returns:
|
||||
List of DiskInfo objects
|
||||
"""
|
||||
self.logger.subsection("Querying disk information")
|
||||
|
||||
disks = []
|
||||
for disk_info in self.system_api.query_nvmes():
|
||||
# Get mount point if available
|
||||
mount_point = None
|
||||
fs_type = "unknown"
|
||||
|
||||
for mount in self.system_api.query_mounts():
|
||||
if mount.device == disk_info.device:
|
||||
mount_point = Path(mount.mount_point)
|
||||
fs_type = mount.fs_type
|
||||
break
|
||||
|
||||
if mount_point:
|
||||
total, used, free = self.system_api.get_disk_usage(mount_point)
|
||||
else:
|
||||
total = disk_info.size
|
||||
used = 0
|
||||
free = disk_info.size
|
||||
|
||||
disk = DiskInfo(
|
||||
name=disk_info.device,
|
||||
device=disk_info.device,
|
||||
mount_point=mount_point or Path("/"),
|
||||
total_size=total,
|
||||
used_size=used,
|
||||
free_size=free,
|
||||
fs_type=fs_type
|
||||
)
|
||||
disks.append(disk)
|
||||
|
||||
self.logger.info(
|
||||
f" {disk.name}: {disk.usage_percent:.1f}% used "
|
||||
f"({disk.used_size:,} / {disk.total_size:,} bytes)"
|
||||
)
|
||||
|
||||
return disks
|
||||
|
||||
def get_file_count(self, disk: Optional[str] = None) -> int:
|
||||
"""Get count of discovered files
|
||||
|
||||
Args:
|
||||
disk: Optional disk filter
|
||||
|
||||
Returns:
|
||||
Count of files
|
||||
"""
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
if disk:
|
||||
cursor.execute("SELECT COUNT(*) FROM files_bak WHERE disk = %s", (disk,))
|
||||
else:
|
||||
cursor.execute("SELECT COUNT(*) FROM files_bak")
|
||||
|
||||
count = cursor.fetchone()[0]
|
||||
cursor.close()
|
||||
|
||||
return count
|
||||
|
||||
def get_total_size(self, disk: Optional[str] = None) -> int:
|
||||
"""Get total size of discovered files
|
||||
|
||||
Args:
|
||||
disk: Optional disk filter
|
||||
|
||||
Returns:
|
||||
Total size in bytes
|
||||
"""
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
if disk:
|
||||
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files_bak WHERE disk = %s", (disk,))
|
||||
else:
|
||||
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files_bak")
|
||||
|
||||
total = cursor.fetchone()[0]
|
||||
cursor.close()
|
||||
|
||||
return total
|
||||
|
||||
def close(self):
|
||||
"""Close database connection"""
|
||||
if self._connection and not self._connection.closed:
|
||||
self._connection.close()
|
||||
|
||||
def __enter__(self):
|
||||
"""Context manager entry"""
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
"""Context manager exit"""
|
||||
self.close()
|
||||
Reference in New Issue
Block a user