remove_doc

This commit is contained in:
mike
2025-12-13 04:23:04 +01:00
parent 75034d5e51
commit 9759001f4c
9 changed files with 741 additions and 1641 deletions

View File

@@ -1,10 +1,8 @@
"""Discovery engine coordinating scanner and system APIs"""
from pathlib import Path
from typing import Optional, Callable
from datetime import datetime
import psycopg2
from psycopg2.extras import execute_batch
from .scanner import FileScanner
from .system import SystemAPI
from ._protocols import FileMeta
@@ -12,23 +10,9 @@ from ..shared.models import FileRecord, DiskInfo, ProcessingStats
from ..shared.config import DatabaseConfig
from ..shared.logger import ProgressLogger
class DiscoveryEngine:
"""Discovery engine for scanning and cataloging files"""
def __init__(
self,
db_config: DatabaseConfig,
logger: ProgressLogger,
batch_size: int = 1000
):
"""Initialize discovery engine
Args:
db_config: Database configuration
logger: Progress logger
batch_size: Number of records to batch before database commit
"""
def __init__(self, db_config: DatabaseConfig, logger: ProgressLogger, batch_size: int=1000):
self.db_config = db_config
self.logger = logger
self.batch_size = batch_size
@@ -36,286 +20,114 @@ class DiscoveryEngine:
self._connection = None
def _get_connection(self):
"""Get or create database connection"""
if self._connection is None or self._connection.closed:
self._connection = psycopg2.connect(
host=self.db_config.host,
port=self.db_config.port,
database=self.db_config.database,
user=self.db_config.user,
password=self.db_config.password
)
self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
return self._connection
def _ensure_tables(self):
"""Ensure database tables exist"""
conn = self._get_connection()
cursor = conn.cursor()
# Create files table
cursor.execute("""
CREATE TABLE IF NOT EXISTS files (
id SERIAL PRIMARY KEY,
path TEXT NOT NULL UNIQUE,
size BIGINT NOT NULL,
modified_time DOUBLE PRECISION NOT NULL,
created_time DOUBLE PRECISION NOT NULL,
disk_label TEXT NOT NULL,
checksum TEXT,
status TEXT DEFAULT 'indexed',
category TEXT,
duplicate_of TEXT,
discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# Create index on path
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_files_path ON files(path)
""")
# Create index on disk
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label)
""")
# Create index on checksum
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_files_checksum ON files(checksum)
""")
cursor.execute("\n CREATE TABLE IF NOT EXISTS files (\n id SERIAL PRIMARY KEY,\n path TEXT NOT NULL UNIQUE,\n size BIGINT NOT NULL,\n modified_time DOUBLE PRECISION NOT NULL,\n created_time DOUBLE PRECISION NOT NULL,\n disk_label TEXT NOT NULL,\n checksum TEXT,\n status TEXT DEFAULT 'indexed',\n category TEXT,\n duplicate_of TEXT,\n discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n )\n ")
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_path ON files(path)\n ')
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label)\n ')
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_checksum ON files(checksum)\n ')
conn.commit()
cursor.close()
def discover_path(
self,
root: Path,
scanner: Optional[FileScanner] = None,
progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
) -> ProcessingStats:
"""Discover and catalog files in a path
Args:
root: Root path to discover
scanner: Optional custom scanner (default: FileScanner())
progress_callback: Optional callback for progress updates
Returns:
ProcessingStats with discovery statistics
"""
self.logger.section(f"Discovering: {root}")
# Ensure tables exist
def discover_path(self, root: Path, scanner: Optional[FileScanner]=None, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats:
self.logger.section(f'Discovering: {root}')
self._ensure_tables()
# Create scanner if not provided
if scanner is None:
scanner = FileScanner(
error_handler=lambda e, p: self.logger.warning(f"Error scanning {p}: {e}")
)
# Get disk info for the root path
scanner = FileScanner(error_handler=lambda e, p: self.logger.warning(f'Error scanning {p}: {e}'))
disk = self.system_api.get_disk_for_path(root)
if disk is None:
disk = str(root)
# Initialize statistics
stats = ProcessingStats()
batch = []
conn = self._get_connection()
cursor = conn.cursor()
try:
# Scan files
for file_meta in scanner.scan(root):
# Create file record
record = FileRecord(
path=file_meta.path,
size=file_meta.size,
modified_time=file_meta.modified_time,
created_time=file_meta.created_time,
disk_label=disk
)
record = FileRecord(path=file_meta.path, size=file_meta.size, modified_time=file_meta.modified_time, created_time=file_meta.created_time, disk_label=disk)
batch.append(record)
stats.files_processed += 1
stats.bytes_processed += record.size
# Batch insert
if len(batch) >= self.batch_size:
self._insert_batch(cursor, batch)
conn.commit()
batch.clear()
# Progress callback
if progress_callback:
progress_callback(stats.files_processed, 0, stats)
# Log progress
if stats.files_processed % (self.batch_size * 10) == 0:
self.logger.progress(
stats.files_processed,
stats.files_processed, # We don't know total
prefix="Files discovered",
bytes_processed=stats.bytes_processed,
elapsed_seconds=stats.elapsed_seconds
)
# Insert remaining batch
self.logger.progress(stats.files_processed, stats.files_processed, prefix='Files discovered', bytes_processed=stats.bytes_processed, elapsed_seconds=stats.elapsed_seconds)
if batch:
self._insert_batch(cursor, batch)
conn.commit()
stats.files_succeeded = stats.files_processed
except Exception as e:
conn.rollback()
self.logger.error(f"Discovery failed: {e}")
self.logger.error(f'Discovery failed: {e}')
raise
finally:
cursor.close()
self.logger.info(
f"Discovery complete: {stats.files_processed} files, "
f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
)
self.logger.info(f'Discovery complete: {stats.files_processed} files, {stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s')
return stats
def _insert_batch(self, cursor, batch: list[FileRecord]):
"""Insert batch of file records
Args:
cursor: Database cursor
batch: List of FileRecord objects
"""
query = """
INSERT INTO files (path, size, modified_time, created_time, disk_label, checksum, status, category, duplicate_of)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (path) DO UPDATE SET
size = EXCLUDED.size,
modified_time = EXCLUDED.modified_time,
updated_at = CURRENT_TIMESTAMP
"""
data = [
(
str(record.path),
record.size,
record.modified_time,
record.created_time,
record.disk_label,
record.checksum,
record.status,
record.category,
record.duplicate_of
)
for record in batch
]
query = '\n INSERT INTO files (path, size, modified_time, created_time, disk_label, checksum, status, category, duplicate_of)\n VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)\n ON CONFLICT (path) DO UPDATE SET\n size = EXCLUDED.size,\n modified_time = EXCLUDED.modified_time,\n updated_at = CURRENT_TIMESTAMP\n '
data = [(str(record.path), record.size, record.modified_time, record.created_time, record.disk_label, record.checksum, record.status, record.category, record.duplicate_of) for record in batch]
execute_batch(cursor, query, data, page_size=self.batch_size)
def get_disk_info(self) -> list[DiskInfo]:
"""Get information about all disks
Returns:
List of DiskInfo objects
"""
self.logger.subsection("Querying disk information")
self.logger.subsection('Querying disk information')
disks = []
for disk_info in self.system_api.query_nvmes():
# Get mount point if available
mount_point = None
fs_type = "unknown"
fs_type = 'unknown'
for mount in self.system_api.query_mounts():
if mount.device == disk_info.device:
mount_point = Path(mount.mount_point)
fs_type = mount.fs_type
break
if mount_point:
total, used, free = self.system_api.get_disk_usage(mount_point)
else:
total = disk_info.size
used = 0
free = disk_info.size
disk = DiskInfo(
name=disk_info.device,
device=disk_info.device,
mount_point=mount_point or Path("/"),
total_size=total,
used_size=used,
free_size=free,
fs_type=fs_type
)
disk = DiskInfo(name=disk_info.device, device=disk_info.device, mount_point=mount_point or Path('/'), total_size=total, used_size=used, free_size=free, fs_type=fs_type)
disks.append(disk)
self.logger.info(
f" {disk.name}: {disk.usage_percent:.1f}% used "
f"({disk.used_size:,} / {disk.total_size:,} bytes)"
)
self.logger.info(f' {disk.name}: {disk.usage_percent:.1f}% used ({disk.used_size:,} / {disk.total_size:,} bytes)')
return disks
def get_file_count(self, disk: Optional[str] = None) -> int:
"""Get count of discovered files
Args:
disk: Optional disk filter
Returns:
Count of files
"""
def get_file_count(self, disk: Optional[str]=None) -> int:
conn = self._get_connection()
cursor = conn.cursor()
if disk:
cursor.execute("SELECT COUNT(*) FROM files WHERE disk_label = %s", (disk,))
cursor.execute('SELECT COUNT(*) FROM files WHERE disk_label = %s', (disk,))
else:
cursor.execute("SELECT COUNT(*) FROM files")
cursor.execute('SELECT COUNT(*) FROM files')
count = cursor.fetchone()[0]
cursor.close()
return count
def get_total_size(self, disk: Optional[str] = None) -> int:
"""Get total size of discovered files
Args:
disk: Optional disk filter
Returns:
Total size in bytes
"""
def get_total_size(self, disk: Optional[str]=None) -> int:
conn = self._get_connection()
cursor = conn.cursor()
if disk:
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE disk_label = %s", (disk,))
cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files WHERE disk_label = %s', (disk,))
else:
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files")
cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files')
total = cursor.fetchone()[0]
cursor.close()
return total
def close(self):
"""Close database connection"""
if self._connection and not self._connection.closed:
if self._connection and (not self._connection.closed):
self._connection.close()
def __enter__(self):
"""Context manager entry"""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit"""
self.close()