This commit is contained in:
mike
2025-12-12 19:25:16 +01:00
parent 5e0db89d45
commit 56b2db82fc
34 changed files with 117 additions and 6556 deletions

17
app/discovery/__init__.py Normal file
View File

@@ -0,0 +1,17 @@
"""Discovery package exports"""
from .scanner import FileScanner, FilteredScanner
from .system import SystemAPI
from .engine import DiscoveryEngine
from ._protocols import FileMeta, MountInfo, DiskInfo, IFileScanner, ISystemAPI
__all__ = [
'FileScanner',
'FilteredScanner',
'SystemAPI',
'DiscoveryEngine',
'FileMeta',
'MountInfo',
'DiskInfo',
'IFileScanner',
'ISystemAPI',
]

View File

@@ -0,0 +1,54 @@
"""Protocol definitions for the discovery package"""
from typing import Iterator, Protocol, Any
from pathlib import Path
from dataclasses import dataclass
@dataclass
class FileMeta:
"""Metadata for a discovered file"""
path: Path
size: int
modified_time: float
created_time: float
# Add other metadata fields as needed
@dataclass
class MountInfo:
"""Information about a mounted filesystem"""
device: str
mount_point: str
fs_type: str
options: str
# Add other mount info fields as needed
@dataclass
class DiskInfo:
"""Information about a disk/NVMe device"""
device: str
model: str
size: int
serial: str
# Add other disk info fields as needed
class IFileScanner(Protocol):
"""Protocol for file scanning operations"""
def scan(self, root: Path) -> Iterator[FileMeta]:
"""Scan a directory tree and yield file metadata"""
...
class ISystemAPI(Protocol):
"""Protocol for system information queries"""
def query_mounts(self) -> list[MountInfo]:
"""Query mounted filesystems"""
...
def query_nvmes(self) -> list[DiskInfo]:
"""Query NVMe/disk information"""
...

321
app/discovery/engine.py Normal file
View File

@@ -0,0 +1,321 @@
"""Discovery engine coordinating scanner and system APIs"""
from pathlib import Path
from typing import Optional, Callable
from datetime import datetime
import psycopg2
from psycopg2.extras import execute_batch
from .scanner import FileScanner
from .system import SystemAPI
from ._protocols import FileMeta
from ..shared.models import FileRecord, DiskInfo, ProcessingStats
from ..shared.config import DatabaseConfig
from ..shared.logger import ProgressLogger
class DiscoveryEngine:
"""Discovery engine for scanning and cataloging files"""
def __init__(
self,
db_config: DatabaseConfig,
logger: ProgressLogger,
batch_size: int = 1000
):
"""Initialize discovery engine
Args:
db_config: Database configuration
logger: Progress logger
batch_size: Number of records to batch before database commit
"""
self.db_config = db_config
self.logger = logger
self.batch_size = batch_size
self.system_api = SystemAPI()
self._connection = None
def _get_connection(self):
"""Get or create database connection"""
if self._connection is None or self._connection.closed:
self._connection = psycopg2.connect(
host=self.db_config.host,
port=self.db_config.port,
database=self.db_config.database,
user=self.db_config.user,
password=self.db_config.password
)
return self._connection
def _ensure_tables(self):
"""Ensure database tables exist"""
conn = self._get_connection()
cursor = conn.cursor()
# Create files table
cursor.execute("""
CREATE TABLE IF NOT EXISTS files (
id SERIAL PRIMARY KEY,
path TEXT NOT NULL UNIQUE,
size BIGINT NOT NULL,
modified_time DOUBLE PRECISION NOT NULL,
created_time DOUBLE PRECISION NOT NULL,
disk TEXT NOT NULL,
checksum TEXT,
status TEXT DEFAULT 'indexed',
category TEXT,
duplicate_of TEXT,
discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# Create index on path
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_files_path ON files_bak(path)
""")
# Create index on disk
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_files_disk ON files_bak(disk)
""")
# Create index on checksum
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_files_checksum ON files_bak(checksum)
""")
conn.commit()
cursor.close()
def discover_path(
self,
root: Path,
scanner: Optional[FileScanner] = None,
progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
) -> ProcessingStats:
"""Discover and catalog files in a path
Args:
root: Root path to discover
scanner: Optional custom scanner (default: FileScanner())
progress_callback: Optional callback for progress updates
Returns:
ProcessingStats with discovery statistics
"""
self.logger.section(f"Discovering: {root}")
# Ensure tables exist
self._ensure_tables()
# Create scanner if not provided
if scanner is None:
scanner = FileScanner(
error_handler=lambda e, p: self.logger.warning(f"Error scanning {p}: {e}")
)
# Get disk info for the root path
disk = self.system_api.get_disk_for_path(root)
if disk is None:
disk = str(root)
# Initialize statistics
stats = ProcessingStats()
batch = []
conn = self._get_connection()
cursor = conn.cursor()
try:
# Scan files
for file_meta in scanner.scan(root):
# Create file record
record = FileRecord(
path=file_meta.path,
size=file_meta.size,
modified_time=file_meta.modified_time,
created_time=file_meta.created_time,
disk=disk
)
batch.append(record)
stats.files_processed += 1
stats.bytes_processed += record.size
# Batch insert
if len(batch) >= self.batch_size:
self._insert_batch(cursor, batch)
conn.commit()
batch.clear()
# Progress callback
if progress_callback:
progress_callback(stats.files_processed, 0, stats)
# Log progress
if stats.files_processed % (self.batch_size * 10) == 0:
self.logger.progress(
stats.files_processed,
stats.files_processed, # We don't know total
prefix="Files discovered",
bytes_processed=stats.bytes_processed,
elapsed_seconds=stats.elapsed_seconds
)
# Insert remaining batch
if batch:
self._insert_batch(cursor, batch)
conn.commit()
stats.files_succeeded = stats.files_processed
except Exception as e:
conn.rollback()
self.logger.error(f"Discovery failed: {e}")
raise
finally:
cursor.close()
self.logger.info(
f"Discovery complete: {stats.files_processed} files, "
f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
)
return stats
def _insert_batch(self, cursor, batch: list[FileRecord]):
"""Insert batch of file records
Args:
cursor: Database cursor
batch: List of FileRecord objects
"""
query = """
INSERT INTO files_bak (path, size, modified_time, created_time, disk, checksum, status, category, duplicate_of)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (path) DO UPDATE SET
size = EXCLUDED.size,
modified_time = EXCLUDED.modified_time,
updated_at = CURRENT_TIMESTAMP
"""
data = [
(
str(record.path),
record.size,
record.modified_time,
record.created_time,
record.disk,
record.checksum,
record.status,
record.category,
record.duplicate_of
)
for record in batch
]
execute_batch(cursor, query, data, page_size=self.batch_size)
def get_disk_info(self) -> list[DiskInfo]:
"""Get information about all disks
Returns:
List of DiskInfo objects
"""
self.logger.subsection("Querying disk information")
disks = []
for disk_info in self.system_api.query_nvmes():
# Get mount point if available
mount_point = None
fs_type = "unknown"
for mount in self.system_api.query_mounts():
if mount.device == disk_info.device:
mount_point = Path(mount.mount_point)
fs_type = mount.fs_type
break
if mount_point:
total, used, free = self.system_api.get_disk_usage(mount_point)
else:
total = disk_info.size
used = 0
free = disk_info.size
disk = DiskInfo(
name=disk_info.device,
device=disk_info.device,
mount_point=mount_point or Path("/"),
total_size=total,
used_size=used,
free_size=free,
fs_type=fs_type
)
disks.append(disk)
self.logger.info(
f" {disk.name}: {disk.usage_percent:.1f}% used "
f"({disk.used_size:,} / {disk.total_size:,} bytes)"
)
return disks
def get_file_count(self, disk: Optional[str] = None) -> int:
"""Get count of discovered files
Args:
disk: Optional disk filter
Returns:
Count of files
"""
conn = self._get_connection()
cursor = conn.cursor()
if disk:
cursor.execute("SELECT COUNT(*) FROM files_bak WHERE disk = %s", (disk,))
else:
cursor.execute("SELECT COUNT(*) FROM files_bak")
count = cursor.fetchone()[0]
cursor.close()
return count
def get_total_size(self, disk: Optional[str] = None) -> int:
"""Get total size of discovered files
Args:
disk: Optional disk filter
Returns:
Total size in bytes
"""
conn = self._get_connection()
cursor = conn.cursor()
if disk:
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files_bak WHERE disk = %s", (disk,))
else:
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files_bak")
total = cursor.fetchone()[0]
cursor.close()
return total
def close(self):
"""Close database connection"""
if self._connection and not self._connection.closed:
self._connection.close()
def __enter__(self):
"""Context manager entry"""
return self
def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit"""
self.close()

216
app/discovery/scanner.py Normal file
View File

@@ -0,0 +1,216 @@
"""File system scanner implementing IFileScanner protocol"""
import os
from pathlib import Path
from typing import Iterator, Optional, Callable
from datetime import datetime
from ._protocols import FileMeta
class FileScanner:
"""File system scanner with filtering and error handling"""
def __init__(
self,
follow_symlinks: bool = False,
skip_hidden: bool = True,
error_handler: Optional[Callable[[Exception, Path], None]] = None
):
"""Initialize file scanner
Args:
follow_symlinks: Whether to follow symbolic links
skip_hidden: Whether to skip hidden files/directories
error_handler: Optional callback for handling errors during scan
"""
self.follow_symlinks = follow_symlinks
self.skip_hidden = skip_hidden
self.error_handler = error_handler
self._files_scanned = 0
self._bytes_scanned = 0
self._errors = 0
def scan(self, root: Path) -> Iterator[FileMeta]:
"""Scan a directory tree and yield file metadata
Args:
root: Root directory to scan
Yields:
FileMeta objects for each discovered file
"""
if not root.exists():
error = FileNotFoundError(f"Path does not exist: {root}")
if self.error_handler:
self.error_handler(error, root)
else:
raise error
return
if not root.is_dir():
# If root is a file, just return its metadata
try:
yield self._get_file_meta(root)
except Exception as e:
self._errors += 1
if self.error_handler:
self.error_handler(e, root)
else:
raise
return
# Walk directory tree
for dirpath, dirnames, filenames in os.walk(root, followlinks=self.follow_symlinks):
current_dir = Path(dirpath)
# Filter directories if needed
if self.skip_hidden:
dirnames[:] = [d for d in dirnames if not d.startswith('.')]
# Process files
for filename in filenames:
if self.skip_hidden and filename.startswith('.'):
continue
file_path = current_dir / filename
try:
# Skip broken symlinks
if file_path.is_symlink() and not file_path.exists():
continue
meta = self._get_file_meta(file_path)
self._files_scanned += 1
self._bytes_scanned += meta.size
yield meta
except PermissionError as e:
self._errors += 1
if self.error_handler:
self.error_handler(e, file_path)
# Continue scanning
continue
except Exception as e:
self._errors += 1
if self.error_handler:
self.error_handler(e, file_path)
# Continue scanning
continue
def _get_file_meta(self, path: Path) -> FileMeta:
"""Get file metadata
Args:
path: Path to file
Returns:
FileMeta object with file metadata
Raises:
OSError: If file cannot be accessed
"""
stat = path.stat()
# Get creation time (platform dependent)
created_time = stat.st_ctime
if hasattr(stat, 'st_birthtime'):
created_time = stat.st_birthtime
return FileMeta(
path=path,
size=stat.st_size,
modified_time=stat.st_mtime,
created_time=created_time
)
@property
def files_scanned(self) -> int:
"""Get count of files scanned"""
return self._files_scanned
@property
def bytes_scanned(self) -> int:
"""Get total bytes scanned"""
return self._bytes_scanned
@property
def errors(self) -> int:
"""Get count of errors encountered"""
return self._errors
def reset_stats(self) -> None:
"""Reset scanning statistics"""
self._files_scanned = 0
self._bytes_scanned = 0
self._errors = 0
class FilteredScanner(FileScanner):
"""Scanner with additional filtering capabilities"""
def __init__(
self,
min_size: Optional[int] = None,
max_size: Optional[int] = None,
extensions: Optional[list[str]] = None,
exclude_patterns: Optional[list[str]] = None,
**kwargs
):
"""Initialize filtered scanner
Args:
min_size: Minimum file size in bytes
max_size: Maximum file size in bytes
extensions: List of file extensions to include (e.g., ['.txt', '.py'])
exclude_patterns: List of path patterns to exclude
**kwargs: Additional arguments passed to FileScanner
"""
super().__init__(**kwargs)
self.min_size = min_size
self.max_size = max_size
self.extensions = {ext.lower() for ext in extensions} if extensions else None
self.exclude_patterns = exclude_patterns or []
def scan(self, root: Path) -> Iterator[FileMeta]:
"""Scan with additional filtering
Args:
root: Root directory to scan
Yields:
FileMeta objects for files matching filter criteria
"""
for meta in super().scan(root):
# Size filtering
if self.min_size is not None and meta.size < self.min_size:
continue
if self.max_size is not None and meta.size > self.max_size:
continue
# Extension filtering
if self.extensions is not None:
if meta.path.suffix.lower() not in self.extensions:
continue
# Exclude pattern filtering
if self._should_exclude(meta.path):
continue
yield meta
def _should_exclude(self, path: Path) -> bool:
"""Check if path matches any exclude pattern
Args:
path: Path to check
Returns:
True if path should be excluded
"""
path_str = str(path)
for pattern in self.exclude_patterns:
if pattern in path_str:
return True
return False

236
app/discovery/system.py Normal file
View File

@@ -0,0 +1,236 @@
"""System API for querying mounts and disks"""
import os
import subprocess
from pathlib import Path
from typing import Optional
import psutil
from ._protocols import MountInfo, DiskInfo
class SystemAPI:
"""System information API for querying mounts and disks"""
def query_mounts(self) -> list[MountInfo]:
"""Query mounted filesystems
Returns:
List of MountInfo objects for all mounted filesystems
"""
mounts = []
for partition in psutil.disk_partitions(all=False):
mount_info = MountInfo(
device=partition.device,
mount_point=partition.mountpoint,
fs_type=partition.fstype,
options=partition.opts
)
mounts.append(mount_info)
return mounts
def query_nvmes(self) -> list[DiskInfo]:
"""Query NVMe/disk information
Returns:
List of DiskInfo objects for all disks
"""
disks = []
# Try to get disk information using lsblk
try:
result = subprocess.run(
['lsblk', '-ndo', 'NAME,MODEL,SIZE,SERIAL', '-b'],
capture_output=True,
text=True,
check=False
)
if result.returncode == 0:
for line in result.stdout.strip().split('\n'):
if not line.strip():
continue
parts = line.split(maxsplit=3)
if len(parts) >= 3:
device = f"/dev/{parts[0]}"
model = parts[1] if len(parts) > 1 else "Unknown"
size_str = parts[2] if len(parts) > 2 else "0"
serial = parts[3] if len(parts) > 3 else "Unknown"
try:
size = int(size_str)
except ValueError:
size = 0
disk_info = DiskInfo(
device=device,
model=model,
size=size,
serial=serial
)
disks.append(disk_info)
except FileNotFoundError:
# lsblk not available, fall back to basic info
pass
# If lsblk failed or unavailable, try alternative method
if not disks:
disks = self._query_disks_fallback()
return disks
def _query_disks_fallback(self) -> list[DiskInfo]:
"""Fallback method for querying disk information
Returns:
List of DiskInfo objects using psutil
"""
disks = []
seen_devices = set()
for partition in psutil.disk_partitions(all=True):
device = partition.device
# Skip non-disk devices
if not device.startswith('/dev/'):
continue
# Get base device (e.g., /dev/sda from /dev/sda1)
base_device = self._get_base_device(device)
if base_device in seen_devices:
continue
seen_devices.add(base_device)
try:
usage = psutil.disk_usage(partition.mountpoint)
size = usage.total
except (PermissionError, OSError):
size = 0
disk_info = DiskInfo(
device=base_device,
model="Unknown",
size=size,
serial="Unknown"
)
disks.append(disk_info)
return disks
def _get_base_device(self, device: str) -> str:
"""Extract base device name from partition device
Args:
device: Device path (e.g., /dev/sda1, /dev/nvme0n1p1)
Returns:
Base device path (e.g., /dev/sda, /dev/nvme0n1)
"""
# Handle NVMe devices
if 'nvme' in device:
# /dev/nvme0n1p1 -> /dev/nvme0n1
if 'p' in device:
return device.rsplit('p', 1)[0]
return device
# Handle standard devices (sda, sdb, etc.)
# /dev/sda1 -> /dev/sda
import re
match = re.match(r'(/dev/[a-z]+)', device)
if match:
return match.group(1)
return device
def get_disk_for_path(self, path: Path) -> Optional[str]:
"""Get the disk/mount point for a given path
Args:
path: Path to check
Returns:
Mount point device or None if not found
"""
path = path.resolve()
# Find the mount point that contains this path
best_match = None
best_match_len = 0
for partition in psutil.disk_partitions():
mount_point = Path(partition.mountpoint)
try:
if path == mount_point or mount_point in path.parents:
mount_len = len(str(mount_point))
if mount_len > best_match_len:
best_match = partition.device
best_match_len = mount_len
except (ValueError, OSError):
continue
return best_match
def get_disk_usage(self, path: Path) -> tuple[int, int, int]:
"""Get disk usage for a path
Args:
path: Path to check
Returns:
Tuple of (total, used, free) in bytes
"""
try:
usage = psutil.disk_usage(str(path))
return usage.total, usage.used, usage.free
except (PermissionError, OSError):
return 0, 0, 0
def get_mount_point(self, path: Path) -> Optional[Path]:
"""Get the mount point for a given path
Args:
path: Path to check
Returns:
Mount point path or None if not found
"""
path = path.resolve()
# Find the mount point that contains this path
best_match = None
best_match_len = 0
for partition in psutil.disk_partitions():
mount_point = Path(partition.mountpoint)
try:
if path == mount_point or mount_point in path.parents:
mount_len = len(str(mount_point))
if mount_len > best_match_len:
best_match = mount_point
best_match_len = mount_len
except (ValueError, OSError):
continue
return best_match
def is_same_filesystem(self, path1: Path, path2: Path) -> bool:
"""Check if two paths are on the same filesystem
Args:
path1: First path
path2: Second path
Returns:
True if paths are on the same filesystem
"""
try:
stat1 = path1.stat()
stat2 = path2.stat()
return stat1.st_dev == stat2.st_dev
except (OSError, PermissionError):
return False