initial
This commit is contained in:
5
app/discovery/__init__.py
Normal file
5
app/discovery/__init__.py
Normal file
@@ -0,0 +1,5 @@
|
||||
from .scanner import FileScanner, FilteredScanner
|
||||
from .system import SystemAPI
|
||||
from .engine import DiscoveryEngine
|
||||
from ._protocols import FileMeta, MountInfo, DiskInfo, IFileScanner, ISystemAPI
|
||||
__all__ = ['FileScanner', 'FilteredScanner', 'SystemAPI', 'DiscoveryEngine', 'FileMeta', 'MountInfo', 'DiskInfo', 'IFileScanner', 'ISystemAPI']
|
||||
37
app/discovery/_protocols.py
Normal file
37
app/discovery/_protocols.py
Normal file
@@ -0,0 +1,37 @@
|
||||
from typing import Iterator, Protocol, Any
|
||||
from pathlib import Path
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class FileMeta:
|
||||
path: Path
|
||||
size: int
|
||||
modified_time: float
|
||||
created_time: float
|
||||
|
||||
@dataclass
|
||||
class MountInfo:
|
||||
device: str
|
||||
mount_point: str
|
||||
fs_type: str
|
||||
options: str
|
||||
|
||||
@dataclass
|
||||
class DiskInfo:
|
||||
device: str
|
||||
model: str
|
||||
size: int
|
||||
serial: str
|
||||
|
||||
class IFileScanner(Protocol):
|
||||
|
||||
def scan(self, root: Path) -> Iterator[FileMeta]:
|
||||
...
|
||||
|
||||
class ISystemAPI(Protocol):
|
||||
|
||||
def query_mounts(self) -> list[MountInfo]:
|
||||
...
|
||||
|
||||
def query_nvmes(self) -> list[DiskInfo]:
|
||||
...
|
||||
133
app/discovery/engine.py
Normal file
133
app/discovery/engine.py
Normal file
@@ -0,0 +1,133 @@
|
||||
from pathlib import Path
|
||||
from typing import Optional, Callable
|
||||
from datetime import datetime
|
||||
import psycopg2
|
||||
from psycopg2.extras import execute_batch
|
||||
from .scanner import FileScanner
|
||||
from .system import SystemAPI
|
||||
from ._protocols import FileMeta
|
||||
from ..shared.models import FileRecord, DiskInfo, ProcessingStats
|
||||
from ..shared.config import DatabaseConfig
|
||||
from ..shared.logger import ProgressLogger
|
||||
|
||||
class DiscoveryEngine:
|
||||
|
||||
def __init__(self, db_config: DatabaseConfig, logger: ProgressLogger, batch_size: int=1000):
|
||||
self.db_config = db_config
|
||||
self.logger = logger
|
||||
self.batch_size = batch_size
|
||||
self.system_api = SystemAPI()
|
||||
self._connection = None
|
||||
|
||||
def _get_connection(self):
|
||||
if self._connection is None or self._connection.closed:
|
||||
self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
|
||||
return self._connection
|
||||
|
||||
def _ensure_tables(self):
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("\n CREATE TABLE IF NOT EXISTS files (\n id SERIAL PRIMARY KEY,\n path TEXT NOT NULL UNIQUE,\n size BIGINT NOT NULL,\n modified_time DOUBLE PRECISION NOT NULL,\n created_time DOUBLE PRECISION NOT NULL,\n disk_label TEXT NOT NULL,\n checksum TEXT,\n status TEXT DEFAULT 'indexed',\n category TEXT,\n duplicate_of TEXT,\n discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n )\n ")
|
||||
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_path ON files(path)\n ')
|
||||
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label)\n ')
|
||||
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_checksum ON files(checksum)\n ')
|
||||
conn.commit()
|
||||
cursor.close()
|
||||
|
||||
def discover_path(self, root: Path, scanner: Optional[FileScanner]=None, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats:
|
||||
self.logger.section(f'Discovering: {root}')
|
||||
self._ensure_tables()
|
||||
if scanner is None:
|
||||
scanner = FileScanner(error_handler=lambda e, p: self.logger.warning(f'Error scanning {p}: {e}'))
|
||||
disk = self.system_api.get_disk_for_path(root)
|
||||
if disk is None:
|
||||
disk = str(root)
|
||||
stats = ProcessingStats()
|
||||
batch = []
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
for file_meta in scanner.scan(root):
|
||||
record = FileRecord(path=file_meta.path, size=file_meta.size, modified_time=file_meta.modified_time, created_time=file_meta.created_time, disk_label=disk)
|
||||
batch.append(record)
|
||||
stats.files_processed += 1
|
||||
stats.bytes_processed += record.size
|
||||
if len(batch) >= self.batch_size:
|
||||
self._insert_batch(cursor, batch)
|
||||
conn.commit()
|
||||
batch.clear()
|
||||
if progress_callback:
|
||||
progress_callback(stats.files_processed, 0, stats)
|
||||
if stats.files_processed % (self.batch_size * 10) == 0:
|
||||
self.logger.progress(stats.files_processed, stats.files_processed, prefix='Files discovered', bytes_processed=stats.bytes_processed, elapsed_seconds=stats.elapsed_seconds)
|
||||
if batch:
|
||||
self._insert_batch(cursor, batch)
|
||||
conn.commit()
|
||||
stats.files_succeeded = stats.files_processed
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
self.logger.error(f'Discovery failed: {e}')
|
||||
raise
|
||||
finally:
|
||||
cursor.close()
|
||||
self.logger.info(f'Discovery complete: {stats.files_processed} files, {stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s')
|
||||
return stats
|
||||
|
||||
def _insert_batch(self, cursor, batch: list[FileRecord]):
|
||||
query = '\n INSERT INTO files (path, size, modified_time, created_time, disk_label, checksum, status, category, duplicate_of)\n VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)\n ON CONFLICT (path) DO UPDATE SET\n size = EXCLUDED.size,\n modified_time = EXCLUDED.modified_time,\n updated_at = CURRENT_TIMESTAMP\n '
|
||||
data = [(str(record.path), record.size, record.modified_time, record.created_time, record.disk_label, record.checksum, record.status, record.category, record.duplicate_of) for record in batch]
|
||||
execute_batch(cursor, query, data, page_size=self.batch_size)
|
||||
|
||||
def get_disk_info(self) -> list[DiskInfo]:
|
||||
self.logger.subsection('Querying disk information')
|
||||
disks = []
|
||||
for disk_info in self.system_api.query_nvmes():
|
||||
mount_point = None
|
||||
fs_type = 'unknown'
|
||||
for mount in self.system_api.query_mounts():
|
||||
if mount.device == disk_info.device:
|
||||
mount_point = Path(mount.mount_point)
|
||||
fs_type = mount.fs_type
|
||||
break
|
||||
if mount_point:
|
||||
total, used, free = self.system_api.get_disk_usage(mount_point)
|
||||
else:
|
||||
total = disk_info.size
|
||||
used = 0
|
||||
free = disk_info.size
|
||||
disk = DiskInfo(name=disk_info.device, device=disk_info.device, mount_point=mount_point or Path('/'), total_size=total, used_size=used, free_size=free, fs_type=fs_type)
|
||||
disks.append(disk)
|
||||
self.logger.info(f' {disk.name}: {disk.usage_percent:.1f}% used ({disk.used_size:,} / {disk.total_size:,} bytes)')
|
||||
return disks
|
||||
|
||||
def get_file_count(self, disk: Optional[str]=None) -> int:
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
if disk:
|
||||
cursor.execute('SELECT COUNT(*) FROM files WHERE disk_label = %s', (disk,))
|
||||
else:
|
||||
cursor.execute('SELECT COUNT(*) FROM files')
|
||||
count = cursor.fetchone()[0]
|
||||
cursor.close()
|
||||
return count
|
||||
|
||||
def get_total_size(self, disk: Optional[str]=None) -> int:
|
||||
conn = self._get_connection()
|
||||
cursor = conn.cursor()
|
||||
if disk:
|
||||
cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files WHERE disk_label = %s', (disk,))
|
||||
else:
|
||||
cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files')
|
||||
total = cursor.fetchone()[0]
|
||||
cursor.close()
|
||||
return total
|
||||
|
||||
def close(self):
|
||||
if self._connection and (not self._connection.closed):
|
||||
self._connection.close()
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.close()
|
||||
112
app/discovery/scanner.py
Normal file
112
app/discovery/scanner.py
Normal file
@@ -0,0 +1,112 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Iterator, Optional, Callable
|
||||
from datetime import datetime
|
||||
from ._protocols import FileMeta
|
||||
|
||||
class FileScanner:
|
||||
|
||||
def __init__(self, follow_symlinks: bool=False, skip_hidden: bool=True, error_handler: Optional[Callable[[Exception, Path], None]]=None):
|
||||
self.follow_symlinks = follow_symlinks
|
||||
self.skip_hidden = skip_hidden
|
||||
self.error_handler = error_handler
|
||||
self._files_scanned = 0
|
||||
self._bytes_scanned = 0
|
||||
self._errors = 0
|
||||
|
||||
def scan(self, root: Path) -> Iterator[FileMeta]:
|
||||
if not root.exists():
|
||||
error = FileNotFoundError(f'Path does not exist: {root}')
|
||||
if self.error_handler:
|
||||
self.error_handler(error, root)
|
||||
else:
|
||||
raise error
|
||||
return
|
||||
if not root.is_dir():
|
||||
try:
|
||||
yield self._get_file_meta(root)
|
||||
except Exception as e:
|
||||
self._errors += 1
|
||||
if self.error_handler:
|
||||
self.error_handler(e, root)
|
||||
else:
|
||||
raise
|
||||
return
|
||||
for dirpath, dirnames, filenames in os.walk(root, followlinks=self.follow_symlinks):
|
||||
current_dir = Path(dirpath)
|
||||
if self.skip_hidden:
|
||||
dirnames[:] = [d for d in dirnames if not d.startswith('.')]
|
||||
for filename in filenames:
|
||||
if self.skip_hidden and filename.startswith('.'):
|
||||
continue
|
||||
file_path = current_dir / filename
|
||||
try:
|
||||
if file_path.is_symlink() and (not file_path.exists()):
|
||||
continue
|
||||
meta = self._get_file_meta(file_path)
|
||||
self._files_scanned += 1
|
||||
self._bytes_scanned += meta.size
|
||||
yield meta
|
||||
except PermissionError as e:
|
||||
self._errors += 1
|
||||
if self.error_handler:
|
||||
self.error_handler(e, file_path)
|
||||
continue
|
||||
except Exception as e:
|
||||
self._errors += 1
|
||||
if self.error_handler:
|
||||
self.error_handler(e, file_path)
|
||||
continue
|
||||
|
||||
def _get_file_meta(self, path: Path) -> FileMeta:
|
||||
stat = path.stat()
|
||||
created_time = stat.st_ctime
|
||||
if hasattr(stat, 'st_birthtime'):
|
||||
created_time = stat.st_birthtime
|
||||
return FileMeta(path=path, size=stat.st_size, modified_time=stat.st_mtime, created_time=created_time)
|
||||
|
||||
@property
|
||||
def files_scanned(self) -> int:
|
||||
return self._files_scanned
|
||||
|
||||
@property
|
||||
def bytes_scanned(self) -> int:
|
||||
return self._bytes_scanned
|
||||
|
||||
@property
|
||||
def errors(self) -> int:
|
||||
return self._errors
|
||||
|
||||
def reset_stats(self) -> None:
|
||||
self._files_scanned = 0
|
||||
self._bytes_scanned = 0
|
||||
self._errors = 0
|
||||
|
||||
class FilteredScanner(FileScanner):
|
||||
|
||||
def __init__(self, min_size: Optional[int]=None, max_size: Optional[int]=None, extensions: Optional[list[str]]=None, exclude_patterns: Optional[list[str]]=None, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.min_size = min_size
|
||||
self.max_size = max_size
|
||||
self.extensions = {ext.lower() for ext in extensions} if extensions else None
|
||||
self.exclude_patterns = exclude_patterns or []
|
||||
|
||||
def scan(self, root: Path) -> Iterator[FileMeta]:
|
||||
for meta in super().scan(root):
|
||||
if self.min_size is not None and meta.size < self.min_size:
|
||||
continue
|
||||
if self.max_size is not None and meta.size > self.max_size:
|
||||
continue
|
||||
if self.extensions is not None:
|
||||
if meta.path.suffix.lower() not in self.extensions:
|
||||
continue
|
||||
if self._should_exclude(meta.path):
|
||||
continue
|
||||
yield meta
|
||||
|
||||
def _should_exclude(self, path: Path) -> bool:
|
||||
path_str = str(path)
|
||||
for pattern in self.exclude_patterns:
|
||||
if pattern in path_str:
|
||||
return True
|
||||
return False
|
||||
119
app/discovery/system.py
Normal file
119
app/discovery/system.py
Normal file
@@ -0,0 +1,119 @@
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import psutil
|
||||
from ._protocols import MountInfo, DiskInfo
|
||||
|
||||
class SystemAPI:
|
||||
|
||||
def query_mounts(self) -> list[MountInfo]:
|
||||
mounts = []
|
||||
for partition in psutil.disk_partitions(all=False):
|
||||
mount_info = MountInfo(device=partition.device, mount_point=partition.mountpoint, fs_type=partition.fstype, options=partition.opts)
|
||||
mounts.append(mount_info)
|
||||
return mounts
|
||||
|
||||
def query_nvmes(self) -> list[DiskInfo]:
|
||||
disks = []
|
||||
try:
|
||||
result = subprocess.run(['lsblk', '-ndo', 'NAME,MODEL,SIZE,SERIAL', '-b'], capture_output=True, text=True, check=False)
|
||||
if result.returncode == 0:
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
parts = line.split(maxsplit=3)
|
||||
if len(parts) >= 3:
|
||||
device = f'/dev/{parts[0]}'
|
||||
model = parts[1] if len(parts) > 1 else 'Unknown'
|
||||
size_str = parts[2] if len(parts) > 2 else '0'
|
||||
serial = parts[3] if len(parts) > 3 else 'Unknown'
|
||||
try:
|
||||
size = int(size_str)
|
||||
except ValueError:
|
||||
size = 0
|
||||
disk_info = DiskInfo(device=device, model=model, size=size, serial=serial)
|
||||
disks.append(disk_info)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
if not disks:
|
||||
disks = self._query_disks_fallback()
|
||||
return disks
|
||||
|
||||
def _query_disks_fallback(self) -> list[DiskInfo]:
|
||||
disks = []
|
||||
seen_devices = set()
|
||||
for partition in psutil.disk_partitions(all=True):
|
||||
device = partition.device
|
||||
if not device.startswith('/dev/'):
|
||||
continue
|
||||
base_device = self._get_base_device(device)
|
||||
if base_device in seen_devices:
|
||||
continue
|
||||
seen_devices.add(base_device)
|
||||
try:
|
||||
usage = psutil.disk_usage(partition.mountpoint)
|
||||
size = usage.total
|
||||
except (PermissionError, OSError):
|
||||
size = 0
|
||||
disk_info = DiskInfo(device=base_device, model='Unknown', size=size, serial='Unknown')
|
||||
disks.append(disk_info)
|
||||
return disks
|
||||
|
||||
def _get_base_device(self, device: str) -> str:
|
||||
if 'nvme' in device:
|
||||
if 'p' in device:
|
||||
return device.rsplit('p', 1)[0]
|
||||
return device
|
||||
import re
|
||||
match = re.match('(/dev/[a-z]+)', device)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return device
|
||||
|
||||
def get_disk_for_path(self, path: Path) -> Optional[str]:
|
||||
path = path.resolve()
|
||||
best_match = None
|
||||
best_match_len = 0
|
||||
for partition in psutil.disk_partitions():
|
||||
mount_point = Path(partition.mountpoint)
|
||||
try:
|
||||
if path == mount_point or mount_point in path.parents:
|
||||
mount_len = len(str(mount_point))
|
||||
if mount_len > best_match_len:
|
||||
best_match = partition.device
|
||||
best_match_len = mount_len
|
||||
except (ValueError, OSError):
|
||||
continue
|
||||
return best_match
|
||||
|
||||
def get_disk_usage(self, path: Path) -> tuple[int, int, int]:
|
||||
try:
|
||||
usage = psutil.disk_usage(str(path))
|
||||
return (usage.total, usage.used, usage.free)
|
||||
except (PermissionError, OSError):
|
||||
return (0, 0, 0)
|
||||
|
||||
def get_mount_point(self, path: Path) -> Optional[Path]:
|
||||
path = path.resolve()
|
||||
best_match = None
|
||||
best_match_len = 0
|
||||
for partition in psutil.disk_partitions():
|
||||
mount_point = Path(partition.mountpoint)
|
||||
try:
|
||||
if path == mount_point or mount_point in path.parents:
|
||||
mount_len = len(str(mount_point))
|
||||
if mount_len > best_match_len:
|
||||
best_match = mount_point
|
||||
best_match_len = mount_len
|
||||
except (ValueError, OSError):
|
||||
continue
|
||||
return best_match
|
||||
|
||||
def is_same_filesystem(self, path1: Path, path2: Path) -> bool:
|
||||
try:
|
||||
stat1 = path1.stat()
|
||||
stat2 = path2.stat()
|
||||
return stat1.st_dev == stat2.st_dev
|
||||
except (OSError, PermissionError):
|
||||
return False
|
||||
Reference in New Issue
Block a user