remove_doc

2025-12-13 04:23:04 +01:00
parent 75034d5e51
commit 9759001f4c
9 changed files with 741 additions and 1641 deletions
--- a/app/content/init.py
+++ b/app/content/init.py
@@ -0,0 +1,3 @@
+from .profiler import ContentProfiler
+from .extractors import ContentExtractor
+__all__ = ['ContentProfiler', 'ContentExtractor']
--- a/app/content/extractors.py
+++ b/app/content/extractors.py
@@ -3,22 +3,14 @@ from typing import Dict, Optional
 import json

 class ContentExtractor:
+
    def __init__(self):
-        self.extractors = {
-            'pdf_text': self._extract_pdf,
-            'ocr+caption': self._extract_image,
-            'transcribe': self._extract_audio,
-            'transcribe+scenes': self._extract_video,
-            'office_text': self._extract_document,
-            'read': self._extract_text,
-            'read+syntax': self._extract_code
-        }
+        self.extractors = {'pdf_text': self._extract_pdf, 'ocr+caption': self._extract_image, 'transcribe': self._extract_audio, 'transcribe+scenes': self._extract_video, 'office_text': self._extract_document, 'read': self._extract_text, 'read+syntax': self._extract_code}

    def extract(self, file_path: Path, extractor_type: str) -> Dict:
        extractor = self.extractors.get(extractor_type)
        if not extractor:
            return {'error': f'Unknown extractor: {extractor_type}'}
-
        try:
            return extractor(file_path)
        except Exception as e:
@@ -28,11 +20,7 @@ class ContentExtractor:
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read(1024 * 1024)
-            return {
-                'text': content,
-                'char_count': len(content),
-                'needs_llm': False
-            }
+            return {'text': content, 'char_count': len(content), 'needs_llm': False}
        except Exception as e:
            return {'error': str(e)}

@@ -51,54 +39,24 @@ class ContentExtractor:
                pdf = PyPDF2.PdfReader(f)
                for page in pdf.pages[:10]:
                    text_parts.append(page.extract_text())
-
            text = '\n'.join(text_parts)
-            return {
-                'text': text,
-                'pages_extracted': len(text_parts),
-                'needs_llm': len(text.strip()) > 100,
-                'type': 'document'
-            }
+            return {'text': text, 'pages_extracted': len(text_parts), 'needs_llm': len(text.strip()) > 100, 'type': 'document'}
        except Exception as e:
            return {'error': str(e), 'needs_ocr': True}

    def _extract_image(self, file_path: Path) -> Dict:
-        return {
-            'type': 'image',
-            'needs_ocr': True,
-            'needs_caption': True,
-            'needs_llm': True,
-            'pipeline': ['ocr', 'caption', 'embedding'],
-            'status': 'pending'
-        }
+        return {'type': 'image', 'needs_ocr': True, 'needs_caption': True, 'needs_llm': True, 'pipeline': ['ocr', 'caption', 'embedding'], 'status': 'pending'}

    def _extract_audio(self, file_path: Path) -> Dict:
-        return {
-            'type': 'audio',
-            'needs_transcription': True,
-            'needs_llm': True,
-            'pipeline': ['transcribe', 'summarize'],
-            'status': 'pending'
-        }
+        return {'type': 'audio', 'needs_transcription': True, 'needs_llm': True, 'pipeline': ['transcribe', 'summarize'], 'status': 'pending'}

    def _extract_video(self, file_path: Path) -> Dict:
-        return {
-            'type': 'video',
-            'needs_transcription': True,
-            'needs_scene_detection': True,
-            'needs_llm': True,
-            'pipeline': ['transcribe', 'scenes', 'summarize'],
-            'status': 'pending'
-        }
+        return {'type': 'video', 'needs_transcription': True, 'needs_scene_detection': True, 'needs_llm': True, 'pipeline': ['transcribe', 'scenes', 'summarize'], 'status': 'pending'}

    def _extract_document(self, file_path: Path) -> Dict:
        try:
            import textract
            text = textract.process(str(file_path)).decode('utf-8')
-            return {
-                'text': text,
-                'type': 'document',
-                'needs_llm': len(text.strip()) > 100
-            }
+            return {'text': text, 'type': 'document', 'needs_llm': len(text.strip()) > 100}
        except:
            return {'error': 'textract failed', 'needs_llm': True}
--- a/app/content/profiler.py
+++ b/app/content/profiler.py
@@ -6,21 +6,10 @@ import json
 from datetime import datetime

 class ContentProfiler:
+
    def __init__(self):
        self.mime_detector = magic.Magic(mime=True)
-
-        self.kind_mapping = {
-            'text': ['text/plain', 'text/html', 'text/css', 'text/javascript', 'text/markdown'],
-            'code': ['application/x-python', 'application/javascript', 'text/x-java', 'text/x-c'],
-            'pdf': ['application/pdf'],
-            'image': ['image/jpeg', 'image/png', 'image/gif', 'image/webp', 'image/svg+xml'],
-            'audio': ['audio/mpeg', 'audio/wav', 'audio/ogg', 'audio/flac'],
-            'video': ['video/mp4', 'video/x-matroska', 'video/avi', 'video/webm'],
-            'archive': ['application/zip', 'application/x-tar', 'application/gzip', 'application/x-7z-compressed'],
-            'document': ['application/msword', 'application/vnd.openxmlformats-officedocument'],
-            'spreadsheet': ['application/vnd.ms-excel', 'text/csv']
-        }
-
+        self.kind_mapping = {'text': ['text/plain', 'text/html', 'text/css', 'text/javascript', 'text/markdown'], 'code': ['application/x-python', 'application/javascript', 'text/x-java', 'text/x-c'], 'pdf': ['application/pdf'], 'image': ['image/jpeg', 'image/png', 'image/gif', 'image/webp', 'image/svg+xml'], 'audio': ['audio/mpeg', 'audio/wav', 'audio/ogg', 'audio/flac'], 'video': ['video/mp4', 'video/x-matroska', 'video/avi', 'video/webm'], 'archive': ['application/zip', 'application/x-tar', 'application/gzip', 'application/x-7z-compressed'], 'document': ['application/msword', 'application/vnd.openxmlformats-officedocument'], 'spreadsheet': ['application/vnd.ms-excel', 'text/csv']}
        self.text_exts = {'.txt', '.md', '.rst', '.log', '.json', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg'}
        self.code_exts = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php'}
        self.processable_kinds = {'text', 'code', 'pdf', 'image', 'audio', 'video', 'document'}
@@ -30,29 +19,12 @@ class ContentProfiler:
            stat = file_path.stat()
            size = stat.st_size
            mtime = datetime.fromtimestamp(stat.st_mtime)
-
            mime_type = self._detect_mime(file_path)
            kind = self._determine_kind(file_path, mime_type)
-
-            profile = {
-                'path': str(file_path),
-                'size': size,
-                'mtime': mtime.isoformat(),
-                'mime': mime_type,
-                'kind': kind,
-                'processable': kind in self.processable_kinds,
-                'extractor': self._suggest_extractor(kind, mime_type),
-                'hints': self._extract_hints(file_path, kind, mime_type, size)
-            }
-
+            profile = {'path': str(file_path), 'size': size, 'mtime': mtime.isoformat(), 'mime': mime_type, 'kind': kind, 'processable': kind in self.processable_kinds, 'extractor': self._suggest_extractor(kind, mime_type), 'hints': self._extract_hints(file_path, kind, mime_type, size)}
            return profile
-
        except Exception as e:
-            return {
-                'path': str(file_path),
-                'error': str(e),
-                'processable': False
-            }
+            return {'path': str(file_path), 'error': str(e), 'processable': False}

    def _detect_mime(self, file_path: Path) -> str:
        try:
@@ -63,61 +35,42 @@ class ContentProfiler:

    def _determine_kind(self, file_path: Path, mime_type: str) -> str:
        for kind, mimes in self.kind_mapping.items():
-            if any(mime in mime_type for mime in mimes):
+            if any((mime in mime_type for mime in mimes)):
                return kind
-
        suffix = file_path.suffix.lower()
        if suffix in self.text_exts:
            return 'text'
        if suffix in self.code_exts:
            return 'code'
-
        return 'unknown'

    def _suggest_extractor(self, kind: str, mime_type: str) -> Optional[str]:
-        extractors = {
-            'pdf': 'pdf_text',
-            'image': 'ocr+caption',
-            'audio': 'transcribe',
-            'video': 'transcribe+scenes',
-            'document': 'office_text',
-            'text': 'read',
-            'code': 'read+syntax'
-        }
+        extractors = {'pdf': 'pdf_text', 'image': 'ocr+caption', 'audio': 'transcribe', 'video': 'transcribe+scenes', 'document': 'office_text', 'text': 'read', 'code': 'read+syntax'}
        return extractors.get(kind)

    def _extract_hints(self, file_path: Path, kind: str, mime_type: str, size: int) -> Dict:
        hints = {}
-
        if kind == 'text' or kind == 'code':
            hints['language'] = self._guess_language(file_path)
            if size < 1024 * 1024:
                hints['lines'] = self._count_lines(file_path)
-
        if kind == 'pdf':
            hints['page_count'] = self._get_pdf_pages(file_path)
-
        if kind in ['audio', 'video']:
            hints['duration'] = self._get_media_duration(file_path)
-
        if kind == 'image':
            hints['has_exif'] = self._has_exif(file_path)
            hints['dimensions'] = self._get_image_dimensions(file_path)
-
        return hints

    def _guess_language(self, file_path: Path) -> Optional[str]:
-        lang_map = {
-            '.py': 'python', '.js': 'javascript', '.ts': 'typescript',
-            '.java': 'java', '.go': 'go', '.rs': 'rust', '.c': 'c',
-            '.cpp': 'cpp', '.cs': 'csharp', '.rb': 'ruby', '.php': 'php'
-        }
+        lang_map = {'.py': 'python', '.js': 'javascript', '.ts': 'typescript', '.java': 'java', '.go': 'go', '.rs': 'rust', '.c': 'c', '.cpp': 'cpp', '.cs': 'csharp', '.rb': 'ruby', '.php': 'php'}
        return lang_map.get(file_path.suffix.lower())

    def _count_lines(self, file_path: Path) -> Optional[int]:
        try:
            with open(file_path, 'rb') as f:
-                return sum(1 for _ in f)
+                return sum((1 for _ in f))
        except:
            return None

--- a/app/discovery/init.py
+++ b/app/discovery/init.py
@@ -1,17 +1,5 @@
-"""Discovery package exports"""
 from .scanner import FileScanner, FilteredScanner
 from .system import SystemAPI
 from .engine import DiscoveryEngine
 from ._protocols import FileMeta, MountInfo, DiskInfo, IFileScanner, ISystemAPI
-
-__all__ = [
-    'FileScanner',
-    'FilteredScanner',
-    'SystemAPI',
-    'DiscoveryEngine',
-    'FileMeta',
-    'MountInfo',
-    'DiskInfo',
-    'IFileScanner',
-    'ISystemAPI',
-]
+__all__ = ['FileScanner', 'FilteredScanner', 'SystemAPI', 'DiscoveryEngine', 'FileMeta', 'MountInfo', 'DiskInfo', 'IFileScanner', 'ISystemAPI']
--- a/app/discovery/_protocols.py
+++ b/app/discovery/_protocols.py
@@ -1,54 +1,37 @@
-"""Protocol definitions for the discovery package"""
 from typing import Iterator, Protocol, Any
 from pathlib import Path
 from dataclasses import dataclass

-
@dataclass
 class FileMeta:
-    """Metadata for a discovered file"""
    path: Path
    size: int
    modified_time: float
    created_time: float
-    # Add other metadata fields as needed
-

@dataclass
 class MountInfo:
-    """Information about a mounted filesystem"""
    device: str
    mount_point: str
    fs_type: str
    options: str
-    # Add other mount info fields as needed
-

@dataclass
 class DiskInfo:
-    """Information about a disk/NVMe device"""
    device: str
    model: str
    size: int
    serial: str
-    # Add other disk info fields as needed
-

 class IFileScanner(Protocol):
-    """Protocol for file scanning operations"""

    def scan(self, root: Path) -> Iterator[FileMeta]:
-        """Scan a directory tree and yield file metadata"""
        ...

-
 class ISystemAPI(Protocol):
-    """Protocol for system information queries"""

    def query_mounts(self) -> list[MountInfo]:
-        """Query mounted filesystems"""
        ...

    def query_nvmes(self) -> list[DiskInfo]:
-        """Query NVMe/disk information"""
        ...
--- a/app/discovery/engine.py
+++ b/app/discovery/engine.py
@@ -1,10 +1,8 @@
-"""Discovery engine coordinating scanner and system APIs"""
 from pathlib import Path
 from typing import Optional, Callable
 from datetime import datetime
 import psycopg2
 from psycopg2.extras import execute_batch
-
 from .scanner import FileScanner
 from .system import SystemAPI
 from ._protocols import FileMeta
@@ -12,23 +10,9 @@ from ..shared.models import FileRecord, DiskInfo, ProcessingStats
 from ..shared.config import DatabaseConfig
 from ..shared.logger import ProgressLogger

-
 class DiscoveryEngine:
-    """Discovery engine for scanning and cataloging files"""

-    def __init__(
-        self,
-        db_config: DatabaseConfig,
-        logger: ProgressLogger,
-        batch_size: int = 1000
-    ):
-        """Initialize discovery engine
-
-        Args:
-            db_config: Database configuration
-            logger: Progress logger
-            batch_size: Number of records to batch before database commit
-        """
+    def __init__(self, db_config: DatabaseConfig, logger: ProgressLogger, batch_size: int=1000):
        self.db_config = db_config
        self.logger = logger
        self.batch_size = batch_size
@@ -36,286 +20,114 @@ class DiscoveryEngine:
        self._connection = None

    def _get_connection(self):
-        """Get or create database connection"""
        if self._connection is None or self._connection.closed:
-            self._connection = psycopg2.connect(
-                host=self.db_config.host,
-                port=self.db_config.port,
-                database=self.db_config.database,
-                user=self.db_config.user,
-                password=self.db_config.password
-            )
+            self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
        return self._connection

    def _ensure_tables(self):
-        """Ensure database tables exist"""
        conn = self._get_connection()
        cursor = conn.cursor()
-
-        # Create files table
-        cursor.execute("""
-            CREATE TABLE IF NOT EXISTS files (
-                id SERIAL PRIMARY KEY,
-                path TEXT NOT NULL UNIQUE,
-                size BIGINT NOT NULL,
-                modified_time DOUBLE PRECISION NOT NULL,
-                created_time DOUBLE PRECISION NOT NULL,
-                disk_label TEXT NOT NULL,
-                checksum TEXT,
-                status TEXT DEFAULT 'indexed',
-                category TEXT,
-                duplicate_of TEXT,
-                discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
-                updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
-            )
-        """)
-
-        # Create index on path
-        cursor.execute("""
-            CREATE INDEX IF NOT EXISTS idx_files_path ON files(path)
-                       """)
-
-        # Create index on disk
-        cursor.execute("""
-            CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label)
-                       """)
-
-        # Create index on checksum
-        cursor.execute("""
-            CREATE INDEX IF NOT EXISTS idx_files_checksum ON files(checksum)
-                       """)
-
+        cursor.execute("\n            CREATE TABLE IF NOT EXISTS files (\n                id SERIAL PRIMARY KEY,\n                path TEXT NOT NULL UNIQUE,\n                size BIGINT NOT NULL,\n                modified_time DOUBLE PRECISION NOT NULL,\n                created_time DOUBLE PRECISION NOT NULL,\n                disk_label TEXT NOT NULL,\n                checksum TEXT,\n                status TEXT DEFAULT 'indexed',\n                category TEXT,\n                duplicate_of TEXT,\n                discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n                updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n            )\n        ")
+        cursor.execute('\n            CREATE INDEX IF NOT EXISTS idx_files_path ON files(path)\n                       ')
+        cursor.execute('\n            CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label)\n                       ')
+        cursor.execute('\n            CREATE INDEX IF NOT EXISTS idx_files_checksum ON files(checksum)\n                       ')
        conn.commit()
        cursor.close()

-    def discover_path(
-        self,
-        root: Path,
-        scanner: Optional[FileScanner] = None,
-        progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
-    ) -> ProcessingStats:
-        """Discover and catalog files in a path
-
-        Args:
-            root: Root path to discover
-            scanner: Optional custom scanner (default: FileScanner())
-            progress_callback: Optional callback for progress updates
-
-        Returns:
-            ProcessingStats with discovery statistics
-        """
-        self.logger.section(f"Discovering: {root}")
-
-        # Ensure tables exist
+    def discover_path(self, root: Path, scanner: Optional[FileScanner]=None, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats:
+        self.logger.section(f'Discovering: {root}')
        self._ensure_tables()
-
-        # Create scanner if not provided
        if scanner is None:
-            scanner = FileScanner(
-                error_handler=lambda e, p: self.logger.warning(f"Error scanning {p}: {e}")
-            )
-
-        # Get disk info for the root path
+            scanner = FileScanner(error_handler=lambda e, p: self.logger.warning(f'Error scanning {p}: {e}'))
        disk = self.system_api.get_disk_for_path(root)
        if disk is None:
            disk = str(root)
-
-        # Initialize statistics
        stats = ProcessingStats()
        batch = []
-
        conn = self._get_connection()
        cursor = conn.cursor()
-
        try:
-            # Scan files
            for file_meta in scanner.scan(root):
-                # Create file record
-                record = FileRecord(
-                    path=file_meta.path,
-                    size=file_meta.size,
-                    modified_time=file_meta.modified_time,
-                    created_time=file_meta.created_time,
-                    disk_label=disk
-                )
-
+                record = FileRecord(path=file_meta.path, size=file_meta.size, modified_time=file_meta.modified_time, created_time=file_meta.created_time, disk_label=disk)
                batch.append(record)
                stats.files_processed += 1
                stats.bytes_processed += record.size
-
-                # Batch insert
                if len(batch) >= self.batch_size:
                    self._insert_batch(cursor, batch)
                    conn.commit()
                    batch.clear()
-
-                    # Progress callback
                    if progress_callback:
                        progress_callback(stats.files_processed, 0, stats)
-
-                    # Log progress
                    if stats.files_processed % (self.batch_size * 10) == 0:
-                        self.logger.progress(
-                            stats.files_processed,
-                            stats.files_processed,  # We don't know total
-                            prefix="Files discovered",
-                            bytes_processed=stats.bytes_processed,
-                            elapsed_seconds=stats.elapsed_seconds
-                        )
-
-            # Insert remaining batch
+                        self.logger.progress(stats.files_processed, stats.files_processed, prefix='Files discovered', bytes_processed=stats.bytes_processed, elapsed_seconds=stats.elapsed_seconds)
            if batch:
                self._insert_batch(cursor, batch)
                conn.commit()
-
            stats.files_succeeded = stats.files_processed
-
        except Exception as e:
            conn.rollback()
-            self.logger.error(f"Discovery failed: {e}")
+            self.logger.error(f'Discovery failed: {e}')
            raise
-
        finally:
            cursor.close()
-
-        self.logger.info(
-            f"Discovery complete: {stats.files_processed} files, "
-            f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
-        )
-
+        self.logger.info(f'Discovery complete: {stats.files_processed} files, {stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s')
        return stats

    def _insert_batch(self, cursor, batch: list[FileRecord]):
-        """Insert batch of file records
-
-        Args:
-            cursor: Database cursor
-            batch: List of FileRecord objects
-        """
-        query = """
-            INSERT INTO files (path, size, modified_time, created_time, disk_label, checksum, status, category, duplicate_of)
-            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
-            ON CONFLICT (path) DO UPDATE SET
-                size = EXCLUDED.size,
-                modified_time = EXCLUDED.modified_time,
-                updated_at = CURRENT_TIMESTAMP
-        """
-
-        data = [
-            (
-                str(record.path),
-                record.size,
-                record.modified_time,
-                record.created_time,
-                record.disk_label,
-                record.checksum,
-                record.status,
-                record.category,
-                record.duplicate_of
-            )
-            for record in batch
-        ]
-
+        query = '\n            INSERT INTO files (path, size, modified_time, created_time, disk_label, checksum, status, category, duplicate_of)\n            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)\n            ON CONFLICT (path) DO UPDATE SET\n                size = EXCLUDED.size,\n                modified_time = EXCLUDED.modified_time,\n                updated_at = CURRENT_TIMESTAMP\n        '
+        data = [(str(record.path), record.size, record.modified_time, record.created_time, record.disk_label, record.checksum, record.status, record.category, record.duplicate_of) for record in batch]
        execute_batch(cursor, query, data, page_size=self.batch_size)

    def get_disk_info(self) -> list[DiskInfo]:
-        """Get information about all disks
-
-        Returns:
-            List of DiskInfo objects
-        """
-        self.logger.subsection("Querying disk information")
-
+        self.logger.subsection('Querying disk information')
        disks = []
        for disk_info in self.system_api.query_nvmes():
-            # Get mount point if available
            mount_point = None
-            fs_type = "unknown"
-
+            fs_type = 'unknown'
            for mount in self.system_api.query_mounts():
                if mount.device == disk_info.device:
                    mount_point = Path(mount.mount_point)
                    fs_type = mount.fs_type
                    break
-
            if mount_point:
                total, used, free = self.system_api.get_disk_usage(mount_point)
            else:
                total = disk_info.size
                used = 0
                free = disk_info.size
-
-            disk = DiskInfo(
-                name=disk_info.device,
-                device=disk_info.device,
-                mount_point=mount_point or Path("/"),
-                total_size=total,
-                used_size=used,
-                free_size=free,
-                fs_type=fs_type
-            )
+            disk = DiskInfo(name=disk_info.device, device=disk_info.device, mount_point=mount_point or Path('/'), total_size=total, used_size=used, free_size=free, fs_type=fs_type)
            disks.append(disk)
-
-            self.logger.info(
-                f"  {disk.name}: {disk.usage_percent:.1f}% used "
-                f"({disk.used_size:,} / {disk.total_size:,} bytes)"
-            )
-
+            self.logger.info(f'  {disk.name}: {disk.usage_percent:.1f}% used ({disk.used_size:,} / {disk.total_size:,} bytes)')
        return disks

-    def get_file_count(self, disk: Optional[str] = None) -> int:
-        """Get count of discovered files
-
-        Args:
-            disk: Optional disk filter
-
-        Returns:
-            Count of files
-        """
+    def get_file_count(self, disk: Optional[str]=None) -> int:
        conn = self._get_connection()
        cursor = conn.cursor()
-
        if disk:
-            cursor.execute("SELECT COUNT(*) FROM files WHERE disk_label = %s", (disk,))
+            cursor.execute('SELECT COUNT(*) FROM files WHERE disk_label = %s', (disk,))
        else:
-            cursor.execute("SELECT COUNT(*) FROM files")
-
+            cursor.execute('SELECT COUNT(*) FROM files')
        count = cursor.fetchone()[0]
        cursor.close()
-
        return count

-    def get_total_size(self, disk: Optional[str] = None) -> int:
-        """Get total size of discovered files
-
-        Args:
-            disk: Optional disk filter
-
-        Returns:
-            Total size in bytes
-        """
+    def get_total_size(self, disk: Optional[str]=None) -> int:
        conn = self._get_connection()
        cursor = conn.cursor()
-
        if disk:
-            cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE disk_label = %s", (disk,))
+            cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files WHERE disk_label = %s', (disk,))
        else:
-            cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files")
-
+            cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files')
        total = cursor.fetchone()[0]
        cursor.close()
-
        return total

    def close(self):
-        """Close database connection"""
-        if self._connection and not self._connection.closed:
+        if self._connection and (not self._connection.closed):
            self._connection.close()

    def __enter__(self):
-        """Context manager entry"""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
-        """Context manager exit"""
        self.close()
--- a/app/discovery/scanner.py
+++ b/app/discovery/scanner.py
@@ -1,28 +1,12 @@
-"""File system scanner implementing IFileScanner protocol"""
 import os
 from pathlib import Path
 from typing import Iterator, Optional, Callable
 from datetime import datetime
-
 from ._protocols import FileMeta

-
 class FileScanner:
-    """File system scanner with filtering and error handling"""

-    def __init__(
-        self,
-        follow_symlinks: bool = False,
-        skip_hidden: bool = True,
-        error_handler: Optional[Callable[[Exception, Path], None]] = None
-    ):
-        """Initialize file scanner
-
-        Args:
-            follow_symlinks: Whether to follow symbolic links
-            skip_hidden: Whether to skip hidden files/directories
-            error_handler: Optional callback for handling errors during scan
-        """
+    def __init__(self, follow_symlinks: bool=False, skip_hidden: bool=True, error_handler: Optional[Callable[[Exception, Path], None]]=None):
        self.follow_symlinks = follow_symlinks
        self.skip_hidden = skip_hidden
        self.error_handler = error_handler
@@ -31,24 +15,14 @@ class FileScanner:
        self._errors = 0

    def scan(self, root: Path) -> Iterator[FileMeta]:
-        """Scan a directory tree and yield file metadata
-
-        Args:
-            root: Root directory to scan
-
-        Yields:
-            FileMeta objects for each discovered file
-        """
        if not root.exists():
-            error = FileNotFoundError(f"Path does not exist: {root}")
+            error = FileNotFoundError(f'Path does not exist: {root}')
            if self.error_handler:
                self.error_handler(error, root)
            else:
                raise error
            return
-
        if not root.is_dir():
-            # If root is a file, just return its metadata
            try:
                yield self._get_file_meta(root)
            except Exception as e:
@@ -58,115 +32,59 @@ class FileScanner:
                else:
                    raise
            return
-
-        # Walk directory tree
        for dirpath, dirnames, filenames in os.walk(root, followlinks=self.follow_symlinks):
            current_dir = Path(dirpath)
-
-            # Filter directories if needed
            if self.skip_hidden:
                dirnames[:] = [d for d in dirnames if not d.startswith('.')]
-
-            # Process files
            for filename in filenames:
                if self.skip_hidden and filename.startswith('.'):
                    continue
-
                file_path = current_dir / filename
-
                try:
-                    # Skip broken symlinks
-                    if file_path.is_symlink() and not file_path.exists():
+                    if file_path.is_symlink() and (not file_path.exists()):
                        continue
-
                    meta = self._get_file_meta(file_path)
                    self._files_scanned += 1
                    self._bytes_scanned += meta.size
-
                    yield meta
-
                except PermissionError as e:
                    self._errors += 1
                    if self.error_handler:
                        self.error_handler(e, file_path)
-                    # Continue scanning
                    continue
-
                except Exception as e:
                    self._errors += 1
                    if self.error_handler:
                        self.error_handler(e, file_path)
-                    # Continue scanning
                    continue

    def _get_file_meta(self, path: Path) -> FileMeta:
-        """Get file metadata
-
-        Args:
-            path: Path to file
-
-        Returns:
-            FileMeta object with file metadata
-
-        Raises:
-            OSError: If file cannot be accessed
-        """
        stat = path.stat()
-
-        # Get creation time (platform dependent)
        created_time = stat.st_ctime
        if hasattr(stat, 'st_birthtime'):
            created_time = stat.st_birthtime
-
-        return FileMeta(
-            path=path,
-            size=stat.st_size,
-            modified_time=stat.st_mtime,
-            created_time=created_time
-        )
+        return FileMeta(path=path, size=stat.st_size, modified_time=stat.st_mtime, created_time=created_time)

    @property
    def files_scanned(self) -> int:
-        """Get count of files scanned"""
        return self._files_scanned

    @property
    def bytes_scanned(self) -> int:
-        """Get total bytes scanned"""
        return self._bytes_scanned

    @property
    def errors(self) -> int:
-        """Get count of errors encountered"""
        return self._errors

    def reset_stats(self) -> None:
-        """Reset scanning statistics"""
        self._files_scanned = 0
        self._bytes_scanned = 0
        self._errors = 0

-
 class FilteredScanner(FileScanner):
-    """Scanner with additional filtering capabilities"""

-    def __init__(
-        self,
-        min_size: Optional[int] = None,
-        max_size: Optional[int] = None,
-        extensions: Optional[list[str]] = None,
-        exclude_patterns: Optional[list[str]] = None,
-        **kwargs
-    ):
-        """Initialize filtered scanner
-
-        Args:
-            min_size: Minimum file size in bytes
-            max_size: Maximum file size in bytes
-            extensions: List of file extensions to include (e.g., ['.txt', '.py'])
-            exclude_patterns: List of path patterns to exclude
-            **kwargs: Additional arguments passed to FileScanner
-        """
+    def __init__(self, min_size: Optional[int]=None, max_size: Optional[int]=None, extensions: Optional[list[str]]=None, exclude_patterns: Optional[list[str]]=None, **kwargs):
        super().__init__(**kwargs)
        self.min_size = min_size
        self.max_size = max_size
@@ -174,41 +92,19 @@ class FilteredScanner(FileScanner):
        self.exclude_patterns = exclude_patterns or []

    def scan(self, root: Path) -> Iterator[FileMeta]:
-        """Scan with additional filtering
-
-        Args:
-            root: Root directory to scan
-
-        Yields:
-            FileMeta objects for files matching filter criteria
-        """
        for meta in super().scan(root):
-            # Size filtering
            if self.min_size is not None and meta.size < self.min_size:
                continue
            if self.max_size is not None and meta.size > self.max_size:
                continue
-
-            # Extension filtering
            if self.extensions is not None:
                if meta.path.suffix.lower() not in self.extensions:
                    continue
-
-            # Exclude pattern filtering
            if self._should_exclude(meta.path):
                continue
-
            yield meta

    def _should_exclude(self, path: Path) -> bool:
-        """Check if path matches any exclude pattern
-
-        Args:
-            path: Path to check
-
-        Returns:
-            True if path should be excluded
-        """
        path_str = str(path)
        for pattern in self.exclude_patterns:
            if pattern in path_str:
--- a/app/discovery/system.py
+++ b/app/discovery/system.py
@@ -1,167 +1,80 @@
-"""System API for querying mounts and disks"""
 import os
 import subprocess
 from pathlib import Path
 from typing import Optional
 import psutil
-
 from ._protocols import MountInfo, DiskInfo

-
 class SystemAPI:
-    """System information API for querying mounts and disks"""

    def query_mounts(self) -> list[MountInfo]:
-        """Query mounted filesystems
-
-        Returns:
-            List of MountInfo objects for all mounted filesystems
-        """
        mounts = []
-
        for partition in psutil.disk_partitions(all=False):
-            mount_info = MountInfo(
-                device=partition.device,
-                mount_point=partition.mountpoint,
-                fs_type=partition.fstype,
-                options=partition.opts
-            )
+            mount_info = MountInfo(device=partition.device, mount_point=partition.mountpoint, fs_type=partition.fstype, options=partition.opts)
            mounts.append(mount_info)
-
        return mounts

    def query_nvmes(self) -> list[DiskInfo]:
-        """Query NVMe/disk information
-
-        Returns:
-            List of DiskInfo objects for all disks
-        """
        disks = []
-
-        # Try to get disk information using lsblk
        try:
-            result = subprocess.run(
-                ['lsblk', '-ndo', 'NAME,MODEL,SIZE,SERIAL', '-b'],
-                capture_output=True,
-                text=True,
-                check=False
-            )
-
+            result = subprocess.run(['lsblk', '-ndo', 'NAME,MODEL,SIZE,SERIAL', '-b'], capture_output=True, text=True, check=False)
            if result.returncode == 0:
                for line in result.stdout.strip().split('\n'):
                    if not line.strip():
                        continue
-
                    parts = line.split(maxsplit=3)
                    if len(parts) >= 3:
-                        device = f"/dev/{parts[0]}"
-                        model = parts[1] if len(parts) > 1 else "Unknown"
-                        size_str = parts[2] if len(parts) > 2 else "0"
-                        serial = parts[3] if len(parts) > 3 else "Unknown"
-
+                        device = f'/dev/{parts[0]}'
+                        model = parts[1] if len(parts) > 1 else 'Unknown'
+                        size_str = parts[2] if len(parts) > 2 else '0'
+                        serial = parts[3] if len(parts) > 3 else 'Unknown'
                        try:
                            size = int(size_str)
                        except ValueError:
                            size = 0
-
-                        disk_info = DiskInfo(
-                            device=device,
-                            model=model,
-                            size=size,
-                            serial=serial
-                        )
+                        disk_info = DiskInfo(device=device, model=model, size=size, serial=serial)
                        disks.append(disk_info)
-
        except FileNotFoundError:
-            # lsblk not available, fall back to basic info
            pass
-
-        # If lsblk failed or unavailable, try alternative method
        if not disks:
            disks = self._query_disks_fallback()
-
        return disks

    def _query_disks_fallback(self) -> list[DiskInfo]:
-        """Fallback method for querying disk information
-
-        Returns:
-            List of DiskInfo objects using psutil
-        """
        disks = []
        seen_devices = set()
-
        for partition in psutil.disk_partitions(all=True):
            device = partition.device
-
-            # Skip non-disk devices
            if not device.startswith('/dev/'):
                continue
-
-            # Get base device (e.g., /dev/sda from /dev/sda1)
            base_device = self._get_base_device(device)
-
            if base_device in seen_devices:
                continue
-
            seen_devices.add(base_device)
-
            try:
                usage = psutil.disk_usage(partition.mountpoint)
                size = usage.total
            except (PermissionError, OSError):
                size = 0
-
-            disk_info = DiskInfo(
-                device=base_device,
-                model="Unknown",
-                size=size,
-                serial="Unknown"
-            )
+            disk_info = DiskInfo(device=base_device, model='Unknown', size=size, serial='Unknown')
            disks.append(disk_info)
-
        return disks

    def _get_base_device(self, device: str) -> str:
-        """Extract base device name from partition device
-
-        Args:
-            device: Device path (e.g., /dev/sda1, /dev/nvme0n1p1)
-
-        Returns:
-            Base device path (e.g., /dev/sda, /dev/nvme0n1)
-        """
-        # Handle NVMe devices
        if 'nvme' in device:
-            # /dev/nvme0n1p1 -> /dev/nvme0n1
            if 'p' in device:
                return device.rsplit('p', 1)[0]
            return device
-
-        # Handle standard devices (sda, sdb, etc.)
-        # /dev/sda1 -> /dev/sda
        import re
-        match = re.match(r'(/dev/[a-z]+)', device)
+        match = re.match('(/dev/[a-z]+)', device)
        if match:
            return match.group(1)
-
        return device

    def get_disk_for_path(self, path: Path) -> Optional[str]:
-        """Get the disk/mount point for a given path
-
-        Args:
-            path: Path to check
-
-        Returns:
-            Mount point device or None if not found
-        """
        path = path.resolve()
-
-        # Find the mount point that contains this path
        best_match = None
        best_match_len = 0
-
        for partition in psutil.disk_partitions():
            mount_point = Path(partition.mountpoint)
            try:
@@ -172,39 +85,19 @@ class SystemAPI:
                        best_match_len = mount_len
            except (ValueError, OSError):
                continue
-
        return best_match

    def get_disk_usage(self, path: Path) -> tuple[int, int, int]:
-        """Get disk usage for a path
-
-        Args:
-            path: Path to check
-
-        Returns:
-            Tuple of (total, used, free) in bytes
-        """
        try:
            usage = psutil.disk_usage(str(path))
-            return usage.total, usage.used, usage.free
+            return (usage.total, usage.used, usage.free)
        except (PermissionError, OSError):
-            return 0, 0, 0
+            return (0, 0, 0)

    def get_mount_point(self, path: Path) -> Optional[Path]:
-        """Get the mount point for a given path
-
-        Args:
-            path: Path to check
-
-        Returns:
-            Mount point path or None if not found
-        """
        path = path.resolve()
-
-        # Find the mount point that contains this path
        best_match = None
        best_match_len = 0
-
        for partition in psutil.disk_partitions():
            mount_point = Path(partition.mountpoint)
            try:
@@ -215,19 +108,9 @@ class SystemAPI:
                        best_match_len = mount_len
            except (ValueError, OSError):
                continue
-
        return best_match

    def is_same_filesystem(self, path1: Path, path2: Path) -> bool:
-        """Check if two paths are on the same filesystem
-
-        Args:
-            path1: First path
-            path2: Second path
-
-        Returns:
-            True if paths are on the same filesystem
-        """
        try:
            stat1 = path1.stat()
            stat2 = path2.stat()
--- a/app/main.py
+++ b/app/main.py