remove_doc

2025-12-13 04:23:04 +01:00
parent 75034d5e51
commit 9759001f4c
9 changed files with 741 additions and 1641 deletions
--- a/app/content/init.py
+++ b/app/content/init.py
@@ -0,0 +1,3 @@
 from .profiler import ContentProfiler
 from .extractors import ContentExtractor
 __all__ = ['ContentProfiler', 'ContentExtractor']
--- a/app/content/extractors.py
+++ b/app/content/extractors.py
@@ -3,22 +3,14 @@ from typing import Dict, Optional
 import json
 class ContentExtractor:
    def __init__(self):
-        self.extractors = {
+        self.extractors = {'pdf_text': self._extract_pdf, 'ocr+caption': self._extract_image, 'transcribe': self._extract_audio, 'transcribe+scenes': self._extract_video, 'office_text': self._extract_document, 'read': self._extract_text, 'read+syntax': self._extract_code}
            'pdf_text': self._extract_pdf,
            'ocr+caption': self._extract_image,
            'transcribe': self._extract_audio,
            'transcribe+scenes': self._extract_video,
            'office_text': self._extract_document,
            'read': self._extract_text,
            'read+syntax': self._extract_code
        }
    def extract(self, file_path: Path, extractor_type: str) -> Dict:
        extractor = self.extractors.get(extractor_type)
        if not extractor:
            return {'error': f'Unknown extractor: {extractor_type}'}
        try:
            return extractor(file_path)
        except Exception as e:
@@ -28,11 +20,7 @@ class ContentExtractor:
        try:
            with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                content = f.read(1024 * 1024)
-            return {
+            return {'text': content, 'char_count': len(content), 'needs_llm': False}
                'text': content,
                'char_count': len(content),
                'needs_llm': False
            }
        except Exception as e:
            return {'error': str(e)}
@@ -51,54 +39,24 @@ class ContentExtractor:
                pdf = PyPDF2.PdfReader(f)
                for page in pdf.pages[:10]:
                    text_parts.append(page.extract_text())
            text = '\n'.join(text_parts)
-            return {
+            return {'text': text, 'pages_extracted': len(text_parts), 'needs_llm': len(text.strip()) > 100, 'type': 'document'}
                'text': text,
                'pages_extracted': len(text_parts),
                'needs_llm': len(text.strip()) > 100,
                'type': 'document'
            }
        except Exception as e:
            return {'error': str(e), 'needs_ocr': True}
    def _extract_image(self, file_path: Path) -> Dict:
-        return {
+        return {'type': 'image', 'needs_ocr': True, 'needs_caption': True, 'needs_llm': True, 'pipeline': ['ocr', 'caption', 'embedding'], 'status': 'pending'}
            'type': 'image',
            'needs_ocr': True,
            'needs_caption': True,
            'needs_llm': True,
            'pipeline': ['ocr', 'caption', 'embedding'],
            'status': 'pending'
        }
    def _extract_audio(self, file_path: Path) -> Dict:
-        return {
+        return {'type': 'audio', 'needs_transcription': True, 'needs_llm': True, 'pipeline': ['transcribe', 'summarize'], 'status': 'pending'}
            'type': 'audio',
            'needs_transcription': True,
            'needs_llm': True,
            'pipeline': ['transcribe', 'summarize'],
            'status': 'pending'
        }
    def _extract_video(self, file_path: Path) -> Dict:
-        return {
+        return {'type': 'video', 'needs_transcription': True, 'needs_scene_detection': True, 'needs_llm': True, 'pipeline': ['transcribe', 'scenes', 'summarize'], 'status': 'pending'}
            'type': 'video',
            'needs_transcription': True,
            'needs_scene_detection': True,
            'needs_llm': True,
            'pipeline': ['transcribe', 'scenes', 'summarize'],
            'status': 'pending'
        }
    def _extract_document(self, file_path: Path) -> Dict:
        try:
            import textract
            text = textract.process(str(file_path)).decode('utf-8')
-            return {
+            return {'text': text, 'type': 'document', 'needs_llm': len(text.strip()) > 100}
                'text': text,
                'type': 'document',
                'needs_llm': len(text.strip()) > 100
            }
        except:
            return {'error': 'textract failed', 'needs_llm': True}
--- a/app/content/profiler.py
+++ b/app/content/profiler.py
@@ -6,21 +6,10 @@ import json
 from datetime import datetime
 class ContentProfiler:
    def __init__(self):
        self.mime_detector = magic.Magic(mime=True)
-
+        self.kind_mapping = {'text': ['text/plain', 'text/html', 'text/css', 'text/javascript', 'text/markdown'], 'code': ['application/x-python', 'application/javascript', 'text/x-java', 'text/x-c'], 'pdf': ['application/pdf'], 'image': ['image/jpeg', 'image/png', 'image/gif', 'image/webp', 'image/svg+xml'], 'audio': ['audio/mpeg', 'audio/wav', 'audio/ogg', 'audio/flac'], 'video': ['video/mp4', 'video/x-matroska', 'video/avi', 'video/webm'], 'archive': ['application/zip', 'application/x-tar', 'application/gzip', 'application/x-7z-compressed'], 'document': ['application/msword', 'application/vnd.openxmlformats-officedocument'], 'spreadsheet': ['application/vnd.ms-excel', 'text/csv']}
        self.kind_mapping = {
            'text': ['text/plain', 'text/html', 'text/css', 'text/javascript', 'text/markdown'],
            'code': ['application/x-python', 'application/javascript', 'text/x-java', 'text/x-c'],
            'pdf': ['application/pdf'],
            'image': ['image/jpeg', 'image/png', 'image/gif', 'image/webp', 'image/svg+xml'],
            'audio': ['audio/mpeg', 'audio/wav', 'audio/ogg', 'audio/flac'],
            'video': ['video/mp4', 'video/x-matroska', 'video/avi', 'video/webm'],
            'archive': ['application/zip', 'application/x-tar', 'application/gzip', 'application/x-7z-compressed'],
            'document': ['application/msword', 'application/vnd.openxmlformats-officedocument'],
            'spreadsheet': ['application/vnd.ms-excel', 'text/csv']
        }
        self.text_exts = {'.txt', '.md', '.rst', '.log', '.json', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg'}
        self.code_exts = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php'}
        self.processable_kinds = {'text', 'code', 'pdf', 'image', 'audio', 'video', 'document'}
@@ -30,29 +19,12 @@ class ContentProfiler:
            stat = file_path.stat()
            size = stat.st_size
            mtime = datetime.fromtimestamp(stat.st_mtime)
            mime_type = self._detect_mime(file_path)
            kind = self._determine_kind(file_path, mime_type)
-
+            profile = {'path': str(file_path), 'size': size, 'mtime': mtime.isoformat(), 'mime': mime_type, 'kind': kind, 'processable': kind in self.processable_kinds, 'extractor': self._suggest_extractor(kind, mime_type), 'hints': self._extract_hints(file_path, kind, mime_type, size)}
            profile = {
                'path': str(file_path),
                'size': size,
                'mtime': mtime.isoformat(),
                'mime': mime_type,
                'kind': kind,
                'processable': kind in self.processable_kinds,
                'extractor': self._suggest_extractor(kind, mime_type),
                'hints': self._extract_hints(file_path, kind, mime_type, size)
            }
            return profile
        except Exception as e:
-            return {
+            return {'path': str(file_path), 'error': str(e), 'processable': False}
                'path': str(file_path),
                'error': str(e),
                'processable': False
            }
    def _detect_mime(self, file_path: Path) -> str:
        try:
@@ -63,61 +35,42 @@ class ContentProfiler:
    def _determine_kind(self, file_path: Path, mime_type: str) -> str:
        for kind, mimes in self.kind_mapping.items():
-            if any(mime in mime_type for mime in mimes):
+            if any((mime in mime_type for mime in mimes)):
                return kind
        suffix = file_path.suffix.lower()
        if suffix in self.text_exts:
            return 'text'
        if suffix in self.code_exts:
            return 'code'
        return 'unknown'
    def _suggest_extractor(self, kind: str, mime_type: str) -> Optional[str]:
-        extractors = {
+        extractors = {'pdf': 'pdf_text', 'image': 'ocr+caption', 'audio': 'transcribe', 'video': 'transcribe+scenes', 'document': 'office_text', 'text': 'read', 'code': 'read+syntax'}
            'pdf': 'pdf_text',
            'image': 'ocr+caption',
            'audio': 'transcribe',
            'video': 'transcribe+scenes',
            'document': 'office_text',
            'text': 'read',
            'code': 'read+syntax'
        }
        return extractors.get(kind)
    def _extract_hints(self, file_path: Path, kind: str, mime_type: str, size: int) -> Dict:
        hints = {}
        if kind == 'text' or kind == 'code':
            hints['language'] = self._guess_language(file_path)
            if size < 1024 * 1024:
                hints['lines'] = self._count_lines(file_path)
        if kind == 'pdf':
            hints['page_count'] = self._get_pdf_pages(file_path)
        if kind in ['audio', 'video']:
            hints['duration'] = self._get_media_duration(file_path)
        if kind == 'image':
            hints['has_exif'] = self._has_exif(file_path)
            hints['dimensions'] = self._get_image_dimensions(file_path)
        return hints
    def _guess_language(self, file_path: Path) -> Optional[str]:
-        lang_map = {
+        lang_map = {'.py': 'python', '.js': 'javascript', '.ts': 'typescript', '.java': 'java', '.go': 'go', '.rs': 'rust', '.c': 'c', '.cpp': 'cpp', '.cs': 'csharp', '.rb': 'ruby', '.php': 'php'}
            '.py': 'python', '.js': 'javascript', '.ts': 'typescript',
            '.java': 'java', '.go': 'go', '.rs': 'rust', '.c': 'c',
            '.cpp': 'cpp', '.cs': 'csharp', '.rb': 'ruby', '.php': 'php'
        }
        return lang_map.get(file_path.suffix.lower())
    def _count_lines(self, file_path: Path) -> Optional[int]:
        try:
            with open(file_path, 'rb') as f:
-                return sum(1 for _ in f)
+                return sum((1 for _ in f))
        except:
            return None
--- a/app/discovery/init.py
+++ b/app/discovery/init.py
@@ -1,17 +1,5 @@
 """Discovery package exports"""
 from .scanner import FileScanner, FilteredScanner
 from .system import SystemAPI
 from .engine import DiscoveryEngine
 from ._protocols import FileMeta, MountInfo, DiskInfo, IFileScanner, ISystemAPI
-
+__all__ = ['FileScanner', 'FilteredScanner', 'SystemAPI', 'DiscoveryEngine', 'FileMeta', 'MountInfo', 'DiskInfo', 'IFileScanner', 'ISystemAPI']
 __all__ = [
    'FileScanner',
    'FilteredScanner',
    'SystemAPI',
    'DiscoveryEngine',
    'FileMeta',
    'MountInfo',
    'DiskInfo',
    'IFileScanner',
    'ISystemAPI',
 ]
--- a/app/discovery/_protocols.py
+++ b/app/discovery/_protocols.py
@@ -1,54 +1,37 @@
 """Protocol definitions for the discovery package"""
 from typing import Iterator, Protocol, Any
 from pathlib import Path
 from dataclasses import dataclass
@dataclass
 class FileMeta:
    """Metadata for a discovered file"""
    path: Path
    size: int
    modified_time: float
    created_time: float
    # Add other metadata fields as needed
@dataclass
 class MountInfo:
    """Information about a mounted filesystem"""
    device: str
    mount_point: str
    fs_type: str
    options: str
    # Add other mount info fields as needed
@dataclass
 class DiskInfo:
    """Information about a disk/NVMe device"""
    device: str
    model: str
    size: int
    serial: str
    # Add other disk info fields as needed
 class IFileScanner(Protocol):
    """Protocol for file scanning operations"""
    def scan(self, root: Path) -> Iterator[FileMeta]:
        """Scan a directory tree and yield file metadata"""
        ...
 class ISystemAPI(Protocol):
    """Protocol for system information queries"""
    def query_mounts(self) -> list[MountInfo]:
        """Query mounted filesystems"""
        ...
    def query_nvmes(self) -> list[DiskInfo]:
        """Query NVMe/disk information"""
        ...
--- a/app/discovery/engine.py
+++ b/app/discovery/engine.py
@@ -1,10 +1,8 @@
 """Discovery engine coordinating scanner and system APIs"""
 from pathlib import Path
 from typing import Optional, Callable
 from datetime import datetime
 import psycopg2
 from psycopg2.extras import execute_batch
 from .scanner import FileScanner
 from .system import SystemAPI
 from ._protocols import FileMeta
@@ -12,23 +10,9 @@ from ..shared.models import FileRecord, DiskInfo, ProcessingStats
 from ..shared.config import DatabaseConfig
 from ..shared.logger import ProgressLogger
 class DiscoveryEngine:
    """Discovery engine for scanning and cataloging files"""
-    def __init__(
+    def __init__(self, db_config: DatabaseConfig, logger: ProgressLogger, batch_size: int=1000):
        self,
        db_config: DatabaseConfig,
        logger: ProgressLogger,
        batch_size: int = 1000
    ):
        """Initialize discovery engine
        Args:
            db_config: Database configuration
            logger: Progress logger
            batch_size: Number of records to batch before database commit
        """
        self.db_config = db_config
        self.logger = logger
        self.batch_size = batch_size
@@ -36,286 +20,114 @@ class DiscoveryEngine:
        self._connection = None
    def _get_connection(self):
        """Get or create database connection"""
        if self._connection is None or self._connection.closed:
-            self._connection = psycopg2.connect(
+            self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
                host=self.db_config.host,
                port=self.db_config.port,
                database=self.db_config.database,
                user=self.db_config.user,
                password=self.db_config.password
            )
        return self._connection
    def _ensure_tables(self):
        """Ensure database tables exist"""
        conn = self._get_connection()
        cursor = conn.cursor()
-
+        cursor.execute("\n            CREATE TABLE IF NOT EXISTS files (\n                id SERIAL PRIMARY KEY,\n                path TEXT NOT NULL UNIQUE,\n                size BIGINT NOT NULL,\n                modified_time DOUBLE PRECISION NOT NULL,\n                created_time DOUBLE PRECISION NOT NULL,\n                disk_label TEXT NOT NULL,\n                checksum TEXT,\n                status TEXT DEFAULT 'indexed',\n                category TEXT,\n                duplicate_of TEXT,\n                discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n                updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n            )\n        ")
-        # Create files table
+        cursor.execute('\n            CREATE INDEX IF NOT EXISTS idx_files_path ON files(path)\n                       ')
-        cursor.execute("""
+        cursor.execute('\n            CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label)\n                       ')
-            CREATE TABLE IF NOT EXISTS files (
+        cursor.execute('\n            CREATE INDEX IF NOT EXISTS idx_files_checksum ON files(checksum)\n                       ')
                id SERIAL PRIMARY KEY,
                path TEXT NOT NULL UNIQUE,
                size BIGINT NOT NULL,
                modified_time DOUBLE PRECISION NOT NULL,
                created_time DOUBLE PRECISION NOT NULL,
                disk_label TEXT NOT NULL,
                checksum TEXT,
                status TEXT DEFAULT 'indexed',
                category TEXT,
                duplicate_of TEXT,
                discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
                updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
            )
        """)
        # Create index on path
        cursor.execute("""
            CREATE INDEX IF NOT EXISTS idx_files_path ON files(path)
                       """)
        # Create index on disk
        cursor.execute("""
            CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label)
                       """)
        # Create index on checksum
        cursor.execute("""
            CREATE INDEX IF NOT EXISTS idx_files_checksum ON files(checksum)
                       """)
        conn.commit()
        cursor.close()
-    def discover_path(
+    def discover_path(self, root: Path, scanner: Optional[FileScanner]=None, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats:
-        self,
+        self.logger.section(f'Discovering: {root}')
        root: Path,
        scanner: Optional[FileScanner] = None,
        progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
    ) -> ProcessingStats:
        """Discover and catalog files in a path
        Args:
            root: Root path to discover
            scanner: Optional custom scanner (default: FileScanner())
            progress_callback: Optional callback for progress updates
        Returns:
            ProcessingStats with discovery statistics
        """
        self.logger.section(f"Discovering: {root}")
        # Ensure tables exist
        self._ensure_tables()
        # Create scanner if not provided
        if scanner is None:
-            scanner = FileScanner(
+            scanner = FileScanner(error_handler=lambda e, p: self.logger.warning(f'Error scanning {p}: {e}'))
                error_handler=lambda e, p: self.logger.warning(f"Error scanning {p}: {e}")
            )
        # Get disk info for the root path
        disk = self.system_api.get_disk_for_path(root)
        if disk is None:
            disk = str(root)
        # Initialize statistics
        stats = ProcessingStats()
        batch = []
        conn = self._get_connection()
        cursor = conn.cursor()
        try:
            # Scan files
            for file_meta in scanner.scan(root):
-                # Create file record
+                record = FileRecord(path=file_meta.path, size=file_meta.size, modified_time=file_meta.modified_time, created_time=file_meta.created_time, disk_label=disk)
                record = FileRecord(
                    path=file_meta.path,
                    size=file_meta.size,
                    modified_time=file_meta.modified_time,
                    created_time=file_meta.created_time,
                    disk_label=disk
                )
                batch.append(record)
                stats.files_processed += 1
                stats.bytes_processed += record.size
                # Batch insert
                if len(batch) >= self.batch_size:
                    self._insert_batch(cursor, batch)
                    conn.commit()
                    batch.clear()
                    # Progress callback
                    if progress_callback:
                        progress_callback(stats.files_processed, 0, stats)
                    # Log progress
                    if stats.files_processed % (self.batch_size * 10) == 0:
-                        self.logger.progress(
+                        self.logger.progress(stats.files_processed, stats.files_processed, prefix='Files discovered', bytes_processed=stats.bytes_processed, elapsed_seconds=stats.elapsed_seconds)
                            stats.files_processed,
                            stats.files_processed,  # We don't know total
                            prefix="Files discovered",
                            bytes_processed=stats.bytes_processed,
                            elapsed_seconds=stats.elapsed_seconds
                        )
            # Insert remaining batch
            if batch:
                self._insert_batch(cursor, batch)
                conn.commit()
            stats.files_succeeded = stats.files_processed
        except Exception as e:
            conn.rollback()
-            self.logger.error(f"Discovery failed: {e}")
+            self.logger.error(f'Discovery failed: {e}')
            raise
        finally:
            cursor.close()
-
+        self.logger.info(f'Discovery complete: {stats.files_processed} files, {stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s')
        self.logger.info(
            f"Discovery complete: {stats.files_processed} files, "
            f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
        )
        return stats
    def _insert_batch(self, cursor, batch: list[FileRecord]):
-        """Insert batch of file records
+        query = '\n            INSERT INTO files (path, size, modified_time, created_time, disk_label, checksum, status, category, duplicate_of)\n            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)\n            ON CONFLICT (path) DO UPDATE SET\n                size = EXCLUDED.size,\n                modified_time = EXCLUDED.modified_time,\n                updated_at = CURRENT_TIMESTAMP\n        '
-
+        data = [(str(record.path), record.size, record.modified_time, record.created_time, record.disk_label, record.checksum, record.status, record.category, record.duplicate_of) for record in batch]
        Args:
            cursor: Database cursor
            batch: List of FileRecord objects
        """
        query = """
            INSERT INTO files (path, size, modified_time, created_time, disk_label, checksum, status, category, duplicate_of)
            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
            ON CONFLICT (path) DO UPDATE SET
                size = EXCLUDED.size,
                modified_time = EXCLUDED.modified_time,
                updated_at = CURRENT_TIMESTAMP
        """
        data = [
            (
                str(record.path),
                record.size,
                record.modified_time,
                record.created_time,
                record.disk_label,
                record.checksum,
                record.status,
                record.category,
                record.duplicate_of
            )
            for record in batch
        ]
        execute_batch(cursor, query, data, page_size=self.batch_size)
    def get_disk_info(self) -> list[DiskInfo]:
-        """Get information about all disks
+        self.logger.subsection('Querying disk information')
        Returns:
            List of DiskInfo objects
        """
        self.logger.subsection("Querying disk information")
        disks = []
        for disk_info in self.system_api.query_nvmes():
            # Get mount point if available
            mount_point = None
-            fs_type = "unknown"
+            fs_type = 'unknown'
            for mount in self.system_api.query_mounts():
                if mount.device == disk_info.device:
                    mount_point = Path(mount.mount_point)
                    fs_type = mount.fs_type
                    break
            if mount_point:
                total, used, free = self.system_api.get_disk_usage(mount_point)
            else:
                total = disk_info.size
                used = 0
                free = disk_info.size
-
+            disk = DiskInfo(name=disk_info.device, device=disk_info.device, mount_point=mount_point or Path('/'), total_size=total, used_size=used, free_size=free, fs_type=fs_type)
            disk = DiskInfo(
                name=disk_info.device,
                device=disk_info.device,
                mount_point=mount_point or Path("/"),
                total_size=total,
                used_size=used,
                free_size=free,
                fs_type=fs_type
            )
            disks.append(disk)
-
+            self.logger.info(f'  {disk.name}: {disk.usage_percent:.1f}% used ({disk.used_size:,} / {disk.total_size:,} bytes)')
            self.logger.info(
                f"  {disk.name}: {disk.usage_percent:.1f}% used "
                f"({disk.used_size:,} / {disk.total_size:,} bytes)"
            )
        return disks
-    def get_file_count(self, disk: Optional[str] = None) -> int:
+    def get_file_count(self, disk: Optional[str]=None) -> int:
        """Get count of discovered files
        Args:
            disk: Optional disk filter
        Returns:
            Count of files
        """
        conn = self._get_connection()
        cursor = conn.cursor()
        if disk:
-            cursor.execute("SELECT COUNT(*) FROM files WHERE disk_label = %s", (disk,))
+            cursor.execute('SELECT COUNT(*) FROM files WHERE disk_label = %s', (disk,))
        else:
-            cursor.execute("SELECT COUNT(*) FROM files")
+            cursor.execute('SELECT COUNT(*) FROM files')
        count = cursor.fetchone()[0]
        cursor.close()
        return count
-    def get_total_size(self, disk: Optional[str] = None) -> int:
+    def get_total_size(self, disk: Optional[str]=None) -> int:
        """Get total size of discovered files
        Args:
            disk: Optional disk filter
        Returns:
            Total size in bytes
        """
        conn = self._get_connection()
        cursor = conn.cursor()
        if disk:
-            cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE disk_label = %s", (disk,))
+            cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files WHERE disk_label = %s', (disk,))
        else:
-            cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files")
+            cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files')
        total = cursor.fetchone()[0]
        cursor.close()
        return total
    def close(self):
-        """Close database connection"""
+        if self._connection and (not self._connection.closed):
        if self._connection and not self._connection.closed:
            self._connection.close()
    def __enter__(self):
        """Context manager entry"""
        return self
    def __exit__(self, exc_type, exc_val, exc_tb):
        """Context manager exit"""
        self.close()
--- a/app/discovery/scanner.py
+++ b/app/discovery/scanner.py
@@ -1,28 +1,12 @@
 """File system scanner implementing IFileScanner protocol"""
 import os
 from pathlib import Path
 from typing import Iterator, Optional, Callable
 from datetime import datetime
 from ._protocols import FileMeta
 class FileScanner:
    """File system scanner with filtering and error handling"""
-    def __init__(
+    def __init__(self, follow_symlinks: bool=False, skip_hidden: bool=True, error_handler: Optional[Callable[[Exception, Path], None]]=None):
        self,
        follow_symlinks: bool = False,
        skip_hidden: bool = True,
        error_handler: Optional[Callable[[Exception, Path], None]] = None
    ):
        """Initialize file scanner
        Args:
            follow_symlinks: Whether to follow symbolic links
            skip_hidden: Whether to skip hidden files/directories
            error_handler: Optional callback for handling errors during scan
        """
        self.follow_symlinks = follow_symlinks
        self.skip_hidden = skip_hidden
        self.error_handler = error_handler
@@ -31,24 +15,14 @@ class FileScanner:
        self._errors = 0
    def scan(self, root: Path) -> Iterator[FileMeta]:
        """Scan a directory tree and yield file metadata
        Args:
            root: Root directory to scan
        Yields:
            FileMeta objects for each discovered file
        """
        if not root.exists():
-            error = FileNotFoundError(f"Path does not exist: {root}")
+            error = FileNotFoundError(f'Path does not exist: {root}')
            if self.error_handler:
                self.error_handler(error, root)
            else:
                raise error
            return
        if not root.is_dir():
            # If root is a file, just return its metadata
            try:
                yield self._get_file_meta(root)
            except Exception as e:
@@ -58,115 +32,59 @@ class FileScanner:
                else:
                    raise
            return
        # Walk directory tree
        for dirpath, dirnames, filenames in os.walk(root, followlinks=self.follow_symlinks):
            current_dir = Path(dirpath)
            # Filter directories if needed
            if self.skip_hidden:
                dirnames[:] = [d for d in dirnames if not d.startswith('.')]
            # Process files
            for filename in filenames:
                if self.skip_hidden and filename.startswith('.'):
                    continue
                file_path = current_dir / filename
                try:
-                    # Skip broken symlinks
+                    if file_path.is_symlink() and (not file_path.exists()):
                    if file_path.is_symlink() and not file_path.exists():
                        continue
                    meta = self._get_file_meta(file_path)
                    self._files_scanned += 1
                    self._bytes_scanned += meta.size
                    yield meta
                except PermissionError as e:
                    self._errors += 1
                    if self.error_handler:
                        self.error_handler(e, file_path)
                    # Continue scanning
                    continue
                except Exception as e:
                    self._errors += 1
                    if self.error_handler:
                        self.error_handler(e, file_path)
                    # Continue scanning
                    continue
    def _get_file_meta(self, path: Path) -> FileMeta:
        """Get file metadata
        Args:
            path: Path to file
        Returns:
            FileMeta object with file metadata
        Raises:
            OSError: If file cannot be accessed
        """
        stat = path.stat()
        # Get creation time (platform dependent)
        created_time = stat.st_ctime
        if hasattr(stat, 'st_birthtime'):
            created_time = stat.st_birthtime
-
+        return FileMeta(path=path, size=stat.st_size, modified_time=stat.st_mtime, created_time=created_time)
        return FileMeta(
            path=path,
            size=stat.st_size,
            modified_time=stat.st_mtime,
            created_time=created_time
        )
    @property
    def files_scanned(self) -> int:
        """Get count of files scanned"""
        return self._files_scanned
    @property
    def bytes_scanned(self) -> int:
        """Get total bytes scanned"""
        return self._bytes_scanned
    @property
    def errors(self) -> int:
        """Get count of errors encountered"""
        return self._errors
    def reset_stats(self) -> None:
        """Reset scanning statistics"""
        self._files_scanned = 0
        self._bytes_scanned = 0
        self._errors = 0
 class FilteredScanner(FileScanner):
    """Scanner with additional filtering capabilities"""
-    def __init__(
+    def __init__(self, min_size: Optional[int]=None, max_size: Optional[int]=None, extensions: Optional[list[str]]=None, exclude_patterns: Optional[list[str]]=None, **kwargs):
        self,
        min_size: Optional[int] = None,
        max_size: Optional[int] = None,
        extensions: Optional[list[str]] = None,
        exclude_patterns: Optional[list[str]] = None,
        **kwargs
    ):
        """Initialize filtered scanner
        Args:
            min_size: Minimum file size in bytes
            max_size: Maximum file size in bytes
            extensions: List of file extensions to include (e.g., ['.txt', '.py'])
            exclude_patterns: List of path patterns to exclude
            **kwargs: Additional arguments passed to FileScanner
        """
        super().__init__(**kwargs)
        self.min_size = min_size
        self.max_size = max_size
@@ -174,41 +92,19 @@ class FilteredScanner(FileScanner):
        self.exclude_patterns = exclude_patterns or []
    def scan(self, root: Path) -> Iterator[FileMeta]:
        """Scan with additional filtering
        Args:
            root: Root directory to scan
        Yields:
            FileMeta objects for files matching filter criteria
        """
        for meta in super().scan(root):
            # Size filtering
            if self.min_size is not None and meta.size < self.min_size:
                continue
            if self.max_size is not None and meta.size > self.max_size:
                continue
            # Extension filtering
            if self.extensions is not None:
                if meta.path.suffix.lower() not in self.extensions:
                    continue
            # Exclude pattern filtering
            if self._should_exclude(meta.path):
                continue
            yield meta
    def _should_exclude(self, path: Path) -> bool:
        """Check if path matches any exclude pattern
        Args:
            path: Path to check
        Returns:
            True if path should be excluded
        """
        path_str = str(path)
        for pattern in self.exclude_patterns:
            if pattern in path_str:
--- a/app/discovery/system.py
+++ b/app/discovery/system.py
@@ -1,167 +1,80 @@
 """System API for querying mounts and disks"""
 import os
 import subprocess
 from pathlib import Path
 from typing import Optional
 import psutil
 from ._protocols import MountInfo, DiskInfo
 class SystemAPI:
    """System information API for querying mounts and disks"""
    def query_mounts(self) -> list[MountInfo]:
        """Query mounted filesystems
        Returns:
            List of MountInfo objects for all mounted filesystems
        """
        mounts = []
        for partition in psutil.disk_partitions(all=False):
-            mount_info = MountInfo(
+            mount_info = MountInfo(device=partition.device, mount_point=partition.mountpoint, fs_type=partition.fstype, options=partition.opts)
                device=partition.device,
                mount_point=partition.mountpoint,
                fs_type=partition.fstype,
                options=partition.opts
            )
            mounts.append(mount_info)
        return mounts
    def query_nvmes(self) -> list[DiskInfo]:
        """Query NVMe/disk information
        Returns:
            List of DiskInfo objects for all disks
        """
        disks = []
        # Try to get disk information using lsblk
        try:
-            result = subprocess.run(
+            result = subprocess.run(['lsblk', '-ndo', 'NAME,MODEL,SIZE,SERIAL', '-b'], capture_output=True, text=True, check=False)
                ['lsblk', '-ndo', 'NAME,MODEL,SIZE,SERIAL', '-b'],
                capture_output=True,
                text=True,
                check=False
            )
            if result.returncode == 0:
                for line in result.stdout.strip().split('\n'):
                    if not line.strip():
                        continue
                    parts = line.split(maxsplit=3)
                    if len(parts) >= 3:
-                        device = f"/dev/{parts[0]}"
+                        device = f'/dev/{parts[0]}'
-                        model = parts[1] if len(parts) > 1 else "Unknown"
+                        model = parts[1] if len(parts) > 1 else 'Unknown'
-                        size_str = parts[2] if len(parts) > 2 else "0"
+                        size_str = parts[2] if len(parts) > 2 else '0'
-                        serial = parts[3] if len(parts) > 3 else "Unknown"
+                        serial = parts[3] if len(parts) > 3 else 'Unknown'
                        try:
                            size = int(size_str)
                        except ValueError:
                            size = 0
-
+                        disk_info = DiskInfo(device=device, model=model, size=size, serial=serial)
                        disk_info = DiskInfo(
                            device=device,
                            model=model,
                            size=size,
                            serial=serial
                        )
                        disks.append(disk_info)
        except FileNotFoundError:
            # lsblk not available, fall back to basic info
            pass
        # If lsblk failed or unavailable, try alternative method
        if not disks:
            disks = self._query_disks_fallback()
        return disks
    def _query_disks_fallback(self) -> list[DiskInfo]:
        """Fallback method for querying disk information
        Returns:
            List of DiskInfo objects using psutil
        """
        disks = []
        seen_devices = set()
        for partition in psutil.disk_partitions(all=True):
            device = partition.device
            # Skip non-disk devices
            if not device.startswith('/dev/'):
                continue
            # Get base device (e.g., /dev/sda from /dev/sda1)
            base_device = self._get_base_device(device)
            if base_device in seen_devices:
                continue
            seen_devices.add(base_device)
            try:
                usage = psutil.disk_usage(partition.mountpoint)
                size = usage.total
            except (PermissionError, OSError):
                size = 0
-
+            disk_info = DiskInfo(device=base_device, model='Unknown', size=size, serial='Unknown')
            disk_info = DiskInfo(
                device=base_device,
                model="Unknown",
                size=size,
                serial="Unknown"
            )
            disks.append(disk_info)
        return disks
    def _get_base_device(self, device: str) -> str:
        """Extract base device name from partition device
        Args:
            device: Device path (e.g., /dev/sda1, /dev/nvme0n1p1)
        Returns:
            Base device path (e.g., /dev/sda, /dev/nvme0n1)
        """
        # Handle NVMe devices
        if 'nvme' in device:
            # /dev/nvme0n1p1 -> /dev/nvme0n1
            if 'p' in device:
                return device.rsplit('p', 1)[0]
            return device
        # Handle standard devices (sda, sdb, etc.)
        # /dev/sda1 -> /dev/sda
        import re
-        match = re.match(r'(/dev/[a-z]+)', device)
+        match = re.match('(/dev/[a-z]+)', device)
        if match:
            return match.group(1)
        return device
    def get_disk_for_path(self, path: Path) -> Optional[str]:
        """Get the disk/mount point for a given path
        Args:
            path: Path to check
        Returns:
            Mount point device or None if not found
        """
        path = path.resolve()
        # Find the mount point that contains this path
        best_match = None
        best_match_len = 0
        for partition in psutil.disk_partitions():
            mount_point = Path(partition.mountpoint)
            try:
@@ -172,39 +85,19 @@ class SystemAPI:
                        best_match_len = mount_len
            except (ValueError, OSError):
                continue
        return best_match
    def get_disk_usage(self, path: Path) -> tuple[int, int, int]:
        """Get disk usage for a path
        Args:
            path: Path to check
        Returns:
            Tuple of (total, used, free) in bytes
        """
        try:
            usage = psutil.disk_usage(str(path))
-            return usage.total, usage.used, usage.free
+            return (usage.total, usage.used, usage.free)
        except (PermissionError, OSError):
-            return 0, 0, 0
+            return (0, 0, 0)
    def get_mount_point(self, path: Path) -> Optional[Path]:
        """Get the mount point for a given path
        Args:
            path: Path to check
        Returns:
            Mount point path or None if not found
        """
        path = path.resolve()
        # Find the mount point that contains this path
        best_match = None
        best_match_len = 0
        for partition in psutil.disk_partitions():
            mount_point = Path(partition.mountpoint)
            try:
@@ -215,19 +108,9 @@ class SystemAPI:
                        best_match_len = mount_len
            except (ValueError, OSError):
                continue
        return best_match
    def is_same_filesystem(self, path1: Path, path2: Path) -> bool:
        """Check if two paths are on the same filesystem
        Args:
            path1: First path
            path2: Second path
        Returns:
            True if paths are on the same filesystem
        """
        try:
            stat1 = path1.stat()
            stat2 = path2.stat()
--- a/app/main.py
+++ b/app/main.py