remove_doc

This commit is contained in:
mike
2025-12-13 04:23:04 +01:00
parent 75034d5e51
commit 9759001f4c
9 changed files with 741 additions and 1641 deletions

3
app/content/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from .profiler import ContentProfiler
from .extractors import ContentExtractor
__all__ = ['ContentProfiler', 'ContentExtractor']

View File

@@ -3,22 +3,14 @@ from typing import Dict, Optional
import json import json
class ContentExtractor: class ContentExtractor:
def __init__(self): def __init__(self):
self.extractors = { self.extractors = {'pdf_text': self._extract_pdf, 'ocr+caption': self._extract_image, 'transcribe': self._extract_audio, 'transcribe+scenes': self._extract_video, 'office_text': self._extract_document, 'read': self._extract_text, 'read+syntax': self._extract_code}
'pdf_text': self._extract_pdf,
'ocr+caption': self._extract_image,
'transcribe': self._extract_audio,
'transcribe+scenes': self._extract_video,
'office_text': self._extract_document,
'read': self._extract_text,
'read+syntax': self._extract_code
}
def extract(self, file_path: Path, extractor_type: str) -> Dict: def extract(self, file_path: Path, extractor_type: str) -> Dict:
extractor = self.extractors.get(extractor_type) extractor = self.extractors.get(extractor_type)
if not extractor: if not extractor:
return {'error': f'Unknown extractor: {extractor_type}'} return {'error': f'Unknown extractor: {extractor_type}'}
try: try:
return extractor(file_path) return extractor(file_path)
except Exception as e: except Exception as e:
@@ -28,11 +20,7 @@ class ContentExtractor:
try: try:
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f: with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
content = f.read(1024 * 1024) content = f.read(1024 * 1024)
return { return {'text': content, 'char_count': len(content), 'needs_llm': False}
'text': content,
'char_count': len(content),
'needs_llm': False
}
except Exception as e: except Exception as e:
return {'error': str(e)} return {'error': str(e)}
@@ -51,54 +39,24 @@ class ContentExtractor:
pdf = PyPDF2.PdfReader(f) pdf = PyPDF2.PdfReader(f)
for page in pdf.pages[:10]: for page in pdf.pages[:10]:
text_parts.append(page.extract_text()) text_parts.append(page.extract_text())
text = '\n'.join(text_parts) text = '\n'.join(text_parts)
return { return {'text': text, 'pages_extracted': len(text_parts), 'needs_llm': len(text.strip()) > 100, 'type': 'document'}
'text': text,
'pages_extracted': len(text_parts),
'needs_llm': len(text.strip()) > 100,
'type': 'document'
}
except Exception as e: except Exception as e:
return {'error': str(e), 'needs_ocr': True} return {'error': str(e), 'needs_ocr': True}
def _extract_image(self, file_path: Path) -> Dict: def _extract_image(self, file_path: Path) -> Dict:
return { return {'type': 'image', 'needs_ocr': True, 'needs_caption': True, 'needs_llm': True, 'pipeline': ['ocr', 'caption', 'embedding'], 'status': 'pending'}
'type': 'image',
'needs_ocr': True,
'needs_caption': True,
'needs_llm': True,
'pipeline': ['ocr', 'caption', 'embedding'],
'status': 'pending'
}
def _extract_audio(self, file_path: Path) -> Dict: def _extract_audio(self, file_path: Path) -> Dict:
return { return {'type': 'audio', 'needs_transcription': True, 'needs_llm': True, 'pipeline': ['transcribe', 'summarize'], 'status': 'pending'}
'type': 'audio',
'needs_transcription': True,
'needs_llm': True,
'pipeline': ['transcribe', 'summarize'],
'status': 'pending'
}
def _extract_video(self, file_path: Path) -> Dict: def _extract_video(self, file_path: Path) -> Dict:
return { return {'type': 'video', 'needs_transcription': True, 'needs_scene_detection': True, 'needs_llm': True, 'pipeline': ['transcribe', 'scenes', 'summarize'], 'status': 'pending'}
'type': 'video',
'needs_transcription': True,
'needs_scene_detection': True,
'needs_llm': True,
'pipeline': ['transcribe', 'scenes', 'summarize'],
'status': 'pending'
}
def _extract_document(self, file_path: Path) -> Dict: def _extract_document(self, file_path: Path) -> Dict:
try: try:
import textract import textract
text = textract.process(str(file_path)).decode('utf-8') text = textract.process(str(file_path)).decode('utf-8')
return { return {'text': text, 'type': 'document', 'needs_llm': len(text.strip()) > 100}
'text': text,
'type': 'document',
'needs_llm': len(text.strip()) > 100
}
except: except:
return {'error': 'textract failed', 'needs_llm': True} return {'error': 'textract failed', 'needs_llm': True}

View File

@@ -6,21 +6,10 @@ import json
from datetime import datetime from datetime import datetime
class ContentProfiler: class ContentProfiler:
def __init__(self): def __init__(self):
self.mime_detector = magic.Magic(mime=True) self.mime_detector = magic.Magic(mime=True)
self.kind_mapping = {'text': ['text/plain', 'text/html', 'text/css', 'text/javascript', 'text/markdown'], 'code': ['application/x-python', 'application/javascript', 'text/x-java', 'text/x-c'], 'pdf': ['application/pdf'], 'image': ['image/jpeg', 'image/png', 'image/gif', 'image/webp', 'image/svg+xml'], 'audio': ['audio/mpeg', 'audio/wav', 'audio/ogg', 'audio/flac'], 'video': ['video/mp4', 'video/x-matroska', 'video/avi', 'video/webm'], 'archive': ['application/zip', 'application/x-tar', 'application/gzip', 'application/x-7z-compressed'], 'document': ['application/msword', 'application/vnd.openxmlformats-officedocument'], 'spreadsheet': ['application/vnd.ms-excel', 'text/csv']}
self.kind_mapping = {
'text': ['text/plain', 'text/html', 'text/css', 'text/javascript', 'text/markdown'],
'code': ['application/x-python', 'application/javascript', 'text/x-java', 'text/x-c'],
'pdf': ['application/pdf'],
'image': ['image/jpeg', 'image/png', 'image/gif', 'image/webp', 'image/svg+xml'],
'audio': ['audio/mpeg', 'audio/wav', 'audio/ogg', 'audio/flac'],
'video': ['video/mp4', 'video/x-matroska', 'video/avi', 'video/webm'],
'archive': ['application/zip', 'application/x-tar', 'application/gzip', 'application/x-7z-compressed'],
'document': ['application/msword', 'application/vnd.openxmlformats-officedocument'],
'spreadsheet': ['application/vnd.ms-excel', 'text/csv']
}
self.text_exts = {'.txt', '.md', '.rst', '.log', '.json', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg'} self.text_exts = {'.txt', '.md', '.rst', '.log', '.json', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg'}
self.code_exts = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php'} self.code_exts = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php'}
self.processable_kinds = {'text', 'code', 'pdf', 'image', 'audio', 'video', 'document'} self.processable_kinds = {'text', 'code', 'pdf', 'image', 'audio', 'video', 'document'}
@@ -30,29 +19,12 @@ class ContentProfiler:
stat = file_path.stat() stat = file_path.stat()
size = stat.st_size size = stat.st_size
mtime = datetime.fromtimestamp(stat.st_mtime) mtime = datetime.fromtimestamp(stat.st_mtime)
mime_type = self._detect_mime(file_path) mime_type = self._detect_mime(file_path)
kind = self._determine_kind(file_path, mime_type) kind = self._determine_kind(file_path, mime_type)
profile = {'path': str(file_path), 'size': size, 'mtime': mtime.isoformat(), 'mime': mime_type, 'kind': kind, 'processable': kind in self.processable_kinds, 'extractor': self._suggest_extractor(kind, mime_type), 'hints': self._extract_hints(file_path, kind, mime_type, size)}
profile = {
'path': str(file_path),
'size': size,
'mtime': mtime.isoformat(),
'mime': mime_type,
'kind': kind,
'processable': kind in self.processable_kinds,
'extractor': self._suggest_extractor(kind, mime_type),
'hints': self._extract_hints(file_path, kind, mime_type, size)
}
return profile return profile
except Exception as e: except Exception as e:
return { return {'path': str(file_path), 'error': str(e), 'processable': False}
'path': str(file_path),
'error': str(e),
'processable': False
}
def _detect_mime(self, file_path: Path) -> str: def _detect_mime(self, file_path: Path) -> str:
try: try:
@@ -63,61 +35,42 @@ class ContentProfiler:
def _determine_kind(self, file_path: Path, mime_type: str) -> str: def _determine_kind(self, file_path: Path, mime_type: str) -> str:
for kind, mimes in self.kind_mapping.items(): for kind, mimes in self.kind_mapping.items():
if any(mime in mime_type for mime in mimes): if any((mime in mime_type for mime in mimes)):
return kind return kind
suffix = file_path.suffix.lower() suffix = file_path.suffix.lower()
if suffix in self.text_exts: if suffix in self.text_exts:
return 'text' return 'text'
if suffix in self.code_exts: if suffix in self.code_exts:
return 'code' return 'code'
return 'unknown' return 'unknown'
def _suggest_extractor(self, kind: str, mime_type: str) -> Optional[str]: def _suggest_extractor(self, kind: str, mime_type: str) -> Optional[str]:
extractors = { extractors = {'pdf': 'pdf_text', 'image': 'ocr+caption', 'audio': 'transcribe', 'video': 'transcribe+scenes', 'document': 'office_text', 'text': 'read', 'code': 'read+syntax'}
'pdf': 'pdf_text',
'image': 'ocr+caption',
'audio': 'transcribe',
'video': 'transcribe+scenes',
'document': 'office_text',
'text': 'read',
'code': 'read+syntax'
}
return extractors.get(kind) return extractors.get(kind)
def _extract_hints(self, file_path: Path, kind: str, mime_type: str, size: int) -> Dict: def _extract_hints(self, file_path: Path, kind: str, mime_type: str, size: int) -> Dict:
hints = {} hints = {}
if kind == 'text' or kind == 'code': if kind == 'text' or kind == 'code':
hints['language'] = self._guess_language(file_path) hints['language'] = self._guess_language(file_path)
if size < 1024 * 1024: if size < 1024 * 1024:
hints['lines'] = self._count_lines(file_path) hints['lines'] = self._count_lines(file_path)
if kind == 'pdf': if kind == 'pdf':
hints['page_count'] = self._get_pdf_pages(file_path) hints['page_count'] = self._get_pdf_pages(file_path)
if kind in ['audio', 'video']: if kind in ['audio', 'video']:
hints['duration'] = self._get_media_duration(file_path) hints['duration'] = self._get_media_duration(file_path)
if kind == 'image': if kind == 'image':
hints['has_exif'] = self._has_exif(file_path) hints['has_exif'] = self._has_exif(file_path)
hints['dimensions'] = self._get_image_dimensions(file_path) hints['dimensions'] = self._get_image_dimensions(file_path)
return hints return hints
def _guess_language(self, file_path: Path) -> Optional[str]: def _guess_language(self, file_path: Path) -> Optional[str]:
lang_map = { lang_map = {'.py': 'python', '.js': 'javascript', '.ts': 'typescript', '.java': 'java', '.go': 'go', '.rs': 'rust', '.c': 'c', '.cpp': 'cpp', '.cs': 'csharp', '.rb': 'ruby', '.php': 'php'}
'.py': 'python', '.js': 'javascript', '.ts': 'typescript',
'.java': 'java', '.go': 'go', '.rs': 'rust', '.c': 'c',
'.cpp': 'cpp', '.cs': 'csharp', '.rb': 'ruby', '.php': 'php'
}
return lang_map.get(file_path.suffix.lower()) return lang_map.get(file_path.suffix.lower())
def _count_lines(self, file_path: Path) -> Optional[int]: def _count_lines(self, file_path: Path) -> Optional[int]:
try: try:
with open(file_path, 'rb') as f: with open(file_path, 'rb') as f:
return sum(1 for _ in f) return sum((1 for _ in f))
except: except:
return None return None

View File

@@ -1,17 +1,5 @@
"""Discovery package exports"""
from .scanner import FileScanner, FilteredScanner from .scanner import FileScanner, FilteredScanner
from .system import SystemAPI from .system import SystemAPI
from .engine import DiscoveryEngine from .engine import DiscoveryEngine
from ._protocols import FileMeta, MountInfo, DiskInfo, IFileScanner, ISystemAPI from ._protocols import FileMeta, MountInfo, DiskInfo, IFileScanner, ISystemAPI
__all__ = ['FileScanner', 'FilteredScanner', 'SystemAPI', 'DiscoveryEngine', 'FileMeta', 'MountInfo', 'DiskInfo', 'IFileScanner', 'ISystemAPI']
__all__ = [
'FileScanner',
'FilteredScanner',
'SystemAPI',
'DiscoveryEngine',
'FileMeta',
'MountInfo',
'DiskInfo',
'IFileScanner',
'ISystemAPI',
]

View File

@@ -1,54 +1,37 @@
"""Protocol definitions for the discovery package"""
from typing import Iterator, Protocol, Any from typing import Iterator, Protocol, Any
from pathlib import Path from pathlib import Path
from dataclasses import dataclass from dataclasses import dataclass
@dataclass @dataclass
class FileMeta: class FileMeta:
"""Metadata for a discovered file"""
path: Path path: Path
size: int size: int
modified_time: float modified_time: float
created_time: float created_time: float
# Add other metadata fields as needed
@dataclass @dataclass
class MountInfo: class MountInfo:
"""Information about a mounted filesystem"""
device: str device: str
mount_point: str mount_point: str
fs_type: str fs_type: str
options: str options: str
# Add other mount info fields as needed
@dataclass @dataclass
class DiskInfo: class DiskInfo:
"""Information about a disk/NVMe device"""
device: str device: str
model: str model: str
size: int size: int
serial: str serial: str
# Add other disk info fields as needed
class IFileScanner(Protocol): class IFileScanner(Protocol):
"""Protocol for file scanning operations"""
def scan(self, root: Path) -> Iterator[FileMeta]: def scan(self, root: Path) -> Iterator[FileMeta]:
"""Scan a directory tree and yield file metadata"""
... ...
class ISystemAPI(Protocol): class ISystemAPI(Protocol):
"""Protocol for system information queries"""
def query_mounts(self) -> list[MountInfo]: def query_mounts(self) -> list[MountInfo]:
"""Query mounted filesystems"""
... ...
def query_nvmes(self) -> list[DiskInfo]: def query_nvmes(self) -> list[DiskInfo]:
"""Query NVMe/disk information"""
... ...

View File

@@ -1,10 +1,8 @@
"""Discovery engine coordinating scanner and system APIs"""
from pathlib import Path from pathlib import Path
from typing import Optional, Callable from typing import Optional, Callable
from datetime import datetime from datetime import datetime
import psycopg2 import psycopg2
from psycopg2.extras import execute_batch from psycopg2.extras import execute_batch
from .scanner import FileScanner from .scanner import FileScanner
from .system import SystemAPI from .system import SystemAPI
from ._protocols import FileMeta from ._protocols import FileMeta
@@ -12,23 +10,9 @@ from ..shared.models import FileRecord, DiskInfo, ProcessingStats
from ..shared.config import DatabaseConfig from ..shared.config import DatabaseConfig
from ..shared.logger import ProgressLogger from ..shared.logger import ProgressLogger
class DiscoveryEngine: class DiscoveryEngine:
"""Discovery engine for scanning and cataloging files"""
def __init__( def __init__(self, db_config: DatabaseConfig, logger: ProgressLogger, batch_size: int=1000):
self,
db_config: DatabaseConfig,
logger: ProgressLogger,
batch_size: int = 1000
):
"""Initialize discovery engine
Args:
db_config: Database configuration
logger: Progress logger
batch_size: Number of records to batch before database commit
"""
self.db_config = db_config self.db_config = db_config
self.logger = logger self.logger = logger
self.batch_size = batch_size self.batch_size = batch_size
@@ -36,286 +20,114 @@ class DiscoveryEngine:
self._connection = None self._connection = None
def _get_connection(self): def _get_connection(self):
"""Get or create database connection"""
if self._connection is None or self._connection.closed: if self._connection is None or self._connection.closed:
self._connection = psycopg2.connect( self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
host=self.db_config.host,
port=self.db_config.port,
database=self.db_config.database,
user=self.db_config.user,
password=self.db_config.password
)
return self._connection return self._connection
def _ensure_tables(self): def _ensure_tables(self):
"""Ensure database tables exist"""
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
cursor.execute("\n CREATE TABLE IF NOT EXISTS files (\n id SERIAL PRIMARY KEY,\n path TEXT NOT NULL UNIQUE,\n size BIGINT NOT NULL,\n modified_time DOUBLE PRECISION NOT NULL,\n created_time DOUBLE PRECISION NOT NULL,\n disk_label TEXT NOT NULL,\n checksum TEXT,\n status TEXT DEFAULT 'indexed',\n category TEXT,\n duplicate_of TEXT,\n discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n )\n ")
# Create files table cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_path ON files(path)\n ')
cursor.execute(""" cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label)\n ')
CREATE TABLE IF NOT EXISTS files ( cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_checksum ON files(checksum)\n ')
id SERIAL PRIMARY KEY,
path TEXT NOT NULL UNIQUE,
size BIGINT NOT NULL,
modified_time DOUBLE PRECISION NOT NULL,
created_time DOUBLE PRECISION NOT NULL,
disk_label TEXT NOT NULL,
checksum TEXT,
status TEXT DEFAULT 'indexed',
category TEXT,
duplicate_of TEXT,
discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# Create index on path
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_files_path ON files(path)
""")
# Create index on disk
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label)
""")
# Create index on checksum
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_files_checksum ON files(checksum)
""")
conn.commit() conn.commit()
cursor.close() cursor.close()
def discover_path( def discover_path(self, root: Path, scanner: Optional[FileScanner]=None, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats:
self, self.logger.section(f'Discovering: {root}')
root: Path,
scanner: Optional[FileScanner] = None,
progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
) -> ProcessingStats:
"""Discover and catalog files in a path
Args:
root: Root path to discover
scanner: Optional custom scanner (default: FileScanner())
progress_callback: Optional callback for progress updates
Returns:
ProcessingStats with discovery statistics
"""
self.logger.section(f"Discovering: {root}")
# Ensure tables exist
self._ensure_tables() self._ensure_tables()
# Create scanner if not provided
if scanner is None: if scanner is None:
scanner = FileScanner( scanner = FileScanner(error_handler=lambda e, p: self.logger.warning(f'Error scanning {p}: {e}'))
error_handler=lambda e, p: self.logger.warning(f"Error scanning {p}: {e}")
)
# Get disk info for the root path
disk = self.system_api.get_disk_for_path(root) disk = self.system_api.get_disk_for_path(root)
if disk is None: if disk is None:
disk = str(root) disk = str(root)
# Initialize statistics
stats = ProcessingStats() stats = ProcessingStats()
batch = [] batch = []
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
try: try:
# Scan files
for file_meta in scanner.scan(root): for file_meta in scanner.scan(root):
# Create file record record = FileRecord(path=file_meta.path, size=file_meta.size, modified_time=file_meta.modified_time, created_time=file_meta.created_time, disk_label=disk)
record = FileRecord(
path=file_meta.path,
size=file_meta.size,
modified_time=file_meta.modified_time,
created_time=file_meta.created_time,
disk_label=disk
)
batch.append(record) batch.append(record)
stats.files_processed += 1 stats.files_processed += 1
stats.bytes_processed += record.size stats.bytes_processed += record.size
# Batch insert
if len(batch) >= self.batch_size: if len(batch) >= self.batch_size:
self._insert_batch(cursor, batch) self._insert_batch(cursor, batch)
conn.commit() conn.commit()
batch.clear() batch.clear()
# Progress callback
if progress_callback: if progress_callback:
progress_callback(stats.files_processed, 0, stats) progress_callback(stats.files_processed, 0, stats)
# Log progress
if stats.files_processed % (self.batch_size * 10) == 0: if stats.files_processed % (self.batch_size * 10) == 0:
self.logger.progress( self.logger.progress(stats.files_processed, stats.files_processed, prefix='Files discovered', bytes_processed=stats.bytes_processed, elapsed_seconds=stats.elapsed_seconds)
stats.files_processed,
stats.files_processed, # We don't know total
prefix="Files discovered",
bytes_processed=stats.bytes_processed,
elapsed_seconds=stats.elapsed_seconds
)
# Insert remaining batch
if batch: if batch:
self._insert_batch(cursor, batch) self._insert_batch(cursor, batch)
conn.commit() conn.commit()
stats.files_succeeded = stats.files_processed stats.files_succeeded = stats.files_processed
except Exception as e: except Exception as e:
conn.rollback() conn.rollback()
self.logger.error(f"Discovery failed: {e}") self.logger.error(f'Discovery failed: {e}')
raise raise
finally: finally:
cursor.close() cursor.close()
self.logger.info(f'Discovery complete: {stats.files_processed} files, {stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s')
self.logger.info(
f"Discovery complete: {stats.files_processed} files, "
f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
)
return stats return stats
def _insert_batch(self, cursor, batch: list[FileRecord]): def _insert_batch(self, cursor, batch: list[FileRecord]):
"""Insert batch of file records query = '\n INSERT INTO files (path, size, modified_time, created_time, disk_label, checksum, status, category, duplicate_of)\n VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)\n ON CONFLICT (path) DO UPDATE SET\n size = EXCLUDED.size,\n modified_time = EXCLUDED.modified_time,\n updated_at = CURRENT_TIMESTAMP\n '
data = [(str(record.path), record.size, record.modified_time, record.created_time, record.disk_label, record.checksum, record.status, record.category, record.duplicate_of) for record in batch]
Args:
cursor: Database cursor
batch: List of FileRecord objects
"""
query = """
INSERT INTO files (path, size, modified_time, created_time, disk_label, checksum, status, category, duplicate_of)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
ON CONFLICT (path) DO UPDATE SET
size = EXCLUDED.size,
modified_time = EXCLUDED.modified_time,
updated_at = CURRENT_TIMESTAMP
"""
data = [
(
str(record.path),
record.size,
record.modified_time,
record.created_time,
record.disk_label,
record.checksum,
record.status,
record.category,
record.duplicate_of
)
for record in batch
]
execute_batch(cursor, query, data, page_size=self.batch_size) execute_batch(cursor, query, data, page_size=self.batch_size)
def get_disk_info(self) -> list[DiskInfo]: def get_disk_info(self) -> list[DiskInfo]:
"""Get information about all disks self.logger.subsection('Querying disk information')
Returns:
List of DiskInfo objects
"""
self.logger.subsection("Querying disk information")
disks = [] disks = []
for disk_info in self.system_api.query_nvmes(): for disk_info in self.system_api.query_nvmes():
# Get mount point if available
mount_point = None mount_point = None
fs_type = "unknown" fs_type = 'unknown'
for mount in self.system_api.query_mounts(): for mount in self.system_api.query_mounts():
if mount.device == disk_info.device: if mount.device == disk_info.device:
mount_point = Path(mount.mount_point) mount_point = Path(mount.mount_point)
fs_type = mount.fs_type fs_type = mount.fs_type
break break
if mount_point: if mount_point:
total, used, free = self.system_api.get_disk_usage(mount_point) total, used, free = self.system_api.get_disk_usage(mount_point)
else: else:
total = disk_info.size total = disk_info.size
used = 0 used = 0
free = disk_info.size free = disk_info.size
disk = DiskInfo(name=disk_info.device, device=disk_info.device, mount_point=mount_point or Path('/'), total_size=total, used_size=used, free_size=free, fs_type=fs_type)
disk = DiskInfo(
name=disk_info.device,
device=disk_info.device,
mount_point=mount_point or Path("/"),
total_size=total,
used_size=used,
free_size=free,
fs_type=fs_type
)
disks.append(disk) disks.append(disk)
self.logger.info(f' {disk.name}: {disk.usage_percent:.1f}% used ({disk.used_size:,} / {disk.total_size:,} bytes)')
self.logger.info(
f" {disk.name}: {disk.usage_percent:.1f}% used "
f"({disk.used_size:,} / {disk.total_size:,} bytes)"
)
return disks return disks
def get_file_count(self, disk: Optional[str] = None) -> int: def get_file_count(self, disk: Optional[str]=None) -> int:
"""Get count of discovered files
Args:
disk: Optional disk filter
Returns:
Count of files
"""
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
if disk: if disk:
cursor.execute("SELECT COUNT(*) FROM files WHERE disk_label = %s", (disk,)) cursor.execute('SELECT COUNT(*) FROM files WHERE disk_label = %s', (disk,))
else: else:
cursor.execute("SELECT COUNT(*) FROM files") cursor.execute('SELECT COUNT(*) FROM files')
count = cursor.fetchone()[0] count = cursor.fetchone()[0]
cursor.close() cursor.close()
return count return count
def get_total_size(self, disk: Optional[str] = None) -> int: def get_total_size(self, disk: Optional[str]=None) -> int:
"""Get total size of discovered files
Args:
disk: Optional disk filter
Returns:
Total size in bytes
"""
conn = self._get_connection() conn = self._get_connection()
cursor = conn.cursor() cursor = conn.cursor()
if disk: if disk:
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE disk_label = %s", (disk,)) cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files WHERE disk_label = %s', (disk,))
else: else:
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files") cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files')
total = cursor.fetchone()[0] total = cursor.fetchone()[0]
cursor.close() cursor.close()
return total return total
def close(self): def close(self):
"""Close database connection""" if self._connection and (not self._connection.closed):
if self._connection and not self._connection.closed:
self._connection.close() self._connection.close()
def __enter__(self): def __enter__(self):
"""Context manager entry"""
return self return self
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
"""Context manager exit"""
self.close() self.close()

View File

@@ -1,28 +1,12 @@
"""File system scanner implementing IFileScanner protocol"""
import os import os
from pathlib import Path from pathlib import Path
from typing import Iterator, Optional, Callable from typing import Iterator, Optional, Callable
from datetime import datetime from datetime import datetime
from ._protocols import FileMeta from ._protocols import FileMeta
class FileScanner: class FileScanner:
"""File system scanner with filtering and error handling"""
def __init__( def __init__(self, follow_symlinks: bool=False, skip_hidden: bool=True, error_handler: Optional[Callable[[Exception, Path], None]]=None):
self,
follow_symlinks: bool = False,
skip_hidden: bool = True,
error_handler: Optional[Callable[[Exception, Path], None]] = None
):
"""Initialize file scanner
Args:
follow_symlinks: Whether to follow symbolic links
skip_hidden: Whether to skip hidden files/directories
error_handler: Optional callback for handling errors during scan
"""
self.follow_symlinks = follow_symlinks self.follow_symlinks = follow_symlinks
self.skip_hidden = skip_hidden self.skip_hidden = skip_hidden
self.error_handler = error_handler self.error_handler = error_handler
@@ -31,24 +15,14 @@ class FileScanner:
self._errors = 0 self._errors = 0
def scan(self, root: Path) -> Iterator[FileMeta]: def scan(self, root: Path) -> Iterator[FileMeta]:
"""Scan a directory tree and yield file metadata
Args:
root: Root directory to scan
Yields:
FileMeta objects for each discovered file
"""
if not root.exists(): if not root.exists():
error = FileNotFoundError(f"Path does not exist: {root}") error = FileNotFoundError(f'Path does not exist: {root}')
if self.error_handler: if self.error_handler:
self.error_handler(error, root) self.error_handler(error, root)
else: else:
raise error raise error
return return
if not root.is_dir(): if not root.is_dir():
# If root is a file, just return its metadata
try: try:
yield self._get_file_meta(root) yield self._get_file_meta(root)
except Exception as e: except Exception as e:
@@ -58,115 +32,59 @@ class FileScanner:
else: else:
raise raise
return return
# Walk directory tree
for dirpath, dirnames, filenames in os.walk(root, followlinks=self.follow_symlinks): for dirpath, dirnames, filenames in os.walk(root, followlinks=self.follow_symlinks):
current_dir = Path(dirpath) current_dir = Path(dirpath)
# Filter directories if needed
if self.skip_hidden: if self.skip_hidden:
dirnames[:] = [d for d in dirnames if not d.startswith('.')] dirnames[:] = [d for d in dirnames if not d.startswith('.')]
# Process files
for filename in filenames: for filename in filenames:
if self.skip_hidden and filename.startswith('.'): if self.skip_hidden and filename.startswith('.'):
continue continue
file_path = current_dir / filename file_path = current_dir / filename
try: try:
# Skip broken symlinks if file_path.is_symlink() and (not file_path.exists()):
if file_path.is_symlink() and not file_path.exists():
continue continue
meta = self._get_file_meta(file_path) meta = self._get_file_meta(file_path)
self._files_scanned += 1 self._files_scanned += 1
self._bytes_scanned += meta.size self._bytes_scanned += meta.size
yield meta yield meta
except PermissionError as e: except PermissionError as e:
self._errors += 1 self._errors += 1
if self.error_handler: if self.error_handler:
self.error_handler(e, file_path) self.error_handler(e, file_path)
# Continue scanning
continue continue
except Exception as e: except Exception as e:
self._errors += 1 self._errors += 1
if self.error_handler: if self.error_handler:
self.error_handler(e, file_path) self.error_handler(e, file_path)
# Continue scanning
continue continue
def _get_file_meta(self, path: Path) -> FileMeta: def _get_file_meta(self, path: Path) -> FileMeta:
"""Get file metadata
Args:
path: Path to file
Returns:
FileMeta object with file metadata
Raises:
OSError: If file cannot be accessed
"""
stat = path.stat() stat = path.stat()
# Get creation time (platform dependent)
created_time = stat.st_ctime created_time = stat.st_ctime
if hasattr(stat, 'st_birthtime'): if hasattr(stat, 'st_birthtime'):
created_time = stat.st_birthtime created_time = stat.st_birthtime
return FileMeta(path=path, size=stat.st_size, modified_time=stat.st_mtime, created_time=created_time)
return FileMeta(
path=path,
size=stat.st_size,
modified_time=stat.st_mtime,
created_time=created_time
)
@property @property
def files_scanned(self) -> int: def files_scanned(self) -> int:
"""Get count of files scanned"""
return self._files_scanned return self._files_scanned
@property @property
def bytes_scanned(self) -> int: def bytes_scanned(self) -> int:
"""Get total bytes scanned"""
return self._bytes_scanned return self._bytes_scanned
@property @property
def errors(self) -> int: def errors(self) -> int:
"""Get count of errors encountered"""
return self._errors return self._errors
def reset_stats(self) -> None: def reset_stats(self) -> None:
"""Reset scanning statistics"""
self._files_scanned = 0 self._files_scanned = 0
self._bytes_scanned = 0 self._bytes_scanned = 0
self._errors = 0 self._errors = 0
class FilteredScanner(FileScanner): class FilteredScanner(FileScanner):
"""Scanner with additional filtering capabilities"""
def __init__( def __init__(self, min_size: Optional[int]=None, max_size: Optional[int]=None, extensions: Optional[list[str]]=None, exclude_patterns: Optional[list[str]]=None, **kwargs):
self,
min_size: Optional[int] = None,
max_size: Optional[int] = None,
extensions: Optional[list[str]] = None,
exclude_patterns: Optional[list[str]] = None,
**kwargs
):
"""Initialize filtered scanner
Args:
min_size: Minimum file size in bytes
max_size: Maximum file size in bytes
extensions: List of file extensions to include (e.g., ['.txt', '.py'])
exclude_patterns: List of path patterns to exclude
**kwargs: Additional arguments passed to FileScanner
"""
super().__init__(**kwargs) super().__init__(**kwargs)
self.min_size = min_size self.min_size = min_size
self.max_size = max_size self.max_size = max_size
@@ -174,41 +92,19 @@ class FilteredScanner(FileScanner):
self.exclude_patterns = exclude_patterns or [] self.exclude_patterns = exclude_patterns or []
def scan(self, root: Path) -> Iterator[FileMeta]: def scan(self, root: Path) -> Iterator[FileMeta]:
"""Scan with additional filtering
Args:
root: Root directory to scan
Yields:
FileMeta objects for files matching filter criteria
"""
for meta in super().scan(root): for meta in super().scan(root):
# Size filtering
if self.min_size is not None and meta.size < self.min_size: if self.min_size is not None and meta.size < self.min_size:
continue continue
if self.max_size is not None and meta.size > self.max_size: if self.max_size is not None and meta.size > self.max_size:
continue continue
# Extension filtering
if self.extensions is not None: if self.extensions is not None:
if meta.path.suffix.lower() not in self.extensions: if meta.path.suffix.lower() not in self.extensions:
continue continue
# Exclude pattern filtering
if self._should_exclude(meta.path): if self._should_exclude(meta.path):
continue continue
yield meta yield meta
def _should_exclude(self, path: Path) -> bool: def _should_exclude(self, path: Path) -> bool:
"""Check if path matches any exclude pattern
Args:
path: Path to check
Returns:
True if path should be excluded
"""
path_str = str(path) path_str = str(path)
for pattern in self.exclude_patterns: for pattern in self.exclude_patterns:
if pattern in path_str: if pattern in path_str:

View File

@@ -1,167 +1,80 @@
"""System API for querying mounts and disks"""
import os import os
import subprocess import subprocess
from pathlib import Path from pathlib import Path
from typing import Optional from typing import Optional
import psutil import psutil
from ._protocols import MountInfo, DiskInfo from ._protocols import MountInfo, DiskInfo
class SystemAPI: class SystemAPI:
"""System information API for querying mounts and disks"""
def query_mounts(self) -> list[MountInfo]: def query_mounts(self) -> list[MountInfo]:
"""Query mounted filesystems
Returns:
List of MountInfo objects for all mounted filesystems
"""
mounts = [] mounts = []
for partition in psutil.disk_partitions(all=False): for partition in psutil.disk_partitions(all=False):
mount_info = MountInfo( mount_info = MountInfo(device=partition.device, mount_point=partition.mountpoint, fs_type=partition.fstype, options=partition.opts)
device=partition.device,
mount_point=partition.mountpoint,
fs_type=partition.fstype,
options=partition.opts
)
mounts.append(mount_info) mounts.append(mount_info)
return mounts return mounts
def query_nvmes(self) -> list[DiskInfo]: def query_nvmes(self) -> list[DiskInfo]:
"""Query NVMe/disk information
Returns:
List of DiskInfo objects for all disks
"""
disks = [] disks = []
# Try to get disk information using lsblk
try: try:
result = subprocess.run( result = subprocess.run(['lsblk', '-ndo', 'NAME,MODEL,SIZE,SERIAL', '-b'], capture_output=True, text=True, check=False)
['lsblk', '-ndo', 'NAME,MODEL,SIZE,SERIAL', '-b'],
capture_output=True,
text=True,
check=False
)
if result.returncode == 0: if result.returncode == 0:
for line in result.stdout.strip().split('\n'): for line in result.stdout.strip().split('\n'):
if not line.strip(): if not line.strip():
continue continue
parts = line.split(maxsplit=3) parts = line.split(maxsplit=3)
if len(parts) >= 3: if len(parts) >= 3:
device = f"/dev/{parts[0]}" device = f'/dev/{parts[0]}'
model = parts[1] if len(parts) > 1 else "Unknown" model = parts[1] if len(parts) > 1 else 'Unknown'
size_str = parts[2] if len(parts) > 2 else "0" size_str = parts[2] if len(parts) > 2 else '0'
serial = parts[3] if len(parts) > 3 else "Unknown" serial = parts[3] if len(parts) > 3 else 'Unknown'
try: try:
size = int(size_str) size = int(size_str)
except ValueError: except ValueError:
size = 0 size = 0
disk_info = DiskInfo(device=device, model=model, size=size, serial=serial)
disk_info = DiskInfo(
device=device,
model=model,
size=size,
serial=serial
)
disks.append(disk_info) disks.append(disk_info)
except FileNotFoundError: except FileNotFoundError:
# lsblk not available, fall back to basic info
pass pass
# If lsblk failed or unavailable, try alternative method
if not disks: if not disks:
disks = self._query_disks_fallback() disks = self._query_disks_fallback()
return disks return disks
def _query_disks_fallback(self) -> list[DiskInfo]: def _query_disks_fallback(self) -> list[DiskInfo]:
"""Fallback method for querying disk information
Returns:
List of DiskInfo objects using psutil
"""
disks = [] disks = []
seen_devices = set() seen_devices = set()
for partition in psutil.disk_partitions(all=True): for partition in psutil.disk_partitions(all=True):
device = partition.device device = partition.device
# Skip non-disk devices
if not device.startswith('/dev/'): if not device.startswith('/dev/'):
continue continue
# Get base device (e.g., /dev/sda from /dev/sda1)
base_device = self._get_base_device(device) base_device = self._get_base_device(device)
if base_device in seen_devices: if base_device in seen_devices:
continue continue
seen_devices.add(base_device) seen_devices.add(base_device)
try: try:
usage = psutil.disk_usage(partition.mountpoint) usage = psutil.disk_usage(partition.mountpoint)
size = usage.total size = usage.total
except (PermissionError, OSError): except (PermissionError, OSError):
size = 0 size = 0
disk_info = DiskInfo(device=base_device, model='Unknown', size=size, serial='Unknown')
disk_info = DiskInfo(
device=base_device,
model="Unknown",
size=size,
serial="Unknown"
)
disks.append(disk_info) disks.append(disk_info)
return disks return disks
def _get_base_device(self, device: str) -> str: def _get_base_device(self, device: str) -> str:
"""Extract base device name from partition device
Args:
device: Device path (e.g., /dev/sda1, /dev/nvme0n1p1)
Returns:
Base device path (e.g., /dev/sda, /dev/nvme0n1)
"""
# Handle NVMe devices
if 'nvme' in device: if 'nvme' in device:
# /dev/nvme0n1p1 -> /dev/nvme0n1
if 'p' in device: if 'p' in device:
return device.rsplit('p', 1)[0] return device.rsplit('p', 1)[0]
return device return device
# Handle standard devices (sda, sdb, etc.)
# /dev/sda1 -> /dev/sda
import re import re
match = re.match(r'(/dev/[a-z]+)', device) match = re.match('(/dev/[a-z]+)', device)
if match: if match:
return match.group(1) return match.group(1)
return device return device
def get_disk_for_path(self, path: Path) -> Optional[str]: def get_disk_for_path(self, path: Path) -> Optional[str]:
"""Get the disk/mount point for a given path
Args:
path: Path to check
Returns:
Mount point device or None if not found
"""
path = path.resolve() path = path.resolve()
# Find the mount point that contains this path
best_match = None best_match = None
best_match_len = 0 best_match_len = 0
for partition in psutil.disk_partitions(): for partition in psutil.disk_partitions():
mount_point = Path(partition.mountpoint) mount_point = Path(partition.mountpoint)
try: try:
@@ -172,39 +85,19 @@ class SystemAPI:
best_match_len = mount_len best_match_len = mount_len
except (ValueError, OSError): except (ValueError, OSError):
continue continue
return best_match return best_match
def get_disk_usage(self, path: Path) -> tuple[int, int, int]: def get_disk_usage(self, path: Path) -> tuple[int, int, int]:
"""Get disk usage for a path
Args:
path: Path to check
Returns:
Tuple of (total, used, free) in bytes
"""
try: try:
usage = psutil.disk_usage(str(path)) usage = psutil.disk_usage(str(path))
return usage.total, usage.used, usage.free return (usage.total, usage.used, usage.free)
except (PermissionError, OSError): except (PermissionError, OSError):
return 0, 0, 0 return (0, 0, 0)
def get_mount_point(self, path: Path) -> Optional[Path]: def get_mount_point(self, path: Path) -> Optional[Path]:
"""Get the mount point for a given path
Args:
path: Path to check
Returns:
Mount point path or None if not found
"""
path = path.resolve() path = path.resolve()
# Find the mount point that contains this path
best_match = None best_match = None
best_match_len = 0 best_match_len = 0
for partition in psutil.disk_partitions(): for partition in psutil.disk_partitions():
mount_point = Path(partition.mountpoint) mount_point = Path(partition.mountpoint)
try: try:
@@ -215,19 +108,9 @@ class SystemAPI:
best_match_len = mount_len best_match_len = mount_len
except (ValueError, OSError): except (ValueError, OSError):
continue continue
return best_match return best_match
def is_same_filesystem(self, path1: Path, path2: Path) -> bool: def is_same_filesystem(self, path1: Path, path2: Path) -> bool:
"""Check if two paths are on the same filesystem
Args:
path1: First path
path2: Second path
Returns:
True if paths are on the same filesystem
"""
try: try:
stat1 = path1.stat() stat1 = path1.stat()
stat2 = path2.stat() stat2 = path2.stat()

File diff suppressed because it is too large Load Diff