remove_doc
This commit is contained in:
3
app/content/__init__.py
Normal file
3
app/content/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
|||||||
|
from .profiler import ContentProfiler
|
||||||
|
from .extractors import ContentExtractor
|
||||||
|
__all__ = ['ContentProfiler', 'ContentExtractor']
|
||||||
@@ -3,22 +3,14 @@ from typing import Dict, Optional
|
|||||||
import json
|
import json
|
||||||
|
|
||||||
class ContentExtractor:
|
class ContentExtractor:
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.extractors = {
|
self.extractors = {'pdf_text': self._extract_pdf, 'ocr+caption': self._extract_image, 'transcribe': self._extract_audio, 'transcribe+scenes': self._extract_video, 'office_text': self._extract_document, 'read': self._extract_text, 'read+syntax': self._extract_code}
|
||||||
'pdf_text': self._extract_pdf,
|
|
||||||
'ocr+caption': self._extract_image,
|
|
||||||
'transcribe': self._extract_audio,
|
|
||||||
'transcribe+scenes': self._extract_video,
|
|
||||||
'office_text': self._extract_document,
|
|
||||||
'read': self._extract_text,
|
|
||||||
'read+syntax': self._extract_code
|
|
||||||
}
|
|
||||||
|
|
||||||
def extract(self, file_path: Path, extractor_type: str) -> Dict:
|
def extract(self, file_path: Path, extractor_type: str) -> Dict:
|
||||||
extractor = self.extractors.get(extractor_type)
|
extractor = self.extractors.get(extractor_type)
|
||||||
if not extractor:
|
if not extractor:
|
||||||
return {'error': f'Unknown extractor: {extractor_type}'}
|
return {'error': f'Unknown extractor: {extractor_type}'}
|
||||||
|
|
||||||
try:
|
try:
|
||||||
return extractor(file_path)
|
return extractor(file_path)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -28,11 +20,7 @@ class ContentExtractor:
|
|||||||
try:
|
try:
|
||||||
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
|
||||||
content = f.read(1024 * 1024)
|
content = f.read(1024 * 1024)
|
||||||
return {
|
return {'text': content, 'char_count': len(content), 'needs_llm': False}
|
||||||
'text': content,
|
|
||||||
'char_count': len(content),
|
|
||||||
'needs_llm': False
|
|
||||||
}
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {'error': str(e)}
|
return {'error': str(e)}
|
||||||
|
|
||||||
@@ -51,54 +39,24 @@ class ContentExtractor:
|
|||||||
pdf = PyPDF2.PdfReader(f)
|
pdf = PyPDF2.PdfReader(f)
|
||||||
for page in pdf.pages[:10]:
|
for page in pdf.pages[:10]:
|
||||||
text_parts.append(page.extract_text())
|
text_parts.append(page.extract_text())
|
||||||
|
|
||||||
text = '\n'.join(text_parts)
|
text = '\n'.join(text_parts)
|
||||||
return {
|
return {'text': text, 'pages_extracted': len(text_parts), 'needs_llm': len(text.strip()) > 100, 'type': 'document'}
|
||||||
'text': text,
|
|
||||||
'pages_extracted': len(text_parts),
|
|
||||||
'needs_llm': len(text.strip()) > 100,
|
|
||||||
'type': 'document'
|
|
||||||
}
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {'error': str(e), 'needs_ocr': True}
|
return {'error': str(e), 'needs_ocr': True}
|
||||||
|
|
||||||
def _extract_image(self, file_path: Path) -> Dict:
|
def _extract_image(self, file_path: Path) -> Dict:
|
||||||
return {
|
return {'type': 'image', 'needs_ocr': True, 'needs_caption': True, 'needs_llm': True, 'pipeline': ['ocr', 'caption', 'embedding'], 'status': 'pending'}
|
||||||
'type': 'image',
|
|
||||||
'needs_ocr': True,
|
|
||||||
'needs_caption': True,
|
|
||||||
'needs_llm': True,
|
|
||||||
'pipeline': ['ocr', 'caption', 'embedding'],
|
|
||||||
'status': 'pending'
|
|
||||||
}
|
|
||||||
|
|
||||||
def _extract_audio(self, file_path: Path) -> Dict:
|
def _extract_audio(self, file_path: Path) -> Dict:
|
||||||
return {
|
return {'type': 'audio', 'needs_transcription': True, 'needs_llm': True, 'pipeline': ['transcribe', 'summarize'], 'status': 'pending'}
|
||||||
'type': 'audio',
|
|
||||||
'needs_transcription': True,
|
|
||||||
'needs_llm': True,
|
|
||||||
'pipeline': ['transcribe', 'summarize'],
|
|
||||||
'status': 'pending'
|
|
||||||
}
|
|
||||||
|
|
||||||
def _extract_video(self, file_path: Path) -> Dict:
|
def _extract_video(self, file_path: Path) -> Dict:
|
||||||
return {
|
return {'type': 'video', 'needs_transcription': True, 'needs_scene_detection': True, 'needs_llm': True, 'pipeline': ['transcribe', 'scenes', 'summarize'], 'status': 'pending'}
|
||||||
'type': 'video',
|
|
||||||
'needs_transcription': True,
|
|
||||||
'needs_scene_detection': True,
|
|
||||||
'needs_llm': True,
|
|
||||||
'pipeline': ['transcribe', 'scenes', 'summarize'],
|
|
||||||
'status': 'pending'
|
|
||||||
}
|
|
||||||
|
|
||||||
def _extract_document(self, file_path: Path) -> Dict:
|
def _extract_document(self, file_path: Path) -> Dict:
|
||||||
try:
|
try:
|
||||||
import textract
|
import textract
|
||||||
text = textract.process(str(file_path)).decode('utf-8')
|
text = textract.process(str(file_path)).decode('utf-8')
|
||||||
return {
|
return {'text': text, 'type': 'document', 'needs_llm': len(text.strip()) > 100}
|
||||||
'text': text,
|
|
||||||
'type': 'document',
|
|
||||||
'needs_llm': len(text.strip()) > 100
|
|
||||||
}
|
|
||||||
except:
|
except:
|
||||||
return {'error': 'textract failed', 'needs_llm': True}
|
return {'error': 'textract failed', 'needs_llm': True}
|
||||||
|
|||||||
@@ -6,21 +6,10 @@ import json
|
|||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
class ContentProfiler:
|
class ContentProfiler:
|
||||||
|
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.mime_detector = magic.Magic(mime=True)
|
self.mime_detector = magic.Magic(mime=True)
|
||||||
|
self.kind_mapping = {'text': ['text/plain', 'text/html', 'text/css', 'text/javascript', 'text/markdown'], 'code': ['application/x-python', 'application/javascript', 'text/x-java', 'text/x-c'], 'pdf': ['application/pdf'], 'image': ['image/jpeg', 'image/png', 'image/gif', 'image/webp', 'image/svg+xml'], 'audio': ['audio/mpeg', 'audio/wav', 'audio/ogg', 'audio/flac'], 'video': ['video/mp4', 'video/x-matroska', 'video/avi', 'video/webm'], 'archive': ['application/zip', 'application/x-tar', 'application/gzip', 'application/x-7z-compressed'], 'document': ['application/msword', 'application/vnd.openxmlformats-officedocument'], 'spreadsheet': ['application/vnd.ms-excel', 'text/csv']}
|
||||||
self.kind_mapping = {
|
|
||||||
'text': ['text/plain', 'text/html', 'text/css', 'text/javascript', 'text/markdown'],
|
|
||||||
'code': ['application/x-python', 'application/javascript', 'text/x-java', 'text/x-c'],
|
|
||||||
'pdf': ['application/pdf'],
|
|
||||||
'image': ['image/jpeg', 'image/png', 'image/gif', 'image/webp', 'image/svg+xml'],
|
|
||||||
'audio': ['audio/mpeg', 'audio/wav', 'audio/ogg', 'audio/flac'],
|
|
||||||
'video': ['video/mp4', 'video/x-matroska', 'video/avi', 'video/webm'],
|
|
||||||
'archive': ['application/zip', 'application/x-tar', 'application/gzip', 'application/x-7z-compressed'],
|
|
||||||
'document': ['application/msword', 'application/vnd.openxmlformats-officedocument'],
|
|
||||||
'spreadsheet': ['application/vnd.ms-excel', 'text/csv']
|
|
||||||
}
|
|
||||||
|
|
||||||
self.text_exts = {'.txt', '.md', '.rst', '.log', '.json', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg'}
|
self.text_exts = {'.txt', '.md', '.rst', '.log', '.json', '.xml', '.yaml', '.yml', '.toml', '.ini', '.cfg'}
|
||||||
self.code_exts = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php'}
|
self.code_exts = {'.py', '.js', '.ts', '.java', '.go', '.rs', '.c', '.cpp', '.h', '.cs', '.rb', '.php'}
|
||||||
self.processable_kinds = {'text', 'code', 'pdf', 'image', 'audio', 'video', 'document'}
|
self.processable_kinds = {'text', 'code', 'pdf', 'image', 'audio', 'video', 'document'}
|
||||||
@@ -30,29 +19,12 @@ class ContentProfiler:
|
|||||||
stat = file_path.stat()
|
stat = file_path.stat()
|
||||||
size = stat.st_size
|
size = stat.st_size
|
||||||
mtime = datetime.fromtimestamp(stat.st_mtime)
|
mtime = datetime.fromtimestamp(stat.st_mtime)
|
||||||
|
|
||||||
mime_type = self._detect_mime(file_path)
|
mime_type = self._detect_mime(file_path)
|
||||||
kind = self._determine_kind(file_path, mime_type)
|
kind = self._determine_kind(file_path, mime_type)
|
||||||
|
profile = {'path': str(file_path), 'size': size, 'mtime': mtime.isoformat(), 'mime': mime_type, 'kind': kind, 'processable': kind in self.processable_kinds, 'extractor': self._suggest_extractor(kind, mime_type), 'hints': self._extract_hints(file_path, kind, mime_type, size)}
|
||||||
profile = {
|
|
||||||
'path': str(file_path),
|
|
||||||
'size': size,
|
|
||||||
'mtime': mtime.isoformat(),
|
|
||||||
'mime': mime_type,
|
|
||||||
'kind': kind,
|
|
||||||
'processable': kind in self.processable_kinds,
|
|
||||||
'extractor': self._suggest_extractor(kind, mime_type),
|
|
||||||
'hints': self._extract_hints(file_path, kind, mime_type, size)
|
|
||||||
}
|
|
||||||
|
|
||||||
return profile
|
return profile
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
return {
|
return {'path': str(file_path), 'error': str(e), 'processable': False}
|
||||||
'path': str(file_path),
|
|
||||||
'error': str(e),
|
|
||||||
'processable': False
|
|
||||||
}
|
|
||||||
|
|
||||||
def _detect_mime(self, file_path: Path) -> str:
|
def _detect_mime(self, file_path: Path) -> str:
|
||||||
try:
|
try:
|
||||||
@@ -63,61 +35,42 @@ class ContentProfiler:
|
|||||||
|
|
||||||
def _determine_kind(self, file_path: Path, mime_type: str) -> str:
|
def _determine_kind(self, file_path: Path, mime_type: str) -> str:
|
||||||
for kind, mimes in self.kind_mapping.items():
|
for kind, mimes in self.kind_mapping.items():
|
||||||
if any(mime in mime_type for mime in mimes):
|
if any((mime in mime_type for mime in mimes)):
|
||||||
return kind
|
return kind
|
||||||
|
|
||||||
suffix = file_path.suffix.lower()
|
suffix = file_path.suffix.lower()
|
||||||
if suffix in self.text_exts:
|
if suffix in self.text_exts:
|
||||||
return 'text'
|
return 'text'
|
||||||
if suffix in self.code_exts:
|
if suffix in self.code_exts:
|
||||||
return 'code'
|
return 'code'
|
||||||
|
|
||||||
return 'unknown'
|
return 'unknown'
|
||||||
|
|
||||||
def _suggest_extractor(self, kind: str, mime_type: str) -> Optional[str]:
|
def _suggest_extractor(self, kind: str, mime_type: str) -> Optional[str]:
|
||||||
extractors = {
|
extractors = {'pdf': 'pdf_text', 'image': 'ocr+caption', 'audio': 'transcribe', 'video': 'transcribe+scenes', 'document': 'office_text', 'text': 'read', 'code': 'read+syntax'}
|
||||||
'pdf': 'pdf_text',
|
|
||||||
'image': 'ocr+caption',
|
|
||||||
'audio': 'transcribe',
|
|
||||||
'video': 'transcribe+scenes',
|
|
||||||
'document': 'office_text',
|
|
||||||
'text': 'read',
|
|
||||||
'code': 'read+syntax'
|
|
||||||
}
|
|
||||||
return extractors.get(kind)
|
return extractors.get(kind)
|
||||||
|
|
||||||
def _extract_hints(self, file_path: Path, kind: str, mime_type: str, size: int) -> Dict:
|
def _extract_hints(self, file_path: Path, kind: str, mime_type: str, size: int) -> Dict:
|
||||||
hints = {}
|
hints = {}
|
||||||
|
|
||||||
if kind == 'text' or kind == 'code':
|
if kind == 'text' or kind == 'code':
|
||||||
hints['language'] = self._guess_language(file_path)
|
hints['language'] = self._guess_language(file_path)
|
||||||
if size < 1024 * 1024:
|
if size < 1024 * 1024:
|
||||||
hints['lines'] = self._count_lines(file_path)
|
hints['lines'] = self._count_lines(file_path)
|
||||||
|
|
||||||
if kind == 'pdf':
|
if kind == 'pdf':
|
||||||
hints['page_count'] = self._get_pdf_pages(file_path)
|
hints['page_count'] = self._get_pdf_pages(file_path)
|
||||||
|
|
||||||
if kind in ['audio', 'video']:
|
if kind in ['audio', 'video']:
|
||||||
hints['duration'] = self._get_media_duration(file_path)
|
hints['duration'] = self._get_media_duration(file_path)
|
||||||
|
|
||||||
if kind == 'image':
|
if kind == 'image':
|
||||||
hints['has_exif'] = self._has_exif(file_path)
|
hints['has_exif'] = self._has_exif(file_path)
|
||||||
hints['dimensions'] = self._get_image_dimensions(file_path)
|
hints['dimensions'] = self._get_image_dimensions(file_path)
|
||||||
|
|
||||||
return hints
|
return hints
|
||||||
|
|
||||||
def _guess_language(self, file_path: Path) -> Optional[str]:
|
def _guess_language(self, file_path: Path) -> Optional[str]:
|
||||||
lang_map = {
|
lang_map = {'.py': 'python', '.js': 'javascript', '.ts': 'typescript', '.java': 'java', '.go': 'go', '.rs': 'rust', '.c': 'c', '.cpp': 'cpp', '.cs': 'csharp', '.rb': 'ruby', '.php': 'php'}
|
||||||
'.py': 'python', '.js': 'javascript', '.ts': 'typescript',
|
|
||||||
'.java': 'java', '.go': 'go', '.rs': 'rust', '.c': 'c',
|
|
||||||
'.cpp': 'cpp', '.cs': 'csharp', '.rb': 'ruby', '.php': 'php'
|
|
||||||
}
|
|
||||||
return lang_map.get(file_path.suffix.lower())
|
return lang_map.get(file_path.suffix.lower())
|
||||||
|
|
||||||
def _count_lines(self, file_path: Path) -> Optional[int]:
|
def _count_lines(self, file_path: Path) -> Optional[int]:
|
||||||
try:
|
try:
|
||||||
with open(file_path, 'rb') as f:
|
with open(file_path, 'rb') as f:
|
||||||
return sum(1 for _ in f)
|
return sum((1 for _ in f))
|
||||||
except:
|
except:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
|||||||
@@ -1,17 +1,5 @@
|
|||||||
"""Discovery package exports"""
|
|
||||||
from .scanner import FileScanner, FilteredScanner
|
from .scanner import FileScanner, FilteredScanner
|
||||||
from .system import SystemAPI
|
from .system import SystemAPI
|
||||||
from .engine import DiscoveryEngine
|
from .engine import DiscoveryEngine
|
||||||
from ._protocols import FileMeta, MountInfo, DiskInfo, IFileScanner, ISystemAPI
|
from ._protocols import FileMeta, MountInfo, DiskInfo, IFileScanner, ISystemAPI
|
||||||
|
__all__ = ['FileScanner', 'FilteredScanner', 'SystemAPI', 'DiscoveryEngine', 'FileMeta', 'MountInfo', 'DiskInfo', 'IFileScanner', 'ISystemAPI']
|
||||||
__all__ = [
|
|
||||||
'FileScanner',
|
|
||||||
'FilteredScanner',
|
|
||||||
'SystemAPI',
|
|
||||||
'DiscoveryEngine',
|
|
||||||
'FileMeta',
|
|
||||||
'MountInfo',
|
|
||||||
'DiskInfo',
|
|
||||||
'IFileScanner',
|
|
||||||
'ISystemAPI',
|
|
||||||
]
|
|
||||||
|
|||||||
@@ -1,54 +1,37 @@
|
|||||||
"""Protocol definitions for the discovery package"""
|
|
||||||
from typing import Iterator, Protocol, Any
|
from typing import Iterator, Protocol, Any
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from dataclasses import dataclass
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class FileMeta:
|
class FileMeta:
|
||||||
"""Metadata for a discovered file"""
|
|
||||||
path: Path
|
path: Path
|
||||||
size: int
|
size: int
|
||||||
modified_time: float
|
modified_time: float
|
||||||
created_time: float
|
created_time: float
|
||||||
# Add other metadata fields as needed
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class MountInfo:
|
class MountInfo:
|
||||||
"""Information about a mounted filesystem"""
|
|
||||||
device: str
|
device: str
|
||||||
mount_point: str
|
mount_point: str
|
||||||
fs_type: str
|
fs_type: str
|
||||||
options: str
|
options: str
|
||||||
# Add other mount info fields as needed
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
@dataclass
|
||||||
class DiskInfo:
|
class DiskInfo:
|
||||||
"""Information about a disk/NVMe device"""
|
|
||||||
device: str
|
device: str
|
||||||
model: str
|
model: str
|
||||||
size: int
|
size: int
|
||||||
serial: str
|
serial: str
|
||||||
# Add other disk info fields as needed
|
|
||||||
|
|
||||||
|
|
||||||
class IFileScanner(Protocol):
|
class IFileScanner(Protocol):
|
||||||
"""Protocol for file scanning operations"""
|
|
||||||
|
|
||||||
def scan(self, root: Path) -> Iterator[FileMeta]:
|
def scan(self, root: Path) -> Iterator[FileMeta]:
|
||||||
"""Scan a directory tree and yield file metadata"""
|
|
||||||
...
|
...
|
||||||
|
|
||||||
|
|
||||||
class ISystemAPI(Protocol):
|
class ISystemAPI(Protocol):
|
||||||
"""Protocol for system information queries"""
|
|
||||||
|
|
||||||
def query_mounts(self) -> list[MountInfo]:
|
def query_mounts(self) -> list[MountInfo]:
|
||||||
"""Query mounted filesystems"""
|
|
||||||
...
|
...
|
||||||
|
|
||||||
def query_nvmes(self) -> list[DiskInfo]:
|
def query_nvmes(self) -> list[DiskInfo]:
|
||||||
"""Query NVMe/disk information"""
|
|
||||||
...
|
...
|
||||||
|
|||||||
@@ -1,10 +1,8 @@
|
|||||||
"""Discovery engine coordinating scanner and system APIs"""
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional, Callable
|
from typing import Optional, Callable
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
import psycopg2
|
import psycopg2
|
||||||
from psycopg2.extras import execute_batch
|
from psycopg2.extras import execute_batch
|
||||||
|
|
||||||
from .scanner import FileScanner
|
from .scanner import FileScanner
|
||||||
from .system import SystemAPI
|
from .system import SystemAPI
|
||||||
from ._protocols import FileMeta
|
from ._protocols import FileMeta
|
||||||
@@ -12,23 +10,9 @@ from ..shared.models import FileRecord, DiskInfo, ProcessingStats
|
|||||||
from ..shared.config import DatabaseConfig
|
from ..shared.config import DatabaseConfig
|
||||||
from ..shared.logger import ProgressLogger
|
from ..shared.logger import ProgressLogger
|
||||||
|
|
||||||
|
|
||||||
class DiscoveryEngine:
|
class DiscoveryEngine:
|
||||||
"""Discovery engine for scanning and cataloging files"""
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(self, db_config: DatabaseConfig, logger: ProgressLogger, batch_size: int=1000):
|
||||||
self,
|
|
||||||
db_config: DatabaseConfig,
|
|
||||||
logger: ProgressLogger,
|
|
||||||
batch_size: int = 1000
|
|
||||||
):
|
|
||||||
"""Initialize discovery engine
|
|
||||||
|
|
||||||
Args:
|
|
||||||
db_config: Database configuration
|
|
||||||
logger: Progress logger
|
|
||||||
batch_size: Number of records to batch before database commit
|
|
||||||
"""
|
|
||||||
self.db_config = db_config
|
self.db_config = db_config
|
||||||
self.logger = logger
|
self.logger = logger
|
||||||
self.batch_size = batch_size
|
self.batch_size = batch_size
|
||||||
@@ -36,286 +20,114 @@ class DiscoveryEngine:
|
|||||||
self._connection = None
|
self._connection = None
|
||||||
|
|
||||||
def _get_connection(self):
|
def _get_connection(self):
|
||||||
"""Get or create database connection"""
|
|
||||||
if self._connection is None or self._connection.closed:
|
if self._connection is None or self._connection.closed:
|
||||||
self._connection = psycopg2.connect(
|
self._connection = psycopg2.connect(host=self.db_config.host, port=self.db_config.port, database=self.db_config.database, user=self.db_config.user, password=self.db_config.password)
|
||||||
host=self.db_config.host,
|
|
||||||
port=self.db_config.port,
|
|
||||||
database=self.db_config.database,
|
|
||||||
user=self.db_config.user,
|
|
||||||
password=self.db_config.password
|
|
||||||
)
|
|
||||||
return self._connection
|
return self._connection
|
||||||
|
|
||||||
def _ensure_tables(self):
|
def _ensure_tables(self):
|
||||||
"""Ensure database tables exist"""
|
|
||||||
conn = self._get_connection()
|
conn = self._get_connection()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("\n CREATE TABLE IF NOT EXISTS files (\n id SERIAL PRIMARY KEY,\n path TEXT NOT NULL UNIQUE,\n size BIGINT NOT NULL,\n modified_time DOUBLE PRECISION NOT NULL,\n created_time DOUBLE PRECISION NOT NULL,\n disk_label TEXT NOT NULL,\n checksum TEXT,\n status TEXT DEFAULT 'indexed',\n category TEXT,\n duplicate_of TEXT,\n discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,\n updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP\n )\n ")
|
||||||
# Create files table
|
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_path ON files(path)\n ')
|
||||||
cursor.execute("""
|
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label)\n ')
|
||||||
CREATE TABLE IF NOT EXISTS files (
|
cursor.execute('\n CREATE INDEX IF NOT EXISTS idx_files_checksum ON files(checksum)\n ')
|
||||||
id SERIAL PRIMARY KEY,
|
|
||||||
path TEXT NOT NULL UNIQUE,
|
|
||||||
size BIGINT NOT NULL,
|
|
||||||
modified_time DOUBLE PRECISION NOT NULL,
|
|
||||||
created_time DOUBLE PRECISION NOT NULL,
|
|
||||||
disk_label TEXT NOT NULL,
|
|
||||||
checksum TEXT,
|
|
||||||
status TEXT DEFAULT 'indexed',
|
|
||||||
category TEXT,
|
|
||||||
duplicate_of TEXT,
|
|
||||||
discovered_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
|
||||||
updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
|
||||||
)
|
|
||||||
""")
|
|
||||||
|
|
||||||
# Create index on path
|
|
||||||
cursor.execute("""
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_files_path ON files(path)
|
|
||||||
""")
|
|
||||||
|
|
||||||
# Create index on disk
|
|
||||||
cursor.execute("""
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_files_disk ON files(disk_label)
|
|
||||||
""")
|
|
||||||
|
|
||||||
# Create index on checksum
|
|
||||||
cursor.execute("""
|
|
||||||
CREATE INDEX IF NOT EXISTS idx_files_checksum ON files(checksum)
|
|
||||||
""")
|
|
||||||
|
|
||||||
conn.commit()
|
conn.commit()
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
|
||||||
def discover_path(
|
def discover_path(self, root: Path, scanner: Optional[FileScanner]=None, progress_callback: Optional[Callable[[int, int, ProcessingStats], None]]=None) -> ProcessingStats:
|
||||||
self,
|
self.logger.section(f'Discovering: {root}')
|
||||||
root: Path,
|
|
||||||
scanner: Optional[FileScanner] = None,
|
|
||||||
progress_callback: Optional[Callable[[int, int, ProcessingStats], None]] = None
|
|
||||||
) -> ProcessingStats:
|
|
||||||
"""Discover and catalog files in a path
|
|
||||||
|
|
||||||
Args:
|
|
||||||
root: Root path to discover
|
|
||||||
scanner: Optional custom scanner (default: FileScanner())
|
|
||||||
progress_callback: Optional callback for progress updates
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
ProcessingStats with discovery statistics
|
|
||||||
"""
|
|
||||||
self.logger.section(f"Discovering: {root}")
|
|
||||||
|
|
||||||
# Ensure tables exist
|
|
||||||
self._ensure_tables()
|
self._ensure_tables()
|
||||||
|
|
||||||
# Create scanner if not provided
|
|
||||||
if scanner is None:
|
if scanner is None:
|
||||||
scanner = FileScanner(
|
scanner = FileScanner(error_handler=lambda e, p: self.logger.warning(f'Error scanning {p}: {e}'))
|
||||||
error_handler=lambda e, p: self.logger.warning(f"Error scanning {p}: {e}")
|
|
||||||
)
|
|
||||||
|
|
||||||
# Get disk info for the root path
|
|
||||||
disk = self.system_api.get_disk_for_path(root)
|
disk = self.system_api.get_disk_for_path(root)
|
||||||
if disk is None:
|
if disk is None:
|
||||||
disk = str(root)
|
disk = str(root)
|
||||||
|
|
||||||
# Initialize statistics
|
|
||||||
stats = ProcessingStats()
|
stats = ProcessingStats()
|
||||||
batch = []
|
batch = []
|
||||||
|
|
||||||
conn = self._get_connection()
|
conn = self._get_connection()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Scan files
|
|
||||||
for file_meta in scanner.scan(root):
|
for file_meta in scanner.scan(root):
|
||||||
# Create file record
|
record = FileRecord(path=file_meta.path, size=file_meta.size, modified_time=file_meta.modified_time, created_time=file_meta.created_time, disk_label=disk)
|
||||||
record = FileRecord(
|
|
||||||
path=file_meta.path,
|
|
||||||
size=file_meta.size,
|
|
||||||
modified_time=file_meta.modified_time,
|
|
||||||
created_time=file_meta.created_time,
|
|
||||||
disk_label=disk
|
|
||||||
)
|
|
||||||
|
|
||||||
batch.append(record)
|
batch.append(record)
|
||||||
stats.files_processed += 1
|
stats.files_processed += 1
|
||||||
stats.bytes_processed += record.size
|
stats.bytes_processed += record.size
|
||||||
|
|
||||||
# Batch insert
|
|
||||||
if len(batch) >= self.batch_size:
|
if len(batch) >= self.batch_size:
|
||||||
self._insert_batch(cursor, batch)
|
self._insert_batch(cursor, batch)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
batch.clear()
|
batch.clear()
|
||||||
|
|
||||||
# Progress callback
|
|
||||||
if progress_callback:
|
if progress_callback:
|
||||||
progress_callback(stats.files_processed, 0, stats)
|
progress_callback(stats.files_processed, 0, stats)
|
||||||
|
|
||||||
# Log progress
|
|
||||||
if stats.files_processed % (self.batch_size * 10) == 0:
|
if stats.files_processed % (self.batch_size * 10) == 0:
|
||||||
self.logger.progress(
|
self.logger.progress(stats.files_processed, stats.files_processed, prefix='Files discovered', bytes_processed=stats.bytes_processed, elapsed_seconds=stats.elapsed_seconds)
|
||||||
stats.files_processed,
|
|
||||||
stats.files_processed, # We don't know total
|
|
||||||
prefix="Files discovered",
|
|
||||||
bytes_processed=stats.bytes_processed,
|
|
||||||
elapsed_seconds=stats.elapsed_seconds
|
|
||||||
)
|
|
||||||
|
|
||||||
# Insert remaining batch
|
|
||||||
if batch:
|
if batch:
|
||||||
self._insert_batch(cursor, batch)
|
self._insert_batch(cursor, batch)
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
stats.files_succeeded = stats.files_processed
|
stats.files_succeeded = stats.files_processed
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
conn.rollback()
|
conn.rollback()
|
||||||
self.logger.error(f"Discovery failed: {e}")
|
self.logger.error(f'Discovery failed: {e}')
|
||||||
raise
|
raise
|
||||||
|
|
||||||
finally:
|
finally:
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
self.logger.info(f'Discovery complete: {stats.files_processed} files, {stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s')
|
||||||
self.logger.info(
|
|
||||||
f"Discovery complete: {stats.files_processed} files, "
|
|
||||||
f"{stats.bytes_processed:,} bytes in {stats.elapsed_seconds:.1f}s"
|
|
||||||
)
|
|
||||||
|
|
||||||
return stats
|
return stats
|
||||||
|
|
||||||
def _insert_batch(self, cursor, batch: list[FileRecord]):
|
def _insert_batch(self, cursor, batch: list[FileRecord]):
|
||||||
"""Insert batch of file records
|
query = '\n INSERT INTO files (path, size, modified_time, created_time, disk_label, checksum, status, category, duplicate_of)\n VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)\n ON CONFLICT (path) DO UPDATE SET\n size = EXCLUDED.size,\n modified_time = EXCLUDED.modified_time,\n updated_at = CURRENT_TIMESTAMP\n '
|
||||||
|
data = [(str(record.path), record.size, record.modified_time, record.created_time, record.disk_label, record.checksum, record.status, record.category, record.duplicate_of) for record in batch]
|
||||||
Args:
|
|
||||||
cursor: Database cursor
|
|
||||||
batch: List of FileRecord objects
|
|
||||||
"""
|
|
||||||
query = """
|
|
||||||
INSERT INTO files (path, size, modified_time, created_time, disk_label, checksum, status, category, duplicate_of)
|
|
||||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s)
|
|
||||||
ON CONFLICT (path) DO UPDATE SET
|
|
||||||
size = EXCLUDED.size,
|
|
||||||
modified_time = EXCLUDED.modified_time,
|
|
||||||
updated_at = CURRENT_TIMESTAMP
|
|
||||||
"""
|
|
||||||
|
|
||||||
data = [
|
|
||||||
(
|
|
||||||
str(record.path),
|
|
||||||
record.size,
|
|
||||||
record.modified_time,
|
|
||||||
record.created_time,
|
|
||||||
record.disk_label,
|
|
||||||
record.checksum,
|
|
||||||
record.status,
|
|
||||||
record.category,
|
|
||||||
record.duplicate_of
|
|
||||||
)
|
|
||||||
for record in batch
|
|
||||||
]
|
|
||||||
|
|
||||||
execute_batch(cursor, query, data, page_size=self.batch_size)
|
execute_batch(cursor, query, data, page_size=self.batch_size)
|
||||||
|
|
||||||
def get_disk_info(self) -> list[DiskInfo]:
|
def get_disk_info(self) -> list[DiskInfo]:
|
||||||
"""Get information about all disks
|
self.logger.subsection('Querying disk information')
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of DiskInfo objects
|
|
||||||
"""
|
|
||||||
self.logger.subsection("Querying disk information")
|
|
||||||
|
|
||||||
disks = []
|
disks = []
|
||||||
for disk_info in self.system_api.query_nvmes():
|
for disk_info in self.system_api.query_nvmes():
|
||||||
# Get mount point if available
|
|
||||||
mount_point = None
|
mount_point = None
|
||||||
fs_type = "unknown"
|
fs_type = 'unknown'
|
||||||
|
|
||||||
for mount in self.system_api.query_mounts():
|
for mount in self.system_api.query_mounts():
|
||||||
if mount.device == disk_info.device:
|
if mount.device == disk_info.device:
|
||||||
mount_point = Path(mount.mount_point)
|
mount_point = Path(mount.mount_point)
|
||||||
fs_type = mount.fs_type
|
fs_type = mount.fs_type
|
||||||
break
|
break
|
||||||
|
|
||||||
if mount_point:
|
if mount_point:
|
||||||
total, used, free = self.system_api.get_disk_usage(mount_point)
|
total, used, free = self.system_api.get_disk_usage(mount_point)
|
||||||
else:
|
else:
|
||||||
total = disk_info.size
|
total = disk_info.size
|
||||||
used = 0
|
used = 0
|
||||||
free = disk_info.size
|
free = disk_info.size
|
||||||
|
disk = DiskInfo(name=disk_info.device, device=disk_info.device, mount_point=mount_point or Path('/'), total_size=total, used_size=used, free_size=free, fs_type=fs_type)
|
||||||
disk = DiskInfo(
|
|
||||||
name=disk_info.device,
|
|
||||||
device=disk_info.device,
|
|
||||||
mount_point=mount_point or Path("/"),
|
|
||||||
total_size=total,
|
|
||||||
used_size=used,
|
|
||||||
free_size=free,
|
|
||||||
fs_type=fs_type
|
|
||||||
)
|
|
||||||
disks.append(disk)
|
disks.append(disk)
|
||||||
|
self.logger.info(f' {disk.name}: {disk.usage_percent:.1f}% used ({disk.used_size:,} / {disk.total_size:,} bytes)')
|
||||||
self.logger.info(
|
|
||||||
f" {disk.name}: {disk.usage_percent:.1f}% used "
|
|
||||||
f"({disk.used_size:,} / {disk.total_size:,} bytes)"
|
|
||||||
)
|
|
||||||
|
|
||||||
return disks
|
return disks
|
||||||
|
|
||||||
def get_file_count(self, disk: Optional[str]=None) -> int:
|
def get_file_count(self, disk: Optional[str]=None) -> int:
|
||||||
"""Get count of discovered files
|
|
||||||
|
|
||||||
Args:
|
|
||||||
disk: Optional disk filter
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Count of files
|
|
||||||
"""
|
|
||||||
conn = self._get_connection()
|
conn = self._get_connection()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
if disk:
|
if disk:
|
||||||
cursor.execute("SELECT COUNT(*) FROM files WHERE disk_label = %s", (disk,))
|
cursor.execute('SELECT COUNT(*) FROM files WHERE disk_label = %s', (disk,))
|
||||||
else:
|
else:
|
||||||
cursor.execute("SELECT COUNT(*) FROM files")
|
cursor.execute('SELECT COUNT(*) FROM files')
|
||||||
|
|
||||||
count = cursor.fetchone()[0]
|
count = cursor.fetchone()[0]
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
|
||||||
return count
|
return count
|
||||||
|
|
||||||
def get_total_size(self, disk: Optional[str]=None) -> int:
|
def get_total_size(self, disk: Optional[str]=None) -> int:
|
||||||
"""Get total size of discovered files
|
|
||||||
|
|
||||||
Args:
|
|
||||||
disk: Optional disk filter
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Total size in bytes
|
|
||||||
"""
|
|
||||||
conn = self._get_connection()
|
conn = self._get_connection()
|
||||||
cursor = conn.cursor()
|
cursor = conn.cursor()
|
||||||
|
|
||||||
if disk:
|
if disk:
|
||||||
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files WHERE disk_label = %s", (disk,))
|
cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files WHERE disk_label = %s', (disk,))
|
||||||
else:
|
else:
|
||||||
cursor.execute("SELECT COALESCE(SUM(size), 0) FROM files")
|
cursor.execute('SELECT COALESCE(SUM(size), 0) FROM files')
|
||||||
|
|
||||||
total = cursor.fetchone()[0]
|
total = cursor.fetchone()[0]
|
||||||
cursor.close()
|
cursor.close()
|
||||||
|
|
||||||
return total
|
return total
|
||||||
|
|
||||||
def close(self):
|
def close(self):
|
||||||
"""Close database connection"""
|
if self._connection and (not self._connection.closed):
|
||||||
if self._connection and not self._connection.closed:
|
|
||||||
self._connection.close()
|
self._connection.close()
|
||||||
|
|
||||||
def __enter__(self):
|
def __enter__(self):
|
||||||
"""Context manager entry"""
|
|
||||||
return self
|
return self
|
||||||
|
|
||||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||||
"""Context manager exit"""
|
|
||||||
self.close()
|
self.close()
|
||||||
|
|||||||
@@ -1,28 +1,12 @@
|
|||||||
"""File system scanner implementing IFileScanner protocol"""
|
|
||||||
import os
|
import os
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Iterator, Optional, Callable
|
from typing import Iterator, Optional, Callable
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
|
||||||
from ._protocols import FileMeta
|
from ._protocols import FileMeta
|
||||||
|
|
||||||
|
|
||||||
class FileScanner:
|
class FileScanner:
|
||||||
"""File system scanner with filtering and error handling"""
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(self, follow_symlinks: bool=False, skip_hidden: bool=True, error_handler: Optional[Callable[[Exception, Path], None]]=None):
|
||||||
self,
|
|
||||||
follow_symlinks: bool = False,
|
|
||||||
skip_hidden: bool = True,
|
|
||||||
error_handler: Optional[Callable[[Exception, Path], None]] = None
|
|
||||||
):
|
|
||||||
"""Initialize file scanner
|
|
||||||
|
|
||||||
Args:
|
|
||||||
follow_symlinks: Whether to follow symbolic links
|
|
||||||
skip_hidden: Whether to skip hidden files/directories
|
|
||||||
error_handler: Optional callback for handling errors during scan
|
|
||||||
"""
|
|
||||||
self.follow_symlinks = follow_symlinks
|
self.follow_symlinks = follow_symlinks
|
||||||
self.skip_hidden = skip_hidden
|
self.skip_hidden = skip_hidden
|
||||||
self.error_handler = error_handler
|
self.error_handler = error_handler
|
||||||
@@ -31,24 +15,14 @@ class FileScanner:
|
|||||||
self._errors = 0
|
self._errors = 0
|
||||||
|
|
||||||
def scan(self, root: Path) -> Iterator[FileMeta]:
|
def scan(self, root: Path) -> Iterator[FileMeta]:
|
||||||
"""Scan a directory tree and yield file metadata
|
|
||||||
|
|
||||||
Args:
|
|
||||||
root: Root directory to scan
|
|
||||||
|
|
||||||
Yields:
|
|
||||||
FileMeta objects for each discovered file
|
|
||||||
"""
|
|
||||||
if not root.exists():
|
if not root.exists():
|
||||||
error = FileNotFoundError(f"Path does not exist: {root}")
|
error = FileNotFoundError(f'Path does not exist: {root}')
|
||||||
if self.error_handler:
|
if self.error_handler:
|
||||||
self.error_handler(error, root)
|
self.error_handler(error, root)
|
||||||
else:
|
else:
|
||||||
raise error
|
raise error
|
||||||
return
|
return
|
||||||
|
|
||||||
if not root.is_dir():
|
if not root.is_dir():
|
||||||
# If root is a file, just return its metadata
|
|
||||||
try:
|
try:
|
||||||
yield self._get_file_meta(root)
|
yield self._get_file_meta(root)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -58,115 +32,59 @@ class FileScanner:
|
|||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
return
|
return
|
||||||
|
|
||||||
# Walk directory tree
|
|
||||||
for dirpath, dirnames, filenames in os.walk(root, followlinks=self.follow_symlinks):
|
for dirpath, dirnames, filenames in os.walk(root, followlinks=self.follow_symlinks):
|
||||||
current_dir = Path(dirpath)
|
current_dir = Path(dirpath)
|
||||||
|
|
||||||
# Filter directories if needed
|
|
||||||
if self.skip_hidden:
|
if self.skip_hidden:
|
||||||
dirnames[:] = [d for d in dirnames if not d.startswith('.')]
|
dirnames[:] = [d for d in dirnames if not d.startswith('.')]
|
||||||
|
|
||||||
# Process files
|
|
||||||
for filename in filenames:
|
for filename in filenames:
|
||||||
if self.skip_hidden and filename.startswith('.'):
|
if self.skip_hidden and filename.startswith('.'):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
file_path = current_dir / filename
|
file_path = current_dir / filename
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Skip broken symlinks
|
if file_path.is_symlink() and (not file_path.exists()):
|
||||||
if file_path.is_symlink() and not file_path.exists():
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
meta = self._get_file_meta(file_path)
|
meta = self._get_file_meta(file_path)
|
||||||
self._files_scanned += 1
|
self._files_scanned += 1
|
||||||
self._bytes_scanned += meta.size
|
self._bytes_scanned += meta.size
|
||||||
|
|
||||||
yield meta
|
yield meta
|
||||||
|
|
||||||
except PermissionError as e:
|
except PermissionError as e:
|
||||||
self._errors += 1
|
self._errors += 1
|
||||||
if self.error_handler:
|
if self.error_handler:
|
||||||
self.error_handler(e, file_path)
|
self.error_handler(e, file_path)
|
||||||
# Continue scanning
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
self._errors += 1
|
self._errors += 1
|
||||||
if self.error_handler:
|
if self.error_handler:
|
||||||
self.error_handler(e, file_path)
|
self.error_handler(e, file_path)
|
||||||
# Continue scanning
|
|
||||||
continue
|
continue
|
||||||
|
|
||||||
def _get_file_meta(self, path: Path) -> FileMeta:
|
def _get_file_meta(self, path: Path) -> FileMeta:
|
||||||
"""Get file metadata
|
|
||||||
|
|
||||||
Args:
|
|
||||||
path: Path to file
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
FileMeta object with file metadata
|
|
||||||
|
|
||||||
Raises:
|
|
||||||
OSError: If file cannot be accessed
|
|
||||||
"""
|
|
||||||
stat = path.stat()
|
stat = path.stat()
|
||||||
|
|
||||||
# Get creation time (platform dependent)
|
|
||||||
created_time = stat.st_ctime
|
created_time = stat.st_ctime
|
||||||
if hasattr(stat, 'st_birthtime'):
|
if hasattr(stat, 'st_birthtime'):
|
||||||
created_time = stat.st_birthtime
|
created_time = stat.st_birthtime
|
||||||
|
return FileMeta(path=path, size=stat.st_size, modified_time=stat.st_mtime, created_time=created_time)
|
||||||
return FileMeta(
|
|
||||||
path=path,
|
|
||||||
size=stat.st_size,
|
|
||||||
modified_time=stat.st_mtime,
|
|
||||||
created_time=created_time
|
|
||||||
)
|
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def files_scanned(self) -> int:
|
def files_scanned(self) -> int:
|
||||||
"""Get count of files scanned"""
|
|
||||||
return self._files_scanned
|
return self._files_scanned
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def bytes_scanned(self) -> int:
|
def bytes_scanned(self) -> int:
|
||||||
"""Get total bytes scanned"""
|
|
||||||
return self._bytes_scanned
|
return self._bytes_scanned
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def errors(self) -> int:
|
def errors(self) -> int:
|
||||||
"""Get count of errors encountered"""
|
|
||||||
return self._errors
|
return self._errors
|
||||||
|
|
||||||
def reset_stats(self) -> None:
|
def reset_stats(self) -> None:
|
||||||
"""Reset scanning statistics"""
|
|
||||||
self._files_scanned = 0
|
self._files_scanned = 0
|
||||||
self._bytes_scanned = 0
|
self._bytes_scanned = 0
|
||||||
self._errors = 0
|
self._errors = 0
|
||||||
|
|
||||||
|
|
||||||
class FilteredScanner(FileScanner):
|
class FilteredScanner(FileScanner):
|
||||||
"""Scanner with additional filtering capabilities"""
|
|
||||||
|
|
||||||
def __init__(
|
def __init__(self, min_size: Optional[int]=None, max_size: Optional[int]=None, extensions: Optional[list[str]]=None, exclude_patterns: Optional[list[str]]=None, **kwargs):
|
||||||
self,
|
|
||||||
min_size: Optional[int] = None,
|
|
||||||
max_size: Optional[int] = None,
|
|
||||||
extensions: Optional[list[str]] = None,
|
|
||||||
exclude_patterns: Optional[list[str]] = None,
|
|
||||||
**kwargs
|
|
||||||
):
|
|
||||||
"""Initialize filtered scanner
|
|
||||||
|
|
||||||
Args:
|
|
||||||
min_size: Minimum file size in bytes
|
|
||||||
max_size: Maximum file size in bytes
|
|
||||||
extensions: List of file extensions to include (e.g., ['.txt', '.py'])
|
|
||||||
exclude_patterns: List of path patterns to exclude
|
|
||||||
**kwargs: Additional arguments passed to FileScanner
|
|
||||||
"""
|
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
self.min_size = min_size
|
self.min_size = min_size
|
||||||
self.max_size = max_size
|
self.max_size = max_size
|
||||||
@@ -174,41 +92,19 @@ class FilteredScanner(FileScanner):
|
|||||||
self.exclude_patterns = exclude_patterns or []
|
self.exclude_patterns = exclude_patterns or []
|
||||||
|
|
||||||
def scan(self, root: Path) -> Iterator[FileMeta]:
|
def scan(self, root: Path) -> Iterator[FileMeta]:
|
||||||
"""Scan with additional filtering
|
|
||||||
|
|
||||||
Args:
|
|
||||||
root: Root directory to scan
|
|
||||||
|
|
||||||
Yields:
|
|
||||||
FileMeta objects for files matching filter criteria
|
|
||||||
"""
|
|
||||||
for meta in super().scan(root):
|
for meta in super().scan(root):
|
||||||
# Size filtering
|
|
||||||
if self.min_size is not None and meta.size < self.min_size:
|
if self.min_size is not None and meta.size < self.min_size:
|
||||||
continue
|
continue
|
||||||
if self.max_size is not None and meta.size > self.max_size:
|
if self.max_size is not None and meta.size > self.max_size:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Extension filtering
|
|
||||||
if self.extensions is not None:
|
if self.extensions is not None:
|
||||||
if meta.path.suffix.lower() not in self.extensions:
|
if meta.path.suffix.lower() not in self.extensions:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Exclude pattern filtering
|
|
||||||
if self._should_exclude(meta.path):
|
if self._should_exclude(meta.path):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
yield meta
|
yield meta
|
||||||
|
|
||||||
def _should_exclude(self, path: Path) -> bool:
|
def _should_exclude(self, path: Path) -> bool:
|
||||||
"""Check if path matches any exclude pattern
|
|
||||||
|
|
||||||
Args:
|
|
||||||
path: Path to check
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if path should be excluded
|
|
||||||
"""
|
|
||||||
path_str = str(path)
|
path_str = str(path)
|
||||||
for pattern in self.exclude_patterns:
|
for pattern in self.exclude_patterns:
|
||||||
if pattern in path_str:
|
if pattern in path_str:
|
||||||
|
|||||||
@@ -1,167 +1,80 @@
|
|||||||
"""System API for querying mounts and disks"""
|
|
||||||
import os
|
import os
|
||||||
import subprocess
|
import subprocess
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
import psutil
|
import psutil
|
||||||
|
|
||||||
from ._protocols import MountInfo, DiskInfo
|
from ._protocols import MountInfo, DiskInfo
|
||||||
|
|
||||||
|
|
||||||
class SystemAPI:
|
class SystemAPI:
|
||||||
"""System information API for querying mounts and disks"""
|
|
||||||
|
|
||||||
def query_mounts(self) -> list[MountInfo]:
|
def query_mounts(self) -> list[MountInfo]:
|
||||||
"""Query mounted filesystems
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of MountInfo objects for all mounted filesystems
|
|
||||||
"""
|
|
||||||
mounts = []
|
mounts = []
|
||||||
|
|
||||||
for partition in psutil.disk_partitions(all=False):
|
for partition in psutil.disk_partitions(all=False):
|
||||||
mount_info = MountInfo(
|
mount_info = MountInfo(device=partition.device, mount_point=partition.mountpoint, fs_type=partition.fstype, options=partition.opts)
|
||||||
device=partition.device,
|
|
||||||
mount_point=partition.mountpoint,
|
|
||||||
fs_type=partition.fstype,
|
|
||||||
options=partition.opts
|
|
||||||
)
|
|
||||||
mounts.append(mount_info)
|
mounts.append(mount_info)
|
||||||
|
|
||||||
return mounts
|
return mounts
|
||||||
|
|
||||||
def query_nvmes(self) -> list[DiskInfo]:
|
def query_nvmes(self) -> list[DiskInfo]:
|
||||||
"""Query NVMe/disk information
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of DiskInfo objects for all disks
|
|
||||||
"""
|
|
||||||
disks = []
|
disks = []
|
||||||
|
|
||||||
# Try to get disk information using lsblk
|
|
||||||
try:
|
try:
|
||||||
result = subprocess.run(
|
result = subprocess.run(['lsblk', '-ndo', 'NAME,MODEL,SIZE,SERIAL', '-b'], capture_output=True, text=True, check=False)
|
||||||
['lsblk', '-ndo', 'NAME,MODEL,SIZE,SERIAL', '-b'],
|
|
||||||
capture_output=True,
|
|
||||||
text=True,
|
|
||||||
check=False
|
|
||||||
)
|
|
||||||
|
|
||||||
if result.returncode == 0:
|
if result.returncode == 0:
|
||||||
for line in result.stdout.strip().split('\n'):
|
for line in result.stdout.strip().split('\n'):
|
||||||
if not line.strip():
|
if not line.strip():
|
||||||
continue
|
continue
|
||||||
|
|
||||||
parts = line.split(maxsplit=3)
|
parts = line.split(maxsplit=3)
|
||||||
if len(parts) >= 3:
|
if len(parts) >= 3:
|
||||||
device = f"/dev/{parts[0]}"
|
device = f'/dev/{parts[0]}'
|
||||||
model = parts[1] if len(parts) > 1 else "Unknown"
|
model = parts[1] if len(parts) > 1 else 'Unknown'
|
||||||
size_str = parts[2] if len(parts) > 2 else "0"
|
size_str = parts[2] if len(parts) > 2 else '0'
|
||||||
serial = parts[3] if len(parts) > 3 else "Unknown"
|
serial = parts[3] if len(parts) > 3 else 'Unknown'
|
||||||
|
|
||||||
try:
|
try:
|
||||||
size = int(size_str)
|
size = int(size_str)
|
||||||
except ValueError:
|
except ValueError:
|
||||||
size = 0
|
size = 0
|
||||||
|
disk_info = DiskInfo(device=device, model=model, size=size, serial=serial)
|
||||||
disk_info = DiskInfo(
|
|
||||||
device=device,
|
|
||||||
model=model,
|
|
||||||
size=size,
|
|
||||||
serial=serial
|
|
||||||
)
|
|
||||||
disks.append(disk_info)
|
disks.append(disk_info)
|
||||||
|
|
||||||
except FileNotFoundError:
|
except FileNotFoundError:
|
||||||
# lsblk not available, fall back to basic info
|
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# If lsblk failed or unavailable, try alternative method
|
|
||||||
if not disks:
|
if not disks:
|
||||||
disks = self._query_disks_fallback()
|
disks = self._query_disks_fallback()
|
||||||
|
|
||||||
return disks
|
return disks
|
||||||
|
|
||||||
def _query_disks_fallback(self) -> list[DiskInfo]:
|
def _query_disks_fallback(self) -> list[DiskInfo]:
|
||||||
"""Fallback method for querying disk information
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
List of DiskInfo objects using psutil
|
|
||||||
"""
|
|
||||||
disks = []
|
disks = []
|
||||||
seen_devices = set()
|
seen_devices = set()
|
||||||
|
|
||||||
for partition in psutil.disk_partitions(all=True):
|
for partition in psutil.disk_partitions(all=True):
|
||||||
device = partition.device
|
device = partition.device
|
||||||
|
|
||||||
# Skip non-disk devices
|
|
||||||
if not device.startswith('/dev/'):
|
if not device.startswith('/dev/'):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Get base device (e.g., /dev/sda from /dev/sda1)
|
|
||||||
base_device = self._get_base_device(device)
|
base_device = self._get_base_device(device)
|
||||||
|
|
||||||
if base_device in seen_devices:
|
if base_device in seen_devices:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
seen_devices.add(base_device)
|
seen_devices.add(base_device)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
usage = psutil.disk_usage(partition.mountpoint)
|
usage = psutil.disk_usage(partition.mountpoint)
|
||||||
size = usage.total
|
size = usage.total
|
||||||
except (PermissionError, OSError):
|
except (PermissionError, OSError):
|
||||||
size = 0
|
size = 0
|
||||||
|
disk_info = DiskInfo(device=base_device, model='Unknown', size=size, serial='Unknown')
|
||||||
disk_info = DiskInfo(
|
|
||||||
device=base_device,
|
|
||||||
model="Unknown",
|
|
||||||
size=size,
|
|
||||||
serial="Unknown"
|
|
||||||
)
|
|
||||||
disks.append(disk_info)
|
disks.append(disk_info)
|
||||||
|
|
||||||
return disks
|
return disks
|
||||||
|
|
||||||
def _get_base_device(self, device: str) -> str:
|
def _get_base_device(self, device: str) -> str:
|
||||||
"""Extract base device name from partition device
|
|
||||||
|
|
||||||
Args:
|
|
||||||
device: Device path (e.g., /dev/sda1, /dev/nvme0n1p1)
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Base device path (e.g., /dev/sda, /dev/nvme0n1)
|
|
||||||
"""
|
|
||||||
# Handle NVMe devices
|
|
||||||
if 'nvme' in device:
|
if 'nvme' in device:
|
||||||
# /dev/nvme0n1p1 -> /dev/nvme0n1
|
|
||||||
if 'p' in device:
|
if 'p' in device:
|
||||||
return device.rsplit('p', 1)[0]
|
return device.rsplit('p', 1)[0]
|
||||||
return device
|
return device
|
||||||
|
|
||||||
# Handle standard devices (sda, sdb, etc.)
|
|
||||||
# /dev/sda1 -> /dev/sda
|
|
||||||
import re
|
import re
|
||||||
match = re.match(r'(/dev/[a-z]+)', device)
|
match = re.match('(/dev/[a-z]+)', device)
|
||||||
if match:
|
if match:
|
||||||
return match.group(1)
|
return match.group(1)
|
||||||
|
|
||||||
return device
|
return device
|
||||||
|
|
||||||
def get_disk_for_path(self, path: Path) -> Optional[str]:
|
def get_disk_for_path(self, path: Path) -> Optional[str]:
|
||||||
"""Get the disk/mount point for a given path
|
|
||||||
|
|
||||||
Args:
|
|
||||||
path: Path to check
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Mount point device or None if not found
|
|
||||||
"""
|
|
||||||
path = path.resolve()
|
path = path.resolve()
|
||||||
|
|
||||||
# Find the mount point that contains this path
|
|
||||||
best_match = None
|
best_match = None
|
||||||
best_match_len = 0
|
best_match_len = 0
|
||||||
|
|
||||||
for partition in psutil.disk_partitions():
|
for partition in psutil.disk_partitions():
|
||||||
mount_point = Path(partition.mountpoint)
|
mount_point = Path(partition.mountpoint)
|
||||||
try:
|
try:
|
||||||
@@ -172,39 +85,19 @@ class SystemAPI:
|
|||||||
best_match_len = mount_len
|
best_match_len = mount_len
|
||||||
except (ValueError, OSError):
|
except (ValueError, OSError):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return best_match
|
return best_match
|
||||||
|
|
||||||
def get_disk_usage(self, path: Path) -> tuple[int, int, int]:
|
def get_disk_usage(self, path: Path) -> tuple[int, int, int]:
|
||||||
"""Get disk usage for a path
|
|
||||||
|
|
||||||
Args:
|
|
||||||
path: Path to check
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Tuple of (total, used, free) in bytes
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
usage = psutil.disk_usage(str(path))
|
usage = psutil.disk_usage(str(path))
|
||||||
return usage.total, usage.used, usage.free
|
return (usage.total, usage.used, usage.free)
|
||||||
except (PermissionError, OSError):
|
except (PermissionError, OSError):
|
||||||
return 0, 0, 0
|
return (0, 0, 0)
|
||||||
|
|
||||||
def get_mount_point(self, path: Path) -> Optional[Path]:
|
def get_mount_point(self, path: Path) -> Optional[Path]:
|
||||||
"""Get the mount point for a given path
|
|
||||||
|
|
||||||
Args:
|
|
||||||
path: Path to check
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Mount point path or None if not found
|
|
||||||
"""
|
|
||||||
path = path.resolve()
|
path = path.resolve()
|
||||||
|
|
||||||
# Find the mount point that contains this path
|
|
||||||
best_match = None
|
best_match = None
|
||||||
best_match_len = 0
|
best_match_len = 0
|
||||||
|
|
||||||
for partition in psutil.disk_partitions():
|
for partition in psutil.disk_partitions():
|
||||||
mount_point = Path(partition.mountpoint)
|
mount_point = Path(partition.mountpoint)
|
||||||
try:
|
try:
|
||||||
@@ -215,19 +108,9 @@ class SystemAPI:
|
|||||||
best_match_len = mount_len
|
best_match_len = mount_len
|
||||||
except (ValueError, OSError):
|
except (ValueError, OSError):
|
||||||
continue
|
continue
|
||||||
|
|
||||||
return best_match
|
return best_match
|
||||||
|
|
||||||
def is_same_filesystem(self, path1: Path, path2: Path) -> bool:
|
def is_same_filesystem(self, path1: Path, path2: Path) -> bool:
|
||||||
"""Check if two paths are on the same filesystem
|
|
||||||
|
|
||||||
Args:
|
|
||||||
path1: First path
|
|
||||||
path2: Second path
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
True if paths are on the same filesystem
|
|
||||||
"""
|
|
||||||
try:
|
try:
|
||||||
stat1 = path1.stat()
|
stat1 = path1.stat()
|
||||||
stat2 = path2.stat()
|
stat2 = path2.stat()
|
||||||
|
|||||||
648
app/main.py
648
app/main.py
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user