import psycopg2 from typing import Dict, Optional import logging logger = logging.getLogger(__name__) class FileTypeInventory: def __init__(self, db_config: Dict): self.db_config = db_config self.parseable_extensions = { 'text': {'txt', 'md', 'log', 'json', 'yaml', 'yml', 'xml', 'csv', 'tsv', 'ini', 'cfg', 'conf'}, 'code': {'py', 'js', 'java', 'go', 'rs', 'ts', 'tsx', 'jsx', 'cpp', 'h', 'c', 'cs', 'rb', 'php', 'sh', 'bat', 'ps1', 'sql', 'r', 'scala', 'kt'}, 'pdf': {'pdf'}, 'document': {'doc', 'docx', 'odt', 'rtf', 'pages'}, 'spreadsheet': {'xls', 'xlsx', 'ods', 'numbers'}, 'presentation': {'ppt', 'pptx', 'odp', 'key'}, 'image': {'jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'webp', 'svg', 'ico'}, 'audio': {'mp3', 'wav', 'flac', 'm4a', 'ogg', 'wma', 'aac', 'opus'}, 'video': {'mp4', 'avi', 'mkv', 'mov', 'wmv', 'flv', 'webm', 'mpg', 'mpeg'}, 'archive': {'zip', 'tar', 'gz', 'bz2', '7z', 'rar', 'xz'}, 'executable': {'exe', 'dll', 'so', 'dylib', 'bin', 'app'}, 'data': {'db', 'sqlite', 'mdb', 'accdb', 'pkl', 'parquet', 'feather', 'arrow'} } self.implemented_parsers = { 'text': True, 'code': True, 'pdf': True, 'document': False, 'spreadsheet': False, 'presentation': False, 'image': False, 'audio': False, 'video': False, 'archive': False, 'executable': False, 'data': False } def get_connection(self): return psycopg2.connect(**self.db_config) def analyze(self, disk: Optional[str] = None, limit: int = 100): conn = self.get_connection() cursor = conn.cursor() try: query = ''' SELECT CASE WHEN path ~ '\\.([a-zA-Z0-9]+)$' THEN LOWER(SUBSTRING(path FROM '\\.([a-zA-Z0-9]+)$')) ELSE 'no_extension' END as extension, COUNT(*) as count, SUM(size)::bigint as total_size, ROUND(AVG(size)::numeric, 0) as avg_size, MAX(size) as max_size, COUNT(CASE WHEN extracted_text IS NOT NULL THEN 1 END) as parsed_count FROM files ''' params = [] if disk: query += ' WHERE disk_label = %s' params.append(disk) query += ' GROUP BY extension ORDER BY count DESC' if limit: query += f' LIMIT {limit}' cursor.execute(query, params) results = cursor.fetchall() return self._format_results(results) finally: cursor.close() conn.close() def _format_results(self, results): total_files = 0 total_size = 0 parseable_files = 0 parsed_files = 0 unparsed_by_type = {} extension_details = [] for row in results: ext, count, size, avg, max_sz, parsed = row total_files += int(count) total_size += int(size or 0) parsed_files += int(parsed or 0) parser_type = self._get_parser_type(ext) is_parseable = parser_type != 'none' and self.implemented_parsers.get(parser_type, False) if is_parseable: parseable_files += int(count) unparsed_count = int(count) - int(parsed or 0) if unparsed_count > 0: if parser_type not in unparsed_by_type: unparsed_by_type[parser_type] = {'count': 0, 'extensions': set()} unparsed_by_type[parser_type]['count'] += unparsed_count unparsed_by_type[parser_type]['extensions'].add(ext) extension_details.append({ 'extension': ext, 'count': int(count), 'total_size': int(size or 0), 'avg_size': int(avg or 0), 'max_size': int(max_sz or 0), 'parsed': int(parsed or 0), 'parser_type': parser_type, 'is_parseable': is_parseable }) return { 'extensions': extension_details, 'summary': { 'total_files': total_files, 'total_size': total_size, 'parseable_files': parseable_files, 'parsed_files': parsed_files, 'coverage': (parsed_files / parseable_files * 100) if parseable_files > 0 else 0 }, 'unparsed_by_type': unparsed_by_type, 'parser_status': self._get_parser_status() } def _get_parser_type(self, ext: str) -> str: for ptype, extensions in self.parseable_extensions.items(): if ext in extensions: return ptype return 'none' def _get_parser_status(self): return { ptype: { 'implemented': self.implemented_parsers.get(ptype, False), 'extensions': list(exts) } for ptype, exts in self.parseable_extensions.items() } def format_size(self, size_bytes: int) -> str: for unit in ['B', 'KB', 'MB', 'GB', 'TB']: if size_bytes < 1024: return f'{size_bytes:.1f}{unit}' size_bytes /= 1024 return f'{size_bytes:.1f}PB'