Files
defrag/app/analysis/inventory.py
2025-12-13 13:57:13 +01:00

151 lines
5.5 KiB
Python

import psycopg2
from typing import Dict, Optional
import logging
logger = logging.getLogger(__name__)
class FileTypeInventory:
def __init__(self, db_config: Dict):
self.db_config = db_config
self.parseable_extensions = {
'text': {'txt', 'md', 'log', 'json', 'yaml', 'yml', 'xml', 'csv', 'tsv', 'ini', 'cfg', 'conf'},
'code': {'py', 'js', 'java', 'go', 'rs', 'ts', 'tsx', 'jsx', 'cpp', 'h', 'c', 'cs', 'rb', 'php', 'sh', 'bat', 'ps1', 'sql', 'r', 'scala', 'kt'},
'pdf': {'pdf'},
'document': {'doc', 'docx', 'odt', 'rtf', 'pages'},
'spreadsheet': {'xls', 'xlsx', 'ods', 'numbers'},
'presentation': {'ppt', 'pptx', 'odp', 'key'},
'image': {'jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'webp', 'svg', 'ico'},
'audio': {'mp3', 'wav', 'flac', 'm4a', 'ogg', 'wma', 'aac', 'opus'},
'video': {'mp4', 'avi', 'mkv', 'mov', 'wmv', 'flv', 'webm', 'mpg', 'mpeg'},
'archive': {'zip', 'tar', 'gz', 'bz2', '7z', 'rar', 'xz'},
'executable': {'exe', 'dll', 'so', 'dylib', 'bin', 'app'},
'data': {'db', 'sqlite', 'mdb', 'accdb', 'pkl', 'parquet', 'feather', 'arrow'}
}
self.implemented_parsers = {
'text': True,
'code': True,
'pdf': True,
'document': False,
'spreadsheet': False,
'presentation': False,
'image': False,
'audio': False,
'video': False,
'archive': False,
'executable': False,
'data': False
}
def get_connection(self):
return psycopg2.connect(**self.db_config)
def analyze(self, disk: Optional[str] = None, limit: int = 100):
conn = self.get_connection()
cursor = conn.cursor()
try:
query = '''
SELECT
CASE
WHEN path ~ '\\.([a-zA-Z0-9]+)$' THEN
LOWER(SUBSTRING(path FROM '\\.([a-zA-Z0-9]+)$'))
ELSE 'no_extension'
END as extension,
COUNT(*) as count,
SUM(size)::bigint as total_size,
ROUND(AVG(size)::numeric, 0) as avg_size,
MAX(size) as max_size,
COUNT(CASE WHEN extracted_text IS NOT NULL THEN 1 END) as parsed_count
FROM files
'''
params = []
if disk:
query += ' WHERE disk_label = %s'
params.append(disk)
query += ' GROUP BY extension ORDER BY count DESC'
if limit:
query += f' LIMIT {limit}'
cursor.execute(query, params)
results = cursor.fetchall()
return self._format_results(results)
finally:
cursor.close()
conn.close()
def _format_results(self, results):
total_files = 0
total_size = 0
parseable_files = 0
parsed_files = 0
unparsed_by_type = {}
extension_details = []
for row in results:
ext, count, size, avg, max_sz, parsed = row
total_files += int(count)
total_size += int(size or 0)
parsed_files += int(parsed or 0)
parser_type = self._get_parser_type(ext)
is_parseable = parser_type != 'none' and self.implemented_parsers.get(parser_type, False)
if is_parseable:
parseable_files += int(count)
unparsed_count = int(count) - int(parsed or 0)
if unparsed_count > 0:
if parser_type not in unparsed_by_type:
unparsed_by_type[parser_type] = {'count': 0, 'extensions': set()}
unparsed_by_type[parser_type]['count'] += unparsed_count
unparsed_by_type[parser_type]['extensions'].add(ext)
extension_details.append({
'extension': ext,
'count': int(count),
'total_size': int(size or 0),
'avg_size': int(avg or 0),
'max_size': int(max_sz or 0),
'parsed': int(parsed or 0),
'parser_type': parser_type,
'is_parseable': is_parseable
})
return {
'extensions': extension_details,
'summary': {
'total_files': total_files,
'total_size': total_size,
'parseable_files': parseable_files,
'parsed_files': parsed_files,
'coverage': (parsed_files / parseable_files * 100) if parseable_files > 0 else 0
},
'unparsed_by_type': unparsed_by_type,
'parser_status': self._get_parser_status()
}
def _get_parser_type(self, ext: str) -> str:
for ptype, extensions in self.parseable_extensions.items():
if ext in extensions:
return ptype
return 'none'
def _get_parser_status(self):
return {
ptype: {
'implemented': self.implemented_parsers.get(ptype, False),
'extensions': list(exts)
}
for ptype, exts in self.parseable_extensions.items()
}
def format_size(self, size_bytes: int) -> str:
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
if size_bytes < 1024:
return f'{size_bytes:.1f}{unit}'
size_bytes /= 1024
return f'{size_bytes:.1f}PB'