151 lines
5.5 KiB
Python
151 lines
5.5 KiB
Python
import psycopg2
|
|
from typing import Dict, Optional
|
|
import logging
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
class FileTypeInventory:
|
|
def __init__(self, db_config: Dict):
|
|
self.db_config = db_config
|
|
|
|
self.parseable_extensions = {
|
|
'text': {'txt', 'md', 'log', 'json', 'yaml', 'yml', 'xml', 'csv', 'tsv', 'ini', 'cfg', 'conf'},
|
|
'code': {'py', 'js', 'java', 'go', 'rs', 'ts', 'tsx', 'jsx', 'cpp', 'h', 'c', 'cs', 'rb', 'php', 'sh', 'bat', 'ps1', 'sql', 'r', 'scala', 'kt'},
|
|
'pdf': {'pdf'},
|
|
'document': {'doc', 'docx', 'odt', 'rtf', 'pages'},
|
|
'spreadsheet': {'xls', 'xlsx', 'ods', 'numbers'},
|
|
'presentation': {'ppt', 'pptx', 'odp', 'key'},
|
|
'image': {'jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff', 'webp', 'svg', 'ico'},
|
|
'audio': {'mp3', 'wav', 'flac', 'm4a', 'ogg', 'wma', 'aac', 'opus'},
|
|
'video': {'mp4', 'avi', 'mkv', 'mov', 'wmv', 'flv', 'webm', 'mpg', 'mpeg'},
|
|
'archive': {'zip', 'tar', 'gz', 'bz2', '7z', 'rar', 'xz'},
|
|
'executable': {'exe', 'dll', 'so', 'dylib', 'bin', 'app'},
|
|
'data': {'db', 'sqlite', 'mdb', 'accdb', 'pkl', 'parquet', 'feather', 'arrow'}
|
|
}
|
|
|
|
self.implemented_parsers = {
|
|
'text': True,
|
|
'code': True,
|
|
'pdf': True,
|
|
'document': False,
|
|
'spreadsheet': False,
|
|
'presentation': False,
|
|
'image': False,
|
|
'audio': False,
|
|
'video': False,
|
|
'archive': False,
|
|
'executable': False,
|
|
'data': False
|
|
}
|
|
|
|
def get_connection(self):
|
|
return psycopg2.connect(**self.db_config)
|
|
|
|
def analyze(self, disk: Optional[str] = None, limit: int = 100):
|
|
conn = self.get_connection()
|
|
cursor = conn.cursor()
|
|
try:
|
|
query = '''
|
|
SELECT
|
|
CASE
|
|
WHEN path ~ '\\.([a-zA-Z0-9]+)$' THEN
|
|
LOWER(SUBSTRING(path FROM '\\.([a-zA-Z0-9]+)$'))
|
|
ELSE 'no_extension'
|
|
END as extension,
|
|
COUNT(*) as count,
|
|
SUM(size)::bigint as total_size,
|
|
ROUND(AVG(size)::numeric, 0) as avg_size,
|
|
MAX(size) as max_size,
|
|
COUNT(CASE WHEN extracted_text IS NOT NULL THEN 1 END) as parsed_count
|
|
FROM files
|
|
'''
|
|
params = []
|
|
if disk:
|
|
query += ' WHERE disk_label = %s'
|
|
params.append(disk)
|
|
|
|
query += ' GROUP BY extension ORDER BY count DESC'
|
|
if limit:
|
|
query += f' LIMIT {limit}'
|
|
|
|
cursor.execute(query, params)
|
|
results = cursor.fetchall()
|
|
|
|
return self._format_results(results)
|
|
|
|
finally:
|
|
cursor.close()
|
|
conn.close()
|
|
|
|
def _format_results(self, results):
|
|
total_files = 0
|
|
total_size = 0
|
|
parseable_files = 0
|
|
parsed_files = 0
|
|
unparsed_by_type = {}
|
|
|
|
extension_details = []
|
|
for row in results:
|
|
ext, count, size, avg, max_sz, parsed = row
|
|
total_files += int(count)
|
|
total_size += int(size or 0)
|
|
parsed_files += int(parsed or 0)
|
|
|
|
parser_type = self._get_parser_type(ext)
|
|
is_parseable = parser_type != 'none' and self.implemented_parsers.get(parser_type, False)
|
|
|
|
if is_parseable:
|
|
parseable_files += int(count)
|
|
unparsed_count = int(count) - int(parsed or 0)
|
|
if unparsed_count > 0:
|
|
if parser_type not in unparsed_by_type:
|
|
unparsed_by_type[parser_type] = {'count': 0, 'extensions': set()}
|
|
unparsed_by_type[parser_type]['count'] += unparsed_count
|
|
unparsed_by_type[parser_type]['extensions'].add(ext)
|
|
|
|
extension_details.append({
|
|
'extension': ext,
|
|
'count': int(count),
|
|
'total_size': int(size or 0),
|
|
'avg_size': int(avg or 0),
|
|
'max_size': int(max_sz or 0),
|
|
'parsed': int(parsed or 0),
|
|
'parser_type': parser_type,
|
|
'is_parseable': is_parseable
|
|
})
|
|
|
|
return {
|
|
'extensions': extension_details,
|
|
'summary': {
|
|
'total_files': total_files,
|
|
'total_size': total_size,
|
|
'parseable_files': parseable_files,
|
|
'parsed_files': parsed_files,
|
|
'coverage': (parsed_files / parseable_files * 100) if parseable_files > 0 else 0
|
|
},
|
|
'unparsed_by_type': unparsed_by_type,
|
|
'parser_status': self._get_parser_status()
|
|
}
|
|
|
|
def _get_parser_type(self, ext: str) -> str:
|
|
for ptype, extensions in self.parseable_extensions.items():
|
|
if ext in extensions:
|
|
return ptype
|
|
return 'none'
|
|
|
|
def _get_parser_status(self):
|
|
return {
|
|
ptype: {
|
|
'implemented': self.implemented_parsers.get(ptype, False),
|
|
'extensions': list(exts)
|
|
}
|
|
for ptype, exts in self.parseable_extensions.items()
|
|
}
|
|
|
|
def format_size(self, size_bytes: int) -> str:
|
|
for unit in ['B', 'KB', 'MB', 'GB', 'TB']:
|
|
if size_bytes < 1024:
|
|
return f'{size_bytes:.1f}{unit}'
|
|
size_bytes /= 1024
|
|
return f'{size_bytes:.1f}PB'
|