fly wa
This commit is contained in:
251
app/main.py
251
app/main.py
@@ -27,7 +27,7 @@ class DiskReorganizer:
|
||||
|
||||
def __init__(self, db_config: Dict=None):
|
||||
if db_config is None:
|
||||
db_config = {'host': os.getenv('DB_HOST', '192.168.1.159'), 'port': int(os.getenv('DB_PORT', 5432)), 'database': os.getenv('DB_NAME', 'disk_reorganizer_db'), 'user': os.getenv('DB_USER', 'disk_reorg_user'), 'password': os.getenv('DB_PASSWORD', 'heel-goed-wachtwoord')}
|
||||
db_config = {'host': os.getenv('DB_HOST', '192.168.1.159'), 'port': int(os.getenv('DB_PORT', 5432)), 'database': os.getenv('DB_NAME', 'disk_reorganizer_db'), 'user': os.getenv('DB_USER', 'auction'), 'password': os.getenv('DB_PASSWORD', 'heel-goed-wachtwoord')}
|
||||
self.db_config = db_config
|
||||
self.init_database()
|
||||
|
||||
@@ -522,23 +522,126 @@ class DiskReorganizer:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def classify_files(self, disk: Optional[str]=None, update_db: bool=False):
|
||||
def parse_files(self, kind: Optional[str] = None, limit: int = 100, update_db: bool = False):
|
||||
from parsers.text_parser import TextParser
|
||||
from parsers.code_parser import CodeParser
|
||||
from parsers.pdf_parser import PDFParser
|
||||
|
||||
parsers = {'text': TextParser(), 'code': CodeParser(), 'pdf': PDFParser()}
|
||||
disk_mount_map = {'SMT': '/media/mike/SMT', 'DISK1': '/media/mike/DISK1', 'LLM': '/media/mike/LLM'}
|
||||
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
query = "SELECT path, size, disk_label FROM files WHERE 1=1"
|
||||
params = []
|
||||
if kind:
|
||||
suffix_map = {'text': "('.txt', '.md', '.log', '.json')", 'code': "('.py', '.js', '.java', '.go')", 'pdf': "('.pdf',)"}
|
||||
if kind in suffix_map:
|
||||
query += f" AND RIGHT(path, 4) IN {suffix_map[kind]} OR RIGHT(path, 3) IN {suffix_map[kind]}"
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
cursor.execute(query, params)
|
||||
files = cursor.fetchall()
|
||||
|
||||
print(f"\n=== PARSING FILES ===\nProcessing {len(files)} files\n")
|
||||
|
||||
parsed_count = 0
|
||||
for path, size, disk_label in files:
|
||||
mount_point = disk_mount_map.get(disk_label, disk_label)
|
||||
full_path = Path(mount_point) / path if not Path(path).is_absolute() else Path(path)
|
||||
|
||||
if not full_path.exists() or int(size) > 10 * 1024 * 1024:
|
||||
continue
|
||||
|
||||
file_kind = 'pdf' if path.endswith('.pdf') else 'code' if any(path.endswith(e) for e in ['.py', '.js', '.java']) else 'text'
|
||||
parser = parsers.get(file_kind)
|
||||
if not parser:
|
||||
continue
|
||||
|
||||
result = parser.parse(full_path)
|
||||
if 'error' not in result:
|
||||
text = result.get('text', '')
|
||||
quality = result.get('quality', 'unknown')
|
||||
print(f"{path[:60]} | {file_kind} | {len(text):,} chars")
|
||||
|
||||
if update_db and text:
|
||||
cursor.execute("UPDATE files SET extracted_text = %s, text_quality = %s WHERE path = %s", (text[:50000], quality, path))
|
||||
parsed_count += 1
|
||||
if parsed_count % 10 == 0:
|
||||
conn.commit()
|
||||
|
||||
if update_db:
|
||||
conn.commit()
|
||||
print(f"\nParsed {parsed_count} files")
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def enrich_files(self, limit: int = 10, llm_endpoint: str = None, use_local: bool = False):
|
||||
from enrichment.enricher import ContentEnricher
|
||||
|
||||
enricher = ContentEnricher()
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
try:
|
||||
cursor.execute(f"SELECT path, extracted_text FROM files WHERE extracted_text IS NOT NULL LIMIT {limit}")
|
||||
files = cursor.fetchall()
|
||||
|
||||
print(f"\n=== ENRICHING CONTENT ===\nProcessing {len(files)} files\n")
|
||||
|
||||
for path, text in files:
|
||||
enrichment = enricher.enrich(text[:5000], use_llm=False)
|
||||
print(f"{path[:60]}")
|
||||
print(f" Quality: {enrichment.get('quality')} | Words: {enrichment.get('word_count'):,}")
|
||||
print(f" PII: {list(enrichment.get('has_pii', {}).keys())}")
|
||||
print(f" Topics: {', '.join(enrichment.get('topics', [])[:5])}\n")
|
||||
|
||||
cursor.execute("UPDATE files SET enrichment = %s::jsonb WHERE path = %s", (json.dumps(enrichment), path))
|
||||
|
||||
conn.commit()
|
||||
print(f"Enriched {len(files)} files")
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def classify_files(self, disk: Optional[str]=None, update_db: bool=False, resume: bool=True):
|
||||
from classification.classifier import FileClassifier
|
||||
classifier = FileClassifier()
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
task_name = f"classify_{disk or 'all'}"
|
||||
skip_count = 0
|
||||
|
||||
if resume and update_db:
|
||||
cursor.execute('SELECT last_processed_path, processed_count FROM processing_checkpoints WHERE task_name = %s', (task_name,))
|
||||
checkpoint = cursor.fetchone()
|
||||
if checkpoint:
|
||||
last_path, skip_count = checkpoint
|
||||
logger.info(f'Resuming from checkpoint: {skip_count:,} files already processed')
|
||||
|
||||
if disk:
|
||||
cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s', (disk,))
|
||||
cursor.execute('SELECT path, size, disk_label FROM files WHERE disk_label = %s ORDER BY path', (disk,))
|
||||
else:
|
||||
cursor.execute('SELECT path, size, disk_label FROM files')
|
||||
cursor.execute('SELECT path, size, disk_label FROM files ORDER BY path')
|
||||
files = cursor.fetchall()
|
||||
total = len(files)
|
||||
logger.info(f'Classifying {total:,} files...')
|
||||
|
||||
categories = {}
|
||||
build_artifacts = 0
|
||||
batch = []
|
||||
processed = 0
|
||||
|
||||
for idx, (path, size, disk_label) in enumerate(files, 1):
|
||||
if idx <= skip_count:
|
||||
continue
|
||||
|
||||
labels, category, is_build = classifier.classify_path(path, int(size))
|
||||
if is_build:
|
||||
build_artifacts += 1
|
||||
@@ -546,18 +649,40 @@ class DiskReorganizer:
|
||||
categories[category] = {'count': 0, 'size': 0}
|
||||
categories[category]['count'] += 1
|
||||
categories[category]['size'] += int(size)
|
||||
|
||||
if update_db:
|
||||
labels_str = ','.join(labels)
|
||||
batch.append((category, labels_str, path))
|
||||
|
||||
if len(batch) >= 1000:
|
||||
cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch])
|
||||
cursor.execute('''
|
||||
INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at)
|
||||
VALUES (%s, %s, %s, CURRENT_TIMESTAMP)
|
||||
ON CONFLICT (task_name) DO UPDATE SET
|
||||
last_processed_path = EXCLUDED.last_processed_path,
|
||||
processed_count = EXCLUDED.processed_count,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
''', (task_name, path, idx))
|
||||
conn.commit()
|
||||
batch.clear()
|
||||
|
||||
processed += 1
|
||||
if idx % 1000 == 0:
|
||||
print(f'\rClassified: {idx:,}/{total:,}', end='', flush=True)
|
||||
print(f'\rClassified: {idx:,}/{total:,} ({100*idx/total:.1f}%)', end='', flush=True)
|
||||
|
||||
if update_db and batch:
|
||||
cursor.executemany('UPDATE files SET category = %s WHERE path = %s', [(cat, p) for cat, lbl, p in batch])
|
||||
cursor.execute('''
|
||||
INSERT INTO processing_checkpoints (task_name, last_processed_path, processed_count, updated_at)
|
||||
VALUES (%s, %s, %s, CURRENT_TIMESTAMP)
|
||||
ON CONFLICT (task_name) DO UPDATE SET
|
||||
last_processed_path = EXCLUDED.last_processed_path,
|
||||
processed_count = EXCLUDED.processed_count,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
''', (task_name, files[-1][0] if files else '', total))
|
||||
conn.commit()
|
||||
|
||||
print()
|
||||
print(f'\n=== CLASSIFICATION SUMMARY ===')
|
||||
print(f'Total files: {total:,}')
|
||||
@@ -570,6 +695,99 @@ class DiskReorganizer:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def analyze_folders(self, disk: Optional[str]=None, min_files: int=3):
|
||||
from analysis.folder_analyzer import FolderAnalyzer
|
||||
analyzer = FolderAnalyzer()
|
||||
conn = self.get_connection()
|
||||
cursor = conn.cursor()
|
||||
try:
|
||||
query = '''
|
||||
SELECT DISTINCT SUBSTRING(path FROM 1 FOR POSITION('/' IN path || '/') - 1) as folder, disk_label
|
||||
FROM files
|
||||
WHERE 1=1
|
||||
'''
|
||||
params = []
|
||||
if disk:
|
||||
query += ' AND disk_label = %s'
|
||||
params.append(disk)
|
||||
|
||||
cursor.execute(query, params)
|
||||
potential_folders = cursor.fetchall()
|
||||
|
||||
logger.info(f'Found {len(potential_folders)} potential folders to analyze')
|
||||
|
||||
processed = 0
|
||||
for folder_name, disk_label in potential_folders:
|
||||
cursor.execute('''
|
||||
SELECT path, size FROM files
|
||||
WHERE disk_label = %s AND path LIKE %s
|
||||
''', (disk_label, f'{folder_name}%'))
|
||||
|
||||
files = cursor.fetchall()
|
||||
if len(files) < min_files:
|
||||
continue
|
||||
|
||||
files_list = [{'path': f[0], 'size': int(f[1])} for f in files]
|
||||
folder_path = Path(folder_name)
|
||||
|
||||
analysis = analyzer.analyze_folder(folder_path, files_list)
|
||||
|
||||
readme_text = None
|
||||
for file_dict in files_list:
|
||||
if 'readme' in file_dict['path'].lower():
|
||||
readme_text = f"Found README at {file_dict['path']}"
|
||||
break
|
||||
|
||||
summary = analyzer.generate_summary(analysis, readme_text)
|
||||
|
||||
cursor.execute('''
|
||||
INSERT INTO folders (path, disk_label, file_count, total_size, project_type, intent, summary,
|
||||
has_readme, has_git, has_manifest, manifest_types, dominant_file_types, structure)
|
||||
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
|
||||
ON CONFLICT (path) DO UPDATE SET
|
||||
file_count = EXCLUDED.file_count,
|
||||
total_size = EXCLUDED.total_size,
|
||||
project_type = EXCLUDED.project_type,
|
||||
intent = EXCLUDED.intent,
|
||||
summary = EXCLUDED.summary,
|
||||
has_readme = EXCLUDED.has_readme,
|
||||
has_git = EXCLUDED.has_git,
|
||||
has_manifest = EXCLUDED.has_manifest,
|
||||
manifest_types = EXCLUDED.manifest_types,
|
||||
dominant_file_types = EXCLUDED.dominant_file_types,
|
||||
structure = EXCLUDED.structure,
|
||||
updated_at = CURRENT_TIMESTAMP
|
||||
''', (
|
||||
str(folder_path), disk_label, len(files_list), sum(f['size'] for f in files_list),
|
||||
analysis.get('project_type'), analysis.get('intent'), summary,
|
||||
analysis.get('has_readme'), analysis.get('has_git'), analysis.get('has_manifest'),
|
||||
analysis.get('manifest_types'), json.dumps(analysis.get('dominant_file_types', {})),
|
||||
json.dumps(analysis.get('structure', {}))
|
||||
))
|
||||
|
||||
processed += 1
|
||||
if processed % 100 == 0:
|
||||
conn.commit()
|
||||
print(f'\rAnalyzed: {processed} folders', end='', flush=True)
|
||||
|
||||
conn.commit()
|
||||
print()
|
||||
logger.info(f'Completed folder analysis: {processed} folders')
|
||||
|
||||
cursor.execute('''
|
||||
SELECT project_type, COUNT(*), SUM(file_count), SUM(total_size)
|
||||
FROM folders
|
||||
GROUP BY project_type
|
||||
''')
|
||||
print(f'\n=== FOLDER ANALYSIS SUMMARY ===')
|
||||
for row in cursor.fetchall():
|
||||
proj_type, count, files, size = row
|
||||
print(f'{proj_type:20}: {count:6,} folders, {files:8,} files, {self.format_size(int(size or 0))}')
|
||||
|
||||
finally:
|
||||
cursor.close()
|
||||
conn.close()
|
||||
|
||||
def review_migration(self, category: Optional[str]=None, show_build: bool=False):
|
||||
from classification.classifier import FileClassifier
|
||||
classifier = FileClassifier()
|
||||
@@ -640,9 +858,24 @@ def main():
|
||||
extract_parser = subparsers.add_parser('extract', help='Extract content from files')
|
||||
extract_parser.add_argument('--kind', help='Extract specific kind (pdf, image, audio, video)')
|
||||
extract_parser.add_argument('--limit', type=int, default=10, help='Limit extraction batch')
|
||||
|
||||
parse_parser = subparsers.add_parser('parse', help='Parse files to extract text')
|
||||
parse_parser.add_argument('--kind', help='Parse specific kind (text, code, pdf)')
|
||||
parse_parser.add_argument('--limit', type=int, default=100, help='Limit parse batch')
|
||||
parse_parser.add_argument('--update', action='store_true', help='Save extracted text to database')
|
||||
|
||||
enrich_parser = subparsers.add_parser('enrich', help='Enrich content with LLM analysis')
|
||||
enrich_parser.add_argument('--limit', type=int, default=10, help='Limit enrichment batch')
|
||||
enrich_parser.add_argument('--llm-endpoint', default='http://192.168.1.74:1234', help='LLM endpoint')
|
||||
enrich_parser.add_argument('--local', action='store_true', help='Use local Ollama')
|
||||
|
||||
classify_parser = subparsers.add_parser('classify', help='Classify files and suggest organization')
|
||||
classify_parser.add_argument('--disk', help='Classify specific disk')
|
||||
classify_parser.add_argument('--update', action='store_true', help='Update database with classifications')
|
||||
classify_parser.add_argument('--no-resume', action='store_true', help='Start from scratch instead of resuming')
|
||||
folders_parser = subparsers.add_parser('analyze-folders', help='Analyze folder structure and infer project intent')
|
||||
folders_parser.add_argument('--disk', help='Analyze specific disk')
|
||||
folders_parser.add_argument('--min-files', type=int, default=3, help='Minimum files per folder')
|
||||
review_parser = subparsers.add_parser('review', help='Review proposed migration structure')
|
||||
review_parser.add_argument('--category', help='Review specific category')
|
||||
review_parser.add_argument('--show-build', action='store_true', help='Include build artifacts')
|
||||
@@ -669,8 +902,14 @@ def main():
|
||||
tool.profile_content(disk=args.disk, update_db=args.update, limit=args.limit)
|
||||
elif args.command == 'extract':
|
||||
tool.extract_content(kind=args.kind, limit=args.limit)
|
||||
elif args.command == 'parse':
|
||||
tool.parse_files(kind=args.kind, limit=args.limit, update_db=args.update)
|
||||
elif args.command == 'enrich':
|
||||
tool.enrich_files(limit=args.limit, llm_endpoint=args.llm_endpoint, use_local=args.local)
|
||||
elif args.command == 'classify':
|
||||
tool.classify_files(disk=args.disk, update_db=args.update)
|
||||
tool.classify_files(disk=args.disk, update_db=args.update, resume=not args.no_resume)
|
||||
elif args.command == 'analyze-folders':
|
||||
tool.analyze_folders(disk=args.disk, min_files=args.min_files)
|
||||
elif args.command == 'review':
|
||||
tool.review_migration(category=args.category, show_build=args.show_build)
|
||||
elif args.command == 'report':
|
||||
|
||||
Reference in New Issue
Block a user