Files
defrag/app/parsers/text_parser.py
2025-12-13 11:56:06 +01:00

27 lines
844 B
Python

from pathlib import Path
from typing import Dict, Optional
import chardet
class TextParser:
def parse(self, file_path: Path) -> Dict:
try:
with open(file_path, 'rb') as f:
raw_data = f.read(1024 * 1024)
encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
text = raw_data.decode(encoding, errors='ignore')
lines = text.split('\n')
return {
'text': text,
'encoding': encoding,
'line_count': len(lines),
'char_count': len(text),
'word_count': len(text.split()),
'structure': {'type': 'plain_text'},
'quality': 'high' if encoding == 'utf-8' else 'medium'
}
except Exception as e:
return {'error': str(e)}