27 lines
844 B
Python
27 lines
844 B
Python
from pathlib import Path
|
|
from typing import Dict, Optional
|
|
import chardet
|
|
|
|
class TextParser:
|
|
def parse(self, file_path: Path) -> Dict:
|
|
try:
|
|
with open(file_path, 'rb') as f:
|
|
raw_data = f.read(1024 * 1024)
|
|
|
|
encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
|
|
text = raw_data.decode(encoding, errors='ignore')
|
|
|
|
lines = text.split('\n')
|
|
|
|
return {
|
|
'text': text,
|
|
'encoding': encoding,
|
|
'line_count': len(lines),
|
|
'char_count': len(text),
|
|
'word_count': len(text.split()),
|
|
'structure': {'type': 'plain_text'},
|
|
'quality': 'high' if encoding == 'utf-8' else 'medium'
|
|
}
|
|
except Exception as e:
|
|
return {'error': str(e)}
|