initial
This commit is contained in:
26
app/parsers/text_parser.py
Normal file
26
app/parsers/text_parser.py
Normal file
@@ -0,0 +1,26 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict, Optional
|
||||
import chardet
|
||||
|
||||
class TextParser:
|
||||
def parse(self, file_path: Path) -> Dict:
|
||||
try:
|
||||
with open(file_path, 'rb') as f:
|
||||
raw_data = f.read(1024 * 1024)
|
||||
|
||||
encoding = chardet.detect(raw_data)['encoding'] or 'utf-8'
|
||||
text = raw_data.decode(encoding, errors='ignore')
|
||||
|
||||
lines = text.split('\n')
|
||||
|
||||
return {
|
||||
'text': text,
|
||||
'encoding': encoding,
|
||||
'line_count': len(lines),
|
||||
'char_count': len(text),
|
||||
'word_count': len(text.split()),
|
||||
'structure': {'type': 'plain_text'},
|
||||
'quality': 'high' if encoding == 'utf-8' else 'medium'
|
||||
}
|
||||
except Exception as e:
|
||||
return {'error': str(e)}
|
||||
Reference in New Issue
Block a user