359 lines
15 KiB
Python
359 lines
15 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Parser module for extracting data from HTML/JSON content
|
||
"""
|
||
import json
|
||
import re
|
||
import html
|
||
from datetime import datetime
|
||
from urllib.parse import urljoin, urlparse
|
||
from typing import Dict, List, Optional
|
||
|
||
from config import BASE_URL
|
||
|
||
|
||
class DataParser:
|
||
"""Handles all data extraction from HTML/JSON content"""
|
||
|
||
@staticmethod
|
||
def extract_lot_id(url: str) -> str:
|
||
"""Extract lot ID from URL"""
|
||
path = urlparse(url).path
|
||
match = re.search(r'/lots/(\d+)', path)
|
||
if match:
|
||
return match.group(1)
|
||
match = re.search(r'/a/.*?([A-Z]\d+-\d+)', path)
|
||
if match:
|
||
return match.group(1)
|
||
return path.split('/')[-1] if path else ""
|
||
|
||
@staticmethod
|
||
def clean_text(text: str) -> str:
|
||
"""Clean extracted text"""
|
||
text = html.unescape(text)
|
||
text = re.sub(r'\s+', ' ', text)
|
||
return text.strip()
|
||
|
||
@staticmethod
|
||
def format_timestamp(timestamp) -> str:
|
||
"""Convert Unix timestamp to readable date"""
|
||
try:
|
||
# Handle numeric timestamps
|
||
if isinstance(timestamp, (int, float)) and timestamp > 0:
|
||
# Unix timestamps are typically 10 digits (seconds) or 13 digits (milliseconds)
|
||
if timestamp > 1e12: # Milliseconds
|
||
timestamp = timestamp / 1000
|
||
return datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
|
||
|
||
# Handle string timestamps that might be numeric
|
||
if isinstance(timestamp, str):
|
||
# Try to parse as number
|
||
try:
|
||
ts_num = float(timestamp)
|
||
if ts_num > 1e12:
|
||
ts_num = ts_num / 1000
|
||
if ts_num > 0:
|
||
return datetime.fromtimestamp(ts_num).strftime('%Y-%m-%d %H:%M:%S')
|
||
except ValueError:
|
||
# Not a numeric string - check if it's an invalid value
|
||
invalid_values = ['gap', 'materieel wegens vereffening', 'tbd', 'n/a', 'unknown']
|
||
if timestamp.lower().strip() in invalid_values:
|
||
return ''
|
||
# Return as-is if it looks like a formatted date
|
||
return timestamp if len(timestamp) > 0 else ''
|
||
|
||
return str(timestamp) if timestamp else ''
|
||
except Exception as e:
|
||
# Log parsing errors for debugging
|
||
if timestamp and str(timestamp).strip():
|
||
print(f" ⚠️ Could not parse timestamp: {timestamp}")
|
||
return ''
|
||
|
||
@staticmethod
|
||
def format_currency(amount) -> str:
|
||
"""Format currency amount"""
|
||
if isinstance(amount, (int, float)):
|
||
return f"€{amount:,.2f}" if amount > 0 else "€0"
|
||
return str(amount) if amount else "€0"
|
||
|
||
def parse_page(self, content: str, url: str) -> Optional[Dict]:
|
||
"""Parse page and determine if it's an auction or lot"""
|
||
next_data = self._extract_nextjs_data(content, url)
|
||
if next_data:
|
||
return next_data
|
||
|
||
content = re.sub(r'\s+', ' ', content)
|
||
return {
|
||
'type': 'lot',
|
||
'url': url,
|
||
'lot_id': self.extract_lot_id(url),
|
||
'title': self._extract_meta_content(content, 'og:title'),
|
||
'current_bid': self._extract_current_bid(content),
|
||
'bid_count': self._extract_bid_count(content),
|
||
'closing_time': self._extract_end_date(content),
|
||
'location': self._extract_location(content),
|
||
'description': self._extract_description(content),
|
||
'category': self._extract_category(content),
|
||
'images': self._extract_images(content),
|
||
'scraped_at': datetime.now().isoformat()
|
||
}
|
||
|
||
def _extract_nextjs_data(self, content: str, url: str) -> Optional[Dict]:
|
||
"""Extract data from Next.js __NEXT_DATA__ JSON"""
|
||
try:
|
||
match = re.search(r'<script[^>]*id="__NEXT_DATA__"[^>]*>(.+?)</script>', content, re.DOTALL)
|
||
if not match:
|
||
return None
|
||
|
||
data = json.loads(match.group(1))
|
||
page_props = data.get('props', {}).get('pageProps', {})
|
||
|
||
if 'lot' in page_props:
|
||
# Pass both lot and auction data (auction is included in lot pages)
|
||
return self._parse_lot_json(page_props.get('lot', {}), url, page_props.get('auction'))
|
||
if 'auction' in page_props:
|
||
return self._parse_auction_json(page_props.get('auction', {}), url)
|
||
return None
|
||
|
||
except Exception as e:
|
||
print(f" → Error parsing __NEXT_DATA__: {e}")
|
||
return None
|
||
|
||
def _parse_lot_json(self, lot_data: Dict, url: str, auction_data: Optional[Dict] = None) -> Dict:
|
||
"""Parse lot data from JSON
|
||
|
||
Args:
|
||
lot_data: Lot object from __NEXT_DATA__
|
||
url: Page URL
|
||
auction_data: Optional auction object (included in lot pages)
|
||
"""
|
||
location_data = lot_data.get('location', {})
|
||
city = location_data.get('city', '')
|
||
country = location_data.get('countryCode', '').upper()
|
||
location = f"{city}, {country}" if city and country else (city or country)
|
||
|
||
current_bid = lot_data.get('currentBid') or lot_data.get('highestBid') or lot_data.get('startingBid')
|
||
if current_bid is None or current_bid == 0:
|
||
bidding = lot_data.get('bidding', {})
|
||
current_bid = bidding.get('currentBid') or bidding.get('amount')
|
||
|
||
current_bid_str = self.format_currency(current_bid) if current_bid and current_bid > 0 else "No bids"
|
||
|
||
bid_count = lot_data.get('bidCount', 0)
|
||
if bid_count == 0:
|
||
bid_count = lot_data.get('bidding', {}).get('bidCount', 0)
|
||
|
||
description = lot_data.get('description', {})
|
||
if isinstance(description, dict):
|
||
description = description.get('description', '')
|
||
else:
|
||
description = str(description)
|
||
|
||
category = lot_data.get('category', {})
|
||
category_name = category.get('name', '') if isinstance(category, dict) else ''
|
||
|
||
# Get auction displayId from auction data if available (lot pages include auction)
|
||
# Otherwise fall back to the UUID auctionId
|
||
auction_id = lot_data.get('auctionId', '')
|
||
if auction_data and auction_data.get('displayId'):
|
||
auction_id = auction_data.get('displayId')
|
||
|
||
return {
|
||
'type': 'lot',
|
||
'lot_id': lot_data.get('displayId', ''),
|
||
'auction_id': auction_id,
|
||
'url': url,
|
||
'title': lot_data.get('title', ''),
|
||
'current_bid': current_bid_str,
|
||
'bid_count': bid_count,
|
||
'closing_time': self.format_timestamp(lot_data.get('endDate', '')),
|
||
'viewing_time': self._extract_viewing_time(lot_data),
|
||
'pickup_date': self._extract_pickup_date(lot_data),
|
||
'location': location,
|
||
'description': description,
|
||
'category': category_name,
|
||
'images': self._extract_images_from_json(lot_data),
|
||
'scraped_at': datetime.now().isoformat()
|
||
}
|
||
|
||
def _parse_auction_json(self, auction_data: Dict, url: str) -> Dict:
|
||
"""Parse auction data from JSON"""
|
||
is_auction = 'lots' in auction_data and isinstance(auction_data['lots'], list)
|
||
is_lot = 'lotNumber' in auction_data or 'currentBid' in auction_data
|
||
|
||
if is_auction:
|
||
lots = auction_data.get('lots', [])
|
||
first_lot_closing = None
|
||
if lots:
|
||
first_lot_closing = self.format_timestamp(lots[0].get('endDate', ''))
|
||
|
||
return {
|
||
'type': 'auction',
|
||
'auction_id': auction_data.get('displayId', ''),
|
||
'url': url,
|
||
'title': auction_data.get('name', ''),
|
||
'location': self._extract_location_from_json(auction_data),
|
||
'lots_count': len(lots),
|
||
'first_lot_closing_time': first_lot_closing or self.format_timestamp(auction_data.get('minEndDate', '')),
|
||
'scraped_at': datetime.now().isoformat(),
|
||
'lots': lots
|
||
}
|
||
elif is_lot:
|
||
return self._parse_lot_json(auction_data, url)
|
||
return None
|
||
|
||
def _extract_viewing_time(self, auction_data: Dict) -> str:
|
||
"""Extract viewing time from auction data"""
|
||
viewing_days = auction_data.get('viewingDays', [])
|
||
if viewing_days:
|
||
first = viewing_days[0]
|
||
start = self.format_timestamp(first.get('startDate', ''))
|
||
end = self.format_timestamp(first.get('endDate', ''))
|
||
if start and end:
|
||
return f"{start} - {end}"
|
||
return start or end
|
||
return ''
|
||
|
||
def _extract_pickup_date(self, auction_data: Dict) -> str:
|
||
"""Extract pickup date from auction data"""
|
||
collection_days = auction_data.get('collectionDays', [])
|
||
if collection_days:
|
||
first = collection_days[0]
|
||
start = self.format_timestamp(first.get('startDate', ''))
|
||
end = self.format_timestamp(first.get('endDate', ''))
|
||
if start and end:
|
||
return f"{start} - {end}"
|
||
return start or end
|
||
return ''
|
||
|
||
def _extract_images_from_json(self, auction_data: Dict) -> List[str]:
|
||
"""Extract all image URLs from auction data"""
|
||
images = []
|
||
if auction_data.get('image', {}).get('url'):
|
||
images.append(auction_data['image']['url'])
|
||
if isinstance(auction_data.get('images'), list):
|
||
for img in auction_data['images']:
|
||
if isinstance(img, dict) and img.get('url'):
|
||
images.append(img['url'])
|
||
elif isinstance(img, str):
|
||
images.append(img)
|
||
return images
|
||
|
||
def _extract_location_from_json(self, auction_data: Dict) -> str:
|
||
"""Extract location from auction JSON data"""
|
||
for days in [auction_data.get('viewingDays', []), auction_data.get('collectionDays', [])]:
|
||
if days:
|
||
first_location = days[0]
|
||
city = first_location.get('city', '')
|
||
country = first_location.get('countryCode', '').upper()
|
||
if city:
|
||
return f"{city}, {country}" if country else city
|
||
return ''
|
||
|
||
def _extract_meta_content(self, content: str, property_name: str) -> str:
|
||
"""Extract content from meta tags"""
|
||
pattern = rf'<meta[^>]*property=["\']{property_name}["\'][^>]*content=["\']([^"\']+)["\']'
|
||
match = re.search(pattern, content, re.IGNORECASE)
|
||
return self.clean_text(match.group(1)) if match else ""
|
||
|
||
def _extract_current_bid(self, content: str) -> str:
|
||
"""Extract current bid amount"""
|
||
patterns = [
|
||
r'"currentBid"\s*:\s*"([^"]+)"',
|
||
r'"currentBid"\s*:\s*(\d+(?:\.\d+)?)',
|
||
r'(?:Current bid|Huidig bod)[:\s]*</?\w*>\s*(€[\d,.\s]+)',
|
||
r'(?:Current bid|Huidig bod)[:\s]+(€[\d,.\s]+)',
|
||
]
|
||
|
||
# Invalid bid texts that should be treated as "no bids"
|
||
invalid_bid_texts = [
|
||
'huidig bod',
|
||
'current bid',
|
||
'€huidig bod',
|
||
'€huidig bod', # With zero-width spaces
|
||
'huidig bod',
|
||
]
|
||
|
||
for pattern in patterns:
|
||
match = re.search(pattern, content, re.IGNORECASE)
|
||
if match:
|
||
bid = match.group(1).strip()
|
||
# Remove zero-width spaces and other unicode whitespace
|
||
bid = re.sub(r'[\u200b\u200c\u200d\u00a0]+', ' ', bid).strip()
|
||
|
||
# Check if it's a valid bid
|
||
if bid:
|
||
# Reject invalid bid texts
|
||
bid_lower = bid.lower().replace(' ', '').replace('€', '')
|
||
if bid_lower not in [t.lower().replace(' ', '').replace('€', '') for t in invalid_bid_texts]:
|
||
if not bid.startswith('€'):
|
||
bid = f"€{bid}"
|
||
return bid
|
||
|
||
return "No bids"
|
||
|
||
def _extract_bid_count(self, content: str) -> int:
|
||
"""Extract number of bids"""
|
||
match = re.search(r'(\d+)\s*bids?', content, re.IGNORECASE)
|
||
if match:
|
||
try:
|
||
return int(match.group(1))
|
||
except:
|
||
pass
|
||
return 0
|
||
|
||
def _extract_end_date(self, content: str) -> str:
|
||
"""Extract auction end date"""
|
||
patterns = [
|
||
r'Ends?[:\s]+([A-Za-z0-9,:\s]+)',
|
||
r'endTime["\']:\s*["\']([^"\']+)["\']',
|
||
]
|
||
for pattern in patterns:
|
||
match = re.search(pattern, content, re.IGNORECASE)
|
||
if match:
|
||
return match.group(1).strip()
|
||
return ""
|
||
|
||
def _extract_location(self, content: str) -> str:
|
||
"""Extract location"""
|
||
patterns = [
|
||
r'(?:Location|Locatie)[:\s]*</?\w*>\s*([A-Za-zÀ-ÿ0-9\s,.-]+?)(?:<|$)',
|
||
r'(?:Location|Locatie)[:\s]+([A-Za-zÀ-ÿ0-9\s,.-]+?)(?:<br|</|$)',
|
||
]
|
||
for pattern in patterns:
|
||
match = re.search(pattern, content, re.IGNORECASE)
|
||
if match:
|
||
location = self.clean_text(match.group(1))
|
||
if location.lower() not in ['locatie', 'location', 'huidig bod', 'current bid']:
|
||
location = re.sub(r'[,.\s]+$', '', location)
|
||
if len(location) > 2:
|
||
return location
|
||
return ""
|
||
|
||
def _extract_description(self, content: str) -> str:
|
||
"""Extract description"""
|
||
pattern = r'<meta[^>]*name=["\']description["\'][^>]*content=["\']([^"\']+)["\']'
|
||
match = re.search(pattern, content, re.IGNORECASE | re.DOTALL)
|
||
return self.clean_text(match.group(1))[:500] if match else ""
|
||
|
||
def _extract_category(self, content: str) -> str:
|
||
"""Extract category from breadcrumb or meta tags"""
|
||
pattern = r'class="breadcrumb[^"]*".*?>([A-Za-z\s]+)</a>'
|
||
match = re.search(pattern, content, re.IGNORECASE)
|
||
if match:
|
||
return self.clean_text(match.group(1))
|
||
return self._extract_meta_content(content, 'category')
|
||
|
||
def _extract_images(self, content: str) -> List[str]:
|
||
"""Extract image URLs"""
|
||
pattern = r'<img[^>]*src=["\']([^"\']+\.jpe?g|[^"\']+\.png)["\'][^>]*>'
|
||
matches = re.findall(pattern, content, re.IGNORECASE)
|
||
|
||
images = []
|
||
for match in matches:
|
||
if any(skip in match.lower() for skip in ['logo', 'icon', 'placeholder', 'banner']):
|
||
continue
|
||
full_url = urljoin(BASE_URL, match)
|
||
images.append(full_url)
|
||
|
||
return images[:5] # Limit to 5 images |