enrich data

This commit is contained in:
Tour
2025-12-07 16:26:30 +01:00
parent fd69faebcc
commit b1905164bd
40 changed files with 76 additions and 3605 deletions

View File

@@ -19,8 +19,9 @@ class CacheManager:
self._init_db()
def _init_db(self):
"""Initialize cache and data storage database"""
"""Initialize cache and data storage database with consolidated schema"""
with sqlite3.connect(self.db_path) as conn:
# Cache table
conn.execute("""
CREATE TABLE IF NOT EXISTS cache (
url TEXT PRIMARY KEY,
@@ -32,6 +33,8 @@ class CacheManager:
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_timestamp ON cache(timestamp)
""")
# Auctions table - consolidated schema
conn.execute("""
CREATE TABLE IF NOT EXISTS auctions (
auction_id TEXT PRIMARY KEY,
@@ -40,9 +43,18 @@ class CacheManager:
location TEXT,
lots_count INTEGER,
first_lot_closing_time TEXT,
scraped_at TEXT
scraped_at TEXT,
city TEXT,
country TEXT,
type TEXT,
lot_count INTEGER DEFAULT 0,
closing_time TEXT,
discovered_at INTEGER
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)")
# Lots table - consolidated schema with all fields from working database
conn.execute("""
CREATE TABLE IF NOT EXISTS lots (
lot_id TEXT PRIMARY KEY,
@@ -50,8 +62,6 @@ class CacheManager:
url TEXT UNIQUE,
title TEXT,
current_bid TEXT,
starting_bid TEXT,
minimum_bid TEXT,
bid_count INTEGER,
closing_time TEXT,
viewing_time TEXT,
@@ -60,9 +70,54 @@ class CacheManager:
description TEXT,
category TEXT,
scraped_at TEXT,
sale_id INTEGER,
manufacturer TEXT,
type TEXT,
year INTEGER,
currency TEXT DEFAULT 'EUR',
closing_notified INTEGER DEFAULT 0,
starting_bid TEXT,
minimum_bid TEXT,
status TEXT,
brand TEXT,
model TEXT,
attributes_json TEXT,
first_bid_time TEXT,
last_bid_time TEXT,
bid_velocity REAL,
bid_increment REAL,
year_manufactured INTEGER,
condition_score REAL,
condition_description TEXT,
serial_number TEXT,
damage_description TEXT,
followers_count INTEGER DEFAULT 0,
estimated_min_price REAL,
estimated_max_price REAL,
lot_condition TEXT,
appearance TEXT,
estimated_min REAL,
estimated_max REAL,
next_bid_step_cents INTEGER,
condition TEXT,
category_path TEXT,
city_location TEXT,
country_code TEXT,
bidding_status TEXT,
packaging TEXT,
quantity INTEGER,
vat REAL,
buyer_premium_percentage REAL,
remarks TEXT,
reserve_price REAL,
reserve_met INTEGER,
view_count INTEGER,
FOREIGN KEY (auction_id) REFERENCES auctions(auction_id)
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_lots_sale_id ON lots(sale_id)")
# Images table
conn.execute("""
CREATE TABLE IF NOT EXISTS images (
id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -70,86 +125,28 @@ class CacheManager:
url TEXT,
local_path TEXT,
downloaded INTEGER DEFAULT 0,
labels TEXT,
processed_at INTEGER,
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
)
""")
conn.execute("CREATE INDEX IF NOT EXISTS idx_images_lot_id ON images(lot_id)")
# Add new columns to auctions table if they don't exist
cursor = conn.execute("PRAGMA table_info(auctions)")
auction_columns = {row[1] for row in cursor.fetchall()}
# Remove duplicates before creating unique index
conn.execute("""
DELETE FROM images
WHERE id NOT IN (
SELECT MIN(id)
FROM images
GROUP BY lot_id, url
)
""")
conn.execute("""
CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_lot_url
ON images(lot_id, url)
""")
if 'city' not in auction_columns:
conn.execute("ALTER TABLE auctions ADD COLUMN city TEXT")
if 'country' not in auction_columns:
conn.execute("ALTER TABLE auctions ADD COLUMN country TEXT")
if 'type' not in auction_columns:
conn.execute("ALTER TABLE auctions ADD COLUMN type TEXT")
if 'lot_count' not in auction_columns:
conn.execute("ALTER TABLE auctions ADD COLUMN lot_count INTEGER DEFAULT 0")
if 'closing_time' not in auction_columns:
conn.execute("ALTER TABLE auctions ADD COLUMN closing_time TEXT")
if 'discovered_at' not in auction_columns:
conn.execute("ALTER TABLE auctions ADD COLUMN discovered_at INTEGER")
# Add index for country filtering
conn.execute("CREATE INDEX IF NOT EXISTS idx_auctions_country ON auctions(country)")
# Add new columns to lots table if they don't exist
cursor = conn.execute("PRAGMA table_info(lots)")
columns = {row[1] for row in cursor.fetchall()}
if 'starting_bid' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN starting_bid TEXT")
if 'minimum_bid' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN minimum_bid TEXT")
if 'status' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN status TEXT")
if 'brand' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN brand TEXT")
if 'model' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN model TEXT")
if 'attributes_json' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN attributes_json TEXT")
# Bidding intelligence fields
if 'first_bid_time' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN first_bid_time TEXT")
if 'last_bid_time' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN last_bid_time TEXT")
if 'bid_velocity' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN bid_velocity REAL")
if 'bid_increment' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN bid_increment REAL")
# Valuation intelligence fields
if 'year_manufactured' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN year_manufactured INTEGER")
if 'condition_score' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN condition_score REAL")
if 'condition_description' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN condition_description TEXT")
if 'serial_number' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN serial_number TEXT")
if 'manufacturer' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN manufacturer TEXT")
if 'damage_description' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN damage_description TEXT")
# NEW: High-value API fields
if 'followers_count' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN followers_count INTEGER DEFAULT 0")
if 'estimated_min_price' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN estimated_min_price REAL")
if 'estimated_max_price' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN estimated_max_price REAL")
if 'lot_condition' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN lot_condition TEXT")
if 'appearance' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN appearance TEXT")
if 'scraped_at_timestamp' not in columns:
conn.execute("ALTER TABLE lots ADD COLUMN scraped_at_timestamp INTEGER")
# Create bid_history table
# Bid history table
conn.execute("""
CREATE TABLE IF NOT EXISTS bid_history (
id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -163,33 +160,15 @@ class CacheManager:
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_bid_history_lot_time
ON bid_history(lot_id, bid_time)
""")
conn.execute("""
CREATE INDEX IF NOT EXISTS idx_bid_history_bidder
ON bid_history(bidder_id)
""")
# Remove duplicates before creating unique index
# Keep the row with the smallest id (first occurrence) for each (lot_id, url) pair
conn.execute("""
DELETE FROM images
WHERE id NOT IN (
SELECT MIN(id)
FROM images
GROUP BY lot_id, url
)
""")
# Now create the unique index
conn.execute("""
CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_lot_url
ON images(lot_id, url)
""")
conn.commit()
def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]: