diff --git a/docker-compose.yml b/docker-compose.yml index 65e4581..24d1445 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -9,8 +9,8 @@ services: - traefik_net environment: RATE_LIMIT_SECONDS: "0.5" - MAX_PAGES: "50" - DOWNLOAD_IMAGES: "False" + MAX_PAGES: "500" + DOWNLOAD_IMAGES: "True" volumes: - shared-auction-data:/mnt/okcomputer/output labels: diff --git a/src/cache.py b/src/cache.py index 948311d..340f725 100644 --- a/src/cache.py +++ b/src/cache.py @@ -71,6 +71,23 @@ class CacheManager: FOREIGN KEY (lot_id) REFERENCES lots(lot_id) ) """) + + # Remove duplicates before creating unique index + # Keep the row with the smallest id (first occurrence) for each (lot_id, url) pair + conn.execute(""" + DELETE FROM images + WHERE id NOT IN ( + SELECT MIN(id) + FROM images + GROUP BY lot_id, url + ) + """) + + # Now create the unique index + conn.execute(""" + CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_lot_url + ON images(lot_id, url) + """) conn.commit() def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]: @@ -169,10 +186,11 @@ class CacheManager: conn.commit() def save_images(self, lot_id: str, image_urls: List[str]): - """Save image URLs for a lot""" + """Save image URLs for a lot (prevents duplicates via unique constraint)""" with sqlite3.connect(self.db_path) as conn: for url in image_urls: conn.execute(""" - INSERT OR IGNORE INTO images (lot_id, url) VALUES (?, ?) + INSERT OR IGNORE INTO images (lot_id, url, downloaded) + VALUES (?, ?, 0) """, (lot_id, url)) conn.commit() \ No newline at end of file diff --git a/src/config.py b/src/config.py index 07c4786..4c3bb7e 100644 --- a/src/config.py +++ b/src/config.py @@ -19,7 +19,7 @@ OUTPUT_DIR = "/mnt/okcomputer/output" IMAGES_DIR = "/mnt/okcomputer/output/images" RATE_LIMIT_SECONDS = 0.5 # EXACTLY 0.5 seconds between requests MAX_PAGES = 50 # Number of listing pages to crawl -DOWNLOAD_IMAGES = False # Set to True to download images +DOWNLOAD_IMAGES = True # Set to True to download images # Setup directories Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True) diff --git a/verify_images.py b/verify_images.py new file mode 100644 index 0000000..c93064b --- /dev/null +++ b/verify_images.py @@ -0,0 +1,92 @@ +#!/usr/bin/env python3 +""" +Verification script to check image download status and duplicates +Run this after deployment to verify the scraper is working correctly +""" +import sqlite3 +import sys +from pathlib import Path + +DB_PATH = "/mnt/okcomputer/output/cache.db" + +def verify_database(): + """Run verification queries on the database""" + + if not Path(DB_PATH).exists(): + print(f"❌ Database not found: {DB_PATH}") + sys.exit(1) + + conn = sqlite3.connect(DB_PATH) + + print("=" * 60) + print("IMAGE DOWNLOAD VERIFICATION") + print("=" * 60) + + # Check download success rate + print("\n[*] Download Success Rate:") + cursor = conn.execute(""" + SELECT + COUNT(*) as total_images, + SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded, + SUM(CASE WHEN downloaded = 0 THEN 1 ELSE 0 END) as failed, + ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate + FROM images + """) + row = cursor.fetchone() + print(f" Total images: {row[0]:,}") + print(f" Downloaded: {row[1]:,}") + print(f" Not downloaded: {row[2]:,}") + print(f" Success rate: {row[3]}%") + + # Check for duplicates + print("\n[*] Duplicate Check:") + cursor = conn.execute(""" + SELECT lot_id, url, COUNT(*) as dup_count + FROM images + GROUP BY lot_id, url + HAVING COUNT(*) > 1 + LIMIT 5 + """) + duplicates = cursor.fetchall() + + if duplicates: + print(f" [!] Found {len(duplicates)} duplicate entries!") + for lot_id, url, count in duplicates: + print(f" {lot_id}: {url[:50]}... (x{count})") + else: + print(" [+] No duplicates found!") + + # Verify file system + print("\n[*] File System Verification:") + cursor = conn.execute(""" + SELECT COUNT(*) + FROM images + WHERE downloaded = 1 + AND local_path IS NOT NULL + AND local_path != '' + """) + files_with_path = cursor.fetchone()[0] + print(f" Images with local_path: {files_with_path:,}") + + # Sample some downloaded images + print("\n[*] Sample Downloaded Images:") + cursor = conn.execute(""" + SELECT lot_id, local_path + FROM images + WHERE downloaded = 1 + AND local_path IS NOT NULL + LIMIT 5 + """) + samples = cursor.fetchall() + for lot_id, path in samples: + exists = "[+]" if Path(path).exists() else "[!]" + print(f" {exists} {lot_id}: {path}") + + conn.close() + + print("\n" + "=" * 60) + print("VERIFICATION COMPLETE") + print("=" * 60) + +if __name__ == "__main__": + verify_database()