This commit is contained in:
Tour
2025-12-06 21:27:11 +01:00
parent 21e97ada0d
commit 8c5f6016ec
4 changed files with 115 additions and 5 deletions

View File

@@ -9,8 +9,8 @@ services:
- traefik_net
environment:
RATE_LIMIT_SECONDS: "0.5"
MAX_PAGES: "50"
DOWNLOAD_IMAGES: "False"
MAX_PAGES: "500"
DOWNLOAD_IMAGES: "True"
volumes:
- shared-auction-data:/mnt/okcomputer/output
labels:

View File

@@ -71,6 +71,23 @@ class CacheManager:
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
)
""")
# Remove duplicates before creating unique index
# Keep the row with the smallest id (first occurrence) for each (lot_id, url) pair
conn.execute("""
DELETE FROM images
WHERE id NOT IN (
SELECT MIN(id)
FROM images
GROUP BY lot_id, url
)
""")
# Now create the unique index
conn.execute("""
CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_lot_url
ON images(lot_id, url)
""")
conn.commit()
def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]:
@@ -169,10 +186,11 @@ class CacheManager:
conn.commit()
def save_images(self, lot_id: str, image_urls: List[str]):
"""Save image URLs for a lot"""
"""Save image URLs for a lot (prevents duplicates via unique constraint)"""
with sqlite3.connect(self.db_path) as conn:
for url in image_urls:
conn.execute("""
INSERT OR IGNORE INTO images (lot_id, url) VALUES (?, ?)
INSERT OR IGNORE INTO images (lot_id, url, downloaded)
VALUES (?, ?, 0)
""", (lot_id, url))
conn.commit()

View File

@@ -19,7 +19,7 @@ OUTPUT_DIR = "/mnt/okcomputer/output"
IMAGES_DIR = "/mnt/okcomputer/output/images"
RATE_LIMIT_SECONDS = 0.5 # EXACTLY 0.5 seconds between requests
MAX_PAGES = 50 # Number of listing pages to crawl
DOWNLOAD_IMAGES = False # Set to True to download images
DOWNLOAD_IMAGES = True # Set to True to download images
# Setup directories
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

92
verify_images.py Normal file
View File

@@ -0,0 +1,92 @@
#!/usr/bin/env python3
"""
Verification script to check image download status and duplicates
Run this after deployment to verify the scraper is working correctly
"""
import sqlite3
import sys
from pathlib import Path
DB_PATH = "/mnt/okcomputer/output/cache.db"
def verify_database():
"""Run verification queries on the database"""
if not Path(DB_PATH).exists():
print(f"❌ Database not found: {DB_PATH}")
sys.exit(1)
conn = sqlite3.connect(DB_PATH)
print("=" * 60)
print("IMAGE DOWNLOAD VERIFICATION")
print("=" * 60)
# Check download success rate
print("\n[*] Download Success Rate:")
cursor = conn.execute("""
SELECT
COUNT(*) as total_images,
SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded,
SUM(CASE WHEN downloaded = 0 THEN 1 ELSE 0 END) as failed,
ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate
FROM images
""")
row = cursor.fetchone()
print(f" Total images: {row[0]:,}")
print(f" Downloaded: {row[1]:,}")
print(f" Not downloaded: {row[2]:,}")
print(f" Success rate: {row[3]}%")
# Check for duplicates
print("\n[*] Duplicate Check:")
cursor = conn.execute("""
SELECT lot_id, url, COUNT(*) as dup_count
FROM images
GROUP BY lot_id, url
HAVING COUNT(*) > 1
LIMIT 5
""")
duplicates = cursor.fetchall()
if duplicates:
print(f" [!] Found {len(duplicates)} duplicate entries!")
for lot_id, url, count in duplicates:
print(f" {lot_id}: {url[:50]}... (x{count})")
else:
print(" [+] No duplicates found!")
# Verify file system
print("\n[*] File System Verification:")
cursor = conn.execute("""
SELECT COUNT(*)
FROM images
WHERE downloaded = 1
AND local_path IS NOT NULL
AND local_path != ''
""")
files_with_path = cursor.fetchone()[0]
print(f" Images with local_path: {files_with_path:,}")
# Sample some downloaded images
print("\n[*] Sample Downloaded Images:")
cursor = conn.execute("""
SELECT lot_id, local_path
FROM images
WHERE downloaded = 1
AND local_path IS NOT NULL
LIMIT 5
""")
samples = cursor.fetchall()
for lot_id, path in samples:
exists = "[+]" if Path(path).exists() else "[!]"
print(f" {exists} {lot_id}: {path}")
conn.close()
print("\n" + "=" * 60)
print("VERIFICATION COMPLETE")
print("=" * 60)
if __name__ == "__main__":
verify_database()