go
This commit is contained in:
@@ -9,8 +9,8 @@ services:
|
||||
- traefik_net
|
||||
environment:
|
||||
RATE_LIMIT_SECONDS: "0.5"
|
||||
MAX_PAGES: "50"
|
||||
DOWNLOAD_IMAGES: "False"
|
||||
MAX_PAGES: "500"
|
||||
DOWNLOAD_IMAGES: "True"
|
||||
volumes:
|
||||
- shared-auction-data:/mnt/okcomputer/output
|
||||
labels:
|
||||
|
||||
22
src/cache.py
22
src/cache.py
@@ -71,6 +71,23 @@ class CacheManager:
|
||||
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
|
||||
)
|
||||
""")
|
||||
|
||||
# Remove duplicates before creating unique index
|
||||
# Keep the row with the smallest id (first occurrence) for each (lot_id, url) pair
|
||||
conn.execute("""
|
||||
DELETE FROM images
|
||||
WHERE id NOT IN (
|
||||
SELECT MIN(id)
|
||||
FROM images
|
||||
GROUP BY lot_id, url
|
||||
)
|
||||
""")
|
||||
|
||||
# Now create the unique index
|
||||
conn.execute("""
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_lot_url
|
||||
ON images(lot_id, url)
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]:
|
||||
@@ -169,10 +186,11 @@ class CacheManager:
|
||||
conn.commit()
|
||||
|
||||
def save_images(self, lot_id: str, image_urls: List[str]):
|
||||
"""Save image URLs for a lot"""
|
||||
"""Save image URLs for a lot (prevents duplicates via unique constraint)"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
for url in image_urls:
|
||||
conn.execute("""
|
||||
INSERT OR IGNORE INTO images (lot_id, url) VALUES (?, ?)
|
||||
INSERT OR IGNORE INTO images (lot_id, url, downloaded)
|
||||
VALUES (?, ?, 0)
|
||||
""", (lot_id, url))
|
||||
conn.commit()
|
||||
@@ -19,7 +19,7 @@ OUTPUT_DIR = "/mnt/okcomputer/output"
|
||||
IMAGES_DIR = "/mnt/okcomputer/output/images"
|
||||
RATE_LIMIT_SECONDS = 0.5 # EXACTLY 0.5 seconds between requests
|
||||
MAX_PAGES = 50 # Number of listing pages to crawl
|
||||
DOWNLOAD_IMAGES = False # Set to True to download images
|
||||
DOWNLOAD_IMAGES = True # Set to True to download images
|
||||
|
||||
# Setup directories
|
||||
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
92
verify_images.py
Normal file
92
verify_images.py
Normal file
@@ -0,0 +1,92 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Verification script to check image download status and duplicates
|
||||
Run this after deployment to verify the scraper is working correctly
|
||||
"""
|
||||
import sqlite3
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
DB_PATH = "/mnt/okcomputer/output/cache.db"
|
||||
|
||||
def verify_database():
|
||||
"""Run verification queries on the database"""
|
||||
|
||||
if not Path(DB_PATH).exists():
|
||||
print(f"❌ Database not found: {DB_PATH}")
|
||||
sys.exit(1)
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
|
||||
print("=" * 60)
|
||||
print("IMAGE DOWNLOAD VERIFICATION")
|
||||
print("=" * 60)
|
||||
|
||||
# Check download success rate
|
||||
print("\n[*] Download Success Rate:")
|
||||
cursor = conn.execute("""
|
||||
SELECT
|
||||
COUNT(*) as total_images,
|
||||
SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded,
|
||||
SUM(CASE WHEN downloaded = 0 THEN 1 ELSE 0 END) as failed,
|
||||
ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate
|
||||
FROM images
|
||||
""")
|
||||
row = cursor.fetchone()
|
||||
print(f" Total images: {row[0]:,}")
|
||||
print(f" Downloaded: {row[1]:,}")
|
||||
print(f" Not downloaded: {row[2]:,}")
|
||||
print(f" Success rate: {row[3]}%")
|
||||
|
||||
# Check for duplicates
|
||||
print("\n[*] Duplicate Check:")
|
||||
cursor = conn.execute("""
|
||||
SELECT lot_id, url, COUNT(*) as dup_count
|
||||
FROM images
|
||||
GROUP BY lot_id, url
|
||||
HAVING COUNT(*) > 1
|
||||
LIMIT 5
|
||||
""")
|
||||
duplicates = cursor.fetchall()
|
||||
|
||||
if duplicates:
|
||||
print(f" [!] Found {len(duplicates)} duplicate entries!")
|
||||
for lot_id, url, count in duplicates:
|
||||
print(f" {lot_id}: {url[:50]}... (x{count})")
|
||||
else:
|
||||
print(" [+] No duplicates found!")
|
||||
|
||||
# Verify file system
|
||||
print("\n[*] File System Verification:")
|
||||
cursor = conn.execute("""
|
||||
SELECT COUNT(*)
|
||||
FROM images
|
||||
WHERE downloaded = 1
|
||||
AND local_path IS NOT NULL
|
||||
AND local_path != ''
|
||||
""")
|
||||
files_with_path = cursor.fetchone()[0]
|
||||
print(f" Images with local_path: {files_with_path:,}")
|
||||
|
||||
# Sample some downloaded images
|
||||
print("\n[*] Sample Downloaded Images:")
|
||||
cursor = conn.execute("""
|
||||
SELECT lot_id, local_path
|
||||
FROM images
|
||||
WHERE downloaded = 1
|
||||
AND local_path IS NOT NULL
|
||||
LIMIT 5
|
||||
""")
|
||||
samples = cursor.fetchall()
|
||||
for lot_id, path in samples:
|
||||
exists = "[+]" if Path(path).exists() else "[!]"
|
||||
print(f" {exists} {lot_id}: {path}")
|
||||
|
||||
conn.close()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("VERIFICATION COMPLETE")
|
||||
print("=" * 60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
verify_database()
|
||||
Reference in New Issue
Block a user