Files
scaev/verify_images.py
Tour 8c5f6016ec go
2025-12-06 21:27:11 +01:00

93 lines
2.6 KiB
Python

#!/usr/bin/env python3
"""
Verification script to check image download status and duplicates
Run this after deployment to verify the scraper is working correctly
"""
import sqlite3
import sys
from pathlib import Path
DB_PATH = "/mnt/okcomputer/output/cache.db"
def verify_database():
"""Run verification queries on the database"""
if not Path(DB_PATH).exists():
print(f"❌ Database not found: {DB_PATH}")
sys.exit(1)
conn = sqlite3.connect(DB_PATH)
print("=" * 60)
print("IMAGE DOWNLOAD VERIFICATION")
print("=" * 60)
# Check download success rate
print("\n[*] Download Success Rate:")
cursor = conn.execute("""
SELECT
COUNT(*) as total_images,
SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded,
SUM(CASE WHEN downloaded = 0 THEN 1 ELSE 0 END) as failed,
ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate
FROM images
""")
row = cursor.fetchone()
print(f" Total images: {row[0]:,}")
print(f" Downloaded: {row[1]:,}")
print(f" Not downloaded: {row[2]:,}")
print(f" Success rate: {row[3]}%")
# Check for duplicates
print("\n[*] Duplicate Check:")
cursor = conn.execute("""
SELECT lot_id, url, COUNT(*) as dup_count
FROM images
GROUP BY lot_id, url
HAVING COUNT(*) > 1
LIMIT 5
""")
duplicates = cursor.fetchall()
if duplicates:
print(f" [!] Found {len(duplicates)} duplicate entries!")
for lot_id, url, count in duplicates:
print(f" {lot_id}: {url[:50]}... (x{count})")
else:
print(" [+] No duplicates found!")
# Verify file system
print("\n[*] File System Verification:")
cursor = conn.execute("""
SELECT COUNT(*)
FROM images
WHERE downloaded = 1
AND local_path IS NOT NULL
AND local_path != ''
""")
files_with_path = cursor.fetchone()[0]
print(f" Images with local_path: {files_with_path:,}")
# Sample some downloaded images
print("\n[*] Sample Downloaded Images:")
cursor = conn.execute("""
SELECT lot_id, local_path
FROM images
WHERE downloaded = 1
AND local_path IS NOT NULL
LIMIT 5
""")
samples = cursor.fetchall()
for lot_id, path in samples:
exists = "[+]" if Path(path).exists() else "[!]"
print(f" {exists} {lot_id}: {path}")
conn.close()
print("\n" + "=" * 60)
print("VERIFICATION COMPLETE")
print("=" * 60)
if __name__ == "__main__":
verify_database()