93 lines
2.6 KiB
Python
93 lines
2.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Verification script to check image download status and duplicates
|
|
Run this after deployment to verify the scraper is working correctly
|
|
"""
|
|
import sqlite3
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
DB_PATH = "/mnt/okcomputer/output/cache.db"
|
|
|
|
def verify_database():
|
|
"""Run verification queries on the database"""
|
|
|
|
if not Path(DB_PATH).exists():
|
|
print(f"❌ Database not found: {DB_PATH}")
|
|
sys.exit(1)
|
|
|
|
conn = sqlite3.connect(DB_PATH)
|
|
|
|
print("=" * 60)
|
|
print("IMAGE DOWNLOAD VERIFICATION")
|
|
print("=" * 60)
|
|
|
|
# Check download success rate
|
|
print("\n[*] Download Success Rate:")
|
|
cursor = conn.execute("""
|
|
SELECT
|
|
COUNT(*) as total_images,
|
|
SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded,
|
|
SUM(CASE WHEN downloaded = 0 THEN 1 ELSE 0 END) as failed,
|
|
ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate
|
|
FROM images
|
|
""")
|
|
row = cursor.fetchone()
|
|
print(f" Total images: {row[0]:,}")
|
|
print(f" Downloaded: {row[1]:,}")
|
|
print(f" Not downloaded: {row[2]:,}")
|
|
print(f" Success rate: {row[3]}%")
|
|
|
|
# Check for duplicates
|
|
print("\n[*] Duplicate Check:")
|
|
cursor = conn.execute("""
|
|
SELECT lot_id, url, COUNT(*) as dup_count
|
|
FROM images
|
|
GROUP BY lot_id, url
|
|
HAVING COUNT(*) > 1
|
|
LIMIT 5
|
|
""")
|
|
duplicates = cursor.fetchall()
|
|
|
|
if duplicates:
|
|
print(f" [!] Found {len(duplicates)} duplicate entries!")
|
|
for lot_id, url, count in duplicates:
|
|
print(f" {lot_id}: {url[:50]}... (x{count})")
|
|
else:
|
|
print(" [+] No duplicates found!")
|
|
|
|
# Verify file system
|
|
print("\n[*] File System Verification:")
|
|
cursor = conn.execute("""
|
|
SELECT COUNT(*)
|
|
FROM images
|
|
WHERE downloaded = 1
|
|
AND local_path IS NOT NULL
|
|
AND local_path != ''
|
|
""")
|
|
files_with_path = cursor.fetchone()[0]
|
|
print(f" Images with local_path: {files_with_path:,}")
|
|
|
|
# Sample some downloaded images
|
|
print("\n[*] Sample Downloaded Images:")
|
|
cursor = conn.execute("""
|
|
SELECT lot_id, local_path
|
|
FROM images
|
|
WHERE downloaded = 1
|
|
AND local_path IS NOT NULL
|
|
LIMIT 5
|
|
""")
|
|
samples = cursor.fetchall()
|
|
for lot_id, path in samples:
|
|
exists = "[+]" if Path(path).exists() else "[!]"
|
|
print(f" {exists} {lot_id}: {path}")
|
|
|
|
conn.close()
|
|
|
|
print("\n" + "=" * 60)
|
|
print("VERIFICATION COMPLETE")
|
|
print("=" * 60)
|
|
|
|
if __name__ == "__main__":
|
|
verify_database()
|