This commit is contained in:
Tour
2025-12-06 21:27:11 +01:00
parent 21e97ada0d
commit 8c5f6016ec
4 changed files with 115 additions and 5 deletions

92
verify_images.py Normal file
View File

@@ -0,0 +1,92 @@
#!/usr/bin/env python3
"""
Verification script to check image download status and duplicates
Run this after deployment to verify the scraper is working correctly
"""
import sqlite3
import sys
from pathlib import Path
DB_PATH = "/mnt/okcomputer/output/cache.db"
def verify_database():
"""Run verification queries on the database"""
if not Path(DB_PATH).exists():
print(f"❌ Database not found: {DB_PATH}")
sys.exit(1)
conn = sqlite3.connect(DB_PATH)
print("=" * 60)
print("IMAGE DOWNLOAD VERIFICATION")
print("=" * 60)
# Check download success rate
print("\n[*] Download Success Rate:")
cursor = conn.execute("""
SELECT
COUNT(*) as total_images,
SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded,
SUM(CASE WHEN downloaded = 0 THEN 1 ELSE 0 END) as failed,
ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate
FROM images
""")
row = cursor.fetchone()
print(f" Total images: {row[0]:,}")
print(f" Downloaded: {row[1]:,}")
print(f" Not downloaded: {row[2]:,}")
print(f" Success rate: {row[3]}%")
# Check for duplicates
print("\n[*] Duplicate Check:")
cursor = conn.execute("""
SELECT lot_id, url, COUNT(*) as dup_count
FROM images
GROUP BY lot_id, url
HAVING COUNT(*) > 1
LIMIT 5
""")
duplicates = cursor.fetchall()
if duplicates:
print(f" [!] Found {len(duplicates)} duplicate entries!")
for lot_id, url, count in duplicates:
print(f" {lot_id}: {url[:50]}... (x{count})")
else:
print(" [+] No duplicates found!")
# Verify file system
print("\n[*] File System Verification:")
cursor = conn.execute("""
SELECT COUNT(*)
FROM images
WHERE downloaded = 1
AND local_path IS NOT NULL
AND local_path != ''
""")
files_with_path = cursor.fetchone()[0]
print(f" Images with local_path: {files_with_path:,}")
# Sample some downloaded images
print("\n[*] Sample Downloaded Images:")
cursor = conn.execute("""
SELECT lot_id, local_path
FROM images
WHERE downloaded = 1
AND local_path IS NOT NULL
LIMIT 5
""")
samples = cursor.fetchall()
for lot_id, path in samples:
exists = "[+]" if Path(path).exists() else "[!]"
print(f" {exists} {lot_id}: {path}")
conn.close()
print("\n" + "=" * 60)
print("VERIFICATION COMPLETE")
print("=" * 60)
if __name__ == "__main__":
verify_database()