go
This commit is contained in:
92
verify_images.py
Normal file
92
verify_images.py
Normal file
@@ -0,0 +1,92 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Verification script to check image download status and duplicates
|
||||
Run this after deployment to verify the scraper is working correctly
|
||||
"""
|
||||
import sqlite3
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
DB_PATH = "/mnt/okcomputer/output/cache.db"
|
||||
|
||||
def verify_database():
|
||||
"""Run verification queries on the database"""
|
||||
|
||||
if not Path(DB_PATH).exists():
|
||||
print(f"❌ Database not found: {DB_PATH}")
|
||||
sys.exit(1)
|
||||
|
||||
conn = sqlite3.connect(DB_PATH)
|
||||
|
||||
print("=" * 60)
|
||||
print("IMAGE DOWNLOAD VERIFICATION")
|
||||
print("=" * 60)
|
||||
|
||||
# Check download success rate
|
||||
print("\n[*] Download Success Rate:")
|
||||
cursor = conn.execute("""
|
||||
SELECT
|
||||
COUNT(*) as total_images,
|
||||
SUM(CASE WHEN downloaded = 1 THEN 1 ELSE 0 END) as downloaded,
|
||||
SUM(CASE WHEN downloaded = 0 THEN 1 ELSE 0 END) as failed,
|
||||
ROUND(100.0 * SUM(downloaded) / COUNT(*), 2) as success_rate
|
||||
FROM images
|
||||
""")
|
||||
row = cursor.fetchone()
|
||||
print(f" Total images: {row[0]:,}")
|
||||
print(f" Downloaded: {row[1]:,}")
|
||||
print(f" Not downloaded: {row[2]:,}")
|
||||
print(f" Success rate: {row[3]}%")
|
||||
|
||||
# Check for duplicates
|
||||
print("\n[*] Duplicate Check:")
|
||||
cursor = conn.execute("""
|
||||
SELECT lot_id, url, COUNT(*) as dup_count
|
||||
FROM images
|
||||
GROUP BY lot_id, url
|
||||
HAVING COUNT(*) > 1
|
||||
LIMIT 5
|
||||
""")
|
||||
duplicates = cursor.fetchall()
|
||||
|
||||
if duplicates:
|
||||
print(f" [!] Found {len(duplicates)} duplicate entries!")
|
||||
for lot_id, url, count in duplicates:
|
||||
print(f" {lot_id}: {url[:50]}... (x{count})")
|
||||
else:
|
||||
print(" [+] No duplicates found!")
|
||||
|
||||
# Verify file system
|
||||
print("\n[*] File System Verification:")
|
||||
cursor = conn.execute("""
|
||||
SELECT COUNT(*)
|
||||
FROM images
|
||||
WHERE downloaded = 1
|
||||
AND local_path IS NOT NULL
|
||||
AND local_path != ''
|
||||
""")
|
||||
files_with_path = cursor.fetchone()[0]
|
||||
print(f" Images with local_path: {files_with_path:,}")
|
||||
|
||||
# Sample some downloaded images
|
||||
print("\n[*] Sample Downloaded Images:")
|
||||
cursor = conn.execute("""
|
||||
SELECT lot_id, local_path
|
||||
FROM images
|
||||
WHERE downloaded = 1
|
||||
AND local_path IS NOT NULL
|
||||
LIMIT 5
|
||||
""")
|
||||
samples = cursor.fetchall()
|
||||
for lot_id, path in samples:
|
||||
exists = "[+]" if Path(path).exists() else "[!]"
|
||||
print(f" {exists} {lot_id}: {path}")
|
||||
|
||||
conn.close()
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("VERIFICATION COMPLETE")
|
||||
print("=" * 60)
|
||||
|
||||
if __name__ == "__main__":
|
||||
verify_database()
|
||||
Reference in New Issue
Block a user