This commit is contained in:
Tour
2025-12-06 21:27:11 +01:00
parent 21e97ada0d
commit 8c5f6016ec
4 changed files with 115 additions and 5 deletions

View File

@@ -71,6 +71,23 @@ class CacheManager:
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
)
""")
# Remove duplicates before creating unique index
# Keep the row with the smallest id (first occurrence) for each (lot_id, url) pair
conn.execute("""
DELETE FROM images
WHERE id NOT IN (
SELECT MIN(id)
FROM images
GROUP BY lot_id, url
)
""")
# Now create the unique index
conn.execute("""
CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_lot_url
ON images(lot_id, url)
""")
conn.commit()
def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]:
@@ -169,10 +186,11 @@ class CacheManager:
conn.commit()
def save_images(self, lot_id: str, image_urls: List[str]):
"""Save image URLs for a lot"""
"""Save image URLs for a lot (prevents duplicates via unique constraint)"""
with sqlite3.connect(self.db_path) as conn:
for url in image_urls:
conn.execute("""
INSERT OR IGNORE INTO images (lot_id, url) VALUES (?, ?)
INSERT OR IGNORE INTO images (lot_id, url, downloaded)
VALUES (?, ?, 0)
""", (lot_id, url))
conn.commit()

View File

@@ -19,7 +19,7 @@ OUTPUT_DIR = "/mnt/okcomputer/output"
IMAGES_DIR = "/mnt/okcomputer/output/images"
RATE_LIMIT_SECONDS = 0.5 # EXACTLY 0.5 seconds between requests
MAX_PAGES = 50 # Number of listing pages to crawl
DOWNLOAD_IMAGES = False # Set to True to download images
DOWNLOAD_IMAGES = True # Set to True to download images
# Setup directories
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)