go
This commit is contained in:
22
src/cache.py
22
src/cache.py
@@ -71,6 +71,23 @@ class CacheManager:
|
||||
FOREIGN KEY (lot_id) REFERENCES lots(lot_id)
|
||||
)
|
||||
""")
|
||||
|
||||
# Remove duplicates before creating unique index
|
||||
# Keep the row with the smallest id (first occurrence) for each (lot_id, url) pair
|
||||
conn.execute("""
|
||||
DELETE FROM images
|
||||
WHERE id NOT IN (
|
||||
SELECT MIN(id)
|
||||
FROM images
|
||||
GROUP BY lot_id, url
|
||||
)
|
||||
""")
|
||||
|
||||
# Now create the unique index
|
||||
conn.execute("""
|
||||
CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_lot_url
|
||||
ON images(lot_id, url)
|
||||
""")
|
||||
conn.commit()
|
||||
|
||||
def get(self, url: str, max_age_hours: int = 24) -> Optional[Dict]:
|
||||
@@ -169,10 +186,11 @@ class CacheManager:
|
||||
conn.commit()
|
||||
|
||||
def save_images(self, lot_id: str, image_urls: List[str]):
|
||||
"""Save image URLs for a lot"""
|
||||
"""Save image URLs for a lot (prevents duplicates via unique constraint)"""
|
||||
with sqlite3.connect(self.db_path) as conn:
|
||||
for url in image_urls:
|
||||
conn.execute("""
|
||||
INSERT OR IGNORE INTO images (lot_id, url) VALUES (?, ?)
|
||||
INSERT OR IGNORE INTO images (lot_id, url, downloaded)
|
||||
VALUES (?, ?, 0)
|
||||
""", (lot_id, url))
|
||||
conn.commit()
|
||||
@@ -19,7 +19,7 @@ OUTPUT_DIR = "/mnt/okcomputer/output"
|
||||
IMAGES_DIR = "/mnt/okcomputer/output/images"
|
||||
RATE_LIMIT_SECONDS = 0.5 # EXACTLY 0.5 seconds between requests
|
||||
MAX_PAGES = 50 # Number of listing pages to crawl
|
||||
DOWNLOAD_IMAGES = False # Set to True to download images
|
||||
DOWNLOAD_IMAGES = True # Set to True to download images
|
||||
|
||||
# Setup directories
|
||||
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
|
||||
|
||||
Reference in New Issue
Block a user