GraphQL integrate, data correctness
This commit is contained in:
@@ -32,15 +32,14 @@ class TroostwijkScraper:
|
||||
self.last_request_time = 0
|
||||
self.download_images = DOWNLOAD_IMAGES
|
||||
|
||||
async def _download_image(self, url: str, lot_id: str, index: int) -> Optional[str]:
|
||||
"""Download an image and save it locally"""
|
||||
async def _download_image(self, session: 'aiohttp.ClientSession', url: str, lot_id: str, index: int) -> Optional[str]:
|
||||
"""Download an image and save it locally (without rate limiting - concurrent within lot)"""
|
||||
if not self.download_images:
|
||||
return None
|
||||
|
||||
try:
|
||||
import aiohttp
|
||||
lot_dir = Path(IMAGES_DIR) / lot_id
|
||||
lot_dir.mkdir(exist_ok=True)
|
||||
lot_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
ext = url.split('.')[-1].split('?')[0]
|
||||
if ext not in ['jpg', 'jpeg', 'png', 'gif', 'webp']:
|
||||
@@ -50,22 +49,19 @@ class TroostwijkScraper:
|
||||
if filepath.exists():
|
||||
return str(filepath)
|
||||
|
||||
await self._rate_limit()
|
||||
async with session.get(url, timeout=30) as response:
|
||||
if response.status == 200:
|
||||
content = await response.read()
|
||||
with open(filepath, 'wb') as f:
|
||||
f.write(content)
|
||||
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, timeout=30) as response:
|
||||
if response.status == 200:
|
||||
content = await response.read()
|
||||
with open(filepath, 'wb') as f:
|
||||
f.write(content)
|
||||
|
||||
with sqlite3.connect(self.cache.db_path) as conn:
|
||||
conn.execute("UPDATE images\n"
|
||||
"SET local_path = ?, downloaded = 1\n"
|
||||
"WHERE lot_id = ? AND url = ?\n"
|
||||
"", (str(filepath), lot_id, url))
|
||||
conn.commit()
|
||||
return str(filepath)
|
||||
with sqlite3.connect(self.cache.db_path) as conn:
|
||||
conn.execute("UPDATE images\n"
|
||||
"SET local_path = ?, downloaded = 1\n"
|
||||
"WHERE lot_id = ? AND url = ?\n"
|
||||
"", (str(filepath), lot_id, url))
|
||||
conn.commit()
|
||||
return str(filepath)
|
||||
|
||||
except Exception as e:
|
||||
print(f" ERROR downloading image: {e}")
|
||||
@@ -211,10 +207,17 @@ class TroostwijkScraper:
|
||||
print(f" Images: {len(images)}")
|
||||
|
||||
if self.download_images:
|
||||
for i, img_url in enumerate(images):
|
||||
local_path = await self._download_image(img_url, page_data['lot_id'], i)
|
||||
if local_path:
|
||||
print(f" Downloaded: {Path(local_path).name}")
|
||||
# Download all images concurrently for this lot
|
||||
import aiohttp
|
||||
async with aiohttp.ClientSession() as session:
|
||||
download_tasks = [
|
||||
self._download_image(session, img_url, page_data['lot_id'], i)
|
||||
for i, img_url in enumerate(images)
|
||||
]
|
||||
results = await asyncio.gather(*download_tasks, return_exceptions=True)
|
||||
|
||||
downloaded_count = sum(1 for r in results if r and not isinstance(r, Exception))
|
||||
print(f" Downloaded: {downloaded_count}/{len(images)} images")
|
||||
|
||||
return page_data
|
||||
|
||||
|
||||
Reference in New Issue
Block a user