# Clone repository git clone git@git.appmodel.nl:Tour/troost-scraper.git cd troost-scraper # Create virtual environment python -m venv .venv source .venv/bin/activate # On Windows: .venv\Scripts\activate # Install dependencies pip install -r requirements.txt playwright install chromium playwright install-deps # Install system dependencies
Create a configuration file or set environment variables:
# main.py configuration BASE_URL = "https://www.troostwijkauctions.com" CACHE_DB = "/var/troost-scraper/cache.db" OUTPUT_DIR = "/var/troost-scraper/output" RATE_LIMIT_SECONDS = 0.5 MAX_PAGES = 50
sudo mkdir -p /var/troost-scraper/output sudo chown $USER:$USER /var/troost-scraper
Add to crontab (crontab -e):
crontab -e
# Run scraper daily at 2 AM 0 2 * * * cd /path/to/troost-scraper && /path/to/.venv/bin/python main.py >> /var/log/troost-scraper.log 2>&1
Create Dockerfile:
Dockerfile
FROM python:3.10-slim WORKDIR /app # Install system dependencies for Playwright RUN apt-get update && apt-get install -y \ wget \ gnupg \ && rm -rf /var/lib/apt/lists/* COPY requirements.txt . RUN pip install --no-cache-dir -r requirements.txt RUN playwright install chromium RUN playwright install-deps COPY main.py . CMD ["python", "main.py"]
Build and run:
docker build -t troost-scraper . docker run -v /path/to/output:/output troost-scraper
tail -f /var/log/troost-scraper.log
ls -lh /var/troost-scraper/output/
# Reinstall browsers playwright install --force chromium
# Fix permissions sudo chown -R $USER:$USER /var/troost-scraper
MAX_PAGES
Deleting the wiki page "Deployment" cannot be undone. Continue?