Build a Web Scraping CLI Tool with Python Click

Build a Web Scraping CLI Tool with Python Click

A command-line scraping tool lets you extract data from any website directly from your terminal. Using Python Click, you can build a polished CLI with proxy support, multiple output formats, pagination handling, and configurable selectors.

Implementation

#!/usr/bin/env python3
# scrape-cli.py
import click
import requests
from bs4 import BeautifulSoup
import json
import csv
import sys
import time

@click.group()
@click.version_option(version="1.0.0")
def cli():
    """Web Scraping CLI Tool - Extract data from any website."""
    pass

@cli.command()
@click.argument("url")
@click.option("--proxy", "-p", help="Proxy URL (http://user:pass@host:port)")
@click.option("--selector", "-s", multiple=True, help="CSS selector to extract (name:selector)")
@click.option("--output", "-o", default="-", help="Output file (- for stdout)")
@click.option("--format", "-f", "fmt", type=click.Choice(["json", "csv", "text"]), default="json")
@click.option("--pages", type=int, default=1, help="Number of pages to scrape")
@click.option("--delay", type=float, default=1.0, help="Delay between requests in seconds")
@click.option("--header", "-H", multiple=True, help="Custom header (Name:Value)")
@click.option("--timeout", type=int, default=15, help="Request timeout in seconds")
@click.option("--user-agent", default="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", help="User-Agent string")
def scrape(url, proxy, selector, output, fmt, pages, delay, header, timeout, user_agent):
    """Scrape a URL and extract data using CSS selectors."""
    headers = {"User-Agent": user_agent}
    for h in header:
        name, value = h.split(":", 1)
        headers[name.strip()] = value.strip()

    proxies = {}
    if proxy:
        proxies = {"http": proxy, "https": proxy}

    all_data = []

    for page in range(1, pages + 1):
        page_url = url if pages == 1 else f"{url}?page={page}"
        click.echo(f"Scraping page {page}/{pages}: {page_url}", err=True)

        try:
            resp = requests.get(page_url, proxies=proxies, headers=headers, timeout=timeout)
            resp.raise_for_status()
        except Exception as e:
            click.echo(f"Error: {e}", err=True)
            continue

        soup = BeautifulSoup(resp.text, "lxml")

        if selector:
            row = {}
            for sel in selector:
                if ":" in sel:
                    name, css = sel.split(":", 1)
                else:
                    name, css = sel, sel
                elements = soup.select(css)
                row[name] = [el.text.strip() for el in elements]
            all_data.append(row)
        else:
            all_data.append({
                "url": page_url,
                "title": soup.title.string if soup.title else "",
                "text": soup.get_text(strip=True)[:2000],
            })

        if page < pages:
            time.sleep(delay)

    # Output
    if output == "-":
        out = sys.stdout
    else:
        out = open(output, "w")

    if fmt == "json":
        json.dump(all_data, out, indent=2, ensure_ascii=False)
    elif fmt == "csv":
        if all_data:
            writer = csv.DictWriter(out, fieldnames=all_data[0].keys())
            writer.writeheader()
            for row in all_data:
                flat_row = {k: "; ".join(v) if isinstance(v, list) else v for k, v in row.items()}
                writer.writerow(flat_row)
    elif fmt == "text":
        for row in all_data:
            for key, value in row.items():
                if isinstance(value, list):
                    for v in value:
                        out.write(f"{key}: {v}\n")
                else:
                    out.write(f"{key}: {value}\n")
            out.write("---\n")

    if output != "-":
        out.close()
        click.echo(f"Saved to {output}", err=True)

@cli.command()
@click.argument("proxy_url")
def test_proxy(proxy_url):
    """Test a proxy connection."""
    click.echo(f"Testing proxy: {proxy_url}")
    start = time.time()
    try:
        resp = requests.get("https://httpbin.org/ip", proxies={"http": proxy_url, "https": proxy_url}, timeout=10)
        latency = (time.time() - start) * 1000
        data = resp.json()
        click.echo(f"  Status: OK")
        click.echo(f"  IP: {data['origin']}")
        click.echo(f"  Latency: {latency:.0f}ms")
    except Exception as e:
        click.echo(f"  Status: FAILED")
        click.echo(f"  Error: {e}")

if __name__ == "__main__":
    cli()

Usage Examples

# Basic scraping
python scrape-cli.py scrape https://example.com

# With proxy and CSS selectors
python scrape-cli.py scrape https://news.ycombinator.com \
    -p http://user:pass@proxy:8080 \
    -s "titles:.titleline > a" \
    -s "scores:.score" \
    -f json -o results.json

# Multi-page scraping
python scrape-cli.py scrape https://example.com/products \
    --pages 5 --delay 2 \
    -s "names:.product-name" \
    -s "prices:.price" \
    -f csv -o products.csv

# Test proxy
python scrape-cli.py test-proxy http://user:pass@proxy:8080

FAQ

How do I install this as a system command?

Add a setup.py or pyproject.toml with a console_scripts entry point. After pip install -e ., the command is available globally as scrape-cli.

Can I scrape JavaScript-rendered pages with this CLI?

This basic version uses requests (no JS rendering). Add a --render flag that switches to Playwright for JS-heavy sites. See our web scraping proxy guides for browser-based scraping approaches.

How do I handle pagination that uses infinite scroll?

Infinite scroll requires browser automation. For URL-based pagination (page=1, page=2), the --pages option works. For API-based pagination, modify the tool to follow cursor tokens or next-page links.

Implementation Best Practices

Error Handling and Retry Logic

Production scraping tools must handle failures gracefully. Implement exponential backoff with jitter:

import random
import time

def retry_with_backoff(func, max_retries=3, base_delay=1):
    for attempt in range(max_retries):
        try:
            return func()
        except Exception as e:
            if attempt == max_retries - 1:
                raise
            delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
            print(f"Attempt {attempt + 1} failed: {e}. Retrying in {delay:.1f}s")
            time.sleep(delay)

Logging Configuration

Set up structured logging for debugging and monitoring:

import logging
import json
from datetime import datetime

class JSONFormatter(logging.Formatter):
    def format(self, record):
        log_entry = {
            "timestamp": datetime.utcnow().isoformat(),
            "level": record.levelname,
            "message": record.getMessage(),
            "module": record.module,
            "function": record.funcName,
        }
        if record.exc_info:
            log_entry["exception"] = self.formatException(record.exc_info)
        return json.dumps(log_entry)

# Setup
handler = logging.StreamHandler()
handler.setFormatter(JSONFormatter())
logger = logging.getLogger("scraper")
logger.addHandler(handler)
logger.setLevel(logging.INFO)

Configuration Management

Use environment variables and config files for flexibility:

import os
from dataclasses import dataclass

@dataclass
class ScraperConfig:
    proxy_url: str = os.getenv("PROXY_URL", "")
    concurrent_workers: int = int(os.getenv("CONCURRENT_WORKERS", "10"))
    request_timeout: int = int(os.getenv("REQUEST_TIMEOUT", "15"))
    max_retries: int = int(os.getenv("MAX_RETRIES", "3"))
    rate_limit_per_second: float = float(os.getenv("RATE_LIMIT", "5"))
    output_format: str = os.getenv("OUTPUT_FORMAT", "json")
    database_url: str = os.getenv("DATABASE_URL", "sqlite:///results.db")
    log_level: str = os.getenv("LOG_LEVEL", "INFO")

    @classmethod
    def from_yaml(cls, filepath: str):
        import yaml
        with open(filepath) as f:
            config = yaml.safe_load(f)
        return cls(**{k: v for k, v in config.items() if hasattr(cls, k)})

Rate Limiting

Implement token bucket rate limiting to respect target sites:

import asyncio
import time

class RateLimiter:
    def __init__(self, rate: float, burst: int = 1):
        self.rate = rate  # requests per second
        self.burst = burst
        self.tokens = burst
        self.last_refill = time.time()
        self._lock = asyncio.Lock()

    async def acquire(self):
        async with self._lock:
            now = time.time()
            elapsed = now - self.last_refill
            self.tokens = min(self.burst, self.tokens + elapsed * self.rate)
            self.last_refill = now

            if self.tokens >= 1:
                self.tokens -= 1
                return
            else:
                wait_time = (1 - self.tokens) / self.rate
                await asyncio.sleep(wait_time)
                self.tokens = 0

Data Validation

Validate scraped data before storage:

from typing import Optional, List
import re

class DataValidator:
    @staticmethod
    def validate_url(url: str) -> bool:
        pattern = re.compile(
            r'^https?://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'
            r'localhost|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)
        return bool(pattern.match(url))

    @staticmethod
    def validate_price(price: Optional[float]) -> bool:
        if price is None:
            return True
        return 0 < price < 1_000_000

    @staticmethod
    def validate_text(text: str, min_length: int = 1, max_length: int = 10000) -> bool:
        return min_length <= len(text.strip()) <= max_length

    def validate_record(self, record: dict) -> tuple:
        errors = []
        if "url" in record and not self.validate_url(record["url"]):
            errors.append("invalid URL")
        if "price" in record and not self.validate_price(record.get("price")):
            errors.append("invalid price")
        if "title" in record and not self.validate_text(record.get("title", ""), 1, 500):
            errors.append("invalid title length")
        return len(errors) == 0, errors

Deployment

Running as a Service

# Using systemd
sudo cat > /etc/systemd/system/scraper.service << EOF
[Unit]
Description=Web Scraping Service
After=network.target

[Service]
Type=simple
User=scraper
WorkingDir=/opt/scraper
ExecStart=/opt/scraper/venv/bin/python main.py
Restart=always
RestartSec=10
Environment=PROXY_URL=http://user:pass@proxy:8080
Environment=LOG_LEVEL=INFO

[Install]
WantedBy=multi-user.target
EOF

sudo systemctl enable scraper
sudo systemctl start scraper

Docker Deployment

FROM python:3.12-slim

WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY . .

HEALTHCHECK --interval=30s --timeout=10s CMD python -c "import requests; requests.get('http://localhost:8000/health')"

CMD ["python", "main.py"]

Testing

Write tests for your scraping tools:

import pytest
import asyncio

class TestProxyIntegration:
    def test_proxy_connectivity(self):
        import requests
        proxy = {"http": "http://user:pass@proxy:8080", "https": "http://user:pass@proxy:8080"}
        response = requests.get("https://httpbin.org/ip", proxies=proxy, timeout=10)
        assert response.status_code == 200
        assert "origin" in response.json()

    def test_proxy_rotation(self):
        ips = set()
        for _ in range(5):
            import requests
            proxy = {"http": "http://user:pass@rotating-proxy:8080"}
            response = requests.get("https://httpbin.org/ip", proxies=proxy, timeout=10)
            ips.add(response.json()["origin"])
        assert len(ips) > 1, "Proxy should rotate IPs"

    def test_data_validation(self):
        validator = DataValidator()
        valid, errors = validator.validate_record({
            "url": "https://example.com",
            "title": "Test Product",
            "price": 29.99,
        })
        assert valid
        assert len(errors) == 0

For proxy infrastructure guidance, see our proxy pool management guide and web scraping proxy overview.


Related Reading

Scroll to Top