Build a Web Scraping CLI Tool with Python Click
A command-line scraping tool lets you extract data from any website directly from your terminal. Using Python Click, you can build a polished CLI with proxy support, multiple output formats, pagination handling, and configurable selectors.
Implementation
#!/usr/bin/env python3
# scrape-cli.py
import click
import requests
from bs4 import BeautifulSoup
import json
import csv
import sys
import time
@click.group()
@click.version_option(version="1.0.0")
def cli():
"""Web Scraping CLI Tool - Extract data from any website."""
pass
@cli.command()
@click.argument("url")
@click.option("--proxy", "-p", help="Proxy URL (http://user:pass@host:port)")
@click.option("--selector", "-s", multiple=True, help="CSS selector to extract (name:selector)")
@click.option("--output", "-o", default="-", help="Output file (- for stdout)")
@click.option("--format", "-f", "fmt", type=click.Choice(["json", "csv", "text"]), default="json")
@click.option("--pages", type=int, default=1, help="Number of pages to scrape")
@click.option("--delay", type=float, default=1.0, help="Delay between requests in seconds")
@click.option("--header", "-H", multiple=True, help="Custom header (Name:Value)")
@click.option("--timeout", type=int, default=15, help="Request timeout in seconds")
@click.option("--user-agent", default="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", help="User-Agent string")
def scrape(url, proxy, selector, output, fmt, pages, delay, header, timeout, user_agent):
"""Scrape a URL and extract data using CSS selectors."""
headers = {"User-Agent": user_agent}
for h in header:
name, value = h.split(":", 1)
headers[name.strip()] = value.strip()
proxies = {}
if proxy:
proxies = {"http": proxy, "https": proxy}
all_data = []
for page in range(1, pages + 1):
page_url = url if pages == 1 else f"{url}?page={page}"
click.echo(f"Scraping page {page}/{pages}: {page_url}", err=True)
try:
resp = requests.get(page_url, proxies=proxies, headers=headers, timeout=timeout)
resp.raise_for_status()
except Exception as e:
click.echo(f"Error: {e}", err=True)
continue
soup = BeautifulSoup(resp.text, "lxml")
if selector:
row = {}
for sel in selector:
if ":" in sel:
name, css = sel.split(":", 1)
else:
name, css = sel, sel
elements = soup.select(css)
row[name] = [el.text.strip() for el in elements]
all_data.append(row)
else:
all_data.append({
"url": page_url,
"title": soup.title.string if soup.title else "",
"text": soup.get_text(strip=True)[:2000],
})
if page < pages:
time.sleep(delay)
# Output
if output == "-":
out = sys.stdout
else:
out = open(output, "w")
if fmt == "json":
json.dump(all_data, out, indent=2, ensure_ascii=False)
elif fmt == "csv":
if all_data:
writer = csv.DictWriter(out, fieldnames=all_data[0].keys())
writer.writeheader()
for row in all_data:
flat_row = {k: "; ".join(v) if isinstance(v, list) else v for k, v in row.items()}
writer.writerow(flat_row)
elif fmt == "text":
for row in all_data:
for key, value in row.items():
if isinstance(value, list):
for v in value:
out.write(f"{key}: {v}\n")
else:
out.write(f"{key}: {value}\n")
out.write("---\n")
if output != "-":
out.close()
click.echo(f"Saved to {output}", err=True)
@cli.command()
@click.argument("proxy_url")
def test_proxy(proxy_url):
"""Test a proxy connection."""
click.echo(f"Testing proxy: {proxy_url}")
start = time.time()
try:
resp = requests.get("https://httpbin.org/ip", proxies={"http": proxy_url, "https": proxy_url}, timeout=10)
latency = (time.time() - start) * 1000
data = resp.json()
click.echo(f" Status: OK")
click.echo(f" IP: {data['origin']}")
click.echo(f" Latency: {latency:.0f}ms")
except Exception as e:
click.echo(f" Status: FAILED")
click.echo(f" Error: {e}")
if __name__ == "__main__":
cli()Usage Examples
# Basic scraping
python scrape-cli.py scrape https://example.com
# With proxy and CSS selectors
python scrape-cli.py scrape https://news.ycombinator.com \
-p http://user:pass@proxy:8080 \
-s "titles:.titleline > a" \
-s "scores:.score" \
-f json -o results.json
# Multi-page scraping
python scrape-cli.py scrape https://example.com/products \
--pages 5 --delay 2 \
-s "names:.product-name" \
-s "prices:.price" \
-f csv -o products.csv
# Test proxy
python scrape-cli.py test-proxy http://user:pass@proxy:8080FAQ
How do I install this as a system command?
Add a setup.py or pyproject.toml with a console_scripts entry point. After pip install -e ., the command is available globally as scrape-cli.
Can I scrape JavaScript-rendered pages with this CLI?
This basic version uses requests (no JS rendering). Add a --render flag that switches to Playwright for JS-heavy sites. See our web scraping proxy guides for browser-based scraping approaches.
How do I handle pagination that uses infinite scroll?
Infinite scroll requires browser automation. For URL-based pagination (page=1, page=2), the --pages option works. For API-based pagination, modify the tool to follow cursor tokens or next-page links.
Implementation Best Practices
Error Handling and Retry Logic
Production scraping tools must handle failures gracefully. Implement exponential backoff with jitter:
import random
import time
def retry_with_backoff(func, max_retries=3, base_delay=1):
for attempt in range(max_retries):
try:
return func()
except Exception as e:
if attempt == max_retries - 1:
raise
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
print(f"Attempt {attempt + 1} failed: {e}. Retrying in {delay:.1f}s")
time.sleep(delay)Logging Configuration
Set up structured logging for debugging and monitoring:
import logging
import json
from datetime import datetime
class JSONFormatter(logging.Formatter):
def format(self, record):
log_entry = {
"timestamp": datetime.utcnow().isoformat(),
"level": record.levelname,
"message": record.getMessage(),
"module": record.module,
"function": record.funcName,
}
if record.exc_info:
log_entry["exception"] = self.formatException(record.exc_info)
return json.dumps(log_entry)
# Setup
handler = logging.StreamHandler()
handler.setFormatter(JSONFormatter())
logger = logging.getLogger("scraper")
logger.addHandler(handler)
logger.setLevel(logging.INFO)Configuration Management
Use environment variables and config files for flexibility:
import os
from dataclasses import dataclass
@dataclass
class ScraperConfig:
proxy_url: str = os.getenv("PROXY_URL", "")
concurrent_workers: int = int(os.getenv("CONCURRENT_WORKERS", "10"))
request_timeout: int = int(os.getenv("REQUEST_TIMEOUT", "15"))
max_retries: int = int(os.getenv("MAX_RETRIES", "3"))
rate_limit_per_second: float = float(os.getenv("RATE_LIMIT", "5"))
output_format: str = os.getenv("OUTPUT_FORMAT", "json")
database_url: str = os.getenv("DATABASE_URL", "sqlite:///results.db")
log_level: str = os.getenv("LOG_LEVEL", "INFO")
@classmethod
def from_yaml(cls, filepath: str):
import yaml
with open(filepath) as f:
config = yaml.safe_load(f)
return cls(**{k: v for k, v in config.items() if hasattr(cls, k)})Rate Limiting
Implement token bucket rate limiting to respect target sites:
import asyncio
import time
class RateLimiter:
def __init__(self, rate: float, burst: int = 1):
self.rate = rate # requests per second
self.burst = burst
self.tokens = burst
self.last_refill = time.time()
self._lock = asyncio.Lock()
async def acquire(self):
async with self._lock:
now = time.time()
elapsed = now - self.last_refill
self.tokens = min(self.burst, self.tokens + elapsed * self.rate)
self.last_refill = now
if self.tokens >= 1:
self.tokens -= 1
return
else:
wait_time = (1 - self.tokens) / self.rate
await asyncio.sleep(wait_time)
self.tokens = 0Data Validation
Validate scraped data before storage:
from typing import Optional, List
import re
class DataValidator:
@staticmethod
def validate_url(url: str) -> bool:
pattern = re.compile(
r'^https?://'
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'
r'localhost|'
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
r'(?::\d+)?'
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
return bool(pattern.match(url))
@staticmethod
def validate_price(price: Optional[float]) -> bool:
if price is None:
return True
return 0 < price < 1_000_000
@staticmethod
def validate_text(text: str, min_length: int = 1, max_length: int = 10000) -> bool:
return min_length <= len(text.strip()) <= max_length
def validate_record(self, record: dict) -> tuple:
errors = []
if "url" in record and not self.validate_url(record["url"]):
errors.append("invalid URL")
if "price" in record and not self.validate_price(record.get("price")):
errors.append("invalid price")
if "title" in record and not self.validate_text(record.get("title", ""), 1, 500):
errors.append("invalid title length")
return len(errors) == 0, errorsDeployment
Running as a Service
# Using systemd
sudo cat > /etc/systemd/system/scraper.service << EOF
[Unit]
Description=Web Scraping Service
After=network.target
[Service]
Type=simple
User=scraper
WorkingDir=/opt/scraper
ExecStart=/opt/scraper/venv/bin/python main.py
Restart=always
RestartSec=10
Environment=PROXY_URL=http://user:pass@proxy:8080
Environment=LOG_LEVEL=INFO
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl enable scraper
sudo systemctl start scraperDocker Deployment
FROM python:3.12-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
HEALTHCHECK --interval=30s --timeout=10s CMD python -c "import requests; requests.get('http://localhost:8000/health')"
CMD ["python", "main.py"]Testing
Write tests for your scraping tools:
import pytest
import asyncio
class TestProxyIntegration:
def test_proxy_connectivity(self):
import requests
proxy = {"http": "http://user:pass@proxy:8080", "https": "http://user:pass@proxy:8080"}
response = requests.get("https://httpbin.org/ip", proxies=proxy, timeout=10)
assert response.status_code == 200
assert "origin" in response.json()
def test_proxy_rotation(self):
ips = set()
for _ in range(5):
import requests
proxy = {"http": "http://user:pass@rotating-proxy:8080"}
response = requests.get("https://httpbin.org/ip", proxies=proxy, timeout=10)
ips.add(response.json()["origin"])
assert len(ips) > 1, "Proxy should rotate IPs"
def test_data_validation(self):
validator = DataValidator()
valid, errors = validator.validate_record({
"url": "https://example.com",
"title": "Test Product",
"price": 29.99,
})
assert valid
assert len(errors) == 0For proxy infrastructure guidance, see our proxy pool management guide and web scraping proxy overview.
- Build an Anti-Detection Test Suite: Verify Browser Stealth
- Build a Proxy Rotator in Python: Complete Tutorial
- AJAX Request Interception: Scraping API Calls Directly
- Bandwidth Optimization for Proxies: Reduce Costs & Increase Speed
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
- Build an Anti-Detection Test Suite: Verify Browser Stealth
- Build a News Crawler in Python: Step-by-Step Tutorial
- AJAX Request Interception: Scraping API Calls Directly
- Azure Functions for Serverless Web Scraping: the Complete Guide
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
Related Reading
- Build an Anti-Detection Test Suite: Verify Browser Stealth
- Build a News Crawler in Python: Step-by-Step Tutorial
- AJAX Request Interception: Scraping API Calls Directly
- Azure Functions for Serverless Web Scraping: the Complete Guide
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)