Build a Web Scraping API: REST Endpoints for Data
Wrapping your scrapers in a REST API transforms them from scripts into services. Clients send a URL, your API scrapes it through rotating proxies, and returns structured data — all with caching, rate limiting, and error handling.
FastAPI Implementation
from fastapi import FastAPI, HTTPException, Query, BackgroundTasks
from fastapi.responses import JSONResponse
from pydantic import BaseModel
import requests
from bs4 import BeautifulSoup
import hashlib
import json
import time
from typing import Optional, List
app = FastAPI(title="Web Scraping API", version="1.0.0")
# Simple in-memory cache (use Redis in production)
cache = {}
PROXY_URL = "http://user:pass@proxy.example.com:8080"
class ScrapeRequest(BaseModel):
url: str
selectors: Optional[dict] = None
render_js: bool = False
use_cache: bool = True
cache_ttl: int = 3600
class ScrapeResponse(BaseModel):
url: str
status_code: int
title: str
data: dict
cached: bool
scrape_time_ms: float
@app.post("/scrape", response_model=ScrapeResponse)
async def scrape_url(request: ScrapeRequest):
# Check cache
cache_key = hashlib.md5(request.url.encode()).hexdigest()
if request.use_cache and cache_key in cache:
cached = cache[cache_key]
if time.time() - cached["timestamp"] < request.cache_ttl:
return ScrapeResponse(**cached["data"], cached=True)
# Scrape
start = time.time()
try:
proxies = {"http": PROXY_URL, "https": PROXY_URL}
resp = requests.get(request.url, proxies=proxies, timeout=30,
headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})
except Exception as e:
raise HTTPException(status_code=502, detail=f"Scraping failed: {str(e)}")
if resp.status_code != 200:
raise HTTPException(status_code=resp.status_code, detail="Target returned error")
soup = BeautifulSoup(resp.text, "lxml")
# Extract data based on selectors
data = {}
if request.selectors:
for key, selector in request.selectors.items():
elements = soup.select(selector)
data[key] = [el.text.strip() for el in elements]
else:
data["text"] = soup.get_text(strip=True)[:5000]
result = ScrapeResponse(
url=request.url,
status_code=resp.status_code,
title=soup.title.string if soup.title else "",
data=data,
cached=False,
scrape_time_ms=(time.time() - start) * 1000,
)
# Cache result
cache[cache_key] = {"data": result.dict(), "timestamp": time.time()}
return result
@app.get("/health")
async def health_check():
return {"status": "healthy", "cache_size": len(cache)}
# Run with: uvicorn main:app --host 0.0.0.0 --port 8000Client Usage
import requests
# Simple scrape
response = requests.post("http://localhost:8000/scrape", json={
"url": "https://example.com",
})
print(response.json())
# With CSS selectors
response = requests.post("http://localhost:8000/scrape", json={
"url": "https://news.ycombinator.com",
"selectors": {
"titles": ".titleline > a",
"scores": ".score",
},
})
data = response.json()
for title in data["data"]["titles"][:5]:
print(f" - {title}")FAQ
How do I handle rate limiting on my scraping API?
Use FastAPI’s middleware or a library like slowapi to implement per-client rate limits. Combine with your proxy pool capacity — if you have 100 proxies handling 5 requests each per minute, your API can safely handle 500 requests/minute.
Should I add authentication to my scraping API?
Yes, always. Use API keys or OAuth2 to control access. Without authentication, your API becomes an open proxy that anyone can abuse, potentially getting your proxy IPs banned.
Can I use this API for JavaScript-rendered pages?
Add a Playwright or Puppeteer integration for pages requiring JS rendering. Set render_js: true in the request and use a headless browser instead of the requests library.
Implementation Best Practices
Error Handling and Retry Logic
Production scraping tools must handle failures gracefully. Implement exponential backoff with jitter:
import random
import time
def retry_with_backoff(func, max_retries=3, base_delay=1):
for attempt in range(max_retries):
try:
return func()
except Exception as e:
if attempt == max_retries - 1:
raise
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
print(f"Attempt {attempt + 1} failed: {e}. Retrying in {delay:.1f}s")
time.sleep(delay)Logging Configuration
Set up structured logging for debugging and monitoring:
import logging
import json
from datetime import datetime
class JSONFormatter(logging.Formatter):
def format(self, record):
log_entry = {
"timestamp": datetime.utcnow().isoformat(),
"level": record.levelname,
"message": record.getMessage(),
"module": record.module,
"function": record.funcName,
}
if record.exc_info:
log_entry["exception"] = self.formatException(record.exc_info)
return json.dumps(log_entry)
# Setup
handler = logging.StreamHandler()
handler.setFormatter(JSONFormatter())
logger = logging.getLogger("scraper")
logger.addHandler(handler)
logger.setLevel(logging.INFO)Configuration Management
Use environment variables and config files for flexibility:
import os
from dataclasses import dataclass
@dataclass
class ScraperConfig:
proxy_url: str = os.getenv("PROXY_URL", "")
concurrent_workers: int = int(os.getenv("CONCURRENT_WORKERS", "10"))
request_timeout: int = int(os.getenv("REQUEST_TIMEOUT", "15"))
max_retries: int = int(os.getenv("MAX_RETRIES", "3"))
rate_limit_per_second: float = float(os.getenv("RATE_LIMIT", "5"))
output_format: str = os.getenv("OUTPUT_FORMAT", "json")
database_url: str = os.getenv("DATABASE_URL", "sqlite:///results.db")
log_level: str = os.getenv("LOG_LEVEL", "INFO")
@classmethod
def from_yaml(cls, filepath: str):
import yaml
with open(filepath) as f:
config = yaml.safe_load(f)
return cls(**{k: v for k, v in config.items() if hasattr(cls, k)})Rate Limiting
Implement token bucket rate limiting to respect target sites:
import asyncio
import time
class RateLimiter:
def __init__(self, rate: float, burst: int = 1):
self.rate = rate # requests per second
self.burst = burst
self.tokens = burst
self.last_refill = time.time()
self._lock = asyncio.Lock()
async def acquire(self):
async with self._lock:
now = time.time()
elapsed = now - self.last_refill
self.tokens = min(self.burst, self.tokens + elapsed * self.rate)
self.last_refill = now
if self.tokens >= 1:
self.tokens -= 1
return
else:
wait_time = (1 - self.tokens) / self.rate
await asyncio.sleep(wait_time)
self.tokens = 0Data Validation
Validate scraped data before storage:
from typing import Optional, List
import re
class DataValidator:
@staticmethod
def validate_url(url: str) -> bool:
pattern = re.compile(
r'^https?://'
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'
r'localhost|'
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
r'(?::\d+)?'
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
return bool(pattern.match(url))
@staticmethod
def validate_price(price: Optional[float]) -> bool:
if price is None:
return True
return 0 < price < 1_000_000
@staticmethod
def validate_text(text: str, min_length: int = 1, max_length: int = 10000) -> bool:
return min_length <= len(text.strip()) <= max_length
def validate_record(self, record: dict) -> tuple:
errors = []
if "url" in record and not self.validate_url(record["url"]):
errors.append("invalid URL")
if "price" in record and not self.validate_price(record.get("price")):
errors.append("invalid price")
if "title" in record and not self.validate_text(record.get("title", ""), 1, 500):
errors.append("invalid title length")
return len(errors) == 0, errorsDeployment
Running as a Service
# Using systemd
sudo cat > /etc/systemd/system/scraper.service << EOF
[Unit]
Description=Web Scraping Service
After=network.target
[Service]
Type=simple
User=scraper
WorkingDir=/opt/scraper
ExecStart=/opt/scraper/venv/bin/python main.py
Restart=always
RestartSec=10
Environment=PROXY_URL=http://user:pass@proxy:8080
Environment=LOG_LEVEL=INFO
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl enable scraper
sudo systemctl start scraperDocker Deployment
FROM python:3.12-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
HEALTHCHECK --interval=30s --timeout=10s CMD python -c "import requests; requests.get('http://localhost:8000/health')"
CMD ["python", "main.py"]Testing
Write tests for your scraping tools:
import pytest
import asyncio
class TestProxyIntegration:
def test_proxy_connectivity(self):
import requests
proxy = {"http": "http://user:pass@proxy:8080", "https": "http://user:pass@proxy:8080"}
response = requests.get("https://httpbin.org/ip", proxies=proxy, timeout=10)
assert response.status_code == 200
assert "origin" in response.json()
def test_proxy_rotation(self):
ips = set()
for _ in range(5):
import requests
proxy = {"http": "http://user:pass@rotating-proxy:8080"}
response = requests.get("https://httpbin.org/ip", proxies=proxy, timeout=10)
ips.add(response.json()["origin"])
assert len(ips) > 1, "Proxy should rotate IPs"
def test_data_validation(self):
validator = DataValidator()
valid, errors = validator.validate_record({
"url": "https://example.com",
"title": "Test Product",
"price": 29.99,
})
assert valid
assert len(errors) == 0For proxy infrastructure guidance, see our proxy pool management guide and web scraping proxy overview.
- Build an Anti-Detection Test Suite: Verify Browser Stealth
- Build a Proxy Rotator in Python: Complete Tutorial
- AJAX Request Interception: Scraping API Calls Directly
- Bandwidth Optimization for Proxies: Reduce Costs & Increase Speed
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
- Build an Anti-Detection Test Suite: Verify Browser Stealth
- Build a News Crawler in Python: Step-by-Step Tutorial
- AJAX Request Interception: Scraping API Calls Directly
- Azure Functions for Serverless Web Scraping: the Complete Guide
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
Related Reading
- Build an Anti-Detection Test Suite: Verify Browser Stealth
- Build a News Crawler in Python: Step-by-Step Tutorial
- AJAX Request Interception: Scraping API Calls Directly
- Azure Functions for Serverless Web Scraping: the Complete Guide
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)