Build a Web Scraping API: REST Endpoints for Data

Build a Web Scraping API: REST Endpoints for Data

Wrapping your scrapers in a REST API transforms them from scripts into services. Clients send a URL, your API scrapes it through rotating proxies, and returns structured data — all with caching, rate limiting, and error handling.

FastAPI Implementation

from fastapi import FastAPI, HTTPException, Query, BackgroundTasks
from fastapi.responses import JSONResponse
from pydantic import BaseModel
import requests
from bs4 import BeautifulSoup
import hashlib
import json
import time
from typing import Optional, List

app = FastAPI(title="Web Scraping API", version="1.0.0")

# Simple in-memory cache (use Redis in production)
cache = {}

PROXY_URL = "http://user:pass@proxy.example.com:8080"

class ScrapeRequest(BaseModel):
    url: str
    selectors: Optional[dict] = None
    render_js: bool = False
    use_cache: bool = True
    cache_ttl: int = 3600

class ScrapeResponse(BaseModel):
    url: str
    status_code: int
    title: str
    data: dict
    cached: bool
    scrape_time_ms: float

@app.post("/scrape", response_model=ScrapeResponse)
async def scrape_url(request: ScrapeRequest):
    # Check cache
    cache_key = hashlib.md5(request.url.encode()).hexdigest()
    if request.use_cache and cache_key in cache:
        cached = cache[cache_key]
        if time.time() - cached["timestamp"] < request.cache_ttl:
            return ScrapeResponse(**cached["data"], cached=True)

    # Scrape
    start = time.time()
    try:
        proxies = {"http": PROXY_URL, "https": PROXY_URL}
        resp = requests.get(request.url, proxies=proxies, timeout=30,
                          headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"})
    except Exception as e:
        raise HTTPException(status_code=502, detail=f"Scraping failed: {str(e)}")

    if resp.status_code != 200:
        raise HTTPException(status_code=resp.status_code, detail="Target returned error")

    soup = BeautifulSoup(resp.text, "lxml")

    # Extract data based on selectors
    data = {}
    if request.selectors:
        for key, selector in request.selectors.items():
            elements = soup.select(selector)
            data[key] = [el.text.strip() for el in elements]
    else:
        data["text"] = soup.get_text(strip=True)[:5000]

    result = ScrapeResponse(
        url=request.url,
        status_code=resp.status_code,
        title=soup.title.string if soup.title else "",
        data=data,
        cached=False,
        scrape_time_ms=(time.time() - start) * 1000,
    )

    # Cache result
    cache[cache_key] = {"data": result.dict(), "timestamp": time.time()}

    return result

@app.get("/health")
async def health_check():
    return {"status": "healthy", "cache_size": len(cache)}

# Run with: uvicorn main:app --host 0.0.0.0 --port 8000

Client Usage

import requests

# Simple scrape
response = requests.post("http://localhost:8000/scrape", json={
    "url": "https://example.com",
})
print(response.json())

# With CSS selectors
response = requests.post("http://localhost:8000/scrape", json={
    "url": "https://news.ycombinator.com",
    "selectors": {
        "titles": ".titleline > a",
        "scores": ".score",
    },
})
data = response.json()
for title in data["data"]["titles"][:5]:
    print(f"  - {title}")

FAQ

How do I handle rate limiting on my scraping API?

Use FastAPI’s middleware or a library like slowapi to implement per-client rate limits. Combine with your proxy pool capacity — if you have 100 proxies handling 5 requests each per minute, your API can safely handle 500 requests/minute.

Should I add authentication to my scraping API?

Yes, always. Use API keys or OAuth2 to control access. Without authentication, your API becomes an open proxy that anyone can abuse, potentially getting your proxy IPs banned.

Can I use this API for JavaScript-rendered pages?

Add a Playwright or Puppeteer integration for pages requiring JS rendering. Set render_js: true in the request and use a headless browser instead of the requests library.

Implementation Best Practices

Error Handling and Retry Logic

Production scraping tools must handle failures gracefully. Implement exponential backoff with jitter:

import random
import time

def retry_with_backoff(func, max_retries=3, base_delay=1):
    for attempt in range(max_retries):
        try:
            return func()
        except Exception as e:
            if attempt == max_retries - 1:
                raise
            delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
            print(f"Attempt {attempt + 1} failed: {e}. Retrying in {delay:.1f}s")
            time.sleep(delay)

Logging Configuration

Set up structured logging for debugging and monitoring:

import logging
import json
from datetime import datetime

class JSONFormatter(logging.Formatter):
    def format(self, record):
        log_entry = {
            "timestamp": datetime.utcnow().isoformat(),
            "level": record.levelname,
            "message": record.getMessage(),
            "module": record.module,
            "function": record.funcName,
        }
        if record.exc_info:
            log_entry["exception"] = self.formatException(record.exc_info)
        return json.dumps(log_entry)

# Setup
handler = logging.StreamHandler()
handler.setFormatter(JSONFormatter())
logger = logging.getLogger("scraper")
logger.addHandler(handler)
logger.setLevel(logging.INFO)

Configuration Management

Use environment variables and config files for flexibility:

import os
from dataclasses import dataclass

@dataclass
class ScraperConfig:
    proxy_url: str = os.getenv("PROXY_URL", "")
    concurrent_workers: int = int(os.getenv("CONCURRENT_WORKERS", "10"))
    request_timeout: int = int(os.getenv("REQUEST_TIMEOUT", "15"))
    max_retries: int = int(os.getenv("MAX_RETRIES", "3"))
    rate_limit_per_second: float = float(os.getenv("RATE_LIMIT", "5"))
    output_format: str = os.getenv("OUTPUT_FORMAT", "json")
    database_url: str = os.getenv("DATABASE_URL", "sqlite:///results.db")
    log_level: str = os.getenv("LOG_LEVEL", "INFO")

    @classmethod
    def from_yaml(cls, filepath: str):
        import yaml
        with open(filepath) as f:
            config = yaml.safe_load(f)
        return cls(**{k: v for k, v in config.items() if hasattr(cls, k)})

Rate Limiting

Implement token bucket rate limiting to respect target sites:

import asyncio
import time

class RateLimiter:
    def __init__(self, rate: float, burst: int = 1):
        self.rate = rate  # requests per second
        self.burst = burst
        self.tokens = burst
        self.last_refill = time.time()
        self._lock = asyncio.Lock()

    async def acquire(self):
        async with self._lock:
            now = time.time()
            elapsed = now - self.last_refill
            self.tokens = min(self.burst, self.tokens + elapsed * self.rate)
            self.last_refill = now

            if self.tokens >= 1:
                self.tokens -= 1
                return
            else:
                wait_time = (1 - self.tokens) / self.rate
                await asyncio.sleep(wait_time)
                self.tokens = 0

Data Validation

Validate scraped data before storage:

from typing import Optional, List
import re

class DataValidator:
    @staticmethod
    def validate_url(url: str) -> bool:
        pattern = re.compile(
            r'^https?://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'
            r'localhost|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)
        return bool(pattern.match(url))

    @staticmethod
    def validate_price(price: Optional[float]) -> bool:
        if price is None:
            return True
        return 0 < price < 1_000_000

    @staticmethod
    def validate_text(text: str, min_length: int = 1, max_length: int = 10000) -> bool:
        return min_length <= len(text.strip()) <= max_length

    def validate_record(self, record: dict) -> tuple:
        errors = []
        if "url" in record and not self.validate_url(record["url"]):
            errors.append("invalid URL")
        if "price" in record and not self.validate_price(record.get("price")):
            errors.append("invalid price")
        if "title" in record and not self.validate_text(record.get("title", ""), 1, 500):
            errors.append("invalid title length")
        return len(errors) == 0, errors

Deployment

Running as a Service

# Using systemd
sudo cat > /etc/systemd/system/scraper.service << EOF
[Unit]
Description=Web Scraping Service
After=network.target

[Service]
Type=simple
User=scraper
WorkingDir=/opt/scraper
ExecStart=/opt/scraper/venv/bin/python main.py
Restart=always
RestartSec=10
Environment=PROXY_URL=http://user:pass@proxy:8080
Environment=LOG_LEVEL=INFO

[Install]
WantedBy=multi-user.target
EOF

sudo systemctl enable scraper
sudo systemctl start scraper

Docker Deployment

FROM python:3.12-slim

WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY . .

HEALTHCHECK --interval=30s --timeout=10s CMD python -c "import requests; requests.get('http://localhost:8000/health')"

CMD ["python", "main.py"]

Testing

Write tests for your scraping tools:

import pytest
import asyncio

class TestProxyIntegration:
    def test_proxy_connectivity(self):
        import requests
        proxy = {"http": "http://user:pass@proxy:8080", "https": "http://user:pass@proxy:8080"}
        response = requests.get("https://httpbin.org/ip", proxies=proxy, timeout=10)
        assert response.status_code == 200
        assert "origin" in response.json()

    def test_proxy_rotation(self):
        ips = set()
        for _ in range(5):
            import requests
            proxy = {"http": "http://user:pass@rotating-proxy:8080"}
            response = requests.get("https://httpbin.org/ip", proxies=proxy, timeout=10)
            ips.add(response.json()["origin"])
        assert len(ips) > 1, "Proxy should rotate IPs"

    def test_data_validation(self):
        validator = DataValidator()
        valid, errors = validator.validate_record({
            "url": "https://example.com",
            "title": "Test Product",
            "price": 29.99,
        })
        assert valid
        assert len(errors) == 0

For proxy infrastructure guidance, see our proxy pool management guide and web scraping proxy overview.


Related Reading

Scroll to Top