Build a Proxy Benchmarking Tool: Compare Providers

Build a Proxy Benchmarking Tool: Compare Providers

Comparing proxy providers objectively requires standardized testing. A benchmarking tool measures latency, throughput, success rates, geographic accuracy, and anonymity across multiple providers under identical conditions.

Benchmarking Framework

import asyncio
import aiohttp
import time
import json
import statistics
from dataclasses import dataclass, field
from typing import List, Dict

@dataclass
class BenchmarkResult:
    provider: str
    proxy_type: str
    total_requests: int = 0
    successful: int = 0
    failed: int = 0
    latencies: List[float] = field(default_factory=list)
    unique_ips: set = field(default_factory=set)
    geo_accuracy: float = 0
    start_time: float = 0
    end_time: float = 0

    @property
    def success_rate(self):
        return self.successful / self.total_requests if self.total_requests else 0

    @property
    def avg_latency(self):
        return statistics.mean(self.latencies) if self.latencies else 0

    @property
    def p95_latency(self):
        if not self.latencies:
            return 0
        sorted_l = sorted(self.latencies)
        return sorted_l[int(len(sorted_l) * 0.95)]

    @property
    def throughput(self):
        duration = self.end_time - self.start_time
        return self.successful / duration if duration > 0 else 0

class ProxyBenchmark:
    def __init__(self):
        self.results: Dict[str, BenchmarkResult] = {}

    async def benchmark_provider(
        self,
        name: str,
        proxy_url: str,
        proxy_type: str = "residential",
        num_requests: int = 100,
        concurrency: int = 10,
        target_country: str = "US",
    ) -> BenchmarkResult:
        result = BenchmarkResult(provider=name, proxy_type=proxy_type)
        result.start_time = time.time()
        sem = asyncio.Semaphore(concurrency)

        async with aiohttp.ClientSession() as session:
            async def single_request():
                async with sem:
                    result.total_requests += 1
                    start = time.time()
                    try:
                        async with session.get(
                            "https://httpbin.org/ip",
                            proxy=proxy_url,
                            timeout=aiohttp.ClientTimeout(total=15),
                        ) as resp:
                            if resp.status == 200:
                                data = await resp.json()
                                latency = (time.time() - start) * 1000
                                result.successful += 1
                                result.latencies.append(latency)
                                result.unique_ips.add(data.get("origin", ""))
                            else:
                                result.failed += 1
                    except Exception:
                        result.failed += 1

            tasks = [single_request() for _ in range(num_requests)]
            await asyncio.gather(*tasks)

        result.end_time = time.time()
        self.results[name] = result
        return result

    def print_comparison(self):
        print(f"{'Provider':<20} {'Success%':<10} {'Avg(ms)':<10} {'P95(ms)':<10} {'RPS':<8} {'IPs':<6}")
        print("-" * 64)
        for name, r in sorted(self.results.items(), key=lambda x: x[1].success_rate, reverse=True):
            print(f"{name:<20} {r.success_rate*100:>6.1f}%   {r.avg_latency:>7.0f}   {r.p95_latency:>7.0f}   {r.throughput:>5.1f}   {len(r.unique_ips):>4}")

# Usage
async def main():
    bench = ProxyBenchmark()

    providers = {
        "Provider A": "http://userA:passA@gate-a.com:7777",
        "Provider B": "http://userB:passB@gate-b.com:8080",
        "Provider C": "http://userC:passC@gate-c.com:5000",
    }

    for name, url in providers.items():
        print(f"Benchmarking {name}...")
        await bench.benchmark_provider(name, url, num_requests=200, concurrency=20)

    print("\n=== Results ===")
    bench.print_comparison()

asyncio.run(main())

FAQ

How many requests should I use for accurate benchmarks?

Minimum 100 requests for basic comparison, 500+ for statistically significant results. Run benchmarks at different times of day since proxy performance varies with network congestion.

What metrics matter most when comparing proxy providers?

Success rate is the primary metric — a fast proxy that fails 50% of the time is worthless. After success rate, prioritize P95 latency (not average), geographic accuracy, and IP diversity.

Should I benchmark against real target sites?

Test against httpbin.org or similar neutral endpoints for fair comparison. Then test against your actual target sites to see real-world performance. Different providers may perform differently against different anti-bot systems.

Implementation Best Practices

Error Handling and Retry Logic

Production scraping tools must handle failures gracefully. Implement exponential backoff with jitter:

import random
import time

def retry_with_backoff(func, max_retries=3, base_delay=1):
    for attempt in range(max_retries):
        try:
            return func()
        except Exception as e:
            if attempt == max_retries - 1:
                raise
            delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
            print(f"Attempt {attempt + 1} failed: {e}. Retrying in {delay:.1f}s")
            time.sleep(delay)

Logging Configuration

Set up structured logging for debugging and monitoring:

import logging
import json
from datetime import datetime

class JSONFormatter(logging.Formatter):
    def format(self, record):
        log_entry = {
            "timestamp": datetime.utcnow().isoformat(),
            "level": record.levelname,
            "message": record.getMessage(),
            "module": record.module,
            "function": record.funcName,
        }
        if record.exc_info:
            log_entry["exception"] = self.formatException(record.exc_info)
        return json.dumps(log_entry)

# Setup
handler = logging.StreamHandler()
handler.setFormatter(JSONFormatter())
logger = logging.getLogger("scraper")
logger.addHandler(handler)
logger.setLevel(logging.INFO)

Configuration Management

Use environment variables and config files for flexibility:

import os
from dataclasses import dataclass

@dataclass
class ScraperConfig:
    proxy_url: str = os.getenv("PROXY_URL", "")
    concurrent_workers: int = int(os.getenv("CONCURRENT_WORKERS", "10"))
    request_timeout: int = int(os.getenv("REQUEST_TIMEOUT", "15"))
    max_retries: int = int(os.getenv("MAX_RETRIES", "3"))
    rate_limit_per_second: float = float(os.getenv("RATE_LIMIT", "5"))
    output_format: str = os.getenv("OUTPUT_FORMAT", "json")
    database_url: str = os.getenv("DATABASE_URL", "sqlite:///results.db")
    log_level: str = os.getenv("LOG_LEVEL", "INFO")

    @classmethod
    def from_yaml(cls, filepath: str):
        import yaml
        with open(filepath) as f:
            config = yaml.safe_load(f)
        return cls(**{k: v for k, v in config.items() if hasattr(cls, k)})

Rate Limiting

Implement token bucket rate limiting to respect target sites:

import asyncio
import time

class RateLimiter:
    def __init__(self, rate: float, burst: int = 1):
        self.rate = rate  # requests per second
        self.burst = burst
        self.tokens = burst
        self.last_refill = time.time()
        self._lock = asyncio.Lock()

    async def acquire(self):
        async with self._lock:
            now = time.time()
            elapsed = now - self.last_refill
            self.tokens = min(self.burst, self.tokens + elapsed * self.rate)
            self.last_refill = now

            if self.tokens >= 1:
                self.tokens -= 1
                return
            else:
                wait_time = (1 - self.tokens) / self.rate
                await asyncio.sleep(wait_time)
                self.tokens = 0

Data Validation

Validate scraped data before storage:

from typing import Optional, List
import re

class DataValidator:
    @staticmethod
    def validate_url(url: str) -> bool:
        pattern = re.compile(
            r'^https?://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'
            r'localhost|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)
        return bool(pattern.match(url))

    @staticmethod
    def validate_price(price: Optional[float]) -> bool:
        if price is None:
            return True
        return 0 < price < 1_000_000

    @staticmethod
    def validate_text(text: str, min_length: int = 1, max_length: int = 10000) -> bool:
        return min_length <= len(text.strip()) <= max_length

    def validate_record(self, record: dict) -> tuple:
        errors = []
        if "url" in record and not self.validate_url(record["url"]):
            errors.append("invalid URL")
        if "price" in record and not self.validate_price(record.get("price")):
            errors.append("invalid price")
        if "title" in record and not self.validate_text(record.get("title", ""), 1, 500):
            errors.append("invalid title length")
        return len(errors) == 0, errors

Deployment

Running as a Service

# Using systemd
sudo cat > /etc/systemd/system/scraper.service << EOF
[Unit]
Description=Web Scraping Service
After=network.target

[Service]
Type=simple
User=scraper
WorkingDir=/opt/scraper
ExecStart=/opt/scraper/venv/bin/python main.py
Restart=always
RestartSec=10
Environment=PROXY_URL=http://user:pass@proxy:8080
Environment=LOG_LEVEL=INFO

[Install]
WantedBy=multi-user.target
EOF

sudo systemctl enable scraper
sudo systemctl start scraper

Docker Deployment

FROM python:3.12-slim

WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY . .

HEALTHCHECK --interval=30s --timeout=10s CMD python -c "import requests; requests.get('http://localhost:8000/health')"

CMD ["python", "main.py"]

Testing

Write tests for your scraping tools:

import pytest
import asyncio

class TestProxyIntegration:
    def test_proxy_connectivity(self):
        import requests
        proxy = {"http": "http://user:pass@proxy:8080", "https": "http://user:pass@proxy:8080"}
        response = requests.get("https://httpbin.org/ip", proxies=proxy, timeout=10)
        assert response.status_code == 200
        assert "origin" in response.json()

    def test_proxy_rotation(self):
        ips = set()
        for _ in range(5):
            import requests
            proxy = {"http": "http://user:pass@rotating-proxy:8080"}
            response = requests.get("https://httpbin.org/ip", proxies=proxy, timeout=10)
            ips.add(response.json()["origin"])
        assert len(ips) > 1, "Proxy should rotate IPs"

    def test_data_validation(self):
        validator = DataValidator()
        valid, errors = validator.validate_record({
            "url": "https://example.com",
            "title": "Test Product",
            "price": 29.99,
        })
        assert valid
        assert len(errors) == 0

For proxy infrastructure guidance, see our proxy pool management guide and web scraping proxy overview.


Related Reading

Scroll to Top