Build a Proxy Benchmarking Tool: Compare Providers
Comparing proxy providers objectively requires standardized testing. A benchmarking tool measures latency, throughput, success rates, geographic accuracy, and anonymity across multiple providers under identical conditions.
Benchmarking Framework
import asyncio
import aiohttp
import time
import json
import statistics
from dataclasses import dataclass, field
from typing import List, Dict
@dataclass
class BenchmarkResult:
provider: str
proxy_type: str
total_requests: int = 0
successful: int = 0
failed: int = 0
latencies: List[float] = field(default_factory=list)
unique_ips: set = field(default_factory=set)
geo_accuracy: float = 0
start_time: float = 0
end_time: float = 0
@property
def success_rate(self):
return self.successful / self.total_requests if self.total_requests else 0
@property
def avg_latency(self):
return statistics.mean(self.latencies) if self.latencies else 0
@property
def p95_latency(self):
if not self.latencies:
return 0
sorted_l = sorted(self.latencies)
return sorted_l[int(len(sorted_l) * 0.95)]
@property
def throughput(self):
duration = self.end_time - self.start_time
return self.successful / duration if duration > 0 else 0
class ProxyBenchmark:
def __init__(self):
self.results: Dict[str, BenchmarkResult] = {}
async def benchmark_provider(
self,
name: str,
proxy_url: str,
proxy_type: str = "residential",
num_requests: int = 100,
concurrency: int = 10,
target_country: str = "US",
) -> BenchmarkResult:
result = BenchmarkResult(provider=name, proxy_type=proxy_type)
result.start_time = time.time()
sem = asyncio.Semaphore(concurrency)
async with aiohttp.ClientSession() as session:
async def single_request():
async with sem:
result.total_requests += 1
start = time.time()
try:
async with session.get(
"https://httpbin.org/ip",
proxy=proxy_url,
timeout=aiohttp.ClientTimeout(total=15),
) as resp:
if resp.status == 200:
data = await resp.json()
latency = (time.time() - start) * 1000
result.successful += 1
result.latencies.append(latency)
result.unique_ips.add(data.get("origin", ""))
else:
result.failed += 1
except Exception:
result.failed += 1
tasks = [single_request() for _ in range(num_requests)]
await asyncio.gather(*tasks)
result.end_time = time.time()
self.results[name] = result
return result
def print_comparison(self):
print(f"{'Provider':<20} {'Success%':<10} {'Avg(ms)':<10} {'P95(ms)':<10} {'RPS':<8} {'IPs':<6}")
print("-" * 64)
for name, r in sorted(self.results.items(), key=lambda x: x[1].success_rate, reverse=True):
print(f"{name:<20} {r.success_rate*100:>6.1f}% {r.avg_latency:>7.0f} {r.p95_latency:>7.0f} {r.throughput:>5.1f} {len(r.unique_ips):>4}")
# Usage
async def main():
bench = ProxyBenchmark()
providers = {
"Provider A": "http://userA:passA@gate-a.com:7777",
"Provider B": "http://userB:passB@gate-b.com:8080",
"Provider C": "http://userC:passC@gate-c.com:5000",
}
for name, url in providers.items():
print(f"Benchmarking {name}...")
await bench.benchmark_provider(name, url, num_requests=200, concurrency=20)
print("\n=== Results ===")
bench.print_comparison()
asyncio.run(main())FAQ
How many requests should I use for accurate benchmarks?
Minimum 100 requests for basic comparison, 500+ for statistically significant results. Run benchmarks at different times of day since proxy performance varies with network congestion.
What metrics matter most when comparing proxy providers?
Success rate is the primary metric — a fast proxy that fails 50% of the time is worthless. After success rate, prioritize P95 latency (not average), geographic accuracy, and IP diversity.
Should I benchmark against real target sites?
Test against httpbin.org or similar neutral endpoints for fair comparison. Then test against your actual target sites to see real-world performance. Different providers may perform differently against different anti-bot systems.
Implementation Best Practices
Error Handling and Retry Logic
Production scraping tools must handle failures gracefully. Implement exponential backoff with jitter:
import random
import time
def retry_with_backoff(func, max_retries=3, base_delay=1):
for attempt in range(max_retries):
try:
return func()
except Exception as e:
if attempt == max_retries - 1:
raise
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
print(f"Attempt {attempt + 1} failed: {e}. Retrying in {delay:.1f}s")
time.sleep(delay)Logging Configuration
Set up structured logging for debugging and monitoring:
import logging
import json
from datetime import datetime
class JSONFormatter(logging.Formatter):
def format(self, record):
log_entry = {
"timestamp": datetime.utcnow().isoformat(),
"level": record.levelname,
"message": record.getMessage(),
"module": record.module,
"function": record.funcName,
}
if record.exc_info:
log_entry["exception"] = self.formatException(record.exc_info)
return json.dumps(log_entry)
# Setup
handler = logging.StreamHandler()
handler.setFormatter(JSONFormatter())
logger = logging.getLogger("scraper")
logger.addHandler(handler)
logger.setLevel(logging.INFO)Configuration Management
Use environment variables and config files for flexibility:
import os
from dataclasses import dataclass
@dataclass
class ScraperConfig:
proxy_url: str = os.getenv("PROXY_URL", "")
concurrent_workers: int = int(os.getenv("CONCURRENT_WORKERS", "10"))
request_timeout: int = int(os.getenv("REQUEST_TIMEOUT", "15"))
max_retries: int = int(os.getenv("MAX_RETRIES", "3"))
rate_limit_per_second: float = float(os.getenv("RATE_LIMIT", "5"))
output_format: str = os.getenv("OUTPUT_FORMAT", "json")
database_url: str = os.getenv("DATABASE_URL", "sqlite:///results.db")
log_level: str = os.getenv("LOG_LEVEL", "INFO")
@classmethod
def from_yaml(cls, filepath: str):
import yaml
with open(filepath) as f:
config = yaml.safe_load(f)
return cls(**{k: v for k, v in config.items() if hasattr(cls, k)})Rate Limiting
Implement token bucket rate limiting to respect target sites:
import asyncio
import time
class RateLimiter:
def __init__(self, rate: float, burst: int = 1):
self.rate = rate # requests per second
self.burst = burst
self.tokens = burst
self.last_refill = time.time()
self._lock = asyncio.Lock()
async def acquire(self):
async with self._lock:
now = time.time()
elapsed = now - self.last_refill
self.tokens = min(self.burst, self.tokens + elapsed * self.rate)
self.last_refill = now
if self.tokens >= 1:
self.tokens -= 1
return
else:
wait_time = (1 - self.tokens) / self.rate
await asyncio.sleep(wait_time)
self.tokens = 0Data Validation
Validate scraped data before storage:
from typing import Optional, List
import re
class DataValidator:
@staticmethod
def validate_url(url: str) -> bool:
pattern = re.compile(
r'^https?://'
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'
r'localhost|'
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
r'(?::\d+)?'
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
return bool(pattern.match(url))
@staticmethod
def validate_price(price: Optional[float]) -> bool:
if price is None:
return True
return 0 < price < 1_000_000
@staticmethod
def validate_text(text: str, min_length: int = 1, max_length: int = 10000) -> bool:
return min_length <= len(text.strip()) <= max_length
def validate_record(self, record: dict) -> tuple:
errors = []
if "url" in record and not self.validate_url(record["url"]):
errors.append("invalid URL")
if "price" in record and not self.validate_price(record.get("price")):
errors.append("invalid price")
if "title" in record and not self.validate_text(record.get("title", ""), 1, 500):
errors.append("invalid title length")
return len(errors) == 0, errorsDeployment
Running as a Service
# Using systemd
sudo cat > /etc/systemd/system/scraper.service << EOF
[Unit]
Description=Web Scraping Service
After=network.target
[Service]
Type=simple
User=scraper
WorkingDir=/opt/scraper
ExecStart=/opt/scraper/venv/bin/python main.py
Restart=always
RestartSec=10
Environment=PROXY_URL=http://user:pass@proxy:8080
Environment=LOG_LEVEL=INFO
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl enable scraper
sudo systemctl start scraperDocker Deployment
FROM python:3.12-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
HEALTHCHECK --interval=30s --timeout=10s CMD python -c "import requests; requests.get('http://localhost:8000/health')"
CMD ["python", "main.py"]Testing
Write tests for your scraping tools:
import pytest
import asyncio
class TestProxyIntegration:
def test_proxy_connectivity(self):
import requests
proxy = {"http": "http://user:pass@proxy:8080", "https": "http://user:pass@proxy:8080"}
response = requests.get("https://httpbin.org/ip", proxies=proxy, timeout=10)
assert response.status_code == 200
assert "origin" in response.json()
def test_proxy_rotation(self):
ips = set()
for _ in range(5):
import requests
proxy = {"http": "http://user:pass@rotating-proxy:8080"}
response = requests.get("https://httpbin.org/ip", proxies=proxy, timeout=10)
ips.add(response.json()["origin"])
assert len(ips) > 1, "Proxy should rotate IPs"
def test_data_validation(self):
validator = DataValidator()
valid, errors = validator.validate_record({
"url": "https://example.com",
"title": "Test Product",
"price": 29.99,
})
assert valid
assert len(errors) == 0For proxy infrastructure guidance, see our proxy pool management guide and web scraping proxy overview.
- Build an Anti-Detection Test Suite: Verify Browser Stealth
- Build a Proxy Rotator in Python: Complete Tutorial
- AJAX Request Interception: Scraping API Calls Directly
- Bandwidth Optimization for Proxies: Reduce Costs & Increase Speed
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
- Build an Anti-Detection Test Suite: Verify Browser Stealth
- Build a Proxy Rotator in Python: Complete Tutorial
- AJAX Request Interception: Scraping API Calls Directly
- Bandwidth Optimization for Proxies: Reduce Costs & Increase Speed
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
- Build an Anti-Detection Test Suite: Verify Browser Stealth
- Build a News Crawler in Python: Step-by-Step Tutorial
- AJAX Request Interception: Scraping API Calls Directly
- Azure Functions for Serverless Web Scraping: the Complete Guide
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
Related Reading
- Build an Anti-Detection Test Suite: Verify Browser Stealth
- Build a News Crawler in Python: Step-by-Step Tutorial
- AJAX Request Interception: Scraping API Calls Directly
- Azure Functions for Serverless Web Scraping: the Complete Guide
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)