Building Your Own Rotating Proxy Pool

Building Your Own Rotating Proxy Pool

A rotating proxy pool automatically cycles through multiple proxy IPs, distributing your requests across different addresses to avoid detection and rate limits. While commercial proxy services handle rotation for you, building your own pool gives you full control over rotation logic, health checking, and source management.

Pool Architecture

┌─────────────────────────────────────┐
│           Proxy Pool Manager         │
│                                      │
│  ┌──────────┐  ┌──────────────────┐ │
│  │  Proxy   │  │  Health Checker  │ │
│  │  Sources │  │  (background)    │ │
│  └─────┬────┘  └────────┬─────────┘ │
│        │                │            │
│  ┌─────┴────────────────┴─────────┐ │
│  │         Active Pool            │ │
│  │  [proxy1, proxy2, ..., proxyN] │ │
│  └─────────────┬──────────────────┘ │
│                │                     │
│  ┌─────────────┴──────────────────┐ │
│  │      Rotation Strategy         │ │
│  │  round-robin / weighted /      │ │
│  │  random / least-used           │ │
│  └────────────────────────────────┘ │
└─────────────────────────────────────┘

Implementation

import asyncio
import httpx
import time
import random
from dataclasses import dataclass, field
from typing import List, Optional
from collections import defaultdict

@dataclass
class ProxyInfo:
    url: str
    protocol: str = 'http'
    country: str = ''
    provider: str = 'unknown'
    healthy: bool = True
    latency_ms: float = 0
    success_count: int = 0
    fail_count: int = 0
    last_used: float = 0
    last_checked: float = 0
    cooldown_until: float = 0
    
    @property
    def success_rate(self):
        total = self.success_count + self.fail_count
        return self.success_count / total if total > 0 else 1.0
    
    @property
    def score(self):
        return self.success_rate * (1000 / max(self.latency_ms, 1))

class RotatingProxyPool:
    def __init__(self):
        self.proxies: List[ProxyInfo] = []
        self.domain_proxy_map = defaultdict(list)  # Track per-domain usage
        self._lock = asyncio.Lock()
    
    def add_proxy(self, url, **kwargs):
        proxy = ProxyInfo(url=url, **kwargs)
        self.proxies.append(proxy)
    
    def add_proxies(self, proxy_urls, **kwargs):
        for url in proxy_urls:
            self.add_proxy(url, **kwargs)
    
    async def get_proxy(self, strategy='weighted', domain=None, country=None) -> Optional[ProxyInfo]:
        async with self._lock:
            candidates = [
                p for p in self.proxies
                if p.healthy
                and time.time() > p.cooldown_until
                and (country is None or p.country == country)
            ]
            
            if not candidates:
                return None
            
            if strategy == 'round_robin':
                # Least recently used
                proxy = min(candidates, key=lambda p: p.last_used)
            elif strategy == 'random':
                proxy = random.choice(candidates)
            elif strategy == 'weighted':
                # Weight by success rate and speed
                weights = [max(p.score, 0.1) for p in candidates]
                proxy = random.choices(candidates, weights=weights, k=1)[0]
            elif strategy == 'sticky':
                # Same proxy per domain
                if domain and domain in self.domain_proxy_map:
                    prev = self.domain_proxy_map[domain]
                    for p in prev:
                        if p in candidates:
                            proxy = p
                            break
                    else:
                        proxy = random.choice(candidates)
                else:
                    proxy = random.choice(candidates)
                if domain:
                    self.domain_proxy_map[domain] = [proxy]
            else:
                proxy = random.choice(candidates)
            
            proxy.last_used = time.time()
            return proxy
    
    def report_success(self, proxy: ProxyInfo, latency_ms: float):
        proxy.success_count += 1
        proxy.latency_ms = (proxy.latency_ms * 0.7) + (latency_ms * 0.3)
        proxy.healthy = True
    
    def report_failure(self, proxy: ProxyInfo, cooldown=30):
        proxy.fail_count += 1
        if proxy.success_rate < 0.3:
            proxy.healthy = False
        else:
            proxy.cooldown_until = time.time() + cooldown
    
    async def health_check_all(self, test_url='https://httpbin.org/ip'):
        async with httpx.AsyncClient(timeout=10) as client:
            for proxy in self.proxies:
                try:
                    start = time.time()
                    response = await client.get(
                        test_url,
                        proxy=proxy.url,
                    )
                    latency = (time.time() - start) * 1000
                    
                    if response.status_code == 200:
                        self.report_success(proxy, latency)
                    else:
                        self.report_failure(proxy)
                except Exception:
                    self.report_failure(proxy)
                
                proxy.last_checked = time.time()
        
        healthy = sum(1 for p in self.proxies if p.healthy)
        print(f"Health check: {healthy}/{len(self.proxies)} healthy")
    
    def stats(self):
        healthy = [p for p in self.proxies if p.healthy]
        return {
            'total': len(self.proxies),
            'healthy': len(healthy),
            'avg_latency_ms': sum(p.latency_ms for p in healthy) / len(healthy) if healthy else 0,
            'avg_success_rate': sum(p.success_rate for p in healthy) / len(healthy) if healthy else 0,
        }

# Usage
async def main():
    pool = RotatingProxyPool()
    
    # Add proxies from multiple sources
    pool.add_proxies([
        'http://user:pass@proxy1.com:8080',
        'http://user:pass@proxy2.com:8080',
        'http://user:pass@proxy3.com:8080',
    ], provider='provider_a')
    
    pool.add_proxies([
        'socks5://user:pass@socks1.com:1080',
        'socks5://user:pass@socks2.com:1080',
    ], provider='provider_b', protocol='socks5')
    
    # Health check
    await pool.health_check_all()
    
    # Use in scraping
    async with httpx.AsyncClient(timeout=30) as client:
        for i in range(100):
            proxy = await pool.get_proxy(strategy='weighted')
            if not proxy:
                print("No healthy proxies!")
                break
            
            try:
                start = time.time()
                response = await client.get(
                    f'https://example.com/page/{i}',
                    proxy=proxy.url,
                )
                latency = (time.time() - start) * 1000
                pool.report_success(proxy, latency)
            except Exception:
                pool.report_failure(proxy)
    
    print(pool.stats())

asyncio.run(main())

Internal Links

FAQ

How many proxies do I need in a rotation pool?

For basic scraping, 10-50 proxies provide adequate rotation. For aggressive scraping of protected sites, 100-1,000+ proxies reduce per-IP request rates enough to avoid detection. The optimal number depends on your target request rate and the target site’s rate limits.

Should I mix proxy types in one pool?

Yes, for resilience. Combine datacenter proxies (fast, cheap) with residential proxies (better anti-detection) and use smart routing: datacenter for easy targets, residential for protected sites.

How often should I health-check proxies?

Check every 30-60 seconds for active pools. For larger pools (1,000+ proxies), stagger checks to avoid overwhelming the test endpoint. Combine active health checks with passive monitoring (tracking success/failure of real requests).

What is the best rotation strategy?

Weighted rotation based on success rate and latency works best for most use cases. It automatically favors reliable, fast proxies while still using slower ones occasionally. Pure round-robin treats all proxies equally, which wastes time on slow proxies.

How do I handle proxy cooldowns?

When a proxy gets rate-limited (429 response), put it on cooldown for 30-300 seconds depending on the target site. Track cooldowns per domain, not globally — a proxy blocked on site A may work fine for site B.


Related Reading

Scroll to Top