IP Rotation Script: Automatic Proxy Switching

IP Rotation Script: Automatic Proxy Switching

IP rotation is the foundation of undetectable web scraping. This guide builds a practical rotation script that switches proxy IPs based on configurable triggers — request count, time intervals, or error detection.

Basic Rotation Script

import requests
import time
import random
from itertools import cycle
from typing import List, Optional

class IPRotator:
    def __init__(self, proxies: List[str], strategy: str = "round_robin"):
        self.proxies = proxies
        self.strategy = strategy
        self.current_index = 0
        self.request_count = 0
        self.last_rotation = time.time()
        self._proxy_cycle = cycle(proxies)

    def get_proxy(self) -> dict:
        if self.strategy == "round_robin":
            proxy = next(self._proxy_cycle)
        elif self.strategy == "random":
            proxy = random.choice(self.proxies)
        elif self.strategy == "sequential":
            proxy = self.proxies[self.current_index % len(self.proxies)]
            self.current_index += 1
        else:
            proxy = self.proxies[0]

        return {"http": proxy, "https": proxy}

    def should_rotate(self, max_requests: int = 10, max_seconds: int = 60) -> bool:
        if self.request_count >= max_requests:
            self.request_count = 0
            return True
        if time.time() - self.last_rotation >= max_seconds:
            self.last_rotation = time.time()
            return True
        return False

    def make_request(self, url: str, **kwargs) -> Optional[requests.Response]:
        proxy = self.get_proxy()
        self.request_count += 1

        try:
            response = requests.get(url, proxies=proxy, timeout=15, **kwargs)
            if response.status_code == 429 or response.status_code == 403:
                # Rate limited or blocked, force rotation
                proxy = self.get_proxy()
                time.sleep(random.uniform(2, 5))
                response = requests.get(url, proxies=proxy, timeout=15, **kwargs)
            return response
        except requests.exceptions.ProxyError:
            # Bad proxy, rotate
            return self.make_request(url, **kwargs)
        except Exception as e:
            print(f"Request failed: {e}")
            return None

# Usage
proxies = [
    "http://user:pass@proxy1.example.com:8080",
    "http://user:pass@proxy2.example.com:8080",
    "http://user:pass@proxy3.example.com:8080",
]

rotator = IPRotator(proxies, strategy="round_robin")

urls = [f"https://example.com/page/{i}" for i in range(100)]
for url in urls:
    response = rotator.make_request(url)
    if response:
        print(f"OK: {url} ({response.status_code})")
    time.sleep(random.uniform(0.5, 2))

Bash IP Rotation Script

#!/bin/bash
# rotate-proxies.sh - Simple proxy rotation for curl

PROXIES=(
    "http://user:pass@proxy1.example.com:8080"
    "http://user:pass@proxy2.example.com:8080"
    "http://user:pass@proxy3.example.com:8080"
)

PROXY_COUNT=${#PROXIES[@]}
INDEX=0

rotate_proxy() {
    CURRENT_PROXY=${PROXIES[$INDEX]}
    INDEX=$(( (INDEX + 1) % PROXY_COUNT ))
}

# Scrape with rotation
while IFS= read -r url; do
    rotate_proxy
    echo "Scraping: $url via proxy $((INDEX))"
    curl -s -x "$CURRENT_PROXY" -o "output/$(echo $url | md5sum | cut -d' ' -f1).html" "$url"
    sleep $((RANDOM % 3 + 1))
done < urls.txt

FAQ

What is the optimal rotation frequency?

Rotate every 5-20 requests for general scraping, or after every request for aggressive anti-bot sites. For residential proxies, backconnect gateways handle rotation automatically — see our proxy rotation guide.

Should I rotate on errors or on a schedule?

Both. Use time or request-based rotation as the baseline, and force immediate rotation on 403/429 errors. This provides both proactive and reactive IP management.

How many IPs do I need for effective rotation?

At minimum 10 for light scraping, 50-100 for medium, and 1,000+ for heavy operations. The more IPs in your pool, the longer the gap between reusing the same IP, reducing detection risk.

Implementation Best Practices

Error Handling and Retry Logic

Production scraping tools must handle failures gracefully. Implement exponential backoff with jitter:

import random
import time

def retry_with_backoff(func, max_retries=3, base_delay=1):
    for attempt in range(max_retries):
        try:
            return func()
        except Exception as e:
            if attempt == max_retries - 1:
                raise
            delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
            print(f"Attempt {attempt + 1} failed: {e}. Retrying in {delay:.1f}s")
            time.sleep(delay)

Logging Configuration

Set up structured logging for debugging and monitoring:

import logging
import json
from datetime import datetime

class JSONFormatter(logging.Formatter):
    def format(self, record):
        log_entry = {
            "timestamp": datetime.utcnow().isoformat(),
            "level": record.levelname,
            "message": record.getMessage(),
            "module": record.module,
            "function": record.funcName,
        }
        if record.exc_info:
            log_entry["exception"] = self.formatException(record.exc_info)
        return json.dumps(log_entry)

# Setup
handler = logging.StreamHandler()
handler.setFormatter(JSONFormatter())
logger = logging.getLogger("scraper")
logger.addHandler(handler)
logger.setLevel(logging.INFO)

Configuration Management

Use environment variables and config files for flexibility:

import os
from dataclasses import dataclass

@dataclass
class ScraperConfig:
    proxy_url: str = os.getenv("PROXY_URL", "")
    concurrent_workers: int = int(os.getenv("CONCURRENT_WORKERS", "10"))
    request_timeout: int = int(os.getenv("REQUEST_TIMEOUT", "15"))
    max_retries: int = int(os.getenv("MAX_RETRIES", "3"))
    rate_limit_per_second: float = float(os.getenv("RATE_LIMIT", "5"))
    output_format: str = os.getenv("OUTPUT_FORMAT", "json")
    database_url: str = os.getenv("DATABASE_URL", "sqlite:///results.db")
    log_level: str = os.getenv("LOG_LEVEL", "INFO")

    @classmethod
    def from_yaml(cls, filepath: str):
        import yaml
        with open(filepath) as f:
            config = yaml.safe_load(f)
        return cls(**{k: v for k, v in config.items() if hasattr(cls, k)})

Rate Limiting

Implement token bucket rate limiting to respect target sites:

import asyncio
import time

class RateLimiter:
    def __init__(self, rate: float, burst: int = 1):
        self.rate = rate  # requests per second
        self.burst = burst
        self.tokens = burst
        self.last_refill = time.time()
        self._lock = asyncio.Lock()

    async def acquire(self):
        async with self._lock:
            now = time.time()
            elapsed = now - self.last_refill
            self.tokens = min(self.burst, self.tokens + elapsed * self.rate)
            self.last_refill = now

            if self.tokens >= 1:
                self.tokens -= 1
                return
            else:
                wait_time = (1 - self.tokens) / self.rate
                await asyncio.sleep(wait_time)
                self.tokens = 0

Data Validation

Validate scraped data before storage:

from typing import Optional, List
import re

class DataValidator:
    @staticmethod
    def validate_url(url: str) -> bool:
        pattern = re.compile(
            r'^https?://'
            r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'
            r'localhost|'
            r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
            r'(?::\d+)?'
            r'(?:/?|[/?]\S+)$', re.IGNORECASE)
        return bool(pattern.match(url))

    @staticmethod
    def validate_price(price: Optional[float]) -> bool:
        if price is None:
            return True
        return 0 < price < 1_000_000

    @staticmethod
    def validate_text(text: str, min_length: int = 1, max_length: int = 10000) -> bool:
        return min_length <= len(text.strip()) <= max_length

    def validate_record(self, record: dict) -> tuple:
        errors = []
        if "url" in record and not self.validate_url(record["url"]):
            errors.append("invalid URL")
        if "price" in record and not self.validate_price(record.get("price")):
            errors.append("invalid price")
        if "title" in record and not self.validate_text(record.get("title", ""), 1, 500):
            errors.append("invalid title length")
        return len(errors) == 0, errors

Deployment

Running as a Service

# Using systemd
sudo cat > /etc/systemd/system/scraper.service << EOF
[Unit]
Description=Web Scraping Service
After=network.target

[Service]
Type=simple
User=scraper
WorkingDir=/opt/scraper
ExecStart=/opt/scraper/venv/bin/python main.py
Restart=always
RestartSec=10
Environment=PROXY_URL=http://user:pass@proxy:8080
Environment=LOG_LEVEL=INFO

[Install]
WantedBy=multi-user.target
EOF

sudo systemctl enable scraper
sudo systemctl start scraper

Docker Deployment

FROM python:3.12-slim

WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY . .

HEALTHCHECK --interval=30s --timeout=10s CMD python -c "import requests; requests.get('http://localhost:8000/health')"

CMD ["python", "main.py"]

Testing

Write tests for your scraping tools:

import pytest
import asyncio

class TestProxyIntegration:
    def test_proxy_connectivity(self):
        import requests
        proxy = {"http": "http://user:pass@proxy:8080", "https": "http://user:pass@proxy:8080"}
        response = requests.get("https://httpbin.org/ip", proxies=proxy, timeout=10)
        assert response.status_code == 200
        assert "origin" in response.json()

    def test_proxy_rotation(self):
        ips = set()
        for _ in range(5):
            import requests
            proxy = {"http": "http://user:pass@rotating-proxy:8080"}
            response = requests.get("https://httpbin.org/ip", proxies=proxy, timeout=10)
            ips.add(response.json()["origin"])
        assert len(ips) > 1, "Proxy should rotate IPs"

    def test_data_validation(self):
        validator = DataValidator()
        valid, errors = validator.validate_record({
            "url": "https://example.com",
            "title": "Test Product",
            "price": 29.99,
        })
        assert valid
        assert len(errors) == 0

For proxy infrastructure guidance, see our proxy pool management guide and web scraping proxy overview.


Related Reading

Scroll to Top