Build an Anti-Detection Test Suite: Verify Browser Stealth
Before deploying your scraping setup against real targets, an anti-detection test suite verifies that your browser configuration, proxy setup, and request patterns will not be flagged by bot detection systems. Automated testing catches issues before they result in IP bans or account suspensions.
Test Suite
import asyncio
from playwright.async_api import async_playwright
import json
class AntiDetectionTestSuite:
def __init__(self):
self.results = []
async def run_all(self, browser_ws: str = None):
async with async_playwright() as p:
if browser_ws:
browser = await p.chromium.connect_over_cdp(browser_ws)
else:
browser = await p.chromium.launch(headless=True)
context = await browser.new_context()
page = await context.new_page()
tests = [
("WebDriver Detection", self._test_webdriver),
("Navigator Properties", self._test_navigator),
("Chrome Runtime", self._test_chrome_runtime),
("Permissions API", self._test_permissions),
("WebGL Vendor", self._test_webgl),
("Canvas Fingerprint", self._test_canvas),
("Audio Fingerprint", self._test_audio_context),
("Timezone Consistency", self._test_timezone),
("Language Consistency", self._test_language),
("Screen Dimensions", self._test_screen),
("Plugin Count", self._test_plugins),
("WebRTC Leak", self._test_webrtc),
]
for name, test_fn in tests:
try:
passed, details = await test_fn(page)
self.results.append({"test": name, "passed": passed, "details": details})
status = "PASS" if passed else "FAIL"
print(f" [{status}] {name}: {details}")
except Exception as e:
self.results.append({"test": name, "passed": False, "details": str(e)})
print(f" [ERROR] {name}: {e}")
await browser.close()
passed = sum(1 for r in self.results if r["passed"])
print(f"\nResults: {passed}/{len(self.results)} tests passed")
return self.results
async def _test_webdriver(self, page):
result = await page.evaluate("navigator.webdriver")
return result != True, f"navigator.webdriver = {result}"
async def _test_navigator(self, page):
props = await page.evaluate("""() => ({
platform: navigator.platform,
hardwareConcurrency: navigator.hardwareConcurrency,
deviceMemory: navigator.deviceMemory,
maxTouchPoints: navigator.maxTouchPoints,
})""")
has_all = all(v is not None for v in props.values())
return has_all, json.dumps(props)
async def _test_chrome_runtime(self, page):
has_chrome = await page.evaluate("typeof window.chrome !== 'undefined'")
has_runtime = await page.evaluate("typeof window.chrome?.runtime !== 'undefined'")
return has_chrome, f"chrome={has_chrome}, runtime={has_runtime}"
async def _test_permissions(self, page):
result = await page.evaluate("""async () => {
try {
const perm = await navigator.permissions.query({name: 'notifications'});
return perm.state;
} catch(e) { return 'error: ' + e.message; }
}""")
return result != "denied" or True, f"notifications={result}"
async def _test_webgl(self, page):
vendor = await page.evaluate("""() => {
const canvas = document.createElement('canvas');
const gl = canvas.getContext('webgl');
if (!gl) return 'no webgl';
const ext = gl.getExtension('WEBGL_debug_renderer_info');
return ext ? gl.getParameter(ext.UNMASKED_VENDOR_WEBGL) : 'no extension';
}""")
return vendor != "no webgl" and vendor != "Brian Paul" , f"vendor={vendor}"
async def _test_canvas(self, page):
fp = await page.evaluate("""() => {
const c = document.createElement('canvas');
const ctx = c.getContext('2d');
ctx.fillText('test', 10, 10);
return c.toDataURL().length;
}""")
return fp > 100, f"canvas_data_length={fp}"
async def _test_audio_context(self, page):
result = await page.evaluate("""() => {
try {
const ctx = new (window.AudioContext || window.webkitAudioContext)();
return ctx.sampleRate > 0;
} catch(e) { return false; }
}""")
return result, f"audio_context={result}"
async def _test_timezone(self, page):
tz = await page.evaluate("Intl.DateTimeFormat().resolvedOptions().timeZone")
return bool(tz), f"timezone={tz}"
async def _test_language(self, page):
lang = await page.evaluate("navigator.language")
return bool(lang), f"language={lang}"
async def _test_screen(self, page):
screen = await page.evaluate("({w: screen.width, h: screen.height, d: window.devicePixelRatio})")
valid = screen["w"] > 0 and screen["h"] > 0
return valid, f"{screen['w']}x{screen['h']} @{screen['d']}x"
async def _test_plugins(self, page):
count = await page.evaluate("navigator.plugins.length")
return count > 0, f"plugins={count}"
async def _test_webrtc(self, page):
has_rtc = await page.evaluate("typeof RTCPeerConnection !== 'undefined'")
return True, f"WebRTC available={has_rtc}"
# Usage
suite = AntiDetectionTestSuite()
asyncio.run(suite.run_all())FAQ
Which tests are most important?
The navigator.webdriver check is the most commonly used detection method. Canvas and WebGL fingerprint consistency are also critical. Focus on these three before fine-tuning other parameters.
Should I test against real anti-bot services?
Yes, but carefully. Test against detection check pages (CreepJS, BrowserLeaks, PixelScan) before testing against your actual targets. This avoids burning proxy IPs on detection tests.
How often should I run anti-detection tests?
Run tests after every browser or anti-detect tool update, when adding new targets, and weekly as a baseline check. Anti-bot systems update constantly, and what passed last month may fail today.
Implementation Best Practices
Error Handling and Retry Logic
Production scraping tools must handle failures gracefully. Implement exponential backoff with jitter:
import random
import time
def retry_with_backoff(func, max_retries=3, base_delay=1):
for attempt in range(max_retries):
try:
return func()
except Exception as e:
if attempt == max_retries - 1:
raise
delay = base_delay * (2 ** attempt) + random.uniform(0, 1)
print(f"Attempt {attempt + 1} failed: {e}. Retrying in {delay:.1f}s")
time.sleep(delay)Logging Configuration
Set up structured logging for debugging and monitoring:
import logging
import json
from datetime import datetime
class JSONFormatter(logging.Formatter):
def format(self, record):
log_entry = {
"timestamp": datetime.utcnow().isoformat(),
"level": record.levelname,
"message": record.getMessage(),
"module": record.module,
"function": record.funcName,
}
if record.exc_info:
log_entry["exception"] = self.formatException(record.exc_info)
return json.dumps(log_entry)
# Setup
handler = logging.StreamHandler()
handler.setFormatter(JSONFormatter())
logger = logging.getLogger("scraper")
logger.addHandler(handler)
logger.setLevel(logging.INFO)Configuration Management
Use environment variables and config files for flexibility:
import os
from dataclasses import dataclass
@dataclass
class ScraperConfig:
proxy_url: str = os.getenv("PROXY_URL", "")
concurrent_workers: int = int(os.getenv("CONCURRENT_WORKERS", "10"))
request_timeout: int = int(os.getenv("REQUEST_TIMEOUT", "15"))
max_retries: int = int(os.getenv("MAX_RETRIES", "3"))
rate_limit_per_second: float = float(os.getenv("RATE_LIMIT", "5"))
output_format: str = os.getenv("OUTPUT_FORMAT", "json")
database_url: str = os.getenv("DATABASE_URL", "sqlite:///results.db")
log_level: str = os.getenv("LOG_LEVEL", "INFO")
@classmethod
def from_yaml(cls, filepath: str):
import yaml
with open(filepath) as f:
config = yaml.safe_load(f)
return cls(**{k: v for k, v in config.items() if hasattr(cls, k)})Rate Limiting
Implement token bucket rate limiting to respect target sites:
import asyncio
import time
class RateLimiter:
def __init__(self, rate: float, burst: int = 1):
self.rate = rate # requests per second
self.burst = burst
self.tokens = burst
self.last_refill = time.time()
self._lock = asyncio.Lock()
async def acquire(self):
async with self._lock:
now = time.time()
elapsed = now - self.last_refill
self.tokens = min(self.burst, self.tokens + elapsed * self.rate)
self.last_refill = now
if self.tokens >= 1:
self.tokens -= 1
return
else:
wait_time = (1 - self.tokens) / self.rate
await asyncio.sleep(wait_time)
self.tokens = 0Data Validation
Validate scraped data before storage:
from typing import Optional, List
import re
class DataValidator:
@staticmethod
def validate_url(url: str) -> bool:
pattern = re.compile(
r'^https?://'
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|'
r'localhost|'
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})'
r'(?::\d+)?'
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
return bool(pattern.match(url))
@staticmethod
def validate_price(price: Optional[float]) -> bool:
if price is None:
return True
return 0 < price < 1_000_000
@staticmethod
def validate_text(text: str, min_length: int = 1, max_length: int = 10000) -> bool:
return min_length <= len(text.strip()) <= max_length
def validate_record(self, record: dict) -> tuple:
errors = []
if "url" in record and not self.validate_url(record["url"]):
errors.append("invalid URL")
if "price" in record and not self.validate_price(record.get("price")):
errors.append("invalid price")
if "title" in record and not self.validate_text(record.get("title", ""), 1, 500):
errors.append("invalid title length")
return len(errors) == 0, errorsDeployment
Running as a Service
# Using systemd
sudo cat > /etc/systemd/system/scraper.service << EOF
[Unit]
Description=Web Scraping Service
After=network.target
[Service]
Type=simple
User=scraper
WorkingDir=/opt/scraper
ExecStart=/opt/scraper/venv/bin/python main.py
Restart=always
RestartSec=10
Environment=PROXY_URL=http://user:pass@proxy:8080
Environment=LOG_LEVEL=INFO
[Install]
WantedBy=multi-user.target
EOF
sudo systemctl enable scraper
sudo systemctl start scraperDocker Deployment
FROM python:3.12-slim
WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
COPY . .
HEALTHCHECK --interval=30s --timeout=10s CMD python -c "import requests; requests.get('http://localhost:8000/health')"
CMD ["python", "main.py"]Testing
Write tests for your scraping tools:
import pytest
import asyncio
class TestProxyIntegration:
def test_proxy_connectivity(self):
import requests
proxy = {"http": "http://user:pass@proxy:8080", "https": "http://user:pass@proxy:8080"}
response = requests.get("https://httpbin.org/ip", proxies=proxy, timeout=10)
assert response.status_code == 200
assert "origin" in response.json()
def test_proxy_rotation(self):
ips = set()
for _ in range(5):
import requests
proxy = {"http": "http://user:pass@rotating-proxy:8080"}
response = requests.get("https://httpbin.org/ip", proxies=proxy, timeout=10)
ips.add(response.json()["origin"])
assert len(ips) > 1, "Proxy should rotate IPs"
def test_data_validation(self):
validator = DataValidator()
valid, errors = validator.validate_record({
"url": "https://example.com",
"title": "Test Product",
"price": 29.99,
})
assert valid
assert len(errors) == 0For proxy infrastructure guidance, see our proxy pool management guide and web scraping proxy overview.
- Build a Proxy Rotator in Python: Complete Tutorial
- Build a SERP Tracker: Monitor Search Rankings
- AJAX Request Interception: Scraping API Calls Directly
- Bandwidth Optimization for Proxies: Reduce Costs & Increase Speed
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
- Build a Proxy Rotator in Python: Complete Tutorial
- Build a SERP Tracker: Monitor Search Rankings
- AJAX Request Interception: Scraping API Calls Directly
- Bandwidth Optimization for Proxies: Reduce Costs & Increase Speed
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
- Build a Proxy Rotator in Python: Complete Tutorial
- Build a SERP Tracker: Monitor Search Rankings
- AJAX Request Interception: Scraping API Calls Directly
- Bandwidth Optimization for Proxies: Reduce Costs & Increase Speed
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
- Build a News Crawler in Python: Step-by-Step Tutorial
- Build a Proxy Rotator in Python: Complete Tutorial
- AJAX Request Interception: Scraping API Calls Directly
- Azure Functions for Serverless Web Scraping: the Complete Guide
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
- Build a News Crawler in Python: Step-by-Step Tutorial
- Build a Proxy Rotator in Python: Complete Tutorial
- AJAX Request Interception: Scraping API Calls Directly
- Azure Functions for Serverless Web Scraping: the Complete Guide
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
- Build a News Crawler in Python: Step-by-Step Tutorial
- Build a Proxy Rotator in Python: Complete Tutorial
- AJAX Request Interception: Scraping API Calls Directly
- Azure Functions for Serverless Web Scraping: the Complete Guide
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
- Build a News Crawler in Python: Step-by-Step Tutorial
- Build a Proxy Rotator in Python: Complete Tutorial
- AJAX Request Interception: Scraping API Calls Directly
- Azure Functions for Serverless Web Scraping: the Complete Guide
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
Related Reading
- Build a News Crawler in Python: Step-by-Step Tutorial
- Build a Proxy Rotator in Python: Complete Tutorial
- AJAX Request Interception: Scraping API Calls Directly
- Azure Functions for Serverless Web Scraping: the Complete Guide
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)