AJAX Request Interception: Scraping API Calls Directly
Most modern websites load data through AJAX calls to internal APIs. Instead of parsing HTML rendered by JavaScript, you can intercept these API calls and replicate them directly — getting structured JSON data at a fraction of the bandwidth and processing cost.
This guide shows you how to discover, intercept, and replicate AJAX requests for efficient scraping.
Why Intercept AJAX Instead of Parsing HTML?
Traditional HTML Scraping:
Browser → Load page (2.5MB) → Execute JS → Render DOM → Parse HTML → Extract data
Time: 3-5 seconds | Bandwidth: 2.5MB | Complexity: High
AJAX API Scraping:
HTTP client → Call API endpoint (5-50KB) → Parse JSON → Extract data
Time: 0.1-0.5 seconds | Bandwidth: 5-50KB | Complexity: Low| Metric | HTML Scraping | AJAX API Scraping |
|---|---|---|
| Speed | 3-5 sec/page | 0.1-0.5 sec/call |
| Bandwidth | 2-5 MB/page | 5-50 KB/call |
| Data format | Unstructured HTML | Structured JSON |
| Browser needed | Often yes | Usually no |
| Pagination | Complex | Simple (page params) |
Discovering AJAX Endpoints
Method 1: Browser DevTools
1. Open Chrome DevTools (F12)
2. Go to Network tab
3. Filter by "XHR" or "Fetch"
4. Browse the website
5. Watch API calls appear
6. Click each call to see:
- Request URL and method
- Request headers
- Request body (for POST)
- Response body (JSON data)
7. Right-click → Copy as cURLMethod 2: Playwright Interception
import asyncio
import json
from playwright.async_api import async_playwright
async def discover_apis(url, proxy=None):
"""Discover all AJAX/API calls made by a page."""
api_calls = []
async with async_playwright() as p:
browser_args = {}
if proxy:
browser_args['proxy'] = {'server': proxy}
browser = await p.chromium.launch(**browser_args)
page = await browser.new_page()
# Intercept all requests
async def on_request(request):
if request.resource_type in ('xhr', 'fetch'):
api_calls.append({
'url': request.url,
'method': request.method,
'headers': dict(request.headers),
'post_data': request.post_data,
})
page.on('request', on_request)
# Also capture responses
async def on_response(response):
request = response.request
if request.resource_type in ('xhr', 'fetch'):
try:
body = await response.json()
# Find matching call and add response
for call in api_calls:
if call['url'] == request.url:
call['response_status'] = response.status
call['response_sample'] = str(body)[:500]
break
except Exception:
pass
page.on('response', on_response)
await page.goto(url, wait_until='networkidle')
await browser.close()
return api_calls
# Usage
apis = asyncio.run(discover_apis(
'https://www.example.com/products',
proxy='http://user:pass@proxy.example.com:8080'
))
for api in apis:
print(f"\n{api['method']} {api['url']}")
if api.get('response_sample'):
print(f" Response: {api['response_sample'][:200]}")Method 3: Request Interception and Modification
async def intercept_and_modify(url, proxy=None):
"""Intercept API calls and modify them."""
async with async_playwright() as p:
browser = await p.chromium.launch(
proxy={'server': proxy} if proxy else None
)
page = await browser.new_page()
# Intercept and modify specific requests
async def handle_route(route):
request = route.request
url = request.url
if '/api/products' in url and 'limit=20' in url:
# Modify pagination to get more results
modified_url = url.replace('limit=20', 'limit=100')
await route.continue_(url=modified_url)
else:
await route.continue_()
await page.route('**/*', handle_route)
await page.goto(url)
await page.wait_for_timeout(5000)
data = await page.evaluate('() => window.__DATA__')
await browser.close()
return dataReplicating AJAX Calls in Python
Once you discover the API endpoint, replicate it directly:
import httpx
import json
class AJAXScraper:
"""Replicate discovered AJAX calls without a browser."""
def __init__(self, proxy_url=None):
self.client = httpx.AsyncClient(
proxy=proxy_url,
http2=True,
timeout=30,
)
async def replicate_call(self, api_info):
"""Replicate a discovered API call."""
headers = api_info['headers'].copy()
# Remove browser-specific headers
for h in ['sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform']:
headers.pop(h, None)
if api_info['method'] == 'GET':
response = await self.client.get(
api_info['url'],
headers=headers,
)
elif api_info['method'] == 'POST':
response = await self.client.post(
api_info['url'],
headers=headers,
content=api_info.get('post_data', ''),
)
return response.json()
async def scrape_paginated_api(self, base_url, params_template, max_pages=100):
"""Scrape a paginated API endpoint."""
all_data = []
for page in range(1, max_pages + 1):
params = params_template.copy()
params['page'] = page
response = await self.client.get(base_url, params=params)
data = response.json()
items = data.get('results', data.get('data', data.get('items', [])))
if not items:
break
all_data.extend(items)
print(f"Page {page}: {len(items)} items (total: {len(all_data)})")
# Check for end of pagination
total = data.get('total', data.get('totalCount', float('inf')))
if len(all_data) >= total:
break
return all_data
# Usage
async def main():
scraper = AJAXScraper(proxy_url='http://user:pass@proxy.example.com:8080')
# Discovered API endpoint
products = await scraper.scrape_paginated_api(
base_url='https://api.example.com/v2/products',
params_template={
'category': 'electronics',
'sort': 'price_asc',
'limit': 100,
}
)
print(f"Total products scraped: {len(products)}")
await scraper.client.aclose()
asyncio.run(main())Handling Common Challenges
CSRF Tokens
async def handle_csrf(session, base_url):
"""Extract CSRF token before making API calls."""
# Method 1: From meta tag
page_response = await session.get(base_url)
import re
csrf_match = re.search(r'csrf-token.*?content="([^"]+)"', page_response.text)
if csrf_match:
return csrf_match.group(1)
# Method 2: From cookie
csrf_cookie = session.cookies.get('csrf_token')
if csrf_cookie:
return csrf_cookie
# Method 3: From dedicated endpoint
token_response = await session.get(f'{base_url}/api/csrf')
return token_response.json().get('token')Authentication Headers
async def authenticate_and_scrape(proxy):
async with httpx.AsyncClient(proxy=proxy) as client:
# Step 1: Login to get token
login_response = await client.post(
'https://api.example.com/auth/login',
json={'email': 'user@example.com', 'password': 'pass'},
)
token = login_response.json()['access_token']
# Step 2: Use token in subsequent API calls
headers = {'Authorization': f'Bearer {token}'}
data_response = await client.get(
'https://api.example.com/v2/data',
headers=headers,
)
return data_response.json()Internal Links
- GraphQL API Scraping — intercept and replicate GraphQL queries
- mitmproxy Tutorial — automate API discovery with Python scripts
- Charles Proxy Guide — visual API endpoint discovery
- Bandwidth Optimization — API-first scraping saves 90%+ bandwidth
- Web Scraping Architecture — design patterns for API-based scrapers
FAQ
How do I find AJAX endpoints if the site uses obfuscated URLs?
Use browser DevTools Network tab filtered to XHR/Fetch. Even obfuscated URLs appear in network traffic. Tools like mitmproxy can capture all requests automatically. Look for JSON responses — the content type application/json is the giveaway.
Can websites detect that I am calling their API directly?
Yes. Websites can check for missing headers (Referer, Origin, Sec-Fetch-*), invalid cookies, missing CSRF tokens, or suspicious User-Agent strings. Replicate all required headers from the browser request to avoid detection.
What if the API requires a session cookie?
First visit the website’s login page or homepage to get session cookies, then include those cookies in your API calls. Use a session object (httpx.AsyncClient or requests.Session) that automatically handles cookie persistence.
Is it legal to call a website’s internal API?
The legality depends on the website’s Terms of Service and applicable laws. Public APIs are generally fair game. Internal/undocumented APIs exist in a gray area. The data you collect matters more than how you collect it — public data is generally safer. Consult legal counsel for commercial use.
How do I handle APIs that require JavaScript-generated tokens?
Some APIs use tokens generated by client-side JavaScript (e.g., Akamai sensor data, Cloudflare challenge tokens). For these, you either need to reverse-engineer the token generation algorithm or use a headless browser to generate tokens and then use them in direct API calls.
- Bandwidth Optimization for Proxies: Reduce Costs & Increase Speed
- Building a Proxy Server from Scratch: Python & Go Tutorial
- Build an Anti-Detection Test Suite: Verify Browser Stealth
- Build a Proxy Rotator in Python: Complete Tutorial
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
- Bandwidth Optimization for Proxies: Reduce Costs & Increase Speed
- Building a Proxy Server from Scratch: Python & Go Tutorial
- Build an Anti-Detection Test Suite: Verify Browser Stealth
- Build a Proxy Rotator in Python: Complete Tutorial
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
- Bandwidth Optimization for Proxies: Reduce Costs & Increase Speed
- Building a Proxy Server from Scratch: Python & Go Tutorial
- Build an Anti-Detection Test Suite: Verify Browser Stealth
- Build a Proxy Rotator in Python: Complete Tutorial
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
- Azure Functions for Serverless Web Scraping: the Complete Guide
- Bandwidth Optimization for Proxies: Reduce Costs & Increase Speed
- Build an Anti-Detection Test Suite: Verify Browser Stealth
- Build a News Crawler in Python: Step-by-Step Tutorial
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
- Azure Functions for Serverless Web Scraping: the Complete Guide
- Bandwidth Optimization for Proxies: Reduce Costs & Increase Speed
- Build an Anti-Detection Test Suite: Verify Browser Stealth
- Build a News Crawler in Python: Step-by-Step Tutorial
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
- Azure Functions for Serverless Web Scraping: the Complete Guide
- Bandwidth Optimization for Proxies: Reduce Costs & Increase Speed
- Build an Anti-Detection Test Suite: Verify Browser Stealth
- Build a News Crawler in Python: Step-by-Step Tutorial
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
- Azure Functions for Serverless Web Scraping: the Complete Guide
- Bandwidth Optimization for Proxies: Reduce Costs & Increase Speed
- Build an Anti-Detection Test Suite: Verify Browser Stealth
- Build a News Crawler in Python: Step-by-Step Tutorial
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)
Related Reading
- Azure Functions for Serverless Web Scraping: the Complete Guide
- Bandwidth Optimization for Proxies: Reduce Costs & Increase Speed
- Build an Anti-Detection Test Suite: Verify Browser Stealth
- Build a News Crawler in Python: Step-by-Step Tutorial
- How to Configure Proxies on iPhone and Android
- How to Use Proxies in Node.js (Axios, Fetch, Puppeteer)