Handling Pagination in Web Scraping
Most websites split large datasets across multiple pages. Product listings, search results, blog archives, and API responses all use pagination. Missing even one page means incomplete data. This guide covers every pagination pattern you’ll encounter and how to handle each one programmatically.
Types of Pagination
1. URL-Based Pagination (Page Numbers)
The simplest pattern — page number in the URL:
https://example.com/products?page=1
https://example.com/products?page=2
https://example.com/products/page/3
import requests
from bs4 import BeautifulSoup
def scrape_numbered_pages(base_url, max_pages=100, proxy=None):
all_items = []
proxies = {"http": proxy, "https": proxy} if proxy else None
for page in range(1, max_pages + 1):
url = f"{base_url}?page={page}"
response = requests.get(url, proxies=proxies, headers={
"User-Agent": "Mozilla/5.0 Chrome/120.0.0.0"
})
if response.status_code != 200:
break
soup = BeautifulSoup(response.text, "lxml")
items = soup.select(".product-item")
if not items:
break # No more items, stop
for item in items:
all_items.append({
"title": item.select_one(".title").text.strip(),
"price": item.select_one(".price").text.strip(),
})
print(f"Page {page}: {len(items)} items")
return all_items
2. Offset-Based Pagination
Common in APIs — specify starting position and count:
/api/products?offset=0&limit=20
/api/products?offset=20&limit=20
/api/products?offset=40&limit=20
def scrape_offset_pagination(api_url, limit=20, proxy=None):
all_data = []
offset = 0
proxies = {"http": proxy, "https": proxy} if proxy else None
while True:
response = requests.get(
api_url,
params={"offset": offset, "limit": limit},
proxies=proxies
)
data = response.json()
items = data.get("results", [])
if not items:
break
all_data.extend(items)
offset += limit
# Check if we've reached the total
total = data.get("total", float("inf"))
if offset >= total:
break
print(f"Fetched {len(all_data)}/{total} items")
return all_data
3. Cursor-Based Pagination
Used by modern APIs (Twitter, Facebook, Shopify). Each response includes a cursor pointing to the next page:
{
"data": [...],
"next_cursor": "eyJsYXN0X2lkIjogMTIzNH0="
}
def scrape_cursor_pagination(api_url, proxy=None):
all_data = []
cursor = None
proxies = {"http": proxy, "https": proxy} if proxy else None
while True:
params = {}
if cursor:
params["cursor"] = cursor
response = requests.get(api_url, params=params, proxies=proxies)
data = response.json()
items = data.get("data", [])
all_data.extend(items)
cursor = data.get("next_cursor")
if not cursor or not items:
break
print(f"Fetched {len(all_data)} items, cursor: {cursor[:20]}...")
return all_data
4. “Load More” Button Pagination
Content loads when clicking a button, usually via AJAX:
from playwright.sync_api import sync_playwright
def scrape_load_more(url, proxy=None):
with sync_playwright() as p:
browser_args = {}
if proxy:
browser_args["proxy"] = {"server": proxy}
browser = p.chromium.launch(headless=True, **browser_args)
page = browser.new_page()
page.goto(url, wait_until="networkidle")
while True:
# Try clicking "Load More" button
load_more = page.locator("button.load-more, a.load-more")
if load_more.count() == 0 or not load_more.is_visible():
break
load_more.click()
page.wait_for_timeout(2000) # Wait for content to load
# Extract all items
items = page.query_selector_all(".item")
data = []
for item in items:
data.append({
"title": item.query_selector(".title").inner_text(),
"price": item.query_selector(".price").inner_text(),
})
browser.close()
return data
5. Infinite Scroll Pagination
Content loads as you scroll down. See our dedicated infinite scroll guide for advanced techniques.
def scrape_infinite_scroll(url, max_scrolls=50, proxy=None):
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.goto(url, wait_until="networkidle")
prev_height = 0
scroll_count = 0
while scroll_count < max_scrolls:
# Scroll to bottom
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
page.wait_for_timeout(2000)
# Check if new content loaded
new_height = page.evaluate("document.body.scrollHeight")
if new_height == prev_height:
break # No new content
prev_height = new_height
scroll_count += 1
# Extract all loaded content
content = page.content()
browser.close()
return content
6. Next Page Link Pagination
Follow “Next” links until there are no more:
def scrape_next_link(start_url, proxy=None):
all_items = []
url = start_url
proxies = {"http": proxy, "https": proxy} if proxy else None
while url:
response = requests.get(url, proxies=proxies, headers={
"User-Agent": "Mozilla/5.0 Chrome/120.0.0.0"
})
soup = BeautifulSoup(response.text, "lxml")
# Extract items from current page
items = soup.select(".item")
for item in items:
all_items.append(item.text.strip())
# Find next page link
next_link = soup.select_one("a.next, a[rel='next'], li.next a")
if next_link and next_link.get("href"):
url = next_link["href"]
if not url.startswith("http"):
from urllib.parse import urljoin
url = urljoin(start_url, url)
else:
url = None
print(f"Scraped {len(all_items)} items total")
return all_items
Detecting Total Pages
From HTML Elements
def detect_total_pages(soup):
"""Try multiple methods to find total page count."""
# Method 1: Last page number in pagination
pagination = soup.select(".pagination a, .pager a")
if pagination:
numbers = []
for link in pagination:
text = link.text.strip()
if text.isdigit():
numbers.append(int(text))
if numbers:
return max(numbers)
# Method 2: "Showing X of Y results" text
import re
result_text = soup.find(string=re.compile(r'of\s+\d+'))
if result_text:
match = re.search(r'of\s+(\d+)', result_text)
if match:
total_items = int(match.group(1))
items_per_page = len(soup.select(".item"))
if items_per_page > 0:
return (total_items + items_per_page - 1) // items_per_page
# Method 3: Data attribute
pager = soup.find(attrs={"data-total-pages": True})
if pager:
return int(pager["data-total-pages"])
return None
From API Responses
def get_total_from_api(response_data):
"""Extract total count from common API response formats."""
# Format: {"total": 500, "results": [...]}
if "total" in response_data:
return response_data["total"]
# Format: {"meta": {"total_count": 500}, "data": [...]}
meta = response_data.get("meta", {})
for key in ["total_count", "totalCount", "total", "count"]:
if key in meta:
return meta[key]
# Format: {"pagination": {"total_pages": 25}}
pagination = response_data.get("pagination", {})
if "total_pages" in pagination:
return pagination["total_pages"]
return None
Parallel Pagination Scraping
Once you know the total pages, scrape them in parallel:
import asyncio
import aiohttp
async def scrape_pages_parallel(base_url, total_pages, max_concurrent=10, proxy=None):
semaphore = asyncio.Semaphore(max_concurrent)
results = {}
async def fetch_page(session, page_num):
async with semaphore:
url = f"{base_url}?page={page_num}"
async with session.get(url, proxy=proxy) as response:
html = await response.text()
soup = BeautifulSoup(html, "lxml")
items = [item.text.strip() for item in soup.select(".item")]
results[page_num] = items
async with aiohttp.ClientSession() as session:
tasks = [fetch_page(session, i) for i in range(1, total_pages + 1)]
await asyncio.gather(*tasks)
# Return in order
all_items = []
for page_num in sorted(results.keys()):
all_items.extend(results[page_num])
return all_items
Handling Common Edge Cases
Duplicate Items Across Pages
Some sites show overlapping items between pages:
class DeduplicatedScraper:
def __init__(self):
self.seen_ids = set()
self.items = []
def add_item(self, item):
item_id = item.get("id") or item.get("url") or hash(frozenset(item.items()))
if item_id not in self.seen_ids:
self.seen_ids.add(item_id)
self.items.append(item)
return True
return False
Pages That Return Empty Results Mid-Sequence
Some sites have gaps in pagination:
def scrape_with_gap_tolerance(base_url, max_empty=3):
empty_count = 0
page = 1
all_items = []
while empty_count < max_empty:
url = f"{base_url}?page={page}"
response = requests.get(url)
if response.status_code == 404:
break
soup = BeautifulSoup(response.text, "lxml")
items = soup.select(".item")
if not items:
empty_count += 1
else:
empty_count = 0
all_items.extend(items)
page += 1
return all_items
Rate Limiting Between Pages
import time
import random
def scrape_with_rate_limit(base_url, total_pages, min_delay=1, max_delay=3):
all_items = []
for page in range(1, total_pages + 1):
url = f"{base_url}?page={page}"
response = requests.get(url)
# ... process response ...
# Random delay between requests
delay = random.uniform(min_delay, max_delay)
time.sleep(delay)
return all_items
Resumable Pagination
Save progress so you can resume after interruption:
import json
import os
class ResumableScraper:
def __init__(self, state_file="scraper_state.json"):
self.state_file = state_file
self.state = self._load_state()
def _load_state(self):
if os.path.exists(self.state_file):
with open(self.state_file) as f:
return json.load(f)
return {"last_page": 0, "items_count": 0}
def _save_state(self):
with open(self.state_file, "w") as f:
json.dump(self.state, f)
def scrape(self, base_url, total_pages):
start_page = self.state["last_page"] + 1
print(f"Resuming from page {start_page}")
for page in range(start_page, total_pages + 1):
try:
url = f"{base_url}?page={page}"
# ... scrape page ...
self.state["last_page"] = page
self.state["items_count"] += len(items)
self._save_state()
except Exception as e:
print(f"Failed on page {page}: {e}")
self._save_state()
raise
FAQ
How do I know which pagination type a site uses?
Inspect the page: look at URL changes when navigating pages, check Network tab for AJAX requests, and examine the HTML for pagination elements. If the URL changes, it’s URL-based. If the URL stays the same but content changes, it’s AJAX-based.
What’s the best pagination strategy for speed?
If you know the total pages upfront, parallel scraping with async is fastest. For cursor-based pagination, you must scrape sequentially since each page depends on the previous cursor.
How do I handle sites that change content between page loads?
Use cursor-based pagination when available — it provides a consistent snapshot. For offset-based pagination, items can shift as new content is added. Deduplicate on item IDs and accept that you might miss some items.
Should I scrape all pages at once or in batches?
Scrape in batches with checkpointing. This lets you resume after failures, respects rate limits, and prevents memory issues with very large datasets.
How do I deal with pagination that requires JavaScript?
Use Playwright or Selenium to handle JavaScript-based pagination. Intercept the underlying API calls with network monitoring and call them directly for better performance. See our JavaScript scraping guide.
Conclusion
Pagination is a fundamental challenge in web scraping, but each pattern has a proven solution. Identify the pagination type, implement the appropriate handler, add deduplication and error recovery, and use parallel fetching when possible. Combined with proxy rotation and rate limiting, you can reliably scrape datasets of any size across hundreds or thousands of pages.