Handling Pagination in Web Scraping

Handling Pagination in Web Scraping

Most websites split large datasets across multiple pages. Product listings, search results, blog archives, and API responses all use pagination. Missing even one page means incomplete data. This guide covers every pagination pattern you’ll encounter and how to handle each one programmatically.

Types of Pagination

1. URL-Based Pagination (Page Numbers)

The simplest pattern — page number in the URL:

https://example.com/products?page=1

https://example.com/products?page=2

https://example.com/products/page/3

import requests

from bs4 import BeautifulSoup

def scrape_numbered_pages(base_url, max_pages=100, proxy=None):

all_items = []

proxies = {"http": proxy, "https": proxy} if proxy else None

for page in range(1, max_pages + 1):

url = f"{base_url}?page={page}"

response = requests.get(url, proxies=proxies, headers={

"User-Agent": "Mozilla/5.0 Chrome/120.0.0.0"

})

if response.status_code != 200:

break

soup = BeautifulSoup(response.text, "lxml")

items = soup.select(".product-item")

if not items:

break # No more items, stop

for item in items:

all_items.append({

"title": item.select_one(".title").text.strip(),

"price": item.select_one(".price").text.strip(),

})

print(f"Page {page}: {len(items)} items")

return all_items

2. Offset-Based Pagination

Common in APIs — specify starting position and count:

/api/products?offset=0&limit=20

/api/products?offset=20&limit=20

/api/products?offset=40&limit=20

def scrape_offset_pagination(api_url, limit=20, proxy=None):

all_data = []

offset = 0

proxies = {"http": proxy, "https": proxy} if proxy else None

while True:

response = requests.get(

api_url,

params={"offset": offset, "limit": limit},

proxies=proxies

)

data = response.json()

items = data.get("results", [])

if not items:

break

all_data.extend(items)

offset += limit

# Check if we've reached the total

total = data.get("total", float("inf"))

if offset >= total:

break

print(f"Fetched {len(all_data)}/{total} items")

return all_data

3. Cursor-Based Pagination

Used by modern APIs (Twitter, Facebook, Shopify). Each response includes a cursor pointing to the next page:

{

"data": [...],

"next_cursor": "eyJsYXN0X2lkIjogMTIzNH0="

}

def scrape_cursor_pagination(api_url, proxy=None):

all_data = []

cursor = None

proxies = {"http": proxy, "https": proxy} if proxy else None

while True:

params = {}

if cursor:

params["cursor"] = cursor

response = requests.get(api_url, params=params, proxies=proxies)

data = response.json()

items = data.get("data", [])

all_data.extend(items)

cursor = data.get("next_cursor")

if not cursor or not items:

break

print(f"Fetched {len(all_data)} items, cursor: {cursor[:20]}...")

return all_data

4. “Load More” Button Pagination

Content loads when clicking a button, usually via AJAX:

from playwright.sync_api import sync_playwright

def scrape_load_more(url, proxy=None):

with sync_playwright() as p:

browser_args = {}

if proxy:

browser_args["proxy"] = {"server": proxy}

browser = p.chromium.launch(headless=True, **browser_args)

page = browser.new_page()

page.goto(url, wait_until="networkidle")

while True:

# Try clicking "Load More" button

load_more = page.locator("button.load-more, a.load-more")

if load_more.count() == 0 or not load_more.is_visible():

break

load_more.click()

page.wait_for_timeout(2000) # Wait for content to load

# Extract all items

items = page.query_selector_all(".item")

data = []

for item in items:

data.append({

"title": item.query_selector(".title").inner_text(),

"price": item.query_selector(".price").inner_text(),

})

browser.close()

return data

5. Infinite Scroll Pagination

Content loads as you scroll down. See our dedicated infinite scroll guide for advanced techniques.

def scrape_infinite_scroll(url, max_scrolls=50, proxy=None):

with sync_playwright() as p:

browser = p.chromium.launch(headless=True)

page = browser.new_page()

page.goto(url, wait_until="networkidle")

prev_height = 0

scroll_count = 0

while scroll_count < max_scrolls:

# Scroll to bottom

page.evaluate("window.scrollTo(0, document.body.scrollHeight)")

page.wait_for_timeout(2000)

# Check if new content loaded

new_height = page.evaluate("document.body.scrollHeight")

if new_height == prev_height:

break # No new content

prev_height = new_height

scroll_count += 1

# Extract all loaded content

content = page.content()

browser.close()

return content

6. Next Page Link Pagination

Follow “Next” links until there are no more:

def scrape_next_link(start_url, proxy=None):

all_items = []

url = start_url

proxies = {"http": proxy, "https": proxy} if proxy else None

while url:

response = requests.get(url, proxies=proxies, headers={

"User-Agent": "Mozilla/5.0 Chrome/120.0.0.0"

})

soup = BeautifulSoup(response.text, "lxml")

# Extract items from current page

items = soup.select(".item")

for item in items:

all_items.append(item.text.strip())

# Find next page link

next_link = soup.select_one("a.next, a[rel='next'], li.next a")

if next_link and next_link.get("href"):

url = next_link["href"]

if not url.startswith("http"):

from urllib.parse import urljoin

url = urljoin(start_url, url)

else:

url = None

print(f"Scraped {len(all_items)} items total")

return all_items

Detecting Total Pages

From HTML Elements

def detect_total_pages(soup):

"""Try multiple methods to find total page count."""

# Method 1: Last page number in pagination

pagination = soup.select(".pagination a, .pager a")

if pagination:

numbers = []

for link in pagination:

text = link.text.strip()

if text.isdigit():

numbers.append(int(text))

if numbers:

return max(numbers)

# Method 2: "Showing X of Y results" text

import re

result_text = soup.find(string=re.compile(r'of\s+\d+'))

if result_text:

match = re.search(r'of\s+(\d+)', result_text)

if match:

total_items = int(match.group(1))

items_per_page = len(soup.select(".item"))

if items_per_page > 0:

return (total_items + items_per_page - 1) // items_per_page

# Method 3: Data attribute

pager = soup.find(attrs={"data-total-pages": True})

if pager:

return int(pager["data-total-pages"])

return None

From API Responses

def get_total_from_api(response_data):

"""Extract total count from common API response formats."""

# Format: {"total": 500, "results": [...]}

if "total" in response_data:

return response_data["total"]

# Format: {"meta": {"total_count": 500}, "data": [...]}

meta = response_data.get("meta", {})

for key in ["total_count", "totalCount", "total", "count"]:

if key in meta:

return meta[key]

# Format: {"pagination": {"total_pages": 25}}

pagination = response_data.get("pagination", {})

if "total_pages" in pagination:

return pagination["total_pages"]

return None

Parallel Pagination Scraping

Once you know the total pages, scrape them in parallel:

import asyncio

import aiohttp

async def scrape_pages_parallel(base_url, total_pages, max_concurrent=10, proxy=None):

semaphore = asyncio.Semaphore(max_concurrent)

results = {}

async def fetch_page(session, page_num):

async with semaphore:

url = f"{base_url}?page={page_num}"

async with session.get(url, proxy=proxy) as response:

html = await response.text()

soup = BeautifulSoup(html, "lxml")

items = [item.text.strip() for item in soup.select(".item")]

results[page_num] = items

async with aiohttp.ClientSession() as session:

tasks = [fetch_page(session, i) for i in range(1, total_pages + 1)]

await asyncio.gather(*tasks)

# Return in order

all_items = []

for page_num in sorted(results.keys()):

all_items.extend(results[page_num])

return all_items

Handling Common Edge Cases

Duplicate Items Across Pages

Some sites show overlapping items between pages:

class DeduplicatedScraper:

def __init__(self):

self.seen_ids = set()

self.items = []

def add_item(self, item):

item_id = item.get("id") or item.get("url") or hash(frozenset(item.items()))

if item_id not in self.seen_ids:

self.seen_ids.add(item_id)

self.items.append(item)

return True

return False

Pages That Return Empty Results Mid-Sequence

Some sites have gaps in pagination:

def scrape_with_gap_tolerance(base_url, max_empty=3):

empty_count = 0

page = 1

all_items = []

while empty_count < max_empty:

url = f"{base_url}?page={page}"

response = requests.get(url)

if response.status_code == 404:

break

soup = BeautifulSoup(response.text, "lxml")

items = soup.select(".item")

if not items:

empty_count += 1

else:

empty_count = 0

all_items.extend(items)

page += 1

return all_items

Rate Limiting Between Pages

import time

import random

def scrape_with_rate_limit(base_url, total_pages, min_delay=1, max_delay=3):

all_items = []

for page in range(1, total_pages + 1):

url = f"{base_url}?page={page}"

response = requests.get(url)

# ... process response ...

# Random delay between requests

delay = random.uniform(min_delay, max_delay)

time.sleep(delay)

return all_items

Resumable Pagination

Save progress so you can resume after interruption:

import json

import os

class ResumableScraper:

def __init__(self, state_file="scraper_state.json"):

self.state_file = state_file

self.state = self._load_state()

def _load_state(self):

if os.path.exists(self.state_file):

with open(self.state_file) as f:

return json.load(f)

return {"last_page": 0, "items_count": 0}

def _save_state(self):

with open(self.state_file, "w") as f:

json.dump(self.state, f)

def scrape(self, base_url, total_pages):

start_page = self.state["last_page"] + 1

print(f"Resuming from page {start_page}")

for page in range(start_page, total_pages + 1):

try:

url = f"{base_url}?page={page}"

# ... scrape page ...

self.state["last_page"] = page

self.state["items_count"] += len(items)

self._save_state()

except Exception as e:

print(f"Failed on page {page}: {e}")

self._save_state()

raise

FAQ

How do I know which pagination type a site uses?

Inspect the page: look at URL changes when navigating pages, check Network tab for AJAX requests, and examine the HTML for pagination elements. If the URL changes, it’s URL-based. If the URL stays the same but content changes, it’s AJAX-based.

What’s the best pagination strategy for speed?

If you know the total pages upfront, parallel scraping with async is fastest. For cursor-based pagination, you must scrape sequentially since each page depends on the previous cursor.

How do I handle sites that change content between page loads?

Use cursor-based pagination when available — it provides a consistent snapshot. For offset-based pagination, items can shift as new content is added. Deduplicate on item IDs and accept that you might miss some items.

Should I scrape all pages at once or in batches?

Scrape in batches with checkpointing. This lets you resume after failures, respects rate limits, and prevents memory issues with very large datasets.

How do I deal with pagination that requires JavaScript?

Use Playwright or Selenium to handle JavaScript-based pagination. Intercept the underlying API calls with network monitoring and call them directly for better performance. See our JavaScript scraping guide.

Conclusion

Pagination is a fundamental challenge in web scraping, but each pattern has a proven solution. Identify the pagination type, implement the appropriate handler, add deduplication and error recovery, and use parallel fetching when possible. Combined with proxy rotation and rate limiting, you can reliably scrape datasets of any size across hundreds or thousands of pages.

Internal Links

Scroll to Top