Scraping Infinite Scroll Pages: Techniques
Infinite scroll pages load new content as you scroll down — no page numbers, no “Next” buttons. Social media feeds, image galleries, and modern e-commerce sites all use this pattern. Traditional HTTP scrapers see only the first batch of content. This guide covers every technique for capturing all the data from infinite scroll pages.
How Infinite Scroll Works
When you scroll near the bottom, JavaScript fires an AJAX request to load more content:
- User scrolls to bottom of visible content
- JavaScript detects scroll position via
IntersectionObserverorscrollevent - AJAX request fires to fetch next batch (e.g.,
/api/items?page=2) - New content is injected into the DOM
- Repeat until no more content
Method 1: Browser Automation (Playwright)
The most reliable approach — simulate scrolling in a real browser:
from playwright.sync_api import sync_playwright
from bs4 import BeautifulSoup
import time
def scrape_infinite_scroll(url, max_scrolls=100, scroll_delay=2, proxy=None):
with sync_playwright() as p:
launch_args = {"headless": True}
if proxy:
launch_args["proxy"] = {"server": proxy}
browser = p.chromium.launch(**launch_args)
page = browser.new_page()
# Block images for faster loading
page.route("*/.{png,jpg,jpeg,gif,svg,webp}", lambda route: route.abort())
page.goto(url, wait_until="networkidle")
prev_height = 0
scroll_count = 0
no_change_count = 0
while scroll_count < max_scrolls:
# Scroll to bottom
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
# Wait for new content
page.wait_for_timeout(int(scroll_delay * 1000))
# Check if new content loaded
new_height = page.evaluate("document.body.scrollHeight")
if new_height == prev_height:
no_change_count += 1
if no_change_count >= 3:
print(f"No new content after {no_change_count} attempts. Stopping.")
break
else:
no_change_count = 0
prev_height = new_height
scroll_count += 1
# Count items
item_count = page.evaluate(
"document.querySelectorAll('.item').length"
)
print(f"Scroll {scroll_count}: {item_count} items loaded")
# Extract all content
html = page.content()
browser.close()
# Parse with BeautifulSoup
soup = BeautifulSoup(html, "lxml")
items = soup.select(".item")
return [parse_item(item) for item in items]
def parse_item(item):
return {
"title": item.select_one(".title").text.strip() if item.select_one(".title") else None,
"description": item.select_one(".desc").text.strip() if item.select_one(".desc") else None,
}
Smarter Scrolling with Element Detection
Instead of just scrolling to the bottom, wait for specific elements:
def smart_scroll(page, item_selector, target_count=None, max_scrolls=200):
"""Scroll until target count reached or no more items load."""
items_collected = set()
stale_count = 0
for i in range(max_scrolls):
# Get current items
current_items = page.query_selector_all(item_selector)
current_count = len(current_items)
if target_count and current_count >= target_count:
print(f"Reached target: {current_count} items")
break
if current_count == len(items_collected):
stale_count += 1
if stale_count >= 5:
break
else:
stale_count = 0
items_collected = set(range(current_count))
# Scroll the last item into view
if current_items:
current_items[-1].scroll_into_view_if_needed()
page.wait_for_timeout(1500)
# Also try scrolling to absolute bottom
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
page.wait_for_timeout(1000)
return page.query_selector_all(item_selector)
Method 2: Intercept API Calls
The fastest approach — capture the underlying AJAX requests and call them directly:
from playwright.sync_api import sync_playwright
import requests
import json
def discover_scroll_api(url):
"""Discover the API endpoint used for infinite scroll."""
api_calls = []
def capture_request(request):
if request.resource_type in ("xhr", "fetch"):
api_calls.append({
"url": request.url,
"method": request.method,
"headers": dict(request.headers),
})
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
page = browser.new_page()
page.on("request", capture_request)
page.goto(url, wait_until="networkidle")
# Trigger scroll
for _ in range(3):
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
page.wait_for_timeout(2000)
browser.close()
# Filter for pagination-related API calls
scroll_apis = [
call for call in api_calls
if any(param in call["url"].lower()
for param in ["page", "offset", "cursor", "after", "skip"])
]
return scroll_apis
def scrape_via_api(api_url_template, total_pages, headers=None, proxy=None):
"""Once you've found the API, call it directly."""
all_data = []
proxies = {"http": proxy, "https": proxy} if proxy else None
for page in range(1, total_pages + 1):
url = api_url_template.format(page=page)
response = requests.get(url, headers=headers, proxies=proxies)
if response.status_code != 200:
break
data = response.json()
items = data.get("items", data.get("results", data.get("data", [])))
if not items:
break
all_data.extend(items)
print(f"Page {page}: {len(items)} items (total: {len(all_data)})")
return all_data
Method 3: Selenium with Scroll
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
import time
def selenium_infinite_scroll(url, max_scrolls=100, proxy=None):
options = Options()
options.add_argument("--headless=new")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
if proxy:
options.add_argument(f"--proxy-server={proxy}")
driver = webdriver.Chrome(options=options)
driver.get(url)
time.sleep(3)
last_height = driver.execute_script("return document.body.scrollHeight")
scroll_count = 0
while scroll_count < max_scrolls:
# Scroll down
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
# Try one more time with a longer wait
time.sleep(3)
new_height = driver.execute_script("return document.body.scrollHeight")
if new_height == last_height:
break
last_height = new_height
scroll_count += 1
# Extract data
items = driver.find_elements(By.CSS_SELECTOR, ".item")
data = [item.text for item in items]
driver.quit()
return data
Handling Different Scroll Triggers
IntersectionObserver-Based
Some sites use IntersectionObserver to detect when a sentinel element enters the viewport:
def trigger_intersection_observer(page):
"""Force trigger IntersectionObserver by scrolling sentinel into view."""
sentinel_selectors = [
".load-more-sentinel",
"#infinite-scroll-trigger",
"[data-infinite-scroll]",
".scroll-sentinel",
]
for selector in sentinel_selectors:
sentinel = page.locator(selector)
if sentinel.count() > 0:
sentinel.scroll_into_view_if_needed()
page.wait_for_timeout(2000)
return True
# Fallback: scroll to bottom
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
return False
Button-Triggered Lazy Load
Some sites show “Show More” buttons after scrolling:
def handle_scroll_with_button(page, max_iterations=100):
for i in range(max_iterations):
# Scroll to bottom
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
page.wait_for_timeout(1000)
# Check for "Load More" button
button = page.locator("button:has-text('Load More'), button:has-text('Show More')")
if button.count() > 0 and button.is_visible():
button.click()
page.wait_for_timeout(2000)
else:
# Check if we've reached the end
end_indicator = page.locator(".no-more-results, .end-of-list")
if end_indicator.count() > 0:
break
Memory Management for Long Scrolls
Scrolling hundreds of times accumulates DOM elements and memory. Here’s how to manage it:
Extract and Clear As You Go
def scrape_with_memory_management(page, item_selector, batch_size=50):
"""Extract items in batches to prevent memory overflow."""
all_data = []
processed_count = 0
while True:
# Scroll to load more
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
page.wait_for_timeout(2000)
# Count total items
total_items = page.evaluate(
f"document.querySelectorAll('{item_selector}').length"
)
if total_items <= processed_count:
break
# Extract new items
new_items = page.evaluate(f"""
() => {{
const items = document.querySelectorAll('{item_selector}');
const newItems = [];
for (let i = {processed_count}; i < items.length; i++) {{
newItems.push({{
text: items[i].textContent.trim(),
html: items[i].innerHTML
}});
}}
return newItems;
}}
""")
all_data.extend(new_items)
processed_count = total_items
# Remove processed items from DOM to free memory
if processed_count > batch_size * 2:
page.evaluate(f"""
() => {{
const items = document.querySelectorAll('{item_selector}');
for (let i = 0; i < {batch_size}; i++) {{
if (items[i]) items[i].remove();
}}
}}
""")
processed_count -= batch_size
print(f"Collected {len(all_data)} items")
return all_data
Detecting End of Content
def is_end_of_content(page):
"""Check multiple signals that infinite scroll has ended."""
# Check for "end of results" messages
end_indicators = [
".no-more-results",
".end-of-list",
"#no-results",
"[data-end-of-scroll]",
":text('No more results')",
":text('End of list')",
":text('That\\'s all')",
]
for selector in end_indicators:
try:
if page.locator(selector).count() > 0:
return True
except Exception:
continue
# Check if loading spinner disappeared without new content
spinner = page.locator(".loading, .spinner, [data-loading]")
if spinner.count() > 0 and not spinner.is_visible():
return True
return False
Async Infinite Scroll Scraper
For high-performance scraping of multiple infinite scroll pages:
import asyncio
from playwright.async_api import async_playwright
async def scrape_multiple_scroll_pages(urls, max_scrolls=50, max_concurrent=5):
semaphore = asyncio.Semaphore(max_concurrent)
async with async_playwright() as p:
browser = await p.chromium.launch(headless=True)
async def scrape_single(url):
async with semaphore:
context = await browser.new_context()
page = await context.new_page()
await page.route("*/.{png,jpg,jpeg,gif,svg}", lambda r: r.abort())
await page.goto(url, wait_until="domcontentloaded")
prev_height = 0
for _ in range(max_scrolls):
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(1500)
new_height = await page.evaluate("document.body.scrollHeight")
if new_height == prev_height:
break
prev_height = new_height
content = await page.content()
await context.close()
return {"url": url, "html": content}
results = await asyncio.gather(*[scrape_single(url) for url in urls])
await browser.close()
return results
FAQ
How many scrolls are too many?
Most infinite scroll pages have 500-5000 items. If you’re scrolling more than 200 times without reaching the end, the content is likely dynamically generated (like a social media feed). Set reasonable limits and use the API approach instead.
Why does my scraper get stuck on infinite scroll?
Common causes: the page uses a non-standard scroll trigger (IntersectionObserver, custom events), the loading takes longer than your wait time, or the content is behind authentication. Increase wait times and check for alternative triggers.
Can I scrape infinite scroll without a browser?
Yes, if you can find the underlying API endpoint. Use browser DevTools Network tab to discover the AJAX calls, then replicate them with requests. This is 10-50x faster than browser-based scrolling.
How do I handle infinite scroll that changes URL hash?
Some sites update the URL hash (#page=2) as you scroll. Track these changes and use them for resumability:
current_hash = page.evaluate("window.location.hash")
What about virtual/windowed scrolling?
Libraries like react-virtualized only render visible items. The DOM doesn’t accumulate all items. For these, you must use the API approach since the items literally don’t exist in the DOM.
Conclusion
Infinite scroll scraping requires browser automation or API interception — there’s no shortcut around the JavaScript rendering. Start by checking for API endpoints (fastest, most reliable), fall back to Playwright scrolling for complex pages, and always implement memory management for large datasets. Pair with residential proxies to avoid detection during extended scrolling sessions.