Image Scraping: Download & Process Images at Scale

Image scraping powers machine learning datasets, competitive intelligence (product photos), real estate listings, fashion catalogs, and content aggregation. Unlike text scraping, images consume significant bandwidth and storage, requiring specific optimization strategies.

Basic Image Scraper

import httpx
import asyncio
import os
import hashlib
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from pathlib import Path

class ImageScraper:
    """Download images from web pages through proxies."""
    
    def __init__(self, proxy=None, output_dir='./images'):
        self.proxy = proxy
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        self.downloaded_hashes = set()  # Deduplication
        self.stats = {'downloaded': 0, 'skipped': 0, 'errors': 0, 'bytes': 0}
    
    async def scrape_page_images(self, page_url, min_size=10000):
        """Extract and download all images from a page."""
        async with httpx.AsyncClient(proxy=self.proxy, timeout=30) as client:
            response = await client.get(page_url)
            soup = BeautifulSoup(response.text, 'html.parser')
            
            image_urls = set()
            
            # Standard img tags
            for img in soup.find_all('img'):
                src = img.get('src') or img.get('data-src') or img.get('data-lazy-src')
                if src:
                    image_urls.add(urljoin(page_url, src))
                
                # Also check srcset for higher resolution
                srcset = img.get('srcset', '')
                for entry in srcset.split(','):
                    parts = entry.strip().split()
                    if parts:
                        image_urls.add(urljoin(page_url, parts[0]))
            
            # Background images in style attributes
            import re
            for tag in soup.find_all(style=True):
                urls = re.findall(r'url\(["\'](.*?)["\'\)]', tag['style'])
                for url in urls:
                    image_urls.add(urljoin(page_url, url))
            
            # Download images
            tasks = [
                self._download_image(client, url, min_size)
                for url in image_urls
            ]
            await asyncio.gather(*tasks)
            
            return self.stats.copy()
    
    async def _download_image(self, client, url, min_size):
        """Download a single image with deduplication."""
        try:
            response = await client.get(url, timeout=30)
            
            if response.status_code != 200:
                self.stats['errors'] += 1
                return
            
            content = response.content
            
            # Skip small images (likely icons/spacers)
            if len(content) < min_size:
                self.stats['skipped'] += 1
                return
            
            # Deduplicate by content hash
            content_hash = hashlib.md5(content).hexdigest()
            if content_hash in self.downloaded_hashes:
                self.stats['skipped'] += 1
                return
            self.downloaded_hashes.add(content_hash)
            
            # Determine filename and extension
            parsed = urlparse(url)
            ext = Path(parsed.path).suffix or '.jpg'
            filename = f"{content_hash}{ext}"
            filepath = self.output_dir / filename
            
            filepath.write_bytes(content)
            self.stats['downloaded'] += 1
            self.stats['bytes'] += len(content)
            
        except Exception as e:
            self.stats['errors'] += 1

# Usage
async def main():
    scraper = ImageScraper(
        proxy='http://user:pass@proxy.example.com:8080',
        output_dir='./scraped_images'
    )
    
    urls = [
        'https://example.com/products/page/1',
        'https://example.com/products/page/2',
    ]
    
    for url in urls:
        stats = await scraper.scrape_page_images(url, min_size=5000)
        print(f"{url}: {stats}")

asyncio.run(main())

Bandwidth Optimization for Images

async def download_with_size_check(client, url, max_size_mb=5):
    """Check image size before downloading."""
    # HEAD request to check size
    head = await client.head(url)
    content_length = int(head.headers.get('content-length', 0))
    
    if content_length > max_size_mb * 1024 * 1024:
        print(f"Skipping {url}: {content_length/1024/1024:.1f}MB > {max_size_mb}MB")
        return None
    
    # Download if within size limit
    response = await client.get(url)
    return response.content

Image Processing Pipeline

from PIL import Image
import io

class ImageProcessor:
    """Process downloaded images — resize, convert, extract metadata."""
    
    @staticmethod
    def resize_image(image_bytes, max_width=1200, max_height=1200, quality=85):
        """Resize image to reduce storage."""
        img = Image.open(io.BytesIO(image_bytes))
        img.thumbnail((max_width, max_height), Image.Resampling.LANCZOS)
        
        output = io.BytesIO()
        img.save(output, format='JPEG', quality=quality, optimize=True)
        return output.getvalue()
    
    @staticmethod
    def extract_metadata(image_bytes):
        """Extract EXIF and basic metadata."""
        img = Image.open(io.BytesIO(image_bytes))
        return {
            'format': img.format,
            'size': img.size,
            'mode': img.mode,
            'exif': dict(img.getexif()) if hasattr(img, 'getexif') else {},
        }
    
    @staticmethod
    def convert_to_webp(image_bytes, quality=80):
        """Convert to WebP for smaller file size."""
        img = Image.open(io.BytesIO(image_bytes))
        output = io.BytesIO()
        img.save(output, format='WEBP', quality=quality)
        return output.getvalue()

Internal Links

PDF Scraping — extract images from PDF documents
Bandwidth Optimization — reduce image download costs
Proxy Performance Benchmarks — optimize download throughput
Data Deduplication — remove duplicate images
Building a Product Research Tool — use scraped images for research

FAQ

How much bandwidth does image scraping use?

Image scraping is bandwidth-intensive. Average product images are 100-500KB each. Scraping 10,000 product pages with 5 images each could use 5-25 GB of bandwidth. Use HEAD requests to check sizes first and skip unnecessarily large images.

Can I use scraped images commercially?

This depends on the image copyright. Product photos from e-commerce sites are copyrighted. Stock photos have licenses. User-uploaded content has varying rights. For ML training data, fair use may apply in some jurisdictions. Always check the source terms and applicable copyright law.

What is the best format for storing scraped images?

WebP offers the best compression-to-quality ratio. JPEG is universal and good for photos. PNG is best for screenshots and graphics with text. For ML datasets, the original format usually works best to preserve quality.

How do I handle lazy-loaded images?

Look for data-src, data-lazy-src, or data-original attributes instead of src. If images load via JavaScript intersection observers, use a headless browser with scrolling to trigger loading, or intercept the image URLs from API calls.

How do I deduplicate scraped images?

Hash image contents (MD5 or SHA256) and skip duplicates. For near-duplicate detection (different resolutions of the same image), use perceptual hashing (pHash) which generates similar hashes for visually similar images.