PDF Scraping: Extract Data from PDF Documents at Scale

PDF Scraping: Extract Data from PDF Documents at Scale

PDFs are everywhere — financial reports, government filings, academic papers, invoices, and product catalogs. Extracting structured data from PDFs is one of the most challenging scraping tasks because PDFs are designed for visual presentation, not data exchange. This guide covers every approach from simple text extraction to AI-powered document understanding.

PDF Extraction Methods

MethodBest ForAccuracySpeed
PyPDF2/pypdfSimple text PDFsMediumFast
pdfplumberTables and structured dataHighMedium
CamelotTable extractionVery HighMedium
TabulaTable extraction (Java)HighMedium
Tesseract OCRScanned/image PDFsMediumSlow
Adobe APIComplex layoutsHighSlow (API)
LLM extractionUnstructured dataHighSlow

Basic Text Extraction

import pypdf
import httpx
import io

async def extract_text_from_url(pdf_url, proxy=None):
    """Download PDF through proxy and extract text."""
    async with httpx.AsyncClient(proxy=proxy, timeout=60) as client:
        response = await client.get(pdf_url)
        
    pdf_reader = pypdf.PdfReader(io.BytesIO(response.content))
    
    text = ""
    for page in pdf_reader.pages:
        text += page.extract_text() + "\n"
    
    return text

# Usage
text = await extract_text_from_url(
    'https://example.com/report.pdf',
    proxy='http://user:pass@proxy.example.com:8080'
)

Table Extraction with pdfplumber

import pdfplumber
import pandas as pd
import io

def extract_tables(pdf_bytes):
    """Extract all tables from a PDF."""
    tables = []
    
    with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
        for i, page in enumerate(pdf.pages):
            page_tables = page.extract_tables()
            
            for j, table in enumerate(page_tables):
                if table and len(table) > 1:
                    # First row as headers
                    df = pd.DataFrame(table[1:], columns=table[0])
                    df['_page'] = i + 1
                    df['_table'] = j + 1
                    tables.append(df)
                    print(f"Page {i+1}, Table {j+1}: {len(df)} rows x {len(df.columns)} cols")
    
    return tables

# Extract specific regions
def extract_region(pdf_bytes, page_num, bbox):
    """Extract text from a specific region of a page.
    bbox = (x0, top, x1, bottom) in points."""
    with pdfplumber.open(io.BytesIO(pdf_bytes)) as pdf:
        page = pdf.pages[page_num]
        cropped = page.within_bbox(bbox)
        return cropped.extract_text()

OCR for Scanned PDFs

import pytesseract
from pdf2image import convert_from_bytes
from PIL import Image

def ocr_pdf(pdf_bytes, language='eng', dpi=300):
    """OCR a scanned PDF to extract text."""
    # Convert PDF pages to images
    images = convert_from_bytes(pdf_bytes, dpi=dpi)
    
    full_text = ""
    for i, image in enumerate(images):
        # Preprocess for better OCR
        image = image.convert('L')  # Grayscale
        
        # OCR with Tesseract
        text = pytesseract.image_to_string(image, lang=language)
        full_text += f"--- Page {i+1} ---\n{text}\n"
    
    return full_text

def ocr_pdf_with_layout(pdf_bytes, dpi=300):
    """OCR preserving layout information."""
    images = convert_from_bytes(pdf_bytes, dpi=dpi)
    results = []
    
    for i, image in enumerate(images):
        # Get bounding boxes for each word
        data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
        
        words = []
        for j in range(len(data['text'])):
            if data['text'][j].strip():
                words.append({
                    'text': data['text'][j],
                    'x': data['left'][j],
                    'y': data['top'][j],
                    'width': data['width'][j],
                    'height': data['height'][j],
                    'confidence': data['conf'][j],
                })
        
        results.append({'page': i + 1, 'words': words})
    
    return results

Batch PDF Processing Pipeline

import asyncio
import os

class PDFScrapingPipeline:
    """Download and process PDFs at scale."""
    
    def __init__(self, proxy=None, output_dir='./pdf_data'):
        self.proxy = proxy
        self.output_dir = output_dir
        os.makedirs(output_dir, exist_ok=True)
    
    async def process_urls(self, pdf_urls, concurrency=5):
        """Process multiple PDF URLs concurrently."""
        semaphore = asyncio.Semaphore(concurrency)
        
        async def process_one(url):
            async with semaphore:
                try:
                    async with httpx.AsyncClient(proxy=self.proxy, timeout=120) as client:
                        response = await client.get(url)
                    
                    if response.headers.get('content-type', '').startswith('application/pdf'):
                        tables = extract_tables(response.content)
                        text = extract_text_simple(response.content)
                        
                        return {
                            'url': url,
                            'status': 'success',
                            'tables': len(tables),
                            'text_length': len(text),
                        }
                    else:
                        return {'url': url, 'status': 'not_pdf'}
                except Exception as e:
                    return {'url': url, 'status': 'error', 'error': str(e)}
        
        results = await asyncio.gather(*[process_one(url) for url in pdf_urls])
        return results

def extract_text_simple(pdf_bytes):
    reader = pypdf.PdfReader(io.BytesIO(pdf_bytes))
    return "\n".join(page.extract_text() for page in reader.pages)

Internal Links

FAQ

What is the best Python library for PDF extraction?

For text-based PDFs with tables, use pdfplumber — it offers the best balance of accuracy and ease of use. For scanned PDFs, use Tesseract OCR via pytesseract. For simple text extraction, pypdf is fastest.

How do I handle password-protected PDFs?

pypdf supports password-protected PDFs: reader = pypdf.PdfReader(file, password='secret'). For encrypted PDFs where you do not have the password, tools like qpdf can sometimes remove restrictions.

Can AI/LLMs extract data from PDFs?

Yes. Send PDF text to GPT-4, Claude, or other LLMs with extraction instructions. This works well for unstructured documents where traditional parsing fails. Libraries like LlamaIndex and LangChain have PDF loaders built in.

How do I scrape PDFs behind authentication?

Use a session with cookies/tokens to download PDFs. First authenticate via the website, then use the same session to download PDF URLs. Some PDFs require specific Referer headers.

What about PDF forms and fillable fields?

Use pypdf to extract form field data: reader.get_form_text_fields() returns a dictionary of field names and values. This is useful for scraping filled government forms and applications.


Related Reading

Scroll to Top