Axios + Cheerio: Lightweight Node.js Scraping

Axios + Cheerio: Lightweight Node.js Scraping

Axios + Cheerio is the Requests + BeautifulSoup of the Node.js world. Axios handles HTTP requests with a clean promise-based API. Cheerio provides jQuery-style HTML parsing without a browser. Together, they form the fastest and lightest scraping stack in Node.js — processing pages 10-50x faster than Puppeteer or Playwright because there is no browser to launch.

This tutorial covers setup, common patterns, error handling, concurrent scraping, and proxy integration.

Table of Contents

Why Axios + Cheerio

FeatureAxios + CheerioPuppeteer
Speed50-200 pages/s1-10 pages/s
Memory~30MB~200MB
JS renderingNoYes
Dependencies2 packagesChrome binary
SetupInstantDownloads browser

Use Axios + Cheerio for static pages. Switch to Puppeteer or Playwright only when JavaScript rendering is required.

Installation

npm init -y
npm install axios cheerio

Basic Scraping

const axios = require('axios');
const cheerio = require('cheerio');

async function scrape(url) {
    const { data } = await axios.get(url, {
        headers: {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
        },
        timeout: 30000,
    });

    const $ = cheerio.load(data);

    const books = [];
    $('article.product_pod').each((i, el) => {
        books.push({
            title: $(el).find('h3 a').attr('title'),
            price: $(el).find('.price_color').text(),
            rating: $(el).find('p.star-rating').attr('class').replace('star-rating ', ''),
        });
    });

    return books;
}

scrape('https://books.toscrape.com/')
    .then(books => {
        console.log(`Found ${books.length} books:`);
        books.forEach(b => console.log(`  ${b.title}: ${b.price}`));
    })
    .catch(console.error);

Handling Responses

JSON API

const { data } = await axios.get('https://api.example.com/products', {
    headers: { Accept: 'application/json' },
    params: { page: 1, limit: 50 },
});

// Axios automatically parses JSON
data.results.forEach(product => {
    console.log(`${product.name}: $${product.price}`);
});

POST Requests

// Form data
const { data } = await axios.post('https://example.com/search', 
    'query=web+scraping&page=1',
    { headers: { 'Content-Type': 'application/x-www-form-urlencoded' } }
);

// JSON body
const { data: jsonData } = await axios.post('https://api.example.com/search',
    { query: 'web scraping', page: 1 },
    { headers: { 'Content-Type': 'application/json' } }
);

Response Interceptors

const client = axios.create({
    baseURL: 'https://books.toscrape.com',
    timeout: 30000,
    headers: { 'User-Agent': 'Mozilla/5.0' },
});

// Log all requests
client.interceptors.request.use(config => {
    console.log(`Fetching: ${config.url}`);
    return config;
});

// Handle response errors
client.interceptors.response.use(
    response => response,
    error => {
        if (error.response) {
            console.error(`HTTP ${error.response.status}: ${error.config.url}`);
        } else {
            console.error(`Network error: ${error.message}`);
        }
        return Promise.reject(error);
    }
);

Pagination

Sequential

const axios = require('axios');
const cheerio = require('cheerio');

async function scrapeAllPages() {
    const client = axios.create({
        timeout: 30000,
        headers: { 'User-Agent': 'Mozilla/5.0' },
    });

    const allBooks = [];

    for (let page = 1; page <= 50; page++) {
        try {
            const { data } = await client.get(
                `https://books.toscrape.com/catalogue/page-${page}.html`
            );
            const $ = cheerio.load(data);

            const books = $('article.product_pod').map((i, el) => ({
                title: $(el).find('h3 a').attr('title'),
                price: $(el).find('.price_color').text(),
            })).get();

            if (books.length === 0) break;
            allBooks.push(...books);

            console.log(`Page ${page}: ${books.length} books`);

            // Polite delay
            await new Promise(r => setTimeout(r, 1000));
        } catch (err) {
            console.error(`Error on page ${page}: ${err.message}`);
            break;
        }
    }

    return allBooks;
}

Following Next Links

async function scrapeFollowingLinks(startUrl) {
    const client = axios.create({ timeout: 30000 });
    const allBooks = [];
    let url = startUrl;

    while (url) {
        const { data } = await client.get(url);
        const $ = cheerio.load(data);

        $('article.product_pod').each((i, el) => {
            allBooks.push({
                title: $(el).find('h3 a').attr('title'),
                price: $(el).find('.price_color').text(),
            });
        });

        const nextHref = $('li.next a').attr('href');
        url = nextHref ? new URL(nextHref, url).href : null;

        await new Promise(r => setTimeout(r, 1000));
    }

    return allBooks;
}

Concurrent Scraping

Promise.all with Batching

async function scrapeConcurrently(urls, concurrency = 5) {
    const client = axios.create({
        timeout: 30000,
        headers: { 'User-Agent': 'Mozilla/5.0' },
    });

    const allResults = [];

    for (let i = 0; i < urls.length; i += concurrency) {
        const batch = urls.slice(i, i + concurrency);

        const results = await Promise.allSettled(
            batch.map(async (url) => {
                const { data } = await client.get(url);
                const $ = cheerio.load(data);

                return $('article.product_pod').map((_, el) => ({
                    title: $(el).find('h3 a').attr('title'),
                    price: $(el).find('.price_color').text(),
                    source: url,
                })).get();
            })
        );

        results.forEach(result => {
            if (result.status === 'fulfilled') {
                allResults.push(...result.value);
            }
        });

        console.log(`Batch ${Math.floor(i / concurrency) + 1}: ${allResults.length} total items`);
        await new Promise(r => setTimeout(r, 500));
    }

    return allResults;
}

// Usage
const urls = Array.from({ length: 50 }, (_, i) =>
    `https://books.toscrape.com/catalogue/page-${i + 1}.html`
);

scrapeConcurrently(urls, 5).then(results => {
    console.log(`Total: ${results.length} books`);
});

p-limit for Concurrency Control

npm install p-limit
const pLimit = require('p-limit');

const limit = pLimit(5); // Max 5 concurrent requests

const tasks = urls.map(url =>
    limit(async () => {
        const { data } = await axios.get(url);
        const $ = cheerio.load(data);
        return $('h1').text();
    })
);

const results = await Promise.all(tasks);

Error Handling and Retries

axios-retry

npm install axios-retry
const axios = require('axios');
const axiosRetry = require('axios-retry').default;

const client = axios.create({ timeout: 30000 });

axiosRetry(client, {
    retries: 3,
    retryDelay: axiosRetry.exponentialDelay,
    retryCondition: (error) => {
        return axiosRetry.isNetworkOrIdempotentRequestError(error) ||
               error.response?.status === 429 ||
               error.response?.status >= 500;
    },
    onRetry: (retryCount, error) => {
        console.log(`Retry ${retryCount}: ${error.config.url}`);
    },
});

Manual Retry

async function fetchWithRetry(url, maxRetries = 3) {
    for (let attempt = 1; attempt <= maxRetries; attempt++) {
        try {
            const response = await axios.get(url, { timeout: 30000 });
            return response.data;
        } catch (err) {
            if (attempt === maxRetries) throw err;

            const delay = Math.pow(2, attempt) * 1000;
            console.log(`Attempt ${attempt} failed, retrying in ${delay}ms...`);
            await new Promise(r => setTimeout(r, delay));
        }
    }
}

Proxy Integration

Basic Proxy

const { data } = await axios.get('https://httpbin.org/ip', {
    proxy: {
        host: 'proxy.example.com',
        port: 8080,
        auth: {
            username: 'user',
            password: 'pass',
        },
    },
});

console.log(data);

Rotating Proxies

const proxies = [
    { host: 'proxy1.example.com', port: 8080 },
    { host: 'proxy2.example.com', port: 8080 },
    { host: 'proxy3.example.com', port: 8080 },
];

function getRandomProxy() {
    const proxy = proxies[Math.floor(Math.random() * proxies.length)];
    return {
        ...proxy,
        auth: { username: 'user', password: 'pass' },
    };
}

async function scrapeWithProxy(url) {
    const { data } = await axios.get(url, {
        proxy: getRandomProxy(),
        timeout: 30000,
    });
    return cheerio.load(data);
}

Using https-proxy-agent

npm install https-proxy-agent
const { HttpsProxyAgent } = require('https-proxy-agent');

const agent = new HttpsProxyAgent('http://user:pass@proxy.example.com:8080');

const { data } = await axios.get('https://httpbin.org/ip', {
    httpAgent: agent,
    httpsAgent: agent,
    proxy: false, // Disable Axios's built-in proxy handling
});

For proxy types, see our web scraping proxy guide and proxy glossary.

Storing Data

JSON

const fs = require('fs');

fs.writeFileSync('books.json', JSON.stringify(books, null, 2));

CSV

npm install csv-stringify
const { stringify } = require('csv-stringify/sync');

const csv = stringify(books, {
    header: true,
    columns: ['title', 'price', 'rating'],
});

fs.writeFileSync('books.csv', csv);

SQLite

npm install better-sqlite3
const Database = require('better-sqlite3');

const db = new Database('books.db');
db.exec('CREATE TABLE IF NOT EXISTS books (title TEXT, price TEXT, rating TEXT)');

const insert = db.prepare('INSERT INTO books VALUES (?, ?, ?)');
const insertMany = db.transaction((books) => {
    for (const book of books) {
        insert.run(book.title, book.price, book.rating);
    }
});

insertMany(books);
db.close();

Session Management

const axios = require('axios');
const tough = require('tough-cookie');
const { wrapper } = require('axios-cookiejar-support');

// Create cookie jar
const jar = new tough.CookieJar();
const client = wrapper(axios.create({
    jar,
    withCredentials: true,
    timeout: 30000,
}));

// Login
await client.post('https://example.com/login', {
    username: 'user',
    password: 'pass',
});

// Subsequent requests maintain cookies
const { data } = await client.get('https://example.com/dashboard');
const $ = cheerio.load(data);

Complete Example

const axios = require('axios');
const cheerio = require('cheerio');
const fs = require('fs');

class BookScraper {
    constructor(options = {}) {
        this.concurrency = options.concurrency || 5;
        this.delay = options.delay || 1000;
        this.client = axios.create({
            timeout: 30000,
            headers: {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                'Accept': 'text/html',
            },
        });
        this.books = [];
    }

    async scrapePage(url) {
        const { data } = await this.client.get(url);
        const $ = cheerio.load(data);

        const books = $('article.product_pod').map((i, el) => ({
            title: $(el).find('h3 a').attr('title'),
            price: parseFloat($(el).find('.price_color').text().replace('£', '')),
            rating: $(el).find('p.star-rating').attr('class').replace('star-rating ', ''),
            available: $(el).find('.instock').length > 0,
        })).get();

        return books;
    }

    async scrapeAll(totalPages = 50) {
        const urls = Array.from({ length: totalPages }, (_, i) =>
            `https://books.toscrape.com/catalogue/page-${i + 1}.html`
        );

        for (let i = 0; i < urls.length; i += this.concurrency) {
            const batch = urls.slice(i, i + this.concurrency);

            const results = await Promise.allSettled(
                batch.map(url => this.scrapePage(url))
            );

            results.forEach(r => {
                if (r.status === 'fulfilled') this.books.push(...r.value);
            });

            console.log(`Progress: ${this.books.length} books`);
            await new Promise(r => setTimeout(r, this.delay));
        }

        return this.books;
    }

    save(filename) {
        fs.writeFileSync(filename, JSON.stringify(this.books, null, 2));
        console.log(`Saved ${this.books.length} books to ${filename}`);
    }

    stats() {
        const avgPrice = this.books.reduce((sum, b) => sum + b.price, 0) / this.books.length;
        console.log(`Total: ${this.books.length} books, Avg price: £${avgPrice.toFixed(2)}`);
    }
}

const scraper = new BookScraper({ concurrency: 5, delay: 500 });
scraper.scrapeAll().then(() => {
    scraper.stats();
    scraper.save('books.json');
});

FAQ

When should I use Axios + Cheerio instead of Puppeteer?

Use Axios + Cheerio for static HTML pages that do not require JavaScript rendering. It is 10-50x faster and uses 80% less memory. Only switch to Puppeteer or Playwright when the page loads content dynamically via JavaScript.

Can I use fetch() instead of Axios?

Yes. Node.js 18+ includes native fetch(). However, Axios provides better error handling, request/response interceptors, automatic JSON parsing, timeout support, and proxy configuration. For simple requests, fetch works fine.

How do I handle rate limiting with Axios?

Use axios-retry for automatic retries with exponential backoff. Add delays between requests with setTimeout. Use the p-limit package to control concurrent request count. Monitor for 429 status codes and respect Retry-After headers.

Is Axios + Cheerio the same as Requests + BeautifulSoup?

Conceptually, yes. Axios is Node.js’s Requests (HTTP client), and Cheerio is Node.js’s BeautifulSoup (HTML parser). The APIs are different but the approach is identical: fetch HTML, parse it, extract data.


Explore more Node.js scraping: Cheerio tutorial, Puppeteer tutorial. For proxy setup, see our web scraping proxy guide.

External Resources:


Related Reading

Scroll to Top