Axios + Cheerio: Lightweight Node.js Scraping
Axios + Cheerio is the Requests + BeautifulSoup of the Node.js world. Axios handles HTTP requests with a clean promise-based API. Cheerio provides jQuery-style HTML parsing without a browser. Together, they form the fastest and lightest scraping stack in Node.js — processing pages 10-50x faster than Puppeteer or Playwright because there is no browser to launch.
This tutorial covers setup, common patterns, error handling, concurrent scraping, and proxy integration.
Table of Contents
- Why Axios + Cheerio
- Installation
- Basic Scraping
- Handling Responses
- Pagination
- Concurrent Scraping
- Error Handling and Retries
- Proxy Integration
- Storing Data
- Session Management
- Complete Example
- FAQ
Why Axios + Cheerio
| Feature | Axios + Cheerio | Puppeteer |
|---|---|---|
| Speed | 50-200 pages/s | 1-10 pages/s |
| Memory | ~30MB | ~200MB |
| JS rendering | No | Yes |
| Dependencies | 2 packages | Chrome binary |
| Setup | Instant | Downloads browser |
Use Axios + Cheerio for static pages. Switch to Puppeteer or Playwright only when JavaScript rendering is required.
Installation
npm init -y
npm install axios cheerioBasic Scraping
const axios = require('axios');
const cheerio = require('cheerio');
async function scrape(url) {
const { data } = await axios.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
},
timeout: 30000,
});
const $ = cheerio.load(data);
const books = [];
$('article.product_pod').each((i, el) => {
books.push({
title: $(el).find('h3 a').attr('title'),
price: $(el).find('.price_color').text(),
rating: $(el).find('p.star-rating').attr('class').replace('star-rating ', ''),
});
});
return books;
}
scrape('https://books.toscrape.com/')
.then(books => {
console.log(`Found ${books.length} books:`);
books.forEach(b => console.log(` ${b.title}: ${b.price}`));
})
.catch(console.error);Handling Responses
JSON API
const { data } = await axios.get('https://api.example.com/products', {
headers: { Accept: 'application/json' },
params: { page: 1, limit: 50 },
});
// Axios automatically parses JSON
data.results.forEach(product => {
console.log(`${product.name}: $${product.price}`);
});POST Requests
// Form data
const { data } = await axios.post('https://example.com/search',
'query=web+scraping&page=1',
{ headers: { 'Content-Type': 'application/x-www-form-urlencoded' } }
);
// JSON body
const { data: jsonData } = await axios.post('https://api.example.com/search',
{ query: 'web scraping', page: 1 },
{ headers: { 'Content-Type': 'application/json' } }
);Response Interceptors
const client = axios.create({
baseURL: 'https://books.toscrape.com',
timeout: 30000,
headers: { 'User-Agent': 'Mozilla/5.0' },
});
// Log all requests
client.interceptors.request.use(config => {
console.log(`Fetching: ${config.url}`);
return config;
});
// Handle response errors
client.interceptors.response.use(
response => response,
error => {
if (error.response) {
console.error(`HTTP ${error.response.status}: ${error.config.url}`);
} else {
console.error(`Network error: ${error.message}`);
}
return Promise.reject(error);
}
);Pagination
Sequential
const axios = require('axios');
const cheerio = require('cheerio');
async function scrapeAllPages() {
const client = axios.create({
timeout: 30000,
headers: { 'User-Agent': 'Mozilla/5.0' },
});
const allBooks = [];
for (let page = 1; page <= 50; page++) {
try {
const { data } = await client.get(
`https://books.toscrape.com/catalogue/page-${page}.html`
);
const $ = cheerio.load(data);
const books = $('article.product_pod').map((i, el) => ({
title: $(el).find('h3 a').attr('title'),
price: $(el).find('.price_color').text(),
})).get();
if (books.length === 0) break;
allBooks.push(...books);
console.log(`Page ${page}: ${books.length} books`);
// Polite delay
await new Promise(r => setTimeout(r, 1000));
} catch (err) {
console.error(`Error on page ${page}: ${err.message}`);
break;
}
}
return allBooks;
}Following Next Links
async function scrapeFollowingLinks(startUrl) {
const client = axios.create({ timeout: 30000 });
const allBooks = [];
let url = startUrl;
while (url) {
const { data } = await client.get(url);
const $ = cheerio.load(data);
$('article.product_pod').each((i, el) => {
allBooks.push({
title: $(el).find('h3 a').attr('title'),
price: $(el).find('.price_color').text(),
});
});
const nextHref = $('li.next a').attr('href');
url = nextHref ? new URL(nextHref, url).href : null;
await new Promise(r => setTimeout(r, 1000));
}
return allBooks;
}Concurrent Scraping
Promise.all with Batching
async function scrapeConcurrently(urls, concurrency = 5) {
const client = axios.create({
timeout: 30000,
headers: { 'User-Agent': 'Mozilla/5.0' },
});
const allResults = [];
for (let i = 0; i < urls.length; i += concurrency) {
const batch = urls.slice(i, i + concurrency);
const results = await Promise.allSettled(
batch.map(async (url) => {
const { data } = await client.get(url);
const $ = cheerio.load(data);
return $('article.product_pod').map((_, el) => ({
title: $(el).find('h3 a').attr('title'),
price: $(el).find('.price_color').text(),
source: url,
})).get();
})
);
results.forEach(result => {
if (result.status === 'fulfilled') {
allResults.push(...result.value);
}
});
console.log(`Batch ${Math.floor(i / concurrency) + 1}: ${allResults.length} total items`);
await new Promise(r => setTimeout(r, 500));
}
return allResults;
}
// Usage
const urls = Array.from({ length: 50 }, (_, i) =>
`https://books.toscrape.com/catalogue/page-${i + 1}.html`
);
scrapeConcurrently(urls, 5).then(results => {
console.log(`Total: ${results.length} books`);
});p-limit for Concurrency Control
npm install p-limitconst pLimit = require('p-limit');
const limit = pLimit(5); // Max 5 concurrent requests
const tasks = urls.map(url =>
limit(async () => {
const { data } = await axios.get(url);
const $ = cheerio.load(data);
return $('h1').text();
})
);
const results = await Promise.all(tasks);Error Handling and Retries
axios-retry
npm install axios-retryconst axios = require('axios');
const axiosRetry = require('axios-retry').default;
const client = axios.create({ timeout: 30000 });
axiosRetry(client, {
retries: 3,
retryDelay: axiosRetry.exponentialDelay,
retryCondition: (error) => {
return axiosRetry.isNetworkOrIdempotentRequestError(error) ||
error.response?.status === 429 ||
error.response?.status >= 500;
},
onRetry: (retryCount, error) => {
console.log(`Retry ${retryCount}: ${error.config.url}`);
},
});Manual Retry
async function fetchWithRetry(url, maxRetries = 3) {
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
const response = await axios.get(url, { timeout: 30000 });
return response.data;
} catch (err) {
if (attempt === maxRetries) throw err;
const delay = Math.pow(2, attempt) * 1000;
console.log(`Attempt ${attempt} failed, retrying in ${delay}ms...`);
await new Promise(r => setTimeout(r, delay));
}
}
}Proxy Integration
Basic Proxy
const { data } = await axios.get('https://httpbin.org/ip', {
proxy: {
host: 'proxy.example.com',
port: 8080,
auth: {
username: 'user',
password: 'pass',
},
},
});
console.log(data);Rotating Proxies
const proxies = [
{ host: 'proxy1.example.com', port: 8080 },
{ host: 'proxy2.example.com', port: 8080 },
{ host: 'proxy3.example.com', port: 8080 },
];
function getRandomProxy() {
const proxy = proxies[Math.floor(Math.random() * proxies.length)];
return {
...proxy,
auth: { username: 'user', password: 'pass' },
};
}
async function scrapeWithProxy(url) {
const { data } = await axios.get(url, {
proxy: getRandomProxy(),
timeout: 30000,
});
return cheerio.load(data);
}Using https-proxy-agent
npm install https-proxy-agentconst { HttpsProxyAgent } = require('https-proxy-agent');
const agent = new HttpsProxyAgent('http://user:pass@proxy.example.com:8080');
const { data } = await axios.get('https://httpbin.org/ip', {
httpAgent: agent,
httpsAgent: agent,
proxy: false, // Disable Axios's built-in proxy handling
});For proxy types, see our web scraping proxy guide and proxy glossary.
Storing Data
JSON
const fs = require('fs');
fs.writeFileSync('books.json', JSON.stringify(books, null, 2));CSV
npm install csv-stringifyconst { stringify } = require('csv-stringify/sync');
const csv = stringify(books, {
header: true,
columns: ['title', 'price', 'rating'],
});
fs.writeFileSync('books.csv', csv);SQLite
npm install better-sqlite3const Database = require('better-sqlite3');
const db = new Database('books.db');
db.exec('CREATE TABLE IF NOT EXISTS books (title TEXT, price TEXT, rating TEXT)');
const insert = db.prepare('INSERT INTO books VALUES (?, ?, ?)');
const insertMany = db.transaction((books) => {
for (const book of books) {
insert.run(book.title, book.price, book.rating);
}
});
insertMany(books);
db.close();Session Management
const axios = require('axios');
const tough = require('tough-cookie');
const { wrapper } = require('axios-cookiejar-support');
// Create cookie jar
const jar = new tough.CookieJar();
const client = wrapper(axios.create({
jar,
withCredentials: true,
timeout: 30000,
}));
// Login
await client.post('https://example.com/login', {
username: 'user',
password: 'pass',
});
// Subsequent requests maintain cookies
const { data } = await client.get('https://example.com/dashboard');
const $ = cheerio.load(data);Complete Example
const axios = require('axios');
const cheerio = require('cheerio');
const fs = require('fs');
class BookScraper {
constructor(options = {}) {
this.concurrency = options.concurrency || 5;
this.delay = options.delay || 1000;
this.client = axios.create({
timeout: 30000,
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html',
},
});
this.books = [];
}
async scrapePage(url) {
const { data } = await this.client.get(url);
const $ = cheerio.load(data);
const books = $('article.product_pod').map((i, el) => ({
title: $(el).find('h3 a').attr('title'),
price: parseFloat($(el).find('.price_color').text().replace('£', '')),
rating: $(el).find('p.star-rating').attr('class').replace('star-rating ', ''),
available: $(el).find('.instock').length > 0,
})).get();
return books;
}
async scrapeAll(totalPages = 50) {
const urls = Array.from({ length: totalPages }, (_, i) =>
`https://books.toscrape.com/catalogue/page-${i + 1}.html`
);
for (let i = 0; i < urls.length; i += this.concurrency) {
const batch = urls.slice(i, i + this.concurrency);
const results = await Promise.allSettled(
batch.map(url => this.scrapePage(url))
);
results.forEach(r => {
if (r.status === 'fulfilled') this.books.push(...r.value);
});
console.log(`Progress: ${this.books.length} books`);
await new Promise(r => setTimeout(r, this.delay));
}
return this.books;
}
save(filename) {
fs.writeFileSync(filename, JSON.stringify(this.books, null, 2));
console.log(`Saved ${this.books.length} books to ${filename}`);
}
stats() {
const avgPrice = this.books.reduce((sum, b) => sum + b.price, 0) / this.books.length;
console.log(`Total: ${this.books.length} books, Avg price: £${avgPrice.toFixed(2)}`);
}
}
const scraper = new BookScraper({ concurrency: 5, delay: 500 });
scraper.scrapeAll().then(() => {
scraper.stats();
scraper.save('books.json');
});FAQ
When should I use Axios + Cheerio instead of Puppeteer?
Use Axios + Cheerio for static HTML pages that do not require JavaScript rendering. It is 10-50x faster and uses 80% less memory. Only switch to Puppeteer or Playwright when the page loads content dynamically via JavaScript.
Can I use fetch() instead of Axios?
Yes. Node.js 18+ includes native fetch(). However, Axios provides better error handling, request/response interceptors, automatic JSON parsing, timeout support, and proxy configuration. For simple requests, fetch works fine.
How do I handle rate limiting with Axios?
Use axios-retry for automatic retries with exponential backoff. Add delays between requests with setTimeout. Use the p-limit package to control concurrent request count. Monitor for 429 status codes and respect Retry-After headers.
Is Axios + Cheerio the same as Requests + BeautifulSoup?
Conceptually, yes. Axios is Node.js’s Requests (HTTP client), and Cheerio is Node.js’s BeautifulSoup (HTML parser). The APIs are different but the approach is identical: fetch HTML, parse it, extract data.
Explore more Node.js scraping: Cheerio tutorial, Puppeteer tutorial. For proxy setup, see our web scraping proxy guide.
External Resources:
- Axios Documentation
- Cheerio Documentation
- axios-retry Documentation
- aiohttp + BeautifulSoup: Async Python Scraping
- Beautiful Soup Tutorial: Python HTML Parsing Guide
- How Anti-Bot Systems Detect Scrapers (Cloudflare, Akamai, PerimeterX)
- API vs Web Scraping: When You Need Proxies (and When You Don’t)
- ASEAN Data Protection Laws: A Web Scraping Compliance Matrix
- How to Build an Ethical Web Scraping Policy for Your Company
- aiohttp + BeautifulSoup: Async Python Scraping
- Beautiful Soup Tutorial: Python HTML Parsing Guide
- How Anti-Bot Systems Detect Scrapers (Cloudflare, Akamai, PerimeterX)
- API vs Web Scraping: When You Need Proxies (and When You Don’t)
- ASEAN Data Protection Laws: A Web Scraping Compliance Matrix
- How to Build an Ethical Web Scraping Policy for Your Company
- aiohttp + BeautifulSoup: Async Python Scraping
- Beautiful Soup Tutorial: Python HTML Parsing Guide
- How Anti-Bot Systems Detect Scrapers (Cloudflare, Akamai, PerimeterX)
- API vs Web Scraping: When You Need Proxies (and When You Don’t)
- ASEAN Data Protection Laws: A Web Scraping Compliance Matrix
- How to Build an Ethical Web Scraping Policy for Your Company
- aiohttp + BeautifulSoup: Async Python Scraping
- Axios Retry for Web Scraping in Node.js: the Complete Guide
- How Anti-Bot Systems Detect Scrapers (Cloudflare, Akamai, PerimeterX)
- API vs Web Scraping: When You Need Proxies (and When You Don’t)
- ASEAN Data Protection Laws: A Web Scraping Compliance Matrix
- How to Build an Ethical Web Scraping Policy for Your Company
- aiohttp + BeautifulSoup: Async Python Scraping
- Axios Retry for Web Scraping in Node.js: the Complete Guide
- How Anti-Bot Systems Detect Scrapers (Cloudflare, Akamai, PerimeterX)
- API vs Web Scraping: When You Need Proxies (and When You Don’t)
- ASEAN Data Protection Laws: A Web Scraping Compliance Matrix
- How to Build an Ethical Web Scraping Policy for Your Company
- aiohttp + BeautifulSoup: Async Python Scraping
- Axios Retry for Web Scraping in Node.js: the Complete Guide
- How Anti-Bot Systems Detect Scrapers (Cloudflare, Akamai, PerimeterX)
- API vs Web Scraping: When You Need Proxies (and When You Don’t)
- ASEAN Data Protection Laws: A Web Scraping Compliance Matrix
- How to Build an Ethical Web Scraping Policy for Your Company
Related Reading
- aiohttp + BeautifulSoup: Async Python Scraping
- Axios Retry for Web Scraping in Node.js: the Complete Guide
- How Anti-Bot Systems Detect Scrapers (Cloudflare, Akamai, PerimeterX)
- API vs Web Scraping: When You Need Proxies (and When You Don’t)
- ASEAN Data Protection Laws: A Web Scraping Compliance Matrix
- How to Build an Ethical Web Scraping Policy for Your Company