Proxies for Scraping Business Directories (Yellow Pages, BBB, Clutch)
Business directories remain one of the most reliable sources of B2B lead data. Unlike social platforms that frequently change their interfaces and anti-bot systems, directories like Yellow Pages, BBB, Clutch, G2, and Capterra have relatively stable structures and contain verified business information. With the right proxy setup, you can extract thousands of qualified leads per day from these sources.
This guide covers proxy strategies and scraping techniques for the most popular business directories, each of which has unique challenges and data opportunities.
Directory Landscape for B2B Lead Generation
Different directories serve different markets. Choose your targets based on your ideal customer profile:
| Directory | Best For | Data Quality | Anti-Bot Level |
|---|---|---|---|
| Yellow Pages | Local service businesses | Medium | Low |
| BBB | Established businesses | High (accredited) | Medium |
| Clutch | B2B service providers | High | Medium |
| G2 | SaaS companies | High | High |
| Capterra | Software companies | High | Medium |
| Angi (HomeAdvisor) | Home services | Medium | Medium |
| ThomasNet | Manufacturers/suppliers | High | Low |
| Manta | Small businesses | Medium | Low |
Proxy Strategy by Directory
Each directory has different detection mechanisms. Mobile proxies work across all of them, but the optimal configuration varies.
Low-Protection Directories (Yellow Pages, Manta, ThomasNet)
These directories have basic rate limiting but minimal fingerprinting. Rotating mobile proxies with moderate concurrency work well:
import requests
from bs4 import BeautifulSoup
import time
import random
LOW_PROTECTION_CONFIG = {
"proxy_url": "http://user:pass@gateway.dataresearchtools.com:5000",
"concurrency": 20,
"delay_range": (1, 3),
"use_browser": False,
}
def scrape_yellowpages(category, location, proxy_url, max_pages=20):
"""Scrape Yellow Pages search results"""
businesses = []
for page in range(1, max_pages + 1):
url = f"https://www.yellowpages.com/search?search_terms={category}&geo_location_terms={location}&page={page}"
response = requests.get(
url,
proxies={"https": proxy_url},
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
},
timeout=15
)
if response.status_code != 200:
print(f"Page {page}: HTTP {response.status_code}")
break
soup = BeautifulSoup(response.text, 'lxml')
results = soup.find_all('div', class_='result')
if not results:
break
for result in results:
business = parse_yellowpages_result(result)
if business:
businesses.append(business)
time.sleep(random.uniform(1, 3))
return businesses
def parse_yellowpages_result(result):
"""Parse a single Yellow Pages search result"""
business = {}
# Name
name_el = result.find('a', class_='business-name')
if name_el:
business['name'] = name_el.get_text(strip=True)
business['yp_url'] = f"https://www.yellowpages.com{name_el.get('href', '')}"
# Phone
phone_el = result.find('div', class_='phones')
if phone_el:
business['phone'] = phone_el.get_text(strip=True)
# Address
address_el = result.find('div', class_='adr')
if address_el:
street = address_el.find('span', class_='street-address')
locality = address_el.find('span', class_='locality')
business['address'] = street.get_text(strip=True) if street else None
business['city_state'] = locality.get_text(strip=True) if locality else None
# Categories
cat_els = result.find_all('a', class_='category')
business['categories'] = [c.get_text(strip=True) for c in cat_els]
# Website
website_el = result.find('a', class_='track-visit-website')
if website_el:
business['website'] = website_el.get('href', '')
# Rating
rating_el = result.find('div', class_='ratings')
if rating_el:
stars = rating_el.find('div', class_='result-rating')
if stars:
class_list = stars.get('class', [])
for cls in class_list:
if cls.startswith('three'):
business['rating'] = 3.0
elif cls.startswith('four'):
business['rating'] = 4.0
elif cls.startswith('five'):
business['rating'] = 5.0
return business if business.get('name') else NoneMedium-Protection Directories (BBB, Clutch, Capterra)
These require more careful handling. Use lower concurrency and longer delays:
MEDIUM_PROTECTION_CONFIG = {
"proxy_url": "http://user:pass@gateway.dataresearchtools.com:5000",
"concurrency": 5,
"delay_range": (3, 8),
"use_browser": True, # Recommended for JS-rendered content
}
async def scrape_bbb(category, location, proxy_config):
"""Scrape BBB directory using browser automation"""
from playwright.async_api import async_playwright
businesses = []
async with async_playwright() as p:
browser = await p.chromium.launch(
proxy=proxy_config,
headless=False,
)
page = await browser.new_page()
search_url = f"https://www.bbb.org/search?find_country=US&find_text={category}&find_loc={location}&find_type=Category"
await page.goto(search_url, wait_until="networkidle")
await page.wait_for_timeout(random.randint(3000, 6000))
# Parse results
results = await page.query_selector_all('[data-testid="result-card"]')
for result in results:
business = {}
name_el = await result.query_selector('h3 a')
if name_el:
business['name'] = await name_el.inner_text()
business['bbb_url'] = await name_el.get_attribute('href')
phone_el = await result.query_selector('a[href^="tel:"]')
if phone_el:
business['phone'] = await phone_el.inner_text()
# BBB rating (A+, A, B+, etc.)
rating_el = await result.query_selector('[class*="rating"]')
if rating_el:
business['bbb_rating'] = await rating_el.inner_text()
# Accreditation status
accredited_el = await result.query_selector('[class*="accredited"]')
business['bbb_accredited'] = accredited_el is not None
if business.get('name'):
businesses.append(business)
await browser.close()
return businessesScraping Clutch for B2B Service Providers
Clutch is particularly valuable for finding agencies, development shops, and consulting firms:
async def scrape_clutch(category_slug, proxy_config):
"""Scrape Clutch.co for B2B service providers"""
from playwright.async_api import async_playwright
companies = []
async with async_playwright() as p:
browser = await p.chromium.launch(proxy=proxy_config)
page = await browser.new_page()
url = f"https://clutch.co/{category_slug}"
await page.goto(url, wait_until="networkidle")
await page.wait_for_timeout(random.randint(3000, 5000))
# Extract company cards
cards = await page.query_selector_all('.provider-row')
for card in cards:
company = {}
name_el = await card.query_selector('h3 a')
if name_el:
company['name'] = (await name_el.inner_text()).strip()
company['clutch_url'] = await name_el.get_attribute('href')
# Clutch rating
rating_el = await card.query_selector('.rating')
if rating_el:
rating_text = await rating_el.inner_text()
try:
company['clutch_rating'] = float(rating_text.strip())
except ValueError:
pass
# Location
loc_el = await card.query_selector('.locality')
if loc_el:
company['location'] = (await loc_el.inner_text()).strip()
# Min project size
project_el = await card.query_selector('.field--min-project-size')
if project_el:
company['min_project'] = (await project_el.inner_text()).strip()
# Employee count
emp_el = await card.query_selector('.field--employees')
if emp_el:
company['employees'] = (await emp_el.inner_text()).strip()
if company.get('name'):
companies.append(company)
await browser.close()
return companiesMulti-Directory Lead Aggregation
The real power comes from combining data across multiple directories. For technical proxy concepts referenced below, see our proxy glossary.
class DirectoryAggregator:
"""Aggregate and deduplicate leads from multiple directories"""
def __init__(self):
self.leads = {}
def add_leads(self, source, leads_list):
"""Add leads from a directory source"""
for lead in leads_list:
# Generate a matching key (normalize business name + city)
key = self.generate_key(lead)
if key in self.leads:
# Merge data from multiple sources
self.leads[key] = self.merge_lead(self.leads[key], lead, source)
else:
lead['sources'] = [source]
self.leads[key] = lead
def generate_key(self, lead):
"""Create a deduplication key"""
import re
name = re.sub(r'[^a-z0-9]', '', (lead.get('name', '')).lower())
city = re.sub(r'[^a-z0-9]', '', (lead.get('city', '')).lower())
return f"{name}_{city}"
def merge_lead(self, existing, new, source):
"""Merge new data into existing lead"""
existing['sources'].append(source)
# Fill in missing fields from new source
for field in ['phone', 'website', 'address', 'email']:
if not existing.get(field) and new.get(field):
existing[field] = new[field]
# Keep higher ratings/review counts
if new.get('review_count', 0) > existing.get('review_count', 0):
existing['review_count'] = new['review_count']
return existing
def get_multi_source_leads(self, min_sources=2):
"""Return leads found in multiple directories (higher confidence)"""
return [
lead for lead in self.leads.values()
if len(lead.get('sources', [])) >= min_sources
]Handling Pagination at Scale
Most directories paginate results. Implement robust pagination handling:
async def paginate_directory(page, base_url, max_pages=50):
"""Handle directory pagination with error recovery"""
all_results = []
current_page = 1
while current_page <= max_pages:
url = f"{base_url}&page={current_page}"
try:
await page.goto(url, wait_until="networkidle", timeout=30000)
await page.wait_for_timeout(random.randint(2000, 5000))
# Check if we've reached the end
no_results = await page.query_selector('text="No results found"')
if no_results:
break
# Extract results from current page
html = await page.content()
results = parse_directory_page(html)
if not results:
break
all_results.extend(results)
current_page += 1
# Progressive delay (slow down over time)
delay = random.uniform(2, 5) + (current_page * 0.5)
await page.wait_for_timeout(int(delay * 1000))
except Exception as e:
print(f"Error on page {current_page}: {e}")
# Wait and retry once
await page.wait_for_timeout(30000)
current_page += 1
return all_resultsIndustry-Specific Directory Strategies
ThomasNet (Manufacturing)
ThomasNet is ideal for finding manufacturers and industrial suppliers. Its anti-bot measures are minimal:
def scrape_thomasnet(product_category, proxy_url):
"""Scrape ThomasNet for manufacturers"""
url = f"https://www.thomasnet.com/products/{product_category}"
response = requests.get(
url,
proxies={"https": proxy_url},
headers={"User-Agent": "Mozilla/5.0"},
timeout=15
)
soup = BeautifulSoup(response.text, 'lxml')
suppliers = []
for listing in soup.find_all('div', class_='supplier-result'):
supplier = {}
name_el = listing.find('h2')
if name_el:
supplier['name'] = name_el.get_text(strip=True)
location_el = listing.find('span', class_='supplier-location')
if location_el:
supplier['location'] = location_el.get_text(strip=True)
suppliers.append(supplier)
return suppliersCapterra (Software Companies)
Capterra provides excellent data for SaaS sales teams. Each listing includes company size, pricing model, and feature lists — giving rich data for web scraping operations targeting the software industry.
Data Quality Assurance
Directory data has inherent quality issues. Implement validation:
import phonenumbers
def validate_lead(lead):
"""Validate and clean lead data"""
issues = []
# Validate phone number
if lead.get('phone'):
try:
parsed = phonenumbers.parse(lead['phone'], 'US')
if phonenumbers.is_valid_number(parsed):
lead['phone'] = phonenumbers.format_number(
parsed, phonenumbers.PhoneNumberFormat.E164
)
else:
issues.append("Invalid phone number")
lead['phone'] = None
except phonenumbers.NumberParseException:
issues.append("Unparseable phone number")
lead['phone'] = None
# Validate website
if lead.get('website'):
from urllib.parse import urlparse
parsed = urlparse(lead['website'])
if not parsed.scheme:
lead['website'] = f"https://{lead['website']}"
# Check for minimum data completeness
required_fields = ['name']
for field in required_fields:
if not lead.get(field):
issues.append(f"Missing required field: {field}")
lead['validation_issues'] = issues
lead['is_valid'] = len(issues) == 0
return leadConclusion
Business directories provide structured, verified lead data that complements social platform scraping. By targeting multiple directories with appropriate proxy configurations — low concurrency for well-protected sites, higher throughput for simpler ones — you can build comprehensive lead databases with cross-referenced data points. The multi-source approach improves data accuracy and provides richer lead profiles than any single directory alone.
Start with the directories most relevant to your target market, build your aggregation pipeline, and scale to additional sources as you validate lead quality through conversion rates.