Hugging Face Web Scraping with Proxy Integration
Hugging Face has become the central hub for AI models, datasets, and tools. while most people think of Hugging Face for NLP tasks like text classification or summarization, its ecosystem is increasingly useful for web scraping. you can use Hugging Face models to parse unstructured web content, classify scraped data, extract entities, and even power AI agents that scrape autonomously.
this guide covers practical ways to combine Hugging Face models with web scraping pipelines and proxy infrastructure for reliable, intelligent data collection.
Why Use Hugging Face for Web Scraping?
traditional scraping relies on CSS selectors and XPath queries that break when websites change their HTML structure. Hugging Face models add intelligence to your pipeline:
- entity extraction – pull out names, dates, prices, and locations from unstructured text without writing custom regex
- content classification – automatically categorize scraped pages by topic
- summarization – condense long scraped articles into key points
- question answering – extract specific facts from scraped content by asking natural language questions
- table understanding – parse complex tables using vision models instead of brittle HTML parsing
Setting Up the Environment
pip install transformers torch datasets curl-cffi beautifulsoup4 lxml accelerate
for GPU acceleration (recommended for larger models):
pip install torch --index-url https://download.pytorch.org/whl/cu121
Method 1: Named Entity Recognition on Scraped Content
use a Hugging Face NER model to extract structured entities from scraped web pages.
from transformers import pipeline
from curl_cffi import requests
from bs4 import BeautifulSoup
import json
import random
# load NER pipeline
ner = pipeline(
"ner",
model="dslim/bert-base-NER",
aggregation_strategy="simple",
device=-1, # use -1 for CPU, 0 for GPU
)
# proxy configuration
PROXIES = [
"http://user:pass@residential1.proxy.com:port",
"http://user:pass@residential2.proxy.com:port",
]
def scrape_and_extract_entities(url):
"""scrape a page and extract named entities using BERT"""
session = requests.Session(impersonate="chrome124")
proxy = random.choice(PROXIES)
response = session.get(
url,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
},
proxies={"http": proxy, "https": proxy},
timeout=30,
)
soup = BeautifulSoup(response.text, "lxml")
# remove non-content elements
for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
tag.decompose()
# get clean text
text = soup.get_text(separator=" ", strip=True)
# NER models have token limits, process in chunks
chunks = [text[i:i+512] for i in range(0, min(len(text), 5000), 512)]
all_entities = []
for chunk in chunks:
entities = ner(chunk)
for entity in entities:
all_entities.append({
"text": entity["word"],
"type": entity["entity_group"],
"confidence": round(entity["score"], 3),
})
# deduplicate entities
seen = set()
unique_entities = []
for entity in all_entities:
key = (entity["text"].lower(), entity["type"])
if key not in seen:
seen.add(key)
unique_entities.append(entity)
return {
"url": url,
"entities": {
"persons": [e for e in unique_entities if e["type"] == "PER"],
"organizations": [e for e in unique_entities if e["type"] == "ORG"],
"locations": [e for e in unique_entities if e["type"] == "LOC"],
"misc": [e for e in unique_entities if e["type"] == "MISC"],
},
"total_entities": len(unique_entities),
}
# usage
result = scrape_and_extract_entities("https://example-news-site.com/article")
print(json.dumps(result, indent=2))
Method 2: Content Classification of Scraped Pages
automatically classify scraped content into categories using zero-shot classification.
from transformers import pipeline
from curl_cffi import requests
from bs4 import BeautifulSoup
import random
# zero-shot classifier - no training needed
classifier = pipeline(
"zero-shot-classification",
model="facebook/bart-large-mnli",
device=-1,
)
def classify_scraped_page(url, categories, proxy=None):
"""scrape a page and classify its content into categories"""
session = requests.Session(impersonate="chrome124")
proxy_dict = {"http": proxy, "https": proxy} if proxy else None
response = session.get(
url,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
},
proxies=proxy_dict,
timeout=30,
)
soup = BeautifulSoup(response.text, "lxml")
for tag in soup(["script", "style", "nav", "footer"]):
tag.decompose()
# get title and first 500 chars of content
title = soup.title.get_text(strip=True) if soup.title else ""
text = soup.get_text(separator=" ", strip=True)[:500]
input_text = f"{title}. {text}"
# classify
result = classifier(input_text, candidate_labels=categories)
return {
"url": url,
"title": title,
"classification": {
label: round(score, 3)
for label, score in zip(result["labels"], result["scores"])
},
"top_category": result["labels"][0],
"confidence": round(result["scores"][0], 3),
}
# usage: classify scraped pages into topic categories
categories = [
"technology",
"business",
"health",
"politics",
"sports",
"entertainment",
"science",
]
proxy = random.choice(PROXIES)
result = classify_scraped_page(
"https://example.com/article-about-ai",
categories,
proxy=proxy,
)
print(f"category: {result['top_category']} ({result['confidence']:.1%})")
print(f"all scores: {result['classification']}")
Method 3: Question Answering on Scraped Content
use a QA model to extract specific information from scraped pages by asking natural language questions.
from transformers import pipeline
# question answering pipeline
qa = pipeline(
"question-answering",
model="deepset/roberta-base-squad2",
device=-1,
)
def extract_facts_from_page(url, questions, proxy=None):
"""scrape a page and answer specific questions about its content"""
session = requests.Session(impersonate="chrome124")
proxy_dict = {"http": proxy, "https": proxy} if proxy else None
response = session.get(
url,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
},
proxies=proxy_dict,
timeout=30,
)
soup = BeautifulSoup(response.text, "lxml")
for tag in soup(["script", "style"]):
tag.decompose()
context = soup.get_text(separator=" ", strip=True)[:3000]
answers = {}
for question in questions:
try:
result = qa(question=question, context=context)
answers[question] = {
"answer": result["answer"],
"confidence": round(result["score"], 3),
"start": result["start"],
"end": result["end"],
}
except Exception as e:
answers[question] = {"error": str(e)}
return {"url": url, "answers": answers}
# usage: extract specific facts from a company page
facts = extract_facts_from_page(
"https://example-company.com/about",
questions=[
"when was the company founded?",
"who is the CEO?",
"how many employees does the company have?",
"where is the company headquartered?",
],
proxy=random.choice(PROXIES),
)
for question, answer in facts["answers"].items():
if "error" not in answer:
print(f"Q: {question}")
print(f"A: {answer['answer']} (confidence: {answer['confidence']:.1%})")
print()
Method 4: Summarizing Scraped Articles
when you scrape many articles, summarization helps you process the content quickly.
from transformers import pipeline
summarizer = pipeline(
"summarization",
model="facebook/bart-large-cnn",
device=-1,
)
def scrape_and_summarize(urls, proxy=None, max_length=150, min_length=50):
"""scrape multiple articles and generate summaries"""
session = requests.Session(impersonate="chrome124")
proxy_dict = {"http": proxy, "https": proxy} if proxy else None
summaries = []
for url in urls:
try:
response = session.get(
url,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
},
proxies=proxy_dict,
timeout=30,
)
soup = BeautifulSoup(response.text, "lxml")
for tag in soup(["script", "style", "nav", "footer", "header"]):
tag.decompose()
# extract article text
article = soup.find("article") or soup.find("main") or soup
text = article.get_text(separator=" ", strip=True)
# bart-large-cnn has a 1024 token limit
text = text[:2000]
if len(text) > 100:
summary = summarizer(
text,
max_length=max_length,
min_length=min_length,
do_sample=False,
)
summaries.append({
"url": url,
"title": soup.title.get_text(strip=True) if soup.title else "",
"summary": summary[0]["summary_text"],
"original_length": len(text),
})
else:
summaries.append({
"url": url,
"error": "content too short to summarize",
})
except Exception as e:
summaries.append({"url": url, "error": str(e)})
import time
time.sleep(random.uniform(2, 5))
return summaries
# usage
urls_to_summarize = [
"https://news-site.com/article-1",
"https://news-site.com/article-2",
"https://news-site.com/article-3",
]
summaries = scrape_and_summarize(urls_to_summarize, proxy=random.choice(PROXIES))
for s in summaries:
if "summary" in s:
print(f"\n{s['title']}")
print(f" {s['summary']}")
Method 5: Building a Hugging Face Agent for Web Scraping
Hugging Face’s Transformers Agents framework lets you build AI agents similar to Google ADK.
from transformers import ReactCodeAgent, HfApiEngine
from curl_cffi import requests
from bs4 import BeautifulSoup
import json
import random
# define scraping tools for the agent
def web_scrape(url: str) -> str:
"""
scrapes a webpage and returns its text content.
args:
url: the URL of the webpage to scrape
returns:
the text content of the webpage
"""
session = requests.Session(impersonate="chrome124")
proxy = random.choice(PROXIES)
response = session.get(
url,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
},
proxies={"http": proxy, "https": proxy},
timeout=30,
)
soup = BeautifulSoup(response.text, "lxml")
for tag in soup(["script", "style", "nav", "footer"]):
tag.decompose()
text = soup.get_text(separator="\n", strip=True)
return text[:5000]
def extract_table_data(url: str) -> str:
"""
scrapes a webpage and extracts all table data as JSON.
args:
url: the URL of the webpage containing tables
returns:
JSON string of all tables found on the page
"""
session = requests.Session(impersonate="chrome124")
proxy = random.choice(PROXIES)
response = session.get(
url,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
},
proxies={"http": proxy, "https": proxy},
timeout=30,
)
soup = BeautifulSoup(response.text, "lxml")
tables = []
for table in soup.find_all("table"):
headers = [th.get_text(strip=True) for th in table.select("tr th")]
rows = []
for tr in table.select("tr")[1:]:
cells = [td.get_text(strip=True) for td in tr.select("td")]
if cells and headers:
row_dict = dict(zip(headers, cells))
rows.append(row_dict)
elif cells:
rows.append(cells)
if rows:
tables.append(rows)
return json.dumps(tables, indent=2)
# create the agent
llm_engine = HfApiEngine("Qwen/Qwen2.5-Coder-32B-Instruct")
agent = ReactCodeAgent(
tools=[web_scrape, extract_table_data],
llm_engine=llm_engine,
max_iterations=5,
)
# run the agent
result = agent.run(
"scrape the pricing page at https://example.com/pricing and extract "
"all pricing tiers and features into a structured format"
)
print(result)
Method 6: Using Hugging Face Inference API with Proxies
for running models without local GPU, use Hugging Face’s Inference API.
import requests as stdlib_requests
class HuggingFaceInference:
def __init__(self, api_key, proxy=None):
self.api_key = api_key
self.base_url = "https://api-inference.huggingface.co/models"
self.proxy = {"http": proxy, "https": proxy} if proxy else None
def query(self, model, payload):
"""query a Hugging Face model via the Inference API"""
response = stdlib_requests.post(
f"{self.base_url}/{model}",
headers={"Authorization": f"Bearer {self.api_key}"},
json=payload,
proxies=self.proxy,
timeout=60,
)
return response.json()
def classify_text(self, text, labels):
"""zero-shot text classification"""
return self.query(
"facebook/bart-large-mnli",
{"inputs": text, "parameters": {"candidate_labels": labels}},
)
def extract_entities(self, text):
"""named entity recognition"""
return self.query("dslim/bert-base-NER", {"inputs": text})
def summarize(self, text, max_length=150):
"""text summarization"""
return self.query(
"facebook/bart-large-cnn",
{"inputs": text, "parameters": {"max_length": max_length}},
)
def answer_question(self, question, context):
"""question answering"""
return self.query(
"deepset/roberta-base-squad2",
{"inputs": {"question": question, "context": context}},
)
# usage
hf = HuggingFaceInference(
api_key="hf_YOUR_API_KEY",
proxy="http://user:pass@proxy:port" # route API calls through proxy if needed
)
# classify scraped content
result = hf.classify_text(
"the new proxy server supports HTTP/2 and offers residential IPs in 50 countries",
labels=["technology", "business", "sports", "politics"]
)
print(f"category: {result['labels'][0]}")
Building a Complete Scraping + ML Pipeline
here’s a full pipeline that scrapes, processes, and enriches data using Hugging Face models.
from curl_cffi import requests
from bs4 import BeautifulSoup
from transformers import pipeline
import pandas as pd
import json
import time
import random
class IntelligentScraper:
def __init__(self, proxies, device=-1):
self.proxies = proxies
self.session = requests.Session(impersonate="chrome124")
# load ML models
print("loading models...")
self.ner = pipeline("ner", model="dslim/bert-base-NER",
aggregation_strategy="simple", device=device)
self.classifier = pipeline("zero-shot-classification",
model="facebook/bart-large-mnli", device=device)
print("models loaded")
def scrape(self, url):
"""scrape a URL through a rotating proxy"""
proxy = random.choice(self.proxies)
response = self.session.get(
url,
headers={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
},
proxies={"http": proxy, "https": proxy},
timeout=30,
)
return response
def extract_text(self, html):
"""clean and extract text from HTML"""
soup = BeautifulSoup(html, "lxml")
for tag in soup(["script", "style", "nav", "footer", "header"]):
tag.decompose()
return soup.get_text(separator=" ", strip=True)
def process_page(self, url, categories=None):
"""scrape, extract, classify, and enrich a single page"""
response = self.scrape(url)
if response.status_code != 200:
return {"url": url, "error": f"HTTP {response.status_code}"}
text = self.extract_text(response.text)
soup = BeautifulSoup(response.text, "lxml")
title = soup.title.get_text(strip=True) if soup.title else ""
result = {
"url": url,
"title": title,
"text_length": len(text),
}
# extract entities
entities = self.ner(text[:1000])
result["entities"] = {
"organizations": list(set(
e["word"] for e in entities if e["entity_group"] == "ORG"
)),
"persons": list(set(
e["word"] for e in entities if e["entity_group"] == "PER"
)),
"locations": list(set(
e["word"] for e in entities if e["entity_group"] == "LOC"
)),
}
# classify content
if categories:
classification = self.classifier(
f"{title}. {text[:300]}",
candidate_labels=categories,
)
result["category"] = classification["labels"][0]
result["category_confidence"] = round(classification["scores"][0], 3)
return result
def process_batch(self, urls, categories=None, delay=3):
"""process multiple URLs with rate limiting"""
results = []
for i, url in enumerate(urls):
print(f"[{i+1}/{len(urls)}] processing {url}")
result = self.process_page(url, categories)
results.append(result)
time.sleep(random.uniform(delay * 0.5, delay * 1.5))
return results
# usage
scraper = IntelligentScraper(
proxies=[
"http://user:pass@proxy1.com:port",
"http://user:pass@proxy2.com:port",
],
device=-1, # CPU
)
urls = [
"https://example.com/article-1",
"https://example.com/article-2",
"https://example.com/article-3",
]
categories = ["proxy services", "web scraping", "data privacy", "cybersecurity"]
results = scraper.process_batch(urls, categories=categories)
# export to DataFrame
df = pd.DataFrame(results)
print(df[["url", "title", "category", "category_confidence"]].to_string())
Proxy Considerations for Hugging Face Workflows
when building Hugging Face scraping pipelines, proxy usage applies at two levels:
- scraping proxies – for collecting web data from target sites
- API proxies – for routing Hugging Face Inference API calls (less common, but useful in restricted environments)
# scraping with proxy rotation
def get_proxy_session():
session = requests.Session(impersonate="chrome124")
proxy = random.choice(PROXIES)
session.proxies = {"http": proxy, "https": proxy}
return session
# for HF Inference API (if behind a firewall or need to route through specific IPs)
hf_response = stdlib_requests.post(
"https://api-inference.huggingface.co/models/dslim/bert-base-NER",
headers={"Authorization": "Bearer hf_token"},
json={"inputs": "scraped text here"},
proxies={"https": "http://api-proxy:port"}, # optional
)
use the Proxy Cost Calculator to estimate costs based on your scraping volume, and test your proxy fingerprint with the Browser Fingerprint Tester to ensure clean requests.
Performance Tips
- batch your model inference – processing 10 texts at once is faster than 10 individual calls
- use smaller models for classification –
distilbert-base-uncasedis 2x faster than BERT base with minimal quality loss - cache model outputs – if you’re processing similar content, cache results to avoid redundant inference
- use GPU when available – set
device=0in pipeline creation for 5-10x speedup on NVIDIA GPUs - separate scraping and processing – scrape first and store raw HTML, then process with models. this lets you re-process without re-scraping
Summary
Hugging Face transforms web scraping from simple data extraction into intelligent data processing. by combining proxy-powered scraping with Hugging Face models, you can:
- extract named entities from scraped pages automatically
- classify content into categories without training data
- answer specific questions about scraped content
- summarize large volumes of scraped articles
- build AI agents that scrape and analyze autonomously
the key is to treat scraping and ML processing as separate stages in your pipeline. scrape with proxies for reliability, store the raw data, then process with models for intelligence. this separation makes your pipeline more maintainable and lets you upgrade either component independently.