you can build a custom search engine by combining a web crawler, a text indexer (whoosh or elasticsearch), and a query interface. the entire stack runs locally and can index any niche domain Google ignores.
custom search engines solve a real problem: Google does not index everything, and even when it does, results get drowned in ads and SEO noise. if you need precise results from a specific domain — a competitor’s site, a forum, a documentation set — building your own search engine is the practical solution.
this tutorial builds a working search engine in Python: a crawler that collects pages, a simple inverted index, and a query interface that returns ranked results.
architecture overview
three components work together:
- crawler — follows links and downloads pages
- indexer — tokenizes text and builds an inverted index
- query engine — scores documents against search queries
for scale, replace the local index with elasticsearch or meilisearch. for crawling at volume, see our guide to what is web scraping for proxy strategies.
the crawler
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
from collections import deque
import time
def crawl(start_url, max_pages=200, delay=1.0):
visited = set()
queue = deque([start_url])
pages = {}
domain = urlparse(start_url).netloc
while queue and len(visited) < max_pages:
url = queue.popleft()
if url in visited:
continue
try:
r = requests.get(url, timeout=10, headers={
"User-Agent": "SearchBot/1.0"
})
if "text/html" not in r.headers.get("Content-Type", ""):
continue
soup = BeautifulSoup(r.text, "html.parser")
title = soup.title.string if soup.title else url
body = soup.get_text(separator=" ", strip=True)
pages[url] = {"title": title, "body": body, "url": url}
visited.add(url)
for a in soup.find_all("a", href=True):
link = urljoin(url, a["href"])
if urlparse(link).netloc == domain and link not in visited:
queue.append(link)
time.sleep(delay)
except Exception as e:
print(f"error crawling {url}: {e}")
return pages
staying on one domain keeps the crawler focused. adjust the delay and add a proxy for larger crawls — see our notes on what is a proxy server.
building the inverted index
import re
from collections import defaultdict
import math
def tokenize(text):
return re.findall(r'[a-z]{2,}', text.lower())
def build_index(pages):
index = defaultdict(set)
for url, page in pages.items():
tokens = tokenize(page["title"] + " " + page["body"])
for token in set(tokens):
index[token].add(url)
return index
def tf_idf_score(query, pages, index):
tokens = tokenize(query)
n_docs = len(pages)
scores = defaultdict(float)
for token in tokens:
matching = index.get(token, set())
idf = math.log(n_docs / (1 + len(matching)))
for url in matching:
body = pages[url]["body"].lower()
tf = body.count(token) / max(len(body.split()), 1)
scores[url] += tf * idf
return sorted(scores.items(), key=lambda x: x[1], reverse=True)
TF-IDF (term frequency-inverse document frequency) weights rare terms higher. common words like “the” appear everywhere and score near zero after IDF weighting.
query interface
def search(query, pages, index, top_n=10):
results = tf_idf_score(query, pages, index)[:top_n]
output = []
for url, score in results:
page = pages[url]
snippet = page["body"][:200].strip()
output.append({
"title": page["title"],
"url": url,
"score": round(score, 4),
"snippet": snippet
})
return output
# run it
pages = crawl("https://docs.python.org/3/", max_pages=100)
index = build_index(pages)
results = search("list comprehension", pages, index)
for r in results:
print(f"{r['title']} ({r['score']})")
print(f" {r['url']}")
print(f" {r['snippet'][:100]}...")
print()
scaling with elasticsearch
from elasticsearch import Elasticsearch
es = Elasticsearch("http://localhost:9200")
# index a page
es.index(index="my-search", id=url, body={
"title": page["title"],
"body": page["body"],
"url": url
})
# query
results = es.search(index="my-search", body={
"query": {"multi_match": {
"query": "list comprehension",
"fields": ["title^3", "body"]
}}
})
elasticsearch handles millions of documents and supports fuzzy matching, faceted search, and real-time indexing. use it once your local index exceeds 50,000 pages.
adding proxy rotation to the crawler
for large crawls across multiple domains, proxy rotation prevents IP bans. see our comparison of SOCKS5 vs HTTP proxy for the right proxy type.
proxies_list = [
"http://user:pass@proxy1:port",
"http://user:pass@proxy2:port",
]
import random
def get_proxy():
return {"http": random.choice(proxies_list),
"https": random.choice(proxies_list)}
r = requests.get(url, proxies=get_proxy(), timeout=10)