How to Scrape Twitch Stream & Chat Data
Twitch is the world’s leading live streaming platform with over 140 million monthly active users and 7.5 million unique streamers. For gaming industry analysts, influencer marketers, and competitive intelligence teams, Twitch data provides insights into audience engagement, content trends, and streamer performance.
This guide covers how to scrape Twitch data using Python, the Twitch API (Helix), and chat scraping via IRC.
What Data Can You Extract from Twitch?
- Stream metadata (title, game, viewer count, uptime)
- Streamer profiles (followers, subscribers, broadcaster type)
- Chat messages (real-time and historical)
- Clip data (popular clips, view counts)
- VOD information (past broadcasts, highlights)
- Game/category statistics
- Emote and badge data
- Channel point redemptions
Example JSON Output
{
"stream_id": "12345678901",
"broadcaster": {
"id": "123456789",
"login": "popular_streamer",
"display_name": "Popular_Streamer",
"broadcaster_type": "partner",
"follower_count": 2500000,
"profile_image": "https://static-cdn.jtvnw.net/..."
},
"stream": {
"title": "Ranked Grind to Immortal! | !socials",
"game": "VALORANT",
"viewer_count": 15234,
"started_at": "2026-03-01T10:00:00Z",
"language": "en",
"tags": ["English", "FPS", "Competitive"]
},
"chat_sample": [
{"user": "viewer123", "message": "LET'S GO!", "timestamp": "2026-03-01T14:30:01Z"},
{"user": "subscriber456", "message": "PogChamp", "timestamp": "2026-03-01T14:30:02Z"}
]
}Prerequisites
pip install requests twitchAPI websockets ircTwitch provides a comprehensive API. Register your application at dev.twitch.tv to get Client ID and Client Secret.
Method 1: Twitch Helix API
import requests
import json
import time
class TwitchAPIScraper:
def __init__(self, client_id, client_secret, proxy_url=None):
self.client_id = client_id
self.client_secret = client_secret
self.proxy_url = proxy_url
self.base_url = "https://api.twitch.tv/helix"
self.access_token = self._get_access_token()
def _get_proxies(self):
if self.proxy_url:
return {"http": self.proxy_url, "https": self.proxy_url}
return None
def _get_access_token(self):
url = "https://id.twitch.tv/oauth2/token"
params = {
"client_id": self.client_id,
"client_secret": self.client_secret,
"grant_type": "client_credentials"
}
response = requests.post(url, params=params)
return response.json()["access_token"]
def _get_headers(self):
return {
"Client-ID": self.client_id,
"Authorization": f"Bearer {self.access_token}"
}
def get_streams(self, game_id=None, language="en", first=100):
"""Get currently live streams."""
url = f"{self.base_url}/streams"
params = {"first": first, "language": language}
if game_id:
params["game_id"] = game_id
response = requests.get(url, headers=self._get_headers(), params=params, proxies=self._get_proxies())
data = response.json()
streams = []
for stream in data.get("data", []):
streams.append({
"id": stream["id"],
"user_id": stream["user_id"],
"user_name": stream["user_name"],
"title": stream["title"],
"game_name": stream["game_name"],
"viewer_count": stream["viewer_count"],
"started_at": stream["started_at"],
"language": stream["language"],
"tags": stream.get("tags", []),
"thumbnail": stream["thumbnail_url"],
})
return streams
def get_user_info(self, usernames):
"""Get user/channel information."""
url = f"{self.base_url}/users"
params = [("login", u) for u in usernames]
response = requests.get(url, headers=self._get_headers(), params=params, proxies=self._get_proxies())
return response.json().get("data", [])
def get_channel_followers(self, broadcaster_id, first=100):
"""Get channel follower count."""
url = f"{self.base_url}/channels/followers"
params = {"broadcaster_id": broadcaster_id, "first": first}
response = requests.get(url, headers=self._get_headers(), params=params, proxies=self._get_proxies())
data = response.json()
return {
"total": data.get("total", 0),
"followers": data.get("data", [])
}
def get_clips(self, broadcaster_id, first=20):
"""Get popular clips from a channel."""
url = f"{self.base_url}/clips"
params = {"broadcaster_id": broadcaster_id, "first": first}
response = requests.get(url, headers=self._get_headers(), params=params, proxies=self._get_proxies())
return response.json().get("data", [])
def search_channels(self, query, first=20):
"""Search for channels."""
url = f"{self.base_url}/search/channels"
params = {"query": query, "first": first}
response = requests.get(url, headers=self._get_headers(), params=params, proxies=self._get_proxies())
return response.json().get("data", [])
# Usage
scraper = TwitchAPIScraper(client_id="YOUR_CLIENT_ID", client_secret="YOUR_CLIENT_SECRET")
streams = scraper.get_streams(language="en", first=20)
for stream in streams[:5]:
print(f"{stream['user_name']}: {stream['title']} ({stream['viewer_count']} viewers)")Method 2: Twitch Chat Scraping via IRC
import socket
import re
import json
import time
from datetime import datetime
class TwitchChatScraper:
def __init__(self, oauth_token, nickname):
self.server = "irc.chat.twitch.tv"
self.port = 6667
self.oauth_token = oauth_token
self.nickname = nickname
self.sock = None
def connect(self):
self.sock = socket.socket()
self.sock.connect((self.server, self.port))
self.sock.send(f"PASS oauth:{self.oauth_token}\r\n".encode("utf-8"))
self.sock.send(f"NICK {self.nickname}\r\n".encode("utf-8"))
self.sock.send("CAP REQ :twitch.tv/tags twitch.tv/commands\r\n".encode("utf-8"))
time.sleep(2)
def join_channel(self, channel):
self.sock.send(f"JOIN #{channel}\r\n".encode("utf-8"))
time.sleep(1)
def collect_messages(self, channel, duration_seconds=300, max_messages=1000):
"""Collect chat messages for a specified duration."""
self.join_channel(channel)
messages = []
start_time = time.time()
while time.time() - start_time < duration_seconds and len(messages) < max_messages:
try:
self.sock.settimeout(5)
response = self.sock.recv(4096).decode("utf-8", errors="ignore")
if response.startswith("PING"):
self.sock.send("PONG :tmi.twitch.tv\r\n".encode("utf-8"))
continue
parsed = self._parse_message(response)
if parsed:
messages.append(parsed)
except socket.timeout:
continue
except Exception as e:
print(f"Error: {e}")
break
return messages
def _parse_message(self, raw):
pattern = r":(\w+)!\w+@\w+\.tmi\.twitch\.tv PRIVMSG #(\w+) :(.*)"
match = re.search(pattern, raw)
if match:
return {
"username": match.group(1),
"channel": match.group(2),
"message": match.group(3).strip(),
"timestamp": datetime.utcnow().isoformat() + "Z",
}
return None
def disconnect(self):
if self.sock:
self.sock.close()
# Usage
# chat = TwitchChatScraper(oauth_token="YOUR_OAUTH", nickname="your_bot_name")
# chat.connect()
# messages = chat.collect_messages("pokimane", duration_seconds=60)
# print(json.dumps(messages[:5], indent=2))
# chat.disconnect()Handling Twitch Rate Limits
API Rate Limits
- 800 requests per minute for authenticated requests
- 30 requests per minute for unauthenticated
- Rate limit headers:
Ratelimit-Remaining,Ratelimit-Reset
IRC Chat Limits
- 20 messages per 30 seconds for standard accounts
- 100 messages per 30 seconds for known/verified bots
- Join rate: 20 joins per 10 seconds
def respect_rate_limit(response):
remaining = int(response.headers.get("Ratelimit-Remaining", 1))
if remaining < 5:
reset = int(response.headers.get("Ratelimit-Reset", time.time() + 60))
wait_time = max(reset - time.time(), 1)
time.sleep(wait_time)Proxy Recommendations
| Method | Proxy Needed | Best Type |
|---|---|---|
| Helix API | For high volume | Datacenter |
| IRC Chat | Rarely needed | Any |
| Web scraping | Yes | Residential |
Twitch’s API is well-designed for programmatic access. Proxies are mainly needed for high-volume API requests. Visit our proxy setup guides for configuration details.
Legal Considerations
- Twitch Developer Agreement: API usage must comply with Twitch’s developer policies.
- Chat Data: Chat messages may contain personal data subject to privacy laws.
- Content Copyright: VODs, clips, and stream content are copyrighted by creators.
- Rate Limits: Exceeding API limits may result in temporary or permanent bans.
- Commercial Use: Check Twitch’s terms for commercial data usage restrictions.
See our web scraping compliance guide.
Rate Limiting Best Practices
- API: Stay under 800 requests/minute
- Chat: Join max 20 channels per 10 seconds
- Pagination: Use cursors for efficient data retrieval
- Caching: Cache user/game data to reduce redundant requests
Advanced Techniques
Handling Pagination
Most websites paginate their results. Implement robust pagination handling:
def scrape_all_pages(scraper, base_url, max_pages=20):
all_data = []
for page in range(1, max_pages + 1):
url = f"{base_url}?page={page}"
results = scraper.search(url)
if not results:
break
all_data.extend(results)
print(f"Page {page}: {len(results)} items (total: {len(all_data)})")
time.sleep(random.uniform(2, 5))
return all_dataData Validation and Cleaning
Always validate scraped data before storage:
def validate_data(item):
required_fields = ["title", "url"]
for field in required_fields:
if not item.get(field):
return False
return True
def clean_text(text):
if not text:
return None
# Remove extra whitespace
import re
text = re.sub(r'\s+', ' ', text).strip()
# Remove HTML entities
import html
text = html.unescape(text)
return text
# Apply to results
cleaned = [item for item in results if validate_data(item)]
for item in cleaned:
item["title"] = clean_text(item.get("title"))Monitoring and Alerting
Build monitoring into your scraping pipeline:
import logging
from datetime import datetime
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class ScrapingMonitor:
def __init__(self):
self.start_time = datetime.now()
self.requests = 0
self.errors = 0
self.items = 0
def log_request(self, success=True):
self.requests += 1
if not success:
self.errors += 1
if self.requests % 50 == 0:
elapsed = (datetime.now() - self.start_time).seconds
rate = self.requests / max(elapsed, 1) * 60
logger.info(f"Requests: {self.requests}, Errors: {self.errors}, "
f"Items: {self.items}, Rate: {rate:.1f}/min")
def log_item(self, count=1):
self.items += countError Handling and Retry Logic
Implement robust error handling:
import time
from requests.exceptions import RequestException
def retry_request(func, max_retries=3, base_delay=5):
for attempt in range(max_retries):
try:
return func()
except RequestException as e:
if attempt == max_retries - 1:
raise
delay = base_delay * (2 ** attempt)
print(f"Attempt {attempt + 1} failed: {e}. Retrying in {delay}s...")
time.sleep(delay)
return NoneData Storage Options
Choose the right storage for your scraping volume:
import json
import csv
import sqlite3
class DataStorage:
def __init__(self, db_path="scraped_data.db"):
self.conn = sqlite3.connect(db_path)
self.conn.execute('''CREATE TABLE IF NOT EXISTS items
(id TEXT PRIMARY KEY, title TEXT, url TEXT, data JSON, scraped_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP)''')
def save(self, item):
self.conn.execute(
"INSERT OR REPLACE INTO items (id, title, url, data) VALUES (?, ?, ?, ?)",
(item.get("id"), item.get("title"), item.get("url"), json.dumps(item))
)
self.conn.commit()
def export_json(self, output_path):
cursor = self.conn.execute("SELECT data FROM items")
items = [json.loads(row[0]) for row in cursor.fetchall()]
with open(output_path, "w") as f:
json.dump(items, f, indent=2)
def export_csv(self, output_path):
cursor = self.conn.execute("SELECT * FROM items")
rows = cursor.fetchall()
with open(output_path, "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["id", "title", "url", "data", "scraped_at"])
writer.writerows(rows)Frequently Asked Questions
How often should I scrape data?
The optimal frequency depends on how often the source data changes. For real-time data (stock prices, news), scrape every few minutes. For product listings, daily or weekly is usually sufficient. For reviews, weekly scraping captures new feedback without excessive load.
What happens if my IP gets blocked?
If you receive 403 or 429 status codes, your IP is likely blocked. Switch to a different proxy, implement exponential backoff, and slow your request rate. Rotating residential proxies automatically switch IPs to prevent blocks.
Should I use headless browsers or HTTP requests?
Use HTTP requests (with BeautifulSoup or similar) whenever possible — they are faster and use less resources. Switch to headless browsers (Selenium, Playwright) only when JavaScript rendering is required for the data you need.
How do I handle CAPTCHAs?
CAPTCHAs indicate aggressive bot detection. To minimize them: use residential or mobile proxies, implement realistic delays, rotate user agents, and maintain consistent session behavior. For persistent CAPTCHAs, consider CAPTCHA-solving services as a last resort.
Can I scrape data commercially?
The legality of commercial scraping depends on the platform’s ToS, the type of data collected, and your jurisdiction. Public data is generally more permissible, but always consult legal counsel for commercial use cases. See our compliance guide.
Conclusion
Twitch’s Helix API and IRC interface provide structured, well-documented access to streaming data. Start with API endpoints for stream and user data, and use IRC for real-time chat collection.
For more streaming data guides, visit dataresearchtools.com and our social media proxy guide.
- How to Scrape AliExpress Product Data
- How to Scrape Amazon Product Reviews in 2026
- aiohttp + BeautifulSoup: Async Python Scraping
- How Anti-Bot Systems Detect Scrapers (Cloudflare, Akamai, PerimeterX)
- API vs Web Scraping: When You Need Proxies (and When You Don’t)
- ASEAN Data Protection Laws: A Web Scraping Compliance Matrix
- How to Scrape AliExpress Product Data
- How to Scrape Amazon Product Reviews in 2026
- aiohttp + BeautifulSoup: Async Python Scraping
- How Anti-Bot Systems Detect Scrapers (Cloudflare, Akamai, PerimeterX)
- API vs Web Scraping: When You Need Proxies (and When You Don’t)
- ASEAN Data Protection Laws: A Web Scraping Compliance Matrix
- How to Scrape AliExpress Product Data
- How to Scrape Amazon Product Reviews in 2026
- aiohttp + BeautifulSoup: Async Python Scraping
- How Anti-Bot Systems Detect Scrapers (Cloudflare, Akamai, PerimeterX)
- API vs Web Scraping: When You Need Proxies (and When You Don’t)
- ASEAN Data Protection Laws: A Web Scraping Compliance Matrix
Related Reading
- How to Scrape AliExpress Product Data
- How to Scrape Amazon Product Reviews in 2026
- aiohttp + BeautifulSoup: Async Python Scraping
- How Anti-Bot Systems Detect Scrapers (Cloudflare, Akamai, PerimeterX)
- API vs Web Scraping: When You Need Proxies (and When You Don’t)
- ASEAN Data Protection Laws: A Web Scraping Compliance Matrix