Cookbook
Real-world recipes for common scraping tasks.
Scrape a List of URLs
import easyscrape as es
urls = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3",
]
results = []
for url in urls:
result = es.scrape(url)
results.append({
"url": url,
"title": result.title(),
"status": result.status_code,
})
Extract Table Data
result = es.scrape("https://example.com/data-table")
# Get all table rows
rows = result.css_all("table tbody tr")
# Parse each row
data = []
for row in rows:
cells = row.css_all("td")
data.append({
"name": cells[0] if cells else "",
"value": cells[1] if len(cells) > 1 else "",
})
Handle Pagination
import easyscrape as es
base_url = "https://example.com/products?page="
all_products = []
for page in range(1, 11): # Pages 1-10
result = es.scrape(f"{base_url}{page}")
products = result.css_all("div.product h3")
if not products:
break # No more pages
all_products.extend(products)
print(f"Found {len(all_products)} products")
Save Results to JSON
import json
import easyscrape as es
result = es.scrape("https://api.example.com/data")
data = result.json()
with open("output.json", "w") as f:
json.dump(data, f, indent=2)
Save Results to CSV
import csv
import easyscrape as es
result = es.scrape("https://example.com/products")
products = result.css_all("div.product")
with open("products.csv", "w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Price"])
for product in products:
name = product.css("h3")
price = product.css(".price")
writer.writerow([name, price])
Retry with Backoff
import time
import easyscrape as es
def scrape_with_backoff(url: str, max_retries: int = 3):
for attempt in range(max_retries):
try:
return es.scrape(url)
except es.ScrapeError:
if attempt < max_retries - 1:
wait = 2 ** attempt # 1, 2, 4 seconds
time.sleep(wait)
else:
raise
Check robots.txt
from urllib.robotparser import RobotFileParser
import easyscrape as es
def can_scrape(url: str) -> bool:
from urllib.parse import urlparse
parsed = urlparse(url)
robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
rp = RobotFileParser()
rp.set_url(robots_url)
rp.read()
return rp.can_fetch("*", url)
# Check before scraping
url = "https://example.com/page"
if can_scrape(url):
result = es.scrape(url)
Session Persistence
import easyscrape as es
# Create a session for multiple requests
with es.Session() as session:
# Login
session.scrape(
"https://example.com/login",
method="POST",
data={"user": "me", "pass": "secret"}
)
# Subsequent requests maintain cookies
dashboard = session.scrape("https://example.com/dashboard")
profile = session.scrape("https://example.com/profile")
Download Files
import easyscrape as es
result = es.scrape("https://example.com/document.pdf")
with open("document.pdf", "wb") as f:
f.write(result.content)
Extract Metadata
import easyscrape as es
result = es.scrape("https://example.com/article")
metadata = {
"title": result.css("meta[property='og:title']", attr="content"),
"description": result.css("meta[name='description']", attr="content"),
"author": result.css("meta[name='author']", attr="content"),
"published": result.css("time", attr="datetime"),
}