Official Python SDK for the ScrapeGraphAI API.
pip install scrapegraph-py
# or
uv add scrapegraph-pyfrom scrapegraph_py import ScrapeGraphAI, ScrapeRequest
# reads SGAI_API_KEY from env, or pass explicitly: ScrapeGraphAI(api_key="...")
sgai = ScrapeGraphAI()
result = sgai.scrape(ScrapeRequest(
url="https://example.com",
))
if result.status == "success":
print(result.data["results"]["markdown"]["data"])
else:
print(result.error)Every method returns ApiResult[T] — no exceptions to catch:
@dataclass
class ApiResult(Generic[T]):
status: Literal["success", "error"]
data: T | None
error: str | None
elapsed_ms: intScrape a webpage in multiple formats (markdown, html, screenshot, json, etc).
from scrapegraph_py import (
ScrapeGraphAI, ScrapeRequest, FetchConfig,
MarkdownFormatConfig, ScreenshotFormatConfig, JsonFormatConfig
)
sgai = ScrapeGraphAI()
res = sgai.scrape(ScrapeRequest(
url="https://example.com",
formats=[
MarkdownFormatConfig(mode="reader"),
ScreenshotFormatConfig(full_page=True, width=1440, height=900),
JsonFormatConfig(prompt="Extract product info"),
],
content_type="text/html", # optional, auto-detected
fetch_config=FetchConfig( # optional
mode="js", # "auto" | "fast" | "js"
stealth=True,
timeout=30000,
wait=2000,
scrolls=3,
headers={"Accept-Language": "en"},
cookies={"session": "abc"},
country="us",
),
))Formats:
markdown— Clean markdown (modes:normal,reader,prune)html— Raw HTML (modes:normal,reader,prune)links— All links on the pageimages— All image URLssummary— AI-generated summaryjson— Structured extraction with prompt/schemabranding— Brand colors, typography, logosscreenshot— Page screenshot (full_page, width, height, quality)
Extract structured data from a URL, HTML, or markdown using AI.
from scrapegraph_py import ScrapeGraphAI, ExtractRequest
sgai = ScrapeGraphAI()
res = sgai.extract(ExtractRequest(
url="https://example.com",
prompt="Extract product names and prices",
schema={"type": "object", "properties": {...}}, # optional
mode="reader", # optional
fetch_config=FetchConfig(...), # optional
))
# Or pass html/markdown directly instead of urlSearch the web and optionally extract structured data.
from scrapegraph_py import ScrapeGraphAI, SearchRequest
sgai = ScrapeGraphAI()
res = sgai.search(SearchRequest(
query="best programming languages 2024",
num_results=5, # 1-20, default 3
format="markdown", # "markdown" | "html"
prompt="Extract key points", # optional, for AI extraction
schema={...}, # optional
time_range="past_week", # optional
location_geo_code="us", # optional
fetch_config=FetchConfig(...), # optional
))Crawl a website and its linked pages.
from scrapegraph_py import ScrapeGraphAI, CrawlRequest, MarkdownFormatConfig
sgai = ScrapeGraphAI()
# Start a crawl
start = sgai.crawl.start(CrawlRequest(
url="https://example.com",
formats=[MarkdownFormatConfig()],
max_pages=50,
max_depth=2,
max_links_per_page=10,
include_patterns=["/blog/*"],
exclude_patterns=["/admin/*"],
fetch_config=FetchConfig(...),
))
# Check status
status = sgai.crawl.get(start.data["id"])
# Control
sgai.crawl.stop(crawl_id)
sgai.crawl.resume(crawl_id)
sgai.crawl.delete(crawl_id)Monitor a webpage for changes on a schedule.
from scrapegraph_py import ScrapeGraphAI, MonitorCreateRequest, MarkdownFormatConfig
sgai = ScrapeGraphAI()
# Create a monitor
mon = sgai.monitor.create(MonitorCreateRequest(
url="https://example.com",
name="Price Monitor",
interval="0 * * * *", # cron expression
formats=[MarkdownFormatConfig()],
webhook_url="https://...", # optional
fetch_config=FetchConfig(...),
))
# Manage monitors
sgai.monitor.list()
sgai.monitor.get(cron_id)
sgai.monitor.update(cron_id, MonitorUpdateRequest(interval="0 */6 * * *"))
sgai.monitor.pause(cron_id)
sgai.monitor.resume(cron_id)
sgai.monitor.delete(cron_id)Fetch request history.
from scrapegraph_py import ScrapeGraphAI, HistoryFilter
sgai = ScrapeGraphAI()
history = sgai.history.list(HistoryFilter(
service="scrape", # optional filter
page=1,
limit=20,
))
entry = sgai.history.get("request-id")from scrapegraph_py import ScrapeGraphAI
sgai = ScrapeGraphAI()
credits = sgai.credits()
# { remaining: 1000, used: 500, plan: "pro", jobs: { crawl: {...}, monitor: {...} } }
health = sgai.health()
# { status: "ok", uptime: 12345 }All methods have async equivalents via AsyncScrapeGraphAI:
import asyncio
from scrapegraph_py import AsyncScrapeGraphAI, ScrapeRequest
async def main():
async with AsyncScrapeGraphAI() as sgai:
result = await sgai.scrape(ScrapeRequest(url="https://example.com"))
if result.status == "success":
print(result.data["results"]["markdown"]["data"])
else:
print(result.error)
asyncio.run(main())async with AsyncScrapeGraphAI() as sgai:
res = await sgai.extract(ExtractRequest(
url="https://example.com",
prompt="Extract product names and prices",
))async with AsyncScrapeGraphAI() as sgai:
res = await sgai.search(SearchRequest(
query="best programming languages 2024",
num_results=5,
))async with AsyncScrapeGraphAI() as sgai:
start = await sgai.crawl.start(CrawlRequest(
url="https://example.com",
max_pages=50,
))
status = await sgai.crawl.get(start.data["id"])async with AsyncScrapeGraphAI() as sgai:
mon = await sgai.monitor.create(MonitorCreateRequest(
url="https://example.com",
name="Price Monitor",
interval="0 * * * *",
))| Service | Example | Description |
|---|---|---|
| scrape | scrape_basic.py |
Basic markdown scraping |
| scrape | scrape_multi_format.py |
Multiple formats |
| scrape | scrape_json_extraction.py |
Structured JSON extraction |
| scrape | scrape_pdf.py |
PDF document parsing |
| scrape | scrape_with_fetchconfig.py |
JS rendering, stealth mode |
| extract | extract_basic.py |
AI data extraction |
| extract | extract_with_schema.py |
Extraction with JSON schema |
| search | search_basic.py |
Web search |
| search | search_with_extraction.py |
Search + AI extraction |
| crawl | crawl_basic.py |
Start and monitor a crawl |
| crawl | crawl_with_formats.py |
Crawl with formats |
| monitor | monitor_basic.py |
Create a page monitor |
| monitor | monitor_with_webhook.py |
Monitor with webhook |
| utilities | credits.py |
Check credits and limits |
| utilities | health.py |
API health check |
| utilities | history.py |
Request history |
| Service | Example | Description |
|---|---|---|
| scrape | scrape_basic_async.py |
Basic markdown scraping |
| scrape | scrape_multi_format_async.py |
Multiple formats |
| scrape | scrape_json_extraction_async.py |
Structured JSON extraction |
| scrape | scrape_pdf_async.py |
PDF document parsing |
| scrape | scrape_with_fetchconfig_async.py |
JS rendering, stealth mode |
| extract | extract_basic_async.py |
AI data extraction |
| extract | extract_with_schema_async.py |
Extraction with JSON schema |
| search | search_basic_async.py |
Web search |
| search | search_with_extraction_async.py |
Search + AI extraction |
| crawl | crawl_basic_async.py |
Start and monitor a crawl |
| crawl | crawl_with_formats_async.py |
Crawl with formats |
| monitor | monitor_basic_async.py |
Create a page monitor |
| monitor | monitor_with_webhook_async.py |
Monitor with webhook |
| utilities | credits_async.py |
Check credits and limits |
| utilities | health_async.py |
API health check |
| utilities | history_async.py |
Request history |
| Variable | Description | Default |
|---|---|---|
SGAI_API_KEY |
Your ScrapeGraphAI API key | — |
SGAI_API_URL |
Override API base URL | https://api.scrapegraphai.com/api/v2 |
SGAI_DEBUG |
Enable debug logging ("1") |
off |
SGAI_TIMEOUT |
Request timeout in seconds | 120 |
uv sync
uv run pytest tests/ # unit tests
uv run pytest tests/test_integration.py # live API tests (requires SGAI_API_KEY)
uv run ruff check . # lintMIT - ScrapeGraphAI