From 906c1e60a8080cb1ee395b083869b039d0cc1ec0 Mon Sep 17 00:00:00 2001 From: Marco Vinciguerra Date: Mon, 16 Feb 2026 12:19:11 +0100 Subject: [PATCH] feat: add wait_ms parameter to Crawler endpoint Allows callers to configure rendering wait time per page during crawling, matching the API-side change in sgai-api#402. Co-Authored-By: Claude Opus 4.6 --- scrapegraph-py/scrapegraph_py/async_client.py | 6 ++++++ scrapegraph-py/scrapegraph_py/client.py | 6 ++++++ scrapegraph-py/scrapegraph_py/models/crawl.py | 5 +++++ 3 files changed, 17 insertions(+) diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py index d2d5166..5111849 100644 --- a/scrapegraph-py/scrapegraph_py/async_client.py +++ b/scrapegraph-py/scrapegraph_py/async_client.py @@ -884,6 +884,7 @@ async def crawl( include_paths: Optional[list[str]] = None, exclude_paths: Optional[list[str]] = None, webhook_url: Optional[str] = None, + wait_ms: Optional[int] = None, return_toon: bool = False, ): """Send a crawl request with support for both AI extraction and @@ -911,6 +912,7 @@ async def crawl( exclude_paths: List of path patterns to exclude (e.g., ['/admin/*', '/api/*']) Supports wildcards and takes precedence over include_paths webhook_url: URL to receive webhook notifications when the crawl completes + wait_ms: Number of milliseconds to wait before scraping each page return_toon: If True, return response in TOON format (reduces token usage by 30-60%) """ logger.info("🔍 Starting crawl request") @@ -944,6 +946,8 @@ async def crawl( logger.debug(f"❌ Exclude paths: {exclude_paths}") if webhook_url: logger.debug(f"🔔 Webhook URL: {webhook_url}") + if wait_ms is not None: + logger.debug(f"⏱️ Wait ms: {wait_ms}") if return_toon: logger.debug("🎨 TOON format output enabled") @@ -977,6 +981,8 @@ async def crawl( request_data["exclude_paths"] = exclude_paths if webhook_url is not None: request_data["webhook_url"] = webhook_url + if wait_ms is not None: + request_data["wait_ms"] = wait_ms request = CrawlRequest(**request_data) logger.debug("✅ Request validation passed") diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py index 1333342..28fc1bf 100644 --- a/scrapegraph-py/scrapegraph_py/client.py +++ b/scrapegraph-py/scrapegraph_py/client.py @@ -894,6 +894,7 @@ def crawl( include_paths: Optional[list[str]] = None, exclude_paths: Optional[list[str]] = None, webhook_url: Optional[str] = None, + wait_ms: Optional[int] = None, return_toon: bool = False, ): """Send a crawl request with support for both AI extraction and @@ -921,6 +922,7 @@ def crawl( exclude_paths: List of path patterns to exclude (e.g., ['/admin/*', '/api/*']) Supports wildcards and takes precedence over include_paths webhook_url: URL to receive webhook notifications when the crawl completes + wait_ms: Number of milliseconds to wait before scraping each page return_toon: If True, return response in TOON format (reduces token usage by 30-60%) """ logger.info("🔍 Starting crawl request") @@ -954,6 +956,8 @@ def crawl( logger.debug(f"❌ Exclude paths: {exclude_paths}") if webhook_url: logger.debug(f"🔔 Webhook URL: {webhook_url}") + if wait_ms is not None: + logger.debug(f"⏱️ Wait ms: {wait_ms}") if return_toon: logger.debug("🎨 TOON format output enabled") @@ -987,6 +991,8 @@ def crawl( request_data["exclude_paths"] = exclude_paths if webhook_url is not None: request_data["webhook_url"] = webhook_url + if wait_ms is not None: + request_data["wait_ms"] = wait_ms request = CrawlRequest(**request_data) logger.debug("✅ Request validation passed") diff --git a/scrapegraph-py/scrapegraph_py/models/crawl.py b/scrapegraph-py/scrapegraph_py/models/crawl.py index e515b80..dd6cca9 100644 --- a/scrapegraph-py/scrapegraph_py/models/crawl.py +++ b/scrapegraph-py/scrapegraph_py/models/crawl.py @@ -108,6 +108,11 @@ class CrawlRequest(BaseModel): "The webhook will receive a POST request with the crawl results.", example="https://example.com/webhook" ) + wait_ms: Optional[int] = Field( + default=None, + description="Number of milliseconds to wait before scraping each page. " + "Useful for pages with heavy JavaScript rendering that need extra time to load.", + ) @model_validator(mode="after") def validate_url(self) -> "CrawlRequest":