From b9e73f70d6828ed54fdf9a54eaf6cb3b4f49ce24 Mon Sep 17 00:00:00 2001 From: Cosmin Staicu Date: Thu, 5 Mar 2026 09:48:36 +0200 Subject: [PATCH] sample: E-commerce Product Scraper Agent --- samples/ecommerce-scraper-agent/agent.mermaid | 11 + samples/ecommerce-scraper-agent/bindings.json | 4 + .../evaluations/eval-sets/edge-cases.json | 57 ++ .../evaluations/eval-sets/happy-path.json | 39 ++ .../eval-sets/output-structure.json | 54 ++ .../evaluators/json-similarity.json | 9 + .../evaluators/llm-judge-output.json | 10 + .../evaluations/evaluators/trajectory.json | 10 + .../ecommerce-scraper-agent/langgraph.json | 5 + samples/ecommerce-scraper-agent/main.py | 611 ++++++++++++++++++ .../ecommerce-scraper-agent/pyproject.toml | 33 + samples/ecommerce-scraper-agent/uipath.json | 15 + 12 files changed, 858 insertions(+) create mode 100644 samples/ecommerce-scraper-agent/agent.mermaid create mode 100644 samples/ecommerce-scraper-agent/bindings.json create mode 100644 samples/ecommerce-scraper-agent/evaluations/eval-sets/edge-cases.json create mode 100644 samples/ecommerce-scraper-agent/evaluations/eval-sets/happy-path.json create mode 100644 samples/ecommerce-scraper-agent/evaluations/eval-sets/output-structure.json create mode 100644 samples/ecommerce-scraper-agent/evaluations/evaluators/json-similarity.json create mode 100644 samples/ecommerce-scraper-agent/evaluations/evaluators/llm-judge-output.json create mode 100644 samples/ecommerce-scraper-agent/evaluations/evaluators/trajectory.json create mode 100644 samples/ecommerce-scraper-agent/langgraph.json create mode 100644 samples/ecommerce-scraper-agent/main.py create mode 100644 samples/ecommerce-scraper-agent/pyproject.toml create mode 100644 samples/ecommerce-scraper-agent/uipath.json diff --git a/samples/ecommerce-scraper-agent/agent.mermaid b/samples/ecommerce-scraper-agent/agent.mermaid new file mode 100644 index 000000000..30967f521 --- /dev/null +++ b/samples/ecommerce-scraper-agent/agent.mermaid @@ -0,0 +1,11 @@ +flowchart TB + __start__(__start__) + coordinator(coordinator) + scraper(scraper) + finalize(finalize) + __end__(__end__) + __start__ --> coordinator + coordinator --> finalize + coordinator --> scraper + scraper --> coordinator + finalize --> __end__ diff --git a/samples/ecommerce-scraper-agent/bindings.json b/samples/ecommerce-scraper-agent/bindings.json new file mode 100644 index 000000000..5dd5a0fd8 --- /dev/null +++ b/samples/ecommerce-scraper-agent/bindings.json @@ -0,0 +1,4 @@ +{ + "version": "2.0", + "resources": [] +} \ No newline at end of file diff --git a/samples/ecommerce-scraper-agent/evaluations/eval-sets/edge-cases.json b/samples/ecommerce-scraper-agent/evaluations/eval-sets/edge-cases.json new file mode 100644 index 000000000..009eb889e --- /dev/null +++ b/samples/ecommerce-scraper-agent/evaluations/eval-sets/edge-cases.json @@ -0,0 +1,57 @@ +{ + "version": "1.0", + "id": "edge-cases", + "name": "Edge Cases and Error Scenarios", + "description": "Tests for unusual inputs, single pages, and boundary conditions", + "evaluatorRefs": ["JsonSimilarityEvaluator", "LLMJudgeOutputEvaluator", "TrajectoryEvaluator"], + "evaluations": [ + { + "id": "test-1-single-product-page", + "name": "Starting from a single product page extracts that product", + "inputs": { + "start_url": "https://sandbox.oxylabs.io/products/1" + }, + "evaluationCriterias": { + "LLMJudgeOutputEvaluator": { + "expectedOutput": { + "products": "Should contain at least one product with name, price, currency, and url fields. The product URL should be https://sandbox.oxylabs.io/products/1 or related products discovered from that page.", + "total_products": "At least 1 product should be found" + } + }, + "TrajectoryEvaluator": { + "expectedAgentBehavior": "The agent should: 1) Start with the single product URL. 2) Fetch the page and classify it as a 'product' page via LLM analysis. 3) Extract product data fields (name, price, description, etc.) using CSS selectors from the strategy. 4) Discover any related product links or navigation links on the product page. 5) Optionally follow discovered links to scrape more products. 6) Finalize with at least one product." + } + } + }, + { + "id": "test-2-last-pagination-page", + "name": "Scraping the last page of pagination works correctly", + "inputs": { + "start_url": "https://sandbox.oxylabs.io/products?page=94" + }, + "evaluationCriterias": { + "LLMJudgeOutputEvaluator": { + "expectedOutput": { + "products": "Should contain products from the last page. Each product should have valid name, price, and url fields.", + "total_products": "Should be a positive number, likely around 20-32 products from the last page and any discovered linked pages" + } + } + } + }, + { + "id": "test-3-nonexistent-page", + "name": "Handling a page that returns no products", + "inputs": { + "start_url": "https://sandbox.oxylabs.io/products?page=9999" + }, + "evaluationCriterias": { + "JsonSimilarityEvaluator": { + "expectedOutput": { + "total_products": 0, + "urls_scraped": 0 + } + } + } + } + ] +} diff --git a/samples/ecommerce-scraper-agent/evaluations/eval-sets/happy-path.json b/samples/ecommerce-scraper-agent/evaluations/eval-sets/happy-path.json new file mode 100644 index 000000000..1ad41ca3d --- /dev/null +++ b/samples/ecommerce-scraper-agent/evaluations/eval-sets/happy-path.json @@ -0,0 +1,39 @@ +{ + "version": "1.0", + "id": "happy-path", + "name": "Happy Path Scenarios", + "description": "Tests for normal scraping operations with the oxylabs sandbox site", + "evaluatorRefs": ["JsonSimilarityEvaluator", "TrajectoryEvaluator"], + "evaluations": [ + { + "id": "test-1-default-url-products-found", + "name": "Scrape default URL returns products with expected fields", + "inputs": { + "start_url": "https://sandbox.oxylabs.io/products" + }, + "evaluationCriterias": { + "JsonSimilarityEvaluator": { + "expectedOutput": { + "total_products": 2993, + "urls_scraped": 3301 + } + }, + "TrajectoryEvaluator": { + "expectedAgentBehavior": "The agent should: 1) Start at the coordinator node which seeds the start URL. 2) Dispatch URLs to parallel scraper sub-agents. 3) Scrapers fetch pages, call the LLM to analyze page type (listing vs product), and extract product links from listing pages and product data from product pages. 4) Return discovered URLs to coordinator for further rounds. 5) Continue until no new URLs remain. 6) Finalize by deduplicating products and resolving currency symbols. The agent should scrape approximately 3000 products across multiple rounds of coordinator-scraper cycles." + } + } + }, + { + "id": "test-2-category-page", + "name": "Scrape a specific category page discovers products", + "inputs": { + "start_url": "https://sandbox.oxylabs.io/products/category/nintendo" + }, + "evaluationCriterias": { + "TrajectoryEvaluator": { + "expectedAgentBehavior": "The agent should: 1) Start at the coordinator with the Nintendo category URL. 2) Dispatch to scrapers which classify the page as a listing page. 3) Extract product links and pagination links from the category listing. 4) Visit individual product pages to extract product data (name, price, description, etc.). 5) Follow pagination to discover all products in the category. 6) Finalize with deduplicated products, all having currency resolved to ISO codes. The total products should be fewer than the full site (~1000 or less for a single category)." + } + } + } + ] +} diff --git a/samples/ecommerce-scraper-agent/evaluations/eval-sets/output-structure.json b/samples/ecommerce-scraper-agent/evaluations/eval-sets/output-structure.json new file mode 100644 index 000000000..944660a98 --- /dev/null +++ b/samples/ecommerce-scraper-agent/evaluations/eval-sets/output-structure.json @@ -0,0 +1,54 @@ +{ + "version": "1.0", + "id": "output-structure", + "name": "Output Structure Validation", + "description": "Validates that scraped products have the expected fields and data quality", + "evaluatorRefs": ["LLMJudgeOutputEvaluator"], + "evaluations": [ + { + "id": "test-1-product-fields-present", + "name": "Products contain required fields (name, price, url, currency)", + "inputs": { + "start_url": "https://sandbox.oxylabs.io/products?page=1" + }, + "evaluationCriterias": { + "LLMJudgeOutputEvaluator": { + "expectedOutput": { + "products": "A list of product objects where each product has at minimum: 'url' (a valid URL to the product page), 'name' (non-empty product name), 'price' (a numeric price value), and 'currency' (ISO 4217 code like EUR). Products may also have description, availability, developer, platform, and type fields.", + "total_products": "A positive integer greater than 0 representing the number of unique products scraped", + "urls_scraped": "A positive integer representing the total number of URLs visited during scraping" + } + } + } + }, + { + "id": "test-2-currency-resolved", + "name": "Currency symbols are resolved to ISO 4217 codes", + "inputs": { + "start_url": "https://sandbox.oxylabs.io/products?page=2" + }, + "evaluationCriterias": { + "LLMJudgeOutputEvaluator": { + "expectedOutput": { + "products": "Products should have a 'currency' field containing a valid ISO 4217 three-letter code (e.g. 'EUR', 'USD', 'GBP') rather than a raw currency symbol (e.g. not '€' or '$'). The price field should be a numeric string without currency symbols." + } + } + } + }, + { + "id": "test-3-no-duplicate-products", + "name": "No duplicate products in output", + "inputs": { + "start_url": "https://sandbox.oxylabs.io/products?page=3" + }, + "evaluationCriterias": { + "LLMJudgeOutputEvaluator": { + "expectedOutput": { + "products": "Each product in the list should have a unique URL. There should be no two products with the same 'url' field. The total_products count should match the length of the products list.", + "total_products": "Should exactly match the number of items in the products array" + } + } + } + } + ] +} diff --git a/samples/ecommerce-scraper-agent/evaluations/evaluators/json-similarity.json b/samples/ecommerce-scraper-agent/evaluations/evaluators/json-similarity.json new file mode 100644 index 000000000..ca6074ee0 --- /dev/null +++ b/samples/ecommerce-scraper-agent/evaluations/evaluators/json-similarity.json @@ -0,0 +1,9 @@ +{ + "version": "1.0", + "id": "JsonSimilarityEvaluator", + "evaluatorTypeId": "uipath-json-similarity", + "evaluatorConfig": { + "name": "JsonSimilarityEvaluator", + "targetOutputKey": "*" + } +} diff --git a/samples/ecommerce-scraper-agent/evaluations/evaluators/llm-judge-output.json b/samples/ecommerce-scraper-agent/evaluations/evaluators/llm-judge-output.json new file mode 100644 index 000000000..1268c14a4 --- /dev/null +++ b/samples/ecommerce-scraper-agent/evaluations/evaluators/llm-judge-output.json @@ -0,0 +1,10 @@ +{ + "version": "1.0", + "id": "LLMJudgeOutputEvaluator", + "evaluatorTypeId": "uipath-llm-judge-output-semantic-similarity", + "evaluatorConfig": { + "name": "LLMJudgeOutputEvaluator", + "model": "gpt-4o-2024-11-20", + "temperature": 0.0 + } +} diff --git a/samples/ecommerce-scraper-agent/evaluations/evaluators/trajectory.json b/samples/ecommerce-scraper-agent/evaluations/evaluators/trajectory.json new file mode 100644 index 000000000..96113bec9 --- /dev/null +++ b/samples/ecommerce-scraper-agent/evaluations/evaluators/trajectory.json @@ -0,0 +1,10 @@ +{ + "version": "1.0", + "id": "TrajectoryEvaluator", + "evaluatorTypeId": "uipath-llm-judge-trajectory-similarity", + "evaluatorConfig": { + "name": "LLMJudgeTrajectoryEvaluator", + "model": "gpt-4o-2024-11-20", + "temperature": 0.0 + } +} diff --git a/samples/ecommerce-scraper-agent/langgraph.json b/samples/ecommerce-scraper-agent/langgraph.json new file mode 100644 index 000000000..0ef52751d --- /dev/null +++ b/samples/ecommerce-scraper-agent/langgraph.json @@ -0,0 +1,5 @@ +{ + "graphs": { + "agent": "./main.py:graph" + } +} diff --git a/samples/ecommerce-scraper-agent/main.py b/samples/ecommerce-scraper-agent/main.py new file mode 100644 index 000000000..b2b0d3676 --- /dev/null +++ b/samples/ecommerce-scraper-agent/main.py @@ -0,0 +1,611 @@ +"""General E-commerce Scraper Agent + +A UiPath LangGraph agent that uses LLM analysis to dynamically determine +how to extract product data from any e-commerce site. + +Architecture: + - Coordinator node: manages the URL queue, dispatches batches to sub-agents. + - Scraper node (sub-agent): fetches pages, uses LLM to classify page type + and determine extraction strategy (CSS selectors), extracts product data + and navigation/product links. + - Finalize node: deduplicates products, resolves currency symbols. + - Sub-agents run in parallel per round via Send. + +Flow: + START -> coordinator -> [scrapers in parallel via Send] -> coordinator + -> ... (until no new URLs) ... + -> finalize -> END +""" + +import asyncio +import json +import os +import re +from operator import add +from typing import Annotated, TypedDict +from urllib.parse import urljoin, urlparse + +import httpx +from bs4 import BeautifulSoup +from playwright.async_api import async_playwright, Browser +from pydantic import BaseModel, Field +from langgraph.graph import START, END, StateGraph +from langgraph.types import Send +from uipath.platform import UiPath + + +# ── Configuration ────────────────────────────────────────────────────────────── + +NUM_SUB_AGENTS = 5 +CONCURRENT_PER_AGENT = 10 +HTTP_TIMEOUT = 30.0 +JS_RENDER_WAIT_MS = 3000 +MIN_CONTENT_LENGTH = 200 +USER_AGENT = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" +) + + +# ── Page Fetching (httpx with playwright fallback) ───────────────────────────── + +_browser: Browser | None = None +_browser_lock = asyncio.Lock() + + +def _page_looks_empty(html: str) -> bool: + """Return True if the HTML has too little visible text content.""" + soup = BeautifulSoup(html, "html.parser") + for tag in soup.find_all(["script", "style", "noscript"]): + tag.decompose() + text = soup.get_text(separator=" ", strip=True) + return len(text) < MIN_CONTENT_LENGTH + + +async def _get_browser() -> Browser: + """Lazily launch a shared headless browser instance.""" + global _browser + async with _browser_lock: + if _browser is None or not _browser.is_connected(): + pw = await async_playwright().start() + _browser = await pw.chromium.launch(headless=True) + print("[browser] Launched headless Chromium") + return _browser + + +async def _fetch_with_browser(url: str) -> str: + """Fetch a page using a headless browser for JS-rendered content.""" + browser = await _get_browser() + page = await browser.new_page(user_agent=USER_AGENT) + try: + await page.goto(url, wait_until="networkidle", timeout=HTTP_TIMEOUT * 1000) + await page.wait_for_timeout(JS_RENDER_WAIT_MS) + return await page.content() + finally: + await page.close() + + +async def _fetch_page(client: httpx.AsyncClient, url: str) -> str: + """Fetch a page: try httpx first, fall back to playwright if content looks JS-rendered.""" + resp = await client.get(url) + resp.raise_for_status() + html = resp.text + + if _page_looks_empty(html): + print(f"[fetch] JS-rendered page detected, using browser: {url}") + html = await _fetch_with_browser(url) + + return html + + +# ── I/O Schemas ──────────────────────────────────────────────────────────────── + +class GraphInput(BaseModel): + start_url: str = Field( + default="https://sandbox.oxylabs.io/products", + description="The starting URL to begin scraping", + ) + + +class GraphOutput(BaseModel): + products: list[dict] = Field( + description="All scraped products with extracted fields and url" + ) + total_products: int = Field(default=0, description="Number of unique products") + urls_scraped: int = Field(default=0, description="Number of URLs scraped") + + +# ── Graph State ──────────────────────────────────────────────────────────────── + +class GraphState(TypedDict): + start_url: str + url_chunks: list[list[str]] + visited_hashes: set[int] + scraped_urls: list[str] + raw_products: Annotated[list[dict], add] + discovered_urls: Annotated[list[str], add] + products: list[dict] + total_products: int + urls_scraped: int + + +# ── LLM Page Analysis ───────────────────────────────────────────────────────── + +class PageStrategy(BaseModel): + """LLM-determined strategy for extracting data from a page type.""" + page_type: str = Field( + description=( + "'listing' for pages showing multiple products, " + "'product' for single product detail pages, " + "'other' for non-product pages" + ) + ) + product_links_css: str = Field( + default="", + description="CSS selector for elements linking to individual product pages", + ) + navigation_links_css: str = Field( + default="", + description="CSS selector for pagination or category navigation elements", + ) + product_fields_css: dict[str, str] = Field( + default_factory=dict, + description=( + "For product detail pages: mapping of field names " + "(e.g. 'name', 'price', 'description') to CSS selectors" + ), + ) + total_pages: int = Field( + default=0, + description=( + "For listing pages: total number of pages if discoverable " + "from the HTML or embedded JSON (e.g. __NEXT_DATA__). 0 if unknown." + ), + ) + pagination_url_template: str = Field( + default="", + description=( + "For listing pages: URL template for generating all page URLs, " + "with {page} as placeholder for the page number. " + "E.g. 'https://example.com/products?page={page}'. Empty if unknown." + ), + ) + + +_strategy_cache: dict[str, PageStrategy] = {} + + +def _url_pattern(url: str) -> str: + """Normalize a URL to a cacheable pattern for strategy reuse. + + Replaces ID-like path segments with {id} and keeps only query param keys. + """ + parsed = urlparse(url) + path = parsed.path.rstrip("/") or "/" + segments = path.split("/") + normalized = [] + for i, seg in enumerate(segments): + if seg and i > 0 and (re.match(r"^\d+$", seg) or len(seg) > 30): + normalized.append("{id}") + else: + normalized.append(seg) + pattern = "/".join(normalized) + if parsed.query: + params = sorted(re.findall(r"([^&=]+)=", parsed.query)) + if params: + pattern += "?" + "&".join(f"{p}=" for p in params) + return f"{parsed.scheme}://{parsed.netloc}{pattern}" + + +def _clean_html_for_llm(html: str, max_chars: int = 20000) -> str: + """Clean and truncate HTML for LLM analysis.""" + soup = BeautifulSoup(html, "html.parser") + for tag in soup.find_all(["style", "link", "meta", "noscript", "svg", "img"]): + tag.decompose() + for script in soup.find_all("script"): + if script.get("id") != "__NEXT_DATA__": + script.decompose() + text = str(soup) + if len(text) > max_chars: + text = text[:max_chars] + "\n" + return text + + +async def _analyze_page(html: str, url: str) -> PageStrategy: + """Analyze a page with LLM to determine extraction strategy. + + Cached per URL pattern so the LLM is only called once per page type. + """ + pattern = _url_pattern(url) + if pattern in _strategy_cache: + print(f"[analyze] Cache hit for pattern: {pattern}") + return _strategy_cache[pattern] + + cleaned = _clean_html_for_llm(html) + sdk = UiPath() + + response = await sdk.llm.chat_completions( + messages=[ + { + "role": "system", + "content": ( + "You are a web scraping expert. Analyze the HTML and return a JSON object with:\n" + "- page_type: 'listing' if it shows multiple products, 'product' if single product detail, 'other' otherwise\n" + "- product_links_css: CSS selector for tags linking to individual product pages (empty string if none)\n" + "- navigation_links_css: CSS selector for pagination/category tags (empty string if none)\n" + "- product_fields_css: for product detail pages only, map field names (name, price, description, etc.) to CSS selectors that extract each field's text content (empty object for listing/other pages)\n" + "- total_pages: for listing pages, the total number of pages if you can find it in the HTML or embedded JSON like __NEXT_DATA__ (look for pageCount, totalPages, last page number in pagination, etc.). 0 if unknown.\n" + "- pagination_url_template: for listing pages, the URL pattern for paginated pages with {page} as placeholder for the page number (e.g. 'https://example.com/products?page={page}'). Derive this from pagination links in the HTML. Empty string if unknown.\n\n" + "IMPORTANT: Look carefully at embedded JSON data (like