diff --git a/pyproject.toml b/pyproject.toml index 518d1af4..4532050c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -119,5 +119,5 @@ disallow_untyped_calls = true ignore_missing_imports = true [tool.poe.tasks] -pylint-local = "pylint scraperaphai/**/*.py" +pylint-local = "pylint scrapegraphai/**/*.py" pylint-ci = "pylint --disable=C0114,C0115,C0116 --exit-zero scrapegraphai/**/*.py" diff --git a/scrapegraphai/docloaders/chromium.py b/scrapegraphai/docloaders/chromium.py index d8cd00ae..e69b5b9d 100644 --- a/scrapegraphai/docloaders/chromium.py +++ b/scrapegraphai/docloaders/chromium.py @@ -68,10 +68,10 @@ def __init__( self.load_state = load_state self.requires_js_support = requires_js_support self.storage_state = storage_state - self.backend = kwargs.get("backend", backend) - self.browser_name = kwargs.get("browser_name", browser_name) - self.retry_limit = kwargs.get("retry_limit", retry_limit) - self.timeout = kwargs.get("timeout", timeout) + self.backend = backend + self.browser_name = browser_name + self.retry_limit = retry_limit + self.timeout = timeout async def scrape(self, url: str) -> str: if self.backend == "playwright": @@ -159,7 +159,8 @@ async def ascrape_undetected_chromedriver(self, url: str) -> str: f"Error: Network error after {self.retry_limit} attempts - {e}" ) finally: - driver.quit() + if "driver" in dir(): + driver.quit() return results @@ -206,7 +207,7 @@ async def ascrape_playwright_scroll( # https://www.steelwood.amsterdam/. The site deos not scroll to the bottom. # In my browser I can scroll vertically but in Chromium it scrolls horizontally?!? - if timeout and timeout <= 0: + if timeout is not None and timeout <= 0: raise ValueError( "If set, timeout value for scrolling scraper must be greater than 0." ) @@ -316,7 +317,8 @@ async def ascrape_playwright_scroll( f"Error: Network error after {self.retry_limit} attempts - {e}" ) finally: - await browser.close() + if browser is not None: + await browser.close() return results @@ -434,7 +436,19 @@ async def ascrape_with_js_support( f"Failed to scrape after {self.retry_limit} attempts: {str(e)}" ) finally: - await browser.close() + if browser is not None: + await browser.close() + + def _get_scraping_fn(self): + """Return the appropriate scraping function based on backend config.""" + if self.requires_js_support: + return self.ascrape_with_js_support + if self.backend == "playwright": + return self.ascrape_playwright + elif self.backend == "selenium": + return self.ascrape_undetected_chromedriver + else: + raise ValueError(f"Unsupported backend: {self.backend}") def lazy_load(self) -> Iterator[Document]: """ @@ -446,11 +460,7 @@ def lazy_load(self) -> Iterator[Document]: Yields: Document: The scraped content encapsulated within a Document object. """ - scraping_fn = ( - self.ascrape_with_js_support - if self.requires_js_support - else getattr(self, f"ascrape_{self.backend}") - ) + scraping_fn = self._get_scraping_fn() for url in self.urls: html_content = asyncio.run(scraping_fn(url)) @@ -470,11 +480,7 @@ async def alazy_load(self) -> AsyncIterator[Document]: Document: A Document object containing the scraped content, along with its source URL as metadata. """ - scraping_fn = ( - self.ascrape_with_js_support - if self.requires_js_support - else getattr(self, f"ascrape_{self.backend}") - ) + scraping_fn = self._get_scraping_fn() tasks = [scraping_fn(url) for url in self.urls] results = await asyncio.gather(*tasks) diff --git a/tests/test_json_scraper_multi_graph.py b/tests/test_json_scraper_multi_graph.py index e69de29b..68d8537a 100644 --- a/tests/test_json_scraper_multi_graph.py +++ b/tests/test_json_scraper_multi_graph.py @@ -0,0 +1,36 @@ +""" +Tests for JSONScraperMultiGraph. +""" +import pytest +from scrapegraphai.graphs import JsonScraperMultiGraph + + +@pytest.fixture +def mock_json_config(): + return { + "llm": { + "model": "mock-model", + }, + } + + +class TestJsonScraperMultiGraph: + """Test suite for JsonScraperMultiGraph.""" + + def test_initialization(self, mock_json_config): + """Test that the graph can be initialized with basic config.""" + graph = JsonScraperMultiGraph( + prompt="Extract data", + source="[{\"test\": \"data\"}]", + config=mock_json_config, + ) + assert graph is not None + + def test_empty_config_raises_error(self): + """Test that empty config raises appropriate error.""" + with pytest.raises(Exception): + JsonScraperMultiGraph( + prompt="Extract data", + source="[{\"test\": \"data\"}]", + config={}, + ) diff --git a/tests/test_smart_scraper_multi_concat_graph.py b/tests/test_smart_scraper_multi_concat_graph.py index e69de29b..6ac1601e 100644 --- a/tests/test_smart_scraper_multi_concat_graph.py +++ b/tests/test_smart_scraper_multi_concat_graph.py @@ -0,0 +1,36 @@ +""" +Tests for SmartScraperMultiConcatGraph. +""" +import pytest +from scrapegraphai.graphs import SmartScraperMultiConcatGraph + + +@pytest.fixture +def mock_concat_config(): + return { + "llm": { + "model": "mock-model", + }, + } + + +class TestSmartScraperMultiConcatGraph: + """Test suite for SmartScraperMultiConcatGraph.""" + + def test_initialization(self, mock_concat_config): + """Test that the graph can be initialized with basic config.""" + graph = SmartScraperMultiConcatGraph( + prompt="Extract data", + source=["https://example.com"], + config=mock_concat_config, + ) + assert graph is not None + + def test_empty_sources_raises_error(self, mock_concat_config): + """Test that empty sources list raises appropriate error.""" + with pytest.raises(Exception): + SmartScraperMultiConcatGraph( + prompt="Extract data", + source=[], + config=mock_concat_config, + )