diff --git a/app/crawler/middlewares.py b/app/crawler/middlewares.py index 1ed6ef7..d46bd7d 100644 --- a/app/crawler/middlewares.py +++ b/app/crawler/middlewares.py @@ -70,6 +70,17 @@ def process_response(self, request, response, spider): f"Page limit reached. Ignoring request {request}" ) + # Parse the domain from the response URL + response_domain = urlparse(response.url).netloc + + # Check if the response domain is in the allowed domains + if spider.allowed_domains is not None and response_domain not in spider.allowed_domains: + raise IgnoreRequest(f"Domain not in allowed domains: {response.url}") + + # Check if the response URL matches the allowed URL + if spider.allowed_url is not None and not response.url.rstrip('/').endswith(spider.allowed_url): + raise IgnoreRequest(f"URL not allowed: {response.url}") + if request.url.endswith("robots.txt"): return response @@ -79,12 +90,9 @@ def process_response(self, request, response, spider): spider.first_real_url = response.url parsed_url = urlparse(spider.first_real_url) if parsed_url.path: - spider.allowed_url = parsed_url.path - else: - spider.allowed_url = parsed_url.netloc + spider.allowed_url = parsed_url.path.strip('/') spider.allowed_domains = [parsed_url.netloc] - if response.status == 200: self.current_page_count += 1 self._save_html_locally(response, spider) diff --git a/app/crawler/spider.py b/app/crawler/spider.py index e56559e..ffcf9f2 100644 --- a/app/crawler/spider.py +++ b/app/crawler/spider.py @@ -9,6 +9,7 @@ class MenesrSpider(CrawlSpider): rules = (Rule(),) use_playwright = False allowed_url = None + allowed_domains = None first_real_url = None page_count = 0 page_limit = 0 diff --git a/tests/tests_crawler/test_middlewares.py b/tests/tests_crawler/test_middlewares.py index 3b86527..84d3f5d 100644 --- a/tests/tests_crawler/test_middlewares.py +++ b/tests/tests_crawler/test_middlewares.py @@ -87,8 +87,10 @@ def setUp(self): ) def test_process_response(self, mock_mkdir, mock_write_text): mock_response = self.mock_response() + mock_spider = self.mock_spider() + mock_spider.allowed_url = "/test" result = self.middleware.process_response( - self.mock_request(), mock_response, self.mock_spider() + self.mock_request(), mock_response, mock_spider ) mock_mkdir.assert_called_once() mock_write_text.assert_called_once_with(self.mock_response().text) @@ -101,10 +103,12 @@ def test_process_response(self, mock_mkdir, mock_write_text): ) def test_robots_txt_skip(self, mock_mkdir, mock_write_text): mock_response = self.mock_response(url="http://example.com/robots.txt") + mock_spider = self.mock_spider() + mock_spider.allowed_url = "/robots.txt" result = self.middleware.process_response( self.mock_request(url="http://example.com/robots.txt"), mock_response, - self.mock_spider(), + mock_spider, ) mock_mkdir.assert_not_called() mock_write_text.assert_not_called()