From 0f3eb7d08dd0dd78c4226ac337b0485f937c7d11 Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Fri, 15 Dec 2023 09:50:03 +0100 Subject: [PATCH 1/2] fix: double check the url when using setting dynamically allowed_url and allowed domains --- app/crawler/middlewares.py | 16 ++++++++++++---- app/crawler/spider.py | 1 + 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/app/crawler/middlewares.py b/app/crawler/middlewares.py index 1ed6ef7..d46bd7d 100644 --- a/app/crawler/middlewares.py +++ b/app/crawler/middlewares.py @@ -70,6 +70,17 @@ def process_response(self, request, response, spider): f"Page limit reached. Ignoring request {request}" ) + # Parse the domain from the response URL + response_domain = urlparse(response.url).netloc + + # Check if the response domain is in the allowed domains + if spider.allowed_domains is not None and response_domain not in spider.allowed_domains: + raise IgnoreRequest(f"Domain not in allowed domains: {response.url}") + + # Check if the response URL matches the allowed URL + if spider.allowed_url is not None and not response.url.rstrip('/').endswith(spider.allowed_url): + raise IgnoreRequest(f"URL not allowed: {response.url}") + if request.url.endswith("robots.txt"): return response @@ -79,12 +90,9 @@ def process_response(self, request, response, spider): spider.first_real_url = response.url parsed_url = urlparse(spider.first_real_url) if parsed_url.path: - spider.allowed_url = parsed_url.path - else: - spider.allowed_url = parsed_url.netloc + spider.allowed_url = parsed_url.path.strip('/') spider.allowed_domains = [parsed_url.netloc] - if response.status == 200: self.current_page_count += 1 self._save_html_locally(response, spider) diff --git a/app/crawler/spider.py b/app/crawler/spider.py index e56559e..ffcf9f2 100644 --- a/app/crawler/spider.py +++ b/app/crawler/spider.py @@ -9,6 +9,7 @@ class MenesrSpider(CrawlSpider): rules = (Rule(),) use_playwright = False allowed_url = None + allowed_domains = None first_real_url = None page_count = 0 page_limit = 0 From 513e8081eb78a63ec67c17a9c4532211321f75dc Mon Sep 17 00:00:00 2001 From: Yilei Pan Date: Fri, 15 Dec 2023 09:58:01 +0100 Subject: [PATCH 2/2] fix: repair mock errors in unit tests --- tests/tests_crawler/test_middlewares.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tests/tests_crawler/test_middlewares.py b/tests/tests_crawler/test_middlewares.py index 3b86527..84d3f5d 100644 --- a/tests/tests_crawler/test_middlewares.py +++ b/tests/tests_crawler/test_middlewares.py @@ -87,8 +87,10 @@ def setUp(self): ) def test_process_response(self, mock_mkdir, mock_write_text): mock_response = self.mock_response() + mock_spider = self.mock_spider() + mock_spider.allowed_url = "/test" result = self.middleware.process_response( - self.mock_request(), mock_response, self.mock_spider() + self.mock_request(), mock_response, mock_spider ) mock_mkdir.assert_called_once() mock_write_text.assert_called_once_with(self.mock_response().text) @@ -101,10 +103,12 @@ def test_process_response(self, mock_mkdir, mock_write_text): ) def test_robots_txt_skip(self, mock_mkdir, mock_write_text): mock_response = self.mock_response(url="http://example.com/robots.txt") + mock_spider = self.mock_spider() + mock_spider.allowed_url = "/robots.txt" result = self.middleware.process_response( self.mock_request(url="http://example.com/robots.txt"), mock_response, - self.mock_spider(), + mock_spider, ) mock_mkdir.assert_not_called() mock_write_text.assert_not_called()