From b634825f12bf55fee74138ab15ee157a42bca76c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BF=9E=E4=B8=80=E7=82=85?= <73890704+yuyijiong@users.noreply.github.com> Date: Wed, 4 Oct 2023 16:17:04 +0800 Subject: [PATCH 1/2] Update baidu_spider.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 修复解析不出二级标题的bug --- encyclopediaCrawler/spiders/baidu_spider.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/encyclopediaCrawler/spiders/baidu_spider.py b/encyclopediaCrawler/spiders/baidu_spider.py index 8db3762..ad3b21b 100644 --- a/encyclopediaCrawler/spiders/baidu_spider.py +++ b/encyclopediaCrawler/spiders/baidu_spider.py @@ -84,9 +84,12 @@ def parse(self, response): else: basic_info_dict[dict_key] = dict_value - h2_title = '正文' - sibling = soup.find('div', attrs={'class': 'para-title level-2'}) + #找到第一个class为para-title且class为level-2的div标签 + sibling = soup.find('div', attrs={'class': lambda x: x and 'para-title' in x and 'level-2' in x}) + + #如果没有二级标题,那么就是正文 if not sibling: + h2_title = '正文' content_h2_dict[h2_title] = '' img_dict[h2_title] = list() for para in soup.find_all('div', attrs={'class': 'para'}): @@ -97,6 +100,8 @@ def parse(self, response): img_dict[h2_title].append(img_url) except AttributeError: pass + + #如果有二级标题,分别获取每个二级标题下的内容 else: while sibling is not None: if 'para-title level-2' in str(sibling): @@ -107,9 +112,9 @@ def parse(self, response): elif 'para-title level-3' in str(sibling): # 3级标题名称 content_h2_dict[h2_title] += '
' + re.sub(r'\r|\n', '', sibling.get_text()) + '
' + content_h2_dict[h2_title] += '' + re.sub(r'\r|\n', '', sibling.get_text()).strip() + '
' try: img_url = sibling.find('img').get('data-src') if img_url: @@ -120,6 +125,7 @@ def parse(self, response): sibling = next(sibling.next_siblings) except StopIteration: sibling = None + # 参考资料 try: reference_key = soup.find('dt', attrs={'class': 'reference-title'}).get_text() From b9ca88449c64fbf5f52152cf18459a855ed1fbc1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BF=9E=E4=B8=80=E7=82=85?= <73890704+yuyijiong@users.noreply.github.com> Date: Fri, 14 Jun 2024 11:50:48 +0800 Subject: [PATCH 2/2] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 45ac35b..817f444 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [](https://www.python.org/downloads/) [](https://scrapy.org/) ## 百科类网站爬虫 + 此项目已经被整合于 [BaiduSpider](https://github.com/yuyijiong/BaiduSpider) 项目中,请勿直接使用本项目。 ### 特性 - 百科类网站全站词条抓取,包括百度百科、互动百科、wiki中英文站点;