From b634825f12bf55fee74138ab15ee157a42bca76c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BF=9E=E4=B8=80=E7=82=85?=
 <73890704+yuyijiong@users.noreply.github.com>
Date: Wed, 4 Oct 2023 16:17:04 +0800
Subject: [PATCH 1/2] Update baidu_spider.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

修复解析不出二级标题的bug
---
 encyclopediaCrawler/spiders/baidu_spider.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)
diff --git a/encyclopediaCrawler/spiders/baidu_spider.py b/encyclopediaCrawler/spiders/baidu_spider.py
index 8db3762..ad3b21b 100644
--- a/encyclopediaCrawler/spiders/baidu_spider.py
+++ b/encyclopediaCrawler/spiders/baidu_spider.py
@@ -84,9 +84,12 @@ def parse(self, response):
             else:
                 basic_info_dict[dict_key] = dict_value
 
-        h2_title = '正文'
-        sibling = soup.find('div', attrs={'class': 'para-title level-2'})
+        #找到第一个class为para-title且class为level-2的div标签
+        sibling = soup.find('div', attrs={'class': lambda x: x and 'para-title' in x and 'level-2' in x})
+
+        #如果没有二级标题，那么就是正文
         if not sibling:
+            h2_title = '正文'
             content_h2_dict[h2_title] = ''
             img_dict[h2_title] = list()
             for para in soup.find_all('div', attrs={'class': 'para'}):
@@ -97,6 +100,8 @@ def parse(self, response):
                         img_dict[h2_title].append(img_url)
                 except AttributeError:
                     pass
+
+        #如果有二级标题，分别获取每个二级标题下的内容
         else:
             while sibling is not None:
                 if 'para-title level-2' in str(sibling):
@@ -107,9 +112,9 @@ def parse(self, response):
                 elif 'para-title level-3' in str(sibling):
                     # 3级标题名称
                     content_h2_dict[h2_title] += '<h3>' + sibling.find('h3').get_text('$$').split('$$')[-1] + '</h3>'
-                elif 'class="para"' in str(sibling):
+                elif 'class=\"para' in str(sibling):
                     # 对应的正文内容
-                    content_h2_dict[h2_title] += '<p>' + re.sub(r'\r|\n', '', sibling.get_text()) + '</p>'
+                    content_h2_dict[h2_title] += '<p>' + re.sub(r'\r|\n', '', sibling.get_text()).strip() + '</p>'
                     try:
                         img_url = sibling.find('img').get('data-src')
                         if img_url:
@@ -120,6 +125,7 @@ def parse(self, response):
                     sibling = next(sibling.next_siblings)
                 except StopIteration:
                     sibling = None
+                    
         # 参考资料
         try:
             reference_key = soup.find('dt', attrs={'class': 'reference-title'}).get_text()

From b9ca88449c64fbf5f52152cf18459a855ed1fbc1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E4=BF=9E=E4=B8=80=E7=82=85?=
 <73890704+yuyijiong@users.noreply.github.com>
Date: Fri, 14 Jun 2024 11:50:48 +0800
Subject: [PATCH 2/2] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 45ac35b..817f444 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
 [![](https://img.shields.io/badge/python-3-brightgreen.svg)](https://www.python.org/downloads/)
 [![](https://img.shields.io/badge/scrapy-1.5-blue.svg)](https://scrapy.org/)
 ## 百科类网站爬虫
+ 此项目已经被整合于 [BaiduSpider](https://github.com/yuyijiong/BaiduSpider) 项目中，请勿直接使用本项目。
 
 ### 特性
 - 百科类网站全站词条抓取，包括百度百科、互动百科、wiki中英文站点；