Times125 · yuyijiong · Oct 4, 2023 · Jun 14, 2024
diff --git a/README.md b/README.md
@@ -2,6 +2,7 @@
 [![](https://img.shields.io/badge/python-3-brightgreen.svg)](https://www.python.org/downloads/)
 [![](https://img.shields.io/badge/scrapy-1.5-blue.svg)](https://scrapy.org/)
 ## 百科类网站爬虫
+ 此项目已经被整合于 [BaiduSpider](https://github.com/yuyijiong/BaiduSpider) 项目中，请勿直接使用本项目。
 
 ### 特性
 - 百科类网站全站词条抓取，包括百度百科、互动百科、wiki中英文站点；

diff --git a/encyclopediaCrawler/spiders/baidu_spider.py b/encyclopediaCrawler/spiders/baidu_spider.py
@@ -84,9 +84,12 @@ def parse(self, response):
             else:
                 basic_info_dict[dict_key] = dict_value
 
-        h2_title = '正文'
-        sibling = soup.find('div', attrs={'class': 'para-title level-2'})
+        #找到第一个class为para-title且class为level-2的div标签
+        sibling = soup.find('div', attrs={'class': lambda x: x and 'para-title' in x and 'level-2' in x})
+
+        #如果没有二级标题，那么就是正文
         if not sibling:
+            h2_title = '正文'
             content_h2_dict[h2_title] = ''
             img_dict[h2_title] = list()
             for para in soup.find_all('div', attrs={'class': 'para'}):
@@ -97,6 +100,8 @@ def parse(self, response):
                         img_dict[h2_title].append(img_url)
                 except AttributeError:
                     pass
+
+        #如果有二级标题，分别获取每个二级标题下的内容
         else:
             while sibling is not None:
                 if 'para-title level-2' in str(sibling):
@@ -107,9 +112,9 @@ def parse(self, response):
                 elif 'para-title level-3' in str(sibling):
                     # 3级标题名称
                     content_h2_dict[h2_title] += '<h3>' + sibling.find('h3').get_text('$$').split('$$')[-1] + '</h3>'
-                elif 'class="para"' in str(sibling):
+                elif 'class=\"para' in str(sibling):
                     # 对应的正文内容
-                    content_h2_dict[h2_title] += '<p>' + re.sub(r'\r|\n', '', sibling.get_text()) + '</p>'
+                    content_h2_dict[h2_title] += '<p>' + re.sub(r'\r|\n', '', sibling.get_text()).strip() + '</p>'
                     try:
                         img_url = sibling.find('img').get('data-src')
                         if img_url:
@@ -120,6 +125,7 @@ def parse(self, response):
                     sibling = next(sibling.next_siblings)
                 except StopIteration:
                     sibling = None
+
         # 参考资料
         try:
             reference_key = soup.find('dt', attrs={'class': 'reference-title'}).get_text()