diff --git a/README.md b/README.md index 45ac35b..817f444 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,7 @@ [](https://www.python.org/downloads/) [](https://scrapy.org/) ## 百科类网站爬虫 + 此项目已经被整合于 [BaiduSpider](https://github.com/yuyijiong/BaiduSpider) 项目中,请勿直接使用本项目。 ### 特性 - 百科类网站全站词条抓取,包括百度百科、互动百科、wiki中英文站点; diff --git a/encyclopediaCrawler/spiders/baidu_spider.py b/encyclopediaCrawler/spiders/baidu_spider.py index 8db3762..ad3b21b 100644 --- a/encyclopediaCrawler/spiders/baidu_spider.py +++ b/encyclopediaCrawler/spiders/baidu_spider.py @@ -84,9 +84,12 @@ def parse(self, response): else: basic_info_dict[dict_key] = dict_value - h2_title = '正文' - sibling = soup.find('div', attrs={'class': 'para-title level-2'}) + #找到第一个class为para-title且class为level-2的div标签 + sibling = soup.find('div', attrs={'class': lambda x: x and 'para-title' in x and 'level-2' in x}) + + #如果没有二级标题,那么就是正文 if not sibling: + h2_title = '正文' content_h2_dict[h2_title] = '' img_dict[h2_title] = list() for para in soup.find_all('div', attrs={'class': 'para'}): @@ -97,6 +100,8 @@ def parse(self, response): img_dict[h2_title].append(img_url) except AttributeError: pass + + #如果有二级标题,分别获取每个二级标题下的内容 else: while sibling is not None: if 'para-title level-2' in str(sibling): @@ -107,9 +112,9 @@ def parse(self, response): elif 'para-title level-3' in str(sibling): # 3级标题名称 content_h2_dict[h2_title] += '
' + re.sub(r'\r|\n', '', sibling.get_text()) + '
' + content_h2_dict[h2_title] += '' + re.sub(r'\r|\n', '', sibling.get_text()).strip() + '
' try: img_url = sibling.find('img').get('data-src') if img_url: @@ -120,6 +125,7 @@ def parse(self, response): sibling = next(sibling.next_siblings) except StopIteration: sibling = None + # 参考资料 try: reference_key = soup.find('dt', attrs={'class': 'reference-title'}).get_text()