Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
[![](https://img.shields.io/badge/python-3-brightgreen.svg)](https://www.python.org/downloads/)
[![](https://img.shields.io/badge/scrapy-1.5-blue.svg)](https://scrapy.org/)
## 百科类网站爬虫
此项目已经被整合于 [BaiduSpider](https://github.com/yuyijiong/BaiduSpider) 项目中,请勿直接使用本项目。

### 特性
- 百科类网站全站词条抓取,包括百度百科、互动百科、wiki中英文站点;
Expand Down
14 changes: 10 additions & 4 deletions encyclopediaCrawler/spiders/baidu_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,12 @@ def parse(self, response):
else:
basic_info_dict[dict_key] = dict_value

h2_title = '正文'
sibling = soup.find('div', attrs={'class': 'para-title level-2'})
#找到第一个class为para-title且class为level-2的div标签
sibling = soup.find('div', attrs={'class': lambda x: x and 'para-title' in x and 'level-2' in x})

#如果没有二级标题,那么就是正文
if not sibling:
h2_title = '正文'
content_h2_dict[h2_title] = ''
img_dict[h2_title] = list()
for para in soup.find_all('div', attrs={'class': 'para'}):
Expand All @@ -97,6 +100,8 @@ def parse(self, response):
img_dict[h2_title].append(img_url)
except AttributeError:
pass

#如果有二级标题,分别获取每个二级标题下的内容
else:
while sibling is not None:
if 'para-title level-2' in str(sibling):
Expand All @@ -107,9 +112,9 @@ def parse(self, response):
elif 'para-title level-3' in str(sibling):
# 3级标题名称
content_h2_dict[h2_title] += '<h3>' + sibling.find('h3').get_text('$$').split('$$')[-1] + '</h3>'
elif 'class="para"' in str(sibling):
elif 'class=\"para' in str(sibling):
# 对应的正文内容
content_h2_dict[h2_title] += '<p>' + re.sub(r'\r|\n', '', sibling.get_text()) + '</p>'
content_h2_dict[h2_title] += '<p>' + re.sub(r'\r|\n', '', sibling.get_text()).strip() + '</p>'
try:
img_url = sibling.find('img').get('data-src')
if img_url:
Expand All @@ -120,6 +125,7 @@ def parse(self, response):
sibling = next(sibling.next_siblings)
except StopIteration:
sibling = None

# 参考资料
try:
reference_key = soup.find('dt', attrs={'class': 'reference-title'}).get_text()
Expand Down