Skip to content

Commit e0217d0

Browse files
committed
增加爬取国际排联
1 parent 0b3ab0f commit e0217d0

2 files changed

Lines changed: 102 additions & 2 deletions

File tree

crawler/volleyball-spider/main.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44

55
from shutils import runtime
66
from shutils.settings import Settings
7-
from spider import VolleyballSpider, VolleyballChinaSpider, VolSportsSpider, SportsVSpider, SportsSinaSpider
7+
from spider import VolleyballSpider, VolleyballChinaSpider, VolSportsSpider, SportsVSpider, SportsSinaSpider, FIVBSpider
88

99
config = Settings.instance()
1010
CrawlInterval = config.getint('settings', 'crawl_interval')
@@ -21,6 +21,7 @@
2121
VolleyballChinaSpider().start()
2222
SportsVSpider().start()
2323
VolSportsSpider().start()
24+
FIVBSpider().start()
2425

2526
end = time.time()
2627

crawler/volleyball-spider/spider.py

Lines changed: 100 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,10 @@
22
import json
33
import random
44
import time
5-
65
import requests
6+
77
from bs4 import BeautifulSoup, Comment
8+
from datetime import datetime
89
from urllib.parse import urlparse
910

1011
from selenium import webdriver
@@ -487,3 +488,101 @@ def parse_item(self, url):
487488
except Exception as e:
488489
self.printException(e)
489490
return content
491+
492+
493+
class FIVBSpider(VolleyballSpider):
494+
def __init__(self):
495+
super().__init__()
496+
self.url = 'https://www.fivb.com/category/volley/'
497+
498+
def start(self):
499+
self.parse_list(self.url)
500+
501+
def parse_list(self, url):
502+
r = self.request(url)
503+
if r == False :
504+
return
505+
soup = BeautifulSoup(r.content, "html.parser")
506+
507+
items = soup.find_all('article', class_='mod-article-item')
508+
509+
news = []
510+
for item in items:
511+
article = {
512+
'source': 'fivb',
513+
'title': '',
514+
'url': '',
515+
'author': '国际排联',
516+
'desc': '',
517+
'poster': '',
518+
'content': '',
519+
'publish_time': ''
520+
}
521+
522+
article['url'] = item.select_one('a').get('href')
523+
524+
info = self.parse_item(article['url'])
525+
526+
article['title'] = info['title']
527+
article['desc'] = info['desc']
528+
article['content'] = info['content']
529+
article['publish_time'] = info['publish_time']
530+
article['poster'] = info['poster']
531+
532+
news.append(article)
533+
if len(news) == 10:
534+
self.hound({'news': json.dumps(news)})
535+
news = []
536+
537+
time.sleep(1)
538+
539+
if len(news) > 0:
540+
self.hound({'news': json.dumps(news)})
541+
542+
def parse_item(self, url):
543+
r = self.request(url)
544+
if r == False :
545+
return
546+
547+
title = ''
548+
desc = ''
549+
content = ''
550+
publish_time = ''
551+
poster = ''
552+
553+
soup = BeautifulSoup(r.content, "html.parser")
554+
555+
# Remove all comments from the HTML string
556+
for comment in soup.find_all(string=lambda string: isinstance(string, Comment)):
557+
comment.extract()
558+
559+
main = soup.find('div', class_='single-new')
560+
561+
title_ = main.find('div', class_='title-wrapper')
562+
563+
title = title_.select_one('h1').text.strip()
564+
desc = title_.select_one('h2').text.strip()
565+
566+
meta = main.find('div', class_='meta position-relative')
567+
568+
cover = meta.select_one('img', class_='cover')
569+
poster = self.parseHref(cover.get('src'), url)
570+
571+
publish_time_ = meta.find('div', class_='date').text.strip()
572+
573+
publish_time = datetime.strptime(publish_time_, "%b %d, %Y").strftime("%Y-%m-%d")
574+
575+
content_ = main.find('article', class_='post').find('div', class_='container').find('div', class_='row')
576+
577+
for blockquote in content_.find_all('blockquote', class_='instagram-media'):
578+
blockquote.extract()
579+
580+
for div in content_.find_all('div', class_='spacer-3'):
581+
div.extract()
582+
583+
for iframe in content_.find_all(['iframe', 'script']):
584+
iframe.extract()
585+
586+
content = content_.prettify().replace('\n', '')
587+
588+
return { 'title': title, 'desc': desc, 'content': content, 'publish_time': publish_time, 'poster': poster }

0 commit comments

Comments
 (0)