|
2 | 2 | import json |
3 | 3 | import random |
4 | 4 | import time |
5 | | - |
6 | 5 | import requests |
| 6 | + |
7 | 7 | from bs4 import BeautifulSoup, Comment |
| 8 | +from datetime import datetime |
8 | 9 | from urllib.parse import urlparse |
9 | 10 |
|
10 | 11 | from selenium import webdriver |
@@ -487,3 +488,101 @@ def parse_item(self, url): |
487 | 488 | except Exception as e: |
488 | 489 | self.printException(e) |
489 | 490 | return content |
| 491 | + |
| 492 | + |
| 493 | +class FIVBSpider(VolleyballSpider): |
| 494 | + def __init__(self): |
| 495 | + super().__init__() |
| 496 | + self.url = 'https://www.fivb.com/category/volley/' |
| 497 | + |
| 498 | + def start(self): |
| 499 | + self.parse_list(self.url) |
| 500 | + |
| 501 | + def parse_list(self, url): |
| 502 | + r = self.request(url) |
| 503 | + if r == False : |
| 504 | + return |
| 505 | + soup = BeautifulSoup(r.content, "html.parser") |
| 506 | + |
| 507 | + items = soup.find_all('article', class_='mod-article-item') |
| 508 | + |
| 509 | + news = [] |
| 510 | + for item in items: |
| 511 | + article = { |
| 512 | + 'source': 'fivb', |
| 513 | + 'title': '', |
| 514 | + 'url': '', |
| 515 | + 'author': '国际排联', |
| 516 | + 'desc': '', |
| 517 | + 'poster': '', |
| 518 | + 'content': '', |
| 519 | + 'publish_time': '' |
| 520 | + } |
| 521 | + |
| 522 | + article['url'] = item.select_one('a').get('href') |
| 523 | + |
| 524 | + info = self.parse_item(article['url']) |
| 525 | + |
| 526 | + article['title'] = info['title'] |
| 527 | + article['desc'] = info['desc'] |
| 528 | + article['content'] = info['content'] |
| 529 | + article['publish_time'] = info['publish_time'] |
| 530 | + article['poster'] = info['poster'] |
| 531 | + |
| 532 | + news.append(article) |
| 533 | + if len(news) == 10: |
| 534 | + self.hound({'news': json.dumps(news)}) |
| 535 | + news = [] |
| 536 | + |
| 537 | + time.sleep(1) |
| 538 | + |
| 539 | + if len(news) > 0: |
| 540 | + self.hound({'news': json.dumps(news)}) |
| 541 | + |
| 542 | + def parse_item(self, url): |
| 543 | + r = self.request(url) |
| 544 | + if r == False : |
| 545 | + return |
| 546 | + |
| 547 | + title = '' |
| 548 | + desc = '' |
| 549 | + content = '' |
| 550 | + publish_time = '' |
| 551 | + poster = '' |
| 552 | + |
| 553 | + soup = BeautifulSoup(r.content, "html.parser") |
| 554 | + |
| 555 | + # Remove all comments from the HTML string |
| 556 | + for comment in soup.find_all(string=lambda string: isinstance(string, Comment)): |
| 557 | + comment.extract() |
| 558 | + |
| 559 | + main = soup.find('div', class_='single-new') |
| 560 | + |
| 561 | + title_ = main.find('div', class_='title-wrapper') |
| 562 | + |
| 563 | + title = title_.select_one('h1').text.strip() |
| 564 | + desc = title_.select_one('h2').text.strip() |
| 565 | + |
| 566 | + meta = main.find('div', class_='meta position-relative') |
| 567 | + |
| 568 | + cover = meta.select_one('img', class_='cover') |
| 569 | + poster = self.parseHref(cover.get('src'), url) |
| 570 | + |
| 571 | + publish_time_ = meta.find('div', class_='date').text.strip() |
| 572 | + |
| 573 | + publish_time = datetime.strptime(publish_time_, "%b %d, %Y").strftime("%Y-%m-%d") |
| 574 | + |
| 575 | + content_ = main.find('article', class_='post').find('div', class_='container').find('div', class_='row') |
| 576 | + |
| 577 | + for blockquote in content_.find_all('blockquote', class_='instagram-media'): |
| 578 | + blockquote.extract() |
| 579 | + |
| 580 | + for div in content_.find_all('div', class_='spacer-3'): |
| 581 | + div.extract() |
| 582 | + |
| 583 | + for iframe in content_.find_all(['iframe', 'script']): |
| 584 | + iframe.extract() |
| 585 | + |
| 586 | + content = content_.prettify().replace('\n', '') |
| 587 | + |
| 588 | + return { 'title': title, 'desc': desc, 'content': content, 'publish_time': publish_time, 'poster': poster } |
0 commit comments