继承 BaseSpider 基类

shannan1989 · shannan1989 · commit d18a2b10dec6 · 2026-03-13T18:05:04.000+08:00
diff --git a/crawler/av-spider/spider.py b/crawler/av-spider/spider.py
@@ -1,76 +1,26 @@
 # coding=utf-8
 
 import json
-import random
-import requests
 import time
 from datetime import datetime
-from itertools import count
 from lxml import etree
 from multiprocessing.dummy import Pool
 from urllib.parse import urlparse
 
-session = requests.session()
+from base_spider import BaseSpider
 
-class JavBusSpider(object):
-    def __init__(self, baseUrl, houndUrl, startUrl):
-        self.baseUrl = baseUrl
-        self.houndUrl = houndUrl
-        self.startUrl = startUrl
-
-        pr = urlparse(self.startUrl)
-        self.host = pr.scheme + '://' + pr.netloc
-
-    def getHeaders(self):
-        userAgents = [
-            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
-            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3.1 Safari/605.1.15',
-            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
-            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
-            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36 Edge/18.17763'
-        ]
-        random.shuffle(userAgents)
-        headers = {
-            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
-            'Upgrade-Insecure-Requests': '1',
-            'Referer': 'https://www.javbus.com',
-            'User-Agent': userAgents[0]
-        }
-        return headers
 
-    def printException(self, e):
-        print('Exception occurs at line %s' %
-              (e.__traceback__.tb_lineno.__str__()))
-        print(e)
+class JavBusSpider(BaseSpider):
+    def __init__(self, baseUrl, houndUrl, startUrl):
+        super(JavBusSpider, self).__init__(baseUrl, houndUrl, startUrl)
+        self.source = 'javbus'
 
     def request(self, url, tries=1):
-        if tries >= 10:
-            print(url, 'tries>=10')
-            return False
-        try:
-            session.cookies.set('existmag', 'all', path='/', domain='www.javbus.com')
-            r = session.get(url, headers=self.getHeaders())
-            print('%s %s' % (r.status_code, url))
-            if r.status_code != 200:
-                time.sleep(5)
-                return self.request(url, tries + 1)
-
-            return r
-        except Exception as e:
-            print(url, e)
-            time.sleep(5)
-            return self.request(url, tries + 1)
-
-    def hound(self, data):
-        try:
-            r = requests.post(self.houndUrl, data, headers=self.getHeaders())
-            print(r.content)
-        except Exception as e:
-            self.printException(e)
+        self.session.cookies.set('existmag', 'all', path='/', domain='www.javbus.com')
+        return super(JavBusSpider, self).request(url, tries)
 
     def start(self):
-        r = requests.get(self.baseUrl, headers=self.getHeaders())
-
+        r = self.request(self.baseUrl)
         data = json.loads(r.text)
         self.ids = data.get('ids')
         self.stars = data.get('stars')
@@ -106,7 +56,7 @@ def start(self):
             print('开始爬取系列 ' + (series.get('aka') or series['name']))
             url = self.host + '/series/' + series['id']
             self.parseList(url)
-        
+
         for genre in self.genres:
             if genre['status'] < 1:
                 continue
@@ -118,7 +68,7 @@ def start(self):
 
     def parseList(self, url):
         r = self.request(url)
-        if r == False:
+        if r is False:
             return
 
         pr = urlparse(url)
@@ -143,7 +93,7 @@ def parseList(self, url):
                 href = pr.scheme + ':' + href
 
             movie_id = href.split('/').pop()
-            if (movie_id in self.ids):
+            if movie_id in self.ids:
                 continue
 
             thumb = ''
@@ -178,14 +128,14 @@ def parseList(self, url):
     def parseMovie(self, item):
         url = item['url']
         r = self.request(url)
-        if r == False:
+        if r is False:
             return
 
         html = etree.HTML(r.content)
 
         movie = {
             'id': '',
-            'source': 'javbus',
+            'source': self.source,
             'title': '',
             'poster': '',
             'serial_number': '',
@@ -240,7 +190,7 @@ def parseMovie(self, item):
         stars = html.xpath("//div[@id='avatar-waterfall']/a")
         for _star in stars:
             star_id = _star.attrib.get('href').split('/').pop()
-            if star_names[star_id] is not None:
+            if star_id in star_names:
                 star_name = star_names[star_id]
             else:
                 try:
@@ -253,7 +203,7 @@ def parseMovie(self, item):
                 star_avatar = ''
             if star_avatar.startswith("/"):
                 star_avatar = self.host + star_avatar
-            star = {'id': star_id, 'source': 'javbus', 'name': star_name, 'avatar': star_avatar}
+            star = {'id': star_id, 'source': self.source, 'name': star_name, 'avatar': star_avatar}
             movie['stars'].append(star)
 
         infos = html.xpath("//div[@class='col-md-3 info']/p/a")
@@ -263,70 +213,28 @@ def parseMovie(self, item):
             info_name = info.text
             if 'series' in href:
                 movie['series'].append({'id': info_id, 'name': info_name})
-                continue
-            if 'label' in href:
+            elif 'label' in href:
                 movie['labels'].append({'id': info_id, 'name': info_name})
-                continue
-            if 'studio' in href:
+            elif 'studio' in href:
                 movie['studios'].append({'id': info_id, 'name': info_name})
-                continue
-            if 'director' in href:
+            elif 'director' in href:
                 movie['directors'].append({'id': info_id, 'name': info_name})
-                continue
-            print(info_id, info_name, href)
-
-        movies = []
-        movies.append(movie)
-        data = {
-            'type': 'movie',
-            'movies': json.dumps(movies)
-        }
-        self.hound(data)
 
+        self.sendMovieData(movie)
         time.sleep(1)
 
-class AvmooSpider(object):
+class AvmooSpider(BaseSpider):
     def __init__(self, baseUrl, houndUrl, startUrl):
-        self.baseUrl = baseUrl
-        self.houndUrl = houndUrl
-
-        pr = urlparse(startUrl)
-        self.host = pr.scheme + '://' + pr.netloc
+        super(AvmooSpider, self).__init__(baseUrl, houndUrl, startUrl)
+        self.source = 'avmoo'
 
         # 需要爬取的列表页
         self.urls = [
             startUrl
         ]
 
-    def getHeaders(self):
-        userAgents = [
-            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
-            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3.1 Safari/605.1.15',
-            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
-            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
-            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36 Edge/18.17763'
-        ]
-        random.shuffle(userAgents)
-        headers = {
-            'User-Agent': userAgents[0]
-        }
-        return headers
-
-    def hound(self, data):
-        try:
-            r = requests.post(self.houndUrl, data, headers=self.getHeaders())
-            print(r.content)
-        except Exception as e:
-            self.printException(e)
-
-    def printException(self, e):
-        print('Exception occurs at line %s' %
-              (e.__traceback__.tb_lineno.__str__()))
-        print(e)
-
     def start(self):
-        r = requests.get(self.baseUrl, headers=self.getHeaders())
-
+        r = self.request(self.baseUrl)
         data = json.loads(r.text)
         self.ids = data.get('ids')
         self.stars = data.get('stars')
@@ -339,26 +247,9 @@ def start(self):
             url = self.host + '/cn/star/' + star['id']
             self.parseList(url, star['subscribe'] >= 1)
 
-    def request(self, url, tries=1):
-        if tries >= 10:
-            print(url, 'tries>=10')
-            return False
-        try:
-            r = requests.get(url, headers=self.getHeaders())
-            print('%s %s' % (r.status_code, url))
-            if r.status_code != 200:
-                time.sleep(5)
-                return self.request(url, tries + 1)
-
-            return r
-        except Exception as e:
-            print(url, e)
-            time.sleep(5)
-            return self.request(url, tries + 1)
-
-    def parseList(self, url, next, tries=1):
+    def parseList(self, url, next):
         r = self.request(url)
-        if r == False:
+        if r is False:
             return
 
         pr = urlparse(url)
@@ -369,8 +260,7 @@ def parseList(self, url, next, tries=1):
         if ('/star/' in url) & ('page' not in url):
             star_id = url.split('/').pop()
             _infos = []
-            infos = html.xpath(
-                '//div[@class="avatar-box"]//div[@class="photo-info"]/p')
+            infos = html.xpath('//div[@class="avatar-box"]//div[@class="photo-info"]/p')
             for i in infos:
                 if i.text is not None:
                     _infos.append(i.text)
@@ -394,7 +284,7 @@ def parseList(self, url, next, tries=1):
                 href = pr.scheme + ':' + href
 
             movie_id = href.split('/').pop()
-            if (movie_id in self.ids):
+            if movie_id in self.ids:
                 continue
 
             thumb = ''
@@ -424,17 +314,17 @@ def parseList(self, url, next, tries=1):
             print('next page')
             self.parseList(href, next)
 
-    def parseMovie(self, item, tries=1):
+    def parseMovie(self, item):
         url = item['url']
         r = self.request(url)
-        if r == False:
+        if r is False:
             return
 
         html = etree.HTML(r.content)
 
         movie = {
             'id': '',
-            'source': 'avmoo',
+            'source': self.source,
             'title': '',
             'poster': '',
             'serial_number': '',
@@ -451,11 +341,9 @@ def parseMovie(self, item, tries=1):
 
         movie['id'] = url.split('/').pop()
         movie['title'] = html.xpath("//div[@class='container']/h3")[0].text
-        movie['poster'] = html.xpath(
-            "//a[@class='bigImage']/img")[0].attrib.get('src')
+        movie['poster'] = html.xpath("//a[@class='bigImage']/img")[0].attrib.get('src')
 
-        sample_images = html.xpath(
-            "//div[@id='sample-waterfall']//a[@class='sample-box']")
+        sample_images = html.xpath("//div[@id='sample-waterfall']//a[@class='sample-box']")
         for _img in sample_images:
             movie['samples'].append(_img.attrib.get('href'))
 
@@ -470,7 +358,7 @@ def parseMovie(self, item, tries=1):
             star_avatar = _star.xpath(".//img")[0].attrib.get('src')
             if 'nowprinting' in star_avatar:
                 star_avatar = ''
-            star = {'id': star_id, 'source': 'avmoo', 'name': star_name, 'avatar': star_avatar}
+            star = {'id': star_id, 'source': self.source, 'name': star_name, 'avatar': star_avatar}
             movie['stars'].append(star)
 
         infos = html.xpath("//div[@class='col-md-3 info']/p")
@@ -485,8 +373,7 @@ def parseMovie(self, item, tries=1):
             if nodes[0].text == '长度:':
                 movie['duration'] = str(nodes[1]).strip()
 
-        genres = html.xpath(
-            "//div[@class='col-md-3 info']/p/span[@class='genre']/a")
+        genres = html.xpath("//div[@class='col-md-3 info']/p/span[@class='genre']/a")
         for genre in genres:
             genre_id = genre.attrib.get('href').split('/').pop()
             genre_name = genre.text
@@ -511,12 +398,5 @@ def parseMovie(self, item, tries=1):
                 continue
             print(info_id, info_name, href)
 
-        movies = []
-        movies.append(movie)
-        data = {
-            'type': 'movie',
-            'movies': json.dumps(movies)
-        }
-        self.hound(data)
-
+        self.sendMovieData(movie)
         time.sleep(1)