11# coding=utf-8
22
33import json
4- import random
5- import requests
64import time
75from datetime import datetime
8- from itertools import count
96from lxml import etree
107from multiprocessing .dummy import Pool
118from urllib .parse import urlparse
129
13- session = requests . session ()
10+ from base_spider import BaseSpider
1411
15- class JavBusSpider (object ):
16- def __init__ (self , baseUrl , houndUrl , startUrl ):
17- self .baseUrl = baseUrl
18- self .houndUrl = houndUrl
19- self .startUrl = startUrl
20-
21- pr = urlparse (self .startUrl )
22- self .host = pr .scheme + '://' + pr .netloc
23-
24- def getHeaders (self ):
25- userAgents = [
26- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36' ,
27- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3.1 Safari/605.1.15' ,
28- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' ,
29- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' ,
30- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36 Edge/18.17763'
31- ]
32- random .shuffle (userAgents )
33- headers = {
34- 'Accept-Language' : 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6' ,
35- 'Upgrade-Insecure-Requests' : '1' ,
36- 'Referer' : 'https://www.javbus.com' ,
37- 'User-Agent' : userAgents [0 ]
38- }
39- return headers
4012
41- def printException ( self , e ):
42- print ( 'Exception occurs at line %s' %
43- ( e . __traceback__ . tb_lineno . __str__ ()) )
44- print ( e )
13+ class JavBusSpider ( BaseSpider ):
14+ def __init__ ( self , baseUrl , houndUrl , startUrl ):
15+ super ( JavBusSpider , self ). __init__ ( baseUrl , houndUrl , startUrl )
16+ self . source = 'javbus'
4517
4618 def request (self , url , tries = 1 ):
47- if tries >= 10 :
48- print (url , 'tries>=10' )
49- return False
50- try :
51- session .cookies .set ('existmag' , 'all' , path = '/' , domain = 'www.javbus.com' )
52- r = session .get (url , headers = self .getHeaders ())
53- print ('%s %s' % (r .status_code , url ))
54- if r .status_code != 200 :
55- time .sleep (5 )
56- return self .request (url , tries + 1 )
57-
58- return r
59- except Exception as e :
60- print (url , e )
61- time .sleep (5 )
62- return self .request (url , tries + 1 )
63-
64- def hound (self , data ):
65- try :
66- r = requests .post (self .houndUrl , data , headers = self .getHeaders ())
67- print (r .content )
68- except Exception as e :
69- self .printException (e )
19+ self .session .cookies .set ('existmag' , 'all' , path = '/' , domain = 'www.javbus.com' )
20+ return super (JavBusSpider , self ).request (url , tries )
7021
7122 def start (self ):
72- r = requests .get (self .baseUrl , headers = self .getHeaders ())
73-
23+ r = self .request (self .baseUrl )
7424 data = json .loads (r .text )
7525 self .ids = data .get ('ids' )
7626 self .stars = data .get ('stars' )
@@ -106,7 +56,7 @@ def start(self):
10656 print ('开始爬取系列 ' + (series .get ('aka' ) or series ['name' ]))
10757 url = self .host + '/series/' + series ['id' ]
10858 self .parseList (url )
109-
59+
11060 for genre in self .genres :
11161 if genre ['status' ] < 1 :
11262 continue
@@ -118,7 +68,7 @@ def start(self):
11868
11969 def parseList (self , url ):
12070 r = self .request (url )
121- if r == False :
71+ if r is False :
12272 return
12373
12474 pr = urlparse (url )
@@ -143,7 +93,7 @@ def parseList(self, url):
14393 href = pr .scheme + ':' + href
14494
14595 movie_id = href .split ('/' ).pop ()
146- if ( movie_id in self .ids ) :
96+ if movie_id in self .ids :
14797 continue
14898
14999 thumb = ''
@@ -178,14 +128,14 @@ def parseList(self, url):
178128 def parseMovie (self , item ):
179129 url = item ['url' ]
180130 r = self .request (url )
181- if r == False :
131+ if r is False :
182132 return
183133
184134 html = etree .HTML (r .content )
185135
186136 movie = {
187137 'id' : '' ,
188- 'source' : 'javbus' ,
138+ 'source' : self . source ,
189139 'title' : '' ,
190140 'poster' : '' ,
191141 'serial_number' : '' ,
@@ -240,7 +190,7 @@ def parseMovie(self, item):
240190 stars = html .xpath ("//div[@id='avatar-waterfall']/a" )
241191 for _star in stars :
242192 star_id = _star .attrib .get ('href' ).split ('/' ).pop ()
243- if star_names [ star_id ] is not None :
193+ if star_id in star_names :
244194 star_name = star_names [star_id ]
245195 else :
246196 try :
@@ -253,7 +203,7 @@ def parseMovie(self, item):
253203 star_avatar = ''
254204 if star_avatar .startswith ("/" ):
255205 star_avatar = self .host + star_avatar
256- star = {'id' : star_id , 'source' : 'javbus' , 'name' : star_name , 'avatar' : star_avatar }
206+ star = {'id' : star_id , 'source' : self . source , 'name' : star_name , 'avatar' : star_avatar }
257207 movie ['stars' ].append (star )
258208
259209 infos = html .xpath ("//div[@class='col-md-3 info']/p/a" )
@@ -263,70 +213,28 @@ def parseMovie(self, item):
263213 info_name = info .text
264214 if 'series' in href :
265215 movie ['series' ].append ({'id' : info_id , 'name' : info_name })
266- continue
267- if 'label' in href :
216+ elif 'label' in href :
268217 movie ['labels' ].append ({'id' : info_id , 'name' : info_name })
269- continue
270- if 'studio' in href :
218+ elif 'studio' in href :
271219 movie ['studios' ].append ({'id' : info_id , 'name' : info_name })
272- continue
273- if 'director' in href :
220+ elif 'director' in href :
274221 movie ['directors' ].append ({'id' : info_id , 'name' : info_name })
275- continue
276- print (info_id , info_name , href )
277-
278- movies = []
279- movies .append (movie )
280- data = {
281- 'type' : 'movie' ,
282- 'movies' : json .dumps (movies )
283- }
284- self .hound (data )
285222
223+ self .sendMovieData (movie )
286224 time .sleep (1 )
287225
288- class AvmooSpider (object ):
226+ class AvmooSpider (BaseSpider ):
289227 def __init__ (self , baseUrl , houndUrl , startUrl ):
290- self .baseUrl = baseUrl
291- self .houndUrl = houndUrl
292-
293- pr = urlparse (startUrl )
294- self .host = pr .scheme + '://' + pr .netloc
228+ super (AvmooSpider , self ).__init__ (baseUrl , houndUrl , startUrl )
229+ self .source = 'avmoo'
295230
296231 # 需要爬取的列表页
297232 self .urls = [
298233 startUrl
299234 ]
300235
301- def getHeaders (self ):
302- userAgents = [
303- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36' ,
304- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3.1 Safari/605.1.15' ,
305- 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' ,
306- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36' ,
307- 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36 Edge/18.17763'
308- ]
309- random .shuffle (userAgents )
310- headers = {
311- 'User-Agent' : userAgents [0 ]
312- }
313- return headers
314-
315- def hound (self , data ):
316- try :
317- r = requests .post (self .houndUrl , data , headers = self .getHeaders ())
318- print (r .content )
319- except Exception as e :
320- self .printException (e )
321-
322- def printException (self , e ):
323- print ('Exception occurs at line %s' %
324- (e .__traceback__ .tb_lineno .__str__ ()))
325- print (e )
326-
327236 def start (self ):
328- r = requests .get (self .baseUrl , headers = self .getHeaders ())
329-
237+ r = self .request (self .baseUrl )
330238 data = json .loads (r .text )
331239 self .ids = data .get ('ids' )
332240 self .stars = data .get ('stars' )
@@ -339,26 +247,9 @@ def start(self):
339247 url = self .host + '/cn/star/' + star ['id' ]
340248 self .parseList (url , star ['subscribe' ] >= 1 )
341249
342- def request (self , url , tries = 1 ):
343- if tries >= 10 :
344- print (url , 'tries>=10' )
345- return False
346- try :
347- r = requests .get (url , headers = self .getHeaders ())
348- print ('%s %s' % (r .status_code , url ))
349- if r .status_code != 200 :
350- time .sleep (5 )
351- return self .request (url , tries + 1 )
352-
353- return r
354- except Exception as e :
355- print (url , e )
356- time .sleep (5 )
357- return self .request (url , tries + 1 )
358-
359- def parseList (self , url , next , tries = 1 ):
250+ def parseList (self , url , next ):
360251 r = self .request (url )
361- if r == False :
252+ if r is False :
362253 return
363254
364255 pr = urlparse (url )
@@ -369,8 +260,7 @@ def parseList(self, url, next, tries=1):
369260 if ('/star/' in url ) & ('page' not in url ):
370261 star_id = url .split ('/' ).pop ()
371262 _infos = []
372- infos = html .xpath (
373- '//div[@class="avatar-box"]//div[@class="photo-info"]/p' )
263+ infos = html .xpath ('//div[@class="avatar-box"]//div[@class="photo-info"]/p' )
374264 for i in infos :
375265 if i .text is not None :
376266 _infos .append (i .text )
@@ -394,7 +284,7 @@ def parseList(self, url, next, tries=1):
394284 href = pr .scheme + ':' + href
395285
396286 movie_id = href .split ('/' ).pop ()
397- if ( movie_id in self .ids ) :
287+ if movie_id in self .ids :
398288 continue
399289
400290 thumb = ''
@@ -424,17 +314,17 @@ def parseList(self, url, next, tries=1):
424314 print ('next page' )
425315 self .parseList (href , next )
426316
427- def parseMovie (self , item , tries = 1 ):
317+ def parseMovie (self , item ):
428318 url = item ['url' ]
429319 r = self .request (url )
430- if r == False :
320+ if r is False :
431321 return
432322
433323 html = etree .HTML (r .content )
434324
435325 movie = {
436326 'id' : '' ,
437- 'source' : 'avmoo' ,
327+ 'source' : self . source ,
438328 'title' : '' ,
439329 'poster' : '' ,
440330 'serial_number' : '' ,
@@ -451,11 +341,9 @@ def parseMovie(self, item, tries=1):
451341
452342 movie ['id' ] = url .split ('/' ).pop ()
453343 movie ['title' ] = html .xpath ("//div[@class='container']/h3" )[0 ].text
454- movie ['poster' ] = html .xpath (
455- "//a[@class='bigImage']/img" )[0 ].attrib .get ('src' )
344+ movie ['poster' ] = html .xpath ("//a[@class='bigImage']/img" )[0 ].attrib .get ('src' )
456345
457- sample_images = html .xpath (
458- "//div[@id='sample-waterfall']//a[@class='sample-box']" )
346+ sample_images = html .xpath ("//div[@id='sample-waterfall']//a[@class='sample-box']" )
459347 for _img in sample_images :
460348 movie ['samples' ].append (_img .attrib .get ('href' ))
461349
@@ -470,7 +358,7 @@ def parseMovie(self, item, tries=1):
470358 star_avatar = _star .xpath (".//img" )[0 ].attrib .get ('src' )
471359 if 'nowprinting' in star_avatar :
472360 star_avatar = ''
473- star = {'id' : star_id , 'source' : 'avmoo' , 'name' : star_name , 'avatar' : star_avatar }
361+ star = {'id' : star_id , 'source' : self . source , 'name' : star_name , 'avatar' : star_avatar }
474362 movie ['stars' ].append (star )
475363
476364 infos = html .xpath ("//div[@class='col-md-3 info']/p" )
@@ -485,8 +373,7 @@ def parseMovie(self, item, tries=1):
485373 if nodes [0 ].text == '长度:' :
486374 movie ['duration' ] = str (nodes [1 ]).strip ()
487375
488- genres = html .xpath (
489- "//div[@class='col-md-3 info']/p/span[@class='genre']/a" )
376+ genres = html .xpath ("//div[@class='col-md-3 info']/p/span[@class='genre']/a" )
490377 for genre in genres :
491378 genre_id = genre .attrib .get ('href' ).split ('/' ).pop ()
492379 genre_name = genre .text
@@ -511,12 +398,5 @@ def parseMovie(self, item, tries=1):
511398 continue
512399 print (info_id , info_name , href )
513400
514- movies = []
515- movies .append (movie )
516- data = {
517- 'type' : 'movie' ,
518- 'movies' : json .dumps (movies )
519- }
520- self .hound (data )
521-
401+ self .sendMovieData (movie )
522402 time .sleep (1 )
0 commit comments