Skip to content

Commit d18a2b1

Browse files
committed
继承 BaseSpider 基类
1 parent 599b094 commit d18a2b1

1 file changed

Lines changed: 35 additions & 155 deletions

File tree

crawler/av-spider/spider.py

Lines changed: 35 additions & 155 deletions
Original file line numberDiff line numberDiff line change
@@ -1,76 +1,26 @@
11
# coding=utf-8
22

33
import json
4-
import random
5-
import requests
64
import time
75
from datetime import datetime
8-
from itertools import count
96
from lxml import etree
107
from multiprocessing.dummy import Pool
118
from urllib.parse import urlparse
129

13-
session = requests.session()
10+
from base_spider import BaseSpider
1411

15-
class JavBusSpider(object):
16-
def __init__(self, baseUrl, houndUrl, startUrl):
17-
self.baseUrl = baseUrl
18-
self.houndUrl = houndUrl
19-
self.startUrl = startUrl
20-
21-
pr = urlparse(self.startUrl)
22-
self.host = pr.scheme + '://' + pr.netloc
23-
24-
def getHeaders(self):
25-
userAgents = [
26-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
27-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3.1 Safari/605.1.15',
28-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
29-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
30-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36 Edge/18.17763'
31-
]
32-
random.shuffle(userAgents)
33-
headers = {
34-
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
35-
'Upgrade-Insecure-Requests': '1',
36-
'Referer': 'https://www.javbus.com',
37-
'User-Agent': userAgents[0]
38-
}
39-
return headers
4012

41-
def printException(self, e):
42-
print('Exception occurs at line %s' %
43-
(e.__traceback__.tb_lineno.__str__()))
44-
print(e)
13+
class JavBusSpider(BaseSpider):
14+
def __init__(self, baseUrl, houndUrl, startUrl):
15+
super(JavBusSpider, self).__init__(baseUrl, houndUrl, startUrl)
16+
self.source = 'javbus'
4517

4618
def request(self, url, tries=1):
47-
if tries >= 10:
48-
print(url, 'tries>=10')
49-
return False
50-
try:
51-
session.cookies.set('existmag', 'all', path='/', domain='www.javbus.com')
52-
r = session.get(url, headers=self.getHeaders())
53-
print('%s %s' % (r.status_code, url))
54-
if r.status_code != 200:
55-
time.sleep(5)
56-
return self.request(url, tries + 1)
57-
58-
return r
59-
except Exception as e:
60-
print(url, e)
61-
time.sleep(5)
62-
return self.request(url, tries + 1)
63-
64-
def hound(self, data):
65-
try:
66-
r = requests.post(self.houndUrl, data, headers=self.getHeaders())
67-
print(r.content)
68-
except Exception as e:
69-
self.printException(e)
19+
self.session.cookies.set('existmag', 'all', path='/', domain='www.javbus.com')
20+
return super(JavBusSpider, self).request(url, tries)
7021

7122
def start(self):
72-
r = requests.get(self.baseUrl, headers=self.getHeaders())
73-
23+
r = self.request(self.baseUrl)
7424
data = json.loads(r.text)
7525
self.ids = data.get('ids')
7626
self.stars = data.get('stars')
@@ -106,7 +56,7 @@ def start(self):
10656
print('开始爬取系列 ' + (series.get('aka') or series['name']))
10757
url = self.host + '/series/' + series['id']
10858
self.parseList(url)
109-
59+
11060
for genre in self.genres:
11161
if genre['status'] < 1:
11262
continue
@@ -118,7 +68,7 @@ def start(self):
11868

11969
def parseList(self, url):
12070
r = self.request(url)
121-
if r == False:
71+
if r is False:
12272
return
12373

12474
pr = urlparse(url)
@@ -143,7 +93,7 @@ def parseList(self, url):
14393
href = pr.scheme + ':' + href
14494

14595
movie_id = href.split('/').pop()
146-
if (movie_id in self.ids):
96+
if movie_id in self.ids:
14797
continue
14898

14999
thumb = ''
@@ -178,14 +128,14 @@ def parseList(self, url):
178128
def parseMovie(self, item):
179129
url = item['url']
180130
r = self.request(url)
181-
if r == False:
131+
if r is False:
182132
return
183133

184134
html = etree.HTML(r.content)
185135

186136
movie = {
187137
'id': '',
188-
'source': 'javbus',
138+
'source': self.source,
189139
'title': '',
190140
'poster': '',
191141
'serial_number': '',
@@ -240,7 +190,7 @@ def parseMovie(self, item):
240190
stars = html.xpath("//div[@id='avatar-waterfall']/a")
241191
for _star in stars:
242192
star_id = _star.attrib.get('href').split('/').pop()
243-
if star_names[star_id] is not None:
193+
if star_id in star_names:
244194
star_name = star_names[star_id]
245195
else:
246196
try:
@@ -253,7 +203,7 @@ def parseMovie(self, item):
253203
star_avatar = ''
254204
if star_avatar.startswith("/"):
255205
star_avatar = self.host + star_avatar
256-
star = {'id': star_id, 'source': 'javbus', 'name': star_name, 'avatar': star_avatar}
206+
star = {'id': star_id, 'source': self.source, 'name': star_name, 'avatar': star_avatar}
257207
movie['stars'].append(star)
258208

259209
infos = html.xpath("//div[@class='col-md-3 info']/p/a")
@@ -263,70 +213,28 @@ def parseMovie(self, item):
263213
info_name = info.text
264214
if 'series' in href:
265215
movie['series'].append({'id': info_id, 'name': info_name})
266-
continue
267-
if 'label' in href:
216+
elif 'label' in href:
268217
movie['labels'].append({'id': info_id, 'name': info_name})
269-
continue
270-
if 'studio' in href:
218+
elif 'studio' in href:
271219
movie['studios'].append({'id': info_id, 'name': info_name})
272-
continue
273-
if 'director' in href:
220+
elif 'director' in href:
274221
movie['directors'].append({'id': info_id, 'name': info_name})
275-
continue
276-
print(info_id, info_name, href)
277-
278-
movies = []
279-
movies.append(movie)
280-
data = {
281-
'type': 'movie',
282-
'movies': json.dumps(movies)
283-
}
284-
self.hound(data)
285222

223+
self.sendMovieData(movie)
286224
time.sleep(1)
287225

288-
class AvmooSpider(object):
226+
class AvmooSpider(BaseSpider):
289227
def __init__(self, baseUrl, houndUrl, startUrl):
290-
self.baseUrl = baseUrl
291-
self.houndUrl = houndUrl
292-
293-
pr = urlparse(startUrl)
294-
self.host = pr.scheme + '://' + pr.netloc
228+
super(AvmooSpider, self).__init__(baseUrl, houndUrl, startUrl)
229+
self.source = 'avmoo'
295230

296231
# 需要爬取的列表页
297232
self.urls = [
298233
startUrl
299234
]
300235

301-
def getHeaders(self):
302-
userAgents = [
303-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
304-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.3.1 Safari/605.1.15',
305-
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
306-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36',
307-
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/104.0.0.0 Safari/537.36 Edge/18.17763'
308-
]
309-
random.shuffle(userAgents)
310-
headers = {
311-
'User-Agent': userAgents[0]
312-
}
313-
return headers
314-
315-
def hound(self, data):
316-
try:
317-
r = requests.post(self.houndUrl, data, headers=self.getHeaders())
318-
print(r.content)
319-
except Exception as e:
320-
self.printException(e)
321-
322-
def printException(self, e):
323-
print('Exception occurs at line %s' %
324-
(e.__traceback__.tb_lineno.__str__()))
325-
print(e)
326-
327236
def start(self):
328-
r = requests.get(self.baseUrl, headers=self.getHeaders())
329-
237+
r = self.request(self.baseUrl)
330238
data = json.loads(r.text)
331239
self.ids = data.get('ids')
332240
self.stars = data.get('stars')
@@ -339,26 +247,9 @@ def start(self):
339247
url = self.host + '/cn/star/' + star['id']
340248
self.parseList(url, star['subscribe'] >= 1)
341249

342-
def request(self, url, tries=1):
343-
if tries >= 10:
344-
print(url, 'tries>=10')
345-
return False
346-
try:
347-
r = requests.get(url, headers=self.getHeaders())
348-
print('%s %s' % (r.status_code, url))
349-
if r.status_code != 200:
350-
time.sleep(5)
351-
return self.request(url, tries + 1)
352-
353-
return r
354-
except Exception as e:
355-
print(url, e)
356-
time.sleep(5)
357-
return self.request(url, tries + 1)
358-
359-
def parseList(self, url, next, tries=1):
250+
def parseList(self, url, next):
360251
r = self.request(url)
361-
if r == False:
252+
if r is False:
362253
return
363254

364255
pr = urlparse(url)
@@ -369,8 +260,7 @@ def parseList(self, url, next, tries=1):
369260
if ('/star/' in url) & ('page' not in url):
370261
star_id = url.split('/').pop()
371262
_infos = []
372-
infos = html.xpath(
373-
'//div[@class="avatar-box"]//div[@class="photo-info"]/p')
263+
infos = html.xpath('//div[@class="avatar-box"]//div[@class="photo-info"]/p')
374264
for i in infos:
375265
if i.text is not None:
376266
_infos.append(i.text)
@@ -394,7 +284,7 @@ def parseList(self, url, next, tries=1):
394284
href = pr.scheme + ':' + href
395285

396286
movie_id = href.split('/').pop()
397-
if (movie_id in self.ids):
287+
if movie_id in self.ids:
398288
continue
399289

400290
thumb = ''
@@ -424,17 +314,17 @@ def parseList(self, url, next, tries=1):
424314
print('next page')
425315
self.parseList(href, next)
426316

427-
def parseMovie(self, item, tries=1):
317+
def parseMovie(self, item):
428318
url = item['url']
429319
r = self.request(url)
430-
if r == False:
320+
if r is False:
431321
return
432322

433323
html = etree.HTML(r.content)
434324

435325
movie = {
436326
'id': '',
437-
'source': 'avmoo',
327+
'source': self.source,
438328
'title': '',
439329
'poster': '',
440330
'serial_number': '',
@@ -451,11 +341,9 @@ def parseMovie(self, item, tries=1):
451341

452342
movie['id'] = url.split('/').pop()
453343
movie['title'] = html.xpath("//div[@class='container']/h3")[0].text
454-
movie['poster'] = html.xpath(
455-
"//a[@class='bigImage']/img")[0].attrib.get('src')
344+
movie['poster'] = html.xpath("//a[@class='bigImage']/img")[0].attrib.get('src')
456345

457-
sample_images = html.xpath(
458-
"//div[@id='sample-waterfall']//a[@class='sample-box']")
346+
sample_images = html.xpath("//div[@id='sample-waterfall']//a[@class='sample-box']")
459347
for _img in sample_images:
460348
movie['samples'].append(_img.attrib.get('href'))
461349

@@ -470,7 +358,7 @@ def parseMovie(self, item, tries=1):
470358
star_avatar = _star.xpath(".//img")[0].attrib.get('src')
471359
if 'nowprinting' in star_avatar:
472360
star_avatar = ''
473-
star = {'id': star_id, 'source': 'avmoo', 'name': star_name, 'avatar': star_avatar}
361+
star = {'id': star_id, 'source': self.source, 'name': star_name, 'avatar': star_avatar}
474362
movie['stars'].append(star)
475363

476364
infos = html.xpath("//div[@class='col-md-3 info']/p")
@@ -485,8 +373,7 @@ def parseMovie(self, item, tries=1):
485373
if nodes[0].text == '长度:':
486374
movie['duration'] = str(nodes[1]).strip()
487375

488-
genres = html.xpath(
489-
"//div[@class='col-md-3 info']/p/span[@class='genre']/a")
376+
genres = html.xpath("//div[@class='col-md-3 info']/p/span[@class='genre']/a")
490377
for genre in genres:
491378
genre_id = genre.attrib.get('href').split('/').pop()
492379
genre_name = genre.text
@@ -511,12 +398,5 @@ def parseMovie(self, item, tries=1):
511398
continue
512399
print(info_id, info_name, href)
513400

514-
movies = []
515-
movies.append(movie)
516-
data = {
517-
'type': 'movie',
518-
'movies': json.dumps(movies)
519-
}
520-
self.hound(data)
521-
401+
self.sendMovieData(movie)
522402
time.sleep(1)

0 commit comments

Comments
 (0)