-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathserver.py
More file actions
80 lines (69 loc) · 2.96 KB
/
server.py
File metadata and controls
80 lines (69 loc) · 2.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python
# Copyright (c) Twisted Matrix Laboratories.
# See LICENSE for details.
from twisted.spread import pb
from twisted.internet import reactor
from collections import deque
import subprocess,psutil,random,math
class CrawlerServer(pb.Root):
def __init__(self,maxSpiders,workerPath,handleData):
self.maxSpiders=maxSpiders
self.workerPath=workerPath
self.scrapParamsQueue=deque()
self.spidersRunning = 0
self.handleData=handleData
self.reservedParams={}
self.checkSpidersAvailable()
self.scrapedsUrls=[]
#Adding scrapParams
#@paramsList (list(dict)) : mandatory
#dict->
# must contain @templatePath (template file to do the dirty work)
# must contain @url
def appendParams(self,paramsList):
for params in paramsList:
self.scrapParamsQueue.append(params)
self.checkSpidersAvailable()
#Receives response from spider.
#@error(string) : optional
#@scrapParams(dict) : mandatory -> must be the same as scrapParams received by spider on start() function
#@another_args : optional -> any args to be processed at handleData function
def remote_spiderResponse(self,kwargs):
p = psutil.Process(kwargs['scrapParams']['pid'])
p.terminate() #or p.kill()
self.spidersRunning=self.spidersRunning-1
if 'error' in kwargs:
print kwargs['error']
self.scrapParamsQueue.append([kwargs['scrapParams']])
else:
if 'links2add' in kwargs:
self.appendParams(kwargs['links2add'])
self.handleData(kwargs)
self.checkSpidersAvailable()
#Check how many spiders are available, and start a detached process to each of them
#Reserves scrapParams to each spider
def checkSpidersAvailable(self):
while self.spidersRunning<self.maxSpiders and len(self.scrapParamsQueue)>0:
id=int(math.floor(random.uniform(1, 10)*1000))
params = self.scrapParamsQueue.popleft()
while(params['url'] in self.scrapedsUrls and len(self.scrapParamsQueue)>0):
params = self.scrapParamsQueue.popleft()
if params['url'] in self.scrapedsUrls:
break
self.reservedParams.update({int(id):params})
self.spidersRunning=self.spidersRunning+1
args = ["python.exe", self.workerPath, str(id)]
params['pid']= subprocess.Popen(args, close_fds=True).pid
#A new spider is now connected. Pass reserved data to it
def remote_spiderConnected(self,spider):
spiderId = -1
d=spider.callRemote("id")
def s(d):
spiderId=d
spider.callRemote('start',self.reservedParams[int(spiderId)])
self.reservedParams.pop(int(spiderId))
d.addCallback(s)
def start(server):
reactor.listenTCP(8800, pb.PBServerFactory(server))
#reactor.listenTCP(8800, pb.PBServerFactory(CrawlerServer()))
reactor.run()