-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathirresistibleforceCrawl.py
More file actions
69 lines (57 loc) · 2.06 KB
/
irresistibleforceCrawl.py
File metadata and controls
69 lines (57 loc) · 2.06 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#this project is aimed to crawl interesting information for hacker
#-*- coding:utf-8 -*-
import requests
import datetime
import re
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def crawlknowledgebefore(url):
knowledgebeforelink=[]
resp=requests.get(url)
safesystem=re.findall("\[<i>.*?</i>\].*>",resp.content)
for child in safesystem:
child=child.replace("[<i> ","")
child=child.replace(" </i>] ",",")
child = re.subn("<a href=.*?rel=\"nofollow\">", ",", child)
child = child[0].replace("</a></p>", "")
knowledgebeforelink.append(child)
return knowledgebeforelink
def crawlknowledge(url):
knowledgelink=[]
resp=requests.get(url)
safesystem=re.findall("<span class=\"category\">.*?</span>.*?<a href=.*?</a></p>",resp.content,re.S)
if len(safesystem) is 0:
knowledgelink=crawlknowledgebefore(url)
for child in safesystem:
child=child.replace("<span class=\"category\">[ ","")
child=child.replace(" ]</span> ",",")
child=re.subn("<a href=.*?rel=\"nofollow\">",",",child)
child=child[0].replace("</a></p>","")
knowledgelink.append(child)
return knowledgelink
def generaldate(start,end,step=1,format="%Y-%m-%d"):
strptime,strftime=datetime.datetime.strptime,datetime.datetime.strftime
days=(strptime(end,format)-strptime(start,format)).days
return [strftime(strptime(start,format)+datetime.timedelta(i),format) for i in xrange(0,days,step)]
def main():
errorurl=open('errorurl.txt','w')
f=open('knowledge.txt','w')
time=generaldate("2016-01-02", "2018-05-18")
for day in time:
day=day.replace("-","/")
url='https://xuanwulab.github.io/cn/secnews/'+day+'/index.html'
try:
print day
results = crawlknowledge(url)
f.write(day)
f.write('\n')
for result in results:
f.write(result)
f.write('\n')
except :
print url
errorurl.close()
f.close()
if __name__=='__main__':
main()