-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcollector.py
More file actions
95 lines (86 loc) · 3.79 KB
/
collector.py
File metadata and controls
95 lines (86 loc) · 3.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/python
import urllib
import xml.sax
import urllib2
from xml.dom.minidom import parse
import xml.dom.minidom
import re
journalDict = {}
journalDict['KXTB'] = "科学通报"
journalName = 'KXTB'
year = "2015"
outputFile = open(journalDict[journalName] + 'csv','w')
outputFile.write('year,issue,articlTitle,autors,orgns\n')
outputFile.write('2015')
outputFile.write('01')
outputFile.write('test1')
outputFile.write('autor1 author2')
outputFile.write('orgn1\n')
outputFile.write('2015')
outputFile.close()
###require year list
url = "http://navi.cnki.net/knavi/JournalDetail/GetJournalYearList?pcode=CJFD&pykm="+journalName+"&pIdx=0"
req = urllib2.Request(url)
resData = urllib2.urlopen(req)
res = resData.read()
###write the response buffer to temp file
file_object = open('temp.xml', 'w')
file_object.write(res)
file_object.close()
###read temp file
DOMTree = xml.dom.minidom.parse("temp.xml")
collection = DOMTree.documentElement
years = collection.getElementsByTagName("dl")
###year list
for yearItem in years:
#print "*******",year.getAttribute("id"),"*********"
em = yearItem.getElementsByTagName("em")
yearNum = em.item(0).firstChild.data
print yearNum
if yearNum == year:
###issue list
dds = yearItem.getElementsByTagName("dd")
for dd in dds:
a = dd.getElementsByTagName("a")
for aItem in a:
aItmId = aItem.getAttribute("id")
issue = aItmId[6:]
print "| ",issue
hasArticl = True
articlIndex = 1
while hasArticl:
if(articlIndex < 10):
articlDetailUrl = "http://kns.cnki.net/kcms/detail/detail.aspx?dbcode=CJFD&filename=" + journalName + year + issue + "00"+ str(articlIndex) +"&dbname=CJFDLAST" + year
elif(articlIndex>=10 and articlIndex<100):
articlDetailUrl = "http://kns.cnki.net/kcms/detail/detail.aspx?dbcode=CJFD&filename=" + journalName + year + issue + "0" + str(articlIndex) + "&dbname=CJFDLAST" + year
else:
articlDetailUrl = "http://kns.cnki.net/kcms/detail/detail.aspx?dbcode=CJFD&filename=" + journalName + year + issue + str(articlIndex)+"&dbname=CJFDLAST" + year
print articlDetailUrl
articlReq = urllib2.Request(articlDetailUrl)
resData = urllib2.urlopen(articlReq)
articlRes = resData.read()
titleObj = re.search(r'<h2\s*class="title".*?h2>',articlRes)
if titleObj == None:
break
print titleObj.group()
authorDiv = re.search(r'<div\s*class="author".*?div>',articlRes)
authorList = re.findall(r'>[^<][^>]+?<',authorDiv.group())
for author in authorList:
print author
orgnDiv = re.search(r'<div\s*class="orgn".*?div>', articlRes)
orgnList = re.findall(r'>[^<][^>]+?<',orgnDiv.group())
for orgn in orgnList:
print orgn
# fundDiv = re.search(r'<p>.*catalog_FUND. *?</p>', articlRes)
# if fundDiv:
# print fundDiv.group()
###write the response buffer to temp file
# file_object = open('fileDetail'+str(articlIndex)+'.xml', 'w')
# file_object.write(articlRes)
# file_object.close()
#
# ###read temp file
# DOMTree = xml.dom.minidom.parse("fileDetail.xml")
# collection = DOMTree.documentElement
#years = collection.getElementsByTagName("dl")
articlIndex+=1