-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathinsertdb.py
More file actions
127 lines (121 loc) · 5.09 KB
/
insertdb.py
File metadata and controls
127 lines (121 loc) · 5.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
"""Update articles.json using information from people.json.
This will potentially add orcid and affiliations to the
authors in articles.json. people.json is much larger, so
we have to read it as streaming JSON and update articles as
we go.
The algorithm is to read articles.json and build a dict
of authorkeys containing the DBLP key for each author of
an article in articles.json. Then we stream through
people.json and update each author entry with orcid and/or
affiliations. Then we pass back over articles.json and
update the authors to contain orcid and affiliations.
"""
from pathlib import Path
import json
import pymysql
import sys
from naya import stream_array, tokenize
def get_authors(articles_file):
"""Read the articles file and return a dict mapping author keys
to a dict that contains orcid and affiliations."""
articles = json.loads(Path(articles_file).read_text(encoding='UTF-8'))
authors = dict()
for paper in articles:
auths = paper.get('authors', [])
for auth in auths:
namekey = auth['n']
if namekey in authors:
if 'orcid' in auth:
authors[namekey]['orcid'] = auth['orcid']
else:
authors[namekey] = {}
return authors
def add_allpeopledata(authors):
"""Stream through allpeople.json and update the authors dict with
affiliations and orcid when they are found in allpeople.json.
"""
with open('allpeople.json', 'r') as fp:
people = stream_array(tokenize(fp))
for person in people:
for key in person.get('namekeys', []):
if key in authors:
authors[key]['affiliations'] = person.get('affiliations', [])
if 'orcid' in person:
authors[key]['orcid'] = person['orcid']
return authors
def update_authors(articles, authors):
for article in articles:
for auth in article.get('authors'):
authkey = auth['n']
authdata = authors.get(authkey, {})
affiliations = authdata.get('affiliations')
if affiliations:
auth['affiliations'] = affiliations
if 'orcid' in authdata:
auth['orcid'] = authdata['orcid']
return articles
db = pymysql.connect(host='localhost',
passwd='dblpd@t@',
db='dblpconflicts',
user='dblpconflicts',
charset='utf8mb4')
cursor = db.cursor(pymysql.cursors.DictCursor)
authors = get_authors('articles.json')
Path('authors.json').write_text(json.dumps(authors, indent=2), encoding='UTF-8')
authors = add_allpeopledata(authors)
articles = json.loads(Path('articles.json').read_text(encoding='UTF-8'))
articles = update_authors(articles, authors)
Path('articles2.json').write_text(json.dumps(articles, indent=2), encoding='UTF-8')
counter = 0
skipcounter = 0
cursor.execute('SELECT dblpkey from article')
rows = cursor.fetchall()
dblpkeys = {row['dblpkey'] for row in rows}
for article in articles:
if article.get('key') in dblpkeys:
skipcounter += 1
else:
venue = '/'.join(article['key'].split('/')[:2])
counter += 1
if (counter % 1000 == 0):
print('counter=', counter)
args = (article['mdate'],
article['key'],
venue,
article['type'],
article['title'],
article['year'],
article['pages'],
article.get('volume'),
article.get('number'),
article.get('publisher'),
article.get('isbn'),
article.get('series'),
article.get('booktitle'),
article.get('journal'),
article.get('doi'))
cursor.execute('INSERT INTO `article` (mdate,dblpkey,venue,type,title,year,pages,volume,number,publisher,isbn,series,booktitle,journal,doi) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)', args)
pubkey = cursor.lastrowid
authornumber = 0
for author in article['authors']:
authornumber += 1
dblpkey = author.get('n')
orcid = author.get('orcid', None)
parts = dblpkey.split(' ')
if parts[-1].isnumeric():
parts.pop()
name = ' '.join(parts)
lastname = parts[-1]
cursor.execute("SELECT authorkey FROM author WHERE dblpkey=%s", (dblpkey,))
row = cursor.fetchone()
if row:
authorkey = row['authorkey']
else:
affiliations = ';'.join(author.get('affiliations', []))
cursor.execute('INSERT INTO author (name,lastname,orcid,dblpkey,affiliations) values (%s,%s,%s,%s,%s)', (name, lastname, orcid, dblpkey, affiliations))
authorkey = cursor.lastrowid
cursor.execute('INSERT INTO authorship (pubkey,authorkey,authornumber,publishedasname) VALUES (%s,%s,%s,%s)', (pubkey,authorkey,authornumber,name))
if counter % 1000 == 0:
db.commit()
db.commit()
print('counter=', counter, 'skipcounter=', skipcounter)