-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy patharxiv.py
More file actions
110 lines (94 loc) · 5.42 KB
/
arxiv.py
File metadata and controls
110 lines (94 loc) · 5.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/python3
from __future__ import annotations
import http.client
import logging
import socket
import time
from urllib import request, error
from xml.etree import ElementTree
import wd
class Element(wd.Element):
"""When called get_by_id() for a new pre-print, fill as many properties as possible via regular ArXiv API"""
__cache = None
def apply(self, parsed_data: Model):
super().apply(parsed_data)
if ('en' not in self.entity['labels']) and parsed_data and parsed_data.label:
self.entity['labels']['en'] = {'value': parsed_data.label, 'language': 'en'}
def obtain_claim(self, snak: dict):
if snak is not None:
if snak['property'] == 'P2093' and 'qualifiers' in snak and 'P1545' in snak['qualifiers']:
for property_id in ['P50', 'P2093']:
if property_id in self.entity['claims']:
for claim in self.entity['claims'][property_id]:
if 'qualifiers' in claim and 'P1545' in claim['qualifiers']:
if claim['qualifiers']['P1545'][0]['datavalue']['value'] == snak['qualifiers']['P1545']:
return
elif snak['property'] in self.entity['claims']:
return
return super().obtain_claim(snak)
class Model(wd.Model):
property, db_ref, item, chunk, suffix = 'P818', 'Q118398', Element, {}, 'metadataPrefix=arXiv'
def __init__(self, external_id: str, snaks: list = None):
super().__init__(external_id, snaks)
self.label, self.__doi = '', ''
@staticmethod
def arxiv_xml(query: str) -> ElementTree:
for retries in range(5):
try:
with request.urlopen((url := 'https://export.arxiv.org/' + query), timeout=180) as file:
return ElementTree.fromstring(file.read())
except error.HTTPError as e:
if e.status == 503 and 'Retry-After' in e.headers:
time.sleep(int(e.headers['Retry-After']))
continue
logging.error('While fetching {} got error: {}'.format(url, e.__str__()))
except (http.client.IncompleteRead, ConnectionResetError, socket.timeout) as e:
logging.error('While fetching {} got error: {}'.format(url, e.__str__()))
time.sleep(1800)
@classmethod
def prepare_data(cls, external_id: str) -> []:
if tree := Model.arxiv_xml('api/query?id_list=' + external_id):
model = super().prepare_data(external_id)
model.input_snaks.append(model.transform('P31', 'Q13442814'))
model.label = ' '.join(tree.findall('*/w3:title', Model.config('ns'))[0].text.split())
model.input_snaks.append(model.transform('P1476', model.label))
author_num = 0
for author in tree.findall('*/*/w3:name', Model.config('ns')):
if len(author.text.strip()) > 3:
snak = model.transform('P2093', author.text.strip())
snak['qualifiers'] = {'P1545': str(author_num := author_num + 1)}
model.input_snaks.append(snak)
if len(doi_list := tree.findall('*/arxiv:doi', Model.config('ns'))) == 1:
model.__doi = doi_list[0].text.upper()
model.input_snaks.append(model.transform('P356', model.__doi))
return model
@classmethod # -------------------- Arxiv Bulk Data Access part --------------------
def next(cls):
cls.chunk = {}
if tree := cls.arxiv_xml('oai2?verb=ListRecords&' + cls.suffix):
for pp in tree.findall('.//oa:arXiv', Model.config('ns')):
if len(lst := pp.findall('oa:doi', Model.config('ns'))) > 0:
doi = lst[0].text.split()[0].replace('\\', '').upper()
Model.chunk[pp.find('oa:id', Model.config('ns')).text] = doi
if (element := tree.find('.//oai:resumptionToken', Model.config('ns'))) is not None and element.text:
cls.suffix = 'resumptionToken=' + element.text
else:
return None
return cls.chunk.keys()
def get_qid(self):
if self.__doi and (result := Element.haswbstatement(self.__doi, 'P356')): # Found by DOI
self.input_snaks = [wd.Wikidata.create_snak(self.property, self.external_id)] # only ArXiv-ID needed
return result
if Model.initialize(__file__): # if not imported
# Model.get_by_id('2110.15392', forced=True).save() # Uncomment to debug processing single preprint
SUMMARY = 'extracted from [[Q118398]] based on [[Property:{}]]: {}'
QUERY = 'SELECT ?c ?i {{VALUES ?c {{\'{}\'}} ?i p:P356/ps:P356 ?c MINUS {{?i p:P818 []; p:P356 []}}}}'
no_doi_items = wd.Wikidata.query('SELECT ?c ?i {?i p:P818/ps:P818 ?c MINUS {?i p:P818 []; p:P356 []}}')
while Model.next() is not None:
no_arxiv_items = wd.Wikidata.query(QUERY.format('\' \''.join(Model.chunk.values())))
for p818 in Model.chunk:
if p356 := Model.chunk[p818]:
if no_doi_items and (qid := no_doi_items.pop(p818, 0)): # DOI is absent
wd.Claim.construct(wd.Wikidata.create_snak('P356', p356), qid).save(SUMMARY.format('P818', p818))
elif no_arxiv_items and (qid := no_arxiv_items.pop(p356, 0)): # Arxiv is absent
wd.Claim.construct(wd.Wikidata.create_snak('P818', p818), qid).save(SUMMARY.format('P356', p356))