Skip to content

Commit 2086028

Browse files
author
Steve Baskauf
authored
Merge pull request #8 from HeardLibrary/v1-4
v1.4 release
2 parents cc7609d + 3571972 commit 2086028

5 files changed

Lines changed: 317 additions & 46 deletions

File tree

vanderbot/README.md

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ Here are some queries that can be run to explore the data:
3232

3333
[Number of clinical trials at Vanderbilt by principal investigator](https://w.wiki/XKK)
3434

35-
The current release is [v1.3](https://github.com/HeardLibrary/linked-data/releases/tag/v1.3).
35+
The current release is [v1.4](https://github.com/HeardLibrary/linked-data/releases/tag/v1.4).
3636

3737
## How it works
3838

@@ -138,5 +138,11 @@ In the case where there are no reference properties, there also isn't any refere
138138
If there are reference property combinations other than this, the `generate_statement_data()` function can't be used and custom code must be written for that statement.
139139

140140

141+
## Release v1.4 (2020-08-17) notes
142+
143+
The changes made in this release were made following tests that used the `csv-metadata.json` mapping schema to emit RDF from the source CSV tables. In order to make it possible to create all of the kinds of statements present in the Wikidata data model, the `csv-metadata.json` file and `vb6_upload_wikidata.py` script were changed to use the `ps:` namespace (`http://www.wikidata.org/prop/statement/`) properties rather than the `wdt:` namespace properties. This makes it possible to construct the missing `wdt:` statements using SPARQL CONSTRUCT. [A new script](https://github.com/HeardLibrary/linked-data/blob/master/vanderbot/generate_direct_props.py) materializes those triples by a CONSTRUCT query to a SPARQL endpoint whose triplestore contains the triples generated by the schema. Those materialized triples are then loaded into the triplestore, making it possible to perform queries on any graph pattern that can be used at the Wikidata Query Service SPARQL endpoint.
144+
145+
The first five scripts were not changed in this release.
146+
141147
----
142-
Revised 2020-04-23
148+
Revised 2020-08-17

vanderbot/csv-metadata.json

Lines changed: 8 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,8 @@
5959
"titles": "orcid",
6060
"name": "orcid",
6161
"datatype": "string",
62-
"aboutUrl": "http://www.wikidata.org/entity/{wikidataId}",
63-
"propertyUrl": "http://www.wikidata.org/prop/direct/P496"
62+
"aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{orcidStatementUuid}",
63+
"propertyUrl": "http://www.wikidata.org/prop/statement/P496"
6464
},
6565
{
6666
"titles": "orcidReferenceHash",
@@ -89,8 +89,8 @@
8989
"titles": "employer",
9090
"name": "employer",
9191
"datatype": "string",
92-
"aboutUrl": "http://www.wikidata.org/entity/{wikidataId}",
93-
"propertyUrl": "http://www.wikidata.org/prop/direct/P108",
92+
"aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{employerStatementUuid}",
93+
"propertyUrl": "http://www.wikidata.org/prop/statement/P108",
9494
"valueUrl": "http://www.wikidata.org/entity/{employer}"
9595
},
9696
{
@@ -128,8 +128,8 @@
128128
"titles": "affiliation",
129129
"name": "affiliation",
130130
"datatype": "string",
131-
"aboutUrl": "http://www.wikidata.org/entity/{wikidataId}",
132-
"propertyUrl": "http://www.wikidata.org/prop/direct/P1416",
131+
"aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{affiliationStatementUuid}",
132+
"propertyUrl": "http://www.wikidata.org/prop/statement/P1416",
133133
"valueUrl": "http://www.wikidata.org/entity/{affiliation}"
134134
},
135135
{
@@ -167,8 +167,8 @@
167167
"titles": "instanceOf",
168168
"name": "instanceOf",
169169
"datatype": "string",
170-
"aboutUrl": "http://www.wikidata.org/entity/{wikidataId}",
171-
"propertyUrl": "http://www.wikidata.org/prop/direct/P31",
170+
"aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{instanceOfUuid}",
171+
"propertyUrl": "http://www.wikidata.org/prop/statement/P31",
172172
"valueUrl": "http://www.wikidata.org/entity/{instanceOf}"
173173
},
174174
{
@@ -183,34 +183,6 @@
183183
"titles": "sexOrGenderQId",
184184
"name": "sexOrGenderQId",
185185
"datatype": "string",
186-
"aboutUrl": "http://www.wikidata.org/entity/{wikidataId}",
187-
"propertyUrl": "http://www.wikidata.org/prop/direct/P21",
188-
"valueUrl": "http://www.wikidata.org/entity/{sexOrGenderQId}"
189-
},
190-
{
191-
"name": "employerPropertyStatement",
192-
"virtual": true,
193-
"aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{employerStatementUuid}",
194-
"propertyUrl": "http://www.wikidata.org/prop/statement/P108",
195-
"valueUrl": "http://www.wikidata.org/entity/{employer}"
196-
},
197-
{
198-
"name": "affiliationPropertyStatement",
199-
"virtual": true,
200-
"aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{affiliationStatementUuid}",
201-
"propertyUrl": "http://www.wikidata.org/prop/statement/P1416",
202-
"valueUrl": "http://www.wikidata.org/entity/{affiliation}"
203-
},
204-
{
205-
"name": "instanceOfPropertyStatement",
206-
"virtual": true,
207-
"aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{instanceOfUuid}",
208-
"propertyUrl": "http://www.wikidata.org/prop/statement/P31",
209-
"valueUrl": "http://www.wikidata.org/entity/{instanceOf}"
210-
},
211-
{
212-
"name": "sexOrGenderPropertyStatement",
213-
"virtual": true,
214186
"aboutUrl": "http://www.wikidata.org/entity/statement/{wikidataId}-{sexOrGenderUuid}",
215187
"propertyUrl": "http://www.wikidata.org/prop/statement/P21",
216188
"valueUrl": "http://www.wikidata.org/entity/{sexOrGenderQId}"
Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 43,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import requests # best library to manage HTTP transactions\n",
10+
"from bs4 import BeautifulSoup # web-scraping library\n",
11+
"import json\n",
12+
"from time import sleep\n",
13+
"import csv\n",
14+
"import math\n",
15+
"from fuzzywuzzy import fuzz # fuzzy logic matching\n",
16+
"from fuzzywuzzy import process\n",
17+
"import xml.etree.ElementTree as et # library to traverse XML tree\n",
18+
"import urllib\n",
19+
"import datetime\n",
20+
"import string\n",
21+
"from pathlib import Path\n",
22+
"\n",
23+
"# ---------------\n",
24+
"# Configuration data\n",
25+
"# ---------------\n",
26+
"\n",
27+
"graph_name = 'http://nursing'\n",
28+
"accept_media_type = 'text/turtle'\n",
29+
"sparql_endpoint = \"https://sparql.vanderbilt.edu/sparql\"\n",
30+
"request_header_dictionary = {\n",
31+
" #'Content-Type': 'application/sparql-query',\n",
32+
" 'Accept' : accept_media_type\n",
33+
"}\n",
34+
"\n",
35+
"# Load endpoint password from file in home directory\n",
36+
"directory = 'home'\n",
37+
"filename = 'sparql_vanderbilt_edu_password.txt'\n",
38+
"pwd = load_credential(filename, directory)\n",
39+
"\n",
40+
"# ---------------\n",
41+
"# Function definitions\n",
42+
"# ---------------\n",
43+
"\n",
44+
"# Load password from local drive\n",
45+
"# value of directory should be either 'home' or 'working'\n",
46+
"def load_credential(filename, directory):\n",
47+
" cred = ''\n",
48+
" # to change the script to look for the credential in the working directory, change the value of home to empty string\n",
49+
" if directory == 'home':\n",
50+
" home = str(Path.home()) #gets path to home directory; supposed to work for Win and Mac\n",
51+
" credential_path = home + '/' + filename\n",
52+
" else:\n",
53+
" directory = 'working'\n",
54+
" credential_path = filename\n",
55+
" try:\n",
56+
" with open(credential_path, 'rt', encoding='utf-8') as file_object:\n",
57+
" cred = file_object.read()\n",
58+
" except:\n",
59+
" print(filename + ' file not found - is it in your ' + directory + ' directory?')\n",
60+
" exit()\n",
61+
" return(cred)\n",
62+
"\n",
63+
"def retrieve_direct_statements(sparql_endpoint):\n",
64+
" query = '''\n",
65+
"construct {?item ?directProp ?value.}\n",
66+
"from <''' + graph_name + '''>\n",
67+
"where {\n",
68+
" ?item ?p ?statement.\n",
69+
" ?statement ?ps ?value.\n",
70+
" filter(substr(str(?ps),1,39)=\"http://www.wikidata.org/prop/statement/\")\n",
71+
" bind(substr(str(?ps),40) as ?id)\n",
72+
" bind(substr(str(?p),30) as ?id)\n",
73+
" bind(iri(concat(\"http://www.wikidata.org/prop/direct/\", ?id)) as ?directProp)\n",
74+
" }\n",
75+
"'''\n",
76+
" results = []\n",
77+
" r = requests.get(sparql_endpoint, params={'query' : query}, headers=request_header_dictionary)\n",
78+
" return r.text\n",
79+
"\n",
80+
"def perform_sparql_update(sparql_endpoint, pwd, update_command):\n",
81+
" # SPARQL Update requires HTTP POST\n",
82+
" hdr = {'Content-Type' : 'application/sparql-update'}\n",
83+
" r = requests.post(sparql_endpoint, auth=('admin', pwd), headers=hdr, data = update_command)\n",
84+
" print(str(r.status_code) + ' ' + r.url)\n",
85+
" print(r.text)\n"
86+
]
87+
},
88+
{
89+
"cell_type": "code",
90+
"execution_count": 44,
91+
"metadata": {},
92+
"outputs": [],
93+
"source": [
94+
"# ---------------\n",
95+
"# Construct the direct property statements entailed by the Wikibase model and retrieve from endpoint \n",
96+
"# ---------------\n",
97+
"\n",
98+
"graph_text = retrieve_direct_statements(sparql_endpoint)\n",
99+
"#print(graph_text)\n",
100+
"print('constructed triples retrieved')"
101+
]
102+
},
103+
{
104+
"cell_type": "code",
105+
"execution_count": 45,
106+
"metadata": {},
107+
"outputs": [],
108+
"source": [
109+
"# remove prefixes from response Turtle, which are not necessary since IRIs are unabbreviated\n",
110+
"graph_text_list = graph_text.split('\\n')\n",
111+
"# print(graph_text_list)\n",
112+
"graph_text = ''\n",
113+
"for line in graph_text_list:\n",
114+
" try:\n",
115+
" if line[0] != '@':\n",
116+
" graph_text += line + '\\n'\n",
117+
" except:\n",
118+
" pass\n",
119+
"#print()\n",
120+
"#print(graph_text)"
121+
]
122+
},
123+
{
124+
"cell_type": "code",
125+
"execution_count": 46,
126+
"metadata": {},
127+
"outputs": [
128+
{
129+
"name": "stdout",
130+
"output_type": "stream",
131+
"text": [
132+
"200 https://sparql.vanderbilt.edu/sparql\n",
133+
"<!DOCTYPE HTML PUBLIC \"-//W3C//DTD HTML 4.01 Transitional//EN\" \"http://www.w3.org/TR/html4/loose.dtd\"><html><head><meta http-equiv=\"Content-Type\" content=\"text&#47;html;charset=UTF-8\"><title>blazegraph&trade; by SYSTAP</title\n",
134+
"></head\n",
135+
"><body<p>totalElapsed=1ms, elapsed=1ms, connFlush=0ms, batchResolve=0, whereClause=0ms, deleteClause=0ms, insertClause=0ms</p\n",
136+
"><hr><p>COMMIT: totalElapsed=356ms, commitTime=1596944443099, mutationCount=776</p\n",
137+
"></html\n",
138+
">\n",
139+
"\n",
140+
"done\n"
141+
]
142+
}
143+
],
144+
"source": [
145+
"# Send SPARQL 1.1 UPDATE to endpoint to add the constructed triples into the graph\n",
146+
"\n",
147+
"update_command = '''INSERT DATA\n",
148+
"{ GRAPH <''' + graph_name + '''> { \n",
149+
"''' + graph_text + '''\n",
150+
"}}'''\n",
151+
"\n",
152+
"#print(update_command)\n",
153+
"\n",
154+
"perform_sparql_update(sparql_endpoint, pwd, update_command)\n",
155+
"\n",
156+
"print()\n",
157+
"print('done')"
158+
]
159+
},
160+
{
161+
"cell_type": "code",
162+
"execution_count": null,
163+
"metadata": {},
164+
"outputs": [],
165+
"source": []
166+
}
167+
],
168+
"metadata": {
169+
"kernelspec": {
170+
"display_name": "Python 3",
171+
"language": "python",
172+
"name": "python3"
173+
},
174+
"language_info": {
175+
"codemirror_mode": {
176+
"name": "ipython",
177+
"version": 3
178+
},
179+
"file_extension": ".py",
180+
"mimetype": "text/x-python",
181+
"name": "python",
182+
"nbconvert_exporter": "python",
183+
"pygments_lexer": "ipython3",
184+
"version": "3.7.1"
185+
}
186+
},
187+
"nbformat": 4,
188+
"nbformat_minor": 2
189+
}

vanderbot/generate_direct_props.py

Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import requests # best library to manage HTTP transactions
2+
import json
3+
from time import sleep
4+
import csv
5+
import math
6+
import urllib
7+
import datetime
8+
import string
9+
from pathlib import Path
10+
11+
# ---------------
12+
# Configuration data
13+
# ---------------
14+
15+
graph_name = 'http://nursing'
16+
accept_media_type = 'text/turtle'
17+
sparql_endpoint = "https://sparql.vanderbilt.edu/sparql"
18+
request_header_dictionary = {
19+
#'Content-Type': 'application/sparql-query',
20+
'Accept' : accept_media_type
21+
}
22+
23+
# Load endpoint password from file in home directory
24+
directory = 'home'
25+
filename = 'sparql_vanderbilt_edu_password.txt'
26+
27+
# ---------------
28+
# Function definitions
29+
# ---------------
30+
31+
# Load password from local drive
32+
# value of directory should be either 'home' or 'working'
33+
def load_credential(filename, directory):
34+
cred = ''
35+
# to change the script to look for the credential in the working directory, change the value of home to empty string
36+
if directory == 'home':
37+
home = str(Path.home()) #gets path to home directory; supposed to work for Win and Mac
38+
credential_path = home + '/' + filename
39+
else:
40+
directory = 'working'
41+
credential_path = filename
42+
try:
43+
with open(credential_path, 'rt', encoding='utf-8') as file_object:
44+
cred = file_object.read()
45+
except:
46+
print(filename + ' file not found - is it in your ' + directory + ' directory?')
47+
exit()
48+
return(cred)
49+
50+
def retrieve_direct_statements(sparql_endpoint):
51+
query = '''
52+
construct {?item ?directProp ?value.}
53+
from <''' + graph_name + '''>
54+
where {
55+
?item ?p ?statement.
56+
?statement ?ps ?value.
57+
filter(substr(str(?ps),1,39)="http://www.wikidata.org/prop/statement/")
58+
bind(substr(str(?ps),40) as ?id)
59+
bind(substr(str(?p),30) as ?id)
60+
bind(iri(concat("http://www.wikidata.org/prop/direct/", ?id)) as ?directProp)
61+
}
62+
'''
63+
r = requests.get(sparql_endpoint, params={'query' : query}, headers=request_header_dictionary)
64+
return r.text
65+
66+
def perform_sparql_update(sparql_endpoint, pwd, update_command):
67+
# SPARQL Update requires HTTP POST
68+
hdr = {'Content-Type' : 'application/sparql-update'}
69+
r = requests.post(sparql_endpoint, auth=('admin', pwd), headers=hdr, data = update_command)
70+
print(str(r.status_code) + ' ' + r.url)
71+
print(r.text)
72+
73+
# ---------------
74+
# Construct the direct property statements entailed by the Wikibase model and retrieve from endpoint
75+
# ---------------
76+
77+
graph_text = retrieve_direct_statements(sparql_endpoint)
78+
print('constructed triples retrieved')
79+
80+
# remove prefixes from response Turtle, which are not necessary since IRIs are unabbreviated
81+
graph_text_list = graph_text.split('\n')
82+
graph_text = ''
83+
for line in graph_text_list:
84+
try:
85+
if line[0] != '@':
86+
graph_text += line + '\n'
87+
except:
88+
pass
89+
90+
# Send SPARQL 1.1 UPDATE to endpoint to add the constructed triples into the graph
91+
92+
update_command = '''INSERT DATA
93+
{ GRAPH <''' + graph_name + '''> {
94+
''' + graph_text + '''
95+
}}'''
96+
97+
pwd = load_credential(filename, directory)
98+
perform_sparql_update(sparql_endpoint, pwd, update_command)
99+
100+
print()
101+
print('done')

0 commit comments

Comments
 (0)