Skip to content

Commit eb1392d

Browse files
authored
Decommission ROR Schema v1 indexing support (#512)
1 parent 17ebe9e commit eb1392d

7 files changed

Lines changed: 45 additions & 107 deletions

File tree

rorapi/common/views.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -303,11 +303,10 @@ class IndexDataDump(APIView):
303303
permission_classes = [OurTokenPermission]
304304

305305
def get(self, request, filename, dataenv, version=REST_FRAMEWORK["DEFAULT_VERSION"]):
306-
schema = 1
306+
# Always use v2 schema - v1 indexing support has been removed
307+
schema = 2
307308
testdata = True
308309
st = 200
309-
if version == 'v2':
310-
schema = 2
311310
if dataenv == 'prod':
312311
testdata = False
313312
msg = management.call_command("setup", filename, schema=schema, testdata=testdata)

rorapi/management/commands/createindex.py

Lines changed: 3 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -13,12 +13,8 @@ def create_index(self, index, template_file):
1313
self.stdout.write('Created index {}'.format(index))
1414

1515
class Command(BaseCommand):
16-
help = 'Create ROR API index'
16+
help = 'Create ROR API v2 index'
1717

1818
def handle(self, *args, **options):
19-
if(options['schema']==1 or options['schema'] is None):
20-
print("creating v1 index")
21-
create_index(self, ES_VARS['INDEX_V1'], ES_VARS['INDEX_TEMPLATE_ES7_V1'])
22-
if(options['schema']==2 or options['schema'] is None):
23-
print("creating v2 index")
24-
create_index(self, ES_VARS['INDEX_V2'], ES_VARS['INDEX_TEMPLATE_ES7_V2'])
19+
self.stdout.write("creating v2 index")
20+
create_index(self, ES_VARS['INDEX_V2'], ES_VARS['INDEX_TEMPLATE_ES7_V2'])

rorapi/management/commands/deleteindex.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -10,13 +10,11 @@ def delete_index(self, index):
1010
self.stdout.write('Index {} does not exist'.format(index))
1111

1212
class Command(BaseCommand):
13-
help = 'Deletes ROR API index'
13+
help = 'Deletes ROR API v2 index'
1414

1515
def handle(self, *args, **options):
16-
if(options['schema']==1 or options['schema'] is None):
17-
print("deleting v1 index")
18-
delete_index(self, ES_VARS['INDEX_V1'])
19-
if(options['schema']==2 or options['schema'] is None):
20-
print("deleting v2 index")
16+
schema = options.get('schema', 2)
17+
if schema == 2 or schema is None:
18+
self.stdout.write("deleting v2 index")
2119
delete_index(self, ES_VARS['INDEX_V2'])
2220

rorapi/management/commands/indexror.py

Lines changed: 13 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -13,30 +13,10 @@
1313
from django.core.management.base import BaseCommand
1414
from elasticsearch import TransportError
1515

16-
def get_nested_names_v1(org):
17-
yield org['name']
18-
for label in org['labels']:
19-
yield label['label']
20-
for alias in org['aliases']:
21-
yield alias
22-
for acronym in org['acronyms']:
23-
yield acronym
24-
2516
def get_nested_names_v2(org):
2617
for name in org['names']:
2718
yield name['value']
2819

29-
def get_nested_ids_v1(org):
30-
yield org['id']
31-
yield re.sub('https://', '', org['id'])
32-
yield re.sub('https://ror.org/', '', org['id'])
33-
for ext_name, ext_id in org['external_ids'].items():
34-
if ext_name == 'GRID':
35-
yield ext_id['all']
36-
else:
37-
for eid in ext_id['all']:
38-
yield eid
39-
4020
def get_nested_ids_v2(org):
4121
yield org['id']
4222
yield re.sub('https://', '', org['id'])
@@ -150,10 +130,10 @@ def process_files(dir, version):
150130

151131
def index(dataset, version):
152132
err = {}
153-
if version == 'v2':
154-
index = ES_VARS['INDEX_V2']
155-
else:
156-
index = ES_VARS['INDEX_V1']
133+
if version != 'v2':
134+
err[index.__name__] = f"Only v2 schema version is supported. Received: {version}"
135+
return err
136+
index = ES_VARS['INDEX_V2']
157137
backup_index = '{}-tmp'.format(index)
158138
ES7.reindex(body={
159139
'source': {
@@ -174,22 +154,14 @@ def index(dataset, version):
174154
'_id': org['id']
175155
}
176156
})
177-
if 'v2' in index:
178-
org['names_ids'] = [{
179-
'name': n
180-
} for n in get_nested_names_v2(org)]
181-
org['names_ids'] += [{
182-
'id': n
183-
} for n in get_nested_ids_v2(org)]
184-
# experimental affiliations_match nested doc
185-
org['affiliation_match'] = get_affiliation_match_doc(org)
186-
else:
187-
org['names_ids'] = [{
188-
'name': n
189-
} for n in get_nested_names_v1(org)]
190-
org['names_ids'] += [{
191-
'id': n
192-
} for n in get_nested_ids_v1(org)]
157+
org['names_ids'] = [{
158+
'name': n
159+
} for n in get_nested_names_v2(org)]
160+
org['names_ids'] += [{
161+
'id': n
162+
} for n in get_nested_ids_v2(org)]
163+
# experimental affiliations_match nested doc
164+
org['affiliation_match'] = get_affiliation_match_doc(org)
193165
body.append(org)
194166
ES7.bulk(body)
195167
except TransportError:
@@ -211,11 +183,10 @@ class Command(BaseCommand):
211183

212184
def add_arguments(self, parser):
213185
parser.add_argument('dir', type=str, help='add directory name for S3 bucket to be processed')
214-
parser.add_argument('version', type=str, help='schema version of files to be processed')
215186

216187
def handle(self,*args, **options):
217188
dir = options['dir']
218-
version = options['version']
189+
version = 'v2'
219190
process_files(dir, version)
220191

221192

rorapi/management/commands/indexrordump.py

Lines changed: 20 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -12,30 +12,10 @@
1212

1313
HEADERS = {'Accept': 'application/vnd.github.v3+json'}
1414

15-
def get_nested_names_v1(org):
16-
yield org['name']
17-
for label in org['labels']:
18-
yield label['label']
19-
for alias in org['aliases']:
20-
yield alias
21-
for acronym in org['acronyms']:
22-
yield acronym
23-
2415
def get_nested_names_v2(org):
2516
for name in org['names']:
2617
yield name['value']
2718

28-
def get_nested_ids_v1(org):
29-
yield org['id']
30-
yield re.sub('https://', '', org['id'])
31-
yield re.sub('https://ror.org/', '', org['id'])
32-
for ext_name, ext_id in org['external_ids'].items():
33-
if ext_name == 'GRID':
34-
yield ext_id['all']
35-
else:
36-
for eid in ext_id['all']:
37-
yield eid
38-
3919
def get_nested_ids_v2(org):
4020
yield org['id']
4121
yield re.sub('https://', '', org['id'])
@@ -81,22 +61,14 @@ def index_dump(self, filename, index, dataset):
8161
'_id': org['id']
8262
}
8363
})
84-
if 'v2' in index:
85-
org['names_ids'] = [{
86-
'name': n
87-
} for n in get_nested_names_v2(org)]
88-
org['names_ids'] += [{
89-
'id': n
90-
} for n in get_nested_ids_v2(org)]
91-
# experimental affiliations_match nested doc
92-
org['affiliation_match'] = get_affiliation_match_doc(org)
93-
else:
94-
org['names_ids'] = [{
95-
'name': n
96-
} for n in get_nested_names_v1(org)]
97-
org['names_ids'] += [{
98-
'id': n
99-
} for n in get_nested_ids_v1(org)]
64+
org['names_ids'] = [{
65+
'name': n
66+
} for n in get_nested_names_v2(org)]
67+
org['names_ids'] += [{
68+
'id': n
69+
} for n in get_nested_ids_v2(org)]
70+
# experimental affiliations_match nested doc
71+
org['affiliation_match'] = get_affiliation_match_doc(org)
10072
body.append(org)
10173
ES7.bulk(body)
10274
except TransportError:
@@ -134,22 +106,25 @@ def handle(self, *args, **options):
134106
json_files.append(file)
135107
if json_files:
136108
for json_file in json_files:
137-
index = None
138109
json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file
139-
if 'schema_v2' in json_file and (options['schema']==2 or options['schema'] is None):
110+
# Check if file is v2.0+ format or legacy schema_v2 format
111+
version_match = re.match(r'v(\d+)\.(\d+)', json_file)
112+
is_v2_format = False
113+
if version_match:
114+
major, minor = map(int, version_match.groups())
115+
if major >= 2:
116+
is_v2_format = True
117+
elif 'schema_v2' in json_file:
118+
# Legacy format with schema_v2 in filename
119+
is_v2_format = True
120+
121+
if is_v2_format and (options.get('schema') == 2 or options.get('schema') is None):
140122
self.stdout.write('Loading JSON')
141123
with open(json_path, 'r') as it:
142124
dataset = json.load(it)
143125
self.stdout.write('Indexing ROR dataset ' + json_file)
144126
index = ES_VARS['INDEX_V2']
145127
index_dump(self, json_file, index, dataset)
146-
if 'schema_v2' not in json_file and (options['schema']==1 or options['schema'] is None):
147-
self.stdout.write('Loading JSON')
148-
with open(json_path, 'r') as it:
149-
dataset = json.load(it)
150-
self.stdout.write('Indexing ROR dataset ' + json_file)
151-
index = ES_VARS['INDEX_V1']
152-
index_dump(self, json_file, index, dataset)
153128
else:
154129
self.stdout.write("ROR data dump does not contain any JSON files")
155130

rorapi/management/commands/setup.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ class Command(BaseCommand):
3636

3737
def add_arguments(self, parser):
3838
parser.add_argument('filename', type=str, help='Name of data dump zip file to index without extension')
39-
parser.add_argument('-s', '--schema', type=int, choices=[1, 2], help='Schema version to index if only indexing 1 version. Only set if not indexing both versions.')
39+
parser.add_argument('-s', '--schema', type=int, choices=[2], default=2, help='Schema version to index (v2 only)')
4040
parser.add_argument('-t', '--testdata', action='store_true', help='Set flag to pull data dump from ror-data-test instead of ror-data')
4141

4242
def handle(self, *args, **options):
@@ -57,7 +57,7 @@ def handle(self, *args, **options):
5757
DeleteIndexCommand().handle(*args, **options)
5858
CreateIndexCommand().handle(*args, **options)
5959
IndexRorDumpCommand().handle(*args, **options)
60-
msg = 'SUCCESS: ROR dataset {} indexed in version {}. Using test repo: {}'.format(filename, str(options['schema']), str(use_test_data))
60+
msg = 'SUCCESS: ROR dataset {} indexed in v2. Using test repo: {}'.format(filename, str(use_test_data))
6161
except:
6262
msg = 'ERROR: Could not index ROR data dump. Check API logs for details.'
6363
else:

rorapi/settings.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -151,8 +151,7 @@
151151
STATIC_ROOT = os.path.join(BASE_DIR, 'static/')
152152

153153
ES_VARS = {
154-
'INDEX_V1': 'organizations',
155-
'INDEX_TEMPLATE_ES7_V1': os.path.join(BASE_DIR, 'rorapi', 'v1', 'index_template_es7.json'),
154+
'INDEX_V1': 'organizations', # Kept for v1 API queries (backward compatibility)
156155
'INDEX_V2': 'organizations-v2',
157156
'INDEX_TEMPLATE_ES7_V2': os.path.join(BASE_DIR, 'rorapi', 'v2', 'index_template_es7.json'),
158157
'BATCH_SIZE': 20,

0 commit comments

Comments
 (0)