Decommission ROR Schema v1 indexing support (#512)

jrhoads · web-flow · commit eb1392d7d81c · 2026-01-15T14:01:21.000-05:00
diff --git a/rorapi/common/views.py b/rorapi/common/views.py
@@ -303,11 +303,10 @@ class IndexDataDump(APIView):
     permission_classes = [OurTokenPermission]
 
     def get(self, request, filename, dataenv, version=REST_FRAMEWORK["DEFAULT_VERSION"]):
-        schema = 1
+        # Always use v2 schema - v1 indexing support has been removed
+        schema = 2
         testdata = True
         st = 200
-        if version == 'v2':
-            schema = 2
         if dataenv == 'prod':
             testdata = False
         msg = management.call_command("setup", filename, schema=schema, testdata=testdata)
diff --git a/rorapi/management/commands/createindex.py b/rorapi/management/commands/createindex.py
@@ -13,12 +13,8 @@ def create_index(self, index, template_file):
     self.stdout.write('Created index {}'.format(index))
 
 class Command(BaseCommand):
-    help = 'Create ROR API index'
+    help = 'Create ROR API v2 index'
 
     def handle(self, *args, **options):
-        if(options['schema']==1 or options['schema'] is None):
-            print("creating v1 index")
-            create_index(self, ES_VARS['INDEX_V1'], ES_VARS['INDEX_TEMPLATE_ES7_V1'])
-        if(options['schema']==2 or options['schema'] is None):
-            print("creating v2 index")
-            create_index(self, ES_VARS['INDEX_V2'], ES_VARS['INDEX_TEMPLATE_ES7_V2'])
+        self.stdout.write("creating v2 index")
+        create_index(self, ES_VARS['INDEX_V2'], ES_VARS['INDEX_TEMPLATE_ES7_V2'])
diff --git a/rorapi/management/commands/deleteindex.py b/rorapi/management/commands/deleteindex.py
@@ -10,13 +10,11 @@ def delete_index(self, index):
         self.stdout.write('Index {} does not exist'.format(index))
 
 class Command(BaseCommand):
-    help = 'Deletes ROR API index'
+    help = 'Deletes ROR API v2 index'
 
     def handle(self, *args, **options):
-        if(options['schema']==1 or options['schema'] is None):
-            print("deleting v1 index")
-            delete_index(self, ES_VARS['INDEX_V1'])
-        if(options['schema']==2 or options['schema'] is None):
-            print("deleting v2 index")
+        schema = options.get('schema', 2)
+        if schema == 2 or schema is None:
+            self.stdout.write("deleting v2 index")
             delete_index(self, ES_VARS['INDEX_V2'])
 
diff --git a/rorapi/management/commands/indexror.py b/rorapi/management/commands/indexror.py
@@ -13,30 +13,10 @@
 from django.core.management.base import BaseCommand
 from elasticsearch import TransportError
 
-def get_nested_names_v1(org):
-    yield org['name']
-    for label in org['labels']:
-        yield label['label']
-    for alias in org['aliases']:
-        yield alias
-    for acronym in org['acronyms']:
-        yield acronym
-
 def get_nested_names_v2(org):
     for name in org['names']:
         yield name['value']
 
-def get_nested_ids_v1(org):
-    yield org['id']
-    yield re.sub('https://', '', org['id'])
-    yield re.sub('https://ror.org/', '', org['id'])
-    for ext_name, ext_id in org['external_ids'].items():
-        if ext_name == 'GRID':
-            yield ext_id['all']
-        else:
-            for eid in ext_id['all']:
-                yield eid
-
 def get_nested_ids_v2(org):
     yield org['id']
     yield re.sub('https://', '', org['id'])
@@ -150,10 +130,10 @@ def process_files(dir, version):
 
 def index(dataset, version):
     err = {}
-    if version == 'v2':
-        index = ES_VARS['INDEX_V2']
-    else:
-        index = ES_VARS['INDEX_V1']
+    if version != 'v2':
+        err[index.__name__] = f"Only v2 schema version is supported. Received: {version}"
+        return err
+    index = ES_VARS['INDEX_V2']
     backup_index = '{}-tmp'.format(index)
     ES7.reindex(body={
         'source': {
@@ -174,22 +154,14 @@ def index(dataset, version):
                         '_id': org['id']
                     }
                 })
-                if 'v2' in index:
-                    org['names_ids'] = [{
-                        'name': n
-                    } for n in get_nested_names_v2(org)]
-                    org['names_ids'] += [{
-                        'id': n
-                    } for n in get_nested_ids_v2(org)]
-                    # experimental affiliations_match nested doc
-                    org['affiliation_match'] = get_affiliation_match_doc(org)
-                else:
-                    org['names_ids'] = [{
-                        'name': n
-                    } for n in get_nested_names_v1(org)]
-                    org['names_ids'] += [{
-                        'id': n
-                    } for n in get_nested_ids_v1(org)]
+                org['names_ids'] = [{
+                    'name': n
+                } for n in get_nested_names_v2(org)]
+                org['names_ids'] += [{
+                    'id': n
+                } for n in get_nested_ids_v2(org)]
+                # experimental affiliations_match nested doc
+                org['affiliation_match'] = get_affiliation_match_doc(org)
                 body.append(org)
             ES7.bulk(body)
     except TransportError:
@@ -211,11 +183,10 @@ class Command(BaseCommand):
 
     def add_arguments(self, parser):
         parser.add_argument('dir', type=str, help='add directory name for S3 bucket to be processed')
-        parser.add_argument('version', type=str, help='schema version of files to be processed')
 
     def handle(self,*args, **options):
         dir = options['dir']
-        version = options['version']
+        version = 'v2'
         process_files(dir, version)
 
 
diff --git a/rorapi/management/commands/indexrordump.py b/rorapi/management/commands/indexrordump.py
@@ -12,30 +12,10 @@
 
 HEADERS = {'Accept': 'application/vnd.github.v3+json'}
 
-def get_nested_names_v1(org):
-    yield org['name']
-    for label in org['labels']:
-        yield label['label']
-    for alias in org['aliases']:
-        yield alias
-    for acronym in org['acronyms']:
-        yield acronym
-
 def get_nested_names_v2(org):
     for name in org['names']:
         yield name['value']
 
-def get_nested_ids_v1(org):
-    yield org['id']
-    yield re.sub('https://', '', org['id'])
-    yield re.sub('https://ror.org/', '', org['id'])
-    for ext_name, ext_id in org['external_ids'].items():
-        if ext_name == 'GRID':
-            yield ext_id['all']
-        else:
-            for eid in ext_id['all']:
-                yield eid
-
 def get_nested_ids_v2(org):
     yield org['id']
     yield re.sub('https://', '', org['id'])
@@ -81,22 +61,14 @@ def index_dump(self, filename, index, dataset):
                         '_id': org['id']
                     }
                 })
-                if 'v2' in index:
-                    org['names_ids'] = [{
-                        'name': n
-                    } for n in get_nested_names_v2(org)]
-                    org['names_ids'] += [{
-                        'id': n
-                    } for n in get_nested_ids_v2(org)]
-                    # experimental affiliations_match nested doc
-                    org['affiliation_match'] = get_affiliation_match_doc(org)
-                else:
-                    org['names_ids'] = [{
-                        'name': n
-                    } for n in get_nested_names_v1(org)]
-                    org['names_ids'] += [{
-                        'id': n
-                    } for n in get_nested_ids_v1(org)]
+                org['names_ids'] = [{
+                    'name': n
+                } for n in get_nested_names_v2(org)]
+                org['names_ids'] += [{
+                    'id': n
+                } for n in get_nested_ids_v2(org)]
+                # experimental affiliations_match nested doc
+                org['affiliation_match'] = get_affiliation_match_doc(org)
                 body.append(org)
             ES7.bulk(body)
     except TransportError:
@@ -134,22 +106,25 @@ def handle(self, *args, **options):
                     json_files.append(file)
             if json_files:
                 for json_file in json_files:
-                    index = None
                     json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file
-                    if 'schema_v2' in json_file and (options['schema']==2 or options['schema'] is None):
+                    # Check if file is v2.0+ format or legacy schema_v2 format
+                    version_match = re.match(r'v(\d+)\.(\d+)', json_file)
+                    is_v2_format = False
+                    if version_match:
+                        major, minor = map(int, version_match.groups())
+                        if major >= 2:
+                            is_v2_format = True
+                    elif 'schema_v2' in json_file:
+                        # Legacy format with schema_v2 in filename
+                        is_v2_format = True
+                    
+                    if is_v2_format and (options.get('schema') == 2 or options.get('schema') is None):
                         self.stdout.write('Loading JSON')
                         with open(json_path, 'r') as it:
                             dataset = json.load(it)
                         self.stdout.write('Indexing ROR dataset ' + json_file)
                         index = ES_VARS['INDEX_V2']
                         index_dump(self, json_file, index, dataset)
-                    if 'schema_v2' not in json_file and (options['schema']==1 or options['schema'] is None):
-                        self.stdout.write('Loading JSON')
-                        with open(json_path, 'r') as it:
-                            dataset = json.load(it)
-                        self.stdout.write('Indexing ROR dataset ' + json_file)
-                        index = ES_VARS['INDEX_V1']
-                        index_dump(self, json_file, index, dataset)
             else:
                 self.stdout.write("ROR data dump does not contain any JSON files")
 
diff --git a/rorapi/management/commands/setup.py b/rorapi/management/commands/setup.py
@@ -36,7 +36,7 @@ class Command(BaseCommand):
 
     def add_arguments(self, parser):
         parser.add_argument('filename', type=str, help='Name of data dump zip file to index without extension')
-        parser.add_argument('-s', '--schema', type=int, choices=[1, 2], help='Schema version to index if only indexing 1 version. Only set if not indexing both versions.')
+        parser.add_argument('-s', '--schema', type=int, choices=[2], default=2, help='Schema version to index (v2 only)')
         parser.add_argument('-t', '--testdata', action='store_true', help='Set flag to pull data dump from ror-data-test instead of ror-data')
 
     def handle(self, *args, **options):
@@ -57,7 +57,7 @@ def handle(self, *args, **options):
                 DeleteIndexCommand().handle(*args, **options)
                 CreateIndexCommand().handle(*args, **options)
                 IndexRorDumpCommand().handle(*args, **options)
-                msg = 'SUCCESS: ROR dataset {} indexed in version {}. Using test repo: {}'.format(filename, str(options['schema']), str(use_test_data))
+                msg = 'SUCCESS: ROR dataset {} indexed in v2. Using test repo: {}'.format(filename, str(use_test_data))
             except:
                 msg = 'ERROR: Could not index ROR data dump. Check API logs for details.'
         else:
diff --git a/rorapi/settings.py b/rorapi/settings.py
@@ -151,8 +151,7 @@
 STATIC_ROOT = os.path.join(BASE_DIR, 'static/')
 
 ES_VARS = {
-    'INDEX_V1': 'organizations',
-    'INDEX_TEMPLATE_ES7_V1': os.path.join(BASE_DIR, 'rorapi', 'v1', 'index_template_es7.json'),
+    'INDEX_V1': 'organizations',  # Kept for v1 API queries (backward compatibility)
     'INDEX_V2': 'organizations-v2',
     'INDEX_TEMPLATE_ES7_V2': os.path.join(BASE_DIR, 'rorapi', 'v2', 'index_template_es7.json'),
     'BATCH_SIZE': 20,