|
12 | 12 |
|
13 | 13 | HEADERS = {'Accept': 'application/vnd.github.v3+json'} |
14 | 14 |
|
15 | | -def get_nested_names_v1(org): |
16 | | - yield org['name'] |
17 | | - for label in org['labels']: |
18 | | - yield label['label'] |
19 | | - for alias in org['aliases']: |
20 | | - yield alias |
21 | | - for acronym in org['acronyms']: |
22 | | - yield acronym |
23 | | - |
24 | 15 | def get_nested_names_v2(org): |
25 | 16 | for name in org['names']: |
26 | 17 | yield name['value'] |
27 | 18 |
|
28 | | -def get_nested_ids_v1(org): |
29 | | - yield org['id'] |
30 | | - yield re.sub('https://', '', org['id']) |
31 | | - yield re.sub('https://ror.org/', '', org['id']) |
32 | | - for ext_name, ext_id in org['external_ids'].items(): |
33 | | - if ext_name == 'GRID': |
34 | | - yield ext_id['all'] |
35 | | - else: |
36 | | - for eid in ext_id['all']: |
37 | | - yield eid |
38 | | - |
39 | 19 | def get_nested_ids_v2(org): |
40 | 20 | yield org['id'] |
41 | 21 | yield re.sub('https://', '', org['id']) |
@@ -81,22 +61,14 @@ def index_dump(self, filename, index, dataset): |
81 | 61 | '_id': org['id'] |
82 | 62 | } |
83 | 63 | }) |
84 | | - if 'v2' in index: |
85 | | - org['names_ids'] = [{ |
86 | | - 'name': n |
87 | | - } for n in get_nested_names_v2(org)] |
88 | | - org['names_ids'] += [{ |
89 | | - 'id': n |
90 | | - } for n in get_nested_ids_v2(org)] |
91 | | - # experimental affiliations_match nested doc |
92 | | - org['affiliation_match'] = get_affiliation_match_doc(org) |
93 | | - else: |
94 | | - org['names_ids'] = [{ |
95 | | - 'name': n |
96 | | - } for n in get_nested_names_v1(org)] |
97 | | - org['names_ids'] += [{ |
98 | | - 'id': n |
99 | | - } for n in get_nested_ids_v1(org)] |
| 64 | + org['names_ids'] = [{ |
| 65 | + 'name': n |
| 66 | + } for n in get_nested_names_v2(org)] |
| 67 | + org['names_ids'] += [{ |
| 68 | + 'id': n |
| 69 | + } for n in get_nested_ids_v2(org)] |
| 70 | + # experimental affiliations_match nested doc |
| 71 | + org['affiliation_match'] = get_affiliation_match_doc(org) |
100 | 72 | body.append(org) |
101 | 73 | ES7.bulk(body) |
102 | 74 | except TransportError: |
@@ -134,22 +106,25 @@ def handle(self, *args, **options): |
134 | 106 | json_files.append(file) |
135 | 107 | if json_files: |
136 | 108 | for json_file in json_files: |
137 | | - index = None |
138 | 109 | json_path = os.path.join(DATA['WORKING_DIR'], filename, '') + json_file |
139 | | - if 'schema_v2' in json_file and (options['schema']==2 or options['schema'] is None): |
| 110 | + # Check if file is v2.0+ format or legacy schema_v2 format |
| 111 | + version_match = re.match(r'v(\d+)\.(\d+)', json_file) |
| 112 | + is_v2_format = False |
| 113 | + if version_match: |
| 114 | + major, minor = map(int, version_match.groups()) |
| 115 | + if major >= 2: |
| 116 | + is_v2_format = True |
| 117 | + elif 'schema_v2' in json_file: |
| 118 | + # Legacy format with schema_v2 in filename |
| 119 | + is_v2_format = True |
| 120 | + |
| 121 | + if is_v2_format and (options.get('schema') == 2 or options.get('schema') is None): |
140 | 122 | self.stdout.write('Loading JSON') |
141 | 123 | with open(json_path, 'r') as it: |
142 | 124 | dataset = json.load(it) |
143 | 125 | self.stdout.write('Indexing ROR dataset ' + json_file) |
144 | 126 | index = ES_VARS['INDEX_V2'] |
145 | 127 | index_dump(self, json_file, index, dataset) |
146 | | - if 'schema_v2' not in json_file and (options['schema']==1 or options['schema'] is None): |
147 | | - self.stdout.write('Loading JSON') |
148 | | - with open(json_path, 'r') as it: |
149 | | - dataset = json.load(it) |
150 | | - self.stdout.write('Indexing ROR dataset ' + json_file) |
151 | | - index = ES_VARS['INDEX_V1'] |
152 | | - index_dump(self, json_file, index, dataset) |
153 | 128 | else: |
154 | 129 | self.stdout.write("ROR data dump does not contain any JSON files") |
155 | 130 |
|
|
0 commit comments