From de7595e84962c49c7874e62c0691bd3548bcdf81 Mon Sep 17 00:00:00 2001 From: Joseph Rhoads Date: Thu, 26 Feb 2026 17:14:18 +0100 Subject: [PATCH 1/2] Add configurable default and parameter-based strategy selection for affiliation matching (#526) * feat: add SINGLE_SEARCH_DEFAULT setting for affiliation matching * feat: add multisearch support and single search default to affiliation matching * test: add unit tests for affiliation single search and multisearch logic --- rorapi/common/views.py | 5 ++ rorapi/settings.py | 4 ++ rorapi/tests/tests_unit/tests_views_v2.py | 88 ++++++++++++++++++++++- 3 files changed, 96 insertions(+), 1 deletion(-) diff --git a/rorapi/common/views.py b/rorapi/common/views.py index cd699aa..557965a 100644 --- a/rorapi/common/views.py +++ b/rorapi/common/views.py @@ -14,6 +14,7 @@ from rorapi.common.create_update import new_record_from_json, update_record_from_json from rorapi.common.csv_bulk import process_csv from rorapi.common.csv_utils import validate_csv +from django.conf import settings from rorapi.settings import REST_FRAMEWORK, ES7, ES_VARS from rorapi.common.matching import match_organizations from rorapi.common.matching_single_search import match_organizations as single_search_match_organizations @@ -153,6 +154,10 @@ def list(self, request, version=REST_FRAMEWORK["DEFAULT_VERSION"]): if "affiliation" in params: if "single_search" in params: errors, organizations = single_search_match_organizations(params) + elif "multisearch" in params: + errors, organizations = match_organizations(params) + elif settings.SINGLE_SEARCH_DEFAULT: + errors, organizations = single_search_match_organizations(params) else: errors, organizations = match_organizations(params) else: diff --git a/rorapi/settings.py b/rorapi/settings.py index 6ae0cd5..d49ede2 100644 --- a/rorapi/settings.py +++ b/rorapi/settings.py @@ -299,6 +299,10 @@ # Toggle for behavior-based rate limiting ENABLE_BEHAVIORAL_LIMITING = os.getenv("ENABLE_BEHAVIORAL_LIMITING", "False") == "True" +# When True, affiliation matching defaults to single search; otherwise multisearch. +# Request params single_search and multisearch override this. +SINGLE_SEARCH_DEFAULT = os.getenv("SINGLE_SEARCH_DEFAULT", "False") == "True" + # Email settings for Django EMAIL_BACKEND = 'django_ses.SESBackend' AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID') diff --git a/rorapi/tests/tests_unit/tests_views_v2.py b/rorapi/tests/tests_unit/tests_views_v2.py index 8811197..88649e0 100644 --- a/rorapi/tests/tests_unit/tests_views_v2.py +++ b/rorapi/tests/tests_unit/tests_views_v2.py @@ -2,7 +2,7 @@ import mock import os -from django.test import SimpleTestCase, Client +from django.test import SimpleTestCase, Client, override_settings from rest_framework.test import APIRequestFactory from rorapi.common import views @@ -88,6 +88,92 @@ def test_query_redirect(self, search_mock): response = client.get('/v2/organizations?query.names=query') self.assertRedirects(response, '/v2/organizations?query=query') + +class _MockMatchingResult: + """Minimal object matching MatchingResultSerializer expectations.""" + number_of_results = 0 + items = [] + + +class AffiliationDefaultMatchingTestCase(SimpleTestCase): + """Test SINGLE_SEARCH_DEFAULT and single_search/multisearch params.""" + V2_VERSION = 'v2' + + @override_settings(SINGLE_SEARCH_DEFAULT=False) + @mock.patch('rorapi.common.views.single_search_match_organizations') + @mock.patch('rorapi.common.views.match_organizations') + def test_affiliation_default_uses_multisearch_when_setting_off( + self, match_organizations_mock, single_search_mock): + mock_result = (None, _MockMatchingResult()) + match_organizations_mock.return_value = mock_result + single_search_mock.return_value = mock_result + + view = views.OrganizationViewSet.as_view({'get': 'list'}) + request = factory.get('/v2/organizations', {'affiliation': 'Harvard University'}) + response = view(request, version=self.V2_VERSION) + response.render() + + match_organizations_mock.assert_called_once() + single_search_mock.assert_not_called() + + @override_settings(SINGLE_SEARCH_DEFAULT=True) + @mock.patch('rorapi.common.views.single_search_match_organizations') + @mock.patch('rorapi.common.views.match_organizations') + def test_affiliation_default_uses_single_search_when_setting_on( + self, match_organizations_mock, single_search_mock): + mock_result = (None, _MockMatchingResult()) + match_organizations_mock.return_value = mock_result + single_search_mock.return_value = mock_result + + view = views.OrganizationViewSet.as_view({'get': 'list'}) + request = factory.get('/v2/organizations', {'affiliation': 'Harvard University'}) + response = view(request, version=self.V2_VERSION) + response.render() + + single_search_mock.assert_called_once() + match_organizations_mock.assert_not_called() + + @override_settings(SINGLE_SEARCH_DEFAULT=True) + @mock.patch('rorapi.common.views.single_search_match_organizations') + @mock.patch('rorapi.common.views.match_organizations') + def test_affiliation_multisearch_param_overrides_setting( + self, match_organizations_mock, single_search_mock): + mock_result = (None, _MockMatchingResult()) + match_organizations_mock.return_value = mock_result + single_search_mock.return_value = mock_result + + view = views.OrganizationViewSet.as_view({'get': 'list'}) + request = factory.get( + '/v2/organizations', + {'affiliation': 'Harvard University', 'multisearch': ''} + ) + response = view(request, version=self.V2_VERSION) + response.render() + + match_organizations_mock.assert_called_once() + single_search_mock.assert_not_called() + + @override_settings(SINGLE_SEARCH_DEFAULT=False) + @mock.patch('rorapi.common.views.single_search_match_organizations') + @mock.patch('rorapi.common.views.match_organizations') + def test_affiliation_single_search_param_uses_single_search( + self, match_organizations_mock, single_search_mock): + mock_result = (None, _MockMatchingResult()) + match_organizations_mock.return_value = mock_result + single_search_mock.return_value = mock_result + + view = views.OrganizationViewSet.as_view({'get': 'list'}) + request = factory.get( + '/v2/organizations', + {'affiliation': 'Harvard University', 'single_search': ''} + ) + response = view(request, version=self.V2_VERSION) + response.render() + + single_search_mock.assert_called_once() + match_organizations_mock.assert_not_called() + + class ViewRetrievalTestCase(SimpleTestCase): V2_VERSION = 'v2' From d37038d5a4a8edd27059cac61e2920bfcc41bd63 Mon Sep 17 00:00:00 2001 From: Joseph Rhoads Date: Wed, 6 May 2026 18:11:30 +0200 Subject: [PATCH 2/2] Refactor setup.py to improve GitHub API error handling and configuration (#532) * refactor: Improve ROR setup command error handling and GitHub API interaction --- rorapi/management/commands/setup.py | 92 ++++++++++++++++++++++------- 1 file changed, 70 insertions(+), 22 deletions(-) diff --git a/rorapi/management/commands/setup.py b/rorapi/management/commands/setup.py index 0795078..d4e84da 100644 --- a/rorapi/management/commands/setup.py +++ b/rorapi/management/commands/setup.py @@ -1,35 +1,77 @@ import requests -import zipfile -import base64 -from django.core.management.base import BaseCommand +import logging +from django.core.management.base import BaseCommand, CommandError from rorapi.management.commands.deleteindex import Command as DeleteIndexCommand from rorapi.management.commands.createindex import Command as CreateIndexCommand from rorapi.management.commands.indexrordump import Command as IndexRorDumpCommand from rorapi.management.commands.getrordump import Command as GetRorDumpCommand from rorapi.settings import ROR_DUMP -HEADERS = {'Accept': 'application/vnd.github.v3+json'} +REQUEST_TIMEOUT_SECONDS = 30 +logger = logging.getLogger(__name__) + + +def build_github_headers(): + token = ROR_DUMP.get('GITHUB_TOKEN') + if not token: + raise CommandError('Missing GitHub authentication configuration; cannot authenticate to GitHub API.') + return { + 'Authorization': f'token {token}', + 'Accept': 'application/vnd.github.v3+json' + } + + +def get_contents_url(use_test_data): + repo_key = 'TEST_REPO_URL' if use_test_data else 'PROD_REPO_URL' + repo_url = ROR_DUMP.get(repo_key) + if not repo_url: + raise CommandError('Repository source URL is not configured; cannot build contents endpoint.') + return f"{repo_url.rstrip('/')}/contents" -HEADERS = {'Authorization': 'token {}'.format(ROR_DUMP['GITHUB_TOKEN']), 'Accept': 'application/vnd.github.v3+json'} def get_ror_dump_sha(filename, use_test_data): - sha = '' - if use_test_data: - contents_url = ROR_DUMP['TEST_REPO_URL'] + '/contents' - else: - contents_url = ROR_DUMP['PROD_REPO_URL'] + '/contents' + headers = build_github_headers() + contents_url = get_contents_url(use_test_data) + try: - response = requests.get(contents_url, headers=HEADERS) - except requests.exceptions.RequestException as e: - raise SystemExit(f"{contents_url}: is Not reachable \nErr: {e}") + response = requests.get(contents_url, headers=headers, timeout=REQUEST_TIMEOUT_SECONDS) + except requests.exceptions.Timeout as exc: + raise CommandError(f'Request timed out while reaching {contents_url}.') from exc + except requests.exceptions.ConnectionError as exc: + raise CommandError(f'Could not connect to {contents_url}.') from exc + except requests.exceptions.RequestException as exc: + raise CommandError(f'GitHub request failed for {contents_url}: {exc}') from exc + + try: + response.raise_for_status() + except requests.exceptions.HTTPError as exc: + status = response.status_code + if status in (401, 403): + raise CommandError('GitHub authentication/authorization failed. Check API credentials and permissions.') from exc + if status == 404: + raise CommandError(f'GitHub repository contents endpoint not found: {contents_url}.') from exc + raise CommandError(f'GitHub API returned HTTP {status} for {contents_url}.') from exc + try: repo_contents = response.json() - for file in repo_contents: - if filename in file['name']: - sha = file['sha'] - return sha - except: - return None + except ValueError as exc: + raise CommandError(f'GitHub API returned invalid JSON for {contents_url}.') from exc + + if not isinstance(repo_contents, list): + raise CommandError(f'Unexpected GitHub API response type for {contents_url}: {type(repo_contents).__name__}.') + + for entry in repo_contents: + if not isinstance(entry, dict): + continue + name = entry.get('name') + sha = entry.get('sha') + if not name or not sha: + continue + if filename in name: + return sha + + return None + class Command(BaseCommand): help = 'Setup ROR API' @@ -49,7 +91,12 @@ def handle(self, *args, **options): else: print("Using ror-data repo") - sha = get_ror_dump_sha(filename, use_test_data) + try: + sha = get_ror_dump_sha(filename, use_test_data) + except CommandError as exc: + msg = f'ERROR: Could not validate ROR data dump source. {exc}' + self.stdout.write(msg) + return msg if sha: try: @@ -58,11 +105,12 @@ def handle(self, *args, **options): CreateIndexCommand().handle(*args, **options) IndexRorDumpCommand().handle(*args, **options) msg = 'SUCCESS: ROR dataset {} indexed in v2. Using test repo: {}'.format(filename, str(use_test_data)) - except: + except Exception: + logger.exception('Failed while indexing ROR dataset %s (use_test_data=%s).', filename, use_test_data) msg = 'ERROR: Could not index ROR data dump. Check API logs for details.' else: msg = 'ERROR: ROR dataset for file {} not found. '.format(filename) \ - +'Please generate the data dump first.' + + 'Please generate the data dump first.' self.stdout.write(msg) return msg