From d95fe7894d306db0ccd980f3081bdcc6b8126e76 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Wed, 4 Mar 2026 17:04:40 -0500 Subject: [PATCH 1/5] refactor index/perms call --- .../edu/harvard/iq/dataverse/api/Index.java | 4 +- .../search/SolrIndexServiceBean.java | 101 +++++++++--------- 2 files changed, 52 insertions(+), 53 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Index.java b/src/main/java/edu/harvard/iq/dataverse/api/Index.java index f83506c7e27..39e93d445b1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Index.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Index.java @@ -372,8 +372,8 @@ public Response indexMod(@QueryParam("partitions") long partitions, @QueryParam( @GET @Path("perms") public Response indexAllPermissions() { - IndexResponse indexResponse = solrIndexService.indexAllPermissions(); - return ok(indexResponse.getMessage()); + solrIndexService.asyncIndexAllPermissions(); + return ok("Asynchronous indexing of all permissions has been started. Check the server logs for progress."); } @GET diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index dfcb61438e6..8eb3d1fdbbc 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -27,6 +27,7 @@ import java.util.logging.Logger; import java.util.stream.Stream; +import jakarta.ejb.Asynchronous; import jakarta.ejb.EJB; import jakarta.ejb.Stateless; import jakarta.ejb.TransactionAttribute; @@ -227,60 +228,58 @@ private String getDatasetOrDataFileSolrEnding(DatasetVersion.VersionState versio } } - public IndexResponse indexAllPermissions() { - Collection docs = new ArrayList<>(); - - List definitionPoints = new ArrayList<>(); - Map> filesPerDataset = new HashMap<>(); - List allExceptFiles = dvObjectService.findAll(); - for (DvObject dvObject : allExceptFiles) { - logger.fine("determining definition points for dvobject id " + dvObject.getId()); - if (dvObject.isInstanceofDataFile()) { - Long dataset = dvObject.getOwner().getId(); - Long datafile = dvObject.getId(); - - List files = filesPerDataset.get(dataset); - if (files == null) { - files = new ArrayList<>(); - filesPerDataset.put(dataset, files); - } - files.add(datafile); - } else { - definitionPoints.addAll(determineSolrDocs(dvObject)); - } - } - - List all = allExceptFiles; - for (Map.Entry> filePerDataset : filesPerDataset.entrySet()) { - definitionPoints.addAll(determineSolrDocsForFilesFromDataset(filePerDataset)); - for (long fileId : filePerDataset.getValue()) { - DvObject file = dvObjectService.findDvObject(fileId); - if (file != null) { - all.add(file); - } - } - } - - for (DvObjectSolrDoc dvObjectSolrDoc : definitionPoints) { - logger.fine("creating solr doc in memory for " + dvObjectSolrDoc.getSolrId()); - SolrInputDocument solrInputDocument = SearchUtil.createSolrDoc(dvObjectSolrDoc); - logger.fine("adding to list of docs to index " + dvObjectSolrDoc.getSolrId()); - docs.add(solrInputDocument); - } + @Asynchronous + public void asyncIndexAllPermissions() { + logger.info("Starting asynchronous indexing of all permissions"); + long startTime = System.currentTimeMillis(); + try { - persistToSolr(docs); - /** - * @todo Do we need a separate permissionIndexTime timestamp? - * Probably. Update it here. - */ - for (DvObject dvObject : all) { - dvObjectService.updatePermissionIndexTime(dvObject); + + // Get ALL dataverses in the system + List allDataverseIds = em.createQuery( + "SELECT d.id FROM Dataverse d ORDER BY d.id", Long.class) + .getResultList(); + + logger.info("Found " + allDataverseIds.size() + " dataverses to index (each will index its datasets and files)"); + + int processedCount = 0; + + // Index each dataverse (which will automatically index all its datasets and files) + for (Long dataverseId : allDataverseIds) { + try { + Dataverse dataverse = dataverseService.find(dataverseId); + if (dataverse == null) { + logger.warning("Dataverse not found: " + dataverseId); + continue; + } + + logger.fine("Indexing permissions for Dataverse " + dataverseId + + " (" + dataverse.getName() + ") and all its datasets/files"); + + // This will index the dataverse itself and all its direct dataset children (with their files) + IndexResponse response = indexPermissionsOnSelfAndChildren(dataverse); + processedCount++; + + logger.fine("Indexed Dataverse " + dataverseId + ": " + response.getMessage()); + + // Clear persistence context periodically to free memory + if (processedCount % 10 == 0) { + em.clear(); + logger.info("Processed " + processedCount + "/" + allDataverseIds.size() + " dataverses"); + } + + } catch (Exception e) { + logger.log(Level.WARNING, "Error indexing permissions for dataverse " + dataverseId, e); + } } - return new IndexResponse("indexed all permissions"); - } catch (SolrServerException | IOException ex) { - return new IndexResponse("problem indexing"); + + long duration = System.currentTimeMillis() - startTime; + logger.info("Completed asynchronous indexing of all permissions. Processed " + + processedCount + " dataverses (with all their datasets and files) in " + duration + "ms"); + + } catch (Exception e) { + logger.log(Level.SEVERE, "Error during asynchronous permission indexing", e); } - } public IndexResponse indexPermissionsForOneDvObject(DvObject dvObject) { From 3006085ec29eba83b83b71f290325b300a146278 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Mon, 9 Mar 2026 11:13:04 -0400 Subject: [PATCH 2/5] remove unused methods --- .../search/SolrIndexServiceBean.java | 37 ------------------- 1 file changed, 37 deletions(-) diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 8eb3d1fdbbc..65997a1bc7e 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -104,20 +104,6 @@ public List determineSolrDocs(DvObject dvObject) { return solrDocs; } - private List determineSolrDocsForFilesFromDataset(Map.Entry> datasetHash) { - List emptyList = new ArrayList<>(); - List solrDocs = emptyList; - DvObject dvObject = dvObjectService.findDvObject(datasetHash.getKey()); - if (dvObject == null) { - return emptyList; - } - if (dvObject.isInstanceofDataset()) { - Dataset dataset = (Dataset) dvObject; - solrDocs.addAll(constructDatafileSolrDocsFromDataset(dataset)); - } - return solrDocs; - } - /** * @todo should this method return a List? The equivalent methods for * datasets and files return lists. @@ -156,29 +142,6 @@ private DvObjectSolrDoc constructDatafileSolrDoc(DataFileProxy fileProxy, List constructDatafileSolrDocsFromDataset(Dataset dataset) { - List datafileSolrDocs = new ArrayList<>(); - for (DatasetVersion datasetVersionFileIsAttachedTo : datasetVersionsToBuildCardsFor(dataset)) { - List perms = new ArrayList<>(); - if (datasetVersionFileIsAttachedTo.isReleased()) { - perms.add(IndexServiceBean.getPublicGroupString()); - } else { - perms = searchPermissionsService.findDatasetVersionPerms(datasetVersionFileIsAttachedTo); - } - - for (FileMetadata fileMetadata : datasetVersionFileIsAttachedTo.getFileMetadatas()) { - Long fileId = fileMetadata.getDataFile().getId(); - String solrIdStart = IndexServiceBean.solrDocIdentifierFile + fileId; - String solrIdEnd = getDatasetOrDataFileSolrEnding(datasetVersionFileIsAttachedTo.getVersionState()); - String solrId = solrIdStart + solrIdEnd; - DvObjectSolrDoc dataFileSolrDoc = new DvObjectSolrDoc(fileId.toString(), solrId, datasetVersionFileIsAttachedTo.getId(), fileMetadata.getLabel(), perms); - logger.finest("adding fileid " + fileId); - datafileSolrDocs.add(dataFileSolrDoc); - } - } - return datafileSolrDocs; - } - /** Find the versions to index. The overall logic is * If there is only one version, or no released version (all non-draft versions are deaccessioned) * then index it regardless of it's versionstate From 45359bafdadb5b3c419193cbdb7518121595ee7f Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 23 Jun 2026 10:52:31 -0400 Subject: [PATCH 3/5] updates per review --- .../12200-async-reindexing-permissions.md | 8 ++++ .../source/admin/solr-search-index.rst | 31 +++++++++++++ doc/sphinx-guides/source/api/changelog.rst | 8 ++++ .../edu/harvard/iq/dataverse/api/Index.java | 46 ++++++++++++++----- .../search/SolrIndexServiceBean.java | 17 ++++++- 5 files changed, 97 insertions(+), 13 deletions(-) create mode 100644 doc/release-notes/12200-async-reindexing-permissions.md diff --git a/doc/release-notes/12200-async-reindexing-permissions.md b/doc/release-notes/12200-async-reindexing-permissions.md new file mode 100644 index 00000000000..22641a917ca --- /dev/null +++ b/doc/release-notes/12200-async-reindexing-permissions.md @@ -0,0 +1,8 @@ +# Asynchronous Permissions Reindexing + +An asynchronous API endpoint has been added to re-index permissions for all objects (Dataverse collections, datasets, and files) in the system. This replaces the previous synchronous process which could cause timeouts on large installations. + +- The new endpoints are `POST /api/admin/index/perms` (asynchronous, all objects) and `POST /api/admin/index/perms/{id}` (synchronous, single object). +- Both endpoints require superuser access. +- For the asynchronous endpoint, if an indexing process is already in progress, the API will return a 409 Conflict status. +- These endpoints are documented in the Solr Search Index section of the Admin Guide. diff --git a/doc/sphinx-guides/source/admin/solr-search-index.rst b/doc/sphinx-guides/source/admin/solr-search-index.rst index 3f7b9d5b547..27eb7fab6f4 100644 --- a/doc/sphinx-guides/source/admin/solr-search-index.rst +++ b/doc/sphinx-guides/source/admin/solr-search-index.rst @@ -96,6 +96,37 @@ This API will clear the Solr entry for the dataset specified. It can be useful i This can be reversed of course by re-indexing the dataset with the API above. +Reindexing Permissions +---------------------- + +It is possible to just re-index the permissions on Solr entries without re-indexing the content. + +Reindexing Permissions for All Objects +++++++++++++++++++++++++++++++++++++++ + +Re-index permissions for all Dataverse collections, datasets, and files. This is an asynchronous operation that may take a long time to complete on large installations. This endpoint requires the API token of a superuser. + +.. code-block:: bash + + export SERVER_URL=https://demo.dataverse.org + curl -X POST -H "X-Dataverse-key:$API_TOKEN" "$SERVER_URL/api/admin/index/perms" + +If indexing is already in progress, the API will return a 409 ("CONFLICT") response. Otherwise, it will return a 200 ("OK") response and the indexing process will start in the background. Check the server logs for progress. + +Reindexing Permissions for a Single Object +++++++++++++++++++++++++++++++++++++++++++ + +It is also possible to re-index permissions for a single object by database ID: + +.. code-block:: bash + + export SERVER_URL=https://demo.dataverse.org + export ID=42 + curl -X POST -H "X-Dataverse-key:$API_TOKEN" "$SERVER_URL/api/admin/index/perms/$ID" + +This operation is performed synchronously. + + Manually Querying Solr ---------------------- diff --git a/doc/sphinx-guides/source/api/changelog.rst b/doc/sphinx-guides/source/api/changelog.rst index fd2dd68f4c4..a9050e3b83b 100644 --- a/doc/sphinx-guides/source/api/changelog.rst +++ b/doc/sphinx-guides/source/api/changelog.rst @@ -7,6 +7,14 @@ This API changelog is experimental and we would love feedback on its usefulness. :local: :depth: 1 +v6.12 +----- +- The permission reindexing endpoints have been updated to use ``POST`` and require superuser access. They are now documented in the :doc:`/admin/solr-search-index` guide. + + - **/api/admin/index/perms** + + - **/api/admin/index/perms/{id}** + v6.10 ----- - The following GET APIs will now return ``400`` if a required Guestbook Response is not supplied. A Guestbook Response can be passed to these APIs in the JSON body using a POST call. See the notes under :ref:`basic-file-access` and :ref:`download-by-dataset-by-version` for details. diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Index.java b/src/main/java/edu/harvard/iq/dataverse/api/Index.java index 39e93d445b1..8a21de8e296 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Index.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Index.java @@ -59,6 +59,7 @@ import jakarta.validation.ConstraintViolationException; import jakarta.ws.rs.DELETE; import jakarta.ws.rs.GET; +import jakarta.ws.rs.POST; import jakarta.ws.rs.Path; import jakarta.ws.rs.PathParam; import jakarta.ws.rs.QueryParam; @@ -369,22 +370,43 @@ public Response indexMod(@QueryParam("partitions") long partitions, @QueryParam( return ok(response); } - @GET + @POST + @AuthRequired @Path("perms") - public Response indexAllPermissions() { - solrIndexService.asyncIndexAllPermissions(); - return ok("Asynchronous indexing of all permissions has been started. Check the server logs for progress."); + public Response indexAllPermissions(@Context ContainerRequestContext crc) { + try { + AuthenticatedUser user = getRequestAuthenticatedUserOrDie(crc); + if (!user.isSuperuser()) { + return error(Status.FORBIDDEN, "Superusers only."); + } + if (solrIndexService.isIndexingPermissionsInProgress()) { + return conflict("Asynchronous indexing of all permissions is already in progress."); + } + solrIndexService.asyncIndexAllPermissions(); + return ok("Asynchronous indexing of all permissions has been started. Check the server logs for progress."); + } catch (WrappedResponse wr) { + return wr.getResponse(); + } } - @GET + @POST + @AuthRequired @Path("perms/{id}") - public Response indexPermissions(@PathParam("id") Long id) { - DvObject dvObject = dvObjectService.findDvObject(id); - if (dvObject == null) { - return error(Status.BAD_REQUEST, "Could not find DvObject based on id " + id); - } else { - IndexResponse indexResponse = solrIndexService.indexPermissionsForOneDvObject(dvObject); - return ok(indexResponse.getMessage()); + public Response indexPermissions(@Context ContainerRequestContext crc, @PathParam("id") Long id) { + try { + AuthenticatedUser user = getRequestAuthenticatedUserOrDie(crc); + if (!user.isSuperuser()) { + return error(Status.FORBIDDEN, "Superusers only."); + } + DvObject dvObject = dvObjectService.findDvObject(id); + if (dvObject == null) { + return error(Status.BAD_REQUEST, "Could not find DvObject based on id " + id); + } else { + IndexResponse indexResponse = solrIndexService.indexPermissionsForOneDvObject(dvObject); + return ok(indexResponse.getMessage()); + } + } catch (WrappedResponse wr) { + return wr.getResponse(); } } /** diff --git a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java index 3331172f894..e4794b88cb1 100644 --- a/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/search/SolrIndexServiceBean.java @@ -23,6 +23,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Stream; @@ -48,6 +49,8 @@ public class SolrIndexServiceBean { private static final Logger logger = Logger.getLogger(SolrIndexServiceBean.class.getCanonicalName()); + private static final AtomicBoolean indexingInProgress = new AtomicBoolean(false); + @EJB private SolrIndexServiceBean self; // Self-injection to allow calling methods in new transactions (from other methods in this bean) @@ -191,11 +194,21 @@ private String getDatasetOrDataFileSolrEnding(DatasetVersion.VersionState versio } } + public boolean isIndexingPermissionsInProgress() { + return indexingInProgress.get(); + } + @Asynchronous + @TransactionAttribute(TransactionAttributeType.NOT_SUPPORTED) public void asyncIndexAllPermissions() { + if (!indexingInProgress.compareAndSet(false, true)) { + logger.info("Asynchronous indexing of all permissions is already in progress. Skipping this invocation."); + return; + } + logger.info("Starting asynchronous indexing of all permissions"); long startTime = System.currentTimeMillis(); - + try { // Get ALL dataverses in the system @@ -242,6 +255,8 @@ public void asyncIndexAllPermissions() { } catch (Exception e) { logger.log(Level.SEVERE, "Error during asynchronous permission indexing", e); + } finally { + indexingInProgress.set(false); } } From 677277926ae131313cf1a020b0582d888a83df90 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 23 Jun 2026 11:10:58 -0400 Subject: [PATCH 4/5] release note update --- doc/release-notes/12200-async-reindexing-permissions.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/doc/release-notes/12200-async-reindexing-permissions.md b/doc/release-notes/12200-async-reindexing-permissions.md index 22641a917ca..4ca976b3331 100644 --- a/doc/release-notes/12200-async-reindexing-permissions.md +++ b/doc/release-notes/12200-async-reindexing-permissions.md @@ -1,8 +1,9 @@ # Asynchronous Permissions Reindexing -An asynchronous API endpoint has been added to re-index permissions for all objects (Dataverse collections, datasets, and files) in the system. This replaces the previous synchronous process which could cause timeouts on large installations. +The previously undocumented Solr permissions reindexing API endpoints have been improved. -- The new endpoints are `POST /api/admin/index/perms` (asynchronous, all objects) and `POST /api/admin/index/perms/{id}` (synchronous, single object). +- The endpoints are `/api/admin/index/perms` (asynchronous, all objects) and `/api/admin/index/perms/{id}` (synchronous, single object) now use POST instead of GET - Both endpoints require superuser access. -- For the asynchronous endpoint, if an indexing process is already in progress, the API will return a 409 Conflict status. -- These endpoints are documented in the Solr Search Index section of the Admin Guide. +- For the asynchronous reindex all endpoint, if an indexing process is already in progress, the API will return a 409 Conflict status. +- The asynchronous reindex all endpoint no longer runs as a single transaction. This avoids potential timeouts in larger installations. +- These endpoints are now documented in the Solr Search Index section of the Admin Guide. From 024799ff6f2a1d4af7f41d5053822aac73519fe7 Mon Sep 17 00:00:00 2001 From: Jim Myers Date: Tue, 23 Jun 2026 11:15:09 -0400 Subject: [PATCH 5/5] fix missing line --- doc/sphinx-guides/source/api/changelog.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/sphinx-guides/source/api/changelog.rst b/doc/sphinx-guides/source/api/changelog.rst index 906647d335a..10e368da51f 100644 --- a/doc/sphinx-guides/source/api/changelog.rst +++ b/doc/sphinx-guides/source/api/changelog.rst @@ -26,6 +26,7 @@ v6.11 - The ``GET /api/access/datafile/{id}/userPermissions`` endpoint now requires authentication. - The Croissant :ref:`metadata export format ` has been updated from version 1.0 to 1.1, which is reflected in the ``conformsTo`` property. The unused ``wd`` property has been dropped. + v6.10 ----- - The following GET APIs will now return ``400`` if a required Guestbook Response is not supplied. A Guestbook Response can be passed to these APIs in the JSON body using a POST call. See the notes under :ref:`basic-file-access` and :ref:`download-by-dataset-by-version` for details.