diff --git a/doc/release-notes/3437-new-index-api-added.md b/doc/release-notes/3437-new-index-api-added.md new file mode 100644 index 00000000000..2f40c65073f --- /dev/null +++ b/doc/release-notes/3437-new-index-api-added.md @@ -0,0 +1,4 @@ +(this API was added as a side feature of the pr #10222. the main point of the pr was an improvement in the OAI set housekeeping logic, I believe it's too obscure part of the system to warrant a relase note by itself. but the new API below needs to be announced). + +A new Index API endpoint has been added allowing an admin to clear an individual dataset from Solr. + diff --git a/doc/sphinx-guides/source/admin/solr-search-index.rst b/doc/sphinx-guides/source/admin/solr-search-index.rst index e6f7b588ede..3f7b9d5b547 100644 --- a/doc/sphinx-guides/source/admin/solr-search-index.rst +++ b/doc/sphinx-guides/source/admin/solr-search-index.rst @@ -26,8 +26,8 @@ Remove all Solr documents that are orphaned (i.e. not associated with objects in ``curl http://localhost:8080/api/admin/index/clear-orphans`` -Clearing Data from Solr -~~~~~~~~~~~~~~~~~~~~~~~ +Clearing ALL Data from Solr +~~~~~~~~~~~~~~~~~~~~~~~~~~~ Please note that the moment you issue this command, it will appear to end users looking at the root Dataverse installation page that all data is gone! This is because the root Dataverse installation page is powered by the search index. @@ -86,6 +86,16 @@ To re-index a dataset by its database ID: ``curl http://localhost:8080/api/admin/index/datasets/7504557`` +Clearing a Dataset from Solr +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +This API will clear the Solr entry for the dataset specified. It can be useful if you have reasons to want to hide a published dataset from showing in search results and/or on Collection pages, but don't want to destroy and purge it from the database just yet. + +``curl -X DELETE http://localhost:8080/api/admin/index/datasets/`` + +This can be reversed of course by re-indexing the dataset with the API above. + + Manually Querying Solr ---------------------- diff --git a/src/main/java/edu/harvard/iq/dataverse/api/Index.java b/src/main/java/edu/harvard/iq/dataverse/api/Index.java index 4910c460b6a..c30a77acb58 100644 --- a/src/main/java/edu/harvard/iq/dataverse/api/Index.java +++ b/src/main/java/edu/harvard/iq/dataverse/api/Index.java @@ -215,7 +215,7 @@ public Response clearSolrIndex() { return error(Status.INTERNAL_SERVER_ERROR, ex.getLocalizedMessage()); } } - + @GET @Path("{type}/{id}") public Response indexTypeById(@PathParam("type") String type, @PathParam("id") Long id) { @@ -326,6 +326,29 @@ public Response indexDatasetByPersistentId(@QueryParam("persistentId") String pe } } + /** + * Clears the entry for a dataset from Solr + * + * @param id numer id of the dataset + * @return response; + * will return 404 if no such dataset in the database; but will attempt to + * clear the entry from Solr regardless. + */ + @DELETE + @Path("datasets/{id}") + public Response clearDatasetFromIndex(@PathParam("id") Long id) { + Dataset dataset = datasetService.find(id); + // We'll attempt to delete the Solr document regardless of whether the + // dataset exists in the database: + String response = indexService.removeSolrDocFromIndex(IndexServiceBean.solrDocIdentifierDataset + id); + if (dataset != null) { + return ok("Sent request to clear Solr document for dataset " + id + ": " + response); + } else { + return notFound("Could not find dataset " + id + " in the database. Requested to clear from Solr anyway: " + response); + } + } + + /** * This is just a demo of the modular math logic we use for indexAll. */ diff --git a/src/main/java/edu/harvard/iq/dataverse/harvest/server/OAIRecordServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/harvest/server/OAIRecordServiceBean.java index 1b4a7bc7db0..cc15d4c978b 100644 --- a/src/main/java/edu/harvard/iq/dataverse/harvest/server/OAIRecordServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/harvest/server/OAIRecordServiceBean.java @@ -40,10 +40,6 @@ @Stateless @Named public class OAIRecordServiceBean implements java.io.Serializable { - @EJB - OAISetServiceBean oaiSetService; - @EJB - IndexServiceBean indexService; @EJB DatasetServiceBean datasetService; @EJB @@ -55,13 +51,24 @@ public class OAIRecordServiceBean implements java.io.Serializable { EntityManager em; private static final Logger logger = Logger.getLogger("edu.harvard.iq.dataverse.harvest.server.OAIRecordServiceBean"); - - public void updateOaiRecords(String setName, List datasetIds, Date updateTime, boolean doExport) { - updateOaiRecords(setName, datasetIds, updateTime, doExport, logger); - } - public void updateOaiRecords(String setName, List datasetIds, Date updateTime, boolean doExport, Logger setUpdateLogger) { - + /** + * Updates the OAI records for the set specified + * @param setName name of the OAI set + * @param datasetIds ids of the datasets that are candidates for this OAI set + * @param updateTime time stamp + * @param doExport attempt to export datasets that haven't been exported yet + * @param confirmed true if the datasetIds above were looked up in the database + * - as opposed to in the search engine. Meaning, that it is + * confirmed that any dataset not on this list that's currently + * in the set is no longer in the database and should be + * marked as deleted without any further checks. Otherwise + * we'll want to double-check if the dataset still exists + * as published. This is to prevent marking existing datasets + * as deleted during a full reindex etc. + * @param setUpdateLogger dedicated Logger + */ + public void updateOaiRecords(String setName, List datasetIds, Date updateTime, boolean doExport, boolean confirmed, Logger setUpdateLogger) { // create Map of OaiRecords List oaiRecords = findOaiRecordsBySetName(setName); Map recordMap = new HashMap<>(); @@ -101,9 +108,6 @@ public void updateOaiRecords(String setName, List datasetIds, Date updateT DatasetVersion releasedVersion = dataset.getReleasedVersion(); Date publicationDate = releasedVersion == null ? null : releasedVersion.getReleaseTime(); - //if (dataset.getPublicationDate() != null - // && (dataset.getLastExportTime() == null - // || dataset.getLastExportTime().before(dataset.getPublicationDate()))) { if (publicationDate != null && (dataset.getLastExportTime() == null || dataset.getLastExportTime().before(publicationDate))) { @@ -125,7 +129,9 @@ public void updateOaiRecords(String setName, List datasetIds, Date updateT } // anything left in the map should be marked as removed! - markOaiRecordsAsRemoved( recordMap.values(), updateTime, setUpdateLogger); + markOaiRecordsAsRemoved(recordMap.values(), updateTime, confirmed, setUpdateLogger); + + } @@ -162,7 +168,7 @@ record = new OAIRecord(setName, dataset.getGlobalId().asString(), new Date()); } } - + /* // Updates any existing OAI records for this dataset // Should be called whenever there's a change in the release status of the Dataset // (i.e., when it's published or deaccessioned), so that the timestamps and @@ -201,13 +207,31 @@ public void updateOaiRecordsForDataset(Dataset dataset) { logger.fine("Null returned - no records found."); } } +*/ - public void markOaiRecordsAsRemoved(Collection records, Date updateTime, Logger setUpdateLogger) { + public void markOaiRecordsAsRemoved(Collection records, Date updateTime, boolean confirmed, Logger setUpdateLogger) { for (OAIRecord oaiRecord : records) { if ( !oaiRecord.isRemoved() ) { - setUpdateLogger.fine("marking OAI record "+oaiRecord.getGlobalId()+" as removed"); - oaiRecord.setRemoved(true); - oaiRecord.setLastUpdateTime(updateTime); + boolean confirmedRemoved = confirmed; + if (!confirmedRemoved) { + Dataset lookedUp = datasetService.findByGlobalId(oaiRecord.getGlobalId()); + if (lookedUp == null) { + confirmedRemoved = true; + } else if (lookedUp.getLastExportTime() == null) { + confirmedRemoved = true; + } else { + boolean isReleased = lookedUp.getReleasedVersion() != null; + if (!isReleased) { + confirmedRemoved = true; + } + } + } + + if (confirmedRemoved) { + setUpdateLogger.fine("marking OAI record "+oaiRecord.getGlobalId()+" as removed"); + oaiRecord.setRemoved(true); + oaiRecord.setLastUpdateTime(updateTime); + } } else { setUpdateLogger.fine("OAI record "+oaiRecord.getGlobalId()+" is already marked as removed."); } diff --git a/src/main/java/edu/harvard/iq/dataverse/harvest/server/OAISetServiceBean.java b/src/main/java/edu/harvard/iq/dataverse/harvest/server/OAISetServiceBean.java index d5c78c36b98..b3a09391bf3 100644 --- a/src/main/java/edu/harvard/iq/dataverse/harvest/server/OAISetServiceBean.java +++ b/src/main/java/edu/harvard/iq/dataverse/harvest/server/OAISetServiceBean.java @@ -171,6 +171,8 @@ public void exportOaiSet(OAISet oaiSet, Logger exportLogger) { String query = managedSet.getDefinition(); List datasetIds; + boolean databaseLookup = false; // As opposed to a search engine lookup + try { if (!oaiSet.isDefaultSet()) { datasetIds = expandSetQuery(query); @@ -181,6 +183,7 @@ public void exportOaiSet(OAISet oaiSet, Logger exportLogger) { // including the unpublished drafts and deaccessioned ones. // Those will be filtered out further down the line. datasetIds = datasetService.findAllLocalDatasetIds(); + databaseLookup = true; } } catch (OaiSetException ose) { datasetIds = null; @@ -191,7 +194,7 @@ public void exportOaiSet(OAISet oaiSet, Logger exportLogger) { // they will be properly marked as "deleted"! -- L.A. 4.5 //if (datasetIds != null && !datasetIds.isEmpty()) { exportLogger.info("Calling OAI Record Service to re-export " + datasetIds.size() + " datasets."); - oaiRecordService.updateOaiRecords(managedSet.getSpec(), datasetIds, new Date(), true, exportLogger); + oaiRecordService.updateOaiRecords(managedSet.getSpec(), datasetIds, new Date(), true, databaseLookup, exportLogger); //} managedSet.setUpdateInProgress(false); diff --git a/src/test/java/edu/harvard/iq/dataverse/api/HarvestingServerIT.java b/src/test/java/edu/harvard/iq/dataverse/api/HarvestingServerIT.java index cffe730a806..57a12224c89 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/HarvestingServerIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/HarvestingServerIT.java @@ -23,6 +23,7 @@ import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.assertEquals; @@ -39,6 +40,7 @@ public class HarvestingServerIT { private static String adminUserAPIKey; private static String singleSetDatasetIdentifier; private static String singleSetDatasetPersistentId; + private static Integer singleSetDatasetDatabaseId; private static List extraDatasetsIdentifiers = new ArrayList<>(); @BeforeAll @@ -84,7 +86,7 @@ private static void setupDatasets() { // create dataset: Response createDatasetResponse = UtilIT.createRandomDatasetViaNativeApi(dataverseAlias, adminUserAPIKey); createDatasetResponse.prettyPrint(); - Integer datasetId = UtilIT.getDatasetIdFromResponse(createDatasetResponse); + singleSetDatasetDatabaseId = UtilIT.getDatasetIdFromResponse(createDatasetResponse); // retrieve the global id: singleSetDatasetPersistentId = UtilIT.getDatasetPersistentIdFromResponse(createDatasetResponse); @@ -104,13 +106,13 @@ private static void setupDatasets() { // So wait for all of this to finish. UtilIT.sleepForReexport(singleSetDatasetPersistentId, adminUserAPIKey, 10); - // ... And let's create 4 more datasets for a multi-dataset experiment: + // ... And let's create 5 more datasets for a multi-dataset experiment: - for (int i = 0; i < 4; i++) { + for (int i = 0; i < 5; i++) { // create dataset: createDatasetResponse = UtilIT.createRandomDatasetViaNativeApi(dataverseAlias, adminUserAPIKey); createDatasetResponse.prettyPrint(); - datasetId = UtilIT.getDatasetIdFromResponse(createDatasetResponse); + Integer datasetId = UtilIT.getDatasetIdFromResponse(createDatasetResponse); // retrieve the global id: String thisDatasetPersistentId = UtilIT.getDatasetPersistentIdFromResponse(createDatasetResponse); @@ -415,6 +417,11 @@ public void testSetEditAPIandOAIlistSets() throws InterruptedException { // OAI set with a single dataset, and attempt to retrieve // it and validate the OAI server responses of the corresponding // ListIdentifiers, ListRecords and GetRecord methods. + // Finally, we will make sure that the test reexport survives + // a reexport when the control dataset is dropped from the search + // index temporarily (if, for example, the site admin cleared their + // solr index in order to reindex everything from scratch - which + // can take a while on a large database). This is per #3437 @Test public void testSingleRecordOaiSet() throws InterruptedException { // Let's try and create an OAI set with the "single set dataset" that @@ -569,6 +576,83 @@ public void testSingleRecordOaiSet() throws InterruptedException { assertEquals("Medicine, Health and Life Sciences", responseXmlPath.getString("OAI-PMH.GetRecord.record.metadata.dc.subject")); // ok, looks legit! + + // Now, let's clear this dataset from Solr: + Response solrClearResponse = UtilIT.indexClearDataset(singleSetDatasetDatabaseId); + assertEquals(200, solrClearResponse.getStatusCode()); + solrClearResponse.prettyPrint(); + + // Now, let's re-export the set. The search query that defines the set + // will no longer find it (todo: confirm this first?). However, since + // the dataset still exists in the database; and would in real life + // be reindexed again, we don't want to mark the OAI record for the + // dataset as "deleted" just yet. (this is a new feature, as of 6.2) + // So, let's re-export the set... + + exportSetResponse = UtilIT.exportOaiSet(setName); + assertEquals(200, exportSetResponse.getStatusCode()); + Thread.sleep(1000L); // wait for just a second, to be safe + + // OAI Test 5. Check ListIdentifiers again: + + Response listIdentifiersResponse = UtilIT.getOaiListIdentifiers(setName, "oai_dc"); + assertEquals(OK.getStatusCode(), listIdentifiersResponse.getStatusCode()); + + // Validate the service section of the OAI response: + responseXmlPath = validateOaiVerbResponse(listIdentifiersResponse, "ListIdentifiers"); + + // ... and confirm that the record for our dataset is still listed + // as active: + List ret = responseXmlPath.getList("OAI-PMH.ListIdentifiers.header"); + + assertEquals(1, ret.size()); + assertEquals(singleSetDatasetPersistentId, responseXmlPath + .getString("OAI-PMH.ListIdentifiers.header.identifier")); + assertEquals(setName, responseXmlPath + .getString("OAI-PMH.ListIdentifiers.header.setSpec")); + // ... and, most importantly, make sure the record does not have a + // `status="deleted"` attribute: + assertNull(responseXmlPath.getString("OAI-PMH.ListIdentifiers.header.@status")); + + // TODO: (?) we could also destroy the dataset for real now, and make + // sure the "deleted" attribute has been added to the OAI record. + + // While we are at it, let's now destroy this dataset for real, and + // make sure the "deleted" attribute is actually added once the set + // is re-exported: + + Response destroyDatasetResponse = UtilIT.destroyDataset(singleSetDatasetPersistentId, adminUserAPIKey); + assertEquals(200, destroyDatasetResponse.getStatusCode()); + destroyDatasetResponse.prettyPrint(); + + // Confirm that it no longer exists: + Response datasetNotFoundResponse = UtilIT.nativeGet(singleSetDatasetDatabaseId, adminUserAPIKey); + assertEquals(404, datasetNotFoundResponse.getStatusCode()); + + // Repeat the whole production with re-exporting set and checking + // ListIdentifiers: + + exportSetResponse = UtilIT.exportOaiSet(setName); + assertEquals(200, exportSetResponse.getStatusCode()); + Thread.sleep(1000L); // wait for just a second, to be safe + System.out.println("re-exported the dataset again, with the control dataset destroyed"); + + // OAI Test 6. Check ListIdentifiers again: + + listIdentifiersResponse = UtilIT.getOaiListIdentifiers(setName, "oai_dc"); + assertEquals(OK.getStatusCode(), listIdentifiersResponse.getStatusCode()); + + // Validate the service section of the OAI response: + responseXmlPath = validateOaiVerbResponse(listIdentifiersResponse, "ListIdentifiers"); + + // ... and confirm that the record for our dataset is still listed... + ret = responseXmlPath.getList("OAI-PMH.ListIdentifiers.header"); + assertEquals(1, ret.size()); + assertEquals(singleSetDatasetPersistentId, responseXmlPath + .getString("OAI-PMH.ListIdentifiers.header.identifier")); + + // ... BUT, it should be marked as "deleted" now: + assertEquals(responseXmlPath.getString("OAI-PMH.ListIdentifiers.header.@status"), "deleted"); } @@ -589,9 +673,13 @@ public void testMultiRecordOaiSet() throws InterruptedException { // in the class init: String setName = UtilIT.getRandomString(6); - String setQuery = "(dsPersistentId:" + singleSetDatasetIdentifier; + String setQuery = ""; for (String persistentId : extraDatasetsIdentifiers) { - setQuery = setQuery.concat(" OR dsPersistentId:" + persistentId); + if (setQuery.equals("")) { + setQuery = "(dsPersistentId:" + persistentId; + } else { + setQuery = setQuery.concat(" OR dsPersistentId:" + persistentId); + } } setQuery = setQuery.concat(")"); @@ -732,7 +820,6 @@ public void testMultiRecordOaiSet() throws InterruptedException { boolean allDatasetsListed = true; - allDatasetsListed = persistentIdsInListIdentifiers.contains(singleSetDatasetIdentifier); for (String persistentId : extraDatasetsIdentifiers) { allDatasetsListed = allDatasetsListed && persistentIdsInListIdentifiers.contains(persistentId); } @@ -857,12 +944,11 @@ public void testMultiRecordOaiSet() throws InterruptedException { // Record the last identifier listed on this final page: persistentIdsInListRecords.add(ret.get(0).substring(ret.get(0).lastIndexOf('/') + 1)); - // Finally, let's confirm that the expected 5 datasets have been listed + // Finally, let's confirm again that the expected 5 datasets have been listed // as part of this Set: allDatasetsListed = true; - allDatasetsListed = persistentIdsInListRecords.contains(singleSetDatasetIdentifier); for (String persistentId : extraDatasetsIdentifiers) { allDatasetsListed = allDatasetsListed && persistentIdsInListRecords.contains(persistentId); } @@ -905,7 +991,7 @@ public void testInvalidQueryParams() { // TODO: // What else can we test? // Some ideas: - // - Test handling of deleted dataset records + // - Test handling of deleted dataset records - DONE! // - Test "from" and "until" time parameters // - Validate full verb response records against XML schema // (for each supported metadata format, possibly?) diff --git a/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java b/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java index ec41248a65f..93200f00d51 100644 --- a/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java +++ b/src/test/java/edu/harvard/iq/dataverse/api/UtilIT.java @@ -1494,6 +1494,11 @@ static Response reindexDataset(String persistentId) { return response; } + static Response indexClearDataset(Integer datasetId) { + return given() + .delete("/api/admin/index/datasets/"+datasetId); + } + static Response reindexDataverse(String dvId) { Response response = given() .get("/api/admin/index/dataverses/" + dvId); @@ -2066,7 +2071,7 @@ static Response indexClear() { return given() .get("/api/admin/index/clear"); } - + static Response index() { return given() .get("/api/admin/index");