Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport 2.x] remote publication checksum stats #15960

Merged
merged 1 commit into from
Sep 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import static org.opensearch.gateway.remote.RemoteClusterStateService.REMOTE_CLUSTER_STATE_ENABLED_SETTING;
import static org.opensearch.gateway.remote.RemoteClusterStateService.REMOTE_PUBLICATION_SETTING_KEY;
import static org.opensearch.gateway.remote.RemoteClusterStateUtils.DELIMITER;
import static org.opensearch.gateway.remote.RemoteDownloadStats.CHECKSUM_VALIDATION_FAILED_COUNT;
import static org.opensearch.gateway.remote.model.RemoteClusterBlocks.CLUSTER_BLOCKS;
import static org.opensearch.gateway.remote.model.RemoteCoordinationMetadata.COORDINATION_METADATA;
import static org.opensearch.gateway.remote.model.RemoteCustomMetadata.CUSTOM_METADATA;
Expand Down Expand Up @@ -228,10 +229,28 @@ private void assertDataNodeDownloadStats(NodesStatsResponse nodesStatsResponse)
assertTrue(dataNodeDiscoveryStats.getClusterStateStats().getPersistenceStats().get(0).getSuccessCount() > 0);
assertEquals(0, dataNodeDiscoveryStats.getClusterStateStats().getPersistenceStats().get(0).getFailedCount());
assertTrue(dataNodeDiscoveryStats.getClusterStateStats().getPersistenceStats().get(0).getTotalTimeInMillis() > 0);
assertEquals(
0,
dataNodeDiscoveryStats.getClusterStateStats()
.getPersistenceStats()
.get(0)
.getExtendedFields()
.get(CHECKSUM_VALIDATION_FAILED_COUNT)
.get()
);

assertTrue(dataNodeDiscoveryStats.getClusterStateStats().getPersistenceStats().get(1).getSuccessCount() > 0);
assertEquals(0, dataNodeDiscoveryStats.getClusterStateStats().getPersistenceStats().get(1).getFailedCount());
assertTrue(dataNodeDiscoveryStats.getClusterStateStats().getPersistenceStats().get(1).getTotalTimeInMillis() > 0);
assertEquals(
0,
dataNodeDiscoveryStats.getClusterStateStats()
.getPersistenceStats()
.get(1)
.getExtendedFields()
.get(CHECKSUM_VALIDATION_FAILED_COUNT)
.get()
);
}

private Map<String, Integer> getMetadataFiles(BlobStoreRepository repository, String subDirectory) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1619,6 +1619,12 @@ void validateClusterStateFromChecksum(
failedValidation
)
);
if (isFullStateDownload) {
remoteStateStats.stateFullDownloadValidationFailed();
} else {
remoteStateStats.stateDiffDownloadValidationFailed();
}

if (isFullStateDownload && remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.FAILURE)) {
throw new IllegalStateException(
"Cluster state checksums do not match during full state read. Validation failed for " + failedValidation
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* The OpenSearch Contributors require contributions made to
* this file be licensed under the Apache-2.0 license or a
* compatible open source license.
*/

package org.opensearch.gateway.remote;

import org.opensearch.cluster.coordination.PersistedStateStats;

import java.util.concurrent.atomic.AtomicLong;

/**
* Download stats for remote state
*
* @opensearch.internal
*/
public class RemoteDownloadStats extends PersistedStateStats {
static final String CHECKSUM_VALIDATION_FAILED_COUNT = "checksum_validation_failed_count";
private AtomicLong checksumValidationFailedCount = new AtomicLong(0);

public RemoteDownloadStats(String statsName) {
super(statsName);
addToExtendedFields(CHECKSUM_VALIDATION_FAILED_COUNT, checksumValidationFailedCount);
}

public void checksumValidationFailedCount() {
checksumValidationFailedCount.incrementAndGet();
}

public long getChecksumValidationFailedCount() {
return checksumValidationFailedCount.get();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -18,16 +18,16 @@
public class RemotePersistenceStats {

RemoteUploadStats remoteUploadStats;
PersistedStateStats remoteDiffDownloadStats;
PersistedStateStats remoteFullDownloadStats;
RemoteDownloadStats remoteDiffDownloadStats;
RemoteDownloadStats remoteFullDownloadStats;

final String FULL_DOWNLOAD_STATS = "remote_full_download";
final String DIFF_DOWNLOAD_STATS = "remote_diff_download";

public RemotePersistenceStats() {
remoteUploadStats = new RemoteUploadStats();
remoteDiffDownloadStats = new PersistedStateStats(DIFF_DOWNLOAD_STATS);
remoteFullDownloadStats = new PersistedStateStats(FULL_DOWNLOAD_STATS);
remoteDiffDownloadStats = new RemoteDownloadStats(DIFF_DOWNLOAD_STATS);
remoteFullDownloadStats = new RemoteDownloadStats(FULL_DOWNLOAD_STATS);
}

public void cleanUpAttemptFailed() {
Expand Down Expand Up @@ -90,6 +90,22 @@ public void stateDiffDownloadFailed() {
remoteDiffDownloadStats.stateFailed();
}

public void stateDiffDownloadValidationFailed() {
remoteDiffDownloadStats.checksumValidationFailedCount();
}

public void stateFullDownloadValidationFailed() {
remoteFullDownloadStats.checksumValidationFailedCount();
}

public long getStateDiffDownloadValidationFailed() {
return remoteDiffDownloadStats.getChecksumValidationFailedCount();
}

public long getStateFullDownloadValidationFailed() {
return remoteFullDownloadStats.getChecksumValidationFailedCount();
}

public PersistedStateStats getUploadStats() {
return remoteUploadStats;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3232,6 +3232,7 @@ public void testGetClusterStateForManifestWithChecksumValidationEnabledWithNullC
anyString(),
anyBoolean()
);
assertEquals(0, remoteClusterStateService.getRemoteStateStats().getStateFullDownloadValidationFailed());
}

public void testGetClusterStateForManifestWithChecksumValidationEnabled() throws IOException {
Expand Down Expand Up @@ -3264,6 +3265,7 @@ public void testGetClusterStateForManifestWithChecksumValidationEnabled() throws
);
mockService.getClusterStateForManifest(ClusterName.DEFAULT.value(), manifest, NODE_ID, true);
verify(mockService, times(1)).validateClusterStateFromChecksum(manifest, clusterState, ClusterName.DEFAULT.value(), NODE_ID, true);
assertEquals(0, remoteClusterStateService.getRemoteStateStats().getStateFullDownloadValidationFailed());
}

public void testGetClusterStateForManifestWithChecksumValidationModeNone() throws IOException {
Expand Down Expand Up @@ -3296,6 +3298,7 @@ public void testGetClusterStateForManifestWithChecksumValidationModeNone() throw
);
mockService.getClusterStateForManifest(ClusterName.DEFAULT.value(), manifest, NODE_ID, true);
verify(mockService, times(0)).validateClusterStateFromChecksum(any(), any(), anyString(), anyString(), anyBoolean());
assertEquals(0, remoteClusterStateService.getRemoteStateStats().getStateFullDownloadValidationFailed());
}

public void testGetClusterStateForManifestWithChecksumValidationEnabledWithMismatch() throws IOException {
Expand Down Expand Up @@ -3338,6 +3341,7 @@ public void testGetClusterStateForManifestWithChecksumValidationEnabledWithMisma
NODE_ID,
true
);
assertEquals(1, remoteClusterStateService.getRemoteStateStats().getStateFullDownloadValidationFailed());
}

public void testGetClusterStateForManifestWithChecksumValidationDebugWithMismatch() throws IOException {
Expand Down Expand Up @@ -3384,6 +3388,7 @@ public void testGetClusterStateForManifestWithChecksumValidationDebugWithMismatc
NODE_ID,
true
);
assertEquals(1, remoteClusterStateService.getRemoteStateStats().getStateFullDownloadValidationFailed());
}

public void testGetClusterStateUsingDiffWithChecksum() throws IOException {
Expand Down Expand Up @@ -3425,6 +3430,7 @@ public void testGetClusterStateUsingDiffWithChecksum() throws IOException {
eq(NODE_ID),
eq(false)
);
assertEquals(0, remoteClusterStateService.getRemoteStateStats().getStateDiffDownloadValidationFailed());
}

public void testGetClusterStateUsingDiffWithChecksumModeNone() throws IOException {
Expand Down Expand Up @@ -3466,6 +3472,7 @@ public void testGetClusterStateUsingDiffWithChecksumModeNone() throws IOExceptio
eq(NODE_ID),
eq(false)
);
assertEquals(0, remoteClusterStateService.getRemoteStateStats().getStateDiffDownloadValidationFailed());
}

public void testGetClusterStateUsingDiffWithChecksumModeDebugMismatch() throws IOException {
Expand Down Expand Up @@ -3506,6 +3513,7 @@ public void testGetClusterStateUsingDiffWithChecksumModeDebugMismatch() throws I
eq(NODE_ID),
eq(false)
);
assertEquals(1, remoteClusterStateService.getRemoteStateStats().getStateDiffDownloadValidationFailed());
}

public void testGetClusterStateUsingDiffWithChecksumModeTraceMismatch() throws IOException {
Expand Down Expand Up @@ -3567,6 +3575,7 @@ public void testGetClusterStateUsingDiffWithChecksumModeTraceMismatch() throws I
eq(NODE_ID),
eq(false)
);
assertEquals(1, remoteClusterStateService.getRemoteStateStats().getStateDiffDownloadValidationFailed());
}

public void testGetClusterStateUsingDiffWithChecksumMismatch() throws IOException {
Expand Down Expand Up @@ -3628,6 +3637,7 @@ public void testGetClusterStateUsingDiffWithChecksumMismatch() throws IOExceptio
eq(NODE_ID),
eq(false)
);
assertEquals(1, remoteClusterStateService.getRemoteStateStats().getStateDiffDownloadValidationFailed());
}

private void mockObjectsForGettingPreviousClusterUUID(Map<String, String> clusterUUIDsPointers) throws IOException {
Expand Down
Loading