Skip to content

Commit

Permalink
Track RequestedRangeNotSatisfiedException separately in S3 Metrics (#…
Browse files Browse the repository at this point in the history
…109657)

Due to RCO changes, we started getting a lot of `RequestedRangeNotSatisfiedExceptions` which are expected. We would like track them separately.

This change adds two new metrics to track all client errors analogous to other S3 errors.

* es.repositories.exceptions.request_range_not_satisfied.total
* es.repositories.exceptions.request_range_not_satisfied.histogram

In the future, we can add the error code as an attribute to the metrics, so we can adapt it to all client errors.
  • Loading branch information
arteam authored Jun 20, 2024
1 parent e79ee17 commit 3566ee9
Show file tree
Hide file tree
Showing 4 changed files with 88 additions and 11 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/109657.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 109657
summary: Track `RequestedRangeNotSatisfiedException` separately in S3 Metrics
area: Snapshot/Restore
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.blobstore.BlobContainer;
import org.elasticsearch.common.blobstore.BlobPath;
import org.elasticsearch.common.blobstore.BlobStore;
import org.elasticsearch.common.blobstore.OperationPurpose;
import org.elasticsearch.common.bytes.BytesArray;
import org.elasticsearch.common.collect.Iterators;
Expand All @@ -23,6 +22,7 @@
import org.elasticsearch.plugins.PluginsService;
import org.elasticsearch.repositories.RepositoriesService;
import org.elasticsearch.repositories.blobstore.BlobStoreRepository;
import org.elasticsearch.repositories.blobstore.RequestedRangeNotSatisfiedException;
import org.elasticsearch.repositories.s3.S3BlobStore.Operation;
import org.elasticsearch.rest.RestStatus;
import org.elasticsearch.telemetry.Measurement;
Expand All @@ -39,6 +39,7 @@

import static org.elasticsearch.repositories.RepositoriesMetrics.HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM;
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_EXCEPTIONS_HISTOGRAM;
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL;
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_EXCEPTIONS_TOTAL;
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_OPERATIONS_TOTAL;
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_REQUESTS_TOTAL;
Expand All @@ -47,8 +48,10 @@
import static org.elasticsearch.repositories.RepositoriesMetrics.METRIC_UNSUCCESSFUL_OPERATIONS_TOTAL;
import static org.elasticsearch.rest.RestStatus.INTERNAL_SERVER_ERROR;
import static org.elasticsearch.rest.RestStatus.NOT_FOUND;
import static org.elasticsearch.rest.RestStatus.REQUESTED_RANGE_NOT_SATISFIED;
import static org.elasticsearch.rest.RestStatus.TOO_MANY_REQUESTS;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.instanceOf;

@SuppressForbidden(reason = "this test uses a HttpServer to emulate an S3 endpoint")
// Need to set up a new cluster for each test because cluster settings use randomized authentication settings
Expand Down Expand Up @@ -80,22 +83,29 @@ protected Settings nodeSettings(int nodeOrdinal, Settings otherSettings) {
.build();
}

public void testMetricsWithErrors() throws IOException {
final String repository = createRepository(randomRepositoryName());

final String dataNodeName = internalCluster().getNodeNameThat(DiscoveryNode::canContainData);
final var blobStoreRepository = (BlobStoreRepository) internalCluster().getInstance(RepositoriesService.class, dataNodeName)
.repository(repository);
final BlobStore blobStore = blobStoreRepository.blobStore();
final TestTelemetryPlugin plugin = internalCluster().getInstance(PluginsService.class, dataNodeName)
private static TestTelemetryPlugin getPlugin(String dataNodeName) {
var plugin = internalCluster().getInstance(PluginsService.class, dataNodeName)
.filterPlugins(TestTelemetryPlugin.class)
.findFirst()
.orElseThrow();

plugin.resetMeter();
return plugin;
}

private static BlobContainer getBlobContainer(String dataNodeName, String repository) {
final var blobStoreRepository = (BlobStoreRepository) internalCluster().getInstance(RepositoriesService.class, dataNodeName)
.repository(repository);
return blobStoreRepository.blobStore().blobContainer(BlobPath.EMPTY.add(randomIdentifier()));
}

public void testMetricsWithErrors() throws IOException {
final String repository = createRepository(randomRepositoryName());

final String dataNodeName = internalCluster().getNodeNameThat(DiscoveryNode::canContainData);
final TestTelemetryPlugin plugin = getPlugin(dataNodeName);

final OperationPurpose purpose = randomFrom(OperationPurpose.values());
final BlobContainer blobContainer = blobStore.blobContainer(BlobPath.EMPTY.add(randomIdentifier()));
final BlobContainer blobContainer = getBlobContainer(dataNodeName, repository);
final String blobName = randomIdentifier();

// Put a blob
Expand Down Expand Up @@ -132,6 +142,9 @@ public void testMetricsWithErrors() throws IOException {
assertThat(getLongHistogramValue(plugin, METRIC_EXCEPTIONS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
assertThat(getLongHistogramValue(plugin, METRIC_THROTTLES_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));

// Make sure we don't hit the request range not satisfied counters
assertThat(getLongCounterValue(plugin, METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL, Operation.GET_OBJECT), equalTo(0L));
}

// List retry exhausted
Expand Down Expand Up @@ -166,6 +179,39 @@ public void testMetricsWithErrors() throws IOException {
assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.DELETE_OBJECTS), equalTo(1L));
}

public void testMetricsForRequestRangeNotSatisfied() {
final String repository = createRepository(randomRepositoryName());
final String dataNodeName = internalCluster().getNodeNameThat(DiscoveryNode::canContainData);
final BlobContainer blobContainer = getBlobContainer(dataNodeName, repository);
final TestTelemetryPlugin plugin = getPlugin(dataNodeName);

final OperationPurpose purpose = randomFrom(OperationPurpose.values());
final String blobName = randomIdentifier();

for (int i = 0; i < randomIntBetween(1, 3); i++) {
final long batch = i + 1;
addErrorStatus(TOO_MANY_REQUESTS, TOO_MANY_REQUESTS, REQUESTED_RANGE_NOT_SATISFIED);
try {
blobContainer.readBlob(purpose, blobName).close();
} catch (Exception e) {
assertThat(e, instanceOf(RequestedRangeNotSatisfiedException.class));
}

assertThat(getLongCounterValue(plugin, METRIC_REQUESTS_TOTAL, Operation.GET_OBJECT), equalTo(3 * batch));
assertThat(getLongCounterValue(plugin, METRIC_OPERATIONS_TOTAL, Operation.GET_OBJECT), equalTo(batch));
assertThat(getLongCounterValue(plugin, METRIC_UNSUCCESSFUL_OPERATIONS_TOTAL, Operation.GET_OBJECT), equalTo(batch));
assertThat(getLongCounterValue(plugin, METRIC_EXCEPTIONS_TOTAL, Operation.GET_OBJECT), equalTo(batch));
assertThat(getLongHistogramValue(plugin, METRIC_EXCEPTIONS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
assertThat(
getLongCounterValue(plugin, METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL, Operation.GET_OBJECT),
equalTo(batch)
);
assertThat(getLongCounterValue(plugin, METRIC_THROTTLES_TOTAL, Operation.GET_OBJECT), equalTo(2 * batch));
assertThat(getLongHistogramValue(plugin, METRIC_THROTTLES_HISTOGRAM, Operation.GET_OBJECT), equalTo(2 * batch));
assertThat(getNumberOfMeasurements(plugin, HTTP_REQUEST_TIME_IN_MICROS_HISTOGRAM, Operation.GET_OBJECT), equalTo(batch));
}
}

private void addErrorStatus(RestStatus... statuses) {
errorStatusQueue.addAll(Arrays.asList(statuses));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@
import java.util.stream.Collectors;

import static org.elasticsearch.core.Strings.format;
import static org.elasticsearch.rest.RestStatus.REQUESTED_RANGE_NOT_SATISFIED;

class S3BlobStore implements BlobStore {

Expand Down Expand Up @@ -177,6 +178,23 @@ public final void collectMetrics(Request<?> request, Response<?> response) {
.map(List::size)
.orElse(0);

if (exceptionCount > 0) {
final List<Object> statusCodes = Objects.requireNonNullElse(
awsRequestMetrics.getProperty(AWSRequestMetrics.Field.StatusCode),
List.of()
);
// REQUESTED_RANGE_NOT_SATISFIED errors are expected errors due to RCO
// TODO Add more expected client error codes?
final long amountOfRequestRangeNotSatisfiedErrors = statusCodes.stream()
.filter(e -> (Integer) e == REQUESTED_RANGE_NOT_SATISFIED.getStatus())
.count();
if (amountOfRequestRangeNotSatisfiedErrors > 0) {
s3RepositoriesMetrics.common()
.requestRangeNotSatisfiedExceptionCounter()
.incrementBy(amountOfRequestRangeNotSatisfiedErrors, attributes);
}
}

s3RepositoriesMetrics.common().operationCounter().incrementBy(1, attributes);
if (numberOfAwsErrors == requestCount) {
s3RepositoriesMetrics.common().unsuccessfulOperationCounter().incrementBy(1, attributes);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ public record RepositoriesMetrics(
MeterRegistry meterRegistry,
LongCounter requestCounter,
LongCounter exceptionCounter,
LongCounter requestRangeNotSatisfiedExceptionCounter,
LongCounter throttleCounter,
LongCounter operationCounter,
LongCounter unsuccessfulOperationCounter,
Expand All @@ -28,6 +29,8 @@ public record RepositoriesMetrics(

public static final String METRIC_REQUESTS_TOTAL = "es.repositories.requests.total";
public static final String METRIC_EXCEPTIONS_TOTAL = "es.repositories.exceptions.total";
public static final String METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL =
"es.repositories.exceptions.request_range_not_satisfied.total";
public static final String METRIC_THROTTLES_TOTAL = "es.repositories.throttles.total";
public static final String METRIC_OPERATIONS_TOTAL = "es.repositories.operations.total";
public static final String METRIC_UNSUCCESSFUL_OPERATIONS_TOTAL = "es.repositories.operations.unsuccessful.total";
Expand All @@ -40,6 +43,11 @@ public RepositoriesMetrics(MeterRegistry meterRegistry) {
meterRegistry,
meterRegistry.registerLongCounter(METRIC_REQUESTS_TOTAL, "repository request counter", "unit"),
meterRegistry.registerLongCounter(METRIC_EXCEPTIONS_TOTAL, "repository request exception counter", "unit"),
meterRegistry.registerLongCounter(
METRIC_EXCEPTIONS_REQUEST_RANGE_NOT_SATISFIED_TOTAL,
"repository request RequestedRangeNotSatisfiedException counter",
"unit"
),
meterRegistry.registerLongCounter(METRIC_THROTTLES_TOTAL, "repository request throttle counter", "unit"),
meterRegistry.registerLongCounter(METRIC_OPERATIONS_TOTAL, "repository operation counter", "unit"),
meterRegistry.registerLongCounter(METRIC_UNSUCCESSFUL_OPERATIONS_TOTAL, "repository unsuccessful operation counter", "unit"),
Expand Down

0 comments on commit 3566ee9

Please sign in to comment.