From 1d7f4824b3da9b442f407e03a06563a509fcb466 Mon Sep 17 00:00:00 2001 From: Robert Niedziela <175605712+robsunday@users.noreply.github.com> Date: Mon, 25 Nov 2024 18:16:34 +0100 Subject: [PATCH] JMX Scraper - YAML config and integration test for HBase (#1538) --- .../target_systems/HbaseIntegrationTest.java | 28 +- .../resources/target-systems/hbase.groovy | 30 +- .../target_systems/HBaseIntegrationTest.java | 439 ++++++++++++++++++ .../target_systems/MetricAssertions.java | 33 +- jmx-scraper/src/main/resources/hbase.yaml | 348 ++++++++++++++ 5 files changed, 839 insertions(+), 39 deletions(-) create mode 100644 jmx-scraper/src/integrationTest/java/io/opentelemetry/contrib/jmxscraper/target_systems/HBaseIntegrationTest.java create mode 100644 jmx-scraper/src/main/resources/hbase.yaml diff --git a/jmx-metrics/src/integrationTest/java/io/opentelemetry/contrib/jmxmetrics/target_systems/HbaseIntegrationTest.java b/jmx-metrics/src/integrationTest/java/io/opentelemetry/contrib/jmxmetrics/target_systems/HbaseIntegrationTest.java index 412805385..ba52e1158 100644 --- a/jmx-metrics/src/integrationTest/java/io/opentelemetry/contrib/jmxmetrics/target_systems/HbaseIntegrationTest.java +++ b/jmx-metrics/src/integrationTest/java/io/opentelemetry/contrib/jmxmetrics/target_systems/HbaseIntegrationTest.java @@ -43,7 +43,7 @@ void endToEnd() { metric, "hbase.master.region_server.count", "The number of region servers.", - "{servers}", + "{server}", attrs -> attrs.contains(entry("state", "dead")), attrs -> attrs.contains(entry("state", "live"))), metric -> @@ -51,14 +51,14 @@ void endToEnd() { metric, "hbase.master.regions_in_transition.count", "The number of regions that are in transition.", - "{regions}", + "{region}", /* isMonotonic= */ false), metric -> assertSum( metric, "hbase.master.regions_in_transition.over_threshold", "The number of regions that have been in transition longer than a threshold time.", - "{regions}", + "{region}", /* isMonotonic= */ false), metric -> assertGauge( @@ -71,14 +71,14 @@ void endToEnd() { metric, "hbase.region_server.region.count", "The number of regions hosted by the region server.", - "{regions}", + "{region}", attrs -> attrs.containsKey("region_server")), metric -> assertSumWithAttributes( metric, "hbase.region_server.disk.store_file.count", "The number of store files on disk currently managed by the region server.", - "{files}", + "{file}", attrs -> attrs.containsKey("region_server")), metric -> assertSumWithAttributes( @@ -92,14 +92,14 @@ void endToEnd() { metric, "hbase.region_server.write_ahead_log.count", "The number of write ahead logs not yet archived.", - "{logs}", + "{log}", attrs -> attrs.containsKey("region_server")), metric -> assertSumWithAttributes( metric, "hbase.region_server.request.count", "The number of requests received.", - "{requests}", + "{request}", attrs -> attrs.contains(entry("state", "write")), attrs -> attrs.contains(entry("state", "read"))), metric -> @@ -107,7 +107,7 @@ void endToEnd() { metric, "hbase.region_server.queue.length", "The number of RPC handlers actively servicing requests.", - "{handlers}", + "{handler}", attrs -> attrs.contains(entry("state", "flush")), attrs -> attrs.contains(entry("state", "compaction"))), metric -> @@ -122,7 +122,7 @@ void endToEnd() { metric, "hbase.region_server.request.count", "The number of requests received.", - "{requests}", + "{request}", attrs -> attrs.contains(entry("state", "write")), attrs -> attrs.contains(entry("state", "read"))), metric -> @@ -347,7 +347,7 @@ void endToEnd() { metric, "hbase.region_server.operations.slow", "Number of operations that took over 1000ms to complete.", - "{operations}", + "{operation}", attrs -> attrs.contains(entry("operation", "delete")), attrs -> attrs.contains(entry("operation", "append")), attrs -> attrs.contains(entry("operation", "get")), @@ -358,21 +358,21 @@ void endToEnd() { metric, "hbase.region_server.open_connection.count", "The number of open connections at the RPC layer.", - "{connections}", + "{connection}", attrs -> attrs.containsKey("region_server")), metric -> assertSumWithAttributes( metric, "hbase.region_server.active_handler.count", "The number of RPC handlers actively servicing requests.", - "{handlers}", + "{handler}", attrs -> attrs.containsKey("region_server")), metric -> assertSumWithAttributes( metric, "hbase.region_server.queue.request.count", "The number of currently enqueued requests.", - "{requests}", + "{request}", attrs -> attrs.contains(entry("state", "replication")), attrs -> attrs.contains(entry("state", "user")), attrs -> attrs.contains(entry("state", "priority"))), @@ -381,7 +381,7 @@ void endToEnd() { metric, "hbase.region_server.authentication.count", "Number of client connection authentication failures/successes.", - "{authentication requests}", + "{authentication request}", attrs -> attrs.contains(entry("state", "successes")), attrs -> attrs.contains(entry("state", "failures"))), metric -> diff --git a/jmx-metrics/src/main/resources/target-systems/hbase.groovy b/jmx-metrics/src/main/resources/target-systems/hbase.groovy index 7c03f75ab..4f9c3b02f 100644 --- a/jmx-metrics/src/main/resources/target-systems/hbase.groovy +++ b/jmx-metrics/src/main/resources/target-systems/hbase.groovy @@ -16,16 +16,16 @@ def beanMasterServer = otel.mbeans("Hadoop:service=HBase,name=Master,sub=Server") otel.instrument(beanMasterServer, "hbase.master.region_server.count", - "The number of region servers.", "{servers}", + "The number of region servers.", "{server}", ["numDeadRegionServers":["state" : {"dead"}], "numRegionServers": ["state" : {"live"}]], otel.&longUpDownCounterCallback) def beanMasterAssignmentManager = otel.mbean("Hadoop:service=HBase,name=Master,sub=AssignmentManager") otel.instrument(beanMasterAssignmentManager, "hbase.master.regions_in_transition.count", - "The number of regions that are in transition.", "{regions}", + "The number of regions that are in transition.", "{region}", "ritCount", otel.&longUpDownCounterCallback) otel.instrument(beanMasterAssignmentManager, "hbase.master.regions_in_transition.over_threshold", - "The number of regions that have been in transition longer than a threshold time.", "{regions}", + "The number of regions that have been in transition longer than a threshold time.", "{region}", "ritCountOverThreshold", otel.&longUpDownCounterCallback) otel.instrument(beanMasterAssignmentManager, "hbase.master.regions_in_transition.oldest_age", "The age of the longest region in transition.", "ms", @@ -33,11 +33,11 @@ otel.instrument(beanMasterAssignmentManager, "hbase.master.regions_in_transition def beanRegionServerServer = otel.mbean("Hadoop:service=HBase,name=RegionServer,sub=Server") otel.instrument(beanRegionServerServer, "hbase.region_server.region.count", - "The number of regions hosted by the region server.", "{regions}", + "The number of regions hosted by the region server.", "{region}", ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], "regionCount", otel.&longUpDownCounterCallback) otel.instrument(beanRegionServerServer, "hbase.region_server.disk.store_file.count", - "The number of store files on disk currently managed by the region server.", "{files}", + "The number of store files on disk currently managed by the region server.", "{file}", ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], "storeFileCount", otel.&longUpDownCounterCallback) otel.instrument(beanRegionServerServer, "hbase.region_server.disk.store_file.size", @@ -45,16 +45,16 @@ otel.instrument(beanRegionServerServer, "hbase.region_server.disk.store_file.siz ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], "storeFileSize", otel.&longUpDownCounterCallback) otel.instrument(beanRegionServerServer, "hbase.region_server.write_ahead_log.count", - "The number of write ahead logs not yet archived.", "{logs}", + "The number of write ahead logs not yet archived.", "{log}", ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], "hlogFileCount", otel.&longUpDownCounterCallback) otel.instrument(beanRegionServerServer, "hbase.region_server.request.count", - "The number of requests received.", "{requests}", + "The number of requests received.", "{request}", ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], ["writeRequestCount":["state" : {"write"}], "readRequestCount": ["state" : {"read"}]], otel.&longUpDownCounterCallback) otel.instrument(beanRegionServerServer, "hbase.region_server.queue.length", - "The number of RPC handlers actively servicing requests.", "{handlers}", + "The number of RPC handlers actively servicing requests.", "{handler}", ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], ["flushQueueLength":["state" : {"flush"}], "compactionQueueLength": ["state" : {"compaction"}]], otel.&longUpDownCounterCallback) @@ -63,7 +63,7 @@ otel.instrument(beanRegionServerServer, "hbase.region_server.blocked_update.time ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], "updatesBlockedTime", otel.&longValueCallback) otel.instrument(beanRegionServerServer, "hbase.region_server.block_cache.operation.count", - "Number of block cache hits/misses.", "{operations}", + "Number of block cache hits/misses.", "{operation}", ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], ["blockCacheMissCount":["state" : {"miss"}], "blockCacheHitCount": ["state" : {"hit"}]], otel.&longValueCallback) @@ -199,7 +199,7 @@ otel.instrument(beanRegionServerServer, "hbase.region_server.operation.increment "Increment_median", otel.&longValueCallback) otel.instrument(beanRegionServerServer, "hbase.region_server.operations.slow", - "Number of operations that took over 1000ms to complete.", "{operations}", + "Number of operations that took over 1000ms to complete.", "{operation}", ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], [ "slowDeleteCount":["operation" : {"delete"}], @@ -212,15 +212,15 @@ otel.instrument(beanRegionServerServer, "hbase.region_server.operations.slow", def beanRegionServerIPC = otel.mbean("Hadoop:service=HBase,name=RegionServer,sub=IPC") otel.instrument(beanRegionServerIPC, "hbase.region_server.open_connection.count", - "The number of open connections at the RPC layer.", "{connections}", + "The number of open connections at the RPC layer.", "{connection}", ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], "numOpenConnections", otel.&longUpDownCounterCallback) otel.instrument(beanRegionServerIPC, "hbase.region_server.active_handler.count", - "The number of RPC handlers actively servicing requests.", "{handlers}", + "The number of RPC handlers actively servicing requests.", "{handler}", ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], "numActiveHandler", otel.&longUpDownCounterCallback) otel.instrument(beanRegionServerIPC, "hbase.region_server.queue.request.count", - "The number of currently enqueued requests.", "{requests}", + "The number of currently enqueued requests.", "{request}", ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], [ "numCallsInReplicationQueue":["state" : {"replication"}], @@ -229,7 +229,7 @@ otel.instrument(beanRegionServerIPC, "hbase.region_server.queue.request.count", ], otel.&longUpDownCounterCallback) otel.instrument(beanRegionServerIPC, "hbase.region_server.authentication.count", - "Number of client connection authentication failures/successes.", "{authentication requests}", + "Number of client connection authentication failures/successes.", "{authentication request}", ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], ["authenticationSuccesses":["state" : {"successes"}], "authenticationFailures": ["state" : {"failures"}]], otel.&longUpDownCounterCallback) @@ -246,4 +246,4 @@ otel.instrument(beanJVMMetrics, "hbase.region_server.gc.young_gen.time", otel.instrument(beanJVMMetrics, "hbase.region_server.gc.old_gen.time", "Time spent in garbage collection of the old generation.", "ms", ["region_server" : { mbean -> mbean.getProperty("tag.Hostname") }], - "GcTimeMillisConcurrentMarkSweep", otel.&longCounterCallback) \ No newline at end of file + "GcTimeMillisConcurrentMarkSweep", otel.&longCounterCallback) diff --git a/jmx-scraper/src/integrationTest/java/io/opentelemetry/contrib/jmxscraper/target_systems/HBaseIntegrationTest.java b/jmx-scraper/src/integrationTest/java/io/opentelemetry/contrib/jmxscraper/target_systems/HBaseIntegrationTest.java new file mode 100644 index 000000000..93877e279 --- /dev/null +++ b/jmx-scraper/src/integrationTest/java/io/opentelemetry/contrib/jmxscraper/target_systems/HBaseIntegrationTest.java @@ -0,0 +1,439 @@ +/* + * Copyright The OpenTelemetry Authors + * SPDX-License-Identifier: Apache-2.0 + */ + +package io.opentelemetry.contrib.jmxscraper.target_systems; + +import static io.opentelemetry.contrib.jmxscraper.target_systems.MetricAssertions.assertGauge; +import static io.opentelemetry.contrib.jmxscraper.target_systems.MetricAssertions.assertGaugeWithAttributes; +import static io.opentelemetry.contrib.jmxscraper.target_systems.MetricAssertions.assertSum; +import static io.opentelemetry.contrib.jmxscraper.target_systems.MetricAssertions.assertSumWithAttributes; +import static org.assertj.core.data.MapEntry.entry; + +import io.opentelemetry.contrib.jmxscraper.JmxScraperContainer; +import java.nio.file.Path; +import java.time.Duration; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.wait.strategy.Wait; + +public class HBaseIntegrationTest extends TargetSystemIntegrationTest { + private static final int DEFAULT_MASTER_SERVICE_PORT = 16000; + + @Override + protected GenericContainer createTargetContainer(int jmxPort) { + return new GenericContainer<>("dajobe/hbase") + .withEnv("HBASE_MASTER_OPTS", genericJmxJvmArguments(jmxPort)) + .withStartupTimeout(Duration.ofMinutes(2)) + .withExposedPorts(jmxPort, DEFAULT_MASTER_SERVICE_PORT) + .waitingFor(Wait.forListeningPorts(jmxPort, DEFAULT_MASTER_SERVICE_PORT)); + } + + @Override + protected JmxScraperContainer customizeScraperContainer( + JmxScraperContainer scraper, GenericContainer target, Path tempDir) { + return scraper.withTargetSystem("hbase"); + } + + @Override + protected void verifyMetrics() { + waitAndAssertMetrics( + metric -> + assertSumWithAttributes( + metric, + "hbase.master.region_server.count", + "The number of region servers.", + "{server}", + /* isMonotonic= */ false, + attrs -> attrs.contains(entry("state", "dead")), + attrs -> attrs.contains(entry("state", "live"))), + metric -> + assertSum( + metric, + "hbase.master.regions_in_transition.count", + "The number of regions that are in transition.", + "{region}", + /* isMonotonic= */ false), + metric -> + assertSum( + metric, + "hbase.master.regions_in_transition.over_threshold", + "The number of regions that have been in transition longer than a threshold time.", + "{region}", + /* isMonotonic= */ false), + metric -> + assertGauge( + metric, + "hbase.master.regions_in_transition.oldest_age", + "The age of the longest region in transition.", + "ms"), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.region.count", + "The number of regions hosted by the region server.", + "{region}", + /* isMonotonic= */ false, + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.disk.store_file.count", + "The number of store files on disk currently managed by the region server.", + "{file}", + /* isMonotonic= */ false, + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.disk.store_file.size", + "Aggregate size of the store files on disk.", + "By", + /* isMonotonic= */ false, + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.write_ahead_log.count", + "The number of write ahead logs not yet archived.", + "{log}", + /* isMonotonic= */ false, + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.request.count", + "The number of requests received.", + "{request}", + /* isMonotonic= */ false, + attrs -> { + attrs.contains(entry("state", "write")); + attrs.containsKey("region_server"); + }, + attrs -> { + attrs.contains(entry("state", "read")); + attrs.containsKey("region_server"); + }), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.queue.length", + "The number of RPC handlers actively servicing requests.", + "{handler}", + /* isMonotonic= */ false, + attrs -> { + attrs.contains(entry("state", "flush")); + attrs.containsKey("region_server"); + }, + attrs -> { + attrs.contains(entry("state", "compaction")); + attrs.containsKey("region_server"); + }), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.blocked_update.time", + "Amount of time updates have been blocked so the memstore can be flushed.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.block_cache.operation.count", + "Number of block cache hits/misses.", + "{operation}", + attrs -> { + attrs.contains(entry("state", "miss")); + attrs.containsKey("region_server"); + }, + attrs -> { + attrs.contains(entry("state", "hit")); + attrs.containsKey("region_server"); + }), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.files.local", + "Percent of store file data that can be read from the local.", + "%", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.append.latency.p99", + "Append operation 99th Percentile latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.append.latency.max", + "Append operation max latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.append.latency.min", + "Append operation minimum latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.append.latency.mean", + "Append operation mean latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.append.latency.median", + "Append operation median latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.delete.latency.p99", + "Delete operation 99th Percentile latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.delete.latency.max", + "Delete operation max latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.delete.latency.min", + "Delete operation minimum latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.delete.latency.mean", + "Delete operation mean latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.delete.latency.median", + "Delete operation median latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.put.latency.p99", + "Put operation 99th Percentile latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.put.latency.max", + "Put operation max latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.put.latency.min", + "Put operation minimum latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.put.latency.mean", + "Put operation mean latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.put.latency.median", + "Put operation median latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.get.latency.p99", + "Get operation 99th Percentile latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.get.latency.max", + "Get operation max latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.get.latency.min", + "Get operation minimum latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.get.latency.mean", + "Get operation mean latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.get.latency.median", + "Get operation median latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.replay.latency.p99", + "Replay operation 99th Percentile latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.replay.latency.max", + "Replay operation max latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.replay.latency.min", + "Replay operation minimum latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.replay.latency.mean", + "Replay operation mean latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.replay.latency.median", + "Replay operation median latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.increment.latency.p99", + "Increment operation 99th Percentile latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.increment.latency.max", + "Increment operation max latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.increment.latency.min", + "Increment operation minimum latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.increment.latency.mean", + "Increment operation mean latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertGaugeWithAttributes( + metric, + "hbase.region_server.operation.increment.latency.median", + "Increment operation median latency.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.operations.slow", + "Number of operations that took over 1000ms to complete.", + "{operation}", + /* isMonotonic= */ false, + attrs -> attrs.contains(entry("operation", "delete")), + attrs -> attrs.contains(entry("operation", "append")), + attrs -> attrs.contains(entry("operation", "get")), + attrs -> attrs.contains(entry("operation", "put")), + attrs -> attrs.contains(entry("operation", "increment"))), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.open_connection.count", + "The number of open connections at the RPC layer.", + "{connection}", + /* isMonotonic= */ false, + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.active_handler.count", + "The number of RPC handlers actively servicing requests.", + "{handler}", + /* isMonotonic= */ false, + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.queue.request.count", + "The number of currently enqueued requests.", + "{request}", + /* isMonotonic= */ false, + attrs -> attrs.contains(entry("state", "replication")), + attrs -> attrs.contains(entry("state", "user")), + attrs -> attrs.contains(entry("state", "priority"))), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.authentication.count", + "Number of client connection authentication failures/successes.", + "{authentication request}", + /* isMonotonic= */ false, + attrs -> attrs.contains(entry("state", "successes")), + attrs -> attrs.contains(entry("state", "failures"))), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.gc.time", + "Time spent in garbage collection.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.gc.young_gen.time", + "Time spent in garbage collection of the young generation.", + "ms", + attrs -> attrs.containsKey("region_server")), + metric -> + assertSumWithAttributes( + metric, + "hbase.region_server.gc.old_gen.time", + "Time spent in garbage collection of the old generation.", + "ms", + attrs -> attrs.containsKey("region_server"))); + } +} diff --git a/jmx-scraper/src/integrationTest/java/io/opentelemetry/contrib/jmxscraper/target_systems/MetricAssertions.java b/jmx-scraper/src/integrationTest/java/io/opentelemetry/contrib/jmxscraper/target_systems/MetricAssertions.java index 881ccbf07..79e35fe45 100644 --- a/jmx-scraper/src/integrationTest/java/io/opentelemetry/contrib/jmxscraper/target_systems/MetricAssertions.java +++ b/jmx-scraper/src/integrationTest/java/io/opentelemetry/contrib/jmxscraper/target_systems/MetricAssertions.java @@ -27,7 +27,7 @@ static void assertGauge(Metric metric, String name, String description, String u assertThat(metric.getName()).isEqualTo(name); assertThat(metric.getDescription()).isEqualTo(description); assertThat(metric.getUnit()).isEqualTo(unit); - assertThat(metric.hasGauge()).isTrue(); + assertMetricWithGauge(metric); assertThat(metric.getGauge().getDataPointsList()) .satisfiesExactly(point -> assertThat(point.getAttributesList()).isEmpty()); } @@ -41,10 +41,9 @@ static void assertSum( assertThat(metric.getName()).isEqualTo(name); assertThat(metric.getDescription()).isEqualTo(description); assertThat(metric.getUnit()).isEqualTo(unit); - assertThat(metric.hasSum()).isTrue(); + assertMetricWithSum(metric, isMonotonic); assertThat(metric.getSum().getDataPointsList()) .satisfiesExactly(point -> assertThat(point.getAttributesList()).isEmpty()); - assertThat(metric.getSum().getIsMonotonic()).isEqualTo(isMonotonic); } static void assertTypedGauge( @@ -52,7 +51,7 @@ static void assertTypedGauge( assertThat(metric.getName()).isEqualTo(name); assertThat(metric.getDescription()).isEqualTo(description); assertThat(metric.getUnit()).isEqualTo(unit); - assertThat(metric.hasGauge()).isTrue(); + assertMetricWithGauge(metric); assertTypedPoints(metric.getGauge().getDataPointsList(), types); } @@ -61,7 +60,7 @@ static void assertTypedSum( assertThat(metric.getName()).isEqualTo(name); assertThat(metric.getDescription()).isEqualTo(description); assertThat(metric.getUnit()).isEqualTo(unit); - assertThat(metric.hasSum()).isTrue(); + assertMetricWithSum(metric); assertTypedPoints(metric.getSum().getDataPointsList(), types); } @@ -87,8 +86,7 @@ static void assertSumWithAttributes( assertThat(metric.getName()).isEqualTo(name); assertThat(metric.getDescription()).isEqualTo(description); assertThat(metric.getUnit()).isEqualTo(unit); - assertThat(metric.hasSum()).describedAs("sum expected").isTrue(); - assertThat(metric.getSum().getIsMonotonic()).isEqualTo(isMonotonic); + assertMetricWithSum(metric, isMonotonic); assertAttributedPoints(metric.getSum().getDataPointsList(), attributeGroupAssertions); } @@ -103,8 +101,7 @@ static void assertSumWithAttributesMultiplePoints( assertThat(metric.getName()).isEqualTo(name); assertThat(metric.getDescription()).isEqualTo(description); assertThat(metric.getUnit()).isEqualTo(unit); - assertThat(metric.hasSum()).isTrue(); - assertThat(metric.getSum().getIsMonotonic()).isEqualTo(isMonotonic); + assertMetricWithSum(metric, isMonotonic); assertAttributedMultiplePoints(metric.getSum().getDataPointsList(), attributeGroupAssertions); } @@ -118,10 +115,25 @@ static void assertGaugeWithAttributes( assertThat(metric.getName()).isEqualTo(name); assertThat(metric.getDescription()).isEqualTo(description); assertThat(metric.getUnit()).isEqualTo(unit); - assertThat(metric.hasGauge()).isTrue(); + assertMetricWithGauge(metric); assertAttributedPoints(metric.getGauge().getDataPointsList(), attributeGroupAssertions); } + private static void assertMetricWithGauge(Metric metric) { + assertThat(metric.hasGauge()).withFailMessage("Metric with gauge expected").isTrue(); + } + + private static void assertMetricWithSum(Metric metric) { + assertThat(metric.hasSum()).withFailMessage("Metric with sum expected").isTrue(); + } + + private static void assertMetricWithSum(Metric metric, boolean isMonotonic) { + assertMetricWithSum(metric); + assertThat(metric.getSum().getIsMonotonic()) + .withFailMessage("Metric should " + (isMonotonic ? "" : "not ") + "be monotonic") + .isEqualTo(isMonotonic); + } + @SuppressWarnings("unchecked") private static void assertTypedPoints(List points, List types) { Consumer>[] assertions = @@ -145,6 +157,7 @@ private static void assertAttributedPoints( .toArray(Consumer[]::new); assertThat(points) + .withFailMessage("Invalid metric attributes. Actual: " + points) .extracting( numberDataPoint -> numberDataPoint.getAttributesList().stream() diff --git a/jmx-scraper/src/main/resources/hbase.yaml b/jmx-scraper/src/main/resources/hbase.yaml new file mode 100644 index 000000000..90f41d97b --- /dev/null +++ b/jmx-scraper/src/main/resources/hbase.yaml @@ -0,0 +1,348 @@ +--- + +rules: + + - bean: Hadoop:service=HBase,name=Master,sub=Server + prefix: hbase.master. + unit: "{server}" + type: updowncounter + mapping: + # hbase.master.region_server.count + numDeadRegionServers: + metric: &metric region_server.count + desc: &desc The number of region servers. + metricAttribute: + state: const(dead) + numRegionServers: + metric: *metric + desc: *desc + metricAttribute: + state: const(live) + + - bean: Hadoop:service=HBase,name=Master,sub=AssignmentManager + prefix: hbase.master.regions_in_transition. + unit: "{region}" + type: updowncounter + mapping: + ritCount: + metric: count + desc: The number of regions that are in transition. + + ritCountOverThreshold: + metric: over_threshold + desc: The number of regions that have been in transition longer than a threshold time. + + ritOldestAge: + metric: oldest_age + unit: ms + type: gauge + desc: The age of the longest region in transition. + + - bean: Hadoop:service=HBase,name=RegionServer,sub=Server + prefix: hbase.region_server. + type: updowncounter + metricAttribute: + region_server: &hostname beanattr(tag\.Hostname) + mapping: + regionCount: + metric: region.count + unit: "{region}" + desc: The number of regions hosted by the region server. + + storeFileCount: + metric: disk.store_file.count + unit: "{file}" + desc: The number of store files on disk currently managed by the region server. + + storeFileSize: + metric: disk.store_file.size + unit: By + desc: Aggregate size of the store files on disk. + + hlogFileCount: + metric: write_ahead_log.count + unit: "{log}" + desc: The number of write ahead logs not yet archived. + + percentFilesLocal: + metric: files.local + type: gauge + unit: "%" + desc: Percent of store file data that can be read from the local. + + updatesBlockedTime: + metric: blocked_update.time + type: gauge + unit: ms + desc: Amount of time updates have been blocked so the memstore can be flushed. + + # hbase.region_server.request.count + writeRequestCount: + metric: &metric request.count + unit: &unit "{request}" + desc: &desc The number of requests received. + metricAttribute: + state: const(write) + region_server: *hostname + readRequestCount: + metric: *metric + unit: *unit + desc: *desc + metricAttribute: + state: const(read) + region_server: *hostname + + # hbase.region_server.queue.length + flushQueueLength: + metric: &metric queue.length + unit: &unit "{handler}" + desc: &desc The number of RPC handlers actively servicing requests. + metricAttribute: + state: const(flush) + region_server: *hostname + compactionQueueLength: + metric: *metric + unit: *unit + desc: *desc + metricAttribute: + state: const(compaction) + region_server: *hostname + + # hbase.region_server.block_cache.operation.count + blockCacheMissCount: + metric: &metric block_cache.operation.count + type: &type gauge + unit: &unit "{operation}" + desc: &desc Number of block cache hits/misses. + metricAttribute: + state: const(miss) + region_server: *hostname + blockCacheHitCount: + metric: *metric + type: *type + unit: *unit + desc: *desc + metricAttribute: + state: const(hit) + region_server: *hostname + + # hbase.region_server.operations.slow + slowDeleteCount: + metric: &metric operations.slow + unit: &unit "{operation}" + desc: &desc Number of operations that took over 1000ms to complete. + metricAttribute: + operation: const(delete) + region_server: *hostname + slowAppendCount: + metric: *metric + unit: *unit + desc: *desc + metricAttribute: + operation: const(append) + region_server: *hostname + slowGetCount: + metric: *metric + unit: *unit + desc: *desc + metricAttribute: + operation: const(get) + region_server: *hostname + slowPutCount: + metric: *metric + unit: *unit + desc: *desc + metricAttribute: + operation: const(put) + region_server: *hostname + slowIncrementCount: + metric: *metric + unit: *unit + desc: *desc + metricAttribute: + operation: const(increment) + region_server: *hostname + + # RegionServer statistical metrics + - bean: Hadoop:service=HBase,name=RegionServer,sub=Server + prefix: hbase.region_server. + type: gauge + unit: ms + metricAttribute: + region_server: *hostname + mapping: + # Statistics for 'append' operation + Append_99th_percentile: + metric: operation.append.latency.p99 + desc: Append operation 99th Percentile latency. + Append_max: + metric: operation.append.latency.max + desc: Append operation max latency. + Append_min: + metric: operation.append.latency.min + desc: Append operation minimum latency. + Append_mean: + metric: operation.append.latency.mean + desc: Append operation mean latency. + Append_median: + metric: operation.append.latency.median + desc: Append operation median latency. + + # Statistics for 'delete' operation + Delete_99th_percentile: + metric: operation.delete.latency.p99 + desc: Delete operation 99th Percentile latency. + Delete_max: + metric: operation.delete.latency.max + desc: Delete operation max latency. + Delete_min: + metric: operation.delete.latency.min + desc: Delete operation minimum latency. + Delete_mean: + metric: operation.delete.latency.mean + desc: Delete operation mean latency. + Delete_median: + metric: operation.delete.latency.median + desc: Delete operation median latency. + + # Statistics for 'put' operation + Put_99th_percentile: + metric: operation.put.latency.p99 + desc: Put operation 99th Percentile latency. + Put_max: + metric: operation.put.latency.max + desc: Put operation max latency. + Put_min: + metric: operation.put.latency.min + desc: Put operation minimum latency. + Put_mean: + metric: operation.put.latency.mean + desc: Put operation mean latency. + Put_median: + metric: operation.put.latency.median + desc: Put operation median latency. + + # Statistics for 'get' operation + Get_99th_percentile: + metric: operation.get.latency.p99 + desc: Get operation 99th Percentile latency. + Get_max: + metric: operation.get.latency.max + desc: Get operation max latency. + Get_min: + metric: operation.get.latency.min + desc: Get operation minimum latency. + Get_mean: + metric: operation.get.latency.mean + desc: Get operation mean latency. + Get_median: + metric: operation.get.latency.median + desc: Get operation median latency. + + # Statistics for 'replay' operation + Replay_99th_percentile: + metric: operation.replay.latency.p99 + desc: Replay operation 99th Percentile latency. + Replay_max: + metric: operation.replay.latency.max + desc: Replay operation max latency. + Replay_min: + metric: operation.replay.latency.min + desc: Replay operation minimum latency. + Replay_mean: + metric: operation.replay.latency.mean + desc: Replay operation mean latency. + Replay_median: + metric: operation.replay.latency.median + desc: Replay operation median latency. + + # Statistics for 'increment' operation + Increment_99th_percentile: + metric: operation.increment.latency.p99 + desc: Increment operation 99th Percentile latency. + Increment_max: + metric: operation.increment.latency.max + desc: Increment operation max latency. + Increment_min: + metric: operation.increment.latency.min + desc: Increment operation minimum latency. + Increment_mean: + metric: operation.increment.latency.mean + desc: Increment operation mean latency. + Increment_median: + metric: operation.increment.latency.median + desc: Increment operation median latency. + + - bean: Hadoop:service=HBase,name=RegionServer,sub=IPC + prefix: hbase.region_server. + type: updowncounter + metricAttribute: + region_server: *hostname + mapping: + numOpenConnections: + metric: open_connection.count + unit: "{connection}" + desc: The number of open connections at the RPC layer. + + numActiveHandler: + metric: active_handler.count + unit: "{handler}" + desc: The number of RPC handlers actively servicing requests. + + # hbase.region_server.queue.request.count + numCallsInReplicationQueue: + metric: &metric queue.request.count + unit: &unit "{request}" + desc: &desc The number of currently enqueued requests. + metricAttribute: + state: const(replication) + region_server: *hostname + numCallsInGeneralQueue: + metric: *metric + unit: *unit + desc: *desc + metricAttribute: + state: const(user) + region_server: *hostname + numCallsInPriorityQueue: + metric: *metric + unit: *unit + desc: *desc + metricAttribute: + state: const(priority) + region_server: *hostname + + # hbase.region_server.authentication.count + authenticationSuccesses: + metric: &metric authentication.count + unit: &unit "{authentication request}" + desc: &desc Number of client connection authentication failures/successes. + metricAttribute: + state: const(successes) + region_server: *hostname + authenticationFailures: + metric: *metric + unit: *unit + desc: *desc + metricAttribute: + state: const(failures) + region_server: *hostname + + - bean: Hadoop:service=HBase,name=JvmMetrics + prefix: hbase.region_server.gc. + unit: ms + type: counter + metricAttribute: + region_server: *hostname + mapping: + GcTimeMillis: + metric: time + desc: Time spent in garbage collection. + + GcTimeMillisParNew: + metric: young_gen.time + desc: Time spent in garbage collection of the young generation. + + GcTimeMillisConcurrentMarkSweep: + metric: old_gen.time + desc: Time spent in garbage collection of the old generation.