From b4dfc09dcf528f61da22bea01e66dc3c450ec0b6 Mon Sep 17 00:00:00 2001 From: onebox-li Date: Thu, 28 Sep 2023 10:18:37 +0800 Subject: [PATCH] [CELEBORN-1007] Improve JVM metrics naming and add ThreadStates metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### What changes were proposed in this pull request? Since we use codahale metrics to expose JVM metrics, the name without prefix is not clear and it‘s not easy to make a grafana template for these metrics because it adds collector name or pool name in names rather than labels. So here I add jvm metric prefixes, remove pool info from name and obtain the pool name as labels if needed. And add ThreadStates metrics additionally. ### Why are the changes needed? Make jvm metrics easy to understand and get template ### Does this PR introduce _any_ user-facing change? Yes,jvm metrics naming is changed,expose threads state additionally. change examples like below: For GarbageCollectorMetricSet, G1-Old-Generation.time -> jvm.gc.time{name="G1-Old-Generation"} For MemoryUsageGaugeSet, total.init -> jvm.memory.total.init ; pools.Metaspace.usage -> jvm.memory.pools.usage{name="Metaspace"} For BufferPoolMetricSet, direct.count -> jvm.direct.count For ThreadStatesGaugeSet, add jvm.thread.count. For G1, the jvm metrics exposed now: metrics_jvm_gc_time_Value{name="G1-Old-Generation",role="Worker"} 0 1695731141588 metrics_jvm_gc_count_Value{name="G1-Young-Generation",role="Worker"} 2 1695731141588 metrics_jvm_gc_time_Value{name="G1-Young-Generation",role="Worker"} 74 1695731141588 metrics_jvm_gc_count_Value{name="G1-Old-Generation",role="Worker"} 0 1695731141588 metrics_jvm_heap_committed_Value{role="Worker"} 2109734912 1695731141588 metrics_jvm_non_heap_used_Value{role="Worker"} 47700056 1695731141588 metrics_jvm_heap_used_Value{role="Worker"} 82801184 1695731141588 metrics_jvm_total_committed_Value{role="Worker"} 2160263168 1695731141588 metrics_jvm_total_init_Value{role="Worker"} 2112290816 1695731141588 metrics_jvm_non_heap_max_Value{role="Worker"} -1 1695731141588 metrics_jvm_heap_usage_Value{role="Worker"} 0.009639326483011246 1695731141588 metrics_jvm_total_used_Value{role="Worker"} 130502480 1695731141589 metrics_jvm_heap_init_Value{role="Worker"} 2109734912 1695731141589 metrics_jvm_non_heap_committed_Value{role="Worker"} 50528256 1695731141589 metrics_jvm_non_heap_init_Value{role="Worker"} 2555904 1695731141589 metrics_jvm_non_heap_usage_Value{role="Worker"} -4.7701296E7 1695731141589 metrics_jvm_heap_max_Value{role="Worker"} 8589934592 1695731141589 metrics_jvm_total_max_Value{role="Worker"} 8589934591 1695731141589 metrics_jvm_memory_pool_used_Value{name="Code-Cache",role="Worker"} 10314368 1695731141588 metrics_jvm_memory_pool_committed_Value{name="Code-Cache",role="Worker"} 10944512 1695731141588 metrics_jvm_memory_pool_init_Value{name="G1-Eden-Space",role="Worker"} 111149056 1695731141588 metrics_jvm_memory_pool_max_Value{name="G1-Old-Gen",role="Worker"} 8589934592 1695731141588 metrics_jvm_memory_pool_used_after_gc_Value{name="G1-Survivor-Space",role="Worker"} 14680064 1695731141588 metrics_jvm_memory_pool_used_Value{name="Compressed-Class-Space",role="Worker"} 4440192 1695731141588 metrics_jvm_memory_pool_usage_Value{name="Metaspace",role="Worker"} 0.9449504192610433 1695731141588 metrics_jvm_memory_pool_max_Value{name="Metaspace",role="Worker"} -1 1695731141588 metrics_jvm_memory_pool_init_Value{name="G1-Survivor-Space",role="Worker"} 0 1695731141588 metrics_jvm_memory_pool_committed_Value{name="G1-Old-Gen",role="Worker"} 1998585856 1695731141588 metrics_jvm_memory_pool_committed_Value{name="G1-Survivor-Space",role="Worker"} 14680064 1695731141588 metrics_jvm_memory_pool_committed_Value{name="G1-Eden-Space",role="Worker"} 96468992 1695731141588 metrics_jvm_memory_pool_max_Value{name="G1-Survivor-Space",role="Worker"} -1 1695731141588 metrics_jvm_memory_pool_usage_Value{name="Compressed-Class-Space",role="Worker"} 0.004135251045227051 1695731141588 metrics_jvm_memory_pool_usage_Value{name="G1-Survivor-Space",role="Worker"} 1.0 1695731141588 metrics_jvm_memory_pool_max_Value{name="Code-Cache",role="Worker"} 251658240 1695731141588 metrics_jvm_memory_pool_init_Value{name="Compressed-Class-Space",role="Worker"} 0 1695731141589 metrics_jvm_memory_pool_usage_Value{name="G1-Eden-Space",role="Worker"} 0.34782608695652173 1695731141589 metrics_jvm_memory_pool_init_Value{name="Metaspace",role="Worker"} 0 1695731141589 metrics_jvm_memory_pool_max_Value{name="G1-Eden-Space",role="Worker"} -1 1695731141589 metrics_jvm_memory_pool_usage_Value{name="Code-Cache",role="Worker"} 0.04098917643229167 1695731141589 metrics_jvm_memory_pool_used_after_gc_Value{name="G1-Eden-Space",role="Worker"} 0 1695731141589 metrics_jvm_memory_pool_init_Value{name="Code-Cache",role="Worker"} 2555904 1695731141589 metrics_jvm_memory_pool_used_Value{name="G1-Survivor-Space",role="Worker"} 14680064 1695731141589 metrics_jvm_memory_pool_committed_Value{name="Compressed-Class-Space",role="Worker"} 4718592 1695731141589 metrics_jvm_memory_pool_used_Value{name="G1-Eden-Space",role="Worker"} 33554432 1695731141589 metrics_jvm_memory_pool_used_Value{name="G1-Old-Gen",role="Worker"} 34566688 1695731141589 metrics_jvm_memory_pool_usage_Value{name="G1-Old-Gen",role="Worker"} 0.004024092108011246 1695731141589 metrics_jvm_memory_pool_used_after_gc_Value{name="G1-Old-Gen",role="Worker"} 0 1695731141589 metrics_jvm_memory_pool_committed_Value{name="Metaspace",role="Worker"} 34865152 1695731141589 metrics_jvm_memory_pool_init_Value{name="G1-Old-Gen",role="Worker"} 1998585856 1695731141589 metrics_jvm_memory_pool_used_Value{name="Metaspace",role="Worker"} 32945840 1695731141589 metrics_jvm_memory_pool_max_Value{name="Compressed-Class-Space",role="Worker"} 1073741824 1695731141589 metrics_jvm_direct_count_Value{role="Worker"} 8 1695731141589 metrics_jvm_direct_capacity_Value{role="Worker"} 1036 1695731141589 metrics_jvm_direct_used_Value{role="Worker"} 1037 1695731141589 metrics_jvm_mapped_used_Value{role="Worker"} 0 1695731141589 metrics_jvm_mapped_capacity_Value{role="Worker"} 0 1695731141589 metrics_jvm_mapped_count_Value{role="Worker"} 0 1695731141589 metrics_jvm_thread_timed_waiting_count_Value{role="Worker"} 23 1695731141589 metrics_jvm_thread_deadlock_count_Value{role="Worker"} 0 1695731141589 metrics_jvm_thread_count_Value{role="Worker"} 78 1695731141589 metrics_jvm_thread_waiting_count_Value{role="Worker"} 45 1695731141589 metrics_jvm_thread_daemon_count_Value{role="Worker"} 75 1695731141589 metrics_jvm_thread_new_count_Value{role="Worker"} 0 1695731141589 metrics_jvm_thread_blocked_count_Value{role="Worker"} 0 1695731141590 metrics_jvm_thread_deadlocks_Value{role="Worker"} [] 1695731141590 metrics_jvm_thread_runnable_count_Value{role="Worker"} 10 1695731141590 metrics_jvm_thread_terminated_count_Value{role="Worker"} 0 1695731141590 ### How was this patch tested? UT and cluster test with g1, PS-Scavenge/PS-MarkSweep and ParNew/CMS Closes #1939 from onebox-li/improve-jvm-metrics. Authored-by: onebox-li Signed-off-by: zky.zhoukeyong --- .../common/metrics/source/JVMSource.scala | 82 ++++++++++++++++--- .../metrics/source/JVMSourceSuite.scala | 55 +++++++++++++ 2 files changed, 126 insertions(+), 11 deletions(-) create mode 100644 common/src/test/scala/org/apache/celeborn/common/metrics/source/JVMSourceSuite.scala diff --git a/common/src/main/scala/org/apache/celeborn/common/metrics/source/JVMSource.scala b/common/src/main/scala/org/apache/celeborn/common/metrics/source/JVMSource.scala index 374024dd248..a8792486851 100644 --- a/common/src/main/scala/org/apache/celeborn/common/metrics/source/JVMSource.scala +++ b/common/src/main/scala/org/apache/celeborn/common/metrics/source/JVMSource.scala @@ -20,26 +20,86 @@ package org.apache.celeborn.common.metrics.source import java.lang.management.ManagementFactory import scala.collection.JavaConverters._ +import scala.collection.mutable -import com.codahale.metrics.Gauge -import com.codahale.metrics.jvm.{BufferPoolMetricSet, GarbageCollectorMetricSet, MemoryUsageGaugeSet} +import com.codahale.metrics.{Gauge, MetricRegistry} +import com.codahale.metrics.jvm.{BufferPoolMetricSet, GarbageCollectorMetricSet, MemoryUsageGaugeSet, ThreadStatesGaugeSet} import org.apache.celeborn.common.CelebornConf class JVMSource(conf: CelebornConf, role: String) extends AbstractSource(conf, role) { override val sourceName = "JVM" - // all of metrics of GCMetricSet and BufferPoolMetricSet are Gauge - Seq( - new GarbageCollectorMetricSet(), - new MemoryUsageGaugeSet(), - new BufferPoolMetricSet(ManagementFactory.getPlatformMBeanServer)) - .map { x => - x.getMetrics.asScala.map { - case (name: String, metric: Gauge[_]) => addGauge(name, metric) - case (name, metric) => new IllegalArgumentException(s"Unknown metric type: $name: $metric") + import JVMSource._ + + private val gcNames = ManagementFactory.getGarbageCollectorMXBeans.asScala.map(bean => + WHITESPACE.matcher(bean.getName).replaceAll("-")) + private val poolNames = ManagementFactory.getMemoryPoolMXBeans.asScala.map(bean => + WHITESPACE.matcher(bean.getName).replaceAll("-")) + + /** + * Add jvm metric prefix, remove pool info from name and obtain the pool name as labels if needed + * @param metricName metric name from MetricSet + * @param targets keywords need to be replaced + * @param prefix prefix for new metric name + * @param replacement replacement for pool name + * @return new metric without target, labels if exists + */ + def handleJVMMetricName( + metricName: String, + targets: mutable.Buffer[String], + prefix: String, + replacement: String): (String, Map[String, String]) = { + for (target <- targets) { + if (metricName.contains(target)) { + val labels = Map("name" -> target) + var replaceTarget = target + if (replacement.isEmpty) { + replaceTarget = target + "." + } + return (MetricRegistry.name(prefix, metricName.replace(replaceTarget, replacement)), labels) } } + (MetricRegistry.name(prefix, metricName), Map.empty[String, String]) + } + + // all metrics in MetricSet are gauges + Seq(new GarbageCollectorMetricSet()).map(_.getMetrics.asScala.map { + case (name: String, metric: Gauge[_]) => + val newMetrics = handleJVMMetricName(name, gcNames, JVM_METRIC_PREFIX, "gc") + addGauge(newMetrics._1, newMetrics._2, metric) + case (name, metric) => new IllegalArgumentException(s"Unknown metric type: $name: $metric") + }) + + Seq(new MemoryUsageGaugeSet()).map(_.getMetrics.asScala.map { + case (name: String, metric: Gauge[_]) => + val newMetrics = handleJVMMetricName(name, poolNames, JVM_METRIC_MEMORY_PREFIX, "") + addGauge(newMetrics._1, newMetrics._2, metric) + case (name, metric) => new IllegalArgumentException(s"Unknown metric type: $name: $metric") + }) + + Seq( + new BufferPoolMetricSet(ManagementFactory.getPlatformMBeanServer)).map( + _.getMetrics.asScala.map { + case (name: String, metric: Gauge[_]) => + addGauge(MetricRegistry.name(JVM_METRIC_PREFIX, name), metric) + case (name, metric) => new IllegalArgumentException(s"Unknown metric type: $name: $metric") + }) + + Seq(new ThreadStatesGaugeSet()).map(_.getMetrics.asScala.map { + case (name: String, metric: Gauge[_]) => + addGauge(MetricRegistry.name(JVM_METRIC_THREAD_PREFIX, name), metric) + case (name, metric) => new IllegalArgumentException(s"Unknown metric type: $name: $metric") + }) + // start cleaner startCleaner() } + +object JVMSource { + private val JVM_METRIC_PREFIX = "jvm" + private val JVM_METRIC_MEMORY_PREFIX = JVM_METRIC_PREFIX + ".memory" + private val JVM_METRIC_THREAD_PREFIX = JVM_METRIC_PREFIX + ".thread" + + private val WHITESPACE = "\\s+".r.pattern +} diff --git a/common/src/test/scala/org/apache/celeborn/common/metrics/source/JVMSourceSuite.scala b/common/src/test/scala/org/apache/celeborn/common/metrics/source/JVMSourceSuite.scala new file mode 100644 index 00000000000..6f4f8cac76b --- /dev/null +++ b/common/src/test/scala/org/apache/celeborn/common/metrics/source/JVMSourceSuite.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.celeborn.common.metrics.source + +import org.apache.celeborn.CelebornFunSuite +import org.apache.celeborn.common.CelebornConf + +class JVMSourceSuite extends CelebornFunSuite { + + val gcNames = Seq("G1-Young-Generation", "G1-Old-Generation").toBuffer + val poolNames = Seq("G1-Eden-Space", "G1-Survivor-Space", "G1-Old-Gen").toBuffer + + val JVM_METRIC_PREFIX = "jvm" + val JVM_MEMORY_PREFIX = "jvm.memory" + + test("Test handleJVMMetricName") { + + val jvmSource = new JVMSource(new CelebornConf(), "test") + + val gcMetric1 = "G1-Old-Generation.time" + val gcMetric2 = "G1-Young-Generation.count" + val gcResult1 = jvmSource.handleJVMMetricName(gcMetric1, gcNames, JVM_METRIC_PREFIX, "gc") + val gcResult2 = jvmSource.handleJVMMetricName(gcMetric2, gcNames, JVM_METRIC_PREFIX, "gc") + assert(gcResult1._1 == "jvm.gc.time") + assert(gcResult1._2 == Map("name" -> "G1-Old-Generation")) + assert(gcResult2._1 == "jvm.gc.count") + assert(gcResult2._2 == Map("name" -> "G1-Young-Generation")) + + val memoryMetric1 = "total.init" + val memoryMetrics = "pools.G1-Eden-Space.init" + val memoryResult1 = + jvmSource.handleJVMMetricName(memoryMetric1, poolNames, JVM_MEMORY_PREFIX, "") + val memoryResult2 = + jvmSource.handleJVMMetricName(memoryMetrics, poolNames, JVM_MEMORY_PREFIX, "") + assert(memoryResult1._1 == "jvm.memory.total.init") + assert(memoryResult1._2 == Map.empty[String, String]) + assert(memoryResult2._1 == "jvm.memory.pools.init") + assert(memoryResult2._2 == Map("name" -> "G1-Eden-Space")) + } +}