Skip to content

Commit

Permalink
[CELEBORN-1007] Improve JVM metrics naming and add ThreadStates metrics
Browse files Browse the repository at this point in the history
### What changes were proposed in this pull request?
Since we use codahale metrics to expose JVM metrics, the name without prefix is not clear and it‘s not easy to make a grafana template for these metrics because it adds collector name or pool name in names rather than labels.

So here I add jvm metric prefixes, remove pool info from name and obtain the pool name as labels if needed.
And add ThreadStates metrics additionally.

### Why are the changes needed?
Make jvm metrics easy to understand and get template

### Does this PR introduce _any_ user-facing change?
Yes,jvm metrics naming is changed,expose threads state additionally.

change examples like below:
For GarbageCollectorMetricSet, G1-Old-Generation.time -> jvm.gc.time{name="G1-Old-Generation"}
For MemoryUsageGaugeSet, total.init -> jvm.memory.total.init ; pools.Metaspace.usage -> jvm.memory.pools.usage{name="Metaspace"}
For BufferPoolMetricSet, direct.count -> jvm.direct.count
For ThreadStatesGaugeSet, add jvm.thread.count.

For G1, the jvm metrics exposed now:
metrics_jvm_gc_time_Value{name="G1-Old-Generation",role="Worker"} 0 1695731141588
metrics_jvm_gc_count_Value{name="G1-Young-Generation",role="Worker"} 2 1695731141588
metrics_jvm_gc_time_Value{name="G1-Young-Generation",role="Worker"} 74 1695731141588
metrics_jvm_gc_count_Value{name="G1-Old-Generation",role="Worker"} 0 1695731141588

metrics_jvm_heap_committed_Value{role="Worker"} 2109734912 1695731141588
metrics_jvm_non_heap_used_Value{role="Worker"} 47700056 1695731141588
metrics_jvm_heap_used_Value{role="Worker"} 82801184 1695731141588
metrics_jvm_total_committed_Value{role="Worker"} 2160263168 1695731141588
metrics_jvm_total_init_Value{role="Worker"} 2112290816 1695731141588
metrics_jvm_non_heap_max_Value{role="Worker"} -1 1695731141588
metrics_jvm_heap_usage_Value{role="Worker"} 0.009639326483011246 1695731141588
metrics_jvm_total_used_Value{role="Worker"} 130502480 1695731141589
metrics_jvm_heap_init_Value{role="Worker"} 2109734912 1695731141589
metrics_jvm_non_heap_committed_Value{role="Worker"} 50528256 1695731141589
metrics_jvm_non_heap_init_Value{role="Worker"} 2555904 1695731141589
metrics_jvm_non_heap_usage_Value{role="Worker"} -4.7701296E7 1695731141589
metrics_jvm_heap_max_Value{role="Worker"} 8589934592 1695731141589
metrics_jvm_total_max_Value{role="Worker"} 8589934591 1695731141589
metrics_jvm_memory_pool_used_Value{name="Code-Cache",role="Worker"} 10314368 1695731141588
metrics_jvm_memory_pool_committed_Value{name="Code-Cache",role="Worker"} 10944512 1695731141588
metrics_jvm_memory_pool_init_Value{name="G1-Eden-Space",role="Worker"} 111149056 1695731141588
metrics_jvm_memory_pool_max_Value{name="G1-Old-Gen",role="Worker"} 8589934592 1695731141588
metrics_jvm_memory_pool_used_after_gc_Value{name="G1-Survivor-Space",role="Worker"} 14680064 1695731141588
metrics_jvm_memory_pool_used_Value{name="Compressed-Class-Space",role="Worker"} 4440192 1695731141588
metrics_jvm_memory_pool_usage_Value{name="Metaspace",role="Worker"} 0.9449504192610433 1695731141588
metrics_jvm_memory_pool_max_Value{name="Metaspace",role="Worker"} -1 1695731141588
metrics_jvm_memory_pool_init_Value{name="G1-Survivor-Space",role="Worker"} 0 1695731141588
metrics_jvm_memory_pool_committed_Value{name="G1-Old-Gen",role="Worker"} 1998585856 1695731141588
metrics_jvm_memory_pool_committed_Value{name="G1-Survivor-Space",role="Worker"} 14680064 1695731141588
metrics_jvm_memory_pool_committed_Value{name="G1-Eden-Space",role="Worker"} 96468992 1695731141588
metrics_jvm_memory_pool_max_Value{name="G1-Survivor-Space",role="Worker"} -1 1695731141588
metrics_jvm_memory_pool_usage_Value{name="Compressed-Class-Space",role="Worker"} 0.004135251045227051 1695731141588
metrics_jvm_memory_pool_usage_Value{name="G1-Survivor-Space",role="Worker"} 1.0 1695731141588
metrics_jvm_memory_pool_max_Value{name="Code-Cache",role="Worker"} 251658240 1695731141588
metrics_jvm_memory_pool_init_Value{name="Compressed-Class-Space",role="Worker"} 0 1695731141589
metrics_jvm_memory_pool_usage_Value{name="G1-Eden-Space",role="Worker"} 0.34782608695652173 1695731141589
metrics_jvm_memory_pool_init_Value{name="Metaspace",role="Worker"} 0 1695731141589
metrics_jvm_memory_pool_max_Value{name="G1-Eden-Space",role="Worker"} -1 1695731141589
metrics_jvm_memory_pool_usage_Value{name="Code-Cache",role="Worker"} 0.04098917643229167 1695731141589
metrics_jvm_memory_pool_used_after_gc_Value{name="G1-Eden-Space",role="Worker"} 0 1695731141589
metrics_jvm_memory_pool_init_Value{name="Code-Cache",role="Worker"} 2555904 1695731141589
metrics_jvm_memory_pool_used_Value{name="G1-Survivor-Space",role="Worker"} 14680064 1695731141589
metrics_jvm_memory_pool_committed_Value{name="Compressed-Class-Space",role="Worker"} 4718592 1695731141589
metrics_jvm_memory_pool_used_Value{name="G1-Eden-Space",role="Worker"} 33554432 1695731141589
metrics_jvm_memory_pool_used_Value{name="G1-Old-Gen",role="Worker"} 34566688 1695731141589
metrics_jvm_memory_pool_usage_Value{name="G1-Old-Gen",role="Worker"} 0.004024092108011246 1695731141589
metrics_jvm_memory_pool_used_after_gc_Value{name="G1-Old-Gen",role="Worker"} 0 1695731141589
metrics_jvm_memory_pool_committed_Value{name="Metaspace",role="Worker"} 34865152 1695731141589
metrics_jvm_memory_pool_init_Value{name="G1-Old-Gen",role="Worker"} 1998585856 1695731141589
metrics_jvm_memory_pool_used_Value{name="Metaspace",role="Worker"} 32945840 1695731141589
metrics_jvm_memory_pool_max_Value{name="Compressed-Class-Space",role="Worker"} 1073741824 1695731141589

metrics_jvm_direct_count_Value{role="Worker"} 8 1695731141589
metrics_jvm_direct_capacity_Value{role="Worker"} 1036 1695731141589
metrics_jvm_direct_used_Value{role="Worker"} 1037 1695731141589
metrics_jvm_mapped_used_Value{role="Worker"} 0 1695731141589
metrics_jvm_mapped_capacity_Value{role="Worker"} 0 1695731141589
metrics_jvm_mapped_count_Value{role="Worker"} 0 1695731141589

metrics_jvm_thread_timed_waiting_count_Value{role="Worker"} 23 1695731141589
metrics_jvm_thread_deadlock_count_Value{role="Worker"} 0 1695731141589
metrics_jvm_thread_count_Value{role="Worker"} 78 1695731141589
metrics_jvm_thread_waiting_count_Value{role="Worker"} 45 1695731141589
metrics_jvm_thread_daemon_count_Value{role="Worker"} 75 1695731141589
metrics_jvm_thread_new_count_Value{role="Worker"} 0 1695731141589
metrics_jvm_thread_blocked_count_Value{role="Worker"} 0 1695731141590
metrics_jvm_thread_deadlocks_Value{role="Worker"} [] 1695731141590
metrics_jvm_thread_runnable_count_Value{role="Worker"} 10 1695731141590
metrics_jvm_thread_terminated_count_Value{role="Worker"} 0 1695731141590

### How was this patch tested?
UT and cluster test with g1, PS-Scavenge/PS-MarkSweep and ParNew/CMS

Closes #1939 from onebox-li/improve-jvm-metrics.

Authored-by: onebox-li <[email protected]>
Signed-off-by: zky.zhoukeyong <[email protected]>
  • Loading branch information
onebox-li authored and waitinfuture committed Sep 28, 2023
1 parent 3e515c5 commit b4dfc09
Show file tree
Hide file tree
Showing 2 changed files with 126 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,86 @@ package org.apache.celeborn.common.metrics.source
import java.lang.management.ManagementFactory

import scala.collection.JavaConverters._
import scala.collection.mutable

import com.codahale.metrics.Gauge
import com.codahale.metrics.jvm.{BufferPoolMetricSet, GarbageCollectorMetricSet, MemoryUsageGaugeSet}
import com.codahale.metrics.{Gauge, MetricRegistry}
import com.codahale.metrics.jvm.{BufferPoolMetricSet, GarbageCollectorMetricSet, MemoryUsageGaugeSet, ThreadStatesGaugeSet}

import org.apache.celeborn.common.CelebornConf

class JVMSource(conf: CelebornConf, role: String) extends AbstractSource(conf, role) {
override val sourceName = "JVM"

// all of metrics of GCMetricSet and BufferPoolMetricSet are Gauge
Seq(
new GarbageCollectorMetricSet(),
new MemoryUsageGaugeSet(),
new BufferPoolMetricSet(ManagementFactory.getPlatformMBeanServer))
.map { x =>
x.getMetrics.asScala.map {
case (name: String, metric: Gauge[_]) => addGauge(name, metric)
case (name, metric) => new IllegalArgumentException(s"Unknown metric type: $name: $metric")
import JVMSource._

private val gcNames = ManagementFactory.getGarbageCollectorMXBeans.asScala.map(bean =>
WHITESPACE.matcher(bean.getName).replaceAll("-"))
private val poolNames = ManagementFactory.getMemoryPoolMXBeans.asScala.map(bean =>
WHITESPACE.matcher(bean.getName).replaceAll("-"))

/**
* Add jvm metric prefix, remove pool info from name and obtain the pool name as labels if needed
* @param metricName metric name from MetricSet
* @param targets keywords need to be replaced
* @param prefix prefix for new metric name
* @param replacement replacement for pool name
* @return new metric without target, labels if exists
*/
def handleJVMMetricName(
metricName: String,
targets: mutable.Buffer[String],
prefix: String,
replacement: String): (String, Map[String, String]) = {
for (target <- targets) {
if (metricName.contains(target)) {
val labels = Map("name" -> target)
var replaceTarget = target
if (replacement.isEmpty) {
replaceTarget = target + "."
}
return (MetricRegistry.name(prefix, metricName.replace(replaceTarget, replacement)), labels)
}
}
(MetricRegistry.name(prefix, metricName), Map.empty[String, String])
}

// all metrics in MetricSet are gauges
Seq(new GarbageCollectorMetricSet()).map(_.getMetrics.asScala.map {
case (name: String, metric: Gauge[_]) =>
val newMetrics = handleJVMMetricName(name, gcNames, JVM_METRIC_PREFIX, "gc")
addGauge(newMetrics._1, newMetrics._2, metric)
case (name, metric) => new IllegalArgumentException(s"Unknown metric type: $name: $metric")
})

Seq(new MemoryUsageGaugeSet()).map(_.getMetrics.asScala.map {
case (name: String, metric: Gauge[_]) =>
val newMetrics = handleJVMMetricName(name, poolNames, JVM_METRIC_MEMORY_PREFIX, "")
addGauge(newMetrics._1, newMetrics._2, metric)
case (name, metric) => new IllegalArgumentException(s"Unknown metric type: $name: $metric")
})

Seq(
new BufferPoolMetricSet(ManagementFactory.getPlatformMBeanServer)).map(
_.getMetrics.asScala.map {
case (name: String, metric: Gauge[_]) =>
addGauge(MetricRegistry.name(JVM_METRIC_PREFIX, name), metric)
case (name, metric) => new IllegalArgumentException(s"Unknown metric type: $name: $metric")
})

Seq(new ThreadStatesGaugeSet()).map(_.getMetrics.asScala.map {
case (name: String, metric: Gauge[_]) =>
addGauge(MetricRegistry.name(JVM_METRIC_THREAD_PREFIX, name), metric)
case (name, metric) => new IllegalArgumentException(s"Unknown metric type: $name: $metric")
})

// start cleaner
startCleaner()
}

object JVMSource {
private val JVM_METRIC_PREFIX = "jvm"
private val JVM_METRIC_MEMORY_PREFIX = JVM_METRIC_PREFIX + ".memory"
private val JVM_METRIC_THREAD_PREFIX = JVM_METRIC_PREFIX + ".thread"

private val WHITESPACE = "\\s+".r.pattern
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.celeborn.common.metrics.source

import org.apache.celeborn.CelebornFunSuite
import org.apache.celeborn.common.CelebornConf

class JVMSourceSuite extends CelebornFunSuite {

val gcNames = Seq("G1-Young-Generation", "G1-Old-Generation").toBuffer
val poolNames = Seq("G1-Eden-Space", "G1-Survivor-Space", "G1-Old-Gen").toBuffer

val JVM_METRIC_PREFIX = "jvm"
val JVM_MEMORY_PREFIX = "jvm.memory"

test("Test handleJVMMetricName") {

val jvmSource = new JVMSource(new CelebornConf(), "test")

val gcMetric1 = "G1-Old-Generation.time"
val gcMetric2 = "G1-Young-Generation.count"
val gcResult1 = jvmSource.handleJVMMetricName(gcMetric1, gcNames, JVM_METRIC_PREFIX, "gc")
val gcResult2 = jvmSource.handleJVMMetricName(gcMetric2, gcNames, JVM_METRIC_PREFIX, "gc")
assert(gcResult1._1 == "jvm.gc.time")
assert(gcResult1._2 == Map("name" -> "G1-Old-Generation"))
assert(gcResult2._1 == "jvm.gc.count")
assert(gcResult2._2 == Map("name" -> "G1-Young-Generation"))

val memoryMetric1 = "total.init"
val memoryMetrics = "pools.G1-Eden-Space.init"
val memoryResult1 =
jvmSource.handleJVMMetricName(memoryMetric1, poolNames, JVM_MEMORY_PREFIX, "")
val memoryResult2 =
jvmSource.handleJVMMetricName(memoryMetrics, poolNames, JVM_MEMORY_PREFIX, "")
assert(memoryResult1._1 == "jvm.memory.total.init")
assert(memoryResult1._2 == Map.empty[String, String])
assert(memoryResult2._1 == "jvm.memory.pools.init")
assert(memoryResult2._2 == Map("name" -> "G1-Eden-Space"))
}
}

0 comments on commit b4dfc09

Please sign in to comment.