Skip to content

Commit

Permalink
dbconsole: reorder overload page metrics for better readability
Browse files Browse the repository at this point in the history
This patch reorders the existing metrics in a more usable order:
1. Metrics to help determine which resource is constrained (IO, CPU)
2. Metrics to narrow down which AC queues are seeing requests waiting
3. More advanced metrics about the system health (goroutine scheduler,
   L0 sublevels, etc.)

Informs #121572.

Release note (ui change): Reordering of metrics on the overload page to
help categorizing them better. They are roughly in the following order:
1. Metrics to help determine which resource is constrained (IO, CPU)
2. Metrics to narrow down which AC queues are seeing requests waiting
3. More advanced metrics about the system health (goroutine scheduler,
   L0 sublevels, etc.)
  • Loading branch information
aadityasondhi committed May 21, 2024
1 parent 8770b52 commit e785089
Showing 1 changed file with 106 additions and 106 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -49,92 +49,39 @@ export default function (props: GraphDashboardProps) {
</LineGraph>,

<LineGraph
title="Goroutine Scheduling Latency: 99th percentile"
sources={nodeSources}
tenantSource={tenantSource}
tooltip={`P99 scheduling latency for goroutines`}
showMetricsInTooltip={true}
>
<Axis units={AxisUnits.Duration} label="latency">
{nodeIDs.map(nid => (
<>
<Metric
key={nid}
name="cr.node.go.scheduler_latency-p99"
title={nodeDisplayName(nodeDisplayNameByID, nid)}
sources={[nid]}
downsampleMax
/>
</>
))}
</Axis>
</LineGraph>,

<LineGraph
title="Runnable Goroutines per CPU"
title="KV Admission Slots Exhausted"
sources={nodeSources}
tenantSource={tenantSource}
tooltip={`The number of Goroutines waiting per CPU.`}
showMetricsInTooltip={true}
>
<Axis label="goroutines">
<Axis label="duration (micros/sec)">
{nodeIDs.map(nid => (
<Metric
name="cr.node.sys.runnable.goroutines.per.cpu"
title={nodeDisplayName(nodeDisplayNameByID, nid)}
key={nid}
name="cr.node.admission.granter.slots_exhausted_duration.kv"
title={
"Admission Slots Exhausted " +
nodeDisplayName(nodeDisplayNameByID, nid)
}
sources={[nid]}
nonNegativeRate
/>
))}
</Axis>
</LineGraph>,

<LineGraph
title="Elastic CPU Utilization"
sources={nodeSources}
tenantSource={tenantSource}
tooltip={`CPU utilization by elastic work, compared to the limit set for elastic work.`}
showMetricsInTooltip={true}
>
<Axis units={AxisUnits.Percentage} label="CPU Utilization">
{nodeIDs.map(nid => (
<>
<Metric
name="cr.node.admission.elastic_cpu.utilization"
title={
"Elastic CPU Utilization " +
nodeDisplayName(nodeDisplayNameByID, nid)
}
sources={[nid]}
/>
<Metric
name="cr.node.admission.elastic_cpu.utilization_limit"
title={
"Elastic CPU Utilization Limit " +
nodeDisplayName(nodeDisplayNameByID, nid)
}
sources={[nid]}
/>
</>
))}
</Axis>
</LineGraph>,

<LineGraph
title="Elastic CPU Exhausted Duration Per Second"
title="KV Admission IO Tokens Exhausted Duration Per Second"
sources={nodeSources}
tenantSource={tenantSource}
tooltip={`Duration of CPU exhaustion by elastic work, in microseconds.`}
showMetricsInTooltip={true}
>
<Axis label="duration (micros/sec)">
{nodeIDs.map(nid => (
<Metric
key={nid}
name="cr.node.admission.elastic_cpu.nanos_exhausted_duration"
title={
"Elastic CPU Exhausted " +
nodeDisplayName(nodeDisplayNameByID, nid)
}
name="cr.node.admission.granter.io_tokens_exhausted_duration.kv"
title={"IO Exhausted " + nodeDisplayName(nodeDisplayNameByID, nid)}
sources={[nid]}
nonNegativeRate
/>
Expand Down Expand Up @@ -164,18 +111,19 @@ export default function (props: GraphDashboardProps) {
</LineGraph>,

<LineGraph
title="KV Admission Slots Exhausted"
title="Elastic CPU Exhausted Duration Per Second"
sources={nodeSources}
tenantSource={tenantSource}
tooltip={`Duration of CPU exhaustion by elastic work, in microseconds.`}
showMetricsInTooltip={true}
>
<Axis label="duration (micros/sec)">
{nodeIDs.map(nid => (
<Metric
key={nid}
name="cr.node.admission.granter.slots_exhausted_duration.kv"
name="cr.node.admission.elastic_cpu.nanos_exhausted_duration"
title={
"Admission Slots Exhausted " +
"Elastic CPU Exhausted " +
nodeDisplayName(nodeDisplayNameByID, nid)
}
sources={[nid]}
Expand All @@ -186,49 +134,76 @@ export default function (props: GraphDashboardProps) {
</LineGraph>,

<LineGraph
title="KV Admission IO Tokens Exhausted Duration Per Second"
title="Flow Tokens Wait Time: 75th percentile"
sources={nodeSources}
tenantSource={tenantSource}
showMetricsInTooltip={true}
>
<Axis label="duration (micros/sec)">
<Axis units={AxisUnits.Duration} label="p75 flow token wait duration">
{nodeIDs.map(nid => (
<Metric
key={nid}
name="cr.node.admission.granter.io_tokens_exhausted_duration.kv"
title={"IO Exhausted " + nodeDisplayName(nodeDisplayNameByID, nid)}
sources={[nid]}
nonNegativeRate
/>
<>
<Metric
key={nid}
name="cr.node.kvadmission.flow_controller.regular_wait_duration-p75"
title={
"Regular flow token wait time " +
nodeDisplayName(nodeDisplayNameByID, nid)
}
sources={[nid]}
downsampleMax
/>
<Metric
key={nid}
name="cr.node.kvadmission.flow_controller.elastic_wait_duration-p75"
title={
"Elastic flow token wait time " +
nodeDisplayName(nodeDisplayNameByID, nid)
}
sources={[nid]}
downsampleMax
/>
</>
))}
</Axis>
</LineGraph>,

<LineGraph
title="Flow Tokens Wait Time: 75th percentile"
title="Admission Delay: 75th percentile"
sources={nodeSources}
tenantSource={tenantSource}
showMetricsInTooltip={true}
>
<Axis units={AxisUnits.Duration} label="p75 flow token wait duration">
<Axis units={AxisUnits.Duration} label="delay for requests that waited">
{nodeIDs.map(nid => (
<>
<Metric
key={nid}
name="cr.node.kvadmission.flow_controller.regular_wait_duration-p75"
name="cr.node.admission.wait_durations.kv-p75"
title={"KV " + nodeDisplayName(nodeDisplayNameByID, nid)}
sources={[nid]}
downsampleMax
/>
<Metric
key={nid}
name="cr.node.admission.wait_durations.kv-stores-p75"
title={"KV write " + nodeDisplayName(nodeDisplayNameByID, nid)}
sources={[nid]}
downsampleMax
/>
<Metric
key={nid}
name="cr.node.admission.wait_durations.sql-kv-response-p75"
title={
"Regular flow token wait time " +
nodeDisplayName(nodeDisplayNameByID, nid)
"SQL-KV response " + nodeDisplayName(nodeDisplayNameByID, nid)
}
sources={[nid]}
downsampleMax
/>
<Metric
key={nid}
name="cr.node.kvadmission.flow_controller.elastic_wait_duration-p75"
name="cr.node.admission.wait_durations.sql-sql-response-p75"
title={
"Elastic flow token wait time " +
nodeDisplayName(nodeDisplayNameByID, nid)
"SQL-SQL response " + nodeDisplayName(nodeDisplayNameByID, nid)
}
sources={[nid]}
downsampleMax
Expand Down Expand Up @@ -271,49 +246,74 @@ export default function (props: GraphDashboardProps) {
</LineGraph>,

<LineGraph
title="Admission Delay: 75th percentile"
title="Elastic CPU Utilization"
sources={nodeSources}
tenantSource={tenantSource}
tooltip={`CPU utilization by elastic work, compared to the limit set for elastic work.`}
showMetricsInTooltip={true}
>
<Axis units={AxisUnits.Duration} label="delay for requests that waited">
<Axis units={AxisUnits.Percentage} label="CPU Utilization">
{nodeIDs.map(nid => (
<>
<Metric
key={nid}
name="cr.node.admission.wait_durations.kv-p75"
title={"KV " + nodeDisplayName(nodeDisplayNameByID, nid)}
sources={[nid]}
downsampleMax
/>
<Metric
key={nid}
name="cr.node.admission.wait_durations.kv-stores-p75"
title={"KV write " + nodeDisplayName(nodeDisplayNameByID, nid)}
name="cr.node.admission.elastic_cpu.utilization"
title={
"Elastic CPU Utilization " +
nodeDisplayName(nodeDisplayNameByID, nid)
}
sources={[nid]}
downsampleMax
/>
<Metric
key={nid}
name="cr.node.admission.wait_durations.sql-kv-response-p75"
name="cr.node.admission.elastic_cpu.utilization_limit"
title={
"SQL-KV response " + nodeDisplayName(nodeDisplayNameByID, nid)
"Elastic CPU Utilization Limit " +
nodeDisplayName(nodeDisplayNameByID, nid)
}
sources={[nid]}
downsampleMax
/>
</>
))}
</Axis>
</LineGraph>,

<LineGraph
title="Goroutine Scheduling Latency: 99th percentile"
sources={nodeSources}
tenantSource={tenantSource}
tooltip={`P99 scheduling latency for goroutines`}
showMetricsInTooltip={true}
>
<Axis units={AxisUnits.Duration} label="latency">
{nodeIDs.map(nid => (
<>
<Metric
key={nid}
name="cr.node.admission.wait_durations.sql-sql-response-p75"
title={
"SQL-SQL response " + nodeDisplayName(nodeDisplayNameByID, nid)
}
name="cr.node.go.scheduler_latency-p99"
title={nodeDisplayName(nodeDisplayNameByID, nid)}
sources={[nid]}
downsampleMax
/>
</>
))}
</Axis>
</LineGraph>,

<LineGraph
title="Runnable Goroutines per CPU"
sources={nodeSources}
tenantSource={tenantSource}
tooltip={`The number of Goroutines waiting per CPU.`}
showMetricsInTooltip={true}
>
<Axis label="goroutines">
{nodeIDs.map(nid => (
<Metric
name="cr.node.sys.runnable.goroutines.per.cpu"
title={nodeDisplayName(nodeDisplayNameByID, nid)}
sources={[nid]}
/>
))}
</Axis>
</LineGraph>,
];
}

0 comments on commit e785089

Please sign in to comment.