-
Notifications
You must be signed in to change notification settings - Fork 3.9k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
69826: clusterversion: mint 21.2 cluster version r=celiala a=celiala Shortly after [Creating a release branch](https://cockroachlabs.atlassian.net/wiki/spaces/ENG/pages/187859111/Creating+a+release+branch) for release-21.2, we'll want to merge PRs as instructed by the [New major version checklist](https://cockroachlabs.atlassian.net/wiki/spaces/ENG/pages/1522270228/New+major+version+checklist). This PR is for Step 1a of the [New major version checklist](https://cockroachlabs.atlassian.net/wiki/spaces/ENG/pages/1522270228/New+major+version+checklist), where we add a cluster version StartX for the corresponding .0 release. Notes: - **This should NOT be merged until we've released a beta** (adding `do-not-merge` until this happens) - These are all PRs created as part of the steps of the [New major version checklist](https://cockroachlabs.atlassian.net/wiki/spaces/ENG/pages/1522270228/New+major+version+checklist): - Step 1a (you are here): #69826 - Step 1b: #69828 - Step 2a + Step 2b: #69827 - Step 2c: #69829 - Step 3: TODO Release justification: Non-production code change. Release note: None 70750: tenant: add endpoint with instant metrics r=darinpp a=darinpp Previously the tenant process was serving various metrics on `/_status/vars`. This endpoint has all the available metrics and these are updated every 10 sec. Many of the metrics show a rate that is calculated over the 10 sec interval. Some of the metrics are used by the cockroach operator to monitor the CPU workload of the tenant process and use that workload for automatic scaling. The 10 sec interval however is too long and causes a slow scaling up. The reporting of high CPU utilization can take up to 20 sec (to compute a delta). To resolve this, the PR adds a new endpoint `/_status/load` that provides an instant reading of a very small subset of the normal metrics - user and system CPU time for now. By having these be instant, the client can retrieve in quick succession, consecutive snapshots and compute a precise CPU utulization. It also allows the client to control the interval between the two pulls (as opposed to having it hard coded to 10 sec). Release note: None Co-authored-by: Celia La <[email protected]> Co-authored-by: Darin Peshev <[email protected]>
- Loading branch information
Showing
11 changed files
with
181 additions
and
19 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
// Copyright 2021 The Cockroach Authors. | ||
// | ||
// Licensed as a CockroachDB Enterprise file under the Cockroach Community | ||
// License (the "License"); you may not use this file except in compliance with | ||
// the License. You may obtain a copy of the License at | ||
// | ||
// https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt | ||
|
||
package serverccl | ||
|
||
import ( | ||
"context" | ||
"crypto/tls" | ||
"net/http" | ||
"os" | ||
"testing" | ||
|
||
"github.com/cockroachdb/cockroach/pkg/base" | ||
_ "github.com/cockroachdb/cockroach/pkg/ccl/kvccl" | ||
"github.com/cockroachdb/cockroach/pkg/roachpb" | ||
"github.com/cockroachdb/cockroach/pkg/sql/tests" | ||
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils" | ||
"github.com/cockroachdb/cockroach/pkg/util/leaktest" | ||
"github.com/cockroachdb/cockroach/pkg/util/log" | ||
"github.com/elastic/gosigar" | ||
io_prometheus_client "github.com/prometheus/client_model/go" | ||
"github.com/prometheus/common/expfmt" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func TestTenantVars(t *testing.T) { | ||
defer leaktest.AfterTest(t)() | ||
defer log.Scope(t).Close(t) | ||
|
||
ctx := context.Background() | ||
|
||
serverParams, _ := tests.CreateTestServerParams() | ||
testCluster := serverutils.StartNewTestCluster(t, 1 /* numNodes */, base.TestClusterArgs{ | ||
ServerArgs: serverParams, | ||
}) | ||
defer testCluster.Stopper().Stop(ctx) | ||
|
||
server := testCluster.Server(0 /* idx */) | ||
|
||
tenant, _ := serverutils.StartTenant(t, server, base.TestTenantArgs{ | ||
TenantID: roachpb.MakeTenantID(10 /* id */), | ||
}) | ||
|
||
url := "https://" + tenant.HTTPAddr() + "/_status/load" | ||
client := http.Client{ | ||
Transport: &http.Transport{ | ||
TLSClientConfig: &tls.Config{InsecureSkipVerify: true}, | ||
}, | ||
} | ||
resp, err := client.Get(url) | ||
require.NoError(t, err) | ||
defer resp.Body.Close() | ||
require.Equal(t, 200, resp.StatusCode, | ||
"invalid non-200 status code %v from tenant", resp.StatusCode) | ||
|
||
var parser expfmt.TextParser | ||
metrics, err := parser.TextToMetricFamilies(resp.Body) | ||
require.NoError(t, err) | ||
|
||
userCPU, found := metrics["sys_cpu_user_ns"] | ||
require.True(t, found) | ||
require.Len(t, userCPU.GetMetric(), 1) | ||
require.Equal(t, io_prometheus_client.MetricType_GAUGE, userCPU.GetType()) | ||
cpuUserNanos := userCPU.Metric[0].GetGauge().GetValue() | ||
|
||
sysCPU, found := metrics["sys_cpu_sys_ns"] | ||
require.True(t, found) | ||
require.True(t, found) | ||
require.Len(t, sysCPU.GetMetric(), 1) | ||
require.Equal(t, io_prometheus_client.MetricType_GAUGE, sysCPU.GetType()) | ||
cpuSysNanos := sysCPU.Metric[0].GetGauge().GetValue() | ||
|
||
// The values are between zero and whatever User/Sys time is observed after the get. | ||
require.Positive(t, cpuUserNanos) | ||
require.Positive(t, cpuSysNanos) | ||
cpuTime := gosigar.ProcTime{} | ||
require.NoError(t, cpuTime.Get(os.Getpid())) | ||
require.LessOrEqual(t, cpuUserNanos, float64(cpuTime.User)*1e6) | ||
require.LessOrEqual(t, cpuSysNanos, float64(cpuTime.Sys)*1e6) | ||
|
||
resp, err = client.Get(url) | ||
require.NoError(t, err) | ||
defer resp.Body.Close() | ||
require.Equal(t, 200, resp.StatusCode, | ||
"invalid non-200 status code %v from tenant", resp.StatusCode) | ||
|
||
metrics, err = parser.TextToMetricFamilies(resp.Body) | ||
require.NoError(t, err) | ||
|
||
userCPU, found = metrics["sys_cpu_user_ns"] | ||
require.True(t, found) | ||
require.Len(t, userCPU.GetMetric(), 1) | ||
require.Equal(t, io_prometheus_client.MetricType_GAUGE, userCPU.GetType()) | ||
cpuUserNanos2 := userCPU.Metric[0].GetGauge().GetValue() | ||
|
||
sysCPU, found = metrics["sys_cpu_sys_ns"] | ||
require.True(t, found) | ||
require.True(t, found) | ||
require.Len(t, sysCPU.GetMetric(), 1) | ||
require.Equal(t, io_prometheus_client.MetricType_GAUGE, sysCPU.GetType()) | ||
cpuSysNanos2 := sysCPU.Metric[0].GetGauge().GetValue() | ||
|
||
require.LessOrEqual(t, float64(cpuTime.User)*1e6, cpuUserNanos2) | ||
require.LessOrEqual(t, float64(cpuTime.Sys)*1e6, cpuSysNanos2) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters