Skip to content

Commit

Permalink
tenant: add endpoint with instant metrics
Browse files Browse the repository at this point in the history
Previously the tenant process was serving various metrics on
`/_status/vars`. This endpoint has all the available metrics and these are
updated every 10 sec. Many of the metrics show a rate that is calculated
over the 10 sec interval. Some of the metrics are used by the cockroach
operator to monitor the CPU workload of the tenant process and use that
workload for automatic scaling. The 10 sec interval however is too long
and causes a slow scaling up. The reporting of high CPU utilization can
take up to 20 sec (to compute a delta). To resolve this, the PR adds a
new endpoint `/_status/instantvars` that provides an instant reading of a
very small subset of the normal metrics - user and system CPU time for
now. By having these as instant, the client can retrieve in quick
succession, consecutive snapshots and compute a precise CPU utulization.
It also allows the client to control the interval between the two pulls
(as opposed to having it hard coded to 10 sec).

Release note: None
  • Loading branch information
darinpp committed Sep 27, 2021
1 parent 37d9f72 commit e1dbe12
Show file tree
Hide file tree
Showing 5 changed files with 153 additions and 0 deletions.
2 changes: 2 additions & 0 deletions pkg/ccl/serverccl/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ go_test(
"server_sql_test.go",
"tenant_grpc_test.go",
"tenant_status_test.go",
"tenant_vars_test.go",
],
embed = [":serverccl"],
deps = [
Expand Down Expand Up @@ -62,6 +63,7 @@ go_test(
"//pkg/util/log",
"//pkg/util/randutil",
"//pkg/util/timeutil",
"@com_github_elastic_gosigar//:gosigar",
"@com_github_lib_pq//:pq",
"@com_github_stretchr_testify//require",
"@org_golang_x_crypto//bcrypt",
Expand Down
112 changes: 112 additions & 0 deletions pkg/ccl/serverccl/tenant_vars_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
// Copyright 2021 The Cockroach Authors.
//
// Licensed as a CockroachDB Enterprise file under the Cockroach Community
// License (the "License"); you may not use this file except in compliance with
// the License. You may obtain a copy of the License at
//
// https://github.com/cockroachdb/cockroach/blob/master/licenses/CCL.txt

package serverccl

import (
"bufio"
"context"
"crypto/tls"
"net/http"
"os"
"regexp"
"strconv"
"testing"

"github.com/cockroachdb/cockroach/pkg/base"
_ "github.com/cockroachdb/cockroach/pkg/ccl/kvccl"
"github.com/cockroachdb/cockroach/pkg/roachpb"
"github.com/cockroachdb/cockroach/pkg/sql/tests"
"github.com/cockroachdb/cockroach/pkg/testutils/serverutils"
"github.com/cockroachdb/cockroach/pkg/util/leaktest"
"github.com/cockroachdb/cockroach/pkg/util/log"
"github.com/elastic/gosigar"
"github.com/stretchr/testify/require"
)

func TestTenantVars(t *testing.T) {
defer leaktest.AfterTest(t)()
defer log.Scope(t).Close(t)

ctx := context.Background()

serverParams, _ := tests.CreateTestServerParams()
testCluster := serverutils.StartNewTestCluster(t, 1 /* numNodes */, base.TestClusterArgs{
ServerArgs: serverParams,
})
defer testCluster.Stopper().Stop(ctx)

server := testCluster.Server(0 /* idx */)

tenant, _ := serverutils.StartTenant(t, server, base.TestTenantArgs{
TenantID: roachpb.MakeTenantID(10 /* id */),
})

url := "https://" + tenant.HTTPAddr() + "/_status/load"
client := http.Client{
Transport: &http.Transport{
TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
},
}
resp, err := client.Get(url)
require.NoError(t, err)
defer resp.Body.Close()
require.Equal(t, 200, resp.StatusCode,
"invalid non-200 status code %v from tenant", resp.StatusCode)

prometheusMetricStringPattern := `^(?P<metric>\w+)(?:\{` +
`(?P<labelvalues>(\w+=\".*\",)*(\w+=\".*\")?)\})?\s+(?P<value>.*)$`
promethusMetricStringRE := regexp.MustCompile(prometheusMetricStringPattern)

var cpuUserNS, cpuSysNS float64
scanner := bufio.NewScanner(resp.Body)
for scanner.Scan() {
matches := promethusMetricStringRE.FindStringSubmatch(scanner.Text())
if matches != nil {
if matches[1] == "sys_cpu_user_ns" {
cpuUserNS, err = strconv.ParseFloat(matches[5], 64)
require.NoError(t, err)
}
if matches[1] == "sys_cpu_sys_ns" {
cpuSysNS, err = strconv.ParseFloat(matches[5], 64)
require.NoError(t, err)
}
}
}
// The values are between zero and whatever User/Sys time is observed after the get.
require.Positive(t, cpuUserNS)
require.Positive(t, cpuSysNS)
cpuTime := gosigar.ProcTime{}
require.NoError(t, cpuTime.Get(os.Getpid()))
require.LessOrEqual(t, cpuUserNS, float64(cpuTime.User)*1e6)
require.LessOrEqual(t, cpuSysNS, float64(cpuTime.Sys)*1e6)

resp, err = client.Get(url)
require.NoError(t, err)
defer resp.Body.Close()
require.Equal(t, 200, resp.StatusCode,
"invalid non-200 status code %v from tenant", resp.StatusCode)

var cpuUserNS2, cpuSysNS2 float64
scanner = bufio.NewScanner(resp.Body)
for scanner.Scan() {
matches := promethusMetricStringRE.FindStringSubmatch(scanner.Text())
if matches != nil {
if matches[1] == "sys_cpu_user_ns" {
cpuUserNS2, err = strconv.ParseFloat(matches[5], 64)
require.NoError(t, err)
}
if matches[1] == "sys_cpu_sys_ns" {
cpuSysNS2, err = strconv.ParseFloat(matches[5], 64)
require.NoError(t, err)
}
}
}
require.LessOrEqual(t, float64(cpuTime.User)*1e6, cpuUserNS2)
require.LessOrEqual(t, float64(cpuTime.Sys)*1e6, cpuSysNS2)
}
1 change: 1 addition & 0 deletions pkg/server/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -203,6 +203,7 @@ go_library(
"@com_github_cockroachdb_pebble//vfs",
"@com_github_cockroachdb_redact//:redact",
"@com_github_cockroachdb_sentry_go//:sentry-go",
"@com_github_elastic_gosigar//:gosigar",
"@com_github_gogo_protobuf//proto",
"@com_github_gorilla_mux//:mux",
"@com_github_grpc_ecosystem_grpc_gateway//runtime:go_default_library",
Expand Down
3 changes: 3 additions & 0 deletions pkg/server/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,9 @@ const (
// statusVars exposes prometheus metrics for monitoring consumption.
statusVars = statusPrefix + "vars"

// loadStatusVars exposes prometheus metrics for instant monitoring of CPU load.
loadStatusVars = statusPrefix + "load"

// raftStateDormant is used when there is no known raft state.
raftStateDormant = "StateDormant"

Expand Down
35 changes: 35 additions & 0 deletions pkg/server/tenant.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import (
"context"
"crypto/tls"
"net/http"
"os"
"time"

"github.com/cockroachdb/cockroach/pkg/base"
Expand Down Expand Up @@ -52,6 +53,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/util/timeutil"
"github.com/cockroachdb/errors"
"github.com/cockroachdb/logtags"
"github.com/elastic/gosigar"
)

// StartTenant starts a stand-alone SQL server against a KV backend.
Expand Down Expand Up @@ -219,6 +221,8 @@ func StartTenant(
})
f := varsHandler{metricSource: args.recorder, st: args.Settings}.handleVars
mux.Handle(statusVars, http.HandlerFunc(f))
ff := loadVarsHandler(ctx, args.runtime)
mux.Handle(loadStatusVars, http.HandlerFunc(ff))

tlsConnManager := netutil.MakeServer(
args.stopper,
Expand Down Expand Up @@ -301,6 +305,37 @@ func StartTenant(
return s, pgLAddr, httpLAddr, nil
}

// Construct a handler responsible for serving the instant values of selected
// load metrics. These include user and system CPU time currently.
func loadVarsHandler(
ctx context.Context, rsr *status.RuntimeStatSampler,
) func(http.ResponseWriter, *http.Request) {
cpuTime := gosigar.ProcTime{}
cpuUserNS := rsr.CPUUserNS
cpuSysNS := rsr.CPUSysNS
registry := metric.NewRegistry()
registry.AddMetric(cpuUserNS)
registry.AddMetric(cpuSysNS)

return func(w http.ResponseWriter, r *http.Request) {
if err := cpuTime.Get(os.Getpid()); err != nil {
log.Ops.Errorf(ctx, "unable to get cpu usage: %v", err)
http.Error(w, err.Error(), http.StatusInternalServerError)
}
utime := int64(cpuTime.User) * 1e6
stime := int64(cpuTime.Sys) * 1e6
cpuUserNS.Update(utime)
cpuSysNS.Update(stime)

exporter := metric.MakePrometheusExporter()
exporter.ScrapeRegistry(registry, true)
if err := exporter.PrintAsText(w); err != nil {
log.Errorf(r.Context(), "%v", err)
http.Error(w, err.Error(), http.StatusInternalServerError)
}
}
}

func makeTenantSQLServerArgs(
stopper *stop.Stopper, kvClusterName string, baseCfg BaseConfig, sqlCfg SQLConfig,
) (sqlServerArgs, error) {
Expand Down

0 comments on commit e1dbe12

Please sign in to comment.