Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

executor: add diagnosis rule to check some metrics exceed thresholds #14801

Merged
merged 13 commits into from
Feb 18, 2020
118 changes: 118 additions & 0 deletions executor/diagnostics.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,13 +79,17 @@ type (
// criticalErrorInspection is used to check are there some critical errors
// occurred in the past
criticalErrorInspection struct{ inspectionName }

// thresholdCheckInspection is used to check some threshold value, like CPU usage, leader count change.
thresholdCheckInspection struct{ inspectionName }
)

var inspectionRules = []inspectionRule{
&configInspection{inspectionName: "config"},
&versionInspection{inspectionName: "version"},
&currentLoadInspection{inspectionName: "current-load"},
&criticalErrorInspection{inspectionName: "critical-error"},
&thresholdCheckInspection{inspectionName: "threshold-check"},
}

type inspectionRetriever struct {
Expand Down Expand Up @@ -363,3 +367,117 @@ func (criticalErrorInspection) inspect(ctx context.Context, sctx sessionctx.Cont
}
return results
}

func (thresholdCheckInspection) inspect(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
var rules = []struct {
item string
component string
configKey string
threshold float64
}{
{
item: "coprocessor_normal_cpu",
component: "cop_normal%",
configKey: "readpool.coprocessor.normal-concurrency",
threshold: 0.9},
{
item: "coprocessor_high_cpu",
component: "cop_high%",
configKey: "readpool.coprocessor.high-concurrency",
threshold: 0.9,
},
{
item: "coprocessor_low_cpu",
component: "cop_low%",
configKey: "readpool.coprocessor.low-concurrency",
threshold: 0.9,
},
{
item: "grpc_cpu",
component: "grpc%",
configKey: "server.grpc-concurrency",
threshold: 0.9,
},
{
item: "raftstore_cpu",
component: "raftstore_%",
configKey: "raftstore.store-pool-size",
threshold: 0.8,
},
{
item: "apply_cpu",
component: "apply_%",
configKey: "raftstore.apply-pool-size",
threshold: 0.8,
},
{
item: "storage_readpool_normal_cpu",
component: "store_read_norm%",
configKey: "readpool.storage.normal-concurrency",
threshold: 0.9,
},
{
item: "storage_readpool_high_cpu",
component: "store_read_high%",
configKey: "readpool.storage.high-concurrency",
threshold: 0.9,
},
{
item: "storage_readpool_low_cpu",
component: "store_read_low%",
configKey: "readpool.storage.low-concurrency",
threshold: 0.9,
},
{
item: "scheduler_worker_cpu",
component: "sched_%",
configKey: "storage.scheduler-worker-pool-size",
threshold: 0.85,
},
{
item: "split_check_cpu",
component: "split_check",
threshold: 0.9,
},
}
var results []inspectionResult
for _, rule := range rules {
var sql string
if len(rule.configKey) > 0 {
sql = fmt.Sprintf("select t1.instance, t1.cpu, t2.threshold, t2.value from "+
"(select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like '%[1]s' and time=now() group by instance) as t1,"+
"(select value * %[2]f as threshold, value from inspection_schema.cluster_config where type='tikv' and `key` = '%[3]s' limit 1) as t2 "+
"where t1.cpu > t2.threshold;", rule.component, rule.threshold, rule.configKey)
} else {
sql = fmt.Sprintf("select t1.instance, t1.cpu, %[2]f from "+
"(select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like '%[1]s' and time=now() group by instance) as t1 "+
"where t1.cpu > %[2]f;", rule.component, rule.threshold)
}
rows, _, err := sctx.(sqlexec.RestrictedSQLExecutor).ExecRestrictedSQLWithContext(ctx, sql)
if err != nil {
sctx.GetSessionVars().StmtCtx.AppendWarning(fmt.Errorf("execute '%s' failed: %v", sql, err))
continue
}
for _, row := range rows {
actual := fmt.Sprintf("%.2f", row.GetFloat64(1))
expected := ""
if len(rule.configKey) > 0 {
expected = fmt.Sprintf("< %.2f, config: %v=%v", row.GetFloat64(2), rule.configKey, row.GetString(3))
} else {
expected = fmt.Sprintf("< %.2f", row.GetFloat64(2))
}
detail := fmt.Sprintf("select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like '%[1]s' and time=now() group by instance", rule.component)
result := inspectionResult{
tp: "tikv",
instance: row.GetString(0),
item: rule.item,
actual: actual,
expected: expected,
severity: "warning",
detail: detail,
}
results = append(results, result)
}
}
return results
}
119 changes: 114 additions & 5 deletions executor/diagnostics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@ package executor_test

import (
"context"

. "github.com/pingcap/check"
"github.com/pingcap/failpoint"
"github.com/pingcap/parser/mysql"
"github.com/pingcap/tidb/domain"
"github.com/pingcap/tidb/infoschema"
"github.com/pingcap/tidb/kv"
"github.com/pingcap/tidb/session"
"github.com/pingcap/tidb/sessionctx/variable"
"github.com/pingcap/tidb/types"
"github.com/pingcap/tidb/util/testkit"
Expand Down Expand Up @@ -166,17 +166,126 @@ func (s *diagnosticsSuite) TestInspectionResult(c *C) {
}
}

func (s *diagnosticsSuite) parseTime(c *C, se session.Session, str string) types.Time {
t, err := types.ParseTime(se.GetSessionVars().StmtCtx, str, mysql.TypeDatetime, types.MaxFsp)
c.Assert(err, IsNil)
return t
}

func (s *diagnosticsSuite) TestThresholdCheckInspection(c *C) {
tk := testkit.NewTestKitWithInit(c, s.store)
// mock tikv configuration.
configurations := map[string]variable.TableSnapshot{}
configurations[infoschema.TableClusterConfig] = variable.TableSnapshot{
Rows: [][]types.Datum{
types.MakeDatums("tikv", "tikv-0", "raftstore.apply-pool-size", "2"),
types.MakeDatums("tikv", "tikv-0", "raftstore.store-pool-size", "2"),
types.MakeDatums("tikv", "tikv-0", "readpool.coprocessor.high-concurrency", "4"),
types.MakeDatums("tikv", "tikv-0", "readpool.coprocessor.low-concurrency", "4"),
types.MakeDatums("tikv", "tikv-0", "readpool.coprocessor.normal-concurrency", "4"),
types.MakeDatums("tikv", "tikv-0", "readpool.storage.high-concurrency", "4"),
types.MakeDatums("tikv", "tikv-0", "readpool.storage.low-concurrency", "4"),
types.MakeDatums("tikv", "tikv-0", "readpool.storage.normal-concurrency", "4"),
types.MakeDatums("tikv", "tikv-0", "server.grpc-concurrency", "8"),
types.MakeDatums("tikv", "tikv-0", "storage.scheduler-worker-pool-size", "6"),
},
}
datetime := func(str string) types.Time {
return s.parseTime(c, tk.Se, str)
}
// construct some mock abnormal data
mockData := map[string][][]types.Datum{
// columns: time, instance, name, value
"tikv_thread_cpu": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_normal0", 10.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_normal1", 10.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_high1", 10.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_low1", 10.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "grpc_1", 10.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "raftstore_1", 10.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "apply_0", 10.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_norm1", 10.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_high2", 10.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_low0", 10.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "sched_2", 10.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "split_check", 10.0),
},
}

fpName := "github.com/pingcap/tidb/executor/mockMergeMockInspectionTables"
c.Assert(failpoint.Enable(fpName, "return"), IsNil)
defer func() { c.Assert(failpoint.Disable(fpName), IsNil) }()

// Mock for metric table data.
fpName2 := "github.com/pingcap/tidb/executor/mockMetricsTableData"
c.Assert(failpoint.Enable(fpName2, "return"), IsNil)
defer func() { c.Assert(failpoint.Disable(fpName2), IsNil) }()

ctx := context.WithValue(context.Background(), "__mockInspectionTables", configurations)
ctx = context.WithValue(ctx, "__mockMetricsTableData", mockData)
ctx = failpoint.WithHook(ctx, func(_ context.Context, fpname2 string) bool {
return fpName2 == fpname2 || fpname2 == fpName
})

rs, err := tk.Se.Execute(ctx, "select item, type, instance, value, reference, details from information_schema.inspection_result where rule='threshold-check' order by item")
c.Assert(err, IsNil)
result := tk.ResultSetToResultWithCtx(ctx, rs[0], Commentf("execute inspect SQL failed"))
c.Assert(tk.Se.GetSessionVars().StmtCtx.WarningCount(), Equals, uint16(0), Commentf("unexpected warnings: %+v", tk.Se.GetSessionVars().StmtCtx.GetWarnings()))
result.Check(testkit.Rows(
"apply_cpu tikv tikv-0 10.00 < 1.60, config: raftstore.apply-pool-size=2 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'apply_%' and time=now() group by instance",
"coprocessor_high_cpu tikv tikv-0 10.00 < 3.60, config: readpool.coprocessor.high-concurrency=4 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'cop_high%' and time=now() group by instance",
"coprocessor_low_cpu tikv tikv-0 10.00 < 3.60, config: readpool.coprocessor.low-concurrency=4 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'cop_low%' and time=now() group by instance",
"coprocessor_normal_cpu tikv tikv-0 20.00 < 3.60, config: readpool.coprocessor.normal-concurrency=4 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'cop_normal%' and time=now() group by instance",
"grpc_cpu tikv tikv-0 10.00 < 7.20, config: server.grpc-concurrency=8 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'grpc%' and time=now() group by instance",
"raftstore_cpu tikv tikv-0 10.00 < 1.60, config: raftstore.store-pool-size=2 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'raftstore_%' and time=now() group by instance",
"scheduler_worker_cpu tikv tikv-0 10.00 < 5.10, config: storage.scheduler-worker-pool-size=6 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'sched_%' and time=now() group by instance",
"split_check_cpu tikv tikv-0 10.00 < 0.00 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'split_check' and time=now() group by instance",
"storage_readpool_high_cpu tikv tikv-0 10.00 < 3.60, config: readpool.storage.high-concurrency=4 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'store_read_high%' and time=now() group by instance",
"storage_readpool_low_cpu tikv tikv-0 10.00 < 3.60, config: readpool.storage.low-concurrency=4 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'store_read_low%' and time=now() group by instance",
"storage_readpool_normal_cpu tikv tikv-0 10.00 < 3.60, config: readpool.storage.normal-concurrency=4 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'store_read_norm%' and time=now() group by instance",
))

// construct some mock normal data
mockData = map[string][][]types.Datum{
// columns: time, instance, name, value
"tikv_thread_cpu": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_normal0", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_high1", 0.1),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_low1", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "grpc_1", 7.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "grpc_2", 0.21),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "raftstore_1", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "apply_0", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_norm1", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_high2", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_low0", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "sched_2", 0.3),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "split_check", 0.5),
},
}

ctx = context.WithValue(context.Background(), "__mockInspectionTables", configurations)
ctx = context.WithValue(ctx, "__mockMetricsTableData", mockData)
ctx = failpoint.WithHook(ctx, func(_ context.Context, fpname2 string) bool {
return fpName2 == fpname2 || fpname2 == fpName
})
rs, err = tk.Se.Execute(ctx, "select item, type, instance, value, reference, details from information_schema.inspection_result where rule='threshold-check' order by item")
c.Assert(err, IsNil)
result = tk.ResultSetToResultWithCtx(ctx, rs[0], Commentf("execute inspect SQL failed"))
c.Assert(tk.Se.GetSessionVars().StmtCtx.WarningCount(), Equals, uint16(0), Commentf("unexpected warnings: %+v", tk.Se.GetSessionVars().StmtCtx.GetWarnings()))
result.Check(testkit.Rows(
"grpc_cpu tikv tikv-0 7.21 < 7.20, config: server.grpc-concurrency=8 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'grpc%' and time=now() group by instance"))
}

func (s *diagnosticsSuite) TestCriticalErrorInspection(c *C) {
tk := testkit.NewTestKitWithInit(c, s.store)

fpName := "github.com/pingcap/tidb/executor/mockMetricsTableData"
c.Assert(failpoint.Enable(fpName, "return"), IsNil)
defer func() { c.Assert(failpoint.Disable(fpName), IsNil) }()

datetime := func(s string) types.Time {
t, err := types.ParseTime(tk.Se.GetSessionVars().StmtCtx, s, mysql.TypeDatetime, types.MaxFsp)
c.Assert(err, IsNil)
return t
datetime := func(str string) types.Time {
return s.parseTime(c, tk.Se, str)
}

// construct some mock data
Expand Down
19 changes: 19 additions & 0 deletions infoschema/inspection_schema.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,10 @@ package infoschema

import (
"fmt"
"strings"

"github.com/pingcap/errors"
"github.com/pingcap/failpoint"
"github.com/pingcap/parser/model"
"github.com/pingcap/parser/mysql"
"github.com/pingcap/tidb/kv"
Expand Down Expand Up @@ -59,6 +61,23 @@ func (it *inspectionSchemaTable) IterRecords(ctx sessionctx.Context, startKey kv
return table.ErrUnsupportedOp
}

failpoint.Inject("mockInspectionSchemaClusterConfigData", func(val failpoint.Value) {
crazycs520 marked this conversation as resolved.
Show resolved Hide resolved
s, ok := val.(string)
if ok {
configs := strings.Split(s, ";")
rows := make([][]types.Datum, 0, len(configs))
for _, config := range configs {
parts := strings.Split(config, ",")
row := make([]types.Datum, 0, len(parts))
for _, part := range parts {
row = append(row, types.NewDatum(part))
}
rows = append(rows, row)
}
sessionVars.InspectionTableCache["cluster_config"] = variable.TableSnapshot{Rows: rows}
}
})

// Obtain data from cache first.
cached, found := sessionVars.InspectionTableCache[it.meta.Name.L]
if !found {
Expand Down
9 changes: 6 additions & 3 deletions infoschema/metric_table_def.go
Original file line number Diff line number Diff line change
Expand Up @@ -354,17 +354,20 @@ var MetricTableMap = map[string]MetricTableDef{
Quantile: 0.999,
},
"pd_tso_wait_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"wait\"}[$RANGE_DURATION])) by (le))",
PromQL: "histogram_quantile($QUANTILE, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"wait\"}[$RANGE_DURATION])) by (le,instance))",
Labels: []string{"instance"},
Quantile: 0.999,
Comment: "The quantile duration of a client starting to wait for the TS until received the TS result.",
},
"pd_tso_rpc_duration": {
Comment: "The quantile duration of a client sending TSO request until received the response.",
PromQL: "histogram_quantile($QUANTILE, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[$RANGE_DURATION])) by (le))",
PromQL: "histogram_quantile($QUANTILE, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[$RANGE_DURATION])) by (le,instance))",
Labels: []string{"instance"},
Quantile: 0.999,
},
"pd_start_tso_wait_duration": {
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_pdclient_ts_future_wait_seconds_bucket[$RANGE_DURATION])) by (le))",
PromQL: "histogram_quantile($QUANTILE, sum(rate(tidb_pdclient_ts_future_wait_seconds_bucket{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (le,instance))",
Labels: []string{"instance"},
Quantile: 0.999,
Comment: "The quantile duration of the waiting time for getting the start timestamp oracle",
},
Expand Down