Skip to content

Commit

Permalink
executor: add more diagnosis rule to check some metrics exceed thresh…
Browse files Browse the repository at this point in the history
…olds (#14843)
  • Loading branch information
crazycs520 authored Feb 20, 2020
1 parent 5a00c80 commit d0ad254
Show file tree
Hide file tree
Showing 3 changed files with 311 additions and 43 deletions.
167 changes: 166 additions & 1 deletion executor/inspection_result.go
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,20 @@ func (criticalErrorInspection) inspect(ctx context.Context, sctx sessionctx.Cont
return results
}

func (thresholdCheckInspection) inspect(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
func (c thresholdCheckInspection) inspect(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
inspects := []func(context.Context, sessionctx.Context, inspectionFilter) []inspectionResult{
c.inspectThreshold1,
c.inspectThreshold2,
}
var results []inspectionResult
for _, inspect := range inspects {
re := inspect(ctx, sctx, filter)
results = append(results, re...)
}
return results
}

func (thresholdCheckInspection) inspectThreshold1(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
var rules = []struct {
item string
component string
Expand Down Expand Up @@ -481,3 +494,155 @@ func (thresholdCheckInspection) inspect(ctx context.Context, sctx sessionctx.Con
}
return results
}

func (thresholdCheckInspection) inspectThreshold2(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult {
var rules = []struct {
tp string
item string
tbl string
condition string
threshold float64
isMin bool
}{
{
tp: "tidb",
item: "tso-duration",
tbl: "pd_tso_wait_duration",
condition: "quantile=0.999",
threshold: 0.05,
},
{
tp: "tidb",
item: "get-token-duration",
tbl: "tidb_get_token_duration",
condition: "quantile=0.999",
threshold: 0.001 * 10e5, // the unit is microsecond
},
{
tp: "tidb",
item: "load-schema-duration",
tbl: "tidb_load_schema_duration",
condition: "quantile=0.99",
threshold: 1,
},
{
tp: "tikv",
item: "scheduler-cmd-duration",
tbl: "tikv_scheduler_command_duration",
condition: "quantile=0.99",
threshold: 0.1,
},
{
tp: "tikv",
item: "handle-snapshot-duration",
tbl: "tikv_handle_snapshot_duration",
threshold: 30,
},
{
tp: "tikv",
item: "storage-write-duration",
tbl: "tikv_storage_async_request_duration",
condition: "type='write'",
threshold: 0.1,
},
{
tp: "tikv",
item: "storage-snapshot-duration",
tbl: "tikv_storage_async_request_duration",
condition: "type='snapshot'",
threshold: 0.05,
},
{
tp: "tikv",
item: "rocksdb-write-duration",
tbl: "tikv_engine_write_duration",
condition: "type='write_max'",
threshold: 0.1 * 10e5, // the unit is microsecond
},
{
tp: "tikv",
item: "rocksdb-get-duration",
tbl: "tikv_engine_max_get_duration",
condition: "type='get_max'",
threshold: 0.05 * 10e5, // the unit is microsecond
},
{
tp: "tikv",
item: "rocksdb-seek-duration",
tbl: "tikv_engine_max_seek_duration",
condition: "type='seek_max'",
threshold: 0.05 * 10e5, // the unit is microsecond
},
{
tp: "tikv",
item: "scheduler-pending-cmd-count",
tbl: "tikv_scheduler_pending_commands",
threshold: 1000,
},
{
tp: "tikv",
item: "index-block-cache-hit",
tbl: "tikv_block_index_cache_hit",
condition: "value > 0",
threshold: 0.95,
isMin: true,
},
{
tp: "tikv",
item: "filter-block-cache-hit",
tbl: "tikv_block_filter_cache_hit",
condition: "value > 0",
threshold: 0.95,
isMin: true,
},
{
tp: "tikv",
item: "data-block-cache-hit",
tbl: "tikv_block_data_cache_hit",
condition: "value > 0",
threshold: 0.80,
isMin: true,
},
}
var results []inspectionResult
for _, rule := range rules {
if !filter.enable(rule.item) {
continue
}
var sql string
condition := rule.condition
if len(condition) > 0 {
condition = "where " + condition
}
if rule.isMin {
sql = fmt.Sprintf("select instance, min(value) as min_value from metric_schema.%s %s group by instance having min_value < %f;", rule.tbl, condition, rule.threshold)
} else {
sql = fmt.Sprintf("select instance, max(value) as max_value from metric_schema.%s %s group by instance having max_value > %f;", rule.tbl, condition, rule.threshold)
}
rows, _, err := sctx.(sqlexec.RestrictedSQLExecutor).ExecRestrictedSQLWithContext(ctx, sql)
if err != nil {
sctx.GetSessionVars().StmtCtx.AppendWarning(fmt.Errorf("execute '%s' failed: %v", sql, err))
continue
}
for _, row := range rows {
actual := fmt.Sprintf("%.2f", row.GetFloat64(1))
expected := ""
if rule.isMin {
expected = fmt.Sprintf("> %.2f", rule.threshold)
} else {
expected = fmt.Sprintf("< %.2f", rule.threshold)
}
result := inspectionResult{
tp: rule.tp,
instance: row.GetString(0),
item: rule.item,
actual: actual,
expected: expected,
severity: "warning",
detail: sql,
}
results = append(results, result)
}
}
return results
}
128 changes: 113 additions & 15 deletions executor/inspection_result_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,19 @@ func (s *inspectionResultSuite) TestThresholdCheckInspection(c *C) {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "sched_2", 10.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "split_check", 10.0),
},
"pd_tso_wait_duration": {},
"tidb_get_token_duration": {},
"tidb_load_schema_duration": {},
"tikv_scheduler_command_duration": {},
"tikv_handle_snapshot_duration": {},
"tikv_storage_async_request_duration": {},
"tikv_engine_write_duration": {},
"tikv_engine_max_get_duration": {},
"tikv_engine_max_seek_duration": {},
"tikv_scheduler_pending_commands": {},
"tikv_block_index_cache_hit": {},
"tikv_block_data_cache_hit": {},
"tikv_block_filter_cache_hit": {},
}

fpName := "github.com/pingcap/tidb/executor/mockMergeMockInspectionTables"
Expand Down Expand Up @@ -246,22 +259,20 @@ func (s *inspectionResultSuite) TestThresholdCheckInspection(c *C) {
))

// construct some mock normal data
mockData = map[string][][]types.Datum{
mockData["tikv_thread_cpu"] = [][]types.Datum{
// columns: time, instance, name, value
"tikv_thread_cpu": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_normal0", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_high1", 0.1),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_low1", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "grpc_1", 7.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "grpc_2", 0.21),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "raftstore_1", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "apply_0", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_norm1", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_high2", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_low0", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "sched_2", 0.3),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "split_check", 0.5),
},
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_normal0", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_high1", 0.1),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "cop_low1", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "grpc_1", 7.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "grpc_2", 0.21),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "raftstore_1", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "apply_0", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_norm1", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_high2", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "store_read_low0", 1.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "sched_2", 0.3),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "split_check", 0.5),
}

ctx = context.WithValue(context.Background(), "__mockInspectionTables", configurations)
Expand All @@ -277,6 +288,93 @@ func (s *inspectionResultSuite) TestThresholdCheckInspection(c *C) {
"grpc_cpu tikv tikv-0 7.21 < 7.20, config: server.grpc-concurrency=8 select instance, sum(value) as cpu from metric_schema.tikv_thread_cpu where name like 'grpc%' and time=now() group by instance"))
}

func (s *inspectionResultSuite) TestThresholdCheckInspection2(c *C) {
tk := testkit.NewTestKitWithInit(c, s.store)
// Mock for metric table data.
fpName := "github.com/pingcap/tidb/executor/mockMetricsTableData"
c.Assert(failpoint.Enable(fpName, "return"), IsNil)
defer func() { c.Assert(failpoint.Disable(fpName), IsNil) }()

datetime := func(s string) types.Time {
t, err := types.ParseTime(tk.Se.GetSessionVars().StmtCtx, s, mysql.TypeDatetime, types.MaxFsp)
c.Assert(err, IsNil)
return t
}

// construct some mock abnormal data
mockData := map[string][][]types.Datum{
"pd_tso_wait_duration": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "pd-0", 0.999, 0.06),
},
"tidb_get_token_duration": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tidb-0", 0.999, 0.02*10e5),
},
"tidb_load_schema_duration": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tidb-0", 0.99, 2.0),
},
"tikv_scheduler_command_duration": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "get", 0.99, 2.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "write", 0.99, 5.0),
},
"tikv_handle_snapshot_duration": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "gen", 0.999, 40.0),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "read", 0.999, 10.0),
},
"tikv_storage_async_request_duration": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "write", 0.999, 0.2),
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "snapshot", 0.999, 0.06),
},
"tikv_engine_write_duration": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "write_max", "kv", 0.2*10e5),
},
"tikv_engine_max_get_duration": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "get_max", "kv", 0.06*10e5),
},
"tikv_engine_max_seek_duration": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "seek_max", "raft", 0.06*10e5),
},
"tikv_scheduler_pending_commands": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", 1001.0),
},
"tikv_block_index_cache_hit": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "kv", 0.94),
},
"tikv_block_data_cache_hit": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "kv", 0.79),
},
"tikv_block_filter_cache_hit": {
types.MakeDatums(datetime("2020-02-14 05:20:00"), "tikv-0", "kv", 0.93),
},
"tikv_thread_cpu": {},
}

ctx := context.WithValue(context.Background(), "__mockMetricsTableData", mockData)
ctx = failpoint.WithHook(ctx, func(_ context.Context, fpname string) bool {
return fpname == fpName
})

rs, err := tk.Se.Execute(ctx, "select item, type, instance, value, reference, details from information_schema.inspection_result where rule='threshold-check' order by item")
c.Assert(err, IsNil)
result := tk.ResultSetToResultWithCtx(ctx, rs[0], Commentf("execute inspect SQL failed"))
c.Assert(tk.Se.GetSessionVars().StmtCtx.WarningCount(), Equals, uint16(0), Commentf("unexpected warnings: %+v", tk.Se.GetSessionVars().StmtCtx.GetWarnings()))
result.Check(testkit.Rows(
"data-block-cache-hit tikv tikv-0 0.79 > 0.80 select instance, min(value) as min_value from metric_schema.tikv_block_data_cache_hit where value > 0 group by instance having min_value < 0.800000;",
"filter-block-cache-hit tikv tikv-0 0.93 > 0.95 select instance, min(value) as min_value from metric_schema.tikv_block_filter_cache_hit where value > 0 group by instance having min_value < 0.950000;",
"get-token-duration tidb tidb-0 20000.00 < 1000.00 select instance, max(value) as max_value from metric_schema.tidb_get_token_duration where quantile=0.999 group by instance having max_value > 1000.000000;",
"handle-snapshot-duration tikv tikv-0 40.00 < 30.00 select instance, max(value) as max_value from metric_schema.tikv_handle_snapshot_duration group by instance having max_value > 30.000000;",
"index-block-cache-hit tikv tikv-0 0.94 > 0.95 select instance, min(value) as min_value from metric_schema.tikv_block_index_cache_hit where value > 0 group by instance having min_value < 0.950000;",
"load-schema-duration tidb tidb-0 2.00 < 1.00 select instance, max(value) as max_value from metric_schema.tidb_load_schema_duration where quantile=0.99 group by instance having max_value > 1.000000;",
"rocksdb-get-duration tikv tikv-0 60000.00 < 50000.00 select instance, max(value) as max_value from metric_schema.tikv_engine_max_get_duration where type='get_max' group by instance having max_value > 50000.000000;",
"rocksdb-seek-duration tikv tikv-0 60000.00 < 50000.00 select instance, max(value) as max_value from metric_schema.tikv_engine_max_seek_duration where type='seek_max' group by instance having max_value > 50000.000000;",
"rocksdb-write-duration tikv tikv-0 200000.00 < 100000.00 select instance, max(value) as max_value from metric_schema.tikv_engine_write_duration where type='write_max' group by instance having max_value > 100000.000000;",
"scheduler-cmd-duration tikv tikv-0 5.00 < 0.10 select instance, max(value) as max_value from metric_schema.tikv_scheduler_command_duration where quantile=0.99 group by instance having max_value > 0.100000;",
"scheduler-pending-cmd-count tikv tikv-0 1001.00 < 1000.00 select instance, max(value) as max_value from metric_schema.tikv_scheduler_pending_commands group by instance having max_value > 1000.000000;",
"storage-snapshot-duration tikv tikv-0 0.20 < 0.05 select instance, max(value) as max_value from metric_schema.tikv_storage_async_request_duration where type='snapshot' group by instance having max_value > 0.050000;",
"storage-write-duration tikv tikv-0 0.20 < 0.10 select instance, max(value) as max_value from metric_schema.tikv_storage_async_request_duration where type='write' group by instance having max_value > 0.100000;",
"tso-duration tidb pd-0 0.06 < 0.05 select instance, max(value) as max_value from metric_schema.pd_tso_wait_duration where quantile=0.999 group by instance having max_value > 0.050000;",
))
}

func (s *inspectionResultSuite) TestCriticalErrorInspection(c *C) {
tk := testkit.NewTestKitWithInit(c, s.store)

Expand Down
Loading

0 comments on commit d0ad254

Please sign in to comment.