From 2f926df5017953863ef38ceae1865699e5d4630e Mon Sep 17 00:00:00 2001 From: Lonng Date: Thu, 13 Feb 2020 19:34:04 +0800 Subject: [PATCH] executor: add diagnosis rule to detect cluster critical errors (#14743) --- executor/cluster_reader_test.go | 60 ++++++--- executor/diagnostics.go | 139 +++++++++++++++----- executor/diagnostics_test.go | 162 ++++++++++++++++++++++++ executor/metric_reader.go | 12 +- infoschema/metric_table_def.go | 7 +- session/session.go | 5 +- util/sqlexec/restricted_sql_executor.go | 2 + 7 files changed, 331 insertions(+), 56 deletions(-) diff --git a/executor/cluster_reader_test.go b/executor/cluster_reader_test.go index 718579d4ca4be..5fa8bc72ef203 100644 --- a/executor/cluster_reader_test.go +++ b/executor/cluster_reader_test.go @@ -37,7 +37,6 @@ import ( "github.com/pingcap/tidb/session" "github.com/pingcap/tidb/util/pdapi" "github.com/pingcap/tidb/util/testkit" - "github.com/pingcap/tidb/util/testutil" pmodel "github.com/prometheus/common/model" "google.golang.org/grpc" ) @@ -60,11 +59,11 @@ func (s *testClusterReaderSuite) TearDownSuite(c *C) { } func (s *testClusterReaderSuite) TestMetricTableData(c *C) { - failPoint := "github.com/pingcap/tidb/executor/mockMetricRetrieverQueryPromQL" - c.Assert(failpoint.Enable(failPoint, "return"), IsNil) - defer func() { - c.Assert(failpoint.Disable(failPoint), IsNil) - }() + fpName := "github.com/pingcap/tidb/executor/mockMetricsPromData" + c.Assert(failpoint.Enable(fpName, "return"), IsNil) + defer func() { c.Assert(failpoint.Disable(fpName), IsNil) }() + + // mock prometheus data matrix := pmodel.Matrix{} metric := map[pmodel.LabelName]pmodel.LabelValue{ "instance": "127.0.0.1:10080", @@ -76,25 +75,46 @@ func (s *testClusterReaderSuite) TestMetricTableData(c *C) { Value: pmodel.SampleValue(0.1), } matrix = append(matrix, &pmodel.SampleStream{Metric: metric, Values: []pmodel.SamplePair{v1}}) - ctx := context.WithValue(context.Background(), "__mockMetricsData", matrix) + + ctx := context.WithValue(context.Background(), "__mockMetricsPromData", matrix) ctx = failpoint.WithHook(ctx, func(ctx context.Context, fpname string) bool { - return fpname == failPoint + return fpname == fpName }) tk := testkit.NewTestKit(c, s.store) tk.MustExec("use metric_schema") - rs, err := tk.Se.Execute(ctx, "select * from tidb_query_duration;") - c.Assert(err, IsNil) - result := tk.ResultSetToResultWithCtx(ctx, rs[0], Commentf("execute sql fail")) - result.Check(testutil.RowsWithSep("|", - "2019-12-23 20:11:35.000000|127.0.0.1:10080| 0.9|0.1|The quantile of TiDB query durations(second)")) - rs, err = tk.Se.Execute(ctx, "select time,instance,quantile,value from tidb_query_duration where quantile in (0.85, 0.95);") - c.Assert(err, IsNil) - result = tk.ResultSetToResultWithCtx(ctx, rs[0], Commentf("execute sql fail")) - result.Check(testkit.Rows( - "2019-12-23 20:11:35.000000 127.0.0.1:10080 0.85 0.1", - "2019-12-23 20:11:35.000000 127.0.0.1:10080 0.95 0.1")) + cases := []struct { + sql string + exp []string + }{ + { + sql: "select time,instance,quantile,value from tidb_query_duration;", + exp: []string{ + "2019-12-23 20:11:35.000000 127.0.0.1:10080 0.9 0.1", + }, + }, + { + sql: "select time,instance,quantile,value from tidb_query_duration where quantile in (0.85, 0.95);", + exp: []string{ + "2019-12-23 20:11:35.000000 127.0.0.1:10080 0.85 0.1", + "2019-12-23 20:11:35.000000 127.0.0.1:10080 0.95 0.1", + }, + }, + { + sql: "select time,instance,quantile,value from tidb_query_duration where quantile=0.5", + exp: []string{ + "2019-12-23 20:11:35.000000 127.0.0.1:10080 0.5 0.1", + }, + }, + } + + for _, cas := range cases { + rs, err := tk.Se.Execute(ctx, cas.sql) + c.Assert(err, IsNil) + result := tk.ResultSetToResultWithCtx(ctx, rs[0], Commentf("sql: %s", cas.sql)) + result.Check(testkit.Rows(cas.exp...)) + } } func (s *testClusterReaderSuite) TestTiDBClusterConfig(c *C) { @@ -406,7 +426,7 @@ func (s *testClusterReaderSuite) TestTiDBClusterLog(c *C) { tmpDir string logFile string } - // typ => testServer + // tp => testServer testServers := map[string]*testServer{} // create gRPC servers diff --git a/executor/diagnostics.go b/executor/diagnostics.go index f7b2cf6b29902..11adbeb60fb06 100644 --- a/executor/diagnostics.go +++ b/executor/diagnostics.go @@ -20,10 +20,12 @@ import ( "strings" "github.com/pingcap/failpoint" + "github.com/pingcap/tidb/infoschema" plannercore "github.com/pingcap/tidb/planner/core" "github.com/pingcap/tidb/sessionctx" "github.com/pingcap/tidb/sessionctx/variable" "github.com/pingcap/tidb/types" + "github.com/pingcap/tidb/util" "github.com/pingcap/tidb/util/chunk" "github.com/pingcap/tidb/util/set" "github.com/pingcap/tidb/util/sqlexec" @@ -32,7 +34,7 @@ import ( type ( // inspectionResult represents a abnormal diagnosis result inspectionResult struct { - typ string + tp string instance string // represents the diagnostics item, e.g: `ddl.lease` `raftstore.cpuusage` item string @@ -43,6 +45,8 @@ type ( detail string } + inspectionName string + inspectionFilter struct{ set.StringSet } inspectionRule interface { @@ -51,14 +55,37 @@ type ( } ) +func (n inspectionName) name() string { + return string(n) +} + func (f inspectionFilter) enable(name string) bool { return len(f.StringSet) == 0 || f.Exist(name) } +type ( + // configInspection is used to check whether a same configuration item has a + // different value between different instance in the cluster + configInspection struct{ inspectionName } + + // versionInspection is used to check whether the same component has different + // version in the cluster + versionInspection struct{ inspectionName } + + // currentLoadInspection is used to check the current load of memory/disk/cpu + // have reached a high-level threshold + currentLoadInspection struct{ inspectionName } + + // criticalErrorInspection is used to check are there some critical errors + // occurred in the past + criticalErrorInspection struct{ inspectionName } +) + var inspectionRules = []inspectionRule{ - &configInspection{}, - &versionInspection{}, - ¤tLoadInspection{}, + &configInspection{inspectionName: "config"}, + &versionInspection{inspectionName: "version"}, + ¤tLoadInspection{inspectionName: "current-load"}, + &criticalErrorInspection{inspectionName: "critical-error"}, } type inspectionRetriever struct { @@ -118,7 +145,7 @@ func (e *inspectionRetriever) retrieve(ctx context.Context, sctx sessionctx.Cont finalRows = append(finalRows, types.MakeDatums( name, result.item, - result.typ, + result.tp, result.instance, result.actual, result.expected, @@ -130,12 +157,6 @@ func (e *inspectionRetriever) retrieve(ctx context.Context, sctx sessionctx.Cont return finalRows, nil } -type configInspection struct{} - -func (configInspection) name() string { - return "config" -} - func (configInspection) inspect(_ context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult { // check the configuration consistent sql := "select type, `key`, count(distinct value) as c from inspection_schema.cluster_config group by type, `key` having c > 1" @@ -148,7 +169,7 @@ func (configInspection) inspect(_ context.Context, sctx sessionctx.Context, filt for _, row := range rows { if filter.enable(row.GetString(1)) { results = append(results, inspectionResult{ - typ: row.GetString(0), + tp: row.GetString(0), instance: "", item: row.GetString(1), // key actual: "inconsistent", @@ -162,12 +183,6 @@ func (configInspection) inspect(_ context.Context, sctx sessionctx.Context, filt return results } -type versionInspection struct{} - -func (versionInspection) name() string { - return "version" -} - func (versionInspection) inspect(_ context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult { // check the configuration consistent sql := "select type, count(distinct git_hash) as c from inspection_schema.cluster_info group by type having c > 1;" @@ -181,7 +196,7 @@ func (versionInspection) inspect(_ context.Context, sctx sessionctx.Context, fil for _, row := range rows { if filter.enable(name) { results = append(results, inspectionResult{ - typ: row.GetString(0), + tp: row.GetString(0), instance: "", item: name, actual: "inconsistent", @@ -194,16 +209,10 @@ func (versionInspection) inspect(_ context.Context, sctx sessionctx.Context, fil return results } -type currentLoadInspection struct{} - -func (currentLoadInspection) name() string { - return "current-load" -} - func (currentLoadInspection) inspect(_ context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult { - var commonResult = func(item string, expected string, row chunk.Row) inspectionResult { + var commonResult = func(item, expected string, row chunk.Row) inspectionResult { return inspectionResult{ - typ: row.GetString(0), + tp: row.GetString(0), instance: row.GetString(1), item: item, actual: row.GetString(2), @@ -211,9 +220,9 @@ func (currentLoadInspection) inspect(_ context.Context, sctx sessionctx.Context, severity: "warning", } } - var diskResult = func(item string, expected string, row chunk.Row) inspectionResult { + var diskResult = func(item, expected string, row chunk.Row) inspectionResult { return inspectionResult{ - typ: row.GetString(0), + tp: row.GetString(0), instance: row.GetString(1), item: item, actual: row.GetString(3), @@ -282,3 +291,75 @@ func (currentLoadInspection) inspect(_ context.Context, sctx sessionctx.Context, } return results } + +func (criticalErrorInspection) inspect(ctx context.Context, sctx sessionctx.Context, filter inspectionFilter) []inspectionResult { + // TODO: specify the `begin` and `end` time of metric query + var rules = []struct { + tp string + item string + tbl string + }{ + {tp: "tidb", item: "failed-query-opm", tbl: "tidb_failed_query_opm"}, + {tp: "tikv", item: "critical-error", tbl: "tikv_critical_error"}, + {tp: "tidb", item: "panic-count", tbl: "tidb_panic_count"}, + {tp: "tidb", item: "binlog-error", tbl: "tidb_binlog_error_count"}, + {tp: "tidb", item: "pd-cmd-failed", tbl: "pd_cmd_fail_ops"}, + {tp: "tidb", item: "ticlient-region-error", tbl: "tidb_kv_region_error_ops"}, + {tp: "tidb", item: "lock-resolve", tbl: "tidb_lock_resolver_ops"}, + {tp: "tikv", item: "scheduler-is-busy", tbl: "tikv_scheduler_is_busy"}, + {tp: "tikv", item: "coprocessor-is-busy", tbl: "tikv_coprocessor_is_busy"}, + {tp: "tikv", item: "channel-is-full", tbl: "tikv_channel_full_total"}, + {tp: "tikv", item: "coprocessor-error", tbl: "tikv_coprocessor_request_error"}, + {tp: "tidb", item: "schema-lease-error", tbl: "tidb_schema_lease_error_opm"}, + {tp: "tidb", item: "txn-retry-error", tbl: "tidb_transaction_retry_error_ops"}, + {tp: "tikv", item: "grpc-errors", tbl: "tikv_grpc_errors"}, + } + + var results []inspectionResult + for _, rule := range rules { + if filter.enable(rule.item) { + def, ok := infoschema.MetricTableMap[rule.tbl] + if !ok { + sctx.GetSessionVars().StmtCtx.AppendWarning(fmt.Errorf("metrics table: %s not fouund", rule.tbl)) + continue + } + sql := fmt.Sprintf("select `%[1]s`, max(value) as max_value from `%[2]s`.`%[3]s` group by `%[1]s` having max_value > 0.0", + strings.Join(def.Labels, "`,`"), util.MetricSchemaName.L, rule.tbl) + rows, _, err := sctx.(sqlexec.RestrictedSQLExecutor).ExecRestrictedSQLWithContext(ctx, sql) + if err != nil { + sctx.GetSessionVars().StmtCtx.AppendWarning(fmt.Errorf("execute '%s' failed: %v", sql, err)) + continue + } + for _, row := range rows { + var actual, detail string + if rest := def.Labels[1:]; len(rest) > 0 { + pairs := make([]string, 0, len(rest)) + // `i+1` and `1+len(rest)` means skip the first field `instance` + for i, label := range rest { + pairs = append(pairs, fmt.Sprintf("`%s`='%s'", label, row.GetString(i+1))) + } + // TODO: find a better way to construct the `actual` field + actual = fmt.Sprintf("{%s}=%.2f", strings.Join(pairs, ","), row.GetFloat64(1+len(rest))) + detail = fmt.Sprintf("select * from `%s`.`%s` where `instance`='%s' and %s", + util.MetricSchemaName.L, rule.tbl, row.GetString(0), strings.Join(pairs, " and ")) + } else { + actual = fmt.Sprintf("%.2f", row.GetFloat64(1)) + detail = fmt.Sprintf("select * from `%s`.`%s` where `instance`='%s'", + util.MetricSchemaName.L, rule.tbl, row.GetString(0)) + } + result := inspectionResult{ + tp: rule.tp, + // NOTE: all tables which can be inspected here whose first label must be `instance` + instance: row.GetString(0), + item: rule.item, + actual: actual, + expected: "0", + severity: "warning", + detail: detail, + } + results = append(results, result) + } + } + } + return results +} diff --git a/executor/diagnostics_test.go b/executor/diagnostics_test.go index 268e55194afc5..e0e75808df7ef 100644 --- a/executor/diagnostics_test.go +++ b/executor/diagnostics_test.go @@ -18,6 +18,7 @@ import ( . "github.com/pingcap/check" "github.com/pingcap/failpoint" + "github.com/pingcap/parser/mysql" "github.com/pingcap/tidb/domain" "github.com/pingcap/tidb/infoschema" "github.com/pingcap/tidb/kv" @@ -164,3 +165,164 @@ func (s *diagnosticsSuite) TestInspectionResult(c *C) { result.Check(testkit.Rows(cs.rows...)) } } + +func (s *diagnosticsSuite) TestCriticalErrorInspection(c *C) { + tk := testkit.NewTestKitWithInit(c, s.store) + + fpName := "github.com/pingcap/tidb/executor/mockMetricsTableData" + c.Assert(failpoint.Enable(fpName, "return"), IsNil) + defer func() { c.Assert(failpoint.Disable(fpName), IsNil) }() + + datetime := func(s string) types.Time { + t, err := types.ParseTime(tk.Se.GetSessionVars().StmtCtx, s, mysql.TypeDatetime, types.MaxFsp) + c.Assert(err, IsNil) + return t + } + + // construct some mock data + mockData := map[string][][]types.Datum{ + // columns: time, instance, type, value + "tidb_failed_query_opm": { + types.MakeDatums(datetime("2020-02-12 10:35:00"), "tidb-0", "type1", 0.0), + types.MakeDatums(datetime("2020-02-12 10:36:00"), "tidb-0", "type2", 1.0), + types.MakeDatums(datetime("2020-02-12 10:37:00"), "tidb-1", "type3", 5.0), + }, + // columns: time, instance, type, value + "tikv_critical_error": { + types.MakeDatums(datetime("2020-02-12 10:35:00"), "tikv-0", "type1", 0.0), + types.MakeDatums(datetime("2020-02-12 10:36:00"), "tikv-1", "type1", 1.0), + types.MakeDatums(datetime("2020-02-12 10:37:00"), "tikv-2", "type2", 5.0), + }, + // columns: time, instance, value + "tidb_panic_count": { + types.MakeDatums(datetime("2020-02-12 10:35:00"), "tidb-0", 4.0), + types.MakeDatums(datetime("2020-02-12 10:36:00"), "tidb-0", 0.0), + types.MakeDatums(datetime("2020-02-12 10:37:00"), "tidb-1", 1.0), + }, + // columns: time, instance, value + "tidb_binlog_error_count": { + types.MakeDatums(datetime("2020-02-12 10:35:00"), "tidb-1", 4.0), + types.MakeDatums(datetime("2020-02-12 10:36:00"), "tidb-2", 0.0), + types.MakeDatums(datetime("2020-02-12 10:37:00"), "tidb-3", 1.0), + }, + // columns: time, instance, type, value + "pd_cmd_fail_ops": { + types.MakeDatums(datetime("2020-02-12 10:35:00"), "tidb-0", "type1", 0.0), + types.MakeDatums(datetime("2020-02-12 10:36:00"), "tidb-0", "type1", 1.0), + types.MakeDatums(datetime("2020-02-12 10:37:00"), "tidb-1", "type2", 5.0), + }, + // columns: time, instance, type, value + "tidb_lock_resolver_ops": { + types.MakeDatums(datetime("2020-02-12 10:35:00"), "tidb-0", "type1", 0.0), + types.MakeDatums(datetime("2020-02-12 10:36:00"), "tidb-0", "type1", 1.0), + types.MakeDatums(datetime("2020-02-12 10:37:00"), "tidb-1", "type2", 5.0), + }, + // columns: time, instance, db, type, stage, value + "tikv_scheduler_is_busy": { + types.MakeDatums(datetime("2020-02-12 10:35:00"), "tikv-0", "db1", "type1", "stage1", 1.0), + types.MakeDatums(datetime("2020-02-12 10:36:00"), "tikv-0", "db2", "type1", "stage2", 2.0), + types.MakeDatums(datetime("2020-02-12 10:37:00"), "tikv-1", "db1", "type2", "stage1", 3.0), + types.MakeDatums(datetime("2020-02-12 10:38:00"), "tikv-0", "db1", "type1", "stage2", 4.0), + types.MakeDatums(datetime("2020-02-12 10:39:00"), "tikv-0", "db2", "type1", "stage1", 5.0), + types.MakeDatums(datetime("2020-02-12 10:40:00"), "tikv-1", "db1", "type2", "stage2", 6.0), + }, + // columns: time, instance, db, value + "tikv_coprocessor_is_busy": { + types.MakeDatums(datetime("2020-02-12 10:35:00"), "tikv-0", "db1", 1.0), + types.MakeDatums(datetime("2020-02-12 10:36:00"), "tikv-0", "db2", 2.0), + types.MakeDatums(datetime("2020-02-12 10:37:00"), "tikv-1", "db1", 3.0), + types.MakeDatums(datetime("2020-02-12 10:38:00"), "tikv-0", "db1", 4.0), + types.MakeDatums(datetime("2020-02-12 10:39:00"), "tikv-0", "db2", 5.0), + types.MakeDatums(datetime("2020-02-12 10:40:00"), "tikv-1", "db1", 6.0), + }, + // columns: time, instance, db, type, value + "tikv_channel_full_total": { + types.MakeDatums(datetime("2020-02-12 10:35:00"), "tikv-0", "db1", "type1", 1.0), + types.MakeDatums(datetime("2020-02-12 10:36:00"), "tikv-0", "db2", "type1", 2.0), + types.MakeDatums(datetime("2020-02-12 10:37:00"), "tikv-1", "db1", "type2", 3.0), + types.MakeDatums(datetime("2020-02-12 10:38:00"), "tikv-0", "db1", "type1", 4.0), + types.MakeDatums(datetime("2020-02-12 10:39:00"), "tikv-0", "db2", "type1", 5.0), + types.MakeDatums(datetime("2020-02-12 10:40:00"), "tikv-1", "db1", "type2", 6.0), + }, + // columns: time, "instance", "reason", value + "tikv_coprocessor_request_error": { + types.MakeDatums(datetime("2020-02-12 10:35:00"), "tikv-0", "reason1", 1.0), + types.MakeDatums(datetime("2020-02-12 10:36:00"), "tikv-0", "reason2", 2.0), + types.MakeDatums(datetime("2020-02-12 10:37:00"), "tikv-1", "reason3", 3.0), + }, + // columns: time, instance, value + "tidb_schema_lease_error_opm": { + types.MakeDatums(datetime("2020-02-12 10:35:00"), "tidb-1", 4.0), + types.MakeDatums(datetime("2020-02-12 10:36:00"), "tidb-2", 0.0), + types.MakeDatums(datetime("2020-02-12 10:37:00"), "tidb-3", 1.0), + }, + // columns: time, instance, type, sql_type, value + "tidb_transaction_retry_error_ops": { + types.MakeDatums(datetime("2020-02-12 10:35:00"), "tidb-0", "db1", "sql_type1", 1.0), + types.MakeDatums(datetime("2020-02-12 10:36:00"), "tidb-0", "db2", "sql_type1", 2.0), + types.MakeDatums(datetime("2020-02-12 10:37:00"), "tidb-1", "db1", "sql_type2", 3.0), + }, + // columns: time, instance, type, value + "tikv_grpc_errors": { + types.MakeDatums(datetime("2020-02-12 10:35:00"), "tikv-0", "type1", 1.0), + types.MakeDatums(datetime("2020-02-12 10:36:00"), "tikv-0", "type2", 2.0), + types.MakeDatums(datetime("2020-02-12 10:37:00"), "tikv-1", "type3", 3.0), + }, + // columns: time, instance, type, value + "tidb_kv_region_error_ops": { + types.MakeDatums(datetime("2020-02-12 10:35:00"), "tikv-0", "type1", 1.0), + types.MakeDatums(datetime("2020-02-12 10:36:00"), "tikv-0", "type2", 2.0), + types.MakeDatums(datetime("2020-02-12 10:37:00"), "tikv-1", "type3", 3.0), + }, + } + + ctx := context.WithValue(context.Background(), "__mockMetricsTableData", mockData) + ctx = failpoint.WithHook(ctx, func(_ context.Context, fpname string) bool { + return fpName == fpname + }) + + rs, err := tk.Se.Execute(ctx, "select item, instance, value, details from information_schema.inspection_result where rule='critical-error'") + c.Assert(err, IsNil) + result := tk.ResultSetToResultWithCtx(ctx, rs[0], Commentf("execute inspect SQL failed")) + c.Assert(tk.Se.GetSessionVars().StmtCtx.WarningCount(), Equals, uint16(0), Commentf("unexpected warnings: %+v", tk.Se.GetSessionVars().StmtCtx.GetWarnings())) + result.Check(testkit.Rows( + "binlog-error tidb-3 1.00 select * from `metric_schema`.`tidb_binlog_error_count` where `instance`='tidb-3'", + "binlog-error tidb-1 4.00 select * from `metric_schema`.`tidb_binlog_error_count` where `instance`='tidb-1'", + "channel-is-full tikv-0 {`db`='db1',`type`='type1'}=4.00 select * from `metric_schema`.`tikv_channel_full_total` where `instance`='tikv-0' and `db`='db1' and `type`='type1'", + "channel-is-full tikv-1 {`db`='db1',`type`='type2'}=6.00 select * from `metric_schema`.`tikv_channel_full_total` where `instance`='tikv-1' and `db`='db1' and `type`='type2'", + "channel-is-full tikv-0 {`db`='db2',`type`='type1'}=5.00 select * from `metric_schema`.`tikv_channel_full_total` where `instance`='tikv-0' and `db`='db2' and `type`='type1'", + "coprocessor-error tikv-0 {`reason`='reason1'}=1.00 select * from `metric_schema`.`tikv_coprocessor_request_error` where `instance`='tikv-0' and `reason`='reason1'", + "coprocessor-error tikv-0 {`reason`='reason2'}=2.00 select * from `metric_schema`.`tikv_coprocessor_request_error` where `instance`='tikv-0' and `reason`='reason2'", + "coprocessor-error tikv-1 {`reason`='reason3'}=3.00 select * from `metric_schema`.`tikv_coprocessor_request_error` where `instance`='tikv-1' and `reason`='reason3'", + "coprocessor-is-busy tikv-0 {`db`='db1'}=4.00 select * from `metric_schema`.`tikv_coprocessor_is_busy` where `instance`='tikv-0' and `db`='db1'", + "coprocessor-is-busy tikv-1 {`db`='db1'}=6.00 select * from `metric_schema`.`tikv_coprocessor_is_busy` where `instance`='tikv-1' and `db`='db1'", + "coprocessor-is-busy tikv-0 {`db`='db2'}=5.00 select * from `metric_schema`.`tikv_coprocessor_is_busy` where `instance`='tikv-0' and `db`='db2'", + "critical-error tikv-1 {`type`='type1'}=1.00 select * from `metric_schema`.`tikv_critical_error` where `instance`='tikv-1' and `type`='type1'", + "critical-error tikv-2 {`type`='type2'}=5.00 select * from `metric_schema`.`tikv_critical_error` where `instance`='tikv-2' and `type`='type2'", + "failed-query-opm tidb-0 {`type`='type2'}=1.00 select * from `metric_schema`.`tidb_failed_query_opm` where `instance`='tidb-0' and `type`='type2'", + "failed-query-opm tidb-1 {`type`='type3'}=5.00 select * from `metric_schema`.`tidb_failed_query_opm` where `instance`='tidb-1' and `type`='type3'", + "grpc-errors tikv-0 {`type`='type1'}=1.00 select * from `metric_schema`.`tikv_grpc_errors` where `instance`='tikv-0' and `type`='type1'", + "grpc-errors tikv-0 {`type`='type2'}=2.00 select * from `metric_schema`.`tikv_grpc_errors` where `instance`='tikv-0' and `type`='type2'", + "grpc-errors tikv-1 {`type`='type3'}=3.00 select * from `metric_schema`.`tikv_grpc_errors` where `instance`='tikv-1' and `type`='type3'", + "lock-resolve tidb-0 {`type`='type1'}=1.00 select * from `metric_schema`.`tidb_lock_resolver_ops` where `instance`='tidb-0' and `type`='type1'", + "lock-resolve tidb-1 {`type`='type2'}=5.00 select * from `metric_schema`.`tidb_lock_resolver_ops` where `instance`='tidb-1' and `type`='type2'", + "panic-count tidb-1 1.00 select * from `metric_schema`.`tidb_panic_count` where `instance`='tidb-1'", + "panic-count tidb-0 4.00 select * from `metric_schema`.`tidb_panic_count` where `instance`='tidb-0'", + "pd-cmd-failed tidb-0 {`type`='type1'}=1.00 select * from `metric_schema`.`pd_cmd_fail_ops` where `instance`='tidb-0' and `type`='type1'", + "pd-cmd-failed tidb-1 {`type`='type2'}=5.00 select * from `metric_schema`.`pd_cmd_fail_ops` where `instance`='tidb-1' and `type`='type2'", + "scheduler-is-busy tikv-0 {`db`='db1',`type`='type1',`stage`='stage1'}=1.00 select * from `metric_schema`.`tikv_scheduler_is_busy` where `instance`='tikv-0' and `db`='db1' and `type`='type1' and `stage`='stage1'", + "scheduler-is-busy tikv-0 {`db`='db1',`type`='type1',`stage`='stage2'}=4.00 select * from `metric_schema`.`tikv_scheduler_is_busy` where `instance`='tikv-0' and `db`='db1' and `type`='type1' and `stage`='stage2'", + "scheduler-is-busy tikv-1 {`db`='db1',`type`='type2',`stage`='stage1'}=3.00 select * from `metric_schema`.`tikv_scheduler_is_busy` where `instance`='tikv-1' and `db`='db1' and `type`='type2' and `stage`='stage1'", + "scheduler-is-busy tikv-1 {`db`='db1',`type`='type2',`stage`='stage2'}=6.00 select * from `metric_schema`.`tikv_scheduler_is_busy` where `instance`='tikv-1' and `db`='db1' and `type`='type2' and `stage`='stage2'", + "scheduler-is-busy tikv-0 {`db`='db2',`type`='type1',`stage`='stage1'}=5.00 select * from `metric_schema`.`tikv_scheduler_is_busy` where `instance`='tikv-0' and `db`='db2' and `type`='type1' and `stage`='stage1'", + "scheduler-is-busy tikv-0 {`db`='db2',`type`='type1',`stage`='stage2'}=2.00 select * from `metric_schema`.`tikv_scheduler_is_busy` where `instance`='tikv-0' and `db`='db2' and `type`='type1' and `stage`='stage2'", + "schema-lease-error tidb-3 1.00 select * from `metric_schema`.`tidb_schema_lease_error_opm` where `instance`='tidb-3'", + "schema-lease-error tidb-1 4.00 select * from `metric_schema`.`tidb_schema_lease_error_opm` where `instance`='tidb-1'", + "ticlient-region-error tikv-0 {`type`='type1'}=1.00 select * from `metric_schema`.`tidb_kv_region_error_ops` where `instance`='tikv-0' and `type`='type1'", + "ticlient-region-error tikv-0 {`type`='type2'}=2.00 select * from `metric_schema`.`tidb_kv_region_error_ops` where `instance`='tikv-0' and `type`='type2'", + "ticlient-region-error tikv-1 {`type`='type3'}=3.00 select * from `metric_schema`.`tidb_kv_region_error_ops` where `instance`='tikv-1' and `type`='type3'", + "txn-retry-error tidb-0 {`type`='db1',`sql_type`='sql_type1'}=1.00 select * from `metric_schema`.`tidb_transaction_retry_error_ops` where `instance`='tidb-0' and `type`='db1' and `sql_type`='sql_type1'", + "txn-retry-error tidb-1 {`type`='db1',`sql_type`='sql_type2'}=3.00 select * from `metric_schema`.`tidb_transaction_retry_error_ops` where `instance`='tidb-1' and `type`='db1' and `sql_type`='sql_type2'", + "txn-retry-error tidb-0 {`type`='db2',`sql_type`='sql_type1'}=2.00 select * from `metric_schema`.`tidb_transaction_retry_error_ops` where `instance`='tidb-0' and `type`='db2' and `sql_type`='sql_type1'", + )) +} diff --git a/executor/metric_reader.go b/executor/metric_reader.go index 000e1a2c2da20..ba54b3233891b 100644 --- a/executor/metric_reader.go +++ b/executor/metric_reader.go @@ -53,6 +53,14 @@ func (e *MetricRetriever) retrieve(ctx context.Context, sctx sessionctx.Context) return nil, nil } e.retrieved = true + + failpoint.InjectContext(ctx, "mockMetricsTableData", func() { + m, ok := ctx.Value("__mockMetricsTableData").(map[string][][]types.Datum) + if ok && m[e.table.Name.L] != nil { + failpoint.Return(m[e.table.Name.L], nil) + } + }) + tblDef, err := infoschema.GetMetricTableDef(e.table.Name.L) if err != nil { return nil, err @@ -84,8 +92,8 @@ func (e *MetricRetriever) retrieve(ctx context.Context, sctx sessionctx.Context) } func (e *MetricRetriever) queryMetric(ctx context.Context, sctx sessionctx.Context, queryRange promv1.Range, quantile float64) (pmodel.Value, error) { - failpoint.InjectContext(ctx, "mockMetricRetrieverQueryPromQL", func() { - failpoint.Return(ctx.Value("__mockMetricsData").(pmodel.Matrix), nil) + failpoint.InjectContext(ctx, "mockMetricsPromData", func() { + failpoint.Return(ctx.Value("__mockMetricsPromData").(pmodel.Matrix), nil) }) addr, err := e.getMetricAddr(sctx) diff --git a/infoschema/metric_table_def.go b/infoschema/metric_table_def.go index cf1aa7bd97628..0cc7786de048a 100644 --- a/infoschema/metric_table_def.go +++ b/infoschema/metric_table_def.go @@ -841,17 +841,17 @@ var MetricTableMap = map[string]MetricTableDef{ }, "tikv_scheduler_is_busy": { PromQL: `sum(rate(tikv_scheduler_too_busy_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,db,type,stage)`, - Labels: []string{"db", "instance", "type", "stage"}, + Labels: []string{"instance", "db", "type", "stage"}, Comment: "Indicates occurrences of Scheduler Busy events that make the TiKV instance unavailable temporarily", }, "tikv_channel_full_total": { PromQL: `sum(rate(tikv_channel_full_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,type,db)`, - Labels: []string{"db", "instance", "type"}, + Labels: []string{"instance", "db", "type"}, Comment: "The total number of channel full errors on each TiKV instance, it will make the TiKV instance unavailable temporarily", }, "tikv_coprocessor_is_busy": { PromQL: `sum(rate(tikv_coprocessor_request_error{type='full'}[$RANGE_DURATION])) by (instance,db,type)`, - Labels: []string{"db", "instance"}, + Labels: []string{"instance", "db"}, Comment: "Indicates occurrences of Coprocessor Full events that make the TiKV instance unavailable temporarily", }, "tikv_engine_write_stall": { @@ -1384,7 +1384,6 @@ var MetricTableMap = map[string]MetricTableDef{ PromQL: `sum(rate(tikv_coprocessor_request_duration_seconds_count{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (req,instance)`, Labels: []string{"instance", "req"}, }, - "tikv_cop_total_request_errors": { PromQL: `sum(rate(tikv_coprocessor_request_error{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (reason,instance)`, Labels: []string{"instance", "reason"}, diff --git a/session/session.go b/session/session.go index b5b0da8ea71c6..f8262de8deae1 100644 --- a/session/session.go +++ b/session/session.go @@ -738,8 +738,11 @@ func (s *session) sysSessionPool() sessionPool { // Unlike normal Exec, it doesn't reset statement status, doesn't commit or rollback the current transaction // and doesn't write binlog. func (s *session) ExecRestrictedSQL(sql string) ([]chunk.Row, []*ast.ResultField, error) { - ctx := context.TODO() + return s.ExecRestrictedSQLWithContext(context.TODO(), sql) +} +// ExecRestrictedSQLWithContext implements RestrictedSQLExecutor interface. +func (s *session) ExecRestrictedSQLWithContext(ctx context.Context, sql string) ([]chunk.Row, []*ast.ResultField, error) { // Use special session to execute the sql. tmp, err := s.sysSessionPool().Get() if err != nil { diff --git a/util/sqlexec/restricted_sql_executor.go b/util/sqlexec/restricted_sql_executor.go index 6a9c55c70cb57..7416e411867d1 100644 --- a/util/sqlexec/restricted_sql_executor.go +++ b/util/sqlexec/restricted_sql_executor.go @@ -36,6 +36,8 @@ import ( type RestrictedSQLExecutor interface { // ExecRestrictedSQL run sql statement in ctx with some restriction. ExecRestrictedSQL(sql string) ([]chunk.Row, []*ast.ResultField, error) + // ExecRestrictedSQLWithContext run sql statement in ctx with some restriction. + ExecRestrictedSQLWithContext(ctx context.Context, sql string) ([]chunk.Row, []*ast.ResultField, error) // ExecRestrictedSQLWithSnapshot run sql statement in ctx with some restriction and with snapshot. // If current session sets the snapshot timestamp, then execute with this snapshot timestamp. // Otherwise, execute with the current transaction start timestamp if the transaction is valid.