From cbac09dbf55fd3e87491ad535ccabe0b055a06ef Mon Sep 17 00:00:00 2001 From: Calvin Neo Date: Fri, 7 Jun 2024 16:53:57 +0800 Subject: [PATCH] KVStore: More metrics for Prehandling (#9131) close pingcap/tiflash#8081 Signed-off-by: CalvinNeo --- dbms/src/Common/TiFlashMetrics.h | 5 ++ .../KVStore/MultiRaft/PrehandleSnapshot.cpp | 6 ++ metrics/grafana/tiflash_summary.json | 74 ++++++++++++++++++- 3 files changed, 84 insertions(+), 1 deletion(-) diff --git a/dbms/src/Common/TiFlashMetrics.h b/dbms/src/Common/TiFlashMetrics.h index 996c1b0a785..a679ed86bdf 100644 --- a/dbms/src/Common/TiFlashMetrics.h +++ b/dbms/src/Common/TiFlashMetrics.h @@ -428,6 +428,10 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva F(type_total, {{"type", "total"}}, ExpBucketsWithRange{0.2, 4, 300}), \ F(type_queue_stage, {{"type", "queue_stage"}}, ExpBucketsWithRange{0.2, 4, 300}), \ F(type_phase1_total, {{"type", "phase1_total"}}, ExpBucketsWithRange{0.2, 4, 300})) \ + M(tiflash_raft_command_throughput, \ + "", \ + Histogram, \ + F(type_prehandle_snapshot, {{"type", "prehandle_snapshot"}}, ExpBuckets{128, 2, 11})) \ M(tiflash_raft_command_duration_seconds, \ "Bucketed histogram of some raft command: apply snapshot and ingest SST", \ Histogram, /* these command usually cost several seconds, increase the start bucket to 50ms */ \ @@ -496,6 +500,7 @@ static_assert(RAFT_REGION_BIG_WRITE_THRES * 4 < RAFT_REGION_BIG_WRITE_MAX, "Inva F(type_flush_log_gap, {{"type", "flush_log_gap"}}), \ F(type_flush_size, {{"type", "flush_size"}}), \ F(type_flush_rowcount, {{"type", "flush_rowcount"}}), \ + F(type_prehandle, {{"type", "prehandle"}}), \ F(type_flush_eager_gc, {{"type", "flush_eager_gc"}})) \ M(tiflash_raft_raft_frequent_events_count, \ "Raft frequent event counter", \ diff --git a/dbms/src/Storages/KVStore/MultiRaft/PrehandleSnapshot.cpp b/dbms/src/Storages/KVStore/MultiRaft/PrehandleSnapshot.cpp index ea3aa4fc10d..db41fb67407 100644 --- a/dbms/src/Storages/KVStore/MultiRaft/PrehandleSnapshot.cpp +++ b/dbms/src/Storages/KVStore/MultiRaft/PrehandleSnapshot.cpp @@ -139,6 +139,7 @@ static inline std::tuple executeTransform trace.releaseSubtaskResources(region_id, split_id); CurrentMetrics::sub(CurrentMetrics::RaftNumPrehandlingSubTasks); }); + Stopwatch sw; LOG_INFO( log, "Add prehandle task split_id={} limit={}", @@ -200,6 +201,10 @@ static inline std::tuple executeTransform stream->cancel(); res = ReadFromStreamResult{.error = abort_reason.value(), .extra_msg = "", .region = new_region}; } + auto keys_per_second = (sst_stream->getProcessKeys().write_cf + sst_stream->getProcessKeys().lock_cf + + sst_stream->getProcessKeys().write_cf) + * 1.0 / sw.elapsedSeconds(); + GET_METRIC(tiflash_raft_command_throughput, type_prehandle_snapshot).Observe(keys_per_second); return std::make_pair( std::move(res), PrehandleResult{ @@ -254,6 +259,7 @@ PrehandleResult KVStore::preHandleSnapshotToFiles( std::optional deadline_index, TMTContext & tmt) { + GET_METRIC(tiflash_raft_raft_events_count, type_prehandle).Increment(); new_region->beforePrehandleSnapshot(new_region->id(), deadline_index); ongoing_prehandle_task_count.fetch_add(1); diff --git a/metrics/grafana/tiflash_summary.json b/metrics/grafana/tiflash_summary.json index 44c664a11b0..919b29d292c 100644 --- a/metrics/grafana/tiflash_summary.json +++ b/metrics/grafana/tiflash_summary.json @@ -52,7 +52,7 @@ "gnetId": null, "graphTooltip": 1, "id": null, - "iteration": 1716350266980, + "iteration": 1717744619239, "links": [], "panels": [ { @@ -13721,6 +13721,78 @@ "alignLevel": null } }, + { + "cards": { + "cardPadding": null, + "cardRound": null + }, + "color": { + "cardColor": "#b4ff00", + "colorScale": "sqrt", + "colorScheme": "interpolateSpectral", + "exponent": 0.5, + "min": 0, + "mode": "spectrum" + }, + "dataFormat": "tsbuckets", + "datasource": "${DS_TEST-CLUSTER}", + "description": "", + "fieldConfig": { + "defaults": {}, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 121 + }, + "heatmap": {}, + "hideZeroBuckets": true, + "highlightCards": true, + "id": 290, + "legend": { + "show": true + }, + "links": [], + "reverseYBuckets": false, + "targets": [ + { + "exemplar": true, + "expr": "sum(delta(tiflash_raft_command_throughput_seconds_bucket{k8s_cluster=\"$k8s_cluster\", tidb_cluster=\"$tidb_cluster\", instance=~\"$instance\", type=\"prehandle_snapshot\"}[1m])) by (le)", + "format": "heatmap", + "interval": "", + "intervalFactor": 2, + "legendFormat": "{{le}}", + "refId": "B" + } + ], + "timeFrom": null, + "timeShift": null, + "title": "Snapshot Prehandle Throughput Heatmap", + "tooltip": { + "show": true, + "showHistogram": true + }, + "type": "heatmap", + "xAxis": { + "show": true + }, + "xBucketNumber": null, + "xBucketSize": null, + "yAxis": { + "decimals": 0, + "format": "bytes", + "logBase": 1, + "max": null, + "min": null, + "show": true, + "splitFactor": null + }, + "yBucketBound": "upper", + "yBucketNumber": null, + "yBucketSize": null + }, { "cards": { "cardPadding": null,