From bc5cb068ad304c90797554dc56c32754f4929a4a Mon Sep 17 00:00:00 2001 From: crazycs Date: Fri, 7 Feb 2020 15:01:50 +0800 Subject: [PATCH 1/7] iinfoschema/metric_schema: add node cpu/mem/net load metric tables --- infoschema/metric_schema_test.go | 3 + infoschema/metric_table_def.go | 211 ++++++++++++++++++++++++++++++- 2 files changed, 210 insertions(+), 4 deletions(-) diff --git a/infoschema/metric_schema_test.go b/infoschema/metric_schema_test.go index 8a78b98a9c5d5..931e12adb2735 100644 --- a/infoschema/metric_schema_test.go +++ b/infoschema/metric_schema_test.go @@ -53,5 +53,8 @@ func (s *inspectionSuite) TestMetricSchemaDef(c *C) { c.Assert(strings.Contains(def.PromQL, label), IsTrue, Commentf("metric table %v has labels, but doesn't contain label %v in promQL", name, label)) } } + if name != strings.ToLower(name) { + c.Assert(name, Equals, strings.ToLower(name), Commentf("metric table name %v should be lower case", name)) + } } } diff --git a/infoschema/metric_table_def.go b/infoschema/metric_table_def.go index 5d5b15fb3cd00..e96195dbd1673 100644 --- a/infoschema/metric_table_def.go +++ b/infoschema/metric_table_def.go @@ -64,7 +64,7 @@ var MetricTableMap = map[string]MetricTableDef{ Labels: []string{"instance", "job"}, Comment: "process rss memory usage", }, - "heap_mem_usage": { + "go_heap_mem_usage": { PromQL: "go_memstats_heap_alloc_bytes{$LABEL_CONDITIONS}", Labels: []string{"instance", "job"}, Comment: "TiDB heap memory size in use", @@ -73,12 +73,12 @@ var MetricTableMap = map[string]MetricTableDef{ PromQL: "rate(process_cpu_seconds_total{$LABEL_CONDITIONS}[$RANGE_DURATION])", Labels: []string{"instance", "job"}, }, - "connection_count": { + "tidb_connection_count": { PromQL: "tidb_server_connections{$LABEL_CONDITIONS}", Labels: []string{"instance"}, Comment: "TiDB current connection counts", }, - "process_open_fd_count": { + "node_process_open_fd_count": { PromQL: "process_open_fds{$LABEL_CONDITIONS}", Labels: []string{"instance", "job"}, Comment: "Process opened file descriptors count", @@ -632,7 +632,7 @@ var MetricTableMap = map[string]MetricTableDef{ PromQL: `pd_cluster_metadata{$LABEL_CONDITIONS}`, Labels: []string{"instance", "type"}, }, - "region_health": { + "pd_region_health": { PromQL: `sum(pd_regions_status{$LABEL_CONDITIONS}) by (instance, type)`, Labels: []string{"instance", "type"}, Comment: "It records the unusual Regions' count which may include pending peers, down peers, extra peers, offline peers, missing peers or learner peers", @@ -1835,4 +1835,207 @@ var MetricTableMap = map[string]MetricTableDef{ PromQL: `rate(tikv_backup_error_counter{$LABEL_CONDITIONS}[$RANGE_DURATION])`, Labels: []string{"error", "instance"}, }, + "node_virtual_cpus": { + PromQL: `count(node_cpu_seconds_total{mode="user"}) by (instance)`, + Labels: []string{"instance"}, + Comment: "node virtual cpu count", + }, + "node_total_memory": { + PromQL: `node_memory_MemTotal_bytes{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + Comment: "node total mem", + }, + "node_memory_available": { + PromQL: `node_memory_MemAvailable_bytes{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + }, + "node_total_memory_swap": { + PromQL: `node_memory_SwapTotal_bytes{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + Comment: "node total memory swap", + }, + "node_uptime": { + PromQL: `node_time_seconds{$LABEL_CONDITIONS} - node_boot_time_seconds{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + Comment: "node uptime", + }, + "node_load1": { + PromQL: `node_load1{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + Comment: "node cpu load1", + }, + "node_load5": { + PromQL: `node_load5{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + Comment: "node cpu load5", + }, + "node_load15": { + PromQL: `node_load15{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + Comment: "node cpu load15", + }, + "node_kernel_interrupts": { + PromQL: `rate(node_intr_total{$LABEL_CONDITIONS}[$RANGE_DURATION]) or irate(node_intr_total{$LABEL_CONDITIONS}[$RANGE_DURATION])`, + Labels: []string{"instance"}, + }, + "node_kernel_forks": { + PromQL: `rate(node_forks_total{$LABEL_CONDITIONS}[$RANGE_DURATION]) or irate(node_forks_total{$LABEL_CONDITIONS}[$RANGE_DURATION])`, + Labels: []string{"instance"}, + }, + "node_kernel_context_switches": { + PromQL: `rate(node_context_switches_total{$LABEL_CONDITIONS}[$RANGE_DURATION]) or irate(node_context_switches_total{$LABEL_CONDITIONS}[$RANGE_DURATION])`, + Labels: []string{"instance"}, + }, + "node_cpu_usage": { + PromQL: `sum(rate(node_cpu_seconds_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (mode,instance) * 100 / count(node_cpu_seconds_total{$LABEL_CONDITIONS}) by (mode,instance) or sum(irate(node_cpu_seconds_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (mode,instance) * 100 / count(node_cpu_seconds_total{$LABEL_CONDITIONS}) by (mode,instance)`, + Labels: []string{"instance", "mode"}, + }, + "node_memory_free": { + PromQL: `node_memory_MemFree_bytes{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + }, + "node_memory_buffers": { + PromQL: `node_memory_Buffers_bytes{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + }, + "node_memory_cached": { + PromQL: `node_memory_Cached_bytes{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + }, + "node_memory_active": { + PromQL: `node_memory_Active_bytes{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + }, + "node_memory_inactive": { + PromQL: `node_memory_Inactive_bytes{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + }, + "node_memory_writeback": { + PromQL: `node_memory_Writeback_bytes{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + }, + "node_memory_writeback_tmp": { + PromQL: `node_memory_WritebackTmp_bytes{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + }, + "node_memory_dirty": { + PromQL: `node_memory_Dirty_bytes{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + }, + "node_memory_shared": { + PromQL: `node_memory_Shmem_bytes{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + }, + "node_memory_mapped": { + PromQL: `node_memory_Mapped_bytes{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + }, + "node_disk_size": { + PromQL: `node_filesystem_size_bytes{$LABEL_CONDITIONS}`, + Labels: []string{"instance", "device", "fstype", "mountpoint"}, + }, + "node_disk_available_size": { + PromQL: `node_filesystem_avail_bytes{$LABEL_CONDITIONS}`, + Labels: []string{"instance", "device", "fstype", "mountpoint"}, + }, + "node_disk_state": { + PromQL: `node_filesystem_readonly{$LABEL_CONDITIONS}`, + Labels: []string{"instance", "device", "fstype", "mountpoint"}, + }, + "node_disk_io_util": { + PromQL: `rate(node_disk_io_time_seconds_total{$LABEL_CONDITIONS}[$RANGE_DURATION]) or irate(node_disk_io_time_seconds_total{$LABEL_CONDITIONS}[$RANGE_DURATION])`, + Labels: []string{"device", "instance"}, + }, + "node_disk_iops": { + PromQL: `sum(rate(node_disk_reads_completed_total{$LABEL_CONDITIONS}[$RANGE_DURATION]) + rate(node_disk_writes_completed_total{$LABEL_CONDITIONS}[$RANGE_DURATION])) by (instance,device)`, + Labels: []string{"device", "instance"}, + }, + "node_disk_write_latency": { + PromQL: `(rate(node_disk_write_time_seconds_total{$LABEL_CONDITIONS}[$RANGE_DURATION])/ rate(node_disk_writes_completed_total{$LABEL_CONDITIONS}[$RANGE_DURATION]))`, + Labels: []string{"device", "instance"}, + Comment: "node disk write latency(ms)", + }, + "node_disk_read_latency": { + PromQL: `(rate(node_disk_read_time_seconds_total{$LABEL_CONDITIONS}[$RANGE_DURATION])/ rate(node_disk_reads_completed_total{$LABEL_CONDITIONS}[$RANGE_DURATION]))`, + Labels: []string{"device", "instance"}, + Comment: "node disk read latency(ms)", + }, + "node_disk_throughput": { + PromQL: `irate(node_disk_read_bytes_total{$LABEL_CONDITIONS}[$RANGE_DURATION]) + irate(node_disk_written_bytes_total{$LABEL_CONDITIONS}[$RANGE_DURATION])`, + Labels: []string{"device", "instance"}, + }, + "node_filesystem_space_used": { + PromQL: `((node_filesystem_size_bytes{$LABEL_CONDITIONS} - node_filesystem_avail_bytes{$LABEL_CONDITIONS}) / node_filesystem_size_bytes{$LABEL_CONDITIONS}) * 100`, + Labels: []string{"device", "instance"}, + Comment: "Filesystem used space. If is > 80% then is Critical.", + }, + "node_file_descriptor_allocated": { + PromQL: `node_filefd_allocated{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + }, + "node_network_in_drops": { + PromQL: `rate(node_network_receive_drop_total{$LABEL_CONDITIONS}[$RANGE_DURATION]) `, + Labels: []string{"device", "instance"}, + }, + "node_network_out_drops": { + PromQL: `rate(node_network_transmit_drop_total{$LABEL_CONDITIONS}[$RANGE_DURATION])`, + Labels: []string{"device", "instance"}, + }, + "node_network_in_errors": { + PromQL: `rate(node_network_receive_errs_total{$LABEL_CONDITIONS}[$RANGE_DURATION])`, + Labels: []string{"device", "instance"}, + }, + "node_network_out_errors": { + PromQL: `rate(node_network_transmit_errs_total{$LABEL_CONDITIONS}[$RANGE_DURATION])`, + Labels: []string{"device", "instance"}, + }, + "node_network_in_traffic": { + PromQL: `rate(node_network_receive_bytes_total{$LABEL_CONDITIONS}[$RANGE_DURATION]) or irate(node_network_receive_bytes_total{$LABEL_CONDITIONS}[$RANGE_DURATION])`, + Labels: []string{"device", "instance"}, + }, + "node_network_out_traffic": { + PromQL: `rate(node_network_transmit_bytes_total{$LABEL_CONDITIONS}[$RANGE_DURATION]) or irate(node_network_transmit_bytes_total{$LABEL_CONDITIONS}[$RANGE_DURATION])`, + Labels: []string{"device", "instance"}, + }, + "node_network_in_packets": { + PromQL: `rate(node_network_receive_packets_total{$LABEL_CONDITIONS}[$RANGE_DURATION]) or irate(node_network_receive_packets_total{$LABEL_CONDITIONS}[$RANGE_DURATION])`, + Labels: []string{"device", "instance"}, + }, + "node_network_out_packets": { + PromQL: `rate(node_network_transmit_packets_total{$LABEL_CONDITIONS}[$RANGE_DURATION]) or irate(node_network_transmit_packets_total{$LABEL_CONDITIONS}[$RANGE_DURATION])`, + Labels: []string{"device", "instance"}, + }, + "node_network_interface_speed": { + PromQL: `node_network_transmit_queue_length{$LABEL_CONDITIONS}`, + Labels: []string{"device", "instance"}, + Comment: "node_network_transmit_queue_length = transmit_queue_length value of /sys/class/net/.", + }, + "node_network_utilization_in_hourly": { + PromQL: `sum(increase(node_network_receive_bytes_total{$LABEL_CONDITIONS}[1h]))`, + Labels: []string{"device", "instance"}, + }, + "node_network_utilization_out_hourly": { + PromQL: `sum(increase(node_network_transmit_bytes_total{$LABEL_CONDITIONS}[1h]))`, + Labels: []string{"device", "instance"}, + }, + "node_tcp_in_use": { + PromQL: `node_sockstat_TCP_inuse{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + }, + "node_segments_retransmitted": { + PromQL: `rate(node_netstat_Tcp_RetransSegs{$LABEL_CONDITIONS}[$RANGE_DURATION]) or irate(node_netstat_Tcp_RetransSegs{$LABEL_CONDITIONS}[$RANGE_DURATION])`, + Labels: []string{"instance"}, + }, + "node_tcp_connections": { + PromQL: `node_netstat_Tcp_CurrEstab{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + }, + "node_processes_running": { + PromQL: `node_procs_running{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + }, + "node_processes_blocked": { + PromQL: `node_procs_blocked{$LABEL_CONDITIONS}`, + Labels: []string{"instance"}, + }, } From f91d1f5cbbeff6c68e4f62de18ae5922f8cb8cf0 Mon Sep 17 00:00:00 2001 From: crazycs Date: Fri, 7 Feb 2020 15:32:11 +0800 Subject: [PATCH 2/7] add comment Signed-off-by: crazycs --- infoschema/metric_table_def.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/infoschema/metric_table_def.go b/infoschema/metric_table_def.go index e96195dbd1673..587f72724bcb8 100644 --- a/infoschema/metric_table_def.go +++ b/infoschema/metric_table_def.go @@ -1857,7 +1857,7 @@ var MetricTableMap = map[string]MetricTableDef{ "node_uptime": { PromQL: `node_time_seconds{$LABEL_CONDITIONS} - node_boot_time_seconds{$LABEL_CONDITIONS}`, Labels: []string{"instance"}, - Comment: "node uptime", + Comment: "node uptime, units are seconds", }, "node_load1": { PromQL: `node_load1{$LABEL_CONDITIONS}`, @@ -1961,8 +1961,9 @@ var MetricTableMap = map[string]MetricTableDef{ Comment: "node disk read latency(ms)", }, "node_disk_throughput": { - PromQL: `irate(node_disk_read_bytes_total{$LABEL_CONDITIONS}[$RANGE_DURATION]) + irate(node_disk_written_bytes_total{$LABEL_CONDITIONS}[$RANGE_DURATION])`, - Labels: []string{"device", "instance"}, + PromQL: `irate(node_disk_read_bytes_total{$LABEL_CONDITIONS}[$RANGE_DURATION]) + irate(node_disk_written_bytes_total{$LABEL_CONDITIONS}[$RANGE_DURATION])`, + Labels: []string{"device", "instance"}, + Comment: "Units is byte", }, "node_filesystem_space_used": { PromQL: `((node_filesystem_size_bytes{$LABEL_CONDITIONS} - node_filesystem_avail_bytes{$LABEL_CONDITIONS}) / node_filesystem_size_bytes{$LABEL_CONDITIONS}) * 100`, From f87124b51983996f88f4358a64c22550e810bbfa Mon Sep 17 00:00:00 2001 From: crazycs Date: Tue, 11 Feb 2020 13:11:35 +0800 Subject: [PATCH 3/7] Update infoschema/metric_table_def.go Co-Authored-By: Maxwell --- infoschema/metric_table_def.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infoschema/metric_table_def.go b/infoschema/metric_table_def.go index 587f72724bcb8..b4e9785c87e3a 100644 --- a/infoschema/metric_table_def.go +++ b/infoschema/metric_table_def.go @@ -1843,7 +1843,7 @@ var MetricTableMap = map[string]MetricTableDef{ "node_total_memory": { PromQL: `node_memory_MemTotal_bytes{$LABEL_CONDITIONS}`, Labels: []string{"instance"}, - Comment: "node total mem", + Comment: "total memory in node", }, "node_memory_available": { PromQL: `node_memory_MemAvailable_bytes{$LABEL_CONDITIONS}`, From 618c7dc1522b71ad1f5442de0d4b60990b543b6b Mon Sep 17 00:00:00 2001 From: crazycs Date: Tue, 11 Feb 2020 13:11:55 +0800 Subject: [PATCH 4/7] Update infoschema/metric_table_def.go Co-Authored-By: Maxwell --- infoschema/metric_table_def.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infoschema/metric_table_def.go b/infoschema/metric_table_def.go index b4e9785c87e3a..43fda9c60fa3d 100644 --- a/infoschema/metric_table_def.go +++ b/infoschema/metric_table_def.go @@ -1862,7 +1862,7 @@ var MetricTableMap = map[string]MetricTableDef{ "node_load1": { PromQL: `node_load1{$LABEL_CONDITIONS}`, Labels: []string{"instance"}, - Comment: "node cpu load1", + Comment: "1 minute load averages in node", }, "node_load5": { PromQL: `node_load5{$LABEL_CONDITIONS}`, From bb6d00c3a03ed2b787bd0660f2aae59a9f563c5e Mon Sep 17 00:00:00 2001 From: crazycs Date: Tue, 11 Feb 2020 13:12:03 +0800 Subject: [PATCH 5/7] Update infoschema/metric_table_def.go Co-Authored-By: Maxwell --- infoschema/metric_table_def.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infoschema/metric_table_def.go b/infoschema/metric_table_def.go index 43fda9c60fa3d..6e361adb24911 100644 --- a/infoschema/metric_table_def.go +++ b/infoschema/metric_table_def.go @@ -1867,7 +1867,7 @@ var MetricTableMap = map[string]MetricTableDef{ "node_load5": { PromQL: `node_load5{$LABEL_CONDITIONS}`, Labels: []string{"instance"}, - Comment: "node cpu load5", + Comment: "5 minutes load averages in node", }, "node_load15": { PromQL: `node_load15{$LABEL_CONDITIONS}`, From ac32ac105347ec5ac2a66fa78f6352c525cd9150 Mon Sep 17 00:00:00 2001 From: crazycs Date: Tue, 11 Feb 2020 13:12:11 +0800 Subject: [PATCH 6/7] Update infoschema/metric_table_def.go Co-Authored-By: Maxwell --- infoschema/metric_table_def.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infoschema/metric_table_def.go b/infoschema/metric_table_def.go index 6e361adb24911..64dfd9c6a9760 100644 --- a/infoschema/metric_table_def.go +++ b/infoschema/metric_table_def.go @@ -1872,7 +1872,7 @@ var MetricTableMap = map[string]MetricTableDef{ "node_load15": { PromQL: `node_load15{$LABEL_CONDITIONS}`, Labels: []string{"instance"}, - Comment: "node cpu load15", + Comment: "15 minutes load averages in node", }, "node_kernel_interrupts": { PromQL: `rate(node_intr_total{$LABEL_CONDITIONS}[$RANGE_DURATION]) or irate(node_intr_total{$LABEL_CONDITIONS}[$RANGE_DURATION])`, From 8ec74075f1d144add4e7968f00055cd0699f97a0 Mon Sep 17 00:00:00 2001 From: crazycs Date: Tue, 11 Feb 2020 14:23:38 +0800 Subject: [PATCH 7/7] fix name Signed-off-by: crazycs --- infoschema/metric_table_def.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/infoschema/metric_table_def.go b/infoschema/metric_table_def.go index 64dfd9c6a9760..cf1aa7bd97628 100644 --- a/infoschema/metric_table_def.go +++ b/infoschema/metric_table_def.go @@ -2023,7 +2023,7 @@ var MetricTableMap = map[string]MetricTableDef{ PromQL: `node_sockstat_TCP_inuse{$LABEL_CONDITIONS}`, Labels: []string{"instance"}, }, - "node_segments_retransmitted": { + "node_tcp_segments_retransmitted": { PromQL: `rate(node_netstat_Tcp_RetransSegs{$LABEL_CONDITIONS}[$RANGE_DURATION]) or irate(node_netstat_Tcp_RetransSegs{$LABEL_CONDITIONS}[$RANGE_DURATION])`, Labels: []string{"instance"}, },