From 0990cc94dbf68017af19ded532379c3e2e694915 Mon Sep 17 00:00:00 2001 From: Cyclinder Kuo Date: Wed, 8 Jan 2025 18:57:07 +0800 Subject: [PATCH] spiderpool v1.0.0-rc4 --- charts/spiderpool/config | 2 +- charts/spiderpool/spiderpool/Chart.yaml | 6 +- charts/spiderpool/spiderpool/README.md | 4 +- .../spiderpool/charts/spiderpool/Chart.yaml | 4 +- .../spiderpool/charts/spiderpool/README.md | 4 +- .../charts/spiderpool/files/grafana-ipam.json | 836 ++++++++++++++++++ .../files/grafana-rdma-cluster.json | 789 ++++++++++++++--- .../spiderpool/files/grafana-rdma-node.json | 95 +- .../spiderpool/files/grafana-rdma-pod.json | 163 ++-- .../files/grafana-rdma-workload.json | 278 +++++- .../spiderpool/templates/configmap.yaml | 152 +++- .../spiderpool/templates/daemonset.yaml | 63 +- .../templates/grafanaDashboard.yaml | 745 +--------------- .../charts/spiderpool/templates/pod.yaml | 6 - .../spiderpool/templates/servicemonitor.yaml | 13 + .../charts/spiderpool/templates/tls.yaml | 42 + .../spiderpool/charts/spiderpool/values.yaml | 11 +- charts/spiderpool/spiderpool/values.yaml | 12 +- 18 files changed, 2213 insertions(+), 1012 deletions(-) create mode 100644 charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-ipam.json diff --git a/charts/spiderpool/config b/charts/spiderpool/config index 29b872da4..2728b4b3f 100644 --- a/charts/spiderpool/config +++ b/charts/spiderpool/config @@ -4,7 +4,7 @@ export USE_OPENSOURCE_CHART=false export REPO_URL=https://spidernet-io.github.io/spiderpool export REPO_NAME=spiderpool export CHART_NAME=spiderpool -export VERSION=1.0.0-rc3 +export VERSION=1.0.0-rc4 # pr, issue, none export UPGRADE_METHOD=pr diff --git a/charts/spiderpool/spiderpool/Chart.yaml b/charts/spiderpool/spiderpool/Chart.yaml index 76cd95ec6..dbd70bf3c 100644 --- a/charts/spiderpool/spiderpool/Chart.yaml +++ b/charts/spiderpool/spiderpool/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 -appVersion: 1.0.0-rc3 +appVersion: 1.0.0-rc4 description: underlay CNI solution for kubernetes home: https://spidernet-io.github.io/spiderpool icon: https://raw.githubusercontent.com/spidernet-io/spiderpool/main/docs/images/spider.svg @@ -16,8 +16,8 @@ name: spiderpool sources: - https://github.com/spidernet-io/spiderpool type: application -version: 1.0.0-rc3 +version: 1.0.0-rc4 dependencies: - name: spiderpool - version: "1.0.0-rc3" + version: "1.0.0-rc4" repository: "https://spidernet-io.github.io/spiderpool" diff --git a/charts/spiderpool/spiderpool/README.md b/charts/spiderpool/spiderpool/README.md index 1cfb0f204..3f65ce21c 100644 --- a/charts/spiderpool/spiderpool/README.md +++ b/charts/spiderpool/spiderpool/README.md @@ -198,7 +198,7 @@ helm install spiderpool spiderpool/spiderpool --wait --namespace kube-system \ | `multus.multusCNI.image.repository` | the multus-CNI image repository | `k8snetworkplumbingwg/multus-cni` | | `multus.multusCNI.image.pullPolicy` | the multus-CNI image pullPolicy | `IfNotPresent` | | `multus.multusCNI.image.digest` | the multus-CNI image digest | `""` | -| `multus.multusCNI.image.tag` | the multus-CNI image tag | `v3.9.3` | +| `multus.multusCNI.image.tag` | the multus-CNI image tag | `v4.1.4` | | `multus.multusCNI.image.imagePullSecrets` | the multus-CNI image imagePullSecrets | `[]` | | `multus.multusCNI.defaultCniCRName` | if this value is empty, multus will automatically get default CNI according to the existed CNI conf file in /etc/cni/net.d/, if no cni files found in /etc/cni/net.d, A Spidermultusconfig CR named default will be created, please update the related SpiderMultusConfig for default CNI after installation. The namespace of defaultCniCRName follows with the release namespace of spdierpool | `""` | | `multus.multusCNI.securityContext.privileged` | the securityContext privileged of multus-CNI daemonset pod | `true` | @@ -222,7 +222,7 @@ helm install spiderpool spiderpool/spiderpool --wait --namespace kube-system \ | `plugins.image.repository` | the image repository of plugins | `spidernet-io/spiderpool/spiderpool-plugins` | | `plugins.image.pullPolicy` | the image pullPolicy of plugins | `IfNotPresent` | | `plugins.image.digest` | the image digest of plugins | `""` | -| `plugins.image.tag` | the image tag of plugins | `82659d90cae0d6a5169eac2869e47c989932d775` | +| `plugins.image.tag` | the image tag of plugins | `27c4f118b1cec3773f2679b772e7583fc77e5686` | | `plugins.image.imagePullSecrets` | the image imagePullSecrets of plugins | `[]` | ### clusterDefaultPool parameters diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/Chart.yaml b/charts/spiderpool/spiderpool/charts/spiderpool/Chart.yaml index d8943598d..a51fd5d9a 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/Chart.yaml +++ b/charts/spiderpool/spiderpool/charts/spiderpool/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 -appVersion: 1.0.0-rc3 +appVersion: 1.0.0-rc4 description: underlay CNI solution for kubernetes home: https://spidernet-io.github.io/spiderpool icon: https://raw.githubusercontent.com/spidernet-io/spiderpool/main/docs/images/spider.svg @@ -16,4 +16,4 @@ name: spiderpool sources: - https://github.com/spidernet-io/spiderpool type: application -version: 1.0.0-rc3 +version: 1.0.0-rc4 diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/README.md b/charts/spiderpool/spiderpool/charts/spiderpool/README.md index 1cfb0f204..3f65ce21c 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/README.md +++ b/charts/spiderpool/spiderpool/charts/spiderpool/README.md @@ -198,7 +198,7 @@ helm install spiderpool spiderpool/spiderpool --wait --namespace kube-system \ | `multus.multusCNI.image.repository` | the multus-CNI image repository | `k8snetworkplumbingwg/multus-cni` | | `multus.multusCNI.image.pullPolicy` | the multus-CNI image pullPolicy | `IfNotPresent` | | `multus.multusCNI.image.digest` | the multus-CNI image digest | `""` | -| `multus.multusCNI.image.tag` | the multus-CNI image tag | `v3.9.3` | +| `multus.multusCNI.image.tag` | the multus-CNI image tag | `v4.1.4` | | `multus.multusCNI.image.imagePullSecrets` | the multus-CNI image imagePullSecrets | `[]` | | `multus.multusCNI.defaultCniCRName` | if this value is empty, multus will automatically get default CNI according to the existed CNI conf file in /etc/cni/net.d/, if no cni files found in /etc/cni/net.d, A Spidermultusconfig CR named default will be created, please update the related SpiderMultusConfig for default CNI after installation. The namespace of defaultCniCRName follows with the release namespace of spdierpool | `""` | | `multus.multusCNI.securityContext.privileged` | the securityContext privileged of multus-CNI daemonset pod | `true` | @@ -222,7 +222,7 @@ helm install spiderpool spiderpool/spiderpool --wait --namespace kube-system \ | `plugins.image.repository` | the image repository of plugins | `spidernet-io/spiderpool/spiderpool-plugins` | | `plugins.image.pullPolicy` | the image pullPolicy of plugins | `IfNotPresent` | | `plugins.image.digest` | the image digest of plugins | `""` | -| `plugins.image.tag` | the image tag of plugins | `82659d90cae0d6a5169eac2869e47c989932d775` | +| `plugins.image.tag` | the image tag of plugins | `27c4f118b1cec3773f2679b772e7583fc77e5686` | | `plugins.image.imagePullSecrets` | the image imagePullSecrets of plugins | `[]` | ### clusterDefaultPool parameters diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-ipam.json b/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-ipam.json new file mode 100644 index 000000000..f97dc4de7 --- /dev/null +++ b/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-ipam.json @@ -0,0 +1,836 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "grafana", + "uid": "-- Grafana --" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "id": 24, + "links": [], + "liveNow": false, + "panels": [ + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 6, + "panels": [], + "title": "Row title", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 18, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.3.14", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "spiderpool_total_ippool_counts{cluster=~\"$cluster\"}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "total ippool counts", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "mappings": [], + "thresholds": { + "mode": "percentage", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "orange", + "value": 70 + }, + { + "color": "red", + "value": 85 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 20, + "options": { + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": true + }, + "pluginVersion": "9.3.14", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "spiderpool_total_subnet_counts{cluster=~\"$cluster\"}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "total subnet counts", + "type": "gauge" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "spiderpool IPAM IP allocation status", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 10, + "x": 0, + "y": 8 + }, + "id": 2, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "spiderpool_ipam_allocation_counts_total{cluster=~\"$cluster\"}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "builder", + "expr": "spiderpool_ipam_allocation_failure_counts_total", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "IP allocation counts", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 8, + "x": 10, + "y": 8 + }, + "id": 10, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.1.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "spiderpool_ipam_allocation_average_duration_seconds{cluster=~\"$cluster\"}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "spiderpool_ipam_allocation_max_duration_seconds{cluster=~\"$cluster\"}", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "spiderpool_ipam_allocation_min_duration_seconds{cluster=~\"$cluster\"}", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "spiderpool_ipam_allocation_latest_duration_seconds{cluster=~\"$cluster\"}", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "D" + } + ], + "title": "ip allocation durations", + "type": "state-timeline" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 9, + "w": 6, + "x": 18, + "y": 8 + }, + "id": 12, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.1.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "spiderpool_ipam_allocation_duration_seconds_bucket{cluster=~\"$cluster\"}", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "ip allocation duration distribution", + "type": "piechart" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "spiderpool IP release and IP GC status", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 25, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "normal" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 10, + "x": 0, + "y": 17 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "spiderpool_ipam_release_counts_total{cluster=~\"$cluster\"}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "spiderpool_ipam_release_failure_counts_total{cluster=~\"$cluster\"}", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "builder", + "expr": "spiderpool_ip_gc_counts_total", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "spiderpool_ip_gc_failure_counts_total{cluster=~\"$cluster\"}", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "D" + } + ], + "title": "IP release&GC counts", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-GrYlRd" + }, + "custom": { + "fillOpacity": 70, + "lineWidth": 0, + "spanNulls": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 8, + "x": 10, + "y": 17 + }, + "id": 14, + "options": { + "alignValue": "left", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.9, + "showValue": "auto", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "9.1.6", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "spiderpool_ipam_release_average_duration_seconds{cluster=~\"$cluster\"}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "spiderpool_ipam_release_max_duration_seconds{cluster=~\"$cluster\"}", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "spiderpool_ipam_release_min_duration_seconds{cluster=~\"$cluster\"}", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "C" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "spiderpool_ipam_release_latest_duration_seconds{cluster=~\"$cluster\"}", + "hide": false, + "legendFormat": "__auto", + "range": true, + "refId": "D" + } + ], + "title": "IP release durations", + "type": "state-timeline" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + } + }, + "mappings": [] + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 6, + "x": 18, + "y": 17 + }, + "id": 16, + "options": { + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "pieType": "pie", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "spiderpool_ipam_release_duration_seconds_bucket{cluster=~\"$cluster\"}", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "IP release duration distribution", + "type": "piechart" + } + ], + "refresh": false, + "schemaVersion": 37, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "Prometheus", + "value": "Prometheus" + }, + "hide": 2, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(global_cluster_info, cluster_name)", + "hide": 0, + "includeAll": false, + "label": "Cluster", + "multi": false, + "name": "cluster_name", + "options": [], + "query": { + "query": "label_values(global_cluster_info, cluster_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(global_cluster_info{cluster_name=\"$cluster_name\"}, cluster)", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "cluster", + "options": [], + "query": { + "query": "label_values(global_cluster_info{cluster_name=\"$cluster_name\"}, cluster)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + } + ] + }, + "time": { + "from": "now-6h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Spiderpool", + "uid": "5FAGqFE4z", + "version": 2, + "weekStart": "" +} \ No newline at end of file diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-cluster.json b/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-cluster.json index 3f47706e7..84d5fbe51 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-cluster.json +++ b/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-cluster.json @@ -24,7 +24,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 10, + "id": 23, "links": [], "liveNow": false, "panels": [ @@ -97,7 +97,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(count(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\"}) by (node_name))", + "expr": "count(count(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\",cluster=~\"$cluster\"}) by (node_name))", "legendFormat": "__auto", "range": true, "refId": "A" @@ -162,7 +162,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "count(count(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\"}) by (pod_name))", + "expr": "count(count(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\",cluster=~\"$cluster\"}) by (pod_name))", "legendFormat": "__auto", "range": true, "refId": "A" @@ -262,6 +262,250 @@ "x": 0, "y": 5 }, + "id": 23, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(\n rate(rdma_rx_vport_rdma_unicast_bytes_total{is_root=\"true\",cluster=~\"$cluster\"}[5m]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{is_root=\"true\",cluster=~\"$cluster\"}[5m])\n)", + "legendFormat": "Read", + "range": true, + "refId": "A" + } + ], + "title": "Cluster Bandwidth | Read", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 3, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "10-20-1-50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "10-20-1-60" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 5 + }, + "id": 8, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum(\n rate(rdma_tx_vport_rdma_unicast_bytes_total{is_root=\"true\",cluster=~\"$cluster\"}[5m]) +\n rate(rdma_tx_vport_rdma_multicast_bytes_total{is_root=\"true\",cluster=~\"$cluster\"}[5m])\n)", + "legendFormat": "Write", + "range": true, + "refId": "A" + } + ], + "title": "Cluster Bandwidth | Write", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 20, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 3, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "10-20-1-60" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "10-20-1-50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 13 + }, "id": 7, "options": { "legend": { @@ -282,13 +526,253 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (node_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total[5m]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total[5m])\n)", + "expr": "sum by (node_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{cluster=~\"$cluster\"}[5m]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{cluster=~\"$cluster\"}[5m])\n)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Node Bandwidth | Read", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 3, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "bytes" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "10-20-1-50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "10-20-1-60" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 13 + }, + "id": 24, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by (node_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{cluster=~\"$cluster\"}[5m]) +\n rate(rdma_tx_vport_rdma_multicast_bytes_total{cluster=~\"$cluster\"}[5m])\n)", + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Node Bandwidth | Write", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 10, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + }, + "unit": "percent" + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "10-20-1-50" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "blue", + "mode": "fixed" + } + } + ] + }, + { + "matcher": { + "id": "byName", + "options": "10-20-1-60" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "purple", + "mode": "fixed" + } + } + ] + } + ] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 21 + }, + "id": 4, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "expr": "sum by (node_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{cluster=~\"$cluster\"}[3m]) + rate(rdma_rx_vport_rdma_multicast_bytes_total{cluster=~\"$cluster\"}[3m])\n)\n/ sum by (node_name) (rdma_vport_speed_mbps_total{is_root=\"true\",cluster=~\"$cluster\"} * 1000000 / 8) * 100", "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Node Bandwidth | Read", + "title": "Rate of Bandwidth | Node | Read", "type": "timeseries" }, { @@ -317,7 +801,7 @@ }, "lineInterpolation": "linear", "lineWidth": 1, - "pointSize": 3, + "pointSize": 5, "scaleDistribution": { "type": "linear" }, @@ -338,14 +822,10 @@ { "color": "green", "value": null - }, - { - "color": "red", - "value": 80 } ] }, - "unit": "bytes" + "unit": "percent" }, "overrides": [ { @@ -384,9 +864,9 @@ "h": 8, "w": 12, "x": 12, - "y": 5 + "y": 21 }, - "id": 8, + "id": 5, "options": { "legend": { "calcs": [], @@ -406,15 +886,28 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (node_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total[5m]) +\n rate(rdma_tx_vport_rdma_multicast_bytes_total[5m])\n)", + "expr": "sum by (node_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{cluster=~\"$cluster\"}[3m]) + rate(rdma_tx_vport_rdma_multicast_bytes_total{cluster=~\"$cluster\"}[3m])\n)\n/ sum by (node_name) (rdma_vport_speed_mbps_total{is_root=\"true\",cluster=~\"$cluster\"} * 1000000 / 8) * 100", "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Node Bandwidth | Write", + "title": "Rate of Bandwidth | Node | Write", "type": "timeseries" }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 29 + }, + "id": 18, + "panels": [], + "title": "Top traffic", + "type": "row" + }, { "datasource": { "type": "prometheus", @@ -441,7 +934,7 @@ }, "lineInterpolation": "linear", "lineWidth": 1, - "pointSize": 5, + "pointSize": 3, "scaleDistribution": { "type": "linear" }, @@ -460,12 +953,15 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" + }, + { + "color": "red", + "value": 80 } ] }, - "unit": "percent" + "unit": "bytes" }, "overrides": [ { @@ -504,14 +1000,14 @@ "h": 8, "w": 12, "x": 0, - "y": 13 + "y": 30 }, - "id": 4, + "id": 11, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom", + "placement": "right", "showLegend": true }, "tooltip": { @@ -526,13 +1022,13 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (node_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{}[3m]) + rate(rdma_rx_vport_rdma_multicast_bytes_total[3m])\n)\n/ sum by (node_name) (rdma_vport_speed_mbps_total{is_root=\"true\"} * 1000000 / 8) * 100", - "legendFormat": "__auto", + "expr": "topk(\n 10,\n sum by (node_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{cluster=~\"$cluster\"}[5m]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{cluster=~\"$cluster\"}[5m])\n )\n)", + "legendFormat": "{{node_name}}", "range": true, "refId": "A" } ], - "title": "Rate of Bandwidth | Node | Read", + "title": "Node Top 10 | Bandwidth | Read", "type": "timeseries" }, { @@ -561,7 +1057,7 @@ }, "lineInterpolation": "linear", "lineWidth": 1, - "pointSize": 5, + "pointSize": 3, "scaleDistribution": { "type": "linear" }, @@ -580,12 +1076,15 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" + }, + { + "color": "red", + "value": 80 } ] }, - "unit": "percent" + "unit": "bytes" }, "overrides": [ { @@ -624,14 +1123,14 @@ "h": 8, "w": 12, "x": 12, - "y": 13 + "y": 30 }, - "id": 5, + "id": 12, "options": { "legend": { "calcs": [], "displayMode": "list", - "placement": "bottom", + "placement": "right", "showLegend": true }, "tooltip": { @@ -646,28 +1145,15 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (node_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{}[3m]) + rate(rdma_tx_vport_rdma_multicast_bytes_total[3m])\n)\n/ sum by (node_name) (rdma_vport_speed_mbps_total{is_root=\"true\"} * 1000000 / 8) * 100", - "legendFormat": "__auto", + "expr": "topk(\n 10,\n sum by (node_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{cluster=~\"$cluster\"}[5m]) +\n rate(rdma_tx_vport_rdma_multicast_bytes_total{cluster=~\"$cluster\"}[5m])\n )\n)", + "legendFormat": "{{node_name}}", "range": true, "refId": "A" } ], - "title": "Rate of Bandwidth | Node | Write", + "title": "Node Top 10 | Bandwidth | Write", "type": "timeseries" }, - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 21 - }, - "id": 18, - "panels": [], - "title": "Top traffic", - "type": "row" - }, { "datasource": { "type": "prometheus", @@ -694,7 +1180,7 @@ }, "lineInterpolation": "linear", "lineWidth": 1, - "pointSize": 3, + "pointSize": 5, "scaleDistribution": { "type": "linear" }, @@ -713,16 +1199,11 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 + "color": "green" } ] }, - "unit": "bytes" + "unit": "percent" }, "overrides": [ { @@ -761,9 +1242,9 @@ "h": 8, "w": 12, "x": 0, - "y": 22 + "y": 38 }, - "id": 11, + "id": 19, "options": { "legend": { "calcs": [], @@ -783,13 +1264,13 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "topk(\n 10,\n sum by (node_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total[5m]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total[5m])\n )\n)", - "legendFormat": "{{node_name}}", + "expr": "topk(\n 10,\nsum by (node_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{cluster=~\"$cluster\"}[3m]) + rate(rdma_rx_vport_rdma_multicast_bytes_total{cluster=~\"$cluster\"}[3m])\n)\n/ sum by (node_name) (rdma_vport_speed_mbps_total{is_root=\"true\",cluster=~\"$cluster\"} * 1000000 / 8) * 100\n)", + "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Node Top 10 | Bandwidth | Read", + "title": "Node Top 10 | Rate of Bandwidth | Read", "type": "timeseries" }, { @@ -818,7 +1299,7 @@ }, "lineInterpolation": "linear", "lineWidth": 1, - "pointSize": 3, + "pointSize": 5, "scaleDistribution": { "type": "linear" }, @@ -837,16 +1318,11 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 + "color": "green" } ] }, - "unit": "bytes" + "unit": "percent" }, "overrides": [ { @@ -885,9 +1361,9 @@ "h": 8, "w": 12, "x": 12, - "y": 22 + "y": 38 }, - "id": 12, + "id": 20, "options": { "legend": { "calcs": [], @@ -907,13 +1383,13 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "topk(\n 10,\n sum by (node_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total[5m]) +\n rate(rdma_tx_vport_rdma_multicast_bytes_total[5m])\n )\n)", - "legendFormat": "{{node_name}}", + "expr": "topk(\n 10,\nsum by (node_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{cluster=~\"$cluster\"}[3m]) + rate(rdma_tx_vport_rdma_multicast_bytes_total{cluster=~\"$cluster\"}[3m])\n)\n/ sum by (node_name) (rdma_vport_speed_mbps_total{is_root=\"true\",cluster=~\"$cluster\"} * 1000000 / 8) * 100\n)", + "legendFormat": "__auto", "range": true, "refId": "A" } ], - "title": "Node Top 10 | Bandwidth | Write", + "title": "Node Top 10 | Rate of Bandwidth | Write", "type": "timeseries" }, { @@ -942,7 +1418,7 @@ }, "lineInterpolation": "linear", "lineWidth": 1, - "pointSize": 5, + "pointSize": 3, "scaleDistribution": { "type": "linear" }, @@ -961,24 +1437,27 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" + }, + { + "color": "red", + "value": 80 } ] }, - "unit": "percent" + "unit": "bytes" }, "overrides": [ { "matcher": { "id": "byName", - "options": "10-20-1-50" + "options": "rdma-test-gpu-tool-jgt9t" }, "properties": [ { "id": "color", "value": { - "fixedColor": "blue", + "fixedColor": "#73BF69", "mode": "fixed" } } @@ -987,13 +1466,13 @@ { "matcher": { "id": "byName", - "options": "10-20-1-60" + "options": "rdma-test-gpu-tool-n792j" }, "properties": [ { "id": "color", "value": { - "fixedColor": "purple", + "fixedColor": "#ffc0cb", "mode": "fixed" } } @@ -1005,9 +1484,9 @@ "h": 8, "w": 12, "x": 0, - "y": 30 + "y": 46 }, - "id": 19, + "id": 13, "options": { "legend": { "calcs": [], @@ -1027,13 +1506,13 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "topk(\n 10,\nsum by (node_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{}[3m]) + rate(rdma_rx_vport_rdma_multicast_bytes_total[3m])\n)\n/ sum by (node_name) (rdma_vport_speed_mbps_total{is_root=\"true\"} * 1000000 / 8) * 100\n)", - "legendFormat": "__auto", + "expr": "topk(\n 10,\n sum by (pod_name) (\n (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\",cluster=~\"$cluster\"}[5m]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\",cluster=~\"$cluster\"}[5m])\n )\n )\n)", + "legendFormat": "{{node_name}}", "range": true, "refId": "A" } ], - "title": "Node Top 10 | Rate of Bandwidth | Read", + "title": "Pod Top 10 | Bandwidth | Read", "type": "timeseries" }, { @@ -1062,7 +1541,7 @@ }, "lineInterpolation": "linear", "lineWidth": 1, - "pointSize": 5, + "pointSize": 3, "scaleDistribution": { "type": "linear" }, @@ -1081,39 +1560,27 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" + }, + { + "color": "red", + "value": 80 } ] }, - "unit": "percent" + "unit": "bytes" }, "overrides": [ { "matcher": { "id": "byName", - "options": "10-20-1-50" - }, - "properties": [ - { - "id": "color", - "value": { - "fixedColor": "blue", - "mode": "fixed" - } - } - ] - }, - { - "matcher": { - "id": "byName", - "options": "10-20-1-60" + "options": "rdma-test-gpu-tool-n792j" }, "properties": [ { "id": "color", "value": { - "fixedColor": "purple", + "fixedColor": "#ffc0cb", "mode": "fixed" } } @@ -1125,9 +1592,9 @@ "h": 8, "w": 12, "x": 12, - "y": 30 + "y": 46 }, - "id": 20, + "id": 14, "options": { "legend": { "calcs": [], @@ -1147,13 +1614,13 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "topk(\n 10,\nsum by (node_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{}[3m]) + rate(rdma_tx_vport_rdma_multicast_bytes_total[3m])\n)\n/ sum by (node_name) (rdma_vport_speed_mbps_total{is_root=\"true\"} * 1000000 / 8) * 100\n)", - "legendFormat": "__auto", + "expr": "topk(\n 10,\n sum by (pod_name) (\n (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\",cluster=~\"$cluster\"}[5m]) +\n rate(rdma_tx_vport_rdma_multicast_bytes_total{pod_name!=\"\",cluster=~\"$cluster\"}[5m])\n )\n )\n)", + "legendFormat": "{{node_name}}", "range": true, "refId": "A" } ], - "title": "Node Top 10 | Rate of Bandwidth | Write", + "title": "Pod Top 10 | Bandwidth | Write", "type": "timeseries" }, { @@ -1201,8 +1668,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1249,9 +1715,9 @@ "h": 8, "w": 12, "x": 0, - "y": 38 + "y": 54 }, - "id": 13, + "id": 21, "options": { "legend": { "calcs": [], @@ -1271,13 +1737,13 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "topk(\n 10,\n sum by (pod_name) (\n (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\"}[5m]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\"}[5m])\n )\n )\n)", - "legendFormat": "{{node_name}}", + "expr": "topk(\n 10,\n sum by (owner_kind,owner_namespace,owner_name) (\n (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\",cluster=~\"$cluster\"}[5m]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\",cluster=~\"$cluster\"}[5m])\n )\n )\n)", + "legendFormat": "{{owner_kind}} - {{owner_namespace}}/{{owner_name}}", "range": true, "refId": "A" } ], - "title": "Pod Top 10 | Bandwidth | Read", + "title": "Workload Top 10 | Bandwidth | Read", "type": "timeseries" }, { @@ -1325,8 +1791,7 @@ "mode": "absolute", "steps": [ { - "color": "green", - "value": null + "color": "green" }, { "color": "red", @@ -1337,6 +1802,21 @@ "unit": "bytes" }, "overrides": [ + { + "matcher": { + "id": "byName", + "options": "rdma-test-gpu-tool-jgt9t" + }, + "properties": [ + { + "id": "color", + "value": { + "fixedColor": "#73BF69", + "mode": "fixed" + } + } + ] + }, { "matcher": { "id": "byName", @@ -1358,9 +1838,9 @@ "h": 8, "w": 12, "x": 12, - "y": 38 + "y": 54 }, - "id": 14, + "id": 22, "options": { "legend": { "calcs": [], @@ -1380,13 +1860,13 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "topk(\n 10,\n sum by (pod_name) (\n (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\"}[5m]) +\n rate(rdma_tx_vport_rdma_multicast_bytes_total{pod_name!=\"\"}[5m])\n )\n )\n)", - "legendFormat": "{{node_name}}", + "expr": "topk(\n 10,\n sum by (owner_kind,owner_namespace,owner_name) (\n (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\",cluster=~\"$cluster\"}[5m]) +\n rate(rdma_tx_vport_rdma_multicast_bytes_total{pod_name!=\"\",cluster=~\"$cluster\"}[5m])\n )\n )\n)", + "legendFormat": "{{owner_kind}} - {{owner_namespace}}/{{owner_name}}", "range": true, "refId": "A" } ], - "title": "Pod Top 10 | Bandwidth | Write", + "title": "Workload Top 10 | Bandwidth | Read", "type": "timeseries" } ], @@ -1413,6 +1893,61 @@ "regex": "", "skipUrlSync": false, "type": "datasource" + }, + { + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(global_cluster_info, cluster_name)", + "hide": 0, + "includeAll": false, + "label": "Cluster", + "multi": false, + "name": "cluster_name", + "options": [], + "query": { + "query": "label_values(global_cluster_info, cluster_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(global_cluster_info{cluster_name=\"$cluster_name\"}, cluster)", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "cluster", + "options": [], + "query": { + "query": "label_values(global_cluster_info{cluster_name=\"$cluster_name\"}, cluster)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" } ] }, diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-node.json b/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-node.json index b75fa5b60..46a145726 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-node.json +++ b/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-node.json @@ -24,7 +24,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 7, + "id": 22, "links": [], "liveNow": false, "panels": [ @@ -140,7 +140,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (rdma_parent_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{node_name=~\"$node\"}[1m]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{node_name=~\"$node\"}[1m])\n)", + "expr": "sum by (rdma_parent_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{node_name=~\"$node\",cluster=~\"$cluster\"}[1m]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{node_name=~\"$node\",cluster=~\"$cluster\"}[1m])\n)", "legendFormat": "{{net_dev_name}}", "range": true, "refId": "A" @@ -248,7 +248,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (rdma_parent_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{node_name=~\"$node\"}[1m]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{node_name=~\"$node\"}[1m])\n)", + "expr": "sum by (rdma_parent_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{node_name=~\"$node\",cluster=~\"$cluster\"}[1m]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{node_name=~\"$node\",cluster=~\"$cluster\"}[1m])\n)", "legendFormat": "{{net_dev_name}}", "range": true, "refId": "A" @@ -339,7 +339,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (rdma_parent_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{node_name=~\"$node\"}[1m])\n)\n/ sum by (rdma_parent_name) (rdma_vport_speed_mbps_total{node_name=~\"$node\", is_root=\"true\"} * 1000000 / 8) * 100", + "expr": "sum by (rdma_parent_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{node_name=~\"$node\",cluster=~\"$cluster\"}[1m])\n)\n/ sum by (rdma_parent_name) (rdma_vport_speed_mbps_total{node_name=~\"$node\", is_root=\"true\",cluster=~\"$cluster\"} * 1000000 / 8) * 100", "legendFormat": "__auto", "range": true, "refId": "A" @@ -431,7 +431,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (rdma_parent_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{node_name=~\"$node\"}[1m])\n)\n/ sum by (rdma_parent_name) (rdma_vport_speed_mbps_total{node_name=~\"$node\", is_root=\"true\"} * 1000000 / 8) * 100", + "expr": "sum by (rdma_parent_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{node_name=~\"$node\",cluster=~\"$cluster\"}[1m])\n)\n/ sum by (rdma_parent_name) (rdma_vport_speed_mbps_total{node_name=~\"$node\", is_root=\"true\",cluster=~\"$cluster\"} * 1000000 / 8) * 100", "legendFormat": "__auto", "range": true, "refId": "A" @@ -564,7 +564,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name=\"\", node_name=~\"$node\"}[$__rate_interval]) +\nrate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name=\"\", node_name=~\"$node\"}[$__rate_interval])", + "expr": "rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name=\"\", node_name=~\"$node\",cluster=~\"$cluster\"}[$__rate_interval]) +\nrate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name=\"\", node_name=~\"$node\",cluster=~\"$cluster\"}[$__rate_interval])", "legendFormat": "{{net_dev_name}}", "range": true, "refId": "A" @@ -653,7 +653,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name=\"\", node_name=~\"$node\"}[$__rate_interval]) +\nrate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name=\"\", node_name=~\"$node\"}[$__rate_interval])", + "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name=\"\", node_name=~\"$node\",cluster=~\"$cluster\"}[$__rate_interval]) +\nrate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name=\"\", node_name=~\"$node\",cluster=~\"$cluster\"}[$__rate_interval])", "legendFormat": "{{net_dev_name}}", "range": true, "refId": "A" @@ -720,7 +720,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -754,7 +755,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\", node_name=~\"$node\"}[$__rate_interval])", + "expr": "rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\", node_name=~\"$node\",cluster=~\"$cluster\"}[$__rate_interval])", "legendFormat": "{{ifname}}", "range": true, "refId": "A" @@ -808,7 +809,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null } ] }, @@ -842,7 +844,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\", node_name=~\"$node\"}[$__rate_interval]) +\nrate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\", node_name=~\"$node\"}[$__rate_interval])", + "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\", node_name=~\"$node\",cluster=~\"$cluster\"}[$__rate_interval]) +\nrate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\", node_name=~\"$node\",cluster=~\"$cluster\"}[$__rate_interval])", "legendFormat": "{{ifname}}", "range": true, "refId": "A" @@ -896,7 +898,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -934,7 +937,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "topk(\n 10,\n sum by (pod_name, pod_namespace) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\", node_name=~\"$node\"}[$__rate_interval]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\", node_name=~\"$node\"}[$__rate_interval])\n )\n)", + "expr": "topk(\n 10,\n sum by (pod_name, pod_namespace) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\", node_name=~\"$node\",cluster=~\"$cluster\"}[$__rate_interval]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\", node_name=~\"$node\",cluster=~\"$cluster\"}[$__rate_interval])\n )\n)", "legendFormat": "{{pod_namespace}}/{{pod_name}}", "range": true, "refId": "A" @@ -988,7 +991,8 @@ "mode": "absolute", "steps": [ { - "color": "green" + "color": "green", + "value": null }, { "color": "red", @@ -1026,7 +1030,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "topk(\n 10, \n sum by (pod_name, pod_namespace) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\", node_name=~\"$node\"}[$__rate_interval]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\", node_name=~\"$node\"}[$__rate_interval])\n )\n)", + "expr": "topk(\n 10, \n sum by (pod_name, pod_namespace) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\", node_name=~\"$node\",cluster=~\"$cluster\"}[$__rate_interval]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\", node_name=~\"$node\",cluster=~\"$cluster\"}[$__rate_interval])\n )\n)", "legendFormat": "{{pod_namespace}}/{{pod_name}}", "range": true, "refId": "A" @@ -1060,6 +1064,61 @@ "skipUrlSync": false, "type": "datasource" }, + { + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(global_cluster_info, cluster_name)", + "hide": 0, + "includeAll": false, + "label": "Cluster", + "multi": false, + "name": "cluster_name", + "options": [], + "query": { + "query": "label_values(global_cluster_info, cluster_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(global_cluster_info{cluster_name=\"$cluster_name\"}, cluster)", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "cluster", + "options": [], + "query": { + "query": "label_values(global_cluster_info{cluster_name=\"$cluster_name\"}, cluster)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, { "current": { "selected": false, @@ -1070,14 +1129,14 @@ "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{}, node_name)", + "definition": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{cluster=~\"$cluster\"}, node_name)", "hide": 0, "includeAll": false, "multi": false, "name": "node", "options": [], "query": { - "query": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{}, node_name)", + "query": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{cluster=~\"$cluster\"}, node_name)", "refId": "StandardVariableQuery" }, "refresh": 1, @@ -1096,6 +1155,6 @@ "timezone": "", "title": "Spiderpool RDMA | Node", "uid": "A0T4f2ZNz", - "version": 22, + "version": 23, "weekStart": "" } \ No newline at end of file diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-pod.json b/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-pod.json index faef3fde4..ecd0befe1 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-pod.json +++ b/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-pod.json @@ -24,7 +24,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 6, + "id": 21, "links": [], "liveNow": false, "panels": [ @@ -156,7 +156,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval]) + rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval]) ", + "expr": "rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval]) + rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval]) ", "format": "time_series", "instant": false, "interval": "", @@ -283,7 +283,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval]) + rate(rdma_tx_vport_rdma_multicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval]) ", + "expr": "rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval]) + rate(rdma_tx_vport_rdma_multicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval]) ", "format": "time_series", "instant": false, "interval": "", @@ -410,7 +410,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "(rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])\n)\n/\n(rdma_vport_speed_mbps_total{pod_name!=\"\"} * 1000000 / 8) * 100", + "expr": "(rate(rdma_rx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval]) +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])\n)\n/\n(rdma_vport_speed_mbps_total{pod_name!=\"\",cluster=~\"$cluster\"} * 1000000 / 8) * 100", "format": "time_series", "instant": false, "interval": "", @@ -537,7 +537,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "(rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval]) +\n rate(rdma_tx_vport_rdma_multicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])\n)\n/\n(rdma_vport_speed_mbps_total{pod_name!=\"\"} * 1000000 / 8) * 100", + "expr": "(rate(rdma_tx_vport_rdma_unicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval]) +\n rate(rdma_tx_vport_rdma_multicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])\n)\n/\n(rdma_vport_speed_mbps_total{pod_name!=\"\",cluster=~\"$cluster\"} * 1000000 / 8) * 100", "format": "time_series", "instant": false, "interval": "", @@ -633,7 +633,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_rx_vport_rdma_unicast_packets_total{pod_name!=\"\", pod_namespace!=\"\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_rx_vport_rdma_unicast_packets_total{pod_name!=\"\", pod_namespace!=\"\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -728,7 +728,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_rx_vport_rdma_multicast_packets_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_rx_vport_rdma_multicast_packets_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -824,7 +824,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_rx_vport_rdma_multicast_bytes_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -918,7 +918,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_rx_read_requests_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_rx_read_requests_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -1014,7 +1014,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_tx_vport_rdma_unicast_packets_total{pod_name!=\"\", pod_namespace!=\"\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_tx_vport_rdma_unicast_packets_total{pod_name!=\"\", pod_namespace!=\"\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -1109,7 +1109,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "irate(rdma_tx_vport_rdma_multicast_packets_total{pod_namespace!=\"\", pod_name=~\"$pod\"}[1m])", + "expr": "irate(rdma_tx_vport_rdma_multicast_packets_total{pod_namespace!=\"\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[1m])", "format": "time_series", "instant": false, "interval": "", @@ -1205,7 +1205,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_tx_vport_rdma_multicast_bytes_total{pod_name!=\"\", pod_name=~\"$pod\"}[1m]) * 8 / 1000000", + "expr": "rate(rdma_tx_vport_rdma_multicast_bytes_total{pod_name!=\"\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[1m]) * 8 / 1000000", "format": "time_series", "instant": false, "interval": "", @@ -1300,7 +1300,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_rx_write_requests_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_rx_write_requests_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -1394,7 +1394,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_req_cqe_error_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_req_cqe_error_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -1488,7 +1488,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "irate(rdma_duplicate_request_total{pod_namespace!=\"\", pod_name=~\"$pod\"}[1m])", + "expr": "irate(rdma_duplicate_request_total{pod_namespace!=\"\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[1m])", "format": "time_series", "instant": false, "interval": "", @@ -1582,7 +1582,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_resp_remote_access_errors_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_resp_remote_access_errors_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -1676,7 +1676,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_req_remote_access_errors_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_req_remote_access_errors_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -1770,7 +1770,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_rx_dct_connect_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_rx_dct_connect_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -1864,7 +1864,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_rx_atomic_requests_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_rx_atomic_requests_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -1958,7 +1958,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_req_remote_invalid_request_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_req_remote_invalid_request_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -2052,7 +2052,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_duplicate_request_total{pod_namespace!=\"\",pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_duplicate_request_total{pod_namespace!=\"\",pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -2146,7 +2146,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_rx_atomic_requests_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_rx_atomic_requests_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -2240,7 +2240,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_resp_cqe_flush_error_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_resp_cqe_flush_error_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -2334,7 +2334,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_req_cqe_flush_error_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_req_cqe_flush_error_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -2428,7 +2428,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_resp_cqe_error_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_resp_cqe_error_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -2522,7 +2522,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_rnr_nak_retry_err_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_rnr_nak_retry_err_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -2616,7 +2616,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_out_of_sequence_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_out_of_sequence_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -2710,7 +2710,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_packet_seq_err_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_packet_seq_err_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -2804,7 +2804,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_resp_local_length_error_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_resp_local_length_error_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -2898,7 +2898,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_implied_nak_seq_err_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_implied_nak_seq_err_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -2992,7 +2992,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_local_ack_timeout_err_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_local_ack_timeout_err_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -3086,7 +3086,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_out_of_buffer_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_out_of_buffer_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -3180,7 +3180,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_req_cqe_error_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_req_cqe_error_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -3274,7 +3274,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_np_cnp_sent_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_np_cnp_sent_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\", cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -3368,7 +3368,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_roce_adp_retrans_to_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_roce_adp_retrans_to_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -3462,7 +3462,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_roce_slow_restart_cnps_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_roce_slow_restart_cnps_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -3556,7 +3556,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_np_ecn_marked_roce_packets_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_np_ecn_marked_roce_packets_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -3650,7 +3650,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_rp_cnp_handled_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_rp_cnp_handled_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -3744,7 +3744,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_roce_slow_restart_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_roce_slow_restart_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -3838,7 +3838,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_rp_cnp_ignored_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_rp_cnp_ignored_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -3932,7 +3932,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "rate(rdma_roce_adp_retrans_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\"}[$__rate_interval])", + "expr": "rate(rdma_roce_adp_retrans_total{pod_name!=\"\", pod_namespace=~\"$namespace\", pod_name=~\"$pod\",cluster=~\"$cluster\"}[$__rate_interval])", "format": "time_series", "instant": false, "interval": "", @@ -3969,25 +3969,80 @@ "skipUrlSync": false, "type": "datasource" }, + { + "current": { + "isNone": true, + "selected": true, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(global_cluster_info, cluster_name)", + "hide": 0, + "includeAll": false, + "label": "Cluster", + "multi": false, + "name": "cluster_name", + "options": [], + "query": { + "query": "label_values(global_cluster_info, cluster_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(global_cluster_info{cluster_name=\"$cluster_name\"}, cluster)", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "cluster", + "options": [], + "query": { + "query": "label_values(global_cluster_info{cluster_name=\"$cluster_name\"}, cluster)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, { "allValue": ".+", "current": { "selected": false, - "text": "All", - "value": "$__all" + "text": "huailou", + "value": "huailou" }, "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{pod_namespace=~\"$namespace\"}, pod_namespace)", + "definition": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{pod_namespace=~\"$namespace\",cluster=~\"$cluster\"}, pod_namespace)", "hide": 0, "includeAll": true, "multi": false, "name": "namespace", "options": [], "query": { - "query": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{pod_namespace=~\"$namespace\"}, pod_namespace)", + "query": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{pod_namespace=~\"$namespace\",cluster=~\"$cluster\"}, pod_namespace)", "refId": "StandardVariableQuery" }, "refresh": 2, @@ -3998,7 +4053,7 @@ }, { "current": { - "selected": false, + "selected": true, "text": "10-20-1-50", "value": "10-20-1-50" }, @@ -4006,14 +4061,14 @@ "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{}, node_name)", + "definition": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{cluster=~\"$cluster\"}, node_name)", "hide": 0, "includeAll": true, "multi": false, "name": "node", "options": [], "query": { - "query": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{}, node_name)", + "query": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{cluster=~\"$cluster\"}, node_name)", "refId": "StandardVariableQuery" }, "refresh": 1, @@ -4025,14 +4080,14 @@ { "current": { "selected": false, - "text": "pytorch-sample-master-0", - "value": "pytorch-sample-master-0" + "text": "rdma-test-gpu-tool-pdrvn", + "value": "rdma-test-gpu-tool-pdrvn" }, "datasource": { "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{pod_namespace=~\"$namespace\"}, pod_name)", + "definition": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{pod_namespace=~\"$namespace\",cluster=~\"$cluster\"}, pod_name)", "hide": 0, "includeAll": false, "label": "pod", @@ -4040,7 +4095,7 @@ "name": "pod", "options": [], "query": { - "query": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{pod_namespace=~\"$namespace\"}, pod_name)", + "query": "label_values(rdma_tx_vport_rdma_unicast_bytes_total{pod_namespace=~\"$namespace\",cluster=~\"$cluster\"}, pod_name)", "refId": "StandardVariableQuery" }, "refresh": 2, @@ -4052,13 +4107,13 @@ ] }, "time": { - "from": "now-3h", + "from": "now-30d", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Spiderpool RDMA | Pod", "uid": "DenUibiNk", - "version": 17, + "version": 19, "weekStart": "" } \ No newline at end of file diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-workload.json b/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-workload.json index 0d6b78cdc..2e59e0442 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-workload.json +++ b/charts/spiderpool/spiderpool/charts/spiderpool/files/grafana-rdma-workload.json @@ -24,7 +24,7 @@ "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, - "id": 12, + "id": 20, "links": [], "liveNow": false, "panels": [ @@ -121,7 +121,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (owner_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\"}[1m])\n)", + "expr": "sum by (owner_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\",cluster=~\"$cluster\"}[1m])\n +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\",cluster=~\"$cluster\"}[1m])\n)", "legendFormat": "__auto", "range": true, "refId": "A" @@ -211,7 +211,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (owner_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\"}[1m])\n)", + "expr": "sum by (owner_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\",cluster=~\"$cluster\"}[1m])\n +\n rate(rdma_tx_vport_rdma_multicast_bytes_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\",cluster=~\"$cluster\"}[1m])\n)", "legendFormat": "__auto", "range": true, "refId": "A" @@ -330,7 +330,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (pod_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\"}[1m])\n)", + "expr": "sum by (pod_name) (\n rate(rdma_rx_vport_rdma_unicast_bytes_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\",cluster=~\"$cluster\"}[1m])\n +\n rate(rdma_rx_vport_rdma_multicast_bytes_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\",cluster=~\"$cluster\"}[1m])\n)", "interval": "", "legendFormat": "{{pod_name}}", "range": true, @@ -436,7 +436,7 @@ "uid": "${datasource}" }, "editorMode": "code", - "expr": "sum by (pod_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\"}[1m])\n)", + "expr": "sum by (pod_name) (\n rate(rdma_tx_vport_rdma_unicast_bytes_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\",cluster=~\"$cluster\"}[1m])\n +\n rate(rdma_tx_vport_rdma_multicast_bytes_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\",cluster=~\"$cluster\"}[1m])\n)", "interval": "", "legendFormat": "{{pod_name}}", "range": true, @@ -546,7 +546,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum by (pod_name) (\n rate(rdma_rx_vport_rdma_unicast_packets_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\"}[1m])\n)", + "expr": "sum by (pod_name) (\n rate(rdma_rx_vport_rdma_unicast_packets_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\",cluster=~\"$cluster\"}[1m])\n)", "format": "time_series", "instant": false, "interval": "", @@ -658,7 +658,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum by (pod_name) (\n rate(rdma_tx_vport_rdma_unicast_packets_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\"}[1m])\n)", + "expr": "sum by (pod_name) (\n rate(rdma_tx_vport_rdma_unicast_packets_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\",cluster=~\"$cluster\"}[1m])\n)", "format": "time_series", "instant": false, "interval": "", @@ -770,7 +770,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum by (pod_name) (\n rate(rdma_tx_vport_rdma_multicast_packets_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\"}[1m])\n)", + "expr": "sum by (pod_name) (\n rate(rdma_tx_vport_rdma_multicast_packets_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\",cluster=~\"$cluster\"}[1m])\n)", "format": "time_series", "instant": false, "interval": "", @@ -882,7 +882,7 @@ }, "editorMode": "code", "exemplar": false, - "expr": "sum by (pod_name) (\n rate(rdma_tx_vport_rdma_multicast_packets_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\"}[1m])\n)", + "expr": "sum by (pod_name) (\n rate(rdma_tx_vport_rdma_multicast_packets_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\",cluster=~\"$cluster\"}[1m])\n)", "format": "time_series", "instant": false, "interval": "", @@ -893,6 +893,194 @@ ], "title": "Packets per Pod | multicast | Write", "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 26 + }, + "id": 37, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_out_of_sequence_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\",cluster=~\"$cluster\"}[1m])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{pod_name}} - {{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "Pod Out of sequence ", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "description": "", + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "", + "axisPlacement": "auto", + "axisSoftMin": 0, + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 2, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 26 + }, + "id": 38, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "right", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "rate(rdma_packet_seq_err_total{owner_namespace=~\"$namespace\", owner_kind=~\"$kind\", owner_name=~\"$name\",cluster=~\"$cluster\"}[1m])", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{pod_name}} - {{net_dev_name}} - {{ifname}}", + "range": true, + "refId": "A" + } + ], + "title": "Pod Packet sequence error", + "type": "timeseries" } ], "refresh": false, @@ -919,6 +1107,62 @@ "skipUrlSync": false, "type": "datasource" }, + { + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(global_cluster_info, cluster_name)", + "hide": 0, + "includeAll": false, + "label": "Cluster", + "multi": false, + "name": "cluster_name", + "options": [], + "query": { + "query": "label_values(global_cluster_info, cluster_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "isNone": true, + "selected": false, + "text": "None", + "value": "" + }, + "datasource": { + "type": "prometheus", + "uid": "${datasource}" + }, + "definition": "label_values(global_cluster_info{cluster_name=\"$cluster_name\"}, cluster)", + "hide": 2, + "includeAll": false, + "label": "", + "multi": false, + "name": "cluster", + "options": [], + "query": { + "query": "label_values(global_cluster_info{cluster_name=\"$cluster_name\"}, cluster)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 1, + "type": "query" + }, { "current": { "selected": false, @@ -929,14 +1173,14 @@ "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(rdma_vport_speed_mbps_total{}, owner_kind)", + "definition": "label_values(rdma_vport_speed_mbps_total{cluster=~\"$cluster\"}, owner_kind)", "hide": 0, "includeAll": false, "multi": false, "name": "kind", "options": [], "query": { - "query": "label_values(rdma_vport_speed_mbps_total{}, owner_kind)", + "query": "label_values(rdma_vport_speed_mbps_total{cluster=~\"$cluster\"}, owner_kind)", "refId": "StandardVariableQuery" }, "refresh": 1, @@ -955,14 +1199,14 @@ "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(rdma_vport_speed_mbps_total{owner_kind=~\"$kind\"}, owner_namespace)", + "definition": "label_values(rdma_vport_speed_mbps_total{owner_kind=~\"$kind\",cluster=~\"$cluster\"}, owner_namespace)", "hide": 0, "includeAll": false, "multi": false, "name": "namespace", "options": [], "query": { - "query": "label_values(rdma_vport_speed_mbps_total{owner_kind=~\"$kind\"}, owner_namespace)", + "query": "label_values(rdma_vport_speed_mbps_total{owner_kind=~\"$kind\",cluster=~\"$cluster\"}, owner_namespace)", "refId": "StandardVariableQuery" }, "refresh": 1, @@ -981,14 +1225,14 @@ "type": "prometheus", "uid": "${datasource}" }, - "definition": "label_values(rdma_vport_speed_mbps_total{owner_kind=~\"$kind\", owner_namespace=~\"$namespace\"}, owner_name)", + "definition": "label_values(rdma_vport_speed_mbps_total{owner_kind=~\"$kind\", owner_namespace=~\"$namespace\",cluster=~\"$cluster\"}, owner_name)", "hide": 0, "includeAll": false, "multi": false, "name": "name", "options": [], "query": { - "query": "label_values(rdma_vport_speed_mbps_total{owner_kind=~\"$kind\", owner_namespace=~\"$namespace\"}, owner_name)", + "query": "label_values(rdma_vport_speed_mbps_total{owner_kind=~\"$kind\", owner_namespace=~\"$namespace\",cluster=~\"$cluster\"}, owner_name)", "refId": "StandardVariableQuery" }, "refresh": 1, @@ -1000,13 +1244,13 @@ ] }, "time": { - "from": "now-3h", + "from": "now-90d", "to": "now" }, "timepicker": {}, "timezone": "", "title": "Spiderpool RDMA | AI Workload", "uid": "AAT6f2ZNz", - "version": 40, + "version": 45, "weekStart": "" } \ No newline at end of file diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/templates/configmap.yaml b/charts/spiderpool/spiderpool/charts/spiderpool/templates/configmap.yaml index 2202605c5..f51ae95a3 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/templates/configmap.yaml +++ b/charts/spiderpool/spiderpool/charts/spiderpool/templates/configmap.yaml @@ -13,6 +13,7 @@ metadata: {{- include "tplvalues.render" ( dict "value" .Values.global.commonAnnotations "context" $ ) | nindent 4 }} {{- end }} data: + clusterNetwork: {{ .Values.multus.multusCNI.defaultCniCRName | quote }} conf.yml: | ipamUnixSocketPath: {{ .Values.global.ipamUNIXSocketHostPath }} enableIPv4: {{ .Values.ipam.enableIPv4 }} @@ -36,7 +37,7 @@ data: kind: ConfigMap apiVersion: v1 metadata: - name: {{ .Values.multus.multusCNI.name | trunc 63 | trimSuffix "-" }} + name: {{ .Values.multus.multusCNI.name | trunc 63 | trimSuffix "-" }}-entrypoint namespace: {{ .Release.Namespace | quote }} labels: {{- include "spiderpool.multus.labels" . | nindent 4 }} @@ -44,23 +45,160 @@ metadata: {{- include "tplvalues.render" ( dict "value" .Values.global.commonLabels "context" $ ) | nindent 4 }} {{- end }} data: - cni-conf.json: | + entrypoint.sh: | + #!/bin/bash + set -e + + function log(){ + echo "INFO: $(date --iso-8601=seconds) ${1}" + } + function error(){ + log "ERR: {$1}" + } + function warn(){ + log "WARN: {$1}" + } + + function generateKubeConfig { + # Check if we're running as a k8s pod. + if [ -f "$SERVICE_ACCOUNT_TOKEN_PATH" ]; then + # We're running as a k8d pod - expect some variables. + if [ -z ${KUBERNETES_SERVICE_HOST} ]; then + error "KUBERNETES_SERVICE_HOST not set"; exit 1; + fi + if [ -z ${KUBERNETES_SERVICE_PORT} ]; then + error "KUBERNETES_SERVICE_PORT not set"; exit 1; + fi + + if [ "$SKIP_TLS_VERIFY" == "true" ]; then + TLS_CFG="insecure-skip-tls-verify: true" + elif [ -f "$KUBE_CA_FILE" ]; then + TLS_CFG="certificate-authority-data: $(cat $KUBE_CA_FILE | base64 | tr -d '\n')" + fi + + # Get the contents of service account token. + SERVICEACCOUNT_TOKEN=$(cat $SERVICE_ACCOUNT_TOKEN_PATH) + + SKIP_TLS_VERIFY=${SKIP_TLS_VERIFY:-false} + + # Write a kubeconfig file for the CNI plugin. Do this + # to skip TLS verification for now. We should eventually support + # writing more complete kubeconfig files. This is only used + # if the provided CNI network config references it. + touch $MULTUS_TEMP_KUBECONFIG + chmod ${KUBECONFIG_MODE:-600} $MULTUS_TEMP_KUBECONFIG + # Write the kubeconfig to a temp file first. + timenow=$(date) + cat > $MULTUS_TEMP_KUBECONFIG < $MULTUS_TEMP_CONFIG << EOF { "cniVersion": "0.3.1", "name": "multus-cni-network", "type": "multus", "confDir": "/etc/cni/net.d/" , - "logLevel": "{{ .Values.multus.multusCNI.log.logLevel }}", - "logFile": "{{ .Values.multus.multusCNI.log.logFile }}", + "logLevel": "debug", + "logFile": "/var/log/multus.log", "capabilities": { "portMappings": true, "bandwidth": true }, "namespaceIsolation": false, - "clusterNetwork": "{{ .Values.multus.multusCNI.defaultCniCRName }}", + "clusterNetwork": "$MULTUS_CLUSTER_NETWORK", "defaultNetworks": [], - "multusNamespace": "{{ .Release.Namespace }}", + "multusNamespace": "$MULTUS_NAMESPACE", "systemNamespaces": [], "kubeconfig": "/etc/cni/net.d/multus.d/multus.kubeconfig" } -{{- end }} + EOF + + if [ -z "${MULTUS_CLUSTER_NETWORK}" ]; then + log "ENV MULTUS_CLUSTER_NETWORK is empty, Detecting default cni in the ${CNI_CONF_DIR}" + DEFAULT_CNI_FILEPATH=$(ls -l ${CNI_CONF_DIR} | grep ^- | grep -v -i multus | awk '{print $9}' | grep -E '(*\.conf|*\.conflist|*\.json)' | head -n 1) + if [ -z "$DEFAULT_CNI_FILEPATH" ] ; then + error "No default cni file found in ${CNI_CONF_DIR}, please install your default cni in the cluster first" && exit 1 + fi + + log "Found the default-cni file: ${DEFAULT_CNI_FILEPATH}" + log "cat /host/etc/cni/net.d/${DEFAULT_CNI_FILEPATH}:" + cat /host/etc/cni/net.d/${DEFAULT_CNI_FILEPATH} + + echo "" + DEFAULT_CNI_NAME=$(grep '"name":' ${CNI_CONF_DIR}/${DEFAULT_CNI_FILEPATH} | awk '{print $2}' | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' | tr -d ',' | tr -d '"') + if [ -z "$DEFAULT_CNI_NAME" ] ; then + error "The name fleid shouldn't be empty, please check the default cni: ${DEFAULT_CNI_FILEPATH}" && exit 1 + fi + + log "Updating the clusterNetwork of the multus-cni config to $DEFAULT_CNI_NAME" + sed -i "s?\"clusterNetwork\": \"\"?\"clusterNetwork\": \"${DEFAULT_CNI_NAME}\"?g" /tmp/00-multus.conf + else + log "User set multus ClusterNetwork: $MULTUS_CLUSTER_NETWORK" + fi + + generateKubeConfig + log "multus kubeconfig is generated." + + cp $MULTUS_TEMP_CONFIG /host/etc/cni/net.d + log "multus config file ${MULTUS_TEMP_CONFIG} is copied to ${CNI_CONF_DIR}." + log "cat ${CNI_CONF_DIR}/00-multus.conf" + cat ${CNI_CONF_DIR}/00-multus.conf + + log "Entering watch loop..." + while true; do + + # Check the md5sum of the service account token and ca. + svcaccountsum=$(md5sum $SERVICE_ACCOUNT_TOKEN_PATH | awk '{print $1}') + casum=$(md5sum $KUBE_CA_FILE | awk '{print $1}') + if [ "$svcaccountsum" != "$LAST_SERVICEACCOUNT_MD5SUM" ] || [ "$casum" != "$LAST_KUBE_CA_FILE_MD5SUM" ]; then + log "Detected service account or CA file change, regenerating kubeconfig..." + generateKubeConfig + fi + + # todo: watch the default cni file is changed. + sleep 10 + done +{{- end }} \ No newline at end of file diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/templates/daemonset.yaml b/charts/spiderpool/spiderpool/charts/spiderpool/templates/daemonset.yaml index af672a82b..2478b3562 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/templates/daemonset.yaml +++ b/charts/spiderpool/spiderpool/charts/spiderpool/templates/daemonset.yaml @@ -106,6 +106,22 @@ spec: - name: cni-bin-path mountPath: /host/opt/cni/bin {{- end }} + {{- if .Values.multus.multusCNI.install }} + - name: install-multus-binary + image: {{ include "spiderpool.multus.image" . | quote }} + imagePullPolicy: IfNotPresent + command: + - /install_multus + args: + - --type + - thin + securityContext: + privileged: true + volumeMounts: + - mountPath: /host/opt/cni/bin + mountPropagation: Bidirectional + name: cni-bin-path + {{- end }} containers: - name: {{ .Values.spiderpoolAgent.name | trunc 63 | trimSuffix "-" }} image: {{ include "spiderpool.spiderpoolAgent.image" . | quote }} @@ -234,21 +250,30 @@ spec: {{- end }} {{- if .Values.multus.multusCNI.install }} - name: multus-cni - imagePullPolicy: {{ .Values.multus.multusCNI.image.pullPolicy }} - image: {{ include "spiderpool.multus.image" . | quote }} + image: {{ include "spiderpool.spiderpoolAgent.image" . | quote }} + imagePullPolicy: {{ .Values.spiderpoolAgent.image.pullPolicy }} command: - - "/bin/sh" - - "-c" - - | - ITEM="multus" - rm -f /host/opt/cni/bin/${ITEM}.old || true - ( [ -f "/host/opt/cni/bin/${ITEM}" ] && mv /host/opt/cni/bin/${ITEM} /host/opt/cni/bin/${ITEM}.old ) || true - cp /usr/src/multus-cni/bin/${ITEM} /host/opt/cni/bin/${ITEM} - rm -f /host/opt/cni/bin/${ITEM}.old &>/dev/null || true - ./entrypoint.sh --multus-conf-file=/tmp/multus-conf/00-multus.conf \ - --cni-version=0.3.1 + - "/home/entrypoint.sh" securityContext: privileged: true + env: + - name: MULTUS_CLUSTER_NETWORK + valueFrom: + configMapKeyRef: + key: clusterNetwork + name: spiderpool-conf + - name: MULTUS_NAMESPACE + valueFrom: + fieldRef: + apiVersion: v1 + fieldPath: metadata.namespace + resources: + limits: + cpu: 100m + memory: 50Mi + requests: + cpu: 100m + memory: 50Mi {{- if .Values.multus.multusCNI.uninstall }} lifecycle: preStop: @@ -264,11 +289,8 @@ spec: volumeMounts: - name: cni mountPath: /host/etc/cni/net.d - - name: cni-bin-path - mountPath: /host/opt/cni/bin - mountPropagation: Bidirectional - - name: multus-cfg - mountPath: /tmp/multus-conf + - mountPath: /home + name: multus-entrypoint {{- if .Values.multus.multusCNI.extraVolumes }} {{- include "tplvalues.render" ( dict "value" .Values.multus.multusCNI.extraVolumeMounts "context" $ ) | nindent 12 }} {{- end }} @@ -304,6 +326,13 @@ spec: items: - key: cni-conf.json path: 00-multus.conf + - name: multus-entrypoint + configMap: + name: {{ .Values.multus.multusCNI.name | trunc 63 | trimSuffix "-" }}-entrypoint + defaultMode: 511 + items: + - key: entrypoint.sh + path: entrypoint.sh {{- end }} {{- if .Values.spiderpoolAgent.extraVolumeMounts }} {{- include "tplvalues.render" ( dict "value" .Values.spiderpoolAgent.extraVolumeMounts "context" $ ) | nindent 6 }} diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboard.yaml b/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboard.yaml index 451bff910..9c6be12e3 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboard.yaml +++ b/charts/spiderpool/spiderpool/charts/spiderpool/templates/grafanaDashboard.yaml @@ -2,7 +2,7 @@ apiVersion: integreatly.org/v1alpha1 kind: GrafanaDashboard metadata: - name: {{ default "spiderpool" .Values.global.nameOverride }} + name: {{ default "spiderpool" .Values.global.nameOverride }}-ipam namespace: {{ default .Release.Namespace .Values.grafanaDashboard.namespace }} labels: {{- if .Values.global.commonLabels }} @@ -21,746 +21,5 @@ metadata: {{- end }} {{- end }} spec: - json: |- - { - "annotations": { - "list": [ - { - "builtIn": 1, - "datasource": { - "type": "grafana", - "uid": "-- Grafana --" - }, - "enable": true, - "hide": true, - "iconColor": "rgba(0, 211, 255, 1)", - "name": "Annotations & Alerts", - "target": { - "limit": 100, - "matchAny": false, - "tags": [], - "type": "dashboard" - }, - "type": "dashboard" - } - ] - }, - "editable": true, - "fiscalYearStartMonth": 0, - "graphTooltip": 0, - "id": 1, - "links": [], - "liveNow": false, - "panels": [ - { - "collapsed": false, - "gridPos": { - "h": 1, - "w": 24, - "x": 0, - "y": 0 - }, - "id": 6, - "panels": [], - "title": "Row title", - "type": "row" - }, - { - "datasource": { - "type": "prometheus" - }, - "fieldConfig": { - "defaults": { - "mappings": [], - "thresholds": { - "mode": "percentage", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "orange", - "value": 70 - }, - { - "color": "red", - "value": 85 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 0, - "y": 1 - }, - "id": 18, - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "9.1.6", - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "editorMode": "builder", - "expr": "spiderpool_total_ippool_counts", - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "total ippool counts", - "type": "gauge" - }, - { - "datasource": { - "type": "prometheus" - }, - "fieldConfig": { - "defaults": { - "mappings": [], - "thresholds": { - "mode": "percentage", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "orange", - "value": 70 - }, - { - "color": "red", - "value": 85 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 7, - "w": 12, - "x": 12, - "y": 1 - }, - "id": 20, - "options": { - "orientation": "auto", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "showThresholdLabels": false, - "showThresholdMarkers": true - }, - "pluginVersion": "9.1.6", - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "editorMode": "builder", - "expr": "spiderpool_total_subnet_counts", - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "total subnet counts", - "type": "gauge" - }, - { - "datasource": { - "type": "prometheus" - }, - "description": "spiderpool IPAM IP allocation status", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 25, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 10, - "x": 0, - "y": 8 - }, - "id": 2, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "editorMode": "builder", - "expr": "spiderpool_ipam_allocation_counts_total", - "legendFormat": "__auto", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus" - }, - "editorMode": "builder", - "expr": "spiderpool_ipam_allocation_failure_counts_total", - "hide": false, - "legendFormat": "__auto", - "range": true, - "refId": "B" - } - ], - "title": "IP allocation counts", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus" - }, - "description": "", - "fieldConfig": { - "defaults": { - "color": { - "mode": "continuous-GrYlRd" - }, - "custom": { - "fillOpacity": 70, - "lineWidth": 0, - "spanNulls": false - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 8, - "x": 10, - "y": 8 - }, - "id": 10, - "options": { - "alignValue": "left", - "legend": { - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "mergeValues": true, - "rowHeight": 0.9, - "showValue": "auto", - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "9.1.6", - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "editorMode": "builder", - "expr": "spiderpool_ipam_allocation_average_duration_seconds", - "legendFormat": "__auto", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus" - }, - "editorMode": "builder", - "expr": "spiderpool_ipam_allocation_max_duration_seconds", - "hide": false, - "legendFormat": "__auto", - "range": true, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus" - }, - "editorMode": "builder", - "expr": "spiderpool_ipam_allocation_min_duration_seconds", - "hide": false, - "legendFormat": "__auto", - "range": true, - "refId": "C" - }, - { - "datasource": { - "type": "prometheus" - }, - "editorMode": "builder", - "expr": "spiderpool_ipam_allocation_latest_duration_seconds", - "hide": false, - "legendFormat": "__auto", - "range": true, - "refId": "D" - } - ], - "title": "ip allocation durations", - "type": "state-timeline" - }, - { - "datasource": { - "type": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - } - }, - "mappings": [] - }, - "overrides": [] - }, - "gridPos": { - "h": 9, - "w": 6, - "x": 18, - "y": 8 - }, - "id": 12, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "9.1.6", - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "editorMode": "builder", - "expr": "spiderpool_ipam_allocation_duration_seconds_bucket", - "hide": false, - "legendFormat": "__auto", - "range": true, - "refId": "B" - } - ], - "title": "ip allocation duration distribution", - "type": "piechart" - }, - { - "datasource": { - "type": "prometheus" - }, - "description": "spiderpool IP release and IP GC status", - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "axisCenteredZero": false, - "axisColorMode": "text", - "axisLabel": "", - "axisPlacement": "auto", - "barAlignment": 0, - "drawStyle": "line", - "fillOpacity": 25, - "gradientMode": "none", - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - }, - "lineInterpolation": "linear", - "lineWidth": 1, - "pointSize": 5, - "scaleDistribution": { - "type": "linear" - }, - "showPoints": "auto", - "spanNulls": false, - "stacking": { - "group": "A", - "mode": "normal" - }, - "thresholdsStyle": { - "mode": "off" - } - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 10, - "x": 0, - "y": 17 - }, - "id": 4, - "options": { - "legend": { - "calcs": [], - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "editorMode": "builder", - "expr": "spiderpool_ipam_release_counts_total", - "legendFormat": "__auto", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus" - }, - "editorMode": "builder", - "expr": "spiderpool_ipam_release_failure_counts_total", - "hide": false, - "legendFormat": "__auto", - "range": true, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus" - }, - "editorMode": "builder", - "expr": "spiderpool_ip_gc_counts_total", - "hide": false, - "legendFormat": "__auto", - "range": true, - "refId": "C" - }, - { - "datasource": { - "type": "prometheus" - }, - "editorMode": "builder", - "expr": "spiderpool_ip_gc_failure_counts_total", - "hide": false, - "legendFormat": "__auto", - "range": true, - "refId": "D" - } - ], - "title": "IP release&GC counts", - "type": "timeseries" - }, - { - "datasource": { - "type": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "continuous-GrYlRd" - }, - "custom": { - "fillOpacity": 70, - "lineWidth": 0, - "spanNulls": false - }, - "mappings": [], - "thresholds": { - "mode": "absolute", - "steps": [ - { - "color": "green", - "value": null - }, - { - "color": "red", - "value": 80 - } - ] - } - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 8, - "x": 10, - "y": 17 - }, - "id": 14, - "options": { - "alignValue": "left", - "legend": { - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "mergeValues": true, - "rowHeight": 0.9, - "showValue": "auto", - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "pluginVersion": "9.1.6", - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "editorMode": "builder", - "expr": "spiderpool_ipam_release_average_duration_seconds", - "legendFormat": "__auto", - "range": true, - "refId": "A" - }, - { - "datasource": { - "type": "prometheus" - }, - "editorMode": "builder", - "expr": "spiderpool_ipam_release_max_duration_seconds", - "hide": false, - "legendFormat": "__auto", - "range": true, - "refId": "B" - }, - { - "datasource": { - "type": "prometheus" - }, - "editorMode": "builder", - "expr": "spiderpool_ipam_release_min_duration_seconds", - "hide": false, - "legendFormat": "__auto", - "range": true, - "refId": "C" - }, - { - "datasource": { - "type": "prometheus" - }, - "editorMode": "builder", - "expr": "spiderpool_ipam_release_latest_duration_seconds", - "hide": false, - "legendFormat": "__auto", - "range": true, - "refId": "D" - } - ], - "title": "IP release durations", - "type": "state-timeline" - }, - { - "datasource": { - "type": "prometheus" - }, - "fieldConfig": { - "defaults": { - "color": { - "mode": "palette-classic" - }, - "custom": { - "hideFrom": { - "legend": false, - "tooltip": false, - "viz": false - } - }, - "mappings": [] - }, - "overrides": [] - }, - "gridPos": { - "h": 8, - "w": 6, - "x": 18, - "y": 17 - }, - "id": 16, - "options": { - "legend": { - "displayMode": "list", - "placement": "bottom", - "showLegend": true - }, - "pieType": "pie", - "reduceOptions": { - "calcs": [ - "lastNotNull" - ], - "fields": "", - "values": false - }, - "tooltip": { - "mode": "single", - "sort": "none" - } - }, - "targets": [ - { - "datasource": { - "type": "prometheus" - }, - "editorMode": "builder", - "expr": "spiderpool_ipam_release_duration_seconds_bucket", - "legendFormat": "__auto", - "range": true, - "refId": "A" - } - ], - "title": "IP release duration distribution", - "type": "piechart" - } - ], - "refresh": false, - "schemaVersion": 37, - "style": "dark", - "tags": [], - "templating": { - "list": [] - }, - "time": { - "from": "now-6h", - "to": "now" - }, - "timepicker": {}, - "timezone": "", - "title": "spiderpool", - "uid": "5FAGqFE4z", - "version": 3, - "weekStart": "" - } + json: {{ .Files.Get "files/grafana-ipam.json" | toJson }} {{- end }} diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/templates/pod.yaml b/charts/spiderpool/spiderpool/charts/spiderpool/templates/pod.yaml index 5e178ea69..8ccf724b8 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/templates/pod.yaml +++ b/charts/spiderpool/spiderpool/charts/spiderpool/templates/pod.yaml @@ -80,19 +80,13 @@ spec: {{- end }} - name: SPIDERPOOL_INIT_ENABLE_MULTUS_CONFIG value: {{ .Values.multus.enableMultusConfig | quote }} - - name: SPIDERPOOL_INIT_INSTALL_MULTUS - value: {{ .Values.multus.multusCNI.install | quote }} - name: SPIDERPOOL_INIT_DEFAULT_CNI_NAME value: {{ .Values.multus.multusCNI.defaultCniCRName | quote }} - name: SPIDERPOOL_INIT_DEFAULT_CNI_NAMESPACE value: {{ .Release.Namespace | quote }} - - name: SPIDERPOOL_INIT_MULTUS_CONFIGMAP - value: {{ .Values.multus.multusCNI.name | trunc 63 | trimSuffix "-" | quote }} {{- if eq .Values.multus.multusCNI.defaultCniCRName "" }} - name: SPIDERPOOL_INIT_DEFAULT_CNI_DIR value: {{ .Values.global.cniConfHostPath | quote }} - - name: SPIDERPOOL_INIT_READINESS_FILE - value: "/etc/spiderpool/ready" volumeMounts: - name: cni mountPath: {{ .Values.global.cniConfHostPath }} diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/templates/servicemonitor.yaml b/charts/spiderpool/spiderpool/charts/spiderpool/templates/servicemonitor.yaml index 1efa34623..4e682105a 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/templates/servicemonitor.yaml +++ b/charts/spiderpool/spiderpool/charts/spiderpool/templates/servicemonitor.yaml @@ -5,6 +5,12 @@ metadata: name: {{ .Values.spiderpoolAgent.name | trunc 63 | trimSuffix "-" }} namespace: {{ default .Release.Namespace .Values.spiderpoolAgent.prometheus.serviceMonitor.namespace }} labels: + {{- if .Values.global.commonLabels }} + {{- include "tplvalues.render" ( dict "value" .Values.global.commonLabels "context" $ ) | nindent 4 }} + {{- end }} + {{- if .Values.spiderpoolAgent.prometheus.serviceMonitor.labels }} + {{- include "tplvalues.render" ( dict "value" .Values.grafanaDashboard.labels "context" $ ) | nindent 4 }} + {{- end }} {{- if or .Values.global.commonAnnotations .Values.spiderpoolAgent.prometheus.serviceMonitor.annotations }} annotations: {{- if .Values.global.commonAnnotations }} @@ -36,6 +42,13 @@ kind: ServiceMonitor metadata: name: {{ .Values.spiderpoolController.name | trunc 63 | trimSuffix "-" }} namespace: {{ default .Release.Namespace .Values.spiderpoolController.prometheus.serviceMonitor.namespace }} + labels: + {{- if .Values.global.commonLabels }} + {{- include "tplvalues.render" ( dict "value" .Values.global.commonLabels "context" $ ) | nindent 4 }} + {{- end }} + {{- if .Values.spiderpoolController.prometheus.serviceMonitor.labels }} + {{- include "tplvalues.render" ( dict "value" .Values.grafanaDashboard.labels "context" $ ) | nindent 4 }} + {{- end }} {{- if or .Values.global.commonAnnotations .Values.spiderpoolController.prometheus.serviceMonitor.annotations }} annotations: {{- if .Values.global.commonAnnotations }} diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/templates/tls.yaml b/charts/spiderpool/spiderpool/charts/spiderpool/templates/tls.yaml index 940d4274b..325485815 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/templates/tls.yaml +++ b/charts/spiderpool/spiderpool/charts/spiderpool/templates/tls.yaml @@ -144,6 +144,48 @@ webhooks: - spidercoordinators sideEffects: None {{- end }} +{{- if .Values.spiderpoolController.podResourceInject.enabled }} +- admissionReviewVersions: + - v1 + clientConfig: + service: + name: {{ .Values.spiderpoolController.name | trunc 63 | trimSuffix "-" }} + namespace: {{ .Release.Namespace }} + path: /mutate--v1-pod + port: {{ .Values.spiderpoolController.webhookPort }} + {{- if (eq .Values.spiderpoolController.tls.method "provided") }} + caBundle: {{ .Values.spiderpoolController.tls.provided.tlsCa | required "missing spiderpoolController.tls.provided.tlsCa" }} + {{- else if (eq .Values.spiderpoolController.tls.method "auto") }} + caBundle: {{ .ca.Cert | b64enc }} + {{- end }} + failurePolicy: Fail + name: pods.spiderpool.spidernet.io + {{- if or .Values.spiderpoolController.podResourceInject.namespacesExclude .Values.spiderpoolController.podResourceInject.namespacesInclude }} + namespaceSelector: + matchExpressions: + {{- if .Values.spiderpoolController.podResourceInject.namespacesExclude }} + - key: kubernetes.io/metadata.name + operator: NotIn + values: {{ toYaml .Values.spiderpoolController.podResourceInject.namespacesExclude | nindent 8 }} + {{- end }} + {{- if .Values.spiderpoolController.podResourceInject.namespacesInclude }} + - key: kubernetes.io/metadata.name + operator: In + values: {{ toYaml .Values.spiderpoolController.podResourceInject.namespacesInclude | nindent 8 }} + {{- end }} + {{- end }} + rules: + - apiGroups: + - "" + apiVersions: + - v1 + operations: + - CREATE + - UPDATE + resources: + - pods + sideEffects: None +{{- end }} --- apiVersion: admissionregistration.k8s.io/v1 kind: ValidatingWebhookConfiguration diff --git a/charts/spiderpool/spiderpool/charts/spiderpool/values.yaml b/charts/spiderpool/spiderpool/charts/spiderpool/values.yaml index ab33cc0f8..a8bc59219 100644 --- a/charts/spiderpool/spiderpool/charts/spiderpool/values.yaml +++ b/charts/spiderpool/spiderpool/charts/spiderpool/values.yaml @@ -233,8 +233,7 @@ multus: digest: "" ## @param multus.multusCNI.image.tag the multus-CNI image tag - tag: v3.9.3 - # tag: v4.0.2-thick + tag: v4.1.4 ## @param multus.multusCNI.image.imagePullSecrets the multus-CNI image imagePullSecrets imagePullSecrets: [] @@ -302,7 +301,7 @@ plugins: digest: "" ## @param plugins.image.tag the image tag of plugins - tag: 82659d90cae0d6a5169eac2869e47c989932d775 + tag: 27c4f118b1cec3773f2679b772e7583fc77e5686 ## @param plugins.image.imagePullSecrets the image imagePullSecrets of plugins imagePullSecrets: [] @@ -369,7 +368,7 @@ spiderpoolAgent: digest: "" ## @param spiderpoolAgent.image.tag the image tag of spiderpoolAgent, overrides the image tag whose default is the chart appVersion. - tag: v1.0.0-rc3 + tag: v1.0.0-rc4 ## @param spiderpoolAgent.image.imagePullSecrets the image imagePullSecrets of spiderpoolAgent imagePullSecrets: [] @@ -559,7 +558,7 @@ spiderpoolController: digest: "" ## @param spiderpoolController.image.tag the image tag of spiderpoolController, overrides the image tag whose default is the chart appVersion. - tag: v1.0.0-rc3 + tag: v1.0.0-rc4 ## @param spiderpoolController.image.imagePullSecrets the image imagePullSecrets of spiderpoolController imagePullSecrets: [] @@ -805,7 +804,7 @@ spiderpoolInit: digest: "" ## @param spiderpoolInit.image.tag the image tag of spiderpoolInit, overrides the image tag whose default is the chart appVersion. - tag: v1.0.0-rc3 + tag: v1.0.0-rc4 ## @param spiderpoolInit.image.imagePullSecrets the image imagePullSecrets of spiderpoolInit imagePullSecrets: [] diff --git a/charts/spiderpool/spiderpool/values.yaml b/charts/spiderpool/spiderpool/values.yaml index 330011bb3..d5a0470f2 100644 --- a/charts/spiderpool/spiderpool/values.yaml +++ b/charts/spiderpool/spiderpool/values.yaml @@ -179,9 +179,7 @@ spiderpool: ## @param multus.multusCNI.image.digest the multus-CNI image digest digest: "" ## @param multus.multusCNI.image.tag the multus-CNI image tag - tag: v3.9.3 - # tag: v4.0.2-thick - + tag: v4.1.4 ## @param multus.multusCNI.image.imagePullSecrets the multus-CNI image imagePullSecrets imagePullSecrets: [] # - name: "image-pull-secret" @@ -232,7 +230,7 @@ spiderpool: ## @param plugins.image.digest the image digest of plugins digest: "" ## @param plugins.image.tag the image tag of plugins - tag: 82659d90cae0d6a5169eac2869e47c989932d775 + tag: 27c4f118b1cec3773f2679b772e7583fc77e5686 ## @param plugins.image.imagePullSecrets the image imagePullSecrets of plugins imagePullSecrets: [] ## @section clusterDefaultPool parameters @@ -281,7 +279,7 @@ spiderpool: ## @param spiderpoolAgent.image.digest the image digest of spiderpoolAgent, which takes preference over tag digest: "" ## @param spiderpoolAgent.image.tag the image tag of spiderpoolAgent, overrides the image tag whose default is the chart appVersion. - tag: v1.0.0-rc3 + tag: v1.0.0-rc4 ## @param spiderpoolAgent.image.imagePullSecrets the image imagePullSecrets of spiderpoolAgent imagePullSecrets: [] # - name: "image-pull-secret" @@ -424,7 +422,7 @@ spiderpool: ## @param spiderpoolController.image.digest the image digest of spiderpoolController, which takes preference over tag digest: "" ## @param spiderpoolController.image.tag the image tag of spiderpoolController, overrides the image tag whose default is the chart appVersion. - tag: v1.0.0-rc3 + tag: v1.0.0-rc4 ## @param spiderpoolController.image.imagePullSecrets the image imagePullSecrets of spiderpoolController imagePullSecrets: [] # - name: "image-pull-secret" @@ -638,7 +636,7 @@ spiderpool: ## @param spiderpoolInit.image.digest the image digest of spiderpoolInit, which takes preference over tag digest: "" ## @param spiderpoolInit.image.tag the image tag of spiderpoolInit, overrides the image tag whose default is the chart appVersion. - tag: v1.0.0-rc3 + tag: v1.0.0-rc4 ## @param spiderpoolInit.image.imagePullSecrets the image imagePullSecrets of spiderpoolInit imagePullSecrets: [] # - name: "image-pull-secret"