From ca118d3111bdba8853c7a1e81d6879a814f02ba1 Mon Sep 17 00:00:00 2001 From: Yusuke Kadowaki Date: Mon, 30 Oct 2023 13:32:26 +0900 Subject: [PATCH] Add index correction metrics (#2215) * Add observability pkg for index correction * Add template grafana dashboard page of index correction * Update label * Add checked index count * Fix o11y service name * Add correction metrics fields in the grafana * Init correction metrics * Organized grafana dashboard for index correction * Fix misspell * Remove unused method receiver --- charts/vald/values.yaml | 2 +- .../index/job/correction/correction.go | 108 ++ .../dashboards/09-vald-index-correction.yaml | 1410 +++++++++++++++++ k8s/metrics/grafana/deployment.yaml | 6 + pkg/index/job/correction/service/corrector.go | 37 +- pkg/index/job/correction/usecase/corrector.go | 6 +- 6 files changed, 1560 insertions(+), 9 deletions(-) create mode 100644 internal/observability/metrics/index/job/correction/correction.go create mode 100644 k8s/metrics/grafana/dashboards/09-vald-index-correction.yaml diff --git a/charts/vald/values.yaml b/charts/vald/values.yaml index 46035e6197..6d302054fc 100644 --- a/charts/vald/values.yaml +++ b/charts/vald/values.yaml @@ -2676,7 +2676,7 @@ manager: observability: otlp: attribute: - service_name: vald-manager-index + service_name: vald-index-correction # @schema {"name": "manager.index.corrector.enabled", "type": "boolean"} # manager.index.corrector.enabled -- enable index correction CronJob enabled: false diff --git a/internal/observability/metrics/index/job/correction/correction.go b/internal/observability/metrics/index/job/correction/correction.go new file mode 100644 index 0000000000..0e31345e2b --- /dev/null +++ b/internal/observability/metrics/index/job/correction/correction.go @@ -0,0 +1,108 @@ +package correction + +import ( + "context" + + "github.com/vdaas/vald/internal/observability/metrics" + "github.com/vdaas/vald/pkg/index/job/correction/service" + "go.opentelemetry.io/otel/sdk/metric/aggregation" + "go.opentelemetry.io/otel/sdk/metric/view" +) + +const ( + checkedIndexCount = "index_job_correction_checked_index_count" + checkedIndexCountDesc = "The number of checked indexes while index correction job" + + correctedOldIndexCount = "index_job_correction_corrected_old_index_count" + correctedOldIndexCountDesc = "The number of corrected old indexes while index correction job" + + correctedReplicationCount = "index_job_correction_corrected_replication_count" + correctedReplicationCountDesc = "The number of operation happened to correct replication number while index correction job" +) + +type correctionMetrics struct { + correction service.Corrector +} + +func New(c service.Corrector) metrics.Metric { + return &correctionMetrics{ + correction: c, + } +} + +func (*correctionMetrics) View() ([]*metrics.View, error) { + checkedIndexCount, err := view.New( + view.MatchInstrumentName(checkedIndexCount), + view.WithSetDescription(checkedIndexCountDesc), + view.WithSetAggregation(aggregation.LastValue{}), + ) + if err != nil { + return nil, err + } + + oldIndexCount, err := view.New( + view.MatchInstrumentName(correctedOldIndexCount), + view.WithSetDescription(correctedOldIndexCountDesc), + view.WithSetAggregation(aggregation.LastValue{}), + ) + if err != nil { + return nil, err + } + + replicationCount, err := view.New( + view.MatchInstrumentName(correctedReplicationCount), + view.WithSetDescription(correctedReplicationCountDesc), + view.WithSetAggregation(aggregation.LastValue{}), + ) + if err != nil { + return nil, err + } + + return []*metrics.View{ + &checkedIndexCount, + &oldIndexCount, + &replicationCount, + }, nil +} + +func (c *correctionMetrics) Register(m metrics.Meter) error { + checkedIndexCount, err := m.AsyncInt64().Gauge( + checkedIndexCount, + metrics.WithDescription(checkedIndexCountDesc), + metrics.WithUnit(metrics.Dimensionless), + ) + if err != nil { + return err + } + + oldIndexCount, err := m.AsyncInt64().Gauge( + correctedOldIndexCount, + metrics.WithDescription(correctedOldIndexCountDesc), + metrics.WithUnit(metrics.Dimensionless), + ) + if err != nil { + return err + } + + replicationCount, err := m.AsyncInt64().Gauge( + correctedReplicationCount, + metrics.WithDescription(correctedReplicationCountDesc), + metrics.WithUnit(metrics.Dimensionless), + ) + if err != nil { + return err + } + + return m.RegisterCallback( + []metrics.AsynchronousInstrument{ + checkedIndexCount, + oldIndexCount, + replicationCount, + }, + func(ctx context.Context) { + checkedIndexCount.Observe(ctx, int64(c.correction.NumberOfCheckedIndex())) + oldIndexCount.Observe(ctx, int64(c.correction.NumberOfCorrectedOldIndex())) + replicationCount.Observe(ctx, int64(c.correction.NumberOfCorrectedReplication())) + }, + ) +} diff --git a/k8s/metrics/grafana/dashboards/09-vald-index-correction.yaml b/k8s/metrics/grafana/dashboards/09-vald-index-correction.yaml new file mode 100644 index 0000000000..031f16864a --- /dev/null +++ b/k8s/metrics/grafana/dashboards/09-vald-index-correction.yaml @@ -0,0 +1,1410 @@ +--- +# +# Copyright (C) 2019-2023 vdaas.org vald team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +apiVersion: v1 +kind: ConfigMap +metadata: + name: grafana-dashboards-vald-index-correction +data: + vald-index-correction.json: | + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 0, + "links": [], + "liveNow": false, + "panels": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 0, + "y": 0 + }, + "id": 2, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/^vald_version$/", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "app_version_info{exported_kubernetes_namespace=\"$Namespace\", kubernetes_name=~\"$ReplicaSet\", target_pod=~\"$PodName\"}", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Vald Version", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 4, + "y": 0 + }, + "id": 4, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/^go_version$/", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "label_replace(app_version_info{exported_kubernetes_namespace=\"$Namespace\", kubernetes_name=~\"$ReplicaSet\", target_pod=~\"$PodName\"}, \"go_version\", \"v$1\", \"go_version\", \"([^v].*)\")", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Go Version", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 8, + "y": 0 + }, + "id": 6, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/^go_os$/", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "app_version_info{exported_kubernetes_namespace=\"$Namespace\", kubernetes_name=~\"$ReplicaSet\", target_pod=~\"$PodName\"}", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Go OS", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 100 + }, + { + "color": "#d44a3a", + "value": 300 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 12, + "y": 0 + }, + "id": 8, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "count(kube_pod_info{namespace=\"$Namespace\", pod=~\"$ReplicaSet.*\"})", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Pods ($ReplicaSet)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "decimals": 2, + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 10000000000 + }, + { + "color": "#d44a3a", + "value": 1000000000000 + } + ] + }, + "unit": "decbytes" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 8, + "x": 16, + "y": 0 + }, + "id": 10, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "area", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{namespace=\"$Namespace\", container=\"$ReplicaSet\", image!=\"\"})", + "format": "time_series", + "interval": "", + "legendFormat": "", + "range": true, + "refId": "A" + } + ], + "title": "Total memory working set ($ReplicaSet)", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 8, + "x": 0, + "y": 3 + }, + "id": 12, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/^git_commit$/", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "app_version_info{exported_kubernetes_namespace=\"$Namespace\", kubernetes_name=~\"$ReplicaSet\", target_pod=~\"$PodName\"}", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Git Commit", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 8, + "y": 3 + }, + "id": 14, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "lastNotNull" + ], + "fields": "/^build_time$/", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "app_version_info{exported_kubernetes_namespace=\"$Namespace\", kubernetes_name=~\"$ReplicaSet\", target_pod=~\"$PodName\"}", + "format": "table", + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "Build at", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 6 + }, + "hiddenSeries": false, + "id": 16, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.2.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(irate(container_cpu_usage_seconds_total{namespace=\"$Namespace\", container=\"$ReplicaSet\", pod=~\"$PodName\", image!=\"\"}[$interval])) by (pod) and on() count(kube_job_created{job_name=\"$ReplicaSet\"}) >= 1", + "interval": "", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "CPU", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:76", + "format": "short", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:77", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 100 + }, + { + "color": "#d44a3a", + "value": 300 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 12, + "y": 6 + }, + "id": 27, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "index_job_correction_checked_index_count{exported_kubernetes_namespace=\"$Namespace\", kubernetes_name=~\"$ReplicaSet\", target_pod=~\"$PodName\"}", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "checked index count", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 100 + }, + { + "color": "#d44a3a", + "value": 300 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 16, + "y": 6 + }, + "id": 28, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "index_job_correction_corrected_old_index_count{exported_kubernetes_namespace=\"$Namespace\", kubernetes_name=~\"$ReplicaSet\", target_pod=~\"$PodName\"}", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "corrected old index count", + "type": "stat" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "mappings": [ + { + "options": { + "match": "null", + "result": { + "text": "N/A" + } + }, + "type": "special" + } + ], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "#299c46", + "value": null + }, + { + "color": "rgba(237, 129, 40, 0.89)", + "value": 100 + }, + { + "color": "#d44a3a", + "value": 300 + } + ] + }, + "unit": "none" + }, + "overrides": [] + }, + "gridPos": { + "h": 6, + "w": 4, + "x": 20, + "y": 6 + }, + "id": 29, + "links": [], + "maxDataPoints": 100, + "options": { + "colorMode": "value", + "graphMode": "none", + "justifyMode": "auto", + "orientation": "horizontal", + "reduceOptions": { + "calcs": [ + "mean" + ], + "fields": "", + "values": false + }, + "text": {}, + "textMode": "auto" + }, + "pluginVersion": "10.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "index_job_correction_corrected_replication_count{exported_kubernetes_namespace=\"$Namespace\", kubernetes_name=~\"$ReplicaSet\", target_pod=~\"$PodName\"}", + "format": "table", + "hide": false, + "instant": true, + "interval": "", + "legendFormat": "", + "refId": "A" + } + ], + "title": "corrected replication count", + "type": "stat" + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 12 + }, + "hiddenSeries": false, + "id": 18, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.2.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "exemplar": true, + "expr": "sum(container_memory_working_set_bytes{namespace=\"$Namespace\", container=\"$ReplicaSet\", pod=~\"$PodName\", image!=\"\"}) by (pod) and on() count(kube_job_created{job_name=\"$ReplicaSet\"}) >= 1", + "interval": "", + "legendFormat": "{{pod}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "Memory working set", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "$$hashKey": "object:154", + "format": "decbytes", + "logBase": 1, + "min": "0", + "show": true + }, + { + "$$hashKey": "object:155", + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 0, + "y": 14 + }, + "hiddenSeries": false, + "id": 24, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.2.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "goroutine_count{exported_kubernetes_namespace=\"$Namespace\", kubernetes_name=~\"$ReplicaSet\", target_pod=~\"$PodName\"}", + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{target_pod}}", + "range": true, + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "goroutine count", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "min": "0", + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + }, + { + "aliasColors": {}, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "links": [] + }, + "overrides": [] + }, + "fill": 0, + "fillGradient": 0, + "gridPos": { + "h": 8, + "w": 12, + "x": 12, + "y": 20 + }, + "hiddenSeries": false, + "id": 26, + "interval": "", + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 1, + "nullPointMode": "null", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "10.2.0", + "pointradius": 2, + "points": false, + "renderer": "flot", + "seriesOverrides": [], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "increase(gc_count{exported_kubernetes_namespace=\"$Namespace\", kubernetes_name=~\"$ReplicaSet\", target_node=~\".+\"}[$interval])", + "format": "time_series", + "instant": false, + "interval": "", + "intervalFactor": 1, + "legendFormat": "{{target_pod}}", + "refId": "A" + } + ], + "thresholds": [], + "timeRegions": [], + "title": "GC count /s", + "tooltip": { + "shared": true, + "sort": 0, + "value_type": "individual" + }, + "type": "graph", + "xaxis": { + "mode": "time", + "show": true, + "values": [] + }, + "yaxes": [ + { + "format": "short", + "logBase": 1, + "show": true + }, + { + "format": "short", + "logBase": 1, + "show": true + } + ], + "yaxis": { + "align": false + } + } + ], + "refresh": "", + "schemaVersion": 38, + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false, + "text": "default", + "value": "default" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(kube_pod_info, namespace)", + "hide": 0, + "includeAll": false, + "label": "namespace", + "multi": false, + "name": "Namespace", + "options": [], + "query": { + "query": "label_values(kube_pod_info, namespace)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "current": { + "selected": false, + "text": "vald-index-correction", + "value": "vald-index-correction" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(app_version_info{server_name=~\"index correction job\"}, kubernetes_name)", + "hide": 0, + "includeAll": false, + "label": "name", + "multi": false, + "name": "ReplicaSet", + "options": [], + "query": { + "query": "label_values(app_version_info{server_name=~\"index correction job\"}, kubernetes_name)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".+", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(app_version_info{server_name=~\"index correction job\", kubernetes_name=\"$ReplicaSet\"}, target_pod)", + "hide": 0, + "includeAll": true, + "label": "pod", + "multi": false, + "name": "PodName", + "options": [], + "query": { + "query": "label_values(app_version_info{server_name=~\"index correction job\", kubernetes_name=\"$ReplicaSet\"}, target_pod)", + "refId": "StandardVariableQuery" + }, + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "type": "query" + }, + { + "auto": false, + "auto_count": 30, + "auto_min": "10s", + "current": { + "selected": true, + "text": "1m", + "value": "1m" + }, + "hide": 0, + "label": "interval", + "name": "interval", + "options": [ + { + "selected": true, + "text": "1m", + "value": "1m" + }, + { + "selected": false, + "text": "2m", + "value": "2m" + }, + { + "selected": false, + "text": "5m", + "value": "5m" + }, + { + "selected": false, + "text": "10m", + "value": "10m" + }, + { + "selected": false, + "text": "30m", + "value": "30m" + }, + { + "selected": false, + "text": "1h", + "value": "1h" + }, + { + "selected": false, + "text": "6h", + "value": "6h" + }, + { + "selected": false, + "text": "12h", + "value": "12h" + }, + { + "selected": false, + "text": "1d", + "value": "1d" + } + ], + "query": "1m,2m,5m,10m,30m,1h,6h,12h,1d", + "refresh": 2, + "skipUrlSync": false, + "type": "interval" + } + ] + }, + "time": { + "from": "now-3h", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Vald Index Correction", + "uid": "a8fc1362-e1b8-419f-91de-5205c1c82476", + "version": 1, + "weekStart": "" + } diff --git a/k8s/metrics/grafana/deployment.yaml b/k8s/metrics/grafana/deployment.yaml index 44abf79f98..19fcf14070 100644 --- a/k8s/metrics/grafana/deployment.yaml +++ b/k8s/metrics/grafana/deployment.yaml @@ -50,6 +50,8 @@ spec: mountPath: /var/lib/grafana/dashboards-vald/07 - name: grafana-dashboards-vald-lb-gateway mountPath: /var/lib/grafana/dashboards-vald/08 + - name: grafana-dashboards-vald-index-correction + mountPath: /var/lib/grafana/dashboards-vald/09 volumes: - name: grafana-datasource-provider configMap: @@ -83,3 +85,7 @@ spec: configMap: defaultMode: 420 name: grafana-dashboards-vald-lb-gateway + - name: grafana-dashboards-vald-index-correction + configMap: + defaultMode: 420 + name: grafana-dashboards-vald-index-correction diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index f5553c73ba..9dfe5abb91 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -52,16 +52,23 @@ const ( type Corrector interface { Start(ctx context.Context) (<-chan error, error) PreStop(ctx context.Context) error + // For metrics + NumberOfCheckedIndex() uint64 + NumberOfCorrectedOldIndex() uint64 + NumberOfCorrectedReplication() uint64 } type correct struct { - cfg *config.Data - discoverer discoverer.Client - agentAddrs []string - indexInfos sync.Map[string, *payload.Info_Index_Count] - uuidsCount uint32 - uncommittedUUIDsCount uint32 - checkedID bbolt.Bbolt + cfg *config.Data + discoverer discoverer.Client + agentAddrs []string + indexInfos sync.Map[string, *payload.Info_Index_Count] + uuidsCount uint32 + uncommittedUUIDsCount uint32 + checkedID bbolt.Bbolt + checkedIndexCount atomic.Uint64 + correctedOldIndexCount atomic.Uint64 + correctedReplicationCount atomic.Uint64 } const filemode = 0o600 @@ -127,6 +134,18 @@ func (c *correct) PreStop(_ context.Context) error { return c.checkedID.Close(true) } +func (c *correct) NumberOfCheckedIndex() uint64 { + return c.checkedIndexCount.Load() +} + +func (c *correct) NumberOfCorrectedOldIndex() uint64 { + return c.correctedOldIndexCount.Load() +} + +func (c *correct) NumberOfCorrectedReplication() uint64 { + return c.correctedReplicationCount.Load() +} + // skipcq: GO-R1005 func (c *correct) correct(ctx context.Context) (err error) { // leftAgentAddrs is the agents' addr that hasn't been corrected yet. @@ -258,6 +277,7 @@ func (c *correct) correct(ctx context.Context) (err error) { // now this id is checked so set it to the disk cache c.checkedID.AsyncSet(bolteg, []byte(id), nil) + c.checkedIndexCount.Add(1) return nil })) @@ -377,6 +397,7 @@ func (c *correct) correctTimestamp(ctx context.Context, targetReplica *vectorRep latest.vec.GetId(), latest.vec.GetTimestamp(), ) + c.correctedOldIndexCount.Add(1) if err := c.updateObject(ctx, replica.addr, latest.vec); err != nil { return err } @@ -417,6 +438,7 @@ func (c *correct) correctReplica( // when there are less replicas than the correct number, add the extra replicas if diff < 0 { log.Infof("replica shortage of vector %s. inserting to other agents...", targetReplica.vec.GetId()) + c.correctedReplicationCount.Add(1) if len(availableAddrs) == 0 { return errors.ErrNoAvailableAgentToInsert } @@ -442,6 +464,7 @@ func (c *correct) correctReplica( // when there are more replicas than the correct number, delete the extra replicas log.Infof("replica oversupply of vector %s. deleting...", targetReplica.vec.GetId()) + c.correctedReplicationCount.Add(1) // delete from myself if err := c.deleteObject(ctx, targetReplica.addr, targetReplica.vec); err != nil { log.Errorf("failed to delete object from agent(%s): %v", targetReplica.addr, err) diff --git a/pkg/index/job/correction/usecase/corrector.go b/pkg/index/job/correction/usecase/corrector.go index 337b308f3c..44907b9c86 100644 --- a/pkg/index/job/correction/usecase/corrector.go +++ b/pkg/index/job/correction/usecase/corrector.go @@ -26,6 +26,7 @@ import ( "github.com/vdaas/vald/internal/net/grpc" "github.com/vdaas/vald/internal/net/grpc/interceptor/server/recover" "github.com/vdaas/vald/internal/observability" + "github.com/vdaas/vald/internal/observability/metrics/index/job/correction" "github.com/vdaas/vald/internal/runner" "github.com/vdaas/vald/internal/safety" "github.com/vdaas/vald/internal/servers/server" @@ -115,7 +116,10 @@ func New(cfg *config.Data) (r runner.Runner, err error) { var obs observability.Observability if cfg.Observability.Enabled { - obs, err = observability.NewWithConfig(cfg.Observability) + obs, err = observability.NewWithConfig( + cfg.Observability, + correction.New(corrector), + ) if err != nil { log.Error("failed to initialize observability") return nil, err