From a1ee9d850554e3a4f88e364abb1744a464d5596a Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 4 Aug 2023 07:04:25 +0000 Subject: [PATCH 001/101] implement the initail framework --- cmd/index/job/correction/main.go | 45 ++++++++++++++++++ pkg/index/job/correction/usecase/corrector.go | 46 +++++++++++++++++++ 2 files changed, 91 insertions(+) create mode 100644 cmd/index/job/correction/main.go create mode 100644 pkg/index/job/correction/usecase/corrector.go diff --git a/cmd/index/job/correction/main.go b/cmd/index/job/correction/main.go new file mode 100644 index 0000000000..423bdfbbe5 --- /dev/null +++ b/cmd/index/job/correction/main.go @@ -0,0 +1,45 @@ +package main + +import ( + "context" + + "github.com/vdaas/vald/internal/errors" + "github.com/vdaas/vald/internal/info" + "github.com/vdaas/vald/internal/log" + "github.com/vdaas/vald/internal/runner" + "github.com/vdaas/vald/internal/safety" + "github.com/vdaas/vald/pkg/index/job/correction/usecase" + + + "github.com/vdaas/vald/pkg/manager/index/config" // FIXME: あとで独自のconfigに切り替え +) + +const ( + maxVersion = "v0.0.10" + minVersion = "v0.0.0" + name = "index correction job" +) + +func main() { + // FIXME: demon前提なので基本的に止まらない。独自のrunnerを作る必要があるか + if err := safety.RecoverFunc(func() error { + return runner.Do( + context.Background(), + runner.WithName(name), + runner.WithVersion(info.Version, maxVersion, minVersion), + runner.WithConfigLoader(func(path string) (interface{}, *config.GlobalConfig, error) { + cfg, err := config.NewConfig(path) + if err != nil { + return nil, nil, errors.Wrap(err, "failed to load "+name+"'s configuration") + } + return cfg, &cfg.GlobalConfig, nil + }), + runner.WithDaemonInitializer(func(cfg interface{}) (runner.Runner, error) { + return usecase.New() + }), + ) + })(); err != nil { + log.Fatal(err, info.Get()) + return + } +} diff --git a/pkg/index/job/correction/usecase/corrector.go b/pkg/index/job/correction/usecase/corrector.go new file mode 100644 index 0000000000..47a1ff58fe --- /dev/null +++ b/pkg/index/job/correction/usecase/corrector.go @@ -0,0 +1,46 @@ +package usecase + +import ( + "context" + + "github.com/vdaas/vald/internal/errgroup" + "github.com/vdaas/vald/internal/runner" +) + +type run struct { + eg errgroup.Group + // cfg *config.Data + // server starter.Server + // observability observability.Observability + // indexer service.Indexer +} + +// FIXME: add config +func New() (r runner.Runner, err error) { + eg := errgroup.Get() + return &run{ + eg: eg, + }, nil +} + +func (r *run) PreStart(ctx context.Context) error { + return nil +} + +func (r *run) Start(ctx context.Context) (<-chan error, error) { + ech := make(chan error, 5) // FIXME: magic number 5 + return ech, nil +} + +func (*run) PreStop(context.Context) error { + return nil +} + +func (*run) Stop(context.Context) error { + return nil +} + + +func (r *run) PostStop(ctx context.Context) error { + return nil +} From 8a1221d8da2645fc3e603a173516dc6e86dbdf84 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 7 Aug 2023 08:33:21 +0000 Subject: [PATCH 002/101] add corrector configuration --- cmd/index/job/correction/sample.yaml | 200 ++++++++++++++++++++++ internal/config/corrector.go | 54 ++++++ pkg/index/job/correction/config/config.go | 75 ++++++++ 3 files changed, 329 insertions(+) create mode 100644 cmd/index/job/correction/sample.yaml create mode 100644 internal/config/corrector.go create mode 100644 pkg/index/job/correction/config/config.go diff --git a/cmd/index/job/correction/sample.yaml b/cmd/index/job/correction/sample.yaml new file mode 100644 index 0000000000..80ddd09082 --- /dev/null +++ b/cmd/index/job/correction/sample.yaml @@ -0,0 +1,200 @@ +# +# Copyright (C) 2019-2023 vdaas.org vald team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +--- +version: v0.0.0 +time_zone: JST +logging: + format: raw + level: debug + logger: glg +server_config: + servers: + - name: grpc + host: 0.0.0.0 + port: 8081 + grpc: + bidirectional_stream_concurrency: 20 + connection_timeout: "" + header_table_size: 0 + initial_conn_window_size: 0 + initial_window_size: 0 + interceptors: [] + keepalive: + max_conn_age: "" + max_conn_age_grace: "" + max_conn_idle: "" + time: "" + timeout: "" + max_header_list_size: 0 + max_receive_message_size: 0 + max_send_message_size: 0 + read_buffer_size: 0 + write_buffer_size: 0 + mode: GRPC + probe_wait_time: 3s + restart: true + health_check_servers: + - name: readiness + host: 0.0.0.0 + port: 3001 + http: + handler_timeout: "" + idle_timeout: "" + read_header_timeout: "" + read_timeout: "" + shutdown_duration: 0s + write_timeout: "" + mode: "" + probe_wait_time: 3s + metrics_servers: + startup_strategy: + - grpc + - readiness + full_shutdown_duration: 600s + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + key: /path/to/key +corrector: + agent_port: 8081 + agent_name: "vald-agent-ngt" + agent_dns: vald-agent-ngt.default.svc.cluster.local + agent_namespace: "_MY_POD_NAMESPACE_" + node_name: "" + discoverer: + duration: 500ms + client: + addrs: + - vald-discoverer.default.svc.cluster.local:8081 + health_check_duration: "1s" + connection_pool: + enable_dns_resolver: true + enable_rebalance: true + old_conn_close_duration: 3s + rebalance_duration: 30m + size: 3 + backoff: + backoff_factor: 1.1 + backoff_time_limit: 5s + enable_error_log: true + initial_duration: 5ms + jitter_limit: 100ms + maximum_duration: 5s + retry_count: 100 + call_option: + max_recv_msg_size: 0 + max_retry_rpc_buffer_size: 0 + max_send_msg_size: 0 + wait_for_ready: true + dial_option: + backoff_base_delay: 1s + backoff_jitter: 0.2 + backoff_max_delay: 120s + backoff_multiplier: 1.6 + enable_backoff: false + initial_connection_window_size: 0 + initial_window_size: 0 + insecure: true + keepalive: + permit_without_stream: false + time: "" + timeout: "" + max_msg_size: 0 + min_connection_timeout: 20s + read_buffer_size: 0 + tcp: + dialer: + dual_stack_enabled: true + keepalive: "" + timeout: "" + dns: + cache_enabled: true + cache_expiration: 1h + refresh_duration: 30m + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + key: /path/to/key + timeout: "" + write_buffer_size: 0 + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + key: /path/to/key + agent_client_options: + addrs: [] + health_check_duration: "1s" + connection_pool: + enable_dns_resolver: true + enable_rebalance: true + old_conn_close_duration: 3s + rebalance_duration: 30m + size: 3 + backoff: + backoff_factor: 1.1 + backoff_time_limit: 5s + enable_error_log: true + initial_duration: 5ms + jitter_limit: 100ms + maximum_duration: 5s + retry_count: 100 + call_option: + max_recv_msg_size: 0 + max_retry_rpc_buffer_size: 0 + max_send_msg_size: 0 + wait_for_ready: true + dial_option: + write_buffer_size: 0 + read_buffer_size: 0 + initial_window_size: 0 + initial_connection_window_size: 0 + max_msg_size: 0 + backoff_max_delay: "120s" + backoff_base_delay: "1s" + backoff_multiplier: 1.6 + backoff_jitter: 0.2 + min_connection_timeout: "20s" + enable_backoff: false + insecure: true + timeout: "" + tcp: + dns: + cache_enabled: true + cache_expiration: 1h + refresh_duration: 30m + dialer: + timeout: "" + keepalive: "15m" + dual_stack_enabled: true + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + key: /path/to/key + keepalive: + permit_without_stream: false + time: "" + timeout: "" + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + key: /path/to/key + \ No newline at end of file diff --git a/internal/config/corrector.go b/internal/config/corrector.go new file mode 100644 index 0000000000..a17657b516 --- /dev/null +++ b/internal/config/corrector.go @@ -0,0 +1,54 @@ +// +// Copyright (C) 2019-2023 vdaas.org vald team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Package config providers configuration type and load configuration logic +package config + +// Corrector represents the index correction configurations. +type Corrector struct { + // AgentPort represent agent port number + AgentPort int `json:"agent_port" yaml:"agent_port"` + + // AgentName represent agents meta_name for service discovery + AgentName string `json:"agent_name" yaml:"agent_name"` + + // AgentNamespace represent agent namespace location + AgentNamespace string `json:"agent_namespace" yaml:"agent_namespace"` + + // AgentDNS represent agents dns A record for service discovery + AgentDNS string `json:"agent_dns" yaml:"agent_dns"` + + CreationPoolSize uint32 `yaml:"creation_pool_size" json:"creation_pool_size"` + + // NodeName represents node name + NodeName string `json:"node_name" yaml:"node_name"` + + // Discoverer represent agent discoverer service configuration + Discoverer *DiscovererClient `json:"discoverer" yaml:"discoverer"` +} + +// Bind binds the actual data from the Indexer receiver field. +func (im *Corrector) Bind() *Corrector { + im.AgentName = GetActualValue(im.AgentName) + im.AgentNamespace = GetActualValue(im.AgentNamespace) + im.AgentDNS = GetActualValue(im.AgentDNS) + im.NodeName = GetActualValue(im.NodeName) + + if im.Discoverer != nil { + im.Discoverer = im.Discoverer.Bind() + } + return im +} diff --git a/pkg/index/job/correction/config/config.go b/pkg/index/job/correction/config/config.go new file mode 100644 index 0000000000..611a3435e1 --- /dev/null +++ b/pkg/index/job/correction/config/config.go @@ -0,0 +1,75 @@ +// +// Copyright (C) 2019-2023 vdaas.org vald team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Package setting stores all server application settings +package config + +import ( + "github.com/vdaas/vald/internal/config" + "github.com/vdaas/vald/internal/errors" +) + +type GlobalConfig = config.GlobalConfig + +// Config represent a application setting data content (config.yaml). +// In K8s environment, this configuration is stored in K8s ConfigMap. +type Data struct { + config.GlobalConfig `json:",inline" yaml:",inline"` + + // Server represent all server configurations + Server *config.Servers `json:"server_config" yaml:"server_config"` + + // Observability represent observability configurations + Observability *config.Observability `json:"observability" yaml:"observability"` + + // Indexer represent agent auto indexing service configuration + Corrector *config.Corrector `json:"corrector" yaml:"corrector"` +} + +func NewConfig(path string) (cfg *Data, err error) { + cfg = new(Data) + + err = config.Read(path, &cfg) + + if err != nil { + return nil, err + } + + if cfg != nil { + cfg.Bind() + } else { + return nil, errors.ErrInvalidConfig + } + + if cfg.Server != nil { + cfg.Server = cfg.Server.Bind() + } else { + return nil, errors.ErrInvalidConfig + } + + if cfg.Observability != nil { + cfg.Observability = cfg.Observability.Bind() + } else { + cfg.Observability = new(config.Observability).Bind() + } + + if cfg.Corrector != nil { + cfg.Corrector = cfg.Corrector.Bind() + } else { + cfg.Corrector = new(config.Corrector).Bind() + } + return cfg, nil +} From fa1eb7552f8915e1d493260507303bd149ad31bc Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 14 Aug 2023 08:00:12 +0000 Subject: [PATCH 003/101] add corrector logic --- charts/vald/values.yaml | 26 + cmd/index/job/correction/main.go | 20 +- cmd/index/job/correction/sample.yaml | 5 +- internal/config/corrector.go | 18 +- internal/errors/corrector.go | 20 + internal/runner/runner.go | 2 + internal/servers/server/server.go | 1 - pkg/index/job/correction/config/config.go | 11 + pkg/index/job/correction/service/corrector.go | 460 ++++++++++++++++++ pkg/index/job/correction/usecase/corrector.go | 119 ++++- 10 files changed, 652 insertions(+), 30 deletions(-) create mode 100644 internal/errors/corrector.go create mode 100644 pkg/index/job/correction/service/corrector.go diff --git a/charts/vald/values.yaml b/charts/vald/values.yaml index b177e5a4e1..e619964f29 100644 --- a/charts/vald/values.yaml +++ b/charts/vald/values.yaml @@ -2631,3 +2631,29 @@ manager: net: dialer: keepalive: 15m #indexer fetches uncommitted index length, which includes huge payload so we need to set keepalive longer than usual + # @schema {"name": "manager.index.corrector", "type": "object"} + corrector: + # @schema {"name": "manager.index.corrector.agent_namespace", "type": "string"} + # manager.index.corrector.agent_namespace -- namespace of agent pods to manage + agent_namespace: _MY_POD_NAMESPACE_ + # @schema {"name": "manager.index.corrector.node_name", "type": "string"} + # manager.index.corrector.node_name -- node name + node_name: "" # _MY_NODE_NAME_ + # @schema {"name": "manager.index.corrector.concurrency", "type": "integer", "minimum": 1} + # manager.index.corrector.concurrency -- concurrency + concurrency: 1 + # @schema {"name": "manager.index.corrector.discoverer", "type": "object"} + discoverer: + # @schema {"name": "manager.index.corrector.discoverer.duration", "type": "string"} + # manager.index.corrector.discoverer.duration -- refresh duration to discover + duration: 500ms + # @schema {"name": "manager.index.corrector.discoverer.client", "alias": "grpc.client"} + # manager.index.corrector.discoverer.client -- gRPC client for discoverer (overrides defaults.grpc.client) + client: {} + # @schema {"name": "manager.index.corrector.discoverer.agent_client_options", "alias": "grpc.client"} + # manager.index.corrector.discoverer.agent_client_options -- gRPC client options for agents (overrides defaults.grpc.client) + agent_client_options: + dial_option: + net: + dialer: + keepalive: 15m #indexer fetches uncommitted index length, which includes huge payload so we need to set keepalive longer than usual diff --git a/cmd/index/job/correction/main.go b/cmd/index/job/correction/main.go index 423bdfbbe5..04a604a04b 100644 --- a/cmd/index/job/correction/main.go +++ b/cmd/index/job/correction/main.go @@ -1,3 +1,16 @@ +// Copyright (C) 2019-2023 vdaas.org vald team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. package main import ( @@ -8,10 +21,8 @@ import ( "github.com/vdaas/vald/internal/log" "github.com/vdaas/vald/internal/runner" "github.com/vdaas/vald/internal/safety" + "github.com/vdaas/vald/pkg/index/job/correction/config" "github.com/vdaas/vald/pkg/index/job/correction/usecase" - - - "github.com/vdaas/vald/pkg/manager/index/config" // FIXME: あとで独自のconfigに切り替え ) const ( @@ -21,7 +32,6 @@ const ( ) func main() { - // FIXME: demon前提なので基本的に止まらない。独自のrunnerを作る必要があるか if err := safety.RecoverFunc(func() error { return runner.Do( context.Background(), @@ -35,7 +45,7 @@ func main() { return cfg, &cfg.GlobalConfig, nil }), runner.WithDaemonInitializer(func(cfg interface{}) (runner.Runner, error) { - return usecase.New() + return usecase.New(cfg.(*config.Data)) }), ) })(); err != nil { diff --git a/cmd/index/job/correction/sample.yaml b/cmd/index/job/correction/sample.yaml index 80ddd09082..44dabf9c97 100644 --- a/cmd/index/job/correction/sample.yaml +++ b/cmd/index/job/correction/sample.yaml @@ -70,11 +70,13 @@ server_config: cert: /path/to/cert enabled: false key: /path/to/key +gateway: + index_replica: 3 corrector: agent_port: 8081 agent_name: "vald-agent-ngt" agent_dns: vald-agent-ngt.default.svc.cluster.local - agent_namespace: "_MY_POD_NAMESPACE_" + agent_namespace: "default" node_name: "" discoverer: duration: 500ms @@ -197,4 +199,3 @@ corrector: cert: /path/to/cert enabled: false key: /path/to/key - \ No newline at end of file diff --git a/internal/config/corrector.go b/internal/config/corrector.go index a17657b516..28924b447b 100644 --- a/internal/config/corrector.go +++ b/internal/config/corrector.go @@ -41,14 +41,14 @@ type Corrector struct { } // Bind binds the actual data from the Indexer receiver field. -func (im *Corrector) Bind() *Corrector { - im.AgentName = GetActualValue(im.AgentName) - im.AgentNamespace = GetActualValue(im.AgentNamespace) - im.AgentDNS = GetActualValue(im.AgentDNS) - im.NodeName = GetActualValue(im.NodeName) - - if im.Discoverer != nil { - im.Discoverer = im.Discoverer.Bind() +func (c *Corrector) Bind() *Corrector { + c.AgentName = GetActualValue(c.AgentName) + c.AgentNamespace = GetActualValue(c.AgentNamespace) + c.AgentDNS = GetActualValue(c.AgentDNS) + c.NodeName = GetActualValue(c.NodeName) + + if c.Discoverer != nil { + c.Discoverer = c.Discoverer.Bind() } - return im + return c } diff --git a/internal/errors/corrector.go b/internal/errors/corrector.go new file mode 100644 index 0000000000..b47296e424 --- /dev/null +++ b/internal/errors/corrector.go @@ -0,0 +1,20 @@ +// +// Copyright (C) 2019-2023 vdaas.org vald team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +// Package errors provides error types and function +package errors + +var ErrIndexReplicaOne = New("nothing to correct when index replica is 1") diff --git a/internal/runner/runner.go b/internal/runner/runner.go index fcf741c51d..4dccd14c90 100644 --- a/internal/runner/runner.go +++ b/internal/runner/runner.go @@ -215,6 +215,8 @@ func Run(ctx context.Context, run Runner, name string) (err error) { emap[err.Error()]++ } + // waif for all the goroutines to finish. + // this errgroup is global across the program err = errgroup.Wait() if err != nil && !errors.Is(err, context.DeadlineExceeded) && diff --git a/internal/servers/server/server.go b/internal/servers/server/server.go index 904e76d371..9e70fb9512 100644 --- a/internal/servers/server/server.go +++ b/internal/servers/server/server.go @@ -334,7 +334,6 @@ func (s *server) ListenAndServe(ctx context.Context, ech chan<- error) (err erro s.mu.RUnlock() log.Infof("%s server %s stopped", s.mode.String(), s.name) } - return nil })) } return nil diff --git a/pkg/index/job/correction/config/config.go b/pkg/index/job/correction/config/config.go index 611a3435e1..d704bf59d0 100644 --- a/pkg/index/job/correction/config/config.go +++ b/pkg/index/job/correction/config/config.go @@ -37,6 +37,10 @@ type Data struct { // Indexer represent agent auto indexing service configuration Corrector *config.Corrector `json:"corrector" yaml:"corrector"` + + // FIXME: ここから読み込むときLB側の設定とのconsistencyをどう担保するのか + // Gateway represent agent gateway service configuration + Gateway *config.LB `json:"gateway" yaml:"gateway"` } func NewConfig(path string) (cfg *Data, err error) { @@ -71,5 +75,12 @@ func NewConfig(path string) (cfg *Data, err error) { } else { cfg.Corrector = new(config.Corrector).Bind() } + + if cfg.Gateway != nil { + cfg.Gateway = cfg.Gateway.Bind() + } else { + cfg.Gateway = new(config.LB).Bind() + } + return cfg, nil } diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go new file mode 100644 index 0000000000..9d1a6281fb --- /dev/null +++ b/pkg/index/job/correction/service/corrector.go @@ -0,0 +1,460 @@ +// Copyright (C) 2019-2023 vdaas.org vald team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +package service + +import ( + "context" + "fmt" + "io" + "sync" + "sync/atomic" + + agent "github.com/vdaas/vald/apis/grpc/v1/agent/core" + "github.com/vdaas/vald/apis/grpc/v1/payload" + "github.com/vdaas/vald/apis/grpc/v1/vald" + "github.com/vdaas/vald/internal/client/v1/client/discoverer" + "github.com/vdaas/vald/internal/errgroup" + "github.com/vdaas/vald/internal/errors" + "github.com/vdaas/vald/internal/log" + "github.com/vdaas/vald/internal/net/grpc" + "github.com/vdaas/vald/internal/net/grpc/codes" + "github.com/vdaas/vald/internal/net/grpc/status" + "github.com/vdaas/vald/internal/slices" + valdsync "github.com/vdaas/vald/internal/sync" + "github.com/vdaas/vald/pkg/index/job/correction/config" +) + +type Corrector interface { + Start(ctx context.Context) (<-chan error, error) +} + +type correct struct { + eg errgroup.Group + cfg *config.Data + discoverer discoverer.Client + indexInfos valdsync.Map[string, *payload.Info_Index_Count] + uuidsCount uint32 + uncommittedUUIDsCount uint32 +} + +func New(cfg *config.Data, discoverer discoverer.Client) (Corrector, error) { + return &correct{ + cfg: cfg, + discoverer: discoverer, + }, nil +} + +func (c *correct) Start(ctx context.Context) (<-chan error, error) { + dech, err := c.discoverer.Start(ctx) + if err != nil { + return nil, err + } + + addrs := c.discoverer.GetAddrs(ctx) + log.Debug("agent addrs found:", addrs) + + if l := len(addrs); l <= 1 { + log.Warn("only %d agent found, there must be more than two agents for correction to happen", l) + return nil, err + } + + err = c.loadInfos(ctx) + if err != nil { + return nil, err + } + + // This blocks. Should we run with errorgroup? + log.Info("starting correction...") + if err := c.correct(ctx, addrs); err != nil { + log.Errorf("there's some errors while correction: %v", err) + return nil, err + } + log.Info("correction finished successfully") + + // ech := make(chan error, 100) + // c.eg.Go(safety.RecoverFunc(func() (err error) { + // defer close(ech) + // for { + // select { + // case <-ctx.Done(): + // err = ctx.Err() + // if err != nil && err != context.Canceled { + // return err + // } + // return nil + // case err = <-dech: + // ech <- err + // } + // } + // })) + return dech, nil +} + +func (c *correct) correct(ctx context.Context, addrs []string) (err error) { + if err := c.discoverer.GetClient().OrderedRange(ctx, addrs, + func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { + vc := vald.NewValdClient(conn) + stream, err := vc.StreamListObject(ctx, &payload.Object_List_Request{}) + if err != nil { + return err + } + + for { + select { + case <-ctx.Done(): + return ctx.Err() + default: + res, err := stream.Recv() + if errors.Is(err, io.EOF) { + log.Debugf("StreamListObject stream finished for agent %s", addr) + return nil + } + if err != nil { + return err + } + + if res.GetVector() == nil { + st := res.GetStatus() + // TODO: errors.Join? + log.Error(st.GetCode(), st.GetMessage(), st.GetDetails()) + continue + } + + log.Debugf("received object in StreamListObject: agent(%s), id(%s), timestamp(%v)", addr, res.GetVector().GetId(), res.GetVector().GetTimestamp()) + if err := c.checkConsistency( + ctx, + &vectorReplica{ + addr: addr, + vec: res.GetVector(), + }, + addrs, + ); err != nil { + // TODO: errors.Join? + // keep processing other vectors even if one vector failed + log.Error(err) + continue + } + } + } + }, + ); err != nil { + return err + } + + return nil +} + +type vectorReplica struct { + addr string + vec *payload.Object_Vector +} + +// Validate len(addrs) >= 2 before calling this function +func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorReplica, addrs []string) error { + // copy the addrs slice but delete the curAgentAddr + otherAddrs := make([]string, 0, len(addrs)-1) + availableAddrs := make(map[string]struct{}) + for _, addr := range addrs { + if addr != targetReplica.addr { + otherAddrs = append(otherAddrs, addr) + availableAddrs[addr] = struct{}{} + } + } + + foundReplicas := make([]*vectorReplica, 0, len(otherAddrs)) + var mu sync.Mutex + if err := c.discoverer.GetClient().OrderedRangeConcurrent(ctx, otherAddrs, len(otherAddrs), + func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { + select { + case <-ctx.Done(): + return ctx.Err() + default: + } + vc := vald.NewValdClient(conn) + v, err := vc.GetObject(ctx, &payload.Object_VectorRequest{ + Id: &payload.Object_ID{ + Id: targetReplica.vec.GetId(), + }, + }) + if err != nil { + if st, ok := status.FromError(err); !ok { + log.Errorf("gRPC call returned not a gRPC status error: %v", err) + return err + } else if st.Code() == codes.NotFound { + // when replica of agent > index replica, this happens + return nil + } else { + log.Errorf("failed to GetObject with unexpected error. code: %v, message: %s", st.Code(), st.Message()) + return err + } + } + + log.Debugf("object found: agent(%s), id(%v), timestamp(%v)", addr, v.GetId(), v.GetTimestamp()) + mu.Lock() + foundReplicas = append(foundReplicas, &vectorReplica{ + addr: addr, + vec: v, + }) + delete(availableAddrs, addr) + mu.Unlock() + + return nil + }, + ); err != nil { + return err + } + + // check timestamps + if err := c.correctTimestamp(ctx, targetReplica, foundReplicas); err != nil { + return fmt.Errorf("failed to fix timestamp: %w", err) + } + + // check replica number + replica := len(foundReplicas) + 1 + if err := c.correctReplica(ctx, targetReplica, foundReplicas, replica, availableAddrs); err != nil { + return fmt.Errorf("failed to fix index replica: %w", err) + } + + return nil +} + +func (c *correct) correctTimestamp(ctx context.Context, targetReplica *vectorReplica, foundReplicas []*vectorReplica) error { + if len(foundReplicas) == 0 { + // no replica found. nothing to do about timestamp + return nil + } + + allReplicas := append(foundReplicas, targetReplica) + + // sort by timestamp + slices.SortFunc(allReplicas, func(i, j *vectorReplica) bool { + // largest timestamp means the latest + return i.vec.GetTimestamp() > j.vec.GetTimestamp() + }) + + latest := allReplicas[0] + latestTs := latest.vec.GetTimestamp() + for _, replica := range allReplicas { + if replica.vec.GetTimestamp() == latestTs { + // no inconsistency + continue + } + + // udate the vector with the new one + log.Infof("timestamp inconsistency detected with vector(id: %s, timestamp: %v). updating with the latest vector(id: %s, timestamp: %v)", + replica.vec.GetId(), + replica.vec.GetTimestamp(), + latest.vec.GetId(), + latest.vec.GetTimestamp(), + ) + if err := c.updateObject(ctx, replica.addr, latest.vec); err != nil { + return err + } + } + + return nil +} + +func (c *correct) correctReplica( + ctx context.Context, + targetReplica *vectorReplica, + foundReplicas []*vectorReplica, + replica int, + availableAddrs map[string]struct{}, +) error { + // diff < 0 means there is less replica than the correct number + diff := replica - c.cfg.Gateway.IndexReplica + if diff == 0 { + // replica number is correct + return nil + } + + // when there are less replicas than the correct number, add the extra replicas + // TODO: refine this logic. pretty complicated + if diff < 0 { + log.Infof("replica shortage of vector %s. inserting to other agents...", + targetReplica.vec.GetId()) + if len(availableAddrs) == 0 { + // TODO: define errors in errors pkg + return fmt.Errorf("no available agent to insert replica") + } + + // availableAddrsからdiff個選んでinsert処理する + // TODO: どのagentにinsertするのが最適化のロジックを考える + // とりあえずはランダムに入れとく + for addr := range availableAddrs { + if diff == 0 { + break + } + log.Infof("inserting replica to %s", addr) + if err := c.insertObject(ctx, addr, targetReplica.vec); err != nil { + log.Errorf("failed to insert object to agent(%s): %v", addr, err) + continue + } + diff++ + } + + if diff < 0 { + return fmt.Errorf("failed to insert the sufficient amount of index to meet the replica setting") + } + + return nil + } + + // when there are more replicas than the correct number, delete the extra replicas + log.Infof("replica oversupply of vector %s. deleting...", + targetReplica.vec.GetId()) + // delete from myself + if err := c.deleteObject(ctx, targetReplica.addr, targetReplica.vec); err != nil { + log.Errorf("failed to delete object from agent(%s): %v", targetReplica.addr, err) + } else { + diff-- + } + + // delte from others + for _, replica := range foundReplicas { + if diff == 0 { + break + } + if err := c.deleteObject(ctx, replica.addr, replica.vec); err != nil { + log.Errorf("failed to delete object from agent(%s): %v", replica.addr, err) + continue + } + diff-- + } + + if diff > 0 { + return fmt.Errorf("failed to delete the sufficient amount of index to meet the replica setting") + } + + return nil +} + +func (c *correct) updateObject(ctx context.Context, addr string, vector *payload.Object_Vector) error { + res, err := c.discoverer.GetClient(). + Do(grpc.WithGRPCMethod(ctx, "core.v1.Vald/Update"), addr, func(ctx context.Context, conn *grpc.ClientConn, copts ...grpc.CallOption) (interface{}, error) { + return vald.NewUpdateClient(conn).Update(ctx, &payload.Update_Request{ + Vector: vector, + // FIXME: this should be deleted after Config.Timestamp deprecation + Config: &payload.Update_Config{ + // TODO: Decrementing because it's gonna be incremented befor being pushed + // to vqueue in the agent. This is a not ideal workaround for the current vqueue implementation + // so we should consider refactoring vqueue. + Timestamp: vector.GetTimestamp() - 1, + }, + }, copts...) + }) + if err != nil { + return err + } + + if v, ok := res.(*payload.Object_Location); ok { + log.Infof("vector successfully updated. address: %s, uuid: %v", addr, v.GetUuid()) + } + + return nil +} + +func (c *correct) insertObject(ctx context.Context, addr string, vector *payload.Object_Vector) error { + res, err := c.discoverer.GetClient(). + Do(grpc.WithGRPCMethod(ctx, "core.v1.Vald/Insert"), addr, func(ctx context.Context, conn *grpc.ClientConn, copts ...grpc.CallOption) (interface{}, error) { + return vald.NewInsertClient(conn).Insert(ctx, &payload.Insert_Request{ + Vector: vector, + // FIXME: this should be deleted after Config.Timestamp deprecation + Config: &payload.Insert_Config{ + Timestamp: vector.GetTimestamp(), + }, + }, copts...) + }) + if err != nil { + return err + } + + if v, ok := res.(*payload.Object_Location); ok { + log.Infof("vector successfully inserted. address: %s, uuid: %v", addr, v.GetUuid()) + } + + return nil +} + +func (c *correct) deleteObject(ctx context.Context, addr string, vector *payload.Object_Vector) error { + res, err := c.discoverer.GetClient(). + Do(grpc.WithGRPCMethod(ctx, "core.v1.Vald/Delete"), addr, func(ctx context.Context, conn *grpc.ClientConn, copts ...grpc.CallOption) (interface{}, error) { + return vald.NewRemoveClient(conn).Remove(ctx, &payload.Remove_Request{ + Id: &payload.Object_ID{ + Id: vector.GetId(), + }, + }, copts...) + }) + if err != nil { + return err + } + + if v, ok := res.(*payload.Object_Location); ok { + log.Infof("vector successfully deleted. address: %s, uuid: %v", addr, v.GetUuid()) + } + + return nil +} + +func (c *correct) loadInfos(ctx context.Context) (err error) { + // FIXME: o11yは最後に整える + // ctx, span := trace.StartSpan(grpc.WithGRPCMethod(ctx, "core.v1.Agent/IndexInfo"), "vald/manager-index/service/Indexer.loadInfos") + // defer func() { + // if span != nil { + // span.End() + // } + // }() + + var u, ucu uint32 + var infoMap valdsync.Map[string, *payload.Info_Index_Count] + err = c.discoverer.GetClient().RangeConcurrent(ctx, len(c.discoverer.GetAddrs(ctx)), + func(ctx context.Context, + addr string, conn *grpc.ClientConn, copts ...grpc.CallOption, + ) (err error) { + select { + case <-ctx.Done(): + return nil + default: + info, err := agent.NewAgentClient(conn).IndexInfo(ctx, new(payload.Empty), copts...) + if err != nil { + log.Warnf("an error occurred while calling IndexInfo of %s: %s", addr, err) + return nil + } + infoMap.Store(addr, info) + atomic.AddUint32(&u, info.GetStored()) + atomic.AddUint32(&ucu, info.GetUncommitted()) + } + return nil + }) + if err != nil { + return err + } + atomic.StoreUint32(&c.uuidsCount, atomic.LoadUint32(&u)) + atomic.StoreUint32(&c.uncommittedUUIDsCount, atomic.LoadUint32(&ucu)) + c.indexInfos.Range(func(addr string, _ *payload.Info_Index_Count) bool { + info, ok := infoMap.Load(addr) + if !ok { + c.indexInfos.Delete(addr) + } + c.indexInfos.Store(addr, info) + infoMap.Delete(addr) + return true + }) + infoMap.Range(func(addr string, info *payload.Info_Index_Count) bool { + c.indexInfos.Store(addr, info) + return true + }) + return nil +} diff --git a/pkg/index/job/correction/usecase/corrector.go b/pkg/index/job/correction/usecase/corrector.go index 47a1ff58fe..ad1bf4e1c1 100644 --- a/pkg/index/job/correction/usecase/corrector.go +++ b/pkg/index/job/correction/usecase/corrector.go @@ -1,34 +1,128 @@ +// Copyright (C) 2019-2023 vdaas.org vald team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. package usecase import ( "context" + "github.com/vdaas/vald/internal/client/v1/client/discoverer" "github.com/vdaas/vald/internal/errgroup" + "github.com/vdaas/vald/internal/errors" + "github.com/vdaas/vald/internal/log" + "github.com/vdaas/vald/internal/net/grpc" "github.com/vdaas/vald/internal/runner" + "github.com/vdaas/vald/internal/safety" + "github.com/vdaas/vald/pkg/index/job/correction/config" + "github.com/vdaas/vald/pkg/index/job/correction/service" ) type run struct { - eg errgroup.Group - // cfg *config.Data - // server starter.Server - // observability observability.Observability - // indexer service.Indexer + eg errgroup.Group + cfg *config.Data + corrector service.Corrector } -// FIXME: add config -func New() (r runner.Runner, err error) { +func New(cfg *config.Data) (r runner.Runner, err error) { + if cfg.Gateway.IndexReplica == 1 { + return nil, errors.ErrIndexReplicaOne + } + eg := errgroup.Get() + + cOpts, err := cfg.Corrector.Discoverer.Client.Opts() + if err != nil { + return nil, err + } + // skipcq: CRT-D0001 + dopts := append( + cOpts, + grpc.WithErrGroup(eg)) + + acOpts, err := cfg.Corrector.Discoverer.AgentClientOptions.Opts() + if err != nil { + return nil, err + } + // skipcq: CRT-D0001 + aopts := append( + acOpts, + grpc.WithErrGroup(eg)) + + // Construct discoverer + discoverer, err := discoverer.New( + discoverer.WithAutoConnect(true), + discoverer.WithName(cfg.Corrector.AgentName), + discoverer.WithNamespace(cfg.Corrector.AgentNamespace), + discoverer.WithPort(cfg.Corrector.AgentPort), + discoverer.WithServiceDNSARecord(cfg.Corrector.AgentDNS), + discoverer.WithDiscovererClient(grpc.New(dopts...)), + discoverer.WithDiscoverDuration(cfg.Corrector.Discoverer.Duration), + discoverer.WithOptions(aopts...), + discoverer.WithNodeName(cfg.Corrector.NodeName), + discoverer.WithOnDiscoverFunc(func(ctx context.Context, c discoverer.Client, addrs []string) error { + last := len(addrs) - 1 + for i := 0; i < len(addrs)/2; i++ { + addrs[i], addrs[last-i] = addrs[last-i], addrs[i] + } + return nil + }), + ) + if err != nil { + return nil, err + } + + corrector, err := service.New(cfg, discoverer) + if err != nil { + return nil, err + } + return &run{ - eg: eg, + eg: eg, + cfg: cfg, + corrector: corrector, }, nil } -func (r *run) PreStart(ctx context.Context) error { +func (c *run) PreStart(ctx context.Context) error { return nil } -func (r *run) Start(ctx context.Context) (<-chan error, error) { - ech := make(chan error, 5) // FIXME: magic number 5 +func (c *run) Start(ctx context.Context) (<-chan error, error) { + // TODO: timeoutはconfigから指定 + // Setting timeout because job resource needs to be finished at some point + // ここでcancelしても親は終了しないので、結局self SIGTERMしかなさそう + // timeout設定はして、finalizeを呼ぶのが良いか + // ctx, cancel = context.WithTimeout(ctx, time.Second*20) + // defer cancel() // ここでdeferすると関数はすぐ抜けちゃうので意味ない + + log.Info("starting index correction...") + + dech, err := c.corrector.Start(ctx) + + // FIXME: 以下をやめてシンプルにStartを抜けたらself SIGTERMで終了させる方がいいかも + // その場合echは無視する + ech := make(chan error, 100) + c.eg.Go(safety.RecoverFunc(func() error { + for { + select { + case <-ctx.Done(): + log.Debug("======= ctx.Done at corrector start") + return ctx.Err() + case err = <-dech: + ech <- err + } + } + })) return ech, nil } @@ -40,7 +134,6 @@ func (*run) Stop(context.Context) error { return nil } - -func (r *run) PostStop(ctx context.Context) error { +func (*run) PostStop(ctx context.Context) error { return nil } From 39e7d71573c118299eb85a5561f5dece97f5cd3f Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 14 Aug 2023 08:25:20 +0000 Subject: [PATCH 004/101] add build make command for index correction binary --- Makefile.d/build.mk | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/Makefile.d/build.mk b/Makefile.d/build.mk index 580cf34599..f62dbdf76a 100644 --- a/Makefile.d/build.mk +++ b/Makefile.d/build.mk @@ -206,6 +206,35 @@ cmd/manager/index/index: \ $(dir $@)main.go $@ -version +cmd/index/job/correction/correction: \ + $(GO_SOURCES_INTERNAL) \ + $(PBGOS) \ + $(shell find $(ROOTDIR)/cmd/index/job/correction/correction -type f -name '*.go' -not -name '*_test.go' -not -name 'doc.go') \ + $(shell find $(ROOTDIR)/pkg/index/job/correction -type f -name '*.go' -not -name '*_test.go' -not -name 'doc.go') + $(eval CGO_ENABLED = 0) + CGO_ENABLED=$(CGO_ENABLED) \ + GO111MODULE=on \ + GOPRIVATE=$(GOPRIVATE) \ + go build \ + --ldflags "-w -extldflags=-static \ + -X '$(GOPKG)/internal/info.Version=$(VERSION)' \ + -X '$(GOPKG)/internal/info.GitCommit=$(GIT_COMMIT)' \ + -X '$(GOPKG)/internal/info.BuildTime=$(DATETIME)' \ + -X '$(GOPKG)/internal/info.GoVersion=$(GO_VERSION)' \ + -X '$(GOPKG)/internal/info.GoOS=$(GOOS)' \ + -X '$(GOPKG)/internal/info.GoArch=$(GOARCH)' \ + -X '$(GOPKG)/internal/info.CGOEnabled=$(CGO_ENABLED)' \ + -X '$(GOPKG)/internal/info.BuildCPUInfoFlags=$(CPU_INFO_FLAGS)' \ + -buildid=" \ + -mod=readonly \ + -modcacherw \ + -a \ + -tags "osusergo netgo static_build" \ + -trimpath \ + -o $@ \ + $(dir $@)main.go + $@ -version + .PHONY: binary/build/zip ## build all binaries and zip them binary/build/zip: \ From 2bd41e9facef6231b8cb25e600459970a67432ce Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 14 Aug 2023 08:25:36 +0000 Subject: [PATCH 005/101] add Dockerfile for index correction --- dockers/index/job/correction/Dockerfile | 93 +++++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 dockers/index/job/correction/Dockerfile diff --git a/dockers/index/job/correction/Dockerfile b/dockers/index/job/correction/Dockerfile new file mode 100644 index 0000000000..0e8f717e7a --- /dev/null +++ b/dockers/index/job/correction/Dockerfile @@ -0,0 +1,93 @@ +# +# Copyright (C) 2019-2023 vdaas.org vald team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +ARG GO_VERSION=latest +ARG DISTROLESS_IMAGE=gcr.io/distroless/static +ARG DISTROLESS_IMAGE_TAG=nonroot +ARG MAINTAINER="vdaas.org vald team " + +FROM golang:${GO_VERSION} AS golang + +FROM ubuntu:devel AS builder + +ENV GO111MODULE on +ENV DEBIAN_FRONTEND noninteractive +ENV INITRD No +ENV LANG en_US.UTF-8 +ENV GOROOT /opt/go +ENV GOPATH /go +ENV PATH ${PATH}:${GOROOT}/bin:${GOPATH}/bin +ENV ORG vdaas +ENV REPO vald +ENV PKG index/job/correction +ENV APP_NAME correction + +# skipcq: DOK-DL3008 +RUN apt-get update && apt-get install -y --no-install-recommends \ + ca-certificates \ + build-essential \ + curl \ + upx \ + git \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* + +COPY --from=golang /usr/local/go $GOROOT +RUN mkdir -p "$GOPATH/src" + +WORKDIR ${GOPATH}/src/github.com/${ORG}/${REPO}/Makefile.d +COPY Makefile.d . +WORKDIR ${GOPATH}/src/github.com/${ORG}/${REPO} +COPY Makefile . +COPY .git . +COPY go.mod . +COPY go.sum . + +RUN make go/download + +WORKDIR ${GOPATH}/src/github.com/${ORG}/${REPO}/internal +COPY internal . + +WORKDIR ${GOPATH}/src/github.com/${ORG}/${REPO}/apis/grpc +COPY apis/grpc . + +WORKDIR ${GOPATH}/src/github.com/${ORG}/${REPO}/pkg/${PKG} +COPY pkg/${PKG} . + +WORKDIR ${GOPATH}/src/github.com/${ORG}/${REPO}/cmd/${PKG} +COPY cmd/${PKG} . + +WORKDIR ${GOPATH}/src/github.com/${ORG}/${REPO}/versions +COPY versions . + +WORKDIR ${GOPATH}/src/github.com/${ORG}/${REPO} +RUN make REPO=${ORG} NAME=${REPO} cmd/${PKG}/${APP_NAME} \ + && mv "cmd/${PKG}/${APP_NAME}" "/usr/bin/${APP_NAME}" + +WORKDIR ${GOPATH}/src/github.com/${ORG}/${REPO}/cmd/${PKG} +RUN cp sample.yaml /tmp/config.yaml + +FROM ${DISTROLESS_IMAGE}:${DISTROLESS_IMAGE_TAG} +LABEL maintainer="${MAINTAINER}" + +ENV APP_NAME correction + +COPY --from=builder /usr/bin/${APP_NAME} /go/bin/${APP_NAME} +COPY --from=builder /tmp/config.yaml /etc/server/config.yaml + +USER nonroot:nonroot + +ENTRYPOINT ["/go/bin/correction"] From 6ef29e3fc8a106be726d48a3206b17c1208e1584 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 14 Aug 2023 08:48:06 +0000 Subject: [PATCH 006/101] add Docker image for index job correction --- .../dockers-index-job-correction.yml | 188 ++++++++++++++++++ Makefile | 1 + Makefile.d/docker.mk | 14 ++ 3 files changed, 203 insertions(+) create mode 100644 .github/workflows/dockers-index-job-correction.yml diff --git a/.github/workflows/dockers-index-job-correction.yml b/.github/workflows/dockers-index-job-correction.yml new file mode 100644 index 0000000000..e67b051def --- /dev/null +++ b/.github/workflows/dockers-index-job-correction.yml @@ -0,0 +1,188 @@ +# +# Copyright (C) 2019-2023 vdaas.org vald team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +name: "Build docker image: index-job-correction" +on: + push: + branches: + - main + tags: + - "*.*.*" + - "v*.*.*" + - "*.*.*-*" + - "v*.*.*-*" + paths: + - ".github/actions/docker-build/actions.yaml" + - ".github/workflows/dockers-index-job-correction.yml" + - "go.mod" + - "go.sum" + - "internal/**" + - "!internal/**/*_test.go" + - "!internal/db/**" + - "!internal/k8s/**" + - "apis/grpc/**" + - "pkg/index/job/correction/**" + - "cmd/index/job/correction/**" + - "dockers/index/job/correction/Dockerfile" + - "versions/GO_VERSION" + pull_request: + paths: + - ".github/actions/docker-build/actions.yaml" + - ".github/workflows/dockers-index-job-correction.yml" + - "go.mod" + - "go.sum" + - "internal/**" + - "!internal/**/*_test.go" + - "!internal/db/**" + - "!internal/k8s/**" + - "apis/grpc/**" + - "pkg/index/job/correction/**" + - "cmd/index/job/correction/**" + - "dockers/index/job/correction/Dockerfile" + - "versions/GO_VERSION" + pull_request_target: + paths: + - ".github/actions/docker-build/actions.yaml" + - ".github/workflows/dockers-index-job-correction.yml" + - "go.mod" + - "go.sum" + - "internal/**" + - "!internal/**/*_test.go" + - "!internal/db/**" + - "!internal/k8s/**" + - "apis/grpc/**" + - "pkg/index/job/correction/**" + - "cmd/index/job/correction/**" + - "dockers/index/job/correction/Dockerfile" + - "versions/GO_VERSION" + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref != 'refs/heads/main' && github.ref || github.sha }}-${{ github.event_name }} + cancel-in-progress: true + +jobs: + dump_contexts_to_log: + runs-on: ubuntu-latest + steps: + - name: Dump GitHub context + id: github_context_step + run: echo $JSON + env: + JSON: ${{ toJSON(github) }} + - name: Dump job context + run: echo $JSON + env: + JSON: ${{ toJSON(job) }} + - name: Dump steps context + run: echo $JSON + env: + JSON: ${{ toJSON(steps) }} + - name: Dump runner context + run: echo $JSON + env: + JSON: ${{ toJSON(runner) }} + - name: Dump strategy context + run: echo $JSON + env: + JSON: ${{ toJSON(strategy) }} + - name: Dump matrix context + run: echo $JSON + env: + JSON: ${{ toJSON(matrix) }} + build: + strategy: + max-parallel: 4 + runs-on: ubuntu-latest + if: ${{ (github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false) || (github.event.pull_request.head.repo.fork == true && github.event_name == 'pull_request_target' && contains(github.event.pull_request.labels.*.name, 'ci/approved')) || (github.event_name == 'push' && github.ref == 'refs/heads/main') || startsWith( github.ref, 'refs/tags/') }} + steps: + - name: Get ref + id: ref + run: | + if [ ${{ github.event.pull_request.head.sha }} != "" ]; then + echo ref=${{ github.event.pull_request.head.sha }} >> $GITHUB_OUTPUT + else + echo ref=${{ github.sha }} >> $GITHUB_OUTPUT + fi + - uses: actions/checkout@v3 + with: + ref: ${{ steps.ref.outputs.ref }} + - name: set git config + run: | + git config --global --add safe.directory ${GITHUB_WORKSPACE} + - name: Setup QEMU + uses: docker/setup-qemu-action@v2 + with: + platforms: all + - name: Setup Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v2 + with: + buildkitd-flags: "--debug" + - name: Login to DockerHub + uses: docker/login-action@v2 + with: + username: ${{ secrets.DOCKERHUB_USER }} + password: ${{ secrets.DOCKERHUB_PASS }} + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ secrets.PACKAGE_USER }} + password: ${{ secrets.PACKAGE_TOKEN }} + - name: Build and Publish + id: build_and_publish + uses: ./.github/actions/docker-build + with: + target: index-job-correction + builder: ${{ steps.buildx.outputs.name }} + - name: Initialize CodeQL + if: startsWith( github.ref, 'refs/tags/') + uses: github/codeql-action/init@v2 + - name: Run vulnerability scanner (table) + if: startsWith( github.ref, 'refs/tags/') + uses: aquasecurity/trivy-action@master + with: + image-ref: "${{ steps.build_and_publish.outputs.IMAGE_NAME }}:${{ steps.build_and_publish.outputs.PRIMARY_TAG }}" + format: "table" + - name: Run vulnerability scanner (sarif) + if: startsWith( github.ref, 'refs/tags/') + uses: aquasecurity/trivy-action@master + with: + image-ref: "${{ steps.build_and_publish.outputs.IMAGE_NAME }}:${{ steps.build_and_publish.outputs.PRIMARY_TAG }}" + format: "template" + template: "@/contrib/sarif.tpl" + output: "trivy-results.sarif" + - name: Upload Trivy scan results to Security tab + if: startsWith( github.ref, 'refs/tags/') + uses: github/codeql-action/upload-sarif@v2 + with: + sarif_file: "trivy-results.sarif" + slack: + name: Slack notification + needs: build + runs-on: ubuntu-latest + if: github.ref == 'refs/heads/main' || startsWith( github.ref, 'refs/tags/') + steps: + - uses: technote-space/workflow-conclusion-action@v2 + with: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - uses: 8398a7/action-slack@v3 + with: + author_name: index-job-correction image build + status: ${{ env.WORKFLOW_CONCLUSION }} + only_mention_fail: channel + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_NOTIFY_WEBHOOK_URL }} diff --git a/Makefile b/Makefile index 03ec84dfa7..7933a89689 100644 --- a/Makefile +++ b/Makefile @@ -30,6 +30,7 @@ FILTER_GATEWAY_IMAGE = $(NAME)-filter-gateway HELM_OPERATOR_IMAGE = $(NAME)-helm-operator LB_GATEWAY_IMAGE = $(NAME)-lb-gateway LOADTEST_IMAGE = $(NAME)-loadtest +INDEX_JOB_CORRECTION_IMAGE = $(NAME)-index-job-correction MANAGER_INDEX_IMAGE = $(NAME)-manager-index MAINTAINER = "$(ORG).org $(NAME) team <$(NAME)@$(ORG).org>" diff --git a/Makefile.d/docker.mk b/Makefile.d/docker.mk index 0997f7e1ef..eed7283a46 100644 --- a/Makefile.d/docker.mk +++ b/Makefile.d/docker.mk @@ -188,3 +188,17 @@ docker/build/loadtest: -t $(ORG)/$(LOADTEST_IMAGE):$(TAG) . \ --build-arg MAINTAINER=$(MAINTAINER) \ --build-arg GO_VERSION=$(GO_VERSION) + +.PHONY: docker/name/index-job-correction +docker/name/index-job-correction: + @echo "$(ORG)/$(INDEX_JOB_CORRECTION_IMAGE)" + +.PHONY: docker/build/index-job-correction +## build index-job-correction image +docker/build/index-job-correction: + $(DOCKER) build \ + $(DOCKER_OPTS) \ + -f dockers/index/job/correction/Dockerfile \ + -t $(ORG)/$(INDEX_JOB_CORRECTION_IMAGE):$(TAG) . \ + --build-arg MAINTAINER=$(MAINTAINER) \ + --build-arg GO_VERSION=$(GO_VERSION) From c7e1ff277cbaa2a5e3ac41aa477b60d9da705441 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 15 Aug 2023 02:49:58 +0000 Subject: [PATCH 007/101] add timer --- pkg/index/job/correction/usecase/corrector.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pkg/index/job/correction/usecase/corrector.go b/pkg/index/job/correction/usecase/corrector.go index ad1bf4e1c1..e442c57f57 100644 --- a/pkg/index/job/correction/usecase/corrector.go +++ b/pkg/index/job/correction/usecase/corrector.go @@ -15,6 +15,7 @@ package usecase import ( "context" + "time" "github.com/vdaas/vald/internal/client/v1/client/discoverer" "github.com/vdaas/vald/internal/errgroup" @@ -107,7 +108,10 @@ func (c *run) Start(ctx context.Context) (<-chan error, error) { log.Info("starting index correction...") + start := time.Now() dech, err := c.corrector.Start(ctx) + end := time.Since(start) + log.Infof("correction finished in %v", end) // FIXME: 以下をやめてシンプルにStartを抜けたらself SIGTERMで終了させる方がいいかも // その場合echは無視する From 22dcaaa78f9db2f00cc9fee257d756ebfe5f3076 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 15 Aug 2023 04:15:32 +0000 Subject: [PATCH 008/101] fix tag align --- internal/config/corrector.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/config/corrector.go b/internal/config/corrector.go index 28924b447b..25817b2ecd 100644 --- a/internal/config/corrector.go +++ b/internal/config/corrector.go @@ -31,7 +31,7 @@ type Corrector struct { // AgentDNS represent agents dns A record for service discovery AgentDNS string `json:"agent_dns" yaml:"agent_dns"` - CreationPoolSize uint32 `yaml:"creation_pool_size" json:"creation_pool_size"` + CreationPoolSize uint32 `json:"creation_pool_size" yaml:"creation_pool_size"` // NodeName represents node name NodeName string `json:"node_name" yaml:"node_name"` From d51e26b2f336198ab16dc1cf9eb5981d1d57b2df Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 16 Aug 2023 02:31:37 +0000 Subject: [PATCH 009/101] tmp --- pkg/index/job/correction/service/corrector.go | 52 +++++++++++++------ pkg/index/job/correction/usecase/corrector.go | 6 ++- 2 files changed, 42 insertions(+), 16 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 9d1a6281fb..662a3dcbcb 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -40,7 +40,6 @@ type Corrector interface { } type correct struct { - eg errgroup.Group cfg *config.Data discoverer discoverer.Client indexInfos valdsync.Map[string, *payload.Info_Index_Count] @@ -74,6 +73,12 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { return nil, err } + // DEBUG: + c.indexInfos.Range(func(addr string, info *payload.Info_Index_Count) bool { + log.Debugf("index info: addr(%s), stored(%d), uncommitted(%d)", addr, info.GetStored(), info.GetUncommitted()) + return true + }) + // This blocks. Should we run with errorgroup? log.Info("starting correction...") if err := c.correct(ctx, addrs); err != nil { @@ -104,12 +109,24 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { func (c *correct) correct(ctx context.Context, addrs []string) (err error) { if err := c.discoverer.GetClient().OrderedRange(ctx, addrs, func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { + eg, ctx := errgroup.New(ctx) + eg.Limitation(c.cfg.Server.GetGRPCStreamConcurrency()) + vc := vald.NewValdClient(conn) stream, err := vc.StreamListObject(ctx, &payload.Object_List_Request{}) if err != nil { return err } + finalize := func() error { + err = eg.Wait() + if err != nil { + log.Errorf("err group returned error: %v", err) + return err + } + return nil + } + for { select { case <-ctx.Done(): @@ -118,9 +135,10 @@ func (c *correct) correct(ctx context.Context, addrs []string) (err error) { res, err := stream.Recv() if errors.Is(err, io.EOF) { log.Debugf("StreamListObject stream finished for agent %s", addr) - return nil + return finalize() } if err != nil { + log.Errorf("StreamListObject stream finished unexpectedly: %v", err) return err } @@ -132,19 +150,23 @@ func (c *correct) correct(ctx context.Context, addrs []string) (err error) { } log.Debugf("received object in StreamListObject: agent(%s), id(%s), timestamp(%v)", addr, res.GetVector().GetId(), res.GetVector().GetTimestamp()) - if err := c.checkConsistency( - ctx, - &vectorReplica{ - addr: addr, - vec: res.GetVector(), - }, - addrs, - ); err != nil { - // TODO: errors.Join? - // keep processing other vectors even if one vector failed - log.Error(err) - continue - } + eg.Go(func() error { + if err := c.checkConsistency( + ctx, + &vectorReplica{ + addr: addr, + vec: res.GetVector(), + }, + addrs, + ); err != nil { + // TODO: errors.Join? + // keep processing other vectors even if one vector failed + log.Error(err) + // continue + return err + } + return nil + }) } } }, diff --git a/pkg/index/job/correction/usecase/corrector.go b/pkg/index/job/correction/usecase/corrector.go index e442c57f57..9a921e2f98 100644 --- a/pkg/index/job/correction/usecase/corrector.go +++ b/pkg/index/job/correction/usecase/corrector.go @@ -110,11 +110,15 @@ func (c *run) Start(ctx context.Context) (<-chan error, error) { start := time.Now() dech, err := c.corrector.Start(ctx) + if err != nil { + log.Errorf("index correction process failed: %v", err) + return nil, err + } end := time.Since(start) log.Infof("correction finished in %v", end) // FIXME: 以下をやめてシンプルにStartを抜けたらself SIGTERMで終了させる方がいいかも - // その場合echは無視する + // その場合echは無視することになる ech := make(chan error, 100) c.eg.Go(safety.RecoverFunc(func() error { for { From 6aa8fdeb803b56153205967e2e1bf5056669f9d9 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 16 Aug 2023 08:30:13 +0000 Subject: [PATCH 010/101] fix log --- pkg/index/job/correction/service/corrector.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 662a3dcbcb..5ba5246bdd 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -144,7 +144,6 @@ func (c *correct) correct(ctx context.Context, addrs []string) (err error) { if res.GetVector() == nil { st := res.GetStatus() - // TODO: errors.Join? log.Error(st.GetCode(), st.GetMessage(), st.GetDetails()) continue } @@ -159,10 +158,7 @@ func (c *correct) correct(ctx context.Context, addrs []string) (err error) { }, addrs, ); err != nil { - // TODO: errors.Join? - // keep processing other vectors even if one vector failed - log.Error(err) - // continue + log.Errorf("failed to check consistency: %v", err) return err } return nil @@ -171,6 +167,7 @@ func (c *correct) correct(ctx context.Context, addrs []string) (err error) { } }, ); err != nil { + log.Errorf("failed to range over agents(%v): %v", addrs, err) return err } From 41faffd0fd4742333d807399685f6e3c144c0f68 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Thu, 17 Aug 2023 09:03:50 +0000 Subject: [PATCH 011/101] temporally implement two versions of correct function --- pkg/index/job/correction/service/corrector.go | 97 ++++++++++++++++++- 1 file changed, 96 insertions(+), 1 deletion(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 5ba5246bdd..efecd2f4af 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -33,6 +33,7 @@ import ( "github.com/vdaas/vald/internal/slices" valdsync "github.com/vdaas/vald/internal/sync" "github.com/vdaas/vald/pkg/index/job/correction/config" + stdeg "golang.org/x/sync/errgroup" ) type Corrector interface { @@ -107,10 +108,104 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { } func (c *correct) correct(ctx context.Context, addrs []string) (err error) { + if err := c.discoverer.GetClient().OrderedRange(ctx, addrs, + func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { + vc := vald.NewValdClient(conn) + stream, err := vc.StreamListObject(ctx, &payload.Object_List_Request{}) + if err != nil { + return err + } + + seg, ctx := stdeg.WithContext(ctx) + seg.SetLimit(100) // FIXME: server settingsをそのまま流用で良いのか? + + finalize := func() error { + err = seg.Wait() + if err != nil { + log.Errorf("err group returned error: %v", err) + return err + } + log.Infof("correction finished for agent %s", addr) + return nil + } + defer finalize() + + streamEnd := make(chan struct{}) + var once sync.Once + var mu sync.Mutex + // これをさらにerrgroupで囲みたくなるが、さすがに頭がおかしくなりそう + // 事前にRecvすべき件数はわかるのだからその回数だけfor文を回すようにする方がいいか + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-streamEnd: + return nil + default: + // TODO: when vald internal errgroup is changed to block when eg limitation is reached, + // switch to vald version of errgroup. + seg.Go(func() error { + mu.Lock() + // As long as we don't stream.Recv() from the stream, we do not consume the memory of the message. + // So by limiting the number of this errgroup.Go instances, we can limit the memory usage + // https://github.com/grpc/grpc-go/blob/33f9fa2e6e5bcf4cf8fe45133e23779ae6e43f6c/rpc_util.go#L795 + res, err := stream.Recv() + mu.Unlock() + + if errors.Is(err, io.EOF) { + log.Debugf("StreamListObject stream finished for agent %s", addr) + once.Do(func() { + close(streamEnd) + }) + return nil + } + if err != nil { + log.Errorf("StreamListObject stream finished unexpectedly: %v", err) + return err + } + + if res.GetVector() == nil { + st := res.GetStatus() + log.Error(st.GetCode(), st.GetMessage(), st.GetDetails()) + // continue + return nil + } + + log.Debugf("received object in StreamListObject: agent(%s), id(%s), timestamp(%v)", addr, res.GetVector().GetId(), res.GetVector().GetTimestamp()) + if err := c.checkConsistency( + ctx, + &vectorReplica{ + addr: addr, + vec: res.GetVector(), + }, + addrs, + ); err != nil { + // TODO: valdとstdでerrorの処理が違うので注意 + // (valdはerrが着信するまでにスタートしていた処理は行われる) + // (stdはerrが着信すると他は全て止まる) + log.Errorf("failed to check consistency: %v", err) + return nil // continue other processes + } + + return nil + }) + } + } + }, + ); err != nil { + log.Errorf("failed to range over agents(%v): %v", addrs, err) + return err + } + + return nil +} + +// stream.Recvぶん回しバージョン。メモリ使用率が高くなる可能性があるのでできれば避けたい。パフォーマンスとしては理論上最も良いはず +func (c *correct) correct2(ctx context.Context, addrs []string) (err error) { if err := c.discoverer.GetClient().OrderedRange(ctx, addrs, func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { eg, ctx := errgroup.New(ctx) - eg.Limitation(c.cfg.Server.GetGRPCStreamConcurrency()) + eg.Limitation(100) vc := vald.NewValdClient(conn) stream, err := vc.StreamListObject(ctx, &payload.Object_List_Request{}) From 8f5bcf05dc66dfa8fc2b109f0f812dd4d416748d Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 18 Aug 2023 01:51:48 +0000 Subject: [PATCH 012/101] set eg limit from config --- pkg/index/job/correction/service/corrector.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index efecd2f4af..4894965489 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -117,7 +117,7 @@ func (c *correct) correct(ctx context.Context, addrs []string) (err error) { } seg, ctx := stdeg.WithContext(ctx) - seg.SetLimit(100) // FIXME: server settingsをそのまま流用で良いのか? + seg.SetLimit(c.cfg.Server.GetGRPCStreamConcurrency()) // FIXME: server settingsをそのまま流用で良いのか? finalize := func() error { err = seg.Wait() @@ -205,7 +205,7 @@ func (c *correct) correct2(ctx context.Context, addrs []string) (err error) { if err := c.discoverer.GetClient().OrderedRange(ctx, addrs, func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { eg, ctx := errgroup.New(ctx) - eg.Limitation(100) + eg.Limitation(c.cfg.Server.GetGRPCStreamConcurrency()) vc := vald.NewValdClient(conn) stream, err := vc.StreamListObject(ctx, &payload.Object_List_Request{}) From 065e9e2c909b34576bcdf8d79c086d95086c1e48 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 18 Aug 2023 02:32:20 +0000 Subject: [PATCH 013/101] add stream list concurrency config --- cmd/index/job/correction/sample.yaml | 1 + internal/config/corrector.go | 12 ++++++++++++ pkg/index/job/correction/service/corrector.go | 7 +++++-- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/cmd/index/job/correction/sample.yaml b/cmd/index/job/correction/sample.yaml index 44dabf9c97..e6a549a003 100644 --- a/cmd/index/job/correction/sample.yaml +++ b/cmd/index/job/correction/sample.yaml @@ -78,6 +78,7 @@ corrector: agent_dns: vald-agent-ngt.default.svc.cluster.local agent_namespace: "default" node_name: "" + stream_list_concurrency: 100 discoverer: duration: 500ms client: diff --git a/internal/config/corrector.go b/internal/config/corrector.go index 25817b2ecd..6e4064a09d 100644 --- a/internal/config/corrector.go +++ b/internal/config/corrector.go @@ -36,6 +36,10 @@ type Corrector struct { // NodeName represents node name NodeName string `json:"node_name" yaml:"node_name"` + // StreamConcurrency represent stream concurrency for StreamListObject rpc client + // this directly affects the memory usage of this job + StreamListConcurrency int `json:"stream_list_concurrency" yaml:"stream_list_concurrency"` + // Discoverer represent agent discoverer service configuration Discoverer *DiscovererClient `json:"discoverer" yaml:"discoverer"` } @@ -52,3 +56,11 @@ func (c *Corrector) Bind() *Corrector { } return c } + +// GetStreamListConcurrency returns the StreamListConcurrency field value if set, -1 otherwise, which means no limit. +func (c *Corrector) GetStreamListConcurrency() int { + if c != nil { + return c.StreamListConcurrency + } + return -1 +} diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 4894965489..96a41fa120 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -117,7 +117,10 @@ func (c *correct) correct(ctx context.Context, addrs []string) (err error) { } seg, ctx := stdeg.WithContext(ctx) - seg.SetLimit(c.cfg.Server.GetGRPCStreamConcurrency()) // FIXME: server settingsをそのまま流用で良いのか? + concurrency := c.cfg.Corrector.GetStreamListConcurrency() + seg.SetLimit(concurrency) // FIXME: server settingsをそのまま流用で良いのか? + + log.Infof("starting correction for agent %s, concurrency: %d", addr, concurrency) finalize := func() error { err = seg.Wait() @@ -205,7 +208,7 @@ func (c *correct) correct2(ctx context.Context, addrs []string) (err error) { if err := c.discoverer.GetClient().OrderedRange(ctx, addrs, func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { eg, ctx := errgroup.New(ctx) - eg.Limitation(c.cfg.Server.GetGRPCStreamConcurrency()) + eg.Limitation(c.cfg.Corrector.GetStreamListConcurrency()) vc := vald.NewValdClient(conn) stream, err := vc.StreamListObject(ctx, &payload.Object_List_Request{}) From 8562a99f17599e8f2c202f2e3c45102ec5ffd6a5 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 21 Aug 2023 09:24:57 +0000 Subject: [PATCH 014/101] implement index id caching --- pkg/index/job/correction/service/corrector.go | 87 ++++--------------- 1 file changed, 17 insertions(+), 70 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 96a41fa120..4d87c28cb6 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -24,7 +24,6 @@ import ( "github.com/vdaas/vald/apis/grpc/v1/payload" "github.com/vdaas/vald/apis/grpc/v1/vald" "github.com/vdaas/vald/internal/client/v1/client/discoverer" - "github.com/vdaas/vald/internal/errgroup" "github.com/vdaas/vald/internal/errors" "github.com/vdaas/vald/internal/log" "github.com/vdaas/vald/internal/net/grpc" @@ -46,12 +45,15 @@ type correct struct { indexInfos valdsync.Map[string, *payload.Info_Index_Count] uuidsCount uint32 uncommittedUUIDsCount uint32 + checkedId map[string]struct{} // TODO: use mmap if necessary + rwmu sync.RWMutex } func New(cfg *config.Data, discoverer discoverer.Client) (Corrector, error) { return &correct{ cfg: cfg, discoverer: discoverer, + checkedId: make(map[string]struct{}), }, nil } @@ -118,7 +120,7 @@ func (c *correct) correct(ctx context.Context, addrs []string) (err error) { seg, ctx := stdeg.WithContext(ctx) concurrency := c.cfg.Corrector.GetStreamListConcurrency() - seg.SetLimit(concurrency) // FIXME: server settingsをそのまま流用で良いのか? + seg.SetLimit(concurrency) log.Infof("starting correction for agent %s, concurrency: %d", addr, concurrency) @@ -175,6 +177,16 @@ func (c *correct) correct(ctx context.Context, addrs []string) (err error) { } log.Debugf("received object in StreamListObject: agent(%s), id(%s), timestamp(%v)", addr, res.GetVector().GetId(), res.GetVector().GetTimestamp()) + + // check if the index is already checked + c.rwmu.RLock() + _, ok := c.checkedId[res.GetVector().GetId()] + c.rwmu.RUnlock() + if ok { + // already checked index + return nil + } + if err := c.checkConsistency( ctx, &vectorReplica{ @@ -190,75 +202,10 @@ func (c *correct) correct(ctx context.Context, addrs []string) (err error) { return nil // continue other processes } - return nil - }) - } - } - }, - ); err != nil { - log.Errorf("failed to range over agents(%v): %v", addrs, err) - return err - } - - return nil -} - -// stream.Recvぶん回しバージョン。メモリ使用率が高くなる可能性があるのでできれば避けたい。パフォーマンスとしては理論上最も良いはず -func (c *correct) correct2(ctx context.Context, addrs []string) (err error) { - if err := c.discoverer.GetClient().OrderedRange(ctx, addrs, - func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { - eg, ctx := errgroup.New(ctx) - eg.Limitation(c.cfg.Corrector.GetStreamListConcurrency()) - - vc := vald.NewValdClient(conn) - stream, err := vc.StreamListObject(ctx, &payload.Object_List_Request{}) - if err != nil { - return err - } + c.rwmu.Lock() + c.checkedId[res.GetVector().GetId()] = struct{}{} + c.rwmu.Unlock() - finalize := func() error { - err = eg.Wait() - if err != nil { - log.Errorf("err group returned error: %v", err) - return err - } - return nil - } - - for { - select { - case <-ctx.Done(): - return ctx.Err() - default: - res, err := stream.Recv() - if errors.Is(err, io.EOF) { - log.Debugf("StreamListObject stream finished for agent %s", addr) - return finalize() - } - if err != nil { - log.Errorf("StreamListObject stream finished unexpectedly: %v", err) - return err - } - - if res.GetVector() == nil { - st := res.GetStatus() - log.Error(st.GetCode(), st.GetMessage(), st.GetDetails()) - continue - } - - log.Debugf("received object in StreamListObject: agent(%s), id(%s), timestamp(%v)", addr, res.GetVector().GetId(), res.GetVector().GetTimestamp()) - eg.Go(func() error { - if err := c.checkConsistency( - ctx, - &vectorReplica{ - addr: addr, - vec: res.GetVector(), - }, - addrs, - ); err != nil { - log.Errorf("failed to check consistency: %v", err) - return err - } return nil }) } From 81345f3beb97b12835bd3a24ea019b1a737b0b9c Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 22 Aug 2023 01:48:42 +0000 Subject: [PATCH 015/101] add config to use cache or not --- cmd/index/job/correction/sample.yaml | 1 + internal/config/corrector.go | 3 + pkg/index/job/correction/service/corrector.go | 116 +++++++++++++++++- 3 files changed, 117 insertions(+), 3 deletions(-) diff --git a/cmd/index/job/correction/sample.yaml b/cmd/index/job/correction/sample.yaml index e6a549a003..117ddd99c7 100644 --- a/cmd/index/job/correction/sample.yaml +++ b/cmd/index/job/correction/sample.yaml @@ -79,6 +79,7 @@ corrector: agent_namespace: "default" node_name: "" stream_list_concurrency: 100 + use_cache: false discoverer: duration: 500ms client: diff --git a/internal/config/corrector.go b/internal/config/corrector.go index 6e4064a09d..dd0d27fd43 100644 --- a/internal/config/corrector.go +++ b/internal/config/corrector.go @@ -42,6 +42,9 @@ type Corrector struct { // Discoverer represent agent discoverer service configuration Discoverer *DiscovererClient `json:"discoverer" yaml:"discoverer"` + + // FIXME: Debug + UseCache bool `json:"use_cache" yaml:"use_cache"` } // Bind binds the actual data from the Indexer receiver field. diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 4d87c28cb6..b3e9f1c4e4 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -84,10 +84,23 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { // This blocks. Should we run with errorgroup? log.Info("starting correction...") - if err := c.correct(ctx, addrs); err != nil { - log.Errorf("there's some errors while correction: %v", err) - return nil, err + if c.cfg.Corrector.UseCache{ + log.Info("with cache...") + if err := c.correctWithCache(ctx, addrs); err != nil { + log.Errorf("there's some errors while correction: %v", err) + return nil, err + } + } else { + log.Info("without cache...") + if err := c.correct(ctx, addrs); err != nil { + log.Errorf("there's some errors while correction: %v", err) + return nil, err + } } + // if err := c.correct(ctx, addrs); err != nil { + // log.Errorf("there's some errors while correction: %v", err) + // return nil, err + // } log.Info("correction finished successfully") // ech := make(chan error, 100) @@ -110,6 +123,102 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { } func (c *correct) correct(ctx context.Context, addrs []string) (err error) { + if err := c.discoverer.GetClient().OrderedRange(ctx, addrs, + func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { + vc := vald.NewValdClient(conn) + stream, err := vc.StreamListObject(ctx, &payload.Object_List_Request{}) + if err != nil { + return err + } + + seg, ctx := stdeg.WithContext(ctx) + concurrency := c.cfg.Corrector.GetStreamListConcurrency() + seg.SetLimit(concurrency) // FIXME: server settingsをそのまま流用で良いのか? + + log.Infof("starting correction for agent %s, concurrency: %d", addr, concurrency) + + finalize := func() error { + err = seg.Wait() + if err != nil { + log.Errorf("err group returned error: %v", err) + return err + } + log.Infof("correction finished for agent %s", addr) + return nil + } + defer finalize() + + streamEnd := make(chan struct{}) + var once sync.Once + var mu sync.Mutex + // これをさらにerrgroupで囲みたくなるが、さすがに頭がおかしくなりそう + // 事前にRecvすべき件数はわかるのだからその回数だけfor文を回すようにする方がいいか + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-streamEnd: + return nil + default: + // TODO: when vald internal errgroup is changed to block when eg limitation is reached, + // switch to vald version of errgroup. + seg.Go(func() error { + mu.Lock() + // As long as we don't stream.Recv() from the stream, we do not consume the memory of the message. + // So by limiting the number of this errgroup.Go instances, we can limit the memory usage + // https://github.com/grpc/grpc-go/blob/33f9fa2e6e5bcf4cf8fe45133e23779ae6e43f6c/rpc_util.go#L795 + res, err := stream.Recv() + mu.Unlock() + + if errors.Is(err, io.EOF) { + log.Debugf("StreamListObject stream finished for agent %s", addr) + once.Do(func() { + close(streamEnd) + }) + return nil + } + if err != nil { + log.Errorf("StreamListObject stream finished unexpectedly: %v", err) + return err + } + + if res.GetVector() == nil { + st := res.GetStatus() + log.Error(st.GetCode(), st.GetMessage(), st.GetDetails()) + // continue + return nil + } + + log.Debugf("received object in StreamListObject: agent(%s), id(%s), timestamp(%v)", addr, res.GetVector().GetId(), res.GetVector().GetTimestamp()) + if err := c.checkConsistency( + ctx, + &vectorReplica{ + addr: addr, + vec: res.GetVector(), + }, + addrs, + ); err != nil { + // TODO: valdとstdでerrorの処理が違うので注意 + // (valdはerrが着信するまでにスタートしていた処理は行われる) + // (stdはerrが着信すると他は全て止まる) + log.Errorf("failed to check consistency: %v", err) + return nil // continue other processes + } + + return nil + }) + } + } + }, + ); err != nil { + log.Errorf("failed to range over agents(%v): %v", addrs, err) + return err + } + + return nil +} + +func (c *correct) correctWithCache(ctx context.Context, addrs []string) (err error) { if err := c.discoverer.GetClient().OrderedRange(ctx, addrs, func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { vc := vald.NewValdClient(conn) @@ -187,6 +296,7 @@ func (c *correct) correct(ctx context.Context, addrs []string) (err error) { return nil } + // FIXME: When caching mode, already checked agent can be omitted here if err := c.checkConsistency( ctx, &vectorReplica{ From 77c33f0c93a78dc3cd42a3be9c8376c8b218f04e Mon Sep 17 00:00:00 2001 From: "deepsource-autofix[bot]" <62050782+deepsource-autofix[bot]@users.noreply.github.com> Date: Tue, 22 Aug 2023 01:49:01 +0000 Subject: [PATCH 016/101] style: Format code with prettier and gofumpt --- pkg/index/job/correction/service/corrector.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index b3e9f1c4e4..2d0a755f8b 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -84,7 +84,7 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { // This blocks. Should we run with errorgroup? log.Info("starting correction...") - if c.cfg.Corrector.UseCache{ + if c.cfg.Corrector.UseCache { log.Info("with cache...") if err := c.correctWithCache(ctx, addrs); err != nil { log.Errorf("there's some errors while correction: %v", err) From 72c3956647792a97b6602f37ecf4fcee9a6389e4 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 23 Aug 2023 09:06:11 +0000 Subject: [PATCH 017/101] refactor availableAddrs --- pkg/index/job/correction/service/corrector.go | 44 ++++++++++--------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 2d0a755f8b..8d339ed7eb 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -63,6 +63,9 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { return nil, err } + // addrs is sorted by the memory usage of each agent(descending order) + // this is decending because it's supposed to be used for index manager to decide + // which pod to make a create index rpc(higher memory, first to commit) addrs := c.discoverer.GetAddrs(ctx) log.Debug("agent addrs found:", addrs) @@ -146,7 +149,6 @@ func (c *correct) correct(ctx context.Context, addrs []string) (err error) { log.Infof("correction finished for agent %s", addr) return nil } - defer finalize() streamEnd := make(chan struct{}) var once sync.Once @@ -156,9 +158,9 @@ func (c *correct) correct(ctx context.Context, addrs []string) (err error) { for { select { case <-ctx.Done(): - return ctx.Err() + return finalize() case <-streamEnd: - return nil + return finalize() default: // TODO: when vald internal errgroup is changed to block when eg limitation is reached, // switch to vald version of errgroup. @@ -247,7 +249,6 @@ func (c *correct) correctWithCache(ctx context.Context, addrs []string) (err err streamEnd := make(chan struct{}) var once sync.Once var mu sync.Mutex - // これをさらにerrgroupで囲みたくなるが、さすがに頭がおかしくなりそう // 事前にRecvすべき件数はわかるのだからその回数だけfor文を回すようにする方がいいか for { select { @@ -336,19 +337,18 @@ type vectorReplica struct { // Validate len(addrs) >= 2 before calling this function func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorReplica, addrs []string) error { - // copy the addrs slice but delete the curAgentAddr - otherAddrs := make([]string, 0, len(addrs)-1) - availableAddrs := make(map[string]struct{}) + // availableAddrs is the agents' addr that doesn't have the target replica thus is available to insert the replica + // to fix the index replica number if required. + availableAddrs := make([]string, 0, len(addrs)-1) for _, addr := range addrs { if addr != targetReplica.addr { - otherAddrs = append(otherAddrs, addr) - availableAddrs[addr] = struct{}{} + availableAddrs = append(availableAddrs, addr) } } - foundReplicas := make([]*vectorReplica, 0, len(otherAddrs)) + foundReplicas := make([]*vectorReplica, 0, len(availableAddrs)) var mu sync.Mutex - if err := c.discoverer.GetClient().OrderedRangeConcurrent(ctx, otherAddrs, len(otherAddrs), + if err := c.discoverer.GetClient().OrderedRangeConcurrent(ctx, availableAddrs, len(availableAddrs), func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { select { case <-ctx.Done(): @@ -375,12 +375,18 @@ func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorRep } log.Debugf("object found: agent(%s), id(%v), timestamp(%v)", addr, v.GetId(), v.GetTimestamp()) + mu.Lock() foundReplicas = append(foundReplicas, &vectorReplica{ addr: addr, vec: v, }) - delete(availableAddrs, addr) + for i, a := range availableAddrs { + if a == addr { + availableAddrs = availableAddrs[:i+copy(availableAddrs[i:], availableAddrs[i+1:])] + break + } + } mu.Unlock() return nil @@ -445,7 +451,7 @@ func (c *correct) correctReplica( targetReplica *vectorReplica, foundReplicas []*vectorReplica, replica int, - availableAddrs map[string]struct{}, + availableAddrs []string, ) error { // diff < 0 means there is less replica than the correct number diff := replica - c.cfg.Gateway.IndexReplica @@ -464,13 +470,9 @@ func (c *correct) correctReplica( return fmt.Errorf("no available agent to insert replica") } - // availableAddrsからdiff個選んでinsert処理する - // TODO: どのagentにinsertするのが最適化のロジックを考える - // とりあえずはランダムに入れとく - for addr := range availableAddrs { - if diff == 0 { - break - } + // inserting with the reverse order of availableAddrs since the last agent has the lowest memory usage + for i := len(availableAddrs) - 1; i >= 0 && diff < 0; i-- { + addr := availableAddrs[i] log.Infof("inserting replica to %s", addr) if err := c.insertObject(ctx, addr, targetReplica.vec); err != nil { log.Errorf("failed to insert object to agent(%s): %v", addr, err) @@ -496,7 +498,7 @@ func (c *correct) correctReplica( diff-- } - // delte from others + // delte from others if there's more to delete for _, replica := range foundReplicas { if diff == 0 { break From 5ffd9b3bd3493a542be8b8314f8f5d4010174ee2 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Thu, 24 Aug 2023 06:24:46 +0000 Subject: [PATCH 018/101] add kvs range duration --- .../metrics/agent/core/ngt/ngt.go | 24 +++++++++++++++++++ pkg/agent/core/ngt/service/kvs/kvs.go | 20 ++++++++++++++++ pkg/agent/core/ngt/service/ngt.go | 5 ++++ 3 files changed, 49 insertions(+) diff --git a/internal/observability/metrics/agent/core/ngt/ngt.go b/internal/observability/metrics/agent/core/ngt/ngt.go index a7016d3512..b7adb23c2d 100644 --- a/internal/observability/metrics/agent/core/ngt/ngt.go +++ b/internal/observability/metrics/agent/core/ngt/ngt.go @@ -49,6 +49,9 @@ const ( brokenIndexStoreCountMetricsName = "agent_core_ngt_broken_index_store_count" brokenIndexStoreCountMetricsDescription = "How many broken index generations have been stored" + + kvsRangeDurationMetricsName = "agent_core_ngt_kvs_range_duration" + kvsRangeDurationMetricsDescription = "The duration of the kvs range method" ) type ngtMetrics struct { @@ -143,6 +146,15 @@ func (n *ngtMetrics) View() ([]*metrics.View, error) { return nil, err } + kvsRangeDuration, err := view.New( + view.MatchInstrumentName(kvsRangeDurationMetricsName), + view.WithSetDescription(kvsRangeDurationMetricsDescription), + view.WithSetAggregation(aggregation.LastValue{}), + ) + if err != nil { + return nil, err + } + return []*metrics.View{ &indexCount, &uncommittedIndexCount, @@ -153,6 +165,7 @@ func (n *ngtMetrics) View() ([]*metrics.View, error) { &isIndexing, &isSaving, &brokenIndexCount, + &kvsRangeDuration, }, nil } @@ -238,6 +251,15 @@ func (n *ngtMetrics) Register(m metrics.Meter) error { return err } + kvsRangeDuration, err := m.AsyncInt64().Gauge( + kvsRangeDurationMetricsName, + metrics.WithDescription(kvsRangeDurationMetricsDescription), + metrics.WithUnit(metrics.Dimensionless), + ) + if err != nil { + return err + } + return m.RegisterCallback( []metrics.AsynchronousInstrument{ indexCount, @@ -249,6 +271,7 @@ func (n *ngtMetrics) Register(m metrics.Meter) error { isIndexing, isSaving, brokenIndexCount, + kvsRangeDuration, }, func(ctx context.Context) { var indexing int64 @@ -270,6 +293,7 @@ func (n *ngtMetrics) Register(m metrics.Meter) error { isIndexing.Observe(ctx, int64(indexing)) isSaving.Observe(ctx, int64(saving)) brokenIndexCount.Observe(ctx, int64(n.ngt.BrokenIndexCount())) + kvsRangeDuration.Observe(ctx, n.ngt.KvsRangeDuration()) }, ) } diff --git a/pkg/agent/core/ngt/service/kvs/kvs.go b/pkg/agent/core/ngt/service/kvs/kvs.go index 7ee078a08b..a2d7ecde77 100644 --- a/pkg/agent/core/ngt/service/kvs/kvs.go +++ b/pkg/agent/core/ngt/service/kvs/kvs.go @@ -19,6 +19,7 @@ package kvs import ( "context" "sync/atomic" + "time" "github.com/vdaas/vald/internal/safety" "github.com/vdaas/vald/internal/sync" @@ -36,6 +37,7 @@ type BidiMap interface { Range(ctx context.Context, f func(string, uint32, int64) bool) Len() uint64 Close() error + RangeDuration() int64 } type valueStructOu struct { @@ -65,6 +67,11 @@ const ( // mask = 0xFFF. ) +var ( + tmu sync.RWMutex + rangeDur int64 +) + // New returns the bidi that satisfies the BidiMap interface. func New(opts ...Option) BidiMap { b := &bidi{ @@ -151,6 +158,8 @@ func (b *bidi) DeleteInverse(val uint32) (key string, ok bool) { // Range retrieves all set keys and values and calls the callback function f. func (b *bidi) Range(ctx context.Context, f func(string, uint32, int64) bool) { + start := time.Now() + var wg sync.WaitGroup for i := range b.uo { idx := i @@ -169,6 +178,11 @@ func (b *bidi) Range(ctx context.Context, f func(string, uint32, int64) bool) { })) } wg.Wait() + + dur := time.Since(start).Nanoseconds() + tmu.Lock() + rangeDur = dur + tmu.Unlock() } // Len returns the length of the cache that is set in the bidi. @@ -192,3 +206,9 @@ func getShardID(key string) (id uint64) { } return xxh3.HashString(key) & mask } + +func (b *bidi) RangeDuration() int64 { + tmu.RLock() + defer tmu.RUnlock() + return rangeDur +} diff --git a/pkg/agent/core/ngt/service/ngt.go b/pkg/agent/core/ngt/service/ngt.go index 1e1a75d291..84caf77c97 100644 --- a/pkg/agent/core/ngt/service/ngt.go +++ b/pkg/agent/core/ngt/service/ngt.go @@ -83,6 +83,7 @@ type NGT interface { GetDimensionSize() int Close(ctx context.Context) error BrokenIndexCount() uint64 + KvsRangeDuration() int64 } type ngt struct { @@ -1747,3 +1748,7 @@ func (n *ngt) toSearchResponse(sr []core.SearchResult) (res *payload.Search_Resp } return res, nil } + +func (n *ngt) KvsRangeDuration() int64 { + return n.kvs.RangeDuration() +} From 5d3c6c72ac7ae62afce1786ab5fe6624cbfa34ac Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 25 Aug 2023 04:27:39 +0000 Subject: [PATCH 019/101] add leftAgentAddrs for performance --- pkg/index/job/correction/service/corrector.go | 57 ++++++++++++------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 8d339ed7eb..c34ee63916 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -42,6 +42,7 @@ type Corrector interface { type correct struct { cfg *config.Data discoverer discoverer.Client + agentAddrs []string indexInfos valdsync.Map[string, *payload.Info_Index_Count] uuidsCount uint32 uncommittedUUIDsCount uint32 @@ -66,10 +67,10 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { // addrs is sorted by the memory usage of each agent(descending order) // this is decending because it's supposed to be used for index manager to decide // which pod to make a create index rpc(higher memory, first to commit) - addrs := c.discoverer.GetAddrs(ctx) - log.Debug("agent addrs found:", addrs) + c.agentAddrs = c.discoverer.GetAddrs(ctx) + log.Debug("agent addrs found:", c.agentAddrs) - if l := len(addrs); l <= 1 { + if l := len(c.agentAddrs); l <= 1 { log.Warn("only %d agent found, there must be more than two agents for correction to happen", l) return nil, err } @@ -89,13 +90,13 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { log.Info("starting correction...") if c.cfg.Corrector.UseCache { log.Info("with cache...") - if err := c.correctWithCache(ctx, addrs); err != nil { + if err := c.correctWithCache(ctx); err != nil { log.Errorf("there's some errors while correction: %v", err) return nil, err } } else { log.Info("without cache...") - if err := c.correct(ctx, addrs); err != nil { + if err := c.correct(ctx); err != nil { log.Errorf("there's some errors while correction: %v", err) return nil, err } @@ -125,8 +126,8 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { return dech, nil } -func (c *correct) correct(ctx context.Context, addrs []string) (err error) { - if err := c.discoverer.GetClient().OrderedRange(ctx, addrs, +func (c *correct) correct(ctx context.Context) (err error) { + if err := c.discoverer.GetClient().OrderedRange(ctx, c.agentAddrs, func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { vc := vald.NewValdClient(conn) stream, err := vc.StreamListObject(ctx, &payload.Object_List_Request{}) @@ -198,7 +199,7 @@ func (c *correct) correct(ctx context.Context, addrs []string) (err error) { addr: addr, vec: res.GetVector(), }, - addrs, + c.agentAddrs, // FIXME: no cache pattern always have to check all the agents ); err != nil { // TODO: valdとstdでerrorの処理が違うので注意 // (valdはerrが着信するまでにスタートしていた処理は行われる) @@ -213,16 +214,30 @@ func (c *correct) correct(ctx context.Context, addrs []string) (err error) { } }, ); err != nil { - log.Errorf("failed to range over agents(%v): %v", addrs, err) + log.Errorf("failed to range over agents(%v): %v", c.agentAddrs, err) return err } return nil } -func (c *correct) correctWithCache(ctx context.Context, addrs []string) (err error) { - if err := c.discoverer.GetClient().OrderedRange(ctx, addrs, +func (c *correct) correctWithCache(ctx context.Context) (err error) { + // leftAgentAddrs is the agents' addr that hasn't been corrected yet. + // This is used to know which agents possibly have the same index as the target replica. + // We can say this because, thanks to caching, there is no way that the target replica is + // in the agent that has already been corrected. + leftAgentAddrs := make([]string, len(c.agentAddrs)) + n := copy(leftAgentAddrs, c.agentAddrs) + if n != len(c.agentAddrs) { + return fmt.Errorf("failed to copy agentAddrs") + } + + if err := c.discoverer.GetClient().OrderedRange(ctx, c.agentAddrs, func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { + // current address is the leftAgentAddrs[0] because this is OrderedRange and + // leftAgentAddrs is copied from c.agentAddrs + leftAgentAddrs = leftAgentAddrs[1:] + vc := vald.NewValdClient(conn) stream, err := vc.StreamListObject(ctx, &payload.Object_List_Request{}) if err != nil { @@ -297,14 +312,13 @@ func (c *correct) correctWithCache(ctx context.Context, addrs []string) (err err return nil } - // FIXME: When caching mode, already checked agent can be omitted here if err := c.checkConsistency( ctx, &vectorReplica{ addr: addr, vec: res.GetVector(), }, - addrs, + leftAgentAddrs, ); err != nil { // TODO: valdとstdでerrorの処理が違うので注意 // (valdはerrが着信するまでにスタートしていた処理は行われる) @@ -323,7 +337,7 @@ func (c *correct) correctWithCache(ctx context.Context, addrs []string) (err err } }, ); err != nil { - log.Errorf("failed to range over agents(%v): %v", addrs, err) + log.Errorf("failed to range over agents(%v): %v", c.agentAddrs, err) return err } @@ -336,11 +350,11 @@ type vectorReplica struct { } // Validate len(addrs) >= 2 before calling this function -func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorReplica, addrs []string) error { +func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorReplica, leftAgentAddrs []string) error { // availableAddrs is the agents' addr that doesn't have the target replica thus is available to insert the replica // to fix the index replica number if required. - availableAddrs := make([]string, 0, len(addrs)-1) - for _, addr := range addrs { + availableAddrs := make([]string, 0, len(c.agentAddrs)-1) + for _, addr := range c.agentAddrs { if addr != targetReplica.addr { availableAddrs = append(availableAddrs, addr) } @@ -348,7 +362,7 @@ func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorRep foundReplicas := make([]*vectorReplica, 0, len(availableAddrs)) var mu sync.Mutex - if err := c.discoverer.GetClient().OrderedRangeConcurrent(ctx, availableAddrs, len(availableAddrs), + if err := c.discoverer.GetClient().OrderedRangeConcurrent(ctx, leftAgentAddrs, len(leftAgentAddrs), func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { select { case <-ctx.Done(): @@ -401,8 +415,7 @@ func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorRep } // check replica number - replica := len(foundReplicas) + 1 - if err := c.correctReplica(ctx, targetReplica, foundReplicas, replica, availableAddrs); err != nil { + if err := c.correctReplica(ctx, targetReplica, foundReplicas, availableAddrs); err != nil { return fmt.Errorf("failed to fix index replica: %w", err) } @@ -450,11 +463,11 @@ func (c *correct) correctReplica( ctx context.Context, targetReplica *vectorReplica, foundReplicas []*vectorReplica, - replica int, availableAddrs []string, ) error { // diff < 0 means there is less replica than the correct number - diff := replica - c.cfg.Gateway.IndexReplica + existReplica := len(foundReplicas) + 1 + diff := existReplica - c.cfg.Gateway.IndexReplica if diff == 0 { // replica number is correct return nil From 3b3710bf9fcd364003c49420a0c58d73e93721a3 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 25 Aug 2023 04:53:30 +0000 Subject: [PATCH 020/101] Revert "add kvs range duration" This reverts commit 5b647be6ccc0f9be7e78e38c89ea8897fa3ee574. --- .../metrics/agent/core/ngt/ngt.go | 24 ------------------- pkg/agent/core/ngt/service/kvs/kvs.go | 20 ---------------- pkg/agent/core/ngt/service/ngt.go | 5 ---- 3 files changed, 49 deletions(-) diff --git a/internal/observability/metrics/agent/core/ngt/ngt.go b/internal/observability/metrics/agent/core/ngt/ngt.go index b7adb23c2d..a7016d3512 100644 --- a/internal/observability/metrics/agent/core/ngt/ngt.go +++ b/internal/observability/metrics/agent/core/ngt/ngt.go @@ -49,9 +49,6 @@ const ( brokenIndexStoreCountMetricsName = "agent_core_ngt_broken_index_store_count" brokenIndexStoreCountMetricsDescription = "How many broken index generations have been stored" - - kvsRangeDurationMetricsName = "agent_core_ngt_kvs_range_duration" - kvsRangeDurationMetricsDescription = "The duration of the kvs range method" ) type ngtMetrics struct { @@ -146,15 +143,6 @@ func (n *ngtMetrics) View() ([]*metrics.View, error) { return nil, err } - kvsRangeDuration, err := view.New( - view.MatchInstrumentName(kvsRangeDurationMetricsName), - view.WithSetDescription(kvsRangeDurationMetricsDescription), - view.WithSetAggregation(aggregation.LastValue{}), - ) - if err != nil { - return nil, err - } - return []*metrics.View{ &indexCount, &uncommittedIndexCount, @@ -165,7 +153,6 @@ func (n *ngtMetrics) View() ([]*metrics.View, error) { &isIndexing, &isSaving, &brokenIndexCount, - &kvsRangeDuration, }, nil } @@ -251,15 +238,6 @@ func (n *ngtMetrics) Register(m metrics.Meter) error { return err } - kvsRangeDuration, err := m.AsyncInt64().Gauge( - kvsRangeDurationMetricsName, - metrics.WithDescription(kvsRangeDurationMetricsDescription), - metrics.WithUnit(metrics.Dimensionless), - ) - if err != nil { - return err - } - return m.RegisterCallback( []metrics.AsynchronousInstrument{ indexCount, @@ -271,7 +249,6 @@ func (n *ngtMetrics) Register(m metrics.Meter) error { isIndexing, isSaving, brokenIndexCount, - kvsRangeDuration, }, func(ctx context.Context) { var indexing int64 @@ -293,7 +270,6 @@ func (n *ngtMetrics) Register(m metrics.Meter) error { isIndexing.Observe(ctx, int64(indexing)) isSaving.Observe(ctx, int64(saving)) brokenIndexCount.Observe(ctx, int64(n.ngt.BrokenIndexCount())) - kvsRangeDuration.Observe(ctx, n.ngt.KvsRangeDuration()) }, ) } diff --git a/pkg/agent/core/ngt/service/kvs/kvs.go b/pkg/agent/core/ngt/service/kvs/kvs.go index a2d7ecde77..7ee078a08b 100644 --- a/pkg/agent/core/ngt/service/kvs/kvs.go +++ b/pkg/agent/core/ngt/service/kvs/kvs.go @@ -19,7 +19,6 @@ package kvs import ( "context" "sync/atomic" - "time" "github.com/vdaas/vald/internal/safety" "github.com/vdaas/vald/internal/sync" @@ -37,7 +36,6 @@ type BidiMap interface { Range(ctx context.Context, f func(string, uint32, int64) bool) Len() uint64 Close() error - RangeDuration() int64 } type valueStructOu struct { @@ -67,11 +65,6 @@ const ( // mask = 0xFFF. ) -var ( - tmu sync.RWMutex - rangeDur int64 -) - // New returns the bidi that satisfies the BidiMap interface. func New(opts ...Option) BidiMap { b := &bidi{ @@ -158,8 +151,6 @@ func (b *bidi) DeleteInverse(val uint32) (key string, ok bool) { // Range retrieves all set keys and values and calls the callback function f. func (b *bidi) Range(ctx context.Context, f func(string, uint32, int64) bool) { - start := time.Now() - var wg sync.WaitGroup for i := range b.uo { idx := i @@ -178,11 +169,6 @@ func (b *bidi) Range(ctx context.Context, f func(string, uint32, int64) bool) { })) } wg.Wait() - - dur := time.Since(start).Nanoseconds() - tmu.Lock() - rangeDur = dur - tmu.Unlock() } // Len returns the length of the cache that is set in the bidi. @@ -206,9 +192,3 @@ func getShardID(key string) (id uint64) { } return xxh3.HashString(key) & mask } - -func (b *bidi) RangeDuration() int64 { - tmu.RLock() - defer tmu.RUnlock() - return rangeDur -} diff --git a/pkg/agent/core/ngt/service/ngt.go b/pkg/agent/core/ngt/service/ngt.go index 84caf77c97..1e1a75d291 100644 --- a/pkg/agent/core/ngt/service/ngt.go +++ b/pkg/agent/core/ngt/service/ngt.go @@ -83,7 +83,6 @@ type NGT interface { GetDimensionSize() int Close(ctx context.Context) error BrokenIndexCount() uint64 - KvsRangeDuration() int64 } type ngt struct { @@ -1748,7 +1747,3 @@ func (n *ngt) toSearchResponse(sr []core.SearchResult) (res *payload.Search_Resp } return res, nil } - -func (n *ngt) KvsRangeDuration() int64 { - return n.kvs.RangeDuration() -} From 6b3934edcee815d737f9574d5d268b941af4f782 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 28 Aug 2023 02:05:36 +0000 Subject: [PATCH 021/101] refactor --- pkg/index/job/correction/service/corrector.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index c34ee63916..af230fd65a 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -154,8 +154,8 @@ func (c *correct) correct(ctx context.Context) (err error) { streamEnd := make(chan struct{}) var once sync.Once var mu sync.Mutex - // これをさらにerrgroupで囲みたくなるが、さすがに頭がおかしくなりそう - // 事前にRecvすべき件数はわかるのだからその回数だけfor文を回すようにする方がいいか + // maybe just iterate through the number of indexes is ok? + // that way, we don't have to use this `streamEnd` channel for { select { case <-ctx.Done(): @@ -304,8 +304,9 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { log.Debugf("received object in StreamListObject: agent(%s), id(%s), timestamp(%v)", addr, res.GetVector().GetId(), res.GetVector().GetTimestamp()) // check if the index is already checked + id := res.GetVector().GetId() c.rwmu.RLock() - _, ok := c.checkedId[res.GetVector().GetId()] + _, ok := c.checkedId[id] c.rwmu.RUnlock() if ok { // already checked index @@ -328,7 +329,7 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { } c.rwmu.Lock() - c.checkedId[res.GetVector().GetId()] = struct{}{} + c.checkedId[id] = struct{}{} c.rwmu.Unlock() return nil From bc591aef9e87a0520734a630c431b91729776e43 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 28 Aug 2023 05:59:13 +0000 Subject: [PATCH 022/101] fix without cache bug --- pkg/index/job/correction/service/corrector.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index af230fd65a..bec861fff2 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -137,7 +137,7 @@ func (c *correct) correct(ctx context.Context) (err error) { seg, ctx := stdeg.WithContext(ctx) concurrency := c.cfg.Corrector.GetStreamListConcurrency() - seg.SetLimit(concurrency) // FIXME: server settingsをそのまま流用で良いのか? + seg.SetLimit(concurrency) log.Infof("starting correction for agent %s, concurrency: %d", addr, concurrency) @@ -365,6 +365,11 @@ func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorRep var mu sync.Mutex if err := c.discoverer.GetClient().OrderedRangeConcurrent(ctx, leftAgentAddrs, len(leftAgentAddrs), func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { + // To avoid GetObject to myself. To maintain backward compatibility for withoug cache operation + if addr == targetReplica.addr { + return nil + } + select { case <-ctx.Done(): return ctx.Err() From 5209a72b2011341314921dddc1b532efd9d4c83b Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 28 Aug 2023 06:46:04 +0000 Subject: [PATCH 023/101] enable observability --- pkg/index/job/correction/usecase/corrector.go | 40 ++++++++++++++----- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/pkg/index/job/correction/usecase/corrector.go b/pkg/index/job/correction/usecase/corrector.go index 9a921e2f98..d592dff82b 100644 --- a/pkg/index/job/correction/usecase/corrector.go +++ b/pkg/index/job/correction/usecase/corrector.go @@ -22,6 +22,7 @@ import ( "github.com/vdaas/vald/internal/errors" "github.com/vdaas/vald/internal/log" "github.com/vdaas/vald/internal/net/grpc" + "github.com/vdaas/vald/internal/observability" "github.com/vdaas/vald/internal/runner" "github.com/vdaas/vald/internal/safety" "github.com/vdaas/vald/pkg/index/job/correction/config" @@ -29,9 +30,10 @@ import ( ) type run struct { - eg errgroup.Group - cfg *config.Data - corrector service.Corrector + eg errgroup.Group + cfg *config.Data + observability observability.Observability + corrector service.Corrector } func New(cfg *config.Data) (r runner.Runner, err error) { @@ -87,18 +89,31 @@ func New(cfg *config.Data) (r runner.Runner, err error) { return nil, err } + var obs observability.Observability + if cfg.Observability.Enabled { + obs, err = observability.NewWithConfig(cfg.Observability) + if err != nil { + log.Error("failed to initialize observability") + return nil, err + } + } + return &run{ - eg: eg, - cfg: cfg, - corrector: corrector, + eg: eg, + cfg: cfg, + observability: obs, + corrector: corrector, }, nil } -func (c *run) PreStart(ctx context.Context) error { +func (r *run) PreStart(ctx context.Context) error { + if r.observability != nil { + return r.observability.PreStart(ctx) + } return nil } -func (c *run) Start(ctx context.Context) (<-chan error, error) { +func (r *run) Start(ctx context.Context) (<-chan error, error) { // TODO: timeoutはconfigから指定 // Setting timeout because job resource needs to be finished at some point // ここでcancelしても親は終了しないので、結局self SIGTERMしかなさそう @@ -106,10 +121,13 @@ func (c *run) Start(ctx context.Context) (<-chan error, error) { // ctx, cancel = context.WithTimeout(ctx, time.Second*20) // defer cancel() // ここでdeferすると関数はすぐ抜けちゃうので意味ない - log.Info("starting index correction...") + log.Info("starting index correction job") + if r.observability != nil { + _ = r.observability.Start(ctx) // FIXME: listen this returned err channel + } start := time.Now() - dech, err := c.corrector.Start(ctx) + dech, err := r.corrector.Start(ctx) if err != nil { log.Errorf("index correction process failed: %v", err) return nil, err @@ -120,7 +138,7 @@ func (c *run) Start(ctx context.Context) (<-chan error, error) { // FIXME: 以下をやめてシンプルにStartを抜けたらself SIGTERMで終了させる方がいいかも // その場合echは無視することになる ech := make(chan error, 100) - c.eg.Go(safety.RecoverFunc(func() error { + r.eg.Go(safety.RecoverFunc(func() error { for { select { case <-ctx.Done(): From a0cb7aa0aa11177c594587012d8a81faf8d28ac1 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 29 Aug 2023 02:20:16 +0000 Subject: [PATCH 024/101] refactor --- pkg/index/job/correction/service/corrector.go | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index bec861fff2..47cce8f4a3 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -80,7 +80,7 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { return nil, err } - // DEBUG: + // For debugging c.indexInfos.Range(func(addr string, info *payload.Info_Index_Count) bool { log.Debugf("index info: addr(%s), stored(%d), uncommitted(%d)", addr, info.GetStored(), info.GetUncommitted()) return true @@ -101,10 +101,6 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { return nil, err } } - // if err := c.correct(ctx, addrs); err != nil { - // log.Errorf("there's some errors while correction: %v", err) - // return nil, err - // } log.Info("correction finished successfully") // ech := make(chan error, 100) @@ -604,14 +600,6 @@ func (c *correct) deleteObject(ctx context.Context, addr string, vector *payload } func (c *correct) loadInfos(ctx context.Context) (err error) { - // FIXME: o11yは最後に整える - // ctx, span := trace.StartSpan(grpc.WithGRPCMethod(ctx, "core.v1.Agent/IndexInfo"), "vald/manager-index/service/Indexer.loadInfos") - // defer func() { - // if span != nil { - // span.End() - // } - // }() - var u, ucu uint32 var infoMap valdsync.Map[string, *payload.Info_Index_Count] err = c.discoverer.GetClient().RangeConcurrent(ctx, len(c.discoverer.GetAddrs(ctx)), From b453e613445414eff9cce334b83a318e27719147 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 29 Aug 2023 02:50:14 +0000 Subject: [PATCH 025/101] SIGTERM after complete --- pkg/index/job/correction/service/corrector.go | 2 +- pkg/index/job/correction/usecase/corrector.go | 48 ++++++++++--------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 47cce8f4a3..df7b2ce8d9 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -86,7 +86,7 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { return true }) - // This blocks. Should we run with errorgroup? + // FIXME: This blocks. Should we run with errorgroup? log.Info("starting correction...") if c.cfg.Corrector.UseCache { log.Info("with cache...") diff --git a/pkg/index/job/correction/usecase/corrector.go b/pkg/index/job/correction/usecase/corrector.go index d592dff82b..5bb53bf738 100644 --- a/pkg/index/job/correction/usecase/corrector.go +++ b/pkg/index/job/correction/usecase/corrector.go @@ -15,6 +15,8 @@ package usecase import ( "context" + "os" + "syscall" "time" "github.com/vdaas/vald/internal/client/v1/client/discoverer" @@ -24,7 +26,6 @@ import ( "github.com/vdaas/vald/internal/net/grpc" "github.com/vdaas/vald/internal/observability" "github.com/vdaas/vald/internal/runner" - "github.com/vdaas/vald/internal/safety" "github.com/vdaas/vald/pkg/index/job/correction/config" "github.com/vdaas/vald/pkg/index/job/correction/service" ) @@ -114,20 +115,32 @@ func (r *run) PreStart(ctx context.Context) error { } func (r *run) Start(ctx context.Context) (<-chan error, error) { - // TODO: timeoutはconfigから指定 - // Setting timeout because job resource needs to be finished at some point - // ここでcancelしても親は終了しないので、結局self SIGTERMしかなさそう - // timeout設定はして、finalizeを呼ぶのが良いか - // ctx, cancel = context.WithTimeout(ctx, time.Second*20) - // defer cancel() // ここでdeferすると関数はすぐ抜けちゃうので意味ない + // TODO: Set timeout? + // ctx, cancel := context.WithTimeout(ctx, time.Microsecond*10) + // defer cancel() + + defer func() { + log.Info("fiding my pid to kill myself") + p, err := os.FindProcess(os.Getpid()) + if err != nil { + // using Fatal to avoid this process to be zombie + log.Fatalf("failed to find my pid to kill %v", err) + return + } + + log.Info("sending SIGTERM to myself to stop this job") + if err := p.Signal(syscall.SIGTERM); err != nil { + log.Error(err) + } + }() log.Info("starting index correction job") if r.observability != nil { - _ = r.observability.Start(ctx) // FIXME: listen this returned err channel + _ = r.observability.Start(ctx) // TODO: listen this returned err channel } start := time.Now() - dech, err := r.corrector.Start(ctx) + _, err := r.corrector.Start(ctx) if err != nil { log.Errorf("index correction process failed: %v", err) return nil, err @@ -135,20 +148,9 @@ func (r *run) Start(ctx context.Context) (<-chan error, error) { end := time.Since(start) log.Infof("correction finished in %v", end) - // FIXME: 以下をやめてシンプルにStartを抜けたらself SIGTERMで終了させる方がいいかも - // その場合echは無視することになる - ech := make(chan error, 100) - r.eg.Go(safety.RecoverFunc(func() error { - for { - select { - case <-ctx.Done(): - log.Debug("======= ctx.Done at corrector start") - return ctx.Err() - case err = <-dech: - ech <- err - } - } - })) + // this ech is just a placeholder to return. this is not a daemon but a job. + // so after returning, this process will be SIGTERMed by myself immediately. + ech := make(chan error) return ech, nil } From db6868d91250390d0cf1ebc59b5d085fbfb781b3 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 4 Sep 2023 02:57:04 +0000 Subject: [PATCH 026/101] add metrics server --- pkg/index/job/correction/usecase/corrector.go | 103 +++++++++++++----- 1 file changed, 75 insertions(+), 28 deletions(-) diff --git a/pkg/index/job/correction/usecase/corrector.go b/pkg/index/job/correction/usecase/corrector.go index 5bb53bf738..78c37da9ed 100644 --- a/pkg/index/job/correction/usecase/corrector.go +++ b/pkg/index/job/correction/usecase/corrector.go @@ -20,12 +20,17 @@ import ( "time" "github.com/vdaas/vald/internal/client/v1/client/discoverer" - "github.com/vdaas/vald/internal/errgroup" + iconf "github.com/vdaas/vald/internal/config" "github.com/vdaas/vald/internal/errors" "github.com/vdaas/vald/internal/log" "github.com/vdaas/vald/internal/net/grpc" + "github.com/vdaas/vald/internal/net/grpc/interceptor/server/recover" "github.com/vdaas/vald/internal/observability" "github.com/vdaas/vald/internal/runner" + "github.com/vdaas/vald/internal/safety" + "github.com/vdaas/vald/internal/servers/server" + "github.com/vdaas/vald/internal/servers/starter" + "github.com/vdaas/vald/internal/sync/errgroup" "github.com/vdaas/vald/pkg/index/job/correction/config" "github.com/vdaas/vald/pkg/index/job/correction/service" ) @@ -34,6 +39,7 @@ type run struct { eg errgroup.Group cfg *config.Data observability observability.Observability + server starter.Server corrector service.Corrector } @@ -85,6 +91,23 @@ func New(cfg *config.Data) (r runner.Runner, err error) { return nil, err } + grpcServerOptions := []server.Option{ + server.WithGRPCOption( + grpc.ChainUnaryInterceptor(recover.RecoverInterceptor()), + grpc.ChainStreamInterceptor(recover.RecoverStreamInterceptor()), + ), + } + + // For health check and metrics + srv, err := starter.New(starter.WithConfig(cfg.Server), + starter.WithGRPC(func(sc *iconf.Server) []server.Option { + return grpcServerOptions + }), + ) + if err != nil { + return nil, err + } + corrector, err := service.New(cfg, discoverer) if err != nil { return nil, err @@ -103,6 +126,7 @@ func New(cfg *config.Data) (r runner.Runner, err error) { eg: eg, cfg: cfg, observability: obs, + server: srv, corrector: corrector, }, nil } @@ -119,38 +143,61 @@ func (r *run) Start(ctx context.Context) (<-chan error, error) { // ctx, cancel := context.WithTimeout(ctx, time.Microsecond*10) // defer cancel() - defer func() { - log.Info("fiding my pid to kill myself") - p, err := os.FindProcess(os.Getpid()) - if err != nil { - // using Fatal to avoid this process to be zombie - log.Fatalf("failed to find my pid to kill %v", err) - return + log.Info("starting servers") + ech := make(chan error, 3) + var oech, nech, sech <-chan error + r.eg.Go(safety.RecoverFunc(func() (err error) { + defer close(ech) + if r.observability != nil { + oech = r.observability.Start(ctx) } - - log.Info("sending SIGTERM to myself to stop this job") - if err := p.Signal(syscall.SIGTERM); err != nil { - log.Error(err) + sech = r.server.ListenAndServe(ctx) + for { + select { + case <-ctx.Done(): + return ctx.Err() + case err = <-oech: + case err = <-nech: + case err = <-sech: + } + if err != nil { + select { + case <-ctx.Done(): + return ctx.Err() + case ech <- err: + } + } } - }() + })) + + // main groutine to run the job + r.eg.Go(safety.RecoverFunc(func() (err error) { + defer func() { + log.Info("fiding my pid to kill myself") + p, err := os.FindProcess(os.Getpid()) + if err != nil { + // using Fatal to avoid this process to be zombie + log.Fatalf("failed to find my pid to kill %v", err) + return + } - log.Info("starting index correction job") - if r.observability != nil { - _ = r.observability.Start(ctx) // TODO: listen this returned err channel - } + log.Info("sending SIGTERM to myself to stop this job") + if err := p.Signal(syscall.SIGTERM); err != nil { + log.Error(err) + } + }() - start := time.Now() - _, err := r.corrector.Start(ctx) - if err != nil { - log.Errorf("index correction process failed: %v", err) - return nil, err - } - end := time.Since(start) - log.Infof("correction finished in %v", end) + start := time.Now() + _, err = r.corrector.Start(ctx) + if err != nil { + log.Errorf("index correction process failed: %v", err) + return err + } + end := time.Since(start) + log.Infof("correction finished in %v", end) + return nil + })) - // this ech is just a placeholder to return. this is not a daemon but a job. - // so after returning, this process will be SIGTERMed by myself immediately. - ech := make(chan error) return ech, nil } From 7a630b3581e3aae5000d19e5df9a71344edf4608 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 5 Sep 2023 09:30:31 +0000 Subject: [PATCH 027/101] add pcache --- internal/cache/persistent/pcache.go | 217 +++++++++++++++++++++++ internal/cache/persistent/pcache_test.go | 101 +++++++++++ 2 files changed, 318 insertions(+) create mode 100644 internal/cache/persistent/pcache.go create mode 100644 internal/cache/persistent/pcache_test.go diff --git a/internal/cache/persistent/pcache.go b/internal/cache/persistent/pcache.go new file mode 100644 index 0000000000..18b6a74ffb --- /dev/null +++ b/internal/cache/persistent/pcache.go @@ -0,0 +1,217 @@ +package persistent + +import ( + "encoding/gob" + "io/fs" + "os" + "sync" + + "github.com/vdaas/vald/internal/file" + "github.com/zeebo/xxh3" +) + +type PCache interface { + Get(string) (struct{}, bool, error) + Set(string, struct{}) error + Delete(string) error + Close() error +} + +type Shard interface { + Get(string) (struct{}, bool, error) + Set(string, struct{}) error + Delete(string) error + Close() error +} + +var _ PCache = (*pcache)(nil) +var _ Shard = (*shard)(nil) + +type pcache struct { + shards [slen]Shard +} + +type shard struct { + path string + dl int + m map[string]struct{} + mu sync.Mutex + perm fs.FileMode +} + +const ( + // slen is shards length. + slen = 512 + // slen = 4096 + // mask is slen-1 Hex value. + mask = 0x1FF + // mask = 0xFFF. +) + +func NewPCache(basePath string) (PCache, error) { + var shards [slen]Shard + for i := range shards { + s, err := newShard(basePath) + if err != nil { + return nil, err + } + shards[i] = s + } + return &pcache{ + shards: shards, + }, nil +} + +// New returns the pcache that satisfies the PCache interface. +func (p *pcache) Get(key string) (struct{}, bool, error) { + data, ok, err := p.shards[getShardID(key)].Get(key) + if err != nil { + return data, false, err + } + if !ok { + return data, false, nil + } + + return data, true, nil +} + +func (p *pcache) Set(key string, data struct{}) error { + return p.shards[getShardID(key)].Set(key, data) +} + +func (p *pcache) Delete(key string) error { + return p.shards[getShardID(key)].Delete(key) +} + +func (p *pcache) Close() error { + for _, s := range p.shards { + err := s.Close() + if err != nil { + return err + } + } + return nil +} + +func newShard(basePath string) (*shard, error) { + f, err := os.CreateTemp(basePath, "pcache-*") + if err != nil { + return nil, err + } + defer f.Close() + + return &shard{ + m: make(map[string]struct{}), + perm: 0600, + path: f.Name(), + }, nil +} + +func (s *shard) Get(key string) (data struct{}, ok bool, err error) { + s.mu.Lock() + defer s.mu.Unlock() + + f, err := file.Open(s.path, os.O_RDWR, s.perm) + if err != nil { + return + } + defer f.Close() + + err = gob.NewDecoder(f).Decode(&s.m) + if err != nil { + return + } + + data, ok = s.m[key] + + s.m = nil // TODO: clear + + return data, ok, nil +} + +func (s *shard) Set(key string, data struct{}) (err error) { + s.mu.Lock() + defer s.mu.Unlock() + + f, err := file.Open(s.path, os.O_RDWR, s.perm) + if err != nil { + return err + } + defer f.Close() + + s.m[key] = data + + err = gob.NewEncoder(f).Encode(s.m) + if err != nil { + return err + } + + fi, err := f.Stat() + if err != nil { + return err + } + s.dl = int(fi.Size()) + + return f.Sync() +} + +func (s *shard) Delete(key string) (err error) { + s.mu.Lock() + defer s.mu.Unlock() + + f, err := file.Open(s.path, os.O_RDWR, s.perm) + if err != nil { + return err + } + defer f.Close() + + err = gob.NewDecoder(f).Decode(&s.m) + if err != nil { + return + } + + delete(s.m, key) + + // Write the updated data to the file + err = f.Truncate(0) + if err != nil { + return err + } + _, err = f.Seek(0, 0) + if err != nil { + return err + } + + err = gob.NewEncoder(f).Encode(s.m) + if err != nil { + return err + } + s.m = nil // TODO: use clear after 1.21 + + fi, err := f.Stat() + if err != nil { + return err + } + s.dl = int(fi.Size()) + + return f.Sync() +} + +func (s *shard) Close() error { + s.mu.Lock() + defer s.mu.Unlock() + + s.m = nil + + if err := os.Remove(s.path); err != nil { + return err + } + return nil +} + +func getShardID(key string) (id uint64) { + if len(key) > 128 { + return xxh3.HashString(key[:128]) & mask + } + return xxh3.HashString(key) & mask +} diff --git a/internal/cache/persistent/pcache_test.go b/internal/cache/persistent/pcache_test.go new file mode 100644 index 0000000000..e95fa7b759 --- /dev/null +++ b/internal/cache/persistent/pcache_test.go @@ -0,0 +1,101 @@ +package persistent_test + +import ( + "fmt" + "testing" + + "github.com/stretchr/testify/require" + "github.com/vdaas/vald/internal/cache/persistent" + "github.com/vdaas/vald/internal/sync" +) + +func TestPersistentCache(t *testing.T) { + base := t.TempDir() + pc, err := persistent.NewPCache(base) + require.NoError(t, err) + + len := 4096 + + for i := 0; i < len; i++ { + err := pc.Set(fmt.Sprint(i), struct{}{}) + require.NoError(t, err) + } + + for i := 0; i < len; i++ { + _, ok, err := pc.Get(fmt.Sprint(i)) + require.NoError(t, err) + require.True(t, ok, fmt.Sprintf("i: %d", i)) + } + + for i := 0; i < len; i++ { + err := pc.Delete(fmt.Sprint(i)) + require.NoError(t, err) + } + + for i := 0; i < len; i++ { + _, ok, err := pc.Get(fmt.Sprint(i)) + require.NoError(t, err) + require.False(t, ok, fmt.Sprintf("i: %d", i)) + } + + err = pc.Close() + require.NoError(t, err) +} + +func TestPersistentCacheConcurrent(t *testing.T) { + base := t.TempDir() + pc, err := persistent.NewPCache(base) + require.NoError(t, err) + + len := 4096 + + var wg sync.WaitGroup + for i := 0; i < len; i++ { + wg.Add(1) + go func(key int) { + defer wg.Done() + err := pc.Set(fmt.Sprint(key), struct{}{}) + require.NoError(t, err) + }(i) + } + + wg.Wait() + + for i := 0; i < len; i++ { + wg.Add(1) + go func(key int) { + defer wg.Done() + _, ok, err := pc.Get(fmt.Sprint(key)) + require.NoError(t, err) + require.True(t, ok, fmt.Sprintf("i: %d", key)) + }(i) + } + + wg.Wait() + + for i := 0; i < len; i++ { + wg.Add(1) + go func(key int) { + defer wg.Done() + err := pc.Delete(fmt.Sprint(key)) + require.NoError(t, err) + }(i) + } + + wg.Wait() + + for i := 0; i < len; i++ { + wg.Add(1) + go func(key int) { + defer wg.Done() + _, ok, err := pc.Get(fmt.Sprint(key)) + require.NoError(t, err) + require.False(t, ok, fmt.Sprintf("i: %d", key)) + }(i) + } + + wg.Wait() + + err = pc.Close() + require.NoError(t, err) +} From ca84f5630c97efa061a6bb7741b91830ea43325a Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 5 Sep 2023 09:38:38 +0000 Subject: [PATCH 028/101] remove comment --- pkg/index/job/correction/service/corrector.go | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index df7b2ce8d9..ba57e58135 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -103,22 +103,6 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { } log.Info("correction finished successfully") - // ech := make(chan error, 100) - // c.eg.Go(safety.RecoverFunc(func() (err error) { - // defer close(ech) - // for { - // select { - // case <-ctx.Done(): - // err = ctx.Err() - // if err != nil && err != context.Canceled { - // return err - // } - // return nil - // case err = <-dech: - // ech <- err - // } - // } - // })) return dech, nil } From feda42861b42023da2fe034e3bbe0d53fa8359a5 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 5 Sep 2023 09:56:44 +0000 Subject: [PATCH 029/101] [TEMP] use pcache --- pkg/index/job/correction/service/corrector.go | 35 ++++++++++++++----- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index ba57e58135..5009ef576b 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -17,12 +17,14 @@ import ( "context" "fmt" "io" + "os" "sync" "sync/atomic" agent "github.com/vdaas/vald/apis/grpc/v1/agent/core" "github.com/vdaas/vald/apis/grpc/v1/payload" "github.com/vdaas/vald/apis/grpc/v1/vald" + "github.com/vdaas/vald/internal/cache/persistent" "github.com/vdaas/vald/internal/client/v1/client/discoverer" "github.com/vdaas/vald/internal/errors" "github.com/vdaas/vald/internal/log" @@ -47,14 +49,21 @@ type correct struct { uuidsCount uint32 uncommittedUUIDsCount uint32 checkedId map[string]struct{} // TODO: use mmap if necessary + checkedIdPersistent persistent.PCache rwmu sync.RWMutex } func New(cfg *config.Data, discoverer discoverer.Client) (Corrector, error) { + p, err := persistent.NewPCache(os.TempDir()) + if err != nil { + return nil, err + } + return &correct{ - cfg: cfg, - discoverer: discoverer, - checkedId: make(map[string]struct{}), + cfg: cfg, + discoverer: discoverer, + checkedId: make(map[string]struct{}), + checkedIdPersistent: p, }, nil } @@ -285,9 +294,13 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { // check if the index is already checked id := res.GetVector().GetId() - c.rwmu.RLock() - _, ok := c.checkedId[id] - c.rwmu.RUnlock() + // c.rwmu.RLock() + // _, ok := c.checkedId[id] + // c.rwmu.RUnlock() + _, ok, err := c.checkedIdPersistent.Get(id) + if err != nil { + return err + } if ok { // already checked index return nil @@ -308,9 +321,13 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { return nil // continue other processes } - c.rwmu.Lock() - c.checkedId[id] = struct{}{} - c.rwmu.Unlock() + // c.rwmu.Lock() + // c.checkedId[id] = struct{}{} + // c.rwmu.Unlock() + err = c.checkedIdPersistent.Set(id, struct{}{}) + if err != nil { + return err + } return nil }) From c229a87ef20ac95281f67d1f2188365c4f3ea7d3 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 6 Sep 2023 01:36:49 +0000 Subject: [PATCH 030/101] [TMP] use pcache --- pkg/index/job/correction/service/corrector.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 5009ef576b..71cdf6017f 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -297,7 +297,9 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { // c.rwmu.RLock() // _, ok := c.checkedId[id] // c.rwmu.RUnlock() + c.rwmu.RLock() _, ok, err := c.checkedIdPersistent.Get(id) + c.rwmu.RUnlock() if err != nil { return err } @@ -324,7 +326,9 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { // c.rwmu.Lock() // c.checkedId[id] = struct{}{} // c.rwmu.Unlock() + c.rwmu.Lock() err = c.checkedIdPersistent.Set(id, struct{}{}) + c.rwmu.Unlock() if err != nil { return err } From fa0b68bdfba0abee71a5a682c127ee36e5240d66 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 6 Sep 2023 01:40:12 +0000 Subject: [PATCH 031/101] fix empty shard returns error --- internal/cache/persistent/pcache.go | 8 +++++++- internal/cache/persistent/pcache_test.go | 6 ++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/internal/cache/persistent/pcache.go b/internal/cache/persistent/pcache.go index 18b6a74ffb..17f3b5679f 100644 --- a/internal/cache/persistent/pcache.go +++ b/internal/cache/persistent/pcache.go @@ -2,10 +2,12 @@ package persistent import ( "encoding/gob" + "io" "io/fs" "os" "sync" + "github.com/vdaas/vald/internal/errors" "github.com/vdaas/vald/internal/file" "github.com/zeebo/xxh3" ) @@ -119,7 +121,11 @@ func (s *shard) Get(key string) (data struct{}, ok bool, err error) { err = gob.NewDecoder(f).Decode(&s.m) if err != nil { - return + // empty shard file returns EOF + if errors.Is(err, io.EOF) { + return data, false, nil + } + return data, false, err } data, ok = s.m[key] diff --git a/internal/cache/persistent/pcache_test.go b/internal/cache/persistent/pcache_test.go index e95fa7b759..e7318d5e1b 100644 --- a/internal/cache/persistent/pcache_test.go +++ b/internal/cache/persistent/pcache_test.go @@ -16,6 +16,12 @@ func TestPersistentCache(t *testing.T) { len := 4096 + for i := 0; i < len; i++ { + _, ok, err := pc.Get(fmt.Sprint(i)) + require.NoError(t, err) + require.False(t, ok, fmt.Sprintf("i: %d", i)) + } + for i := 0; i < len; i++ { err := pc.Set(fmt.Sprint(i), struct{}{}) require.NoError(t, err) From 06b312b621a243bff26585f90d9c3d0487fd1d96 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 6 Sep 2023 02:33:16 +0000 Subject: [PATCH 032/101] fix to use local map --- internal/cache/persistent/pcache.go | 40 ++++++++++++++++-------- internal/cache/persistent/pcache_test.go | 5 +++ 2 files changed, 32 insertions(+), 13 deletions(-) diff --git a/internal/cache/persistent/pcache.go b/internal/cache/persistent/pcache.go index 17f3b5679f..10325143dc 100644 --- a/internal/cache/persistent/pcache.go +++ b/internal/cache/persistent/pcache.go @@ -36,7 +36,6 @@ type pcache struct { type shard struct { path string dl int - m map[string]struct{} mu sync.Mutex perm fs.FileMode } @@ -103,7 +102,6 @@ func newShard(basePath string) (*shard, error) { defer f.Close() return &shard{ - m: make(map[string]struct{}), perm: 0600, path: f.Name(), }, nil @@ -119,7 +117,8 @@ func (s *shard) Get(key string) (data struct{}, ok bool, err error) { } defer f.Close() - err = gob.NewDecoder(f).Decode(&s.m) + m := make(map[string]struct{}, 1000) + err = gob.NewDecoder(f).Decode(&m) if err != nil { // empty shard file returns EOF if errors.Is(err, io.EOF) { @@ -128,9 +127,9 @@ func (s *shard) Get(key string) (data struct{}, ok bool, err error) { return data, false, err } - data, ok = s.m[key] + data, ok = m[key] - s.m = nil // TODO: clear + m = nil // TODO: clear return data, ok, nil } @@ -145,9 +144,25 @@ func (s *shard) Set(key string, data struct{}) (err error) { } defer f.Close() - s.m[key] = data + m := make(map[string]struct{}, s.dl) + if s.dl != 0 { + err = gob.NewDecoder(f).Decode(&m) + if err != nil { + return err + } + } - err = gob.NewEncoder(f).Encode(s.m) + m[key] = data + + err = f.Truncate(0) + if err != nil { + return err + } + _, err = f.Seek(0, 0) + if err != nil { + return err + } + err = gob.NewEncoder(f).Encode(m) if err != nil { return err } @@ -171,12 +186,13 @@ func (s *shard) Delete(key string) (err error) { } defer f.Close() - err = gob.NewDecoder(f).Decode(&s.m) + m := make(map[string]struct{}, s.dl) + err = gob.NewDecoder(f).Decode(&m) if err != nil { return } - delete(s.m, key) + delete(m, key) // Write the updated data to the file err = f.Truncate(0) @@ -188,11 +204,11 @@ func (s *shard) Delete(key string) (err error) { return err } - err = gob.NewEncoder(f).Encode(s.m) + err = gob.NewEncoder(f).Encode(m) if err != nil { return err } - s.m = nil // TODO: use clear after 1.21 + m = nil // TODO: use clear after 1.21 fi, err := f.Stat() if err != nil { @@ -207,8 +223,6 @@ func (s *shard) Close() error { s.mu.Lock() defer s.mu.Unlock() - s.m = nil - if err := os.Remove(s.path); err != nil { return err } diff --git a/internal/cache/persistent/pcache_test.go b/internal/cache/persistent/pcache_test.go index e7318d5e1b..34a850f123 100644 --- a/internal/cache/persistent/pcache_test.go +++ b/internal/cache/persistent/pcache_test.go @@ -33,6 +33,11 @@ func TestPersistentCache(t *testing.T) { require.True(t, ok, fmt.Sprintf("i: %d", i)) } + for i := 0; i < len; i++ { + err := pc.Set(fmt.Sprint(i), struct{}{}) + require.NoError(t, err) + } + for i := 0; i < len; i++ { err := pc.Delete(fmt.Sprint(i)) require.NoError(t, err) From 490345fd79de25718045737f3f415e43f681bea4 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 6 Sep 2023 04:09:02 +0000 Subject: [PATCH 033/101] [TMP] add prestop for pcache --- pkg/index/job/correction/service/corrector.go | 15 ++++++++++++--- pkg/index/job/correction/usecase/corrector.go | 3 ++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 71cdf6017f..d1352e6361 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -18,6 +18,7 @@ import ( "fmt" "io" "os" + "path/filepath" "sync" "sync/atomic" @@ -27,6 +28,7 @@ import ( "github.com/vdaas/vald/internal/cache/persistent" "github.com/vdaas/vald/internal/client/v1/client/discoverer" "github.com/vdaas/vald/internal/errors" + "github.com/vdaas/vald/internal/file" "github.com/vdaas/vald/internal/log" "github.com/vdaas/vald/internal/net/grpc" "github.com/vdaas/vald/internal/net/grpc/codes" @@ -39,6 +41,7 @@ import ( type Corrector interface { Start(ctx context.Context) (<-chan error, error) + PreStop(ctx context.Context) error } type correct struct { @@ -54,7 +57,9 @@ type correct struct { } func New(cfg *config.Data, discoverer discoverer.Client) (Corrector, error) { - p, err := persistent.NewPCache(os.TempDir()) + p := filepath.Join(os.TempDir(), "pcache") + file.MkdirAll(p, os.ModePerm) + pc, err := persistent.NewPCache(p) if err != nil { return nil, err } @@ -63,7 +68,7 @@ func New(cfg *config.Data, discoverer discoverer.Client) (Corrector, error) { cfg: cfg, discoverer: discoverer, checkedId: make(map[string]struct{}), - checkedIdPersistent: p, + checkedIdPersistent: pc, }, nil } @@ -95,7 +100,6 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { return true }) - // FIXME: This blocks. Should we run with errorgroup? log.Info("starting correction...") if c.cfg.Corrector.UseCache { log.Info("with cache...") @@ -115,6 +119,11 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { return dech, nil } +func (c *correct) PreStop(_ context.Context) error { + log.Info("removing persistent cache files...") + return c.checkedIdPersistent.Close() +} + func (c *correct) correct(ctx context.Context) (err error) { if err := c.discoverer.GetClient().OrderedRange(ctx, c.agentAddrs, func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { diff --git a/pkg/index/job/correction/usecase/corrector.go b/pkg/index/job/correction/usecase/corrector.go index 78c37da9ed..8633584f72 100644 --- a/pkg/index/job/correction/usecase/corrector.go +++ b/pkg/index/job/correction/usecase/corrector.go @@ -201,7 +201,8 @@ func (r *run) Start(ctx context.Context) (<-chan error, error) { return ech, nil } -func (*run) PreStop(context.Context) error { +func (r *run) PreStop(ctx context.Context) error { + r.corrector.PreStop(ctx) return nil } From 05838c3d6cf1aca68d5719a91d67270536ccd1bc Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 6 Sep 2023 04:12:07 +0000 Subject: [PATCH 034/101] [TEMP] add pcache config --- internal/config/corrector.go | 1 + pkg/index/job/correction/service/corrector.go | 39 +++++++++++-------- 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/internal/config/corrector.go b/internal/config/corrector.go index dd0d27fd43..55570834b7 100644 --- a/internal/config/corrector.go +++ b/internal/config/corrector.go @@ -45,6 +45,7 @@ type Corrector struct { // FIXME: Debug UseCache bool `json:"use_cache" yaml:"use_cache"` + PCache bool `json:"p_cache" yaml:"p_cache"` } // Bind binds the actual data from the Indexer receiver field. diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index d1352e6361..2178e325f8 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -303,15 +303,20 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { // check if the index is already checked id := res.GetVector().GetId() - // c.rwmu.RLock() - // _, ok := c.checkedId[id] - // c.rwmu.RUnlock() - c.rwmu.RLock() - _, ok, err := c.checkedIdPersistent.Get(id) - c.rwmu.RUnlock() - if err != nil { - return err + + // DEBUG: configで切り替え + ok := false + if c.cfg.Corrector.PCache { + _, ok, err = c.checkedIdPersistent.Get(id) + if err != nil { + return err + } + } else { + c.rwmu.RLock() + _, ok = c.checkedId[id] + c.rwmu.RUnlock() } + if ok { // already checked index return nil @@ -332,14 +337,16 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { return nil // continue other processes } - // c.rwmu.Lock() - // c.checkedId[id] = struct{}{} - // c.rwmu.Unlock() - c.rwmu.Lock() - err = c.checkedIdPersistent.Set(id, struct{}{}) - c.rwmu.Unlock() - if err != nil { - return err + // DEBUG: Testing pcache + if c.cfg.Corrector.PCache { + err = c.checkedIdPersistent.Set(id, struct{}{}) + if err != nil { + return err + } + } else { + c.rwmu.Lock() + c.checkedId[id] = struct{}{} + c.rwmu.Unlock() } return nil From 9805e7ef85b7ceeefd41e80f12a8457dea7dec03 Mon Sep 17 00:00:00 2001 From: "deepsource-autofix[bot]" <62050782+deepsource-autofix[bot]@users.noreply.github.com> Date: Wed, 6 Sep 2023 04:12:37 +0000 Subject: [PATCH 035/101] style: Format code with prettier and gofumpt --- internal/cache/persistent/pcache.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/internal/cache/persistent/pcache.go b/internal/cache/persistent/pcache.go index 10325143dc..04b03d055c 100644 --- a/internal/cache/persistent/pcache.go +++ b/internal/cache/persistent/pcache.go @@ -26,8 +26,10 @@ type Shard interface { Close() error } -var _ PCache = (*pcache)(nil) -var _ Shard = (*shard)(nil) +var ( + _ PCache = (*pcache)(nil) + _ Shard = (*shard)(nil) +) type pcache struct { shards [slen]Shard From a0d698e6f07725f39e8f5bfb41adc4bb893c9003 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 6 Sep 2023 04:18:07 +0000 Subject: [PATCH 036/101] [TEMP] add pcache log --- pkg/index/job/correction/service/corrector.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 2178e325f8..74da34d433 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -102,7 +102,11 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { log.Info("starting correction...") if c.cfg.Corrector.UseCache { - log.Info("with cache...") + if c.cfg.Corrector.PCache { + log.Info("with persistent cache...") + } else { + log.Info("with in-memory cache...") + } if err := c.correctWithCache(ctx); err != nil { log.Errorf("there's some errors while correction: %v", err) return nil, err From 9e5f2aef397ffdcd25a513631e08831ebd2e92bd Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 6 Sep 2023 04:52:31 +0000 Subject: [PATCH 037/101] fix map alloc size --- internal/cache/persistent/pcache.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/internal/cache/persistent/pcache.go b/internal/cache/persistent/pcache.go index 04b03d055c..d9bda78b50 100644 --- a/internal/cache/persistent/pcache.go +++ b/internal/cache/persistent/pcache.go @@ -38,6 +38,7 @@ type pcache struct { type shard struct { path string dl int + l int mu sync.Mutex perm fs.FileMode } @@ -119,7 +120,7 @@ func (s *shard) Get(key string) (data struct{}, ok bool, err error) { } defer f.Close() - m := make(map[string]struct{}, 1000) + m := make(map[string]struct{}, s.l) err = gob.NewDecoder(f).Decode(&m) if err != nil { // empty shard file returns EOF @@ -146,7 +147,7 @@ func (s *shard) Set(key string, data struct{}) (err error) { } defer f.Close() - m := make(map[string]struct{}, s.dl) + m := make(map[string]struct{}, s.l) if s.dl != 0 { err = gob.NewDecoder(f).Decode(&m) if err != nil { @@ -174,6 +175,7 @@ func (s *shard) Set(key string, data struct{}) (err error) { return err } s.dl = int(fi.Size()) + s.l++ return f.Sync() } @@ -188,7 +190,7 @@ func (s *shard) Delete(key string) (err error) { } defer f.Close() - m := make(map[string]struct{}, s.dl) + m := make(map[string]struct{}, s.l) err = gob.NewDecoder(f).Decode(&m) if err != nil { return @@ -217,6 +219,7 @@ func (s *shard) Delete(key string) (err error) { return err } s.dl = int(fi.Size()) + s.l-- return f.Sync() } From 37282762c52e33a10f9d859a5428d3f517d4bbaf Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 8 Sep 2023 04:28:47 +0000 Subject: [PATCH 038/101] [TMP] Add bbolt cache --- internal/cache/bbolt/bbolt.go | 82 +++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) create mode 100644 internal/cache/bbolt/bbolt.go diff --git a/internal/cache/bbolt/bbolt.go b/internal/cache/bbolt/bbolt.go new file mode 100644 index 0000000000..941a35175a --- /dev/null +++ b/internal/cache/bbolt/bbolt.go @@ -0,0 +1,82 @@ +package bbolt + +import ( + bolt "go.etcd.io/bbolt" +) + +type Bbolt struct { + db *bolt.DB + bucket string +} + +func New(path string) *Bbolt { + // TODO: 初期化をここでするか、DIするか。ライフタイムを管理するのだるいのでDIの方がいいかも + return &Bbolt{} +} + +func (b *Bbolt) Set(key string, val []byte) error { + if err := b.db.Update(func(tx *bolt.Tx) error { + b := tx.Bucket(val) + err := b.Put([]byte(key), nil) + return err + }); err != nil { + return err + } + + return nil +} + +func (b *Bbolt) Get(key string) ([]byte, bool, error) { + var val []byte + if err := b.db.View(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte(b.bucket)) + copy(val, b.Get([]byte(key))) + return nil + }); err != nil { + return nil, false, err + } + + if val == nil { + return nil, false, nil + } + + return val, true, nil +} + +// func main() { +// f, err := os.Create("my.db") +// f.Close() +// defer os.Remove(f.Name()) + +// // Open the my.db data file in your current directory. +// // It will be created if it doesn't exist. +// db, err := bolt.Open("my.db", 0600, nil) +// if err != nil { +// log.Fatal(err) +// } +// defer db.Close() + +// db.Update(func(tx *bolt.Tx) error { +// _, err := tx.CreateBucket([]byte("MyBucket")) +// if err != nil { +// return fmt.Errorf("create bucket: %s", err) +// } +// return nil +// }) + +// db.Update(func(tx *bolt.Tx) error { +// b := tx.Bucket([]byte("MyBucket")) +// err := b.Put([]byte("answer"), nil) +// return err +// }) + +// db.View(func(tx *bolt.Tx) error { +// b := tx.Bucket([]byte("MyBucket")) +// v := b.Get([]byte("answer")) +// if v == nil { +// fmt.Println("No answer found. We can differentiate this from the empty value") +// } +// fmt.Println("The key exists") +// return nil +// }) +// } From 1678ff74ccf37c0dd4eb899f9fc47d45e977204e Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 8 Sep 2023 04:49:37 +0000 Subject: [PATCH 039/101] update bbolt --- internal/cache/bbolt/bbolt.go | 73 ++++++++++++++++------------------- 1 file changed, 34 insertions(+), 39 deletions(-) diff --git a/internal/cache/bbolt/bbolt.go b/internal/cache/bbolt/bbolt.go index 941a35175a..656417da0a 100644 --- a/internal/cache/bbolt/bbolt.go +++ b/internal/cache/bbolt/bbolt.go @@ -1,17 +1,37 @@ package bbolt import ( + "fmt" + "os" + + "github.com/vdaas/vald/internal/errors" bolt "go.etcd.io/bbolt" ) type Bbolt struct { - db *bolt.DB - bucket string + db *bolt.DB + file string } -func New(path string) *Bbolt { +const bucket = "vald-bbolt-bucket" + +func New(filepath string) (*Bbolt, error) { // TODO: 初期化をここでするか、DIするか。ライフタイムを管理するのだるいのでDIの方がいいかも - return &Bbolt{} + db, err := bolt.Open(filepath, 0600, nil) + if err != nil { + return nil, err + } + db.Update(func(tx *bolt.Tx) error { + _, err := tx.CreateBucket([]byte(bucket)) + if err != nil { + return fmt.Errorf("failed to create bucket: %w", err) + } + return nil + }) + return &Bbolt{ + db: db, + file: filepath, + }, nil } func (b *Bbolt) Set(key string, val []byte) error { @@ -29,7 +49,7 @@ func (b *Bbolt) Set(key string, val []byte) error { func (b *Bbolt) Get(key string) ([]byte, bool, error) { var val []byte if err := b.db.View(func(tx *bolt.Tx) error { - b := tx.Bucket([]byte(b.bucket)) + b := tx.Bucket([]byte(bucket)) copy(val, b.Get([]byte(key))) return nil }); err != nil { @@ -43,40 +63,15 @@ func (b *Bbolt) Get(key string) ([]byte, bool, error) { return val, true, nil } -// func main() { -// f, err := os.Create("my.db") -// f.Close() -// defer os.Remove(f.Name()) - -// // Open the my.db data file in your current directory. -// // It will be created if it doesn't exist. -// db, err := bolt.Open("my.db", 0600, nil) -// if err != nil { -// log.Fatal(err) -// } -// defer db.Close() +func (b *Bbolt) Close() (err error) { + if cerr := b.db.Close(); cerr != nil { + err = cerr + } -// db.Update(func(tx *bolt.Tx) error { -// _, err := tx.CreateBucket([]byte("MyBucket")) -// if err != nil { -// return fmt.Errorf("create bucket: %s", err) -// } -// return nil -// }) + if rerr := os.RemoveAll(b.file); rerr != nil { + err = errors.Wrap(rerr, err.Error()) + } -// db.Update(func(tx *bolt.Tx) error { -// b := tx.Bucket([]byte("MyBucket")) -// err := b.Put([]byte("answer"), nil) -// return err -// }) + return err +} -// db.View(func(tx *bolt.Tx) error { -// b := tx.Bucket([]byte("MyBucket")) -// v := b.Get([]byte("answer")) -// if v == nil { -// fmt.Println("No answer found. We can differentiate this from the empty value") -// } -// fmt.Println("The key exists") -// return nil -// }) -// } From 4f876019fa9b41c066a9d9448f72f8a31959186e Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 8 Sep 2023 05:50:06 +0000 Subject: [PATCH 040/101] fix bbolt bug --- internal/cache/bbolt/bbolt.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/internal/cache/bbolt/bbolt.go b/internal/cache/bbolt/bbolt.go index 656417da0a..a48c45515e 100644 --- a/internal/cache/bbolt/bbolt.go +++ b/internal/cache/bbolt/bbolt.go @@ -36,8 +36,8 @@ func New(filepath string) (*Bbolt, error) { func (b *Bbolt) Set(key string, val []byte) error { if err := b.db.Update(func(tx *bolt.Tx) error { - b := tx.Bucket(val) - err := b.Put([]byte(key), nil) + b := tx.Bucket([]byte(bucket)) + err := b.Put([]byte(key), val) return err }); err != nil { return err @@ -50,7 +50,13 @@ func (b *Bbolt) Get(key string) ([]byte, bool, error) { var val []byte if err := b.db.View(func(tx *bolt.Tx) error { b := tx.Bucket([]byte(bucket)) - copy(val, b.Get([]byte(key))) + ret := b.Get([]byte(key)) + if ret == nil { + // key not found + return nil + } + val = make([]byte, len(ret)) + copy(val, ret) return nil }); err != nil { return nil, false, err From 01225aa7f94bb104973ce673004174e6e4caedc8 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 8 Sep 2023 05:50:12 +0000 Subject: [PATCH 041/101] add bbolt test --- internal/cache/bbolt/bbolt_test.go | 32 ++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 internal/cache/bbolt/bbolt_test.go diff --git a/internal/cache/bbolt/bbolt_test.go b/internal/cache/bbolt/bbolt_test.go new file mode 100644 index 0000000000..d4608e9a3b --- /dev/null +++ b/internal/cache/bbolt/bbolt_test.go @@ -0,0 +1,32 @@ +package bbolt_test + +import ( + "path/filepath" + "testing" + + "github.com/stretchr/testify/require" + "github.com/vdaas/vald/internal/cache/bbolt" +) + +func TestBbolt(t *testing.T) { + tempdir := t.TempDir() + tmpfile := filepath.Join(tempdir, "test.db") + b, err := bbolt.New(tmpfile) + require.NoError(t, err) + + err = b.Set("key", []byte("value")) + require.NoError(t, err) + + val, ok, err := b.Get("key") + require.NoError(t, err) + require.True(t, ok) + require.Equal(t, []byte("value"), val) + + val, ok, err = b.Get("no exist key") + require.NoError(t, err) + require.False(t, ok) + require.Nil(t, val) + + err = b.Close() + require.NoError(t, err) +} From 428ee35f8b057fa49fa905689c29ec7ee655714b Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 8 Sep 2023 05:50:30 +0000 Subject: [PATCH 042/101] [TEMP] use bbolt as persistent cache --- pkg/index/job/correction/service/corrector.go | 22 +++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 74da34d433..8fe6f86cc8 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -25,6 +25,7 @@ import ( agent "github.com/vdaas/vald/apis/grpc/v1/agent/core" "github.com/vdaas/vald/apis/grpc/v1/payload" "github.com/vdaas/vald/apis/grpc/v1/vald" + "github.com/vdaas/vald/internal/cache/bbolt" "github.com/vdaas/vald/internal/cache/persistent" "github.com/vdaas/vald/internal/client/v1/client/discoverer" "github.com/vdaas/vald/internal/errors" @@ -53,6 +54,7 @@ type correct struct { uncommittedUUIDsCount uint32 checkedId map[string]struct{} // TODO: use mmap if necessary checkedIdPersistent persistent.PCache + checkedIdBbolt *bbolt.Bbolt rwmu sync.RWMutex } @@ -64,11 +66,20 @@ func New(cfg *config.Data, discoverer discoverer.Client) (Corrector, error) { return nil, err } + d := filepath.Join(os.TempDir(), "bbolt") + file.MkdirAll(d, os.ModePerm) + p = filepath.Join(d, "checkedid.db") + b, err := bbolt.New(p) + if err != nil { + return nil, err + } + return &correct{ cfg: cfg, discoverer: discoverer, checkedId: make(map[string]struct{}), checkedIdPersistent: pc, + checkedIdBbolt: b, }, nil } @@ -125,7 +136,9 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { func (c *correct) PreStop(_ context.Context) error { log.Info("removing persistent cache files...") - return c.checkedIdPersistent.Close() + err1 := c.checkedIdPersistent.Close() + err2 := c.checkedIdBbolt.Close() + return errors.Join(err1, err2) } func (c *correct) correct(ctx context.Context) (err error) { @@ -308,10 +321,10 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { // check if the index is already checked id := res.GetVector().GetId() - // DEBUG: configで切り替え ok := false if c.cfg.Corrector.PCache { - _, ok, err = c.checkedIdPersistent.Get(id) + // _, ok, err = c.checkedIdPersistent.Get(id) + _, ok, err = c.checkedIdBbolt.Get(id) if err != nil { return err } @@ -343,7 +356,8 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { // DEBUG: Testing pcache if c.cfg.Corrector.PCache { - err = c.checkedIdPersistent.Set(id, struct{}{}) + // err = c.checkedIdPersistent.Set(id, struct{}{}) + err = c.checkedIdBbolt.Set(id, nil) if err != nil { return err } From 81b800e320c80fa5109d7e4a5862805b4ffc87d1 Mon Sep 17 00:00:00 2001 From: "deepsource-autofix[bot]" <62050782+deepsource-autofix[bot]@users.noreply.github.com> Date: Fri, 8 Sep 2023 05:50:53 +0000 Subject: [PATCH 043/101] style: Format code with prettier and gofumpt --- internal/cache/bbolt/bbolt.go | 1 - 1 file changed, 1 deletion(-) diff --git a/internal/cache/bbolt/bbolt.go b/internal/cache/bbolt/bbolt.go index a48c45515e..2c666c91b0 100644 --- a/internal/cache/bbolt/bbolt.go +++ b/internal/cache/bbolt/bbolt.go @@ -80,4 +80,3 @@ func (b *Bbolt) Close() (err error) { return err } - From bfc73c486ec0440d4dc27b1188c3c93f6af4a5fd Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 8 Sep 2023 07:02:48 +0000 Subject: [PATCH 044/101] add SetBatch to bbolt --- internal/cache/bbolt/bbolt.go | 20 ++++++++++++++++++++ internal/cache/bbolt/bbolt_test.go | 27 +++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/internal/cache/bbolt/bbolt.go b/internal/cache/bbolt/bbolt.go index 2c666c91b0..ef2664d13e 100644 --- a/internal/cache/bbolt/bbolt.go +++ b/internal/cache/bbolt/bbolt.go @@ -5,6 +5,7 @@ import ( "os" "github.com/vdaas/vald/internal/errors" + "github.com/vdaas/vald/internal/sync" bolt "go.etcd.io/bbolt" ) @@ -46,6 +47,25 @@ func (b *Bbolt) Set(key string, val []byte) error { return nil } +func (b *Bbolt) SetBatch(kv map[string]struct{}) error { + var wg sync.WaitGroup + for k := range kv { + wg.Add(1) + go func(key string) { + defer wg.Done() + b.db.Batch(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte(bucket)) + // FIXME: for index correction, value doesn't matter, but for more general use, it should be considered + err := b.Put([]byte(key), nil) + return err + }) + }(k) + } + wg.Wait() + + return nil +} + func (b *Bbolt) Get(key string) ([]byte, bool, error) { var val []byte if err := b.db.View(func(tx *bolt.Tx) error { diff --git a/internal/cache/bbolt/bbolt_test.go b/internal/cache/bbolt/bbolt_test.go index d4608e9a3b..362a6dbe1d 100644 --- a/internal/cache/bbolt/bbolt_test.go +++ b/internal/cache/bbolt/bbolt_test.go @@ -30,3 +30,30 @@ func TestBbolt(t *testing.T) { err = b.Close() require.NoError(t, err) } + +func TestSetBatch(t *testing.T) { + tempdir := t.TempDir() + tmpfile := filepath.Join(tempdir, "test.db") + b, err := bbolt.New(tmpfile) + require.NoError(t, err) + + kv := map[string]struct{}{ + "key1": {}, + "key2": {}, + "key3": {}, + "key4": {}, + "key5": {}, + } + + err = b.SetBatch(kv) + require.NoError(t, err) + + for k := range kv { + _, ok, err := b.Get(k) + require.NoError(t, err) + require.True(t, ok) + } + + err = b.Close() + require.NoError(t, err) +} From d4f3695fd7e5b5cb69fa64066f3ecd906aa15765 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 8 Sep 2023 07:16:48 +0000 Subject: [PATCH 045/101] use batch to write map to disk --- pkg/index/job/correction/service/corrector.go | 25 ++++++++++++++++--- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 8fe6f86cc8..d2d2581c16 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -249,6 +249,11 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { if err := c.discoverer.GetClient().OrderedRange(ctx, c.agentAddrs, func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { + + // DEBUG: + tmpSet := make(map[string]struct{}) + // ~DEBUG: + // current address is the leftAgentAddrs[0] because this is OrderedRange and // leftAgentAddrs is copied from c.agentAddrs leftAgentAddrs = leftAgentAddrs[1:] @@ -271,6 +276,15 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { log.Errorf("err group returned error: %v", err) return err } + + // DEBUG: + log.Info("writing cache to disk...") + if err := c.checkedIdBbolt.SetBatch(tmpSet); err != nil { + log.Errorf("SetBatch failed: %v", err) + return err + } + // ~DEBUG: + log.Infof("correction finished for agent %s", addr) return nil } @@ -357,10 +371,13 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { // DEBUG: Testing pcache if c.cfg.Corrector.PCache { // err = c.checkedIdPersistent.Set(id, struct{}{}) - err = c.checkedIdBbolt.Set(id, nil) - if err != nil { - return err - } + // err = c.checkedIdBbolt.Set(id, nil) + // if err != nil { + // return err + // } + c.rwmu.Lock() + tmpSet[id] = struct{}{} + c.rwmu.Unlock() } else { c.rwmu.Lock() c.checkedId[id] = struct{}{} From 9975237fbf5811c50e0af19bd40ffd4ad7fcc439 Mon Sep 17 00:00:00 2001 From: "deepsource-autofix[bot]" <62050782+deepsource-autofix[bot]@users.noreply.github.com> Date: Fri, 8 Sep 2023 07:17:06 +0000 Subject: [PATCH 046/101] style: Format code with prettier and gofumpt --- pkg/index/job/correction/service/corrector.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index d2d2581c16..e899826d89 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -249,7 +249,6 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { if err := c.discoverer.GetClient().OrderedRange(ctx, c.agentAddrs, func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { - // DEBUG: tmpSet := make(map[string]struct{}) // ~DEBUG: From 2b00e05cb5cea5e23da0e4c95c2c952dc0cf5cfd Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 8 Sep 2023 07:52:41 +0000 Subject: [PATCH 047/101] delete the map elements on finalize --- pkg/index/job/correction/service/corrector.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index e899826d89..4690a40f2a 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -282,6 +282,11 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { log.Errorf("SetBatch failed: %v", err) return err } + // delete all the key from the tmpSet + for k := range tmpSet { + delete(tmpSet, k) + } + tmpSet = nil // ~DEBUG: log.Infof("correction finished for agent %s", addr) From a611e88f7ffc68f8a0462ca17a1c9cb78027eef8 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 8 Sep 2023 08:12:34 +0000 Subject: [PATCH 048/101] manually call GC after the map shrink --- pkg/index/job/correction/service/corrector.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 4690a40f2a..7bdcc607ec 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -19,6 +19,7 @@ import ( "io" "os" "path/filepath" + "runtime" "sync" "sync/atomic" @@ -287,6 +288,7 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { delete(tmpSet, k) } tmpSet = nil + runtime.GC() // ~DEBUG: log.Infof("correction finished for agent %s", addr) From e21441cecf8cc70ef9cdf0777db7147f14e339c4 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 8 Sep 2023 08:31:37 +0000 Subject: [PATCH 049/101] add limit to SetBatch goroutine number --- internal/cache/bbolt/bbolt.go | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/internal/cache/bbolt/bbolt.go b/internal/cache/bbolt/bbolt.go index ef2664d13e..43318c5efc 100644 --- a/internal/cache/bbolt/bbolt.go +++ b/internal/cache/bbolt/bbolt.go @@ -1,12 +1,13 @@ package bbolt import ( + "context" "fmt" "os" "github.com/vdaas/vald/internal/errors" - "github.com/vdaas/vald/internal/sync" bolt "go.etcd.io/bbolt" + "golang.org/x/sync/errgroup" ) type Bbolt struct { @@ -48,20 +49,21 @@ func (b *Bbolt) Set(key string, val []byte) error { } func (b *Bbolt) SetBatch(kv map[string]struct{}) error { - var wg sync.WaitGroup + eg, _ := errgroup.WithContext(context.Background()) + eg.SetLimit(200) for k := range kv { - wg.Add(1) - go func(key string) { - defer wg.Done() + key := k + eg.Go(func() error { b.db.Batch(func(tx *bolt.Tx) error { b := tx.Bucket([]byte(bucket)) // FIXME: for index correction, value doesn't matter, but for more general use, it should be considered err := b.Put([]byte(key), nil) return err }) - }(k) + return nil + }) } - wg.Wait() + eg.Wait() return nil } From 66bc79cae3d2c32599dcc155d1f85b2c101af2d1 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 8 Sep 2023 08:49:52 +0000 Subject: [PATCH 050/101] stop unnecesarry GC --- pkg/index/job/correction/service/corrector.go | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 7bdcc607ec..15b349d737 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -19,7 +19,6 @@ import ( "io" "os" "path/filepath" - "runtime" "sync" "sync/atomic" @@ -284,11 +283,7 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { return err } // delete all the key from the tmpSet - for k := range tmpSet { - delete(tmpSet, k) - } tmpSet = nil - runtime.GC() // ~DEBUG: log.Infof("correction finished for agent %s", addr) From 3a854053ec65e4d2a86851beeb0420a1a96078f0 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 8 Sep 2023 09:11:40 +0000 Subject: [PATCH 051/101] increase eg limit to the MaxBatchSize --- internal/cache/bbolt/bbolt.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/internal/cache/bbolt/bbolt.go b/internal/cache/bbolt/bbolt.go index 43318c5efc..174a98feee 100644 --- a/internal/cache/bbolt/bbolt.go +++ b/internal/cache/bbolt/bbolt.go @@ -50,7 +50,7 @@ func (b *Bbolt) Set(key string, val []byte) error { func (b *Bbolt) SetBatch(kv map[string]struct{}) error { eg, _ := errgroup.WithContext(context.Background()) - eg.SetLimit(200) + eg.SetLimit(b.db.MaxBatchSize) for k := range kv { key := k eg.Go(func() error { From 16d298605db845e6d1bdd96f915dc6178e085ad2 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 11 Sep 2023 05:47:37 +0000 Subject: [PATCH 052/101] use ch to set batch bbolt --- internal/cache/bbolt/bbolt.go | 14 +++++++ pkg/index/job/correction/service/corrector.go | 37 ++++++++++++------- 2 files changed, 38 insertions(+), 13 deletions(-) diff --git a/internal/cache/bbolt/bbolt.go b/internal/cache/bbolt/bbolt.go index 174a98feee..920138ba7d 100644 --- a/internal/cache/bbolt/bbolt.go +++ b/internal/cache/bbolt/bbolt.go @@ -68,6 +68,20 @@ func (b *Bbolt) SetBatch(kv map[string]struct{}) error { return nil } +// wait for this eg to make sure all the batches finished +func (b *Bbolt) SetBatch2(eg *errgroup.Group, key string, val []byte) error { + eg.Go(func() error { + b.db.Batch(func(tx *bolt.Tx) error { + b := tx.Bucket([]byte(bucket)) + // FIXME: for index correction, value doesn't matter, but for more general use, it should be considered + err := b.Put([]byte(key), nil) + return err + }) + return nil + }) + return nil +} + func (b *Bbolt) Get(key string) ([]byte, bool, error) { var val []byte if err := b.db.View(func(tx *bolt.Tx) error { diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 15b349d737..c939b1f0d2 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -249,9 +249,8 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { if err := c.discoverer.GetClient().OrderedRange(ctx, c.agentAddrs, func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { - // DEBUG: - tmpSet := make(map[string]struct{}) - // ~DEBUG: + // FIXME: set ch size with cfg or something + wch := make(chan string, 1024) // current address is the leftAgentAddrs[0] because this is OrderedRange and // leftAgentAddrs is copied from c.agentAddrs @@ -269,6 +268,20 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { log.Infof("starting correction for agent %s, concurrency: %d", addr, concurrency) + bolteg := stdeg.Group{} + bolteg.SetLimit(1000) + go func() { + for { + select { + case <-ctx.Done(): + log.Info("bbolt write goroutine finished") + return + case id := <-wch: + c.checkedIdBbolt.SetBatch2(&bolteg, id, nil) + } + } + }() + finalize := func() error { err = seg.Wait() if err != nil { @@ -276,15 +289,12 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { return err } - // DEBUG: - log.Info("writing cache to disk...") - if err := c.checkedIdBbolt.SetBatch(tmpSet); err != nil { - log.Errorf("SetBatch failed: %v", err) + err = bolteg.Wait() + if err != nil { + log.Errorf("bolt err group returned error: %v", err) return err } - // delete all the key from the tmpSet - tmpSet = nil - // ~DEBUG: + log.Info("bbolt all batch finished") log.Infof("correction finished for agent %s", addr) return nil @@ -376,9 +386,10 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { // if err != nil { // return err // } - c.rwmu.Lock() - tmpSet[id] = struct{}{} - c.rwmu.Unlock() + // c.rwmu.Lock() + // tmpSet[id] = struct{}{} + wch <- id + // c.rwmu.Unlock() } else { c.rwmu.Lock() c.checkedId[id] = struct{}{} From 919c9f9dd26d910e5a985065414e48f9a06f9cc8 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 15 Sep 2023 06:58:11 +0000 Subject: [PATCH 053/101] fix servers shutdown properly --- pkg/index/job/correction/usecase/corrector.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pkg/index/job/correction/usecase/corrector.go b/pkg/index/job/correction/usecase/corrector.go index 8633584f72..e738bba803 100644 --- a/pkg/index/job/correction/usecase/corrector.go +++ b/pkg/index/job/correction/usecase/corrector.go @@ -206,7 +206,13 @@ func (r *run) PreStop(ctx context.Context) error { return nil } -func (*run) Stop(context.Context) error { +func (r *run) Stop(ctx context.Context) error { + if r.observability != nil { + r.observability.Stop(ctx) + } + if r.server != nil { + r.server.Shutdown(ctx) + } return nil } From 9bd57c0b08dc106c44a562546ec13c67095c6b89 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 15 Sep 2023 08:24:41 +0000 Subject: [PATCH 054/101] use internal/kvs/bbolt --- internal/cache/bbolt/bbolt.go | 118 --------- internal/cache/bbolt/bbolt_test.go | 59 ----- internal/cache/persistent/pcache.go | 242 ------------------ internal/cache/persistent/pcache_test.go | 112 -------- pkg/index/job/correction/service/corrector.go | 93 ++----- 5 files changed, 22 insertions(+), 602 deletions(-) delete mode 100644 internal/cache/bbolt/bbolt.go delete mode 100644 internal/cache/bbolt/bbolt_test.go delete mode 100644 internal/cache/persistent/pcache.go delete mode 100644 internal/cache/persistent/pcache_test.go diff --git a/internal/cache/bbolt/bbolt.go b/internal/cache/bbolt/bbolt.go deleted file mode 100644 index 920138ba7d..0000000000 --- a/internal/cache/bbolt/bbolt.go +++ /dev/null @@ -1,118 +0,0 @@ -package bbolt - -import ( - "context" - "fmt" - "os" - - "github.com/vdaas/vald/internal/errors" - bolt "go.etcd.io/bbolt" - "golang.org/x/sync/errgroup" -) - -type Bbolt struct { - db *bolt.DB - file string -} - -const bucket = "vald-bbolt-bucket" - -func New(filepath string) (*Bbolt, error) { - // TODO: 初期化をここでするか、DIするか。ライフタイムを管理するのだるいのでDIの方がいいかも - db, err := bolt.Open(filepath, 0600, nil) - if err != nil { - return nil, err - } - db.Update(func(tx *bolt.Tx) error { - _, err := tx.CreateBucket([]byte(bucket)) - if err != nil { - return fmt.Errorf("failed to create bucket: %w", err) - } - return nil - }) - return &Bbolt{ - db: db, - file: filepath, - }, nil -} - -func (b *Bbolt) Set(key string, val []byte) error { - if err := b.db.Update(func(tx *bolt.Tx) error { - b := tx.Bucket([]byte(bucket)) - err := b.Put([]byte(key), val) - return err - }); err != nil { - return err - } - - return nil -} - -func (b *Bbolt) SetBatch(kv map[string]struct{}) error { - eg, _ := errgroup.WithContext(context.Background()) - eg.SetLimit(b.db.MaxBatchSize) - for k := range kv { - key := k - eg.Go(func() error { - b.db.Batch(func(tx *bolt.Tx) error { - b := tx.Bucket([]byte(bucket)) - // FIXME: for index correction, value doesn't matter, but for more general use, it should be considered - err := b.Put([]byte(key), nil) - return err - }) - return nil - }) - } - eg.Wait() - - return nil -} - -// wait for this eg to make sure all the batches finished -func (b *Bbolt) SetBatch2(eg *errgroup.Group, key string, val []byte) error { - eg.Go(func() error { - b.db.Batch(func(tx *bolt.Tx) error { - b := tx.Bucket([]byte(bucket)) - // FIXME: for index correction, value doesn't matter, but for more general use, it should be considered - err := b.Put([]byte(key), nil) - return err - }) - return nil - }) - return nil -} - -func (b *Bbolt) Get(key string) ([]byte, bool, error) { - var val []byte - if err := b.db.View(func(tx *bolt.Tx) error { - b := tx.Bucket([]byte(bucket)) - ret := b.Get([]byte(key)) - if ret == nil { - // key not found - return nil - } - val = make([]byte, len(ret)) - copy(val, ret) - return nil - }); err != nil { - return nil, false, err - } - - if val == nil { - return nil, false, nil - } - - return val, true, nil -} - -func (b *Bbolt) Close() (err error) { - if cerr := b.db.Close(); cerr != nil { - err = cerr - } - - if rerr := os.RemoveAll(b.file); rerr != nil { - err = errors.Wrap(rerr, err.Error()) - } - - return err -} diff --git a/internal/cache/bbolt/bbolt_test.go b/internal/cache/bbolt/bbolt_test.go deleted file mode 100644 index 362a6dbe1d..0000000000 --- a/internal/cache/bbolt/bbolt_test.go +++ /dev/null @@ -1,59 +0,0 @@ -package bbolt_test - -import ( - "path/filepath" - "testing" - - "github.com/stretchr/testify/require" - "github.com/vdaas/vald/internal/cache/bbolt" -) - -func TestBbolt(t *testing.T) { - tempdir := t.TempDir() - tmpfile := filepath.Join(tempdir, "test.db") - b, err := bbolt.New(tmpfile) - require.NoError(t, err) - - err = b.Set("key", []byte("value")) - require.NoError(t, err) - - val, ok, err := b.Get("key") - require.NoError(t, err) - require.True(t, ok) - require.Equal(t, []byte("value"), val) - - val, ok, err = b.Get("no exist key") - require.NoError(t, err) - require.False(t, ok) - require.Nil(t, val) - - err = b.Close() - require.NoError(t, err) -} - -func TestSetBatch(t *testing.T) { - tempdir := t.TempDir() - tmpfile := filepath.Join(tempdir, "test.db") - b, err := bbolt.New(tmpfile) - require.NoError(t, err) - - kv := map[string]struct{}{ - "key1": {}, - "key2": {}, - "key3": {}, - "key4": {}, - "key5": {}, - } - - err = b.SetBatch(kv) - require.NoError(t, err) - - for k := range kv { - _, ok, err := b.Get(k) - require.NoError(t, err) - require.True(t, ok) - } - - err = b.Close() - require.NoError(t, err) -} diff --git a/internal/cache/persistent/pcache.go b/internal/cache/persistent/pcache.go deleted file mode 100644 index d9bda78b50..0000000000 --- a/internal/cache/persistent/pcache.go +++ /dev/null @@ -1,242 +0,0 @@ -package persistent - -import ( - "encoding/gob" - "io" - "io/fs" - "os" - "sync" - - "github.com/vdaas/vald/internal/errors" - "github.com/vdaas/vald/internal/file" - "github.com/zeebo/xxh3" -) - -type PCache interface { - Get(string) (struct{}, bool, error) - Set(string, struct{}) error - Delete(string) error - Close() error -} - -type Shard interface { - Get(string) (struct{}, bool, error) - Set(string, struct{}) error - Delete(string) error - Close() error -} - -var ( - _ PCache = (*pcache)(nil) - _ Shard = (*shard)(nil) -) - -type pcache struct { - shards [slen]Shard -} - -type shard struct { - path string - dl int - l int - mu sync.Mutex - perm fs.FileMode -} - -const ( - // slen is shards length. - slen = 512 - // slen = 4096 - // mask is slen-1 Hex value. - mask = 0x1FF - // mask = 0xFFF. -) - -func NewPCache(basePath string) (PCache, error) { - var shards [slen]Shard - for i := range shards { - s, err := newShard(basePath) - if err != nil { - return nil, err - } - shards[i] = s - } - return &pcache{ - shards: shards, - }, nil -} - -// New returns the pcache that satisfies the PCache interface. -func (p *pcache) Get(key string) (struct{}, bool, error) { - data, ok, err := p.shards[getShardID(key)].Get(key) - if err != nil { - return data, false, err - } - if !ok { - return data, false, nil - } - - return data, true, nil -} - -func (p *pcache) Set(key string, data struct{}) error { - return p.shards[getShardID(key)].Set(key, data) -} - -func (p *pcache) Delete(key string) error { - return p.shards[getShardID(key)].Delete(key) -} - -func (p *pcache) Close() error { - for _, s := range p.shards { - err := s.Close() - if err != nil { - return err - } - } - return nil -} - -func newShard(basePath string) (*shard, error) { - f, err := os.CreateTemp(basePath, "pcache-*") - if err != nil { - return nil, err - } - defer f.Close() - - return &shard{ - perm: 0600, - path: f.Name(), - }, nil -} - -func (s *shard) Get(key string) (data struct{}, ok bool, err error) { - s.mu.Lock() - defer s.mu.Unlock() - - f, err := file.Open(s.path, os.O_RDWR, s.perm) - if err != nil { - return - } - defer f.Close() - - m := make(map[string]struct{}, s.l) - err = gob.NewDecoder(f).Decode(&m) - if err != nil { - // empty shard file returns EOF - if errors.Is(err, io.EOF) { - return data, false, nil - } - return data, false, err - } - - data, ok = m[key] - - m = nil // TODO: clear - - return data, ok, nil -} - -func (s *shard) Set(key string, data struct{}) (err error) { - s.mu.Lock() - defer s.mu.Unlock() - - f, err := file.Open(s.path, os.O_RDWR, s.perm) - if err != nil { - return err - } - defer f.Close() - - m := make(map[string]struct{}, s.l) - if s.dl != 0 { - err = gob.NewDecoder(f).Decode(&m) - if err != nil { - return err - } - } - - m[key] = data - - err = f.Truncate(0) - if err != nil { - return err - } - _, err = f.Seek(0, 0) - if err != nil { - return err - } - err = gob.NewEncoder(f).Encode(m) - if err != nil { - return err - } - - fi, err := f.Stat() - if err != nil { - return err - } - s.dl = int(fi.Size()) - s.l++ - - return f.Sync() -} - -func (s *shard) Delete(key string) (err error) { - s.mu.Lock() - defer s.mu.Unlock() - - f, err := file.Open(s.path, os.O_RDWR, s.perm) - if err != nil { - return err - } - defer f.Close() - - m := make(map[string]struct{}, s.l) - err = gob.NewDecoder(f).Decode(&m) - if err != nil { - return - } - - delete(m, key) - - // Write the updated data to the file - err = f.Truncate(0) - if err != nil { - return err - } - _, err = f.Seek(0, 0) - if err != nil { - return err - } - - err = gob.NewEncoder(f).Encode(m) - if err != nil { - return err - } - m = nil // TODO: use clear after 1.21 - - fi, err := f.Stat() - if err != nil { - return err - } - s.dl = int(fi.Size()) - s.l-- - - return f.Sync() -} - -func (s *shard) Close() error { - s.mu.Lock() - defer s.mu.Unlock() - - if err := os.Remove(s.path); err != nil { - return err - } - return nil -} - -func getShardID(key string) (id uint64) { - if len(key) > 128 { - return xxh3.HashString(key[:128]) & mask - } - return xxh3.HashString(key) & mask -} diff --git a/internal/cache/persistent/pcache_test.go b/internal/cache/persistent/pcache_test.go deleted file mode 100644 index 34a850f123..0000000000 --- a/internal/cache/persistent/pcache_test.go +++ /dev/null @@ -1,112 +0,0 @@ -package persistent_test - -import ( - "fmt" - "testing" - - "github.com/stretchr/testify/require" - "github.com/vdaas/vald/internal/cache/persistent" - "github.com/vdaas/vald/internal/sync" -) - -func TestPersistentCache(t *testing.T) { - base := t.TempDir() - pc, err := persistent.NewPCache(base) - require.NoError(t, err) - - len := 4096 - - for i := 0; i < len; i++ { - _, ok, err := pc.Get(fmt.Sprint(i)) - require.NoError(t, err) - require.False(t, ok, fmt.Sprintf("i: %d", i)) - } - - for i := 0; i < len; i++ { - err := pc.Set(fmt.Sprint(i), struct{}{}) - require.NoError(t, err) - } - - for i := 0; i < len; i++ { - _, ok, err := pc.Get(fmt.Sprint(i)) - require.NoError(t, err) - require.True(t, ok, fmt.Sprintf("i: %d", i)) - } - - for i := 0; i < len; i++ { - err := pc.Set(fmt.Sprint(i), struct{}{}) - require.NoError(t, err) - } - - for i := 0; i < len; i++ { - err := pc.Delete(fmt.Sprint(i)) - require.NoError(t, err) - } - - for i := 0; i < len; i++ { - _, ok, err := pc.Get(fmt.Sprint(i)) - require.NoError(t, err) - require.False(t, ok, fmt.Sprintf("i: %d", i)) - } - - err = pc.Close() - require.NoError(t, err) -} - -func TestPersistentCacheConcurrent(t *testing.T) { - base := t.TempDir() - pc, err := persistent.NewPCache(base) - require.NoError(t, err) - - len := 4096 - - var wg sync.WaitGroup - for i := 0; i < len; i++ { - wg.Add(1) - go func(key int) { - defer wg.Done() - err := pc.Set(fmt.Sprint(key), struct{}{}) - require.NoError(t, err) - }(i) - } - - wg.Wait() - - for i := 0; i < len; i++ { - wg.Add(1) - go func(key int) { - defer wg.Done() - _, ok, err := pc.Get(fmt.Sprint(key)) - require.NoError(t, err) - require.True(t, ok, fmt.Sprintf("i: %d", key)) - }(i) - } - - wg.Wait() - - for i := 0; i < len; i++ { - wg.Add(1) - go func(key int) { - defer wg.Done() - err := pc.Delete(fmt.Sprint(key)) - require.NoError(t, err) - }(i) - } - - wg.Wait() - - for i := 0; i < len; i++ { - wg.Add(1) - go func(key int) { - defer wg.Done() - _, ok, err := pc.Get(fmt.Sprint(key)) - require.NoError(t, err) - require.False(t, ok, fmt.Sprintf("i: %d", key)) - }(i) - } - - wg.Wait() - - err = pc.Close() - require.NoError(t, err) -} diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index c939b1f0d2..8b49a1b3f6 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -25,9 +25,8 @@ import ( agent "github.com/vdaas/vald/apis/grpc/v1/agent/core" "github.com/vdaas/vald/apis/grpc/v1/payload" "github.com/vdaas/vald/apis/grpc/v1/vald" - "github.com/vdaas/vald/internal/cache/bbolt" - "github.com/vdaas/vald/internal/cache/persistent" "github.com/vdaas/vald/internal/client/v1/client/discoverer" + "github.com/vdaas/vald/internal/db/kvs/bbolt" "github.com/vdaas/vald/internal/errors" "github.com/vdaas/vald/internal/file" "github.com/vdaas/vald/internal/log" @@ -52,34 +51,23 @@ type correct struct { indexInfos valdsync.Map[string, *payload.Info_Index_Count] uuidsCount uint32 uncommittedUUIDsCount uint32 - checkedId map[string]struct{} // TODO: use mmap if necessary - checkedIdPersistent persistent.PCache - checkedIdBbolt *bbolt.Bbolt + checkedId bbolt.Bbolt rwmu sync.RWMutex } func New(cfg *config.Data, discoverer discoverer.Client) (Corrector, error) { - p := filepath.Join(os.TempDir(), "pcache") - file.MkdirAll(p, os.ModePerm) - pc, err := persistent.NewPCache(p) - if err != nil { - return nil, err - } - d := filepath.Join(os.TempDir(), "bbolt") file.MkdirAll(d, os.ModePerm) - p = filepath.Join(d, "checkedid.db") - b, err := bbolt.New(p) + dbfile := filepath.Join(d, "checkedid.db") + bolt, err := bbolt.New(dbfile, "", os.FileMode(0o600)) if err != nil { return nil, err } return &correct{ - cfg: cfg, - discoverer: discoverer, - checkedId: make(map[string]struct{}), - checkedIdPersistent: pc, - checkedIdBbolt: b, + cfg: cfg, + discoverer: discoverer, + checkedId: bolt, }, nil } @@ -113,11 +101,7 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { log.Info("starting correction...") if c.cfg.Corrector.UseCache { - if c.cfg.Corrector.PCache { - log.Info("with persistent cache...") - } else { - log.Info("with in-memory cache...") - } + log.Info("with bbolt disk cache...") if err := c.correctWithCache(ctx); err != nil { log.Errorf("there's some errors while correction: %v", err) return nil, err @@ -136,9 +120,10 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { func (c *correct) PreStop(_ context.Context) error { log.Info("removing persistent cache files...") - err1 := c.checkedIdPersistent.Close() - err2 := c.checkedIdBbolt.Close() - return errors.Join(err1, err2) + if err := c.checkedId.Close(true); err != nil { + return err + } + return nil } func (c *correct) correct(ctx context.Context) (err error) { @@ -249,9 +234,6 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { if err := c.discoverer.GetClient().OrderedRange(ctx, c.agentAddrs, func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { - // FIXME: set ch size with cfg or something - wch := make(chan string, 1024) - // current address is the leftAgentAddrs[0] because this is OrderedRange and // leftAgentAddrs is copied from c.agentAddrs leftAgentAddrs = leftAgentAddrs[1:] @@ -266,21 +248,8 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { concurrency := c.cfg.Corrector.GetStreamListConcurrency() seg.SetLimit(concurrency) - log.Infof("starting correction for agent %s, concurrency: %d", addr, concurrency) - - bolteg := stdeg.Group{} - bolteg.SetLimit(1000) - go func() { - for { - select { - case <-ctx.Done(): - log.Info("bbolt write goroutine finished") - return - case id := <-wch: - c.checkedIdBbolt.SetBatch2(&bolteg, id, nil) - } - } - }() + bolteg, ctx := stdeg.WithContext(ctx) + bolteg.SetLimit(2048) finalize := func() error { err = seg.Wait() @@ -304,6 +273,9 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { streamEnd := make(chan struct{}) var once sync.Once var mu sync.Mutex + + log.Infof("starting correction for agent %s, concurrency: %d", addr, concurrency) + // 事前にRecvすべき件数はわかるのだからその回数だけfor文を回すようにする方がいいか for { select { @@ -347,16 +319,9 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { id := res.GetVector().GetId() ok := false - if c.cfg.Corrector.PCache { - // _, ok, err = c.checkedIdPersistent.Get(id) - _, ok, err = c.checkedIdBbolt.Get(id) - if err != nil { - return err - } - } else { - c.rwmu.RLock() - _, ok = c.checkedId[id] - c.rwmu.RUnlock() + _, ok, err = c.checkedId.Get([]byte(id)) + if err != nil { + log.Errorf("failed to perform Get from bbolt: %v", err) } if ok { @@ -379,22 +344,8 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { return nil // continue other processes } - // DEBUG: Testing pcache - if c.cfg.Corrector.PCache { - // err = c.checkedIdPersistent.Set(id, struct{}{}) - // err = c.checkedIdBbolt.Set(id, nil) - // if err != nil { - // return err - // } - // c.rwmu.Lock() - // tmpSet[id] = struct{}{} - wch <- id - // c.rwmu.Unlock() - } else { - c.rwmu.Lock() - c.checkedId[id] = struct{}{} - c.rwmu.Unlock() - } + // TODO: define error group + c.checkedId.AsyncSet(bolteg, []byte(id), nil) return nil }) From 55c658653c902167e8a765dc9d1803d8ac6b3421 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Fri, 15 Sep 2023 08:33:38 +0000 Subject: [PATCH 055/101] refactor --- pkg/index/job/correction/service/corrector.go | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 8b49a1b3f6..68fbc05a60 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -51,8 +51,7 @@ type correct struct { indexInfos valdsync.Map[string, *payload.Info_Index_Count] uuidsCount uint32 uncommittedUUIDsCount uint32 - checkedId bbolt.Bbolt - rwmu sync.RWMutex + checkedID bbolt.Bbolt } func New(cfg *config.Data, discoverer discoverer.Client) (Corrector, error) { @@ -67,7 +66,7 @@ func New(cfg *config.Data, discoverer discoverer.Client) (Corrector, error) { return &correct{ cfg: cfg, discoverer: discoverer, - checkedId: bolt, + checkedID: bolt, }, nil } @@ -120,7 +119,7 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { func (c *correct) PreStop(_ context.Context) error { log.Info("removing persistent cache files...") - if err := c.checkedId.Close(true); err != nil { + if err := c.checkedID.Close(true); err != nil { return err } return nil @@ -319,7 +318,7 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { id := res.GetVector().GetId() ok := false - _, ok, err = c.checkedId.Get([]byte(id)) + _, ok, err = c.checkedID.Get([]byte(id)) if err != nil { log.Errorf("failed to perform Get from bbolt: %v", err) } @@ -345,7 +344,7 @@ func (c *correct) correctWithCache(ctx context.Context) (err error) { } // TODO: define error group - c.checkedId.AsyncSet(bolteg, []byte(id), nil) + c.checkedID.AsyncSet(bolteg, []byte(id), nil) return nil }) From bc92686c7288f9e08aca1d9d8864f74b9fc2f7f8 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 19 Sep 2023 01:46:43 +0000 Subject: [PATCH 056/101] always use bbolt cache for correction --- internal/config/corrector.go | 4 - pkg/index/job/correction/service/corrector.go | 112 +----------------- 2 files changed, 4 insertions(+), 112 deletions(-) diff --git a/internal/config/corrector.go b/internal/config/corrector.go index 55570834b7..6e4064a09d 100644 --- a/internal/config/corrector.go +++ b/internal/config/corrector.go @@ -42,10 +42,6 @@ type Corrector struct { // Discoverer represent agent discoverer service configuration Discoverer *DiscovererClient `json:"discoverer" yaml:"discoverer"` - - // FIXME: Debug - UseCache bool `json:"use_cache" yaml:"use_cache"` - PCache bool `json:"p_cache" yaml:"p_cache"` } // Bind binds the actual data from the Indexer receiver field. diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 68fbc05a60..bcd871a05f 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -98,19 +98,10 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { return true }) - log.Info("starting correction...") - if c.cfg.Corrector.UseCache { - log.Info("with bbolt disk cache...") - if err := c.correctWithCache(ctx); err != nil { - log.Errorf("there's some errors while correction: %v", err) - return nil, err - } - } else { - log.Info("without cache...") - if err := c.correct(ctx); err != nil { - log.Errorf("there's some errors while correction: %v", err) - return nil, err - } + log.Info("starting correction with bbolt disk cache...") + if err := c.correct(ctx); err != nil { + log.Errorf("there's some errors while correction: %v", err) + return nil, err } log.Info("correction finished successfully") @@ -126,101 +117,6 @@ func (c *correct) PreStop(_ context.Context) error { } func (c *correct) correct(ctx context.Context) (err error) { - if err := c.discoverer.GetClient().OrderedRange(ctx, c.agentAddrs, - func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { - vc := vald.NewValdClient(conn) - stream, err := vc.StreamListObject(ctx, &payload.Object_List_Request{}) - if err != nil { - return err - } - - seg, ctx := stdeg.WithContext(ctx) - concurrency := c.cfg.Corrector.GetStreamListConcurrency() - seg.SetLimit(concurrency) - - log.Infof("starting correction for agent %s, concurrency: %d", addr, concurrency) - - finalize := func() error { - err = seg.Wait() - if err != nil { - log.Errorf("err group returned error: %v", err) - return err - } - log.Infof("correction finished for agent %s", addr) - return nil - } - - streamEnd := make(chan struct{}) - var once sync.Once - var mu sync.Mutex - // maybe just iterate through the number of indexes is ok? - // that way, we don't have to use this `streamEnd` channel - for { - select { - case <-ctx.Done(): - return finalize() - case <-streamEnd: - return finalize() - default: - // TODO: when vald internal errgroup is changed to block when eg limitation is reached, - // switch to vald version of errgroup. - seg.Go(func() error { - mu.Lock() - // As long as we don't stream.Recv() from the stream, we do not consume the memory of the message. - // So by limiting the number of this errgroup.Go instances, we can limit the memory usage - // https://github.com/grpc/grpc-go/blob/33f9fa2e6e5bcf4cf8fe45133e23779ae6e43f6c/rpc_util.go#L795 - res, err := stream.Recv() - mu.Unlock() - - if errors.Is(err, io.EOF) { - log.Debugf("StreamListObject stream finished for agent %s", addr) - once.Do(func() { - close(streamEnd) - }) - return nil - } - if err != nil { - log.Errorf("StreamListObject stream finished unexpectedly: %v", err) - return err - } - - if res.GetVector() == nil { - st := res.GetStatus() - log.Error(st.GetCode(), st.GetMessage(), st.GetDetails()) - // continue - return nil - } - - log.Debugf("received object in StreamListObject: agent(%s), id(%s), timestamp(%v)", addr, res.GetVector().GetId(), res.GetVector().GetTimestamp()) - if err := c.checkConsistency( - ctx, - &vectorReplica{ - addr: addr, - vec: res.GetVector(), - }, - c.agentAddrs, // FIXME: no cache pattern always have to check all the agents - ); err != nil { - // TODO: valdとstdでerrorの処理が違うので注意 - // (valdはerrが着信するまでにスタートしていた処理は行われる) - // (stdはerrが着信すると他は全て止まる) - log.Errorf("failed to check consistency: %v", err) - return nil // continue other processes - } - - return nil - }) - } - } - }, - ); err != nil { - log.Errorf("failed to range over agents(%v): %v", c.agentAddrs, err) - return err - } - - return nil -} - -func (c *correct) correctWithCache(ctx context.Context) (err error) { // leftAgentAddrs is the agents' addr that hasn't been corrected yet. // This is used to know which agents possibly have the same index as the target replica. // We can say this because, thanks to caching, there is no way that the target replica is From 319ec8b29990b2e76117deb046c8020a74964ce2 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 19 Sep 2023 01:48:00 +0000 Subject: [PATCH 057/101] update sample.yaml for correction --- cmd/index/job/correction/sample.yaml | 34 ++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/cmd/index/job/correction/sample.yaml b/cmd/index/job/correction/sample.yaml index 117ddd99c7..61d156e47d 100644 --- a/cmd/index/job/correction/sample.yaml +++ b/cmd/index/job/correction/sample.yaml @@ -19,7 +19,7 @@ version: v0.0.0 time_zone: JST logging: format: raw - level: debug + level: info logger: glg server_config: servers: @@ -79,7 +79,6 @@ corrector: agent_namespace: "default" node_name: "" stream_list_concurrency: 100 - use_cache: false discoverer: duration: 500ms client: @@ -201,3 +200,34 @@ corrector: cert: /path/to/cert enabled: false key: /path/to/key +observability: + enabled: false + otlp: + collector_endpoint: "otel-collector.monitoring.svc.cluster.local:4317" + trace_batch_timeout: "1s" + trace_export_timeout: "1m" + trace_max_export_batch_size: 1024 + trace_max_queue_size: 256 + metrics_export_interval: "1s" + metrics_export_timeout: "1m" + attribute: + namespace: "_MY_POD_NAMESPACE_" + pod_name: "_MY_POD_NAME_" + node_name: "_MY_NODE_NAME_" + service_name: "vald-index-job-correction" + metrics: + enable_cgo: true + enable_goroutine: true + enable_memory: true + enable_version_info: true + version_info_labels: + - vald_version + - server_name + - git_commit + - build_time + - go_version + - go_os + - go_arch + - ngt_version + trace: + enabled: true From eb5cdb089c9756acb7ec9113e95c225b43d0153e Mon Sep 17 00:00:00 2001 From: "deepsource-autofix[bot]" <62050782+deepsource-autofix[bot]@users.noreply.github.com> Date: Tue, 19 Sep 2023 01:49:46 +0000 Subject: [PATCH 058/101] style: format code with Prettier and Gofumpt This commit fixes the style issues introduced in 319ec8b according to the output from Prettier and Gofumpt. Details: https://github.com/vdaas/vald/pull/2152 --- cmd/index/job/correction/sample.yaml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cmd/index/job/correction/sample.yaml b/cmd/index/job/correction/sample.yaml index 61d156e47d..87ddd08657 100644 --- a/cmd/index/job/correction/sample.yaml +++ b/cmd/index/job/correction/sample.yaml @@ -221,13 +221,13 @@ observability: enable_memory: true enable_version_info: true version_info_labels: - - vald_version - - server_name - - git_commit - - build_time - - go_version - - go_os - - go_arch - - ngt_version + - vald_version + - server_name + - git_commit + - build_time + - go_version + - go_os + - go_arch + - ngt_version trace: enabled: true From d3616e51cece56a6cd6804bbcabf88321d6a74d1 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 19 Sep 2023 06:21:26 +0000 Subject: [PATCH 059/101] use go std slices pkg --- pkg/index/job/correction/service/corrector.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index bcd871a05f..20f22aada1 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -14,11 +14,13 @@ package service import ( + "cmp" "context" "fmt" "io" "os" "path/filepath" + "slices" "sync" "sync/atomic" @@ -33,7 +35,6 @@ import ( "github.com/vdaas/vald/internal/net/grpc" "github.com/vdaas/vald/internal/net/grpc/codes" "github.com/vdaas/vald/internal/net/grpc/status" - "github.com/vdaas/vald/internal/slices" valdsync "github.com/vdaas/vald/internal/sync" "github.com/vdaas/vald/pkg/index/job/correction/config" stdeg "golang.org/x/sync/errgroup" @@ -347,9 +348,9 @@ func (c *correct) correctTimestamp(ctx context.Context, targetReplica *vectorRep allReplicas := append(foundReplicas, targetReplica) // sort by timestamp - slices.SortFunc(allReplicas, func(i, j *vectorReplica) bool { + slices.SortFunc(allReplicas, func(i, j *vectorReplica) int { // largest timestamp means the latest - return i.vec.GetTimestamp() > j.vec.GetTimestamp() + return cmp.Compare(j.vec.GetTimestamp(), i.vec.GetTimestamp()) }) latest := allReplicas[0] From 89e23390a6f1d4850a0437bb7f9eef73da2f1667 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 19 Sep 2023 06:43:08 +0000 Subject: [PATCH 060/101] refactor --- pkg/index/job/correction/service/corrector.go | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 20f22aada1..c7c8c223e1 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -213,13 +213,10 @@ func (c *correct) correct(ctx context.Context) (err error) { // check if the index is already checked id := res.GetVector().GetId() - - ok := false - _, ok, err = c.checkedID.Get([]byte(id)) + _, ok, err := c.checkedID.Get([]byte(id)) if err != nil { log.Errorf("failed to perform Get from bbolt: %v", err) } - if ok { // already checked index return nil @@ -240,7 +237,7 @@ func (c *correct) correct(ctx context.Context) (err error) { return nil // continue other processes } - // TODO: define error group + // now this id is checked so set it to the disk cache c.checkedID.AsyncSet(bolteg, []byte(id), nil) return nil @@ -305,6 +302,7 @@ func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorRep } } + // the target replica is found in this agent with the addr log.Debugf("object found: agent(%s), id(%v), timestamp(%v)", addr, v.GetId(), v.GetTimestamp()) mu.Lock() @@ -312,12 +310,12 @@ func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorRep addr: addr, vec: v, }) - for i, a := range availableAddrs { - if a == addr { - availableAddrs = availableAddrs[:i+copy(availableAddrs[i:], availableAddrs[i+1:])] - break - } - } + + // Remove this addr from availableAddrs because this addr has the target replica + // and not available to insert the replica to fix the index replica number + slices.DeleteFunc(availableAddrs, func(availableAddr string) bool { + return availableAddr == addr + }) mu.Unlock() return nil From e2eeabce3f8d89d3c5652f4e869be55445317554 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 19 Sep 2023 06:54:13 +0000 Subject: [PATCH 061/101] add comment --- pkg/index/job/correction/service/corrector.go | 1 + 1 file changed, 1 insertion(+) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index c7c8c223e1..95e78e31fe 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -448,6 +448,7 @@ func (c *correct) correctReplica( func (c *correct) updateObject(ctx context.Context, addr string, vector *payload.Object_Vector) error { res, err := c.discoverer.GetClient(). Do(grpc.WithGRPCMethod(ctx, "core.v1.Vald/Update"), addr, func(ctx context.Context, conn *grpc.ClientConn, copts ...grpc.CallOption) (interface{}, error) { + // TODO: use UpdateTimestamp when it's implemented because here we just want to update only the timestamp but not the vector return vald.NewUpdateClient(conn).Update(ctx, &payload.Update_Request{ Vector: vector, // FIXME: this should be deleted after Config.Timestamp deprecation From 9a9e622aef33ca81cc9880604ced7ed17f049edf Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 19 Sep 2023 06:56:53 +0000 Subject: [PATCH 062/101] remove valdsync --- pkg/index/job/correction/service/corrector.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 95e78e31fe..a6e16cdccb 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -21,7 +21,6 @@ import ( "os" "path/filepath" "slices" - "sync" "sync/atomic" agent "github.com/vdaas/vald/apis/grpc/v1/agent/core" @@ -35,7 +34,7 @@ import ( "github.com/vdaas/vald/internal/net/grpc" "github.com/vdaas/vald/internal/net/grpc/codes" "github.com/vdaas/vald/internal/net/grpc/status" - valdsync "github.com/vdaas/vald/internal/sync" + "github.com/vdaas/vald/internal/sync" "github.com/vdaas/vald/pkg/index/job/correction/config" stdeg "golang.org/x/sync/errgroup" ) @@ -49,7 +48,7 @@ type correct struct { cfg *config.Data discoverer discoverer.Client agentAddrs []string - indexInfos valdsync.Map[string, *payload.Info_Index_Count] + indexInfos sync.Map[string, *payload.Info_Index_Count] uuidsCount uint32 uncommittedUUIDsCount uint32 checkedID bbolt.Bbolt @@ -515,7 +514,7 @@ func (c *correct) deleteObject(ctx context.Context, addr string, vector *payload func (c *correct) loadInfos(ctx context.Context) (err error) { var u, ucu uint32 - var infoMap valdsync.Map[string, *payload.Info_Index_Count] + var infoMap sync.Map[string, *payload.Info_Index_Count] err = c.discoverer.GetClient().RangeConcurrent(ctx, len(c.discoverer.GetAddrs(ctx)), func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption, From 4c22bcc1a779b6d4db9b3b7f1135b844c993f9a5 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 19 Sep 2023 06:59:04 +0000 Subject: [PATCH 063/101] use vald errgroup --- pkg/index/job/correction/service/corrector.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index a6e16cdccb..0c5c5d0617 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -35,8 +35,8 @@ import ( "github.com/vdaas/vald/internal/net/grpc/codes" "github.com/vdaas/vald/internal/net/grpc/status" "github.com/vdaas/vald/internal/sync" + "github.com/vdaas/vald/internal/sync/errgroup" "github.com/vdaas/vald/pkg/index/job/correction/config" - stdeg "golang.org/x/sync/errgroup" ) type Corrector interface { @@ -139,11 +139,11 @@ func (c *correct) correct(ctx context.Context) (err error) { return err } - seg, ctx := stdeg.WithContext(ctx) + seg, ctx := errgroup.WithContext(ctx) concurrency := c.cfg.Corrector.GetStreamListConcurrency() seg.SetLimit(concurrency) - bolteg, ctx := stdeg.WithContext(ctx) + bolteg, ctx := errgroup.WithContext(ctx) bolteg.SetLimit(2048) finalize := func() error { From 4dc480167487b3a9f430c550c0ab4beef1ae615b Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 19 Sep 2023 08:45:18 +0000 Subject: [PATCH 064/101] refactor --- pkg/index/job/correction/service/corrector.go | 66 +++++++++---------- 1 file changed, 30 insertions(+), 36 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 0c5c5d0617..9bee36eec5 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -139,48 +139,30 @@ func (c *correct) correct(ctx context.Context) (err error) { return err } - seg, ctx := errgroup.WithContext(ctx) + // context and errgroup for stream.Recv and correction + sctx, scancel := context.WithCancel(ctx) + defer scancel() + seg, sctx := errgroup.WithContext(sctx) concurrency := c.cfg.Corrector.GetStreamListConcurrency() seg.SetLimit(concurrency) + // errgroup for bbolt AsyncSet bolteg, ctx := errgroup.WithContext(ctx) bolteg.SetLimit(2048) - finalize := func() error { - err = seg.Wait() - if err != nil { - log.Errorf("err group returned error: %v", err) - return err - } - - err = bolteg.Wait() - if err != nil { - log.Errorf("bolt err group returned error: %v", err) - return err - } - log.Info("bbolt all batch finished") - - log.Infof("correction finished for agent %s", addr) - return nil - } - defer finalize() - - streamEnd := make(chan struct{}) - var once sync.Once var mu sync.Mutex - log.Infof("starting correction for agent %s, concurrency: %d", addr, concurrency) - // 事前にRecvすべき件数はわかるのだからその回数だけfor文を回すようにする方がいいか + // 事前にRecvすべき件数は事前にわからない。なぜなら処理中に新規でinsertされる可能性があるため + // TODO: そういうものはtimestampで判断して弾かないといけない for { select { - case <-ctx.Done(): - return ctx.Err() - case <-streamEnd: - return nil + case <-sctx.Done(): + if !errors.Is(sctx.Err(), context.Canceled) { + log.Errorf("context done unexpectedly: %v", sctx.Err()) + } + goto Finalize default: - // TODO: when vald internal errgroup is changed to block when eg limitation is reached, - // switch to vald version of errgroup. seg.Go(func() error { mu.Lock() // As long as we don't stream.Recv() from the stream, we do not consume the memory of the message. @@ -191,9 +173,7 @@ func (c *correct) correct(ctx context.Context) (err error) { if errors.Is(err, io.EOF) { log.Debugf("StreamListObject stream finished for agent %s", addr) - once.Do(func() { - close(streamEnd) - }) + scancel() return nil } if err != nil { @@ -229,9 +209,6 @@ func (c *correct) correct(ctx context.Context) (err error) { }, leftAgentAddrs, ); err != nil { - // TODO: valdとstdでerrorの処理が違うので注意 - // (valdはerrが着信するまでにスタートしていた処理は行われる) - // (stdはerrが着信すると他は全て止まる) log.Errorf("failed to check consistency: %v", err) return nil // continue other processes } @@ -243,6 +220,22 @@ func (c *correct) correct(ctx context.Context) (err error) { }) } } + + Finalize: + err = seg.Wait() + if err != nil { + log.Errorf("err group returned error: %v", err) + } + + berr := bolteg.Wait() + if berr != nil { + log.Errorf("bolt err group returned error: %v", err) + err = errors.Join(err, berr) + } + log.Info("bbolt all batch finished") + + log.Infof("correction finished for agent %s", addr) + return err }, ); err != nil { log.Errorf("failed to range over agents(%v): %v", c.agentAddrs, err) @@ -288,6 +281,7 @@ func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorRep Id: targetReplica.vec.GetId(), }, }) + if err != nil { if st, ok := status.FromError(err); !ok { log.Errorf("gRPC call returned not a gRPC status error: %v", err) From 705dc2952291eafa6ab0b169d136b7ee5cae50cc Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 20 Sep 2023 01:50:16 +0000 Subject: [PATCH 065/101] Define ErrNoAvailableAgentToInsert --- internal/errors/corrector.go | 2 ++ pkg/index/job/correction/service/corrector.go | 6 ++---- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/internal/errors/corrector.go b/internal/errors/corrector.go index b47296e424..53c4a136d8 100644 --- a/internal/errors/corrector.go +++ b/internal/errors/corrector.go @@ -18,3 +18,5 @@ package errors var ErrIndexReplicaOne = New("nothing to correct when index replica is 1") + +var ErrNoAvailableAgentToInsert = New("no available agent to insert replica") diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 9bee36eec5..b4f52da4bb 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -384,11 +384,9 @@ func (c *correct) correctReplica( // when there are less replicas than the correct number, add the extra replicas // TODO: refine this logic. pretty complicated if diff < 0 { - log.Infof("replica shortage of vector %s. inserting to other agents...", - targetReplica.vec.GetId()) + log.Infof("replica shortage of vector %s. inserting to other agents...", targetReplica.vec.GetId()) if len(availableAddrs) == 0 { - // TODO: define errors in errors pkg - return fmt.Errorf("no available agent to insert replica") + return errors.ErrNoAvailableAgentToInsert } // inserting with the reverse order of availableAddrs since the last agent has the lowest memory usage From 952a6e730f76da867b1330b653d87384654a388e Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 20 Sep 2023 01:52:05 +0000 Subject: [PATCH 066/101] update comment in English --- pkg/index/job/correction/service/corrector.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index b4f52da4bb..8d9353d3f2 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -153,8 +153,9 @@ func (c *correct) correct(ctx context.Context) (err error) { var mu sync.Mutex log.Infof("starting correction for agent %s, concurrency: %d", addr, concurrency) - // 事前にRecvすべき件数は事前にわからない。なぜなら処理中に新規でinsertされる可能性があるため - // TODO: そういうものはtimestampで判断して弾かないといけない + // The number of items to be received in advance is not known in advance. + // This is because there is a possibility of new items being inserted during processing. + // TODO: BTW, we need to ignore these index by checking the timestamp. for { select { case <-sctx.Done(): From 5f933988668a541a90062a5440c04ec87dce9131 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 20 Sep 2023 01:55:27 +0000 Subject: [PATCH 067/101] Apply new actions yaml format --- .../dockers-index-job-correction.yml | 122 +----------------- 1 file changed, 6 insertions(+), 116 deletions(-) diff --git a/.github/workflows/dockers-index-job-correction.yml b/.github/workflows/dockers-index-job-correction.yml index e67b051def..7e6742d247 100644 --- a/.github/workflows/dockers-index-job-correction.yml +++ b/.github/workflows/dockers-index-job-correction.yml @@ -40,6 +40,7 @@ on: pull_request: paths: - ".github/actions/docker-build/actions.yaml" + - ".github/workflows/_docker-image.yaml" - ".github/workflows/dockers-index-job-correction.yml" - "go.mod" - "go.sum" @@ -55,6 +56,7 @@ on: pull_request_target: paths: - ".github/actions/docker-build/actions.yaml" + - ".github/workflows/_docker-image.yaml" - ".github/workflows/dockers-index-job-correction.yml" - "go.mod" - "go.sum" @@ -68,121 +70,9 @@ on: - "dockers/index/job/correction/Dockerfile" - "versions/GO_VERSION" -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref != 'refs/heads/main' && github.ref || github.sha }}-${{ github.event_name }} - cancel-in-progress: true - jobs: - dump_contexts_to_log: - runs-on: ubuntu-latest - steps: - - name: Dump GitHub context - id: github_context_step - run: echo $JSON - env: - JSON: ${{ toJSON(github) }} - - name: Dump job context - run: echo $JSON - env: - JSON: ${{ toJSON(job) }} - - name: Dump steps context - run: echo $JSON - env: - JSON: ${{ toJSON(steps) }} - - name: Dump runner context - run: echo $JSON - env: - JSON: ${{ toJSON(runner) }} - - name: Dump strategy context - run: echo $JSON - env: - JSON: ${{ toJSON(strategy) }} - - name: Dump matrix context - run: echo $JSON - env: - JSON: ${{ toJSON(matrix) }} build: - strategy: - max-parallel: 4 - runs-on: ubuntu-latest - if: ${{ (github.event_name == 'pull_request' && github.event.pull_request.head.repo.fork == false) || (github.event.pull_request.head.repo.fork == true && github.event_name == 'pull_request_target' && contains(github.event.pull_request.labels.*.name, 'ci/approved')) || (github.event_name == 'push' && github.ref == 'refs/heads/main') || startsWith( github.ref, 'refs/tags/') }} - steps: - - name: Get ref - id: ref - run: | - if [ ${{ github.event.pull_request.head.sha }} != "" ]; then - echo ref=${{ github.event.pull_request.head.sha }} >> $GITHUB_OUTPUT - else - echo ref=${{ github.sha }} >> $GITHUB_OUTPUT - fi - - uses: actions/checkout@v3 - with: - ref: ${{ steps.ref.outputs.ref }} - - name: set git config - run: | - git config --global --add safe.directory ${GITHUB_WORKSPACE} - - name: Setup QEMU - uses: docker/setup-qemu-action@v2 - with: - platforms: all - - name: Setup Docker Buildx - id: buildx - uses: docker/setup-buildx-action@v2 - with: - buildkitd-flags: "--debug" - - name: Login to DockerHub - uses: docker/login-action@v2 - with: - username: ${{ secrets.DOCKERHUB_USER }} - password: ${{ secrets.DOCKERHUB_PASS }} - - name: Login to GitHub Container Registry - uses: docker/login-action@v2 - with: - registry: ghcr.io - username: ${{ secrets.PACKAGE_USER }} - password: ${{ secrets.PACKAGE_TOKEN }} - - name: Build and Publish - id: build_and_publish - uses: ./.github/actions/docker-build - with: - target: index-job-correction - builder: ${{ steps.buildx.outputs.name }} - - name: Initialize CodeQL - if: startsWith( github.ref, 'refs/tags/') - uses: github/codeql-action/init@v2 - - name: Run vulnerability scanner (table) - if: startsWith( github.ref, 'refs/tags/') - uses: aquasecurity/trivy-action@master - with: - image-ref: "${{ steps.build_and_publish.outputs.IMAGE_NAME }}:${{ steps.build_and_publish.outputs.PRIMARY_TAG }}" - format: "table" - - name: Run vulnerability scanner (sarif) - if: startsWith( github.ref, 'refs/tags/') - uses: aquasecurity/trivy-action@master - with: - image-ref: "${{ steps.build_and_publish.outputs.IMAGE_NAME }}:${{ steps.build_and_publish.outputs.PRIMARY_TAG }}" - format: "template" - template: "@/contrib/sarif.tpl" - output: "trivy-results.sarif" - - name: Upload Trivy scan results to Security tab - if: startsWith( github.ref, 'refs/tags/') - uses: github/codeql-action/upload-sarif@v2 - with: - sarif_file: "trivy-results.sarif" - slack: - name: Slack notification - needs: build - runs-on: ubuntu-latest - if: github.ref == 'refs/heads/main' || startsWith( github.ref, 'refs/tags/') - steps: - - uses: technote-space/workflow-conclusion-action@v2 - with: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - uses: 8398a7/action-slack@v3 - with: - author_name: index-job-correction image build - status: ${{ env.WORKFLOW_CONCLUSION }} - only_mention_fail: channel - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_NOTIFY_WEBHOOK_URL }} + uses: ./.github/workflows/_docker-image.yaml + with: + target: index-job-correction + secrets: inherit From c860ddc919d6dec77f657bf81b6234ba4b8dbf54 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 20 Sep 2023 01:58:07 +0000 Subject: [PATCH 068/101] Disable godox --- .golangci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.golangci.yml b/.golangci.yml index a3bb179b38..065e2b5550 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -51,7 +51,6 @@ linters: - gochecknoinits - goconst - godot - - godox - gofumpt - goimports - gomnd @@ -99,6 +98,7 @@ linters: # - gocognit # - gocritic # - gocyclo + # - godox # - goerr113 # - gofmt # - goheader From 2c2bb09482a5c82b8741fb63ba4072ab9bd1dfed Mon Sep 17 00:00:00 2001 From: "deepsource-autofix[bot]" <62050782+deepsource-autofix[bot]@users.noreply.github.com> Date: Wed, 20 Sep 2023 01:58:40 +0000 Subject: [PATCH 069/101] style: format code with Prettier and Gofumpt This commit fixes the style issues introduced in c860ddc according to the output from Prettier and Gofumpt. Details: https://github.com/vdaas/vald/pull/2194 --- pkg/index/job/correction/service/corrector.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 8d9353d3f2..f0b2892b81 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -282,7 +282,6 @@ func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorRep Id: targetReplica.vec.GetId(), }, }) - if err != nil { if st, ok := status.FromError(err); !ok { log.Errorf("gRPC call returned not a gRPC status error: %v", err) From 2ecf8cfba9e22da973a3062163c50df5b7817e6c Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 20 Sep 2023 02:04:54 +0000 Subject: [PATCH 070/101] remove comment --- internal/runner/runner.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/internal/runner/runner.go b/internal/runner/runner.go index 4dccd14c90..fcf741c51d 100644 --- a/internal/runner/runner.go +++ b/internal/runner/runner.go @@ -215,8 +215,6 @@ func Run(ctx context.Context, run Runner, name string) (err error) { emap[err.Error()]++ } - // waif for all the goroutines to finish. - // this errgroup is global across the program err = errgroup.Wait() if err != nil && !errors.Is(err, context.DeadlineExceeded) && From 8bb271d139d0601d37eff72464a2b40812d91927 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 20 Sep 2023 02:19:56 +0000 Subject: [PATCH 071/101] Apply format --- internal/config/corrector.go | 2 +- internal/errors/corrector.go | 2 +- pkg/index/job/correction/config/config.go | 2 +- pkg/index/job/correction/service/corrector.go | 2 +- pkg/index/job/correction/usecase/corrector.go | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/internal/config/corrector.go b/internal/config/corrector.go index 6e4064a09d..fe72c16d89 100644 --- a/internal/config/corrector.go +++ b/internal/config/corrector.go @@ -2,7 +2,7 @@ // Copyright (C) 2019-2023 vdaas.org vald team // // Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. +// You may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 diff --git a/internal/errors/corrector.go b/internal/errors/corrector.go index 53c4a136d8..9b3aeb0efe 100644 --- a/internal/errors/corrector.go +++ b/internal/errors/corrector.go @@ -2,7 +2,7 @@ // Copyright (C) 2019-2023 vdaas.org vald team // // Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. +// You may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 diff --git a/pkg/index/job/correction/config/config.go b/pkg/index/job/correction/config/config.go index d704bf59d0..90eafa50f8 100644 --- a/pkg/index/job/correction/config/config.go +++ b/pkg/index/job/correction/config/config.go @@ -2,7 +2,7 @@ // Copyright (C) 2019-2023 vdaas.org vald team // // Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. +// You may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index f0b2892b81..cd41a801df 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -1,7 +1,7 @@ // Copyright (C) 2019-2023 vdaas.org vald team // // Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. +// You may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 diff --git a/pkg/index/job/correction/usecase/corrector.go b/pkg/index/job/correction/usecase/corrector.go index e738bba803..e3040e9ad8 100644 --- a/pkg/index/job/correction/usecase/corrector.go +++ b/pkg/index/job/correction/usecase/corrector.go @@ -1,7 +1,7 @@ // Copyright (C) 2019-2023 vdaas.org vald team // // Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. +// You may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 From e5f73a342f341450267ad5b6727b2d9bd7f0c25b Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 20 Sep 2023 02:28:45 +0000 Subject: [PATCH 072/101] Add type check for type assertion --- cmd/index/job/correction/main.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/cmd/index/job/correction/main.go b/cmd/index/job/correction/main.go index 04a604a04b..d549626691 100644 --- a/cmd/index/job/correction/main.go +++ b/cmd/index/job/correction/main.go @@ -1,7 +1,7 @@ // Copyright (C) 2019-2023 vdaas.org vald team // // Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. +// You may not use this file except in compliance with the License. // You may obtain a copy of the License at // // https://www.apache.org/licenses/LICENSE-2.0 @@ -45,7 +45,11 @@ func main() { return cfg, &cfg.GlobalConfig, nil }), runner.WithDaemonInitializer(func(cfg interface{}) (runner.Runner, error) { - return usecase.New(cfg.(*config.Data)) + c, ok := cfg.(*config.Data) + if !ok { + return nil, errors.ErrInvalidConfig + } + return usecase.New(c) }), ) })(); err != nil { From d3e4d78f4695dd6d32d755fcaf418e9116b8781a Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 20 Sep 2023 02:32:12 +0000 Subject: [PATCH 073/101] use const to specify filemode --- pkg/index/job/correction/service/corrector.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index cd41a801df..6302b916fe 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -54,11 +54,13 @@ type correct struct { checkedID bbolt.Bbolt } +const filemode = 0o600 + func New(cfg *config.Data, discoverer discoverer.Client) (Corrector, error) { d := filepath.Join(os.TempDir(), "bbolt") file.MkdirAll(d, os.ModePerm) dbfile := filepath.Join(d, "checkedid.db") - bolt, err := bbolt.New(dbfile, "", os.FileMode(0o600)) + bolt, err := bbolt.New(dbfile, "", os.FileMode(filemode)) if err != nil { return nil, err } From 7e5cd714107e98fa3a95487a179a82eff05c8de6 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 20 Sep 2023 02:41:38 +0000 Subject: [PATCH 074/101] Add bbolt concurrency as config --- cmd/index/job/correction/sample.yaml | 3 ++- internal/config/corrector.go | 13 ++++++++++++- pkg/index/job/correction/service/corrector.go | 9 +++++---- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/cmd/index/job/correction/sample.yaml b/cmd/index/job/correction/sample.yaml index 87ddd08657..7b2eaf263c 100644 --- a/cmd/index/job/correction/sample.yaml +++ b/cmd/index/job/correction/sample.yaml @@ -78,7 +78,8 @@ corrector: agent_dns: vald-agent-ngt.default.svc.cluster.local agent_namespace: "default" node_name: "" - stream_list_concurrency: 100 + stream_list_concurrency: 200 + bbolt_async_write_concurrency: 2048 discoverer: duration: 500ms client: diff --git a/internal/config/corrector.go b/internal/config/corrector.go index fe72c16d89..fa0b6621b7 100644 --- a/internal/config/corrector.go +++ b/internal/config/corrector.go @@ -40,6 +40,9 @@ type Corrector struct { // this directly affects the memory usage of this job StreamListConcurrency int `json:"stream_list_concurrency" yaml:"stream_list_concurrency"` + // BboltAsyncWriteConcurrency represent concurrency for bbolt async write + BboltAsyncWriteConcurrency int `json:"bbolt_async_write_concurrency" yaml:"bbolt_async_write_concurrency"` + // Discoverer represent agent discoverer service configuration Discoverer *DiscovererClient `json:"discoverer" yaml:"discoverer"` } @@ -62,5 +65,13 @@ func (c *Corrector) GetStreamListConcurrency() int { if c != nil { return c.StreamListConcurrency } - return -1 + return 200 +} + +// Returns 2048 when not specified since not setting this could use up all the available momory +func (c *Corrector) GetBboltAsyncWriteConcurrency() int { + if c != nil { + return c.BboltAsyncWriteConcurrency + } + return 2048 } diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 6302b916fe..0a0279fb40 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -145,15 +145,16 @@ func (c *correct) correct(ctx context.Context) (err error) { sctx, scancel := context.WithCancel(ctx) defer scancel() seg, sctx := errgroup.WithContext(sctx) - concurrency := c.cfg.Corrector.GetStreamListConcurrency() - seg.SetLimit(concurrency) + sconcurrency := c.cfg.Corrector.GetStreamListConcurrency() + seg.SetLimit(sconcurrency) // errgroup for bbolt AsyncSet bolteg, ctx := errgroup.WithContext(ctx) - bolteg.SetLimit(2048) + bconcurrency := c.cfg.Corrector.GetBboltAsyncWriteConcurrency() + bolteg.SetLimit(bconcurrency) var mu sync.Mutex - log.Infof("starting correction for agent %s, concurrency: %d", addr, concurrency) + log.Infof("starting correction for agent %s, stream concurrency: %d, bbolt concurrency: %d", addr, sconcurrency, bconcurrency) // The number of items to be received in advance is not known in advance. // This is because there is a possibility of new items being inserted during processing. From a184b22326e462639740ac6abe2a42be660df267 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 20 Sep 2023 02:43:50 +0000 Subject: [PATCH 075/101] fix var style --- pkg/index/job/correction/service/corrector.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 0a0279fb40..c828dd2be9 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -348,9 +348,9 @@ func (c *correct) correctTimestamp(ctx context.Context, targetReplica *vectorRep }) latest := allReplicas[0] - latestTs := latest.vec.GetTimestamp() + latestTS := latest.vec.GetTimestamp() for _, replica := range allReplicas { - if replica.vec.GetTimestamp() == latestTs { + if replica.vec.GetTimestamp() == latestTS { // no inconsistency continue } From 4972294e3bce22c6dddc9c22e1362b6966868521 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 20 Sep 2023 02:46:17 +0000 Subject: [PATCH 076/101] Suppress linter --- pkg/index/job/correction/usecase/corrector.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/index/job/correction/usecase/corrector.go b/pkg/index/job/correction/usecase/corrector.go index e3040e9ad8..5567016f7a 100644 --- a/pkg/index/job/correction/usecase/corrector.go +++ b/pkg/index/job/correction/usecase/corrector.go @@ -144,7 +144,7 @@ func (r *run) Start(ctx context.Context) (<-chan error, error) { // defer cancel() log.Info("starting servers") - ech := make(chan error, 3) + ech := make(chan error, 3) //nolint:gomnd var oech, nech, sech <-chan error r.eg.Go(safety.RecoverFunc(func() (err error) { defer close(ech) From ef931239d8ef5db83a66b5c3c022d20b5b649582 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 20 Sep 2023 07:43:26 +0000 Subject: [PATCH 077/101] fix comment --- internal/config/corrector.go | 7 ++++--- pkg/index/job/correction/service/corrector.go | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/internal/config/corrector.go b/internal/config/corrector.go index fa0b6621b7..56424e455f 100644 --- a/internal/config/corrector.go +++ b/internal/config/corrector.go @@ -60,12 +60,13 @@ func (c *Corrector) Bind() *Corrector { return c } -// GetStreamListConcurrency returns the StreamListConcurrency field value if set, -1 otherwise, which means no limit. +// GetStreamListConcurrency returns the StreamListConcurrency field value if set, otherwise 200 is set, +// since not setting this could use up all the available momory func (c *Corrector) GetStreamListConcurrency() int { if c != nil { return c.StreamListConcurrency } - return 200 + return 200 //nolint:gomnd } // Returns 2048 when not specified since not setting this could use up all the available momory @@ -73,5 +74,5 @@ func (c *Corrector) GetBboltAsyncWriteConcurrency() int { if c != nil { return c.BboltAsyncWriteConcurrency } - return 2048 + return 2048 //nolint:gomnd } diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index c828dd2be9..02eee789a6 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -445,7 +445,7 @@ func (c *correct) updateObject(ctx context.Context, addr string, vector *payload // TODO: use UpdateTimestamp when it's implemented because here we just want to update only the timestamp but not the vector return vald.NewUpdateClient(conn).Update(ctx, &payload.Update_Request{ Vector: vector, - // FIXME: this should be deleted after Config.Timestamp deprecation + // TODO: this should be deleted after Config.Timestamp deprecation Config: &payload.Update_Config{ // TODO: Decrementing because it's gonna be incremented befor being pushed // to vqueue in the agent. This is a not ideal workaround for the current vqueue implementation From 375c04efb9e57f19396e59b7ed615903fa97f407 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 20 Sep 2023 07:44:01 +0000 Subject: [PATCH 078/101] add test template --- internal/config/corrector_test.go | 366 ++++++++++++++++++ .../job/correction/config/config_test.go | 93 +++++ .../job/correction/service/corrector_test.go | 346 +++++++++++++++++ .../job/correction/usecase/corrector_test.go | 93 +++++ 4 files changed, 898 insertions(+) create mode 100644 internal/config/corrector_test.go create mode 100644 pkg/index/job/correction/config/config_test.go create mode 100644 pkg/index/job/correction/service/corrector_test.go create mode 100644 pkg/index/job/correction/usecase/corrector_test.go diff --git a/internal/config/corrector_test.go b/internal/config/corrector_test.go new file mode 100644 index 0000000000..8cd0119742 --- /dev/null +++ b/internal/config/corrector_test.go @@ -0,0 +1,366 @@ +package config + +// NOT IMPLEMENTED BELOW +// +// func TestCorrector_Bind(t *testing.T) { +// type fields struct { +// AgentPort int +// AgentName string +// AgentNamespace string +// AgentDNS string +// CreationPoolSize uint32 +// NodeName string +// StreamListConcurrency int +// BboltAsyncWriteConcurrency int +// Discoverer *DiscovererClient +// } +// type want struct { +// want *Corrector +// } +// type test struct { +// name string +// fields fields +// want want +// checkFunc func(want, *Corrector) error +// beforeFunc func(*testing.T) +// afterFunc func(*testing.T) +// } +// defaultCheckFunc := func(w want, got *Corrector) error { +// if !reflect.DeepEqual(got, w.want) { +// return errors.Errorf("got: \"%#v\",\n\t\t\t\twant: \"%#v\"", got, w.want) +// } +// return nil +// } +// tests := []test{ +// // TODO test cases +// /* +// { +// name: "test_case_1", +// fields: fields { +// AgentPort:0, +// AgentName:"", +// AgentNamespace:"", +// AgentDNS:"", +// CreationPoolSize:0, +// NodeName:"", +// StreamListConcurrency:0, +// BboltAsyncWriteConcurrency:0, +// Discoverer:DiscovererClient{}, +// }, +// want: want{}, +// checkFunc: defaultCheckFunc, +// beforeFunc: func(t *testing.T,) { +// t.Helper() +// }, +// afterFunc: func(t *testing.T,) { +// t.Helper() +// }, +// }, +// */ +// +// // TODO test cases +// /* +// func() test { +// return test { +// name: "test_case_2", +// fields: fields { +// AgentPort:0, +// AgentName:"", +// AgentNamespace:"", +// AgentDNS:"", +// CreationPoolSize:0, +// NodeName:"", +// StreamListConcurrency:0, +// BboltAsyncWriteConcurrency:0, +// Discoverer:DiscovererClient{}, +// }, +// want: want{}, +// checkFunc: defaultCheckFunc, +// beforeFunc: func(t *testing.T,) { +// t.Helper() +// }, +// afterFunc: func(t *testing.T,) { +// t.Helper() +// }, +// } +// }(), +// */ +// } +// +// for _, tc := range tests { +// test := tc +// t.Run(test.name, func(tt *testing.T) { +// tt.Parallel() +// defer goleak.VerifyNone(tt, goleak.IgnoreCurrent()) +// if test.beforeFunc != nil { +// test.beforeFunc(tt) +// } +// if test.afterFunc != nil { +// defer test.afterFunc(tt) +// } +// checkFunc := test.checkFunc +// if test.checkFunc == nil { +// checkFunc = defaultCheckFunc +// } +// c := &Corrector{ +// AgentPort: test.fields.AgentPort, +// AgentName: test.fields.AgentName, +// AgentNamespace: test.fields.AgentNamespace, +// AgentDNS: test.fields.AgentDNS, +// CreationPoolSize: test.fields.CreationPoolSize, +// NodeName: test.fields.NodeName, +// StreamListConcurrency: test.fields.StreamListConcurrency, +// BboltAsyncWriteConcurrency: test.fields.BboltAsyncWriteConcurrency, +// Discoverer: test.fields.Discoverer, +// } +// +// got := c.Bind() +// if err := checkFunc(test.want, got); err != nil { +// tt.Errorf("error = %v", err) +// } +// +// }) +// } +// } +// +// func TestCorrector_GetStreamListConcurrency(t *testing.T) { +// type fields struct { +// AgentPort int +// AgentName string +// AgentNamespace string +// AgentDNS string +// CreationPoolSize uint32 +// NodeName string +// StreamListConcurrency int +// BboltAsyncWriteConcurrency int +// Discoverer *DiscovererClient +// } +// type want struct { +// want int +// } +// type test struct { +// name string +// fields fields +// want want +// checkFunc func(want, int) error +// beforeFunc func(*testing.T) +// afterFunc func(*testing.T) +// } +// defaultCheckFunc := func(w want, got int) error { +// if !reflect.DeepEqual(got, w.want) { +// return errors.Errorf("got: \"%#v\",\n\t\t\t\twant: \"%#v\"", got, w.want) +// } +// return nil +// } +// tests := []test{ +// // TODO test cases +// /* +// { +// name: "test_case_1", +// fields: fields { +// AgentPort:0, +// AgentName:"", +// AgentNamespace:"", +// AgentDNS:"", +// CreationPoolSize:0, +// NodeName:"", +// StreamListConcurrency:0, +// BboltAsyncWriteConcurrency:0, +// Discoverer:DiscovererClient{}, +// }, +// want: want{}, +// checkFunc: defaultCheckFunc, +// beforeFunc: func(t *testing.T,) { +// t.Helper() +// }, +// afterFunc: func(t *testing.T,) { +// t.Helper() +// }, +// }, +// */ +// +// // TODO test cases +// /* +// func() test { +// return test { +// name: "test_case_2", +// fields: fields { +// AgentPort:0, +// AgentName:"", +// AgentNamespace:"", +// AgentDNS:"", +// CreationPoolSize:0, +// NodeName:"", +// StreamListConcurrency:0, +// BboltAsyncWriteConcurrency:0, +// Discoverer:DiscovererClient{}, +// }, +// want: want{}, +// checkFunc: defaultCheckFunc, +// beforeFunc: func(t *testing.T,) { +// t.Helper() +// }, +// afterFunc: func(t *testing.T,) { +// t.Helper() +// }, +// } +// }(), +// */ +// } +// +// for _, tc := range tests { +// test := tc +// t.Run(test.name, func(tt *testing.T) { +// tt.Parallel() +// defer goleak.VerifyNone(tt, goleak.IgnoreCurrent()) +// if test.beforeFunc != nil { +// test.beforeFunc(tt) +// } +// if test.afterFunc != nil { +// defer test.afterFunc(tt) +// } +// checkFunc := test.checkFunc +// if test.checkFunc == nil { +// checkFunc = defaultCheckFunc +// } +// c := &Corrector{ +// AgentPort: test.fields.AgentPort, +// AgentName: test.fields.AgentName, +// AgentNamespace: test.fields.AgentNamespace, +// AgentDNS: test.fields.AgentDNS, +// CreationPoolSize: test.fields.CreationPoolSize, +// NodeName: test.fields.NodeName, +// StreamListConcurrency: test.fields.StreamListConcurrency, +// BboltAsyncWriteConcurrency: test.fields.BboltAsyncWriteConcurrency, +// Discoverer: test.fields.Discoverer, +// } +// +// got := c.GetStreamListConcurrency() +// if err := checkFunc(test.want, got); err != nil { +// tt.Errorf("error = %v", err) +// } +// +// }) +// } +// } +// +// func TestCorrector_GetBboltAsyncWriteConcurrency(t *testing.T) { +// type fields struct { +// AgentPort int +// AgentName string +// AgentNamespace string +// AgentDNS string +// CreationPoolSize uint32 +// NodeName string +// StreamListConcurrency int +// BboltAsyncWriteConcurrency int +// Discoverer *DiscovererClient +// } +// type want struct { +// want int +// } +// type test struct { +// name string +// fields fields +// want want +// checkFunc func(want, int) error +// beforeFunc func(*testing.T) +// afterFunc func(*testing.T) +// } +// defaultCheckFunc := func(w want, got int) error { +// if !reflect.DeepEqual(got, w.want) { +// return errors.Errorf("got: \"%#v\",\n\t\t\t\twant: \"%#v\"", got, w.want) +// } +// return nil +// } +// tests := []test{ +// // TODO test cases +// /* +// { +// name: "test_case_1", +// fields: fields { +// AgentPort:0, +// AgentName:"", +// AgentNamespace:"", +// AgentDNS:"", +// CreationPoolSize:0, +// NodeName:"", +// StreamListConcurrency:0, +// BboltAsyncWriteConcurrency:0, +// Discoverer:DiscovererClient{}, +// }, +// want: want{}, +// checkFunc: defaultCheckFunc, +// beforeFunc: func(t *testing.T,) { +// t.Helper() +// }, +// afterFunc: func(t *testing.T,) { +// t.Helper() +// }, +// }, +// */ +// +// // TODO test cases +// /* +// func() test { +// return test { +// name: "test_case_2", +// fields: fields { +// AgentPort:0, +// AgentName:"", +// AgentNamespace:"", +// AgentDNS:"", +// CreationPoolSize:0, +// NodeName:"", +// StreamListConcurrency:0, +// BboltAsyncWriteConcurrency:0, +// Discoverer:DiscovererClient{}, +// }, +// want: want{}, +// checkFunc: defaultCheckFunc, +// beforeFunc: func(t *testing.T,) { +// t.Helper() +// }, +// afterFunc: func(t *testing.T,) { +// t.Helper() +// }, +// } +// }(), +// */ +// } +// +// for _, tc := range tests { +// test := tc +// t.Run(test.name, func(tt *testing.T) { +// tt.Parallel() +// defer goleak.VerifyNone(tt, goleak.IgnoreCurrent()) +// if test.beforeFunc != nil { +// test.beforeFunc(tt) +// } +// if test.afterFunc != nil { +// defer test.afterFunc(tt) +// } +// checkFunc := test.checkFunc +// if test.checkFunc == nil { +// checkFunc = defaultCheckFunc +// } +// c := &Corrector{ +// AgentPort: test.fields.AgentPort, +// AgentName: test.fields.AgentName, +// AgentNamespace: test.fields.AgentNamespace, +// AgentDNS: test.fields.AgentDNS, +// CreationPoolSize: test.fields.CreationPoolSize, +// NodeName: test.fields.NodeName, +// StreamListConcurrency: test.fields.StreamListConcurrency, +// BboltAsyncWriteConcurrency: test.fields.BboltAsyncWriteConcurrency, +// Discoverer: test.fields.Discoverer, +// } +// +// got := c.GetBboltAsyncWriteConcurrency() +// if err := checkFunc(test.want, got); err != nil { +// tt.Errorf("error = %v", err) +// } +// +// }) +// } +// } diff --git a/pkg/index/job/correction/config/config_test.go b/pkg/index/job/correction/config/config_test.go new file mode 100644 index 0000000000..ac346d089c --- /dev/null +++ b/pkg/index/job/correction/config/config_test.go @@ -0,0 +1,93 @@ +package config + +// NOT IMPLEMENTED BELOW +// +// func TestNewConfig(t *testing.T) { +// type args struct { +// path string +// } +// type want struct { +// wantCfg *Data +// err error +// } +// type test struct { +// name string +// args args +// want want +// checkFunc func(want, *Data, error) error +// beforeFunc func(*testing.T, args) +// afterFunc func(*testing.T, args) +// } +// defaultCheckFunc := func(w want, gotCfg *Data, err error) error { +// if !errors.Is(err, w.err) { +// return errors.Errorf("got_error: \"%#v\",\n\t\t\t\twant: \"%#v\"", err, w.err) +// } +// if !reflect.DeepEqual(gotCfg, w.wantCfg) { +// return errors.Errorf("got: \"%#v\",\n\t\t\t\twant: \"%#v\"", gotCfg, w.wantCfg) +// } +// return nil +// } +// tests := []test{ +// // TODO test cases +// /* +// { +// name: "test_case_1", +// args: args { +// path:"", +// }, +// want: want{}, +// checkFunc: defaultCheckFunc, +// beforeFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// afterFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// }, +// */ +// +// // TODO test cases +// /* +// func() test { +// return test { +// name: "test_case_2", +// args: args { +// path:"", +// }, +// want: want{}, +// checkFunc: defaultCheckFunc, +// beforeFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// afterFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// } +// }(), +// */ +// } +// +// for _, tc := range tests { +// test := tc +// t.Run(test.name, func(tt *testing.T) { +// tt.Parallel() +// defer goleak.VerifyNone(tt, goleak.IgnoreCurrent()) +// if test.beforeFunc != nil { +// test.beforeFunc(tt, test.args) +// } +// if test.afterFunc != nil { +// defer test.afterFunc(tt, test.args) +// } +// checkFunc := test.checkFunc +// if test.checkFunc == nil { +// checkFunc = defaultCheckFunc +// } +// +// gotCfg, err := NewConfig(test.args.path) +// if err := checkFunc(test.want, gotCfg, err); err != nil { +// tt.Errorf("error = %v", err) +// } +// +// }) +// } +// } diff --git a/pkg/index/job/correction/service/corrector_test.go b/pkg/index/job/correction/service/corrector_test.go new file mode 100644 index 0000000000..837611d41f --- /dev/null +++ b/pkg/index/job/correction/service/corrector_test.go @@ -0,0 +1,346 @@ +package service + +// NOT IMPLEMENTED BELOW +// +// func TestNew(t *testing.T) { +// type args struct { +// cfg *config.Data +// discoverer discoverer.Client +// } +// type want struct { +// want Corrector +// err error +// } +// type test struct { +// name string +// args args +// want want +// checkFunc func(want, Corrector, error) error +// beforeFunc func(*testing.T, args) +// afterFunc func(*testing.T, args) +// } +// defaultCheckFunc := func(w want, got Corrector, err error) error { +// if !errors.Is(err, w.err) { +// return errors.Errorf("got_error: \"%#v\",\n\t\t\t\twant: \"%#v\"", err, w.err) +// } +// if !reflect.DeepEqual(got, w.want) { +// return errors.Errorf("got: \"%#v\",\n\t\t\t\twant: \"%#v\"", got, w.want) +// } +// return nil +// } +// tests := []test{ +// // TODO test cases +// /* +// { +// name: "test_case_1", +// args: args { +// cfg:nil, +// discoverer:nil, +// }, +// want: want{}, +// checkFunc: defaultCheckFunc, +// beforeFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// afterFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// }, +// */ +// +// // TODO test cases +// /* +// func() test { +// return test { +// name: "test_case_2", +// args: args { +// cfg:nil, +// discoverer:nil, +// }, +// want: want{}, +// checkFunc: defaultCheckFunc, +// beforeFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// afterFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// } +// }(), +// */ +// } +// +// for _, tc := range tests { +// test := tc +// t.Run(test.name, func(tt *testing.T) { +// tt.Parallel() +// defer goleak.VerifyNone(tt, goleak.IgnoreCurrent()) +// if test.beforeFunc != nil { +// test.beforeFunc(tt, test.args) +// } +// if test.afterFunc != nil { +// defer test.afterFunc(tt, test.args) +// } +// checkFunc := test.checkFunc +// if test.checkFunc == nil { +// checkFunc = defaultCheckFunc +// } +// +// got, err := New(test.args.cfg, test.args.discoverer) +// if err := checkFunc(test.want, got, err); err != nil { +// tt.Errorf("error = %v", err) +// } +// +// }) +// } +// } +// +// func Test_correct_Start(t *testing.T) { +// type args struct { +// ctx context.Context +// } +// type fields struct { +// cfg *config.Data +// discoverer discoverer.Client +// agentAddrs []string +// indexInfos sync.Map[string, *payload.Info_Index_Count] +// uuidsCount uint32 +// uncommittedUUIDsCount uint32 +// checkedID bbolt.Bbolt +// } +// type want struct { +// want <-chan error +// err error +// } +// type test struct { +// name string +// args args +// fields fields +// want want +// checkFunc func(want, <-chan error, error) error +// beforeFunc func(*testing.T, args) +// afterFunc func(*testing.T, args) +// } +// defaultCheckFunc := func(w want, got <-chan error, err error) error { +// if !errors.Is(err, w.err) { +// return errors.Errorf("got_error: \"%#v\",\n\t\t\t\twant: \"%#v\"", err, w.err) +// } +// if !reflect.DeepEqual(got, w.want) { +// return errors.Errorf("got: \"%#v\",\n\t\t\t\twant: \"%#v\"", got, w.want) +// } +// return nil +// } +// tests := []test{ +// // TODO test cases +// /* +// { +// name: "test_case_1", +// args: args { +// ctx:nil, +// }, +// fields: fields { +// cfg:nil, +// discoverer:nil, +// agentAddrs:nil, +// indexInfos:nil, +// uuidsCount:0, +// uncommittedUUIDsCount:0, +// checkedID:nil, +// }, +// want: want{}, +// checkFunc: defaultCheckFunc, +// beforeFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// afterFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// }, +// */ +// +// // TODO test cases +// /* +// func() test { +// return test { +// name: "test_case_2", +// args: args { +// ctx:nil, +// }, +// fields: fields { +// cfg:nil, +// discoverer:nil, +// agentAddrs:nil, +// indexInfos:nil, +// uuidsCount:0, +// uncommittedUUIDsCount:0, +// checkedID:nil, +// }, +// want: want{}, +// checkFunc: defaultCheckFunc, +// beforeFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// afterFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// } +// }(), +// */ +// } +// +// for _, tc := range tests { +// test := tc +// t.Run(test.name, func(tt *testing.T) { +// tt.Parallel() +// defer goleak.VerifyNone(tt, goleak.IgnoreCurrent()) +// if test.beforeFunc != nil { +// test.beforeFunc(tt, test.args) +// } +// if test.afterFunc != nil { +// defer test.afterFunc(tt, test.args) +// } +// checkFunc := test.checkFunc +// if test.checkFunc == nil { +// checkFunc = defaultCheckFunc +// } +// c := &correct{ +// cfg: test.fields.cfg, +// discoverer: test.fields.discoverer, +// agentAddrs: test.fields.agentAddrs, +// indexInfos: test.fields.indexInfos, +// uuidsCount: test.fields.uuidsCount, +// uncommittedUUIDsCount: test.fields.uncommittedUUIDsCount, +// checkedID: test.fields.checkedID, +// } +// +// got, err := c.Start(test.args.ctx) +// if err := checkFunc(test.want, got, err); err != nil { +// tt.Errorf("error = %v", err) +// } +// +// }) +// } +// } +// +// func Test_correct_PreStop(t *testing.T) { +// type args struct { +// in0 context.Context +// } +// type fields struct { +// cfg *config.Data +// discoverer discoverer.Client +// agentAddrs []string +// indexInfos sync.Map[string, *payload.Info_Index_Count] +// uuidsCount uint32 +// uncommittedUUIDsCount uint32 +// checkedID bbolt.Bbolt +// } +// type want struct { +// err error +// } +// type test struct { +// name string +// args args +// fields fields +// want want +// checkFunc func(want, error) error +// beforeFunc func(*testing.T, args) +// afterFunc func(*testing.T, args) +// } +// defaultCheckFunc := func(w want, err error) error { +// if !errors.Is(err, w.err) { +// return errors.Errorf("got_error: \"%#v\",\n\t\t\t\twant: \"%#v\"", err, w.err) +// } +// return nil +// } +// tests := []test{ +// // TODO test cases +// /* +// { +// name: "test_case_1", +// args: args { +// in0:nil, +// }, +// fields: fields { +// cfg:nil, +// discoverer:nil, +// agentAddrs:nil, +// indexInfos:nil, +// uuidsCount:0, +// uncommittedUUIDsCount:0, +// checkedID:nil, +// }, +// want: want{}, +// checkFunc: defaultCheckFunc, +// beforeFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// afterFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// }, +// */ +// +// // TODO test cases +// /* +// func() test { +// return test { +// name: "test_case_2", +// args: args { +// in0:nil, +// }, +// fields: fields { +// cfg:nil, +// discoverer:nil, +// agentAddrs:nil, +// indexInfos:nil, +// uuidsCount:0, +// uncommittedUUIDsCount:0, +// checkedID:nil, +// }, +// want: want{}, +// checkFunc: defaultCheckFunc, +// beforeFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// afterFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// } +// }(), +// */ +// } +// +// for _, tc := range tests { +// test := tc +// t.Run(test.name, func(tt *testing.T) { +// tt.Parallel() +// defer goleak.VerifyNone(tt, goleak.IgnoreCurrent()) +// if test.beforeFunc != nil { +// test.beforeFunc(tt, test.args) +// } +// if test.afterFunc != nil { +// defer test.afterFunc(tt, test.args) +// } +// checkFunc := test.checkFunc +// if test.checkFunc == nil { +// checkFunc = defaultCheckFunc +// } +// c := &correct{ +// cfg: test.fields.cfg, +// discoverer: test.fields.discoverer, +// agentAddrs: test.fields.agentAddrs, +// indexInfos: test.fields.indexInfos, +// uuidsCount: test.fields.uuidsCount, +// uncommittedUUIDsCount: test.fields.uncommittedUUIDsCount, +// checkedID: test.fields.checkedID, +// } +// +// err := c.PreStop(test.args.in0) +// if err := checkFunc(test.want, err); err != nil { +// tt.Errorf("error = %v", err) +// } +// +// }) +// } +// } diff --git a/pkg/index/job/correction/usecase/corrector_test.go b/pkg/index/job/correction/usecase/corrector_test.go new file mode 100644 index 0000000000..c4669fa90d --- /dev/null +++ b/pkg/index/job/correction/usecase/corrector_test.go @@ -0,0 +1,93 @@ +package usecase + +// NOT IMPLEMENTED BELOW +// +// func TestNew(t *testing.T) { +// type args struct { +// cfg *config.Data +// } +// type want struct { +// wantR runner.Runner +// err error +// } +// type test struct { +// name string +// args args +// want want +// checkFunc func(want, runner.Runner, error) error +// beforeFunc func(*testing.T, args) +// afterFunc func(*testing.T, args) +// } +// defaultCheckFunc := func(w want, gotR runner.Runner, err error) error { +// if !errors.Is(err, w.err) { +// return errors.Errorf("got_error: \"%#v\",\n\t\t\t\twant: \"%#v\"", err, w.err) +// } +// if !reflect.DeepEqual(gotR, w.wantR) { +// return errors.Errorf("got: \"%#v\",\n\t\t\t\twant: \"%#v\"", gotR, w.wantR) +// } +// return nil +// } +// tests := []test{ +// // TODO test cases +// /* +// { +// name: "test_case_1", +// args: args { +// cfg:nil, +// }, +// want: want{}, +// checkFunc: defaultCheckFunc, +// beforeFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// afterFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// }, +// */ +// +// // TODO test cases +// /* +// func() test { +// return test { +// name: "test_case_2", +// args: args { +// cfg:nil, +// }, +// want: want{}, +// checkFunc: defaultCheckFunc, +// beforeFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// afterFunc: func(t *testing.T, args args) { +// t.Helper() +// }, +// } +// }(), +// */ +// } +// +// for _, tc := range tests { +// test := tc +// t.Run(test.name, func(tt *testing.T) { +// tt.Parallel() +// defer goleak.VerifyNone(tt, goleak.IgnoreCurrent()) +// if test.beforeFunc != nil { +// test.beforeFunc(tt, test.args) +// } +// if test.afterFunc != nil { +// defer test.afterFunc(tt, test.args) +// } +// checkFunc := test.checkFunc +// if test.checkFunc == nil { +// checkFunc = defaultCheckFunc +// } +// +// gotR, err := New(test.args.cfg) +// if err := checkFunc(test.want, gotR, err); err != nil { +// tt.Errorf("error = %v", err) +// } +// +// }) +// } +// } From 33ad603a3dd7529f7121383e7859dbbe06afea45 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 25 Sep 2023 02:24:45 +0000 Subject: [PATCH 079/101] Refactor parameters for index correction --- charts/vald/values.yaml | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/charts/vald/values.yaml b/charts/vald/values.yaml index e619964f29..0acfe58e23 100644 --- a/charts/vald/values.yaml +++ b/charts/vald/values.yaml @@ -2633,15 +2633,24 @@ manager: keepalive: 15m #indexer fetches uncommitted index length, which includes huge payload so we need to set keepalive longer than usual # @schema {"name": "manager.index.corrector", "type": "object"} corrector: + # @schema {"name": "manager.index.corrector.enabled", "type": "bool"} + # manager.index.corrector.enabled -- enable index correction CronJob + enabled: false + # @schema {"name": "manager.index.corrector.check_duration", "type": "string"} + # manager.index.corrector.enabled -- check duration of index correction CronJob + check_duration: 24h + # @schema {"name": "manager.index.corrector.stream_list_concurrency", "type": "integer", "minimum": 1} + # manager.index.corrector.stream_list_concurrency -- concurrency for stream list object rpc + stream_list_concurrency: 200 + # @schema {"name": "manager.index.corrector.bbolt_async_write_concurrency", "type": "integer", "minimum": 1} + # manager.index.corrector.bbolt_async_write_concurrency -- concurrency for bbolt async write + bbolt_async_write_concurrency: 2048 # @schema {"name": "manager.index.corrector.agent_namespace", "type": "string"} # manager.index.corrector.agent_namespace -- namespace of agent pods to manage agent_namespace: _MY_POD_NAMESPACE_ # @schema {"name": "manager.index.corrector.node_name", "type": "string"} # manager.index.corrector.node_name -- node name node_name: "" # _MY_NODE_NAME_ - # @schema {"name": "manager.index.corrector.concurrency", "type": "integer", "minimum": 1} - # manager.index.corrector.concurrency -- concurrency - concurrency: 1 # @schema {"name": "manager.index.corrector.discoverer", "type": "object"} discoverer: # @schema {"name": "manager.index.corrector.discoverer.duration", "type": "string"} From 55bc82122658c5e4aa35cea240d1b9a1dc41d748 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 25 Sep 2023 02:48:28 +0000 Subject: [PATCH 080/101] Refactor config --- internal/config/corrector.go | 3 +++ pkg/index/job/correction/config/config.go | 1 - pkg/index/job/correction/service/corrector.go | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/internal/config/corrector.go b/internal/config/corrector.go index 56424e455f..806f5babeb 100644 --- a/internal/config/corrector.go +++ b/internal/config/corrector.go @@ -43,6 +43,9 @@ type Corrector struct { // BboltAsyncWriteConcurrency represent concurrency for bbolt async write BboltAsyncWriteConcurrency int `json:"bbolt_async_write_concurrency" yaml:"bbolt_async_write_concurrency"` + // IndexReplica represent index replica count. This should be equal to the lb setting + IndexReplica int `json:"index_replica" yaml:"index_replica"` + // Discoverer represent agent discoverer service configuration Discoverer *DiscovererClient `json:"discoverer" yaml:"discoverer"` } diff --git a/pkg/index/job/correction/config/config.go b/pkg/index/job/correction/config/config.go index 90eafa50f8..98e0a8b519 100644 --- a/pkg/index/job/correction/config/config.go +++ b/pkg/index/job/correction/config/config.go @@ -38,7 +38,6 @@ type Data struct { // Indexer represent agent auto indexing service configuration Corrector *config.Corrector `json:"corrector" yaml:"corrector"` - // FIXME: ここから読み込むときLB側の設定とのconsistencyをどう担保するのか // Gateway represent agent gateway service configuration Gateway *config.LB `json:"gateway" yaml:"gateway"` } diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 02eee789a6..c2624258b2 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -378,7 +378,7 @@ func (c *correct) correctReplica( ) error { // diff < 0 means there is less replica than the correct number existReplica := len(foundReplicas) + 1 - diff := existReplica - c.cfg.Gateway.IndexReplica + diff := existReplica - c.cfg.Corrector.IndexReplica if diff == 0 { // replica number is correct return nil From 004bf815e5637227c6d149ded96f4799c05f2c92 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 25 Sep 2023 08:51:22 +0000 Subject: [PATCH 081/101] Add corrector test --- internal/errors/corrector.go | 2 + internal/net/grpc/context.go | 11 +- internal/test/mock/grpc_testify_mock.go | 113 +++++ pkg/index/job/correction/service/corrector.go | 17 +- .../job/correction/service/corrector_test.go | 447 ++++++++++++++++++ 5 files changed, 579 insertions(+), 11 deletions(-) diff --git a/internal/errors/corrector.go b/internal/errors/corrector.go index 9b3aeb0efe..ab0f03b844 100644 --- a/internal/errors/corrector.go +++ b/internal/errors/corrector.go @@ -20,3 +20,5 @@ package errors var ErrIndexReplicaOne = New("nothing to correct when index replica is 1") var ErrNoAvailableAgentToInsert = New("no available agent to insert replica") + +var ErrFailedToCorrectReplicaNum = New("failed to correct replica number after correction process") diff --git a/internal/net/grpc/context.go b/internal/net/grpc/context.go index a95bfca402..e3077fb67c 100644 --- a/internal/net/grpc/context.go +++ b/internal/net/grpc/context.go @@ -20,28 +20,29 @@ import ( type contextKey string -const grpcMethodContextKey contextKey = "grpc_method" +// exported only for testing +const GrpcMethodContextKey contextKey = "grpc_method" // WrapGRPCMethod returns a copy of parent in which the method associated with key (grpcMethodContextKey). func WrapGRPCMethod(ctx context.Context, method string) context.Context { m := FromGRPCMethod(ctx) if m == "" { - return context.WithValue(ctx, grpcMethodContextKey, method) + return context.WithValue(ctx, GrpcMethodContextKey, method) } if strings.HasSuffix(m, method) { return ctx } - return context.WithValue(ctx, grpcMethodContextKey, m+"/"+method) + return context.WithValue(ctx, GrpcMethodContextKey, m+"/"+method) } // WithGRPCMethod returns a copy of parent in which the method associated with key (grpcMethodContextKey). func WithGRPCMethod(ctx context.Context, method string) context.Context { - return context.WithValue(ctx, grpcMethodContextKey, method) + return context.WithValue(ctx, GrpcMethodContextKey, method) } // FromGRPCMethod returns the value associated with this context for key (grpcMethodContextKey). func FromGRPCMethod(ctx context.Context) string { - if v := ctx.Value(grpcMethodContextKey); v != nil { + if v := ctx.Value(GrpcMethodContextKey); v != nil { if method, ok := v.(string); ok { return method } diff --git a/internal/test/mock/grpc_testify_mock.go b/internal/test/mock/grpc_testify_mock.go index 5fcd6a41fa..641962594b 100644 --- a/internal/test/mock/grpc_testify_mock.go +++ b/internal/test/mock/grpc_testify_mock.go @@ -18,6 +18,9 @@ import ( "github.com/stretchr/testify/mock" "github.com/vdaas/vald/apis/grpc/v1/payload" + "github.com/vdaas/vald/internal/backoff" + "github.com/vdaas/vald/internal/net/grpc/pool" + "google.golang.org/grpc" "google.golang.org/grpc/metadata" ) @@ -63,3 +66,113 @@ func (losm *ListObjectStreamMock) Send(res *payload.Object_List_Response) error args := losm.Called(res) return args.Error(0) } + +type ClientInternal struct { + mock.Mock +} + +type ( + CallOption = grpc.CallOption + DialOption = pool.DialOption + ClientConn = pool.ClientConn +) + +func (c *ClientInternal) StartConnectionMonitor(ctx context.Context) (<-chan error, error) { + args := c.Called(ctx) + return args.Get(0).(<-chan error), args.Error(1) +} + +func (c *ClientInternal) Connect(ctx context.Context, addr string, dopts ...DialOption) (pool.Conn, error) { + args := c.Called(ctx, addr, dopts) + return args.Get(0).(pool.Conn), args.Error(1) +} + +func (c *ClientInternal) IsConnected(ctx context.Context, addr string) bool { + args := c.Called(ctx, addr) + return args.Bool(0) +} + +func (c *ClientInternal) Disconnect(ctx context.Context, addr string) error { + args := c.Called(ctx, addr) + return args.Error(0) +} + +func (c *ClientInternal) Range(ctx context.Context, + f func(ctx context.Context, + addr string, + conn *ClientConn, + copts ...CallOption) error) error { + args := c.Called(ctx, f) + return args.Error(0) +} + +func (c *ClientInternal) RangeConcurrent(ctx context.Context, + concurrency int, + f func(ctx context.Context, + addr string, + conn *ClientConn, + copts ...CallOption) error) error { + args := c.Called(ctx, concurrency, f) + return args.Error(0) +} + +func (c *ClientInternal) OrderedRange(ctx context.Context, + order []string, + f func(ctx context.Context, + addr string, + conn *ClientConn, + copts ...CallOption) error) error { + args := c.Called(ctx, order, f) + return args.Error(0) +} + +func (c *ClientInternal) OrderedRangeConcurrent(ctx context.Context, + order []string, + concurrency int, + f func(ctx context.Context, + addr string, + conn *ClientConn, + copts ...CallOption) error) error { + args := c.Called(ctx, order, concurrency, f) + return args.Error(0) +} + +func (c *ClientInternal) Do(ctx context.Context, addr string, + f func(ctx context.Context, + conn *ClientConn, + copts ...CallOption) (interface{}, error)) (interface{}, error) { + args := c.Called(ctx, addr, f) + return args.Get(0), args.Error(1) +} + +func (c *ClientInternal) RoundRobin(ctx context.Context, f func(ctx context.Context, + conn *ClientConn, + copts ...CallOption) (interface{}, error)) (interface{}, error) { + args := c.Called(ctx, f) + return args.Get(0), args.Error(1) +} + +func (c *ClientInternal) GetDialOption() []DialOption { + args := c.Called() + return args.Get(0).([]DialOption) +} + +func (c *ClientInternal) GetCallOption() []CallOption { + args := c.Called() + return args.Get(0).([]CallOption) +} + +func (c *ClientInternal) GetBackoff() backoff.Backoff { + args := c.Called() + return args.Get(0).(backoff.Backoff) +} + +func (c *ClientInternal) ConnectedAddrs() []string { + args := c.Called() + return args.Get(0).([]string) +} + +func (c *ClientInternal) Close(ctx context.Context) error { + args := c.Called(ctx) + return args.Error(0) +} diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index c2624258b2..e57bdcaf0e 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -39,6 +39,12 @@ import ( "github.com/vdaas/vald/pkg/index/job/correction/config" ) +const ( + insertMethod = "core.v1.Vald/Insert" + updateMethod = "core.v1.Vald/Update" + deleteMethod = "core.v1.Vald/Delete" +) + type Corrector interface { Start(ctx context.Context) (<-chan error, error) PreStop(ctx context.Context) error @@ -385,7 +391,6 @@ func (c *correct) correctReplica( } // when there are less replicas than the correct number, add the extra replicas - // TODO: refine this logic. pretty complicated if diff < 0 { log.Infof("replica shortage of vector %s. inserting to other agents...", targetReplica.vec.GetId()) if len(availableAddrs) == 0 { @@ -404,7 +409,7 @@ func (c *correct) correctReplica( } if diff < 0 { - return fmt.Errorf("failed to insert the sufficient amount of index to meet the replica setting") + return errors.ErrFailedToCorrectReplicaNum } return nil @@ -433,7 +438,7 @@ func (c *correct) correctReplica( } if diff > 0 { - return fmt.Errorf("failed to delete the sufficient amount of index to meet the replica setting") + return errors.ErrFailedToCorrectReplicaNum } return nil @@ -441,7 +446,7 @@ func (c *correct) correctReplica( func (c *correct) updateObject(ctx context.Context, addr string, vector *payload.Object_Vector) error { res, err := c.discoverer.GetClient(). - Do(grpc.WithGRPCMethod(ctx, "core.v1.Vald/Update"), addr, func(ctx context.Context, conn *grpc.ClientConn, copts ...grpc.CallOption) (interface{}, error) { + Do(grpc.WithGRPCMethod(ctx, updateMethod), addr, func(ctx context.Context, conn *grpc.ClientConn, copts ...grpc.CallOption) (interface{}, error) { // TODO: use UpdateTimestamp when it's implemented because here we just want to update only the timestamp but not the vector return vald.NewUpdateClient(conn).Update(ctx, &payload.Update_Request{ Vector: vector, @@ -467,7 +472,7 @@ func (c *correct) updateObject(ctx context.Context, addr string, vector *payload func (c *correct) insertObject(ctx context.Context, addr string, vector *payload.Object_Vector) error { res, err := c.discoverer.GetClient(). - Do(grpc.WithGRPCMethod(ctx, "core.v1.Vald/Insert"), addr, func(ctx context.Context, conn *grpc.ClientConn, copts ...grpc.CallOption) (interface{}, error) { + Do(grpc.WithGRPCMethod(ctx, insertMethod), addr, func(ctx context.Context, conn *grpc.ClientConn, copts ...grpc.CallOption) (interface{}, error) { return vald.NewInsertClient(conn).Insert(ctx, &payload.Insert_Request{ Vector: vector, // FIXME: this should be deleted after Config.Timestamp deprecation @@ -489,7 +494,7 @@ func (c *correct) insertObject(ctx context.Context, addr string, vector *payload func (c *correct) deleteObject(ctx context.Context, addr string, vector *payload.Object_Vector) error { res, err := c.discoverer.GetClient(). - Do(grpc.WithGRPCMethod(ctx, "core.v1.Vald/Delete"), addr, func(ctx context.Context, conn *grpc.ClientConn, copts ...grpc.CallOption) (interface{}, error) { + Do(grpc.WithGRPCMethod(ctx, deleteMethod), addr, func(ctx context.Context, conn *grpc.ClientConn, copts ...grpc.CallOption) (interface{}, error) { return vald.NewRemoveClient(conn).Remove(ctx, &payload.Remove_Request{ Id: &payload.Object_ID{ Id: vector.GetId(), diff --git a/pkg/index/job/correction/service/corrector_test.go b/pkg/index/job/correction/service/corrector_test.go index 837611d41f..c0f5e5c605 100644 --- a/pkg/index/job/correction/service/corrector_test.go +++ b/pkg/index/job/correction/service/corrector_test.go @@ -1,5 +1,452 @@ package service +import ( + "context" + "testing" + + tmock "github.com/stretchr/testify/mock" + "github.com/stretchr/testify/require" + "github.com/vdaas/vald/apis/grpc/v1/payload" + iconfig "github.com/vdaas/vald/internal/config" + "github.com/vdaas/vald/internal/errors" + "github.com/vdaas/vald/internal/net/grpc" + "github.com/vdaas/vald/internal/test/mock" + "github.com/vdaas/vald/pkg/index/job/correction/config" +) + +type mockDiscovererClient struct { + client mock.ClientInternal +} + +func (*mockDiscovererClient) Start(ctx context.Context) (<-chan error, error) { + return nil, nil +} + +func (*mockDiscovererClient) GetAddrs(ctx context.Context) []string { + return nil +} + +func (m *mockDiscovererClient) GetClient() grpc.Client { + return &m.client +} + +func Test_correct_correctTimestamp(t *testing.T) { + t.Parallel() + + // This mock just returns nil and record args inside + m := mockDiscovererClient{} + m.client.On("Do", tmock.Anything, tmock.Anything, tmock.Anything).Return(nil, nil) + c := &correct{ + discoverer: &m, + } + + type args struct { + target *vectorReplica + found []*vectorReplica + } + + type want struct { + addrs []string + err error + } + + type test struct { + name string + args args + want want + } + + tests := []test{ + { + name: "nothing happens when no replica is found", + args: args{ + target: &vectorReplica{ + addr: "target", + vec: &payload.Object_Vector{ + Id: "target", + Timestamp: 100, + }, + }, + found: []*vectorReplica{}, + }, + want: want{ + addrs: nil, + err: nil, + }, + }, + { + name: "updates one found vec when found vecs are older than target", + args: args{ + target: &vectorReplica{ + addr: "target", + vec: &payload.Object_Vector{ + Id: "target", + Timestamp: 100, + }, + }, + found: []*vectorReplica{ + { + addr: "found", + vec: &payload.Object_Vector{ + Id: "found", + Timestamp: 99, + }, + }, + }, + }, + want: want{ + addrs: []string{"found"}, + err: nil, + }, + }, + { + name: "updates multiple found vecs when found vecs are older than target", + args: args{ + target: &vectorReplica{ + addr: "target", + vec: &payload.Object_Vector{ + Id: "target", + Timestamp: 100, + }, + }, + found: []*vectorReplica{ + { + addr: "found1", + vec: &payload.Object_Vector{ + Id: "found", + Timestamp: 99, + }, + }, + { + addr: "found2", + vec: &payload.Object_Vector{ + Id: "found", + Timestamp: 98, + }, + }, + }, + }, + want: want{ + addrs: []string{"found1", "found2"}, + err: nil, + }, + }, + { + name: "updates target vec when found vecs are newer than target", + args: args{ + target: &vectorReplica{ + addr: "target", + vec: &payload.Object_Vector{ + Id: "target", + Timestamp: 0, + }, + }, + found: []*vectorReplica{ + { + addr: "found1", + vec: &payload.Object_Vector{ + Id: "found", + Timestamp: 99, + }, + }, + }, + }, + want: want{ + addrs: []string{"target"}, + err: nil, + }, + }, + { + name: "updates target vec and one of found vecs with the latest found vec", + args: args{ + target: &vectorReplica{ + addr: "target", + vec: &payload.Object_Vector{ + Id: "target", + Timestamp: 0, + }, + }, + found: []*vectorReplica{ + { + addr: "found1", + vec: &payload.Object_Vector{ + Id: "found", + Timestamp: 99, + }, + }, + { + addr: "latest", + vec: &payload.Object_Vector{ + Id: "found", + Timestamp: 100, + }, + }, + }, + }, + want: want{ + addrs: []string{"target", "found1"}, + err: nil, + }, + }, + } + + for _, tc := range tests { + test := tc + t.Run(test.name, func(tt *testing.T) { + tt.Parallel() + err := c.correctTimestamp(context.Background(), test.args.target, test.args.found) + require.Equal(tt, test.want.err, err) + + for _, addr := range test.want.addrs { + // check if the agents which need to be corrected are called + // checking calling parameter, like timestamp, is impossible because its inside of the function arg + m.client.AssertCalled(tt, "Do", tmock.Anything, addr, tmock.Anything) + } + }) + } +} + +func Test_correct_correctReplica(t *testing.T) { + t.Parallel() + + // This mock just returns nil and record args inside + m := mockDiscovererClient{} + m.client.On("Do", tmock.Anything, tmock.Anything, tmock.Anything).Return(nil, nil) + + type args struct { + indexReplica int + target *vectorReplica + found []*vectorReplica + availableAddrs []string + } + + type addrMethod struct { + addr string + method string + } + + type want struct { + addrMethods []addrMethod + err error + } + + type test struct { + name string + args args + want want + } + + tests := []test{ + { + name: "nothing happens when replica number sutisfies", + args: args{ + indexReplica: 2, + target: &vectorReplica{ + addr: "target", + vec: &payload.Object_Vector{ + Id: "target", + }, + }, + found: []*vectorReplica{ + { + addr: "found", + vec: &payload.Object_Vector{ + Id: "found", + }, + }, + }, + availableAddrs: []string{}, + }, + want: want{ + addrMethods: nil, + err: nil, + }, + }, + { + name: "insert replica when replica number is not enough", + args: args{ + indexReplica: 2, + target: &vectorReplica{ + addr: "target", + vec: &payload.Object_Vector{ + Id: "target", + }, + }, + found: []*vectorReplica{}, + availableAddrs: []string{"available"}, + }, + want: want{ + addrMethods: []addrMethod{ + { + addr: "available", + method: insertMethod, + }, + }, + err: nil, + }, + }, + { + name: "insert replica to the agent with most memory available", + args: args{ + indexReplica: 2, + target: &vectorReplica{ + addr: "target", + vec: &payload.Object_Vector{ + Id: "target", + }, + }, + found: []*vectorReplica{}, + // this is supposed to be sorted by memory usage with descending order + availableAddrs: []string{"most memory used", "second memory used"}, + }, + want: want{ + addrMethods: []addrMethod{ + { + addr: "second memory used", + method: insertMethod, + }, + }, + err: nil, + }, + }, + { + name: "delete replica from myself when replica number is too much by one", + args: args{ + indexReplica: 2, + target: &vectorReplica{ + addr: "target", + vec: &payload.Object_Vector{ + Id: "target", + }, + }, + found: []*vectorReplica{ + { + addr: "found1", + }, + { + addr: "found2", + }, + }, + availableAddrs: []string{}, + }, + want: want{ + addrMethods: []addrMethod{ + { + addr: "target", + method: deleteMethod, + }, + }, + err: nil, + }, + }, + { + name: "delete replica from myself and most memory used agent when replica number is too much by more than one", + args: args{ + indexReplica: 2, + target: &vectorReplica{ + addr: "target", + vec: &payload.Object_Vector{ + Id: "target", + }, + }, + found: []*vectorReplica{ + { + addr: "found1", + }, + { + addr: "found2", + }, + { + addr: "found3", + }, + }, + availableAddrs: []string{}, + }, + want: want{ + addrMethods: []addrMethod{ + { + addr: "target", + method: deleteMethod, + }, + { + addr: "found1", + method: deleteMethod, + }, + }, + err: nil, + }, + }, + { + name: "return ErrNoAvailableAgentToInsert when availableAddrs is empty when insertion required", + args: args{ + indexReplica: 2, + target: &vectorReplica{ + addr: "target", + vec: &payload.Object_Vector{ + Id: "target", + }, + }, + found: []*vectorReplica{}, + availableAddrs: []string{}, + }, + want: want{ + addrMethods: nil, + err: errors.ErrNoAvailableAgentToInsert, + }, + }, + { + name: "return ErrFailedToCorrectReplicaNum when there is not enough number of availableAddrs", + args: args{ + indexReplica: 3, + target: &vectorReplica{ + addr: "target", + vec: &payload.Object_Vector{ + Id: "target", + }, + }, + found: []*vectorReplica{}, + availableAddrs: []string{"available"}, + }, + want: want{ + addrMethods: nil, + err: errors.ErrFailedToCorrectReplicaNum, + }, + }, + } + + for _, tc := range tests { + test := tc + c := &correct{ + discoverer: &m, + cfg: &config.Data{ + Corrector: &iconfig.Corrector{ + IndexReplica: test.args.indexReplica, + }, + }, + } + t.Run(test.name, func(tt *testing.T) { + tt.Parallel() + err := c.correctReplica(context.Background(), test.args.target, test.args.found, test.args.availableAddrs) + if test.want.err != nil { + require.ErrorIs(t, test.want.err, err) + } + + for _, am := range test.want.addrMethods { + // check if the agents which need to be corrected are called with the required method + // checking calling parameter, like timestamp, is impossible because its inside of the function arg + m.client.AssertCalled(tt, "Do", tmock.MatchedBy(func(ctx context.Context) bool { + method := ctx.Value(grpc.GrpcMethodContextKey) + val, ok := method.(string) + if !ok { + return false + } + return val == am.method + }), am.addr, tmock.Anything) + } + }) + } +} + // NOT IMPLEMENTED BELOW // // func TestNew(t *testing.T) { From 97ff3c021360ea711916a605140bb18a11ee1562 Mon Sep 17 00:00:00 2001 From: "deepsource-autofix[bot]" <62050782+deepsource-autofix[bot]@users.noreply.github.com> Date: Mon, 25 Sep 2023 08:51:31 +0000 Subject: [PATCH 082/101] style: format code with Prettier and Gofumpt This commit fixes the style issues introduced in 004bf81 according to the output from Prettier and Gofumpt. Details: https://github.com/vdaas/vald/pull/2194 --- internal/test/mock/grpc_testify_mock.go | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/internal/test/mock/grpc_testify_mock.go b/internal/test/mock/grpc_testify_mock.go index 641962594b..2a5c75535f 100644 --- a/internal/test/mock/grpc_testify_mock.go +++ b/internal/test/mock/grpc_testify_mock.go @@ -101,7 +101,8 @@ func (c *ClientInternal) Range(ctx context.Context, f func(ctx context.Context, addr string, conn *ClientConn, - copts ...CallOption) error) error { + copts ...CallOption) error, +) error { args := c.Called(ctx, f) return args.Error(0) } @@ -111,7 +112,8 @@ func (c *ClientInternal) RangeConcurrent(ctx context.Context, f func(ctx context.Context, addr string, conn *ClientConn, - copts ...CallOption) error) error { + copts ...CallOption) error, +) error { args := c.Called(ctx, concurrency, f) return args.Error(0) } @@ -121,7 +123,8 @@ func (c *ClientInternal) OrderedRange(ctx context.Context, f func(ctx context.Context, addr string, conn *ClientConn, - copts ...CallOption) error) error { + copts ...CallOption) error, +) error { args := c.Called(ctx, order, f) return args.Error(0) } @@ -132,7 +135,8 @@ func (c *ClientInternal) OrderedRangeConcurrent(ctx context.Context, f func(ctx context.Context, addr string, conn *ClientConn, - copts ...CallOption) error) error { + copts ...CallOption) error, +) error { args := c.Called(ctx, order, concurrency, f) return args.Error(0) } @@ -140,14 +144,16 @@ func (c *ClientInternal) OrderedRangeConcurrent(ctx context.Context, func (c *ClientInternal) Do(ctx context.Context, addr string, f func(ctx context.Context, conn *ClientConn, - copts ...CallOption) (interface{}, error)) (interface{}, error) { + copts ...CallOption) (interface{}, error), +) (interface{}, error) { args := c.Called(ctx, addr, f) return args.Get(0), args.Error(1) } func (c *ClientInternal) RoundRobin(ctx context.Context, f func(ctx context.Context, conn *ClientConn, - copts ...CallOption) (interface{}, error)) (interface{}, error) { + copts ...CallOption) (interface{}, error), +) (interface{}, error) { args := c.Called(ctx, f) return args.Get(0), args.Error(1) } From 8b0540dd76e19748e76db21e51c4e3c6e1f4cd5e Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 26 Sep 2023 02:22:15 +0000 Subject: [PATCH 083/101] Add timestamp check --- pkg/index/job/correction/service/corrector.go | 76 ++++++++++++++++--- 1 file changed, 64 insertions(+), 12 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index e57bdcaf0e..faa5b19d3a 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -22,6 +22,7 @@ import ( "path/filepath" "slices" "sync/atomic" + "time" agent "github.com/vdaas/vald/apis/grpc/v1/agent/core" "github.com/vdaas/vald/apis/grpc/v1/payload" @@ -39,10 +40,13 @@ import ( "github.com/vdaas/vald/pkg/index/job/correction/config" ) +type contextTimeKey string + const ( - insertMethod = "core.v1.Vald/Insert" - updateMethod = "core.v1.Vald/Update" - deleteMethod = "core.v1.Vald/Delete" + insertMethod = "core.v1.Vald/Insert" + updateMethod = "core.v1.Vald/Update" + deleteMethod = "core.v1.Vald/Delete" + correctionStartTimeKey contextTimeKey = "correctionStartTimeKey" ) type Corrector interface { @@ -79,6 +83,9 @@ func New(cfg *config.Data, discoverer discoverer.Client) (Corrector, error) { } func (c *correct) Start(ctx context.Context) (<-chan error, error) { + // set current time to context + ctx = embedTime(ctx) + dech, err := c.discoverer.Start(ctx) if err != nil { return nil, err @@ -135,6 +142,13 @@ func (c *correct) correct(ctx context.Context) (err error) { return fmt.Errorf("failed to copy agentAddrs") } + // Vector with time after this should not be processed + correctionStartTime, err := getCorrectionStartTime(ctx) + if err != nil { + log.Errorf("cannot determine correction start time: %w", err) + return err + } + if err := c.discoverer.GetClient().OrderedRange(ctx, c.agentAddrs, func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { // current address is the leftAgentAddrs[0] because this is OrderedRange and @@ -182,7 +196,6 @@ func (c *correct) correct(ctx context.Context) (err error) { mu.Unlock() if errors.Is(err, io.EOF) { - log.Debugf("StreamListObject stream finished for agent %s", addr) scancel() return nil } @@ -191,17 +204,26 @@ func (c *correct) correct(ctx context.Context) (err error) { return err } - if res.GetVector() == nil { + vec := res.GetVector() + if vec == nil { st := res.GetStatus() log.Error(st.GetCode(), st.GetMessage(), st.GetDetails()) // continue return nil } - log.Debugf("received object in StreamListObject: agent(%s), id(%s), timestamp(%v)", addr, res.GetVector().GetId(), res.GetVector().GetTimestamp()) + // skip if the vector is inserted after correction start + if vec.GetTimestamp() > correctionStartTime.UnixNano() { + log.Debugf("timestamp of vector(id: %s, timestamp: %v) is newer than correction start time(%v). skipping...", + vec.GetId(), + vec.GetTimestamp(), + correctionStartTime.UnixNano(), + ) + return nil + } // check if the index is already checked - id := res.GetVector().GetId() + id := vec.GetId() _, ok, err := c.checkedID.Get([]byte(id)) if err != nil { log.Errorf("failed to perform Get from bbolt: %v", err) @@ -215,7 +237,7 @@ func (c *correct) correct(ctx context.Context) (err error) { ctx, &vectorReplica{ addr: addr, - vec: res.GetVector(), + vec: vec, }, leftAgentAddrs, ); err != nil { @@ -271,6 +293,13 @@ func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorRep } } + // Vector with time after this should not be processed + correctionStartTime, err := getCorrectionStartTime(ctx) + if err != nil { + log.Errorf("cannot determine correction start time: %w", err) + return err + } + foundReplicas := make([]*vectorReplica, 0, len(availableAddrs)) var mu sync.Mutex if err := c.discoverer.GetClient().OrderedRangeConcurrent(ctx, leftAgentAddrs, len(leftAgentAddrs), @@ -286,7 +315,7 @@ func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorRep default: } vc := vald.NewValdClient(conn) - v, err := vc.GetObject(ctx, &payload.Object_VectorRequest{ + vec, err := vc.GetObject(ctx, &payload.Object_VectorRequest{ Id: &payload.Object_ID{ Id: targetReplica.vec.GetId(), }, @@ -304,13 +333,20 @@ func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorRep } } - // the target replica is found in this agent with the addr - log.Debugf("object found: agent(%s), id(%v), timestamp(%v)", addr, v.GetId(), v.GetTimestamp()) + // skip if the vector is inserted after correction start + if vec.GetTimestamp() > correctionStartTime.UnixNano() { + log.Debugf("timestamp of vector(id: %s, timestamp: %v) is newer than correction start time(%v). skipping...", + vec.GetId(), + vec.GetTimestamp(), + correctionStartTime.UnixNano(), + ) + return nil + } mu.Lock() foundReplicas = append(foundReplicas, &vectorReplica{ addr: addr, - vec: v, + vec: vec, }) // Remove this addr from availableAddrs because this addr has the target replica @@ -554,3 +590,19 @@ func (c *correct) loadInfos(ctx context.Context) (err error) { }) return nil } + +func embedTime(ctx context.Context) context.Context { + v := ctx.Value(correctionStartTimeKey) + if _, ok := v.(time.Time); ok { + return ctx + } + return context.WithValue(ctx, correctionStartTimeKey, time.Now()) +} + +func getCorrectionStartTime(ctx context.Context) (time.Time, error) { + v := ctx.Value(correctionStartTimeKey) + if t, ok := v.(time.Time); ok { + return t, nil + } + return time.Time{}, fmt.Errorf("timeKey is not embeded in context") +} From 5430f6762fb91f32caf13160f24a1c42aa7b33c9 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 26 Sep 2023 02:27:24 +0000 Subject: [PATCH 084/101] Apply format --- .github/workflows/dockers-index-job-correction.yml | 2 +- cmd/index/job/correction/sample.yaml | 2 +- dockers/index/job/correction/Dockerfile | 2 +- internal/config/corrector_test.go | 13 +++++++++++++ pkg/index/job/correction/config/config_test.go | 13 +++++++++++++ pkg/index/job/correction/service/corrector_test.go | 13 +++++++++++++ pkg/index/job/correction/usecase/corrector_test.go | 13 +++++++++++++ 7 files changed, 55 insertions(+), 3 deletions(-) diff --git a/.github/workflows/dockers-index-job-correction.yml b/.github/workflows/dockers-index-job-correction.yml index 7e6742d247..624b3e5cbe 100644 --- a/.github/workflows/dockers-index-job-correction.yml +++ b/.github/workflows/dockers-index-job-correction.yml @@ -2,7 +2,7 @@ # Copyright (C) 2019-2023 vdaas.org vald team # # Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. +# You may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 diff --git a/cmd/index/job/correction/sample.yaml b/cmd/index/job/correction/sample.yaml index 7b2eaf263c..09ad7dc5ca 100644 --- a/cmd/index/job/correction/sample.yaml +++ b/cmd/index/job/correction/sample.yaml @@ -2,7 +2,7 @@ # Copyright (C) 2019-2023 vdaas.org vald team # # Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. +# You may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 diff --git a/dockers/index/job/correction/Dockerfile b/dockers/index/job/correction/Dockerfile index 0e8f717e7a..a065c2cb40 100644 --- a/dockers/index/job/correction/Dockerfile +++ b/dockers/index/job/correction/Dockerfile @@ -2,7 +2,7 @@ # Copyright (C) 2019-2023 vdaas.org vald team # # Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. +# You may not use this file except in compliance with the License. # You may obtain a copy of the License at # # https://www.apache.org/licenses/LICENSE-2.0 diff --git a/internal/config/corrector_test.go b/internal/config/corrector_test.go index 8cd0119742..a66af0181a 100644 --- a/internal/config/corrector_test.go +++ b/internal/config/corrector_test.go @@ -1,3 +1,16 @@ +// Copyright (C) 2019-2023 vdaas.org vald team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// You may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. package config // NOT IMPLEMENTED BELOW diff --git a/pkg/index/job/correction/config/config_test.go b/pkg/index/job/correction/config/config_test.go index ac346d089c..0cf6858bf7 100644 --- a/pkg/index/job/correction/config/config_test.go +++ b/pkg/index/job/correction/config/config_test.go @@ -1,3 +1,16 @@ +// Copyright (C) 2019-2023 vdaas.org vald team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// You may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. package config // NOT IMPLEMENTED BELOW diff --git a/pkg/index/job/correction/service/corrector_test.go b/pkg/index/job/correction/service/corrector_test.go index c0f5e5c605..5083eadf10 100644 --- a/pkg/index/job/correction/service/corrector_test.go +++ b/pkg/index/job/correction/service/corrector_test.go @@ -1,3 +1,16 @@ +// Copyright (C) 2019-2023 vdaas.org vald team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// You may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. package service import ( diff --git a/pkg/index/job/correction/usecase/corrector_test.go b/pkg/index/job/correction/usecase/corrector_test.go index c4669fa90d..c8759d29e4 100644 --- a/pkg/index/job/correction/usecase/corrector_test.go +++ b/pkg/index/job/correction/usecase/corrector_test.go @@ -1,3 +1,16 @@ +// Copyright (C) 2019-2023 vdaas.org vald team +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// You may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. package usecase // NOT IMPLEMENTED BELOW From bdc9c70b90a19b744ccfbab53e245a82a51fb57b Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 26 Sep 2023 02:32:25 +0000 Subject: [PATCH 085/101] fix schema type --- charts/vald/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/charts/vald/values.yaml b/charts/vald/values.yaml index 0acfe58e23..0cc8c02bbb 100644 --- a/charts/vald/values.yaml +++ b/charts/vald/values.yaml @@ -2633,7 +2633,7 @@ manager: keepalive: 15m #indexer fetches uncommitted index length, which includes huge payload so we need to set keepalive longer than usual # @schema {"name": "manager.index.corrector", "type": "object"} corrector: - # @schema {"name": "manager.index.corrector.enabled", "type": "bool"} + # @schema {"name": "manager.index.corrector.enabled", "type": "boolean"} # manager.index.corrector.enabled -- enable index correction CronJob enabled: false # @schema {"name": "manager.index.corrector.check_duration", "type": "string"} From f7ffd6a125eea956a20a4401072290f54c567ee5 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 26 Sep 2023 04:06:33 +0000 Subject: [PATCH 086/101] Fix DeepSource errors --- internal/config/corrector.go | 2 +- internal/net/grpc/context.go | 3 ++- internal/servers/server/server.go | 10 +++++++--- pkg/index/job/correction/service/corrector.go | 7 +++---- pkg/index/job/correction/service/corrector_test.go | 4 ++-- pkg/index/job/correction/usecase/corrector.go | 3 ++- 6 files changed, 17 insertions(+), 12 deletions(-) diff --git a/internal/config/corrector.go b/internal/config/corrector.go index 806f5babeb..c29f508472 100644 --- a/internal/config/corrector.go +++ b/internal/config/corrector.go @@ -72,7 +72,7 @@ func (c *Corrector) GetStreamListConcurrency() int { return 200 //nolint:gomnd } -// Returns 2048 when not specified since not setting this could use up all the available momory +// GetBboltAsyncWriteConcurrency returns 2048 when not specified since not setting this could use up all the available momory func (c *Corrector) GetBboltAsyncWriteConcurrency() int { if c != nil { return c.BboltAsyncWriteConcurrency diff --git a/internal/net/grpc/context.go b/internal/net/grpc/context.go index e3077fb67c..2e06fb7897 100644 --- a/internal/net/grpc/context.go +++ b/internal/net/grpc/context.go @@ -20,7 +20,8 @@ import ( type contextKey string -// exported only for testing +// GrpcMethodContextKey represents a context key for grpc method. +// This is exported only for testing. const GrpcMethodContextKey contextKey = "grpc_method" // WrapGRPCMethod returns a copy of parent in which the method associated with key (grpcMethodContextKey). diff --git a/internal/servers/server/server.go b/internal/servers/server/server.go index 9e70fb9512..709f5c973e 100644 --- a/internal/servers/server/server.go +++ b/internal/servers/server/server.go @@ -22,6 +22,7 @@ import ( "crypto/tls" "net/http" "os" + "path/filepath" "reflect" "strconv" "syscall" @@ -128,6 +129,7 @@ type grpcKeepalive struct { permitWithoutStream bool } +// skipcq: GO-R1005 func New(opts ...Option) (Server, error) { srv := new(server) @@ -253,6 +255,7 @@ func (s *server) Name() string { return s.name } +// skipcq: GO-R1005 func (s *server) ListenAndServe(ctx context.Context, ech chan<- error) (err error) { if !s.IsRunning() { s.mu.Lock() @@ -274,8 +277,8 @@ func (s *server) ListenAndServe(ctx context.Context, ech chan<- error) (err erro return s.network.String() }(), func() string { if s.network == net.UNIX { - if len(s.socketPath) == 0 { - s.socketPath = os.TempDir() + string(os.PathSeparator) + s.name + "." + strconv.Itoa(os.Getpid()) + ".sock" + if s.socketPath == "" { + s.socketPath = filepath.Join(os.TempDir(), string(os.PathSeparator), s.name, ".", strconv.Itoa(os.Getpid()), ".sock") } return s.socketPath } @@ -339,6 +342,7 @@ func (s *server) ListenAndServe(ctx context.Context, ech chan<- error) (err erro return nil } +// skipcq: GO-R1005 func (s *server) Shutdown(ctx context.Context) (rerr error) { if !s.IsRunning() { return nil @@ -385,7 +389,7 @@ func (s *server) Shutdown(ctx context.Context) (rerr error) { } } - if len(s.socketPath) != 0 { + if s.socketPath != "" { defer func() { err := os.RemoveAll(s.socketPath) if err != nil { diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index faa5b19d3a..912c52deca 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -125,12 +125,10 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { func (c *correct) PreStop(_ context.Context) error { log.Info("removing persistent cache files...") - if err := c.checkedID.Close(true); err != nil { - return err - } - return nil + return c.checkedID.Close(true) } +// skipcq: GO-R1005 func (c *correct) correct(ctx context.Context) (err error) { // leftAgentAddrs is the agents' addr that hasn't been corrected yet. // This is used to know which agents possibly have the same index as the target replica. @@ -381,6 +379,7 @@ func (c *correct) correctTimestamp(ctx context.Context, targetReplica *vectorRep return nil } + // skipcq: CRT-D0001 allReplicas := append(foundReplicas, targetReplica) // sort by timestamp diff --git a/pkg/index/job/correction/service/corrector_test.go b/pkg/index/job/correction/service/corrector_test.go index 5083eadf10..533a8c2fd1 100644 --- a/pkg/index/job/correction/service/corrector_test.go +++ b/pkg/index/job/correction/service/corrector_test.go @@ -31,11 +31,11 @@ type mockDiscovererClient struct { client mock.ClientInternal } -func (*mockDiscovererClient) Start(ctx context.Context) (<-chan error, error) { +func (*mockDiscovererClient) Start(context.Context) (<-chan error, error) { return nil, nil } -func (*mockDiscovererClient) GetAddrs(ctx context.Context) []string { +func (*mockDiscovererClient) GetAddrs(context.Context) []string { return nil } diff --git a/pkg/index/job/correction/usecase/corrector.go b/pkg/index/job/correction/usecase/corrector.go index 5567016f7a..5f4bc4c2ea 100644 --- a/pkg/index/job/correction/usecase/corrector.go +++ b/pkg/index/job/correction/usecase/corrector.go @@ -177,6 +177,7 @@ func (r *run) Start(ctx context.Context) (<-chan error, error) { p, err := os.FindProcess(os.Getpid()) if err != nil { // using Fatal to avoid this process to be zombie + // skipcq: RVV-A0003 log.Fatalf("failed to find my pid to kill %v", err) return } @@ -216,6 +217,6 @@ func (r *run) Stop(ctx context.Context) error { return nil } -func (*run) PostStop(ctx context.Context) error { +func (*run) PostStop(_ context.Context) error { return nil } From 5e73905f3866137e2ffe6d075688f2a7e347895c Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 26 Sep 2023 04:07:26 +0000 Subject: [PATCH 087/101] Fix misspell --- pkg/index/job/correction/service/corrector.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 912c52deca..ef57dd6e04 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -603,5 +603,5 @@ func getCorrectionStartTime(ctx context.Context) (time.Time, error) { if t, ok := v.(time.Time); ok { return t, nil } - return time.Time{}, fmt.Errorf("timeKey is not embeded in context") + return time.Time{}, fmt.Errorf("timeKey is not embedded in context") } From ce78ccf7e948bd85044271321142f8f5c76e0548 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 26 Sep 2023 04:15:32 +0000 Subject: [PATCH 088/101] Add type check --- internal/test/mock/grpc_testify_mock.go | 29 +++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/internal/test/mock/grpc_testify_mock.go b/internal/test/mock/grpc_testify_mock.go index 2a5c75535f..9083d65554 100644 --- a/internal/test/mock/grpc_testify_mock.go +++ b/internal/test/mock/grpc_testify_mock.go @@ -15,6 +15,7 @@ package mock import ( "context" + "fmt" "github.com/stretchr/testify/mock" "github.com/vdaas/vald/apis/grpc/v1/payload" @@ -160,22 +161,42 @@ func (c *ClientInternal) RoundRobin(ctx context.Context, f func(ctx context.Cont func (c *ClientInternal) GetDialOption() []DialOption { args := c.Called() - return args.Get(0).([]DialOption) + v, ok := args.Get(0).([]DialOption) + if !ok { + // panic here like testify mock does + panic(fmt.Sprintf("The provided arg(%v) is not type []DialOption", args.Get(0))) + } + return v } func (c *ClientInternal) GetCallOption() []CallOption { args := c.Called() - return args.Get(0).([]CallOption) + v, ok := args.Get(0).([]CallOption) + if !ok { + // panic here like testify mock does + panic(fmt.Sprintf("The provided arg(%v) is not type []CallOption", args.Get(0))) + } + return v } func (c *ClientInternal) GetBackoff() backoff.Backoff { args := c.Called() - return args.Get(0).(backoff.Backoff) + v, ok := args.Get(0).(backoff.Backoff) + if !ok { + // panic here like testify mock does + panic(fmt.Sprintf("The provided arg(%v) is not type backoff.Backoff", args.Get(0))) + } + return v } func (c *ClientInternal) ConnectedAddrs() []string { args := c.Called() - return args.Get(0).([]string) + v, ok := args.Get(0).([]string) + if !ok { + // panic here like testify mock does + panic(fmt.Sprintf("The provided arg(%v) is not type []string", args.Get(0))) + } + return v } func (c *ClientInternal) Close(ctx context.Context) error { From ef2e20d60cf32eef9b10f2408fedf2f27e20dc76 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 26 Sep 2023 04:29:04 +0000 Subject: [PATCH 089/101] Remove unused config --- pkg/index/job/correction/config/config.go | 9 --------- pkg/index/job/correction/usecase/corrector.go | 2 +- 2 files changed, 1 insertion(+), 10 deletions(-) diff --git a/pkg/index/job/correction/config/config.go b/pkg/index/job/correction/config/config.go index 98e0a8b519..72d4f7d758 100644 --- a/pkg/index/job/correction/config/config.go +++ b/pkg/index/job/correction/config/config.go @@ -37,9 +37,6 @@ type Data struct { // Indexer represent agent auto indexing service configuration Corrector *config.Corrector `json:"corrector" yaml:"corrector"` - - // Gateway represent agent gateway service configuration - Gateway *config.LB `json:"gateway" yaml:"gateway"` } func NewConfig(path string) (cfg *Data, err error) { @@ -75,11 +72,5 @@ func NewConfig(path string) (cfg *Data, err error) { cfg.Corrector = new(config.Corrector).Bind() } - if cfg.Gateway != nil { - cfg.Gateway = cfg.Gateway.Bind() - } else { - cfg.Gateway = new(config.LB).Bind() - } - return cfg, nil } diff --git a/pkg/index/job/correction/usecase/corrector.go b/pkg/index/job/correction/usecase/corrector.go index 5f4bc4c2ea..5a168a00eb 100644 --- a/pkg/index/job/correction/usecase/corrector.go +++ b/pkg/index/job/correction/usecase/corrector.go @@ -44,7 +44,7 @@ type run struct { } func New(cfg *config.Data) (r runner.Runner, err error) { - if cfg.Gateway.IndexReplica == 1 { + if cfg.Corrector.IndexReplica == 1 { return nil, errors.ErrIndexReplicaOne } From 93d6ab7bdc387bd33c4b74649da817f681eebfdf Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 26 Sep 2023 04:58:51 +0000 Subject: [PATCH 090/101] Fix DeepSource error --- internal/servers/server/server.go | 1 + pkg/index/job/correction/config/config.go | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/internal/servers/server/server.go b/internal/servers/server/server.go index 709f5c973e..7141e883a5 100644 --- a/internal/servers/server/server.go +++ b/internal/servers/server/server.go @@ -129,6 +129,7 @@ type grpcKeepalive struct { permitWithoutStream bool } +// New returns Server implementation. // skipcq: GO-R1005 func New(opts ...Option) (Server, error) { srv := new(server) diff --git a/pkg/index/job/correction/config/config.go b/pkg/index/job/correction/config/config.go index 72d4f7d758..70a48b5baa 100644 --- a/pkg/index/job/correction/config/config.go +++ b/pkg/index/job/correction/config/config.go @@ -24,7 +24,7 @@ import ( type GlobalConfig = config.GlobalConfig -// Config represent a application setting data content (config.yaml). +// Data represents a application setting data content (config.yaml). // In K8s environment, this configuration is stored in K8s ConfigMap. type Data struct { config.GlobalConfig `json:",inline" yaml:",inline"` From 31058f5d4eda359975a6b6562082a9f74ad2634e Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 26 Sep 2023 05:10:43 +0000 Subject: [PATCH 091/101] Add required go:build e2e tag --- tests/e2e/pkg/agent/core/ngt/service/ngt_e2s_test.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/e2e/pkg/agent/core/ngt/service/ngt_e2s_test.go b/tests/e2e/pkg/agent/core/ngt/service/ngt_e2s_test.go index 77528a2de0..81ee7cf22d 100644 --- a/tests/e2e/pkg/agent/core/ngt/service/ngt_e2s_test.go +++ b/tests/e2e/pkg/agent/core/ngt/service/ngt_e2s_test.go @@ -1,3 +1,5 @@ +//go:build e2e + // // Copyright (C) 2019-2023 vdaas.org vald team // From 6f7973619ebec4cfec02788965ed60d9b9aeddf9 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 26 Sep 2023 05:27:11 +0000 Subject: [PATCH 092/101] Remove memo --- pkg/index/job/correction/usecase/corrector.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pkg/index/job/correction/usecase/corrector.go b/pkg/index/job/correction/usecase/corrector.go index 5a168a00eb..337b308f3c 100644 --- a/pkg/index/job/correction/usecase/corrector.go +++ b/pkg/index/job/correction/usecase/corrector.go @@ -139,10 +139,6 @@ func (r *run) PreStart(ctx context.Context) error { } func (r *run) Start(ctx context.Context) (<-chan error, error) { - // TODO: Set timeout? - // ctx, cancel := context.WithTimeout(ctx, time.Microsecond*10) - // defer cancel() - log.Info("starting servers") ech := make(chan error, 3) //nolint:gomnd var oech, nech, sech <-chan error From 0d4d9d24f3b1eaeab9443e22b34c8383d4789e41 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 26 Sep 2023 05:30:19 +0000 Subject: [PATCH 093/101] Refactor comment --- pkg/index/job/correction/service/corrector.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index ef57dd6e04..25fd5c4deb 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -510,7 +510,7 @@ func (c *correct) insertObject(ctx context.Context, addr string, vector *payload Do(grpc.WithGRPCMethod(ctx, insertMethod), addr, func(ctx context.Context, conn *grpc.ClientConn, copts ...grpc.CallOption) (interface{}, error) { return vald.NewInsertClient(conn).Insert(ctx, &payload.Insert_Request{ Vector: vector, - // FIXME: this should be deleted after Config.Timestamp deprecation + // TODO: this should be deleted after Config.Timestamp deprecation Config: &payload.Insert_Config{ Timestamp: vector.GetTimestamp(), }, From 0eebe24850213c9de5791cfb9753752b1f5fbf5c Mon Sep 17 00:00:00 2001 From: ykadowak Date: Wed, 27 Sep 2023 02:32:17 +0000 Subject: [PATCH 094/101] Remove TODO comment that is already done --- pkg/index/job/correction/service/corrector.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 25fd5c4deb..f9ba9c78a3 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -176,7 +176,6 @@ func (c *correct) correct(ctx context.Context) (err error) { // The number of items to be received in advance is not known in advance. // This is because there is a possibility of new items being inserted during processing. - // TODO: BTW, we need to ignore these index by checking the timestamp. for { select { case <-sctx.Done(): From 0eba31e641ca6a85ab60cea8d763852baca01430 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 2 Oct 2023 02:40:17 +0000 Subject: [PATCH 095/101] Remove unused config --- internal/config/corrector.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/internal/config/corrector.go b/internal/config/corrector.go index c29f508472..2b5b56beec 100644 --- a/internal/config/corrector.go +++ b/internal/config/corrector.go @@ -31,8 +31,6 @@ type Corrector struct { // AgentDNS represent agents dns A record for service discovery AgentDNS string `json:"agent_dns" yaml:"agent_dns"` - CreationPoolSize uint32 `json:"creation_pool_size" yaml:"creation_pool_size"` - // NodeName represents node name NodeName string `json:"node_name" yaml:"node_name"` From 5a73044f8586e5ea087c9df0d289388e1cb14b69 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 2 Oct 2023 02:44:18 +0000 Subject: [PATCH 096/101] Add comment to errors --- internal/errors/corrector.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/internal/errors/corrector.go b/internal/errors/corrector.go index ab0f03b844..757c66820e 100644 --- a/internal/errors/corrector.go +++ b/internal/errors/corrector.go @@ -17,8 +17,11 @@ // Package errors provides error types and function package errors +// ErrIndexReplicaOne represents an error that nothing to correct when index replica is 1. var ErrIndexReplicaOne = New("nothing to correct when index replica is 1") +// ErrNoAvailableAgentToInsert represents an error that no available agent to insert replica. var ErrNoAvailableAgentToInsert = New("no available agent to insert replica") +// ErrFailedToCorrectReplicaNum represents an error that failed to correct replica number after correction process. var ErrFailedToCorrectReplicaNum = New("failed to correct replica number after correction process") From 5117332f5a304aea2e5762f55206645f052418c3 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 2 Oct 2023 06:20:43 +0000 Subject: [PATCH 097/101] change app name --- ...correction.yml => dockers-index-correction.yml} | 10 +++++----- Makefile | 2 +- Makefile.d/build.mk | 2 +- Makefile.d/docker.mk | 14 +++++++------- dockers/index/job/correction/Dockerfile | 6 +++--- 5 files changed, 17 insertions(+), 17 deletions(-) rename .github/workflows/{dockers-index-job-correction.yml => dockers-index-correction.yml} (88%) diff --git a/.github/workflows/dockers-index-job-correction.yml b/.github/workflows/dockers-index-correction.yml similarity index 88% rename from .github/workflows/dockers-index-job-correction.yml rename to .github/workflows/dockers-index-correction.yml index 624b3e5cbe..3d6ada85a4 100644 --- a/.github/workflows/dockers-index-job-correction.yml +++ b/.github/workflows/dockers-index-correction.yml @@ -13,7 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. # -name: "Build docker image: index-job-correction" +name: "Build docker image: index-correction" on: push: branches: @@ -25,7 +25,7 @@ on: - "v*.*.*-*" paths: - ".github/actions/docker-build/actions.yaml" - - ".github/workflows/dockers-index-job-correction.yml" + - ".github/workflows/dockers-index-correction.yml" - "go.mod" - "go.sum" - "internal/**" @@ -41,7 +41,7 @@ on: paths: - ".github/actions/docker-build/actions.yaml" - ".github/workflows/_docker-image.yaml" - - ".github/workflows/dockers-index-job-correction.yml" + - ".github/workflows/dockers-index-correction.yml" - "go.mod" - "go.sum" - "internal/**" @@ -57,7 +57,7 @@ on: paths: - ".github/actions/docker-build/actions.yaml" - ".github/workflows/_docker-image.yaml" - - ".github/workflows/dockers-index-job-correction.yml" + - ".github/workflows/dockers-index-correction.yml" - "go.mod" - "go.sum" - "internal/**" @@ -74,5 +74,5 @@ jobs: build: uses: ./.github/workflows/_docker-image.yaml with: - target: index-job-correction + target: index-correction secrets: inherit diff --git a/Makefile b/Makefile index 7933a89689..d859cef45d 100644 --- a/Makefile +++ b/Makefile @@ -30,7 +30,7 @@ FILTER_GATEWAY_IMAGE = $(NAME)-filter-gateway HELM_OPERATOR_IMAGE = $(NAME)-helm-operator LB_GATEWAY_IMAGE = $(NAME)-lb-gateway LOADTEST_IMAGE = $(NAME)-loadtest -INDEX_JOB_CORRECTION_IMAGE = $(NAME)-index-job-correction +INDEX_CORRECTION_IMAGE = $(NAME)-index-correction MANAGER_INDEX_IMAGE = $(NAME)-manager-index MAINTAINER = "$(ORG).org $(NAME) team <$(NAME)@$(ORG).org>" diff --git a/Makefile.d/build.mk b/Makefile.d/build.mk index f62dbdf76a..5f0d2089e1 100644 --- a/Makefile.d/build.mk +++ b/Makefile.d/build.mk @@ -206,7 +206,7 @@ cmd/manager/index/index: \ $(dir $@)main.go $@ -version -cmd/index/job/correction/correction: \ +cmd/index/job/correction/index-correction: \ $(GO_SOURCES_INTERNAL) \ $(PBGOS) \ $(shell find $(ROOTDIR)/cmd/index/job/correction/correction -type f -name '*.go' -not -name '*_test.go' -not -name 'doc.go') \ diff --git a/Makefile.d/docker.mk b/Makefile.d/docker.mk index eed7283a46..5771c857e6 100644 --- a/Makefile.d/docker.mk +++ b/Makefile.d/docker.mk @@ -189,16 +189,16 @@ docker/build/loadtest: --build-arg MAINTAINER=$(MAINTAINER) \ --build-arg GO_VERSION=$(GO_VERSION) -.PHONY: docker/name/index-job-correction -docker/name/index-job-correction: - @echo "$(ORG)/$(INDEX_JOB_CORRECTION_IMAGE)" +.PHONY: docker/name/index-correction +docker/name/index-correction: + @echo "$(ORG)/$(INDEX_CORRECTION_IMAGE)" -.PHONY: docker/build/index-job-correction -## build index-job-correction image -docker/build/index-job-correction: +.PHONY: docker/build/index-correction +## build index-correction image +docker/build/index-correction: $(DOCKER) build \ $(DOCKER_OPTS) \ -f dockers/index/job/correction/Dockerfile \ - -t $(ORG)/$(INDEX_JOB_CORRECTION_IMAGE):$(TAG) . \ + -t $(ORG)/$(INDEX_CORRECTION_IMAGE):$(TAG) . \ --build-arg MAINTAINER=$(MAINTAINER) \ --build-arg GO_VERSION=$(GO_VERSION) diff --git a/dockers/index/job/correction/Dockerfile b/dockers/index/job/correction/Dockerfile index a065c2cb40..1938e2531f 100644 --- a/dockers/index/job/correction/Dockerfile +++ b/dockers/index/job/correction/Dockerfile @@ -33,7 +33,7 @@ ENV PATH ${PATH}:${GOROOT}/bin:${GOPATH}/bin ENV ORG vdaas ENV REPO vald ENV PKG index/job/correction -ENV APP_NAME correction +ENV APP_NAME index-correction # skipcq: DOK-DL3008 RUN apt-get update && apt-get install -y --no-install-recommends \ @@ -83,11 +83,11 @@ RUN cp sample.yaml /tmp/config.yaml FROM ${DISTROLESS_IMAGE}:${DISTROLESS_IMAGE_TAG} LABEL maintainer="${MAINTAINER}" -ENV APP_NAME correction +ENV APP_NAME index-correction COPY --from=builder /usr/bin/${APP_NAME} /go/bin/${APP_NAME} COPY --from=builder /tmp/config.yaml /etc/server/config.yaml USER nonroot:nonroot -ENTRYPOINT ["/go/bin/correction"] +ENTRYPOINT ["/go/bin/index-correction"] From 74bd1c6ddad75291b5d864802838c4547df5f3f3 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 2 Oct 2023 04:25:22 +0000 Subject: [PATCH 098/101] replace filepath pkg with internal file replace filepath pkg with internal file refactor --- internal/servers/server/server.go | 5 +++-- pkg/index/job/correction/service/corrector.go | 5 ++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/internal/servers/server/server.go b/internal/servers/server/server.go index 7141e883a5..7d094906c5 100644 --- a/internal/servers/server/server.go +++ b/internal/servers/server/server.go @@ -22,13 +22,13 @@ import ( "crypto/tls" "net/http" "os" - "path/filepath" "reflect" "strconv" "syscall" "time" "github.com/vdaas/vald/internal/errors" + "github.com/vdaas/vald/internal/file" "github.com/vdaas/vald/internal/log" "github.com/vdaas/vald/internal/net" "github.com/vdaas/vald/internal/net/control" @@ -279,7 +279,8 @@ func (s *server) ListenAndServe(ctx context.Context, ech chan<- error) (err erro }(), func() string { if s.network == net.UNIX { if s.socketPath == "" { - s.socketPath = filepath.Join(os.TempDir(), string(os.PathSeparator), s.name, ".", strconv.Itoa(os.Getpid()), ".sock") + sockFile := strings.Join([]string{s.name, strconv.Itoa(os.Getpid()), "sock"}, ".") + s.socketPath = file.Join(os.TempDir(), sockFile) } return s.socketPath } diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index f9ba9c78a3..37519f8bb9 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -19,7 +19,6 @@ import ( "fmt" "io" "os" - "path/filepath" "slices" "sync/atomic" "time" @@ -67,9 +66,9 @@ type correct struct { const filemode = 0o600 func New(cfg *config.Data, discoverer discoverer.Client) (Corrector, error) { - d := filepath.Join(os.TempDir(), "bbolt") + d := file.Join(os.TempDir(), "bbolt") file.MkdirAll(d, os.ModePerm) - dbfile := filepath.Join(d, "checkedid.db") + dbfile := file.Join(d, "checkedid.db") bolt, err := bbolt.New(dbfile, "", os.FileMode(filemode)) if err != nil { return nil, err From d687013c6b7e508a07c59b0d9540d41e4b65abd8 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Mon, 2 Oct 2023 07:31:39 +0000 Subject: [PATCH 099/101] Refactor refactor --- pkg/index/job/correction/service/corrector.go | 126 ++++++++---------- .../job/correction/service/corrector_test.go | 10 +- 2 files changed, 68 insertions(+), 68 deletions(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 37519f8bb9..d0570e5977 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -34,6 +34,7 @@ import ( "github.com/vdaas/vald/internal/net/grpc" "github.com/vdaas/vald/internal/net/grpc/codes" "github.com/vdaas/vald/internal/net/grpc/status" + "github.com/vdaas/vald/internal/safety" "github.com/vdaas/vald/internal/sync" "github.com/vdaas/vald/internal/sync/errgroup" "github.com/vdaas/vald/pkg/index/job/correction/config" @@ -106,9 +107,8 @@ func (c *correct) Start(ctx context.Context) (<-chan error, error) { return nil, err } - // For debugging c.indexInfos.Range(func(addr string, info *payload.Info_Index_Count) bool { - log.Debugf("index info: addr(%s), stored(%d), uncommitted(%d)", addr, info.GetStored(), info.GetUncommitted()) + log.Infof("index info: addr(%s), stored(%d), uncommitted(%d)", addr, info.GetStored(), info.GetUncommitted()) return true }) @@ -133,30 +133,22 @@ func (c *correct) correct(ctx context.Context) (err error) { // This is used to know which agents possibly have the same index as the target replica. // We can say this because, thanks to caching, there is no way that the target replica is // in the agent that has already been corrected. - leftAgentAddrs := make([]string, len(c.agentAddrs)) - n := copy(leftAgentAddrs, c.agentAddrs) - if n != len(c.agentAddrs) { - return fmt.Errorf("failed to copy agentAddrs") - } // Vector with time after this should not be processed - correctionStartTime, err := getCorrectionStartTime(ctx) + correctionStartTime, err := correctionStartTime(ctx) if err != nil { log.Errorf("cannot determine correction start time: %w", err) return err } + curTargetAgent := 0 if err := c.discoverer.GetClient().OrderedRange(ctx, c.agentAddrs, func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { // current address is the leftAgentAddrs[0] because this is OrderedRange and // leftAgentAddrs is copied from c.agentAddrs - leftAgentAddrs = leftAgentAddrs[1:] - - vc := vald.NewValdClient(conn) - stream, err := vc.StreamListObject(ctx, &payload.Object_List_Request{}) - if err != nil { - return err - } + defer func() { + curTargetAgent++ + }() // context and errgroup for stream.Recv and correction sctx, scancel := context.WithCancel(ctx) @@ -173,6 +165,12 @@ func (c *correct) correct(ctx context.Context) (err error) { var mu sync.Mutex log.Infof("starting correction for agent %s, stream concurrency: %d, bbolt concurrency: %d", addr, sconcurrency, bconcurrency) + vc := vald.NewValdClient(conn) + stream, err := vc.StreamListObject(ctx, &payload.Object_List_Request{}) + if err != nil { + return err + } + // The number of items to be received in advance is not known in advance. // This is because there is a possibility of new items being inserted during processing. for { @@ -181,9 +179,26 @@ func (c *correct) correct(ctx context.Context) (err error) { if !errors.Is(sctx.Err(), context.Canceled) { log.Errorf("context done unexpectedly: %v", sctx.Err()) } - goto Finalize + + // Finalize + err = seg.Wait() + if err != nil { + log.Errorf("err group returned error: %v", err) + } + + berr := bolteg.Wait() + if berr != nil { + log.Errorf("bbolt err group returned error: %v", err) + err = errors.Join(err, berr) + } else { + log.Info("bbolt all batch finished") + } + + log.Infof("correction finished for agent %s", addr) + return err + default: - seg.Go(func() error { + seg.Go(safety.RecoverFunc(func() error { mu.Lock() // As long as we don't stream.Recv() from the stream, we do not consume the memory of the message. // So by limiting the number of this errgroup.Go instances, we can limit the memory usage @@ -235,7 +250,7 @@ func (c *correct) correct(ctx context.Context) (err error) { addr: addr, vec: vec, }, - leftAgentAddrs, + curTargetAgent, ); err != nil { log.Errorf("failed to check consistency: %v", err) return nil // continue other processes @@ -245,25 +260,9 @@ func (c *correct) correct(ctx context.Context) (err error) { c.checkedID.AsyncSet(bolteg, []byte(id), nil) return nil - }) + })) } } - - Finalize: - err = seg.Wait() - if err != nil { - log.Errorf("err group returned error: %v", err) - } - - berr := bolteg.Wait() - if berr != nil { - log.Errorf("bolt err group returned error: %v", err) - err = errors.Join(err, berr) - } - log.Info("bbolt all batch finished") - - log.Infof("correction finished for agent %s", addr) - return err }, ); err != nil { log.Errorf("failed to range over agents(%v): %v", c.agentAddrs, err) @@ -279,39 +278,23 @@ type vectorReplica struct { } // Validate len(addrs) >= 2 before calling this function -func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorReplica, leftAgentAddrs []string) error { - // availableAddrs is the agents' addr that doesn't have the target replica thus is available to insert the replica - // to fix the index replica number if required. - availableAddrs := make([]string, 0, len(c.agentAddrs)-1) - for _, addr := range c.agentAddrs { - if addr != targetReplica.addr { - availableAddrs = append(availableAddrs, addr) - } - } +// idxだけ渡せば良い?c.addrsに全ての情報があるので? +func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorReplica, targetAgentIdx int) error { + // leftAgentAddrs is the agents' addr that hasn't been corrected yet. + leftAgentAddrs := c.agentAddrs[targetAgentIdx+1:] // Vector with time after this should not be processed - correctionStartTime, err := getCorrectionStartTime(ctx) + correctionStartTime, err := correctionStartTime(ctx) if err != nil { log.Errorf("cannot determine correction start time: %w", err) return err } - foundReplicas := make([]*vectorReplica, 0, len(availableAddrs)) + foundReplicas := make([]*vectorReplica, 0, len(c.agentAddrs)) var mu sync.Mutex if err := c.discoverer.GetClient().OrderedRangeConcurrent(ctx, leftAgentAddrs, len(leftAgentAddrs), func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { - // To avoid GetObject to myself. To maintain backward compatibility for withoug cache operation - if addr == targetReplica.addr { - return nil - } - - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - vc := vald.NewValdClient(conn) - vec, err := vc.GetObject(ctx, &payload.Object_VectorRequest{ + vec, err := vald.NewValdClient(conn).GetObject(ctx, &payload.Object_VectorRequest{ Id: &payload.Object_ID{ Id: targetReplica.vec.GetId(), }, @@ -344,12 +327,6 @@ func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorRep addr: addr, vec: vec, }) - - // Remove this addr from availableAddrs because this addr has the target replica - // and not available to insert the replica to fix the index replica number - slices.DeleteFunc(availableAddrs, func(availableAddr string) bool { - return availableAddr == addr - }) mu.Unlock() return nil @@ -364,7 +341,7 @@ func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorRep } // check replica number - if err := c.correctReplica(ctx, targetReplica, foundReplicas, availableAddrs); err != nil { + if err := c.correctReplica(ctx, targetReplica, foundReplicas); err != nil { return fmt.Errorf("failed to fix index replica: %w", err) } @@ -409,11 +386,12 @@ func (c *correct) correctTimestamp(ctx context.Context, targetReplica *vectorRep return nil } +// correctReplica corrects the number of replicas of the target vector. +// skipcq: GO-R1005 func (c *correct) correctReplica( ctx context.Context, targetReplica *vectorReplica, foundReplicas []*vectorReplica, - availableAddrs []string, ) error { // diff < 0 means there is less replica than the correct number existReplica := len(foundReplicas) + 1 @@ -423,6 +401,20 @@ func (c *correct) correctReplica( return nil } + // availableAddrs = c.agentAddrs - foundReplicas - targetReplica.addr + availableAddrs := make([]string, 0, len(c.agentAddrs)) + for _, addr := range c.agentAddrs { + if addr == targetReplica.addr { + continue + } + if slices.ContainsFunc(foundReplicas, func(replica *vectorReplica) bool { + return replica.addr == addr + }) { + continue + } + availableAddrs = append(availableAddrs, addr) + } + // when there are less replicas than the correct number, add the extra replicas if diff < 0 { log.Infof("replica shortage of vector %s. inserting to other agents...", targetReplica.vec.GetId()) @@ -596,7 +588,7 @@ func embedTime(ctx context.Context) context.Context { return context.WithValue(ctx, correctionStartTimeKey, time.Now()) } -func getCorrectionStartTime(ctx context.Context) (time.Time, error) { +func correctionStartTime(ctx context.Context) (time.Time, error) { v := ctx.Value(correctionStartTimeKey) if t, ok := v.(time.Time); ok { return t, nil diff --git a/pkg/index/job/correction/service/corrector_test.go b/pkg/index/job/correction/service/corrector_test.go index 533a8c2fd1..0f5b7f3dd7 100644 --- a/pkg/index/job/correction/service/corrector_test.go +++ b/pkg/index/job/correction/service/corrector_test.go @@ -437,9 +437,17 @@ func Test_correct_correctReplica(t *testing.T) { }, }, } + + // agentAddrs = availableAddrs + target.addr + found.addr + // skipcq: CRT-D0001 + c.agentAddrs = append(test.args.availableAddrs, test.args.target.addr) + for _, found := range test.args.found { + c.agentAddrs = append(c.agentAddrs, found.addr) + } + t.Run(test.name, func(tt *testing.T) { tt.Parallel() - err := c.correctReplica(context.Background(), test.args.target, test.args.found, test.args.availableAddrs) + err := c.correctReplica(context.Background(), test.args.target, test.args.found) if test.want.err != nil { require.ErrorIs(t, test.want.err, err) } From ba241cb21fe0c953a9367c8aa7764a9fd32e8b23 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 3 Oct 2023 04:06:35 +0000 Subject: [PATCH 100/101] Fix gRPC spelling --- internal/net/grpc/context.go | 12 ++++++------ pkg/index/job/correction/service/corrector_test.go | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/internal/net/grpc/context.go b/internal/net/grpc/context.go index 2e06fb7897..90e5ee32a6 100644 --- a/internal/net/grpc/context.go +++ b/internal/net/grpc/context.go @@ -20,30 +20,30 @@ import ( type contextKey string -// GrpcMethodContextKey represents a context key for grpc method. +// GRPCMethodContextKey represents a context key for gRPC method. // This is exported only for testing. -const GrpcMethodContextKey contextKey = "grpc_method" +const GRPCMethodContextKey contextKey = "grpc_method" // WrapGRPCMethod returns a copy of parent in which the method associated with key (grpcMethodContextKey). func WrapGRPCMethod(ctx context.Context, method string) context.Context { m := FromGRPCMethod(ctx) if m == "" { - return context.WithValue(ctx, GrpcMethodContextKey, method) + return context.WithValue(ctx, GRPCMethodContextKey, method) } if strings.HasSuffix(m, method) { return ctx } - return context.WithValue(ctx, GrpcMethodContextKey, m+"/"+method) + return context.WithValue(ctx, GRPCMethodContextKey, m+"/"+method) } // WithGRPCMethod returns a copy of parent in which the method associated with key (grpcMethodContextKey). func WithGRPCMethod(ctx context.Context, method string) context.Context { - return context.WithValue(ctx, GrpcMethodContextKey, method) + return context.WithValue(ctx, GRPCMethodContextKey, method) } // FromGRPCMethod returns the value associated with this context for key (grpcMethodContextKey). func FromGRPCMethod(ctx context.Context) string { - if v := ctx.Value(GrpcMethodContextKey); v != nil { + if v := ctx.Value(GRPCMethodContextKey); v != nil { if method, ok := v.(string); ok { return method } diff --git a/pkg/index/job/correction/service/corrector_test.go b/pkg/index/job/correction/service/corrector_test.go index 0f5b7f3dd7..91a6b2fd4c 100644 --- a/pkg/index/job/correction/service/corrector_test.go +++ b/pkg/index/job/correction/service/corrector_test.go @@ -456,7 +456,7 @@ func Test_correct_correctReplica(t *testing.T) { // check if the agents which need to be corrected are called with the required method // checking calling parameter, like timestamp, is impossible because its inside of the function arg m.client.AssertCalled(tt, "Do", tmock.MatchedBy(func(ctx context.Context) bool { - method := ctx.Value(grpc.GrpcMethodContextKey) + method := ctx.Value(grpc.GRPCMethodContextKey) val, ok := method.(string) if !ok { return false From 436da6efad95e55028839e98d40254c442146465 Mon Sep 17 00:00:00 2001 From: ykadowak Date: Tue, 3 Oct 2023 04:21:25 +0000 Subject: [PATCH 101/101] Remove memo --- pkg/index/job/correction/service/corrector.go | 1 - 1 file changed, 1 deletion(-) diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index d0570e5977..f5553c73ba 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -278,7 +278,6 @@ type vectorReplica struct { } // Validate len(addrs) >= 2 before calling this function -// idxだけ渡せば良い?c.addrsに全ての情報があるので? func (c *correct) checkConsistency(ctx context.Context, targetReplica *vectorReplica, targetAgentIdx int) error { // leftAgentAddrs is the agents' addr that hasn't been corrected yet. leftAgentAddrs := c.agentAddrs[targetAgentIdx+1:]