From 8968ae5ec4163f290219310b68515a6be3c9a635 Mon Sep 17 00:00:00 2001 From: Lyndon-Li Date: Wed, 29 Nov 2023 13:16:57 +0800 Subject: [PATCH] add node-agent concurrency doc Signed-off-by: Lyndon-Li --- changelogs/unreleased/7161-Lyndon-Li | 1 + design/node-agent-concurrency.md | 14 ++-- pkg/cmd/cli/nodeagent/server.go | 8 +-- pkg/cmd/cli/nodeagent/server_test.go | 18 ++--- pkg/nodeagent/node_agent.go | 11 ++-- pkg/nodeagent/node_agent_test.go | 10 +-- .../docs/main/node-agent-concurrency.md | 66 +++++++++++++++++++ site/data/docs/main-toc.yml | 4 +- 8 files changed, 100 insertions(+), 32 deletions(-) create mode 100644 changelogs/unreleased/7161-Lyndon-Li create mode 100644 site/content/docs/main/node-agent-concurrency.md diff --git a/changelogs/unreleased/7161-Lyndon-Li b/changelogs/unreleased/7161-Lyndon-Li new file mode 100644 index 0000000000..19524276fc --- /dev/null +++ b/changelogs/unreleased/7161-Lyndon-Li @@ -0,0 +1 @@ +Add node-agent concurrency doc and change the config name from dataPathConcurrency to loadCocurrency \ No newline at end of file diff --git a/design/node-agent-concurrency.md b/design/node-agent-concurrency.md index 597da89792..359c6a2b61 100644 --- a/design/node-agent-concurrency.md +++ b/design/node-agent-concurrency.md @@ -28,16 +28,16 @@ Therefore, in order to gain the optimized performance with the limited resources We introduce a configMap named ```node-agent-configs``` for users to specify the node-agent related configurations. This configMap is not created by Velero, users should create it manually on demand. The configMap should be in the same namespace where Velero is installed. If multiple Velero instances are installed in different namespaces, there should be one configMap in each namespace which applies to node-agent in that namespace only. Node-agent server checks these configurations at startup time and use it to initiate the related VGDP modules. Therefore, users could edit this configMap any time, but in order to make the changes effective, node-agent server needs to be restarted. -The ```node-agent-configs``` configMap may be used for other purpose of configuring node-agent in future, at present, there is only one kind of configuration as the data in the configMap, the name is ```dataPathConcurrency```. +The ```node-agent-configs``` configMap may be used for other purpose of configuring node-agent in future, at present, there is only one kind of configuration as the data in the configMap, the name is ```loadConcurrency```. The data structure for ```node-agent-configs``` is as below: ```go type Configs struct { - // DataPathConcurrency is the config for data path concurrency per node. - DataPathConcurrency *DataPathConcurrency `json:"dataPathConcurrency,omitempty"` + // LoadConcurrency is the config for load concurrency per node. + LoadConcurrency *LoadConcurrency `json:"loadConcurrency,omitempty"` } -type DataPathConcurrency struct { +type LoadConcurrency struct { // GlobalConfig specifies the concurrency number to all nodes for which per-node config is not specified GlobalConfig int `json:"globalConfig,omitempty"` @@ -55,7 +55,7 @@ type RuledConfigs struct { ``` ### Global concurrent number -We allow users to specify a concurrent number that will be applied to all nodes if the per-node number is not specified. This number is set through ```globalConfig``` field in ```dataPathConcurrency```. +We allow users to specify a concurrent number that will be applied to all nodes if the per-node number is not specified. This number is set through ```globalConfig```. The number starts from 1 which means there is no concurrency, only one instance of VGDP is allowed. There is no roof limit. If this number is not specified or not valid, a hard-coded default value will be used, the value is set to 1. @@ -67,7 +67,7 @@ We allow users to specify different concurrent number per node, for example, use The range of Per-node concurrent number is the same with Global concurrent number. Per-node concurrent number is preferable to Global concurrent number, so it will overwrite the Global concurrent number for that node. -Per-node concurrent number is implemented through ```perNodeConfig``` field in ```dataPathConcurrency```. +Per-node concurrent number is implemented through ```perNodeConfig``` field. ```perNodeConfig``` is a list of ```RuledConfigs``` each item of which matches one or more nodes by label selectors and specify the concurrent number for the matched nodes. This means, the nodes are identified by labels. @@ -85,7 +85,7 @@ If one node falls into more than one rules, e.g., if node1 also has the label `` A sample of the ```node-agent-configs``` configMap is as below: ```json { - "dataPathConcurrency": { + "loadConcurrency": { "globalConfig": 2, "perNodeConfig": [ { diff --git a/pkg/cmd/cli/nodeagent/server.go b/pkg/cmd/cli/nodeagent/server.go index 86340a02d6..e648be91af 100644 --- a/pkg/cmd/cli/nodeagent/server.go +++ b/pkg/cmd/cli/nodeagent/server.go @@ -498,19 +498,19 @@ func (s *nodeAgentServer) getDataPathConcurrentNum(defaultNum int) int { return defaultNum } - if configs == nil || configs.DataPathConcurrency == nil { + if configs == nil || configs.LoadConcurrency == nil { s.logger.Infof("Concurrency configs are not found, use the default number %v", defaultNum) return defaultNum } - globalNum := configs.DataPathConcurrency.GlobalConfig + globalNum := configs.LoadConcurrency.GlobalConfig if globalNum <= 0 { s.logger.Warnf("Global number %v is invalid, use the default value %v", globalNum, defaultNum) globalNum = defaultNum } - if len(configs.DataPathConcurrency.PerNodeConfig) == 0 { + if len(configs.LoadConcurrency.PerNodeConfig) == 0 { return globalNum } @@ -522,7 +522,7 @@ func (s *nodeAgentServer) getDataPathConcurrentNum(defaultNum int) int { concurrentNum := math.MaxInt32 - for _, rule := range configs.DataPathConcurrency.PerNodeConfig { + for _, rule := range configs.LoadConcurrency.PerNodeConfig { selector, err := metav1.LabelSelectorAsSelector(&rule.NodeSelector) if err != nil { s.logger.WithError(err).Warnf("Failed to parse rule with label selector %s, skip it", rule.NodeSelector.String()) diff --git a/pkg/cmd/cli/nodeagent/server_test.go b/pkg/cmd/cli/nodeagent/server_test.go index 4472dfce12..d062a71866 100644 --- a/pkg/cmd/cli/nodeagent/server_test.go +++ b/pkg/cmd/cli/nodeagent/server_test.go @@ -176,7 +176,7 @@ func Test_getDataPathConcurrentNum(t *testing.T) { name: "global number is invalid", getFunc: func(context.Context, string, kubernetes.Interface) (*nodeagent.Configs, error) { return &nodeagent.Configs{ - DataPathConcurrency: &nodeagent.DataPathConcurrency{ + LoadConcurrency: &nodeagent.LoadConcurrency{ GlobalConfig: -1, }, }, nil @@ -188,7 +188,7 @@ func Test_getDataPathConcurrentNum(t *testing.T) { name: "global number is valid", getFunc: func(context.Context, string, kubernetes.Interface) (*nodeagent.Configs, error) { return &nodeagent.Configs{ - DataPathConcurrency: &nodeagent.DataPathConcurrency{ + LoadConcurrency: &nodeagent.LoadConcurrency{ GlobalConfig: globalNum, }, }, nil @@ -199,7 +199,7 @@ func Test_getDataPathConcurrentNum(t *testing.T) { name: "node is not found", getFunc: func(context.Context, string, kubernetes.Interface) (*nodeagent.Configs, error) { return &nodeagent.Configs{ - DataPathConcurrency: &nodeagent.DataPathConcurrency{ + LoadConcurrency: &nodeagent.LoadConcurrency{ GlobalConfig: globalNum, PerNodeConfig: []nodeagent.RuledConfigs{ { @@ -217,7 +217,7 @@ func Test_getDataPathConcurrentNum(t *testing.T) { name: "failed to get selector", getFunc: func(context.Context, string, kubernetes.Interface) (*nodeagent.Configs, error) { return &nodeagent.Configs{ - DataPathConcurrency: &nodeagent.DataPathConcurrency{ + LoadConcurrency: &nodeagent.LoadConcurrency{ GlobalConfig: globalNum, PerNodeConfig: []nodeagent.RuledConfigs{ { @@ -237,7 +237,7 @@ func Test_getDataPathConcurrentNum(t *testing.T) { name: "rule number is invalid", getFunc: func(context.Context, string, kubernetes.Interface) (*nodeagent.Configs, error) { return &nodeagent.Configs{ - DataPathConcurrency: &nodeagent.DataPathConcurrency{ + LoadConcurrency: &nodeagent.LoadConcurrency{ GlobalConfig: globalNum, PerNodeConfig: []nodeagent.RuledConfigs{ { @@ -257,7 +257,7 @@ func Test_getDataPathConcurrentNum(t *testing.T) { name: "label doesn't match", getFunc: func(context.Context, string, kubernetes.Interface) (*nodeagent.Configs, error) { return &nodeagent.Configs{ - DataPathConcurrency: &nodeagent.DataPathConcurrency{ + LoadConcurrency: &nodeagent.LoadConcurrency{ GlobalConfig: globalNum, PerNodeConfig: []nodeagent.RuledConfigs{ { @@ -277,7 +277,7 @@ func Test_getDataPathConcurrentNum(t *testing.T) { name: "match one rule", getFunc: func(context.Context, string, kubernetes.Interface) (*nodeagent.Configs, error) { return &nodeagent.Configs{ - DataPathConcurrency: &nodeagent.DataPathConcurrency{ + LoadConcurrency: &nodeagent.LoadConcurrency{ GlobalConfig: globalNum, PerNodeConfig: []nodeagent.RuledConfigs{ { @@ -297,7 +297,7 @@ func Test_getDataPathConcurrentNum(t *testing.T) { name: "match multiple rules", getFunc: func(context.Context, string, kubernetes.Interface) (*nodeagent.Configs, error) { return &nodeagent.Configs{ - DataPathConcurrency: &nodeagent.DataPathConcurrency{ + LoadConcurrency: &nodeagent.LoadConcurrency{ GlobalConfig: globalNum, PerNodeConfig: []nodeagent.RuledConfigs{ { @@ -321,7 +321,7 @@ func Test_getDataPathConcurrentNum(t *testing.T) { name: "match multiple rules 2", getFunc: func(context.Context, string, kubernetes.Interface) (*nodeagent.Configs, error) { return &nodeagent.Configs{ - DataPathConcurrency: &nodeagent.DataPathConcurrency{ + LoadConcurrency: &nodeagent.LoadConcurrency{ GlobalConfig: globalNum, PerNodeConfig: []nodeagent.RuledConfigs{ { diff --git a/pkg/nodeagent/node_agent.go b/pkg/nodeagent/node_agent.go index ff93ed5967..4320baf3b0 100644 --- a/pkg/nodeagent/node_agent.go +++ b/pkg/nodeagent/node_agent.go @@ -34,16 +34,15 @@ import ( const ( // daemonSet is the name of the Velero node agent daemonset. - daemonSet = "node-agent" - configName = "node-agent-configs" - dataPathConConfigName = "data-path-concurrency" + daemonSet = "node-agent" + configName = "node-agent-configs" ) var ( ErrDaemonSetNotFound = errors.New("daemonset not found") ) -type DataPathConcurrency struct { +type LoadConcurrency struct { // GlobalConfig specifies the concurrency number to all nodes for which per-node config is not specified GlobalConfig int `json:"globalConfig,omitempty"` @@ -60,8 +59,8 @@ type RuledConfigs struct { } type Configs struct { - // DataPathConcurrency is the config for data path concurrency per node. - DataPathConcurrency *DataPathConcurrency `json:"dataPathConcurrency,omitempty"` + // LoadConcurrency is the config for data path load concurrency per node. + LoadConcurrency *LoadConcurrency `json:"loadConcurrency,omitempty"` } // IsRunning checks if the node agent daemonset is running properly. If not, return the error found diff --git a/pkg/nodeagent/node_agent_test.go b/pkg/nodeagent/node_agent_test.go index a18e45b140..a482a5a3ed 100644 --- a/pkg/nodeagent/node_agent_test.go +++ b/pkg/nodeagent/node_agent_test.go @@ -244,7 +244,7 @@ func TestGetConfigs(t *testing.T) { cm := builder.ForConfigMap("fake-ns", "node-agent-configs").Result() cmWithInvalidDataFormat := builder.ForConfigMap("fake-ns", "node-agent-configs").Data("fake-key", "wrong").Result() cmWithoutCocurrentData := builder.ForConfigMap("fake-ns", "node-agent-configs").Data("fake-key", "{\"someothers\":{\"someother\": 10}}").Result() - cmWithValidData := builder.ForConfigMap("fake-ns", "node-agent-configs").Data("fake-key", "{\"dataPathConcurrency\":{\"globalConfig\": 5}}").Result() + cmWithValidData := builder.ForConfigMap("fake-ns", "node-agent-configs").Data("fake-key", "{\"loadConcurrency\":{\"globalConfig\": 5}}").Result() tests := []struct { name string @@ -303,7 +303,7 @@ func TestGetConfigs(t *testing.T) { cmWithValidData, }, expectResult: &Configs{ - DataPathConcurrency: &DataPathConcurrency{ + LoadConcurrency: &LoadConcurrency{ GlobalConfig: 5, }, }, @@ -324,10 +324,10 @@ func TestGetConfigs(t *testing.T) { if test.expectResult == nil { assert.Nil(t, result) - } else if test.expectResult.DataPathConcurrency == nil { - assert.Nil(t, result.DataPathConcurrency) + } else if test.expectResult.LoadConcurrency == nil { + assert.Nil(t, result.LoadConcurrency) } else { - assert.Equal(t, *test.expectResult.DataPathConcurrency, *result.DataPathConcurrency) + assert.Equal(t, *test.expectResult.LoadConcurrency, *result.LoadConcurrency) } } else { assert.EqualError(t, err, test.expectErr) diff --git a/site/content/docs/main/node-agent-concurrency.md b/site/content/docs/main/node-agent-concurrency.md new file mode 100644 index 0000000000..20564eac58 --- /dev/null +++ b/site/content/docs/main/node-agent-concurrency.md @@ -0,0 +1,66 @@ +--- +title: "Node-agent Concurrency" +layout: docs +--- + +Velero node-agent is a daemonset hosting modules to complete the concrete tasks of backups/restores, i.e., file system backup/restore, CSI snapshot data movement. +Varying from the data size, data complexity, resource availability, the tasks may take a long time and remarkable resources (CPU, memory, network bandwidth, etc.). These tasks make the loads of node-agent. + +Node-agent concurrency configurations allow you to configure the concurrent number of node-agent loads per node. When the resources are sufficient in nodes, you can set a large concurrent number, so as to reduce the backup/restore time; otherwise, the concurrency should be reduced, otherwise, the backup/restore may encounter problems, i.e., time lagging, hang or OOM kill. + +To set Node-agent concurrency configurations, a configMap named ```node-agent-configs``` should be created manually. The configMap should be in the same namespace where Velero is installed. If multiple Velero instances are installed in different namespaces, there should be one configMap in each namespace which applies to node-agent in that namespace only. +Node-agent server checks these configurations at startup time. Therefore, you could edit this configMap any time, but in order to make the changes effective, node-agent server needs to be restarted. + +### Global concurrent number +You can specify a concurrent number that will be applied to all nodes if the per-node number is not specified. This number is set through ```globalConfig``` field in ```loadConcurrency```. +The number starts from 1 which means there is no concurrency, only one load is allowed. There is no roof limit. If this number is not specified or not valid, a hard-coded default value will be used, the value is set to 1. + +### Per-node concurrent number +You can specify different concurrent number per node, for example, you can set 3 concurrent instances in Node-1, 2 instances in Node-2 and 1 instance in Node-3. +The range of Per-node concurrent number is the same with Global concurrent number. Per-node concurrent number is preferable to Global concurrent number, so it will overwrite the Global concurrent number for that node. + +Per-node concurrent number is implemented through ```perNodeConfig``` field in ```loadConcurrency```. +```perNodeConfig``` is a list of ```RuledConfigs``` each item of which matches one or more nodes by label selectors and specify the concurrent number for the matched nodes. +Here is an example of the ```perNodeConfig``: +``` +"nodeSelector: kubernetes.io/hostname=node1; number: 3" +"nodeSelector: beta.kubernetes.io/instance-type=Standard_B4ms; number: 5" +``` +The first element means the node with host name ```node1``` gets the Per-node concurrent number of 3. +The second element means all the nodes with label ```beta.kubernetes.io/instance-type``` of value ```Standard_B4ms``` get the Per-node concurrent number of 5. +At least one node is expected to have a label with the specified ```RuledConfigs``` element (rule). If no node is with this label, the Per-node rule makes no effect. +If one node falls into more than one rules, e.g., if node1 also has the label ```beta.kubernetes.io/instance-type=Standard_B4ms```, the smallest number (3) will be used. + +### Sample +A sample of the complete ```node-agent-configs``` configMap is as below: +```json +{ + "loadConcurrency": { + "globalConfig": 2, + "perNodeConfig": [ + { + "nodeSelector": { + "matchLabels": { + "kubernetes.io/hostname": "node1" + } + }, + "number": 3 + }, + { + "nodeSelector": { + "matchLabels": { + "beta.kubernetes.io/instance-type": "Standard_B4ms" + } + }, + "number": 5 + } + ] + } +} +``` +To create the configMap, save something like the above sample to a json file and then run below command: +``` +kubectl create cm node-agent-configs -n velero --from-file= +``` + + diff --git a/site/data/docs/main-toc.yml b/site/data/docs/main-toc.yml index f3138778b5..beeee2efd6 100644 --- a/site/data/docs/main-toc.yml +++ b/site/data/docs/main-toc.yml @@ -50,7 +50,9 @@ toc: - page: CSI Support url: /csi - page: CSI Snapshot Data Movement - url: /csi-snapshot-data-movement + url: /csi-snapshot-data-movement + - page: Node-agent Concurrency + url: /node-agent-concurrency - page: Verifying Self-signed Certificates url: /self-signed-certificates - page: Changing RBAC permissions