diff --git a/README.md b/README.md index b04a87f8..0365f65e 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ See [install/uninstall nebula operator](doc/user/install_guide.md) . ### Create and destroy a nebula cluster ```bash -$ kubectl create -f config/samples/apps_v1alpha1_nebulacluster.yaml +$ kubectl create -f config/samples/nebulacluster.yaml ``` A none ha-mode nebula cluster will be created. @@ -53,7 +53,7 @@ Welcome to NebulaGraph! Destroy the nebula cluster: ```bash -$ kubectl delete -f config/samples/apps_v1alpha1_nebulacluster.yaml +$ kubectl delete -f config/samples/nebulacluster.yaml ``` ### Resize a nebula cluster @@ -61,10 +61,10 @@ $ kubectl delete -f config/samples/apps_v1alpha1_nebulacluster.yaml Create a nebula cluster: ```bash -$ kubectl create -f config/samples/apps_v1alpha1_nebulacluster.yaml +$ kubectl create -f config/samples/nebulacluster.yaml ``` -In `config/samples/apps_v1alpha1_nebulacluster.yaml` the initial storaged replicas is 3. +In `config/samples/nebulacluster.yaml` the initial storaged replicas is 3. Modify the file and change `replicas` from 3 to 5. ```yaml @@ -89,7 +89,7 @@ Modify the file and change `replicas` from 3 to 5. Apply the replicas change to the cluster CR: ```bash -$ kubectl apply -f config/samples/apps_v1alpha1_nebulacluster.yaml +$ kubectl apply -f config/samples/nebulacluster.yaml ``` The storaged cluster will scale to 5 members (5 pods): @@ -147,7 +147,7 @@ In addition, you can [Install Nebula Cluster with helm](doc/user/nebula_cluster_ Create a nebula cluster with the version specified (v3.6.0): ```bash -$ kubectl apply -f config/samples/apps_v1alpha1_nebulacluster.yaml +$ kubectl apply -f config/samples/nebulacluster.yaml $ kubectl get pods -l app.kubernetes.io/cluster=nebula NAME READY STATUS RESTARTS AGE nebula-graphd-0 1/1 Running 0 25m @@ -166,12 +166,12 @@ $ kubectl get pods -l app.kubernetes.io/cluster=nebula -o jsonpath="{.items[*]. 3 vesoft/nebula-storaged:v3.6.0 ``` -Now modify the file `apps_v1alpha1_nebulacluster.yaml` and change the `version` from v3.6.0 to v3.6.x: +Now modify the file `nebulacluster.yaml` and change the `version` from v3.6.0 to v3.6.x: Apply the version change to the cluster CR: ```bash -$ kubectl apply -f config/samples/apps_v1alpha1_nebulacluster.yaml +$ kubectl apply -f config/samples/nebulacluster.yaml ``` Wait few minutes. The container image version should be updated to v3.6.x: @@ -199,7 +199,7 @@ through this in the following steps. Create a nebula cluster: ```bash -$ kubectl create -f config/samples/apps_v1alpha1_nebulacluster.yaml +$ kubectl create -f config/samples/nebulacluster.yaml ``` Wait until pods are up. Simulate a member failure by deleting a storaged pod: diff --git a/apis/pkg/annotation/annotation.go b/apis/pkg/annotation/annotation.go index c1048578..b180d183 100644 --- a/apis/pkg/annotation/annotation.go +++ b/apis/pkg/annotation/annotation.go @@ -24,7 +24,7 @@ const ( AnnPodNameKey = "nebula-graph.io/pod-name" // AnnLastSyncTimestampKey is annotation key to indicate the last timestamp the operator sync the workload AnnLastSyncTimestampKey = "nebula-graph.io/sync-timestamp" - // AnnHaModeKey is annotation key to indicate whether in ha mode + // AnnHaModeKey is annotation key to indicate whether in HA mode AnnHaModeKey = "nebula-graph.io/ha-mode" // AnnLastAppliedDynamicFlagsKey is annotation key to indicate the last applied custom dynamic flags AnnLastAppliedDynamicFlagsKey = "nebula-graph.io/last-applied-dynamic-flags" @@ -48,7 +48,7 @@ const ( // AnnRestoreStageKey is the annotation key to indicate what is the current stage AnnRestoreStageKey = "restore-stage" - // AnnHaModeVal is annotation value to indicate whether in ha mode + // AnnHaModeVal is annotation value to indicate whether in HA mode AnnHaModeVal = "true" // AnnRestoreMetadStepVal is annotation value to indicate whether Metad restore step is completed in stage 1 @@ -100,7 +100,7 @@ func IsInRestoreStage2(ann map[string]string) bool { return false } -// IsInHaMode check whether in ha mode +// IsInHaMode check whether in HA mode func IsInHaMode(ann map[string]string) bool { if ann != nil { val, ok := ann[AnnHaModeKey] diff --git a/config/samples/autoscaling_v1alpha1_nebulaautoscaler.yaml b/config/samples/autoscaling_v1alpha1_nebulaautoscaler.yaml deleted file mode 100644 index a82cf67a..00000000 --- a/config/samples/autoscaling_v1alpha1_nebulaautoscaler.yaml +++ /dev/null @@ -1,18 +0,0 @@ -apiVersion: autoscaling.nebula-graph.io/v1alpha1 -kind: NebulaAutoscaler -metadata: - name: nebula-autoscaler -spec: - nebulaClusterRef: - name: nebula - graphdPolicy: - minReplicas: 2 - maxReplicas: 5 - metrics: - - type: Resource - resource: - name: cpu - target: - type: Utilization - averageUtilization: 50 - pollingPeriod: 30s diff --git a/config/samples/nebulaautoscaler.yaml b/config/samples/nebulaautoscaler.yaml new file mode 100644 index 00000000..d16e1bda --- /dev/null +++ b/config/samples/nebulaautoscaler.yaml @@ -0,0 +1,35 @@ +apiVersion: autoscaling.nebula-graph.io/v1alpha1 +kind: NebulaAutoscaler +metadata: + name: nebula-autoscaler +spec: + nebulaClusterRef: + name: nebula + graphdPolicy: + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 50 + behavior: + scaleDown: + stabilizationWindowSeconds: 300 + policies: + - type: Percent + value: 100 + periodSeconds: 15 + scaleUp: + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 100 + periodSeconds: 15 + - type: Pods + value: 4 + periodSeconds: 15 + selectPolicy: Max + pollingPeriod: 30s diff --git a/config/samples/nebulacluster-auth-sidecar.yaml b/config/samples/nebulacluster-auth-sidecar.yaml new file mode 100644 index 00000000..237a6192 --- /dev/null +++ b/config/samples/nebulacluster-auth-sidecar.yaml @@ -0,0 +1,108 @@ +apiVersion: apps.nebula-graph.io/v1alpha1 +kind: NebulaCluster +metadata: + name: nebula +spec: + graphd: + config: + ca_client_path: certs/root.crt + ca_path: certs/root.crt + cert_path: certs/server.crt + key_path: certs/server.key + enable_graph_ssl: "true" + initContainers: + - name: init-auth-sidecar + command: + - /bin/sh + - -c + args: + - cp /certs/* /credentials/ + imagePullPolicy: Always + image: reg.vesoft-inc.com/nebula-certs:latest + volumeMounts: + - name: credentials + mountPath: /credentials + sidecarContainers: + - name: auth-sidecar + imagePullPolicy: Always + image: reg.vesoft-inc.com/nebula-certs:latest + volumeMounts: + - name: credentials + mountPath: /credentials + volumes: + - name: credentials + emptyDir: + medium: Memory + volumeMounts: + - name: credentials + mountPath: /usr/local/nebula/certs + logVolumeClaim: + resources: + requests: + storage: 1Gi + storageClassName: local-path + resources: + requests: + cpu: "500m" + memory: "500Mi" + limits: + cpu: "2" + memory: "2Gi" + replicas: 1 + image: reg.vesoft-inc.com/nebula-graphd-ent + version: v3.6.0 + metad: + licenseManagerURL: "nebula-license-manager-svc:9119" + resources: + requests: + cpu: "500m" + memory: "500Mi" + limits: + cpu: "1" + memory: "1Gi" + replicas: 1 + image: reg.vesoft-inc.com/rc/nebula-metad-ent + version: v3.6.0 + dataVolumeClaim: + resources: + requests: + storage: 2Gi + storageClassName: local-path + logVolumeClaim: + resources: + requests: + storage: 1Gi + storageClassName: local-path + storaged: + resources: + requests: + cpu: "500m" + memory: "500Mi" + limits: + cpu: "2" + memory: "2Gi" + replicas: 1 + image: reg.vesoft-inc.com/nebula-storaged-ent + version: v3.6.0 + dataVolumeClaims: + - resources: + requests: + storage: 2Gi + storageClassName: local-path + logVolumeClaim: + resources: + requests: + storage: 1Gi + storageClassName: local-path + enableAutoBalance: true + reference: + name: statefulsets.apps + version: v1 + schedulerName: default-scheduler + imagePullPolicy: Always + imagePullSecrets: + - name: nebula-image + enablePVReclaim: true + topologySpreadConstraints: + - topologyKey: "kubernetes.io/hostname" + whenUnsatisfiable: "ScheduleAnyway" \ No newline at end of file diff --git a/config/samples/nebulacluster-zone.yaml b/config/samples/nebulacluster-zone.yaml new file mode 100644 index 00000000..5a9c197e --- /dev/null +++ b/config/samples/nebulacluster-zone.yaml @@ -0,0 +1,82 @@ +apiVersion: apps.nebula-graph.io/v1alpha1 +kind: NebulaCluster +metadata: + name: nebula +spec: + # Alpine image built with linux tools. + alpineImage: "reg.vesoft-inc.com/nebula-alpine:latest" + graphd: + config: + prioritize_intra_zone_reading: "true" + stick_to_intra_zone_on_failure: "true" + resources: + requests: + cpu: "500m" + memory: "500Mi" + limits: + cpu: "2" + memory: "2Gi" + replicas: 1 + image: reg.vesoft-inc.com/nebula-graphd-ent + version: v3.6.0 + service: + type: NodePort + externalTrafficPolicy: Local + logVolumeClaim: + resources: + requests: + storage: 1Gi + storageClassName: ebs-sc + metad: + licenseManagerURL: "nebula-license-manager-svc:9119" + config: + zone_list: az1,az2,az3 + resources: + requests: + cpu: "500m" + memory: "500Mi" + limits: + cpu: "1" + memory: "1Gi" + replicas: 1 + image: reg.vesoft-inc.com/nebula-metad-ent + version: v3.6.0 + dataVolumeClaim: + resources: + requests: + storage: 5Gi + storageClassName: ebs-sc + logVolumeClaim: + resources: + requests: + storage: 1Gi + storageClassName: ebs-sc + storaged: + resources: + requests: + cpu: "500m" + memory: "500Mi" + limits: + cpu: "2" + memory: "2Gi" + replicas: 3 + image: reg.vesoft-inc.com/nebula-storaged-ent + version: v3.6.0 + dataVolumeClaims: + - resources: + requests: + storage: 10Gi + storageClassName: ebs-sc + logVolumeClaim: + resources: + requests: + storage: 1Gi + storageClassName: ebs-sc + reference: + name: statefulsets.apps + version: v1 + schedulerName: nebula-scheduler + imagePullPolicy: Always + topologySpreadConstraints: + - topologyKey: "topology.kubernetes.io/zone" + whenUnsatisfiable: "DoNotSchedule" diff --git a/config/samples/apps_v1alpha1_nebulacluster.yaml b/config/samples/nebulacluster.yaml similarity index 91% rename from config/samples/apps_v1alpha1_nebulacluster.yaml rename to config/samples/nebulacluster.yaml index bddec234..5e792153 100644 --- a/config/samples/apps_v1alpha1_nebulacluster.yaml +++ b/config/samples/nebulacluster.yaml @@ -9,8 +9,8 @@ spec: cpu: "500m" memory: "500Mi" limits: - cpu: "1" - memory: "1Gi" + cpu: "2" + memory: "2Gi" replicas: 1 image: vesoft/nebula-graphd version: v3.6.0 @@ -23,9 +23,7 @@ spec: storage: 1Gi storageClassName: ebs-sc metad: -# license: -# secretName: "nebula-license" -# licenseKey: "nebula.license" +# licenseManagerURL: "nebula-license-manager-svc:9119" resources: requests: cpu: "500m" @@ -52,8 +50,8 @@ spec: cpu: "500m" memory: "500Mi" limits: - cpu: "1" - memory: "1Gi" + cpu: "2" + memory: "2Gi" replicas: 3 image: vesoft/nebula-storaged version: v3.6.0 diff --git a/config/samples/apps_v1alpha1_nebularestore.yaml b/config/samples/nebularestore.yaml similarity index 100% rename from config/samples/apps_v1alpha1_nebularestore.yaml rename to config/samples/nebularestore.yaml diff --git a/doc/user/add-ons.md b/doc/user/add-ons.md index ad1c54f9..2b2d7d07 100644 --- a/doc/user/add-ons.md +++ b/doc/user/add-ons.md @@ -20,7 +20,7 @@ If you set helm chart nebula-operator _.Values.admissionWebhook.create_ to false [cert-manager](https://cert-manager.io/) is a tool that automates certificate management. It makes use of extending the Kubernetes API server using a Webhook server to provide dynamic admission control over cert-manager resources. -Refer to the [cert-manager installation documentation](https://cert-manager.io/docs/installation/kubernetes/) to get +Refer to the [cert-manager installation documentation](https://cert-manager.io/docs/installation/) to get started. cert-manager is used for validating NebulaGraph each component replicas, if you run it in production environment and diff --git a/doc/user/br_guide.md b/doc/user/br_guide.md index 7d880f4c..f941192b 100644 --- a/doc/user/br_guide.md +++ b/doc/user/br_guide.md @@ -51,7 +51,7 @@ The restore flow: ![avatar](../pictures/restore.png) -Update the [apps_v1alpha1_nebularestore.yaml](../../config/samples/apps_v1alpha1_nebularestore.yaml) fields: +Update the [apps_v1alpha1_nebularestore.yaml](../../config/samples/nebularestore.yaml) fields: * clusterName * backupName diff --git a/doc/user/custom_config.md b/doc/user/custom_config.md index e0e96d17..65e188cf 100644 --- a/doc/user/custom_config.md +++ b/doc/user/custom_config.md @@ -47,33 +47,30 @@ Afterward, the custom flags _enable_authorize_, _auth_type_ and _foo_ will be c This a dynamic runtime flags table, the pod rolling update will not be triggered after you apply updates in the scenario: - All flags updated are in this table -| Flag | Description | Default | -|:----------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------|:--------------------------------------------------------------------------------------------------------| -| `minloglevel` | Log level, 0, 1, 2, 3 for INFO, WARNING, ERROR, FATAL respectively | `0` | -| `v` | Verbose log level, 1, 2, 3, 4, the higher of the level, the more verbose of the logging | `0` | -| `accept_partial_success` | This flag is only used for Read-only access, and Modify access always treats partial success as an error | `false` | -| `session_reclaim_interval_secs` | Period we try to reclaim expired sessions | `60` | -| `max_allowed_query_size` | Maximum sentence length, unit byte | `4194304` | -| `system_memory_high_watermark_ratio` | System memory high watermark ratio, cancel the memory checking when the ratio greater than 1.0 | `0.8` | -| `ng_black_box_file_lifetime_seconds` | Black box log files expire time | `1800` | -| `memory_tracker_limit_ratio` | Trackable memory ratio (trackable_memory / (total_memory - untracked_reserved_memory) ) | `0.8` | -| `memory_tracker_untracked_reserved_memory_mb` | Untracked reserved memory in Mib | `50` | -| `memory_tracker_detail_log` | Enable log memory tracker stats periodically | `false` | -| `memory_tracker_detail_log_interval_ms` | Log memory tacker stats interval in milliseconds | `60000` | -| `memory_purge_enabled` | Enable memory background purge (if jemalloc is used) | `true` | -| `memory_purge_interval_seconds` | Memory background purge interval in seconds | `10` | -| `heartbeat_interval_secs` | Heartbeat interval in seconds | `10` | -| `raft_heartbeat_interval_secs` | Raft election timeout | `30` | -| `raft_rpc_timeout_ms` | RPC timeout for raft client (ms) | `500` | -| `query_concurrently` | Whether turn on query in multiple thread | `true` | -| `wal_ttl` | Recycle Raft WAL | `14400` | -| `auto_remove_invalid_space` | Whether remove outdated space data | `true` | -| `num_io_threads` | Network IO threads number | `16` | -| `num_worker_threads` | Worker threads number to handle request | `32` | -| `max_concurrent_subtasks` | Maximum subtasks to run admin jobs concurrently | `10` | -| `snapshot_part_rate_limit` | The rate limit in bytes when leader synchronizes snapshot data | `10485760` | -| `snapshot_batch_size` | The amount of data sent in each batch when leader synchronizes snapshot data | `1048576` | -| `rebuild_index_part_rate_limit` | The rate limit in bytes when leader synchronizes rebuilding index | `4194304` | -| `rocksdb_db_options` | Rocksdb DBOptions in json, each name and value of option is a string, given as "option_name":"option_value" separated by comma | `{}` | -| `rocksdb_column_family_options` | Rocksdb ColumnFamilyOptions in json, each name and value of option is string, given as "option_name":"option_value" separated by comma | `{"write_buffer_size":"67108864","max_write_buffer_number":"4","max_bytes_for_level_base":"268435456"}` | -| `rocksdb_block_based_table_options` | Rocksdb BlockBasedTableOptions in json, each name and value of option is string, given as "option_name":"option_value" separated by comma | `{"block_size":"8192"}` | \ No newline at end of file +| Flag | Description | Default | +|:-----------------------------------------------|:---------------------------------------------------------------------------------------------------------|:-----------| +| `minloglevel` | Log level, 0, 1, 2, 3 for INFO, WARNING, ERROR, FATAL respectively | `0` | +| `v` | Verbose log level, 1, 2, 3, 4, the higher of the level, the more verbose of the logging | `0` | +| `accept_partial_success` | This flag is only used for Read-only access, and Modify access always treats partial success as an error | `false` | +| `session_reclaim_interval_secs` | Period we try to reclaim expired sessions | `60` | +| `max_allowed_query_size` | Maximum sentence length, unit byte | `4194304` | +| `system_memory_high_watermark_ratio` | System memory high watermark ratio, cancel the memory checking when the ratio greater than 1.0 | `0.8` | +| `ng_black_box_file_lifetime_seconds` | Black box log files expire time | `1800` | +| `memory_tracker_limit_ratio` | Trackable memory ratio (trackable_memory / (total_memory - untracked_reserved_memory) ) | `0.8` | +| `memory_tracker_untracked_reserved_memory_mb` | Untracked reserved memory in Mib | `50` | +| `memory_tracker_detail_log` | Enable log memory tracker stats periodically | `false` | +| `memory_tracker_detail_log_interval_ms` | Log memory tacker stats interval in milliseconds | `60000` | +| `memory_purge_enabled` | Enable memory background purge (if jemalloc is used) | `true` | +| `memory_purge_interval_seconds` | Memory background purge interval in seconds | `10` | +| `heartbeat_interval_secs` | Heartbeat interval in seconds | `10` | +| `raft_heartbeat_interval_secs` | Raft election timeout | `30` | +| `raft_rpc_timeout_ms` | RPC timeout for raft client (ms) | `500` | +| `query_concurrently` | Whether turn on query in multiple thread | `true` | +| `wal_ttl` | Recycle Raft WAL | `14400` | +| `auto_remove_invalid_space` | Whether remove outdated space data | `true` | +| `snapshot_part_rate_limit` | The rate limit in bytes when leader synchronizes snapshot data | `10485760` | +| `snapshot_batch_size` | The amount of data sent in each batch when leader synchronizes snapshot data | `1048576` | +| `rebuild_index_part_rate_limit` | The rate limit in bytes when leader synchronizes rebuilding index | `4194304` | +| `stick_to_intra_zone_on_failure` | Stick to intra zone routing if unable to find the storaged hosting the requested part in the same zone. | `false` | +| `sync_meta_when_use_space` | Whether to sync session to meta when use space | `false` | +| `validate_session_timestamp` | whether validate the timestamp when update session | `true` | diff --git a/doc/user/intra_zone.md b/doc/user/intra_zone.md index 918f8960..5e0d9d6d 100644 --- a/doc/user/intra_zone.md +++ b/doc/user/intra_zone.md @@ -26,11 +26,11 @@ spec: stick_to_intra_zone_on_failure: "true" resources: requests: - cpu: "200m" + cpu: "500m" memory: "500Mi" limits: - cpu: "1" - memory: "1Gi" + cpu: "2" + memory: "2Gi" logVolumeClaim: resources: requests: @@ -46,7 +46,7 @@ spec: licenseManagerURL: "192.168.8.53:9119" resources: requests: - cpu: "300m" + cpu: "500m" memory: "500Mi" limits: cpu: "1" diff --git a/doc/user/log_guide.md b/doc/user/log_guide.md index 48e6720d..4607c75b 100644 --- a/doc/user/log_guide.md +++ b/doc/user/log_guide.md @@ -42,9 +42,8 @@ spec: redirect_stdout: "false" # The numbers of severity level INFO, WARNING, ERROR, and FATAL are 0, 1, 2, and 3, respectively. stderrthreshold: "0" - env: - - name: GLOG_logtostderr # Logs are written to standard error instead of to files - value: "1" + # Logs are written to standard error instead of to files + logtostderr: "true" image: vesoft/nebula-graphd replicas: 1 resources: @@ -60,14 +59,12 @@ spec: config: redirect_stdout: "false" stderrthreshold: "0" + logtostderr: "true" dataVolumeClaim: resources: requests: storage: 1Gi storageClassName: ebs-sc - env: - - name: GLOG_logtostderr - value: "1" image: vesoft/nebula-metad replicas: 1 resources: @@ -83,6 +80,7 @@ spec: config: redirect_stdout: "false" stderrthreshold: "0" + logtostderr: "true" dataVolumeClaims: - resources: requests: @@ -90,9 +88,6 @@ spec: storageClassName: ebs-sc enableAutoBalance: true enableForceUpdate: false - env: - - name: GLOG_logtostderr - value: "1" image: vesoft/nebula-storaged replicas: 1 resources: diff --git a/doc/user/nebula_autoscaler.md b/doc/user/nebula_autoscaler.md new file mode 100644 index 00000000..25cee5fa --- /dev/null +++ b/doc/user/nebula_autoscaler.md @@ -0,0 +1,152 @@ +# nebula-autoscaler + +The nebula-autoscaler is fully compatible with K8S HPA, and you can use it according to the operating mechanism of HPA. +Currently, the nebula-autoscaler only supports automatic scaling of Graphd. +Please refer to the documentation [HPA](https://kubernetes.io/docs/tasks/run-application/horizontal-pod-autoscale/) for more information. + +### Deploy metrics-server +Please refer to the repo of [metrics-server](https://github.com/kubernetes-sigs/metrics-server). +```shell +kubectl apply -f https://github.com/kubernetes-sigs/metrics-server/releases/latest/download/components.yaml + +# Modify startup parameters and add flag kubelet-insecure-tls to exempt the authentication between metrics-server and kubelet. +# For testing purposes only. +- --cert-dir=/tmp +- --secure-port=4443 +- --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname +- --kubelet-use-node-status-port +- --kubelet-insecure-tls +- --metric-resolution=15s +``` + +### Test metrics API +```shell +$ kubectl get --raw "/apis/metrics.k8s.io/v1beta1/namespaces/default/pods/nebula-graphd-1" | jq '.' +{ + "kind": "PodMetrics", + "apiVersion": "metrics.k8s.io/v1beta1", + "metadata": { + "name": "nebula-graphd-1", + "namespace": "default", + "creationTimestamp": "2023-09-27T13:39:54Z", + "labels": { + "app.kubernetes.io/cluster": "nebula", + "app.kubernetes.io/component": "graphd", + "app.kubernetes.io/managed-by": "nebula-operator", + "app.kubernetes.io/name": "nebula-graph", + "controller-revision-hash": "nebula-graphd-56cf5f8b66", + "statefulset.kubernetes.io/pod-name": "nebula-graphd-1" + } + }, + "timestamp": "2023-09-27T13:39:48Z", + "window": "15.015s", + "containers": [ + { + "name": "graphd", + "usage": { + "cpu": "323307n", + "memory": "12644Ki" + } + } + ] +} + +$ kubectl get --raw "/apis/metrics.k8s.io/v1beta1/nodes/192-168-8-35" | jq '.' +{ + "kind": "NodeMetrics", + "apiVersion": "metrics.k8s.io/v1beta1", + "metadata": { + "name": "192-168-8-35", + "creationTimestamp": "2023-09-27T14:00:13Z", + "labels": { + "beta.kubernetes.io/arch": "amd64", + "beta.kubernetes.io/os": "linux", + "kubernetes.io/arch": "amd64", + "kubernetes.io/hostname": "192-168-8-35", + "kubernetes.io/os": "linux", + "nebula": "cloud", + "node-role.kubernetes.io/control-plane": "", + "node.kubernetes.io/exclude-from-external-load-balancers": "" + } + }, + "timestamp": "2023-09-27T14:00:00Z", + "window": "20.045s", + "usage": { + "cpu": "164625163n", + "memory": "8616740Ki" + } +} +``` + +### Autoscaler +Here is the sample without behavior: +```yaml +apiVersion: autoscaling.nebula-graph.io/v1alpha1 +kind: NebulaAutoscaler +metadata: + name: nebula-autoscaler +spec: + nebulaClusterRef: + name: nebula + graphdPolicy: + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 50 + pollingPeriod: 30s + +``` + +Here is the sample with behavior: +```yaml +apiVersion: autoscaling.nebula-graph.io/v1alpha1 +kind: NebulaAutoscaler +metadata: + name: nebula-autoscaler +spec: + nebulaClusterRef: + name: nebula + graphdPolicy: + minReplicas: 2 + maxReplicas: 5 + metrics: + - type: Resource + resource: + name: cpu + target: + type: Utilization + averageUtilization: 50 + behavior: + scaleDown: + stabilizationWindowSeconds: 300 + policies: + - type: Percent + value: 100 + periodSeconds: 15 + scaleUp: + stabilizationWindowSeconds: 0 + policies: + - type: Percent + value: 100 + periodSeconds: 15 + - type: Pods + value: 4 + periodSeconds: 15 + selectPolicy: Max + pollingPeriod: 30s +``` + +### Verify status +```shell +$ kubectl get nc +NAME READY GRAPHD-DESIRED GRAPHD-READY METAD-DESIRED METAD-READY STORAGED-DESIRED STORAGED-READY AGE +nebula True 2 2 1 1 3 3 20h +$ kubectl get na +NAME REFERENCE MIN-REPLICAS MAX-REPLICAS CURRENT-REPLICAS ACTIVE ABLETOSCALE LIMITED READY AGE +nebula-autoscaler nebula 2 5 2 True True True True 19h +``` \ No newline at end of file diff --git a/doc/user/nebula_cluster_helm_guide.md b/doc/user/nebula_cluster_helm_guide.md index 0af47a27..b0000de0 100644 --- a/doc/user/nebula_cluster_helm_guide.md +++ b/doc/user/nebula_cluster_helm_guide.md @@ -2,7 +2,7 @@ Please install [nebula-operator](install_guide.md) before installing NebulaGraph cluster. -### Get Repo Info +### Get repo Info ```shell script # If you have already added it, please skip. diff --git a/doc/user/nebula_port.md b/doc/user/nebula_port.md new file mode 100644 index 00000000..9da79f31 --- /dev/null +++ b/doc/user/nebula_port.md @@ -0,0 +1,100 @@ +### Nebula port configurable + +We provide the fields `port` and `httpPort` in CRD to define the port settings for each component in NebulaGraph. +- The Thrift port can be configured upon creation, but changes are prohibited when the cluster is running. +- The HTTP port can be configured at any time. + +Here is the configuration file for NebulaCluster which have a custom HTTP port: +```yaml +apiVersion: apps.nebula-graph.io/v1alpha1 +kind: NebulaCluster +metadata: + name: nebula + namespace: default +spec: + graphd: + port: 9669 + httpPort: 8080 + config: + logtostderr: "true" + redirect_stdout: "false" + stderrthreshold: "0" + resources: + requests: + cpu: "200m" + memory: "500Mi" + limits: + cpu: "1" + memory: "1Gi" + replicas: 1 + image: vesoft/nebula-graphd + version: v3.6.0 + metad: + port: 9559 + httpPort: 8081 + config: + redirect_stdout: "false" + stderrthreshold: "0" + logtostder: "true" + resources: + requests: + cpu: "300m" + memory: "500Mi" + limits: + cpu: "1" + memory: "1Gi" + replicas: 1 + image: vesoft/nebula-metad + version: v3.6.0 + dataVolumeClaim: + resources: + requests: + storage: 2Gi + storageClassName: local-path + storaged: + port: 9779 + httpPort: 8082 + config: + redirect_stdout: "false" + stderrthreshold: "0" + logtostder: "true" + resources: + requests: + cpu: "300m" + memory: "500Mi" + limits: + cpu: "1" + memory: "1Gi" + replicas: 1 + image: vesoft/nebula-storaged + version: v3.6.0 + dataVolumeClaims: + - resources: + requests: + storage: 2Gi + storageClassName: local-path + enableAutoBalance: true + reference: + name: statefulsets.apps + version: v1 + schedulerName: default-scheduler + imagePullPolicy: IfNotPresent + imagePullSecrets: + - name: nebula-image + enablePVReclaim: true + topologySpreadConstraints: + - topologyKey: kubernetes.io/hostname + whenUnsatisfiable: "ScheduleAnyway" +``` + +Verify the configuration: +```shell +$ kubectl get svc +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +nebula-graphd-headless ClusterIP None 9669/TCP,8080/TCP 10m +nebula-graphd-svc ClusterIP 10.102.13.115 9669/TCP,8080/TCP 10m +nebula-metad-headless ClusterIP None 9559/TCP,8081/TCP 11m +nebula-storaged-headless ClusterIP None 9779/TCP,8082/TCP,9778/TCP 11m +$ curl 10.102.13.115:8080/status +{"git_info_sha":"537f942","status":"running"} +``` \ No newline at end of file diff --git a/doc/user/pv_expansion.md b/doc/user/pv_expansion.md new file mode 100644 index 00000000..2c312791 --- /dev/null +++ b/doc/user/pv_expansion.md @@ -0,0 +1,30 @@ +# PV Expansion + +Volume expansion was introduced as an alpha feature in Kubernetes 1.8, and it went beta in 1.11 and with Kubernetes 1.24 +we are excited to announce general availability(GA) of volume expansion. + +This feature allows Kubernetes users to simply edit their PersistentVolumeClaim objects and specify new size in PVC Spec +and Kubernetes will automatically expand the volume using storage backend and also expand the underlying file system in-use +by the Pod without requiring any downtime at all if possible. + +Not every volume type however is expandable by default. Some volume types such as - intree hostpath volumes are not expandable at all. +For CSI volumes - the CSI driver must have capability EXPAND_VOLUME in controller or node service (or both if appropriate). +Please refer to volume expansion documentation for intree volume types which support volume expansion - [Expanding Persistent Volumes](https://kubernetes.io/docs/concepts/storage/persistent-volumes/#expanding-persistent-volumes-claims). + +In general to provide some degree of control over volumes that can be expanded, only dynamically provisioned PVCs whose storage class +has allowVolumeExpansion parameter set to true are expandable. + +A Kubernetes cluster administrator must edit the appropriate StorageClass object and set the allowVolumeExpansion field to true. For example: +```shell +$ kubectl patch storageclass ebs-sc -p '{"allowVolumeExpansion": true}' +``` + +After allowVolumeExpansion is enabled, perform the following operations to expand PV capacity: +```shell +$ kubectl patch nc nebula --type='merge' --patch '{"spec": {"storaged": {"dataVolumeClaims":[{"resources": {"requests": {"storage": "100Gi"}}, "storageClassName": "ebs-sc"}]}}}' +``` + +After the expansion is successful, run `kubectl get pvc -n ` to display the original size, but viewing the PV size will show that it has expanded to the expected size. +```shell +kubectl get pv | grep +``` diff --git a/doc/user/webhook.md b/doc/user/webhook.md new file mode 100644 index 00000000..d902fc87 --- /dev/null +++ b/doc/user/webhook.md @@ -0,0 +1,129 @@ +# Admission webhook + +Admission webhooks are HTTP callbacks that receive admission requests and do something with them. There are two types of admission webhooks, +validating admission webhook and mutating admission webhook. Mutating admission webhooks are invoked first, and can modify objects sent to the API server +to enforce custom defaults. After all object modifications are complete, and after the incoming object is validated by the API server, +validating admission webhooks are invoked and can reject requests to enforce custom policies. + +The nebula-operator controller-manager starts a built-in admission webhook server and manages policies about how to validate NebulaCluster. + +Follow this guide to enable webhook. + +### Deploy cert-manager +Refer to the [cert-manager installation](https://cert-manager.io/docs/installation) to get started. + +### Enable admission webhook +```yaml +# helm chart nebula-operator values.yaml, set `create` to true +admissionWebhook: + create: true + # The TCP port the Webhook server binds to. (default 9443) + webhookBindPort: 9443 +``` + +Verify resource Issuer and Certificate status +```yaml +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + annotations: + meta.helm.sh/release-name: nebula-operator + meta.helm.sh/release-namespace: default + creationTimestamp: "2023-09-29T04:15:20Z" + generation: 1 + labels: + app.kubernetes.io/component: admission-webhook + app.kubernetes.io/instance: nebula-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: nebula-operator + app.kubernetes.io/version: 1.7.0 + helm.sh/chart: nebula-operator-1.7.0 + name: nebula-operator-webhook-issuer + namespace: default + resourceVersion: "109935202" + uid: 244015eb-2991-4cb9-befc-a35fba0eadce +spec: + selfSigned: {} +status: + conditions: + - lastTransitionTime: "2023-09-29T04:15:20Z" + observedGeneration: 1 + reason: IsReady + status: "True" + type: Ready + +--- +apiVersion: cert-manager.io/v1 +kind: Certificate +metadata: + annotations: + meta.helm.sh/release-name: nebula-operator + meta.helm.sh/release-namespace: default + creationTimestamp: "2023-09-29T04:15:20Z" + generation: 1 + labels: + app.kubernetes.io/component: admission-webhook + app.kubernetes.io/instance: nebula-operator + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: nebula-operator + app.kubernetes.io/version: 1.7.0 + helm.sh/chart: nebula-operator-1.7.0 + name: nebula-operator-webhook-cert + namespace: default + resourceVersion: "109935196" + uid: 7b03e317-354e-4cad-9832-96a74453c462 +spec: + dnsNames: + - nebula-operator-webhook-service.default.svc + - nebula-operator-webhook-service.default.svc.cluster.local + issuerRef: + kind: Issuer + name: nebula-operator-webhook-issuer + secretName: nebula-operator-webhook-secret +status: + conditions: + - lastTransitionTime: "2023-09-29T04:15:20Z" + message: Certificate is up to date and has not expired + observedGeneration: 1 + reason: Ready + status: "True" + type: Ready + notAfter: "2023-12-19T18:03:06Z" + notBefore: "2023-09-20T18:03:06Z" + renewalTime: "2023-11-19T18:03:06Z" +``` + +### Validate rules +- Append storage volume +```shell +$ kubectl patch nc nebula --type='merge' --patch '{"spec": {"storaged": {"dataVolumeClaims":[{"resources": {"requests": {"storage": "2Gi"}}, "storageClassName": "local-path"},{"resources": {"requests": {"storage": "3Gi"}}, "storageClassName": "fask-disks"}]}}}' +Error from server: admission webhook "nebulaclustervalidating.nebula-graph.io" denied the request: spec.storaged.dataVolumeClaims: Forbidden: storaged dataVolumeClaims is immutable +- ``` + +- Shrink PV +```shell +$ kubectl patch nc nebula --type='merge' --patch '{"spec": {"storaged": {"dataVolumeClaims":[{"resources": {"requests": {"storage": "1Gi"}}, "storageClassName": "fast-disks"}]}}}' +Error from server: admission webhook "nebulaclustervalidating.nebula-graph.io" denied the request: spec.storaged.dataVolumeClaims: Invalid value: resource.Quantity{i:resource.int64Amount{value:1073741824, scale:0}, d:resource.infDecAmount{Dec:(*inf.Dec)(nil)}, s:"1Gi", Format:"BinarySI"}: data volume size can only be increased +- ``` + +- Modify thrift ports +```shell +$ kubectl patch nc nebula --type='merge' --patch '{"spec": {"graphd": {"port": 8669}}}' +Error from server: admission webhook "nebulaclustervalidating.nebula-graph.io" denied the request: spec.graphd.port: Invalid value: 8669: field is immutable +- ``` + +- Intermediate state scaling +```shell +$ kubectl patch nc nebula --type='merge' --patch '{"spec": {"storaged": {"replicas": 5}}}' +nebulacluster.apps.nebula-graph.io/nebula patched +$ kubectl patch nc nebula --type='merge' --patch '{"spec": {"storaged": {"replicas": 3}}}' +Error from server: admission webhook "nebulaclustervalidating.nebula-graph.io" denied the request: [spec.storaged: Forbidden: field is immutable while in ScaleOut phase, spec.storaged.replicas: Invalid value: 3: field is immutable while not in Running phase] +- ``` + +- HA mode +```shell +# Create a nebula cluster with 2 graphd, 3 metad, and 3 storaged to meet the minimum HA configuration requirement. +$ kubectl annotate nc nebula nebula-graph.io/ha-mode=true +$ kubectl patch nc nebula --type='merge' --patch '{"spec": {"graphd": {"replicas":1}}}' +Error from server: admission webhook "nebulaclustervalidating.nebula-graph.io" denied the request: spec.graphd.replicas: Invalid value: 1: should be at least 2 in HA mode +- ``` \ No newline at end of file diff --git a/pkg/webhook/util/validation/validation.go b/pkg/webhook/util/validation/validation.go index eb6e2a8a..6a0dedd4 100644 --- a/pkg/webhook/util/validation/validation.go +++ b/pkg/webhook/util/validation/validation.go @@ -23,8 +23,8 @@ import ( ) const ( - fmtNotHaModeErrorDetail = "should be at least %d not in ha mode" - fmtHaModeErrorDetail = "should be at least %d in ha mode" + fmtNotHaModeErrorDetail = "should be at least %d not in HA mode" + fmtHaModeErrorDetail = "should be at least %d in HA mode" oddNumberDetail = "should be odd number" )