diff --git a/CHANGELOG.md b/CHANGELOG.md index cb711f034..f1a06d5a4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,8 +3,9 @@ ## [master](https://github.com/arangodb/kube-arangodb/tree/master) (N/A) - (Maintenance) Update go-driver to v1.6.0, update IsNotFound() checks - (Improvement) Print assigned node name to log and condition message when pod is scheduled +- (Maintenance) Remove obsolete docs, restructure for better UX, generate index files -## [1.2.34](https://github.com/arangodb/kube-arangodb/tree/1.2.34) (2023-10-16 +## [1.2.34](https://github.com/arangodb/kube-arangodb/tree/1.2.34) (2023-10-16) - (Bugfix) Fix make manifests-crd-file command - (Improvement) Allow tcp:// and ssl:// protocols in endpoints for members - (Maintenance) Reorganize package imports / move common code to separate repos diff --git a/README.md b/README.md index 4439c4a72..3ad0657a1 100644 --- a/README.md +++ b/README.md @@ -58,33 +58,33 @@ covers individual newer features separately. #### Operator Features -| Feature | Operator Version | Introduced | ArangoDB Version | ArangoDB Edition | State | Enabled | Flag | Remarks | -|:-------------------------------------------------------------------------------------|:-----------------|:-----------|:-----------------|:----------------------|:-------------|:--------|:------------------------------------------------------|:-----------------------------------------------------------------------------------| -| Enforced ResignLeadership | 1.2.34 | 1.2.34 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.enforced-resign-leadership | Enforce ResignLeadership and ensure that Leaders are moved from restarted DBServer | -| Copy resources spec to init containers | 1.2.33 | 1.2.33 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.init-containers-copy-resources | Copy resources spec to built-in init containers if they are not specified | -| [Rebalancer V2](docs/design/features/rebalancer_v2.md) | 1.2.31 | 1.2.31 | >= 3.10.0 | Community, Enterprise | Alpha | False | --deployment.feature.rebalancer-v2 | N/A | -| [Secured containers](docs/design/features/secured_containers.md) | 1.2.31 | 1.2.31 | >= 3.8.0 | Community, Enterprise | Alpha | False | --deployment.feature.secured-containers | If set to True Operator will run containers in secure mode | -| Version Check V2 | 1.2.31 | 1.2.31 | >= 3.8.0 | Community, Enterprise | Alpha | False | --deployment.feature.upgrade-version-check-V2 | N/A | -| [Operator Ephemeral Volumes](docs/design/features/ephemeral_volumes.md) | 1.2.31 | 1.2.2 | >= 3.8.0 | Community, Enterprise | Beta | False | --deployment.feature.ephemeral-volumes | N/A | -| [Force Rebuild Out Synced Shards](docs/design/features/rebuild_out_synced_shards.md) | 1.2.27 | 1.2.27 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.force-rebuild-out-synced-shards | It should be used only if user is aware of the risks. | -| [Spec Default Restore](docs/design/features/deployment_spec_defaults.md) | 1.2.25 | 1.2.21 | >= 3.8.0 | Community, Enterprise | Beta | True | --deployment.feature.deployment-spec-defaults-restore | If set to False Operator will not change ArangoDeployment Spec | -| Version Check | 1.2.23 | 1.1.4 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.upgrade-version-check | N/A | -| [Failover Leader service](docs/design/features/failover_leader_service.md) | 1.2.13 | 1.2.13 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.failover-leadership | N/A | -| Graceful Restart | 1.2.5 | 1.0.7 | >= 3.8.0 | Community, Enterprise | Production | True | ---deployment.feature.graceful-shutdown | N/A | -| Optional Graceful Restart | 1.2.0 | 1.2.5 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.optional-graceful-shutdown | N/A | -| Operator Internal Metrics Exporter | 1.2.0 | 1.2.0 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.metrics-exporter | N/A | -| Operator Maintenance Management Support | 1.2.0 | 1.0.7 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.maintenance | N/A | -| Encryption Key Rotation Support | 1.2.0 | 1.0.3 | >= 3.8.0 | Enterprise | NotSupported | False | --deployment.feature.encryption-rotation | N/A | -| TLS Runtime Rotation Support | 1.1.0 | 1.0.4 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.tls-rotation | N/A | -| JWT Rotation Support | 1.1.0 | 1.0.3 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.jwt-rotation | N/A | -| Operator Single Mode | 1.0.4 | 1.0.4 | >= 3.8.0 | Community, Enterprise | Production | False | --mode.single | Only 1 instance of Operator allowed in namespace when feature is enabled | -| TLS SNI Support | 1.0.3 | 1.0.3 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.tls-sni | N/A | -| Disabling of liveness probes | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | -| Pod Disruption Budgets | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | -| Prometheus Metrics Exporter | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | Prometheus required | -| Sidecar Containers | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | -| Volume Claim Templates | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | -| Volume Resizing | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | +| Feature | Operator Version | Introduced | ArangoDB Version | ArangoDB Edition | State | Enabled | Flag | Remarks | +|:------------------------------------------------------------------------------|:-----------------|:-----------|:-----------------|:----------------------|:-------------|:--------|:------------------------------------------------------|:-----------------------------------------------------------------------------------| +| Enforced ResignLeadership | 1.2.34 | 1.2.34 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.enforced-resign-leadership | Enforce ResignLeadership and ensure that Leaders are moved from restarted DBServer | +| Copy resources spec to init containers | 1.2.33 | 1.2.33 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.init-containers-copy-resources | Copy resources spec to built-in init containers if they are not specified | +| [Rebalancer V2](docs/features/rebalancer_v2.md) | 1.2.31 | 1.2.31 | >= 3.10.0 | Community, Enterprise | Alpha | False | --deployment.feature.rebalancer-v2 | N/A | +| [Secured containers](docs/features/secured_containers.md) | 1.2.31 | 1.2.31 | >= 3.8.0 | Community, Enterprise | Alpha | False | --deployment.feature.secured-containers | If set to True Operator will run containers in secure mode | +| Version Check V2 | 1.2.31 | 1.2.31 | >= 3.8.0 | Community, Enterprise | Alpha | False | --deployment.feature.upgrade-version-check-V2 | N/A | +| [Operator Ephemeral Volumes](docs/features/ephemeral_volumes.md) | 1.2.31 | 1.2.2 | >= 3.8.0 | Community, Enterprise | Beta | False | --deployment.feature.ephemeral-volumes | N/A | +| [Force Rebuild Out Synced Shards](docs/features/rebuild_out_synced_shards.md) | 1.2.27 | 1.2.27 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.force-rebuild-out-synced-shards | It should be used only if user is aware of the risks. | +| [Spec Default Restore](docs/features/deployment_spec_defaults.md) | 1.2.25 | 1.2.21 | >= 3.8.0 | Community, Enterprise | Beta | True | --deployment.feature.deployment-spec-defaults-restore | If set to False Operator will not change ArangoDeployment Spec | +| Version Check | 1.2.23 | 1.1.4 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.upgrade-version-check | N/A | +| [Failover Leader service](docs/features/failover_leader_service.md) | 1.2.13 | 1.2.13 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.failover-leadership | N/A | +| Graceful Restart | 1.2.5 | 1.0.7 | >= 3.8.0 | Community, Enterprise | Production | True | ---deployment.feature.graceful-shutdown | N/A | +| Optional Graceful Restart | 1.2.0 | 1.2.5 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.optional-graceful-shutdown | N/A | +| Operator Internal Metrics Exporter | 1.2.0 | 1.2.0 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.metrics-exporter | N/A | +| Operator Maintenance Management Support | 1.2.0 | 1.0.7 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.maintenance | N/A | +| Encryption Key Rotation Support | 1.2.0 | 1.0.3 | >= 3.8.0 | Enterprise | NotSupported | False | --deployment.feature.encryption-rotation | N/A | +| TLS Runtime Rotation Support | 1.1.0 | 1.0.4 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.tls-rotation | N/A | +| JWT Rotation Support | 1.1.0 | 1.0.3 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.jwt-rotation | N/A | +| Operator Single Mode | 1.0.4 | 1.0.4 | >= 3.8.0 | Community, Enterprise | Production | False | --mode.single | Only 1 instance of Operator allowed in namespace when feature is enabled | +| TLS SNI Support | 1.0.3 | 1.0.3 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.tls-sni | N/A | +| Disabling of liveness probes | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | +| Pod Disruption Budgets | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | +| Prometheus Metrics Exporter | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | Prometheus required | +| Sidecar Containers | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | +| Volume Claim Templates | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | +| Volume Resizing | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | @@ -97,7 +97,7 @@ To upgrade to the Enterprise Edition, you need to get in touch with the ArangoDB |:-------------------------------------------------------|:-----------------|:-----------|:-----------------|:-----------------|:-----------|:--------|:-----|:----------------------------------------------------------------------------| | AgencyCache | 1.2.30 | 1.2.30 | >= 3.8.0 | Enterprise | Production | True | N/A | Enable Agency Cache mechanism in the Operator (Increase limit of the nodes) | | Member Maintenance Support | 1.2.25 | 1.2.16 | >= 3.8.0 | Enterprise | Production | True | N/A | Enable Member Maintenance during planned restarts | -| [Rebalancer](docs/design/features/rebalancer.md) | 1.2.15 | 1.2.5 | >= 3.8.0 | Enterprise | Production | True | N/A | N/A | +| [Rebalancer](docs/features/rebalancer.md) | 1.2.15 | 1.2.5 | >= 3.8.0 | Enterprise | Production | True | N/A | N/A | | [TopologyAwareness](docs/design/topology_awareness.md) | 1.2.4 | 1.2.4 | >= 3.8.0 | Enterprise | Production | True | N/A | N/A | diff --git a/docs/README.md b/docs/README.md index 4bb0b7274..1b6e9a982 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2,10 +2,10 @@ - [Tutorial](https://www.arangodb.com/docs/stable/tutorials-kubernetes.html) - [Documentation](https://www.arangodb.com/docs/stable/deployment-kubernetes.html) -- [Design documents](./design/README.md) -- [Providers](./providers/README.md) - - -# ArangoDB Kubernetes Operator Generated Documentation -- [ArangoDB Operator Metrics & Alerts](./generated/metrics/README.md) -- [ArangoDB Actions](./generated/actions.md) +- [Architecture](./design/README.md) +- [Features description and usage](./features/README.md) +- [Custom Resources API Reference](./api/README.md) +- [Operator Metrics & Alerts](./generated/metrics/README.md) +- [Operator Actions](./generated/actions.md) +- Known issues (TBD) +- [How-to ...](how-to/README.md) diff --git a/docs/api/ArangoDeployment.V1.md b/docs/api/ArangoDeployment.V1.md index 8e2631c91..4bf44623d 100644 --- a/docs/api/ArangoDeployment.V1.md +++ b/docs/api/ArangoDeployment.V1.md @@ -597,7 +597,7 @@ Architecture defines the list of supported architectures. First element on the list is marked as default architecture. Links: -* [Architecture Change](/docs/design/arch_change.md) +* [Architecture Change](/docs/how-to/arch_change.md) Default Value: ['amd64'] diff --git a/docs/api/ArangoMember.V1.md b/docs/api/ArangoMember.V1.md index 8828eacb1..f295d0ae0 100644 --- a/docs/api/ArangoMember.V1.md +++ b/docs/api/ArangoMember.V1.md @@ -4,7 +4,11 @@ ### .spec.deletion_priority: int -[Code Reference](/pkg/apis/deployment/v1/arango_member_spec.go#L44) +DeletionPriority define Deletion Priority. +Higher value means higher priority. Default is 0. +Example: set 1 for Coordinator which should be deleted first and scale down coordinators by one. + +[Code Reference](/pkg/apis/deployment/v1/arango_member_spec.go#L47) ### .spec.deploymentUID: string diff --git a/docs/api/README.md b/docs/api/README.md new file mode 100644 index 000000000..1a109ee38 --- /dev/null +++ b/docs/api/README.md @@ -0,0 +1,5 @@ +# Custom Resources API Reference + + - [ArangoDeployment.V1](./ArangoDeployment.V1.md) + - [ArangoMember.V1](./ArangoMember.V1.md) + diff --git a/docs/bare-metal.md b/docs/bare-metal.md deleted file mode 100644 index 75a4ef590..000000000 --- a/docs/bare-metal.md +++ /dev/null @@ -1,524 +0,0 @@ -# ArangoDB on bare metal Kubernetes - -A note of warning for lack of a better word upfront: Kubernetes is -awesome and powerful. As with awesome and powerful things, there is -infinite ways of setting up a k8s cluster. With great flexibility -comes great complexity. There are infinite ways of hitting barriers. - -This guide is a walk through for, again in lack of a better word, -a reasonable and flexible setup to get to an ArangoDB cluster setup on -a bare metal kubernetes setup. - -## BEWARE: Do not use this setup for production! - -This guide does not involve setting up dedicated master nodes or high -availability for Kubernetes, but uses for sake of simplicity a single untainted -master. This is the very definition of a test environment. - -If you are interested in running a high available Kubernetes setup, please -refer to: [Creating Highly Available Clusters with kubeadm](https://kubernetes.io/docs/setup/independent/high-availability/) - -## Requirements - -Let there be 3 Linux boxes, `kube01 (192.168.10.61)`, `kube02 (192.168.10.62)` -and `kube03 (192.168.10.3)`, with `kubeadm` and `kubectl` installed and off we go: - -* `kubeadm`, `kubectl` version `>=1.10` - -## Initialize the master node - -The master node is outstanding in that it handles the API server and some other -vital infrastructure - -``` -sudo kubeadm init --pod-network-cidr=10.244.0.0/16 -``` - -``` - [init] Using Kubernetes version: v1.13.2 - [preflight] Running pre-flight checks - [preflight] Pulling images required for setting up a Kubernetes cluster - [preflight] This might take a minute or two, depending on the speed of your internet connection - [preflight] You can also perform this action in beforehand using 'kubeadm config images pull' - [kubelet-start] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env" - [kubelet-start] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml" - [kubelet-start] Activating the kubelet service - [certs] Using certificateDir folder "/etc/kubernetes/pki" - [certs] Generating "ca" certificate and key - [certs] Generating "apiserver" certificate and key - [certs] apiserver serving cert is signed for DNS names [kube01 kubernetes kubernetes.default kubernetes.default.svc kubernetes.default.svc.cluster.local] and IPs [10.96.0.1 192.168.10.61] - [certs] Generating "apiserver-kubelet-client" certificate and key - [certs] Generating "front-proxy-ca" certificate and key - [certs] Generating "front-proxy-client" certificate and key - [certs] Generating "etcd/ca" certificate and key - [certs] Generating "apiserver-etcd-client" certificate and key - [certs] Generating "etcd/server" certificate and key - [certs] etcd/server serving cert is signed for DNS names [kube01 localhost] and IPs [192.168.10.61 127.0.0.1 ::1] - [certs] Generating "etcd/peer" certificate and key - [certs] etcd/peer serving cert is signed for DNS names [kube01 localhost] and IPs [192.168.10.61 127.0.0.1 ::1] - [certs] Generating "etcd/healthcheck-client" certificate and key - [certs] Generating "sa" key and public key - [kubeconfig] Using kubeconfig folder "/etc/kubernetes" - [kubeconfig] Writing "admin.conf" kubeconfig file - [kubeconfig] Writing "kubelet.conf" kubeconfig file - [kubeconfig] Writing "controller-manager.conf" kubeconfig file - [kubeconfig] Writing "scheduler.conf" kubeconfig file - [control-plane] Using manifest folder "/etc/kubernetes/manifests" - [control-plane] Creating static Pod manifest for "kube-apiserver" - [control-plane] Creating static Pod manifest for "kube-controller-manager" - [control-plane] Creating static Pod manifest for "kube-scheduler" - [etcd] Creating static Pod manifest for local etcd in "/etc/kubernetes/manifests" - [wait-control-plane] Waiting for the kubelet to boot up the control plane as static Pods from directory "/etc/kubernetes/manifests". This can take up to 4m0s - [apiclient] All control plane components are healthy after 23.512869 seconds - [uploadconfig] storing the configuration used in ConfigMap "kubeadm-config" in the "kube-system" Namespace - [kubelet] Creating a ConfigMap "kubelet-config-1.13" in namespace kube-system with the configuration for the kubelets in the cluster - [patchnode] Uploading the CRI Socket information "/var/run/dockershim.sock" to the Node API object "kube01" as an annotation - [mark-control-plane] Marking the node kube01 as control-plane by adding the label "node-role.kubernetes.io/master=''" - [mark-control-plane] Marking the node kube01 as control-plane by adding the taints [node-role.kubernetes.io/master:NoSchedule] - [bootstrap-token] Using token: blcr1y.49wloegyaugice8a - [bootstrap-token] Configuring bootstrap tokens, cluster-info ConfigMap, RBAC Roles - [bootstraptoken] configured RBAC rules to allow Node Bootstrap tokens to post CSRs in order for nodes to get long term certificate credentials - [bootstraptoken] configured RBAC rules to allow the csrapprover controller automatically approve CSRs from a Node Bootstrap Token - [bootstraptoken] configured RBAC rules to allow certificate rotation for all node client certificates in the cluster - [bootstraptoken] creating the "cluster-info" ConfigMap in the "kube-public" namespace - [addons] Applied essential addon: CoreDNS - [addons] Applied essential addon: kube-proxy - - Your Kubernetes master has initialized successfully! - - To start using your cluster, you need to run the following as a regular user: - - mkdir -p $HOME/.kube - sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config - sudo chown $(id -u):$(id -g) $HOME/.kube/config - - You should now deploy a pod network to the cluster. - Run "kubectl apply -f [podnetwork].yaml" with one of the options listed at: - https://kubernetes.io/docs/concepts/cluster-administration/addons/ - - You can now join any number of machines by running the following on each node as root: - - kubeadm join 192.168.10.61:6443 --token blcr1y.49wloegyaugice8a --discovery-token-ca-cert-hash sha256:0505933664d28054a62298c68dc91e9b2b5cf01ecfa2228f3c8fa2412b7a78c8 -``` - -Go ahead and do as above instructed and see into getting kubectl to work on the master: - -``` -mkdir -p $HOME/.kube -sudo cp -i /etc/kubernetes/admin.conf $HOME/.kube/config -sudo chown $(id -u):$(id -g) $HOME/.kube/config -``` - -## Deploy a pod network - -For this guide, we go with **flannel**, as it is an easy way of setting up a -layer 3 network, which uses the Kubernetes API and just works anywhere, where a -network between the involved machines works: - -``` -kubectl apply -f \ - https://raw.githubusercontent.com/coreos/flannel/bc79dd1505b0c8681ece4de4c0d86c5cd2643275/Documentation/kube-flannel.yml -``` -``` - clusterrole.rbac.authorization.k8s.io/flannel created - clusterrolebinding.rbac.authorization.k8s.io/flannel created - serviceaccount/flannel created - configmap/kube-flannel-cfg created - daemonset.extensions/kube-flannel-ds-amd64 created - daemonset.extensions/kube-flannel-ds-arm64 created - daemonset.extensions/kube-flannel-ds-arm created - daemonset.extensions/kube-flannel-ds-ppc64le created - daemonset.extensions/kube-flannel-ds-s390x created -``` - -## Join remaining nodes - -Run the above join commands on the nodes `kube02` and `kube03`. Below is the -output on `kube02` for the setup for this guide: - -``` -sudo kubeadm join 192.168.10.61:6443 --token blcr1y.49wloegyaugice8a --discovery-token-ca-cert-hash sha256:0505933664d28054a62298c68dc91e9b2b5cf01ecfa2228f3c8fa2412b7a78c8 -``` -``` - [preflight] Running pre-flight checks - [discovery] Trying to connect to API Server "192.168.10.61:6443" - [discovery] Created cluster-info discovery client, requesting info from "https:// 192.168.10.61:6443" - [discovery] Requesting info from "https://192.168.10.61:6443" again to validate TLS against the pinned public key - [discovery] Cluster info signature and contents are valid and TLS certificate validates against pinned roots, will use API Server "192.168.10.61:6443" - [discovery] Successfully established connection with API Server "192.168.10.61:6443" - [join] Reading configuration from the cluster... - [join] FYI: You can look at this config file with 'kubectl -n kube-system get cm kubeadm-config -oyaml' - [kubelet] Downloading configuration for the kubelet from the "kubelet-config-1.13" ConfigMap in the kube-system namespace - [kubelet-start] Writing kubelet configuration to file "/var/lib/kubelet/config.yaml" - [kubelet-start] Writing kubelet environment file with flags to file "/var/lib/kubelet/kubeadm-flags.env" - [kubelet-start] Activating the kubelet service - [tlsbootstrap] Waiting for the kubelet to perform the TLS Bootstrap... - [patchnode] Uploading the CRI Socket information "/var/run/dockershim.sock" to the Node API object "kube02" as an annotation - -This node has joined the cluster: -* Certificate signing request was sent to apiserver and a response was received. -* The Kubelet was informed of the new secure connection details. - -Run 'kubectl get nodes' on the master to see this node join the cluster. -``` - -## Untaint master node - -``` -kubectl taint nodes --all node-role.kubernetes.io/master- -``` -``` - node/kube01 untainted - taint "node-role.kubernetes.io/master:" not found - taint "node-role.kubernetes.io/master:" not found -``` - -## Wait for nodes to get ready and sanity checking - -After some brief period, you should see that your nodes are good to go: - -``` -kubectl get nodes -``` -``` - NAME STATUS ROLES AGE VERSION - kube01 Ready master 38m v1.13.2 - kube02 Ready 13m v1.13.2 - kube03 Ready 63s v1.13.2 -``` - -Just a quick sanity check to see, that your cluster is up and running: - -``` -kubectl get all --all-namespaces -``` -``` - NAMESPACE NAME READY STATUS RESTARTS AGE - kube-system pod/coredns-86c58d9df4-r9l5c 1/1 Running 2 41m - kube-system pod/coredns-86c58d9df4-swzpx 1/1 Running 2 41m - kube-system pod/etcd-kube01 1/1 Running 2 40m - kube-system pod/kube-apiserver-kube01 1/1 Running 2 40m - kube-system pod/kube-controller-manager-kube01 1/1 Running 2 40m - kube-system pod/kube-flannel-ds-amd64-hppt4 1/1 Running 3 16m - kube-system pod/kube-flannel-ds-amd64-kt6jh 1/1 Running 1 3m41s - kube-system pod/kube-flannel-ds-amd64-tg7gz 1/1 Running 2 20m - kube-system pod/kube-proxy-f2g2q 1/1 Running 2 41m - kube-system pod/kube-proxy-gt9hh 1/1 Running 0 3m41s - kube-system pod/kube-proxy-jwmq7 1/1 Running 2 16m - kube-system pod/kube-scheduler-kube01 1/1 Running 2 40m - - NAMESPACE NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE - default service/kubernetes ClusterIP 10.96.0.1 443/TCP 41m - kube-system service/kube-dns ClusterIP 10.96.0.10 53/UDP,53/TCP 41m -``` - -## Deploy helm - -- Obtain current [helm release](https://github.com/helm/helm/releases) for your architecture - -- Create tiller user - - ``` - kubectl create serviceaccount --namespace kube-system tiller - ``` - ``` - serviceaccount/tiller created - ``` - -- Attach `tiller` to proper role - - ``` - kubectl create clusterrolebinding tiller-cluster-rule \ - --clusterrole=cluster-admin --serviceaccount=kube-system:tiller - ``` - ``` - clusterrolebinding.rbac.authorization.k8s.io/tiller-cluster-rule created - ``` - -- Initialise helm - - ``` - helm init --service-account tiller - ``` - ``` - $HELM_HOME has been configured at /home/xxx/.helm. - ... - Happy Helming! - - Tiller (the Helm server-side component) has been - installed into your Kubernetes Cluster. - ``` - -## Deploy ArangoDB operator charts - -- Deploy ArangoDB custom resource definition chart - -``` -helm install https://github.com/arangodb/kube-arangodb/releases/download/0.3.7/kube-arangodb-crd.tgz -``` -``` - NAME: hoping-gorilla - LAST DEPLOYED: Mon Jan 14 06:10:27 2019 - NAMESPACE: default - STATUS: DEPLOYED - - RESOURCES: - ==> v1beta1/CustomResourceDefinition - NAME AGE - arangodeployments.database.arangodb.com 0s - arangodeploymentreplications.replication.database.arangodb.com 0s - - - NOTES: - - kube-arangodb-crd has been deployed successfully! - - Your release is named 'hoping-gorilla'. - - You can now continue install kube-arangodb chart. -``` -- Deploy ArangoDB operator chart - -``` -helm install https://github.com/arangodb/kube-arangodb/releases/download/0.3.7/kube-arangodb.tgz -``` -``` - NAME: illocutionary-whippet - LAST DEPLOYED: Mon Jan 14 06:11:58 2019 - NAMESPACE: default - STATUS: DEPLOYED - - RESOURCES: - ==> v1beta1/ClusterRole - NAME AGE - illocutionary-whippet-deployment-replications 0s - illocutionary-whippet-deployment-replication-operator 0s - illocutionary-whippet-deployments 0s - illocutionary-whippet-deployment-operator 0s - - ==> v1beta1/ClusterRoleBinding - NAME AGE - illocutionary-whippet-deployment-replication-operator-default 0s - illocutionary-whippet-deployment-operator-default 0s - - ==> v1beta1/RoleBinding - NAME AGE - illocutionary-whippet-deployment-replications 0s - illocutionary-whippet-deployments 0s - - ==> v1/Service - NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE - arango-deployment-replication-operator ClusterIP 10.107.2.133 8528/TCP 0s - arango-deployment-operator ClusterIP 10.104.189.81 8528/TCP 0s - - ==> v1beta1/Deployment - NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE - arango-deployment-replication-operator 2 2 2 0 0s - arango-deployment-operator 2 2 2 0 0s - - ==> v1/Pod(related) - NAME READY STATUS RESTARTS AGE - arango-deployment-replication-operator-5f679fbfd8-nk8kz 0/1 Pending 0 0s - arango-deployment-replication-operator-5f679fbfd8-pbxdl 0/1 ContainerCreating 0 0s - arango-deployment-operator-65f969fc84-gjgl9 0/1 Pending 0 0s - arango-deployment-operator-65f969fc84-wg4nf 0/1 ContainerCreating 0 0s - - -NOTES: - -kube-arangodb has been deployed successfully! - -Your release is named 'illocutionary-whippet'. - -You can now deploy ArangoDeployment & ArangoDeploymentReplication resources. - -See https://www.arangodb.com/docs/stable/tutorials-kubernetes.html -for how to get started. -``` -- As unlike cloud k8s offerings no file volume infrastructure exists, we need - to still deploy the storage operator chart: - -``` -helm install \ - https://github.com/arangodb/kube-arangodb/releases/download/0.3.7/kube-arangodb-storage.tgz -``` -``` - NAME: sad-newt - LAST DEPLOYED: Mon Jan 14 06:14:15 2019 - NAMESPACE: default - STATUS: DEPLOYED - - RESOURCES: - ==> v1/ServiceAccount - NAME SECRETS AGE - arango-storage-operator 1 1s - - ==> v1beta1/CustomResourceDefinition - NAME AGE - arangolocalstorages.storage.arangodb.com 1s - - ==> v1beta1/ClusterRole - NAME AGE - sad-newt-storages 1s - sad-newt-storage-operator 1s - - ==> v1beta1/ClusterRoleBinding - NAME AGE - sad-newt-storage-operator 1s - - ==> v1beta1/RoleBinding - NAME AGE - sad-newt-storages 1s - - ==> v1/Service - NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE - arango-storage-operator ClusterIP 10.104.172.100 8528/TCP 1s - - ==> v1beta1/Deployment - NAME DESIRED CURRENT UP-TO-DATE AVAILABLE AGE - arango-storage-operator 2 2 2 0 1s - - ==> v1/Pod(related) - NAME READY STATUS RESTARTS AGE - arango-storage-operator-6bc64ccdfb-tzllq 0/1 ContainerCreating 0 0s - arango-storage-operator-6bc64ccdfb-zdlxk 0/1 Pending 0 0s - - - NOTES: - - kube-arangodb-storage has been deployed successfully! - - Your release is named 'sad-newt'. - - You can now deploy an ArangoLocalStorage resource. - - See https://www.arangodb.com/docs/stable/deployment-kubernetes-storage-resource.html - for further instructions. - -``` -## Deploy ArangoDB cluster - -- Deploy local storage - -``` -kubectl apply -f https://raw.githubusercontent.com/arangodb/kube-arangodb/master/examples/arango-local-storage.yaml -``` -``` - arangolocalstorage.storage.arangodb.com/arangodb-local-storage created -``` - -- Deploy simple cluster - -``` -kubectl apply -f https://raw.githubusercontent.com/arangodb/kube-arangodb/master/examples/simple-cluster.yaml -``` -``` - arangodeployment.database.arangodb.com/example-simple-cluster created -``` - -## Access your cluster - -- Find your cluster's network address: - -``` -kubectl get services -``` -``` - NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE - arango-deployment-operator ClusterIP 10.104.189.81 8528/TCP 14m - arango-deployment-replication-operator ClusterIP 10.107.2.133 8528/TCP 14m - example-simple-cluster ClusterIP 10.109.170.64 8529/TCP 5m18s - example-simple-cluster-ea NodePort 10.98.198.7 8529:30551/TCP 4m8s - example-simple-cluster-int ClusterIP None 8529/TCP 5m19s - kubernetes ClusterIP 10.96.0.1 443/TCP 69m -``` - -- In this case, according to the access service, `example-simple-cluster-ea`, - the cluster's coordinators are reachable here: - -https://kube01:30551, https://kube02:30551 and https://kube03:30551 - -## LoadBalancing - -For this guide we like to use the `metallb` load balancer, which can be easiy -installed as a simple layer 2 load balancer: - -- install the `metalllb` controller: - -``` -kubectl apply -f \ - https://raw.githubusercontent.com/google/metallb/v0.7.3/manifests/metallb.yaml -``` -``` - namespace/metallb-system created - serviceaccount/controller created - serviceaccount/speaker created - clusterrole.rbac.authorization.k8s.io/metallb-system:controller created - clusterrole.rbac.authorization.k8s.io/metallb-system:speaker created - role.rbac.authorization.k8s.io/config-watcher created - clusterrolebinding.rbac.authorization.k8s.io/metallb-system:controller created - clusterrolebinding.rbac.authorization.k8s.io/metallb-system:speaker created - rolebinding.rbac.authorization.k8s.io/config-watcher created - daemonset.apps/speaker created - deployment.apps/controller created -``` - -- Deploy network range configurator. Assuming that the range for the IP addresses, - which are granted to `metalllb` for load balancing is 192.168.10.224/28, - download the [exmample layer2 configurator](https://raw.githubusercontent.com/google/metallb/v0.7.3/manifests/example-layer2-config.yaml). - -``` -wget https://raw.githubusercontent.com/google/metallb/v0.7.3/manifests/example-layer2-config.yaml -``` - -- Edit the `example-layer2-config.yaml` file to use the according addresses. - Do this with great care, as YAML files are indention sensitive. - -``` -apiVersion: v1 -kind: ConfigMap -metadata: - namespace: metallb-system - name: config -data: - config: | - address-pools: - - name: my-ip-space - protocol: layer2 - addresses: - - 192.168.10.224/28 -``` - -- deploy the configuration map: - -``` -kubectl apply -f example-layer2-config.yaml -``` -``` - configmap/config created -``` - -- restart ArangoDB's endpoint access service: - -``` -kubectl delete service example-simple-cluster-ea -``` -``` - service "example-simple-cluster-ea" deleted -``` - -- watch, how the service goes from `Nodeport` to `LoadBalancer` the output above - -``` -kubectl get services -``` -``` NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE - arango-deployment-operator ClusterIP 10.104.189.81 8528/TCP 34m - arango-deployment-replication-operator ClusterIP 10.107.2.133 8528/TCP 34m - example-simple-cluster ClusterIP 10.109.170.64 8529/TCP 24m - example-simple-cluster-ea LoadBalancer 10.97.217.222 192.168.10.224 8529:30292/TCP 22s - example-simple-cluster-int ClusterIP None 8529/TCP 24m - kubernetes ClusterIP 10.96.0.1 443/TCP 89m -``` - -- Now you are able of accessing all 3 coordinators through https://192.168.10.224:8529 diff --git a/docs/customer_questions.md b/docs/customer_questions.md deleted file mode 100644 index e172b61c8..000000000 --- a/docs/customer_questions.md +++ /dev/null @@ -1,11 +0,0 @@ -# Customer questions - -- What is your experience with using Kubernetes? -- What is your experience with using ArangoDB on Kubernetes? -- What do you think of the operator concept for an ArangoDB Kubernetes offering? -- What is the minimum version of Kubernetes you're running / need? -- What kind of persistent volumes do you use / plan to use? -- What kind of load-balancer support do you use / need for ArangoDB in Kubernetes? -- Do you have a need to limit ArangoDB Pods to a sub-section of your Kubernetes cluster? -- Do you see a need to shutdown a cluster and bring it back alive later (with its data!)? -- In which cloud/on premises environment are you going to use Kubernetes (AWS, GCE, on premise...)? diff --git a/docs/design/README.md b/docs/design/README.md index bdd89f717..d50fb3fbb 100644 --- a/docs/design/README.md +++ b/docs/design/README.md @@ -1,30 +1,13 @@ -# ArangoDB operator design documents +# ArangoDB operator architecture details -- [Architecture change](./arch_change.md) -- [Constraints](./constraints.md) +- [Operator API](./api.md) +- [Backups](./backup.md) +- [Constraints for high-availability](./constraints.md) +- [ArangoDB Exporter](./exporter.md) - [Health](./health.md) -- [Metrics](./metrics.md) +- [Lifecycle hooks and Finalizers](./lifecycle_hooks_and_finalizers.md) +- [Pod eviction and replacement](./pod_eviction_and_replacement.md) - [Kubernetes Pod name versus cluster ID](./pod_name_versus_cluster_id.md) -- [Resource & labels](./resource_and_labels.md) -- [Resource Management](./resource_management.md) +- [Resources & labels](./resources_and_labels.md) - [Scaling](./scaling.md) -- [Status](./status.md) -- [Upgrading](./upgrading.md) -- [Rotating Pods](./rotating.md) -- [Maintenance](./maintenance.md) -- [Additional configuration](./additional_configuration.md) -- [Topology awareness](./topology_awareness.md) -- [Configuring timezone](./configuring_tz.md) -- [Operator API](./api.md) -- [Logging](./logging.md) -- [Manual Recovery](./recovery.md) -- [Backup](./backup.md) - -## Features -- [Force rebuild out-synced Shards with broken Merkle Tree](./features/rebuild_out_synced_shards.md) -- [Failover Leader service](./features/failover_leader_service.md) -- [Restore defaults from last accepted state of deployment](./features/deployment_spec_defaults.md) - -## Debugging -- [Collecting debug info](./debugging.md) -- \ No newline at end of file +- [Topology awareness](./topology_awareness.md) \ No newline at end of file diff --git a/docs/design/acceptance_test.md b/docs/design/acceptance_test.md deleted file mode 100644 index 5967035dc..000000000 --- a/docs/design/acceptance_test.md +++ /dev/null @@ -1,533 +0,0 @@ -# Acceptance test for kube-arangodb operator on specific Kubernetes platform - -This acceptance test plan describes all test scenario's that must be executed -successfully in order to consider the kube-arangodb operator production ready -on a specific Kubernetes setup (from now on we'll call a Kubernetes setup a platform). - -## Platform parameters - -Before the test, record the following parameters for the platform the test is executed on. - -- Name of the platform -- Version of the platform -- Upstream Kubernetes version used by the platform (run `kubectl version`) -- Number of nodes used by the Kubernetes cluster (run `kubectl get node`) -- `StorageClasses` provided by the platform (run `kubectl get storageclass`) -- Does the platform use RBAC? (run `kubectl describe clusterrolebinding`) -- Does the platform support services of type `LoadBalancer`? - -If one of the above questions can have multiple answers (e.g. different Kubernetes versions) -then make the platform more specific. E.g. consider "GKE with Kubernetes 1.10.2" a platform -instead of "GKE" which can have version "1.8", "1.9" & "1.10.2". - -## Platform preparations - -Before the tests can be run, the platform has to be prepared. - -### Deploy the ArangoDB operators - -Deploy the following ArangoDB operators: - -- `ArangoDeployment` operator -- `ArangoDeploymentReplication` operator -- `ArangoLocalStorage` operator - -To do so, follow the [instructions in the documentation](https://www.arangodb.com/docs/stable/deployment-kubernetes-usage.html). - -### `PersistentVolume` provider - -If the platform does not provide a `PersistentVolume` provider, create one by running: - -```bash -kubectl apply -f examples/arango-local-storage.yaml -``` - -## Basis tests - -The basis tests are executed on every platform with various images: - -Run the following tests with the following images: - -- Community -- Enterprise - -For every tests, one of these images can be chosen, as long as each image -is used in a test at least once. - -### Test 1a: Create single server deployment - -Create an `ArangoDeployment` of mode `Single`. - -Hint: Use `tests/acceptance/single.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 1 `Pod` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -### Test 1b: Create active failover deployment - -Create an `ArangoDeployment` of mode `ActiveFailover`. - -Hint: Use `tests/acceptance/activefailover.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 5 `Pods` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -### Test 1c: Create cluster deployment - -Create an `ArangoDeployment` of mode `Cluster`. - -Hint: Use `tests/acceptance/cluster.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 9 `Pods` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -### Test 1d: Create cluster deployment with dc2dc - -This test requires the use of the enterprise image. - -Create an `ArangoDeployment` of mode `Cluster` and dc2dc enabled. - -Hint: Derive from `tests/acceptance/cluster-sync.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 15 `Pods` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The deployment must yield a `Service` named `-sync` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -### Test 2a: Scale an active failover deployment - -Create an `ArangoDeployment` of mode `ActiveFailover`. - -- [ ] The deployment must start -- [ ] The deployment must yield 5 `Pods` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -Change the value of `spec.single.count` from 2 to 3. - -- [ ] A single server is added -- [ ] The deployment must yield 6 `Pods` - -Change the value of `spec.single.count` from 3 to 2. - -- [ ] A single server is removed -- [ ] The deployment must yield 5 `Pods` - -### Test 2b: Scale a cluster deployment - -Create an `ArangoDeployment` of mode `Cluster`. - -Hint: Use `tests/acceptance/cluster.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 9 `Pods` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -Change the value of `spec.dbservers.count` from 3 to 5. - -- [ ] Two dbservers are added -- [ ] The deployment must yield 11 `Pods` - -Change the value of `spec.coordinators.count` from 3 to 4. - -- [ ] A coordinator is added -- [ ] The deployment must yield 12 `Pods` - -Change the value of `spec.dbservers.count` from 5 to 2. - -- [ ] Three dbservers are removed (one by one) -- [ ] The deployment must yield 9 `Pods` - -Change the value of `spec.coordinators.count` from 4 to 1. - -- [ ] Three coordinators are removed (one by one) -- [ ] The deployment must yield 6 `Pods` - -### Test 3: Production environment - -Production environment tests are only relevant if there are enough nodes -available that `Pods` can be scheduled on. - -The number of available nodes must be >= the maximum server count in -any group. - -### Test 3a: Create single server deployment in production environment - -Create an `ArangoDeployment` of mode `Single` with an environment of `Production`. - -Hint: Derive from `tests/acceptance/single.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 1 `Pod` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -### Test 3b: Create active failover deployment in production environment - -Create an `ArangoDeployment` of mode `ActiveFailover` with an environment of `Production`. - -Hint: Derive from `tests/acceptance/activefailover.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 5 `Pods` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -### Test 3c: Create cluster deployment in production environment - -Create an `ArangoDeployment` of mode `Cluster` with an environment of `Production`. - -Hint: Derive from `tests/acceptance/cluster.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 9 `Pods` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -### Test 3d: Create cluster deployment in production environment and scale it - -Create an `ArangoDeployment` of mode `Cluster` with an environment of `Production`. - -Hint: Derive from `tests/acceptance/cluster.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 9 `Pods` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -Change the value of `spec.dbservers.count` from 3 to 4. - -- [ ] Two dbservers are added -- [ ] The deployment must yield 10 `Pods` - -Change the value of `spec.coordinators.count` from 3 to 4. - -- [ ] A coordinator is added -- [ ] The deployment must yield 11 `Pods` - -Change the value of `spec.dbservers.count` from 4 to 2. - -- [ ] Three dbservers are removed (one by one) -- [ ] The deployment must yield 9 `Pods` - -Change the value of `spec.coordinators.count` from 4 to 2. - -- [ ] Three coordinators are removed (one by one) -- [ ] The deployment must yield 7 `Pods` - -### Test 4a: Create cluster deployment with `ArangoLocalStorage` provided volumes - -Ensure an `ArangoLocalStorage` is deployed. - -Hint: Use from `tests/acceptance/local-storage.yaml`. - -Create an `ArangoDeployment` of mode `Cluster` with a `StorageClass` that is -mapped to an `ArangoLocalStorage` provider. - -Hint: Derive from `tests/acceptance/cluster.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 9 `Pods` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -### Test 4b: Create cluster deployment with a platform provided `StorageClass` - -This test only applies to platforms that provide their own `StorageClasses`. - -Create an `ArangoDeployment` of mode `Cluster` with a `StorageClass` that is -provided by the platform. - -Hint: Derive from `tests/acceptance/cluster.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 9 `Pods` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -### Test 5a: Test `Pod` resilience on single servers - -Create an `ArangoDeployment` of mode `Single`. - -Hint: Use from `tests/acceptance/single.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 1 `Pod` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -Delete the `Pod` of the deployment that contains the single server. - -- [ ] The `Pod` must be restarted -- [ ] After the `Pod` has restarted, the server must have the same data and be responsive again - -### Test 5b: Test `Pod` resilience on active failover - -Create an `ArangoDeployment` of mode `ActiveFailover`. - -Hint: Use from `tests/acceptance/activefailover.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 5 `Pods` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -Delete a `Pod` of the deployment that contains an agent. - -- [ ] While the `Pod` is gone & restarted, the cluster must still respond to requests (R/W) -- [ ] The `Pod` must be restarted - -Delete a `Pod` of the deployment that contains a single server. - -- [ ] While the `Pod` is gone & restarted, the cluster must still respond to requests (R/W) -- [ ] The `Pod` must be restarted - -### Test 5c: Test `Pod` resilience on clusters - -Create an `ArangoDeployment` of mode `Cluster`. - -Hint: Use from `tests/acceptance/cluster.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 9 `Pods` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -Delete a `Pod` of the deployment that contains an agent. - -- [ ] While the `Pod` is gone & restarted, the cluster must still respond to requests (R/W) -- [ ] The `Pod` must be restarted - -Delete a `Pod` of the deployment that contains a dbserver. - -- [ ] While the `Pod` is gone & restarted, the cluster must still respond to requests (R/W), except - for requests to collections with a replication factor of 1. -- [ ] The `Pod` must be restarted - -Delete a `Pod` of the deployment that contains an coordinator. - -- [ ] While the `Pod` is gone & restarted, the cluster must still respond to requests (R/W), except - requests targeting the restarting coordinator. -- [ ] The `Pod` must be restarted - -### Test 6a: Test `Node` reboot on single servers - -Create an `ArangoDeployment` of mode `Single`. - -Hint: Use from `tests/acceptance/single.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 1 `Pod` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -Reboot the `Node` of the deployment that contains the single server. - -- [ ] The `Pod` running on the `Node` must be restarted -- [ ] After the `Pod` has restarted, the server must have the same data and be responsive again - -### Test 6b: Test `Node` reboot on active failover - -Create an `ArangoDeployment` of mode `ActiveFailover` with an environment of `Production`. - -Hint: Use from `tests/acceptance/activefailover.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 5 `Pods` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -Reboot a `Node`. - -- [ ] While the `Node` is restarting, the cluster must still respond to requests (R/W) -- [ ] All `Pods` on the `Node` must be restarted - -### Test 6c: Test `Node` reboot on clusters - -Create an `ArangoDeployment` of mode `Cluster` with an environment of `Production`. - -Hint: Use from `tests/acceptance/cluster.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 9 `Pods` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -Reboot a `Node`. - -- [ ] While the `Node` is restarting, the cluster must still respond to requests (R/W) -- [ ] All `Pods` on the `Node` must be restarted - -### Test 6d: Test `Node` removal on single servers - -This test is only valid when `StorageClass` is used that provides network attached `PersistentVolumes`. - -Create an `ArangoDeployment` of mode `Single`. - -Hint: Use from `tests/acceptance/single.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 1 `Pod` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -Remove the `Node` containing the deployment from the Kubernetes cluster. - -- [ ] The `Pod` running on the `Node` must be restarted on another `Node` -- [ ] After the `Pod` has restarted, the server must have the same data and be responsive again - -### Test 6e: Test `Node` removal on active failover - -Create an `ArangoDeployment` of mode `ActiveFailover` with an environment of `Production`. - -Hint: Use from `tests/acceptance/activefailover.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 5 `Pods` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -Remove a `Node` containing the `Pods` of the deployment from the Kubernetes cluster. - -- [ ] While the `Pods` are being restarted on new `Nodes`, the cluster must still respond to requests (R/W) -- [ ] The `Pods` running on the `Node` must be restarted on another `Node` -- [ ] After the `Pods` have restarted, the server must have the same data and be responsive again - -### Test 6f: Test `Node` removal on clusters - -This test is only valid when: - -- A `StorageClass` is used that provides network attached `PersistentVolumes` -- or all collections have a replication factor of 2 or higher - -Create an `ArangoDeployment` of mode `Cluster` with an environment of `Production`. - -Hint: Use from `tests/acceptance/cluster.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 9 `Pods` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -Remove a `Node` containing the `Pods` of the deployment from the Kubernetes cluster. - -- [ ] While the `Pods` are being restarted on new `Nodes`, the cluster must still respond to requests (R/W) -- [ ] The `Pods` running on the `Node` must be restarted on another `Node` -- [ ] After the `Pods` have restarted, the server must have the same data and be responsive again - -### Test 6g: Test `Node` removal on clusters with replication factor 1 - -This test is only valid when: - -- A `StorageClass` is used that provides `Node` local `PersistentVolumes` -- and at least some collections have a replication factor of 1 - -Create an `ArangoDeployment` of mode `Cluster` with an environment of `Production`. - -Hint: Use from `tests/acceptance/cluster.yaml`. - -- [ ] The deployment must start -- [ ] The deployment must yield 9 `Pods` -- [ ] The deployment must yield a `Service` named `` -- [ ] The deployment must yield a `Service` named `-ea` -- [ ] The `Service` named `-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -Remove a `Node`, containing the dbserver `Pod` that holds a collection with replication factor 1, -from the Kubernetes cluster. - -- [ ] While the `Pods` are being restarted on new `Nodes`, the cluster must still respond to requests (R/W), - except requests involving collections with a replication factor of 1 -- [ ] The `Pod` running the dbserver with a collection that has a replication factor of 1 must NOT be restarted on another `Node` - -Remove the collections with the replication factor of 1 - -- [ ] The remaining `Pods` running on the `Node` must be restarted on another `Node` -- [ ] After the `Pods` have restarted, the server must have the same data, except for the removed collections, and be responsive again - -### Test 7a: Test DC2DC on 2 clusters, running in the same Kubernetes cluster - -This test requires the use of the enterprise image. - -Create 2 `ArangoDeployment` of mode `Cluster` and dc2dc enabled. - -Hint: Derive from `tests/acceptance/cluster-sync.yaml`, name the deployments `cluster1` and `cluster2`. - -Make sure to include a name ('cluster1-to-2`) for an external access package. - -```yaml -apiVersion: "database.arangodb.com/v1alpha" -kind: "ArangoDeployment" -metadata: - name: "cluster1" -spec: - mode: Cluster - image: ewoutp/arangodb:3.3.14 - sync: - enabled: true - externalAccess: - accessPackageSecretNames: ["cluster1-to-2"] -``` - -- [ ] The deployments must start -- [ ] The deployments must yield 15 `Pods` -- [ ] The deployments must yield a `Service` named `cluster[1|2]` -- [ ] The deployments must yield a `Service` named `cluster[1|2]-ea` -- [ ] The deployments must yield a `Service` named `cluster[1|2]-sync` -- [ ] The `Services` named `cluster[1|2]-ea` must be accessible from outside (LoadBalancer or NodePort) and show WebUI - -Create an `ArangoDeploymentReplication` from `tests/acceptance/cluster12-replication.yaml`. - -It will take some time until the synchronization (from `cluster1` to `cluster2`) is configured. - -- [ ] The status of the `cluster12-replication` resource shows .... -- [ ] The webUI of `cluster1` shows that you can create a new collection there. -- [ ] The webUI of `cluster2` shows that you cannot create a new collection there. - -Create a collection named `testcol` with a replication factor 2 and 3 shards (using the webUI of `cluster1`). - -- [ ] The webUI of `cluster2` shows collection `testcol` with the given replication factor and number of shards. - -Create multiple documents in the collection named `testcol` (using the webUI of `cluster1`). - -- [ ] The documents are visible in webUI of `cluster2`. - -Modify multiple documents in the collection named `testcol` (using the webUI of `cluster1`). - -- [ ] The modified documents are visible in webUI of `cluster2`. - -Remove one or more documents from the collection named `testcol` (using the webUI of `cluster1`). - -- [ ] The documents are no longer visible in webUI of `cluster2`. - -Create a new database called `db2` (using the webUI of `cluster1`). - -- [ ] The webUI of `cluster2` shows database `db2`. diff --git a/docs/design/acceptance_test_platforms.md b/docs/design/acceptance_test_platforms.md deleted file mode 100644 index 61f31807d..000000000 --- a/docs/design/acceptance_test_platforms.md +++ /dev/null @@ -1,13 +0,0 @@ -# Acceptance test platforms - -The [kube-arangodb acceptance tests](./acceptance_test.md) must be -executed on the following platforms: - -- Google GKE, with Kubernetes version 1.10 -- Amazon EKS, with Kubernetes version 1.10 -- Amazon & Kops, with Kubernetes version 1.10 -- Azure AKS, with Kubernetes version 1.10 -- Openshift, based on Kubernetes version 1.10 -- Bare metal with kubeadm 1.10 -- Minikube with Kubernetes version 1.10 -- Kubernetes on docker for Mac, with Kubernetes version 1.10 diff --git a/docs/design/dashboard.md b/docs/design/dashboard.md index 01da1eaab..022606b1c 100644 --- a/docs/design/dashboard.md +++ b/docs/design/dashboard.md @@ -1,64 +1,3 @@ # Deployment Operator Dashboard -To inspect the state of an `ArangoDeployment` you can use `kubectl get ...` to inspect -the `status` of the resource itself, but to get the entire "picture" you also -must inspect the status of the `Pods` created for the deployment, the `PersistentVolumeClaims`, -the `PersistentVolumes`, the `Services` and some `Secrets`. - -The goal of the operator dashboard is to simplify this inspection process. - -The deployment operator dashboard provides: - -- A status overview of all `ArangoDeployments` it controls -- A status overview of all resources created by the operator (for an `ArangoDeployment`) -- Run the arangoinspector on deployments -- Instructions for upgrading deployments to newer versions - -It does not provide: - -- Direct access to the deployed database -- Anything that can already be done in the web-UI of the database or naturaly belongs there. - -The dashboard is a single-page web application that is served by the operator itself. - -## Design decisions - -### Leader only - -Since only the operator instance that won the leader election has the latest state of all -deployments, only that instance will serve dashboard requests. - -For this purpose, a `Service` is created when deploying the operator. -This service uses a `role=leader` selector to ensure that only the right instance -will be included in its list of endpoints. - -### Exposing the dashboard - -By default the `Service` that selects the leading operator instance is not exposed outside the Kubernetes cluster. -Users must use `kubectl expose service ...` to add additional `Services` of type `LoadBalancer` -or `NodePort` to expose the dashboard if and how they want to. - -### Readonly behavior - -The dashboard only provides readonly functions. -When modifications to an `ArangoDeployment` are needed (e.g. when upgrading to a new version), the dashboard -will provide instructions for doing so using `kubectl` commands. - -In doing so, the requirements for authentication & access control of the dashboard itself remain limited, -while all possible authentication & access control features of Kubernetes are still available to ensure -a secure deployment. - -### Authentication - -The dashboard requires a username+password to gain access, unless it is started with an option to disable authentication. -This username+password pair is stored in a standard basic authentication `Secret` in the Kubernetes cluster. - -### Frontend technology - -The frontend part of the dashboard will be built with React. -This aligns with future developments in the context of the web-UI of the database itself. - -### Backend technology - -The backend of the dashboard contains an HTTPS server that serves the dashboard webpage (including all required web resources) -and all API methods it needs. +### Dashboard UI now is deprecated and will be removed in next minor version diff --git a/docs/design/lifecycle_hooks_and_finalizers.md b/docs/design/lifecycle_hooks_and_finalizers.md index d30b4723d..778f705e3 100644 --- a/docs/design/lifecycle_hooks_and_finalizers.md +++ b/docs/design/lifecycle_hooks_and_finalizers.md @@ -1,7 +1,7 @@ # Lifecycle hooks & Finalizers The ArangoDB operator expects full control of the `Pods` and `PersistentVolumeClaims` it creates. -Therefore it takes measures to prevent the removal of those resources +Therefore, it takes measures to prevent the removal of those resources until it is safe to do so. To achieve this, the server containers in the `Pods` have @@ -27,11 +27,17 @@ is shared between the init-container and the server container. ## Finalizers -The ArangoDB operators adds the following finalizers to `Pods`. - +The ArangoDB operators adds the following finalizers to `Pods`: - `dbserver.database.arangodb.com/drain`: Added to DBServers, removed only when the dbserver can be restarted or is completely drained - `agent.database.arangodb.com/agency-serving`: Added to Agents, removed only when enough agents are left to keep the agency serving +- `pod.database.arangodb.com/delay`: Delays pod termination +- `database.arangodb.com/graceful-shutdown`: Added to All members, indicating the need for graceful shutdown + +The ArangoDB operators adds the following finalizers to `PersistentVolumeClaims`: +- `pvc.database.arangodb.com/member-exists`: Removed only when its member no longer exists or can be safely rebuild -The ArangoDB operators adds the following finalizers to `PersistentVolumeClaims`. +The ArangoDB operators adds the following finalizers to `ArangoDeployment`: +- `database.arangodb.com/remove-child-finalizers`: Clean-ups finalizers from all children resources -- `pvc.database.arangodb.com/member-exists`: removed only when its member exists no longer exists or can be safely rebuild +The ArangoDB operators adds the following finalizers to `ArangoDeploymentReplication`: +- `replication.database.arangodb.com/stop-sync`: Stops deployment-to-deployment replication diff --git a/docs/design/maintenance.md b/docs/design/maintenance.md deleted file mode 100644 index afbf9f6af..000000000 --- a/docs/design/maintenance.md +++ /dev/null @@ -1,14 +0,0 @@ -# Maintenance - -## ArangoDeployment - -Maintenance on ArangoDeployment can be enabled using annotation. - -Key: `deployment.arangodb.com/maintenance` -Value: `true` - -To enable maintenance mode for ArangoDeployment kubectl command can be used: -`kubectl annotate arangodeployment deployment deployment.arangodb.com/maintenance=true` - -To disable maintenance mode for ArangoDeployment kubectl command can be used: -`kubectl annotate --overwrite arangodeployment deployment deployment.arangodb.com/maintenance-` \ No newline at end of file diff --git a/docs/design/pod_evication_and_replacement.md b/docs/design/pod_eviction_and_replacement.md similarity index 100% rename from docs/design/pod_evication_and_replacement.md rename to docs/design/pod_eviction_and_replacement.md diff --git a/docs/design/pod_name_versus_cluster_id.md b/docs/design/pod_name_versus_cluster_id.md index 52b2fb8e8..a55f32f3f 100644 --- a/docs/design/pod_name_versus_cluster_id.md +++ b/docs/design/pod_name_versus_cluster_id.md @@ -3,13 +3,13 @@ All resources being created will get a name that contains the user provided cluster name and a unique part. -The unique part will be difference for every pod that +The unique part will be different for every pod that is being created. E.g. when upgrading to a new version, we generate a new unique pod name. The servers in the ArangoDB cluster will be assigned -a persistent, unique ID. +a persistent, unique ID which is stored in ArangoMember CR. When a Pod changes (e.g. because of an upgrade) the Pod name changes, but the cluster ID remains the same. diff --git a/docs/design/resource_and_labels.md b/docs/design/resources_and_labels.md similarity index 80% rename from docs/design/resource_and_labels.md rename to docs/design/resources_and_labels.md index ebb93ee72..467bb8dc2 100644 --- a/docs/design/resource_and_labels.md +++ b/docs/design/resources_and_labels.md @@ -60,17 +60,8 @@ For a full cluster deployment, the following Kubernetes resources are created: - `arangodb_deployment: ` - `role: dbserver` -- Headless `Service` for accessing the all server, named `_servers`. - The service will provide access all server server from within the k8s cluster. - - Labels: - - `app=arangodb` - - `arangodb_deployment: ` - - Selector: - - `app=arangodb` - - `arangodb_deployment: ` - -- `Service` for accessing the all coordinators, named ``. - The service will provide access all coordinators from within the k8s cluster. +- `Service` for accessing all coordinators, named ``. + The service will provide access to all coordinators from within the k8s cluster. - Labels: - `app=arangodb` - `arangodb_deployment: ` @@ -86,17 +77,17 @@ For a full cluster with datacenter replication deployment, the same resources are created as for a Full cluster, with the following additions: -- `Pods` running ArangoSync workers named `_syncworker_`. +- `Pods` running ArangoSync workers named `-syncworker-`. - Labels: - `app=arangodb` - `arangodb_deployment: ` - `role: syncworker` -- `Pods` running ArangoSync master named `_syncmaster_`. +- `Pods` running ArangoSync master named `-syncmaster-`. - Labels: - `app=arangodb` - `arangodb_deployment: ` - `role: syncmaster` -- `Service` for accessing the sync masters, named `_sync`. +- `Service` for accessing the sync masters, named `-sync`. The service will provide access to all syncmaster from within the Kubernetes cluster. diff --git a/docs/design/scaling.md b/docs/design/scaling.md index 68c479f07..fe9ddaef2 100644 --- a/docs/design/scaling.md +++ b/docs/design/scaling.md @@ -1,28 +1,21 @@ # Scaling -The internal process followed by the ArangoDB operator -when scaling up is as follows: +Number of running servers is controlled through `spec..count` field. -- Set CR state to `Scaling` -- Create an additional server Pod -- Wait until server is ready before continuing -- Set CR state to `Ready` +### Scale-up +When increasing the `count`, operator will try to create missing pods. +When scaling up make sure that you have enough computational resources / nodes, otherwise pod will stuck in Pending state. -The internal process followed by the ArangoDB operator -when scaling down a dbserver is as follows: -- Set CR state to `Scaling` -- Drain the dbserver (TODO fill in procedure) -- Shutdown the dbserver such that it removes itself from the agency -- Remove the dbserver Pod -- Set CR state to `Ready` +### Scale-down -The internal process followed by the ArangoDB operator -when scaling down a coordinator is as follows: +Scaling down is always done 1 server at a time. -- Set CR state to `Scaling` -- Shutdown the coordinator such that it removes itself from the agency -- Remove the coordinator Pod -- Set CR state to `Ready` +Scale down is possible only when all other actions on ArangoDeployment are finished. -Note: Scaling is always done 1 server at a time. +The internal process followed by the ArangoDB operator when scaling up is as follows: +- It chooses a member to be evicted. First it will try to remove unhealthy members or fall-back to the member with highest deletion_priority. +- Making an internal calls, it forces the server to resign leadership. + In case of DB servers it means that all shard leaders will be switched to other servers. +- Wait until server is cleaned out from cluster +- Pod finalized diff --git a/docs/design/status.md b/docs/design/status.md deleted file mode 100644 index 01b4f2e81..000000000 --- a/docs/design/status.md +++ /dev/null @@ -1,33 +0,0 @@ -# Status - -The status field of the `CustomResource` must contain all persistent state needed to -create & maintain the cluster. - -## `status.state: string` - -This field contains the current status of the cluster. -Possible values are: - -- `Creating` when the cluster is first be created. -- `Ready` when all pods if the cluster are in running state. -- `Scaling` when pods are being added to an existing cluster or removed from an existing cluster. -- `Upgrading` when cluster is in the process of being upgraded to another version. - -## `status.members..[x].state: string` - -This field contains the pod state of server x of this group. -Possible values are: - -- `Creating` when the pod is about to be created. -- `Ready` when the pod has been created. -- `Draining` when a dbserver pod is being drained. -- `ShuttingDown` when a server is in the process of shutting down. - -## `status.members..[x].podName: string` - -This field contains the name of the current pod that runs server x of this group. - -## `status.members..[x].clusterID: string` - -This field contains the unique cluster ID of server x of this group. -The field is only valid for groups `single`, `agents`, `dbservers` & `coordinators`. diff --git a/docs/design/test_clusters.md b/docs/design/test_clusters.md deleted file mode 100644 index 73d1c461d..000000000 --- a/docs/design/test_clusters.md +++ /dev/null @@ -1,16 +0,0 @@ -# Test clusters - -The ArangoDB operator is tested on various types of kubernetes clusters. - -To prepare a cluster for running the ArangoDB operator tests, -do the following: - -- Create a `kubectl` config file for accessing the cluster. -- Use that config file. -- Run `./scripts/kube_configure_test_cluster.sh`. This creates a `ConfigMap` - named `arango-operator-test` in the `kube-system` namespace containing the - following environment variables. - -```bash -REQUIRE_LOCAL_STORAGE=1 -``` diff --git a/docs/design/testing.md b/docs/design/testing.md deleted file mode 100644 index 25e4680ab..000000000 --- a/docs/design/testing.md +++ /dev/null @@ -1,40 +0,0 @@ -# Testing - -## Scenario's - -The following test scenario's must be covered by automated tests: - -- Creating 1 deployment (all modes, all environments, all storage engines) -- Creating multiple deployments (all modes, all environments, all storage engines), - controlling each individually -- Creating deployment with/without authentication -- Creating deployment with/without TLS - -- Updating deployment wrt: - - Number of servers (scaling, up/down) - - Image version (upgrading, downgrading within same minor version range (e.g. 3.2.x)) - - Immutable fields (should be reset automatically) - -- Resilience: - - Delete individual pods - - Delete individual PVCs - - Delete individual Services - - Delete Node - - Restart Node - - API server unavailable - -- Persistent Volumes: - - hint: RBAC file might need to be changed - - hint: get info via - client-go.CoreV1() - - Number of volumes should stay in reasonable bounds - - For some cases it might be possible to check that, the amount before and after the test stays the same - - A Cluster start should need 6 Volumes (DBServer + Agents) - - The release of a volume-claim should result in a release of the volume - -## Test environments - -- Kubernetes clusters - - Single node - - Multi node - - Access control mode (RBAC, ...) - - Persistent volumes ... diff --git a/docs/design/upgrading.md b/docs/design/upgrading.md deleted file mode 100644 index e30937f4c..000000000 --- a/docs/design/upgrading.md +++ /dev/null @@ -1,32 +0,0 @@ -# Upgrade procedure - -## Upgrading ArangoDB single to another version - -The process for upgrading an existing ArangoDB single server -to another version is as follows: - -- Set CR state to `Upgrading` -- Remove the server Pod (keep persistent volume) -- Create a new server Pod with new version -- Wait until server is ready before continuing -- Set CR state to `Ready` - -## Upgrading ArangoDB cluster to another version - -The process for upgrading an existing ArangoDB cluster -to another version is as follows: - -- Set CR state to `Upgrading` -- For each agent: - - Remove the agent Pod (keep persistent volume) - - Create new agent Pod with new version - - Wait until agent is ready before continuing -- For each dbserver: - - Remove the dbserver Pod (keep persistent volume) - - Create new dbserver Pod with new version - - Wait until dbserver is ready before continuing -- For each coordinator: - - Remove the coordinator Pod (keep persistent volume) - - Create new coordinator Pod with new version - - Wait until coordinator is ready before continuing -- Set CR state to `Ready` diff --git a/docs/features/README.md b/docs/features/README.md new file mode 100644 index 000000000..7b14cda09 --- /dev/null +++ b/docs/features/README.md @@ -0,0 +1,41 @@ +## List of Community Edition features + +| Feature | Operator Version | Introduced | ArangoDB Version | ArangoDB Edition | State | Enabled | Flag | Remarks | +|:-----------------------------------------------------|:-----------------|:-----------|:-----------------|:-----------------|:-----------|:--------|:-----|:----------------------------------------------------------------------------| +| AgencyCache | 1.2.30 | 1.2.30 | >= 3.8.0 | Enterprise | Production | True | N/A | Enable Agency Cache mechanism in the Operator (Increase limit of the nodes) | +| Member Maintenance Support | 1.2.25 | 1.2.16 | >= 3.8.0 | Enterprise | Production | True | N/A | Enable Member Maintenance during planned restarts | +| [Rebalancer](rebalancer.md) | 1.2.15 | 1.2.5 | >= 3.8.0 | Enterprise | Production | True | N/A | N/A | +| [TopologyAwareness](../design/topology_awareness.md) | 1.2.4 | 1.2.4 | >= 3.8.0 | Enterprise | Production | True | N/A | N/A | + + +## List of Enterprise Edition features + +| Feature | Operator Version | Introduced | ArangoDB Version | ArangoDB Edition | State | Enabled | Flag | Remarks | +|:----------------------------------------------------------------|:-----------------|:-----------|:-----------------|:----------------------|:-------------|:--------|:------------------------------------------------------|:-----------------------------------------------------------------------------------| +| Enforced ResignLeadership | 1.2.34 | 1.2.34 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.enforced-resign-leadership | Enforce ResignLeadership and ensure that Leaders are moved from restarted DBServer | +| Copy resources spec to init containers | 1.2.33 | 1.2.33 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.init-containers-copy-resources | Copy resources spec to built-in init containers if they are not specified | +| [Rebalancer V2](rebalancer_v2.md) | 1.2.31 | 1.2.31 | >= 3.10.0 | Community, Enterprise | Alpha | False | --deployment.feature.rebalancer-v2 | N/A | +| [Secured containers](secured_containers.md) | 1.2.31 | 1.2.31 | >= 3.8.0 | Community, Enterprise | Alpha | False | --deployment.feature.secured-containers | If set to True Operator will run containers in secure mode | +| Version Check V2 | 1.2.31 | 1.2.31 | >= 3.8.0 | Community, Enterprise | Alpha | False | --deployment.feature.upgrade-version-check-V2 | N/A | +| [Operator Ephemeral Volumes](ephemeral_volumes.md) | 1.2.31 | 1.2.2 | >= 3.8.0 | Community, Enterprise | Beta | False | --deployment.feature.ephemeral-volumes | N/A | +| [Force Rebuild Out Synced Shards](rebuild_out_synced_shards.md) | 1.2.27 | 1.2.27 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.force-rebuild-out-synced-shards | It should be used only if user is aware of the risks. | +| [Spec Default Restore](deployment_spec_defaults.md) | 1.2.25 | 1.2.21 | >= 3.8.0 | Community, Enterprise | Beta | True | --deployment.feature.deployment-spec-defaults-restore | If set to False Operator will not change ArangoDeployment Spec | +| Version Check | 1.2.23 | 1.1.4 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.upgrade-version-check | N/A | +| [Failover Leader service](failover_leader_service.md) | 1.2.13 | 1.2.13 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.failover-leadership | N/A | +| Graceful Restart | 1.2.5 | 1.0.7 | >= 3.8.0 | Community, Enterprise | Production | True | ---deployment.feature.graceful-shutdown | N/A | +| Optional Graceful Restart | 1.2.0 | 1.2.5 | >= 3.8.0 | Community, Enterprise | Production | False | --deployment.feature.optional-graceful-shutdown | N/A | +| Operator Internal Metrics Exporter | 1.2.0 | 1.2.0 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.metrics-exporter | N/A | +| Operator Maintenance Management Support | 1.2.0 | 1.0.7 | >= 3.8.0 | Community, Enterprise | Production | True | --deployment.feature.maintenance | N/A | +| Encryption Key Rotation Support | 1.2.0 | 1.0.3 | >= 3.8.0 | Enterprise | NotSupported | False | --deployment.feature.encryption-rotation | N/A | +| TLS Runtime Rotation Support | 1.1.0 | 1.0.4 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.tls-rotation | N/A | +| JWT Rotation Support | 1.1.0 | 1.0.3 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.jwt-rotation | N/A | +| Operator Single Mode | 1.0.4 | 1.0.4 | >= 3.8.0 | Community, Enterprise | Production | False | --mode.single | Only 1 instance of Operator allowed in namespace when feature is enabled | +| TLS SNI Support | 1.0.3 | 1.0.3 | >= 3.8.0 | Enterprise | Production | True | --deployment.feature.tls-sni | N/A | +| Disabling of liveness probes | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | +| Pod Disruption Budgets | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | +| Prometheus Metrics Exporter | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | Prometheus required | +| Sidecar Containers | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | +| Volume Claim Templates | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | +| Volume Resizing | 0.3.11 | 0.3.10 | >= 3.8.0 | Community, Enterprise | Production | True | N/A | N/A | + + diff --git a/docs/design/features/deployment_spec_defaults.md b/docs/features/deployment_spec_defaults.md similarity index 100% rename from docs/design/features/deployment_spec_defaults.md rename to docs/features/deployment_spec_defaults.md diff --git a/docs/design/features/ephemeral_volumes.md b/docs/features/ephemeral_volumes.md similarity index 100% rename from docs/design/features/ephemeral_volumes.md rename to docs/features/ephemeral_volumes.md diff --git a/docs/design/features/failover_leader_service.md b/docs/features/failover_leader_service.md similarity index 100% rename from docs/design/features/failover_leader_service.md rename to docs/features/failover_leader_service.md diff --git a/docs/design/features/rebalancer.md b/docs/features/rebalancer.md similarity index 100% rename from docs/design/features/rebalancer.md rename to docs/features/rebalancer.md diff --git a/docs/design/features/rebalancer_v2.md b/docs/features/rebalancer_v2.md similarity index 100% rename from docs/design/features/rebalancer_v2.md rename to docs/features/rebalancer_v2.md diff --git a/docs/design/features/rebuild_out_synced_shards.md b/docs/features/rebuild_out_synced_shards.md similarity index 100% rename from docs/design/features/rebuild_out_synced_shards.md rename to docs/features/rebuild_out_synced_shards.md diff --git a/docs/design/features/secured_containers.md b/docs/features/secured_containers.md similarity index 88% rename from docs/design/features/secured_containers.md rename to docs/features/secured_containers.md index 9de169207..08870c7cd 100644 --- a/docs/design/features/secured_containers.md +++ b/docs/features/secured_containers.md @@ -15,7 +15,7 @@ Change Default settings of: ## Dependencies -- [Operator Ephemeral Volumes](./ephemeral_volumes.md) should be Enabled and Supported. +- [Operator Ephemeral Volumes](ephemeral_volumes.md) should be Enabled and Supported. ## How to use diff --git a/docs/how-to/README.md b/docs/how-to/README.md new file mode 100644 index 000000000..775268386 --- /dev/null +++ b/docs/how-to/README.md @@ -0,0 +1,12 @@ +## How-to... + +- [Pass additional params to operator](additional_configuration.md) +- [Change architecture / enable ARM support](arch_change.md) +- [Configure timezone for cluster](configuring_tz.md) +- [Collect debug data for support case](debugging.md) +- [Configure logging](logging.md) +- [Enable maintenance mode](maintenance.md) +- [Start metrics collection and monitoring](metrics.md) +- [Override detected total memory](override_detected_memory.md) +- [Manually recover cluster if you still have volumes with data](recovery.md) +- [How to rotate Pod](rotate-pod.md) diff --git a/docs/design/additional_configuration.md b/docs/how-to/additional_configuration.md similarity index 100% rename from docs/design/additional_configuration.md rename to docs/how-to/additional_configuration.md diff --git a/docs/design/arch_change.md b/docs/how-to/arch_change.md similarity index 100% rename from docs/design/arch_change.md rename to docs/how-to/arch_change.md diff --git a/docs/design/configuring_tz.md b/docs/how-to/configuring_tz.md similarity index 100% rename from docs/design/configuring_tz.md rename to docs/how-to/configuring_tz.md diff --git a/docs/design/debugging.md b/docs/how-to/debugging.md similarity index 100% rename from docs/design/debugging.md rename to docs/how-to/debugging.md diff --git a/docs/design/logging.md b/docs/how-to/logging.md similarity index 92% rename from docs/design/logging.md rename to docs/how-to/logging.md index fb9f64774..03b128a90 100644 --- a/docs/design/logging.md +++ b/docs/how-to/logging.md @@ -5,7 +5,7 @@ ### Log level To adjust logging level of the operator, you can use `operator.args` in chart template value -as described in [Additional configuration](./additional_configuration.md). +as described in [Additional configuration](additional_configuration.md). For example, to set log level to `INFO` and `DEBUG` for `requests` package, you can use the following value: ```yaml diff --git a/docs/how-to/maintenance.md b/docs/how-to/maintenance.md new file mode 100644 index 000000000..b016ad86c --- /dev/null +++ b/docs/how-to/maintenance.md @@ -0,0 +1,29 @@ +# Maintenance mode + +## ArangoDeployment maintenance + +When enabled, operator will pause reconciliation loop for specified ArangoDeployment. + +Maintenance on ArangoDeployment can be enabled using annotation. + +Key: `deployment.arangodb.com/maintenance` +Value: `true` + +To enable maintenance mode for ArangoDeployment kubectl command can be used: +`kubectl annotate arangodeployment deployment deployment.arangodb.com/maintenance=true` + +To disable maintenance mode for ArangoDeployment kubectl command can be used: +`kubectl annotate --overwrite arangodeployment deployment deployment.arangodb.com/maintenance-` + +## Cluster maintenance + +It is possible to put ArangoDB cluster into [agecy supervision mode](https://docs.arangodb.com/3.11/develop/http/cluster/#maintenance). + +Use `spec.database.maintenance` field of ArangoDeployment CR to configure that: +``` +spec: + # ... + database: + maintenance: true + +``` diff --git a/docs/design/metrics.md b/docs/how-to/metrics.md similarity index 97% rename from docs/design/metrics.md rename to docs/how-to/metrics.md index 2e31770b6..f66ca446d 100644 --- a/docs/design/metrics.md +++ b/docs/how-to/metrics.md @@ -1,10 +1,10 @@ -# Metrics +# Metrics collection Operator provides metrics of its operations in a format supported by [Prometheus](https://prometheus.io/). The metrics are exposed through HTTPS on port `8528` under path `/metrics`. -For a full list of available metrics, see [here](./../generated/metrics/README.md). +For a full list of available metrics, see [here](../generated/metrics/README.md). #### Contents - [Integration with standard Prometheus installation (no TLS)](#Integration-with-standard-Prometheus-installation-no-TLS) diff --git a/docs/design/resource_management.md b/docs/how-to/override_detected_memory.md similarity index 94% rename from docs/design/resource_management.md rename to docs/how-to/override_detected_memory.md index 324466a29..1954b8811 100644 --- a/docs/design/resource_management.md +++ b/docs/how-to/override_detected_memory.md @@ -1,4 +1,4 @@ -# Resource Management +# Override detected total memory ## overrideDetectedTotalMemory diff --git a/docs/design/recovery.md b/docs/how-to/recovery.md similarity index 100% rename from docs/design/recovery.md rename to docs/how-to/recovery.md diff --git a/docs/design/rotating.md b/docs/how-to/rotate-pod.md similarity index 71% rename from docs/design/rotating.md rename to docs/how-to/rotate-pod.md index bffc003c6..94264b7e7 100644 --- a/docs/design/rotating.md +++ b/docs/how-to/rotate-pod.md @@ -1,10 +1,8 @@ -# Rotation - -## ArangoDeployment +# How to rotate Pod Rotation of ArangoDeployment Pods can be triggered by Pod deletion or by annotation (safe way). -Using annotation Pods gonna be rotated one-by-one which will keep cluster alive. +Using annotation is preferred way to rotate pods while keeping cluster in health state. Key: `deployment.arangodb.com/rotate` Value: `true` diff --git a/docs/providers/README.md b/docs/providers/README.md deleted file mode 100644 index d59b40570..000000000 --- a/docs/providers/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# Supported Providers - -- [Amazon EKS](./eks/README.md) \ No newline at end of file diff --git a/internal/docs_test.go b/internal/docs_test.go index 4e25d3f29..a6cf35534 100644 --- a/internal/docs_test.go +++ b/internal/docs_test.go @@ -141,26 +141,25 @@ func Test_GenerateAPIDocs(t *testing.T) { root := os.Getenv("ROOT") require.NotEmpty(t, root) - generateDocs(t, map[string]map[string]interface{}{ + docs := map[string]map[string]interface{}{ "ArangoDeployment.V1": { "Spec": api.ArangoDeployment{}.Spec, }, - }, - fmt.Sprintf("%s/pkg/apis/deployment/v1", root)) - - generateDocs(t, map[string]map[string]interface{}{ "ArangoMember.V1": { "Spec": api.ArangoMember{}.Spec, }, - }, - fmt.Sprintf("%s/pkg/apis/deployment/v1", root)) + } + resultPaths := generateDocs(t, docs, fmt.Sprintf("%s/pkg/apis/deployment/v1", root)) + + generateIndex(t, resultPaths) } -func generateDocs(t *testing.T, objects map[string]map[string]interface{}, paths ...string) { +func generateDocs(t *testing.T, objects map[string]map[string]interface{}, paths ...string) map[string]string { root := os.Getenv("ROOT") require.NotEmpty(t, root) docs, fs := getDocs(t, paths...) + outPaths := make(map[string]string) for object, sections := range objects { t.Run(object, func(t *testing.T) { @@ -237,7 +236,10 @@ func generateDocs(t *testing.T, objects map[string]map[string]interface{}, paths }) } - out, err := os.OpenFile(path.Join(root, "docs/api", fmt.Sprintf("%s.md", object)), os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) + fileName := fmt.Sprintf("%s.md", object) + outPaths[object] = fileName + outPath := path.Join(root, "docs/api", fmt.Sprintf("%s.md", object)) + out, err := os.OpenFile(outPath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) require.NoError(t, err) defer func() { @@ -254,6 +256,26 @@ func generateDocs(t *testing.T, objects map[string]map[string]interface{}, paths } }) } + return outPaths +} + +func generateIndex(t *testing.T, apiDocs map[string]string) { + root := os.Getenv("ROOT") + require.NotEmpty(t, root) + outPath := path.Join(root, "docs/api/README.md") + + out, err := os.OpenFile(outPath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) + require.NoError(t, err) + defer func() { + require.NoError(t, out.Close()) + }() + + write(t, out, "# Custom Resources API Reference\n\n") + + for name, filePath := range apiDocs { + write(t, out, " - [%s](./%s)\n", name, filePath) + } + write(t, out, "\n") } func write(t *testing.T, out io.Writer, format string, args ...interface{}) { diff --git a/internal/features.yaml b/internal/features.yaml index 0637631e2..8bc50a283 100644 --- a/internal/features.yaml +++ b/internal/features.yaml @@ -128,7 +128,7 @@ features: - operatorVersion: 1.2.0 state: Production - name: Operator Ephemeral Volumes - doc: docs/design/features/ephemeral_volumes.md + doc: docs/features/ephemeral_volumes.md flag: --deployment.feature.ephemeral-volumes enabled: false releases: @@ -137,14 +137,14 @@ features: - operatorVersion: 1.2.31 state: Beta - name: Failover Leader service - doc: docs/design/features/failover_leader_service.md + doc: docs/features/failover_leader_service.md flag: --deployment.feature.failover-leadership enabled: false releases: - operatorVersion: 1.2.13 state: Production - name: Spec Default Restore - doc: docs/design/features/deployment_spec_defaults.md + doc: docs/features/deployment_spec_defaults.md flag: --deployment.feature.deployment-spec-defaults-restore enabled: true remarks: If set to False Operator will not change ArangoDeployment Spec @@ -154,7 +154,7 @@ features: - operatorVersion: 1.2.25 state: Beta - name: Force Rebuild Out Synced Shards - doc: docs/design/features/rebuild_out_synced_shards.md + doc: docs/features/rebuild_out_synced_shards.md flag: --deployment.feature.force-rebuild-out-synced-shards enabled: false remarks: It should be used only if user is aware of the risks. @@ -162,7 +162,7 @@ features: - operatorVersion: 1.2.27 state: Production - name: Rebalancer - doc: docs/design/features/rebalancer.md + doc: docs/features/rebalancer.md enabled: true operatorEditions: Enterprise arangoDBEditions: Enterprise @@ -172,7 +172,7 @@ features: - operatorVersion: 1.2.15 state: Production - name: Rebalancer V2 - doc: docs/design/features/rebalancer_v2.md + doc: docs/features/rebalancer_v2.md arangoDBVersion: ">= 3.10.0" flag: --deployment.feature.rebalancer-v2 enabled: false @@ -180,7 +180,7 @@ features: - operatorVersion: 1.2.31 state: Alpha - name: Secured containers - doc: docs/design/features/secured_containers.md + doc: docs/features/secured_containers.md flag: --deployment.feature.secured-containers enabled: false remarks: If set to True Operator will run containers in secure mode diff --git a/internal/features_test.go b/internal/features_test.go new file mode 100644 index 000000000..d203645f1 --- /dev/null +++ b/internal/features_test.go @@ -0,0 +1,55 @@ +// +// DISCLAIMER +// +// Copyright 2016-2023 ArangoDB GmbH, Cologne, Germany +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Copyright holder is ArangoDB GmbH, Cologne, Germany +// + +package internal + +import ( + "os" + "path" + "testing" + + "github.com/stretchr/testify/require" +) + +func Test_GenerateFeaturesIndex(t *testing.T) { + root := os.Getenv("ROOT") + require.NotEmpty(t, root) + + outPath := path.Join(root, "docs/features/README.md") + + out, err := os.OpenFile(outPath, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) + require.NoError(t, err) + defer func() { + require.NoError(t, out.Close()) + }() + + const basePath = "docs/features" + write(t, out, "## List of Community Edition features\n") + section, err := GenerateReadmeFeatures(root, basePath, true) + require.NoError(t, err) + write(t, out, section) + write(t, out, "\n") + + write(t, out, "## List of Enterprise Edition features\n") + section, err = GenerateReadmeFeatures(root, basePath, false) + require.NoError(t, err) + write(t, out, section) + write(t, out, "\n") +} diff --git a/internal/readme.go b/internal/readme.go index cfc48d5a3..04f73b769 100644 --- a/internal/readme.go +++ b/internal/readme.go @@ -24,6 +24,7 @@ import ( "fmt" "os" "path" + "path/filepath" "sort" "strings" @@ -109,13 +110,14 @@ func GenerateReadme(root string) error { readmeSections["kubernetesVersionsTable"] = section } - if section, err := GenerateReadmeFeatures(root, true); err != nil { + const basePath = "" + if section, err := GenerateReadmeFeatures(root, basePath, true); err != nil { return err } else { readmeSections["featuresEnterpriseTable"] = section } - if section, err := GenerateReadmeFeatures(root, false); err != nil { + if section, err := GenerateReadmeFeatures(root, basePath, false); err != nil { return err } else { readmeSections["featuresCommunityTable"] = section @@ -134,7 +136,7 @@ func GenerateReadme(root string) error { return nil } -func GenerateReadmeFeatures(root string, eeOnly bool) (string, error) { +func GenerateReadmeFeatures(root, basePath string, eeOnly bool) (string, error) { feature := md.NewColumn("Feature", md.ColumnLeftAlign) introduced := md.NewColumn("Introduced", md.ColumnLeftAlign) oVersion := md.NewColumn("Operator Version", md.ColumnLeftAlign) @@ -204,7 +206,12 @@ func GenerateReadmeFeatures(root string, eeOnly bool) (string, error) { n := f.Name if v := util.First(r.Doc, f.Doc); v != nil { - n = fmt.Sprintf("[%s](%s)", n, *v) + p, err := filepath.Rel(basePath, *v) + if err != nil { + return "", err + } + + n = fmt.Sprintf("[%s](%s)", n, p) } if err := t.AddRow(map[md.Column]string{ diff --git a/pkg/apis/deployment/v1/deployment_spec.go b/pkg/apis/deployment/v1/deployment_spec.go index 535f6f783..9c0b6567f 100644 --- a/pkg/apis/deployment/v1/deployment_spec.go +++ b/pkg/apis/deployment/v1/deployment_spec.go @@ -252,7 +252,7 @@ type DeploymentSpec struct { // Architecture defines the list of supported architectures. // First element on the list is marked as default architecture. - // +doc/link: Architecture Change|/docs/design/arch_change.md + // +doc/link: Architecture Change|/docs/how-to/arch_change.md // +doc/type: []string // +doc/default: ['amd64'] Architecture ArangoDeploymentArchitecture `json:"architecture,omitempty"`