diff --git a/.github/workflows/auto-approve-pr.yml b/.github/workflows/auto-approve-pr.yml deleted file mode 100644 index 1840f92f..00000000 --- a/.github/workflows/auto-approve-pr.yml +++ /dev/null @@ -1,38 +0,0 @@ -name: Auto-approve a pull request - -on: - pull_request - -env: - PR_OWNER: ${{ github.event.pull_request.user.login }} - GITHUB_OAUTH_TOKEN: ${{ secrets.DOCUMENT_REVIEW_GITHUB }} - TEAM_NAME: "WebOps" - -jobs: - check-diff: - runs-on: ${{ matrix.os }} - - strategy: - matrix: - os: [ubuntu-latest] - - steps: - - name: Checkout PR code - uses: actions/checkout@v4 - - run: | - git fetch --no-tags --prune --depth=1 origin +refs/heads/*:refs/remotes/origin/* - - name: Run git diff against repository - run: | - git diff origin/main HEAD > changes - - name: Auto-approval check - id: approve_pr_check - uses: ministryofjustice/cloud-platform-doc-checker@v1.0.0 - env: - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - - - name: Approving PR - uses: hmarr/auto-approve-action@v2 - - if: steps.approve_pr_check.outputs.review_pr == 'true' - with: - github-token: "${{ secrets.GITHUB_TOKEN }}" diff --git a/.github/workflows/dependency-review.yml b/.github/workflows/dependency-review.yml index 0925fb8a..0c730823 100644 --- a/.github/workflows/dependency-review.yml +++ b/.github/workflows/dependency-review.yml @@ -15,7 +15,7 @@ jobs: - name: Checkout Repository uses: actions/checkout@v4 - name: Dependency Review - uses: actions/dependency-review-action@v2 + uses: actions/dependency-review-action@v4 with: # Possible values: critical, high, moderate, low fail-on-severity: critical diff --git a/.github/workflows/format-code.yml b/.github/workflows/format-code.yml index 6150b114..6ae7167b 100644 --- a/.github/workflows/format-code.yml +++ b/.github/workflows/format-code.yml @@ -9,6 +9,6 @@ jobs: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - - uses: ministryofjustice/github-actions/code-formatter@e08cbcac12ec9c09d867ab2b803d4ea1a87300ad # v18.2.4 + - uses: ministryofjustice/github-actions/code-formatter@ccf9e3a4a828df1ec741f6c8e6ed9d0acaef3490 # v18.5.0 env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/link-checker.yml b/.github/workflows/link-checker.yml index 9da9f894..b1a5eb74 100644 --- a/.github/workflows/link-checker.yml +++ b/.github/workflows/link-checker.yml @@ -13,7 +13,7 @@ jobs: - uses: actions/checkout@v4 - name: Link Checker - uses: lycheeverse/lychee-action@v1.9.1 + uses: lycheeverse/lychee-action@v2.1.0 with: args: --verbose --no-progress **/*.md **/*.html **/*.erb --accept 200,429,403,400,301,302,401 --exclude-mail env: @@ -21,7 +21,7 @@ jobs: - name: Create Issue From File if: env.lychee_exit_code != 0 - uses: peter-evans/create-issue-from-file@v3 + uses: peter-evans/create-issue-from-file@v5 with: title: Link Checker Report content-filepath: ./lychee/out.md diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml index 01df813e..55a8347f 100644 --- a/.github/workflows/publish.yml +++ b/.github/workflows/publish.yml @@ -22,7 +22,7 @@ jobs: apk update && apk add rsync which rsync - name: Deploy - uses: JamesIves/github-pages-deploy-action@v4.3.3 + uses: JamesIves/github-pages-deploy-action@v4.7.1 with: token: ${{ secrets.PUBLISHING_GIT_TOKEN }} git-config-name: cloud-platform-moj diff --git a/architecture-decision-record/022-EKS.md b/architecture-decision-record/022-EKS.md index 94942929..8480dc45 100644 --- a/architecture-decision-record/022-EKS.md +++ b/architecture-decision-record/022-EKS.md @@ -1,6 +1,6 @@ # EKS -Date: 02/05/2021 +Date: 11/11/2024 ## Status @@ -14,13 +14,13 @@ Use Amazon EKS for running the main cluster, which hosts MOJ service teams' appl Benefits of EKS: -* a managed control plane (master nodes), reducing operational overhead compared to kOps, such as scaling the control plane nodes. And reduces risk to k8s API availability, if there was a sudden increase in k8s API traffic. -* [managed nodes](https://docs.aws.amazon.com/eks/latest/userguide/managed-node-groups.html), further reducing operational overhead -* Kubernetes upgrades are smoother: - * kOps rolling upgrades have been problematic. e.g. during 1.18 to 1.19 upgrade kOps caused us to have to [work around a networking issue](https://docs.google.com/document/d/1HzmTk0IvuW1XsXmVJEsSOzB4jkiMpjwZWlUwIF7P9Gc/edit) - * CP team sees kOps upgrades as particularly stressful, and 3rd on our risk register -* it opens the door to using [ELB for ingress](https://docs.aws.amazon.com/eks/latest/userguide/aws-load-balancer-controller.html). Being managed, it is seen as preferable to self-managed nginx, which requires upgrades, scaling etc. -* avoid security challenge of managing tokens that are exported with `kops export kubeconfig` +- a managed control plane (master nodes), reducing operational overhead compared to kOps, such as scaling the control plane nodes. And reduces risk to k8s API availability, if there was a sudden increase in k8s API traffic. +- [managed nodes](https://docs.aws.amazon.com/eks/latest/userguide/managed-node-groups.html), further reducing operational overhead +- Kubernetes upgrades are smoother: + - kOps rolling upgrades have been problematic. e.g. during 1.18 to 1.19 upgrade kOps caused us to have to [work around a networking issue](https://docs.google.com/document/d/1HzmTk0IvuW1XsXmVJEsSOzB4jkiMpjwZWlUwIF7P9Gc/edit) + - CP team sees kOps upgrades as particularly stressful, and 3rd on our risk register +- it opens the door to using [ELB for ingress](https://docs.aws.amazon.com/eks/latest/userguide/aws-load-balancer-controller.html). Being managed, it is seen as preferable to self-managed nginx, which requires upgrades, scaling etc. +- avoid security challenge of managing tokens that are exported with `kops export kubeconfig` We already run the Manager cluster on EKS, and have gained a lot of insight and experience of using it. @@ -32,15 +32,15 @@ We already run the Manager cluster on EKS, and have gained a lot of insight and Developers in service teams need to use the k8s auth, and GitHub continues to be the most common SSO amongst them with good tie-in to JML processes - see [ADR 6 Use GitHub as our identity provider](006-Use-github-as-user-directory.md) -Auth0 is useful as a broker, for a couple of important [rules that it runs at login time](https://github.com/ministryofjustice/cloud-platform-infrastructure/tree/main/terraform/global-resources/resources/auth0-rules): +Auth0 is useful as a broker, for a couple of important [rules that it runs at login time](https://github.com/ministryofjustice/cloud-platform-terraform-global-resources-auth0): -* it ensures that the user is in the ministryofjustice GitHub organization, so only staff can get a kubeconfig and login to CP websites like Grafana -* it inserts the user's GitHub teams into the OIDC ID token as claims. These are used by k8s RBAC to authorize the user for the correct namespaces +- it ensures that the user is in the ministryofjustice GitHub organization, so only staff can get a kubeconfig and login to CP websites like Grafana +- it inserts the user's GitHub teams into the OIDC ID token as claims. These are used by k8s RBAC to authorize the user for the correct namespaces Future options: -* Azure AD SSO is growing in MOJ - there's a case for switching to that, if it is adopted amongst our users -* IAM auth has the benefit of immediately revoking access. Maybe we could use federated login with GitHub? (But would that give only temporary kubecfg?) Or sync the GitHub team info to IAM? +- Azure AD SSO is growing in MOJ - there's a case for switching to that, if it is adopted amongst our users +- IAM auth has the benefit of immediately revoking access. Maybe we could use federated login with GitHub? (But would that give only temporary kubecfg?) Or sync the GitHub team info to IAM? **Status**: Completed 2/6/21 [#2854](https://github.com/ministryofjustice/cloud-platform/issues/2854) @@ -52,12 +52,12 @@ We've long used Kuberos for issuing kubecfg credentials to users. The [original Other options considered: -* [Gangway](https://github.com/heptiolabs/gangway) - similar to Kuberos, it has not had releases for 2 years (v3.2.0) -* [kubelogin](https://github.com/int128/kubelogin) - * CP team would have to distribute the client secret to all users. It seems odd to go to the trouble of securely sharing that secret, to overcome the perceived difficulty of issuing kubecfg credentials. - * Requires all of our users to install the software, rather than doing it server-side centrally -* [kubehook](https://github.com/negz/kubehook) - not compatible with EKS - doesn't support web hook authn -* [dex](https://github.com/dexidp/dex) - doesn't have a web front-end for issuing creds - it is more of an OIDC broker +- [Gangway](https://github.com/heptiolabs/gangway) - similar to Kuberos, it has not had releases for 2 years (v3.2.0) +- [kubelogin](https://github.com/int128/kubelogin) + - CP team would have to distribute the client secret to all users. It seems odd to go to the trouble of securely sharing that secret, to overcome the perceived difficulty of issuing kubecfg credentials. + - Requires all of our users to install the software, rather than doing it server-side centrally +- [kubehook](https://github.com/negz/kubehook) - not compatible with EKS - doesn't support web hook authn +- [dex](https://github.com/dexidp/dex) - doesn't have a web front-end for issuing creds - it is more of an OIDC broker **Status:** Completed 24/6/21 [#1254](https://github.com/ministryofjustice/cloud-platform/issues/2945) @@ -75,9 +75,9 @@ We'll continue to use our existing RBAC configuration from the previous cluster. Options: -* Self-managed nodes -* Managed node groups - automates various aspects of the node lifecycle, including creating the EC2s, the auto scaling group, registration of nodes with kubernetes and recycling nodes -* Fargate nodes - fully automated nodes, the least to manage. Benefits from more isolation between pods and automatic scaling. Doesn't support daemonsets. +- Self-managed nodes +- Managed node groups - automates various aspects of the node lifecycle, including creating the EC2s, the auto scaling group, registration of nodes with kubernetes and recycling nodes +- Fargate nodes - fully automated nodes, the least to manage. Benefits from more isolation between pods and automatic scaling. Doesn't support daemonsets. We aim to take advantage of as much automation as possible, to minimize the team's operational overhead and risk. Initially we'll use managed node groups, before looking at Fargate for workloads. @@ -85,18 +85,18 @@ We aim to take advantage of as much automation as possible, to minimize the team #### Future Fargate considerations -*Pod limits* - there is a quota limit of [500 Fargate pods per region per AWS Account](https://aws.amazon.com/about-aws/whats-new/2020/09/aws-fargate-increases-default-resource-count-service-quotas/) which could be an issue, considering we currently run ~2000 pods. We can request AWS raise the limit - not currently sure what scope there is. With Multi-cluster stage 5, the separation of loads into different AWS accounts will settle this issue. +_Pod limits_ - there is a quota limit of [500 Fargate pods per region per AWS Account](https://aws.amazon.com/about-aws/whats-new/2020/09/aws-fargate-increases-default-resource-count-service-quotas/) which could be an issue, considering we currently run ~2000 pods. We can request AWS raise the limit - not currently sure what scope there is. With Multi-cluster stage 5, the separation of loads into different AWS accounts will settle this issue. -*Daemonset functionality* - needs replacement: +_Daemonset functionality_ - needs replacement: -* fluent-bit - currently used for log shipping to ElasticSearch. AWS provides a managed version of [Fluent Bit on Fargate](https://aws.amazon.com/blogs/containers/fluent-bit-for-amazon-eks-on-aws-fargate-is-here/) which can be configured to ship logs to ElasticSearch. -* prometheus-node-exporter - currently used to export node metrics to prometheus. In Fargate the node itself is managed by AWS and therefore hidden. However we can [collect some useful metrics about pods running in Fargate from scraping cAdvisor](https://aws.amazon.com/blogs/containers/monitoring-amazon-eks-on-aws-fargate-using-prometheus-and-grafana/), including on CPU, memory, disk and network +- fluent-bit - currently used for log shipping to ElasticSearch. AWS provides a managed version of [Fluent Bit on Fargate](https://aws.amazon.com/blogs/containers/fluent-bit-for-amazon-eks-on-aws-fargate-is-here/) which can be configured to ship logs to ElasticSearch. +- prometheus-node-exporter - currently used to export node metrics to prometheus. In Fargate the node itself is managed by AWS and therefore hidden. However we can [collect some useful metrics about pods running in Fargate from scraping cAdvisor](https://aws.amazon.com/blogs/containers/monitoring-amazon-eks-on-aws-fargate-using-prometheus-and-grafana/), including on CPU, memory, disk and network -*No EBS support* - Prometheus will run still in a managed node group. Likely other workloads too to consider. +_No EBS support_ - Prometheus will run still in a managed node group. Likely other workloads too to consider. -*how people check the status of their deployments* - to be investigated +_how people check the status of their deployments_ - to be investigated -*ingress can't be nginx? - just the load balancer in front* - to be investigated +_ingress can't be nginx? - just the load balancer in front_ - to be investigated If we don't use Fargate then we should take advantage of Spot instances for reduced costs. However Fargate is the priority, because the main driver here is engineer time, not EC2 cost. @@ -124,12 +124,12 @@ The choice of AWS CNI networking in the new cluster, initially added a constrain So the primary choice is: -* r5.xlarge - 4 vCPUs, 32GB memory +- r5.xlarge - 4 vCPUs, 32GB memory With fallbacks (should the cloud provider run out of these in the AZ): -* r5.2xlarge - 8 vCPUs, 64GB memory -* r5a.xlarge - 4 vCPUs, 32GB memory +- r5.2xlarge - 8 vCPUs, 64GB memory +- r5a.xlarge - 4 vCPUs, 32GB memory In the future we might consider the ARM processor ranges, but we'd need to consider the added complexity of cross-compiled container images. @@ -137,12 +137,13 @@ In the future we might consider the ARM processor ranges, but we'd need to consi The existing cluster uses r5.2xlarge, so we'll continue with that, and add some fall-backs: -* r5.2xlarge - memory optimized range - 8 cores 64 GB -* r4.2xlarge - memory optimized range - 8 cores 61 GB +- r5.2xlarge - memory optimized range - 8 cores 64 GB +- r4.2xlarge - memory optimized range - 8 cores 61 GB **Status:** -* 1/9/21 r5.xlarge is in place for main node group - a temporarily a high number of instances -* IP prefixes is in the backlog [#3086](https://github.com/ministryofjustice/cloud-platform/issues/3086) + +- 1/9/21 r5.xlarge is in place for main node group - a temporarily a high number of instances +- IP prefixes is in the backlog [#3086](https://github.com/ministryofjustice/cloud-platform/issues/3086) ### Pod networking (CNI) @@ -154,16 +155,16 @@ AWS's CNI is used for the pod networking (IPAM, CNI and Routing). Each pod is gi Advantages of AWS's CNI: -* it is the default with EKS, native to AWS, is fully supported by AWS - low management overhead -* offers good network performance +- it is the default with EKS, native to AWS, is fully supported by AWS - low management overhead +- offers good network performance -The concern with AWS's CNI would be that it uses an IP address for every pod, and there is a [limit per node](https://github.com/awslabs/amazon-eks-ami/blob/master/files/eni-max-pods.txt), depending on the EC2 instance type and the number of ENIs it supports. The calculations in [Node Instance Types](#node-instance-types) show that with a change of instance type, the cost of the cluster increases by 17% or $8k, which is acceptable - likely less than the engineering cost of maintaining and supporting full Calico networking and custom node image. +The concern with AWS's CNI would be that it uses an IP address for every pod, and there is a [limit per node](https://github.com/awslabs/amazon-eks-ami/blob/main/nodeadm/internal/kubelet/eni-max-pods.txt), depending on the EC2 instance type and the number of ENIs it supports. The calculations in [Node Instance Types](#node-instance-types) show that with a change of instance type, the cost of the cluster increases by 17% or $8k, which is acceptable - likely less than the engineering cost of maintaining and supporting full Calico networking and custom node image. The alternative considered was [Calico networking](https://docs.projectcalico.org/getting-started/kubernetes/managed-public-cloud/eks#install-eks-with-calico-networking). This has the advantage of not needing an IP address per pod, and associated instance limit. And it is open source. However: -* We wouldn't have any support from the cloud provider if there were networking issues. -* We have to maintain a customized image with Calico installed. It's likely that changes to EKS over time will frequently cause breakages with this networking setup. -* Installation requires recycling the nodes, which is not a good fit with declarative config. +- We wouldn't have any support from the cloud provider if there were networking issues. +- We have to maintain a customized image with Calico installed. It's likely that changes to EKS over time will frequently cause breakages with this networking setup. +- Installation requires recycling the nodes, which is not a good fit with declarative config. **Status**: Completed 2/6/21 [#2854](https://github.com/ministryofjustice/cloud-platform/issues/2854) @@ -187,13 +188,14 @@ Cluster auto-scaling should be considered soon though. This is to embrace one of Considerations for auto-scaler: -* we need to maintain spare capacity, so that workloads that scale up don't have to wait for nodes to start-up, which can take about 7 minutes. This may require some tuning. -* tenants should be encouraged to auto-scale their pods effectively (e.g. using the Horizontal pod autoscaler), to capitalize on cluster auto-scaling -* scaling down non-prod namespaces will need agreement from service teams +- we need to maintain spare capacity, so that workloads that scale up don't have to wait for nodes to start-up, which can take about 7 minutes. This may require some tuning. +- tenants should be encouraged to auto-scale their pods effectively (e.g. using the Horizontal pod autoscaler), to capitalize on cluster auto-scaling +- scaling down non-prod namespaces will need agreement from service teams **Status:** -* 18/8/21 Manual scaling in place [#3033](https://github.com/ministryofjustice/cloud-platform/issues/3033) -* 23/9/21 Auto-scaler is still desired + +- 18/8/21 Manual scaling in place [#3033](https://github.com/ministryofjustice/cloud-platform/issues/3033) +- 23/9/21 Auto-scaler is still desired ### Network policy enforcement @@ -231,29 +233,30 @@ The logs go to CloudWatch. Maybe we need to export them elsewhere. Further discu Links: -* https://docs.aws.amazon.com/prescriptive-guidance/latest/patterns/install-ssm-agent-on-amazon-eks-worker-nodes-by-using-kubernetes-daemonset.html -* https://github.com/aws/containers-roadmap/issues/593 +- https://docs.aws.amazon.com/prescriptive-guidance/latest/patterns/install-ssm-agent-on-amazon-eks-worker-nodes-by-using-kubernetes-daemonset.html +- https://github.com/aws/containers-roadmap/issues/593 AWS Systems Manager Session Manager benefits: -* easy to install - daemonset -* auth is via a team member's AWS creds, so it's tied into JML processes and access can be removed immediately if they leave the team, and 2FA is the norm -* terminal commands are logged - useful for audit purposes -* [it's an EKS best practice](https://aws.github.io/aws-eks-best-practices/security/docs/hosts/#minimize-access-to-worker-nodes) -* we can take advantage of other Systems Manager features in future, including diagnostic and compliance monitoring +- easy to install - daemonset +- auth is via a team member's AWS creds, so it's tied into JML processes and access can be removed immediately if they leave the team, and 2FA is the norm +- terminal commands are logged - useful for audit purposes +- [it's an EKS best practice](https://aws.github.io/aws-eks-best-practices/security/docs/hosts/#minimize-access-to-worker-nodes) +- we can take advantage of other Systems Manager features in future, including diagnostic and compliance monitoring To note: -* requires permissions `hostNetwork: true` and `privileged: true` so may need its own PSP -* it's no use if the node is failing to boot or join the cluster properly, but we can live with that - it's likely that it's the pods we want to characterize, not the node, because the node is managed +- requires permissions `hostNetwork: true` and `privileged: true` so may need its own PSP +- it's no use if the node is failing to boot or join the cluster properly, but we can live with that - it's likely that it's the pods we want to characterize, not the node, because the node is managed The traditional method of node access would be to SSH in via a bastion. This involves a shared ssh key, and shared credentials is not an acceptable security practice. **Status** Completed 2/9/21 -* Implementation ticket: https://github.com/ministryofjustice/cloud-platform/issues/2962 -* Runbook for usage: https://runbooks.cloud-platform.service.justice.gov.uk/eks-node-terminal-access.html + +- Implementation ticket: https://github.com/ministryofjustice/cloud-platform/issues/2962 +- Runbook for usage: https://runbooks.cloud-platform.service.justice.gov.uk/eks-node-terminal-access.html ### PodSecurityPolicies @@ -287,11 +290,11 @@ PSPs are [deprecated](https://kubernetes.io/blog/2021/04/06/podsecuritypolicy-de Benefits of IRSA over kiam or kube2iam: -* kiam/kube2iam require running and managing a daemonset container. -* kiam/kube2iam require [powerful AWS credentials](https://github.com/jtblin/kube2iam#iam-roles), which allow EC2 boxes to assume any role. Appropriate configuration of kiam/kube2iam aims to provide containers with only a specific role. However there are security concerns with this approach: - * With kube2iam you have to remember to set a `--default-role` to use when annotation is not set on a pod. - * When a node boots, there may be a short window until kiam/kube2iam starts up, when there is no protection of the instance metadata. In comparison, IRSA injects the token into the pod, avoiding this concern. - * With kube2iam/kiam, an attacker able to get root on the node could access the credentials and therefore any AWS Role. In comparison, with IRSA a breach of k8s might only have bring access to the AWS Roles that are associated with k8s service roles. +- kiam/kube2iam require running and managing a daemonset container. +- kiam/kube2iam require [powerful AWS credentials](https://github.com/jtblin/kube2iam#iam-roles), which allow EC2 boxes to assume any role. Appropriate configuration of kiam/kube2iam aims to provide containers with only a specific role. However there are security concerns with this approach: + - With kube2iam you have to remember to set a `--default-role` to use when annotation is not set on a pod. + - When a node boots, there may be a short window until kiam/kube2iam starts up, when there is no protection of the instance metadata. In comparison, IRSA injects the token into the pod, avoiding this concern. + - With kube2iam/kiam, an attacker able to get root on the node could access the credentials and therefore any AWS Role. In comparison, with IRSA a breach of k8s might only have bring access to the AWS Roles that are associated with k8s service roles. #### Blocking access instance metadata diff --git a/architecture-decision-record/023-Logging.md b/architecture-decision-record/023-Logging.md index 1eb16149..13f1f846 100644 --- a/architecture-decision-record/023-Logging.md +++ b/architecture-decision-record/023-Logging.md @@ -1,6 +1,6 @@ # 23 Logging -Date: 02/06/2021 +Date: 11/11/2024 ## Status @@ -8,7 +8,10 @@ Date: 02/06/2021 ## Context -Cloud Platform's existing strategy for logs has been to **centralize** them in an ElasticSearch instance (Saas hosted by AWS OpenSearch). This allows [service teams](https://user-guide.cloud-platform.service.justice.gov.uk/documentation/logging-an-app/access-logs.html#accessing-application-log-data) and Cloud Platform team to use Kibana's search and browse functionality, for the purpose of debug and resolving incidents. All pods' stdout get [shipped using Fluentbit](https://user-guide.cloud-platform.service.justice.gov.uk/documentation/logging-an-app/log-collection-and-storage.html#application-log-collection-and-storage) and ElasticSearch stored them for 30 days. +> Cloud Platform's existing strategy for logs has been to **centralize** them in an ElasticSearch instance (Saas hosted by AWS OpenSearch). + +As of November 2024, we have migrated the logging service over to AWS OpenSearch, with ElasticSearch due for retirement (pending some decisions and actions on how to manage existing data retention on that cluster). +Service teams can use OpenSearch's [search and browse functionality](https://app-logs.cloud-platform.service.justice.gov.uk/_dashboards/app/home#/) for the purposes of debugging and resolving incidents. All pods' stdout get [shipped using Fluentbit](https://user-guide.cloud-platform.service.justice.gov.uk/documentation/logging-an-app/log-collection-and-storage.html#application-log-collection-and-storage) and ElasticSearch stored them for 30 days. The full lifecycle policy configuration for OpenSearch can be viewed [here](https://github.com/ministryofjustice/cloud-platform-infrastructure/blob/main/terraform/aws-accounts/cloud-platform-aws/account/resources/opensearch/ism-policy.json.tpl). Concerns with existing ElasticSearch logging: diff --git a/architecture-decision-record/026-Managed-Prometheus.md b/architecture-decision-record/026-Managed-Prometheus.md index face2c2a..39b8d3a9 100644 --- a/architecture-decision-record/026-Managed-Prometheus.md +++ b/architecture-decision-record/026-Managed-Prometheus.md @@ -1,6 +1,6 @@ # 26 Managed Prometheus -Date: 2021-10-08 +Date: 2024-11-11 ## Status @@ -67,7 +67,9 @@ We also need to address: **Sharding**: We could split/shard the Prometheus instance: perhaps dividing into two - tenants and platform. Or if we did multi-cluster we could have one Prometheus instance per cluster. This appears relatively straightforward to do. There would be concern that however we split it, as we scale in the future we'll hit future scaling thresholds, where it will be necessary to change how to divide it into shards, so a bit of planning would be needed. -**High Availability**: The recommended approach would be to run multiple instances of Prometheus configured the same, scraping the same endpoints independently. [Source](https://prometheus-operator.dev/docs/operator/high-availability/#prometheus) There is a `replicas` option to do this. However for HA we would also need to have a load balancer for the PromQL queries to the Prometheus API, to fail-over if the primary is unresponsive. And it's not clear how this works with duplicate alerts being sent to AlertManager. This doesn't feel like a very paved path, with Prometheus Operator [saying](https://prometheus-operator.dev/docs/operator/high-availability/) "We are currently implementing some of the groundwork to make this possible, and figuring out the best approach to do so, but it is definitely on the roadmap!" - Jan 2017, and not updated since. +**High Availability**: We are now running Prometheus in HA mode [with 3 replicas](https://github.com/ministryofjustice/cloud-platform-terraform-monitoring/pull/239). Keeping the findings below as we may have some additional elements of HA to consider in the future: + +> [Source](https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/high-availability.md#prometheus) There is a `replicas` option to do this. However for HA we would also need to have a load balancer for the PromQL queries to the Prometheus API, to fail-over if the primary is unresponsive. And it's not clear how this works with duplicate alerts being sent to AlertManager. This doesn't feel like a very paved path, with Prometheus Operator [saying](https://github.com/prometheus-operator/prometheus-operator/blob/main/Documentation/high-availability.md) "We are currently implementing some of the groundwork to make this possible, and figuring out the best approach to do so, but it is definitely on the roadmap!" - Jan 2017, and not updated since. **Managed Prometheus**: Using a managed service of prometheus, such as AMP, would address most of these concerns, and is evaluated in detail in the next section. diff --git a/runbooks/source/add-new-opa-policy.html.md.erb b/runbooks/source/add-new-opa-policy.html.md.erb index 8cd36c6b..b9e47058 100644 --- a/runbooks/source/add-new-opa-policy.html.md.erb +++ b/runbooks/source/add-new-opa-policy.html.md.erb @@ -1,7 +1,7 @@ --- title: Add a new OPA policy weight: 9000 -last_reviewed_on: 2024-05-24 +last_reviewed_on: 2024-11-25 review_in: 6 months --- diff --git a/runbooks/source/auth0-rotation.html.md.erb b/runbooks/source/auth0-rotation.html.md.erb index 40976ccd..a6266d35 100644 --- a/runbooks/source/auth0-rotation.html.md.erb +++ b/runbooks/source/auth0-rotation.html.md.erb @@ -1,7 +1,7 @@ --- title: Credentials rotation for auth0 apps weight: 68 -last_reviewed_on: 2024-05-24 +last_reviewed_on: 2024-11-25 review_in: 6 months --- diff --git a/runbooks/source/bastion-node.html.md.erb b/runbooks/source/bastion-node.html.md.erb index 9dd5fd3a..40f48f2e 100644 --- a/runbooks/source/bastion-node.html.md.erb +++ b/runbooks/source/bastion-node.html.md.erb @@ -1,7 +1,7 @@ --- title: Create and access bastion node weight: 97 -last_reviewed_on: 2024-05-24 +last_reviewed_on: 2024-11-25 review_in: 6 months --- diff --git a/runbooks/source/container-images.html.md.erb b/runbooks/source/container-images.html.md.erb index 8feef289..197b92ec 100644 --- a/runbooks/source/container-images.html.md.erb +++ b/runbooks/source/container-images.html.md.erb @@ -1,7 +1,7 @@ --- title: Container Images used by Cluster Components weight: 55 -last_reviewed_on: 2024-10-09 +last_reviewed_on: 2024-11-14 review_in: 3 months --- @@ -19,9 +19,9 @@ To grab the current image versions for all containers within components namespac kubectl get pods -n [NAMESPACE] -o jsonpath='{range .items[*]}{"\n"}{.metadata.name}{":\t"}{range .spec.containers[*]}{.image}{", "}{end}{end}' | sort ``` -### Latest version for k8s 1.28 +### Latest version for k8s 1.29 -The latest versions of some of the components might not be compatible with k8s 1.28. For this, click the link to check the Compatibility Matrix +The latest versions of some of the components might not be compatible with k8s 1.29. For this, click the link to check the Compatibility Matrix ### Latest version available That's the latest version available in the public repository. Update the version when there is a new release. You can find the latest version by clicking on the link or by checking the @@ -41,122 +41,126 @@ This depends on several factors, some of them are: πŸ”΄ - urgent, within this sprint ## calico-apiserver -| container image | urgency | latest version for k8s 1.28 | latest version available | latest helm chart | +| container image | urgency | latest version for k8s 1.29 | latest version available | latest helm chart | |-|-|-|-|-| -| docker.io/calico/apiserver:v3.25.0 | 🟒 | [v3.28.0](https://docs.tigera.io/calico/latest/getting-started/kubernetes/requirements#kubernetes-requirements) |[v3.28.0](https://github.com/projectcalico/calico/releases/tag/v3.28.0) | [v1.34.1](https://github.com/tigera/operator/releases/tag/v1.34.1) | +| docker.io/calico/apiserver:v3.28.1 | 🟠 | [v3.29.0](https://docs.tigera.io/calico/latest/getting-started/kubernetes/requirements#kubernetes-requirements) |[v3.29.0](https://github.com/projectcalico/calico/releases/tag/v3.29.0) | [v1.36.1](https://github.com/tigera/operator/releases/tag/v1.36.1) | ## calico-system -| container image | urgency | latest version for k8s 1.28 | latest version available | latest helm chart | +| container image | urgency | latest version for k8s 1.29 | latest version available | latest helm chart | |-|-|-|-|-| -| docker.io/calico/csi:v3.25.0 | 🟒 | [v3.28.0](https://docs.tigera.io/calico/latest/getting-started/kubernetes/requirements#kubernetes-requirements) | [v3.28.0](https://github.com/projectcalico/calico/releases/tag/v3.28.0) | [v1.34.1](https://github.com/tigera/operator/releases/tag/v1.34.1) | -| docker.io/calico/kube-controllers:v3.25.0 | 🟒 | v3.28.0 | [v3.28.0](https://github.com/projectcalico/calico/releases/tag/v3.28.0) | [v1.34.1](https://github.com/tigera/operator/releases/tag/v1.34.1) | -| docker.io/calico/node-driver-registrar:v3.25.0 | 🟒 | v3.28.0 | [v3.28.0](https://github.com/projectcalico/calico/releases/tag/v3.28.0) | [v1.34.1](https://github.com/tigera/operator/releases/tag/v1.34.1) | -| docker.io/calico/node:v3.25.0 | 🟒 | v3.28.0 | [v3.28.0](https://github.com/projectcalico/calico/releases/tag/v3.28.0) | [v1.34.1](https://github.com/tigera/operator/releases/tag/v1.34.1) | -| docker.io/calico/typha:v3.25.0 | 🟒 | v3.28.0 | [v3.28.0](https://github.com/projectcalico/calico/releases/tag/v3.28.0) | [v1.34.1](https://github.com/tigera/operator/releases/tag/v1.34.1) | +| docker.io/calico/csi:v3.28.1 | 🟠 | [v3.29.0](https://docs.tigera.io/calico/latest/getting-started/kubernetes/requirements#kubernetes-requirements) |[v3.29.0](https://github.com/projectcalico/calico/releases/tag/v3.29.0) | [v1.36.1](https://github.com/tigera/operator/releases/tag/v1.36.1) | +| docker.io/calico/kube-controllers:v3.28.1 | 🟠 | [v3.29.0](https://docs.tigera.io/calico/latest/getting-started/kubernetes/requirements#kubernetes-requirements) |[v3.29.0](https://github.com/projectcalico/calico/releases/tag/v3.29.0) | [v1.36.1](https://github.com/tigera/operator/releases/tag/v1.36.1) | +| docker.io/calico/node-driver-registrar:v3.28.1 | 🟠 | [v3.29.0](https://docs.tigera.io/calico/latest/getting-started/kubernetes/requirements#kubernetes-requirements) |[v3.29.0](https://github.com/projectcalico/calico/releases/tag/v3.29.0) | [v1.36.1](https://github.com/tigera/operator/releases/tag/v1.36.1) | +| docker.io/calico/node:v3.28.1 | 🟠 | [v3.29.0](https://docs.tigera.io/calico/latest/getting-started/kubernetes/requirements#kubernetes-requirements) |[v3.29.0](https://github.com/projectcalico/calico/releases/tag/v3.29.0) | [v1.36.1](https://github.com/tigera/operator/releases/tag/v1.36.1) | +| docker.io/calico/typha:v3.28.1 | 🟠 | [v3.29.0](https://docs.tigera.io/calico/latest/getting-started/kubernetes/requirements#kubernetes-requirements) |[v3.29.0](https://github.com/projectcalico/calico/releases/tag/v3.29.0) | [v1.36.1](https://github.com/tigera/operator/releases/tag/v1.36.1) | ## cert-manager -| container image | urgency | latest version for k8s 1.28 | latest version available | latest helm chart | +| container image | urgency | latest version for k8s 1.29 | latest version available | latest helm chart | |-|-|-|-|-| -| quay.io/jetstack/cert-manager-cainjector:v1.13.1 | 🟒 | [v1.15.0](https://cert-manager.io/docs/releases/#currently-supported-releases) | [v1.15.0](https://github.com/cert-manager/cert-manager/releases/tag/v1.15.0) | [v1.15.0](https://github.com/cert-manager/cert-manager/releases/tag/v1.15.0) | -| quay.io/jetstack/cert-manager-controller:v1.13.1 | 🟒 | [v1.15.0](https://cert-manager.io/docs/releases/#currently-supported-releases) | [v1.15.0](https://github.com/cert-manager/cert-manager/releases/tag/v1.15.0) | [v1.15.0](https://github.com/cert-manager/cert-manager/releases/tag/v1.15.0) | -| quay.io/jetstack/cert-manager-webhook:v1.13.1 | 🟒 | [v1.15.0](https://cert-manager.io/docs/releases/#currently-supported-releases) | [v1.15.0](https://github.com/cert-manager/cert-manager/releases/tag/v1.15.0) | [v1.15.0](https://github.com/cert-manager/cert-manager/releases/tag/v1.15.0) | +| quay.io/jetstack/cert-manager-cainjector:v1.13.1 | 🟠 | [v1.16.0](https://cert-manager.io/docs/releases/#currently-supported-releases) | [v1.16.1](https://github.com/cert-manager/cert-manager/releases/tag/v1.16.1) | [v1.16.1](https://github.com/cert-manager/cert-manager/releases/tag/v1.16.1) | +| quay.io/jetstack/cert-manager-controller:v1.13.1 | 🟠 | [v1.16.0](https://cert-manager.io/docs/releases/#currently-supported-releases) | [v1.16.1](https://github.com/cert-manager/cert-manager/releases/tag/v1.16.1) | [v1.16.1](https://github.com/cert-manager/cert-manager/releases/tag/v1.16.1) | +| quay.io/jetstack/cert-manager-webhook:v1.13.1 | 🟠 | [v1.16.0](https://cert-manager.io/docs/releases/#currently-supported-releases) | [v1.16.1](https://github.com/cert-manager/cert-manager/releases/tag/v1.16.1) | [v1.16.1](https://github.com/cert-manager/cert-manager/releases/tag/v1.16.1) | ## concourse -| container image | urgency | latest version for k8s 1.28 | latest version available | latest helm chart | +| container image | urgency | latest version for k8s 1.29 | latest version available | latest helm chart | |-|-|-|-|-| -| concourse/concourse:7.10.0 | 🟒 | [v7.11.2](https://github.com/concourse/concourse/releases) | [v7.11.2](https://github.com/concourse/concourse/releases) | [v17.3.1](https://github.com/concourse/concourse-chart/releases/tag/v17.3.1) +| concourse/concourse:7.10.0 | 🟒 | [v7.12.0](https://github.com/concourse/concourse/releases) | [v7.12.0](https://github.com/concourse/concourse/releases) | [v17.3.1](https://github.com/concourse/concourse-chart/releases/tag/v17.3.1) ## external-secrets-operator -| container image | urgency | latest version for k8s 1.28 | latest version available | latest helm chart | +| container image | urgency | latest version for k8s 1.29 | latest version available | latest helm chart | |-|-|-|-|-| -| ghcr.io/external-secrets/external-secrets:v0.8.1 | 🟒 | [v0.9.19](https://external-secrets.io/latest/introduction/stability-support/#supported-versions) | [v0.9.19](https://github.com/external-secrets/external-secrets/releases/tag/v0.9.19) | [v0.9.19](https://github.com/external-secrets/external-secrets/releases/tag/helm-chart-0.9.19) +| ghcr.io/external-secrets/external-secrets:v0.8.1 | 🟒 | [v0.10.5](https://external-secrets.io/latest/introduction/stability-support/#supported-versions) | [v0.10.15](https://github.com/external-secrets/external-secrets/releases/tag/v0.10.15) | [v0.10.5](https://github.com/external-secrets/external-secrets/releases/tag/helm-chart-0.10.5) ## gatekeeper-system -| container image | urgency | latest version for k8s 1.28 | latest version available | latest helm chart | -|-|-|-|-|-| -| openpolicyagent/gatekeeper:v3.15.1: | 🟒 | v3.15.1 | [v3.16.3](https://github.com/open-policy-agent/gatekeeper/releases/tag/v3.16.3) | [v3.16.3](https://github.com/open-policy-agent/gatekeeper/releases/tag/v3.16.3) | +| container image | urgency | latest version for k8s 1.29 | latest version available | latest helm chart | +|-|-|-|-|-r +| openpolicyagent/gatekeeper:v3.15.1: | 🟠 | v3.17.1 | [v3.17.1](https://github.com/open-policy-agent/gatekeeper/releases/tag/v3.17.1) | [v3.17.1](https://github.com/open-policy-agent/gatekeeper/releases/tag/v3.17.1) | ## ingress-controllers -| container image | urgency | latest version for k8s 1.28 | latest version available | latest helm chart | +| container image | urgency | latest version for k8s 1.29 | latest version available | latest helm chart | |-|-|-|-|-| -| debian:bookworm-slim | 🟒 | latest | n/a | -| fluent/fluent-bit:3.0.2-amd64 | 🟒 | v3.0.7 | [v3.0.7](https://github.com/fluent/fluent-bit/releases/tag/v3.0.7) | n/a | -| ministryofjustice/cloud-platform-custom-error-pages:0.6 | 🟠 | [managed by us](https://github.com/ministryofjustice/cloud-platform-custom-error-pages) | [managed by us](https://github.com/ministryofjustice/cloud-platform-custom-error-pages/releases/tag/1.1.3) | n/a | -| registry.k8s.io/ingress-nginx/controller:v1.8.4| 🟒 | [v1.10.1](https://github.com/kubernetes/ingress-nginx?tab=readme-ov-file#supported-versions-table) | [v1.10.1](https://github.com/kubernetes/ingress-nginx/releases/tag/controller-v1.10.1) | [v4.10.1](https://github.com/kubernetes/ingress-nginx/tree/main/charts/ingress-nginx) +| debian:bookworm-slim::bookworm-20241016-slim | 🟒 | latest | n/a | +| fluent/fluent-bit:3.0.2-amd64 | 🟒 | v3.1.10 | [v3.1.10](https://github.com/fluent/fluent-bit/releases/tag/v3.1.10) | n/a | +| ministryofjustice/cloud-platform-custom-error-pages:1.1.5 | 🟒 | [managed by us](https://github.com/ministryofjustice/cloud-platform-custom-error-pages) | [managed by us](https://github.com/ministryofjustice/cloud-platform-custom-error-pages/releases/tag/1.1.5) | n/a | +| registry.k8s.io/ingress-nginx/controller:v1.10.1| 🟒 | [v1.11.3](https://github.com/kubernetes/ingress-nginx?tab=readme-ov-file#supported-versions-table) | [v1.11.3](https://github.com/kubernetes/ingress-nginx/releases/tag/controller-v1.11.3) | [v4.11.3](https://github.com/kubernetes/ingress-nginx/releases/tag/helm-chart-4.11.3) ## kube-system -| container image | urgency | latest version for k8s 1.28 | latest version available | latest helm chart | -|-|-|-|-|-| -| 602401143452.dkr.ecr.eu-west-2.amazonaws.com/amazon-k8s-cni:v1.18.2-eksbuild.1 | 🟒 | [v1.18.2-eksbuild.1](https://docs.aws.amazon.com/eks/latest/userguide/managing-vpc-cni.html) | [v1.18.2-eksbuild.1](https://docs.aws.amazon.com/eks/latest/userguide/managing-vpc-cni.html) | n/a | -| 602401143452.dkr.ecr.eu-west-2.amazonaws.com/amazon/aws-network-policy-agent:v1.1.2-eksbuild.1 | 🟒 | [v1.1.2-eksbuild.1](https://docs.aws.amazon.com/eks/latest/userguide/managing-vpc-cni.html) | [v1.1.2-eksbuild.1](https://docs.aws.amazon.com/eks/latest/userguide/managing-vpc-cni.html) | n/a -| 602401143452.dkr.ecr.eu-west-2.amazonaws.com/eks/coredns:v1.10.1-eksbuild.11 | 🟒 | [v1.10.1-eksbuild.11](https://docs.aws.amazon.com/eks/latest/userguide/managing-coredns.html) | [v1.11.1-eksbuild.9](https://docs.aws.amazon.com/eks/latest/userguide/managing-coredns.html) | n/a | -| 602401143452.dkr.ecr.eu-west-2.amazonaws.com/eks/kube-proxy:v1.28.8-minimal-eksbuild.5 | 🟒 | [v1.28.8-eksbuild.5](https://docs.aws.amazon.com/eks/latest/userguide/managing-kube-proxy.html) | [v1.30.0-eksbuild.3](https://docs.aws.amazon.com/eks/latest/userguide/managing-kube-proxy.html) | n/a -| docker.io/bitnami/external-dns:0.13.4-debian-11-r14 | 🟒 | v0.14.x | [v0.14.x](https://github.com/kubernetes-sigs/external-dns/releases/tag/v0.14.0) | [v0.14.x](https://github.com/bitnami/charts/blob/main/bitnami/external-dns/Chart.yaml#L11) | -| public.ecr.aws/ebs-csi-driver/aws-ebs-csi-driver:v1.29.1 | 🟒 | [v1.30.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver?tab=readme-ov-file#compatibility) | [v1.30.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/v1.30.0) | [2.30.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/helm-chart-aws-ebs-csi-driver-2.30.0) | -| public.ecr.aws/eks-distro/kubernetes-csi/external-attacher:v4.5.0-eks-1-29-7 | 🟒 | [v4.5.0](https://distro.eks.amazonaws.com/releases/1-26/28/) | [v1.30.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/v1.29.1) | [2.30.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/helm-chart-aws-ebs-csi-driver-2.30.0) | -| public.ecr.aws/eks-distro/kubernetes-csi/external-provisioner:v4.0.0-eks-1-29-7 | 🟒 | [v4.0.0](https://distro.eks.amazonaws.com/releases/1-26/28/) | [v1.30.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/v1.29.1) | [2.30.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/helm-chart-aws-ebs-csi-driver-2.30.0) | -| public.ecr.aws/eks-distro/kubernetes-csi/external-resizer:v1.10.0-eks-1-29-7 | 🟒 | [v1.10.0](https://distro.eks.amazonaws.com/releases/1-26/28/) | [v1.30.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/v1.29.1) | [2.30.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/helm-chart-aws-ebs-csi-driver-2.30.0) | -| public.ecr.aws/eks-distro/kubernetes-csi/livenessprobe:v2.12.0-eks-1-29-7 | 🟒 | [v2.12.0](https://distro.eks.amazonaws.com/releases/1-26/28/) | [v1.30.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/v1.29.1) | [2.30.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/helm-chart-aws-ebs-csi-driver-2.30.0) | -| public.ecr.aws/eks-distro/kubernetes-csi/node-driver-registrar:v2.10.0-eks-1-29-7 | 🟒 | [v2.10.0](https://distro.eks.amazonaws.com/releases/1-26/28/) | [v1.30.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/v1.29.1) | [2.30.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/helm-chart-aws-ebs-csi-driver-2.30.0) | -| registry.k8s.io/autoscaling/cluster-autoscaler:v1.28.5 | 🟒 | [v1.28.5](https://github.com/kubernetes/autoscaler/tree/master/cluster-autoscaler#releases) | [v1.30.1](https://github.com/kubernetes/autoscaler/releases/tag/cluster-autoscaler-1.30.1) | [9.37.0](https://github.com/kubernetes/autoscaler/releases/tag/cluster-autoscaler-chart-9.37.0) | -| registry.k8s.io/descheduler/descheduler:v0.27.1 | 🟒 | [v0.27.1](https://github.com/kubernetes-sigs/descheduler?tab=readme-ov-file#%EF%B8%8F--documentation-versions-by-release) | [v0.29.0](https://github.com/kubernetes-sigs/descheduler/releases/tag/v0.29.0) | [0.29.0](https://github.com/kubernetes-sigs/descheduler/releases/tag/descheduler-helm-chart-0.29.0) | -| registry.k8s.io/metrics-server/metrics-server:v0.7.1 | 🟒 | [v0.7.1](https://github.com/kubernetes-sigs/metrics-server?tab=readme-ov-file#compatibility-matrix) | [v0.7.1](https://github.com/kubernetes-sigs/metrics-server/releases/tag/v0.7.1) | [3.12.1](https://github.com/kubernetes-sigs/metrics-server/releases/tag/metrics-server-helm-chart-3.12.1) | +| container image | urgency | latest version for k8s 1.29 | latest version available | latest helm chart | +|-|-|-|-|-| +| 602401143452.dkr.ecr.eu-west-2.amazonaws.com/amazon-k8s-cni:v1.19.0-eksbuild.1 | 🟒 | [v1.19.0-eksbuild.1](https://docs.aws.amazon.com/eks/latest/userguide/managing-vpc-cni.html) | [v1.19.0-eksbuild.1](https://docs.aws.amazon.com/eks/latest/userguide/managing-vpc-cni.html) | n/a | +| 602401143452.dkr.ecr.eu-west-2.amazonaws.com/amazon/aws-network-policy-agent:v1.1.2 | 🟒 | [v1.1.4](https://github.com/aws/aws-network-policy-agent/releases/tag/v1.1.4) | [v1.1.4](https://github.com/aws/aws-network-policy-agent/releases/tag/v1.1.4) | n/a +| 602401143452.dkr.ecr.eu-west-2.amazonaws.com/eks/coredns:v1.11.3-eksbuild.2 | 🟒 | [v1.11.3-eksbuild.2](https://docs.aws.amazon.com/eks/latest/userguide/managing-coredns.html) | [v1.11.3-eksbuild.9](https://docs.aws.amazon.com/eks/latest/userguide/managing-coredns.html) | n/a | +| 602401143452.dkr.ecr.eu-west-2.amazonaws.com/eks/kube-proxy:v1.29.10-eksbuild.3 | 🟒 | [v1.29.10-eksbuild.3](https://docs.aws.amazon.com/eks/latest/userguide/managing-kube-proxy.html#managing-kube-proxy-images) | [v1.31.1-minimal-eksbuild.2](https://docs.aws.amazon.com/eks/latest/userguide/managing-kube-proxy.html) | n/a +| docker.io/bitnami/external-dns:0.13.4-debian-11-r14 | 🟠 | v0.15.x | [v0.15.x](https://github.com/kubernetes-sigs/external-dns/releases/tag/v0.15.0) | [v0.15.x](https://github.com/bitnami/charts/blob/main/bitnami/external-dns/Chart.yaml#L11) | +| registry.k8s.io/autoscaling/cluster-autoscaler:v1.28.5 | 🟒 | [v1.29.4](https://github.com/kubernetes/autoscaler/releases/tag/cluster-autoscaler-1.29.4) | [v1.31.0](https://github.com/kubernetes/autoscaler/releases/tag/cluster-autoscaler-1.31.0) | [9.38.0](https://github.com/kubernetes/autoscaler/releases/tag/cluster-autoscaler-chart-9.38.0) | +| registry.k8s.io/descheduler/descheduler:v0.27.1 | 🟠 | [v0.29.x](https://github.com/kubernetes-sigs/descheduler?tab=readme-ov-file#%EF%B8%8F--documentation-versions-by-release) | [v0.29.0](https://github.com/kubernetes-sigs/descheduler/releases/tag/v0.29.0) | [0.31.0](https://github.com/kubernetes-sigs/descheduler/releases/tag/descheduler-helm-chart-0.31.0) | +| registry.k8s.io/metrics-server/metrics-server:v0.7.1 | 🟒 | [v0.7.2](https://github.com/kubernetes-sigs/metrics-server?tab=readme-ov-file#compatibility-matrix) | [v0.7.2](https://github.com/kubernetes-sigs/metrics-server/releases/tag/v0.7.2) | [3.12.2](https://github.com/kubernetes-sigs/metrics-server/releases/tag/metrics-server-helm-chart-3.12.2) | + +#### included with the ebs-cbs-driver in `kube-system` +| container image | urgency | latest version for k8s 1.29 | latest version available | latest helm chart | +|-|-|-|-|-| +| public.ecr.aws/ebs-csi-driver/aws-ebs-csi-driver:v1.29.1 | 🟠 | [v1.37.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases) | [v1.37.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/v1.37.0) | [2.37.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/helm-chart-aws-ebs-csi-driver-2.37.0) | +| public.ecr.aws/eks-distro/kubernetes-csi/external-attacher:v4.5.0-eks-1-29-7 | 🟠 | [v4.7.0](https://distro.eks.amazonaws.com/releases/1-29/24/) | [v1.37.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/v1.37.0) | [2.37.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/helm-chart-aws-ebs-csi-driver-2.37.0) | +| public.ecr.aws/eks-distro/kubernetes-csi/external-provisioner:v4.0.0-eks-1-29-7 | 🟠 | [v5.1.0](https://distro.eks.amazonaws.com/releases/1-29/24/) | [v1.37.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/v1.37.0) | [2.37.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/helm-chart-aws-ebs-csi-driver-2.37.0) | +| public.ecr.aws/eks-distro/kubernetes-csi/external-resizer:v1.10.0-eks-1-29-7 | 🟠 | [v1.12.0](https://distro.eks.amazonaws.com/releases/1-29/24/) | [v1.37.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/v1.37.0) | [2.37.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/helm-chart-aws-ebs-csi-driver-2.37.0) | +| public.ecr.aws/eks-distro/kubernetes-csi/livenessprobe:v2.12.0-eks-1-29-7 | 🟠 | [v2.14.0](https://distro.eks.amazonaws.com/releases/1-29/24/) | [v1.37.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/v1.37.0) | [2.37.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/helm-chart-aws-ebs-csi-driver-2.37.0) | +| public.ecr.aws/eks-distro/kubernetes-csi/node-driver-registrar:v2.10.0-eks-1-29-7 | 🟠 | [v2.12.0](https://distro.eks.amazonaws.com/releases/1-29/24/) | [v1.37.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/v1.37.0) | [2.37.0](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/releases/tag/helm-chart-aws-ebs-csi-driver-2.37.0) | ## kuberhealthy -| container image | urgency | latest version for k8s 1.28 | latest version available | latest helm chart | +| container image | urgency | latest version for k8s 1.29 | latest version available | latest helm chart | |-|-|-|-|-| | 754256621582.dkr.ecr.eu-west-2.amazonaws.com/webops/cloud-platform-kuberhealthy-checks:1.9 | 🟒 | managed by us | [1.9](https://github.com/ministryofjustice/cloud-platform-kuberhealthy-checks/releases/tag/1.9) | n/a | -| docker.io/kuberhealthy/daemonset-check:v3.3.0 | 🟒 | v3.3.0 | [v3.3.0](https://github.com/kuberhealthy/kuberhealthy/releases/tag/v2.7.1) | [104](https://github.com/kuberhealthy/kuberhealthy/tree/master/deploy/helm/kuberhealthy) | -| docker.io/kuberhealthy/deployment-check:v1.9.0 | 🟒 | v1.9.0 | [v3.3.0](https://github.com/kuberhealthy/kuberhealthy/releases/tag/v2.7.1) | [104](https://github.com/kuberhealthy/kuberhealthy/tree/master/deploy/helm/kuberhealthy) | -| docker.io/kuberhealthy/dns-resolution-check:v1.5.0 | 🟒 | v1.5.0 | [v3.3.0](https://github.com/kuberhealthy/kuberhealthy/releases/tag/v2.7.1) | [104](https://github.com/kuberhealthy/kuberhealthy/tree/master/deploy/helm/kuberhealthy) | -| docker.io/kuberhealthy/kuberhealthy:v2.8.0-rc2 __[pre-release]__| 🟒 | v2.7.1 | [v3.3.0](https://github.com/kuberhealthy/kuberhealthy/releases/tag/v2.7.1) | [104](https://github.com/kuberhealthy/kuberhealthy/tree/master/deploy/helm/kuberhealthy) | +| docker.io/kuberhealthy/daemonset-check:v3.3.0 | 🟒 | v3.3.0 | [v2.7.1](https://github.com/kuberhealthy/kuberhealthy/releases/tag/v2.7.1) | [104](https://github.com/kuberhealthy/kuberhealthy/tree/master/deploy/helm/kuberhealthy) | +| docker.io/kuberhealthy/deployment-check:v1.9.0 | 🟒 | v1.9.1 | [v2.7.1](https://github.com/kuberhealthy/kuberhealthy/releases/tag/v2.7.1) | [104](https://github.com/kuberhealthy/kuberhealthy/tree/master/deploy/helm/kuberhealthy) | +| docker.io/kuberhealthy/dns-resolution-check:v1.5.0 | 🟒 | v1.5.0 | [v2.7.1](https://github.com/kuberhealthy/kuberhealthy/releases/tag/v2.7.1) | [104](https://github.com/kuberhealthy/kuberhealthy/tree/master/deploy/helm/kuberhealthy) | +| docker.io/kuberhealthy/kuberhealthy:v2.8.0-rc2 __[pre-release]__| 🟒 | v2.7.1 | [v2.7.1](https://github.com/kuberhealthy/kuberhealthy/releases/tag/v2.7.1) | [104](https://github.com/kuberhealthy/kuberhealthy/tree/master/deploy/helm/kuberhealthy) | ## kuberos -| container image | urgency | latest version for k8s 1.28 | latest version available | latest helm chart | +| container image | urgency | latest version for k8s 1.29 | latest version available | latest helm chart | |-|-|-|-|-| | ministryofjustice/cloud-platform-kuberos:2.7.0 | 🟒 | managed by us | [0.4.0](https://github.com/ministryofjustice/cloud-platform-helm-charts/tree/main/kuberos) | [0.4.0](https://github.com/ministryofjustice/cloud-platform-helm-charts/tree/main/kuberos) ## logging -| container image | urgency | latest version for k8s 1.28 | latest version available | latest helm chart | +| container image | urgency | latest version for k8s 1.29 | latest version available | latest helm chart | |-|-|-|-|-| -| fluent/fluent-bit:2.2.1 | 🟠 | v3.0.2 | [v3.0.7](https://github.com/fluent/fluent-bit/releases/tag/v3.0.7) | [0.46.11](https://github.com/fluent/helm-charts) | +| fluent/fluent-bit:2.2.1 | πŸ”΄ | v3.1.10 | [v3.1.10](https://github.com/fluent/fluent-bit/releases/tag/v3.1.10) | [0.47.11](https://github.com/fluent/helm-charts) | ## monitoring -| container image | urgency | latest version for k8s 1.28 | latest version available | latest helm chart | +| container image | urgency | latest version for k8s 1.29 | latest version available | latest helm chart | |-|-|-|-|-| -| docker.io/bitnami/redis:7.2.4-debian-11-r5 | 🟒 | v7.2.5-debian-12-r1 | [v7.2.5-debian-12-r1](https://hub.docker.com/layers/bitnami/redis/7.2.5-debian-12-r1/images/sha256-4c7ac96a3d576ce06603c2809d32f0c0e1754699aeb5bc3cb727d158d14caefd?context=explore | n/a | -| docker.io/bitnami/thanos:0.34.1-debian-12-r1 | 🟒 | v0.36.0 | [v0.36.0](https://github.com/thanos-io/thanos/releases/tag/v0.36.0-rc.0) | [v0.35.1](https://github.com/bitnami/charts/blob/main/bitnami/thanos/Chart.yaml#L13) | -| docker.io/grafana/grafana:10.4.0 | 🟠 | v11.1.0| [v11.1.0](https://github.com/grafana/grafana/releases/tag/v11.1.0) | [60.4.0](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/Chart.yaml#L26) | +| docker.io/bitnami/redis:7.2.4-debian-11-r5 | 🟒 | 7.4.1-debian-12-r2 | [v7.4.1-debian-12-r2](https://hub.docker.com/layers/bitnami/redis/7.4.1-debian-12-r2/images/sha256-3413f16342b05f07b31c246240b8bf2295553c46c7b81294f88e2855ba1cb026?context=explore) | n/a | +| docker.io/bitnami/thanos:0.34.1-debian-12-r1 | 🟠 | v0.36.1 | [v0.36.1](https://github.com/thanos-io/thanos/releases/tag/v0.36.1) | [v0.36.1](https://github.com/bitnami/charts/blob/main/bitnami/thanos/Chart.yaml#L13) | +| docker.io/grafana/grafana:11.3.0 | 🟒 | v11.3.0+security-01| [v11.3.0+security-01](https://github.com/grafana/grafana/releases/tag/v11.3.0%2Bsecurity-01) | [66.2.1](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/Chart.yaml#L26) | | ministryofjustice/prometheus-ecr-exporter:0.2.0 | 🟒 | managed by us | n/a | [0.4.0](https://github.com/ministryofjustice/cloud-platform-helm-charts/blob/main/prometheus-ecr-exporter/Chart.yaml#L5) | | ghcr.io/nerdswords/yet-another-cloudwatch-exporter:v0.61.2 | 🟒 | v0.61.2 | [v0.61.2](https://github.com/nerdswords/yet-another-cloudwatch-exporter/releases) | [0.38.0](https://github.com/nerdswords/helm-charts/releases) -| quay.io/kiwigrid/k8s-sidecar:1.26.1 | 🟒 | v1.26.2 | [v1.26.2](https://github.com/kiwigrid/k8s-sidecar/releases/tag/1.26.2) | [60.4.0](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/Chart.yaml#L26) | -| quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 | 🟒 | v7.6.0 | [v7.6.0](https://github.com/oauth2-proxy/oauth2-proxy/releases/tag/v7.6.0) | [7.7.7](https://github.com/oauth2-proxy/manifests/releases/tag/oauth2-proxy-7.7.7) | -| quay.io/prometheus-operator/prometheus-config-reloader:v0.72.0 | 🟒 | v0.75.0 | [v0.75.0](https://github.com/prometheus-operator/prometheus-operator/releases/tag/v0.73.0) | [60.4.0](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/Chart.yaml#L26) | -| quay.io/prometheus-operator/prometheus-operator:v0.72.0 | 🟒 | v0.75.0 | [v0.75.0](https://github.com/prometheus-operator/prometheus-operator/releases/tag/v0.75.0) | [60.4.0](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/Chart.yaml#L26) | +| quay.io/kiwigrid/k8s-sidecar:1.28.0 | 🟒 | v1.28.0 | [v1.28.0](https://github.com/kiwigrid/k8s-sidecar/releases/tag/1.28.0) | [66.2.1](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/Chart.yaml#L26) | +| quay.io/oauth2-proxy/oauth2-proxy:v7.6.0 | 🟠 | v7.7.1 | [v7.7.1](https://github.com/oauth2-proxy/oauth2-proxy/releases/tag/v7.7.1) | [7.7.29](https://github.com/oauth2-proxy/manifests/releases/tag/oauth2-proxy-7.7.29) | +| quay.io/prometheus-operator/prometheus-config-reloader:v0.78.1 | 🟒 | v0.78.1 | [v0.78.1](https://github.com/prometheus-operator/prometheus-operator/releases/tag/v0.78.1) | [66.2.1](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/Chart.yaml#L26) | +| quay.io/prometheus-operator/prometheus-operator:v0.78.1 | 🟒 | v0.78.1 | [v0.78.1](https://github.com/prometheus-operator/prometheus-operator/releases/tag/v0.78.1) | [66.1.1](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/Chart.yaml#L26) | | quay.io/prometheus/alertmanager:v0.27.0 | 🟒 | v0.27.0 | [v0.27.0](https://github.com/prometheus/alertmanager/releases/tag/v0.27.0) | [60.4.0](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/Chart.yaml#L26) | -| quay.io/prometheus/node-exporter:v1.7.0 | 🟒 | v1.7.0 | [v1.8.1](https://github.com/prometheus/node_exporter/releases/tag/v1.8.1) | [60.4.0](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/Chart.yaml#L26) | -| quay.io/prometheus/prometheus:v2.51.0 | 🟒 | v2.53.0 | [v2.53.0](https://github.com/prometheus/prometheus/releases/tag/v2.53.0) | [60.4.0](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/Chart.yaml#L26) | -| quay.io/thanos/thanos:v0.33.0 | 🟒 | v0.36.0 | [v0.36.0](https://github.com/thanos-io/thanos/releases/tag/v0.36.0-rc.0) | [60.4.0](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/Chart.yaml#L26) | -| registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.11.0 | 🟒 | [v2.10.1](https://github.com/kubernetes/kube-state-metrics?tab=readme-ov-file#compatibility-matrix) | [2.12.0](https://github.com/kubernetes/kube-state-metrics/releases) | [60.4.0](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/Chart.yaml#L26) | +| quay.io/prometheus/node-exporter:v1.7.0 | 🟒 | v1.7.0 | [v1.8.2](https://github.com/prometheus/node_exporter/releases/tag/v1.8.2) | [60.4.0](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/Chart.yaml#L26) | +| quay.io/prometheus/prometheus:v2.51.0 | 🟒 | v3.0.0 | [v3.0.0](https://github.com/prometheus/prometheus/releases/tag/v3.0.0) | [66.1.1](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/Chart.yaml#L26) | +| quay.io/thanos/thanos:v0.34.1 | 🟠 | v0.36.1 | [v0.36.1](https://github.com/bitnami/charts/blob/c52ccd47ba9334bd99eeb438d2dc188497e50703/bitnami/thanos/Chart.yaml#L13) | [15.8.1](https://github.com/bitnami/charts/blob/c52ccd47ba9334bd99eeb438d2dc188497e50703/bitnami/thanos/Chart.yaml#L38) | +| registry.k8s.io/kube-state-metrics/kube-state-metrics:v2.14.0 | 🟒 | [v2.14.0](https://github.com/kubernetes/kube-state-metrics?tab=readme-ov-file#compatibility-matrix) | [2.14.0](https://github.com/kubernetes/kube-state-metrics/releases) | [66.2.1](https://github.com/prometheus-community/helm-charts/blob/main/charts/kube-prometheus-stack/Chart.yaml#L26) | ## overprovision -| container image | urgency | latest version for k8s 1.28 | latest version available | latest helm chart | +| container image | urgency | latest version for k8s 1.29 | latest version available | latest helm chart | |-|-|-|-|-| | registry.k8s.io/cpa/cluster-proportional-autoscaler:1.8.6 | 🟒 | v1.8.9 | [v1.8.9](https://github.com/kubernetes-sigs/cluster-proportional-autoscaler/releases/tag/v1.8.9) | [1.1.0](https://github.com/kubernetes-sigs/cluster-proportional-autoscaler/tree/master/charts/cluster-proportional-autoscaler) | registry.k8s.io/pause:3.9 | 🟒 | v3.9 | [v3.9](https://github.com/kubernetes/kubernetes/tree/master/build/pause) | [registry](https://github.com/kubernetes/registry.k8s.io/blob/main/docs/debugging.md#verify-image-repositories-and-tags) | ## tigera-operator -| container image | urgency | latest version for k8s 1.28 | latest version available | latest helm chart | +| container image | urgency | latest version for k8s 1.29 | latest version available | latest helm chart | |-|-|-|-|-| -| quay.io/tigera/operator:v1.30.0 | 🟠 | v1.34.1 | [v1.34.1](https://github.com/tigera/operator/releases/tag/v1.34.1) | [3.28.0](https://github.com/projectcalico/calico/tree/master/charts/tigera-operator) +| quay.io/tigera/operator:v1.30.0 | πŸ”΄ | v1.36.1 | [v1.36.1](https://github.com/tigera/operator/releases/tag/v1.36.1) | [3.28.0](https://github.com/projectcalico/calico/tree/master/charts/tigera-operator) ## trivy-system -| container image | urgency | latest version for k8s 1.28 | latest version available | latest helm chart | +| container image | urgency | latest version for k8s 1.29 | latest version available | latest helm chart | |-|-|-|-|-| -| ghcr.io/aquasecurity/trivy-operator:0.16.4 | 🟠 | v0.21.3| [v0.21.3](https://github.com/aquasecurity/trivy-operator/releases/tag/v0.21.3) | [0.23.3](https://github.com/aquasecurity/trivy-operator/blob/main/deploy/helm/Chart.yaml) -| ghcr.io/aquasecurity/trivy:0.47.0 | 🟠 | v0.52.2 | [v0.52.2](https://github.com/aquasecurity/trivy/releases) | [0.23.3](https://github.com/aquasecurity/trivy-operator/blob/main/deploy/helm/Chart.yaml) +| ghcr.io/aquasecurity/trivy-operator:0.23.0 | 🟒 | v0.23.0| [v0.23.0](https://github.com/aquasecurity/trivy-operator/releases/tag/v0.23.0) | [0.23.0](https://github.com/aquasecurity/trivy-operator/blob/main/deploy/helm/Chart.yaml) +| ghcr.io/aquasecurity/trivy:0.57.1 | 🟒 | v0.57.0 | [v0.57.1](https://github.com/aquasecurity/trivy/releases) | [0.24.1](https://github.com/aquasecurity/trivy-operator/blob/main/deploy/helm/Chart.yaml) ## velero -| container image | urgency | latest version for k8s 1.28 | latest version available | latest helm chart | +| container image | urgency | latest version for k8s 1.29 | latest version available | latest helm chart | |-|-|-|-|-| -| velero/velero:v1.13.0 | 🟒 | [v1.14.0](https://github.com/vmware-tanzu/velero?tab=readme-ov-file#velero-compatibility-matrix) | [v1.14.0](https://github.com/vmware-tanzu/velero/releases) | [ 7.0.0](https://github.com/vmware-tanzu/helm-charts/blob/main/charts/velero/Chart.yaml) | +| velero/velero:v1.13.0 | πŸ”΄ | [v1.15.0](https://github.com/vmware-tanzu/velero?tab=readme-ov-file#velero-compatibility-matrix) | [v1.15.0](https://github.com/vmware-tanzu/velero/releases) | [8.0.0](https://github.com/vmware-tanzu/helm-charts/blob/main/charts/velero/Chart.yaml) | diff --git a/runbooks/source/create-custom-cluster.html.md.erb b/runbooks/source/create-custom-cluster.html.md.erb new file mode 100644 index 00000000..018ae758 --- /dev/null +++ b/runbooks/source/create-custom-cluster.html.md.erb @@ -0,0 +1,46 @@ +--- +title: Create Custom Cluster +weight: 8600 +last_reviewed_on: 2024-11-27 +review_in: 6 months +--- + +# Create custom cluster + +In concourse we have the ability to create custom clusters based of whichever branch you specify. + +First we need to update the pipeline config with our branch, you need to run the following command from the `cloud-platform-terraform-concourse` repo root folder. + +``` +fly -t moj-cp set-pipeline --pipeline custom-cluster --config pipelines/manager/main/custom-cluster.yaml -v branch_name=migrate-eks-csi +``` + +From here you can then kick off the build from the concourse UI. + +## Deleting your custom cluster + +Our delete pipeline allows you to specify the `branch_name` in the config so to do this you run: + +``` +fly -t moj-cp set-pipeline --pipeline delete-cluster --config pipelines/manager/main/delete-cluster.yaml -v branch_name=migrate-eks-csi -v cluster_name=tp-0000-0000 +``` + +You need to pass both `branch_name` and `cluster_name` to the command. + +##Β Rerunning stages in the create custom cluster pipeline + +If you need to re-run a plan or apply at any stage after the cluster has been created you can do this by running the following: + +``` +fly -t moj-cp set-pipeline --pipeline custom-cluster --config pipelines/manager/main/custom-cluster.yaml -v branch_name=migrate-eks-csi -v cluster_name=tp-0000-0000 +``` + +Then you can select the step in the pipeline that you'd like to re-run. + +## Run integration tests against custom cluster + +You can also use the `custom-integration-tests` job within the `custom-cluster` pipeline to execute the go integration test suite against a cluster of your choice: + +`fly -t manager set-pipeline -p custom-cluster -c pipelines/manager/main/custom-cluster.yaml -v branch_name= -v cluster_name=` + +And then trigger a new build in `custom-integration-tests` job view. diff --git a/runbooks/source/delete-prometheus-metrics.html.md.erb b/runbooks/source/delete-prometheus-metrics.html.md.erb index 8807c51c..ce65bbb8 100644 --- a/runbooks/source/delete-prometheus-metrics.html.md.erb +++ b/runbooks/source/delete-prometheus-metrics.html.md.erb @@ -1,7 +1,7 @@ --- title: Delete Prometheus Metrics weight: 170 -last_reviewed_on: 2024-05-24 +last_reviewed_on: 2024-11-25 review_in: 6 months --- diff --git a/runbooks/source/delete-state-lock.html.md.erb b/runbooks/source/delete-state-lock.html.md.erb index 70684230..680e054b 100644 --- a/runbooks/source/delete-state-lock.html.md.erb +++ b/runbooks/source/delete-state-lock.html.md.erb @@ -1,7 +1,7 @@ --- title: Delete terraform state lock weight: 199 -last_reviewed_on: 2024-05-24 +last_reviewed_on: 2024-11-25 review_in: 6 months --- diff --git a/runbooks/source/disaster-recovery-scenarios.html.md.erb b/runbooks/source/disaster-recovery-scenarios.html.md.erb index d74a3454..aaf401f2 100644 --- a/runbooks/source/disaster-recovery-scenarios.html.md.erb +++ b/runbooks/source/disaster-recovery-scenarios.html.md.erb @@ -1,7 +1,7 @@ --- title: Cloud Platform Disaster Recovery Scenarios weight: 91 -last_reviewed_on: 2024-05-20 +last_reviewed_on: 2024-11-25 review_in: 6 months --- @@ -152,7 +152,7 @@ This way of restoring the whole cluster have been tested with below procedure Any namespaces over 3 hours old can be recovered using Velero (newer namespaces might not have been backed up before the incident occurred). Create the cluster with the **same** name from the [source code](https://github.com/ministryofjustice/cloud-platform-infrastructure/blob/main/create-cluster.rb) -and provide the exisiting `vpc-name`. This will link the velero backup locations to the lost cluster. +and provide the existing `vpc-name`. This will link the velero backup locations to the lost cluster. Find the name of the most recent backup of the `allnamespacebackup` schedule: diff --git a/runbooks/source/export-elasticsearch-to-csv.html.md.erb b/runbooks/source/export-elasticsearch-to-csv.html.md.erb index db88f15e..e35a1daa 100644 --- a/runbooks/source/export-elasticsearch-to-csv.html.md.erb +++ b/runbooks/source/export-elasticsearch-to-csv.html.md.erb @@ -1,7 +1,7 @@ --- title: Export data from AWS Elasticsearch into a CSV file weight: 190 -last_reviewed_on: 2024-05-24 +last_reviewed_on: 2024-11-25 review_in: 6 months --- diff --git a/runbooks/source/get-audit-log-from-modsec.html.md.erb b/runbooks/source/get-audit-log-from-modsec.html.md.erb index 9211340a..b0616be7 100644 --- a/runbooks/source/get-audit-log-from-modsec.html.md.erb +++ b/runbooks/source/get-audit-log-from-modsec.html.md.erb @@ -1,42 +1,39 @@ --- title: Get an audit log from modsec weight: 8600 -last_reviewed_on: 2024-05-24 +last_reviewed_on: 2024-11-27 review_in: 6 months --- # OpenSearch modsec setup -We have introduced an openSearch dashboard which collects all modsec logs and has document level security enabled. This means users can only access the logs for the github team they are in [see here for more details](https://user-guide.cloud-platform.service.justice.gov.uk/documentation/networking/modsecurity.html). With this feature in place users can self serve and access their own modsec logs. In the case of a rare error and logs aren't flowing to OpenSearch, then you must use the instructions below to access modsec logs on behalf of the user. +We have introduced an OpenSearch dashboard which collects all modsec logs and has document level security enabled. This means users can only access the logs for the github team they are in [see here for more details](https://user-guide.cloud-platform.service.justice.gov.uk/documentation/networking/modsecurity.html). With this feature in place users can self serve and access their own modsec logs. In the case of a rare error and logs aren't flowing to OpenSearch, then you must use the instructions below to access modsec logs on behalf of the user. ## Get an audit log from modsec (when fluent-bit is not pushing to OpenSearch) -On occasion users may need you to provide them with audit log information on an modsec event from our ingress-controllers. This information may be sensitive so it can't be placed in our org-wide Elasticsearch cluster. You'll need to fetch this information from the pod that generated the log. +In the event that audit logs have failed to ship to OpenSearch, you'll need to fetch this information from the pod that generated the log. ### How do I check the audit log -As mentioned above, the audit log cannot be placed into Elasticsearch so you'll need the following: - -- A Kibana event from the user. A request will come into the ask-cloud-platform channel asking something like: +- An OpenSearch ingress event from the user. A request will come into the ask-cloud-platform channel asking something like: ``` Good afternoon, could I ask for the detailed logs for this block from ModSecurity, please? -https://kibana.cloud-platform.service.justice.gov.uk/_plugin/kibana/app/kibana#/doc/fb2e6550-0186-11ec-a2cf-6b21[…]lue:0),time:(from:now-3h,to:now)) -(I need to find out which rules triggered the block, it has 2 critical fails) +https://app-logs.cloud-platform.service.justice.gov.uk/_dashboards/app/data-explorer/discover#?[SOME-SEARCH-QUERY.....] +(I need to find out which rules triggered the block) -example: https://mojdt.slack.com/archives/C57UPMZLY/p1630936971082200 ``` -- The Kibana event above should provide you with the following key information +- The OpenSearch event above should provide you with the following key information ``` - modsec pod name (optional): This will allow you to hone in on the correct pod. - unique_id: This is a hash of the event in question, e.g. 16494071776.005464 + kubernetes.pod_name (optional): This will allow you to hone in on the modsec ingress correct pod. + unique_id: This is a hash of the event in question, e.g. 16494071776.005464, and can be located in the log entry. ``` - Kubectl access to the live cluster and access to the `ingress-controllers` namespace. -### Perform a search for the unique-id (obtained from the Kibana entry) +### Perform a search for the unique-id (obtained from the OpenSearch entry) ``` # assuming the event id is 16494071776.005464 diff --git a/runbooks/source/grafana-dashboards.html.md.erb b/runbooks/source/grafana-dashboards.html.md.erb index 8aa3ac0f..1e236f3d 100644 --- a/runbooks/source/grafana-dashboards.html.md.erb +++ b/runbooks/source/grafana-dashboards.html.md.erb @@ -1,7 +1,7 @@ --- title: Grafana Dashboards weight: 9106 -last_reviewed_on: 2024-10-09 +last_reviewed_on: 2024-11-15 review_in: 3 months --- @@ -36,7 +36,7 @@ kubectl describe node ### Fixing "failed to load dashboard" errors -The kibana alert has reported an error similar to: +The OpenSearch alert has reported an error similar to: > Grafana failed to load one or more dashboards - This could prevent new dashboards from being created ⚠️ @@ -68,7 +68,7 @@ Contact the user in the given slack-channel and ask them to fix it. Provide the ### Fixing "duplicate dashboard uid" errors -The kibana alert has reported an error similar to: +The OpenSearch alert has reported an error similar to: > Duplicate Grafana dashboard UIDs found diff --git a/runbooks/source/kibana-podsecurity-violations-alert.html.md.erb b/runbooks/source/kibana-podsecurity-violations-alert.html.md.erb deleted file mode 100644 index 4d4c0bea..00000000 --- a/runbooks/source/kibana-podsecurity-violations-alert.html.md.erb +++ /dev/null @@ -1,39 +0,0 @@ ---- -title: Kibana PodSecurity Violations Alert -weight: 191 -last_reviewed_on: 2024-09-11 -review_in: 3 months ---- - -# Kibana PodSecurity Violations Alert -This runbook will document the Kibana PodSecurity (PSA) violations monitor and how to debug the offending namespace and resources. - -## Kibana Alert/Monitor - -[This Kibana monitor](https://kibana.cloud-platform.service.justice.gov.uk/_plugin/kibana/app/opendistro-alerting#/monitors/jR-J3YsBP8PE0GofcRIF) has been created that will alert if any PSA violations are detected. - -You can see when previous alerts have been triggered under the `Alerts` section on the monitor. - -## Checking logs for PSA violations in Kibana - -To diagnose which namespace(s) are violating and to see the reason in the logs, either go to the [discover section on Kibana](https://kibana.cloud-platform.service.justice.gov.uk/_plugin/kibana/app/discover#/) and search for the following query: - -``` -"violates PodSecurity" AND NOT "smoketest-restricted" AND NOT "smoketest-privileged" -``` - -Or follow [this link](https://kibana.cloud-platform.service.justice.gov.uk/_plugin/kibana/app/discover#/?_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-5h,to:now))&_a=(columns:!(_source),filters:!(),index:'167701b0-f8c0-11ec-b95c-1d65c3682287',interval:auto,query:(language:kuery,query:'%22violates%20PodSecurity%22%20AND%20NOT%20%22smoketest-restricted%22%20AND%20NOT%20%22smoketest-privileged%22'),sort:!())) to get the same search. - -This will show any logs of PSA violations (excluding smoketests). If no logs appear, then increase the time frame to match when the alert was triggered. You can check this on the monitor under the `Alerts` heading. - -In the logs, it will provide information such as the offending namespace and the reason it has been triggered. - -## Fixing PSA Violations - -To fix a PSA violation and stop the monitor from triggering, gather the namespace and violation reason from the logs and then contact a member of the team that owns the violating namespace with details of what is causing the issue, the user then should resolve this issue. - -## Slack Alert - -Kibana will put a message into the `#low-priority-alarms` slack channel whenever the [PodSecurity Violations monitor](https://kibana.cloud-platform.service.justice.gov.uk/_plugin/kibana/app/opendistro-alerting#/monitors/jR-J3YsBP8PE0GofcRIF) first goes into the `Triggered` status. - -The monitor is throttled to only send 1 message every 24 hours per trigger. This means if a namespace is already triggering the monitor then when another violation occurs, then it will not send another message. The best way to check what is triggering the monitor is to use the steps mentioned above under [Checking logs for PSA violation in Kibana](#checking-logs-for-psa-violations-in-kibana). diff --git a/runbooks/source/leavers-guide.html.md.erb b/runbooks/source/leavers-guide.html.md.erb index cd8f3c1a..718188cd 100644 --- a/runbooks/source/leavers-guide.html.md.erb +++ b/runbooks/source/leavers-guide.html.md.erb @@ -70,7 +70,7 @@ Below are the list of 3rd party accounts that need to be removed when a member l 4. [Pagerduty](https://moj-digital-tools.pagerduty.com/users) -5. [DockerHub MoJ teams](https://cloud.docker.com/orgs/ministryofjustice/teams) +5. DockerHub MoJ teams 6. [Pingdom](https://www.pingdom.com) diff --git a/runbooks/source/manually-delete-namespace-resources.html.md.erb b/runbooks/source/manually-delete-namespace-resources.html.md.erb index e8cf8fcd..ae70b95b 100644 --- a/runbooks/source/manually-delete-namespace-resources.html.md.erb +++ b/runbooks/source/manually-delete-namespace-resources.html.md.erb @@ -54,7 +54,7 @@ Locate the PR number for the namespace deletion PR, and execute the following co ```bash cloud-platform environment destroy \ - --prNumber [namespace-deletion-PR] \ + --pr-number [namespace-deletion-PR] \ --cluster arn:aws:eks:eu-west-2:754256621582:cluster/live \ --kubecfg ~/.kube/config \ --clusterdir live.cloud-platform.service.justice.gov.uk \ diff --git a/runbooks/source/opensearch-podsecurity-violations-alert.html.md.erb b/runbooks/source/opensearch-podsecurity-violations-alert.html.md.erb new file mode 100644 index 00000000..21e3fd0f --- /dev/null +++ b/runbooks/source/opensearch-podsecurity-violations-alert.html.md.erb @@ -0,0 +1,39 @@ +--- +title: OpenSearch PodSecurity Violations Alert +weight: 191 +last_reviewed_on: 2024-11-15 +review_in: 3 months +--- + +# OpenSearch PodSecurity Violations Alert +This runbook will document the OpenSearch PodSecurity (PSA) violations monitor and how to debug the offending namespace and resources. + +## OpenSearch Alert/Monitor + +[This OpenSearch monitor](https://app-logs.cloud-platform.service.justice.gov.uk/_dashboards/app/alerting#/monitors/t4z3XI8BxtKHqtnhcXO2) has been created that will alert if any PSA violations are detected. + +You can see when previous alerts have been triggered under the `Alerts` section on the monitor. + +## Checking logs for PSA violations in OpenSearch + +To diagnose which namespace(s) are violating and to see the reason in the logs, either go to the [discover section on OpenSearch](https://app-logs.cloud-platform.service.justice.gov.uk/_dashboards/app/data-explorer/discover/) and search for the following query: + +``` +"violates PodSecurity" AND NOT "smoketest-restricted" AND NOT "smoketest-privileged" +``` + +Or follow [this link](https://app-logs.cloud-platform.service.justice.gov.uk/_dashboards/app/data-explorer/discover#?_q=(filters:!(),query:(language:kuery,query:'%22violates%20PodSecurity%22%20AND%20NOT%20%22smoketest-restricted%22%20AND%20NOT%20%22smoketest-privileged%22'))&_a=(discover:(columns:!(_source),isDirty:!f,sort:!()),metadata:(indexPattern:bb90f230-0d2e-11ef-bf63-53113938c53a,view:discover))&_g=(filters:!(),refreshInterval:(pause:!t,value:0),time:(from:now-5h,to:now))) to get the same search. + +This will show any logs of PSA violations (excluding smoketests). If no logs appear, then increase the time frame to match when the alert was triggered. You can check this on the monitor under the `Alerts` heading. + +In the logs, it will provide information such as the offending namespace and the reason it has been triggered. + +## Fixing PSA Violations + +To fix a PSA violation and stop the monitor from triggering, gather the namespace and violation reason from the logs and then contact a member of the team that owns the violating namespace with details of what is causing the issue, the user then should resolve this issue. + +## Slack Alert + +OpenSearch will put a message into the `#low-priority-alarms` slack channel whenever the [PodSecurity Violations monitor](https://app-logs.cloud-platform.service.justice.gov.uk/_dashboards/app/alerting#/monitors/t4z3XI8BxtKHqtnhcXO2) first goes into the `Triggered` status. + +The monitor is throttled to only send 1 message every 24 hours per trigger. This means if a namespace is already triggering the monitor then when another violation occurs, then it will not send another message. The best way to check what is triggering the monitor is to use the steps mentioned above under [Checking logs for PSA violation in OpenSearch](#checking-logs-for-psa-violations-in-opensearch). diff --git a/runbooks/source/resolve-opensearch-no-logs.html.md.erb b/runbooks/source/resolve-opensearch-no-logs.html.md.erb index 1713d664..ba4c48c2 100644 --- a/runbooks/source/resolve-opensearch-no-logs.html.md.erb +++ b/runbooks/source/resolve-opensearch-no-logs.html.md.erb @@ -1,7 +1,7 @@ --- title: Resolving no logs in modsec OpenSearch weight: 190 -last_reviewed_on: 2024-05-24 +last_reviewed_on: 2024-11-25 review_in: 6 months --- diff --git a/runbooks/source/resolve-opensearch-shard-issues.html.md.erb b/runbooks/source/resolve-opensearch-shard-issues.html.md.erb index d86fd11f..4da41717 100644 --- a/runbooks/source/resolve-opensearch-shard-issues.html.md.erb +++ b/runbooks/source/resolve-opensearch-shard-issues.html.md.erb @@ -1,7 +1,7 @@ --- title: Resolving OpenSearch shard problems weight: 190 -last_reviewed_on: 2024-05-24 +last_reviewed_on: 2024-11-25 review_in: 6 months --- @@ -52,7 +52,7 @@ kubectl run curl-pod -n --image="alpine/curl" --restart=Never - ## Connecting to the OpenSearch api -Because we have fine-grained access enabled on OpenSearch connection isn't based on ip. It's based on SAML. To link your cli with OpenSearch there is a manual step of adding your aws user arn to the `all_access` OpenSearh role. +Because we have fine-grained access enabled on OpenSearch connection isn't based on ip. It's based on SAML. To link your cli with OpenSearch there is a manual step of adding your aws user arn to the `all_access` OpenSearch role. 1. login to the OpenSearch dashboard using github via saml 1. as a webops team member you have permissions to edit roles so head to Security -> Roles -> `all_access` (see screenshot below) diff --git a/runbooks/source/upgrade-eks-addons.html.md.erb b/runbooks/source/upgrade-eks-addons.html.md.erb index 0a774b3a..7c4c279f 100644 --- a/runbooks/source/upgrade-eks-addons.html.md.erb +++ b/runbooks/source/upgrade-eks-addons.html.md.erb @@ -45,6 +45,8 @@ aws eks describe-addon-versions --kubernetes-version=$K8S_VERSION | jq '.addons[ this will pull out the default compatible value for the k8s version for your addon. +You can use the helper script to get the most up-to-date available addon versions for each kubernetes cluster version [script here](https://github.com/ministryofjustice/cloud-platform-infrastructure/blob/main/scripts/addons-upgrade.bash) + ## Preparing for upgrade Check the changelog for each of the addons and determine if there are any breaking changes. @@ -53,26 +55,24 @@ Create a thread in #cloud-platform notifying the team that upgrades are starting ## Starting the upgrade -1. Bump the version number in cloud-platform-terraform-eks-add-ons -2. Commit changes on a new branch and create a pull request -3. Request review from someone on the team -4. Merge pull request and create a new release through the Github UI -5. Bump the version number of the cloud-platform-terraform-eks-add-ons in cloud-platform-infrastructure -6. Commit changes on a new branch and create a pull request -7. Request review from someone on the team -8. Check the terraform plan in concourse and pause the following pipelines: +1. Run the helper [script](https://github.com/ministryofjustice/cloud-platform-infrastructure/blob/main/scripts/addons-upgrade.bash) before +2. Bump the version of the addon +3. Commit changes on a new branch and create a pull request +4. Request review from someone on the team +5. Check the terraform plan in concourse and pause the following pipelines: * bootstrap * infrastructure-live * infrastructure-manager * infrastructure-live-2 -9. Create an output of the configuration of a pod before the upgrade. `kubectl -n kube-system get pod $addon -oyaml` there is also a helper [script](https://github.com/ministryofjustice/cloud-platform-infrastructure/blob/main/scripts/addons-upgrade.bash). -10. Merge the pull request -11. Unpause an infrastructure pipeline and wait for it to complete -12. While running: +6. Create an output of the configuration of a pod before the upgrade. `kubectl -n kube-system get pod $addon -oyaml` there is also a helper . +7. Merge the pull request +8 Unpause an infrastructure pipeline and wait for it to complete +9. While running: * Keep an eye on pods recycling `watch -n 1 "kubectl -n kube-system get pods"` * Keep an eye on events `watch -n 1 "kubectl -n kube-system get events"` -13. Run the reporting pipeline on the infrastructure environment -14. If everything is green repeat steps 11-14 on each environment. +10. Run the helper [script](https://github.com/ministryofjustice/cloud-platform-infrastructure/blob/main/scripts/addons-upgrade.bash) after +11. Run the reporting pipeline on the infrastructure environment +12. If everything is green repeat steps 8 - 11 on each environment. ## Finish the upgrade diff --git a/runbooks/source/upgrade-eks-cluster.html.md.erb b/runbooks/source/upgrade-eks-cluster.html.md.erb index 195abdd8..00fe480e 100644 --- a/runbooks/source/upgrade-eks-cluster.html.md.erb +++ b/runbooks/source/upgrade-eks-cluster.html.md.erb @@ -73,6 +73,8 @@ Pause the following pipelines: * infrastructure-live-2 * infrastructure-manager +> **IMPORTANT:** Add a Pull Request to pause the Dependabot action in the infrastructure repository before pausing as you do not want any changes going through concourse after unpausing the pipeline. + Update `cluster.tf` in `cloud-platform-infrastructure` with the version of Kubernetes you are upgrading to. Run a `tf plan` against the cluster your upgrading to check to see if everything is expected, the only changes should be to resources relating to the the version upgrade. @@ -106,7 +108,12 @@ As with preparing for the upgrade communication is really important, keep the th #### Increasing coredns pods -To ensure that coredns stays up and running during the cluster upgrade replications should be scaled up to 10. +To ensure that coredns stays up and running during the cluster upgrade replications should be scaled up to 10. This can be done with the following command: + +```bash +kubectl scale deployment coredns --replicas=10 -n kube-system +``` +> **NOTE:** This is a temporary measure, double check the deployment for the current replicaset, as you will need this for when you scale back after the completion of the upgrade. #### Upgrading the control pane @@ -130,7 +137,7 @@ Click `Update` From the cluster control panel select `Compute` tab. -Select `Upgrade now` next to the monitoring node group. +Select `Upgrade now` next to the default node group. For update strategy select "Force update" @@ -154,6 +161,10 @@ Unpause the bootstrap pipeline. Scale down the coredns pods. +```bash +kubectl scale deployment coredns --replicas=3 -n kube-system +``` + ### Finishing touches The `kubectl` version in the `cloud-platform-cli` and `cloud-platform-tools-image` needs updating to match the current Kubernetes version. diff --git a/runbooks/source/upgrade-terraform-version.html.md.erb b/runbooks/source/upgrade-terraform-version.html.md.erb index 76887546..d6b60493 100644 --- a/runbooks/source/upgrade-terraform-version.html.md.erb +++ b/runbooks/source/upgrade-terraform-version.html.md.erb @@ -1,7 +1,7 @@ --- title: Upgrade Terraform Version weight: 54 -last_reviewed_on: 2024-05-24 +last_reviewed_on: 2024-11-25 review_in: 6 months --- @@ -126,10 +126,10 @@ Here is a snapshot of how our directory looks but this is likely to change: aws-accounts β”œβ”€β”€ cloud-platform-aws β”‚ β”œβ”€β”€ account # AWS Account specific configuration. -β”‚ └── vpc # VPC creation. Workspaces for individual clusters +β”‚ └── vpc # VPC creation. Workspaces for individual clusters β”‚ β”œβ”€β”€ eks # Holding EKS, workspaces for individual clusters. -β”‚ β”‚ └── components # EKS components. Workspaces for individual clusters -β”‚ └── kops # Holding KOPS, workspaces for individual clusters. +β”‚ β”‚ └── core # EKS core. Workspaces for individual clusters + | └── components # EKS components. β”œβ”€β”€ cloud-platform-dsd β”‚ └── main.tf β”œβ”€β”€ cloud-platform-ephemeral-test diff --git a/runbooks/source/upgrade-user-components.html.md.erb b/runbooks/source/upgrade-user-components.html.md.erb index d0e041a4..335538e6 100644 --- a/runbooks/source/upgrade-user-components.html.md.erb +++ b/runbooks/source/upgrade-user-components.html.md.erb @@ -1,7 +1,7 @@ --- title: Upgrade user components weight: 55 -last_reviewed_on: 2024-05-24 +last_reviewed_on: 2024-11-25 review_in: 6 months ---