Skip to content

Commit

Permalink
Merge pull request #3148 from GoogleCloudPlatform/release-candidate
Browse files Browse the repository at this point in the history
Release v1.41.0
  • Loading branch information
harshthakkar01 authored Oct 25, 2024
2 parents eb00254 + 0c0f1c4 commit 26fafe0
Show file tree
Hide file tree
Showing 258 changed files with 3,633 additions and 813 deletions.
4 changes: 3 additions & 1 deletion .github/workflows/pr-label-validation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,13 @@ on:
- ready_for_review
- unlocked
branches:
- main
- develop
- release-candidate

jobs:
pr-label-validation:
if: github.repository == 'GoogleCloudPlatform/cluster-toolkit'
if: github.repository == 'GoogleCloudPlatform/cluster-toolkit' && github.event.pull_request.draft == false
runs-on: ubuntu-latest
permissions:
pull-requests: read
Expand Down
3 changes: 1 addition & 2 deletions .github/workflows/pr-precommit.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,8 @@ name: 'Use pre-commit to validate Pull Request'
on:
pull_request:
types:
- edited
- opened
- labeled
- reopened
- synchronize
branches:
- main
Expand Down
18 changes: 18 additions & 0 deletions cmd/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,9 +125,27 @@ func expandOrDie(path string) (config.Blueprint, *config.YamlCtx) {
// Expand the blueprint
checkErr(bp.Expand(), ctx)
validateMaybeDie(bp, *ctx)
v5DeprecationWarning(bp)

return bp, ctx
}

// TODO: Remove this warning when v5 deprecation is complete
func v5DeprecationWarning(bp config.Blueprint) {
alreadyContainsV5 := false
bp.WalkModulesSafe(func(mp config.ModulePath, m *config.Module) {
if strings.Contains(m.Source, "schedmd-slurm-gcp-v5-controller") && !alreadyContainsV5 {
logging.Info(boldYellow(
"We have been supporting slurm-gcp v5 since July 2022 and are now deprecating it, as we've launched slurm-gcp v6 in June 2024. \n" +
"Toolkit blueprints using Slurm-gcp v5 will be marked “deprecated” starting October 2024 and slurm-gcp v6 will be the default deployment. \n" +
"However we won't begin removing slurm-gcp v5 blueprints until January 6, 2025. Beginning on January 6, 2025, the Cluster Toolkit team will cease their support for Slurm-gcp v5. \n" +
"While this will not directly or immediately impact running clusters, we recommend replacing any v5 clusters with Slurm-gcp v6.",
))
alreadyContainsV5 = true // This is to avoid the logging message showing repeatedly for multiple v5 controllers
}
})
}

// TODO: move to expand.go
func validateMaybeDie(bp config.Blueprint, ctx config.YamlCtx) {
err := validators.Execute(bp)
Expand Down
2 changes: 1 addition & 1 deletion cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ HPC deployments on the Google Cloud Platform.`,
logging.Fatal("cmd.Help function failed: %s", err)
}
},
Version: "v1.40.0",
Version: "v1.41.0",
Annotations: annotation,
}
)
Expand Down
2 changes: 1 addition & 1 deletion community/examples/AMD/hpc-amd-slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ deployment_groups:
# these images must match the images used by Slurm modules below because
# we are building OpenMPI with PMI support in libraries contained in
# Slurm installation
family: slurm-gcp-6-6-hpc-rocky-linux-8
family: slurm-gcp-6-7-hpc-rocky-linux-8
project: schedmd-slurm-public

- id: low_cost_nodeset
Expand Down
2 changes: 1 addition & 1 deletion community/examples/hpc-build-slurm-image.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ vars:
image_build_machine_type: n2d-standard-16
build_from_image_family: hpc-rocky-linux-8
build_from_image_project: cloud-hpc-image-public
build_from_git_ref: 6.7.0
build_from_git_ref: 6.8.2
built_image_family: my-custom-slurm
built_instance_image:
family: $(vars.built_image_family)
Expand Down
1 change: 0 additions & 1 deletion community/examples/hpc-slurm-ramble-gromacs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@ deployment_groups:
modules:
# Source is an embedded module, denoted by "modules/*" without ./, ../, /
# as a prefix. To refer to a local module, prefix with ./, ../ or /
# Example - ./modules/network/vpc
- id: network
source: modules/network/vpc

Expand Down
1 change: 0 additions & 1 deletion community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ deployment_groups:
modules:
# Source is an embedded module, denoted by "modules/*" without ./, ../, /
# as a prefix. To refer to a local module, prefix with ./, ../ or /
# Example - ./modules/network/vpc
- id: network1
source: modules/network/vpc

Expand Down
3 changes: 1 addition & 2 deletions community/examples/hpc-slurm-ubuntu2004.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ vars:
slurm_image:
# Please refer to the following link for the latest images:
# https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems
family: slurm-gcp-6-6-ubuntu-2004-lts
family: slurm-gcp-6-7-ubuntu-2004-lts
project: schedmd-slurm-public
instance_image_custom: true

Expand All @@ -33,7 +33,6 @@ deployment_groups:
modules:
# Source is an embedded module, denoted by "modules/*" without ./, ../, /
# as a prefix. To refer to a local module, prefix with ./, ../ or /
# Example - ./modules/network/vpc
- id: network1
source: modules/network/vpc

Expand Down
2 changes: 1 addition & 1 deletion community/examples/hpc-slurm6-apptainer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ deployment_groups:
settings:
source_image_project_id: [schedmd-slurm-public]
# see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family
source_image_family: slurm-gcp-6-6-hpc-rocky-linux-8
source_image_family: slurm-gcp-6-7-hpc-rocky-linux-8
# You can find size of source image by using following command
# gcloud compute images describe-from-family <source_image_family> --project schedmd-slurm-public
disk_size: $(vars.disk_size)
Expand Down
1 change: 0 additions & 1 deletion community/examples/htc-slurm-v5-legacy.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ deployment_groups:
modules:
# Source is an embedded module, denoted by "modules/*" without ./, ../, /
# as a prefix. To refer to a local or community module, prefix with ./, ../ or /
# Example - ./modules/network/pre-existing-vpc
- id: network1
source: modules/network/vpc

Expand Down
1 change: 0 additions & 1 deletion community/examples/htc-slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ deployment_groups:
modules:
# Source is an embedded module, denoted by "modules/*" without ./, ../, /
# as a prefix. To refer to a local or community module, prefix with ./, ../ or /
# Example - ./modules/network/pre-existing-vpc
- id: network
source: modules/network/vpc

Expand Down
3 changes: 2 additions & 1 deletion community/examples/tutorial-starccm-slurm.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@
---

blueprint_name: starccm-on-slurm
toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
toolkit_modules_version: v1.41.0

vars:
project_id: ## Set GCP Project ID Here ##
Expand All @@ -30,7 +32,6 @@ deployment_groups:
modules:
# Source is an embedded module, denoted by "modules/*" without ./, ../, /
# as a prefix. To refer to a local module, prefix with ./, ../ or /
# Example - ./modules/network/vpc
- id: network1
source: modules/network/vpc

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
kind: ConfigMap
apiVersion: v1
metadata:
name: ${name}
data:
h100-mega-80gb-8: "${num_nodes}"
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
apiVersion: kueue.x-k8s.io/v1beta1
kind: ResourceFlavor
metadata:
name: 1xh100-mega-80gb-8
spec:
nodeLabels:
cloud.google.com/gke-accelerator: nvidia-h100-mega-80gb
---

apiVersion: kueue.x-k8s.io/v1beta1
kind: ClusterQueue
metadata:
name: cluster-queue
spec:
preemption:
reclaimWithinCohort: Never # Don't preempt other queues in the cohort.
withinClusterQueue: LowerPriority
namespaceSelector: {} # match all.
resourceGroups:
- coveredResources: ["nvidia.com/gpu"]
flavors:
- name: 1xh100-mega-80gb-8
resources:
- name: "nvidia.com/gpu"
nominalQuota: ${num_chips}
---
apiVersion: kueue.x-k8s.io/v1beta1
kind: LocalQueue
metadata:
namespace: default
name: multislice-queue
spec:
clusterQueue: cluster-queue
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: very-low
value: 100
globalDefault: false
description: "Very Low"
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: low
value: 250
globalDefault: false
description: "Low"
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: medium
value: 500
globalDefault: false
description: "Medium"
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: high
value: 750
globalDefault: false
description: "High"
---
apiVersion: scheduling.k8s.io/v1
kind: PriorityClass
metadata:
name: very-high
value: 1000
globalDefault: false
description: "Very High"
118 changes: 118 additions & 0 deletions community/examples/xpk-gke-a3-megagpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---

blueprint_name: xpk-gke-a3-megagpu

vars:
project_id: ## Set GCP Project ID Here ##
deployment_name: xpk-gke-a3-megagpu
region: us-central1
zone: us-central1-c

# Cidr block containing the IP of the machine calling terraform.
# The following line must be updated for this example to work.
authorized_cidr: <your-ip-address>/32

deployment_groups:
- group: primary
modules:
- id: network1
source: modules/network/vpc
settings:
subnetwork_name: xpk-gke-a3-megagpu-subnet
secondary_ranges:
xpk-gke-a3-megagpu-subnet:
- range_name: pods
ip_cidr_range: 10.4.0.0/14
- range_name: services
ip_cidr_range: 10.0.32.0/20

- id: gpunets
source: modules/network/multivpc
settings:
network_name_prefix: $(vars.deployment_name)-gpunet
global_ip_address_range: 192.169.0.0/16
network_count: 8
subnetwork_cidr_suffix: 24

- id: gke_cluster
source: modules/scheduler/gke-cluster
use: [network1, gpunets]
settings:
master_authorized_networks:
- cidr_block: $(vars.authorized_cidr) # Allows your machine run kubectl command. It's required for the multi-network setup.
display_name: "kubectl-access-network"
system_node_pool_machine_type: "e2-standard-32"
outputs: [instructions]

- id: group_placement_0
source: modules/compute/resource-policy
settings:
name: $(vars.deployment_name)-gp-np-0
group_placement_max_distance: 2

- id: group_placement_1
source: modules/compute/resource-policy
settings:
name: $(vars.deployment_name)-gp-np-1
group_placement_max_distance: 2

- id: a3_megagpu_pool_0
source: modules/compute/gke-node-pool
use: [gke_cluster, gpunets, group_placement_0]
settings:
name: a3-megagpu-pool-0
machine_type: a3-megagpu-8g
autoscaling_total_min_nodes: 2
initial_node_count: 2
zones: [$(vars.zone)]
host_maintenance_interval: PERIODIC
outputs: [instructions]

- id: a3_megagpu_pool_1
source: modules/compute/gke-node-pool
use: [gke_cluster, gpunets, group_placement_1]
settings:
name: a3-megagpu-pool-1
machine_type: a3-megagpu-8g
autoscaling_total_min_nodes: 2
initial_node_count: 2
zones: [$(vars.zone)]
host_maintenance_interval: PERIODIC
outputs: [instructions]

- id: workload_component_install
source: modules/management/kubectl-apply
use: [gke_cluster]
settings:
kueue:
install: true
config_path: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/kueue-xpk-configuration.yaml.tftpl
config_template_vars: {num_chips: "32"}
jobset:
install: true

- id: topology_aware_scheduler_install
source: community/modules/compute/gke-topology-scheduler
use: [gke_cluster]

- id: workload_configmap
source: modules/management/kubectl-apply
use: [gke_cluster]
settings:
apply_manifests:
- source: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/config-map.yaml.tftpl
template_vars: {name: "xpk-gke-a3-megagpu-resources-configmap", num_nodes: "4"}
2 changes: 1 addition & 1 deletion community/front-end/ofe/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ dill==0.3.6
distlib==0.3.6
# django-revproxy==0.11.0 released but not yet in pypi
git+https://github.com/jazzband/django-revproxy.git@d2234005135dc0771b7c4e0bb0465664ccfa5787
Django==4.2.15
Django==4.2.16
django-allauth==0.54.0
django-extensions==3.2.3
djangorestframework==3.15.2
Expand Down
Loading

0 comments on commit 26fafe0

Please sign in to comment.