Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/fine tuning 6b #243

Merged
merged 7 commits into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## UNRELEASED

### **Added**
- added new manifest `manifests/fine-tuning-6B`

### **Changed**

Expand Down
26 changes: 26 additions & 0 deletions manifests/fine-tuning-6B/base-modules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
name: networking
path: git::https://github.com/awslabs/idf-modules.git//modules/network/basic-cdk?ref=release/1.11.0&depth=1
parameters:
- name: InternetAccessible
value: true
---
name: buckets
path: git::https://github.com/awslabs/idf-modules.git//modules/storage/buckets?ref=release/1.11.0&depth=1
parameters:
- name: EncryptionType
value: SSE
- name: RetentionType
value: DESTROY
---
name: ray-ecr
path: git::https://github.com/awslabs/idf-modules.git//modules/storage/ecr?ref=release/1.11.0&depth=1
targetAccount: primary
parameters:
- name: ImageTagMutability
value: MUTABLE
- name: ImageScanOnPush
value: True
- name: Encryption
value: KMS_MANAGED
- name: RemovalPolicy
value: DESTROY
112 changes: 112 additions & 0 deletions manifests/fine-tuning-6B/core-modules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
name: eks
path: git::https://github.com/awslabs/idf-modules.git//modules/compute/eks?ref=release/1.11.0&depth=1
dataFiles:
- filePath: git::https://github.com/awslabs/idf-modules.git//data/eks_dockerimage-replication/versions/1.29.yaml?ref=release/1.11.0&depth=1
- filePath: git::https://github.com/awslabs/idf-modules.git//data/eks_dockerimage-replication/versions/default.yaml?ref=release/1.11.0&depth=1
parameters:
- name: VpcId
valueFrom:
moduleMetadata:
group: base
name: networking
key: VpcId
- name: ControlplaneSubnetIds
valueFrom:
moduleMetadata:
group: base
name: networking
key: PrivateSubnetIds
- name: DataplaneSubnetIds
valueFrom:
moduleMetadata:
group: base
name: networking
key: PrivateSubnetIds
- name: EksAdminRoleName
value: Admin
- name: EksPoweruserRoleName
value: PowerUser
- name: EksReadOnlyRoleName
value: ReadOnly
- name: EksVersion
value: "1.29"
# valueFrom:
# envVariable: GLOBAL_EKS_VERSION
- name: EksCompute
value:
eks_nodegroup_config:
- eks_ng_name: ng1
eks_node_quantity: 1
eks_node_max_quantity: 1
eks_node_min_quantity: 1
eks_node_disk_size: 400
eks_node_instance_type: "m5.xlarge"
eks_node_labels:
usage: core
- eks_ng_name: ng-gpu
eks_node_quantity: 6
eks_node_max_quantity: 15
eks_node_min_quantity: 6
eks_node_disk_size: 400
eks_node_instance_type: "g4dn.4xlarge"
eks_node_labels:
usage: gpu
nvidia.com/gpu.present: "true"
use_gpu_ami: True
eks_node_taints:
- key: "nvidia.com/gpu"
value: "true"
# operator: "Equal"
effect: "NoSchedule"
install_nvidia_device_plugin: True
eks_node_spot: False
eks_secrets_envelope_encryption: True
eks_api_endpoint_private: False
- name: EksAddons
value:
# Autoscaling
deploy_cluster_autoscaler: True
deploy_metrics_server: True
# Observability
deploy_cloudwatch_observability_addon: True
# Storage
deploy_aws_fsx_csi: True
---
name: fsx-lustre
path: git::https://github.com/awslabs/idf-modules.git//modules/storage/fsx-lustre?ref=release/1.11.0&depth=1
parameters:
- name: VpcId
valueFrom:
moduleMetadata:
group: base
name: networking
key: VpcId
- name: PrivateSubnetIds
valueFrom:
moduleMetadata:
group: base
name: networking
key: PrivateSubnetIds
- name: FsDeploymentType
value: SCRATCH_2
- name: StorageThroughput
value: 50
- name: DataBucketName
valueFrom:
moduleMetadata:
group: base
name: buckets
key: ArtifactsBucketName
- name: DraExportPath
valueFrom:
parameterValue: draExportPath
- name: DraImportPath
valueFrom:
parameterValue: draImportPath
- name: FsxVersion
value: "2.15"
- name: Namespace
valueFrom:
parameterValue: rayNamespaceName
- name: ImportPolicy
value: "NEW_CHANGED_DELETED"
30 changes: 30 additions & 0 deletions manifests/fine-tuning-6B/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: fine-tuning-6B
forceDependencyRedeploy: True
toolchainRegion: us-east-1
groups:
- name: base
path: manifests/fine-tuning-6B/base-modules.yaml
- name: images
path: manifests/fine-tuning-6B/images-modules.yaml
- name: core
path: manifests/fine-tuning-6B/core-modules.yaml
- name: integration
path: manifests/fine-tuning-6B/integration-modules.yaml
- name: ray-operator
path: manifests/fine-tuning-6B/ray-operator-modules.yaml
- name: ray-cluster
path: manifests/fine-tuning-6B/ray-cluster-modules.yaml
targetAccountMappings:
- alias: primary
accountId:
valueFrom:
envVariable: PRIMARY_ACCOUNT
default: true
codebuildImage: aws/codebuild/standard:7.0
parametersGlobal:
rayNamespaceName: ray
draImportPath: /ray/import/
draExportPath: /ray/export/
regionMappings:
- region: us-east-1
default: true
10 changes: 10 additions & 0 deletions manifests/fine-tuning-6B/images-modules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
name: ray
path: git::https://github.com/awslabs/aiops-modules.git//modules/eks/ray-image?ref=release/1.5.0&depth=1
targetAccount: primary
parameters:
- name: EcrRepoName
valueFrom:
moduleMetadata:
group: base
name: ray-ecr
key: EcrRepositoryName
63 changes: 63 additions & 0 deletions manifests/fine-tuning-6B/integration-modules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
name: lustre-on-eks
path: git::https://github.com/awslabs/idf-modules.git//modules/integration/fsx-lustre-on-eks?ref=release/1.11.0&depth=1
parameters:
- name: EksClusterAdminRoleArn
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterMasterRoleArn
- name: EksHandlerRoleArn
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksHandlerRoleArn
- name: EksClusterName
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterName
- name: EksOidcArn
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksOidcArn
- name: EksClusterSecurityGroupId
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterSecurityGroupId
- name: Namespace
valueFrom:
parameterValue: rayNamespaceName
- name: FsxFileSystemId
valueFrom:
moduleMetadata:
group: core
name: fsx-lustre
key: FSxLustreFileSystemId
- name: FsxSecurityGroupId
valueFrom:
moduleMetadata:
group: core
name: fsx-lustre
key: FSxLustreSecurityGroup
- name: FsxMountName
valueFrom:
moduleMetadata:
group: core
name: fsx-lustre
key: FSxLustreMountName
- name: FsxDnsName
valueFrom:
moduleMetadata:
group: core
name: fsx-lustre
key: FSxLustreAttrDnsName
- name: DraExportPath
valueFrom:
parameterValue: draExportPath
82 changes: 82 additions & 0 deletions manifests/fine-tuning-6B/ray-cluster-modules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
name: ray-cluster
path: git::https://github.com/awslabs/aiops-modules.git//modules/eks/ray-cluster?ref=release/1.5.0&depth=1
parameters:
- name: EksClusterAdminRoleArn
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterMasterRoleArn
- name: EksClusterName
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterName
- name: EksOidcArn
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksOidcArn
- name: Namespace
valueFrom:
parameterValue: rayNamespaceName
- name: ServiceAccountName
valueFrom:
moduleMetadata:
group: ray-operator
name: ray-operator
key: EksServiceAccountName
- name: HeadResources
value:
requests:
cpu: "1"
memory: "8G"
limits:
cpu: "4"
memory: "16G"
- name: WorkerReplicas
value: 1
- name: WorkerMinReplicas
value: 1
- name: WorkerMaxReplicas
value: 15
- name: WorkerResources
value:
requests:
cpu: "4"
memory: "8G"
limits:
cpu: "14"
memory: "60G"
- name: DataBucketName
valueFrom:
moduleMetadata:
group: base
name: buckets
key: ArtifactsBucketName
- name: ImageUri
valueFrom:
moduleMetadata:
group: images
name: ray
key: ImageUri
- name: WorkerTolerations
value: # make sure to match w/ the taints on the GPU Nodegroup
- key: "nvidia.com/gpu"
value: "true"
# operator: "Equal"
effect: "NoSchedule"
- name: WorkerLabels
value: # make sure to match w/ the labels on the GPU Nodegroup
usage: gpu
- name: PvcName
valueFrom:
moduleMetadata:
group: integration
name: lustre-on-eks
key: PersistentVolumeClaimName
- name: DraExportPath
valueFrom:
parameterValue: draExportPath
60 changes: 60 additions & 0 deletions manifests/fine-tuning-6B/ray-operator-modules.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
name: ray-operator
path: git::https://github.com/awslabs/aiops-modules.git//modules/eks/ray-operator?ref=release/1.5.0&depth=1
parameters:
- name: EksClusterAdminRoleArn
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterMasterRoleArn
- name: EksHandlerRoleArn
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksHandlerRoleArn
- name: EksClusterName
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterName
- name: EksClusterEndpoint
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterEndpoint
- name: EksOidcArn
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksOidcArn
- name: EksOpenidIssuer
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterOpenIdConnectIssuer
- name: EksCertAuthData
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterCertAuthData
- name: EksClusterSecurityGroupId
valueFrom:
moduleMetadata:
group: core
name: eks
key: EksClusterSecurityGroupId
- name: Namespace
valueFrom:
parameterValue: rayNamespaceName
- name: DataBucketName
valueFrom:
moduleMetadata:
group: base
name: buckets
key: ArtifactsBucketName
Loading
Loading