diff --git a/CHANGELOG.md b/CHANGELOG.md index 7db33318..340ab526 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## UNRELEASED ### **Added** +- added new manifest `manifests/fine-tuning-6B` ### **Changed** diff --git a/manifests/fine-tuning-6B/base-modules.yaml b/manifests/fine-tuning-6B/base-modules.yaml new file mode 100644 index 00000000..f9f2e8f5 --- /dev/null +++ b/manifests/fine-tuning-6B/base-modules.yaml @@ -0,0 +1,26 @@ +name: networking +path: git::https://github.com/awslabs/idf-modules.git//modules/network/basic-cdk?ref=release/1.11.0&depth=1 +parameters: + - name: InternetAccessible + value: true +--- +name: buckets +path: git::https://github.com/awslabs/idf-modules.git//modules/storage/buckets?ref=release/1.11.0&depth=1 +parameters: + - name: EncryptionType + value: SSE + - name: RetentionType + value: DESTROY +--- +name: ray-ecr +path: git::https://github.com/awslabs/idf-modules.git//modules/storage/ecr?ref=release/1.11.0&depth=1 +targetAccount: primary +parameters: + - name: ImageTagMutability + value: MUTABLE + - name: ImageScanOnPush + value: True + - name: Encryption + value: KMS_MANAGED + - name: RemovalPolicy + value: DESTROY diff --git a/manifests/fine-tuning-6B/core-modules.yaml b/manifests/fine-tuning-6B/core-modules.yaml new file mode 100644 index 00000000..ba3ac288 --- /dev/null +++ b/manifests/fine-tuning-6B/core-modules.yaml @@ -0,0 +1,112 @@ +name: eks +path: git::https://github.com/awslabs/idf-modules.git//modules/compute/eks?ref=release/1.11.0&depth=1 +dataFiles: + - filePath: git::https://github.com/awslabs/idf-modules.git//data/eks_dockerimage-replication/versions/1.29.yaml?ref=release/1.11.0&depth=1 + - filePath: git::https://github.com/awslabs/idf-modules.git//data/eks_dockerimage-replication/versions/default.yaml?ref=release/1.11.0&depth=1 +parameters: + - name: VpcId + valueFrom: + moduleMetadata: + group: base + name: networking + key: VpcId + - name: ControlplaneSubnetIds + valueFrom: + moduleMetadata: + group: base + name: networking + key: PrivateSubnetIds + - name: DataplaneSubnetIds + valueFrom: + moduleMetadata: + group: base + name: networking + key: PrivateSubnetIds + - name: EksAdminRoleName + value: Admin + - name: EksPoweruserRoleName + value: PowerUser + - name: EksReadOnlyRoleName + value: ReadOnly + - name: EksVersion + value: "1.29" + # valueFrom: + # envVariable: GLOBAL_EKS_VERSION + - name: EksCompute + value: + eks_nodegroup_config: + - eks_ng_name: ng1 + eks_node_quantity: 1 + eks_node_max_quantity: 1 + eks_node_min_quantity: 1 + eks_node_disk_size: 400 + eks_node_instance_type: "m5.xlarge" + eks_node_labels: + usage: core + - eks_ng_name: ng-gpu + eks_node_quantity: 6 + eks_node_max_quantity: 15 + eks_node_min_quantity: 6 + eks_node_disk_size: 400 + eks_node_instance_type: "g4dn.4xlarge" + eks_node_labels: + usage: gpu + nvidia.com/gpu.present: "true" + use_gpu_ami: True + eks_node_taints: + - key: "nvidia.com/gpu" + value: "true" + # operator: "Equal" + effect: "NoSchedule" + install_nvidia_device_plugin: True + eks_node_spot: False + eks_secrets_envelope_encryption: True + eks_api_endpoint_private: False + - name: EksAddons + value: + # Autoscaling + deploy_cluster_autoscaler: True + deploy_metrics_server: True + # Observability + deploy_cloudwatch_observability_addon: True + # Storage + deploy_aws_fsx_csi: True +--- +name: fsx-lustre +path: git::https://github.com/awslabs/idf-modules.git//modules/storage/fsx-lustre?ref=release/1.11.0&depth=1 +parameters: + - name: VpcId + valueFrom: + moduleMetadata: + group: base + name: networking + key: VpcId + - name: PrivateSubnetIds + valueFrom: + moduleMetadata: + group: base + name: networking + key: PrivateSubnetIds + - name: FsDeploymentType + value: SCRATCH_2 + - name: StorageThroughput + value: 50 + - name: DataBucketName + valueFrom: + moduleMetadata: + group: base + name: buckets + key: ArtifactsBucketName + - name: DraExportPath + valueFrom: + parameterValue: draExportPath + - name: DraImportPath + valueFrom: + parameterValue: draImportPath + - name: FsxVersion + value: "2.15" + - name: Namespace + valueFrom: + parameterValue: rayNamespaceName + - name: ImportPolicy + value: "NEW_CHANGED_DELETED" diff --git a/manifests/fine-tuning-6B/deployment.yaml b/manifests/fine-tuning-6B/deployment.yaml new file mode 100644 index 00000000..40672468 --- /dev/null +++ b/manifests/fine-tuning-6B/deployment.yaml @@ -0,0 +1,30 @@ +name: fine-tuning-6B +forceDependencyRedeploy: True +toolchainRegion: us-east-1 +groups: + - name: base + path: manifests/fine-tuning-6B/base-modules.yaml + - name: images + path: manifests/fine-tuning-6B/images-modules.yaml + - name: core + path: manifests/fine-tuning-6B/core-modules.yaml + - name: integration + path: manifests/fine-tuning-6B/integration-modules.yaml + - name: ray-operator + path: manifests/fine-tuning-6B/ray-operator-modules.yaml + - name: ray-cluster + path: manifests/fine-tuning-6B/ray-cluster-modules.yaml +targetAccountMappings: + - alias: primary + accountId: + valueFrom: + envVariable: PRIMARY_ACCOUNT + default: true + codebuildImage: aws/codebuild/standard:7.0 + parametersGlobal: + rayNamespaceName: ray + draImportPath: /ray/import/ + draExportPath: /ray/export/ + regionMappings: + - region: us-east-1 + default: true diff --git a/manifests/fine-tuning-6B/images-modules.yaml b/manifests/fine-tuning-6B/images-modules.yaml new file mode 100644 index 00000000..a4705241 --- /dev/null +++ b/manifests/fine-tuning-6B/images-modules.yaml @@ -0,0 +1,10 @@ +name: ray +path: git::https://github.com/awslabs/aiops-modules.git//modules/eks/ray-image?ref=release/1.5.0&depth=1 +targetAccount: primary +parameters: + - name: EcrRepoName + valueFrom: + moduleMetadata: + group: base + name: ray-ecr + key: EcrRepositoryName diff --git a/manifests/fine-tuning-6B/integration-modules.yaml b/manifests/fine-tuning-6B/integration-modules.yaml new file mode 100644 index 00000000..f4b1fb89 --- /dev/null +++ b/manifests/fine-tuning-6B/integration-modules.yaml @@ -0,0 +1,63 @@ +name: lustre-on-eks +path: git::https://github.com/awslabs/idf-modules.git//modules/integration/fsx-lustre-on-eks?ref=release/1.11.0&depth=1 +parameters: + - name: EksClusterAdminRoleArn + valueFrom: + moduleMetadata: + group: core + name: eks + key: EksClusterMasterRoleArn + - name: EksHandlerRoleArn + valueFrom: + moduleMetadata: + group: core + name: eks + key: EksHandlerRoleArn + - name: EksClusterName + valueFrom: + moduleMetadata: + group: core + name: eks + key: EksClusterName + - name: EksOidcArn + valueFrom: + moduleMetadata: + group: core + name: eks + key: EksOidcArn + - name: EksClusterSecurityGroupId + valueFrom: + moduleMetadata: + group: core + name: eks + key: EksClusterSecurityGroupId + - name: Namespace + valueFrom: + parameterValue: rayNamespaceName + - name: FsxFileSystemId + valueFrom: + moduleMetadata: + group: core + name: fsx-lustre + key: FSxLustreFileSystemId + - name: FsxSecurityGroupId + valueFrom: + moduleMetadata: + group: core + name: fsx-lustre + key: FSxLustreSecurityGroup + - name: FsxMountName + valueFrom: + moduleMetadata: + group: core + name: fsx-lustre + key: FSxLustreMountName + - name: FsxDnsName + valueFrom: + moduleMetadata: + group: core + name: fsx-lustre + key: FSxLustreAttrDnsName + - name: DraExportPath + valueFrom: + parameterValue: draExportPath diff --git a/manifests/fine-tuning-6B/ray-cluster-modules.yaml b/manifests/fine-tuning-6B/ray-cluster-modules.yaml new file mode 100644 index 00000000..c72b7688 --- /dev/null +++ b/manifests/fine-tuning-6B/ray-cluster-modules.yaml @@ -0,0 +1,82 @@ +name: ray-cluster +path: git::https://github.com/awslabs/aiops-modules.git//modules/eks/ray-cluster?ref=release/1.5.0&depth=1 +parameters: + - name: EksClusterAdminRoleArn + valueFrom: + moduleMetadata: + group: core + name: eks + key: EksClusterMasterRoleArn + - name: EksClusterName + valueFrom: + moduleMetadata: + group: core + name: eks + key: EksClusterName + - name: EksOidcArn + valueFrom: + moduleMetadata: + group: core + name: eks + key: EksOidcArn + - name: Namespace + valueFrom: + parameterValue: rayNamespaceName + - name: ServiceAccountName + valueFrom: + moduleMetadata: + group: ray-operator + name: ray-operator + key: EksServiceAccountName + - name: HeadResources + value: + requests: + cpu: "1" + memory: "8G" + limits: + cpu: "4" + memory: "16G" + - name: WorkerReplicas + value: 1 + - name: WorkerMinReplicas + value: 1 + - name: WorkerMaxReplicas + value: 15 + - name: WorkerResources + value: + requests: + cpu: "4" + memory: "8G" + limits: + cpu: "14" + memory: "60G" + - name: DataBucketName + valueFrom: + moduleMetadata: + group: base + name: buckets + key: ArtifactsBucketName + - name: ImageUri + valueFrom: + moduleMetadata: + group: images + name: ray + key: ImageUri + - name: WorkerTolerations + value: # make sure to match w/ the taints on the GPU Nodegroup + - key: "nvidia.com/gpu" + value: "true" + # operator: "Equal" + effect: "NoSchedule" + - name: WorkerLabels + value: # make sure to match w/ the labels on the GPU Nodegroup + usage: gpu + - name: PvcName + valueFrom: + moduleMetadata: + group: integration + name: lustre-on-eks + key: PersistentVolumeClaimName + - name: DraExportPath + valueFrom: + parameterValue: draExportPath diff --git a/manifests/fine-tuning-6B/ray-operator-modules.yaml b/manifests/fine-tuning-6B/ray-operator-modules.yaml new file mode 100644 index 00000000..45bfb2e6 --- /dev/null +++ b/manifests/fine-tuning-6B/ray-operator-modules.yaml @@ -0,0 +1,60 @@ +name: ray-operator +path: git::https://github.com/awslabs/aiops-modules.git//modules/eks/ray-operator?ref=release/1.5.0&depth=1 +parameters: + - name: EksClusterAdminRoleArn + valueFrom: + moduleMetadata: + group: core + name: eks + key: EksClusterMasterRoleArn + - name: EksHandlerRoleArn + valueFrom: + moduleMetadata: + group: core + name: eks + key: EksHandlerRoleArn + - name: EksClusterName + valueFrom: + moduleMetadata: + group: core + name: eks + key: EksClusterName + - name: EksClusterEndpoint + valueFrom: + moduleMetadata: + group: core + name: eks + key: EksClusterEndpoint + - name: EksOidcArn + valueFrom: + moduleMetadata: + group: core + name: eks + key: EksOidcArn + - name: EksOpenidIssuer + valueFrom: + moduleMetadata: + group: core + name: eks + key: EksClusterOpenIdConnectIssuer + - name: EksCertAuthData + valueFrom: + moduleMetadata: + group: core + name: eks + key: EksClusterCertAuthData + - name: EksClusterSecurityGroupId + valueFrom: + moduleMetadata: + group: core + name: eks + key: EksClusterSecurityGroupId + - name: Namespace + valueFrom: + parameterValue: rayNamespaceName + - name: DataBucketName + valueFrom: + moduleMetadata: + group: base + name: buckets + key: ArtifactsBucketName diff --git a/manifests/fine-tuning-6B/scripts/inference-6B.py b/manifests/fine-tuning-6B/scripts/inference-6B.py new file mode 100644 index 00000000..b33a8546 --- /dev/null +++ b/manifests/fine-tuning-6B/scripts/inference-6B.py @@ -0,0 +1,21 @@ +import torch +import torchvision + +from transformers import pipeline, AutoTokenizer, GPTJForCausalLM + +model = GPTJForCausalLM.from_pretrained("/ray/export/.../checkpoint") +tokenizer = AutoTokenizer.from_pretrained("/ray/export/.../checkpoint") + +pipe = pipeline( + model=model, + tokenizer=tokenizer, + task="text-generation", + torch_dtype=torch.float16, + device_map="auto", +) + +# Generate from prompts! +for sentence in pipe( + ["Romeo and Juliet", "war", "blood"], do_sample=True, min_length=20 +): + print(sentence) \ No newline at end of file diff --git a/manifests/fine-tuning-6B/scripts/training-6B.py b/manifests/fine-tuning-6B/scripts/training-6B.py new file mode 100644 index 00000000..e04ee198 --- /dev/null +++ b/manifests/fine-tuning-6B/scripts/training-6B.py @@ -0,0 +1,230 @@ +import numpy as np +import pandas as pd +import os + +import ray +import ray.data +from datasets import load_dataset +import evaluate +import torch +from transformers import ( + Trainer, + TrainingArguments, + GPTJForCausalLM, + AutoTokenizer, + default_data_collator, +) +from transformers.utils.logging import disable_progress_bar, enable_progress_bar +from ray import train +from ray.train.huggingface.transformers import prepare_trainer, RayTrainReportCallback +from ray.train.torch import TorchTrainer +from ray.train import RunConfig, ScalingConfig + + +model_name = "EleutherAI/gpt-j-6B" +use_gpu = True +num_workers = 5 +cpus_per_worker = 12 +block_size = 512 +storage_path = "/ray/export" + + +ray.init( + runtime_env={ + "pip": [ + "datasets", + "evaluate", + # The latest combination accelerate==0.25.0, transformers==4.36.0, deepspeed==0.12.4 + # has issues with DeepSpeed process group initialization, + # and will result in a batch_size validation problem. + # TODO(ml-team): get rid of the pins once the issue is fixed. + "accelerate==0.18.0", + "transformers==4.26.0", + "torch>=1.12.0", + "deepspeed==0.12.3", + ], + }, +) + +print("Loading tiny_shakespeare dataset") +current_dataset = load_dataset("tiny_shakespeare") + +ray_datasets = { + "train": ray.data.from_huggingface(current_dataset["train"]), + "validation": ray.data.from_huggingface(current_dataset["validation"]), +} + + +def split_text(batch: pd.DataFrame) -> pd.DataFrame: + text = list(batch["text"]) + flat_text = "".join(text) + split_text = [ + x.strip() + for x in flat_text.split("\n") + if x.strip() and not x.strip()[-1] == ":" + ] + return pd.DataFrame(split_text, columns=["text"]) + + +def tokenize(batch: pd.DataFrame) -> dict: + tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False) + tokenizer.pad_token = tokenizer.eos_token + ret = tokenizer( + list(batch["text"]), + truncation=True, + max_length=block_size, + padding="max_length", + return_tensors="np", + ) + ret["labels"] = ret["input_ids"].copy() + return dict(ret) + + +processed_datasets = { + key: ( + ds.map_batches(split_text, batch_format="pandas") + .map_batches(tokenize, batch_format="pandas") + ) + for key, ds in ray_datasets.items() +} + + +def train_func(config): + # Use the actual number of CPUs assigned by Ray + os.environ["OMP_NUM_THREADS"] = str( + train.get_context().get_trial_resources().bundles[-1].get("CPU", 1) + ) + # Enable tf32 for better performance + torch.backends.cuda.matmul.allow_tf32 = True + + batch_size = config.get("batch_size", 4) + epochs = config.get("epochs", 2) + warmup_steps = config.get("warmup_steps", 0) + learning_rate = config.get("learning_rate", 0.00002) + weight_decay = config.get("weight_decay", 0.01) + steps_per_epoch = config.get("steps_per_epoch") + + deepspeed = { + "fp16": { + "enabled": "auto", + "initial_scale_power": 8, + "hysteresis": 4, + "consecutive_hysteresis": True, + }, + "bf16": {"enabled": "auto"}, + "optimizer": { + "type": "AdamW", + "params": { + "lr": "auto", + "betas": "auto", + "eps": "auto", + }, + }, + "zero_optimization": { + "stage": 3, + "offload_optimizer": { + "device": "cpu", + "pin_memory": False, #out of mmeory + }, + "overlap_comm": False, # running out of GRAM + "contiguous_gradients": True, + "reduce_bucket_size": "auto", + "stage3_prefetch_bucket_size": "auto", + "stage3_param_persistence_threshold": "auto", + "gather_16bit_weights_on_model_save": True, + "round_robin_gradients": True, + }, + "gradient_accumulation_steps": "auto", + "gradient_clipping": "auto", + "steps_per_print": 10, + "train_batch_size": "auto", + "train_micro_batch_size_per_gpu": "auto", + "wall_clock_breakdown": False, + } + + print("Preparing training arguments") + training_args = TrainingArguments( + "output", + logging_steps=1, + save_strategy="steps", + save_steps=steps_per_epoch, + max_steps=steps_per_epoch * epochs, + per_device_train_batch_size=batch_size, + gradient_accumulation_steps=1, + learning_rate=learning_rate, + weight_decay=weight_decay, + warmup_steps=warmup_steps, + label_names=["input_ids", "attention_mask"], + push_to_hub=False, + report_to="none", + disable_tqdm=True, # declutter the output a little + fp16=True, + gradient_checkpointing=True, + deepspeed=deepspeed, + ) + disable_progress_bar() + + tokenizer = AutoTokenizer.from_pretrained(model_name) + tokenizer.pad_token = tokenizer.eos_token + + print("Loading model") + + model = GPTJForCausalLM.from_pretrained(model_name, use_cache=False) + model.resize_token_embeddings(len(tokenizer)) + + print("Model loaded") + + enable_progress_bar() + + metric = evaluate.load("accuracy") + + train_ds = train.get_dataset_shard("train") + eval_ds = train.get_dataset_shard("validation") + + train_ds_iterable = train_ds.iter_torch_batches( + batch_size=batch_size, + local_shuffle_buffer_size=train.get_context().get_world_size() * batch_size, + ) + eval_ds_iterable = eval_ds.iter_torch_batches(batch_size=batch_size) + + def compute_metrics(eval_pred): + logits, labels = eval_pred + predictions = np.argmax(logits, axis=-1) + return metric.compute(predictions=predictions, references=labels) + + trainer = Trainer( + model=model, + args=training_args, + train_dataset=train_ds_iterable, + eval_dataset=eval_ds_iterable, + compute_metrics=compute_metrics, + tokenizer=tokenizer, + data_collator=default_data_collator, + ) + + # Add callback to report checkpoints to Ray Train + trainer.add_callback(RayTrainReportCallback()) + trainer = prepare_trainer(trainer) + trainer.train() + +batch_size = 12 +train_ds_size = processed_datasets["train"].count() +steps_per_epoch = train_ds_size // (batch_size * num_workers) + +trainer = TorchTrainer( + train_loop_per_worker=train_func, + train_loop_config={ + "epochs": 1, + "batch_size": batch_size, # per device + "steps_per_epoch": steps_per_epoch, + }, + scaling_config=ScalingConfig( + num_workers=num_workers, + use_gpu=use_gpu, + resources_per_worker={"GPU": 1, "CPU": cpus_per_worker}, + ), + datasets=processed_datasets, + run_config=RunConfig(storage_path=storage_path), +) + +results = trainer.fit() \ No newline at end of file