From 98072ef821882fb24868aecd81eb7369c72c08e4 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 5 Nov 2024 13:30:58 +0100 Subject: [PATCH 01/15] [fine_tuning] toolbox: fine_tuning_ray_fine_tuning_job: refactor to use a workload (entrypoint+app) directory --- .../Fine_Tuning.ray_fine_tuning_job.rst | 22 +-- projects/fine_tuning/toolbox/fine_tuning.py | 19 ++- .../defaults/main/config.yml | 24 ++-- .../tasks/main.yml | 131 ++++++++++-------- .../templates/base_config.yaml.j2 | 26 +++- .../templates/ray_cluster.yaml.j2 | 43 ++---- .../vars/main/resources.yml | 3 +- .../ray-finetune-llm-deepspeed/app}/.source | 0 .../ray-finetune-llm-deepspeed/app}/README.md | 0 .../deepspeed_configs/zero_3_llama_2_13b.json | 0 .../deepspeed_configs/zero_3_llama_2_70b.json | 0 .../deepspeed_configs/zero_3_llama_2_7b.json | 0 .../ray-finetune-llm-deepspeed/app}/lora.json | 0 .../app}/ray_finetune_llm_deepspeed.ipynb | 0 .../app}/ray_finetune_llm_deepspeed.py | 0 .../app}/ray_test.py | 0 .../app}/requirements.txt | 0 .../app}/training.py | 0 .../ray-finetune-llm-deepspeed/app}/utils.py | 0 .../app}/zero_3_offload_optim_param.json | 0 .../entrypoint/convert_alpaca.py | 0 .../entrypoint/convert_dataset_helper.py | 0 .../entrypoint/convert_replicate.py | 0 .../entrypoint/entrypoint.sh} | 10 +- .../entrypoint/prepare_dataset.sh | 6 +- .../entrypoint/study_dataset.py | 0 .../entrypoint/synthetic_dataset.txt | 0 .../vars/main/resources.yml | 1 - 28 files changed, 146 insertions(+), 139 deletions(-) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files/ray-finetune-llm-deepspeed => workloads/ray-finetune-llm-deepspeed/app}/.source (100%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files/ray-finetune-llm-deepspeed => workloads/ray-finetune-llm-deepspeed/app}/README.md (100%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files/ray-finetune-llm-deepspeed => workloads/ray-finetune-llm-deepspeed/app}/deepspeed_configs/zero_3_llama_2_13b.json (100%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files/ray-finetune-llm-deepspeed => workloads/ray-finetune-llm-deepspeed/app}/deepspeed_configs/zero_3_llama_2_70b.json (100%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files/ray-finetune-llm-deepspeed => workloads/ray-finetune-llm-deepspeed/app}/deepspeed_configs/zero_3_llama_2_7b.json (100%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files/ray-finetune-llm-deepspeed => workloads/ray-finetune-llm-deepspeed/app}/lora.json (100%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files/ray-finetune-llm-deepspeed => workloads/ray-finetune-llm-deepspeed/app}/ray_finetune_llm_deepspeed.ipynb (100%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files/ray-finetune-llm-deepspeed => workloads/ray-finetune-llm-deepspeed/app}/ray_finetune_llm_deepspeed.py (100%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files/ray-finetune-llm-deepspeed => workloads/ray-finetune-llm-deepspeed/app}/ray_test.py (100%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files/ray-finetune-llm-deepspeed => workloads/ray-finetune-llm-deepspeed/app}/requirements.txt (100%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files/ray-finetune-llm-deepspeed => workloads/ray-finetune-llm-deepspeed/app}/training.py (100%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files/ray-finetune-llm-deepspeed => workloads/ray-finetune-llm-deepspeed/app}/utils.py (100%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files/ray-finetune-llm-deepspeed => workloads/ray-finetune-llm-deepspeed/app}/zero_3_offload_optim_param.json (100%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files => workloads/ray-finetune-llm-deepspeed}/entrypoint/convert_alpaca.py (100%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files => workloads/ray-finetune-llm-deepspeed}/entrypoint/convert_dataset_helper.py (100%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files => workloads/ray-finetune-llm-deepspeed}/entrypoint/convert_replicate.py (100%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files/entrypoint/job_entrypoint.sh => workloads/ray-finetune-llm-deepspeed/entrypoint/entrypoint.sh} (93%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files => workloads/ray-finetune-llm-deepspeed}/entrypoint/prepare_dataset.sh (89%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files => workloads/ray-finetune-llm-deepspeed}/entrypoint/study_dataset.py (100%) rename projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/{files => workloads/ray-finetune-llm-deepspeed}/entrypoint/synthetic_dataset.txt (100%) diff --git a/docs/toolbox.generated/Fine_Tuning.ray_fine_tuning_job.rst b/docs/toolbox.generated/Fine_Tuning.ray_fine_tuning_job.rst index 454fd71033..f6e9797d0e 100644 --- a/docs/toolbox.generated/Fine_Tuning.ray_fine_tuning_job.rst +++ b/docs/toolbox.generated/Fine_Tuning.ray_fine_tuning_job.rst @@ -38,9 +38,11 @@ Parameters * The name of the model to use inside the /dataset directory of the PVC -``ft_scripts_dir`` +``workload`` -* Directory where the fine-tuning scripts are stored +* The name of the workload job to run (see the role's workload directory) + +* default value: ``ray-finetune-llm-deepspeed`` ``dataset_name`` @@ -72,13 +74,6 @@ Parameters * If True, only prepare the dataset cache file and do not run the fine-tuning. -``dataset_response_template`` - -* The delimiter marking the beginning of the response in the dataset samples - -* default value: ``\n### Label:`` - - ``container_image`` * The image to use for the fine-tuning container @@ -97,8 +92,6 @@ Parameters * The number of GPUs to request for the fine-tuning job -* default value: ``1`` - ``memory`` @@ -145,3 +138,10 @@ Parameters * If true, sleeps forever instead of running the fine-tuning command. + +``capture_artifacts`` + +* If enabled, captures the artifacts that will help post-mortem analyses + +* default value: ``True`` + diff --git a/projects/fine_tuning/toolbox/fine_tuning.py b/projects/fine_tuning/toolbox/fine_tuning.py index ea9b8fcfd1..d478b102b7 100644 --- a/projects/fine_tuning/toolbox/fine_tuning.py +++ b/projects/fine_tuning/toolbox/fine_tuning.py @@ -131,20 +131,19 @@ def ray_fine_tuning_job( self, name, namespace, - pvc_name, + pvc_name=None, - model_name, - ft_scripts_dir, + model_name=None, + workload="ray-finetune-llm-deepspeed", - dataset_name, + dataset_name=None, dataset_replication=1, dataset_transform=None, dataset_prefer_cache=True, dataset_prepare_cache_only=False, - dataset_response_template="\n### Label:", container_image="quay.io/rhoai/ray:2.35.0-py39-cu121-torch24-fa26", ray_version="2.35.0", - gpu=1, + gpu=0, memory=10, cpu=1, request_equals_limits=False, @@ -157,6 +156,7 @@ def ray_fine_tuning_job( hyper_parameters={}, sleep_forever=False, + capture_artifacts=True, ): """ Run a simple Ray fine-tuning Job. @@ -175,7 +175,6 @@ def ray_fine_tuning_job( dataset_transform: name of the transformation to apply to the dataset dataset_prefer_cache: if True, and the dataset has to be transformed/duplicated, save and/or load it from the PVC dataset_prepare_cache_only: if True, only prepare the dataset cache file and do not run the fine-tuning. - dataset_response_template: the delimiter marking the beginning of the response in the dataset samples container_image: the image to use for the fine-tuning container gpu: the number of GPUs to request for the fine-tuning job memory: the number of RAM gigs to request for to the fine-tuning job (in Gigs) @@ -191,6 +190,12 @@ def ray_fine_tuning_job( sleep_forever: if true, sleeps forever instead of running the fine-tuning command. ray_version: the version identifier passed to the RayCluster object + capture_artifacts: if enabled, captures the artifacts that will help post-mortem analyses + + workload: the name of the workload job to run (see the role's workload directory) """ + if dataset_name is None: + dataset_replication = None + return RunAnsibleRole(locals()) diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/defaults/main/config.yml b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/defaults/main/config.yml index f7e0d891dc..ac72a038bf 100644 --- a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/defaults/main/config.yml +++ b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/defaults/main/config.yml @@ -12,20 +12,16 @@ fine_tuning_ray_fine_tuning_job_name: fine_tuning_ray_fine_tuning_job_namespace: # the name of the PVC where the model and dataset are stored -# Mandatory value -fine_tuning_ray_fine_tuning_job_pvc_name: +fine_tuning_ray_fine_tuning_job_pvc_name: null # the name of the model to use inside the /dataset directory of the PVC -# Mandatory value -fine_tuning_ray_fine_tuning_job_model_name: +fine_tuning_ray_fine_tuning_job_model_name: null -# directory where the fine-tuning scripts are stored -# Mandatory value -fine_tuning_ray_fine_tuning_job_ft_scripts_dir: +# the name of the workload job to run (see the role's workload directory) +fine_tuning_ray_fine_tuning_job_workload: ray-finetune-llm-deepspeed # the name of the dataset to use inside the /model directory of the PVC -# Mandatory value -fine_tuning_ray_fine_tuning_job_dataset_name: +fine_tuning_ray_fine_tuning_job_dataset_name: null # number of replications of the dataset to use, to artificially extend or reduce the fine-tuning effort fine_tuning_ray_fine_tuning_job_dataset_replication: 1 @@ -39,11 +35,6 @@ fine_tuning_ray_fine_tuning_job_dataset_prefer_cache: true # if True, only prepare the dataset cache file and do not run the fine-tuning. fine_tuning_ray_fine_tuning_job_dataset_prepare_cache_only: false -# the delimiter marking the beginning of the response in the dataset samples -fine_tuning_ray_fine_tuning_job_dataset_response_template: ' - - ### Label:' - # the image to use for the fine-tuning container fine_tuning_ray_fine_tuning_job_container_image: quay.io/rhoai/ray:2.35.0-py39-cu121-torch24-fa26 @@ -51,7 +42,7 @@ fine_tuning_ray_fine_tuning_job_container_image: quay.io/rhoai/ray:2.35.0-py39-c fine_tuning_ray_fine_tuning_job_ray_version: 2.35.0 # the number of GPUs to request for the fine-tuning job -fine_tuning_ray_fine_tuning_job_gpu: 1 +fine_tuning_ray_fine_tuning_job_gpu: 0 # the number of RAM gigs to request for to the fine-tuning job (in Gigs) fine_tuning_ray_fine_tuning_job_memory: 10 @@ -76,3 +67,6 @@ fine_tuning_ray_fine_tuning_job_hyper_parameters: {} # if true, sleeps forever instead of running the fine-tuning command. fine_tuning_ray_fine_tuning_job_sleep_forever: false + +# if enabled, captures the artifacts that will help post-mortem analyses +fine_tuning_ray_fine_tuning_job_capture_artifacts: true diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/tasks/main.yml b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/tasks/main.yml index 9494388762..8f3c1aeb71 100644 --- a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/tasks/main.yml +++ b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/tasks/main.yml @@ -16,6 +16,10 @@ set_fact: job_name_safe: "{{ fine_tuning_ray_fine_tuning_job_name | replace('.', '-') | replace('_', '-') }}" +- name: Prepare the workload_dir variable + set_fact: + workload_dir: "{{ role_path }}/workloads/{{ fine_tuning_ray_fine_tuning_job_workload }}" + - name: Delete the fine-tuning job configmaps, if any command: oc delete configmap @@ -62,36 +66,36 @@ | tee -a "{{ artifact_extra_logs_dir }}/src/configmap_config.yaml" \ | oc apply -f- -- name: Prepare the entrypoint ConfigMap +- name: Prepare the workload entrypoint ConfigMap shell: | set -o pipefail; oc create cm {{ job_name_safe }}-entrypoint \ -n {{ fine_tuning_ray_fine_tuning_job_namespace }} \ - --from-file=$(find "{{ fine_tuning_job_entrypoint_dir }}" -maxdepth 1 -not -type d | tr '\n' ,)/dev/null \ + --from-file=$(find "{{ workload_dir }}/entrypoint" -maxdepth 1 -not -type d | tr '\n' ,)/dev/null \ --dry-run=client \ -oyaml \ | yq -Y '. | .metadata.labels = {"topsail.fine-tuning-jobname": "{{ job_name_safe }}"}' \ | tee -a "{{ artifact_extra_logs_dir }}/src/configmap_entrypoint.yaml" \ | oc apply -f- -- name: Prepare the fine-tuning scripts ConfigMap +- name: Prepare the workload app ConfigMap shell: | set -o pipefail; - oc create cm {{ job_name_safe }}-ft-scripts \ + oc create cm {{ job_name_safe }}-app \ -n {{ fine_tuning_ray_fine_tuning_job_namespace }} \ - --from-file="$(find "{{ fine_tuning_ray_fine_tuning_job_ft_scripts_dir }}" -not -type d -not -name '*.pyc' | tr '\n' ,)/dev/null" \ + --from-file="$(find "{{ workload_dir }}/app" -not -type d -not -name '*.pyc' | tr '\n' ,)/dev/null" \ --dry-run=client \ -oyaml \ | yq -Y '. | .metadata.labels = {"topsail.fine-tuning-jobname": "{{ job_name_safe }}"}' \ - | tee -a "{{ artifact_extra_logs_dir }}/src/configmap_ft_scripts.yaml" \ + | tee -a "{{ artifact_extra_logs_dir }}/src/configmap_app.yaml" \ | oc apply -f- - name: Load the content of the requirement file shell: set -o pipefail; - cat "{{ fine_tuning_ray_fine_tuning_job_ft_scripts_dir }}/requirements.txt" | sed 's/^/- /' + (cat "{{ workload_dir }}/app/requirements.txt" || true) | sed 's/^/- /' register: requirements_cmd - name: Prepare the cluster template file @@ -177,7 +181,8 @@ set -o pipefail; oc get pods -l 'ray.io/identifier={{ ray_cluster_name }}-head' -n {{ fine_tuning_ray_fine_tuning_job_namespace }} - --no-headers | awk '{print $3}' + --no-headers + | awk '{print $3}' register: wait_pod_fetch retries: 720 delay: 10 @@ -188,7 +193,10 @@ when: wait_pod_fetch.stdout in ["Error", "Init:Error", "CrashLoopBackOff", "ImagePullBackOff"] - name: Wait for the cluster to become Ready - shell: oc get rayjobs/{{ job_name_safe }} -ojsonpath={.status.jobDeploymentStatus} + command: + oc get rayjobs/{{ job_name_safe }} + -n {{ fine_tuning_ray_fine_tuning_job_namespace }} + -ojsonpath={.status.jobDeploymentStatus} register: ray_job_deployment_status retries: 12 delay: 10 @@ -204,7 +212,9 @@ - name: Wait for the job to complete command: - oc get rayjob/{{ job_name_safe }} -ojsonpath={.status.endTime} + oc get rayjob/{{ job_name_safe }} + -n {{ fine_tuning_ray_fine_tuning_job_namespace }} + -ojsonpath={.status.endTime} register: ray_job_end_time_cmd retries: 720 delay: 30 @@ -212,26 +222,64 @@ - name: Check if deployment succeeded command: - oc get rayjob/{{ job_name_safe }} -ojsonpath={.status.jobDeploymentStatus} + oc get rayjob/{{ job_name_safe }} + -ojsonpath={.status.jobDeploymentStatus} + -n {{ fine_tuning_ray_fine_tuning_job_namespace }} register: ray_job_deployment_status_cmd failed_when: ray_job_deployment_status_cmd.stdout not in ["Complete"] - name: Check if job succeeded command: - oc get rayjob/{{ job_name_safe }} -ojsonpath={.status.jobStatus} + oc get rayjob/{{ job_name_safe }} + -ojsonpath={.status.jobStatus} + -n {{ fine_tuning_ray_fine_tuning_job_namespace }} register: ray_job_status_cmd failed_when: ray_job_status_cmd.stdout not in ["SUCCEEDED"] - - name: Check if the script succeeded + always: + - name: Get the name of the job pods shell: set -o pipefail; - oc logs job/{{ job_name_safe }} + oc get pods -lbatch.kubernetes.io/job-name={{ job_name_safe }} + --sort-by=.metadata.creationTimestamp -n {{ fine_tuning_ray_fine_tuning_job_namespace }} - | grep "SCRIPT SUCCEEDED" - register: script_succeeded_cmd + > {{ artifact_extra_logs_dir }}/artifacts/job_pods.status; + + oc get pods -lbatch.kubernetes.io/job-name={{ job_name_safe }} + --sort-by=.metadata.creationTimestamp + --no-headers + -oname + -n {{ fine_tuning_ray_fine_tuning_job_namespace }} + failed_when: false + register: job_pod_name_cmd + + - name: Save the job pod logs + shell: + set -o pipefail; + + oc logs {{ item }} + -n {{ fine_tuning_ray_fine_tuning_job_namespace }} + > {{ artifact_extra_logs_dir }}/artifacts/rayjob_pod_{{ item|replace('pod/', '') }}.log + failed_when: false + loop: "{{ job_pod_name_cmd.stdout_lines }}" + + - name: Capture the state of the RayJobs + shell: + oc get rayjob/{{ job_name_safe }} + -oyaml + -n {{ fine_tuning_ray_fine_tuning_job_namespace }} + > {{ artifact_extra_logs_dir }}/artifacts/rayjob.yaml; + + oc get rayjob/{{ job_name_safe }} + -ojson + -n {{ fine_tuning_ray_fine_tuning_job_namespace }} + > {{ artifact_extra_logs_dir }}/artifacts/rayjob.json; + + oc get rayjob/{{ job_name_safe }} + -n {{ fine_tuning_ray_fine_tuning_job_namespace }} + > {{ artifact_extra_logs_dir }}/artifacts/rayjob.status; - always: - name: Capture the state of the fine-tuning Pod resource shell: set -o pipefail; @@ -259,48 +307,21 @@ -n {{ fine_tuning_ray_fine_tuning_job_namespace }} > {{ artifact_extra_logs_dir }}/artifacts/pod.desc - oc logs $( - oc get pods -l 'ray.io/identifier={{ ray_cluster_name }}-head' - -n {{ fine_tuning_ray_fine_tuning_job_namespace }} - -oname | head -1) - -n {{ fine_tuning_ray_fine_tuning_job_namespace }} - > {{ artifact_extra_logs_dir }}/artifacts/pod.log ignore_errors: true + when: fine_tuning_ray_fine_tuning_job_capture_artifacts | bool - - name: Capture the state of the RayCluster resource - shell: - set -o pipefail; - oc get RayCluster/{{ ray_cluster_name }} - -oyaml - -n {{ fine_tuning_ray_fine_tuning_job_namespace }} - > {{ artifact_extra_logs_dir }}/artifacts/raycluster.yaml; - - oc get RayCluster/{{ ray_cluster_name }} - -owide - -n {{ fine_tuning_ray_fine_tuning_job_namespace }} - > {{ artifact_extra_logs_dir }}/artifacts/raycluster.status; - - oc describe RayCluster/{{ ray_cluster_name }} - -n {{ fine_tuning_ray_fine_tuning_job_namespace }} - > {{ artifact_extra_logs_dir }}/artifacts/raycluster.desc - ignore_errors: true +- name: Ensure that the script succeeded + shell: + set -o pipefail; - - name: Capture the state of the RayJob resource - shell: - set -o pipefail; + cat "{{ artifact_extra_logs_dir }}/artifacts/"rayjob_pod_*.log | grep "SCRIPT SUCCEEDED" - oc get RayJob/{{ job_name_safe }} - -oyaml - -n {{ fine_tuning_ray_fine_tuning_job_namespace }} - > {{ artifact_extra_logs_dir }}/artifacts/rayjob.yaml; +- name: Save the logs of the successful Pod + shell: + set -o pipefail; + set -e; - oc get RayCluster/{{ job_name_safe }} - -owide - -n {{ fine_tuning_ray_fine_tuning_job_namespace }} - > {{ artifact_extra_logs_dir }}/artifacts/rayjob.status; + success_pod_name=$(cat {{ artifact_extra_logs_dir }}/artifacts/job_pods.status | grep Completed | cut -d" " -f1) - oc describe RayCluster/{{ job_name_safe }} - -n {{ fine_tuning_ray_fine_tuning_job_namespace }} - > {{ artifact_extra_logs_dir }}/artifacts/rayjob.desc - ignore_errors: true + cp "{{ artifact_extra_logs_dir }}/artifacts/rayjob_pod_${success_pod_name}.log" "{{ artifact_extra_logs_dir }}/artifacts/job_pod.log" diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/templates/base_config.yaml.j2 b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/templates/base_config.yaml.j2 index a4c6b0528c..3e6013b6e6 100644 --- a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/templates/base_config.yaml.j2 +++ b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/templates/base_config.yaml.j2 @@ -1,13 +1,25 @@ --- -# DATASET_SOURCE: {{ fine_tuning_ray_fine_tuning_job_dataset_name }} -# DATASET_TRANSFORM: {{ fine_tuning_ray_fine_tuning_job_dataset_transform }} -# DATASET_REPLICATION: {{ fine_tuning_ray_fine_tuning_job_dataset_replication }} -training_data_path: "/mnt/output/dataset.json" # aka DATASET_DEST +{% if fine_tuning_ray_fine_tuning_job_model_name %} model_name_or_path: "/mnt/storage/model/{{ fine_tuning_ray_fine_tuning_job_model_name }}" +model_name: "{{ fine_tuning_ray_fine_tuning_job_model_name }}" +{% endif %} -response_template: "{{ fine_tuning_ray_fine_tuning_job_dataset_response_template }}" +{% if fine_tuning_ray_fine_tuning_job_dataset_name %} +dataset_source: "/mnt/storage/dataset/{{ fine_tuning_ray_fine_tuning_job_dataset_name }}" +dataset_replication: {{ fine_tuning_ray_fine_tuning_job_dataset_replication }} -output_dir: "/mnt/output/fine-tuning" +{% if fine_tuning_ray_fine_tuning_job_dataset_transform %} +dataset_transform: "/mnt/entrypoint/{{ fine_tuning_ray_fine_tuning_job_dataset_transform }}" +{% endif %} +{% if fine_tuning_ray_fine_tuning_job_dataset_prefer_cache %} +dataset_prefer_cache: true +{% endif %} +{% if fine_tuning_ray_fine_tuning_job_dataset_prepare_cache_only %} +dataset_prepare_cache_only: true +{% endif %} +{% endif %} -max_seq_length: 4096 +{% if fine_tuning_ray_fine_tuning_job_sleep_forever %} +sleep_forever: true +{% endif %} diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/templates/ray_cluster.yaml.j2 b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/templates/ray_cluster.yaml.j2 index eb9970e896..7311043478 100644 --- a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/templates/ray_cluster.yaml.j2 +++ b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/templates/ray_cluster.yaml.j2 @@ -31,36 +31,10 @@ spec: image: &head_image "{{ fine_tuning_ray_fine_tuning_job_container_image }}" env: &head_env - - - name: FT_CONFIG_JSON_PATH + - name: CONFIG_JSON_PATH value: /mnt/config/config.json - - name: DATASET_SOURCE - value: "/mnt/storage/dataset/{{ fine_tuning_ray_fine_tuning_job_dataset_name }}" - name: DATASET_REPLICATION - value: "{{ fine_tuning_ray_fine_tuning_job_dataset_replication }}" -{% if fine_tuning_ray_fine_tuning_job_dataset_transform %} - - name: DATASET_TRANSFORM - value: "/mnt/entrypoint/{{ fine_tuning_ray_fine_tuning_job_dataset_transform }}" -{% endif %} -{% if fine_tuning_ray_fine_tuning_job_dataset_prefer_cache %} - - name: DATASET_PREFER_CACHE - value: "true" -{% endif %} -{% if fine_tuning_ray_fine_tuning_job_dataset_prepare_cache_only %} - - name: DATASET_PREPARE_CACHE_ONLY - value: "true" -{% endif %} - -{% if fine_tuning_ray_fine_tuning_job_gpu %} - - name: NUM_GPUS - value: "{{ fine_tuning_ray_fine_tuning_job_gpu }}" -{% endif %} - - name: MODEL_NAME - value: "{{ fine_tuning_ray_fine_tuning_job_model_name}}" -{% if fine_tuning_ray_fine_tuning_job_sleep_forever %} - - name: SLEEP_FOREVER - value: "true" -{% endif %} + value: resources: &head_resources requests: &head_request_block {% if fine_tuning_ray_fine_tuning_job_gpu %} @@ -79,11 +53,12 @@ spec: {% endif %} volumeMounts: &head_volume_mounts - +{% if fine_tuning_ray_fine_tuning_job_pvc_name %} - name: storage-volume mountPath: /mnt/storage - - name: ft-scripts-volume - mountPath: /mnt/ft-scripts +{% endif %} + - name: app-volume + mountPath: /mnt/app - name: entrypoint-volume mountPath: /mnt/entrypoint - name: config-volume @@ -92,18 +67,20 @@ spec: mountPath: /mnt/output volumes: &head_volumes +{% if fine_tuning_ray_fine_tuning_job_pvc_name %} - name: storage-volume persistentVolumeClaim: claimName: {{ fine_tuning_ray_fine_tuning_job_pvc_name }} +{% endif %} - name: config-volume configMap: name: {{ job_name_safe }}-config - name: entrypoint-volume configMap: name: {{ job_name_safe }}-entrypoint - - name: ft-scripts-volume + - name: app-volume configMap: - name: {{ job_name_safe }}-ft-scripts + name: {{ job_name_safe }}-app - name: output-volume emptyDir: {} diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/vars/main/resources.yml b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/vars/main/resources.yml index 54ea6d2a1e..1845a526e0 100644 --- a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/vars/main/resources.yml +++ b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/vars/main/resources.yml @@ -3,6 +3,5 @@ fine_tuning_cluster_template: templates/ray_cluster.yaml.j2 fine_tuning_job_template: templates/ray_job.yaml.j2 fine_tuning_job_config_template: templates/base_config.yaml.j2 -fine_tuning_job_entrypoint_dir: "{{ role_path }}/files/entrypoint" -fine_tuning_job_entrypoint_name: job_entrypoint.sh +fine_tuning_job_entrypoint_name: entrypoint.sh __safe: [fine_tuning_job_entrypoint_name] diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/.source b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/.source similarity index 100% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/.source rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/.source diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/README.md b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/README.md similarity index 100% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/README.md rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/README.md diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/deepspeed_configs/zero_3_llama_2_13b.json b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/deepspeed_configs/zero_3_llama_2_13b.json similarity index 100% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/deepspeed_configs/zero_3_llama_2_13b.json rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/deepspeed_configs/zero_3_llama_2_13b.json diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/deepspeed_configs/zero_3_llama_2_70b.json b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/deepspeed_configs/zero_3_llama_2_70b.json similarity index 100% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/deepspeed_configs/zero_3_llama_2_70b.json rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/deepspeed_configs/zero_3_llama_2_70b.json diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/deepspeed_configs/zero_3_llama_2_7b.json b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/deepspeed_configs/zero_3_llama_2_7b.json similarity index 100% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/deepspeed_configs/zero_3_llama_2_7b.json rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/deepspeed_configs/zero_3_llama_2_7b.json diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/lora.json b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/lora.json similarity index 100% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/lora.json rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/lora.json diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.ipynb b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/ray_finetune_llm_deepspeed.ipynb similarity index 100% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.ipynb rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/ray_finetune_llm_deepspeed.ipynb diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.py b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/ray_finetune_llm_deepspeed.py similarity index 100% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/ray_finetune_llm_deepspeed.py rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/ray_finetune_llm_deepspeed.py diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/ray_test.py b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/ray_test.py similarity index 100% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/ray_test.py rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/ray_test.py diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/requirements.txt b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/requirements.txt similarity index 100% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/requirements.txt rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/requirements.txt diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/training.py b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/training.py similarity index 100% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/training.py rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/training.py diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/utils.py b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/utils.py similarity index 100% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/utils.py rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/utils.py diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/zero_3_offload_optim_param.json b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/zero_3_offload_optim_param.json similarity index 100% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/ray-finetune-llm-deepspeed/zero_3_offload_optim_param.json rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/app/zero_3_offload_optim_param.json diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/entrypoint/convert_alpaca.py b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/entrypoint/convert_alpaca.py similarity index 100% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/entrypoint/convert_alpaca.py rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/entrypoint/convert_alpaca.py diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/entrypoint/convert_dataset_helper.py b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/entrypoint/convert_dataset_helper.py similarity index 100% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/entrypoint/convert_dataset_helper.py rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/entrypoint/convert_dataset_helper.py diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/entrypoint/convert_replicate.py b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/entrypoint/convert_replicate.py similarity index 100% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/entrypoint/convert_replicate.py rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/entrypoint/convert_replicate.py diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/entrypoint/job_entrypoint.sh b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/entrypoint/entrypoint.sh similarity index 93% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/entrypoint/job_entrypoint.sh rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/entrypoint/entrypoint.sh index 55462c56fc..6f25ba9678 100644 --- a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/entrypoint/job_entrypoint.sh +++ b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/entrypoint/entrypoint.sh @@ -8,7 +8,7 @@ set -x # echo "Source dataset: $DATASET_SOURCE" -# MAX_SEQ_LENGTH=$(cat "$FT_CONFIG_JSON_PATH" | grep max_seq_length | awk '{print $2}' | cut -d"," -f1) +# MAX_SEQ_LENGTH=$(cat "$CONFIG_JSON_PATH" | grep max_seq_length | awk '{print $2}' | cut -d"," -f1) # DATASET_CACHE_FILE="/mnt/storage/dataset/$(basename "${DATASET_TRANSFORM:-}")_replicate_${DATASET_REPLICATION}_max${MAX_SEQ_LENGTH}tokens_$(basename "${DATASET_SOURCE}")" # prepare_dataset() { @@ -46,7 +46,7 @@ if [[ "${DATASET_PREPARE_CACHE_ONLY:-0}" == true ]]; then fi echo "# configuration:" -cat "$FT_CONFIG_JSON_PATH" +cat "$CONFIG_JSON_PATH" echo "# sha256sum of the $MODEL_NAME files" if [[ -f "/mnt/storage/model/${MODEL_NAME}.sha256sum" ]]; then @@ -62,7 +62,7 @@ else echo "No GPU seem to be available." fi -cd /mnt/ft-scripts +cd /mnt/app if [[ "${SLEEP_FOREVER:-}" ]]; then set +x @@ -70,8 +70,8 @@ if [[ "${SLEEP_FOREVER:-}" ]]; then echo "Fine-tuning command:" cat < "$CACHE_FILE" + SFT_TRAINER_CONFIG_JSON_PATH="$CONFIG_JSON_PATH" python /mnt/entrypoint/study_dataset.py > "$CACHE_FILE" fi cat "$CACHE_FILE" @@ -55,7 +55,7 @@ if [[ "${DATASET_PREPARE_CACHE_ONLY:-0}" == true ]]; then fi echo "# configuration:" -cat "$FT_CONFIG_JSON_PATH" +cat "$CONFIG_JSON_PATH" echo "# sha256sum of the $MODEL_NAME files" if [[ -f "/mnt/storage/model/${MODEL_NAME}.sha256sum" ]]; then diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/entrypoint/study_dataset.py b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/entrypoint/study_dataset.py similarity index 100% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/entrypoint/study_dataset.py rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/entrypoint/study_dataset.py diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/entrypoint/synthetic_dataset.txt b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/entrypoint/synthetic_dataset.txt similarity index 100% rename from projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/files/entrypoint/synthetic_dataset.txt rename to projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-finetune-llm-deepspeed/entrypoint/synthetic_dataset.txt diff --git a/projects/fine_tuning/toolbox/fine_tuning_run_quality_evaluation/vars/main/resources.yml b/projects/fine_tuning/toolbox/fine_tuning_run_quality_evaluation/vars/main/resources.yml index a3511af5b3..485ddf585e 100644 --- a/projects/fine_tuning/toolbox/fine_tuning_run_quality_evaluation/vars/main/resources.yml +++ b/projects/fine_tuning/toolbox/fine_tuning_run_quality_evaluation/vars/main/resources.yml @@ -1,6 +1,5 @@ --- fine_tuning_job_template: templates/quality_evaluation_job.yaml.j2 - fine_tuning_job_entrypoint_dir: "{{ role_path }}/files/entrypoint" fine_tuning_job_entrypoint_name: entrypoint.sh fine_tuning_job_config_template: templates/base_config.yaml.j2 From fb088d73fc57679eb93dfb7d212fa55e48f250cf Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 5 Nov 2024 15:49:33 +0100 Subject: [PATCH 02/15] [fine_tuning] toolbox: fine_tuning_ray_fine_tuning_job: workloads/ray-benchmark: new workload --- .../app/test_network_overhead.py | 59 +++++++++++++++++++ .../ray-benchmark/entrypoint/entrypoint.sh | 25 ++++++++ 2 files changed, 84 insertions(+) create mode 100644 projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-benchmark/app/test_network_overhead.py create mode 100644 projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-benchmark/entrypoint/entrypoint.sh diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-benchmark/app/test_network_overhead.py b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-benchmark/app/test_network_overhead.py new file mode 100644 index 0000000000..3d037ef7fc --- /dev/null +++ b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-benchmark/app/test_network_overhead.py @@ -0,0 +1,59 @@ +"""Networking overhead (200 trials on 200 nodes) + +In this run, we will start 100 trials and run them on 100 different nodes. +This test will thus measure the overhead that comes with network communication +and specifically log synchronization. + +Test owner: krfricke + +Acceptance criteria: Should run faster than 500 seconds. + +Theoretical minimum time: 300 seconds +""" + +# https://github.com/ray-project/ray/blob/130cb3d4f28e7486fad46697fd893dd84b5a096b/release/tune_tests/scalability_tests/workloads/test_network_overhead.py + +import argparse +import ray + +from ray.tune.utils.release_test_util import timed_tune_run + + +def main(smoke_test: bool = False): + ray.init(address="auto") + + num_samples = 100 if not smoke_test else 20 + results_per_second = 0.01 + trial_length_s = 300 + + max_runtime = 500 + + success = timed_tune_run( + name="result network overhead", + num_samples=num_samples, + results_per_second=results_per_second, + trial_length_s=trial_length_s, + max_runtime=max_runtime, + # One trial per worker node, none get scheduled on the head node. + # See the compute config. + resources_per_trial={"cpu": 2}, + ) + + if not success: + raise RuntimeError( + f"Test did not finish in within the max_runtime ({max_runtime} s). " + "See above for details." + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--smoke-test", + action="store_true", + default=False, + help="Finish quickly for training.", + ) + args = parser.parse_args() + + main(args.smoke_test) diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-benchmark/entrypoint/entrypoint.sh b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-benchmark/entrypoint/entrypoint.sh new file mode 100644 index 0000000000..5aef47159e --- /dev/null +++ b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-benchmark/entrypoint/entrypoint.sh @@ -0,0 +1,25 @@ +# !/bin/bash + +set -o pipefail +set -o errexit +set -o nounset +set -o errtrace +set -x + +cd /mnt/app + +echo "# configuration:" +cat "$CONFIG_JSON_PATH" + +if python3 ./test_network_overhead.py --smoke-test; then + echo "SCRIPT SUCCEEDED" +else + echo "SCRIPT FAILED" + # don't exit with a return code != 0, otherwise the RayJob->Job retries 3 times ... +fi + +echo "*********" +echo "*********" +echo "*********" +echo "*********" +echo "********* Bye" From 7d808e2871821254f99ba589582f67e005eed07e Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Tue, 5 Nov 2024 15:58:27 +0100 Subject: [PATCH 03/15] [fine_tuning] testing: config: remove outdated IBM regression comparison presets --- projects/fine_tuning/testing/config.yaml | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/projects/fine_tuning/testing/config.yaml b/projects/fine_tuning/testing/config.yaml index f563c2f104..570054115f 100644 --- a/projects/fine_tuning/testing/config.yaml +++ b/projects/fine_tuning/testing/config.yaml @@ -64,12 +64,6 @@ ci_presets: tests.fine_tuning.multi_model.enabled: true tests.fine_tuning.test_settings.model_name: null - ibm_release_comparison: - tests.fine_tuning.test_settings.container_image: - - quay.io/modh/fms-hf-tuning:release-ec50c3d7dc09f50d9885f25efc3d2fc98a379709 # RHOAI-2.12 - - quay.io/modh/fms-hf-tuning:release-5e4e9441febdb5b2beb21eaecdda1103abd1db05 # RHOAI-2.11 - - quay.io/modh/fms-hf-tuning:release-7a8ff0f4114ba43398d34fd976f6b17bb1f665f3 # RHOAI-2.10 - hf_evaluation: fine_tuning.pvc.size: 2000Gi tests.fine_tuning.matbenchmarking.enabled: true @@ -78,20 +72,6 @@ ci_presets: tests.fine_tuning.test_settings.dataset_replication: 0.1 tests.fine_tuning.matbenchmarking.stop_on_error: false - ibm_regression: - extends: [hf_evaluation, dgx_small_footprint] - tests.fine_tuning.test_settings.model_name: - - ibm-granite/granite-3b-code-instruct - - ibm/merlinite-7b - - meta-llama/Llama-2-13b-chat-hf - - ibm-granite/granite-20b-code-instruct - - ibm-granite/granite-34b-code-instruct - tests.fine_tuning.test_settings.gpu: [2, 4, 8] - tests.fine_tuning.test_settings.container_image: - - quay.io/modh/fms-hf-tuning:main-5e965e4676bff71f0c4e8219a59aba37ce542083 # 4.42 - - quay.io/modh/fms-hf-tuning:main-abbb2e2dfac0f92a34714147f3bd4696758037c6 # 4.41 - - quay.io/modh/fms-hf-tuning:release-5e4e9441febdb5b2beb21eaecdda1103abd1db05 # RHOAI 2.11 image - dgx_small_footprint: tests.fine_tuning.test_settings.gpu: 8 # -- # From 55f2adb70fb830175b4c8c1bd14f6a7e6ecd55d2 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 6 Nov 2024 11:35:26 +0100 Subject: [PATCH 04/15] [core] library: ansible_toolbox: remove stray pdb call --- projects/core/library/ansible_toolbox.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/projects/core/library/ansible_toolbox.py b/projects/core/library/ansible_toolbox.py index ea9ae65591..a3ca2f4b2e 100644 --- a/projects/core/library/ansible_toolbox.py +++ b/projects/core/library/ansible_toolbox.py @@ -26,10 +26,7 @@ def __init__(self): if toolbox_file.name.startswith("."): continue project_toolbox_module = str(toolbox_file.relative_to(TOPSAIL_DIR).with_suffix("")).replace(os.path.sep, ".") - try: - mod = importlib.import_module(project_toolbox_module) - except: - import pdb;pdb.set_trace() + mod = importlib.import_module(project_toolbox_module) toolbox_name = toolbox_file.with_suffix("").name if toolbox_name.startswith("_"): continue From cadb64db01e542a9495265d61d49b145eafe686c Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 6 Nov 2024 11:35:49 +0100 Subject: [PATCH 05/15] [fine_tuning] testing: config: add ray_bench preset --- projects/fine_tuning/testing/config.yaml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/projects/fine_tuning/testing/config.yaml b/projects/fine_tuning/testing/config.yaml index 570054115f..086844dcb8 100644 --- a/projects/fine_tuning/testing/config.yaml +++ b/projects/fine_tuning/testing/config.yaml @@ -285,6 +285,13 @@ ci_presets: matbench.lts.opensearch.export.enabled: true matbench.lts.regression_analyses.enabled: true 'ci_presets.light["tests.fine_tuning.test_settings.gpu"]': 1 + + no_model: + fine_tuning.pvc.name: null + tests.fine_tuning.test_settings.dataset_name: null + tests.fine_tuning.test_settings.model_name: null + tests.fine_tuning.test_settings.dataset_replication: null + # --- quality_evaluation: @@ -304,6 +311,16 @@ ci_presets: ray: tests.fine_tuning.fms.enabled: false tests.fine_tuning.ray.enabled: true + tests.capture_prom: false # not needed for the time being + tests.visualize: false # not needed for the time being + tests.capture_state: false # not needed for the time being + tests.fine_tuning.test_settings.hyper_parameters: {} + + ray_bench: + extends: [ray, no_model] + tests.fine_tuning.ray.workload: ray-benchmark + tests.fine_tuning.test_settings.hyper_parameters: + num_samples: 10 # --- From a46bde1e522646ca4c7ac2d76649745b27e7cd18 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Wed, 6 Nov 2024 11:36:12 +0100 Subject: [PATCH 06/15] [fine_tuning] testing: prepare_finetuning: make it work without a dataset/model/pvc --- .../fine_tuning/testing/prepare_finetuning.py | 28 +++++++++++++++---- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/projects/fine_tuning/testing/prepare_finetuning.py b/projects/fine_tuning/testing/prepare_finetuning.py index da1e9d8aac..d1dd6f7e93 100644 --- a/projects/fine_tuning/testing/prepare_finetuning.py +++ b/projects/fine_tuning/testing/prepare_finetuning.py @@ -91,23 +91,41 @@ def set_namespace_annotations(): def download_data_sources(test_settings): namespace = config.project.get_config("tests.fine_tuning.namespace") - model_name = test_settings["model_name"] - dataset_name = test_settings["dataset_name"] + model_name = test_settings.get("model_name") + dataset_name = test_settings.get("dataset_name") pvc_name = config.project.get_config("fine_tuning.pvc.name") sources = config.project.get_config(f"fine_tuning.sources") dry_mode = config.project.get_config("tests.dry_mode") - sources_name = [dataset_name] - if model_name is None: + sources_name = [] + if dataset_name: + sources_name.append(dataset_name) + + if config.project.get_config("tests.fine_tuning.multi_model.enabled"): multi_models = config.project.get_config("tests.fine_tuning.multi_model.models") for model in multi_models: sources_name.append(model["name"]) + if model_name is None: + pass # nothing to do elif isinstance(model_name, str): sources_name.append(model_name) - else: + elif isinstance(model_name, list): sources_name += model_name + else: + msg = f"Received an unexpected value of 'model_name': {model_name} ({model_name.__class__.__name__})" + logging.error(msg) + raise ValueError(msg) + + if not sources_name: + logging.info("download_data_sources: Nothing to download.") + return # nothing to do + + if not pvc_name: + msg = f"Found {len(sources_name)} sources to download, but fine_tuning.pvc.name={pvc_name}" + logging.error(msg) + raise ValueError(msg) def do_download(extra, secret_key=None, image_key=None): name = extra["name"] From d88a442743fa6ae286104beed427d0b0b65209d7 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 7 Nov 2024 15:15:40 +0100 Subject: [PATCH 07/15] [fine_tuning] toolbox: fine_tuning_ray_fine_tuning_job: workloads/ray-benchmark/app/test_network_overhead: allow configuring the num_samples --- .../app/test_network_overhead.py | 22 ++++++++----------- .../ray-benchmark/entrypoint/entrypoint.sh | 4 ++-- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-benchmark/app/test_network_overhead.py b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-benchmark/app/test_network_overhead.py index 3d037ef7fc..7928c9ff38 100644 --- a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-benchmark/app/test_network_overhead.py +++ b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-benchmark/app/test_network_overhead.py @@ -13,16 +13,21 @@ # https://github.com/ray-project/ray/blob/130cb3d4f28e7486fad46697fd893dd84b5a096b/release/tune_tests/scalability_tests/workloads/test_network_overhead.py -import argparse +import os +import json + import ray from ray.tune.utils.release_test_util import timed_tune_run +with open(os.environ["CONFIG_JSON_PATH"]) as f: + CONFIG = json.load(f) -def main(smoke_test: bool = False): +def main(): ray.init(address="auto") - num_samples = 100 if not smoke_test else 20 + num_samples = CONFIG.get("num_samples", 20) + results_per_second = 0.01 trial_length_s = 300 @@ -47,13 +52,4 @@ def main(smoke_test: bool = False): if __name__ == "__main__": - parser = argparse.ArgumentParser() - parser.add_argument( - "--smoke-test", - action="store_true", - default=False, - help="Finish quickly for training.", - ) - args = parser.parse_args() - - main(args.smoke_test) + main() diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-benchmark/entrypoint/entrypoint.sh b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-benchmark/entrypoint/entrypoint.sh index 5aef47159e..6e58c7b55d 100644 --- a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-benchmark/entrypoint/entrypoint.sh +++ b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/workloads/ray-benchmark/entrypoint/entrypoint.sh @@ -11,13 +11,13 @@ cd /mnt/app echo "# configuration:" cat "$CONFIG_JSON_PATH" -if python3 ./test_network_overhead.py --smoke-test; then +if python3 ./test_network_overhead.py; then echo "SCRIPT SUCCEEDED" else echo "SCRIPT FAILED" # don't exit with a return code != 0, otherwise the RayJob->Job retries 3 times ... fi - +set +x echo "*********" echo "*********" echo "*********" From 2b8c33e9524a078b17b282d72e824186041f2a56 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 7 Nov 2024 17:24:46 +0100 Subject: [PATCH 08/15] [fine_tuning] testing: config: re-enable visualization for Ray --- projects/fine_tuning/testing/config.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/projects/fine_tuning/testing/config.yaml b/projects/fine_tuning/testing/config.yaml index 086844dcb8..a884c685cd 100644 --- a/projects/fine_tuning/testing/config.yaml +++ b/projects/fine_tuning/testing/config.yaml @@ -312,12 +312,11 @@ ci_presets: tests.fine_tuning.fms.enabled: false tests.fine_tuning.ray.enabled: true tests.capture_prom: false # not needed for the time being - tests.visualize: false # not needed for the time being - tests.capture_state: false # not needed for the time being tests.fine_tuning.test_settings.hyper_parameters: {} ray_bench: extends: [ray, no_model] + matbench.config_file: ray_benchmark.yaml tests.fine_tuning.ray.workload: ray-benchmark tests.fine_tuning.test_settings.hyper_parameters: num_samples: 10 From a08a624c0a2ecd52b16bc9d4a3062f173e46bd95 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 7 Nov 2024 17:25:02 +0100 Subject: [PATCH 09/15] [fine_tuning] visualizations: fine_tuning: plotting/error_report: make compatible with Ray --- .../fine_tuning/plotting/error_report.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/projects/fine_tuning/visualizations/fine_tuning/plotting/error_report.py b/projects/fine_tuning/visualizations/fine_tuning/plotting/error_report.py index 62dbf9f5aa..137f9df46a 100644 --- a/projects/fine_tuning/visualizations/fine_tuning/plotting/error_report.py +++ b/projects/fine_tuning/visualizations/fine_tuning/plotting/error_report.py @@ -70,8 +70,8 @@ def _get_test_setup(entry): setup_info += [html.Li([f"Test UUID:", html.Code(entry.results.test_uuid, style={"white-space": "pre-wrap"})])] - setup_info += [html.Li([f"Job configuration:", - html.A(html.Code("config_final.json"), href=artifacts_basedir / entry.results.locations.tuning_config_file, target="_blank"), + setup_info += [html.Li([f"Workload configuration:", + html.A(html.Code("config_final.json"), href=artifacts_basedir / entry.results.locations.workload_config_file, target="_blank"), html.Code(yaml.dump(entry.results.job_config), style={"white-space": "pre-wrap"})])] setup_info += [html.Li([f"Job execution"])] @@ -82,9 +82,15 @@ def _get_test_setup(entry): if entry.results.finish_reason.message: exec_info += [html.Li([f"Exit message:", html.Code(entry.results.finish_reason.message, style={"white-space": "pre-wrap"})])] - metrics = yaml.safe_load(json.dumps(entry.results.sfttrainer_metrics, default=functools.partial(json_dumper, strict=False))) - if metrics.get("progress") or metrics.get("summary"): - exec_info += [html.Li([f"Fine-tuning metrics:", html.Code(yaml.dump(metrics), style={"white-space": "pre-wrap"})])] + if entry.results.locations.has_fms: + metrics = yaml.safe_load(json.dumps(entry.results.sfttrainer_metrics, default=functools.partial(json_dumper, strict=False))) + if metrics.get("progress") or metrics.get("summary"): + exec_info += [html.Li([f"Fine-tuning metrics:", html.Code(yaml.dump(metrics), style={"white-space": "pre-wrap"})])] + + elif entry.results.locations.has_ray: + metrics = yaml.safe_load(json.dumps(entry.results.ray_metrics, default=functools.partial(json_dumper, strict=False))) + if metrics.get("progress") or metrics.get("summary"): + exec_info += [html.Li([f"Fine-tuning metrics:", html.Code(yaml.dump(metrics), style={"white-space": "pre-wrap"})])] if entry.results.locations.job_logs: exec_info += [html.Li(html.A("Job logs", href=artifacts_basedir / entry.results.locations.job_logs, target="_blank"))] From 27c250d35e894e43062436f1d5b6fecb91ea57e5 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 7 Nov 2024 17:25:50 +0100 Subject: [PATCH 10/15] [fine_tuning] visualizations: fine_tuning: store: extend the parsers to work with Ray --- .../fine_tuning/store/lts_parser.py | 29 +++-- .../fine_tuning/store/parsers.py | 104 ++++++++++++++---- 2 files changed, 102 insertions(+), 31 deletions(-) diff --git a/projects/fine_tuning/visualizations/fine_tuning/store/lts_parser.py b/projects/fine_tuning/visualizations/fine_tuning/store/lts_parser.py index 89f91022df..a6a354e7d9 100644 --- a/projects/fine_tuning/visualizations/fine_tuning/store/lts_parser.py +++ b/projects/fine_tuning/visualizations/fine_tuning/store/lts_parser.py @@ -36,6 +36,9 @@ def generate_lts_metadata(results, import_settings): def generate_lts_results(results): results_lts = types.SimpleNamespace() + if not results.locations.has_fms: + return results_lts + if not results.sfttrainer_metrics.summary or not results.sfttrainer_metrics.summary.__dict__: return results_lts @@ -78,11 +81,12 @@ def generate_lts_settings(lts_metadata, results, import_settings): lts_settings.ocp_version = results.ocp_version lts_settings.rhoai_version = results.rhods_info.full_version + lts_settings.container_image = results.job_config["container_image"].split("/")[-1] lts_settings.instance_type = results.test_config.get("clusters.sutest.compute.machineset.type") lts_settings.model_name = results.job_config["model_name"] - lts_settings.tuning_method = results.tuning_config.get("peft_method", "none") + lts_settings.tuning_method = results.workload_config.get("peft_method", "none") if lts_settings.tuning_method in ("none" , None): lts_settings.tuning_method = "full" @@ -94,17 +98,18 @@ def generate_lts_settings(lts_metadata, results, import_settings): lts_settings.replicas = replicas lts_settings.accelerators_per_replica = accelerators_per_replica lts_settings.accelerator_count = replicas * accelerators_per_replica - lts_settings.per_device_train_batch_size = results.tuning_config["per_device_train_batch_size"] - lts_settings.batch_size = results.tuning_config["per_device_train_batch_size"] * lts_settings.accelerator_count - lts_settings.max_seq_length = results.tuning_config["max_seq_length"] - - lts_settings.lora_rank = results.tuning_config.get("r") - lts_settings.lora_alpha = results.tuning_config.get("lora_alpha") - lts_settings.lora_dropout = results.tuning_config.get("lora_dropout") - lts_settings.lora_modules = ", ".join(sorted(results.tuning_config.get("target_modules", []))) or None - - lts_settings.dataset_name = results.job_config["dataset_name"] - lts_settings.dataset_replication = results.job_config["dataset_replication"] + if results.locations.has_fms: + lts_settings.per_device_train_batch_size = results.workload_config["per_device_train_batch_size"] + lts_settings.batch_size = results.workload_config["per_device_train_batch_size"] * lts_settings.accelerator_count + lts_settings.max_seq_length = results.workload_config["max_seq_length"] + + lts_settings.lora_rank = results.workload_config.get("r") + lts_settings.lora_alpha = results.workload_config.get("lora_alpha") + lts_settings.lora_dropout = results.workload_config.get("lora_dropout") + lts_settings.lora_modules = ", ".join(sorted(results.workload_config.get("target_modules", []))) or None + + lts_settings.dataset_name = results.job_config["dataset_name"] + lts_settings.dataset_replication = results.job_config["dataset_replication"] lts_settings.ci_engine = results.from_env.test.ci_engine lts_settings.run_id = results.from_env.test.run_id diff --git a/projects/fine_tuning/visualizations/fine_tuning/store/parsers.py b/projects/fine_tuning/visualizations/fine_tuning/store/parsers.py index 62227a0b6d..100c93fbd8 100644 --- a/projects/fine_tuning/visualizations/fine_tuning/store/parsers.py +++ b/projects/fine_tuning/visualizations/fine_tuning/store/parsers.py @@ -24,6 +24,7 @@ artifact_dirnames = types.SimpleNamespace() artifact_dirnames.CLUSTER_CAPTURE_ENV_DIR = "*__cluster__capture_environment" artifact_dirnames.FINE_TUNING_RUN_FINE_TUNING_DIR = "*__fine_tuning__run_fine_tuning_job" +artifact_dirnames.FINE_TUNING_RAY_FINE_TUNING_DIR = "*__fine_tuning__ray_fine_tuning_job" artifact_dirnames.RHODS_CAPTURE_STATE = "*__rhods__capture_state" artifact_paths = types.SimpleNamespace() # will be dynamically populated @@ -33,10 +34,15 @@ f"{artifact_dirnames.CLUSTER_CAPTURE_ENV_DIR}/_ansible.log", f"{artifact_dirnames.CLUSTER_CAPTURE_ENV_DIR}/nodes.json", f"{artifact_dirnames.CLUSTER_CAPTURE_ENV_DIR}/ocp_version.yml", + f"{artifact_dirnames.FINE_TUNING_RUN_FINE_TUNING_DIR}/src/config_final.json", f"{artifact_dirnames.FINE_TUNING_RUN_FINE_TUNING_DIR}/artifacts/pod.log", f"{artifact_dirnames.FINE_TUNING_RUN_FINE_TUNING_DIR}/artifacts/pod.json", f"{artifact_dirnames.FINE_TUNING_RUN_FINE_TUNING_DIR}/_ansible.play.yaml", + + f"{artifact_dirnames.FINE_TUNING_RAY_FINE_TUNING_DIR}/src/config_final.json", + f"{artifact_dirnames.FINE_TUNING_RAY_FINE_TUNING_DIR}/artifacts/pod.log", + f"{artifact_dirnames.RHODS_CAPTURE_STATE}/rhods.createdAt", f"{artifact_dirnames.RHODS_CAPTURE_STATE}/rhods.version", ] @@ -62,12 +68,21 @@ def parse_once(results, dirname): results.test_start_end_time = _parse_start_end_time(dirname) - results.sfttrainer_metrics = _parse_sfttrainer_logs(dirname) - results.allocated_resources = _parse_allocated_resources(dirname) - results.finish_reason = _parse_finish_reason(dirname) results.locations = _prepare_file_locations(dirname) - results.job_config = _parse_job_config(dirname) - results.tuning_config = _parse_tuning_config(dirname, results.locations.tuning_config_file) + + results.job_config = _parse_job_config(dirname, results.locations) + + results.workload_config = _parse_workload_config(dirname, results.locations) + + if results.locations.has_fms: + results.sfttrainer_metrics = _parse_fms_logs(dirname) + results.allocated_resources = _parse_fms_allocated_resources(dirname) + results.finish_reason = _parse_fms_finish_reason(dirname) + + if results.locations.has_ray: + results.ray_metrics = _parse_ray_logs(dirname) + results.allocated_resources = _parse_ray_allocated_resources(dirname) + results.finish_reason = _parse_ray_finish_reason(dirname) @helpers_store_parsers.ignore_file_not_found @@ -120,7 +135,7 @@ def _parse_start_end_time(dirname): @helpers_store_parsers.ignore_file_not_found -def _parse_sfttrainer_logs(dirname): +def _parse_fms_logs(dirname): sfttrainer_metrics = types.SimpleNamespace() sfttrainer_metrics.summary = types.SimpleNamespace() sfttrainer_metrics.progress = [] @@ -177,8 +192,9 @@ def parse_dataset_stats(data): return sfttrainer_metrics + @helpers_store_parsers.ignore_file_not_found -def _parse_allocated_resources(dirname): +def _parse_fms_allocated_resources(dirname): allocated_resources = types.SimpleNamespace() with open(register_important_file(dirname, artifact_paths.FINE_TUNING_RUN_FINE_TUNING_DIR / "artifacts/pod.json")) as f: pod_def = json.load(f) @@ -190,8 +206,14 @@ def _parse_allocated_resources(dirname): return allocated_resources + +@helpers_store_parsers.ignore_file_not_found +def _parse_ray_allocated_resources(dirname): + pass + + @helpers_store_parsers.ignore_file_not_found -def _parse_finish_reason(dirname): +def _parse_fms_finish_reason(dirname): finish_reason = types.SimpleNamespace() finish_reason.exit_code = None finish_reason.message = "Parsing did not complete" @@ -216,41 +238,85 @@ def _parse_finish_reason(dirname): return finish_reason +@helpers_store_parsers.ignore_file_not_found +def _parse_ray_finish_reason(dirname): + finish_reason = types.SimpleNamespace() + finish_reason.exit_code = None + finish_reason.message = "_parse_ray_finish_reason: not implemented" + + return finish_reason + + def _prepare_file_locations(dirname): locations = types.SimpleNamespace() - locations.job_logs = artifact_paths.FINE_TUNING_RUN_FINE_TUNING_DIR / "artifacts/pod.log" + locations.has_fms = artifact_paths.FINE_TUNING_RUN_FINE_TUNING_DIR is not None + locations.has_ray = artifact_paths.FINE_TUNING_RAY_FINE_TUNING_DIR is not None + + if locations.has_fms: + locations.job_dir = artifact_paths.FINE_TUNING_RUN_FINE_TUNING_DIR + locations.job_logs = artifact_paths.FINE_TUNING_RUN_FINE_TUNING_DIR / "artifacts/pod.log" + + elif locations.has_ray: + locations.job_dir = artifact_paths.FINE_TUNING_RAY_FINE_TUNING_DIR + locations.job_logs = artifact_paths.FINE_TUNING_RAY_FINE_TUNING_DIR / "artifacts/pod.log" + else: + logging.error("Couldn't fine the FMS nor Ray job directory ...") + locations.job_dir = None + locations.job_logs = None + job_logs_file = register_important_file(dirname, locations.job_logs) if not job_logs_file.exists(): locations.job_logs = None logging.info(f"Job log file {job_logs_file} does not exist ...") - locations.tuning_config_file = (job_logs_file.parent.parent / "src" / "config_final.json").relative_to(dirname) + locations.workload_config_file = locations.job_dir / "src" / "config_final.json" return locations @helpers_store_parsers.ignore_file_not_found -def _parse_job_config(dirname): +def _parse_job_config(dirname, locations): job_config = {} - PREFIX = "fine_tuning_run_fine_tuning_job_" + if locations.has_fms: + prefix = "fine_tuning_run_fine_tuning_job_" + elif locations.has_ray: + prefix = "fine_tuning_ray_fine_tuning_job_" - with open(register_important_file(dirname, artifact_paths.FINE_TUNING_RUN_FINE_TUNING_DIR / "_ansible.play.yaml")) as f: + with open(register_important_file(dirname, locations.job_dir / "_ansible.play.yaml")) as f: ansible_play = yaml.safe_load(f) for k, v in ansible_play[0]["vars"].items(): - if not k.startswith(PREFIX): continue + if not k.startswith(prefix): continue - job_config[k.replace(PREFIX, "")] = v + job_config[k.replace(prefix, "")] = v return job_config @helpers_store_parsers.ignore_file_not_found -def _parse_tuning_config(dirname, tuning_config_file_location): - with open(register_important_file(dirname, tuning_config_file_location)) as f: - tuning_config = json.load(f) +def _parse_workload_config(dirname, locations): + with open(register_important_file(dirname, locations.workload_config_file)) as f: + workload_config = json.load(f) + + return workload_config + + +@helpers_store_parsers.ignore_file_not_found +def _parse_ray_logs(dirname): + ray_metrics = types.SimpleNamespace() + ray_metrics.summary = types.SimpleNamespace() + ray_metrics.summary.time = None + + ray_metrics.progress = [] + + with open(register_important_file(dirname, artifact_paths.FINE_TUNING_RAY_FINE_TUNING_DIR / "artifacts/job_pod.log")) as f: + for line in f.readlines(): + if not line.startswith("---"): + continue + ray_metrics.summary.time = float(line.strip().split("::: ")[-1].split()[0]) + break - return tuning_config + return ray_metrics From 2dcd572c48ea7ad87726b924c93591e47c74c2cf Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 7 Nov 2024 17:26:16 +0100 Subject: [PATCH 11/15] [matrix_benchmarking] visualizations: helpers: store/parsers: log an error message if RHOAI version cannot be parsed but don't crash --- .../visualizations/helpers/store/parsers.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/projects/matrix_benchmarking/visualizations/helpers/store/parsers.py b/projects/matrix_benchmarking/visualizations/helpers/store/parsers.py index ca166dd7d1..8b5f45366c 100644 --- a/projects/matrix_benchmarking/visualizations/helpers/store/parsers.py +++ b/projects/matrix_benchmarking/visualizations/helpers/store/parsers.py @@ -255,6 +255,13 @@ def extract_cluster_info(nodes_info): def parse_rhods_info(dirname, capture_state_dir, version_name=None): rhods_info = types.SimpleNamespace() + if capture_state_dir is None: + logging.error("parse_rhods_info: `capture_state_dir` not available, returning dummy values :/") + rhods_info.version = "not available" + rhods_info.createdAt_raw = "not available" + rhods_info.full_version = "0.0.0" + return rhods_info + with open(register_important_file(dirname, capture_state_dir / "rhods.version")) as f: rhods_info.version = f.read().strip() From 0b36dd02d83ac35204ec4ee1d59c6cf5ea8380a5 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 7 Nov 2024 17:29:08 +0100 Subject: [PATCH 12/15] WIP: [fine_tuning] visualizations: fine_tuning: data/ray_benchmark: add a Ray Benchmark visualization --- .../visualizations/fine_tuning/data/ray_benchmark.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 projects/fine_tuning/visualizations/fine_tuning/data/ray_benchmark.yaml diff --git a/projects/fine_tuning/visualizations/fine_tuning/data/ray_benchmark.yaml b/projects/fine_tuning/visualizations/fine_tuning/data/ray_benchmark.yaml new file mode 100644 index 0000000000..f7ad08d7d3 --- /dev/null +++ b/projects/fine_tuning/visualizations/fine_tuning/data/ray_benchmark.yaml @@ -0,0 +1,8 @@ +visualize: +- id: scale_test + generate: + - "report: Error report" + #- "report: Ray Benchmark Progress" + #- "report: Ray Benchmark Summary" + - "report: LTS Documentation" + - "report: KPI Table Report" From 7e92c3b882337bc2067b50375a9ba7576c2a8ace Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 7 Nov 2024 17:32:29 +0100 Subject: [PATCH 13/15] [fine_tuning] testing: config: add a 'ray_bench_scale' preset --- projects/fine_tuning/testing/config.yaml | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/projects/fine_tuning/testing/config.yaml b/projects/fine_tuning/testing/config.yaml index a884c685cd..960a31ac2e 100644 --- a/projects/fine_tuning/testing/config.yaml +++ b/projects/fine_tuning/testing/config.yaml @@ -313,14 +313,25 @@ ci_presets: tests.fine_tuning.ray.enabled: true tests.capture_prom: false # not needed for the time being tests.fine_tuning.test_settings.hyper_parameters: {} + matbench.lts.generate: false + tests.fine_tuning.test_settings.name: ray ray_bench: extends: [ray, no_model] matbench.config_file: ray_benchmark.yaml tests.fine_tuning.ray.workload: ray-benchmark + tests.fine_tuning.test_settings.hyper_parameters: num_samples: 10 + ray_bench_scale: + extends: [ray_bench] + tests.fine_tuning.matbenchmarking.enabled: true + tests.fine_tuning.matbenchmarking.stop_on_error: false + tests.fine_tuning.test_settings.worker_replicas: [2, 8, 16, 32] + tests.fine_tuning.test_settings.hyper_parameters.num_samples: [20, 50, 100, 150] + tests.fine_tuning.test_settings.gpu: 0 + # --- cluster_ibm_dgx: From 6084c82857e37619990813198b82c895ab13ed62 Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 7 Nov 2024 17:33:12 +0100 Subject: [PATCH 14/15] WIP: [fine_tuning] visualizations: fine_tuning: plotting/ray_benchmark: add a plotting file for Ray --- .../fine_tuning/plotting/ray_benchmark.py | 300 ++++++++++++++++++ 1 file changed, 300 insertions(+) create mode 100644 projects/fine_tuning/visualizations/fine_tuning/plotting/ray_benchmark.py diff --git a/projects/fine_tuning/visualizations/fine_tuning/plotting/ray_benchmark.py b/projects/fine_tuning/visualizations/fine_tuning/plotting/ray_benchmark.py new file mode 100644 index 0000000000..235c4438c4 --- /dev/null +++ b/projects/fine_tuning/visualizations/fine_tuning/plotting/ray_benchmark.py @@ -0,0 +1,300 @@ +from collections import defaultdict +import re +import logging +import datetime +import math +import copy +import numbers +import numpy +import statistics as stats + +import plotly.subplots +import plotly.graph_objs as go +import pandas as pd +import plotly.express as px +from dash import html + +import matrix_benchmarking.plotting.table_stats as table_stats +import matrix_benchmarking.common as common + +from . import error_report, report + +def register(): + RayBenchmarkSummary() + RayBenchmarkProgress() + + +def generateRaySummaryData(entries, x_key, _variables, summary_key, compute_speedup=False, filter_key=None, filter_value=None, y_lower_better=True): + data = [] + + variables = [v for v in _variables if v != x_key] + if not variables and x_key != "gpu" and x_key is not None: + variables += [x_key] + + + for entry in entries: + if filter_key is not None and entry.get_settings()[filter_key] != filter_value: + continue + + datum = dict() + if x_key == "gpu": + datum[x_key] = entry.results.allocated_resources.gpu + elif x_key is None: + datum[x_key] = "Single entry" + else: + datum[x_key] = entry.settings.__dict__[x_key] + + datum[summary_key] = getattr(entry.results.sfttrainer_metrics.summary, summary_key, None) + + datum["name"] = entry.get_name(variables).replace("hyper_parameters.", "") + datum["text"] = "{:.2f}".format(datum[summary_key]) if datum[summary_key] is not None else "None" + datum["is_computed"] = False + + data.append(datum) + + if not compute_speedup: + return data, None + + ref = None + for datum in data: + if datum[x_key] == 1: + ref = datum[summary_key] + + if not ref: + return data, None + + for src_datum in data[:]: + + perfect_datum = src_datum.copy() + perfect_datum["is_computed"] = True + perfect_datum["name"] = (perfect_datum["name"] + " perfect scaling").strip() + perfect_datum[summary_key] = value = ref / src_datum[x_key] \ + if y_lower_better else ref * src_datum[x_key] + + if src_datum[x_key] != 1: + speedup = ref / src_datum[summary_key] + efficiency = speedup / src_datum[x_key] + perfect_datum["text"] = f"{value:.2f}
speedup: {speedup:.1f}
efficiency: {efficiency:.2f}" + + data.append(perfect_datum) + + if not src_datum["name"]: + src_datum["name"] = summary_key + + return data, ref + + +class RayBenchmarkSummary(): + def __init__(self): + self.name = "Ray Benchmark Summary" + self.id_name = self.name + + table_stats.TableStats._register_stat(self) + common.Matrix.settings["stats"].add(self.name) + + def do_hover(self, meta_value, variables, figure, data, click_info): + return "nothing" + + def do_plot(self, ordered_vars, settings, setting_lists, variables, cfg): + cfg__summary_key = cfg.get("summary_key", False) + + cfg__filter_key = cfg.get("filter_key", None) + cfg__filter_value = cfg.get("filter_value", False) + cfg__x_key = cfg.get("x_key", None) + + from ..store import parsers + summary_key_properties = parsers.SFT_TRAINER_SUMMARY_KEYS[cfg__summary_key] + y_lower_better = summary_key_properties.lower_better + + if not cfg__summary_key: + raise ValueError("'summary_key' is a mandatory parameter ...") + + entries = common.Matrix.all_records(settings, setting_lists) + + has_gpu = "gpu" in ordered_vars and cfg__filter_key != "gpu" + + x_key = cfg__x_key + if x_key is None: + if has_gpu: + x_key = "gpu" + elif "model_name" in ordered_vars: + x_key = "model_name" + elif ordered_vars: + x_key = ordered_vars[0] + else: + x_key = None + + compute_speedup = has_gpu + + data, has_speedup = generateSFTTrainerSummaryData(entries, x_key, variables, cfg__summary_key, compute_speedup, cfg__filter_key, cfg__filter_value, y_lower_better) + df = pd.DataFrame(data) + + if df.empty: + return None, "Not data available ..." + + if x_key is not None: + df = df.sort_values(by=[x_key], ascending=False) + + y_key = cfg__summary_key + + if has_gpu or has_speedup: + do_line_plot = True + + elif len(variables) == 1: + do_line_plot = all(isinstance(v, numbers.Number) for v in list(variables.values())[0]) + elif x_key is None: + do_line_plot = False + elif x_key.startswith("hyper_parameters."): + do_line_plot = True + else: + do_line_plot = False + + text = None if len(variables) > 3 else "text" + if do_line_plot: + color = None if (len(variables) == 1 and not has_speedup) else "name" + fig = px.line(df, hover_data=df.columns, x=x_key, y=y_key, color=color, text=text) + + for i in range(len(fig.data)): + fig.data[i].update(mode='lines+markers+text') + fig.update_yaxes(rangemode='tozero') + + fig.update_traces(textposition='top center') + + else: + if x_key is not None: + df = df.sort_values(by=["name", x_key], ascending=True) + color = None if (len(variables) == 1) else "name" + fig = px.bar(df, hover_data=df.columns, x=x_key, y=y_key, color=color, barmode='group', text=text) + + if has_gpu: + fig.update_xaxes(title="Number of GPUs used for the fine-tuning") + else: + fig.update_xaxes(title=x_key) + + y_title = getattr(summary_key_properties, "title", "speed") + y_units = summary_key_properties.units + x_name = (x_key or "single expe").replace("hyper_parameters.", "") + + y_lower_better = summary_key_properties.lower_better + what = f", in {y_units}" + + y_title = f"Fine-tuning {y_title}{what}. " + title = y_title + "
"+("Lower is better" if y_lower_better else "Higher is better") + + if cfg__filter_key == "gpu": + gpu_count = cfg__filter_value + title += f". {gpu_count} GPU{'s' if gpu_count > 1 else ''}." + + fig.update_yaxes(title=("❮ " if y_lower_better else "") + y_title + (" ❯" if not y_lower_better else "")) + fig.update_layout(title=title, title_x=0.5,) + fig.update_layout(legend_title_text="Configuration") + + fig.update_xaxes(title=x_name) + # ❯ or ❮ + + msg = [] + + values_df = df[y_key][df["is_computed"] != True] + + min_row_idx = values_df.idxmin() + max_row_idx = values_df.idxmax() + + if any(map(numpy.isnan, [min_row_idx, max_row_idx])): + return fig, ["Max or Min is NaN"] + + min_count = values_df[min_row_idx] + max_count = values_df[max_row_idx] + + if has_gpu: + min_name = f"{min_count} GPU" + ("s" if min_count > 1 else "") + max_name = f"{max_count} GPU" + ("s" if max_count > 1 else "") + else: + min_name = min_count + max_name = max_count + + if len(data) > 1: + if y_lower_better: + fastest = df[y_key][min_row_idx] + slowest = df[y_key][max_row_idx] + else: + fastest = df[y_key][max_row_idx] + slowest = df[y_key][min_row_idx] + + slower = (fastest-slowest)/fastest + faster = (fastest-slowest)/slowest + msg.append(f"Fastest: {fastest:.2f} {y_units} ({abs(faster)*100:.0f}% faster, best)") + msg.append(html.Br()) + msg.append(f"Slowest: {slowest:.2f} {y_units} ({abs(slower)*100:.0f}% slower)") + + return fig, msg + + +def generateRayProgressData(entries, x_key, variables, progress_key): + data = [] + + for entry in entries: + progress_entries = entry.results.sfttrainer_metrics.progress + entry_name = entry.get_name(variables) + + for progress in progress_entries: + datum = dict() + datum[x_key] = getattr(progress, x_key) + datum[progress_key] = getattr(progress, progress_key, None) + datum["name"] = entry_name + data.append(datum) + + return data + + +class RayBenchmarkProgress(): + def __init__(self): + self.name = "Ray Benchmark Progress" + self.id_name = self.name + + table_stats.TableStats._register_stat(self) + common.Matrix.settings["stats"].add(self.name) + + def do_hover(self, meta_value, variables, figure, data, click_info): + return "nothing" + + def do_plot(self, ordered_vars, settings, setting_lists, variables, cfg): + cfg__progress_key = cfg.get("progress_key", False) + + if not cfg__progress_key: + raise ValueError("'progress_key' is a mandatory parameter ...") + + from ..store import parsers + progress_key_properties = parsers.SFT_TRAINER_PROGRESS_KEYS[cfg__progress_key] + + entries = common.Matrix.all_records(settings, setting_lists) + + x_key = "epoch" + + data = generateSFTTrainerProgressData(entries, x_key, variables, cfg__progress_key) + df = pd.DataFrame(data) + + if df.empty: + return None, "Not data available ..." + + df = df.sort_values(by=[x_key], ascending=False) + + y_key = cfg__progress_key + y_lower_better = progress_key_properties.lower_better + + fig = px.line(df, hover_data=df.columns, x=x_key, y=y_key, color="name") + + for i in range(len(fig.data)): + fig.data[i].update(mode='lines+markers+text') + fig.update_yaxes(rangemode='tozero') + + fig.update_xaxes(title="epochs") + + y_title = f"Training {y_key}. " + title = f"Fine-tuning '{y_key}' progress over the training {x_key}s" + title += "
"+("Lower is better" if y_lower_better else "Higher is better") + y_title += ("Lower is better" if y_lower_better else "Higher is better") + fig.update_yaxes(title=("❮ " if y_lower_better else "") + y_title + (" ❯" if not y_lower_better else "")) + fig.update_layout(title=title, title_x=0.5) + fig.update_layout(legend_title_text="Configuration") + + return fig, "" From 9337da1e04c6281497e7091cde79dea4bea4769d Mon Sep 17 00:00:00 2001 From: Kevin Pouget Date: Thu, 7 Nov 2024 17:33:32 +0100 Subject: [PATCH 15/15] [fine_tuning] visualizations: fine_tuning: plotting/__init__: enable Ray plots --- .../visualizations/fine_tuning/plotting/__init__.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/projects/fine_tuning/visualizations/fine_tuning/plotting/__init__.py b/projects/fine_tuning/visualizations/fine_tuning/plotting/__init__.py index e66d959bf3..ed53bbaba8 100644 --- a/projects/fine_tuning/visualizations/fine_tuning/plotting/__init__.py +++ b/projects/fine_tuning/visualizations/fine_tuning/plotting/__init__.py @@ -1,5 +1,7 @@ from . import error_report from . import sfttrainer +#from . import ray_benchmark + import projects.matrix_benchmarking.visualizations.helpers.plotting.lts_documentation as lts_documentation import projects.matrix_benchmarking.visualizations.helpers.plotting.kpi_table as kpi_table @@ -9,3 +11,4 @@ def register(): lts_documentation.register() sfttrainer.register() kpi_table.register() + #ray_benchmark.register()