openshift-psap · kpouget · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 6, 2024
diff --git a/docs/toolbox.generated/Fine_Tuning.ray_fine_tuning_job.rst b/docs/toolbox.generated/Fine_Tuning.ray_fine_tuning_job.rst
@@ -38,9 +38,11 @@ Parameters
 * The name of the model to use inside the /dataset directory of the PVC
 
 
-``ft_scripts_dir``  
+``workload``  
 
-* Directory where the fine-tuning scripts are stored
+* The name of the workload job to run (see the role's workload directory)
+
+* default value: ``ray-finetune-llm-deepspeed``
 
 
 ``dataset_name``  
@@ -72,13 +74,6 @@ Parameters
 * If True, only prepare the dataset cache file and do not run the fine-tuning.
 
 
-``dataset_response_template``  
-
-* The delimiter marking the beginning of the response in the dataset samples
-
-* default value: ``\n### Label:``
-
-
 ``container_image``  
 
 * The image to use for the fine-tuning container
@@ -97,8 +92,6 @@ Parameters
 
 * The number of GPUs to request for the fine-tuning job
 
-* default value: ``1``
-
 
 ``memory``  
 
@@ -145,3 +138,10 @@ Parameters
 
 * If true, sleeps forever instead of running the fine-tuning command.
 
+
+``capture_artifacts``  
+
+* If enabled, captures the artifacts that will help post-mortem analyses
+
+* default value: ``True``
+
diff --git a/projects/core/library/ansible_toolbox.py b/projects/core/library/ansible_toolbox.py
@@ -26,10 +26,7 @@ def __init__(self):
             if toolbox_file.name.startswith("."): continue
 
             project_toolbox_module = str(toolbox_file.relative_to(TOPSAIL_DIR).with_suffix("")).replace(os.path.sep, ".")
-            try:
-                mod = importlib.import_module(project_toolbox_module)
-            except:
-                import pdb;pdb.set_trace()
+            mod = importlib.import_module(project_toolbox_module)
             toolbox_name = toolbox_file.with_suffix("").name
 
             if toolbox_name.startswith("_"): continue

diff --git a/projects/fine_tuning/testing/config.yaml b/projects/fine_tuning/testing/config.yaml
@@ -64,12 +64,6 @@ ci_presets:
     tests.fine_tuning.multi_model.enabled: true
     tests.fine_tuning.test_settings.model_name: null
 
-  ibm_release_comparison:
-    tests.fine_tuning.test_settings.container_image:
-    - quay.io/modh/fms-hf-tuning:release-ec50c3d7dc09f50d9885f25efc3d2fc98a379709 # RHOAI-2.12
-    - quay.io/modh/fms-hf-tuning:release-5e4e9441febdb5b2beb21eaecdda1103abd1db05 # RHOAI-2.11
-    - quay.io/modh/fms-hf-tuning:release-7a8ff0f4114ba43398d34fd976f6b17bb1f665f3 # RHOAI-2.10
-
   hf_evaluation:
     fine_tuning.pvc.size: 2000Gi
     tests.fine_tuning.matbenchmarking.enabled: true
@@ -78,20 +72,6 @@ ci_presets:
     tests.fine_tuning.test_settings.dataset_replication: 0.1
     tests.fine_tuning.matbenchmarking.stop_on_error: false
 
-  ibm_regression:
-    extends: [hf_evaluation, dgx_small_footprint]
-    tests.fine_tuning.test_settings.model_name:
-    - ibm-granite/granite-3b-code-instruct
-    - ibm/merlinite-7b
-    - meta-llama/Llama-2-13b-chat-hf
-    - ibm-granite/granite-20b-code-instruct
-    - ibm-granite/granite-34b-code-instruct
-    tests.fine_tuning.test_settings.gpu: [2, 4, 8]
-    tests.fine_tuning.test_settings.container_image:
-    - quay.io/modh/fms-hf-tuning:main-5e965e4676bff71f0c4e8219a59aba37ce542083 # 4.42
-    - quay.io/modh/fms-hf-tuning:main-abbb2e2dfac0f92a34714147f3bd4696758037c6 # 4.41
-    - quay.io/modh/fms-hf-tuning:release-5e4e9441febdb5b2beb21eaecdda1103abd1db05 # RHOAI 2.11 image
-
   dgx_small_footprint:
     tests.fine_tuning.test_settings.gpu: 8
     # -- #
@@ -305,6 +285,13 @@ ci_presets:
     matbench.lts.opensearch.export.enabled: true
     matbench.lts.regression_analyses.enabled: true
     'ci_presets.light["tests.fine_tuning.test_settings.gpu"]': 1
+
+  no_model:
+    fine_tuning.pvc.name: null
+    tests.fine_tuning.test_settings.dataset_name: null
+    tests.fine_tuning.test_settings.model_name: null
+    tests.fine_tuning.test_settings.dataset_replication: null
+
   # ---
 
   quality_evaluation:
@@ -324,6 +311,26 @@ ci_presets:
   ray:
     tests.fine_tuning.fms.enabled: false
     tests.fine_tuning.ray.enabled: true
+    tests.capture_prom: false # not needed for the time being
+    tests.fine_tuning.test_settings.hyper_parameters: {}
+    matbench.lts.generate: false
+    tests.fine_tuning.test_settings.name: ray
+
+  ray_bench:
+    extends: [ray, no_model]
+    matbench.config_file: ray_benchmark.yaml
+    tests.fine_tuning.ray.workload: ray-benchmark
+
+    tests.fine_tuning.test_settings.hyper_parameters:
+      num_samples: 10
+
+  ray_bench_scale:
+    extends: [ray_bench]
+    tests.fine_tuning.matbenchmarking.enabled: true
+    tests.fine_tuning.matbenchmarking.stop_on_error: false
+    tests.fine_tuning.test_settings.worker_replicas: [2, 8, 16, 32]
+    tests.fine_tuning.test_settings.hyper_parameters.num_samples: [20, 50, 100, 150]
+    tests.fine_tuning.test_settings.gpu: 0
 
   # ---
 

diff --git a/projects/fine_tuning/testing/prepare_finetuning.py b/projects/fine_tuning/testing/prepare_finetuning.py
@@ -91,23 +91,41 @@ def set_namespace_annotations():
 
 def download_data_sources(test_settings):
     namespace = config.project.get_config("tests.fine_tuning.namespace")
-    model_name = test_settings["model_name"]
-    dataset_name = test_settings["dataset_name"]
+    model_name = test_settings.get("model_name")
+    dataset_name = test_settings.get("dataset_name")
 
     pvc_name = config.project.get_config("fine_tuning.pvc.name")
     sources = config.project.get_config(f"fine_tuning.sources")
 
     dry_mode = config.project.get_config("tests.dry_mode")
 
-    sources_name = [dataset_name]
-    if model_name is None:
+    sources_name = []
+    if dataset_name:
+        sources_name.append(dataset_name)
+
+    if config.project.get_config("tests.fine_tuning.multi_model.enabled"):
         multi_models = config.project.get_config("tests.fine_tuning.multi_model.models")
         for model in multi_models:
             sources_name.append(model["name"])
+    if model_name is None:
+        pass # nothing to do
     elif isinstance(model_name, str):
         sources_name.append(model_name)
-    else:
+    elif isinstance(model_name, list):
         sources_name += model_name
+    else:
+        msg = f"Received an unexpected value of 'model_name': {model_name} ({model_name.__class__.__name__})"
+        logging.error(msg)
+        raise ValueError(msg)
+
+    if not sources_name:
+        logging.info("download_data_sources: Nothing to download.")
+        return # nothing to do
+
+    if not pvc_name:
+        msg = f"Found {len(sources_name)} sources to download, but fine_tuning.pvc.name={pvc_name}"
+        logging.error(msg)
+        raise ValueError(msg)
 
     def do_download(extra, secret_key=None, image_key=None):
         name = extra["name"]

diff --git a/projects/fine_tuning/toolbox/fine_tuning.py b/projects/fine_tuning/toolbox/fine_tuning.py
@@ -131,20 +131,19 @@ def ray_fine_tuning_job(
             self,
             name,
             namespace,
-            pvc_name,
+            pvc_name=None,
 
-            model_name,
-            ft_scripts_dir,
+            model_name=None,
+            workload="ray-finetune-llm-deepspeed",
 
-            dataset_name,
+            dataset_name=None,
             dataset_replication=1,
             dataset_transform=None,
             dataset_prefer_cache=True,
             dataset_prepare_cache_only=False,
-            dataset_response_template="\n### Label:",
             container_image="quay.io/rhoai/ray:2.35.0-py39-cu121-torch24-fa26",
             ray_version="2.35.0",
-            gpu=1,
+            gpu=0,
             memory=10,
             cpu=1,
             request_equals_limits=False,
@@ -157,6 +156,7 @@ def ray_fine_tuning_job(
             hyper_parameters={},
 
             sleep_forever=False,
+            capture_artifacts=True,
     ):
         """
         Run a simple Ray fine-tuning Job.
@@ -175,7 +175,6 @@ def ray_fine_tuning_job(
           dataset_transform: name of the transformation to apply to the dataset
           dataset_prefer_cache: if True, and the dataset has to be transformed/duplicated, save and/or load it from the PVC
           dataset_prepare_cache_only: if True, only prepare the dataset cache file and do not run the fine-tuning.
-          dataset_response_template: the delimiter marking the beginning of the response in the dataset samples
           container_image: the image to use for the fine-tuning container
           gpu: the number of GPUs to request for the fine-tuning job
           memory: the number of RAM gigs to request for to the fine-tuning job (in Gigs)
@@ -191,6 +190,12 @@ def ray_fine_tuning_job(
 
           sleep_forever: if true, sleeps forever instead of running the fine-tuning command.
           ray_version: the version identifier passed to the RayCluster object
+          capture_artifacts: if enabled, captures the artifacts that will help post-mortem analyses
+
+          workload: the name of the workload job to run (see the role's workload directory)
         """
 
+        if dataset_name is None:
+            dataset_replication = None
+
         return RunAnsibleRole(locals())
diff --git a/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/defaults/main/config.yml b/projects/fine_tuning/toolbox/fine_tuning_ray_fine_tuning_job/defaults/main/config.yml
@@ -12,20 +12,16 @@ fine_tuning_ray_fine_tuning_job_name:
 fine_tuning_ray_fine_tuning_job_namespace:
 
 # the name of the PVC where the model and dataset are stored
-# Mandatory value
-fine_tuning_ray_fine_tuning_job_pvc_name:
+fine_tuning_ray_fine_tuning_job_pvc_name: null
 
 # the name of the model to use inside the /dataset directory of the PVC
-# Mandatory value
-fine_tuning_ray_fine_tuning_job_model_name:
+fine_tuning_ray_fine_tuning_job_model_name: null
 
-# directory where the fine-tuning scripts are stored
-# Mandatory value
-fine_tuning_ray_fine_tuning_job_ft_scripts_dir:
+# the name of the workload job to run (see the role's workload directory)
+fine_tuning_ray_fine_tuning_job_workload: ray-finetune-llm-deepspeed
 
 # the name of the dataset to use inside the /model directory of the PVC
-# Mandatory value
-fine_tuning_ray_fine_tuning_job_dataset_name:
+fine_tuning_ray_fine_tuning_job_dataset_name: null
 
 # number of replications of the dataset to use, to artificially extend or reduce the fine-tuning effort
 fine_tuning_ray_fine_tuning_job_dataset_replication: 1
@@ -39,19 +35,14 @@ fine_tuning_ray_fine_tuning_job_dataset_prefer_cache: true
 # if True, only prepare the dataset cache file and do not run the fine-tuning.
 fine_tuning_ray_fine_tuning_job_dataset_prepare_cache_only: false
 
-# the delimiter marking the beginning of the response in the dataset samples
-fine_tuning_ray_fine_tuning_job_dataset_response_template: '
-
-  ### Label:'
-
 # the image to use for the fine-tuning container
 fine_tuning_ray_fine_tuning_job_container_image: quay.io/rhoai/ray:2.35.0-py39-cu121-torch24-fa26
 
 # the version identifier passed to the RayCluster object
 fine_tuning_ray_fine_tuning_job_ray_version: 2.35.0
 
 # the number of GPUs to request for the fine-tuning job
-fine_tuning_ray_fine_tuning_job_gpu: 1
+fine_tuning_ray_fine_tuning_job_gpu: 0
 
 # the number of RAM gigs to request for to the fine-tuning job (in Gigs)
 fine_tuning_ray_fine_tuning_job_memory: 10
@@ -76,3 +67,6 @@ fine_tuning_ray_fine_tuning_job_hyper_parameters: {}
 
 # if true, sleeps forever instead of running the fine-tuning command.
 fine_tuning_ray_fine_tuning_job_sleep_forever: false
+
+# if enabled, captures the artifacts that will help post-mortem analyses
+fine_tuning_ray_fine_tuning_job_capture_artifacts: true