Skip to content

Commit

Permalink
[fine-tuning] Integrate Ray benchmarking as an alternative fine-tunin…
Browse files Browse the repository at this point in the history
…g job (#580)
  • Loading branch information
kpouget authored Nov 7, 2024
2 parents 5d13472 + ca3ecbf commit a268e0b
Show file tree
Hide file tree
Showing 36 changed files with 384 additions and 345 deletions.
20 changes: 11 additions & 9 deletions docs/toolbox.generated/Fine_Tuning.ray_fine_tuning_job.rst
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,11 @@ Parameters
* The name of the model to use inside the /dataset directory of the PVC


``ft_scripts_dir``
``workload``

* Directory where the fine-tuning scripts are stored
* The name of the workload job to run (see the role's workload directory)

* default value: ``ray-finetune-llm-deepspeed``


``dataset_name``
Expand Down Expand Up @@ -72,13 +74,6 @@ Parameters
* If True, only prepare the dataset cache file and do not run the fine-tuning.


``dataset_response_template``

* The delimiter marking the beginning of the response in the dataset samples

* default value: ``\n### Label:``


``container_image``

* The image to use for the fine-tuning container
Expand Down Expand Up @@ -145,3 +140,10 @@ Parameters

* If true, sleeps forever instead of running the fine-tuning command.


``capture_artifacts``

* If enabled, captures the artifacts that will help post-mortem analyses

* default value: ``True``

5 changes: 1 addition & 4 deletions projects/core/library/ansible_toolbox.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,7 @@ def __init__(self):
if toolbox_file.name.startswith("."): continue

project_toolbox_module = str(toolbox_file.relative_to(TOPSAIL_DIR).with_suffix("")).replace(os.path.sep, ".")
try:
mod = importlib.import_module(project_toolbox_module)
except:
import pdb;pdb.set_trace()
mod = importlib.import_module(project_toolbox_module)
toolbox_name = toolbox_file.with_suffix("").name

if toolbox_name.startswith("_"): continue
Expand Down
17 changes: 15 additions & 2 deletions projects/fine_tuning/testing/command_args.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ fine_tuning run_fine_tuning_job:
name: {{ tests.fine_tuning.test_settings.job_name }}
namespace: "{{ tests.fine_tuning.namespace }}"
pvc_name: "{{ fine_tuning.pvc.name }}"
container_image: "{{ fine_tuning.image }}"
container_image: "{{ tests.fine_tuning.fms.image }}"

model_name: {{ tests.fine_tuning.test_settings.model_name }}
dataset_name: {{ tests.fine_tuning.test_settings.dataset_name }}
Expand All @@ -66,9 +66,22 @@ fine_tuning run_quality_evaluation:
name: {{ tests.fine_tuning.test_settings.job_name }}
namespace: "{{ tests.fine_tuning.namespace }}"
pvc_name: "{{ fine_tuning.pvc.name }}"
container_image: "{{ fine_tuning.image }}"
container_image: "{{ tests.fine_tuning.fms.image }}"
model_name: "{{ tests.fine_tuning.test_settings.model_name }}"

fine_tuning ray_fine_tuning_job:
name: {{ tests.fine_tuning.test_settings.job_name }}
namespace: "{{ tests.fine_tuning.namespace }}"
{% if fine_tuning.pvc.name %}
pvc_name: "{{ fine_tuning.pvc.name }}"
{% endif %}
container_image: "{{ tests.fine_tuning.ray.image }}"

model_name: {{ tests.fine_tuning.test_settings.model_name }}
dataset_name: {{ tests.fine_tuning.test_settings.dataset_name }}
dataset_replication: {{ tests.fine_tuning.test_settings.dataset_replication }}

workload: {{ tests.fine_tuning.ray.workload }}

storage download_to_pvc:
name: SET_AT_RUNTIME
Expand Down
58 changes: 36 additions & 22 deletions projects/fine_tuning/testing/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -64,12 +64,6 @@ ci_presets:
tests.fine_tuning.multi_model.enabled: true
tests.fine_tuning.test_settings.model_name: null

ibm_release_comparison:
tests.fine_tuning.test_settings.container_image:
- quay.io/modh/fms-hf-tuning:release-ec50c3d7dc09f50d9885f25efc3d2fc98a379709 # RHOAI-2.12
- quay.io/modh/fms-hf-tuning:release-5e4e9441febdb5b2beb21eaecdda1103abd1db05 # RHOAI-2.11
- quay.io/modh/fms-hf-tuning:release-7a8ff0f4114ba43398d34fd976f6b17bb1f665f3 # RHOAI-2.10

hf_evaluation:
fine_tuning.pvc.size: 2000Gi
tests.fine_tuning.matbenchmarking.enabled: true
Expand All @@ -78,20 +72,6 @@ ci_presets:
tests.fine_tuning.test_settings.dataset_replication: 0.1
tests.fine_tuning.matbenchmarking.stop_on_error: false

ibm_regression:
extends: [hf_evaluation, dgx_small_footprint]
tests.fine_tuning.test_settings.model_name:
- ibm-granite/granite-3b-code-instruct
- ibm/merlinite-7b
- meta-llama/Llama-2-13b-chat-hf
- ibm-granite/granite-20b-code-instruct
- ibm-granite/granite-34b-code-instruct
tests.fine_tuning.test_settings.gpu: [2, 4, 8]
tests.fine_tuning.test_settings.container_image:
- quay.io/modh/fms-hf-tuning:main-5e965e4676bff71f0c4e8219a59aba37ce542083 # 4.42
- quay.io/modh/fms-hf-tuning:main-abbb2e2dfac0f92a34714147f3bd4696758037c6 # 4.41
- quay.io/modh/fms-hf-tuning:release-5e4e9441febdb5b2beb21eaecdda1103abd1db05 # RHOAI 2.11 image

dgx_small_footprint:
tests.fine_tuning.test_settings.gpu: 8
# -- #
Expand Down Expand Up @@ -305,15 +285,43 @@ ci_presets:
matbench.lts.opensearch.export.enabled: true
matbench.lts.regression_analyses.enabled: true
'ci_presets.light["tests.fine_tuning.test_settings.gpu"]': 1

no_model:
fine_tuning.pvc.name: null
tests.fine_tuning.test_settings.dataset_name: null
tests.fine_tuning.test_settings.model_name: null
tests.fine_tuning.test_settings.dataset_replication: null

# ---

quality_evaluation:
tests.fine_tuning.quality_evaluation.enabled: true
tests.fine_tuning.fms.enabled: false
tests.capture_prom: false
matbench.workload: projects.fine_tuning.visualizations.quality_evaluation
matbench.lts.generate: false
tests.capture_state: false
tests.fine_tuning.test_settings.name: quality-evaluation

#

fms:
tests.fine_tuning.fms.enabled: false

ray:
tests.fine_tuning.fms.enabled: false
tests.fine_tuning.ray.enabled: true
tests.capture_prom: false # not needed for the time being
tests.visualize: false # not needed for the time being
tests.capture_state: false # not needed for the time being
tests.fine_tuning.test_settings.hyper_parameters: {}

ray_bench:
extends: [ray, no_model]
tests.fine_tuning.ray.workload: ray-benchmark
tests.fine_tuning.test_settings.hyper_parameters:
num_samples: 10

# ---

cluster_ibm_dgx:
Expand Down Expand Up @@ -423,7 +431,6 @@ gpu:
replicas: 1

fine_tuning:
image: quay.io/modh/fms-hf-tuning:v2.0.1
pvc:
name: fine-tuning-storage
access_mode: ReadWriteOnce
Expand All @@ -435,7 +442,7 @@ fine_tuning:
registry_type: model
source_dir: "dmf://"
secret_key: "secrets.dmf_token"
download_pod_image_key: "fine_tuning.image"
download_pod_image_key: "tests.fine_tuning.fms.image"
hf:
type: model-registry
registry_type: model
Expand Down Expand Up @@ -518,9 +525,16 @@ tests:
count: 20
kueue_name: local-queue
timespan: 0
fms:
enabled: true
image: quay.io/modh/fms-hf-tuning:v2.0.1
quality_evaluation:
enabled: false
image: registry.redhat.io/ubi9
ray:
enabled: false
workload: ray-benchmark
image: quay.io/rhoai/ray:2.35.0-py311-cu121-torch24-fa26
matbench:
preset: null
workload: projects.fine_tuning.visualizations.fine_tuning
Expand Down
37 changes: 30 additions & 7 deletions projects/fine_tuning/testing/prepare_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def prepare_rhoai():
has_dsc = run.run("oc get dsc -oname", capture_stdout=True).stdout
run.run_toolbox(
"rhods", "update_datasciencecluster",
enable=["kueue", "codeflare", "trainingoperator"],
enable=["kueue", "codeflare", "trainingoperator", "ray"],
name=None if has_dsc else "default-dsc",
)

Expand Down Expand Up @@ -91,23 +91,41 @@ def set_namespace_annotations():

def download_data_sources(test_settings):
namespace = config.project.get_config("tests.fine_tuning.namespace")
model_name = test_settings["model_name"]
dataset_name = test_settings["dataset_name"]
model_name = test_settings.get("model_name")
dataset_name = test_settings.get("dataset_name")

pvc_name = config.project.get_config("fine_tuning.pvc.name")
sources = config.project.get_config(f"fine_tuning.sources")

dry_mode = config.project.get_config("tests.dry_mode")

sources_name = [dataset_name]
if model_name is None:
sources_name = []
if dataset_name:
sources_name.append(dataset_name)

if config.project.get_config("tests.fine_tuning.multi_model.enabled"):
multi_models = config.project.get_config("tests.fine_tuning.multi_model.models")
for model in multi_models:
sources_name.append(model["name"])
if model_name is None:
pass # nothing to do
elif isinstance(model_name, str):
sources_name.append(model_name)
else:
elif isinstance(model_name, list):
sources_name += model_name
else:
msg = f"Received an unexpected value of 'model_name': {model_name} ({model_name.__class__.__name__})"
logging.error(msg)
raise ValueError(msg)

if not sources_name:
logging.info("download_data_sources: Nothing to download.")
return # nothing to do

if not pvc_name:
msg = f"Found {len(sources_name)} sources to download, but fine_tuning.pvc.name={pvc_name}"
logging.error(msg)
raise ValueError(msg)

def do_download(extra, secret_key=None, image_key=None):
name = extra["name"]
Expand Down Expand Up @@ -288,5 +306,10 @@ def preload(image, name):
logging.warning(f"Preloading of '{image}' try #{i+1}/{RETRIES} failed :/")
if i+1 == RETRIES:
raise
do_ray = config.project.get_config("tests.fine_tuning.ray.enabled")
do_fms = config.project.get_config("tests.fine_tuning.ray.enabled")
if do_fms:
preload(config.project.get_config("tests.fine_tuning.fms.image"), "fine-tuning-image")

preload(config.project.get_config("fine_tuning.image"), "fine-tuning-image")
elif do_ray:
preload(config.project.get_config("tests.fine_tuning.ray.image"), "fine-tuning-image")
77 changes: 60 additions & 17 deletions projects/fine_tuning/testing/test_finetuning.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,24 +74,27 @@ def _run_test(test_artifact_dir_p, test_override_values, job_index=None):
test_settings = config.project.get_config("tests.fine_tuning.test_settings") | test_override_values
do_multi_model = config.project.get_config("tests.fine_tuning.multi_model.enabled")
do_many_model = config.project.get_config("tests.fine_tuning.many_model.enabled")
do_fms = config.project.get_config("tests.fine_tuning.fms.enabled")
do_quality_evaluation = config.project.get_config("tests.fine_tuning.quality_evaluation.enabled")
do_ray = config.project.get_config("tests.fine_tuning.ray.enabled")

test_settings["hyper_parameters"] = {k: v for k, v in test_settings["hyper_parameters"].items()
if v is not None}

logging.info(f"Test configuration to run: \n{yaml.dump(test_settings, sort_keys=False)}")

sources = config.project.get_config(f"fine_tuning.sources")
dataset_source = sources[test_settings["dataset_name"]]
if test_settings["dataset_name"]:
dataset_source = sources[test_settings["dataset_name"]]

if transform := dataset_source.get("transform", False):
test_settings["dataset_transform"] = transform
if transform := dataset_source.get("transform", False):
test_settings["dataset_transform"] = transform

if (prefer_cache := dataset_source.get("prefer_cache")) is not None:
test_settings["dataset_prefer_cache"] = prefer_cache
if (prefer_cache := dataset_source.get("prefer_cache")) is not None:
test_settings["dataset_prefer_cache"] = prefer_cache

if (response_template := dataset_source.get("response_template")) is not None:
test_settings["dataset_response_template"] = response_template
if (response_template := dataset_source.get("response_template")) is not None:
test_settings["dataset_response_template"] = response_template

remove_none_values(test_settings)

Expand All @@ -103,8 +106,14 @@ def _run_test(test_artifact_dir_p, test_override_values, job_index=None):
if not do_multi_model:
prom_start_ts = prom.reset_prometheus()

test_dir_name = "evaluate_quality" if do_quality_evaluation \
else "test_fine_tuning"
if do_fms:
test_dir_name = "fms_fine_tuning"
elif do_quality_evaluation:
test_dir_name = "evaluate_quality"
elif do_ray:
workload = config.project.get_config("tests.fine_tuning.ray.workload")
test_dir_name = f"ray__{workload}"

with env.NextArtifactDir(test_dir_name):
test_artifact_dir_p[0] = env.ARTIFACT_DIR

Expand All @@ -122,9 +131,16 @@ def _run_test(test_artifact_dir_p, test_override_values, job_index=None):
with open(env.ARTIFACT_DIR / "settings.mode.yaml", "w") as f:
yaml.dump(dict(mode="single-model"), f, indent=4)

test_settings["model_name"] = prepare_finetuning.get_safe_model_name(test_settings["model_name"])
run.run_toolbox_from_config("fine_tuning", "run_fine_tuning_job",
extra=test_settings)
if do_fms:
test_settings["model_name"] = prepare_finetuning.get_safe_model_name(test_settings["model_name"])
run.run_toolbox_from_config("fine_tuning", "run_fine_tuning_job",
extra=test_settings)
elif do_ray:
run.run_toolbox_from_config(
"fine_tuning", "ray_fine_tuning_job",
extra=test_settings,
)

failed = False
finally:
with open(env.ARTIFACT_DIR / "exit_code", "w") as f:
Expand Down Expand Up @@ -227,6 +243,16 @@ def _run_test_and_visualize(test_override_values=None):
do_matbenchmarking = test_override_values is None and config.project.get_config("tests.fine_tuning.matbenchmarking.enabled")
do_multi_model = config.project.get_config("tests.fine_tuning.multi_model.enabled")

ray_enabled = config.project.get_config("tests.fine_tuning.ray.enabled")
fms_enabled = config.project.get_config("tests.fine_tuning.fms.enabled")
quality_enabled = config.project.get_config("tests.fine_tuning.quality_evaluation.enabled")

enabled = sum(1 for opt in (fms_enabled, quality_enabled, ray_enabled) if opt)
if enabled != 1:
msg = f"FMS or Quality or Ray testing must be enabled. Found {enabled} enabled. Cannot proceed."
logging.error(msg)
raise RuntimeError(msg)

if not do_matbenchmarking and config.project.get_config("tests.fine_tuning.test_extra_settings"):
msg = "Cannot use 'test_extra_settings' when 'tests.fine_tuning.tests.fine_tuning.matbenchmarking' isn't enabled."
logging.error(msg)
Expand All @@ -242,8 +268,13 @@ def _run_test_and_visualize(test_override_values=None):
logging.error(msg)
raise RuntimeError(msg)

if not prepare_rhoai_mod.is_component_deployed("trainingoperator"):
msg = "Training Operator not installed, cluster not prepared for fine-tuning"
if fms_enabled and not prepare_rhoai_mod.is_component_deployed("trainingoperator"):
msg = "Training Operator not enabled, cluster not prepared for fine-tuning"
logging.error(msg)
raise RuntimeError(msg)

if ray_enabled and not prepare_rhoai_mod.is_component_deployed("ray"):
msg = "Ray Operator not enabled, cluster not prepared for fine-tuning"
logging.error(msg)
raise RuntimeError(msg)

Expand Down Expand Up @@ -475,11 +506,23 @@ def matbench_run_one():


def _run_test_many_model(test_settings):
run.run_toolbox_from_config("fine_tuning", "run_fine_tuning_job",
extra=test_settings | dict(prepare_only=True, delete_other=True))
ray_enabled = config.project.get_config("tests.fine_tuning.ray.enabled")
fms_enabled = config.project.get_config("tests.fine_tuning.fms.enabled")
extra = test_settings | dict(prepare_only=True, delete_other=True)

if fms_enabled:
run.run_toolbox_from_config("fine_tuning", "run_fine_tuning_job", extra)
elif ray_enabled:
run.run_toolbox_from_config("fine_tuning", "ray_fine_tuning_job", extra)

artifact_dir = list(env.ARTIFACT_DIR.glob("*__fine_tuning__run_fine_tuning_job"))[-1]
if fms_enabled:
fine_tuning_job_base = artifact_dir / "src" / "pytorchjob_fine_tuning.yaml"

elif ray_enabled:
# fine_tuning_job_base = artifact_dir / "src" / "ray_job.yaml"
raise NotImplemented("Ray many-model fine-tuning not implemented yet")

fine_tuning_job_base = artifact_dir / "src" / "pytorchjob_fine_tuning.yaml"
if not fine_tuning_job_base.exists():
raise FileNotFoundError(f"Something went wrong with the fine tuning job preparation. {fine_tuning_job_base} does not exist.")

Expand Down
Loading

0 comments on commit a268e0b

Please sign in to comment.