From aacd5bd0cfe0fc9be241cc4eec7a898e78e18e03 Mon Sep 17 00:00:00 2001 From: Yu Chin Fabian Lim Date: Fri, 11 Oct 2024 03:51:20 +0000 Subject: [PATCH 1/8] address issue 2 in #83 Signed-off-by: Yu Chin Fabian Lim --- .../framework_plugin_fast_quantized_peft.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_quantized_peft.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_quantized_peft.py index 6ec4cd9..78e90d7 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_quantized_peft.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_quantized_peft.py @@ -54,10 +54,19 @@ def _all_reduce_hook(grad): # because we will ignore these from FSDP, we need to manually # move them to gpu if they are already not on them + # - if the adapters are on meta, we assume that this is for FSDP + # low_cpu_mem_mode purposes, and that the values will be synced over + # - So just initialize them to empty. if not A.weight.is_cuda: - set_module_tensor_to_device(A, "weight", "cuda") + value = None + if A.weight.device == torch.device('meta'): + value = torch.empty(*A.weight.size(), dtype=A.weight.dtype) + set_module_tensor_to_device(A, "weight", "cuda", value) if not B.weight.is_cuda: - set_module_tensor_to_device(B, "weight", "cuda") + value = None + if B.weight.device == torch.device('meta'): + value = torch.empty(*B.weight.size(), dtype=B.weight.dtype) + set_module_tensor_to_device(B, "weight", "cuda", value) def register_foak_model_patch_rules(base_type): From 1e99ee9138f09a5bee2272ea6277dfe89d8b69b8 Mon Sep 17 00:00:00 2001 From: Yu Chin Fabian Lim Date: Fri, 11 Oct 2024 06:55:30 +0000 Subject: [PATCH 2/8] properly handle broadcast of adapters Signed-off-by: Yu Chin Fabian Lim --- .../framework_plugin_fast_quantized_peft.py | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_quantized_peft.py b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_quantized_peft.py index 78e90d7..0558d6b 100644 --- a/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_quantized_peft.py +++ b/plugins/fused-ops-and-kernels/src/fms_acceleration_foak/framework_plugin_fast_quantized_peft.py @@ -21,10 +21,10 @@ from peft import LoraConfig from peft.tuners.lora.layer import LoraLayer from transformers import TrainingArguments +from transformers.modeling_utils import is_fsdp_enabled import torch import torch.distributed as dist - # consider moving this somewhere else later def lora_adapters_switch_ddp_from_fsdp(modules, fsdp_plugin): """ @@ -58,16 +58,27 @@ def _all_reduce_hook(grad): # low_cpu_mem_mode purposes, and that the values will be synced over # - So just initialize them to empty. if not A.weight.is_cuda: - value = None - if A.weight.device == torch.device('meta'): - value = torch.empty(*A.weight.size(), dtype=A.weight.dtype) + value = A.weight + + if is_fsdp_enabled() and value.device == torch.device('meta'): + # if low_cpu_mem_mode + value = torch.empty(*value.size(), dtype=value.dtype) + set_module_tensor_to_device(A, "weight", "cuda", value) + + if is_fsdp_enabled(): + dist.broadcast(A.weight, src=0) + if not B.weight.is_cuda: - value = None - if B.weight.device == torch.device('meta'): - value = torch.empty(*B.weight.size(), dtype=B.weight.dtype) + value = B.weight + + if is_fsdp_enabled() and value.device == torch.device('meta'): + value = torch.empty(*value.size(), dtype=value.dtype) + set_module_tensor_to_device(B, "weight", "cuda", value) + if is_fsdp_enabled(): + dist.broadcast(B.weight, src=0) def register_foak_model_patch_rules(base_type): # Third Party From 0f5c9ae08e44d2a0279f37bc95e678ecda8bec2d Mon Sep 17 00:00:00 2001 From: Yu Chin Fabian Lim Date: Fri, 11 Oct 2024 11:51:30 +0000 Subject: [PATCH 3/8] handle param_init_fn_tied_param Signed-off-by: Yu Chin Fabian Lim --- .../framework_plugin_bnb.py | 93 +++++++------------ .../src/fms_acceleration_peft/fsdp_utils.py | 69 ++++++++++++++ 2 files changed, 103 insertions(+), 59 deletions(-) create mode 100644 plugins/accelerated-peft/src/fms_acceleration_peft/fsdp_utils.py diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py index 1aadc08..0c3cddf 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py @@ -116,64 +116,12 @@ def model_loader(self, model_name: str, **kwargs): except ValueError: world_size = 1 # pg not init - patched_is_local_dist_rank_0 = None if ( world_size > 1 and os.environ.get("ACCELERATE_USE_FSDP", "false").lower() == "true" ): config_kwargs["bnb_4bit_quant_storage"] = torch_dtype - # - of course assume that this package must exist, simply need the version - _, _transformers_version = _is_package_available( - "transformers", return_version=True - ) - - # this is a workaround that disables low_cpu_mem_mode for quant QLORA - # - this issue was introduced in https://github.com/huggingface/transformers/pull/33154 - # whereby the low_cpu_mem_mode was actually fixed. - # - However fixing it causes some problems with the current impl. - # 1. For lora fused ops, the adapters cannot be managed by FSDP, as - # forwards are not called. This causes issue 2) in - # https://github.com/foundation-model-stack/fms-acceleration/issues/83 - # where the adapters are still sharded when passed in the fused-ops. - # However, if low_cpu_mem_mode=True, then we NEED FSDP to intialize - # their state, which contradicts the above point. - # - # 2. We have observed, - # see https://github.com/foundation-model-stack/fms-acceleration/pull/86 - # that low_cpu_mem_mode=True can cause torch distributed primitives - # to hang. - - if _transformers_version >= "4.45": - - # pylint: disable=import-outside-toplevel - # Third Party - from fms_acceleration.model_patcher import patch_target_module - import transformers.modeling_utils - - def _truthy(): - return ( - True # use this to always return True to is_local_dist_rank_0 - ) - - # - we cannot use the model patcher and this needs to be called immediately below - # at the model_loader - # - but we immediately revert the patch after loading - patched_is_local_dist_rank_0 = ( - transformers.modeling_utils.is_local_dist_rank_0 - ) - patch_target_module( - "transformers.modeling_utils.is_local_dist_rank_0", - _truthy, - ) - - warnings.warn( - "Disabling low_cpu_mem_mode in the BNBAccelerationPlugin as this may " - "potentiall cause problems with: " - "1. the fused-ops-and-kernels package, and, " - "2. the syncing of FSDP modules across devices." - ) - elif world_size > 1: warnings.warn( "Running in distributed mode but bnb_4bit_quant_storage is not set. " @@ -206,13 +154,6 @@ def _truthy(): attn_implementation=attn_implementation, ) - if patched_is_local_dist_rank_0 is not None: - # replace it - patch_target_module( - "transformers.modeling_utils.is_local_dist_rank_0", - patched_is_local_dist_rank_0, - ) - return model @property @@ -252,6 +193,40 @@ def augmentation( modifiable_args = (None,) # return a None return model, modifiable_args + def get_callbacks_and_ready_for_train( + self, model: torch.nn.Module = None, accelerator=None + ): + callbacks = [] + if ( + accelerator is not None + and getattr(accelerator.state, "fsdp_plugin", None) is not None + ): + _, _transformers_version = _is_package_available("transformers", return_version=True) + _trl_installed, _trl_version = _is_package_available("trl", return_version=True) + + # the meta device fix for quantized models is since this transformers version + # or if trl is installed then its only for this version + if ( + _transformers_version >= "4.45" and ( + not _trl_installed or (_trl_installed and _trl_version >= "12") + ) + ): + # guarded + # NOTE: replace this later with a more specific accelerate version check + try: + from torch.distributed.utils import ensure_weights_retied + # then its handled internally and there is nothing to do + except ImportError: + # need to use our internal version + from .fsdp_utils import ensure_weights_retied + accelerator.state.fsdp_plugin.param_init_fn = ensure_weights_retied( + accelerator.state.fsdp_plugin.param_init_fn, + model if self._no_peft_model else model.get_base_model(), + accelerator.device + ) + + return callbacks + # register AccelerationPlugin.register_plugin( diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/fsdp_utils.py b/plugins/accelerated-peft/src/fms_acceleration_peft/fsdp_utils.py new file mode 100644 index 0000000..95a22eb --- /dev/null +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/fsdp_utils.py @@ -0,0 +1,69 @@ +from collections import defaultdict +import torch + + +# Copyright The IBM Tuning Team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# SPDX-License-Identifier: Apache-2.0 +# https://spdx.dev/learn/handling-license-info/ + +def ensure_weights_retied( + param_init_fn, model: torch.nn.Module, device: torch.cuda.device +): + + _tied_names = model._tied_weights_keys + if not _tied_names: + # if no tied names just passthrough + return param_init_fn + + # get map of parameter instances to params. + # - needed for replacement later + _tied_params = {} + for name in _tied_names: + name = name.split('.') + name, param_name = '.'.join(name[:-1]), name[-1] + mod = model.get_submodule(name) + param = getattr(mod, param_name) + + _tied_params[id(param)] = None # placeholder for the param first + + # build param_init_fn for the case with tied params + def param_init_fn_tied_param(module: torch.nn.Module): + + # track which params to tie + # - usually only 1, but for completeness consider > 1 + params_to_tie = defaultdict(list) + for n, param in module.named_parameters(recurse=False): + if id(param) in _tied_params: + params_to_tie[id(param)].append(n) + + # call the param init fn, which potentially re-allocates the + # parameters + module = param_init_fn(module) + + # search the parameters again and tie them up again + for id_key, _param_names in params_to_tie.items(): + for param_name in _param_names: + param = _tied_params[id_key] + if param is None: + # everything will be tied to the first time the + # param is observed + _tied_params[id_key] = getattr(module, param_name) + else: + setattr(module, param_name, param) # tie + + return module + + return param_init_fn_tied_param \ No newline at end of file From 18aa160b9c49479c7f5e759455904c440c1f57a6 Mon Sep 17 00:00:00 2001 From: Yu Chin Fabian Lim Date: Sun, 13 Oct 2024 06:05:35 +0000 Subject: [PATCH 4/8] trl version error Signed-off-by: Yu Chin Fabian Lim --- .../src/fms_acceleration_peft/framework_plugin_bnb.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py index 0c3cddf..8560bd4 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py @@ -208,7 +208,7 @@ def get_callbacks_and_ready_for_train( # or if trl is installed then its only for this version if ( _transformers_version >= "4.45" and ( - not _trl_installed or (_trl_installed and _trl_version >= "12") + not _trl_installed or (_trl_installed and _trl_version >= "0.12") ) ): # guarded From e02993d2511aeb4bec9f346d496953ea2ff325bd Mon Sep 17 00:00:00 2001 From: Yu Chin Fabian Lim Date: Sun, 13 Oct 2024 12:45:52 +0000 Subject: [PATCH 5/8] tied weights fix and meta fix for autogptq Signed-off-by: Yu Chin Fabian Lim --- .../framework_plugin_autogptq.py | 34 ++++++++ .../gptqmodel/models/base.py | 79 ++++++++++--------- 2 files changed, 76 insertions(+), 37 deletions(-) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py index e03a40d..8fd10df 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py @@ -29,6 +29,7 @@ from peft.tuners.lora.model import LoraModel from transformers import AutoModelForCausalLM, TrainingArguments from transformers.modeling_utils import is_fsdp_enabled +from transformers.utils.import_utils import _is_package_available import torch import torch.distributed @@ -351,6 +352,39 @@ def augmentation( return model, modifiable_args + def get_callbacks_and_ready_for_train( + self, model: torch.nn.Module = None, accelerator=None + ): + callbacks = [] + if ( + accelerator is not None + and getattr(accelerator.state, "fsdp_plugin", None) is not None + ): + _, _transformers_version = _is_package_available("transformers", return_version=True) + _trl_installed, _trl_version = _is_package_available("trl", return_version=True) + + # the meta device fix for quantized models is since this transformers version + # or if trl is installed then its only for this version + if ( + _transformers_version >= "4.45" and ( + not _trl_installed or (_trl_installed and _trl_version >= "0.12") + ) + ): + # guarded + # NOTE: replace this later with a more specific accelerate version check + try: + from torch.distributed.utils import ensure_weights_retied + # then its handled internally and there is nothing to do + except ImportError: + # need to use our internal version + from .fsdp_utils import ensure_weights_retied + accelerator.state.fsdp_plugin.param_init_fn = ensure_weights_retied( + accelerator.state.fsdp_plugin.param_init_fn, + model.get_base_model(), + accelerator.device + ) + return callbacks + # register AccelerationPlugin.register_plugin( diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py index 40f80cb..8782744 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py @@ -32,7 +32,7 @@ PretrainedConfig, PreTrainedModel, ) -from transformers.modeling_utils import no_init_weights, shard_checkpoint +from transformers.modeling_utils import no_init_weights, shard_checkpoint, is_local_dist_rank_0 from transformers.utils.generic import ContextManagers import accelerate import torch @@ -1105,45 +1105,50 @@ def skip(*args, **kwargs): # prepares the model on gpu in `trainer.train` to avoid unnecessary gpu usage device_map = {"": "cpu"} - load_checkpoint_in_model = False - # compat: runtime convert checkpoint gptq(v1) to gptq_v2 format - if quantize_config.format == FORMAT.GPTQ: - accelerate.load_checkpoint_in_model( - model, - dtype=torch_dtype, - # This is very hacky but works due to https://github.com/huggingface/accelerate/blob/bd72a5f1a80d5146554458823f8aeda0a9db5297/src/accelerate/utils/modeling.py#L292 - checkpoint=model_save_name, - device_map=device_map, - ) - # validate sym=False v1 loading needs to be protected for models produced with new v2 format codebase - if ( - not quantize_config.sym - and not quantize_config.is_quantized_or_packed_by_v2() - ): - raise ValueError( - f"Loading of a sym=False model with format={FORMAT.GPTQ} is only supported if produced by gptqmodel version >= {MIN_VERSION_WITH_V2}" + # low_cpu_mem_usage fix by flim@sg.ibm.com + # - load the checkpoint only if not low_cpu_mem_usage + # - or if low_cpu_mem_usage then only in the rank_0 + if not low_cpu_mem_usage or is_local_dist_rank_0(): + load_checkpoint_in_model = False + # compat: runtime convert checkpoint gptq(v1) to gptq_v2 format + if quantize_config.format == FORMAT.GPTQ: + accelerate.load_checkpoint_in_model( + model, + dtype=torch_dtype, + # This is very hacky but works due to https://github.com/huggingface/accelerate/blob/bd72a5f1a80d5146554458823f8aeda0a9db5297/src/accelerate/utils/modeling.py#L292 + checkpoint=model_save_name, + device_map=device_map, ) + # validate sym=False v1 loading needs to be protected for models produced with new v2 format codebase + if ( + not quantize_config.sym + and not quantize_config.is_quantized_or_packed_by_v2() + ): + raise ValueError( + f"Loading of a sym=False model with format={FORMAT.GPTQ} is only supported if produced by gptqmodel version >= {MIN_VERSION_WITH_V2}" + ) - logger.info( - f"Compatibility: converting `{FORMAT_FIELD_JSON}` from `{FORMAT.GPTQ}` to `{FORMAT.GPTQ_V2}`." - ) - model = convert_gptq_v1_to_v2_format( - model, - quantize_config=quantize_config, - qlinear_kernel=preload_qlinear_kernel, - ) - load_checkpoint_in_model = True - quantize_config.format = FORMAT.GPTQ_V2 + logger.info( + f"Compatibility: converting `{FORMAT_FIELD_JSON}` from `{FORMAT.GPTQ}` to `{FORMAT.GPTQ_V2}`." + ) + model = convert_gptq_v1_to_v2_format( + model, + quantize_config=quantize_config, + qlinear_kernel=preload_qlinear_kernel, + ) + load_checkpoint_in_model = True + quantize_config.format = FORMAT.GPTQ_V2 + + if not load_checkpoint_in_model and backend == Backend.TRITON: + accelerate.load_checkpoint_in_model( + model, + dtype=torch_dtype, # This is very hacky but works due to https://github.com/huggingface/accelerate/blob/bd72a5f1a80d5146554458823f8aeda0a9db5297/src/accelerate/utils/modeling.py#L292 + checkpoint=model_save_name, + device_map=device_map, + ) - if not load_checkpoint_in_model and backend == Backend.TRITON: - accelerate.load_checkpoint_in_model( - model, - dtype=torch_dtype, # This is very hacky but works due to https://github.com/huggingface/accelerate/blob/bd72a5f1a80d5146554458823f8aeda0a9db5297/src/accelerate/utils/modeling.py#L292 - checkpoint=model_save_name, - device_map=device_map, - ) - # TODO: Why are we using this custom function and not dispatch_model? - model = simple_dispatch_model(model, device_map) + # TODO: Why are we using this custom function and not dispatch_model? + model = simple_dispatch_model(model, device_map) qlinear_kernel = select_quant_linear( bits=quantize_config.bits, From 510e0d18d5054d0035c21d637c9efef5f83812de Mon Sep 17 00:00:00 2001 From: Yu Chin Fabian Lim Date: Sun, 13 Oct 2024 15:08:18 +0000 Subject: [PATCH 6/8] update readme Signed-off-by: Yu Chin Fabian Lim --- plugins/accelerated-peft/README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/plugins/accelerated-peft/README.md b/plugins/accelerated-peft/README.md index fc2cf62..8fb1afb 100644 --- a/plugins/accelerated-peft/README.md +++ b/plugins/accelerated-peft/README.md @@ -6,8 +6,8 @@ Currently only supports LoRA-related techniques, but more are in the pipeline to Plugin | Description | Depends | Loading | Augmentation | Callbacks --|--|--|--|--|-- -[autogptq](./src/fms_acceleration_peft/framework_plugin_autogptq.py) | Loads 4bit GPTQ-LoRA with quantized GPTQ as base | AutoGPTQ | ✅ | ✅ -[bnb](./src/fms_acceleration_peft/framework_plugin_bnb.py) | Loads 4bit QLoRA with quantized bitsandbytes Linear4 | Huggingface
bitsandbytes | ✅ | ✅ +[autogptq](./src/fms_acceleration_peft/framework_plugin_autogptq.py) | Loads 4bit GPTQ-LoRA with quantized GPTQ as base | AutoGPTQ | ✅ | ✅ | ✅ +[bnb](./src/fms_acceleration_peft/framework_plugin_bnb.py) | Loads 4bit QLoRA with quantized bitsandbytes Linear4 | Huggingface
bitsandbytes | ✅ | ✅ | ✅ ### Key Points @@ -43,6 +43,7 @@ GPTQ-LORA depends on an AutoGPTQ backend to run. There are 2 backend options ## Known Issues + - GPTQ-LORA sometimes observed to have `nan` grad norms in the begining of training, but training proceeds well otherwise. -- `low_cpu_mem_usage` temporarily disabled for AutoGPTQ until bug with `make_sure_no_tensor_in_meta_device` is resolved. From a85faacceeead022273bcb3058cc768422f96a21 Mon Sep 17 00:00:00 2001 From: Yu Chin Fabian Lim Date: Mon, 14 Oct 2024 03:11:49 +0000 Subject: [PATCH 7/8] fmt + lint Signed-off-by: Yu Chin Fabian Lim --- .../framework_plugin_autogptq.py | 31 +++++++++++-------- .../framework_plugin_bnb.py | 27 ++++++++++------ .../src/fms_acceleration_peft/fsdp_utils.py | 25 ++++++++------- .../gptqmodel/models/base.py | 6 +++- 4 files changed, 55 insertions(+), 34 deletions(-) diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py index 8fd10df..e1fd277 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_autogptq.py @@ -62,10 +62,6 @@ def __init__(self, configurations: Dict[str, Dict]): ) if self.use_external_lib: - # Third Party - from transformers.utils.import_utils import ( # pylint: disable=import-outside-toplevel - _is_package_available, - ) assert _is_package_available("auto_gptq") is True, ( "Unable to use external library, auto_gptq module not found. " @@ -360,28 +356,37 @@ def get_callbacks_and_ready_for_train( accelerator is not None and getattr(accelerator.state, "fsdp_plugin", None) is not None ): - _, _transformers_version = _is_package_available("transformers", return_version=True) - _trl_installed, _trl_version = _is_package_available("trl", return_version=True) + _, _transformers_version = _is_package_available( + "transformers", return_version=True + ) + _trl_installed, _trl_version = _is_package_available( + "trl", return_version=True + ) # the meta device fix for quantized models is since this transformers version # or if trl is installed then its only for this version - if ( - _transformers_version >= "4.45" and ( - not _trl_installed or (_trl_installed and _trl_version >= "0.12") - ) + if _transformers_version >= "4.45" and ( + not _trl_installed or (_trl_installed and _trl_version >= "0.12") ): # guarded # NOTE: replace this later with a more specific accelerate version check try: + # Third Party + # pylint: disable=import-outside-toplevel from torch.distributed.utils import ensure_weights_retied + # then its handled internally and there is nothing to do except ImportError: # need to use our internal version - from .fsdp_utils import ensure_weights_retied + # Local + from .fsdp_utils import ( # pylint: disable=import-outside-toplevel + ensure_weights_retied, + ) + accelerator.state.fsdp_plugin.param_init_fn = ensure_weights_retied( - accelerator.state.fsdp_plugin.param_init_fn, + accelerator.state.fsdp_plugin.param_init_fn, model.get_base_model(), - accelerator.device + accelerator.device, ) return callbacks diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py index 8560bd4..3a4c12a 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/framework_plugin_bnb.py @@ -201,28 +201,37 @@ def get_callbacks_and_ready_for_train( accelerator is not None and getattr(accelerator.state, "fsdp_plugin", None) is not None ): - _, _transformers_version = _is_package_available("transformers", return_version=True) - _trl_installed, _trl_version = _is_package_available("trl", return_version=True) + _, _transformers_version = _is_package_available( + "transformers", return_version=True + ) + _trl_installed, _trl_version = _is_package_available( + "trl", return_version=True + ) # the meta device fix for quantized models is since this transformers version # or if trl is installed then its only for this version - if ( - _transformers_version >= "4.45" and ( - not _trl_installed or (_trl_installed and _trl_version >= "0.12") - ) + if _transformers_version >= "4.45" and ( + not _trl_installed or (_trl_installed and _trl_version >= "0.12") ): # guarded # NOTE: replace this later with a more specific accelerate version check try: + # Third Party + # pylint: disable=import-outside-toplevel from torch.distributed.utils import ensure_weights_retied + # then its handled internally and there is nothing to do except ImportError: # need to use our internal version - from .fsdp_utils import ensure_weights_retied + # Local + from .fsdp_utils import ( # pylint: disable=import-outside-toplevel + ensure_weights_retied, + ) + accelerator.state.fsdp_plugin.param_init_fn = ensure_weights_retied( - accelerator.state.fsdp_plugin.param_init_fn, + accelerator.state.fsdp_plugin.param_init_fn, model if self._no_peft_model else model.get_base_model(), - accelerator.device + accelerator.device, ) return callbacks diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/fsdp_utils.py b/plugins/accelerated-peft/src/fms_acceleration_peft/fsdp_utils.py index 95a22eb..3086cf7 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/fsdp_utils.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/fsdp_utils.py @@ -1,6 +1,8 @@ +# Standard from collections import defaultdict -import torch +# Third Party +import torch # Copyright The IBM Tuning Team # @@ -19,6 +21,7 @@ # SPDX-License-Identifier: Apache-2.0 # https://spdx.dev/learn/handling-license-info/ + def ensure_weights_retied( param_init_fn, model: torch.nn.Module, device: torch.cuda.device ): @@ -28,28 +31,28 @@ def ensure_weights_retied( # if no tied names just passthrough return param_init_fn - # get map of parameter instances to params. + # get map of parameter instances to params. # - needed for replacement later _tied_params = {} for name in _tied_names: - name = name.split('.') - name, param_name = '.'.join(name[:-1]), name[-1] + name = name.split(".") + name, param_name = ".".join(name[:-1]), name[-1] mod = model.get_submodule(name) param = getattr(mod, param_name) - _tied_params[id(param)] = None # placeholder for the param first - + _tied_params[id(param)] = None # placeholder for the param first + # build param_init_fn for the case with tied params def param_init_fn_tied_param(module: torch.nn.Module): - # track which params to tie + # track which params to tie # - usually only 1, but for completeness consider > 1 params_to_tie = defaultdict(list) for n, param in module.named_parameters(recurse=False): if id(param) in _tied_params: params_to_tie[id(param)].append(n) - # call the param init fn, which potentially re-allocates the + # call the param init fn, which potentially re-allocates the # parameters module = param_init_fn(module) @@ -62,8 +65,8 @@ def param_init_fn_tied_param(module: torch.nn.Module): # param is observed _tied_params[id_key] = getattr(module, param_name) else: - setattr(module, param_name, param) # tie - + setattr(module, param_name, param) # tie + return module - return param_init_fn_tied_param \ No newline at end of file + return param_init_fn_tied_param diff --git a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py index 8782744..85c17ee 100644 --- a/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py +++ b/plugins/accelerated-peft/src/fms_acceleration_peft/gptqmodel/models/base.py @@ -32,7 +32,11 @@ PretrainedConfig, PreTrainedModel, ) -from transformers.modeling_utils import no_init_weights, shard_checkpoint, is_local_dist_rank_0 +from transformers.modeling_utils import ( + is_local_dist_rank_0, + no_init_weights, + shard_checkpoint, +) from transformers.utils.generic import ContextManagers import accelerate import torch From 5a24f02e0c6d42cf9b5634d035502a0503741a74 Mon Sep 17 00:00:00 2001 From: Yu Chin Fabian Lim Date: Mon, 14 Oct 2024 14:01:48 +0000 Subject: [PATCH 8/8] upgrade granite benches Signed-off-by: Yu Chin Fabian Lim --- scripts/benchmarks/refs/a100_80gb_granite.csv | 58 ++++----- .../benchmarks/refs/requirements_granite.txt | 110 +++++++++++++++--- 2 files changed, 123 insertions(+), 45 deletions(-) diff --git a/scripts/benchmarks/refs/a100_80gb_granite.csv b/scripts/benchmarks/refs/a100_80gb_granite.csv index 0637a4c..353efcd 100644 --- a/scripts/benchmarks/refs/a100_80gb_granite.csv +++ b/scripts/benchmarks/refs/a100_80gb_granite.csv @@ -1,29 +1,29 @@ -epoch,framework_config,learning_rate,lora_alpha,lora_dropout,mem_nvidia_mem_reserved,mem_peak_torch_mem_alloc_in_bytes,mem_torch_mem_alloc_in_bytes,model_name_or_path,num_gpus,output_dir,peft_method,per_device_train_batch_size,r,target_modules,torch_dtype,train_loss,train_runtime,train_samples_per_second,train_steps_per_second,train_tokens_per_second -0.14,none,2e-5,,,47683.0,35134836736,21089537536,ibm/PowerLM-3b,1,benchmark_outputs3/exp_0/hf,,4,,,bfloat16,0.9698258590698242,322.7029,1.24,0.31,5077.116 -0.14,none,2e-5,,,43779.0,35198512128,28135299584,ibm/PowerLM-3b,2,benchmark_outputs3/exp_1/hf,,2,,,bfloat16,0.929816370010376,185.4019,2.157,0.539,4418.51 -0.29,none,2e-5,,,52927.0,47059390976,21089930752,ibm/PowerLM-3b,1,benchmark_outputs3/exp_2/hf,,8,,,bfloat16,0.9660552883148193,629.0534,1.272,0.159,5209.097 -0.29,none,2e-5,,,45698.0,35169178112,28129300992,ibm/PowerLM-3b,2,benchmark_outputs3/exp_3/hf,,4,,,bfloat16,0.9272240352630615,334.0146,2.395,0.299,4905.174 -0.14,foak-fast-kernels,2e-5,,,43333.0,35133716992,21088791040,ibm/PowerLM-3b,1,benchmark_outputs3/exp_4/hf,,4,,,bfloat16,0.9692397594451905,273.7817,1.461,0.365,5984.329 -0.14,foak-fast-kernels,2e-5,,,42204.0,35208045568,28144833024,ibm/PowerLM-3b,2,benchmark_outputs3/exp_5/hf,,2,,,bfloat16,0.9299186515808106,160.0673,2.499,0.625,5117.847 -0.29,foak-fast-kernels,2e-5,,,50077.0,40316365824,21089184256,ibm/PowerLM-3b,1,benchmark_outputs3/exp_6/hf,,8,,,bfloat16,0.9664452648162842,532.0588,1.504,0.188,6158.718 -0.29,foak-fast-kernels,2e-5,,,47155.0,35173639168,28114620928,ibm/PowerLM-3b,2,benchmark_outputs3/exp_7/hf,,4,,,bfloat16,0.9271490287780761,285.7146,2.8,0.35,5734.394 -0.14,none,2e-4,16,0.1,24383.0,20193463808,7183026688,ibm/PowerLM-3b,1,benchmark_outputs3/exp_8/hf,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,1.0136762142181397,310.9178,1.287,0.322,5269.56 -0.14,none,2e-4,16,0.1,13849.0,10322532864,3625665536,ibm/PowerLM-3b,2,benchmark_outputs3/exp_9/hf,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,1.014874858856201,215.9738,1.852,0.463,3793.053 -0.29,none,2e-4,16,0.1,41093.0,33176473088,7183419904,ibm/PowerLM-3b,1,benchmark_outputs3/exp_10/hf,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,1.013114538192749,611.955,1.307,0.163,5354.642 -0.29,none,2e-4,16,0.1,22076.0,16835885568,3646137344,ibm/PowerLM-3b,2,benchmark_outputs3/exp_11/hf,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,1.0142618751525878,332.7819,2.404,0.3,4923.344 -0.14,foak-fast-kernels,2e-4,16,0.1,21311.0,16897035264,7183026688,ibm/PowerLM-3b,1,benchmark_outputs3/exp_12/hf,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,1.0154188919067382,262.241,1.525,0.381,6247.687 -0.14,foak-fast-kernels,2e-4,16,0.1,12241.0,8673304576,3624651776,ibm/PowerLM-3b,2,benchmark_outputs3/exp_13/hf,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,1.0135305118560791,195.6867,2.044,0.511,4186.284 -0.29,foak-fast-kernels,2e-4,16,0.1,34661.0,26585189376,7183419904,ibm/PowerLM-3b,1,benchmark_outputs3/exp_14/hf,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,1.0131592655181885,516.5368,1.549,0.194,6343.788 -0.29,foak-fast-kernels,2e-4,16,0.1,19014.0,13533374464,3640054784,ibm/PowerLM-3b,2,benchmark_outputs3/exp_15/hf,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,1.011543436050415,290.371,2.755,0.344,5642.437 -0.14,baseline-peft-bnb,2e-4,16,0.1,23127.0,18174161920,2143825920,ibm/PowerLM-3b,1,benchmark_outputs3/exp_16/hf,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,1.0207978439331056,391.2937,1.022,0.256,4187.137 -0.14,baseline-peft-bnb,2e-4,16,0.1,11254.0,7826759168,1129891840,ibm/PowerLM-3b,2,benchmark_outputs3/exp_17/hf,lora,2,16,q_proj k_proj v_proj o_proj,bfloat16,1.0213402462005616,220.9544,1.81,0.453,3707.553 -0.29,baseline-peft-bnb,2e-4,16,0.1,43327.0,34177070080,2144219136,ibm/PowerLM-3b,1,benchmark_outputs3/exp_18/hf,lora,8,16,q_proj k_proj v_proj o_proj,bfloat16,1.018005132675171,760.3699,1.052,0.132,4309.481 -0.29,baseline-peft-bnb,2e-4,16,0.1,19415.0,14319836672,1130088448,ibm/PowerLM-3b,2,benchmark_outputs3/exp_19/hf,lora,4,16,q_proj k_proj v_proj o_proj,bfloat16,1.0211043930053711,336.4976,2.377,0.297,4868.98 -0.14,accelerated-peft-bnb,2e-4,16,0.1,19379.0,15151677952,2141240832,ibm/PowerLM-3b,1,benchmark_outputs3/exp_20/hf,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.0230310726165772,296.6505,1.348,0.337,5522.998 -0.14,accelerated-peft-bnb,2e-4,16,0.1,11269.0,7826759168,1129891840,ibm/PowerLM-3b,2,benchmark_outputs3/exp_21/hf,lora,2,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.0242974948883057,227.579,1.758,0.439,3599.629 -0.29,accelerated-peft-bnb,2e-4,16,0.1,35837.0,28134687232,2141634048,ibm/PowerLM-3b,1,benchmark_outputs3/exp_22/hf,lora,8,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.020608777999878,579.2578,1.381,0.173,5656.894 -0.29,accelerated-peft-bnb,2e-4,16,0.1,19406.0,14319836672,1130088448,ibm/PowerLM-3b,2,benchmark_outputs3/exp_23/hf,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.0224126720428466,334.7691,2.39,0.299,4894.12 -0.14,accelerated-peft-bnb-foak,2e-4,16,0.1,16347.0,11831656448,2141240832,ibm/PowerLM-3b,1,benchmark_outputs3/exp_24/hf,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.0216326427459717,247.0632,1.619,0.405,6631.502 -0.14,accelerated-peft-bnb-foak,2e-4,16,0.1,9705.0,6202137600,1129891840,ibm/PowerLM-3b,2,benchmark_outputs3/exp_25/hf,lora,2,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.0239610195159912,137.0723,2.918,0.73,5976.408 -0.29,accelerated-peft-bnb-foak,2e-4,16,0.1,29777.0,21519810560,2141634048,ibm/PowerLM-3b,1,benchmark_outputs3/exp_26/hf,lora,8,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.0218979930877685,480.7611,1.664,0.208,6815.859 -0.29,accelerated-peft-bnb-foak,2e-4,16,0.1,16454.0,11047001088,1130088448,ibm/PowerLM-3b,2,benchmark_outputs3/exp_27/hf,lora,4,16,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.0228086185455323,252.2261,3.172,0.396,6495.76 +bf16,epoch,fp16,framework_config,learning_rate,lora_alpha,lora_dropout,mem_nvidia_mem_reserved,mem_peak_torch_mem_alloc_in_bytes,mem_torch_mem_alloc_in_bytes,model_name_or_path,num_gpus,peft_method,per_device_train_batch_size,r,target_modules,torch_dtype,train_loss,train_runtime,train_samples_per_second,train_steps_per_second,train_tokens_per_second +,0.14,,none,2e-05,,,47683.0,35134836736,21089537536,ibm/PowerLM-3b,1,,4,,,bfloat16,0.9708438396453858,321.2872,1.245,0.311,5099.488 +,0.14,,none,2e-05,,,25563.0,17626672128,14116037120,ibm/PowerLM-3b,2,,2,,,bfloat16,0.9699858856201172,178.9566,2.235,0.559,4577.647 +,0.29,,none,2e-05,,,52927.0,47059390976,21089930752,ibm/PowerLM-3b,1,,8,,,bfloat16,0.9677363204956054,627.2531,1.275,0.159,5224.047 +,0.29,,none,2e-05,,,28962.0,23781051904,14120288768,ibm/PowerLM-3b,2,,4,,,bfloat16,0.967378387451172,328.2929,2.437,0.305,4990.665 +,0.14,,foak-fast-kernels,2e-05,,,43333.0,35133716992,21088791040,ibm/PowerLM-3b,1,,4,,,bfloat16,0.9708970737457276,272.6346,1.467,0.367,6009.509 +,0.14,,foak-fast-kernels,2e-05,,,24945.0,17618562048,14107927040,ibm/PowerLM-3b,2,,2,,,bfloat16,0.9701127910614014,155.0031,2.581,0.645,5285.056 +,0.29,,foak-fast-kernels,2e-05,,,50077.0,40316365824,21089184256,ibm/PowerLM-3b,1,,8,,,bfloat16,0.9673656368255616,530.8662,1.507,0.188,6172.553 +,0.29,,foak-fast-kernels,2e-05,,,25856.0,20396960768,14108123648,ibm/PowerLM-3b,2,,4,,,bfloat16,0.9672414493560793,280.7892,2.849,0.356,5834.982 +,0.14,,none,0.0002,16.0,0.1,24591.0,20169870848,7183026688,ibm/PowerLM-3b,1,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0160441780090332,325.5188,1.229,0.307,5033.196 +,0.14,,none,0.0002,16.0,0.1,13919.0,10321519104,3624651776,ibm/PowerLM-3b,2,lora,2,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0179708290100098,205.3199,1.948,0.487,3989.872 +,0.29,,none,0.0002,16.0,0.1,41237.0,33152880128,7183419904,ibm/PowerLM-3b,1,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.015061044692993,641.7595,1.247,0.156,5105.963 +,0.29,,none,0.0002,16.0,0.1,22182.0,16819665408,3629917184,ibm/PowerLM-3b,2,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0122298526763915,342.5583,2.335,0.292,4782.835 +,0.14,,foak-fast-kernels,0.0002,16.0,0.1,21375.0,16873442304,7183026688,ibm/PowerLM-3b,1,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0166709804534912,277.343,1.442,0.361,5907.487 +,0.14,,foak-fast-kernels,0.0002,16.0,0.1,12274.0,8673304576,3624651776,ibm/PowerLM-3b,2,lora,2,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0182081413269044,181.9223,2.199,0.55,4503.022 +,0.29,,foak-fast-kernels,0.0002,16.0,0.1,34803.0,26561596416,7183419904,ibm/PowerLM-3b,1,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0157270526885986,546.8807,1.463,0.183,5991.8 +,0.29,,foak-fast-kernels,0.0002,16.0,0.1,19151.0,13518168064,3624848384,ibm/PowerLM-3b,2,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0141748046875,299.1818,2.674,0.334,5476.269 +True,0.14,,baseline-peft-bnb,0.0002,16.0,0.1,23127.0,18174161920,2143825920,ibm/PowerLM-3b,1,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.020770034790039,390.2779,1.025,0.256,4198.034 +True,0.14,,baseline-peft-bnb,0.0002,16.0,0.1,11366.0,7826759168,1129891840,ibm/PowerLM-3b,2,lora,2,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.026328239440918,217.3217,1.841,0.46,3769.527 +True,0.29,,baseline-peft-bnb,0.0002,16.0,0.1,43327.0,34177070080,2144219136,ibm/PowerLM-3b,1,lora,8,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0214418220520018,759.0108,1.054,0.132,4317.198 +True,0.29,,baseline-peft-bnb,0.0002,16.0,0.1,19662.0,14319836672,1130088448,ibm/PowerLM-3b,2,lora,4,16.0,q_proj k_proj v_proj o_proj,bfloat16,1.0208434104919433,325.6275,2.457,0.307,5031.517 +True,0.14,,accelerated-peft-bnb,0.0002,16.0,0.1,19379.0,15151677952,2141240832,ibm/PowerLM-3b,1,lora,4,16.0,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.0252878952026367,295.7236,1.353,0.338,5540.308 +True,0.14,,accelerated-peft-bnb,0.0002,16.0,0.1,11316.0,7826759168,1129891840,ibm/PowerLM-3b,2,lora,2,16.0,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.0245559310913086,216.0996,1.851,0.463,3790.845 +True,0.29,,accelerated-peft-bnb,0.0002,16.0,0.1,35837.0,28134687232,2141634048,ibm/PowerLM-3b,1,lora,8,16.0,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.0224976921081543,578.3764,1.383,0.173,5665.515 +True,0.29,,accelerated-peft-bnb,0.0002,16.0,0.1,19600.0,14319836672,1130088448,ibm/PowerLM-3b,2,lora,4,16.0,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.0212884616851807,328.1765,2.438,0.305,4992.436 +True,0.14,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,16347.0,11831656448,2141240832,ibm/PowerLM-3b,1,lora,4,16.0,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.0251048755645753,246.4572,1.623,0.406,6647.808 +True,0.14,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,9796.0,6202137600,1129891840,ibm/PowerLM-3b,2,lora,2,16.0,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.0391525173187257,137.496,2.909,0.727,5957.992 +True,0.29,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,29777.0,21519810560,2141634048,ibm/PowerLM-3b,1,lora,8,16.0,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.0225242614746093,479.5512,1.668,0.209,6833.055 +True,0.29,,accelerated-peft-bnb-foak,0.0002,16.0,0.1,16722.0,11047001088,1130088448,ibm/PowerLM-3b,2,lora,4,16.0,q_proj k_proj v_proj o_proj c_attn,bfloat16,1.0340285396575928,252.7203,3.166,0.396,6483.058 diff --git a/scripts/benchmarks/refs/requirements_granite.txt b/scripts/benchmarks/refs/requirements_granite.txt index 9d58b46..54221b0 100644 --- a/scripts/benchmarks/refs/requirements_granite.txt +++ b/scripts/benchmarks/refs/requirements_granite.txt @@ -1,41 +1,86 @@ -accelerate @ git+https://github.com/huggingface/accelerate.git@4305033f8035defad0a87cd38e5c918e78510ba5 -aiohappyeyeballs==2.4.0 -aiohttp==3.10.6 +accelerate==0.34.2 +aiohappyeyeballs==2.4.3 +aiohttp==3.10.9 aiosignal==1.3.1 +anyio==4.6.0 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==2.4.1 +async-lru==2.0.4 async-timeout==4.0.3 attrs==24.2.0 +babel==2.16.0 +beautifulsoup4==4.12.3 bitsandbytes==0.43.3 +bleach==6.1.0 certifi==2024.8.30 -charset-normalizer==3.3.2 +cffi==1.17.1 +charset-normalizer==3.4.0 +comm==0.2.2 contourpy==1.3.0 cycler==0.12.1 datasets==2.21.0 +debugpy==1.8.6 +decorator==5.1.1 +defusedxml==0.7.1 dill==0.3.8 docstring_parser==0.16 einops==0.8.0 +exceptiongroup==1.2.2 +executing==2.1.0 +fastjsonschema==2.20.0 filelock==3.16.1 flash-attn==2.6.3 --e git+https://github.com/foundation-model-stack/fms-acceleration.git@36c7fa7cb4ef413ddfbf55b75eb6135b369de038#egg=fms_acceleration&subdirectory=plugins/framework --e git+https://github.com/foundation-model-stack/fms-acceleration.git@36c7fa7cb4ef413ddfbf55b75eb6135b369de038#egg=fms_acceleration_aadp&subdirectory=plugins/attention-and-distributed-packing --e git+https://github.com/foundation-model-stack/fms-acceleration.git@36c7fa7cb4ef413ddfbf55b75eb6135b369de038#egg=fms_acceleration_foak&subdirectory=plugins/fused-ops-and-kernels --e git+https://github.com/foundation-model-stack/fms-acceleration.git@36c7fa7cb4ef413ddfbf55b75eb6135b369de038#egg=fms_acceleration_peft&subdirectory=plugins/accelerated-peft -fms-hf-tuning @ git+https://github.com/foundation-model-stack/fms-hf-tuning.git@8676d0177c2f9436cf176342f38c3b5602f91751 +-e git+https://github.com/foundation-model-stack/fms-acceleration.git@10cc000c12a57774f68e9da861dbbc7eaa559816#egg=fms_acceleration&subdirectory=plugins/framework +-e git+https://github.com/foundation-model-stack/fms-acceleration.git@10cc000c12a57774f68e9da861dbbc7eaa559816#egg=fms_acceleration_aadp&subdirectory=plugins/attention-and-distributed-packing +-e git+https://github.com/foundation-model-stack/fms-acceleration.git@10cc000c12a57774f68e9da861dbbc7eaa559816#egg=fms_acceleration_foak&subdirectory=plugins/fused-ops-and-kernels +-e git+https://github.com/foundation-model-stack/fms-acceleration.git@10cc000c12a57774f68e9da861dbbc7eaa559816#egg=fms_acceleration_peft&subdirectory=plugins/accelerated-peft +fms-hf-tuning @ git+https://github.com/foundation-model-stack/fms-hf-tuning.git@b33634c8ce5c85b6d10daa187fd1a069db969b67 fonttools==4.54.1 +fqdn==1.5.1 frozenlist==1.4.1 fsspec==2024.6.1 -huggingface-hub==0.25.1 +h11==0.14.0 +httpcore==1.0.6 +httpx==0.27.2 +huggingface-hub==0.25.2 idna==3.10 +ipykernel==6.29.5 +ipython==8.28.0 +isoduration==20.11.0 +jedi==0.19.1 Jinja2==3.1.4 +json5==0.9.25 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +jupyter-events==0.10.0 +jupyter-lsp==2.2.5 +jupyter_client==8.6.3 +jupyter_core==5.7.2 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.5 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 kiwisolver==1.4.7 llvmlite==0.43.0 markdown-it-py==3.0.0 -MarkupSafe==2.1.5 +MarkupSafe==3.0.1 matplotlib==3.9.2 +matplotlib-inline==0.1.7 mdurl==0.1.2 +mistune==3.0.2 mpmath==1.3.0 multidict==6.1.0 multiprocess==0.70.16 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 networkx==3.3 +notebook_shim==0.2.4 numba==0.60.0 numpy==1.26.4 nvidia-cublas-cu12==12.1.3.1 @@ -48,39 +93,72 @@ nvidia-curand-cu12==10.3.2.106 nvidia-cusolver-cu12==11.4.5.107 nvidia-cusparse-cu12==12.1.0.106 nvidia-nccl-cu12==2.20.5 -nvidia-nvjitlink-cu12==12.6.68 +nvidia-nvjitlink-cu12==12.6.77 nvidia-nvtx-cu12==12.1.105 +overrides==7.7.0 packaging==24.1 pandas==2.2.3 +pandocfilters==1.5.1 +parso==0.8.4 peft==0.12.0 +pexpect==4.9.0 pillow==10.4.0 +platformdirs==4.3.6 +prometheus_client==0.21.0 +prompt_toolkit==3.0.48 +propcache==0.2.0 protobuf==5.28.2 psutil==6.0.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 pyarrow==17.0.0 +pycparser==2.22 Pygments==2.18.0 pyparsing==3.1.4 python-dateutil==2.9.0.post0 +python-json-logger==2.0.7 pytz==2024.2 PyYAML==6.0.2 +pyzmq==26.2.0 +referencing==0.35.1 regex==2024.9.11 requests==2.32.3 -rich==13.8.1 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rich==13.9.2 +rpds-py==0.20.0 safetensors==0.4.5 +Send2Trash==1.8.3 sentencepiece==0.2.0 shtab==1.7.1 simpleeval==0.9.13 six==1.16.0 +sniffio==1.3.1 +soupsieve==2.6 +stack-data==0.6.3 sympy==1.13.3 +tabulate==0.9.0 +terminado==0.18.1 threadpoolctl==3.5.0 +tinycss2==1.3.0 tokenizers==0.20.0 +tomli==2.0.2 torch==2.4.1 +tornado==6.4.1 tqdm==4.66.5 -transformers==4.45.0 +traitlets==5.14.3 +transformers==4.45.2 triton==3.0.0 -trl @ git+https://github.com/huggingface/trl.git@9af4734178d4436a8dc98a069042eedd2ccf178f +trl @ git+https://github.com/huggingface/trl.git@9b80f3d50ccb98ceee94bab4145a36e7e58aa4eb +types-python-dateutil==2.9.0.20241003 typing_extensions==4.12.2 tyro==0.8.11 tzdata==2024.2 +uri-template==1.3.0 urllib3==2.2.3 +wcwidth==0.2.13 +webcolors==24.8.0 +webencodings==0.5.1 +websocket-client==1.8.0 xxhash==3.5.0 -yarl==1.12.1 +yarl==1.14.0