foundation-model-stack · fabianlim · Jul 29, 2024 · Jul 17, 2024 · Jul 17, 2024 · Jul 18, 2024
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "fms-acceleration-peft"
-version = '0.0.1'
+version = '0.1.0.1.dev'
 description = "FMS Acceleration for PeFT"
 authors = [
   {name = "Fabian Lim", email = "[email protected]"},

@@ -24,35 +24,45 @@
 from peft.tuners.lora.gptq import QuantLinear as LoraLinearGPTQ
 import torch
 
+from fms_acceleration.model_patcher import ModelPatcher, ModelPatcherRule, ModelPatcherTrigger
+from functools import partial
+
 # these parameters are to be patched for triton v2
 # consider making a map if patching more kernels
 PATCH_FOR_FSDP_TRITON_V2 = ["qweight", "qzeros"]
 
-
-# This function may be moved after merging
-# https://github.com/foundation-model-stack/fms-acceleration/pull/25
-def _patch_target_module(
-    to_patch: str,
-    replace_with: Any,
-    target_module: str = None,
+def build_patch_to_view_tensor_to_parameter_for_fsdp_gptq(
+    module,
+    torch_dtype,
 ):
-    to_patch = to_patch.split(".")
-    assert len(to_patch) > 1, "must have an object to patch"
-
-    to_patch, obj_name_to_patch = to_patch[:-1], to_patch[-1]
-    to_patch = ".".join(to_patch)
-    source = importlib.import_module(to_patch)
-    original_obj = getattr(source, obj_name_to_patch)
-    setattr(source, obj_name_to_patch, replace_with)
-
-    if target_module is not None:
-        # reload and this should get the patched object
-        target_module = importlib.import_module(target_module)
-        importlib.reload(target_module)
-
-        # replace it
-        setattr(source, obj_name_to_patch, original_obj)
+    # convert all patched attributes to Parameters of torch_dtype
+    # so FSDP can shard them
+    for attr_name in PATCH_FOR_FSDP_TRITON_V2:
+        attr = getattr(module, attr_name)
+        attr = torch.nn.Parameter(
+            attr.view(torch_dtype), requires_grad=False
+        )
+        setattr(module, attr_name, attr)
+
+    # this patches the forward to convert them back to original
+    # type (i.e. int32) before the function call into the kernels
+    return patch_forward_to_view_attributes_before_call(
+        module.forward,
+        attribute_names=PATCH_FOR_FSDP_TRITON_V2,
+        torch_dtype=torch.int32,  # patch it back to
+    )
 
+def register_tensors_as_parameters_patch_rule(target_module, torch_dtype):
+    # Register patch
+    ModelPatcher.register(
+        ModelPatcherRule(
+            rule_id="autogptq_patch_tensors_as_float_parameters",
+            trigger=ModelPatcherTrigger(check=target_module),
+            forward_builder = build_patch_to_view_tensor_to_parameter_for_fsdp_gptq,
+            forward_builder_args=["torch_dtype"],
+        )
+    )
+    ModelPatcher.patch = partial(ModelPatcher.patch, torch_dtype=torch_dtype)
 
 def make_sure_no_tensor_in_meta_device(
     model,
@@ -124,7 +134,6 @@ def create_new_module_peft(
     # if module cannot be found, return None which results in a raise in the call-stack
     return new_module
 
-
 # consider to move this somewhere more general
 def patch_forward_to_view_attributes_before_call(
     old_forward: Callable,

@@ -24,6 +24,7 @@
 
 # Third Party
 from fms_acceleration import AccelerationPlugin
+from fms_acceleration.model_patcher import patch_target_module
 from peft import LoraConfig, prepare_model_for_kbit_training
 from peft.tuners.lora.model import LoraModel
 from transformers import AutoModelForCausalLM, TrainingArguments
@@ -81,11 +82,6 @@ def model_loader(self, model_name: str, **kwargs):
             from .gptqmodel.nn_modules.qlinear.qlinear_tritonv2 import ( # pylint: disable=import-outside-toplevel,import-error
                 QuantLinear,
             )
-        # Local
-        from .autogptq_utils import (  # pylint: disable=import-outside-toplevel
-            PATCH_FOR_FSDP_TRITON_V2,
-            patch_forward_to_view_attributes_before_call,
-        )
 
         # Currently we allow only a quantized checkpoint to be loaded, we do not
         # implement the quantization process here.
@@ -143,14 +139,11 @@ def model_loader(self, model_name: str, **kwargs):
             kwargs["low_cpu_mem_usage"] = True
             if self.use_external_lib:
                 # Local
-                from .autogptq_utils import (  # pylint: disable=import-outside-toplevel
-                    _patch_target_module,
-                    make_sure_no_tensor_in_meta_device,
-                )
+                from .autogptq_utils import make_sure_no_tensor_in_meta_device # pylint: disable=import-outside-toplevel
 
                 # We patch `make_sure_no_tensor_in_meta_device`
                 # from autogptq to avoid errors on models without bias
-                _patch_target_module(
+                patch_target_module(
                     to_patch="auto_gptq.modeling._utils.make_sure_no_tensor_in_meta_device",
                     replace_with=make_sure_no_tensor_in_meta_device,
                     target_module="auto_gptq.modeling._base",
@@ -201,31 +194,15 @@ def model_loader(self, model_name: str, **kwargs):
             world_size > 1
             and os.environ.get("ACCELERATE_USE_FSDP", "false").lower() == "true"
         ):
+            # register FSDP patch
+            from .autogptq_utils import register_tensors_as_parameters_patch_rule
+            register_tensors_as_parameters_patch_rule(
+                target_module=QuantLinear, 
+                torch_dtype=torch_dtype,
+            )
 
-            # patch all the QuantLinear base layers
-            for mod in model.modules():
-                if isinstance(mod, QuantLinear):
-
-                    # convert all patched attributes to Parameters of torch_dtype
-                    # so FSDP can shard them
-                    for attr_name in PATCH_FOR_FSDP_TRITON_V2:
-                        attr = getattr(mod, attr_name)
-                        attr = torch.nn.Parameter(
-                            attr.view(torch_dtype), requires_grad=False
-                        )
-                        setattr(mod, attr_name, attr)
-
-                    # this patches the forward to convert them back to original
-                    # type (i.e. int32) before the function call into the kernels
-                    _forward = patch_forward_to_view_attributes_before_call(
-                        mod.forward,
-                        attribute_names=PATCH_FOR_FSDP_TRITON_V2,
-                        torch_dtype=torch.int32,  # patch it back to
-                    )
-                    mod.forward = MethodType(_forward, mod)
-
-        # replace
-        AutoModelForCausalLM.from_config = _old_from_config
+            # replace
+            AutoModelForCausalLM.from_config = _old_from_config
 
         # AutoGPTQ does not set the torch_dtype of the model carefully
         model.config.torch_dtype = torch_dtype

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "fms-acceleration"
-version = '0.1.1.dev'
+version = '0.1.2.dev'
 description = "FMS Acceleration Plugin Framework"
 authors = [
   {name = "Fabian Lim", email = "[email protected]"},

@@ -14,16 +14,38 @@
 
 # Standard
 from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Set, Tuple
+from typing import Any, Dict, List, Optional, Set, Tuple, Callable
 import importlib
 import sys
 
 # Third Party
 from accelerate import Accelerator
 from peft import LoraConfig
+from transformers.utils import logging
 from transformers import TrainingArguments
 import torch
 
+# want to use the transformers logger, but a bit of pain
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+logger.setLevel(logging._get_default_logging_level())
+logger.addHandler(logging._default_handler)
+
+def log_patch_summary(
+    logging_func: Callable = None,
+):
+    if logging_func is None:
+        logging_func = print
+
+    # this is a guarded import, because the model rule registration
+    # does not need to be loaded unless patch_model is required
+    # Local
+    from fms_acceleration.model_patcher import (  # pylint: disable=import-outside-toplevel
+        patch_model_summary,
+    )
+
+    for line in patch_model_summary().split("\n"):
+        logging_func(line)
+
 
 @dataclass
 class PluginRegistration:
@@ -146,6 +168,14 @@ def augmentation(
     def get_callbacks_and_ready_for_train(
         self, model: torch.nn.Module = None, accelerator: Accelerator = None
     ):
+        # Finally apply all registered patches to the model
+        from .model_patcher import ModelPatcher # pylint: disable=import-outside-toplevel
+        ModelPatcher.patch(model)
+
+        # if patching is done, print patch summary to logger
+        if len(ModelPatcher.history)>0:
+            log_patch_summary(logging_func=logger.info)
+
         return []
 
     def _check_config_and_maybe_check_values(