diff --git a/deepspeed/module_inject/auto_tp.py b/deepspeed/module_inject/auto_tp.py index 396fe7db2447..d80f0bf612b3 100644 --- a/deepspeed/module_inject/auto_tp.py +++ b/deepspeed/module_inject/auto_tp.py @@ -108,6 +108,10 @@ def tp_parser(model): gem_list = gem_list + [layer] elif 'down_proj' in layer: gem_list = gem_list + [layer] + elif 'self_attention.dense' in layer and 'falcon' in str( + type(module)): # this is a hack to get the right linear layer for this model! + gem_list = gem_list + [layer] + layer_list = [] if gem_list != []: gem_list = list(set(gem_list)) diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py index f0fe81f28714..4753f55b9ea3 100644 --- a/deepspeed/module_inject/replace_module.py +++ b/deepspeed/module_inject/replace_module.py @@ -426,38 +426,14 @@ def _slice_embedding(child, name, conv_linear_layer): def update_mp_params(child): if getattr(child, "replaced", False) == True: return - if hasattr(child, 'n_heads'): - assert child.n_heads % mp_size == 0, "n_heads ({}) must be divisible by mp_size ({})".format( - child.n_heads, mp_size) - child.n_heads = child.n_heads // mp_size - if hasattr(child, 'inner_dim'): - assert child.inner_dim % mp_size == 0, "inner_dim ({}) must be divisible by mp_size ({})".format( - child.inner_dim, mp_size) - child.inner_dim = child.inner_dim // mp_size - if hasattr(child, 'num_heads'): - assert child.num_heads % mp_size == 0, "num_heads ({}) must be divisible by mp_size ({})".format( - child.num_heads, mp_size) - child.num_heads = child.num_heads // mp_size - if hasattr(child, 'num_attention_heads'): - assert child.num_attention_heads % mp_size == 0, "num_attention_heads ({}) must be divisible by mp_size ({})".format( - child.num_attention_heads, mp_size) - child.num_attention_heads = child.num_attention_heads // mp_size - if hasattr(child, 'num_attn_heads'): - assert child.num_attn_heads % mp_size == 0, "num_attn_heads ({}) must be divisible by mp_size ({})".format( - child.num_attn_heads, mp_size) - child.num_attn_heads = child.num_attn_heads // mp_size - if hasattr(child, 'all_head_size'): - assert child.all_head_size % mp_size == 0, "all_head_size ({}) must be divisible by mp_size ({})".format( - child.all_head_size, mp_size) - child.all_head_size = child.all_head_size // mp_size - if hasattr(child, 'embed_dim'): - assert child.embed_dim % mp_size == 0, "embed_dim must ({}) be divisible by mp_size ({})".format( - child.embed_dim, mp_size) - child.embed_dim = child.embed_dim // mp_size - if hasattr(child, 'hidden_size'): - assert child.hidden_size % mp_size == 0, "hidden_size ({}) must be divisible by mp_size ({})".format( - child.hidden_size, mp_size) - child.hidden_size = child.hidden_size // mp_size + for param in [ + "n_heads", "inner_dim", "num_heads", "num_kv", "num_attention_heads", "num_attn_heads", + "all_head_size", "embed_dim", "hidden_size" + ]: + if hasattr(child, param): + param_val = getattr(child, param) + assert param_val % mp_size == 0, f"{param} ({param_val}) must be divisible by mp_size ({mp_size})" + setattr(child, param, param_val // mp_size) setattr(child, "replaced", True) conv_linear_layer = False @@ -495,6 +471,16 @@ def _replace_module(r_module, prev_name='', prev_class_name=''): if child.__class__ in linear_policies: setattr(r_module, name, linear_policies[child.__class__](child, prev_name + '.' + name, conv_linear_layer)) + elif any(isinstance(child, lp) for lp in linear_policies): + # Added for falcon model support + # Note: isinstance will account for class inheritance, child.__class__ does not + key = None + for lp in linear_policies: + if isinstance(child, lp): + key = lp + break + assert key is not None + setattr(r_module, name, linear_policies[key](child, prev_name + '.' + name, conv_linear_layer)) else: update_mp_params(child) _replace_module(child, name, class_name) diff --git a/tests/unit/inference/test_inference.py b/tests/unit/inference/test_inference.py index a9da94d5d30f..c9073e66b67b 100644 --- a/tests/unit/inference/test_inference.py +++ b/tests/unit/inference/test_inference.py @@ -13,7 +13,7 @@ from unit.common import DistributedTest from packaging import version as pkg_version from deepspeed.ops.op_builder import OpBuilder -from transformers import pipeline +from transformers import pipeline, AutoTokenizer from transformers.models.t5.modeling_t5 import T5Block from transformers.models.roberta.modeling_roberta import RobertaLayer from huggingface_hub import HfApi @@ -380,6 +380,46 @@ def test( assert assert_fn(bs_output, ds_output) +@pytest.mark.seq_inference +@pytest.mark.parametrize("model_w_task", [("tiiuae/falcon-7b", "text-generation")], ids=["falcon"]) +class TestAutoTP(DistributedTest): + world_size = 1 + + def test( + self, + model_w_task, + query, + inf_kwargs, + assert_fn, + ): + # TODO: enable this test for H100 tests + pytest.skip("Not enough GPU memory for this on V100 runners") + model, task = model_w_task + dtype = torch.bfloat16 + local_rank = int(os.getenv("LOCAL_RANK", "0")) + + # We have to load these large models on CPU with pipeline because not + # enough GPU memory + tokenizer = AutoTokenizer.from_pretrained(model, use_fast=True) + pipe = pipeline(task, + model=model, + tokenizer=tokenizer, + torch_dtype=dtype, + trust_remote_code=True, + device=torch.device("cpu"), + framework="pt") + #bs_output = pipe(query, **inf_kwargs) + + pipe.model = deepspeed.init_inference(pipe.model, mp_size=self.world_size, replace_with_kernel_inject=False) + # Switch device to GPU so that input tensors are not on CPU + pipe.device = torch.device(get_accelerator().device_name(local_rank)) + ds_output = pipe(query, **inf_kwargs) + + #print(local_rank, "baseline", bs_output) + print(local_rank, "deepspeed", ds_output) + #assert assert_fn(bs_output, ds_output) + + @pytest.mark.seq_inference @pytest.mark.parametrize( "model_w_task, injection_policy",