diff --git a/python/llm/dev/benchmark/all-in-one/run.py b/python/llm/dev/benchmark/all-in-one/run.py index f5c0ecfacad..ddd130d2ac4 100644 --- a/python/llm/dev/benchmark/all-in-one/run.py +++ b/python/llm/dev/benchmark/all-in-one/run.py @@ -121,7 +121,7 @@ def run_model(repo_id, test_api, in_out_pairs, local_model_hub=None, warm_up=1, low_bit, cpu_embedding if 'win' in test_api else 'N/A', round(result[in_out_pair][-1][5], 2), - result[in_out_pair][-1][6] if any(keyword in test_api for keyword in ['int4_gpu', 'int4_fp16_gpu_win', 'int4_loadlowbit_gpu' ]) else 'N/A', + result[in_out_pair][-1][6] if any(keyword in test_api for keyword in ['int4_gpu', 'int4_fp16_gpu_win', 'int4_loadlowbit_gpu', 'fp16_gpu']) else 'N/A', streaming if 'win' in test_api else 'N/A'], ) @@ -716,7 +716,7 @@ def run_bigdl_fp16_gpu(repo_id, print(output[0]) if i >= warm_up: result[in_out].append([model.first_cost, model.rest_cost_mean, model.encoder_time, - actual_in_len, actual_out_len, load_time]) + actual_in_len, actual_out_len, load_time, model.peak_memory]) del model torch.xpu.empty_cache() return result diff --git a/python/llm/src/ipex_llm/transformers/models/llama.py b/python/llm/src/ipex_llm/transformers/models/llama.py index 45d944c5e4d..68391499d0d 100644 --- a/python/llm/src/ipex_llm/transformers/models/llama.py +++ b/python/llm/src/ipex_llm/transformers/models/llama.py @@ -622,7 +622,7 @@ def llama_attention_forward_4_31_original( is_causal=True) attn_weights = None elif not self.training and not hidden_states.requires_grad and \ - use_esimd_sdp(q_len, key_states.shape[2], self.head_dim, query_states): + use_esimd_sdp(q_len, key_states.shape[2], self.head_dim, query_states, attention_mask): import linear_fp16_esimd attn_output = linear_fp16_esimd.sdp_forward(query_states, key_states, diff --git a/python/llm/src/ipex_llm/transformers/models/utils.py b/python/llm/src/ipex_llm/transformers/models/utils.py index 1a4e1f0b94d..79c1ffecf71 100644 --- a/python/llm/src/ipex_llm/transformers/models/utils.py +++ b/python/llm/src/ipex_llm/transformers/models/utils.py @@ -299,7 +299,7 @@ def use_flash_attention(query, key, attention_mask=None): return True -def use_esimd_sdp(q_len, k_len, head_dim, query_states): +def use_esimd_sdp(q_len, k_len, head_dim, query_states, attention_mask=None): if head_dim != 128: # esimd_sdp only support head_dim = 128 now return False @@ -315,17 +315,23 @@ def use_esimd_sdp(q_len, k_len, head_dim, query_states): elif query_states.dtype != torch.float16: # esimd_sdp only has optimization for FP16 now return False - else: - device_name = torch.xpu.get_device_name(query_states.device.index) - if device_name.startswith("Intel(R) Arc(TM) A") or \ - device_name.startswith("Intel(R) Data Center GPU Flex"): - import linear_fp16_esimd - if hasattr(linear_fp16_esimd, "sdp_forward"): - return True - else: - return False - else: + elif query_states.shape[0] > 1 and attention_mask is not None: + # for batched input, can't accept attention_mask + # TODO: this check needs some time + if not torch.all(attention_mask.eq(0)): + return False + + device_name = torch.xpu.get_device_name(query_states.device.index) + if device_name.startswith("Intel(R) Arc(TM) A") or \ + device_name.startswith("Intel(R) Data Center GPU Flex") or \ + device_name.startswith("Intel(R) Data Center GPU Max"): + import linear_fp16_esimd + if not hasattr(linear_fp16_esimd, "sdp_forward"): return False + else: + return False + + return True def mlp_fusion_check(x, qtype, training):