Skip to content

Commit

Permalink
remove obselete npu code (#11967)
Browse files Browse the repository at this point in the history
  • Loading branch information
yangw1234 authored Aug 29, 2024
1 parent a9e485e commit fbf088f
Show file tree
Hide file tree
Showing 3 changed files with 5 additions and 780 deletions.
15 changes: 5 additions & 10 deletions python/llm/src/ipex_llm/transformers/npu_models/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,21 +81,16 @@ def optimize_llm(model: torch.nn.Module):
from ipex_llm.transformers.npu_models.llama import merge_qkv
from ipex_llm.transformers.npu_models.llama import merge_mlp
from ipex_llm.transformers.npu_models.llama import llama_model_forward
from ipex_llm.transformers.npu_models.llama import llama_fused_model_forward
from ipex_llm.transformers.npu_models.llama import llama_attention_forward
from ipex_llm.transformers.npu_models.llama import llama_mlp_forward
from transformers.models.llama.modeling_llama import LlamaModel
from transformers.models.llama.modeling_llama import LlamaAttention
from transformers.models.llama.modeling_llama import LlamaMLP
if hasattr(model, 'pipeline_parallel_stages'):
# experimental support for fused decoderlayer implementation
convert_forward(model, LlamaModel, llama_fused_model_forward)
else:
model.apply(merge_qkv)
model.apply(merge_mlp)
convert_forward(model, LlamaModel, llama_model_forward)
convert_forward(model, LlamaAttention, llama_attention_forward)
convert_forward(model, LlamaMLP, llama_mlp_forward)
model.apply(merge_qkv)
model.apply(merge_mlp)
convert_forward(model, LlamaModel, llama_model_forward)
convert_forward(model, LlamaAttention, llama_attention_forward)
convert_forward(model, LlamaMLP, llama_mlp_forward)

elif model.config.model_type == "mistral":
from ipex_llm.transformers.npu_models.mistral import merge_qkv
Expand Down
131 changes: 0 additions & 131 deletions python/llm/src/ipex_llm/transformers/npu_models/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,137 +182,6 @@ def llama_model_forward(
)


def llama_fused_model_forward(
self,
input_ids: torch.LongTensor = None,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.LongTensor] = None,
past_key_values: Optional[List[torch.FloatTensor]] = None,
inputs_embeds: Optional[torch.FloatTensor] = None,
use_cache: Optional[bool] = None,
output_attentions: Optional[bool] = None,
output_hidden_states: Optional[bool] = None,
return_dict: Optional[bool] = None,
cache_position: Optional[torch.LongTensor] = None,
) -> Union[Tuple, BaseModelOutputWithPast]:
output_attentions = (
output_attentions if output_attentions is not None
else self.config.output_attentions
)
output_hidden_states = (
output_hidden_states if output_hidden_states is not None
else self.config.output_hidden_states
)
use_cache = use_cache if use_cache is not None else self.config.use_cache
return_dict = return_dict if return_dict is not None else self.config.use_return_dict

if (input_ids is None) ^ (inputs_embeds is not None):
invalidInputError(False,
("You cannot specify both input_ids and inputs_embeds at the same time, "
"and must specify either one"))

if self.gradient_checkpointing and self.training and use_cache:
use_cache = False

if inputs_embeds is None:
inputs_embeds = self.embed_tokens(input_ids)

past_seen_tokens = 0

# ipex-llm changes start
from ipex_llm.transformers.npu_models.kv import DynamicFusedNormalCache
if use_cache and not isinstance(past_key_values, DynamicFusedNormalCache):
past_key_values = DynamicFusedNormalCache.from_legacy_cache(past_key_values)
past_seen_tokens = past_key_values.get_seq_length()

if cache_position is None:
cache_position = torch.arange(past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1],
device=inputs_embeds.device)
# ipex-llm changes end

if position_ids is None:
position_ids = cache_position.unsqueeze(0)

causal_mask = self._update_causal_mask(attention_mask, inputs_embeds,
cache_position, past_seen_tokens)

# embed positions
hidden_states = inputs_embeds

# decoder layers
all_hidden_states = () if output_hidden_states else None
all_self_attns = () if output_attentions else None
next_decoder_cache = None

seq_len = hidden_states.size(1)

if seq_len == 1:
# multi_decoder = self.layers[(self.layer_end + 1) % num_layers]
layer_outputs = self.multi_decoder(hidden_states,
attention_mask=causal_mask,
position_ids=position_ids,
past_key_value=past_key_values,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,)
hidden_states = layer_outputs[0]

next_decoder_cache = layer_outputs[1]
else:
for decoder_layer in self.layers:
if output_hidden_states:
all_hidden_states += (hidden_states,)

if self.gradient_checkpointing and self.training:
layer_outputs = self._gradient_checkpointing_func(
decoder_layer.__call__,
hidden_states,
causal_mask,
position_ids,
past_key_values,
output_attentions,
use_cache,
cache_position,
)
else:
layer_outputs = decoder_layer(
hidden_states,
attention_mask=causal_mask,
position_ids=position_ids,
past_key_value=past_key_values,
output_attentions=output_attentions,
use_cache=use_cache,
cache_position=cache_position,
)

hidden_states = layer_outputs[0]

if use_cache:
next_decoder_cache = layer_outputs[2 if output_attentions else 1]

if output_attentions:
all_self_attns += (layer_outputs[1],)

hidden_states = self.norm(hidden_states)

# add hidden states from the last decoder layer
if output_hidden_states:
all_hidden_states += (hidden_states,)

# ipex-llm changes start
next_cache = next_decoder_cache if use_cache else None
# ipex-llm changes end
if not return_dict:
return tuple(v for v in [hidden_states, next_cache,
all_hidden_states, all_self_attns] if v is not None)
return BaseModelOutputWithPast(
last_hidden_state=hidden_states,
past_key_values=next_cache,
hidden_states=all_hidden_states,
attentions=all_self_attns,
)


def llama_attention_forward(
self,
hidden_states: torch.Tensor,
Expand Down
Loading

0 comments on commit fbf088f

Please sign in to comment.