Skip to content

Commit

Permalink
Style fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
Oscilloscope98 committed Nov 18, 2024
1 parent 2196f35 commit 61e482b
Showing 1 changed file with 32 additions and 18 deletions.
50 changes: 32 additions & 18 deletions python/llm/src/ipex_llm/transformers/speculative.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@
# https://github.com/huggingface/transformers/blob/v4.45.2/src/transformers/generation/utils.py
# which are licensed under Apache License 2.0:
#
# Copyright 2020 The Google AI Language Team Authors, Facebook AI Research authors and The HuggingFace Inc. team.
# Copyright 2020 The Google AI Language Team Authors, Facebook AI Research authors
# and The HuggingFace Inc. team.
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
Expand Down Expand Up @@ -510,8 +511,9 @@ def _crop_past_key_values(self, past_key_values, new_cache_size, _enable_ipex=Fa
for k, v in past_key_values
]
elif self.config.model_type == "chatglm":
if isinstance(self.config.eos_token_id, list) and not hasattr(self.transformer, "vision") \
and self.config.num_layers in [28, 40]:
if isinstance(self.config.eos_token_id, list) and \
not hasattr(self.transformer, "vision") and \
self.config.num_layers in [28, 40]:
# glm4 models
past_key_values = [
(k[:, :, :-(new_cache_size), :],
Expand Down Expand Up @@ -653,17 +655,20 @@ def _prepare_generate_args(self, inputs, generation_config, streamer=None, **sam
def _prepare_generate_args_4_45(self, inputs, generation_config, streamer=None, **kwargs):
# 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
self._validate_model_class()
tokenizer = kwargs.pop("tokenizer", None) # Pull this out first, we only use it for stopping criteria
# Pull this out first, we only use it for stopping criteria
tokenizer = kwargs.pop("tokenizer", None)
generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
self._validate_model_kwargs(model_kwargs.copy())

# 2. Set generation parameters if not already defined
logits_processor = kwargs.pop("logits_processor", None)
stopping_criteria = kwargs.pop("stopping_criteria", None)
logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
stopping_criteria = \
stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()

accepts_attention_mask = "attention_mask" in set(inspect.signature(self.forward).parameters.keys())
accepts_attention_mask = \
"attention_mask" in set(inspect.signature(self.forward).parameters.keys())
requires_attention_mask = "encoder_outputs" not in model_kwargs
kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None

Expand All @@ -680,16 +685,18 @@ def _prepare_generate_args_4_45(self, inputs, generation_config, streamer=None,
from transformers.utils import is_torchdynamo_compiling
if not self.config.is_encoder_decoder and not is_torchdynamo_compiling():
# If `input_ids` was given, check if the last id in any sequence is `pad_token_id`
# Note: If using, `inputs_embeds` this check does not work, because we want to be more hands-off.
# Note: If using, `inputs_embeds` this check does not work
# because we want to be more hands-off.
if (
generation_config._pad_token_tensor is not None
and batch_size > 1
and len(inputs_tensor.shape) == 2
and torch.sum(inputs_tensor[:, -1] == generation_config._pad_token_tensor) > 0
):
logger.warning(
"A decoder-only architecture is being used, but right-padding was detected! For correct "
"generation results, please set `padding_side='left'` when initializing the tokenizer."
"A decoder-only architecture is being used, but right-padding was detected! "
"For correct generation results, please set `padding_side='left'` "
"when initializing the tokenizer."
)
else:
perf_mode = os.environ.get("IPEX_LLM_PERFORMANCE_MODE", None)
Expand All @@ -700,8 +707,9 @@ def _prepare_generate_args_4_45(self, inputs, generation_config, streamer=None,
invalidInputError(False, f"Encoder-decoder models are not supported now for {error_str}.")

# 4. Define other model kwargs
# decoder-only models with inputs_embeds forwarding must use caching (otherwise we can't detect whether we are
# generating the first new token or not, and we only want to use the embeddings for the first new token)
# decoder-only models with inputs_embeds forwarding must use caching
# (otherwise we can't detect whether we are generating the first new token or not,
# and we only want to use the embeddings for the first new token)
if not self.config.is_encoder_decoder and model_input_name == "inputs_embeds":
model_kwargs["use_cache"] = True
else:
Expand All @@ -713,8 +721,8 @@ def _prepare_generate_args_4_45(self, inputs, generation_config, streamer=None,
)
elif kwargs_has_attention_mask:
if model_input_name == "input_ids" and len(model_kwargs["attention_mask"].shape) > 2:
raise ValueError("`attention_mask` passed to `generate` must be 2D.")
raise invalidInputError(False, "`attention_mask` passed to `generate` must be 2D.")

# 5. Prepare `input_ids` which will be used for auto-regressive generation
input_ids = inputs_tensor if model_input_name == "input_ids" else model_kwargs.pop("input_ids")

Expand All @@ -734,8 +742,10 @@ def _prepare_generate_args_4_45(self, inputs, generation_config, streamer=None,
# 8. determine generation mode
# skip
if streamer is not None and (generation_config.num_beams > 1):
raise ValueError(
"`streamer` cannot be used with beam search (yet!). Make sure that `num_beams` is set to 1."
raise invalidInputError(
False,
"`streamer` cannot be used with beam search (yet!). "
"Make sure that `num_beams` is set to 1."
)

# 9. prepare logits processors and stopping criteria
Expand All @@ -752,10 +762,14 @@ def _prepare_generate_args_4_45(self, inputs, generation_config, streamer=None,
negative_prompt_attention_mask=None,
)
prepared_stopping_criteria = self._get_stopping_criteria(
generation_config=generation_config, stopping_criteria=stopping_criteria, tokenizer=tokenizer, **kwargs
generation_config=generation_config,
stopping_criteria=stopping_criteria,
tokenizer=tokenizer,
**kwargs
)

return input_ids, generation_config, prepared_logits_processor, prepared_stopping_criteria, model_kwargs
return input_ids, generation_config, prepared_logits_processor, prepared_stopping_criteria,\
model_kwargs


def _non_cpu_ipex_verify(self, verify_input_ids, past_key_values, cur_attention_mask=None,
Expand All @@ -771,7 +785,7 @@ def _non_cpu_ipex_verify(self, verify_input_ids, past_key_values, cur_attention_

if self.config.model_type == "chatglm":
if isinstance(self.config.eos_token_id, list) and not hasattr(self.transformer, "vision") \
and self.config.num_layers in [28, 40]:
and self.config.num_layers in [28, 40]:
# glm4 models
past_key_value_len = past_key_values[0][0].shape[2]
else:
Expand Down

0 comments on commit 61e482b

Please sign in to comment.