diff --git a/paddlemix/models/blip2/modeling.py b/paddlemix/models/blip2/modeling.py index 366ac9991f59e..0a042efbe4e49 100644 --- a/paddlemix/models/blip2/modeling.py +++ b/paddlemix/models/blip2/modeling.py @@ -23,9 +23,9 @@ from paddlenlp.transformers.llama.modeling import LlamaForCausalLM from paddlenlp.transformers.model_outputs import ModelOutput from paddlenlp.transformers.model_utils import PretrainedModel +from paddlenlp.transformers.opt.modeling import OPTForCausalLM from paddlenlp.transformers.t5.modeling import T5ForConditionalGeneration -from paddlemix.models.blip2.modeling_opt import OPTForCausalLM from paddlemix.models.blip2.modeling_utils import ( all_gather_with_grad, concat_all_gather, @@ -414,7 +414,6 @@ def __init__( language_model = OPTForCausalLM.from_pretrained( config.text_config, load_state_as_np=True, - mp_degree=config.mp_degree, ignore_mismatched_sizes=True, ) elif "llama" in config.text_config: diff --git a/paddlemix/models/blip2/modeling_opt.py b/paddlemix/models/blip2/modeling_opt.py deleted file mode 100644 index ee9e8250159d1..0000000000000 --- a/paddlemix/models/blip2/modeling_opt.py +++ /dev/null @@ -1,1287 +0,0 @@ -# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. -# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team. -# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -from __future__ import annotations - -import collections -from functools import partial -from typing import Any, Dict, List - -import numpy as np -import paddle -import paddle.nn as nn -import paddle.nn.functional as F -import paddle.tensor as tensor -from paddle.distributed import fleet -from paddle.distributed.fleet.meta_parallel import get_rng_state_tracker -from paddle.fluid import layers -from paddle.nn import Layer -from paddle.nn.functional.flash_attention import flash_attention -from paddle.nn.layer.transformer import _convert_param_attr_to_list -from paddlenlp.transformers.conversion_utils import StateDictNameMapping -from paddlenlp.transformers.model_outputs import ( - BaseModelOutputWithPastAndCrossAttentions, - CausalLMOutputWithCrossAttentions, -) -from paddlenlp.transformers.model_utils import PretrainedModel, register_base_model -from paddlenlp.transformers.opt.configuration import ( - OPT_PRETRAINED_INIT_CONFIGURATION, - OPT_PRETRAINED_RESOURCE_FILES_MAP, - OPTConfig, -) -from paddlenlp.utils.log import logger - -__all__ = ["OPTModel", "OPTPretrainedModel", "OPTForCausalLM", "OPTForConditionalGeneration"] - - -def finfo(dtype): - if dtype == "float32": - return np.finfo(np.float32) - if dtype == "float16": - return np.finfo(np.float16) - if dtype == "float64": - return np.finfo(np.float64) - - -def _make_causal_mask(input_ids_shape, past_key_values_length, dtype): - """ - Make causal mask used for self-attention. - """ - batch_size, target_length = input_ids_shape - - mask = paddle.full((target_length, target_length), float(finfo(paddle.get_default_dtype()).min)) - - mask_cond = paddle.arange(mask.shape[-1]) - mask_cond = mask_cond < (mask_cond + 1).reshape([mask.shape[-1], 1]) - mask = paddle.where(mask_cond, paddle.full(mask_cond.shape, 0), mask) - - if past_key_values_length > 0: - mask = paddle.concat([paddle.zeros([target_length, past_key_values_length], dtype=mask.dtype), mask], axis=-1) - - expanded_mask = mask.unsqueeze(0).expand([batch_size, 1, target_length, target_length + past_key_values_length]) - return expanded_mask - - -def _expand_mask(mask, tgt_length): - """ - Expands attention_mask from `[batch_size, src_length]` to `[batch_size, 1, tgt_length, src_length]`. - """ - batch_size, src_length = mask.shape[0], mask.shape[-1] - tgt_length = tgt_length if tgt_length is not None else src_length - - expanded_mask = ~(paddle.cast(mask[:, None, None, :], "bool")) - expanded_mask = paddle.cast(expanded_mask, dtype=paddle.float32) - - expanded_mask = expanded_mask.expand([batch_size, 1, tgt_length, src_length]) - expanded_mask = expanded_mask * float(finfo("float16").min) - return expanded_mask - - -class MultiHeadAttention(nn.Layer): - """ - Attention mapps queries and a set of key-value pairs to outputs, and - Multi-Head Attention performs multiple parallel attention to jointly attending - to information from different representation subspaces. - - """ - - Cache = collections.namedtuple("Cache", ["k", "v"]) - StaticCache = collections.namedtuple("StaticCache", ["k", "v"]) - - def __init__( - self, - config: OPTConfig, - need_weights=False, - ): - super(MultiHeadAttention, self).__init__() - self.use_flash_attn = config.get("use_flash_attn", False) - self.num_heads = config.num_attention_heads - self.head_dim = config.hidden_size // self.num_heads - - # get the `num_heads` - assert self.num_heads % config.mp_degree == 0 - self.num_heads = self.num_heads // config.mp_degree - - self.dropout = config.attention_probs_dropout_prob - self.need_weights = need_weights - self.fuse_attention_qkv = config.fuse_attention_qkv - self.mp_degree = config.mp_degree - - assert ( - self.head_dim * self.num_heads * config.mp_degree == config.hidden_size - ), "hidden_size must be divisible by num_heads" - - if config.mp_degree > 1: - if self.fuse_attention_qkv: - self.qkv_proj = fleet.meta_parallel.ColumnParallelLinear( - config.hidden_size, - config.hidden_size * 3, - has_bias=True, - input_is_parallel=True, - ) - else: - self.q_proj = fleet.meta_parallel.ColumnParallelLinear( - config.hidden_size, - config.hidden_size, - has_bias=True, - gather_output=False, - ) - self.k_proj = fleet.meta_parallel.ColumnParallelLinear( - config.hidden_size, - config.hidden_size, - has_bias=True, - gather_output=False, - ) - self.v_proj = fleet.meta_parallel.ColumnParallelLinear( - config.hidden_size, - config.hidden_size, - has_bias=True, - gather_output=False, - ) - - self.out_proj = fleet.meta_parallel.RowParallelLinear( - config.hidden_size, config.hidden_size, input_is_parallel=True, has_bias=True - ) - else: - if self.fuse_attention_qkv: - self.qkv_proj = nn.Linear(config.hidden_size, 3 * config.hidden_size) - else: - self.q_proj = nn.Linear(config.hidden_size, config.hidden_size) - self.k_proj = nn.Linear(config.hidden_size, config.hidden_size) - self.v_proj = nn.Linear(config.hidden_size, config.hidden_size) - - self.out_proj = nn.Linear(config.hidden_size, config.hidden_size) - - def _fuse_prepare_qkv(self, query, use_cache=False, cache=None): - mix_layer = self.qkv_proj(query) - mix_layer = paddle.reshape_(mix_layer, [0, 0, self.num_heads, 3 * self.head_dim]) - if not self.use_flash_attn: - mix_layer = paddle.transpose(mix_layer, [0, 2, 1, 3]) - - q, k, v = paddle.split(mix_layer, num_or_sections=3, axis=-1) - - assert not isinstance(cache, self.StaticCache), "cache currently does not support the StaticCache type" - - if isinstance(cache, self.Cache): - # for decoder self-attention in inference - if not self.use_flash_attn: - k = paddle.concat([cache.k, k], axis=2) - v = paddle.concat([cache.v, v], axis=2) - else: - k = paddle.concat([cache.k, k], axis=2) - v = paddle.concat([cache.v, v], axis=2) - if use_cache is True: - cache = self.Cache(k, v) - - return (q, k, v, cache) if use_cache else (q, k, v, None) - - def _prepare_qkv(self, query, key, value, use_cache=False, cache=None): - r""" - Prapares linear projected queries, keys and values for usage of subsequnt - multiple parallel attention. If `cache` is not None, using cached results - to reduce redundant calculations. - - """ - q = self.q_proj(query) - q = paddle.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) - if not self.use_flash_attn: - q = paddle.transpose(x=q, perm=[0, 2, 1, 3]) - if isinstance(cache, self.StaticCache): - # for encoder-decoder attention in inference and has cached - k, v = cache.k, cache.v - else: - k, v = self.compute_kv(key, value) - - if isinstance(cache, self.Cache): - # for decoder self-attention in inference - if not self.use_flash_attn: - k = paddle.concat([cache.k, k], axis=2) - v = paddle.concat([cache.v, v], axis=2) - else: - k = paddle.concat([cache.k, k], axis=1) - v = paddle.concat([cache.v, v], axis=1) - if use_cache is True: - cache = self.Cache(k, v) - - return (q, k, v, None) if use_cache is False else (q, k, v, cache) - - def compute_kv(self, key, value): - r""" - Applies linear projection on input keys and values, then splits heads - (reshape and transpose) to get keys and values from different representation - subspaces. The results are used as key-values pairs for subsequent multiple - parallel attention. - - It is part of calculations in multi-head attention, and is provided as - a method to pre-compute and prefetch these results, thus we can use them - to construct cache for inference. - - """ - k = self.k_proj(key) - v = self.v_proj(value) - k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) - if not self.use_flash_attn: - k = tensor.transpose(x=k, perm=[0, 2, 1, 3]) - v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) - if not self.use_flash_attn: - v = tensor.transpose(x=v, perm=[0, 2, 1, 3]) - return k, v - - def gen_cache(self, key, value=None, type=Cache): - """ - Generates cache for `forward` usage in inference accroding to arguments. - The generated cache is an instance of `MultiHeadAttention.Cache` or an - instance of `MultiHeadAttention.StaticCache`. - """ - if type == MultiHeadAttention.StaticCache: # static_kv - k, v = self.compute_kv(key, value) - return self.StaticCache(k, v) - elif value is None: # incremental_state - k = layers.fill_constant_batch_size_like( - input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0 - ) - v = layers.fill_constant_batch_size_like( - input=key, shape=[-1, self.num_heads, 0, self.head_dim], dtype=key.dtype, value=0 - ) - return self.Cache(k, v) - else: - # incremental_state with initial value, mainly for usage like UniLM - return self.Cache(key, value) - - def forward( - self, query, key, value, attn_mask=None, use_cache=False, cache=None, output_attention=None, is_causal=True - ): - r""" - Applies multi-head attention to map queries and a set of key-value pairs - to outputs. - """ - key = query if key is None else key - value = query if value is None else value - - if self.fuse_attention_qkv: - q, k, v, cache = self._fuse_prepare_qkv(query, use_cache, cache) - else: - q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, cache) - if self.use_flash_attn: - bsz, q_len, num_heads, head_dim = q.shape - out, weights = flash_attention( - q, - k, - v, - causal=is_causal and q.shape[1] != 1, - return_softmax=self.need_weights and output_attention, - dropout=self.dropout, - ) - out = out.reshape([bsz, q_len, head_dim * num_heads]) - # scale dot product attention - else: - product = paddle.matmul(x=q * (self.head_dim**-0.5), y=k, transpose_y=True) - - if attn_mask is not None: - product = product + attn_mask - - weights = F.softmax(product) - if self.dropout: - if self.mp_degree > 1: - with get_rng_state_tracker().rng_state("local_seed"): - weights = F.dropout(weights, self.dropout, training=self.training, mode="upscale_in_train") - else: - weights = F.dropout(weights, self.dropout, training=self.training, mode="upscale_in_train") - - out = tensor.matmul(weights, v) - - # combine heads - out = tensor.transpose(out, perm=[0, 2, 1, 3]) - out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) - - # project to output - out = self.out_proj(out) - - outs = [out, weights] - if use_cache: - outs.append(cache) - return out if len(outs) == 1 else tuple(outs) - - -class TransformerDecoderLayer(nn.Layer): - """ - The transformer decoder layer. - - It contains multiheadattention and some linear layers. - """ - - def __init__(self, config): - - d_model = config.hidden_size - dim_feedforward = config.intermediate_size - dropout = config.hidden_dropout_prob - activation = config.hidden_act - attn_dropout = config.attention_probs_dropout_prob - act_dropout = config.hidden_dropout_prob - normalize_before = getattr(config, "normalize_before", True) - - weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal(mean=0.0, std=config.initializer_range)) - bias_attr = None - - self._config = locals() - self._config.pop("self") - self._config.pop("__class__", None) # py3 - - super(TransformerDecoderLayer, self).__init__() - attn_dropout = dropout if attn_dropout is None else attn_dropout - act_dropout = dropout if act_dropout is None else act_dropout - self.normalize_before = normalize_before - - weight_attrs = _convert_param_attr_to_list(weight_attr, 3) - bias_attrs = _convert_param_attr_to_list(bias_attr, 3) - - self.self_attn = MultiHeadAttention(config, need_weights=True) - if config.mp_degree > 1: - self.linear1 = fleet.meta_parallel.ColumnParallelLinear( - d_model, dim_feedforward, has_bias=True, gather_output=True - ) - - else: - self.linear1 = nn.Linear(d_model, dim_feedforward, weight_attrs[2], bias_attr=bias_attrs[2]) - - if config.mp_degree > 1: - self.linear2 = fleet.meta_parallel.ColumnParallelLinear( - dim_feedforward, d_model, has_bias=True, gather_output=True - ) - """ - self.linear2 = fleet.meta_parallel.RowParallelLinear( - dim_feedforward, - d_model, - input_is_parallel=True, - has_bias=False, - ) - """ - else: - self.linear2 = nn.Linear(dim_feedforward, d_model, weight_attrs[2], bias_attr=bias_attrs[2]) - - self.norm1 = nn.LayerNorm(d_model, epsilon=1e-5) - self.norm2 = nn.LayerNorm(d_model, epsilon=1e-5) - self.dropout1 = nn.Dropout(dropout, mode="upscale_in_train") - self.dropout2 = nn.Dropout(act_dropout, mode="upscale_in_train") - - if activation == "gelu": - self.activation = nn.GELU(approximate=True) - else: - self.activation = getattr(F, activation) - self.mp_degree = config.mp_degree - - def forward(self, tgt, memory, tgt_mask=None, use_cache=False, cache=None, output_attentions=False): - residual = tgt - - if self.normalize_before: - tgt = self.norm1(tgt) - - # self.self_attn(...) --> hidden_states, weights, (cache) - if use_cache is False: - tgt, attn_weights = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache, output_attention=None) - else: - tgt, attn_weights, incremental_cache = self.self_attn(tgt, tgt, tgt, tgt_mask, use_cache, cache) - if self.mp_degree > 1: - with get_rng_state_tracker().rng_state("global_seed"): - tgt = residual + self.dropout1(tgt) - else: - tgt = residual + self.dropout1(tgt) - if not self.normalize_before: - tgt = self.norm1(tgt) - - residual = tgt - if self.normalize_before: - tgt = self.norm2(tgt) - if self.mp_degree > 1: - with get_rng_state_tracker().rng_state("global_seed"): - tgt = self.dropout2(self.linear2(self.activation(self.linear1(tgt)))) - else: - tgt = self.dropout2(self.linear2(self.activation(self.linear1(tgt)))) - tgt = residual + tgt - - if not self.normalize_before: - tgt = self.norm2(tgt) - - if not (output_attentions or use_cache): - return tgt - - temp_list = [tgt, attn_weights, incremental_cache if use_cache else None] - - return tuple(v for v in temp_list if v is not None) - - def gen_cache(self, memory): - incremental_cache = self.self_attn.gen_cache(memory, type=self.self_attn.Cache) - return incremental_cache - - -class TransformerDecoder(Layer): - """ - TransformerDecoder is a stack of N decoder layers. - """ - - def __init__(self, config: OPTConfig, decoder_layers: List[Layer]): - super(TransformerDecoder, self).__init__() - - if config.word_embed_proj_dim != config.hidden_size: - if config.mp_degree > 1: - self.project_out = fleet.meta_parallel.ColumnParallelLinear( - config.hidden_size, - config.word_embed_proj_dim, - gather_output=True, - has_bias=False, - ) - else: - if config.use_fusedlinear: - self.project_out = paddle.incubate.nn.FusedLinear( - config.hidden_size, config.word_embed_proj_dim, bias_attr=False - ) - else: - self.project_out = nn.Linear(config.hidden_size, config.word_embed_proj_dim, bias_attr=False) - else: - self.project_out = None - - self.num_layers = config.num_hidden_layers - self.layers = decoder_layers - - if config.normalize_before: - self.final_layer_norm = nn.LayerNorm(config.hidden_size) - else: - self.final_layer_norm = None - - self.checkpoints = [] - - def forward( - self, - tgt, - memory, - tgt_mask=None, - memory_mask=None, - use_cache: bool = False, - cache=None, - output_attentions=False, - output_hidden_states=False, - return_dict=False, - ): - r""" - Applies a stack of N Transformer decoder layers on inputs. If `norm` is - provided, also applies layer normalization on the output of last decoder - layer. - """ - output = tgt - new_caches = [] if use_cache else None - self.checkpoints = [] - all_self_attentions = () if output_attentions else None - all_hidden_states = () if output_hidden_states else None - - for i, mod in enumerate(self.layers): - outputs = mod( - output, - memory, - tgt_mask=tgt_mask, - use_cache=use_cache, - cache=cache[i] if cache is not None else cache, - output_attentions=output_attentions, - ) - - # outputs = hidden_states if both use_cache and output_attentions are False - # Otherwise, outputs = (hidden_states, attention if output_attentions, cache if use_cache) - output = outputs[0] if (use_cache or output_attentions) else outputs - - if output_attentions: - all_self_attentions = all_self_attentions + (outputs[1],) - if use_cache: - new_caches.append(outputs[-1]) - if output_hidden_states: - all_hidden_states = all_hidden_states + (output,) - self.checkpoints.append(output.name) - - if self.final_layer_norm: - output = self.final_layer_norm(output) - - if self.project_out: - output = self.project_out(output) - - if not return_dict: - temp_list = [output, new_caches, all_hidden_states, all_self_attentions] - - if not (use_cache or output_attentions or output_hidden_states): - return output - - return tuple(v for v in temp_list if v is not None) - - return BaseModelOutputWithPastAndCrossAttentions( - last_hidden_state=output, - past_key_values=new_caches, - hidden_states=all_hidden_states, - attentions=all_self_attentions, - cross_attentions=None, - ) - - def gen_cache(self, memory, do_zip=False): - r""" - Generates cache for `forward` usage. The generated cache is a list, and - each element in it is a tuple( :code:`(incremental_cache, static_cache)` ) - produced by `TransformerDecoderLayer.gen_cache`. See `TransformerDecoderLayer.gen_cache` - for more details. If `do_zip` is True, apply `zip` on these tuples to get - a list with two elements. - """ - cache = [layer.gen_cache(memory) for layer in self.layers] - if do_zip: - cache = list(zip(*cache)) - return cache - - -class OPTLearnedPositionEmbedding(nn.Embedding): - """this module learns postional embeddings up to a fixed maximum size""" - - def __init__(self, num_embeddings: int, embedding_dim: int, initializer_range: float): - """OPT is set up so that if padding_idx is specified then offset the embedding ids by 2 - and adjust num_embeddings appropriately. Other models don't have this hack. - - Args: - num_embeddings (int): the number of embedding size - embedding_dim (int): the dim of embedding - """ - self.offset = 2 - super().__init__(num_embeddings + self.offset, embedding_dim) - - def forward(self, attention_mask, past_key_values_length: int = 0): - """get the position embedding with attention mask - - Args: - attention_mask: (paddle.Tensor): # create positions depending on attention_mask - past_key_values_length (int, optional): the past key value which will . Defaults to 0. - - Returns: - paddle.Tensor: the position embedding - """ - # create positions depending on attention_mask - if attention_mask.dtype not in [paddle.bool, paddle.int64]: - attention_mask = attention_mask == 1.0 - - position_ids = paddle.cumsum(paddle.cast(attention_mask, "int64"), axis=-1) * attention_mask - 1 - - # cut positions if `past_key_values_length` is > 0 - position_ids = position_ids[:, past_key_values_length:] - return nn.Embedding.forward(self, position_ids + self.offset) - - -class OPTEmbeddings(Layer): - """ - Include embeddings from word and position embeddings. - """ - - def __init__(self, config: OPTConfig): - super(OPTEmbeddings, self).__init__() - if config.mp_degree > 1: - self.word_embeddings = fleet.meta_parallel.VocabParallelEmbedding( - config.vocab_size, - config.word_embed_proj_dim, - weight_attr=paddle.ParamAttr( - initializer=nn.initializer.Normal(mean=0.0, std=config.initializer_range) - ), - ) - else: - self.word_embeddings = nn.Embedding( - config.vocab_size, - config.word_embed_proj_dim, - # padding_idx=config.pad_token_id, - weight_attr=paddle.ParamAttr( - initializer=nn.initializer.Normal(mean=0.0, std=config.initializer_range) - ), - ) - - if config.word_embed_proj_dim != config.hidden_size: - if config.mp_degree > 1: - self.project_in = fleet.meta_parallel.ColumnParallelLinear( - config.word_embed_proj_dim, - config.hidden_size, - gather_output=True, - has_bias=False, - ) - else: - if config.use_fusedlinear: - self.project_in = paddle.incubate.nn.FusedLinear( - config.word_embed_proj_dim, config.hidden_size, bias_attr=False - ) - else: - self.project_in = nn.Linear(config.word_embed_proj_dim, config.hidden_size, bias_attr=False) - else: - self.project_in = None - - self.position_embeddings = OPTLearnedPositionEmbedding( - num_embeddings=config.max_position_embeddings, - embedding_dim=config.hidden_size, - initializer_range=config.initializer_range, - ) - self.mp_degree = config.mp_degree - self.dropout = nn.Dropout(config.hidden_dropout_prob) - - def forward(self, input_ids=None, attention_mask=None, input_embeddings=None, past_key_values_length=None): - if input_ids is not None: - input_embeddings = self.word_embeddings(input_ids) - - if self.project_in: - input_embeddings = self.project_in(input_embeddings) - - position_embeddings = self.position_embeddings(attention_mask, past_key_values_length) - - embeddings = input_embeddings + position_embeddings - if self.mp_degree > 1: - with get_rng_state_tracker().rng_state("global_seed"): - embeddings = self.dropout(embeddings) - else: - embeddings = self.dropout(embeddings) - return embeddings - - -class OPTPretrainedModel(PretrainedModel): - """ - An abstract class for pretrained OPT models. It provides OPT related - `model_config_file`, `resource_files_names`, `pretrained_resource_files_map`, - `pretrained_init_configuration`, `base_model_prefix` for downloading and - loading pretrained models. - See :class:`~paddlenlp.transformers.model_utils.PretrainedModel` for more details. - """ - - config_class = OPTConfig - base_model_prefix = "opt" - - pretrained_init_configuration = OPT_PRETRAINED_INIT_CONFIGURATION - pretrained_resource_files_map = OPT_PRETRAINED_RESOURCE_FILES_MAP - - @classmethod - def _get_tensor_parallel_mappings(cls, config: OPTConfig, is_split=True): - - from paddlenlp.transformers.conversion_utils import split_or_merge_func - - fn = split_or_merge_func( - is_split=is_split, - tensor_parallel_degree=config.tensor_parallel_degree, - tensor_parallel_rank=config.tensor_parallel_rank, - num_attention_heads=config.num_attention_heads, - ) - actions = { - "word_embeddings.weight": partial(fn, is_column=False), - } - for layer_index in range(config.num_hidden_layers): - actions.update( - { - # Column Linear - f"decoder.layers.{layer_index}.self_attn.q_proj.weight": partial(fn, is_column=True), - f"decoder.layers.{layer_index}.self_attn.k_proj.weight": partial(fn, is_column=True), - f"decoder.layers.{layer_index}.self_attn.v_proj.weight": partial(fn, is_column=True), - f"decoder.layers.{layer_index}.linear1.weight": partial(fn, is_column=True), - # Row Linear - f"decoder.layers.{layer_index}.linear2.weight": partial(fn, is_column=False), - f"decoder.layers.{layer_index}.self_attn.out_proj.weight": partial(fn, is_column=False), - } - ) - - if config.word_embed_proj_dim != config.hidden_size: - actions.update( - { - "decoder.project_out.weight": partial(fn, is_column=True), - "decoder.project_in.weight": partial(fn, is_column=True), - } - ) - - if cls.__name__ != "OPTModel": - for key in list(actions.keys()): - actions["opt." + key] = actions.pop(key) - - return actions - - @classmethod - def _get_name_mappings(cls, config: OPTConfig) -> list[StateDictNameMapping]: - mappings: list[StateDictNameMapping] = [] - model_mappings = [ - ["decoder.embed_tokens.weight", "embeddings.word_embeddings.weight"], - ["decoder.embed_positions.weight", "embeddings.position_embeddings.weight"], - ["decoder.final_layer_norm.weight", "decoder.final_layer_norm.weight"], - ["decoder.final_layer_norm.bias", "decoder.final_layer_norm.bias"], - ] - for layer_index in range(config.num_hidden_layers): - layer_mappings = [ - [ - f"decoder.layers.{layer_index}.self_attn.k_proj.weight", - f"decoder.layers.{layer_index}.self_attn.k_proj.weight", - "transpose", - ], - [ - f"decoder.layers.{layer_index}.self_attn.k_proj.bias", - f"decoder.layers.{layer_index}.self_attn.k_proj.bias", - ], - [ - f"decoder.layers.{layer_index}.self_attn.v_proj.weight", - f"decoder.layers.{layer_index}.self_attn.v_proj.weight", - "transpose", - ], - [ - f"decoder.layers.{layer_index}.self_attn.v_proj.bias", - f"decoder.layers.{layer_index}.self_attn.v_proj.bias", - ], - [ - f"decoder.layers.{layer_index}.self_attn.q_proj.weight", - f"decoder.layers.{layer_index}.self_attn.q_proj.weight", - "transpose", - ], - [ - f"decoder.layers.{layer_index}.self_attn.q_proj.bias", - f"decoder.layers.{layer_index}.self_attn.q_proj.bias", - ], - [ - f"decoder.layers.{layer_index}.self_attn.out_proj.weight", - f"decoder.layers.{layer_index}.self_attn.out_proj.weight", - "transpose", - ], - [ - f"decoder.layers.{layer_index}.self_attn.out_proj.bias", - f"decoder.layers.{layer_index}.self_attn.out_proj.bias", - ], - [ - f"decoder.layers.{layer_index}.self_attn_layer_norm.weight", - f"decoder.layers.{layer_index}.norm1.weight", - ], - [ - f"decoder.layers.{layer_index}.self_attn_layer_norm.bias", - f"decoder.layers.{layer_index}.norm1.bias", - ], - [ - f"decoder.layers.{layer_index}.fc1.weight", - f"decoder.layers.{layer_index}.linear1.weight", - "transpose", - ], - [f"decoder.layers.{layer_index}.fc1.bias", f"decoder.layers.{layer_index}.linear1.bias"], - [ - f"decoder.layers.{layer_index}.fc2.weight", - f"decoder.layers.{layer_index}.linear2.weight", - "transpose", - ], - [f"decoder.layers.{layer_index}.fc2.bias", f"decoder.layers.{layer_index}.linear2.bias"], - [ - f"decoder.layers.{layer_index}.final_layer_norm.weight", - f"decoder.layers.{layer_index}.norm2.weight", - ], - [f"decoder.layers.{layer_index}.final_layer_norm.bias", f"decoder.layers.{layer_index}.norm2.bias"], - ] - model_mappings.extend(layer_mappings) - - # base-model prefix "OPTModel" - if cls.__name__ != "OPTModel": - for mapping in model_mappings: - mapping[0] = "model." + mapping[0] - mapping[1] = "opt." + mapping[1] - - # downstream mappings - mappings = [StateDictNameMapping(*mapping, index=index) for index, mapping in enumerate(model_mappings)] - return mappings - - def _init_weights(self, layer): - """Initialization hook""" - if isinstance(layer, (paddle.incubate.nn.FusedLinear, nn.Linear, nn.Embedding)): - # In the dygraph mode, use the `set_value` to reset the parameter directly, - # and reset the `state_dict` to update parameter in static mode. - if isinstance(layer.weight, paddle.Tensor): - layer.weight.set_value( - paddle.tensor.normal( - mean=0.0, - std=self.initializer_range - if hasattr(self, "initializer_range") - else self.opt.config["initializer_range"], - shape=layer.weight.shape, - ) - ) - - -@register_base_model -class OPTModel(OPTPretrainedModel): - r""" - The bare OPT Model transformer outputting raw hidden-states. - - This model inherits from :class:`~paddlenlp.transformers.model_utils.PretrainedModel`. - Refer to the superclass documentation for the generic methods. - - This model is also a Paddle `paddle.nn.Layer `__ subclass. Use it as a regular Paddle Layer - and refer to the Paddle documentation for all matter related to general usage and behavior. - - Args: - config (:class:`OPTConfig`): - An instance of OPTConfig used to construct OPTModel. - """ - - def __init__(self, config: OPTConfig): - super(OPTModel, self).__init__(config) - self.pad_token_id = config.pad_token_id - self.initializer_range = config.initializer_range - self.hidden_size = config.hidden_size - self.vocab_size = config.vocab_size - self.embeddings = OPTEmbeddings(config) - - config.fuse_attention_qkv = False - decoder_layers = nn.LayerList() - for i in range(config.num_hidden_layers): - decoder_layers.append(TransformerDecoderLayer(config)) - self.decoder = TransformerDecoder(config, decoder_layers) - self.checkpoints = [] - - def _prepare_decoder_attention_mask(self, attention_mask, input_shape, past_key_values_length): - # create causal mask - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - combined_attention_mask = None - if input_shape[-1] > 1: - combined_attention_mask = _make_causal_mask( - input_shape, past_key_values_length=past_key_values_length, dtype=attention_mask.dtype - ) - - if attention_mask is not None: - # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] - expanded_attn_mask = _expand_mask(attention_mask, tgt_length=input_shape[-1]) - combined_attention_mask = ( - expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask - ) - - return combined_attention_mask - - def forward( - self, - input_ids=None, - position_ids=None, - attention_mask=None, - inputs_embeds=None, - use_cache=False, - cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - ): - r""" - The OPTModel forward method, overrides the `__call__()` special method. - - Args: - input_ids (Tensor): - Indices of input sequence tokens in the vocabulary. They are - numerical representations of tokens that build the input sequence. - Its data type should be `int64` and it has a shape of [batch_size, sequence_length]. - position_ids(Tensor, optional): - Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0, - max_position_embeddings - 1]``. - Shape as `(batch_size, num_tokens)` and dtype as int64. Defaults to `None`. - attention_mask (Tensor, optional): - Mask used in self attention to avoid performing attention to some unwanted positions, - usually the subsequent positions. - It is a tensor with shape broadcasted to `[batch_size, num_attention_heads, sequence_length, sequence_length]`. - For example, its shape can be [batch_size, sequence_length], [batch_size, sequence_length, sequence_length], - [batch_size, num_attention_heads, sequence_length, sequence_length]. - Its data type should be float32. - The `masked` tokens have `-1e9` values, and the `unmasked` tokens have `0` values. - Defaults to `None`, which means nothing needed to be prevented attention to. - inputs_embeds (Tensor, optional): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation - of shape `(batch_size, sequence_length, hidden_size)`. This is useful if you want more control over - how to convert `input_ids` indices into associated vectors than the model's internal embedding lookup matrix. - Default to None. - use_cache (bool, optional): - Whether or not to use cache. Defaults to `False`. If set to `True`, key value states will be returned and - can be used to speed up decoding. - cache (list, optional): - It is a list, and each element in the list is a tuple `(incremental_cache, static_cache)`. - See `TransformerDecoder.gen_cache `__ for more details. - It is only used for inference and should be None for training. - Default to `None`. - output_attentions (bool, optional): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned - tensors for more detail. Defaults to `None`. - output_hidden_states (bool, optional): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for - more detail. Defaults to `None`. - return_dict (bool, optional): - Whether to return a :class:`~paddlenlp.transformers.model_outputs.BaseModelOutputWithPastAndCrossAttentions` object. If `False`, the output - will be a tuple of tensors. Defaults to `None`. - - - Returns: - Tensor: Returns tensor `encoder_output`, which is the output at the last layer of the model. - Its data type should be float32 and has a shape of [batch_size, sequence_length, hidden_size]. - - Example: - .. code-block:: - - import paddle - from paddlenlp.transformers import OPTModel, GPTTokenizer - - tokenizer = GPTTokenizer.from_pretrained('facebook/opt-125m') - - model = OPTModel.from_pretrained('facebook/opt-125m') - - inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLimage.pngP!", return_token_type_ids=False) - inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()} - output = model(**inputs) - """ - if position_ids is not None: - logger.warning("position_ids has not required for OPTModel.") - - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - if input_ids is not None and inputs_embeds is not None: - raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") - elif input_ids is not None: - input_shape = paddle.shape(input_ids) - input_ids = input_ids.reshape((-1, input_shape[-1])) - elif inputs_embeds is not None: - input_shape = paddle.shape(inputs_embeds)[:-1] - else: - raise ValueError("You have to specify either input_ids or inputs_embeds") - - self.checkpoints = [] - past_key_values_length = paddle.shape(cache[0].k)[2] if cache is not None else 0 - - seq_length_with_past = input_shape[-1] + past_key_values_length - - if attention_mask is None: - attention_mask = paddle.ones((input_shape[0], seq_length_with_past), dtype=paddle.bool) - - embedding_output = self.embeddings( - input_ids=input_ids, - attention_mask=attention_mask, - input_embeddings=inputs_embeds, - past_key_values_length=past_key_values_length, - ) - - attention_mask = self._prepare_decoder_attention_mask(attention_mask, input_shape, past_key_values_length) - attention_mask.stop_gradient = True - - outputs = self.decoder.forward( - embedding_output, - memory=None, - tgt_mask=attention_mask, - use_cache=use_cache, - cache=cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - if output_hidden_states: - if return_dict: - outputs.hidden_states = (embedding_output,) + outputs.hidden_states - else: - # [last_hidden_state, caches, all_hidden_states, all_self_attentions] - idx = 2 if use_cache else 1 - all_hidden_states = ((embedding_output,) + outputs[idx],) - outputs = outputs[:idx] + all_hidden_states + outputs[idx + 1 :] - - self.checkpoints.extend(self.decoder.checkpoints) - return outputs - - def get_input_embeddings(self): - """get opt input word embedding - Returns: - nn.Embedding: the input word embedding of opt mdoel - """ - return self.embeddings.word_embeddings - - def set_input_embeddings(self, embedding: nn.Embedding): - """set opt input embedding - Returns: - nn.Embedding: the instance of new word embedding - """ - self.embeddings.word_embeddings = embedding - - -class OPTLMHead(Layer): - def __init__(self, hidden_size: int, vocab_size: int, embedding_weights=None): - super(OPTLMHead, self).__init__() - self.decoder_weight = ( - self.create_parameter(shape=[vocab_size, hidden_size], dtype=paddle.get_default_dtype(), is_bias=True) - if embedding_weights is None - else embedding_weights - ) - - def forward(self, hidden_states): - if isinstance(hidden_states, BaseModelOutputWithPastAndCrossAttentions): - hidden_states = hidden_states["last_hidden_state"] - - logits = paddle.tensor.matmul(hidden_states, self.decoder_weight, transpose_y=True) - return logits - - -class OPTForCausalLM(OPTPretrainedModel): - """ - The OPT Model with a `language modeling` head on top. - - Args: - config (:class:`OPTConfig`): - An instance of OPTConfig used to construct OPTModel. - - """ - - def __init__(self, config: OPTConfig, **kwargs): - super(OPTForCausalLM, self).__init__(config) - - config.use_fusedlinear = config.get("use_fusedlinear", False) - config.mp_degree = config.mp_degree - - self.opt = OPTModel(config) - self.lm_head = OPTLMHead( - hidden_size=self.opt.config.hidden_size, - vocab_size=self.opt.config.vocab_size, - # embedding_weights=self.opt.embeddings.word_embeddings.weight, - ) - - def forward( - self, - input_ids=None, - attention_mask=None, - inputs_embeds=None, - labels=None, - use_cache=False, - cache=None, - output_attentions=None, - output_hidden_states=None, - return_dict=None, - **kwargs, - ): - r""" - - Args: - input_ids (Tensor): - See :class:`OPTModel`. - attention_mask (Tensor, optional): - See :class:`OPTModel`. - inputs_embeds (Tensor, optional): - See :class:`GPTModel`. - use_cache (bool, optional): - See :class:`OPTModel`. - cache (Tensor, optional): - See :class:`OPTModel`. - labels (paddle.Tensor, optional): - A Tensor of shape `(batch_size, sequence_length)`. - Labels for language modeling. Note that the labels are shifted inside the model, i.e. you can set - `labels = input_ids` Indices are selected in `[-100, 0, ..., vocab_size]` All labels set to `-100` - are ignored (masked), the loss is only computed for labels in `[0, ..., vocab_size]` - Defaults to None. - output_attentions (bool, optional): - See :class:`GPTModel`. - output_hidden_states (bool, optional): - See :class:`GPTModel`. - return_dict (bool, optional): - See :class:`GPTModel`. - Returns: - Tensor or tuple: Returns tensor `logits` or tuple `(logits, cached_kvs)`. If `use_cache` is True, - tuple (`logits, cached_kvs`) will be returned. Otherwise, tensor `logits` will be returned. - `logits` is the output of the opt model. - `cache_kvs` is the cache output of opt model if `use_cache` is True. - - Example: - .. code-block:: - - import paddle - from paddlenlp.transformers import OPTForCausalLM, GPTTokenizer - - tokenizer = GPTTokenizer.from_pretrained('facebook/opt-125m') - model = OPTForCausalLM.from_pretrained('facebook/opt-125m') - - inputs = tokenizer("Welcome to use PaddlePaddle and PaddleNLP!") - inputs = {k:paddle.to_tensor([v]) for (k, v) in inputs.items()} - output_ids, score = model.generate(input_ids=inputs['input_ids']) - print(tokenizer.batch_decode(output_ids[0])) - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - outputs = self.opt( - input_ids, - attention_mask=attention_mask, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - cache=cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - if use_cache: - encoder_outputs, cached_kvs = outputs[:2] - else: - encoder_outputs = outputs - - logits = self.lm_head(encoder_outputs) - - loss = None - if labels is not None: - logits = logits[:, -labels.shape[1] :, :] - shift_logits = logits[:, :-1, :] - shift_labels = labels[:, 1:] - - loss_fct = CrossEntropyLoss(reduction="mean", label_smoothing=None) - labels = shift_labels.reshape((-1,)) - - valid_index = paddle.where(labels != -100)[0].flatten() - logits = shift_logits.reshape((-1, shift_logits.shape[-1])) - logits = paddle.gather(logits, valid_index, axis=0) - labels = paddle.gather(labels, valid_index, axis=0) - lm_loss = loss_fct(logits, labels) - - loss = lm_loss - - if not return_dict: - if not use_cache: - return (loss, logits) if loss is not None else logits - - outputs = (logits,) + outputs[1:] - return ((loss,) + outputs) if loss is not None else outputs - - return CausalLMOutputWithCrossAttentions( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - cross_attentions=outputs.cross_attentions, - ) - - def prepare_fast_entry(self, kwargs: Dict[str, Any]): - # import FasterOPT at here to avoid cycling import - from paddlenlp.ops import FasterOPT - - use_fp16_decoding = kwargs.get("use_fp16_decoding", False) - decode_strategy = kwargs.get("decode_strategy") - # decoding_lib can be passed into FasterOPT - decoding_lib = kwargs.get("decoding_lib", None) - - if decode_strategy == "beam_search": - raise AttributeError("'beam_search' is not supported yet in the fast version of OPT") - # Currently, FasterTransformer only support restricted size_per_head. - - size_per_head = self.opt.config["hidden_size"] // self.opt.config["num_attention_heads"] - - if size_per_head not in [32, 64, 80, 96, 128]: - raise AttributeError( - "'size_per_head = %d' is not supported yet in the fast version of OPT" % size_per_head - ) - if kwargs["forced_bos_token_id"] is not None: - # not support for forced_bos_token_id yet in the fast version - raise AttributeError("'forced_bos_token_id != None' is not supported yet in the fast version") - if kwargs["min_length"] != 0: - # not support for min_length yet in the fast version - raise AttributeError("'min_length != 0' is not supported yet in the fast version") - self._fast_entry = FasterOPT(self, use_fp16_decoding=use_fp16_decoding, decoding_lib=decoding_lib).forward - return self._fast_entry - - def prepare_inputs_for_generation( - self, input_ids, use_cache=False, cache=None, attention_mask=None, inputs_embeds=None, **kwargs - ): - if cache is not None: - input_ids = input_ids[:, -1:] - - # if `inputs_embeds` are passed, we only want to use them in the 1st generation step - if inputs_embeds is not None and cache is None: - model_inputs = {"inputs_embeds": inputs_embeds} - else: - model_inputs = {"input_ids": input_ids} - - model_inputs.update( - { - "cache": cache, - "use_cache": True, - "attention_mask": attention_mask, - } - ) - return model_inputs - - @staticmethod - def prepare_attention_mask_for_generation(input_ids, pad_token_id, eos_token_id): - is_pad_token_in_inputs_ids = (pad_token_id is not None) and paddle.any( - input_ids == pad_token_id - ).numpy().item() - is_pad_token_not_equal_to_eos_token_id = (eos_token_id is None) or ( - (eos_token_id is not None) and (pad_token_id != eos_token_id) - ) - if is_pad_token_in_inputs_ids and is_pad_token_not_equal_to_eos_token_id: - attention_mask = (input_ids != pad_token_id).astype("int64") - else: - attention_mask = paddle.ones_like(input_ids, dtype="int64") - return attention_mask - - def __getattr__(self, name): - try: - return super().__getattr__(name) - except AttributeError as e: - try: - return getattr(getattr(self, self.base_model_prefix), name) - except AttributeError: - try: - return getattr(self, self.base_model_prefix).config[name] - except KeyError: - raise e - - -class CrossEntropyLoss(nn.Layer): - """ - Softmax Cross entropy loss - """ - - def __init__(self, reduction="mean", label_smoothing=None): - super().__init__() - if label_smoothing is not None: - assert label_smoothing >= 0 and label_smoothing <= 1, "label_smoothing must be in [0, 1]" - self.epsilon = label_smoothing - self.reduction = reduction - - def _labelsmoothing(self, target, class_num): - if len(target.shape) == 1 or target.shape[-1] != class_num: - one_hot_target = F.one_hot(target, class_num) - else: - one_hot_target = target - soft_target = F.label_smooth(one_hot_target, epsilon=self.epsilon) - soft_target = paddle.reshape(soft_target, shape=[-1, class_num]) - return soft_target - - def forward(self, x, label): - if isinstance(x, dict): - x = x["logits"] - if self.epsilon is not None: - class_num = x.shape[-1] - label = self._labelsmoothing(label, class_num) - - x = -F.log_softmax(x, axis=-1) - loss = paddle.sum(x * label, axis=-1) - else: - if label.shape[-1] == x.shape[-1]: - loss = paddle.sum(-label * F.log_softmax(x, axis=-1), axis=-1) - else: - if label.dtype == paddle.int32: - label = paddle.cast(label, "int64") - loss = F.cross_entropy(x, label=label, soft_label=False) - - if self.reduction == "sum": - return loss.sum() - elif self.reduction == "mean": - return loss.mean() - else: - return loss - - -OPTForConditionalGeneration = OPTForCausalLM