Skip to content

Commit

Permalink
Fix initilizer initializer (#61189)
Browse files Browse the repository at this point in the history
  • Loading branch information
co63oc authored Jan 26, 2024
1 parent 5bbb9a0 commit 1532abc
Show file tree
Hide file tree
Showing 20 changed files with 80 additions and 80 deletions.
6 changes: 3 additions & 3 deletions test/legacy_test/auto_parallel_gpt_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -403,7 +403,7 @@ def gen_cache(self, memory, do_zip=False):
class TransformerDecoderLayer(nn.Layer):
"""
The transformer decoder layer.
It contains multiheadattention and some linear layers.
It contains multi-head attention and some linear layers.
"""

def __init__(
Expand Down Expand Up @@ -634,8 +634,8 @@ def __init__(
self.recompute_granularity = recompute_granularity

self.layer_per_stage = None
self.pipline_mode = pp_degree is not None and pp_degree > 1
if self.pipline_mode:
self.pipeline_mode = pp_degree is not None and pp_degree > 1
if self.pipeline_mode:
self.layer_per_stage = num_hidden_layers // pp_degree
self.embeddings = GPTEmbeddings(
vocab_size,
Expand Down
14 changes: 7 additions & 7 deletions test/legacy_test/test_auto_parallel_completion_gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def _fuse_prepare_qkv(self, query):

def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
r"""
Prapares linear projected queries, keys and values for usage of subsequnt
Prepares linear projected queries, keys and values for usage of subsequent
multiple parallel attention. If `cache` is not None, using cached results
to reduce redundant calculations.
"""
Expand Down Expand Up @@ -163,7 +163,7 @@ def compute_kv(self, key, value):

def gen_cache(self, key, value=None, type=Cache):
"""
Generates cache for `forward` usage in inference accroding to arguments.
Generates cache for `forward` usage in inference according to arguments.
The generated cache is an instance of `MultiHeadAttention.Cache` or an
instance of `MultiHeadAttention.StaticCache`.
"""
Expand Down Expand Up @@ -329,7 +329,7 @@ def gen_cache(self, memory, do_zip=False):
class TransformerDecoderLayer(nn.Layer):
"""
The transformer decoder layer.
It contains multiheadattention and some linear layers.
It contains multi-head attention and some linear layers.
"""

def __init__(
Expand Down Expand Up @@ -484,7 +484,7 @@ def forward(self, input_ids, position_ids=None):
seq_length = paddle.cumsum(ones, axis=-1)
position_ids = seq_length - ones

input_embedings = self.word_embeddings(input_ids)
input_embeddings = self.word_embeddings(input_ids)

if _global_parallel_strategy in ["mp", "dp_mp"]:
auto.shard_tensor(
Expand All @@ -494,7 +494,7 @@ def forward(self, input_ids, position_ids=None):
)

position_embeddings = self.position_embeddings(position_ids)
embeddings = input_embedings + position_embeddings
embeddings = input_embeddings + position_embeddings
embeddings = self.dropout(embeddings)
return embeddings

Expand Down Expand Up @@ -528,8 +528,8 @@ def __init__(
self.hidden_size = hidden_size
self.vocab_size = vocab_size

self.pipline_mode = topo is not None and topo.pp_info.size > 1
if self.pipline_mode:
self.pipeline_mode = topo is not None and topo.pp_info.size > 1
if self.pipeline_mode:
self.layer_per_stage = num_hidden_layers // self.topo.pp_info.size

self.embeddings = GPTEmbeddings(
Expand Down
14 changes: 7 additions & 7 deletions test/legacy_test/test_auto_parallel_partitioner_gpt.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def _fuse_prepare_qkv(self, query):

def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
r"""
Prapares linear projected queries, keys and values for usage of subsequnt
Prepares linear projected queries, keys and values for usage of subsequent
multiple parallel attention. If `cache` is not None, using cached results
to reduce redundant calculations.
"""
Expand Down Expand Up @@ -209,7 +209,7 @@ def compute_kv(self, key, value):

def gen_cache(self, key, value=None, type=Cache):
"""
Generates cache for `forward` usage in inference accroding to arguments.
Generates cache for `forward` usage in inference according to arguments.
The generated cache is an instance of `MultiHeadAttention.Cache` or an
instance of `MultiHeadAttention.StaticCache`.
"""
Expand Down Expand Up @@ -375,7 +375,7 @@ def gen_cache(self, memory, do_zip=False):
class TransformerDecoderLayer(nn.Layer):
"""
The transformer decoder layer.
It contains multiheadattention and some linear layers.
It contains multi-head attention and some linear layers.
"""

def __init__(
Expand Down Expand Up @@ -530,7 +530,7 @@ def forward(self, input_ids, position_ids=None):
seq_length = paddle.cumsum(ones, axis=-1)
position_ids = seq_length - ones

input_embedings = self.word_embeddings(input_ids)
input_embeddings = self.word_embeddings(input_ids)

if _global_parallel_strategy in ["mp", "dp_mp"]:
auto.shard_tensor(
Expand All @@ -540,7 +540,7 @@ def forward(self, input_ids, position_ids=None):
)

position_embeddings = self.position_embeddings(position_ids)
embeddings = input_embedings + position_embeddings
embeddings = input_embeddings + position_embeddings
embeddings = self.dropout(embeddings)
return embeddings

Expand Down Expand Up @@ -574,8 +574,8 @@ def __init__(
self.hidden_size = hidden_size
self.vocab_size = vocab_size

self.pipline_mode = topo is not None and topo.pp_info.size > 1
if self.pipline_mode:
self.pipeline_mode = topo is not None and topo.pp_info.size > 1
if self.pipeline_mode:
self.layer_per_stage = num_hidden_layers // self.topo.pp_info.size

self.embeddings = GPTEmbeddings(
Expand Down
4 changes: 2 additions & 2 deletions test/legacy_test/test_dropout_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -984,7 +984,7 @@ def test_errors(self):
with paddle.static.program_guard(main_prog, startup_prog):

def test_xdim():
# dimentions of x should be 4
# dimensions of x should be 4
x = paddle.static.data(
name='x1', shape=[2, 3, 4, 5, 6], dtype="int32"
)
Expand Down Expand Up @@ -1116,7 +1116,7 @@ def test_errors(self):
with paddle.static.program_guard(main_prog, startup_prog):

def test_xdim():
# dimentions of x should be 5
# dimensions of x should be 5
x = paddle.static.data(
name='x1', shape=[2, 3, 4, 5], dtype="int32"
)
Expand Down
2 changes: 1 addition & 1 deletion test/legacy_test/test_elementwise_max_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ def init_data(self):
core.cudnn_version() < 8100
or paddle.device.cuda.get_device_capability()[0] < 8
),
"run test when gpu is availble and the minimum cudnn version is 8.1.0 and gpu's compute capability is at least 8.0.",
"run test when gpu is available and the minimum cudnn version is 8.1.0 and gpu's compute capability is at least 8.0.",
)
class TestElementwiseBF16Op(OpTest):
def init_data(self):
Expand Down
2 changes: 1 addition & 1 deletion test/legacy_test/test_elementwise_min_op.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def setUp(self):
core.cudnn_version() < 8100
or paddle.device.cuda.get_device_capability()[0] < 8
),
"run test when gpu is availble and the minimum cudnn version is 8.1.0 and gpu's compute capability is at least 8.0.",
"run test when gpu is available and the minimum cudnn version is 8.1.0 and gpu's compute capability is at least 8.0.",
)
class TestElementwiseBF16Op(OpTest):
def init_data(self):
Expand Down
4 changes: 2 additions & 2 deletions test/legacy_test/test_imperative_mnist.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,13 +103,13 @@ def forward(self, inputs):

class TestImperativeMnist(unittest.TestCase):
def reader_decorator(self, reader):
def _reader_imple():
def _reader_simple():
for item in reader():
image = np.array(item[0]).reshape(1, 28, 28)
label = np.array(item[1]).astype('int64').reshape(1)
yield image, label

return _reader_imple
return _reader_simple

def test_mnist_float32(self):
seed = 90
Expand Down
4 changes: 2 additions & 2 deletions test/legacy_test/test_imperative_optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,13 @@ def get_optimizer(self):
raise NotImplementedError()

def reader_decorator(self, reader):
def _reader_imple():
def _reader_simple():
for item in reader():
image = np.array(item[0]).reshape(1, 784)
label = np.array(item[1]).astype('int64').reshape(1)
yield image, label

return _reader_imple
return _reader_simple

def _check_exception(self, exception_message, place=None):
seed = 90
Expand Down
6 changes: 3 additions & 3 deletions test/legacy_test/test_imperative_optimizer_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,13 @@ def get_optimizer(self):
raise NotImplementedError()

def reader_decorator(self, reader):
def _reader_imple():
def _reader_simple():
for item in reader():
image = np.array(item[0]).reshape(1, 784)
label = np.array(item[1]).astype('int64').reshape(1)
yield image, label

return _reader_imple
return _reader_simple

def _check_exception(self, exception_message, place=None):
seed = 90
Expand Down Expand Up @@ -832,7 +832,7 @@ def get_optimizer_dygraph(self, parameter_list):
optimizer = paddle.incubate.optimizer.PipelineOptimizer(optimizer)
return optimizer

def test_pipline(self):
def test_pipeline(self):
exception_message = "In dygraph, don't support PipelineOptimizer."
self._check_exception(exception_message)

Expand Down
4 changes: 2 additions & 2 deletions test/legacy_test/test_imperative_resnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,13 +242,13 @@ def forward(self, inputs):

class TestDygraphResnet(unittest.TestCase):
def reader_decorator(self, reader):
def _reader_imple():
def _reader_simple():
for item in reader():
doc = np.array(item[0]).reshape(3, 224, 224)
label = np.array(item[1]).astype('int64').reshape(1)
yield doc, label

return _reader_imple
return _reader_simple

def test_resnet_float32(self):
seed = 90
Expand Down
4 changes: 2 additions & 2 deletions test/legacy_test/test_imperative_se_resnext.py
Original file line number Diff line number Diff line change
Expand Up @@ -312,13 +312,13 @@ def forward(self, inputs):

class TestImperativeResneXt(unittest.TestCase):
def reader_decorator(self, reader):
def _reader_imple():
def _reader_simple():
for item in reader():
doc = np.array(item[0]).reshape(3, 224, 224)
label = np.array(item[1]).astype('int64').reshape(1)
yield doc, label

return _reader_imple
return _reader_simple

def test_se_resnext_float32(self):
seed = 90
Expand Down
22 changes: 11 additions & 11 deletions test/legacy_test/test_imperative_transformer_sorted_gradient.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ class ModelHyperParams:
# automatically according to the passed vocabulary path and special tokens.
# size of source word dictionary.
src_vocab_size = 10000
# size of target word dictionay
# size of target word dictionary
trg_vocab_size = 10000
# index for <bos> token
bos_idx = 0
Expand Down Expand Up @@ -235,7 +235,7 @@ def make_all_inputs(input_fields):
# consistent with some ops' infer-shape output in compile time, such as the
# sequence_expand op used in beamsearch decoder.
batch_size = -1
# The placeholder for squence length in compile time.
# The placeholder for sequence length in compile time.
seq_len = ModelHyperParams.max_length
# Here list the data shapes and data types of all inputs.
# The shapes here act as placeholder and are set to pass the infer-shape in
Expand Down Expand Up @@ -496,11 +496,11 @@ def forward(self, queries, keys, values, attn_bias):
product += attn_bias
weights = paddle.nn.functional.softmax(product)
if self._dropout_rate:
weights_droped = paddle.nn.functional.dropout(
weights_dropped = paddle.nn.functional.dropout(
weights,
p=self._dropout_rate,
)
out = paddle.matmul(weights_droped, transpose_v)
out = paddle.matmul(weights_dropped, transpose_v)
else:
out = paddle.matmul(weights, transpose_v)

Expand Down Expand Up @@ -775,7 +775,7 @@ def __init__(
super().__init__()
self._postprocess_cmd = postprocess_cmd
self._preprocess_cmd = preprocess_cmd
self._prepostprcess_dropout = prepostprocess_dropout
self._prepostprocess_dropout = prepostprocess_dropout
self._pre_process_layer = PrePostProcessLayer(
d_model, preprocess_cmd, 3
)
Expand Down Expand Up @@ -819,7 +819,7 @@ def __init__(

def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
pre_process_rlt = self._pre_process_layer(
None, dec_input, self._preprocess_cmd, self._prepostprcess_dropout
None, dec_input, self._preprocess_cmd, self._prepostprocess_dropout
)
slf_attn_output = self._multihead_attention_layer(
pre_process_rlt, None, None, slf_attn_bias
Expand All @@ -828,13 +828,13 @@ def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
dec_input,
slf_attn_output,
self._postprocess_cmd,
self._prepostprcess_dropout,
self._prepostprocess_dropout,
)
pre_process_rlt2 = self._pre_process_layer2(
None,
slf_attn_output_pp,
self._preprocess_cmd,
self._prepostprcess_dropout,
self._prepostprocess_dropout,
)
enc_attn_output_pp = self._multihead_attention_layer2(
pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias
Expand All @@ -843,20 +843,20 @@ def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
slf_attn_output_pp,
enc_attn_output_pp,
self._postprocess_cmd,
self._prepostprcess_dropout,
self._prepostprocess_dropout,
)
pre_process_rlt3 = self._pre_process_layer3(
None,
enc_attn_output,
self._preprocess_cmd,
self._prepostprcess_dropout,
self._prepostprocess_dropout,
)
ffd_output = self._positionwise_feed_forward_layer(pre_process_rlt3)
dec_output = self._post_process_layer3(
enc_attn_output,
ffd_output,
self._postprocess_cmd,
self._prepostprcess_dropout,
self._prepostprocess_dropout,
)
return dec_output

Expand Down
Loading

0 comments on commit 1532abc

Please sign in to comment.