PaddlePaddle · luotao1 · Jan 26, 2024 · Jan 25, 2024
diff --git a/test/legacy_test/auto_parallel_gpt_model.py b/test/legacy_test/auto_parallel_gpt_model.py
@@ -403,7 +403,7 @@ def gen_cache(self, memory, do_zip=False):
 class TransformerDecoderLayer(nn.Layer):
     """
     The transformer decoder layer.
-    It contains multiheadattention and some linear layers.
+    It contains multi-head attention and some linear layers.
     """
 
     def __init__(
@@ -634,8 +634,8 @@ def __init__(
         self.recompute_granularity = recompute_granularity
 
         self.layer_per_stage = None
-        self.pipline_mode = pp_degree is not None and pp_degree > 1
-        if self.pipline_mode:
+        self.pipeline_mode = pp_degree is not None and pp_degree > 1
+        if self.pipeline_mode:
             self.layer_per_stage = num_hidden_layers // pp_degree
         self.embeddings = GPTEmbeddings(
             vocab_size,

diff --git a/test/legacy_test/test_auto_parallel_completion_gpt.py b/test/legacy_test/test_auto_parallel_completion_gpt.py
@@ -99,7 +99,7 @@ def _fuse_prepare_qkv(self, query):
 
     def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
         r"""
-        Prapares linear projected queries, keys and values for usage of subsequnt
+        Prepares linear projected queries, keys and values for usage of subsequent
         multiple parallel attention. If `cache` is not None, using cached results
         to reduce redundant calculations.
         """
@@ -163,7 +163,7 @@ def compute_kv(self, key, value):
 
     def gen_cache(self, key, value=None, type=Cache):
         """
-        Generates cache for `forward` usage in inference accroding to arguments.
+        Generates cache for `forward` usage in inference according to arguments.
         The generated cache is an instance of `MultiHeadAttention.Cache` or an
         instance of `MultiHeadAttention.StaticCache`.
         """
@@ -329,7 +329,7 @@ def gen_cache(self, memory, do_zip=False):
 class TransformerDecoderLayer(nn.Layer):
     """
     The transformer decoder layer.
-    It contains multiheadattention and some linear layers.
+    It contains multi-head attention and some linear layers.
     """
 
     def __init__(
@@ -484,7 +484,7 @@ def forward(self, input_ids, position_ids=None):
             seq_length = paddle.cumsum(ones, axis=-1)
             position_ids = seq_length - ones
 
-        input_embedings = self.word_embeddings(input_ids)
+        input_embeddings = self.word_embeddings(input_ids)
 
         if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(
@@ -494,7 +494,7 @@ def forward(self, input_ids, position_ids=None):
             )
 
         position_embeddings = self.position_embeddings(position_ids)
-        embeddings = input_embedings + position_embeddings
+        embeddings = input_embeddings + position_embeddings
         embeddings = self.dropout(embeddings)
         return embeddings
 
@@ -528,8 +528,8 @@ def __init__(
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
 
-        self.pipline_mode = topo is not None and topo.pp_info.size > 1
-        if self.pipline_mode:
+        self.pipeline_mode = topo is not None and topo.pp_info.size > 1
+        if self.pipeline_mode:
             self.layer_per_stage = num_hidden_layers // self.topo.pp_info.size
 
         self.embeddings = GPTEmbeddings(

diff --git a/test/legacy_test/test_auto_parallel_partitioner_gpt.py b/test/legacy_test/test_auto_parallel_partitioner_gpt.py
@@ -145,7 +145,7 @@ def _fuse_prepare_qkv(self, query):
 
     def _prepare_qkv(self, query, key, value, use_cache=False, cache=None):
         r"""
-        Prapares linear projected queries, keys and values for usage of subsequnt
+        Prepares linear projected queries, keys and values for usage of subsequent
         multiple parallel attention. If `cache` is not None, using cached results
         to reduce redundant calculations.
         """
@@ -209,7 +209,7 @@ def compute_kv(self, key, value):
 
     def gen_cache(self, key, value=None, type=Cache):
         """
-        Generates cache for `forward` usage in inference accroding to arguments.
+        Generates cache for `forward` usage in inference according to arguments.
         The generated cache is an instance of `MultiHeadAttention.Cache` or an
         instance of `MultiHeadAttention.StaticCache`.
         """
@@ -375,7 +375,7 @@ def gen_cache(self, memory, do_zip=False):
 class TransformerDecoderLayer(nn.Layer):
     """
     The transformer decoder layer.
-    It contains multiheadattention and some linear layers.
+    It contains multi-head attention and some linear layers.
     """
 
     def __init__(
@@ -530,7 +530,7 @@ def forward(self, input_ids, position_ids=None):
             seq_length = paddle.cumsum(ones, axis=-1)
             position_ids = seq_length - ones
 
-        input_embedings = self.word_embeddings(input_ids)
+        input_embeddings = self.word_embeddings(input_ids)
 
         if _global_parallel_strategy in ["mp", "dp_mp"]:
             auto.shard_tensor(
@@ -540,7 +540,7 @@ def forward(self, input_ids, position_ids=None):
             )
 
         position_embeddings = self.position_embeddings(position_ids)
-        embeddings = input_embedings + position_embeddings
+        embeddings = input_embeddings + position_embeddings
         embeddings = self.dropout(embeddings)
         return embeddings
 
@@ -574,8 +574,8 @@ def __init__(
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
 
-        self.pipline_mode = topo is not None and topo.pp_info.size > 1
-        if self.pipline_mode:
+        self.pipeline_mode = topo is not None and topo.pp_info.size > 1
+        if self.pipeline_mode:
             self.layer_per_stage = num_hidden_layers // self.topo.pp_info.size
 
         self.embeddings = GPTEmbeddings(

diff --git a/test/legacy_test/test_dropout_op.py b/test/legacy_test/test_dropout_op.py
@@ -984,7 +984,7 @@ def test_errors(self):
         with paddle.static.program_guard(main_prog, startup_prog):
 
             def test_xdim():
-                # dimentions of x should be 4
+                # dimensions of x should be 4
                 x = paddle.static.data(
                     name='x1', shape=[2, 3, 4, 5, 6], dtype="int32"
                 )
@@ -1116,7 +1116,7 @@ def test_errors(self):
         with paddle.static.program_guard(main_prog, startup_prog):
 
             def test_xdim():
-                # dimentions of x should be 5
+                # dimensions of x should be 5
                 x = paddle.static.data(
                     name='x1', shape=[2, 3, 4, 5], dtype="int32"
                 )

diff --git a/test/legacy_test/test_elementwise_max_op.py b/test/legacy_test/test_elementwise_max_op.py
@@ -169,7 +169,7 @@ def init_data(self):
         core.cudnn_version() < 8100
         or paddle.device.cuda.get_device_capability()[0] < 8
     ),
-    "run test when gpu is availble and the minimum cudnn version is 8.1.0 and gpu's compute capability is at least 8.0.",
+    "run test when gpu is available and the minimum cudnn version is 8.1.0 and gpu's compute capability is at least 8.0.",
 )
 class TestElementwiseBF16Op(OpTest):
     def init_data(self):

diff --git a/test/legacy_test/test_elementwise_min_op.py b/test/legacy_test/test_elementwise_min_op.py
@@ -315,7 +315,7 @@ def setUp(self):
         core.cudnn_version() < 8100
         or paddle.device.cuda.get_device_capability()[0] < 8
     ),
-    "run test when gpu is availble and the minimum cudnn version is 8.1.0 and gpu's compute capability is at least 8.0.",
+    "run test when gpu is available and the minimum cudnn version is 8.1.0 and gpu's compute capability is at least 8.0.",
 )
 class TestElementwiseBF16Op(OpTest):
     def init_data(self):

diff --git a/test/legacy_test/test_imperative_mnist.py b/test/legacy_test/test_imperative_mnist.py
@@ -103,13 +103,13 @@ def forward(self, inputs):
 
 class TestImperativeMnist(unittest.TestCase):
     def reader_decorator(self, reader):
-        def _reader_imple():
+        def _reader_simple():
             for item in reader():
                 image = np.array(item[0]).reshape(1, 28, 28)
                 label = np.array(item[1]).astype('int64').reshape(1)
                 yield image, label
 
-        return _reader_imple
+        return _reader_simple
 
     def test_mnist_float32(self):
         seed = 90

diff --git a/test/legacy_test/test_imperative_optimizer.py b/test/legacy_test/test_imperative_optimizer.py
@@ -50,13 +50,13 @@ def get_optimizer(self):
         raise NotImplementedError()
 
     def reader_decorator(self, reader):
-        def _reader_imple():
+        def _reader_simple():
             for item in reader():
                 image = np.array(item[0]).reshape(1, 784)
                 label = np.array(item[1]).astype('int64').reshape(1)
                 yield image, label
 
-        return _reader_imple
+        return _reader_simple
 
     def _check_exception(self, exception_message, place=None):
         seed = 90

diff --git a/test/legacy_test/test_imperative_optimizer_v2.py b/test/legacy_test/test_imperative_optimizer_v2.py
@@ -52,13 +52,13 @@ def get_optimizer(self):
         raise NotImplementedError()
 
     def reader_decorator(self, reader):
-        def _reader_imple():
+        def _reader_simple():
             for item in reader():
                 image = np.array(item[0]).reshape(1, 784)
                 label = np.array(item[1]).astype('int64').reshape(1)
                 yield image, label
 
-        return _reader_imple
+        return _reader_simple
 
     def _check_exception(self, exception_message, place=None):
         seed = 90
@@ -832,7 +832,7 @@ def get_optimizer_dygraph(self, parameter_list):
         optimizer = paddle.incubate.optimizer.PipelineOptimizer(optimizer)
         return optimizer
 
-    def test_pipline(self):
+    def test_pipeline(self):
         exception_message = "In dygraph, don't support PipelineOptimizer."
         self._check_exception(exception_message)
 

diff --git a/test/legacy_test/test_imperative_resnet.py b/test/legacy_test/test_imperative_resnet.py
@@ -242,13 +242,13 @@ def forward(self, inputs):
 
 class TestDygraphResnet(unittest.TestCase):
     def reader_decorator(self, reader):
-        def _reader_imple():
+        def _reader_simple():
             for item in reader():
                 doc = np.array(item[0]).reshape(3, 224, 224)
                 label = np.array(item[1]).astype('int64').reshape(1)
                 yield doc, label
 
-        return _reader_imple
+        return _reader_simple
 
     def test_resnet_float32(self):
         seed = 90

diff --git a/test/legacy_test/test_imperative_se_resnext.py b/test/legacy_test/test_imperative_se_resnext.py
@@ -312,13 +312,13 @@ def forward(self, inputs):
 
 class TestImperativeResneXt(unittest.TestCase):
     def reader_decorator(self, reader):
-        def _reader_imple():
+        def _reader_simple():
             for item in reader():
                 doc = np.array(item[0]).reshape(3, 224, 224)
                 label = np.array(item[1]).astype('int64').reshape(1)
                 yield doc, label
 
-        return _reader_imple
+        return _reader_simple
 
     def test_se_resnext_float32(self):
         seed = 90

diff --git a/test/legacy_test/test_imperative_transformer_sorted_gradient.py b/test/legacy_test/test_imperative_transformer_sorted_gradient.py
@@ -88,7 +88,7 @@ class ModelHyperParams:
     # automatically according to the passed vocabulary path and special tokens.
     # size of source word dictionary.
     src_vocab_size = 10000
-    # size of target word dictionay
+    # size of target word dictionary
     trg_vocab_size = 10000
     # index for <bos> token
     bos_idx = 0
@@ -235,7 +235,7 @@ def make_all_inputs(input_fields):
 # consistent with some ops' infer-shape output in compile time, such as the
 # sequence_expand op used in beamsearch decoder.
 batch_size = -1
-# The placeholder for squence length in compile time.
+# The placeholder for sequence length in compile time.
 seq_len = ModelHyperParams.max_length
 # Here list the data shapes and data types of all inputs.
 # The shapes here act as placeholder and are set to pass the infer-shape in
@@ -496,11 +496,11 @@ def forward(self, queries, keys, values, attn_bias):
             product += attn_bias
         weights = paddle.nn.functional.softmax(product)
         if self._dropout_rate:
-            weights_droped = paddle.nn.functional.dropout(
+            weights_dropped = paddle.nn.functional.dropout(
                 weights,
                 p=self._dropout_rate,
             )
-            out = paddle.matmul(weights_droped, transpose_v)
+            out = paddle.matmul(weights_dropped, transpose_v)
         else:
             out = paddle.matmul(weights, transpose_v)
 
@@ -775,7 +775,7 @@ def __init__(
         super().__init__()
         self._postprocess_cmd = postprocess_cmd
         self._preprocess_cmd = preprocess_cmd
-        self._prepostprcess_dropout = prepostprocess_dropout
+        self._prepostprocess_dropout = prepostprocess_dropout
         self._pre_process_layer = PrePostProcessLayer(
             d_model, preprocess_cmd, 3
         )
@@ -819,7 +819,7 @@ def __init__(
 
     def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
         pre_process_rlt = self._pre_process_layer(
-            None, dec_input, self._preprocess_cmd, self._prepostprcess_dropout
+            None, dec_input, self._preprocess_cmd, self._prepostprocess_dropout
         )
         slf_attn_output = self._multihead_attention_layer(
             pre_process_rlt, None, None, slf_attn_bias
@@ -828,13 +828,13 @@ def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
             dec_input,
             slf_attn_output,
             self._postprocess_cmd,
-            self._prepostprcess_dropout,
+            self._prepostprocess_dropout,
         )
         pre_process_rlt2 = self._pre_process_layer2(
             None,
             slf_attn_output_pp,
             self._preprocess_cmd,
-            self._prepostprcess_dropout,
+            self._prepostprocess_dropout,
         )
         enc_attn_output_pp = self._multihead_attention_layer2(
             pre_process_rlt2, enc_output, enc_output, dec_enc_attn_bias
@@ -843,20 +843,20 @@ def forward(self, dec_input, enc_output, slf_attn_bias, dec_enc_attn_bias):
             slf_attn_output_pp,
             enc_attn_output_pp,
             self._postprocess_cmd,
-            self._prepostprcess_dropout,
+            self._prepostprocess_dropout,
         )
         pre_process_rlt3 = self._pre_process_layer3(
             None,
             enc_attn_output,
             self._preprocess_cmd,
-            self._prepostprcess_dropout,
+            self._prepostprocess_dropout,
         )
         ffd_output = self._positionwise_feed_forward_layer(pre_process_rlt3)
         dec_output = self._post_process_layer3(
             enc_attn_output,
             ffd_output,
             self._postprocess_cmd,
-            self._prepostprcess_dropout,
+            self._prepostprocess_dropout,
         )
         return dec_output