add unittest

PaddlePaddle · Mar 22, 2023 · a2878f5 · a2878f5
1 parent 1d58586
commit a2878f5
Show file tree

Hide file tree

Showing 2 changed files with 142 additions and 0 deletions.
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt b/python/paddle/fluid/tests/unittests/auto_parallel/CMakeLists.txt
@@ -127,6 +127,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU)
   py_test_modules(test_pass_bf16 MODULES test_pass_bf16)
   py_test_modules(test_dist_saver MODULES test_dist_saver)
   py_test_modules(test_engine_save_load MODULES test_engine_save_load)
+  py_test_modules(test_rule_based_tuner MODULES test_rule_based_tuner)
   # End of unittests WITH single card WITHOUT timeout
 
 endif()
diff --git a/python/paddle/fluid/tests/unittests/auto_parallel/test_rule_based_tuner.py b/python/paddle/fluid/tests/unittests/auto_parallel/test_rule_based_tuner.py
@@ -0,0 +1,141 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import unittest
+
+import numpy as np
+
+import paddle
+import paddle.static as static
+
+sys.path.append("..")
+import auto_parallel_gpt_model as modeling
+from auto_parallel_gpt_model import (
+    GPTForPretraining,
+    GPTModel,
+    GPTPretrainingCriterion,
+)
+
+
+def get_gpt_model(
+    train_program, start_program, place, batch_size, sequence_len, vocab_size
+):
+    with static.program_guard(train_program, start_program):
+        tokens = paddle.static.data(
+            name="tokens", shape=[batch_size, sequence_len], dtype='int64'
+        )
+        position_ids = paddle.static.data(
+            name="position_ids", shape=[batch_size, sequence_len], dtype='int64'
+        )
+        attention_mask = paddle.static.data(
+            name="attention_mask",
+            shape=[batch_size, 1, sequence_len, sequence_len],
+            dtype='float32',
+        )
+        labels = paddle.static.data(
+            name="labels", shape=[batch_size, sequence_len], dtype='int64'
+        )
+        loss_mask = paddle.static.data(
+            name="loss_mask", shape=[batch_size, sequence_len], dtype='float32'
+        )
+
+        gpt = GPTModel(
+            vocab_size=1000,
+            hidden_size=64,
+            num_hidden_layers=2,
+            num_attention_heads=8,
+            intermediate_size=256,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.0,
+            attention_probs_dropout_prob=0.0,
+            max_position_embeddings=1024,
+            type_vocab_size=1,
+            initializer_range=0.02,
+            pad_token_id=0,
+            eos_token_id=7,
+            bos_token_id=0,
+            eol_token_id=3,
+        )
+
+        model = GPTForPretraining(
+            gpt, vocab_size=1000, hidden_size=64, initializer_range=0.02
+        )
+        preds = model(tokens, position_ids, attention_mask)
+        criterion = GPTPretrainingCriterion()
+        loss = criterion(preds, labels, loss_mask)
+
+    def gen_data():
+        np.random.seed(2021)
+        tokens = []
+        position_ids = []
+        attention_mask = []
+        labels = []
+        loss_mask = []
+        for _ in range(batch_size):
+            tokens.append(np.random.randint(vocab_size, size=sequence_len))
+            position_ids.append(np.arange(sequence_len))
+            attention_mask.append([np.tril(np.ones(sequence_len))])
+            labels.append(np.random.randint(vocab_size, size=sequence_len))
+            loss_mask.append(np.ones(sequence_len))
+
+        return tokens, position_ids, attention_mask, labels, loss_mask
+
+    return train_program, start_program, loss, gen_data
+
+
+class TestRuleBasedTuner(unittest.TestCase):
+    def test_gpt(self):
+        modeling.init_global()
+        train_program = static.Program()
+        start_program = static.Program()
+        place = paddle.set_device("gpu")
+        batch_size = 8
+        sequence_len = 512
+        vocab_size = 1000
+        train_program, start_program, loss, gen_data = get_gpt_model(
+            train_program,
+            start_program,
+            place,
+            batch_size,
+            sequence_len,
+            vocab_size,
+        )
+        from paddle.distributed.auto_parallel.dist_context import (
+            DistributedContext,
+        )
+        from paddle.distributed.auto_parallel.process_mesh import ProcessMesh
+        from paddle.distributed.auto_parallel.tuner.rule_based_tuner import (
+            RuleBasedTuner,
+        )
+
+        clip = paddle.nn.ClipGradByGlobalNorm(0.2)
+        opt = paddle.optimizer.AdamW(learning_rate=0.00001, grad_clip=clip)
+        dist_context = DistributedContext(
+            serial_main_prog=train_program,
+            serial_startup_prog=start_program,
+            serial_optimizer=opt,
+            serial_loss=loss,
+        )
+        dist_context.initialize()
+        tuner = RuleBasedTuner(dist_context)
+        tuner.cluster_operators()
+        tuner.gen_full_program()
+        tuner.match_program(tuner._dist_context.serial_main_program)
+        process_mesh = ProcessMesh([0, 1])
+        tuner.complete_sub_fwd_programs(process_mesh)
+
+
+if __name__ == "__main__":
+    unittest.main()