From 9f92b4a3d2e38e8036ae8ca92eb007c4acebc4dc Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Sat, 7 May 2022 15:15:10 +0800
Subject: [PATCH 01/48] add ernie-large config

---
 paddlenlp/transformers/ernie/modeling.py  | 13 +++++++++++++
 paddlenlp/transformers/ernie/tokenizer.py |  5 +++++
 paddlenlp/transformers/model_utils.py     | 13 +++++++++++--
 3 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/paddlenlp/transformers/ernie/modeling.py b/paddlenlp/transformers/ernie/modeling.py
index 64931eb6b5ec..21cc05f9abc2 100644
--- a/paddlenlp/transformers/ernie/modeling.py
+++ b/paddlenlp/transformers/ernie/modeling.py
@@ -132,6 +132,19 @@ class ErniePretrainedModel(PretrainedModel):
             "vocab_size": 18000,
             "pad_token_id": 0,
         },
+        "ernie-1.0-large": {
+            "attention_probs_dropout_prob": 0.1,
+            "hidden_act": "relu",
+            "hidden_dropout_prob": 0.1,
+            "hidden_size": 1024,
+            "initializer_range": 0.02,
+            "max_position_embeddings": 512,
+            "num_attention_heads": 16,
+            "num_hidden_layers": 24,
+            "type_vocab_size": 2,
+            "vocab_size": 18000,
+            "pad_token_id": 0,
+        },
         "ernie-tiny": {
             "attention_probs_dropout_prob": 0.1,
             "hidden_act": "relu",
diff --git a/paddlenlp/transformers/ernie/tokenizer.py b/paddlenlp/transformers/ernie/tokenizer.py
index 1547770631c1..75aef0d3d0e5 100644
--- a/paddlenlp/transformers/ernie/tokenizer.py
+++ b/paddlenlp/transformers/ernie/tokenizer.py
@@ -80,6 +80,8 @@ class ErnieTokenizer(PretrainedTokenizer):
         "vocab_file": {
             "ernie-1.0":
             "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt",
+            "ernie-1.0-large":
+            "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt",
             "ernie-tiny":
             "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_tiny/vocab.txt",
             "ernie-2.0-en":
@@ -116,6 +118,9 @@ class ErnieTokenizer(PretrainedTokenizer):
         "ernie-1.0": {
             "do_lower_case": True
         },
+        "ernie-1.0-large": {
+            "do_lower_case": True
+        },
         "ernie-tiny": {
             "do_lower_case": True
         },
diff --git a/paddlenlp/transformers/model_utils.py b/paddlenlp/transformers/model_utils.py
index acb75ca0538c..048585bd5a57 100644
--- a/paddlenlp/transformers/model_utils.py
+++ b/paddlenlp/transformers/model_utils.py
@@ -211,8 +211,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
         # From built-in pretrained models
         if pretrained_model_name_or_path in pretrained_models:
             for file_id, map_list in cls.pretrained_resource_files_map.items():
-                resource_files[file_id] = map_list[
-                    pretrained_model_name_or_path]
+                if pretrained_model_name_or_path not in map_list:
+                    resource_files[file_id] = None
+                else:
+                    resource_files[file_id] = map_list[
+                        pretrained_model_name_or_path]
             init_configuration = copy.deepcopy(
                 cls.pretrained_init_configuration[
                     pretrained_model_name_or_path])
@@ -335,6 +338,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
 
         # Maybe need more ways to load resources.
         weight_path = resolved_resource_files["model_state"]
+        if weight_path is None:
+            logger.warning(
+                "No model weight found for %s, return with random initialization !!!"
+                % pretrained_model_name_or_path)
+            return model
+
         assert weight_path.endswith(
             ".pdparams"), "suffix of weight must be .pdparams"
 

From 7243573f4b1ebcee1b1fe23ec3af22935c48a863 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Sat, 7 May 2022 16:28:19 +0800
Subject: [PATCH 02/48] update

---
 examples/language_model/data_tools/dataset_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/language_model/data_tools/dataset_utils.py b/examples/language_model/data_tools/dataset_utils.py
index 81e4e5e2e6c7..a25c2182c1dc 100644
--- a/examples/language_model/data_tools/dataset_utils.py
+++ b/examples/language_model/data_tools/dataset_utils.py
@@ -83,8 +83,9 @@ def __init__(self, datasets, weights):
         self.dataset_index = np.zeros(self.size, dtype=np.uint8)
         self.dataset_sample_index = np.zeros(self.size, dtype=np.int64)
 
-        local_rank = 0 if fleet.local_rank() is None else int(fleet.local_rank(
-        ))
+        # local_rank = 0 if fleet.local_rank() is None else int(fleet.local_rank(
+        # ))
+        local_rank = get_local_rank()
 
         while True:
             try:

From 8dace3d5d36501dc40837761a51421af33be796a Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Wed, 18 May 2022 22:07:30 +0800
Subject: [PATCH 03/48] update clue finetune.

---
 .../clue/classification/run_clue_classifier_trainer.py      | 2 +-
 paddlenlp/transformers/ernie/modeling.py                    | 3 +++
 paddlenlp/transformers/tokenizer_utils_base.py              | 6 +++---
 3 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/examples/benchmark/clue/classification/run_clue_classifier_trainer.py b/examples/benchmark/clue/classification/run_clue_classifier_trainer.py
index 2efca4c68d2a..8264fd9b7479 100644
--- a/examples/benchmark/clue/classification/run_clue_classifier_trainer.py
+++ b/examples/benchmark/clue/classification/run_clue_classifier_trainer.py
@@ -288,7 +288,7 @@ def compute_metrics(p):
     if training_args.do_train:
         train_result = trainer.train(resume_from_checkpoint=checkpoint)
         metrics = train_result.metrics
-        trainer.save_model()  # Saves the tokenizer too for easy upload
+        # trainer.save_model()  # Saves the tokenizer too for easy upload
         trainer.log_metrics("train", metrics)
         trainer.save_metrics("train", metrics)
         trainer.save_state()
diff --git a/paddlenlp/transformers/ernie/modeling.py b/paddlenlp/transformers/ernie/modeling.py
index 21cc05f9abc2..7bb4d573069c 100644
--- a/paddlenlp/transformers/ernie/modeling.py
+++ b/paddlenlp/transformers/ernie/modeling.py
@@ -312,6 +312,9 @@ class ErniePretrainedModel(PretrainedModel):
         "model_state": {
             "ernie-1.0":
             "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/ernie_v1_chn_base.pdparams",
+            "ernie-1.0-large":
+            # "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/ernie-1.0-large-dp16-gb1024-phase2-0507-110w.pdparams",
+            "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/ernie-1.0-large-dp16-gb1024-phase2-0511-60w.pdparams",
             "ernie-tiny":
             "https://bj.bcebos.com/paddlenlp/models/transformers/ernie_tiny/ernie_tiny.pdparams",
             "ernie-2.0-en":
diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py
index 7768c36d7b31..19f6d0d1ce88 100644
--- a/paddlenlp/transformers/tokenizer_utils_base.py
+++ b/paddlenlp/transformers/tokenizer_utils_base.py
@@ -543,8 +543,8 @@ def token_to_chars(self,
         else:
             batch_index = 0
             token_index = batch_or_token_index
-        return CharSpan(*(self._encodings[batch_index].token_to_chars(
-            token_index)))
+        return CharSpan(*(
+            self._encodings[batch_index].token_to_chars(token_index)))
 
     def char_to_token(self,
                       batch_or_char_index: int,
@@ -2884,7 +2884,7 @@ def truncate_sequences(
                     )
                 logger.error(error_msg)
         elif truncation_strategy == TruncationStrategy.LONGEST_FIRST:
-            logger.warning(
+            warnings.warn(
                 f"Be aware, overflowing tokens are not returned for the setting you have chosen,"
                 f" i.e. sequence pairs with the '{TruncationStrategy.LONGEST_FIRST.value}' "
                 f"truncation strategy. So the returned list will always be empty even if some "

From 80c4ac94d817a4177059de5421cefb75b11b8a02 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 19 May 2022 16:42:16 +0800
Subject: [PATCH 04/48] unused delete.

---
 .../run_clue_classifier_trainer.py            |  3 +++
 paddlenlp/trainer/trainer_base.py             | 22 +++----------------
 2 files changed, 6 insertions(+), 19 deletions(-)

diff --git a/examples/benchmark/clue/classification/run_clue_classifier_trainer.py b/examples/benchmark/clue/classification/run_clue_classifier_trainer.py
index 8264fd9b7479..2623e620e593 100644
--- a/examples/benchmark/clue/classification/run_clue_classifier_trainer.py
+++ b/examples/benchmark/clue/classification/run_clue_classifier_trainer.py
@@ -266,6 +266,9 @@ def compute_metrics(p):
         metric.update(result)
         accu = metric.accumulate()
         metric.reset()
+        del metric
+        del preds
+        del label
         return {"accuracy": accu}
 
     trainer = Trainer(
diff --git a/paddlenlp/trainer/trainer_base.py b/paddlenlp/trainer/trainer_base.py
index 5962ebd374fc..b8a0d85b2924 100644
--- a/paddlenlp/trainer/trainer_base.py
+++ b/paddlenlp/trainer/trainer_base.py
@@ -510,8 +510,9 @@ def train(
                         tr_loss_step = self.training_step(model, inputs)
                 else:
                     tr_loss_step = self.training_step(model, inputs)
+                del inputs
 
-                tr_loss += tr_loss_step
+                tr_loss.add_(tr_loss_step)
 
                 if (step + 1) % args.gradient_accumulation_steps == 0 or (
                         # last step in epoch but step is always smaller than gradient_accumulation_steps
@@ -556,10 +557,6 @@ def train(
             if self.control.should_training_stop:
                 break
 
-        if args.past_index and hasattr(self, "_past"):
-            # Clean the state at the end of training
-            delattr(self, "_past")
-
         logger.info("\nTraining completed. \n")
         if args.load_best_model_at_end and self.state.best_model_checkpoint is not None:
             if args.local_rank != -1:
@@ -935,8 +932,6 @@ def _prepare_inputs(self, inputs: Dict[str, Union[paddle.Tensor, Any]]
         handling potential state.
         """
         inputs = self._prepare_input(inputs)
-        if self.args.past_index >= 0 and self._past is not None:
-            inputs["mems"] = self._past
 
         return inputs
 
@@ -980,11 +975,6 @@ def compute_loss(self, model, inputs, return_outputs=False):
             loss = self.criterion(outputs, labels)
             outputs = (loss, outputs)
 
-        # Save past state if it exists
-        # TODO: this needs to be fixed and made cleaner later.
-        if self.args.past_index >= 0:
-            self._past = outputs[self.args.past_index]
-
         # We don't use .loss here since the model may return tuples instead of ModelOutput.
         loss = outputs["loss"] if isinstance(outputs, dict) else outputs[0]
 
@@ -1011,7 +1001,7 @@ def training_step(
             `paddle.Tensor`: The tensor with training loss on this batch.
         """
         model.train()
-        inputs = self._prepare_inputs(inputs)
+        # inputs = self._prepare_inputs(inputs)
 
         with self.autocast_smart_context_manager():
             loss = self.compute_loss(model, inputs)
@@ -1412,9 +1402,6 @@ def evaluation_loop(
         # Do this before wrapping.
         eval_dataset = dataloader.dataset
 
-        if args.past_index >= 0:
-            self._past = None
-
         # Initialize containers
         # losses/preds/labels on GPU (accumulated for eval_accumulation_steps)
         losses_host = None
@@ -1624,9 +1611,6 @@ def prediction_step(
                                    if k not in ignore_keys)
                 else:
                     logits = outputs
-                # TODO: this needs to be fixed and made cleaner later.
-                if self.args.past_index >= 0:
-                    self._past = outputs[self.args.past_index - 1]
 
         if prediction_loss_only:
             return (loss, None, None)

From 8921177a1116c415bed32347ffa373b1e749563d Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Wed, 25 May 2022 20:14:00 +0800
Subject: [PATCH 05/48] update

---
 .../run_clue_classifier_trainer.py            | 10 ++++--
 examples/benchmark/clue/mrc/run_c3.py         | 26 ++++++++------
 examples/benchmark/clue/mrc/run_chid.py       | 26 ++++++++------
 examples/benchmark/clue/mrc/run_cmrc.py       | 34 +++++++++++--------
 4 files changed, 60 insertions(+), 36 deletions(-)

diff --git a/examples/benchmark/clue/classification/run_clue_classifier_trainer.py b/examples/benchmark/clue/classification/run_clue_classifier_trainer.py
index 2623e620e593..10fb45069e86 100644
--- a/examples/benchmark/clue/classification/run_clue_classifier_trainer.py
+++ b/examples/benchmark/clue/classification/run_clue_classifier_trainer.py
@@ -154,12 +154,18 @@ def convert_clue(example,
     if tokenizer is None:
         return example
     if 'sentence' in example:
-        example = tokenizer(example['sentence'], max_seq_len=max_seq_length)
+        example = tokenizer(
+            example['sentence'],
+            padding=True,
+            truncation=True,
+            max_length=max_seq_length)
     elif 'sentence1' in example:
         example = tokenizer(
             example['sentence1'],
             text_pair=example['sentence2'],
-            max_seq_len=max_seq_length)
+            padding=True,
+            truncation=True,
+            max_length=max_seq_length)
 
     if not is_test:
         return {
diff --git a/examples/benchmark/clue/mrc/run_c3.py b/examples/benchmark/clue/mrc/run_c3.py
index ece49c921e07..6ecc28a2c46e 100644
--- a/examples/benchmark/clue/mrc/run_c3.py
+++ b/examples/benchmark/clue/mrc/run_c3.py
@@ -31,6 +31,9 @@
 from paddlenlp.transformers import LinearDecayWithWarmup
 from paddlenlp.transformers import AutoModelForMultipleChoice, AutoTokenizer
 
+from datasets import set_caching_enabled
+set_caching_enabled(False)
+
 
 def parse_args():
     parser = argparse.ArgumentParser()
@@ -100,7 +103,7 @@ def parse_args():
         help="Batch size per GPU/CPU for training.", )
     parser.add_argument(
         "--eval_batch_size",
-        default=32,
+        default=16,
         type=int,
         help="Batch size per GPU/CPU for training.", )
     parser.add_argument(
@@ -260,7 +263,8 @@ def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length):
         train_ds = train_ds.map(preprocess_function,
                                 batched=True,
                                 batch_size=len(train_ds),
-                                num_proc=1,
+                                num_proc=4,
+                                load_from_cache_file=False,
                                 remove_columns=column_names)
         batchify_fn = lambda samples, fn=Dict({
             'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id),  # input
@@ -280,7 +284,8 @@ def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length):
                             batched=True,
                             batch_size=len(dev_ds),
                             remove_columns=column_names,
-                            num_proc=1)
+                            load_from_cache_file=False,
+                            num_proc=4)
         dev_batch_sampler = paddle.io.BatchSampler(
             dev_ds, batch_size=args.eval_batch_size, shuffle=False)
         dev_data_loader = paddle.io.DataLoader(
@@ -340,12 +345,12 @@ def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length):
                   (acc, time.time() - tic_eval))
             if paddle.distributed.get_rank() == 0 and acc > best_acc:
                 best_acc = acc
-                model_to_save = model._layers if isinstance(
-                    model, paddle.DataParallel) else model
-                if not os.path.exists(args.output_dir):
-                    os.makedirs(args.output_dir)
-                model_to_save.save_pretrained(args.output_dir)
-                tokenizer.save_pretrained(args.output_dir)
+                # model_to_save = model._layers if isinstance(
+                #     model, paddle.DataParallel) else model
+                # if not os.path.exists(args.output_dir):
+                #     os.makedirs(args.output_dir)
+                # model_to_save.save_pretrained(args.output_dir)
+                # tokenizer.save_pretrained(args.output_dir)
         print("best_acc: ", best_acc)
 
     if args.do_predict:
@@ -355,7 +360,8 @@ def _truncate_seq_tuple(tokens_a, tokens_b, tokens_c, max_length):
                               batched=True,
                               batch_size=len(test_ds),
                               remove_columns=column_names,
-                              num_proc=1)
+                              load_from_cache_file=False,
+                              num_proc=4)
         # Serveral samples have more than four choices.
         test_batch_sampler = paddle.io.BatchSampler(
             test_ds, batch_size=1, shuffle=False)
diff --git a/examples/benchmark/clue/mrc/run_chid.py b/examples/benchmark/clue/mrc/run_chid.py
index 6da515a672fc..602f76b86c51 100644
--- a/examples/benchmark/clue/mrc/run_chid.py
+++ b/examples/benchmark/clue/mrc/run_chid.py
@@ -31,6 +31,9 @@
 from paddlenlp.transformers import AutoModelForMultipleChoice, AutoTokenizer
 from paddlenlp.transformers import LinearDecayWithWarmup
 
+from datasets import set_caching_enabled
+set_caching_enabled(False)
+
 
 def parse_args():
     parser = argparse.ArgumentParser()
@@ -100,7 +103,7 @@ def parse_args():
         help="Batch size per GPU/CPU for training.", )
     parser.add_argument(
         "--eval_batch_size",
-        default=24,
+        default=12,
         type=int,
         help="Batch size per GPU/CPU for training.", )
     parser.add_argument(
@@ -397,7 +400,8 @@ def add_tokens_for_around(tokens, pos, num_tokens):
         train_ds = train_ds.map(partial(preprocess_function),
                                 batched=True,
                                 batch_size=len(train_ds),
-                                num_proc=1,
+                                num_proc=4,
+                                load_from_cache_file=False,
                                 remove_columns=column_names)
         batchify_fn = lambda samples, fn=Dict({
             'input_ids': Pad(axis=1, pad_val=tokenizer.pad_token_id),  # input
@@ -419,7 +423,8 @@ def add_tokens_for_around(tokens, pos, num_tokens):
                             batched=True,
                             batch_size=len(dev_ds),
                             remove_columns=column_names,
-                            num_proc=1)
+                            load_from_cache_file=False,
+                            num_proc=4)
 
         dev_batch_sampler = paddle.io.BatchSampler(
             dev_ds, batch_size=args.eval_batch_size, shuffle=False)
@@ -481,12 +486,12 @@ def add_tokens_for_around(tokens, pos, num_tokens):
                   (acc, time.time() - tic_eval))
             if paddle.distributed.get_rank() == 0 and acc > best_acc:
                 best_acc = acc
-                model_to_save = model._layers if isinstance(
-                    model, paddle.DataParallel) else model
-                if not os.path.exists(args.output_dir):
-                    os.makedirs(args.output_dir)
-                model_to_save.save_pretrained(args.output_dir)
-                tokenizer.save_pretrained(args.output_dir)
+                # model_to_save = model._layers if isinstance(
+                #     model, paddle.DataParallel) else model
+                # if not os.path.exists(args.output_dir):
+                #     os.makedirs(args.output_dir)
+                # model_to_save.save_pretrained(args.output_dir)
+                # tokenizer.save_pretrained(args.output_dir)
         print("best_acc: ", best_acc)
 
     if args.do_predict:
@@ -496,7 +501,8 @@ def add_tokens_for_around(tokens, pos, num_tokens):
                               batched=True,
                               batch_size=len(test_ds),
                               remove_columns=column_names,
-                              num_proc=1)
+                              load_from_cache_file=False,
+                              num_proc=4)
         test_batch_sampler = paddle.io.BatchSampler(
             test_ds, batch_size=args.eval_batch_size, shuffle=False)
 
diff --git a/examples/benchmark/clue/mrc/run_cmrc.py b/examples/benchmark/clue/mrc/run_cmrc.py
index 7f587da8762f..22edf1681498 100644
--- a/examples/benchmark/clue/mrc/run_cmrc.py
+++ b/examples/benchmark/clue/mrc/run_cmrc.py
@@ -233,7 +233,9 @@ def run(args):
     set_seed(args)
 
     train_examples, dev_examples, test_examples = load_dataset(
-        'cmrc2018', split=["train", "validation", "test"])
+        'cmrc2018',
+        split=["train", "validation", "test"],
+        cache_dir="./cache_dir")
 
     column_names = train_examples.column_names
     if rank == 0:
@@ -374,7 +376,8 @@ def prepare_validation_features(examples):
         train_ds = train_examples.map(prepare_train_features,
                                       batched=True,
                                       remove_columns=column_names,
-                                      num_proc=1)
+                                      load_from_cache_file=False,
+                                      num_proc=4)
         train_batch_sampler = paddle.io.DistributedBatchSampler(
             train_ds, batch_size=args.batch_size, shuffle=True)
         train_batchify_fn = lambda samples, fn=Dict({
@@ -392,7 +395,8 @@ def prepare_validation_features(examples):
         dev_ds = dev_examples.map(prepare_validation_features,
                                   batched=True,
                                   remove_columns=column_names,
-                                  num_proc=1)
+                                  load_from_cache_file=False,
+                                  num_proc=4)
         dev_batch_sampler = paddle.io.BatchSampler(
             dev_ds, batch_size=args.eval_batch_size, shuffle=False)
         dev_batchify_fn = lambda samples, fn=Dict({
@@ -455,16 +459,17 @@ def prepare_validation_features(examples):
 
                     if global_step % args.save_steps == 0 or global_step == num_training_steps:
                         if rank == 0:
-                            output_dir = os.path.join(args.output_dir,
-                                                      "model_%d" % global_step)
-                            if not os.path.exists(output_dir):
-                                os.makedirs(output_dir)
-                            # need better way to get inner model of DataParallel
-                            model_to_save = model._layers if isinstance(
-                                model, paddle.DataParallel) else model
-                            model_to_save.save_pretrained(output_dir)
-                            tokenizer.save_pretrained(output_dir)
-                            print('Saving checkpoint to:', output_dir)
+                            pass
+                            # output_dir = os.path.join(args.output_dir,
+                            #                           "model_%d" % global_step)
+                            # if not os.path.exists(output_dir):
+                            #     os.makedirs(output_dir)
+                            # # need better way to get inner model of DataParallel
+                            # model_to_save = model._layers if isinstance(
+                            #     model, paddle.DataParallel) else model
+                            # model_to_save.save_pretrained(output_dir)
+                            # tokenizer.save_pretrained(output_dir)
+                            # print('Saving checkpoint to:', output_dir)
                         if global_step == num_training_steps:
                             break
             evaluate(model, dev_examples, dev_data_loader, args)
@@ -473,7 +478,8 @@ def prepare_validation_features(examples):
         test_ds = test_examples.map(prepare_validation_features,
                                     batched=True,
                                     remove_columns=column_names,
-                                    num_proc=1)
+                                    load_from_cache_file=False,
+                                    num_proc=4)
         test_batch_sampler = paddle.io.BatchSampler(
             test_ds, batch_size=args.eval_batch_size, shuffle=False)
         test_batchify_fn = lambda samples, fn=Dict({

From c229380a114e80a228adf7ca4a8047e393e77306 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 2 Jun 2022 15:59:09 +0800
Subject: [PATCH 06/48] support no nsp for enrie.

---
 model_zoo/ernie-1.0/run_gb512_s1m_trainer.sh |   1 +
 model_zoo/ernie-1.0/run_pretrain.py          | 145 +++++++++++++------
 model_zoo/ernie-1.0/run_pretrain_trainer.py  |  62 ++++++--
 paddlenlp/transformers/ernie/modeling.py     |  13 +-
 4 files changed, 153 insertions(+), 68 deletions(-)

diff --git a/model_zoo/ernie-1.0/run_gb512_s1m_trainer.sh b/model_zoo/ernie-1.0/run_gb512_s1m_trainer.sh
index 2b294475ff99..8ca0f0320548 100644
--- a/model_zoo/ernie-1.0/run_gb512_s1m_trainer.sh
+++ b/model_zoo/ernie-1.0/run_gb512_s1m_trainer.sh
@@ -21,6 +21,7 @@ python -u  -m paddle.distributed.launch \
     --fp16  \
     --fp16_opt_level "O2"  \
     --learning_rate 0.0001 \
+    --min_learning_rate 0.00001 \
     --max_steps 1000000 \
     --save_steps 50000 \
     --weight_decay 0.01 \
diff --git a/model_zoo/ernie-1.0/run_pretrain.py b/model_zoo/ernie-1.0/run_pretrain.py
index 0ff0015a7df1..334be92d0600 100644
--- a/model_zoo/ernie-1.0/run_pretrain.py
+++ b/model_zoo/ernie-1.0/run_pretrain.py
@@ -30,7 +30,7 @@
 from paddle.io import DataLoader, Dataset
 from visualdl import LogWriter
 
-from paddlenlp.transformers import ErnieModel, ErnieForPretraining, ErniePretrainingCriterion, ErnieTokenizer
+from paddlenlp.transformers import ErnieModel, ErnieForPretraining, ErniePretrainingCriterion, ErnieTokenizer, ErnieForMaskedLM
 from paddlenlp.transformers import CosineAnnealingWithWarmupDecay, LinearAnnealingWithWarmupDecay
 from paddlenlp.utils.batch_sampler import DistributedBatchSampler
 from paddlenlp.data import Stack, Tuple, Pad
@@ -55,6 +55,7 @@ def create_pretrained_dataset(
         max_seq_len,
         places=None,
         data_holders=None,
+        binary_head=True,
         current_step=0, ):
 
     train_valid_test_num_samples = [
@@ -74,7 +75,7 @@ def create_pretrained_dataset(
         short_seq_prob=args.short_seq_prob,
         seed=args.seed,
         skip_warmup=True,
-        binary_head=True,
+        binary_head=binary_head,
         max_seq_length_dec=None,
         dataset_type='ernie')
 
@@ -136,12 +137,26 @@ def loader(dataset, consumed_samples=0):
 
 
 def get_train_data_file(args):
-    files = [
-        os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
-        if (os.path.isfile(os.path.join(args.input_dir, f)) and "_idx.npz" in
-            str(f))
-    ]
-    files = [x.replace("_idx.npz", "") for x in files]
+    if len(args.input_dir.split()) > 1:
+        # weight-1 data-prefix-1 weight-2 data-prefix-2 ...
+        return args.input_dir.split()
+    else:
+        files = [
+            os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
+            if (os.path.isfile(os.path.join(args.input_dir, f)) and "_idx.npz"
+                in str(f))
+        ]
+        files = [x.replace("_idx.npz", "") for x in files]
+
+        if len(files) > 1:
+            ret = []
+            logger.info("You are using multi-dataset:")
+            for x in files:
+                ret.append(1.0)
+                ret.append(x)
+                logger.info("    > set weight of %s dataset to 1.0" % x)
+            return ret
+
     return files
 
 
@@ -166,11 +181,14 @@ def run_evaluate(data_loader,
     model.eval()
     all_loss, all_lm_loss, all_sop_loss = [], [], []
 
-    loss_global = {
-        "loss": paddle.to_tensor(0.0),
-        "lm_loss": paddle.to_tensor(0.0),
-        "sop_loss": paddle.to_tensor(0.0),
-    }
+    if args.binary_head:
+        loss_global = {
+            "loss": paddle.to_tensor(0.0),
+            "lm_loss": paddle.to_tensor(0.0),
+            "sop_loss": paddle.to_tensor(0.0),
+        }
+    else:
+        loss_global = {"loss": paddle.to_tensor(0.0), }
 
     local_time = time.time()
 
@@ -186,13 +204,19 @@ def run_evaluate(data_loader,
             attention_mask=input_mask,
             masked_positions=masked_lm_positions)
 
-        lm_loss, sop_loss = criterion(prediction_scores, seq_relationship_score,
-                                      masked_lm_labels, next_sentence_labels)
-        loss = lm_loss + sop_loss
+        if args.binary_head:
+            lm_loss, sop_loss = criterion(
+                prediction_scores, seq_relationship_score, masked_lm_labels,
+                next_sentence_labels)
+            loss = lm_loss + sop_loss
+        else:
+            loss = criterion(prediction_scores, seq_relationship_score,
+                             masked_lm_labels)
 
         loss_global["loss"] += loss.detach()
-        loss_global["lm_loss"] += lm_loss.detach()
-        loss_global["sop_loss"] += sop_loss.detach()
+        if args.binary_head:
+            loss_global["lm_loss"] += lm_loss.detach()
+            loss_global["sop_loss"] += sop_loss.detach()
 
         if eval_step >= iter_steps - 1:
             log_info_dict = dict()
@@ -203,12 +227,14 @@ def run_evaluate(data_loader,
                 log_info_dict[
                     "samples_per_second"] = iter_steps * args.micro_batch_size / (
                         time.time() - local_time)
-                logger.info(
-                    "%s step %d, batch: %d, loss: %f, lm_loss: %.6f, sop_loss: %.6f, speed: %.0f seqs/s"
-                    % (task_name, global_step, iter_steps,
-                       log_info_dict["loss"], log_info_dict["lm_loss"],
-                       log_info_dict["sop_loss"],
-                       log_info_dict["samples_per_second"]))
+                loss_info = ", ".join([
+                    "{}: {:.6f}".format(k, log_info_dict[k])
+                    for k in log_info_dict.keys() if k.endswith("loss")
+                ])
+
+                logger.info("%s step %d, batch: %d, %s, speed: %.0f seqs/s" %
+                            (task_name, global_step, iter_steps, loss_info,
+                             log_info_dict["samples_per_second"]))
 
                 for k, v in log_info_dict.items():
                     log_writer.add_scalar("%s/%s" % (task_name, k), v,
@@ -303,6 +329,9 @@ def do_train(args):
     # Define the input data in the static mode
     base_class, model_class, criterion_class, tokenizer_class = MODEL_CLASSES[
         args.model_type]
+    if args.binary_head is False:
+        model_class = ErnieForMaskedLM
+
     pretrained_models_list = list(
         model_class.pretrained_init_configuration.keys())
 
@@ -333,7 +362,9 @@ def do_train(args):
             hidden_dropout_prob=args.hidden_dropout_prob,
             attention_probs_dropout_prob=args.attention_probs_dropout_prob)
 
-    criterion = criterion_class()
+    # criterion = criterion_class(with_nsp_loss=args.binary_head)
+
+    criterion = criterion_class(with_nsp_loss=args.binary_head)
 
     if worker_index == 0:
         # log the model config and args 
@@ -420,11 +451,15 @@ def do_train(args):
             logger.info("Checkpoint loaded from global step: {}".format(
                 global_step))
 
-    loss_global = {
-        "loss": paddle.to_tensor(0.0),
-        "lm_loss": paddle.to_tensor(0.0),
-        "sop_loss": paddle.to_tensor(0.0),
-    }
+    if args.binary_head:
+        loss_global = {
+            "loss": paddle.to_tensor(0.0),
+            "lm_loss": paddle.to_tensor(0.0),
+            "sop_loss": paddle.to_tensor(0.0),
+        }
+    else:
+        loss_global = {"loss": paddle.to_tensor(0.0), }
+
     tic_train = time.time()
     while True:
         # If not call valid_data_loader, the enumerate will call valid_data_loader
@@ -460,17 +495,26 @@ def do_train(args):
                     level='O2'):
 
                 # Create the model for the ernie pretrain
-                prediction_scores, seq_relationship_score = model(
-                    input_ids=input_ids,
-                    token_type_ids=segment_ids,
-                    position_ids=None,
-                    attention_mask=input_mask,
-                    masked_positions=masked_lm_positions)
-
-                lm_loss, sop_loss = criterion(
-                    prediction_scores, seq_relationship_score, masked_lm_labels,
-                    next_sentence_labels)
-                loss = lm_loss + sop_loss
+                if args.binary_head:
+                    prediction_scores, seq_relationship_score = model(
+                        input_ids=input_ids,
+                        token_type_ids=segment_ids,
+                        position_ids=None,
+                        attention_mask=input_mask,
+                        masked_positions=masked_lm_positions)
+                    lm_loss, sop_loss = criterion(
+                        prediction_scores, seq_relationship_score,
+                        masked_lm_labels, next_sentence_labels)
+                    loss = lm_loss + sop_loss
+                else:
+                    prediction_scores = model(
+                        input_ids=input_ids,
+                        token_type_ids=segment_ids,
+                        position_ids=None,
+                        attention_mask=input_mask,
+                        masked_positions=masked_lm_positions)
+
+                    loss = criterion(prediction_scores, None, masked_lm_labels)
 
             if args.use_amp:
                 scaler.scale(loss).backward()
@@ -489,8 +533,9 @@ def do_train(args):
             global_step += 1
 
             loss_global["loss"] += loss.detach()
-            loss_global["lm_loss"] += lm_loss.detach()
-            loss_global["sop_loss"] += sop_loss.detach()
+            if args.binary_head:
+                loss_global["lm_loss"] += lm_loss.detach()
+                loss_global["sop_loss"] += sop_loss.detach()
 
             if global_step % args.logging_freq == 0:
                 log_info_dict = dict()
@@ -508,10 +553,14 @@ def do_train(args):
                     for k, v in log_info_dict.items():
                         log_writer.add_scalar("train/%s" % k, v, global_step)
 
-                    common_loginfo = "global step %d, loss: %.9f, lm_loss: %.6f, sop_loss: %.6f, speed: %.2f steps/s, ips: %.2f seqs/s, learning rate: %.5e" % (
-                        global_step, log_info_dict["loss"],
-                        log_info_dict["lm_loss"], log_info_dict["sop_loss"],
-                        speed, log_info_dict["samples_per_second"],
+                    loss_info = ", ".join([
+                        "{}: {:.6f}".format(k, log_info_dict[k])
+                        for k in log_info_dict.keys() if k.endswith("loss")
+                    ])
+
+                    common_loginfo = "global step %d, %s, speed: %.2f steps/s, ips: %.2f seqs/s, learning rate: %.5e" % (
+                        global_step, loss_info, speed,
+                        log_info_dict["samples_per_second"],
                         log_info_dict["learning_rate"])
 
                     addition_info = ""
@@ -521,7 +570,7 @@ def do_train(args):
                             "incr_count": scaler._incr_count,
                             "decr_count": scaler._decr_count
                         }
-                        addition_info = ", ".join("%s: %d" % (k, v)
+                        addition_info = ", ".join("%s: %.2f" % (k, v)
                                                   for k, v in amp_info.items())
                         addition_info = " " + addition_info
                         for k, v in amp_info.items():
diff --git a/model_zoo/ernie-1.0/run_pretrain_trainer.py b/model_zoo/ernie-1.0/run_pretrain_trainer.py
index d1462dfd8aba..1762eb697f2c 100644
--- a/model_zoo/ernie-1.0/run_pretrain_trainer.py
+++ b/model_zoo/ernie-1.0/run_pretrain_trainer.py
@@ -140,7 +140,12 @@ class ModelArguments:
         })
 
 
-def create_pretrained_dataset(data_args, training_args, data_file, tokenizer):
+def create_pretrained_dataset(
+        data_args,
+        training_args,
+        data_file,
+        tokenizer,
+        binary_head=True, ):
 
     train_valid_test_num_samples = [
         training_args.per_device_train_batch_size * training_args.world_size *
@@ -162,7 +167,7 @@ def create_pretrained_dataset(data_args, training_args, data_file, tokenizer):
         short_seq_prob=data_args.short_seq_prob,
         seed=training_args.seed,
         skip_warmup=True,
-        binary_head=True,
+        binary_head=binary_head,
         max_seq_length_dec=None,
         dataset_type='ernie')
 
@@ -206,12 +211,26 @@ def _collate_data(data, stack_fn=Stack()):
 
 
 def get_train_data_file(args):
-    files = [
-        os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
-        if (os.path.isfile(os.path.join(args.input_dir, f)) and "_idx.npz" in
-            str(f))
-    ]
-    files = [x.replace("_idx.npz", "") for x in files]
+    if len(args.input_dir.split()) > 1:
+        # weight-1 data-prefix-1 weight-2 data-prefix-2 ...
+        return args.input_dir.split()
+    else:
+        files = [
+            os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir)
+            if (os.path.isfile(os.path.join(args.input_dir, f)) and "_idx.npz"
+                in str(f))
+        ]
+        files = [x.replace("_idx.npz", "") for x in files]
+
+        if len(files) > 1:
+            ret = []
+            logger.info("You are using multi-dataset:")
+            for x in files:
+                ret.append(1.0)
+                ret.append(x)
+                logger.info("    > set weight of %s dataset to 1.0" % x)
+            return ret
+
     return files
 
 
@@ -330,6 +349,10 @@ def main():
 
     base_class, model_class, criterion_class, tokenizer_class = MODEL_CLASSES[
         model_args.model_type]
+
+    if model_args.binary_head is False:
+        model_class = ErnieForMaskedLM
+
     pretrained_models_list = list(
         model_class.pretrained_init_configuration.keys())
 
@@ -355,7 +378,8 @@ def __init__(self):
             """CriterionWrapper
             """
             super(CriterionWrapper, self).__init__()
-            self.criterion = criterion_class()
+            self.criterion = criterion_class(
+                with_nsp_loss=model_args.binary_head)
 
         def forward(self, output, labels):
             """forward function
@@ -367,14 +391,22 @@ def forward(self, output, labels):
             Returns:
                 Tensor: final loss.
             """
-            prediction_scores, seq_relationship_score = output
             masked_lm_labels, next_sentence_labels = labels
+            if model_args.binary_head:
+                prediction_scores, seq_relationship_score = output
+
+                lm_loss, sop_loss = self.criterion(
+                    prediction_scores, seq_relationship_score, masked_lm_labels,
+                    next_sentence_labels)
+
+                loss = lm_loss + sop_loss
 
-            lm_loss, sop_loss = self.criterion(
-                prediction_scores, seq_relationship_score, masked_lm_labels,
-                next_sentence_labels)
+            else:
+                prediction_scores = output
+                print(prediction_scores)
+                loss = self.criterion(prediction_scores, None, masked_lm_labels)
+                print(loss)
 
-            loss = lm_loss + sop_loss
             return loss
 
     # Create the learning_rate sheduler and optimizer
@@ -392,7 +424,7 @@ def forward(self, output, labels):
     tokenizer = tokenizer_class.from_pretrained(model_args.model_name_or_path)
 
     train_dataset, eval_dataset, test_dataset, data_collator = create_pretrained_dataset(
-        data_args, training_args, data_file, tokenizer)
+        data_args, training_args, data_file, tokenizer, model_args.binary_head)
 
     trainer = PretrainingTrainer(
         model=model,
diff --git a/paddlenlp/transformers/ernie/modeling.py b/paddlenlp/transformers/ernie/modeling.py
index 937904c394fe..78a4943fd4a2 100644
--- a/paddlenlp/transformers/ernie/modeling.py
+++ b/paddlenlp/transformers/ernie/modeling.py
@@ -139,7 +139,7 @@ class ErniePretrainedModel(PretrainedModel):
             "hidden_dropout_prob": 0.1,
             "hidden_size": 768,
             "initializer_range": 0.02,
-            "max_position_embeddings": 513,
+            "max_position_embeddings": 512,
             "num_attention_heads": 12,
             "num_hidden_layers": 12,
             "type_vocab_size": 2,
@@ -413,8 +413,6 @@ class ErniePretrainedModel(PretrainedModel):
             # Deprecated, alias for ernie-1.0-base-zh
             "ernie-1.0":
             "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/ernie_v1_chn_base.pdparams",
-            "ernie-1.0-base-zh":
-            "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/ernie_v1_chn_base.pdparams",
             "ernie-1.0-large-zh":
             "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/ernie_1.0_large_zh.pdparams",
             "ernie-tiny":
@@ -1005,6 +1003,7 @@ def forward(self,
             sequence_output, pooled_output = outputs[:2]
             prediction_scores, seq_relationship_score = self.cls(
                 sequence_output, pooled_output, masked_positions)
+
             return prediction_scores, seq_relationship_score
 
 
@@ -1103,7 +1102,8 @@ def forward(self,
                 input_ids,
                 token_type_ids=None,
                 position_ids=None,
-                attention_mask=None):
+                attention_mask=None,
+                masked_positions=None):
         r"""
 
         Args:
@@ -1115,6 +1115,8 @@ def forward(self,
                 See :class:`ErnieModel`.
             attention_mask (Tensor, optional):
                 See :class:`ErnieModel`.
+            masked_positions:
+                masked positions of output. 
 
         Returns:
             Tensor: Returns tensor `prediction_scores`, The scores of masked token prediction.
@@ -1144,7 +1146,8 @@ def forward(self,
             position_ids=position_ids,
             attention_mask=attention_mask)
         sequence_output = outputs[0]
-        prediction_scores = self.cls(sequence_output, masked_positions=None)
+        prediction_scores = self.cls(sequence_output,
+                                     masked_positions=masked_positions)
         return prediction_scores
 
 

From c5c48282f3ca5238614930f7206b276f3bcb79d3 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 2 Jun 2022 19:51:42 +0800
Subject: [PATCH 07/48] fix evaluation

---
 model_zoo/ernie-1.0/run_pretrain.py | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

diff --git a/model_zoo/ernie-1.0/run_pretrain.py b/model_zoo/ernie-1.0/run_pretrain.py
index 334be92d0600..1144cd6cd5ea 100644
--- a/model_zoo/ernie-1.0/run_pretrain.py
+++ b/model_zoo/ernie-1.0/run_pretrain.py
@@ -196,22 +196,27 @@ def run_evaluate(data_loader,
         input_ids, segment_ids, input_mask, masked_lm_positions, \
         masked_lm_labels, next_sentence_labels = batch
 
-        # Create the model for the gpt pretrain
-        prediction_scores, seq_relationship_score = model(
-            input_ids=input_ids,
-            token_type_ids=segment_ids,
-            position_ids=None,
-            attention_mask=input_mask,
-            masked_positions=masked_lm_positions)
-
         if args.binary_head:
+            prediction_scores, seq_relationship_score = model(
+                input_ids=input_ids,
+                token_type_ids=segment_ids,
+                position_ids=None,
+                attention_mask=input_mask,
+                masked_positions=masked_lm_positions)
+
             lm_loss, sop_loss = criterion(
                 prediction_scores, seq_relationship_score, masked_lm_labels,
                 next_sentence_labels)
             loss = lm_loss + sop_loss
         else:
-            loss = criterion(prediction_scores, seq_relationship_score,
-                             masked_lm_labels)
+            prediction_scores = model(
+                input_ids=input_ids,
+                token_type_ids=segment_ids,
+                position_ids=None,
+                attention_mask=input_mask,
+                masked_positions=masked_lm_positions)
+
+            loss = criterion(prediction_scores, None, masked_lm_labels)
 
         loss_global["loss"] += loss.detach()
         if args.binary_head:

From c6f406d5fab2909c2f9ba87dad4e7e7b38ab2d0f Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Tue, 14 Jun 2022 11:43:02 +0800
Subject: [PATCH 08/48] fix amp o2 save_dtype bugs.

---
 model_zoo/ernie-1.0/run_pretrain.py         | 44 +++++++++++++++++----
 model_zoo/ernie-1.0/run_pretrain_trainer.py |  4 +-
 2 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/model_zoo/ernie-1.0/run_pretrain.py b/model_zoo/ernie-1.0/run_pretrain.py
index 1144cd6cd5ea..f1250282b749 100644
--- a/model_zoo/ernie-1.0/run_pretrain.py
+++ b/model_zoo/ernie-1.0/run_pretrain.py
@@ -418,8 +418,9 @@ def do_train(args):
     if args.use_amp:
         scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)
         scaler = fleet.distributed_scaler(scaler)
-        model = paddle.amp.decorate(
-            models=model, level='O2', save_dtype='float32')
+        model = paddle.amp.decorate(models=model, level='O2')
+    else:
+        scaler = None
 
     if paddle.distributed.get_world_size() > 1:
         model = fleet.distributed_model(model)
@@ -446,13 +447,28 @@ def do_train(args):
             params_path = os.path.join(checkpoint_dir, "model_state.pdparams")
 
             if os.path.exists(opt_path):
+                load_dict = paddle.load(params_path)
+                model_dict = model.state_dict()
+                if args.use_amp:
+                    for k, v in load_dict.items():
+                        if k not in model_dict:
+                            logger.warning(
+                                f"Checkpoint have too much keys: {k}")
+                            continue
+                        if "layer_norm" not in model_dict[k].name:
+                            load_dict[k] = v.astype("float16")
+                model.set_state_dict(load_dict)
                 opt_dict = paddle.load(opt_path)
                 optimizer.set_state_dict(opt_dict)
-                model_dict = paddle.load(params_path)
-                model.set_state_dict(model_dict)
             else:
                 logger.warning("No optimizer checkpoint file found in %s." %
                                opt_path)
+            if scaler is not None and os.path.isfile(
+                    os.path.join(checkpoint_dir, "scaler.pdparams")):
+                scaler.load_state_dict(
+                    paddle.load(
+                        os.path.join(checkpoint_dir, "scaler.pdparams"),
+                        return_numpy=True))
             logger.info("Checkpoint loaded from global step: {}".format(
                 global_step))
 
@@ -602,7 +618,8 @@ def do_train(args):
                     task_name="valid")
                 tic_train = time.time()
 
-            def save_ckpt(output_dir, model, tokenizer, args, global_step):
+            def save_ckpt(output_dir, model, tokenizer, optimizer, scaler, args,
+                          global_step):
                 step_config = {
                     "model_name": args.model_name_or_path,
                     "global_step": global_step,
@@ -614,8 +631,17 @@ def save_ckpt(output_dir, model, tokenizer, args, global_step):
                 model_to_save = model._layers if isinstance(
                     model, paddle.DataParallel) else model
 
-                model_to_save.save_pretrained(output_dir)
                 tokenizer.save_pretrained(output_dir)
+                model_to_save.save_model_config(output_dir)
+                model_dict = model_to_save.state_dict()
+                if scaler is not None:
+                    paddle.save(scaler.state_dict(),
+                                os.path.join(output_dir, "scaler.pdparams"))
+                    for k, v in model_dict.items():
+                        if v.dtype is paddle.float16:
+                            model_dict[k] = v.astype("float32")
+                paddle.save(model_dict,
+                            os.path.join(output_dir, "model_state.pdparams"))
                 paddle.save(optimizer.state_dict(),
                             os.path.join(output_dir, "model_state.pdopt"))
 
@@ -627,7 +653,8 @@ def save_ckpt(output_dir, model, tokenizer, args, global_step):
                 output_dir = os.path.join(args.output_dir,
                                           "model_%d" % global_step)
                 if worker_index == 0:
-                    save_ckpt(output_dir, model, tokenizer, args, global_step)
+                    save_ckpt(output_dir, model, tokenizer, optimizer, scaler,
+                              args, global_step)
 
                 if worker_num > 1:
                     paddle.distributed.barrier()
@@ -645,7 +672,8 @@ def save_ckpt(output_dir, model, tokenizer, args, global_step):
                             shutil.rmtree(output_dir_bak)
                         shutil.move(output_dir, output_dir_bak)
                         os.mkdir(output_dir)
-                    save_ckpt(output_dir, model, tokenizer, args, global_step)
+                    save_ckpt(output_dir, model, tokenizer, optimizer, scaler,
+                              args, global_step)
 
                 if worker_num > 1:
                     paddle.distributed.barrier()
diff --git a/model_zoo/ernie-1.0/run_pretrain_trainer.py b/model_zoo/ernie-1.0/run_pretrain_trainer.py
index 1762eb697f2c..979d76716c5a 100644
--- a/model_zoo/ernie-1.0/run_pretrain_trainer.py
+++ b/model_zoo/ernie-1.0/run_pretrain_trainer.py
@@ -26,7 +26,7 @@
 
 import numpy as np
 import paddle
-from paddlenlp.transformers import ErnieModel, ErnieForPretraining, ErniePretrainingCriterion, ErnieTokenizer
+from paddlenlp.transformers import ErnieModel, ErnieForPretraining, ErniePretrainingCriterion, ErnieTokenizer, ErnieForMaskedLM
 from paddlenlp.transformers import CosineAnnealingWithWarmupDecay, LinearAnnealingWithWarmupDecay
 from paddlenlp.utils.batch_sampler import DistributedBatchSampler
 from paddlenlp.data import Stack, Tuple, Pad
@@ -403,9 +403,7 @@ def forward(self, output, labels):
 
             else:
                 prediction_scores = output
-                print(prediction_scores)
                 loss = self.criterion(prediction_scores, None, masked_lm_labels)
-                print(loss)
 
             return loss
 

From c9b991ce0d6e935e909c153cbafa5d0b6344283c Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Mon, 27 Jun 2022 15:54:51 +0800
Subject: [PATCH 09/48] extand ernie.

---
 paddlenlp/transformers/ernie/tokenizer.py | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/paddlenlp/transformers/ernie/tokenizer.py b/paddlenlp/transformers/ernie/tokenizer.py
index b82752222633..b1ca291b3466 100644
--- a/paddlenlp/transformers/ernie/tokenizer.py
+++ b/paddlenlp/transformers/ernie/tokenizer.py
@@ -236,6 +236,28 @@ def vocab_size(self):
         """
         return len(self.vocab)
 
+    def extend_chinese_char(self):
+        """
+        For, char level model such as ERNIE, we need add ## chinese token 
+        to demonstrate the segment information.
+        """
+        vocab_set = set(self.vocab.token_to_idx.keys())
+        extend_list = []
+        for i in range(len(self.vocab)):
+            if i not in self.vocab.idx_to_token:
+                continue
+            w = self.vocab.idx_to_token[i]
+            if len(w) == 1 and ord(w) >= 0x4E00 and ord(w) <= 0x9FA5:
+                new_char = "##" + w
+                if new_char not in vocab_set:
+                    extend_list.append(new_char)
+        if len(self.vocab) + len(extend_list) > 2**16:
+            warnings.warn("The vocab size if larger than uint16")
+        verbose = self.verbose
+        self.verbose = False
+        self.add_tokens(extend_list)
+        self.verbose = verbose
+
     def _tokenize(self, text):
         r"""
         End-to-end tokenization for ERNIE models.

From e6fd720e7f0dca93b4c6d858ef072112c689f6e4 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Tue, 28 Jun 2022 13:50:05 +0800
Subject: [PATCH 10/48] fix ernie pretrain with ## vocab.

---
 model_zoo/ernie-1.0/args.py                   |  3 ++
 .../ernie-1.0/data_tools/ernie_dataset.py     |  3 ++
 model_zoo/ernie-1.0/run_pretrain.py           |  3 +-
 model_zoo/ernie-1.0/run_pretrain_static.py    |  1 +
 model_zoo/ernie-1.0/run_pretrain_trainer.py   |  1 +
 paddlenlp/transformers/bert/tokenizer.py      |  4 +--
 paddlenlp/transformers/ernie/tokenizer.py     | 28 ++++++++++++++++---
 7 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/model_zoo/ernie-1.0/args.py b/model_zoo/ernie-1.0/args.py
index ab9a6f1749ea..867e6472206b 100644
--- a/model_zoo/ernie-1.0/args.py
+++ b/model_zoo/ernie-1.0/args.py
@@ -34,6 +34,9 @@ def parse_args(MODEL_CLASSES):
     parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
         help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
         sum([ list(classes[-1].pretrained_init_configuration.keys()) for classes in MODEL_CLASSES.values() ], [])),)
+    parser.add_argument("--tokenize_name_or_path", default=None, type=str, required=True,
+        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
+        sum([ list(classes[-1].pretrained_init_configuration.keys()) for classes in MODEL_CLASSES.values() ], [])),)
 
     # Train I/O config
     parser.add_argument("--input_dir", default=None, type=str, required=True, help="The input directory where the data will be read from.", )
diff --git a/model_zoo/ernie-1.0/data_tools/ernie_dataset.py b/model_zoo/ernie-1.0/data_tools/ernie_dataset.py
index ecffce3aa01c..da732efd9581 100644
--- a/model_zoo/ernie-1.0/data_tools/ernie_dataset.py
+++ b/model_zoo/ernie-1.0/data_tools/ernie_dataset.py
@@ -79,6 +79,9 @@ def __init__(self,
         self.vocab_id_to_token_dict = tokenizer.vocab.idx_to_token
         self.vocab_token_to_id_dict = tokenizer.vocab.token_to_idx
 
+        self.vocab_id_to_token_dict.update(tokenizer.added_tokens_decoder)
+        self.vocab_token_to_id_dict.update(tokenizer.added_tokens_encoder)
+
         self.cls_id = tokenizer.cls_token_id
         self.sep_id = tokenizer.sep_token_id
         self.mask_id = tokenizer.mask_token_id
diff --git a/model_zoo/ernie-1.0/run_pretrain.py b/model_zoo/ernie-1.0/run_pretrain.py
index addb7ecd3755..9850ec151460 100644
--- a/model_zoo/ernie-1.0/run_pretrain.py
+++ b/model_zoo/ernie-1.0/run_pretrain.py
@@ -391,7 +391,8 @@ def do_train(args):
         model = fleet.distributed_model(model)
         optimizer = fleet.distributed_optimizer(optimizer)
 
-    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenize_name_or_path)
+    tokenizer.extend_chinese_char()
 
     data_file = get_train_data_file(args)
     train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset(
diff --git a/model_zoo/ernie-1.0/run_pretrain_static.py b/model_zoo/ernie-1.0/run_pretrain_static.py
index 154112fee086..809506522a6f 100644
--- a/model_zoo/ernie-1.0/run_pretrain_static.py
+++ b/model_zoo/ernie-1.0/run_pretrain_static.py
@@ -404,6 +404,7 @@ def do_train(args):
         ] = data_holders
 
         tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
+        tokenizer.extend_chinese_char()
 
         train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset(
             args,
diff --git a/model_zoo/ernie-1.0/run_pretrain_trainer.py b/model_zoo/ernie-1.0/run_pretrain_trainer.py
index b9ffbcf8cf73..eef97d2ea996 100644
--- a/model_zoo/ernie-1.0/run_pretrain_trainer.py
+++ b/model_zoo/ernie-1.0/run_pretrain_trainer.py
@@ -402,6 +402,7 @@ def forward(self, output, labels):
 
     data_file = get_train_data_file(data_args)
     tokenizer = tokenizer_class.from_pretrained(model_args.model_name_or_path)
+    tokenizer.extend_chinese_char()
 
     train_dataset, eval_dataset, test_dataset, data_collator = create_pretrained_dataset(
         data_args, training_args, data_file, tokenizer)
diff --git a/paddlenlp/transformers/bert/tokenizer.py b/paddlenlp/transformers/bert/tokenizer.py
index 8946e27496d8..cdd4e9c8f4b3 100644
--- a/paddlenlp/transformers/bert/tokenizer.py
+++ b/paddlenlp/transformers/bert/tokenizer.py
@@ -18,7 +18,7 @@
 import unicodedata
 
 from .. import PretrainedTokenizer, AddedToken
-from ..tokenizer_utils import convert_to_unicode, whitespace_tokenize, _is_whitespace, _is_control, _is_punctuation
+from ..tokenizer_utils import convert_to_unicode, whitespace_tokenize, _is_whitespace, _is_control, _is_punctuation, _is_symbol
 
 __all__ = [
     'BasicTokenizer',
@@ -105,7 +105,7 @@ def _run_split_on_punc(self, text):
         output = []
         while i < len(chars):
             char = chars[i]
-            if _is_punctuation(char):
+            if _is_punctuation(char) or _is_symbol(char):
                 output.append([char])
                 start_new_word = True
             else:
diff --git a/paddlenlp/transformers/ernie/tokenizer.py b/paddlenlp/transformers/ernie/tokenizer.py
index b1ca291b3466..1904ede218cc 100644
--- a/paddlenlp/transformers/ernie/tokenizer.py
+++ b/paddlenlp/transformers/ernie/tokenizer.py
@@ -253,10 +253,30 @@ def extend_chinese_char(self):
                     extend_list.append(new_char)
         if len(self.vocab) + len(extend_list) > 2**16:
             warnings.warn("The vocab size if larger than uint16")
-        verbose = self.verbose
-        self.verbose = False
-        self.add_tokens(extend_list)
-        self.verbose = verbose
+        new_tokens = [str(tok) for tok in extend_list]
+
+        tokens_to_add = []
+        for token in new_tokens:
+            if not isinstance(token, str):
+                raise TypeError(
+                    f"Token {token} is not a string but a {type(token)}.")
+            if hasattr(self, "do_lower_case") and self.do_lower_case:
+                token = token.lower()
+            if (token != self.unk_token and self.convert_tokens_to_ids(token)
+                    == self.convert_tokens_to_ids(self.unk_token)
+                    and token not in tokens_to_add):
+                tokens_to_add.append(token)
+
+        if self.verbose:
+            print(
+                f"Adding {len(tokens_to_add)} ## chinese tokens to the vocabulary"
+            )
+
+        added_tok_encoder = dict(
+            (tok, len(self) + i) for i, tok in enumerate(tokens_to_add))
+        added_tok_decoder = {v: k for k, v in added_tok_encoder.items()}
+        self.added_tokens_encoder.update(added_tok_encoder)
+        self.added_tokens_decoder.update(added_tok_decoder)
 
     def _tokenize(self, text):
         r"""

From 815cae4a682f4a5d610f8435bb2d92aeb79dd6a8 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Tue, 28 Jun 2022 15:37:51 +0800
Subject: [PATCH 11/48] extend vocab

---
 model_zoo/ernie-1.0/data_tools/ernie_dataset.py | 11 ++++++++---
 model_zoo/ernie-1.0/run_pretrain.py             | 16 ++++++++++++++++
 model_zoo/ernie-1.0/run_pretrain_static.py      | 16 ++++++++++++++++
 model_zoo/ernie-1.0/run_pretrain_trainer.py     | 16 ++++++++++++++++
 4 files changed, 56 insertions(+), 3 deletions(-)

diff --git a/model_zoo/ernie-1.0/data_tools/ernie_dataset.py b/model_zoo/ernie-1.0/data_tools/ernie_dataset.py
index da732efd9581..003c902aa056 100644
--- a/model_zoo/ernie-1.0/data_tools/ernie_dataset.py
+++ b/model_zoo/ernie-1.0/data_tools/ernie_dataset.py
@@ -14,9 +14,11 @@
 # limitations under the License.
 """BERT Style dataset."""
 
+import re
+import copy
+
 import numpy as np
 import paddle
-import re
 
 from .dataset_utils import (
     get_samples_mapping,
@@ -76,8 +78,10 @@ def __init__(self,
         # self.vocab_id_list = list(tokenizer.inv_vocab.keys())
         # self.vocab_id_to_token_dict = tokenizer.inv_vocab
         self.vocab_id_list = list(tokenizer.vocab.idx_to_token.keys())
-        self.vocab_id_to_token_dict = tokenizer.vocab.idx_to_token
-        self.vocab_token_to_id_dict = tokenizer.vocab.token_to_idx
+        self.vocab_id_to_token_dict = copy.deepcopy(
+            tokenizer.vocab.idx_to_token)
+        self.vocab_token_to_id_dict = copy.deepcopy(
+            tokenizer.vocab.token_to_idx)
 
         self.vocab_id_to_token_dict.update(tokenizer.added_tokens_decoder)
         self.vocab_token_to_id_dict.update(tokenizer.added_tokens_encoder)
@@ -93,6 +97,7 @@ def __len__(self):
     def __getitem__(self, idx):
         start_idx, end_idx, seq_length = self.samples_mapping[idx]
         sample = [self.indexed_dataset[i] for i in range(start_idx, end_idx)]
+
         # Note that this rng state should be numpy and not python since
         # python randint is inclusive whereas the numpy one is exclusive.
         # We % 2**32 since numpy requres the seed to be between 0 and 2**32 - 1
diff --git a/model_zoo/ernie-1.0/run_pretrain.py b/model_zoo/ernie-1.0/run_pretrain.py
index 9850ec151460..80034579d073 100644
--- a/model_zoo/ernie-1.0/run_pretrain.py
+++ b/model_zoo/ernie-1.0/run_pretrain.py
@@ -79,6 +79,22 @@ def create_pretrained_dataset(
         max_seq_length_dec=None,
         dataset_type='ernie')
 
+    def print_dataset(data, mode="train"):
+        logger.info(f"Sample data for {mode} mode")
+        input_ids, segment_ids, input_mask, masked_lm_positions, masked_lm_labels, next_sentence_labels = data
+        if tokenizer.pad_token_id in input_ids:
+            input_ids = input_ids[0:list(input_ids).index(tokenizer.pad_token_id
+                                                          )]
+        logger.info(tokenizer._decode(input_ids))
+        for pos, label in zip(masked_lm_positions, masked_lm_labels):
+            input_ids[pos] = label
+        logger.info(tokenizer._decode(input_ids))
+        logger.info(tokenizer.convert_ids_to_tokens(masked_lm_labels))
+
+    print_dataset(train_ds[0], "train")
+    print_dataset(valid_ds[0], "valid")
+    print_dataset(test_ds[0], "test")
+
     def _collate_data(data, stack_fn=Stack()):
         num_fields = len(data[0])
         out = [None] * num_fields
diff --git a/model_zoo/ernie-1.0/run_pretrain_static.py b/model_zoo/ernie-1.0/run_pretrain_static.py
index 809506522a6f..7bc92b224c3b 100644
--- a/model_zoo/ernie-1.0/run_pretrain_static.py
+++ b/model_zoo/ernie-1.0/run_pretrain_static.py
@@ -84,6 +84,22 @@ def create_pretrained_dataset(
         max_seq_length_dec=None,
         dataset_type='ernie')
 
+    def print_dataset(data, mode="train"):
+        logger.info(f"Sample data for {mode} mode")
+        input_ids, segment_ids, input_mask, masked_lm_positions, masked_lm_labels, next_sentence_labels = data
+        if tokenizer.pad_token_id in input_ids:
+            input_ids = input_ids[0:list(input_ids).index(tokenizer.pad_token_id
+                                                          )]
+        logger.info(tokenizer._decode(input_ids))
+        for pos, label in zip(masked_lm_positions, masked_lm_labels):
+            input_ids[pos] = label
+        logger.info(tokenizer._decode(input_ids))
+        logger.info(tokenizer.convert_ids_to_tokens(masked_lm_labels))
+
+    print_dataset(train_ds[0], "train")
+    print_dataset(valid_ds[0], "valid")
+    print_dataset(test_ds[0], "test")
+
     def _collate_data(data, stack_fn=Stack()):
         num_fields = len(data[0])
         out = [None] * num_fields
diff --git a/model_zoo/ernie-1.0/run_pretrain_trainer.py b/model_zoo/ernie-1.0/run_pretrain_trainer.py
index eef97d2ea996..188fd5bc5507 100644
--- a/model_zoo/ernie-1.0/run_pretrain_trainer.py
+++ b/model_zoo/ernie-1.0/run_pretrain_trainer.py
@@ -174,6 +174,22 @@ def create_pretrained_dataset(data_args, training_args, data_file, tokenizer):
         max_seq_length_dec=None,
         dataset_type='ernie')
 
+    def print_dataset(data, mode="train"):
+        logger.info(f"Sample data for {mode} mode")
+        input_ids, segment_ids, input_mask, masked_lm_positions, masked_lm_labels, next_sentence_labels = data
+        if tokenizer.pad_token_id in input_ids:
+            input_ids = input_ids[0:list(input_ids).index(tokenizer.pad_token_id
+                                                          )]
+        logger.info(tokenizer._decode(input_ids))
+        for pos, label in zip(masked_lm_positions, masked_lm_labels):
+            input_ids[pos] = label
+        logger.info(tokenizer._decode(input_ids))
+        logger.info(tokenizer.convert_ids_to_tokens(masked_lm_labels))
+
+    print_dataset(train_ds[0], "train")
+    print_dataset(valid_ds[0], "valid")
+    print_dataset(test_ds[0], "test")
+
     def _collate_data(data, stack_fn=Stack()):
         num_fields = len(data[0])
         out = [None] * num_fields

From 70c41894086011f50d7dccc5885860dd4317da10 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Tue, 28 Jun 2022 16:05:11 +0800
Subject: [PATCH 12/48] support custom tokenizer.

---
 model_zoo/ernie-1.0/args.py                 | 5 ++++-
 model_zoo/ernie-1.0/run_gb512_s1m_static.sh | 3 ++-
 model_zoo/ernie-1.0/run_pretrain.py         | 2 +-
 model_zoo/ernie-1.0/run_pretrain_static.py  | 2 +-
 model_zoo/ernie-1.0/run_pretrain_trainer.py | 8 ++++++--
 5 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/model_zoo/ernie-1.0/args.py b/model_zoo/ernie-1.0/args.py
index 867e6472206b..0fd54ac49241 100644
--- a/model_zoo/ernie-1.0/args.py
+++ b/model_zoo/ernie-1.0/args.py
@@ -34,7 +34,7 @@ def parse_args(MODEL_CLASSES):
     parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
         help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
         sum([ list(classes[-1].pretrained_init_configuration.keys()) for classes in MODEL_CLASSES.values() ], [])),)
-    parser.add_argument("--tokenize_name_or_path", default=None, type=str, required=True,
+    parser.add_argument("--tokenizer_name_or_path", default=None, type=str,
         help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
         sum([ list(classes[-1].pretrained_init_configuration.keys()) for classes in MODEL_CLASSES.values() ], [])),)
 
@@ -99,6 +99,9 @@ def parse_args(MODEL_CLASSES):
     # yapf: enable
 
     args = parser.parse_args()
+
+    if args.tokenizer_name_or_path is None:
+        args.tokenizer_name_or_path = args.model_name_or_path
     args.test_iters = args.eval_iters * 10
 
     if args.check_accuracy:
diff --git a/model_zoo/ernie-1.0/run_gb512_s1m_static.sh b/model_zoo/ernie-1.0/run_gb512_s1m_static.sh
index 05beae7ebc63..905272629b9e 100644
--- a/model_zoo/ernie-1.0/run_gb512_s1m_static.sh
+++ b/model_zoo/ernie-1.0/run_gb512_s1m_static.sh
@@ -16,7 +16,8 @@ python -u  -m paddle.distributed.launch \
     --log_dir "output/$task_name/log" \
     run_pretrain_static.py \
     --model_type "ernie" \
-    --model_name_or_path "ernie-1.0-base-zh" \
+    --model_name_or_path "ernie-3.0-base-zh" \
+    --tokenize_name_or_path "./final_vocab" \
     --input_dir "./data" \
     --output_dir "output/$task_name" \
     --split 949,50,1 \
diff --git a/model_zoo/ernie-1.0/run_pretrain.py b/model_zoo/ernie-1.0/run_pretrain.py
index 87cbed32146c..59febd223e07 100644
--- a/model_zoo/ernie-1.0/run_pretrain.py
+++ b/model_zoo/ernie-1.0/run_pretrain.py
@@ -445,7 +445,7 @@ def do_train(args):
         model = fleet.distributed_model(model)
         optimizer = fleet.distributed_optimizer(optimizer)
 
-    tokenizer = tokenizer_class.from_pretrained(args.tokenize_name_or_path)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name_or_path)
     tokenizer.extend_chinese_char()
 
     data_file = get_train_data_file(args)
diff --git a/model_zoo/ernie-1.0/run_pretrain_static.py b/model_zoo/ernie-1.0/run_pretrain_static.py
index 7bc92b224c3b..ba046a3a1cb1 100644
--- a/model_zoo/ernie-1.0/run_pretrain_static.py
+++ b/model_zoo/ernie-1.0/run_pretrain_static.py
@@ -419,7 +419,7 @@ def do_train(args):
             masked_lm_labels, next_sentence_labels
         ] = data_holders
 
-        tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
+        tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name_or_path)
         tokenizer.extend_chinese_char()
 
         train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset(
diff --git a/model_zoo/ernie-1.0/run_pretrain_trainer.py b/model_zoo/ernie-1.0/run_pretrain_trainer.py
index 05fd929fe451..75cbaab4ae2b 100644
--- a/model_zoo/ernie-1.0/run_pretrain_trainer.py
+++ b/model_zoo/ernie-1.0/run_pretrain_trainer.py
@@ -140,7 +140,7 @@ class ModelArguments:
             "help":
             "Pretrained config name or path if not the same as model_name"
         })
-    tokenizer_name: Optional[str] = field(
+    tokenizer_name_or_path: Optional[str] = field(
         default=None,
         metadata={
             "help":
@@ -339,6 +339,9 @@ def main():
     parser = PdArgumentParser(
         (ModelArguments, DataArguments, PreTrainingArguments))
     model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    if model_args.tokenizer_name_or_path is None:
+        model_args.tokenizer_name_or_path = model_args.model_name_or_path
+
     set_seed(training_args)
     paddle.set_device(training_args.device)
     if paddle.distributed.get_world_size() > 1:
@@ -448,7 +451,8 @@ def forward(self, output, labels):
         decay_step=training_args.decay_steps)
 
     data_file = get_train_data_file(data_args)
-    tokenizer = tokenizer_class.from_pretrained(model_args.model_name_or_path)
+    tokenizer = tokenizer_class.from_pretrained(
+        model_args.tokenizer_name_or_path)
     tokenizer.extend_chinese_char()
 
     train_dataset, eval_dataset, test_dataset, data_collator = create_pretrained_dataset(

From 50d39c59141d926752630a1d4bd39eddd1ff1513 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Wed, 29 Jun 2022 16:28:34 +0800
Subject: [PATCH 13/48] add some comments.

---
 model_zoo/ernie-1.0/run_gb512_s1m.sh         | 1 +
 model_zoo/ernie-1.0/run_gb512_s1m_static.sh  | 4 ++--
 model_zoo/ernie-1.0/run_gb512_s1m_trainer.sh | 1 +
 paddlenlp/transformers/bert/tokenizer.py     | 1 +
 paddlenlp/transformers/ernie/tokenizer.py    | 1 +
 5 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/model_zoo/ernie-1.0/run_gb512_s1m.sh b/model_zoo/ernie-1.0/run_gb512_s1m.sh
index aa6c4ac51c0b..b651d93a9c65 100644
--- a/model_zoo/ernie-1.0/run_gb512_s1m.sh
+++ b/model_zoo/ernie-1.0/run_gb512_s1m.sh
@@ -15,6 +15,7 @@ python -u  -m paddle.distributed.launch \
     run_pretrain.py \
     --model_type "ernie" \
     --model_name_or_path "ernie-1.0-base-zh" \
+    --tokenizer_name_or_path "ernie-1.0-base-zh" \
     --input_dir "./data" \
     --output_dir "output/$task_name" \
     --split 949,50,1 \
diff --git a/model_zoo/ernie-1.0/run_gb512_s1m_static.sh b/model_zoo/ernie-1.0/run_gb512_s1m_static.sh
index 905272629b9e..99baa1463ba0 100644
--- a/model_zoo/ernie-1.0/run_gb512_s1m_static.sh
+++ b/model_zoo/ernie-1.0/run_gb512_s1m_static.sh
@@ -16,8 +16,8 @@ python -u  -m paddle.distributed.launch \
     --log_dir "output/$task_name/log" \
     run_pretrain_static.py \
     --model_type "ernie" \
-    --model_name_or_path "ernie-3.0-base-zh" \
-    --tokenize_name_or_path "./final_vocab" \
+    --model_name_or_path "ernie-1.0-base-zh" \
+    --tokenizer_name_or_path "ernie-1.0-base-zh" \
     --input_dir "./data" \
     --output_dir "output/$task_name" \
     --split 949,50,1 \
diff --git a/model_zoo/ernie-1.0/run_gb512_s1m_trainer.sh b/model_zoo/ernie-1.0/run_gb512_s1m_trainer.sh
index 7aea663b3aa9..eebd6f9c2be5 100644
--- a/model_zoo/ernie-1.0/run_gb512_s1m_trainer.sh
+++ b/model_zoo/ernie-1.0/run_gb512_s1m_trainer.sh
@@ -13,6 +13,7 @@ python -u  -m paddle.distributed.launch \
     run_pretrain_trainer.py \
     --model_type "ernie" \
     --model_name_or_path "ernie-1.0-base-zh" \
+    --tokenizer_name_or_path "ernie-1.0-base-zh" \
     --input_dir "./data" \
     --output_dir "output/$task_name" \
     --split 949,50,1 \
diff --git a/paddlenlp/transformers/bert/tokenizer.py b/paddlenlp/transformers/bert/tokenizer.py
index cdd4e9c8f4b3..939ff9a35119 100644
--- a/paddlenlp/transformers/bert/tokenizer.py
+++ b/paddlenlp/transformers/bert/tokenizer.py
@@ -105,6 +105,7 @@ def _run_split_on_punc(self, text):
         output = []
         while i < len(chars):
             char = chars[i]
+            # punctuation and symbol should be treat as single char.
             if _is_punctuation(char) or _is_symbol(char):
                 output.append([char])
                 start_new_word = True
diff --git a/paddlenlp/transformers/ernie/tokenizer.py b/paddlenlp/transformers/ernie/tokenizer.py
index 1904ede218cc..e807f33e4f46 100644
--- a/paddlenlp/transformers/ernie/tokenizer.py
+++ b/paddlenlp/transformers/ernie/tokenizer.py
@@ -247,6 +247,7 @@ def extend_chinese_char(self):
             if i not in self.vocab.idx_to_token:
                 continue
             w = self.vocab.idx_to_token[i]
+            # Chose chinese char in [0x4E00, Ox9FA5], and try add  ## char to vocab.
             if len(w) == 1 and ord(w) >= 0x4E00 and ord(w) <= 0x9FA5:
                 new_char = "##" + w
                 if new_char not in vocab_set:

From 95a67ba4d04a0d749978aeb933a349cba135343a Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Wed, 29 Jun 2022 16:32:36 +0800
Subject: [PATCH 14/48] fix bugs.

---
 paddlenlp/transformers/ernie/modeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddlenlp/transformers/ernie/modeling.py b/paddlenlp/transformers/ernie/modeling.py
index 173b68802da8..1b203ab03adf 100644
--- a/paddlenlp/transformers/ernie/modeling.py
+++ b/paddlenlp/transformers/ernie/modeling.py
@@ -143,7 +143,7 @@ class ErniePretrainedModel(PretrainedModel):
             "hidden_dropout_prob": 0.1,
             "hidden_size": 768,
             "initializer_range": 0.02,
-            "max_position_embeddings": 512,
+            "max_position_embeddings": 513,
             "num_attention_heads": 12,
             "num_hidden_layers": 12,
             "type_vocab_size": 2,

From 21cbc7f0d5af71600991dec67eb48ecab73a026e Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Wed, 29 Jun 2022 16:41:04 +0800
Subject: [PATCH 15/48] add comments.

---
 model_zoo/ernie-1.0/data_tools/dataset_utils.py | 1 +
 model_zoo/ernie-1.0/data_tools/ernie_dataset.py | 3 +++
 2 files changed, 4 insertions(+)

diff --git a/model_zoo/ernie-1.0/data_tools/dataset_utils.py b/model_zoo/ernie-1.0/data_tools/dataset_utils.py
index e2dd21b1bbfc..c6b3dd57346d 100755
--- a/model_zoo/ernie-1.0/data_tools/dataset_utils.py
+++ b/model_zoo/ernie-1.0/data_tools/dataset_utils.py
@@ -368,6 +368,7 @@ def create_masked_lm_predictions(tokens,
                 token_boundary[i] = 1
 
     if to_chinese_char:
+        # set ## chinse char to original chinese char
         char_tokens = []
         assert vocab_token_to_id_dict is not None
         for i, b in enumerate(token_boundary):
diff --git a/model_zoo/ernie-1.0/data_tools/ernie_dataset.py b/model_zoo/ernie-1.0/data_tools/ernie_dataset.py
index 003c902aa056..6fab9ad53473 100644
--- a/model_zoo/ernie-1.0/data_tools/ernie_dataset.py
+++ b/model_zoo/ernie-1.0/data_tools/ernie_dataset.py
@@ -83,6 +83,9 @@ def __init__(self,
         self.vocab_token_to_id_dict = copy.deepcopy(
             tokenizer.vocab.token_to_idx)
 
+        # ERNIE is chinse char level model, sometime is need
+        # add ## chinse char to encode and decode.
+        # Here we extend the vocab dict.
         self.vocab_id_to_token_dict.update(tokenizer.added_tokens_decoder)
         self.vocab_token_to_id_dict.update(tokenizer.added_tokens_encoder)
 

From c6d6c72ca76de75cda6702155d9823bdf64701b7 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Wed, 29 Jun 2022 17:08:18 +0800
Subject: [PATCH 16/48] fix bug.

---
 model_zoo/ernie-1.0/run_pretrain.py       | 6 +++---
 paddlenlp/transformers/ernie/tokenizer.py | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/model_zoo/ernie-1.0/run_pretrain.py b/model_zoo/ernie-1.0/run_pretrain.py
index 59febd223e07..cc98de93c1e1 100644
--- a/model_zoo/ernie-1.0/run_pretrain.py
+++ b/model_zoo/ernie-1.0/run_pretrain.py
@@ -249,14 +249,14 @@ def run_evaluate(data_loader,
                 v.subtract_(v)
             if dist.get_rank() == 0:
                 log_info_dict[
-                    "samples_per_second"] = iter_steps * args.micro_batch_size / (
-                        time.time() - local_time)
+                    "samples_per_second"] = iter_steps * args.micro_batch_size * dist.get_world_size(
+                    ) / (time.time() - local_time)
                 loss_info = ", ".join([
                     "{}: {:.6f}".format(k, log_info_dict[k])
                     for k in log_info_dict.keys() if k.endswith("loss")
                 ])
 
-                logger.info("%s step %d, batch: %d, %s, speed: %.0f seqs/s" %
+                logger.info("%s step %d, batch: %d, %s, ips: %.0f seqs/s" %
                             (task_name, global_step, iter_steps, loss_info,
                              log_info_dict["samples_per_second"]))
 
diff --git a/paddlenlp/transformers/ernie/tokenizer.py b/paddlenlp/transformers/ernie/tokenizer.py
index e807f33e4f46..5625ccba45c4 100644
--- a/paddlenlp/transformers/ernie/tokenizer.py
+++ b/paddlenlp/transformers/ernie/tokenizer.py
@@ -253,7 +253,7 @@ def extend_chinese_char(self):
                 if new_char not in vocab_set:
                     extend_list.append(new_char)
         if len(self.vocab) + len(extend_list) > 2**16:
-            warnings.warn("The vocab size if larger than uint16")
+            warnings.warn("The vocab size is larger than uint16")
         new_tokens = [str(tok) for tok in extend_list]
 
         tokens_to_add = []

From de24d7b8b88c130f7111e2528ae9cadd906c16eb Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Sat, 2 Jul 2022 20:16:17 +0800
Subject: [PATCH 17/48] fix run_pretrain_static logging.

---
 model_zoo/ernie-1.0/run_pretrain_static.py | 106 +++++++++++++++------
 1 file changed, 76 insertions(+), 30 deletions(-)

diff --git a/model_zoo/ernie-1.0/run_pretrain_static.py b/model_zoo/ernie-1.0/run_pretrain_static.py
index ba046a3a1cb1..590f30142862 100644
--- a/model_zoo/ernie-1.0/run_pretrain_static.py
+++ b/model_zoo/ernie-1.0/run_pretrain_static.py
@@ -21,11 +21,13 @@
 import time
 import yaml
 import shutil
+import json
 import collections
 
 import numpy as np
 import paddle
 import paddle.distributed.fleet as fleet
+import paddle.distributed as dist
 from paddle.distributed.fleet.meta_optimizers.sharding.utils import save_persistables
 from paddle.io import DataLoader, Dataset
 from paddlenlp.utils.batch_sampler import DistributedBatchSampler
@@ -302,6 +304,7 @@ def run_evaluate(data_loader,
     average_ret = collections.defaultdict(float)
 
     local_time = time.time()
+    worker_num = fleet.worker_num()
 
     for eval_step, batch in enumerate(data_loader):
         ret = exe.run(program, feed=batch, fetch_list=list(eval_fetch.values()))
@@ -310,15 +313,15 @@ def run_evaluate(data_loader,
                 all_ret[k].append(float(v[0]))
 
         if eval_step >= iter_steps - 1:
-            if not is_last:
+            if not is_last or log_writer is None:
                 break
 
             for k in list(eval_fetch.keys()):
                 average_ret[k] = sum(all_ret[k]) / len(all_ret[k])
 
             speed = iter_steps / (time.time() - local_time)
-            speed_tokens = speed * args.micro_batch_size * args.max_seq_len
-            ips = speed * args.micro_batch_size
+            speed_tokens = speed * args.micro_batch_size * args.max_seq_len * worker_num
+            ips = speed * args.micro_batch_size * worker_num
 
             loss_info = ", ".join([
                 "{}: {:.6f}".format(k, average_ret[k])
@@ -337,6 +340,26 @@ def run_evaluate(data_loader,
             break
 
 
+def all_gather(v):
+    if fleet.worker_num() <= 1:
+        return v
+    ret = []
+    dist.all_gather(ret, v)
+    concat = paddle.concat(ret, axis=0)
+    return concat.mean()
+
+
+def default_logdir() -> str:
+    """
+    Same default
+    """
+    import socket
+    from datetime import datetime
+
+    current_time = datetime.now().strftime("%b%d_%H-%M-%S")
+    return os.path.join("runs", current_time + "_" + socket.gethostname())
+
+
 def do_train(args):
     # Initialize the paddle and paddle fleet execute environment
     paddle.enable_static()
@@ -372,15 +395,10 @@ def do_train(args):
     dist_strategy = dist_optimizer(args, topo)
 
     # Create log write, train results show on last card of pipeline.
-    if topo.is_last:
-        log_writer_path = os.path.join(
-            args.output_dir, "train_log",
-            "{}_globalbsz_{}_amp_{}_recompute_{}_card_{}".format(
-                args.model_name_or_path, args.global_batch_size, args.use_amp,
-                args.use_recompute, worker_index).lower())
-        # if os.path.exists(log_writer_path):
-        #     shutil.rmtree(log_writer_path)
-        log_writer = LogWriter(log_writer_path)
+    # Create log write,
+    log_writer = None
+    if worker_index == 0:
+        log_writer = LogWriter(os.path.join(args.output_dir, default_logdir()))
 
     # Define the input data in the static mode
     base_class, model_class, criterion_class, tokenizer_class = MODEL_CLASSES[
@@ -466,10 +484,14 @@ def do_train(args):
                                           masked_lm_labels,
                                           next_sentence_labels)
             loss = lm_loss + sop_loss
+            lm_loss_gather = all_gather(lm_loss)
+            sop_loss_gather = all_gather(sop_loss)
         else:
             loss = criterion(prediction_scores, seq_relationship_score,
                              masked_lm_labels)
 
+        loss_gather = all_gather(loss)
+
         # Create the learning_rate sheduler and optimizer
         if args.decay_steps is None:
             args.decay_steps = args.max_steps
@@ -529,6 +551,17 @@ def do_train(args):
               'w') as f:
         f.write(str(startup_program))
 
+    if worker_index == 0:
+        # log the model config and args
+        model_config_json = json.dumps(model.get_model_config(),
+                                       ensure_ascii=False,
+                                       indent=2)
+        log_writer.add_text("model_config", model_config_json)
+        args_dict = {"paddle commit id": str(paddle.version.commit)}
+        for arg in vars(args):
+            args_dict[arg] = str(getattr(args, arg))
+        log_writer.add_text("args", json.dumps(args_dict, indent=2))
+
     # Define the Executor for running the static model
     exe = paddle.static.Executor(place)
     exe.run(startup_program)
@@ -571,10 +604,10 @@ def do_train(args):
 
     fetch_loss_vars = collections.OrderedDict()
     fetch_other_vars = collections.OrderedDict()
-    fetch_loss_vars["loss"] = loss
+    fetch_loss_vars["loss"] = loss_gather
     if args.binary_head:
-        fetch_loss_vars["lm_loss"] = lm_loss
-        fetch_loss_vars["sop_loss"] = sop_loss
+        fetch_loss_vars["lm_loss"] = lm_loss_gather
+        fetch_loss_vars["sop_loss"] = sop_loss_gather
 
     fetch_other_vars["learning_rate"] = main_program.global_block(
     ).vars["learning_rate_0"]
@@ -599,12 +632,19 @@ def do_train(args):
         valid_data_loader = valid_data_loader()
         test_data_loader = test_data_loader()
 
+        loss_res = collections.defaultdict(list)
         for step, batch in enumerate(train_data_loader()):
             ret = exe.run(main_program,
                           feed=batch,
                           fetch_list=fetchs,
                           use_program_cache=True)
             # Skip for accumulate_steps in global step
+
+            if log_writer is not None:
+                for k, v in zip(fetchs_keys, ret):
+                    if k in fetch_loss_vars:
+                        loss_res[k].append(v[0])
+
             if (step + 1) % args.accumulate_steps != 0:
                 continue
             global_step += 1
@@ -612,15 +652,17 @@ def do_train(args):
             lr_scheduler.step()
 
             if global_step % args.logging_freq == 0:
-                if topo.is_last:
+                if topo.is_last and log_writer is not None:
                     res = collections.defaultdict(float)
                     for k, v in zip(fetchs_keys, ret):
-                        res[k] = v[0]
+                        if k in fetch_loss_vars:
+                            res[k] = sum(loss_res[k]) / len(loss_res[k])
+                            loss_res[k] = []
+                        else:
+                            res[k] = v[0]
 
                     speed = args.logging_freq / (time.time() - tic_train)
 
-                    loss_info = "loss: %.6f, lm_loss: %.6f, sop_loss: %.6f"
-
                     loss_info = ", ".join([
                         "{}: {:.6f}".format(k, res[k])
                         for k in fetch_loss_vars.keys()
@@ -636,8 +678,12 @@ def do_train(args):
                     if additional_loginfo:
                         common_loginfo += ", " + additional_loginfo
                     logger.info(common_loginfo)
+
                     for k, v in res.items():
-                        log_writer.add_scalar("train/" + k, v, global_step)
+                        if k in additional_vars:
+                            log_writer.add_scalar("amp/" + k, v, global_step)
+                        else:
+                            log_writer.add_scalar("train/" + k, v, global_step)
 
                 tic_train = time.time()
 
@@ -650,11 +696,11 @@ def do_train(args):
             if global_step % args.eval_freq == 0:
                 # TODO, check the input data of validation
                 eval_fetch = collections.OrderedDict()
-                if topo.is_last:
-                    eval_fetch["loss"] = loss
-                    if args.binary_head:
-                        eval_fetch["lm_loss"] = lm_loss
-                        eval_fetch["sop_loss"] = sop_loss
+                # if topo.is_last:
+                eval_fetch["loss"] = loss_gather
+                if args.binary_head:
+                    eval_fetch["lm_loss"] = lm_loss_gather
+                    eval_fetch["sop_loss"] = sop_loss_gather
 
                 run_evaluate(valid_data_loader, exe, test_program,
                              args.eval_iters, log_writer, global_step, args,
@@ -718,11 +764,11 @@ def do_train(args):
 
             if global_step >= args.max_steps:
                 eval_fetch = collections.OrderedDict()
-                if topo.is_last:
-                    eval_fetch["loss"] = loss
-                    if args.binary_head:
-                        eval_fetch["lm_loss"] = lm_loss
-                        eval_fetch["sop_loss"] = sop_loss
+                # if topo.is_last:
+                eval_fetch["loss"] = loss_gather
+                if args.binary_head:
+                    eval_fetch["lm_loss"] = lm_loss_gather
+                    eval_fetch["sop_loss"] = sop_loss_gather
 
                 run_evaluate(test_data_loader, exe, test_program,
                              args.test_iters, log_writer, global_step, args,

From 0ff8c8ff710d367d47ebb9c1a58d490f5c2435e5 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Tue, 5 Jul 2022 22:54:07 +0800
Subject: [PATCH 18/48] fix all gather.

---
 model_zoo/ernie-1.0/run_pretrain_static.py | 38 +++++++++++-----------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/model_zoo/ernie-1.0/run_pretrain_static.py b/model_zoo/ernie-1.0/run_pretrain_static.py
index 590f30142862..c3864fcf3ee6 100644
--- a/model_zoo/ernie-1.0/run_pretrain_static.py
+++ b/model_zoo/ernie-1.0/run_pretrain_static.py
@@ -317,7 +317,7 @@ def run_evaluate(data_loader,
                 break
 
             for k in list(eval_fetch.keys()):
-                average_ret[k] = sum(all_ret[k]) / len(all_ret[k])
+                average_ret[k] = sum(all_ret[k]) / len(all_ret[k]) / worker_num
 
             speed = iter_steps / (time.time() - local_time)
             speed_tokens = speed * args.micro_batch_size * args.max_seq_len * worker_num
@@ -340,13 +340,12 @@ def run_evaluate(data_loader,
             break
 
 
-def all_gather(v):
+def all_reduce(v):
     if fleet.worker_num() <= 1:
         return v
-    ret = []
-    dist.all_gather(ret, v)
-    concat = paddle.concat(ret, axis=0)
-    return concat.mean()
+    v = v + 0
+    dist.all_reduce(v)
+    return v
 
 
 def default_logdir() -> str:
@@ -484,13 +483,13 @@ def do_train(args):
                                           masked_lm_labels,
                                           next_sentence_labels)
             loss = lm_loss + sop_loss
-            lm_loss_gather = all_gather(lm_loss)
-            sop_loss_gather = all_gather(sop_loss)
+            lm_loss_reduce = all_reduce(lm_loss)
+            sop_loss_reduce = all_reduce(sop_loss)
         else:
             loss = criterion(prediction_scores, seq_relationship_score,
                              masked_lm_labels)
 
-        loss_gather = all_gather(loss)
+        loss_reduce = all_reduce(loss)
 
         # Create the learning_rate sheduler and optimizer
         if args.decay_steps is None:
@@ -604,10 +603,10 @@ def do_train(args):
 
     fetch_loss_vars = collections.OrderedDict()
     fetch_other_vars = collections.OrderedDict()
-    fetch_loss_vars["loss"] = loss_gather
+    fetch_loss_vars["loss"] = loss_reduce
     if args.binary_head:
-        fetch_loss_vars["lm_loss"] = lm_loss_gather
-        fetch_loss_vars["sop_loss"] = sop_loss_gather
+        fetch_loss_vars["lm_loss"] = lm_loss_reduce
+        fetch_loss_vars["sop_loss"] = sop_loss_reduce
 
     fetch_other_vars["learning_rate"] = main_program.global_block(
     ).vars["learning_rate_0"]
@@ -656,7 +655,8 @@ def do_train(args):
                     res = collections.defaultdict(float)
                     for k, v in zip(fetchs_keys, ret):
                         if k in fetch_loss_vars:
-                            res[k] = sum(loss_res[k]) / len(loss_res[k])
+                            res[k] = sum(loss_res[k]) / len(
+                                loss_res[k]) / worker_num
                             loss_res[k] = []
                         else:
                             res[k] = v[0]
@@ -697,10 +697,10 @@ def do_train(args):
                 # TODO, check the input data of validation
                 eval_fetch = collections.OrderedDict()
                 # if topo.is_last:
-                eval_fetch["loss"] = loss_gather
+                eval_fetch["loss"] = loss_reduce
                 if args.binary_head:
-                    eval_fetch["lm_loss"] = lm_loss_gather
-                    eval_fetch["sop_loss"] = sop_loss_gather
+                    eval_fetch["lm_loss"] = lm_loss_reduce
+                    eval_fetch["sop_loss"] = sop_loss_reduce
 
                 run_evaluate(valid_data_loader, exe, test_program,
                              args.eval_iters, log_writer, global_step, args,
@@ -765,10 +765,10 @@ def do_train(args):
             if global_step >= args.max_steps:
                 eval_fetch = collections.OrderedDict()
                 # if topo.is_last:
-                eval_fetch["loss"] = loss_gather
+                eval_fetch["loss"] = loss_reduce
                 if args.binary_head:
-                    eval_fetch["lm_loss"] = lm_loss_gather
-                    eval_fetch["sop_loss"] = sop_loss_gather
+                    eval_fetch["lm_loss"] = lm_loss_reduce
+                    eval_fetch["sop_loss"] = sop_loss_reduce
 
                 run_evaluate(test_data_loader, exe, test_program,
                              args.test_iters, log_writer, global_step, args,

From 994db93e44a3cf45a681eb0ae1399daa1bc09cba Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Tue, 5 Jul 2022 23:20:20 +0800
Subject: [PATCH 19/48] fix a100

---
 model_zoo/ernie-1.0/run_pretrain_static.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/model_zoo/ernie-1.0/run_pretrain_static.py b/model_zoo/ernie-1.0/run_pretrain_static.py
index c3864fcf3ee6..5711f66adc34 100644
--- a/model_zoo/ernie-1.0/run_pretrain_static.py
+++ b/model_zoo/ernie-1.0/run_pretrain_static.py
@@ -205,7 +205,7 @@ def dist_optimizer(args, topo):
     exec_strategy.num_iteration_per_drop_scope = 10000
 
     build_strategy = paddle.static.BuildStrategy()
-    #build_strategy.enable_sequential_execution = True # for profile
+    build_strategy.enable_sequential_execution = True  # for profile
     build_strategy.fuse_broadcast_ops = True
     build_strategy.enable_inplace = True
     build_strategy.enable_addto = args.enable_addto

From a18f6214b8fa1fa0157bce070fdf42ccdc116f88 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Wed, 6 Jul 2022 21:04:30 +0800
Subject: [PATCH 20/48] fix

---
 model_zoo/ernie-1.0/run_pretrain_static.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/model_zoo/ernie-1.0/run_pretrain_static.py b/model_zoo/ernie-1.0/run_pretrain_static.py
index 5711f66adc34..e154f30682fe 100644
--- a/model_zoo/ernie-1.0/run_pretrain_static.py
+++ b/model_zoo/ernie-1.0/run_pretrain_static.py
@@ -211,6 +211,7 @@ def dist_optimizer(args, topo):
     build_strategy.enable_addto = args.enable_addto
 
     dist_strategy = fleet.DistributedStrategy()
+    dist_strategy.without_graph_optimization = True
     dist_strategy.execution_strategy = exec_strategy
     dist_strategy.build_strategy = build_strategy
     dist_strategy.nccl_comm_num = 3

From 54a821f7f62cdac0f7bf861cc765bdb08d890d1f Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 8 Jul 2022 00:59:02 +0800
Subject: [PATCH 21/48] fix bugs

---
 model_zoo/ernie-1.0/args.py                   |  3 +
 .../ernie-1.0/data_tools/dataset_utils.py     |  1 +
 .../ernie-1.0/data_tools/ernie_dataset.py     | 57 ++++++++++++-------
 model_zoo/ernie-1.0/run_pretrain.py           |  2 +
 model_zoo/ernie-1.0/run_pretrain_static.py    | 13 ++++-
 5 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/model_zoo/ernie-1.0/args.py b/model_zoo/ernie-1.0/args.py
index 0fd54ac49241..a755abb3142b 100644
--- a/model_zoo/ernie-1.0/args.py
+++ b/model_zoo/ernie-1.0/args.py
@@ -96,6 +96,9 @@ def parse_args(MODEL_CLASSES):
     # Argument for bert
     parser.add_argument("--masked_lm_prob", type=float, default=0.15, help="Mask token prob.")
     parser.add_argument("--short_seq_prob", type=float, default=0.1, help="Short sequence prob.")
+    parser.add_argument("--favor_longer_ngram", type=str2bool, default=False, help="Short sequence prob.")
+    parser.add_argument("--max_ngrams", type=int, default=3, help="Short sequence prob.")
+
     # yapf: enable
 
     args = parser.parse_args()
diff --git a/model_zoo/ernie-1.0/data_tools/dataset_utils.py b/model_zoo/ernie-1.0/data_tools/dataset_utils.py
index c6b3dd57346d..b56e9251e8f6 100755
--- a/model_zoo/ernie-1.0/data_tools/dataset_utils.py
+++ b/model_zoo/ernie-1.0/data_tools/dataset_utils.py
@@ -733,6 +733,7 @@ def build_dataset(index, name):
                 max_seq_length=max_seq_length,
                 seed=seed,
                 share_folder=args.share_folder,
+                args=args,
             )
             if dataset_type == DSET_TYPE_T5:
                 dataset = T5Dataset(indexed_dataset=indexed_dataset,
diff --git a/model_zoo/ernie-1.0/data_tools/ernie_dataset.py b/model_zoo/ernie-1.0/data_tools/ernie_dataset.py
index 6fab9ad53473..b8c174aea5a9 100644
--- a/model_zoo/ernie-1.0/data_tools/ernie_dataset.py
+++ b/model_zoo/ernie-1.0/data_tools/ernie_dataset.py
@@ -35,19 +35,22 @@
 
 class ErnieDataset(paddle.io.Dataset):
 
-    def __init__(self,
-                 name,
-                 tokenizer,
-                 indexed_dataset,
-                 data_prefix,
-                 num_epochs,
-                 max_num_samples,
-                 masked_lm_prob,
-                 max_seq_length,
-                 short_seq_prob,
-                 seed,
-                 binary_head,
-                 share_folder=False):
+    def __init__(
+        self,
+        name,
+        tokenizer,
+        indexed_dataset,
+        data_prefix,
+        num_epochs,
+        max_num_samples,
+        masked_lm_prob,
+        max_seq_length,
+        short_seq_prob,
+        seed,
+        binary_head,
+        share_folder=False,
+        args=None,
+    ):
 
         # Params to store.
         self.name = name
@@ -56,6 +59,7 @@ def __init__(self,
         self.max_seq_length = max_seq_length
         self.binary_head = binary_head
         self.share_folder = share_folder
+        self.args = args
 
         # Dataset.
         self.indexed_dataset = indexed_dataset
@@ -118,13 +122,24 @@ def __getitem__(self, idx):
             self.pad_id,
             self.masked_lm_prob,
             np_rng,
-            self.binary_head)
-
-
-def build_training_sample(sample, target_seq_length, max_seq_length,
-                          vocab_id_list, vocab_id_to_token_dict,
-                          vocab_token_to_id_dict, cls_id, sep_id, mask_id,
-                          pad_id, masked_lm_prob, np_rng, binary_head):
+            self.binary_head,
+            self.args)
+
+
+def build_training_sample(sample,
+                          target_seq_length,
+                          max_seq_length,
+                          vocab_id_list,
+                          vocab_id_to_token_dict,
+                          vocab_token_to_id_dict,
+                          cls_id,
+                          sep_id,
+                          mask_id,
+                          pad_id,
+                          masked_lm_prob,
+                          np_rng,
+                          binary_head,
+                          args=None):
     """Biuld training sample.
 
     Arguments:
@@ -186,6 +201,8 @@ def build_training_sample(sample, target_seq_length, max_seq_length,
          vocab_token_to_id_dict=vocab_token_to_id_dict,
          to_chinese_char=True,
          inplace_random_mask=False,
+         favor_longer_ngram=False if args is None else args.favor_longer_ngram,
+         max_ngrams=3 if args is None else args.max_ngrams,
      )
 
     # Padding.
diff --git a/model_zoo/ernie-1.0/run_pretrain.py b/model_zoo/ernie-1.0/run_pretrain.py
index cc98de93c1e1..f34f6aca3749 100644
--- a/model_zoo/ernie-1.0/run_pretrain.py
+++ b/model_zoo/ernie-1.0/run_pretrain.py
@@ -710,6 +710,8 @@ def save_ckpt(output_dir, model, tokenizer, optimizer, scaler, args,
                              args,
                              task_name="test")
                 del train_data_loader
+                del valid_data_loader
+                del test_data_loader
                 return
 
 
diff --git a/model_zoo/ernie-1.0/run_pretrain_static.py b/model_zoo/ernie-1.0/run_pretrain_static.py
index e154f30682fe..7e616811c8b3 100644
--- a/model_zoo/ernie-1.0/run_pretrain_static.py
+++ b/model_zoo/ernie-1.0/run_pretrain_static.py
@@ -206,12 +206,14 @@ def dist_optimizer(args, topo):
 
     build_strategy = paddle.static.BuildStrategy()
     build_strategy.enable_sequential_execution = True  # for profile
+    # build_strategy.reduce_strategy = paddle.static.BuildStrategy.ReduceStrategy._NoReduce
     build_strategy.fuse_broadcast_ops = True
+    build_strategy.fix_op_run_order = True
     build_strategy.enable_inplace = True
     build_strategy.enable_addto = args.enable_addto
 
     dist_strategy = fleet.DistributedStrategy()
-    dist_strategy.without_graph_optimization = True
+    # dist_strategy.without_graph_optimization = True
     dist_strategy.execution_strategy = exec_strategy
     dist_strategy.build_strategy = build_strategy
     dist_strategy.nccl_comm_num = 3
@@ -663,6 +665,9 @@ def do_train(args):
                             res[k] = v[0]
 
                     speed = args.logging_freq / (time.time() - tic_train)
+                    res["global_step"] = global_step
+                    res["steps_per_second"] = speed
+                    res["samples_per_second"] = speed * args.global_batch_size
 
                     loss_info = ", ".join([
                         "{}: {:.6f}".format(k, res[k])
@@ -670,8 +675,8 @@ def do_train(args):
                     ])
 
                     common_loginfo = "global step %d, %s, speed: %.2f steps/s, ips: %.2f seqs/s, learning rate: %.5e" % (
-                        global_step, loss_info, speed,
-                        speed * args.global_batch_size, res["learning_rate"])
+                        global_step, loss_info, res["steps_per_second"],
+                        res["samples_per_second"], res["learning_rate"])
                     additional_loginfo = ", ".join([
                         "{}: {}".format(k, res[k])
                         for k in additional_vars.keys()
@@ -775,6 +780,8 @@ def do_train(args):
                              args.test_iters, log_writer, global_step, args,
                              topo.is_last, eval_fetch, "test")
                 del train_data_loader
+                del valid_data_loader
+                del test_data_loader
                 return
 
 

From 9942c37f48f02f30510207976acf4413e5ed9f8a Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 8 Jul 2022 10:40:55 +0800
Subject: [PATCH 22/48] fix save

---
 model_zoo/ernie-1.0/run_pretrain_static.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/model_zoo/ernie-1.0/run_pretrain_static.py b/model_zoo/ernie-1.0/run_pretrain_static.py
index 7e616811c8b3..989ca6fec9c0 100644
--- a/model_zoo/ernie-1.0/run_pretrain_static.py
+++ b/model_zoo/ernie-1.0/run_pretrain_static.py
@@ -719,9 +719,9 @@ def do_train(args):
                 logger.debug("saving models to {}".format(output_dir))
                 save_persistables(exe, os.path.join(output_dir, "static_vars"),
                                   main_program)
-                if global_step == args.save_steps:
-                    model.init_config["init_args"][0].init_config.pop(
-                        "topo", None)
+                # if global_step == args.save_steps:
+                #     model.init_config["init_args"][0].init_config.pop(
+                #         "topo", None)
                 model.save_pretrained(output_dir)
                 tokenizer.save_pretrained(output_dir)
                 tic_train = time.time()

From 918441914c1c0f3bc595168a9871fd4808be656d Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Tue, 9 Aug 2022 15:48:38 +0800
Subject: [PATCH 23/48] tmp commit for pre-process.

---
 .copyright.hook                              | 134 ++++++++++++++
 .pre-commit-config.yaml                      |   7 +
 model_zoo/ernie-1.0/scripts/README.md        |  18 ++
 model_zoo/ernie-1.0/scripts/clue_process.py  | 174 +++++++++++++++++++
 model_zoo/ernie-1.0/scripts/gen_char.py      |  64 +++++++
 model_zoo/ernie-1.0/scripts/gen_vocab.py     |  22 +++
 model_zoo/ernie-1.0/scripts/merge_vocab.py   | 139 +++++++++++++++
 model_zoo/ernie-1.0/scripts/trans_to_json.py | 172 ++++++++++++++++++
 model_zoo/ernie-1.0/scripts/wudao_process.py | 174 +++++++++++++++++++
 9 files changed, 904 insertions(+)
 create mode 100644 .copyright.hook
 create mode 100644 model_zoo/ernie-1.0/scripts/README.md
 create mode 100644 model_zoo/ernie-1.0/scripts/clue_process.py
 create mode 100644 model_zoo/ernie-1.0/scripts/gen_char.py
 create mode 100644 model_zoo/ernie-1.0/scripts/gen_vocab.py
 create mode 100644 model_zoo/ernie-1.0/scripts/merge_vocab.py
 create mode 100644 model_zoo/ernie-1.0/scripts/trans_to_json.py
 create mode 100644 model_zoo/ernie-1.0/scripts/wudao_process.py

diff --git a/.copyright.hook b/.copyright.hook
new file mode 100644
index 000000000000..d25ac074d8c9
--- /dev/null
+++ b/.copyright.hook
@@ -0,0 +1,134 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import unicode_literals
+
+import argparse
+import io
+import re
+import sys
+import os
+import datetime
+
+COPYRIGHT = '''Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.'''
+
+def _generate_copyright(comment_mark):
+    copyright=COPYRIGHT.split(os.linesep)
+    header = copyright[0].rstrip()
+
+    p = re.search('(\d{4})', header).group(0)
+    now = datetime.datetime.now()
+
+    header = header.replace(p,str(now.year))
+
+    ans=[comment_mark + " " + header + os.linesep]
+    for idx, line in enumerate(copyright[1:]):
+        ans.append(comment_mark + " " + line.rstrip() + os.linesep)
+
+    return ans
+
+def _get_comment_mark(path):
+    lang_type=re.compile(r"\.(py|sh)$")
+    if lang_type.search(path) is not None:
+        return "#"
+
+    lang_type=re.compile(r"\.(h|c|hpp|cc|cpp|cu|go|cuh|proto)$")
+    if lang_type.search(path) is not None:
+        return "//"
+
+    return None
+
+
+RE_ENCODE = re.compile(r"^[ \t\v]*#.*?coding[:=]", re.IGNORECASE)
+RE_COPYRIGHT = re.compile(r".*Copyright \(c\) \d{4}", re.IGNORECASE)
+RE_SHEBANG = re.compile(r"^[ \t\v]*#[ \t]?\!")
+
+def _check_copyright(path):
+    head=[]
+    try:
+        with open(path) as f:
+            head = [next(f) for x in range(4)]
+    except StopIteration:
+        pass
+
+    for idx, line in enumerate(head):
+        if RE_COPYRIGHT.search(line) is not None:
+            return True
+
+    return False
+
+def generate_copyright(path, comment_mark):
+    original_contents = io.open(path, encoding="utf-8").readlines()
+    head = original_contents[0:4]
+
+    insert_line_no=0
+    for i, line in enumerate(head):
+        if RE_ENCODE.search(line) or RE_SHEBANG.search(line):
+            insert_line_no=i+1
+
+    copyright = _generate_copyright(comment_mark)
+    if insert_line_no == 0:
+        new_contents = copyright
+        if len(original_contents) > 0 and len(original_contents[0].strip()) != 0:
+            new_contents.append(os.linesep)
+        new_contents.extend(original_contents)
+    else:
+        new_contents=original_contents[0:insert_line_no]
+        new_contents.append(os.linesep)
+        new_contents.extend(copyright)
+        if len(original_contents) > insert_line_no and len(original_contents[insert_line_no].strip()) != 0:
+            new_contents.append(os.linesep)
+        new_contents.extend(original_contents[insert_line_no:])
+    new_contents="".join(new_contents)
+
+    with io.open(path, 'w') as output_file:
+        output_file.write(new_contents)
+
+
+
+def main(argv=None):
+    parser = argparse.ArgumentParser(
+        description='Checker for copyright declaration.')
+    parser.add_argument('filenames', nargs='*', help='Filenames to check')
+    args = parser.parse_args(argv)
+
+    retv = 0
+    for path in args.filenames:
+        comment_mark = _get_comment_mark(path)
+        if comment_mark is None:
+            print("warning:Unsupported file", path, file=sys.stderr)
+            continue
+
+        if _check_copyright(path):
+            continue
+
+        generate_copyright(path, comment_mark)
+
+
+if __name__ == '__main__':
+    exit(main())
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 079b318a7b4f..9cc79be0fc65 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -26,3 +26,10 @@ repos:
         files: \.md$
     -   id: remove-tabs
         files: \.md$
+-   repo: local
+    hooks:
+    -   id: copyright_checker
+        name: copyright_checker
+        entry: python .copyright.hook
+        language: system
+        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|xpu|kps|py|sh)$
diff --git a/model_zoo/ernie-1.0/scripts/README.md b/model_zoo/ernie-1.0/scripts/README.md
new file mode 100644
index 000000000000..91ada5c2a94f
--- /dev/null
+++ b/model_zoo/ernie-1.0/scripts/README.md
@@ -0,0 +1,18 @@
+# ERNIE 数据制作全流程
+
+
+## 数据下载
+
+### CLUE 200G数据集 & WuDaoCorpusBase 200G
+
+### 英文部分，下载了 WikiText 数据
+
+
+## 词表制作
+
+1. 统计字符
+2. 制作英文词表
+3. 合并词表
+
+注：此方法拼接产出的词表容易出现UNK的情况。
+如issue[2927](https://github.com/PaddlePaddle/PaddleNLP/issues/2927)
diff --git a/model_zoo/ernie-1.0/scripts/clue_process.py b/model_zoo/ernie-1.0/scripts/clue_process.py
new file mode 100644
index 000000000000..fb24511695c2
--- /dev/null
+++ b/model_zoo/ernie-1.0/scripts/clue_process.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+input_path = "WuDaoCorpus2.0_base_200G/"
+
+import json
+import re
+import argparse
+import multiprocessing
+import os
+import time
+import jieba
+import sys
+from functools import partial
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_path',
+                        type=str,
+                        required=True,
+                        help='Path to you raw files. Folder or file path.')
+    parser.add_argument('--workers',
+                        type=int,
+                        default=1,
+                        help='Number of worker processes to launch')
+    parser.add_argument('--output_path',
+                        type=str,
+                        help='Path to save the output json files.')
+    parser.add_argument('--log_interval',
+                        type=int,
+                        default=1,
+                        help='Interval between progress updates.')
+    args = parser.parse_args()
+    return args
+
+
+def lexical_analysis_fn():
+    from LAC import LAC
+    lac = LAC(mode="lac")
+
+    def process(line):
+        words, _ = lac.run(line)
+        return words
+
+    return process
+
+
+def chinese_segmentation_fn():
+    from LAC import LAC
+    lac_cws = LAC(mode='seg')
+
+    def process(line):
+        words = lac_cws.run(line)
+        return words
+
+    return process
+
+
+def jieba_segmentation_fn():
+    import jieba
+
+    def process(line):
+        words = jieba.cut(line)
+        return list(words)
+
+    return process
+
+
+CHINESE_SEG_FUNC = {
+    'lac': lexical_analysis_fn(),
+    'seg': chinese_segmentation_fn(),
+    'jieba': jieba_segmentation_fn(),
+}
+
+special_chars = ['\n', '。', '?', '？', ' ', ';', '；', '！', '!']
+split_chars = ['。', '?', '？', ';', '；', '!', '！']
+
+
+def text_to_text(path):
+    out_name = "./tmp/" + path[-20:]
+    print("Loading %s" % path)
+    with open(path, "r") as f:
+        try:
+            contents = json.load(f)
+        except Exception as e:
+            print("Failed to load %s" % path)
+            return 0, None
+
+    print("Write into %s" % out_name)
+    if os.path.exists(out_name):
+        print("File exists %s" % out_name)
+        return 0, None
+
+    seg_func = CHINESE_SEG_FUNC["seg"]
+    import time
+    s = time.time()
+    data_len = 0
+    count = 0
+    with open(out_name, "w") as f:
+        for js in contents:
+            count += 1
+            text = js["content"]
+            data_len += len(text.encode("utf-8"))
+            # make special char only once, because of those token will be treat as sentence spliter.
+            for char in special_chars:
+                text = re.sub('[' + char + ']+[ ]*', char, text)
+            # space will be treat as comma, WARM, not in eng
+            # text = text.replace(" ", "，")
+            for char in split_chars:
+                text = text.replace(char, char + "\n")
+            final = ""
+            for line in text.split("\n"):
+                if len(line) == 0:
+                    continue
+                words = seg_func(line)
+                final += " ".join(words) + "\n"
+            f.write(final + "\n")
+            # if count % 100 == 0:
+            #     print("speed: ", data_len/1024/(time.time() - s))
+
+    return data_len, None
+
+
+def main():
+    args = get_args()
+    startup_start = time.time()
+
+    file_paths = []
+    if os.path.isfile(args.input_path):
+        file_paths.append(args.input_path)
+    else:
+        for root, _, fs in os.walk(args.input_path):
+            for f in fs:
+                file_paths.append(os.path.join(root, f))
+
+    pool = multiprocessing.Pool(args.workers)
+
+    startup_end = time.time()
+    proc_start = time.time()
+    total_bytes_processed = 0
+    print("Time to startup:", startup_end - startup_start)
+
+    encoded_files = pool.imap(text_to_text, file_paths, 1)
+
+    out_paths = []
+    for i, (bytes_processed, out_path) in enumerate(encoded_files, start=1):
+        total_bytes_processed += bytes_processed
+        out_paths.append(out_path)
+        master_start = time.time()
+
+        if i % args.log_interval == 0:
+            current = time.time()
+            elapsed = current - proc_start
+            mbs = total_bytes_processed / elapsed / 1024 / 1024
+            print(f"Processed {i} files",
+                  f"({i/elapsed} files/s, {mbs} MB/s).",
+                  file=sys.stderr)
+    pool.close()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/model_zoo/ernie-1.0/scripts/gen_char.py b/model_zoo/ernie-1.0/scripts/gen_char.py
new file mode 100644
index 000000000000..dabf678c1e4e
--- /dev/null
+++ b/model_zoo/ernie-1.0/scripts/gen_char.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import time
+import sys
+import pickle
+from collections import defaultdict
+
+input_path = sys.argv[1]
+print(input_path)
+
+char_dict = defaultdict(int)
+
+file_paths = []
+if os.path.isfile(input_path):
+    file_paths.append(input_path)
+else:
+    for root, _, fs in os.walk(input_path):
+        for f in fs:
+            file_paths.append(os.path.join(root, f))
+
+count = 0
+s = time.time()
+data_len = 0
+for file_name in file_paths:
+    print(f" > reading file {file_name}")
+    with open(file_name, 'r') as f:
+        line = f.readline()
+        while line:
+            count += 1
+            data_len += len(line.encode("utf-8"))
+            for char in line:
+                char_dict[char] += 1
+            line = f.readline()
+            if count % 10000 == 0:
+                print(
+                    f"processed doc {count}, char size: {len(char_dict)}, speed: {data_len/1024/1024/(time.time() - s)} MB/s"
+                )
+                with open("char_dict.txt", "w") as rf:
+                    res = sorted(char_dict.items(), key=lambda x: -x[1])
+                    for x in res:
+                        k, v = x
+                        rf.write(f"{k} {v}\n")
+
+with open("char_dict.txt", "w") as f:
+    res = sorted(char_dict.items(), key=lambda x: -x[1])
+    for x in res:
+        k, v = x
+        f.write(f"{k} {v}\n")
+
+with open("char_dict.pickle", "wb") as f:
+    pickle.dump(char_dict, f)
diff --git a/model_zoo/ernie-1.0/scripts/gen_vocab.py b/model_zoo/ernie-1.0/scripts/gen_vocab.py
new file mode 100644
index 000000000000..595dabcf9567
--- /dev/null
+++ b/model_zoo/ernie-1.0/scripts/gen_vocab.py
@@ -0,0 +1,22 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sentencepiece as spm
+
+spm.SentencePieceTrainer.train(
+    input='../wikitext/wiki.all.raw',
+    model_prefix='eng',
+    vocab_size=17000,
+    model_type="BPE",
+)
diff --git a/model_zoo/ernie-1.0/scripts/merge_vocab.py b/model_zoo/ernie-1.0/scripts/merge_vocab.py
new file mode 100644
index 000000000000..8907f8ae56d9
--- /dev/null
+++ b/model_zoo/ernie-1.0/scripts/merge_vocab.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pickle
+import re
+from paddlenlp.transformers import BasicTokenizer
+from paddlenlp.transformers.tokenizer_utils import (
+    _is_punctuation,
+    _is_control,
+    _is_whitespace,
+    is_chinese_char,
+    tokenize_special_chars,
+)
+
+re_eng = re.compile('[#a-zA-Z0-9]', re.U)
+re_sep = re.compile('\[[A-Z]+\]', re.U)
+re_sep_eng = re.compile('\<[\/a-z]+\>', re.U)
+
+bt = BasicTokenizer()
+normalize_chars = lambda x: "".join(bt.tokenize(x))
+
+
+def chinese_char():
+    return set([chr(x) for x in range(0x4E00, 0x9FA5 + 1)])
+
+
+def jk_vocab(c):
+    c = ord(c)
+    return (c >= 0x3040 and c<= 0x33FF) or \
+              (c>= 0x1100 and c<=0x11FF)   #  谚文字母
+
+
+def add_special_token():
+    return ["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]
+
+
+char_dict = pickle.load(open("char_dict.pickle", "rb"))
+cjk_vocab = chinese_char()
+
+final_vocab = set()
+
+# Not in use char
+# final_vocab.add(" ")
+# final_vocab.add("\n")
+
+other_char = []
+
+
+def add_vocab(char, f):
+    if re_sep_eng.match(char):
+        return
+    # add eng vocab and specical token
+    if re_eng.match(char) or re_sep.match(char):
+        if char not in final_vocab:
+            final_vocab.add(char)
+            f.write(f"{char}\n")
+        return
+    # add japanese and Korean char
+    if len(char) > 1 and char.startswith("##") and cjk_vocab(char[2]):
+        if char not in final_vocab:
+            final_vocab.add(char)
+            f.write(f"{char}\n")
+        return
+
+    char = normalize_chars(char)
+    for i, k in enumerate(char):
+        if _is_whitespace(k) or _is_control(k):
+            continue
+        if k not in final_vocab:
+            if not _is_punctuation(k) and not is_chinese_char(
+                    ord(k)) and k == tokenize_special_chars(k):
+                other_char.append(k)
+            final_vocab.add(k)
+            f.write(f"{k}\n")
+            if jk_vocab(k):
+                add_vocab("##" + k, f)
+
+
+with open("vocab.txt", "w") as f:
+    for x in add_special_token():
+        add_vocab(x, f)
+
+    res = sorted(char_dict.items(), key=lambda x: -x[1])
+    # Add cjk by freq
+    for x in res:
+        k, v = x
+        k = normalize_chars(k)
+        if k in cjk_vocab:
+            add_vocab(k, f)
+            cjk_vocab.remove(k)
+    # if cjk not in freq add it
+    cjk_vocab = sorted(cjk_vocab)
+    while len(cjk_vocab) > 0:
+        k = cjk_vocab.pop()
+        if k not in final_vocab:
+            f.write(f"{k}\n")
+            final_vocab.add(k)
+    with open("eng.vocab") as ec:
+        line = ec.readline()
+        while line:
+            k, v = line.strip().split()
+            if "▁" in k:
+                k = k[1:]
+            elif re_sep_eng.match(k):
+                pass
+            else:
+                k = "##" + k
+
+            add_vocab(k, f)
+            line = ec.readline()
+    for x in res:
+        k, v = x
+        if v >= 200:
+            add_vocab(k, f)
+
+    # addition = []
+    # for x in res:
+    #     oldk,v = x
+    #     k = normalize_chars(oldk)
+    #     for c in k:
+    #         if c not in final_vocab and  v >= 200:
+    #             addition.append(c)
+    #             final_vocab.add(c)
+    # for k in sorted(addition):
+    #     f.write(f"{k}\n")
+
+# for k in sorted(other_char, key= lambda x:ord(x)):
+#     print(k)
diff --git a/model_zoo/ernie-1.0/scripts/trans_to_json.py b/model_zoo/ernie-1.0/scripts/trans_to_json.py
new file mode 100644
index 000000000000..bd04aa919a7a
--- /dev/null
+++ b/model_zoo/ernie-1.0/scripts/trans_to_json.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import re
+import argparse
+import json
+import multiprocessing
+import sys
+import time
+import shutil
+from functools import partial
+
+import numpy as np
+from tqdm import tqdm
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_path',
+                        type=str,
+                        required=True,
+                        help='Path to you raw files. Folder or file path.')
+    parser.add_argument('--output_path',
+                        type=str,
+                        required=True,
+                        help='Path to save the output json files.')
+    parser.add_argument('--json_key',
+                        type=str,
+                        default='text',
+                        help='The content key of json file.')
+    parser.add_argument(
+        '--doc_spliter',
+        type=str,
+        default='',
+        help=
+        "Spliter between documents. We will strip the line, if you use blank line to split doc, leave it blank."
+    )
+    parser.add_argument('--min_doc_length',
+                        type=int,
+                        default=10,
+                        help="Minimal char of a documment.")
+    parser.add_argument('--workers',
+                        type=int,
+                        default=1,
+                        help='Number of worker processes to launch')
+    parser.add_argument('--log_interval',
+                        type=int,
+                        default=1,
+                        help='Interval between progress updates.')
+    parser.add_argument('--no-merge',
+                        action='store_true',
+                        help='Don\'t merge the file.')
+    parser.add_argument('--no-shuffle',
+                        action='store_true',
+                        help='Don\'t shuffle the file.')
+    args = parser.parse_args()
+    return args
+
+
+def raw_text_to_json(path, doc_spliter="", json_key="text", min_doc_length=10):
+    path = os.path.abspath(path)
+    if not os.path.exists(path):
+        print("No found file %s" % path)
+        return 0, None
+
+    out_filepath = path + ".jsonl"
+    fout = open(out_filepath, "w", encoding="utf-8")
+    len_files = 0
+    with open(path, "r") as f:
+        doc = ""
+        line = f.readline()
+        while line:
+            len_files += len(line)
+            if line.strip() == doc_spliter:
+                if len(doc) > min_doc_length:
+                    fout.write(
+                        json.dumps({json_key: doc}, ensure_ascii=False) + "\n")
+                doc = ""
+            else:
+                doc += line
+            line = f.readline()
+
+        if len(doc) > min_doc_length:
+            fout.write(json.dumps({json_key: doc}, ensure_ascii=False) + "\n")
+        doc = ""
+
+    return len_files, out_filepath
+
+
+def merge_file(file_paths, output_path):
+    if not output_path.endswith(".jsonl"):
+        output_path = output_path + ".jsonl"
+    print("Merging files into %s" % output_path)
+    with open(output_path, 'wb') as wfd:
+        for f in file_paths:
+            if f is not None and os.path.exists(f):
+                with open(f, 'rb') as fd:
+                    shutil.copyfileobj(fd, wfd)
+                os.remove(f)
+    print("File save in %s" % output_path)
+    return output_path
+
+
+def shuffle_file(output_path):
+    print("Shuffling the jsonl file...")
+    if os.path.exists(output_path):
+        os.system("shuf %s -o %s" % (output_path, output_path))
+        print("File shuffled!!!")
+    else:
+        raise ValueError("File not found: %s" % output_path)
+
+
+def main():
+    args = get_args()
+    startup_start = time.time()
+
+    file_paths = []
+    if os.path.isfile(args.input_path):
+        file_paths.append(args.input_path)
+    else:
+        for root, _, fs in os.walk(args.input_path):
+            for f in fs:
+                file_paths.append(os.path.join(root, f))
+
+    pool = multiprocessing.Pool(args.workers)
+
+    startup_end = time.time()
+    proc_start = time.time()
+    total_bytes_processed = 0
+    print("Time to startup:", startup_end - startup_start)
+
+    trans_json = partial(raw_text_to_json,
+                         doc_spliter=args.doc_spliter,
+                         json_key=args.json_key,
+                         min_doc_length=args.min_doc_length)
+    encoded_files = pool.imap(trans_json, file_paths, 1)
+
+    out_paths = []
+    for i, (bytes_processed, out_path) in enumerate(encoded_files, start=1):
+        total_bytes_processed += bytes_processed
+        out_paths.append(out_path)
+        master_start = time.time()
+
+        if i % args.log_interval == 0:
+            current = time.time()
+            elapsed = current - proc_start
+            mbs = total_bytes_processed / elapsed / 1024 / 1024
+            print(f"Processed {i} files",
+                  f"({i/elapsed} files/s, {mbs} MB/s).",
+                  file=sys.stderr)
+
+    if not args.no_merge:
+        output_path = merge_file(out_paths, args.output_path)
+        if not args.no_shuffle:
+            shuffle_file(output_path)
+
+
+if __name__ == "__main__":
+    main()
+    #profile.run("main()", "testprof")
diff --git a/model_zoo/ernie-1.0/scripts/wudao_process.py b/model_zoo/ernie-1.0/scripts/wudao_process.py
new file mode 100644
index 000000000000..fb24511695c2
--- /dev/null
+++ b/model_zoo/ernie-1.0/scripts/wudao_process.py
@@ -0,0 +1,174 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+input_path = "WuDaoCorpus2.0_base_200G/"
+
+import json
+import re
+import argparse
+import multiprocessing
+import os
+import time
+import jieba
+import sys
+from functools import partial
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--input_path',
+                        type=str,
+                        required=True,
+                        help='Path to you raw files. Folder or file path.')
+    parser.add_argument('--workers',
+                        type=int,
+                        default=1,
+                        help='Number of worker processes to launch')
+    parser.add_argument('--output_path',
+                        type=str,
+                        help='Path to save the output json files.')
+    parser.add_argument('--log_interval',
+                        type=int,
+                        default=1,
+                        help='Interval between progress updates.')
+    args = parser.parse_args()
+    return args
+
+
+def lexical_analysis_fn():
+    from LAC import LAC
+    lac = LAC(mode="lac")
+
+    def process(line):
+        words, _ = lac.run(line)
+        return words
+
+    return process
+
+
+def chinese_segmentation_fn():
+    from LAC import LAC
+    lac_cws = LAC(mode='seg')
+
+    def process(line):
+        words = lac_cws.run(line)
+        return words
+
+    return process
+
+
+def jieba_segmentation_fn():
+    import jieba
+
+    def process(line):
+        words = jieba.cut(line)
+        return list(words)
+
+    return process
+
+
+CHINESE_SEG_FUNC = {
+    'lac': lexical_analysis_fn(),
+    'seg': chinese_segmentation_fn(),
+    'jieba': jieba_segmentation_fn(),
+}
+
+special_chars = ['\n', '。', '?', '？', ' ', ';', '；', '！', '!']
+split_chars = ['。', '?', '？', ';', '；', '!', '！']
+
+
+def text_to_text(path):
+    out_name = "./tmp/" + path[-20:]
+    print("Loading %s" % path)
+    with open(path, "r") as f:
+        try:
+            contents = json.load(f)
+        except Exception as e:
+            print("Failed to load %s" % path)
+            return 0, None
+
+    print("Write into %s" % out_name)
+    if os.path.exists(out_name):
+        print("File exists %s" % out_name)
+        return 0, None
+
+    seg_func = CHINESE_SEG_FUNC["seg"]
+    import time
+    s = time.time()
+    data_len = 0
+    count = 0
+    with open(out_name, "w") as f:
+        for js in contents:
+            count += 1
+            text = js["content"]
+            data_len += len(text.encode("utf-8"))
+            # make special char only once, because of those token will be treat as sentence spliter.
+            for char in special_chars:
+                text = re.sub('[' + char + ']+[ ]*', char, text)
+            # space will be treat as comma, WARM, not in eng
+            # text = text.replace(" ", "，")
+            for char in split_chars:
+                text = text.replace(char, char + "\n")
+            final = ""
+            for line in text.split("\n"):
+                if len(line) == 0:
+                    continue
+                words = seg_func(line)
+                final += " ".join(words) + "\n"
+            f.write(final + "\n")
+            # if count % 100 == 0:
+            #     print("speed: ", data_len/1024/(time.time() - s))
+
+    return data_len, None
+
+
+def main():
+    args = get_args()
+    startup_start = time.time()
+
+    file_paths = []
+    if os.path.isfile(args.input_path):
+        file_paths.append(args.input_path)
+    else:
+        for root, _, fs in os.walk(args.input_path):
+            for f in fs:
+                file_paths.append(os.path.join(root, f))
+
+    pool = multiprocessing.Pool(args.workers)
+
+    startup_end = time.time()
+    proc_start = time.time()
+    total_bytes_processed = 0
+    print("Time to startup:", startup_end - startup_start)
+
+    encoded_files = pool.imap(text_to_text, file_paths, 1)
+
+    out_paths = []
+    for i, (bytes_processed, out_path) in enumerate(encoded_files, start=1):
+        total_bytes_processed += bytes_processed
+        out_paths.append(out_path)
+        master_start = time.time()
+
+        if i % args.log_interval == 0:
+            current = time.time()
+            elapsed = current - proc_start
+            mbs = total_bytes_processed / elapsed / 1024 / 1024
+            print(f"Processed {i} files",
+                  f"({i/elapsed} files/s, {mbs} MB/s).",
+                  file=sys.stderr)
+    pool.close()
+
+
+if __name__ == "__main__":
+    main()

From dc4d29999f43099714512537ad8c37ce12a781d7 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 12 Aug 2022 17:29:02 +0800
Subject: [PATCH 24/48] Update README.md

---
 model_zoo/ernie-3.0/README.md | 56 +++++++++++++++++++++++++++++------
 1 file changed, 47 insertions(+), 9 deletions(-)

diff --git a/model_zoo/ernie-3.0/README.md b/model_zoo/ernie-3.0/README.md
index 081870253396..b88b67d5b64c 100644
--- a/model_zoo/ernie-3.0/README.md
+++ b/model_zoo/ernie-3.0/README.md
@@ -137,12 +137,50 @@ batch_size=32 和 1，预测精度为 FP16 时，GPU 下的效果-时延图：
             </td>
         </tr>
         <tr>
-            <td rowspan=2 align=center> 24L1024H </td>
+            <td rowspan=3 align=center> 24L1024H </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>ERNIE 2.0-Large-zh</b></span>
+                <span style="font-size:18px">ERNIE 1.0-Large-CW</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>77.03</b></span>
+                <span style="font-size:18px"><b>79.03</b></span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px">75.97</span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px">59.65</span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px"><b>62.91</b></span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px"><b>85.09</b></span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px"><b>81.73</b></span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px"><b>93.09</b></span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px"><b>84.53</b></span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px"><b>74.22/91.88</b></span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px"><b>88.57</b></span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px"><b>84.54</b></span>
+            </td>
+        </tr>
+        <tr>
+            <td style="text-align:center">
+                <span style="font-size:18px">ERNIE 2.0-Large-zh</span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px">77.03</span>
             </td>
             <td style="text-align:center">
                 <span style="font-size:18px"><b>76.41</b></span>
@@ -157,16 +195,16 @@ batch_size=32 和 1，预测精度为 FP16 时，GPU 下的效果-时延图：
                 <span style="font-size:18px">83.82</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>79.69</b></span>
+                <span style="font-size:18px">79.69</span>
             </td>
             <td style="text-align:center">
                 <span style="font-size:18px">89.14</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>84.10</b></span>
+                <span style="font-size:18px">84.10</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>71.48/90.35</b></span>
+                <span style="font-size:18px">71.48/90.35</span>
             </td>
             <td style="text-align:center">
                 <span style="font-size:18px">85.52</span>
@@ -192,13 +230,13 @@ batch_size=32 和 1，预测精度为 FP16 时，GPU 下的效果-时延图：
                 <span style="font-size:18px">62.02</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>83.88</b></span>
+                <span style="font-size:18px">83.88</span>
             </td>
             <td style="text-align:center">
                 <span style="font-size:18px">78.81</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>90.79</b></span>
+                <span style="font-size:18px">90.79</span>
             </td>
             <td style="text-align:center">
                 <span style="font-size:18px">83.67</span>
@@ -207,7 +245,7 @@ batch_size=32 和 1，预测精度为 FP16 时，GPU 下的效果-时延图：
                 <span style="font-size:18px">70.58/89.82</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>85.72</b></span>
+                <span style="font-size:18px">85.72</span>
             </td>
             <td style="text-align:center">
                 <span style="font-size:18px">75.26</span>

From c1fc1e16b774fb50ad7c5aeb92ba47c59d1a7761 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 12 Aug 2022 17:39:03 +0800
Subject: [PATCH 25/48] Update README.md

---
 examples/benchmark/clue/README.md | 54 ++++++++++++++++++++++++++-----
 1 file changed, 46 insertions(+), 8 deletions(-)

diff --git a/examples/benchmark/clue/README.md b/examples/benchmark/clue/README.md
index 1dbb56473f8d..8ab13498068f 100644
--- a/examples/benchmark/clue/README.md
+++ b/examples/benchmark/clue/README.md
@@ -67,14 +67,51 @@
             <td style="text-align:center;">
                 <span style="font-size:18px;">C<sup>3</sup></span>
             </td>
+        </tr>        <tr>
+            <td rowspan=3 align=center> 24L1024H </td>
+            <td style="text-align:center">
+                <span style="font-size:18px">ERNIE 1.0-Large-zh-CW</span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px"><b>79.03</b></span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px">75.97</span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px">59.65</span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px"><b>62.91</b></span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px"><b>85.09</b></span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px"><b>81.73</b></span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px"><b>93.09</b></span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px"><b>84.53</b></span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px"><b>74.22/91.88</b></span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px"><b>88.57</b></span>
+            </td>
+            <td style="text-align:center">
+                <span style="font-size:18px"><b>84.54</b></span>
+            </td>
         </tr>
         <tr>
-            <td rowspan=2 align=center> 24L1024H </td>
             <td style="text-align:center">
                 <span style="font-size:18px">ERNIE 2.0-Large-zh</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>77.03</b></span>
+                <span style="font-size:18px">77.03</span>
             </td>
             <td style="text-align:center">
                 <span style="font-size:18px"><b>76.41</b></span>
@@ -89,16 +126,16 @@
                 <span style="font-size:18px">83.82</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>79.69</b></span>
+                <span style="font-size:18px">79.69</span>
             </td>
             <td style="text-align:center">
                 <span style="font-size:18px">89.14</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>84.10</b></span>
+                <span style="font-size:18px">84.10</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>71.48/90.35</b></span>
+                <span style="font-size:18px">71.48/90.35</span>
             </td>
             <td style="text-align:center">
                 <span style="font-size:18px">85.52</span>
@@ -124,13 +161,13 @@
                 <span style="font-size:18px">62.02</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>83.88</b></span>
+                <span style="font-size:18px">83.88</span>
             </td>
             <td style="text-align:center">
                 <span style="font-size:18px">78.81</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>90.79</b></span>
+                <span style="font-size:18px">90.79</span>
             </td>
             <td style="text-align:center">
                 <span style="font-size:18px">83.67</span>
@@ -139,7 +176,7 @@
                 <span style="font-size:18px">70.58/89.82</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>85.72</b></span>
+                <span style="font-size:18px">85.72</span>
             </td>
             <td style="text-align:center">
                 <span style="font-size:18px">75.26</span>
@@ -1154,6 +1191,7 @@ AFQMC（语义相似度）、TNEWS（文本分类）、IFLYTEK（长文本分类
 
 | Model                            | AFQMC   | TNEWS   | IFLYTEK | CMNLI    | OCNLI    | CLUEWSC2020 | CSL     | CMRC2018 | CHID    | C<sup>3</sup> |
 | -------------------------------- | ------- | ------- | ------- | -------- | -------- | ----------- | ------- | -------- | ------- | ------------- |
+| ERNIE 1.0-Large-zh-cw            | 2e-5,64 | 3e-5,32 | 5e-5,16 | 2e-5,16  | 2e-5,32  | 1e-5,32     | 1e-5,16 | 2e-5,24  | 1e-5,24 | 2e-5,32       |
 | ERNIE 3.0-Xbase-zh               | 2e-5,16 | 3e-5,32 | 3e-5,32 | 3e-5,64  | 3e-5,64  | 2e-5,32     | 1e-5,16 | 3e-5,24  | 2e-5,24 | 3e-5,24       |
 | ERNIE 2.0-Large-zh               | 1e-5,32 | 3e-5,64 | 3e-5,32 | 2e-5,32  | 1e-5,16  | 3e-5,32     | 1e-5,64 | 2e-5,24  | 2e-5,24 | 3e-5,32       |
 | HFL/RoBERTa-wwm-ext-large        | 1e-5,32 | 3e-5,32 | 2e-5,32 | 1e-5,16  | 1e-5,16  | 2e-5,16     | 2e-5,16 | 3e-5,32  | 1e-5,24 | 2e-5,24       |

From 21a578857bf267417d5fac29afff990a28f7f7cf Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Tue, 9 Aug 2022 17:01:49 +0800
Subject: [PATCH 26/48] add amp o1 support

---
 model_zoo/ernie-1.0/args.py           |  1 +
 model_zoo/ernie-1.0/run_pretrain.py   | 13 +++++++-----
 model_zoo/ernie-1.0/scripts/README.md | 30 +++++++++++++++++++++++++--
 3 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/model_zoo/ernie-1.0/args.py b/model_zoo/ernie-1.0/args.py
index a755abb3142b..fd2b6a27c6be 100644
--- a/model_zoo/ernie-1.0/args.py
+++ b/model_zoo/ernie-1.0/args.py
@@ -80,6 +80,7 @@ def parse_args(MODEL_CLASSES):
 
     # AMP config
     parser.add_argument("--use_amp", type=str2bool, nargs='?', const=False, help="Enable mixed precision training.")
+    parser.add_argument("--fp16_opt_level", type=str, default="O2", help="Mixed precision training optimization level.")
     parser.add_argument("--enable_addto", type=str2bool, nargs='?', const=True, default=True, help="Whether to enable the addto strategy for gradient accumulation or not. This is only used for AMP training.")
     parser.add_argument("--scale_loss", type=float, default=32768, help="The value of scale_loss for fp16. This is only used for AMP training.")
     parser.add_argument("--hidden_dropout_prob", type=float, default=0.1, help="The hidden dropout prob.")
diff --git a/model_zoo/ernie-1.0/run_pretrain.py b/model_zoo/ernie-1.0/run_pretrain.py
index f34f6aca3749..a49cda8c823f 100644
--- a/model_zoo/ernie-1.0/run_pretrain.py
+++ b/model_zoo/ernie-1.0/run_pretrain.py
@@ -437,7 +437,7 @@ def do_train(args):
     if args.use_amp:
         scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)
         scaler = fleet.distributed_scaler(scaler)
-        model = paddle.amp.decorate(models=model, level='O2')
+        model = paddle.amp.decorate(models=model, level=args.fp16_opt_level)
     else:
         scaler = None
 
@@ -468,7 +468,7 @@ def do_train(args):
             if os.path.exists(opt_path):
                 load_dict = paddle.load(params_path)
                 model_dict = model.state_dict()
-                if args.use_amp:
+                if args.use_amp and args.fp16_opt_level == "O2":
                     for k, v in load_dict.items():
                         if k not in model_dict:
                             logger.warning(
@@ -528,12 +528,15 @@ def do_train(args):
             masked_lm_labels, next_sentence_labels = batch
 
             with paddle.amp.auto_cast(args.use_amp,
+                                      custom_white_list=[
+                                          'softmax',
+                                          'layer_norm',
+                                          'gelu',
+                                      ],
                                       custom_black_list=[
-                                          "reduce_sum",
                                           "c_softmax_with_cross_entropy",
-                                          "elementwise_div"
                                       ],
-                                      level='O2'):
+                                      level=args.fp16_opt_level):
 
                 # Create the model for the ernie pretrain
                 if args.binary_head:
diff --git a/model_zoo/ernie-1.0/scripts/README.md b/model_zoo/ernie-1.0/scripts/README.md
index 91ada5c2a94f..aef873e51174 100644
--- a/model_zoo/ernie-1.0/scripts/README.md
+++ b/model_zoo/ernie-1.0/scripts/README.md
@@ -3,9 +3,33 @@
 
 ## 数据下载
 
-### CLUE 200G数据集 & WuDaoCorpusBase 200G
+### CLUECorpus2020  & WuDaoCorpus2.0 Base 数据集
+
+CLUE 提供了约200G左右的语料文本，详细介绍见[官网](https://github.com/CLUEbenchmark/CLUECorpus2020#%E6%95%B0%E6%8D%AE%E4%B8%8B%E8%BD%BD)，用户可以申请下载，方式如下：
+> 数据下载
+> 申请方式： 将使用语料研究目的和用途，计划、研究机构和申请者介绍，发送到邮箱，并承诺不向第三方提供。
+>
+> 邮箱: CLUEbenchmark@163.com，标题是：CLUECorpus2020 200G语料库
+
+
+WuDaoCorpora是悟道爬取的中文大规模预料。整体数量为3TB，目前开源的部分为WuDaoCorpus2.0 bases数据集，大小为200GB。
+用户微信登录[官网](https://resource.wudaoai.cn/home)，即可直接下载数据。
+
+### 中文预料分词
+
+ERNIE 使用知识嵌入的方式进行预训练，如何尽可能精确的从原始文本中提取知识，直接关系预训练模型的效果。
+目前采用的分词方式的有jieba，lac，Wordtag，效果以此
+速度对比，假设CPU使用40线程，GPU使用16卡：
+
+| 切词方式 | 效果 | 速度 | 耗时（处理200G）
+|-|-|-|-|
+| jieba | 一般 | 607 KB/s |  2.5 h |
+| lac   | 好 | 106 KB/s | 13.9 h
+| wordtag| 最好 | 0.94 KB/s | 159D|
+
+综合考虑分词的效果与速度，我们选择百度的LAC作为我们的分词工具。
+
 
-### 英文部分，下载了 WikiText 数据
 
 
 ## 词表制作
@@ -16,3 +40,5 @@
 
 注：此方法拼接产出的词表容易出现UNK的情况。
 如issue[2927](https://github.com/PaddlePaddle/PaddleNLP/issues/2927)
+
+### 英文部分，下载了 WikiText 数据

From 8af8082648941be986815b63dd0443a61ee80c0c Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Mon, 22 Aug 2022 23:10:15 +0800
Subject: [PATCH 27/48] ernie cw readme.

---
 model_zoo/ernie-1.0/README.md                 | 144 +++++++++++++
 .../data_tools/create_pretraining_data.py     |  11 +
 model_zoo/ernie-1.0/scripts/README.md         | 194 ++++++++++++++++--
 model_zoo/ernie-1.0/scripts/gen_vocab.py      |   6 +-
 model_zoo/ernie-1.0/scripts/merge_vocab.py    |  59 +++---
 model_zoo/ernie-1.0/scripts/wudao_process.py  |  24 ++-
 6 files changed, 380 insertions(+), 58 deletions(-)

diff --git a/model_zoo/ernie-1.0/README.md b/model_zoo/ernie-1.0/README.md
index 9920d87da66d..87d395c46a61 100644
--- a/model_zoo/ernie-1.0/README.md
+++ b/model_zoo/ernie-1.0/README.md
@@ -56,6 +56,10 @@ ERNIE预训练采用的是MLM（Mask Language Model）的训练方式，采用WW
 用户可以根据自己的需求，灵活修改mask方式。具体可以参考修改`data_tools/dataset_utils.py`中`create_masked_lm_predictions`函数。
 用户可以设置`checkpoint_steps`，间隔`checkpoint_steps`数，即保留最新的checkpoint到`model_last`文件夹。重启训练时，程序默认从最新checkpoint重启训练，学习率、数据集都可以恢复到checkpoint时候的状态。
 
+下面是使用CLUECorpusSmall 14G文本进行预训练的流程：
+<details>
+<summary><b>CLUECorpusSmall 数据集预训练</b></summary>
+
 ### 数据准备
 数据下载部分请参考[data_tools]目录，根据文档中`CLUECorpusSmall 数据集处理教程`，下载数据。下载好后:
 
@@ -182,6 +186,146 @@ ERINE-1.0-cluecorpussmall | 12L768H | 73.24(-0.54) | 74.26 | 57.24 | 60.79 | 81.
 注:
 - `ERNIE-1.0 Base`官方预训练参数，采用的训练配置是batch_size=1024、steps=100w，
 - `ERINE-1.0-cluecorpussmall`复现版本，采用的是batch_size=512、steps=100w。
+</details>
+
+
+### ERNIE-CW 预训练流程
+
+PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 总共400GB，发布ERNIE-CW项目。让用户可以从零开始构建你的预训练模型。
+
+ERNIE-CW项目，从数据下载，词表制作，数据转化，模型训练，所有流程，完全开源开放，可复现。
+并训练发布开源最优的模型参数。
+
+数据下载，词表制作，数据转化部分，请参见[此处](./scripts/README.md)。
+接下来我们主要介绍训练流程部分的特性
+
+
+训练结构：
+- 支持SOP损失，灵活可配置。
+训练方式：
+- 同时支持动态图和静态图训练
+
+**训练速度方面**，我们支持了如下策略，加速计算过程，减小显存占用，扩大batch_size：
+
+- **多卡多机训练**：
+    - 基于飞桨Fleet分布式API，用户可以十分方便的通过数据并行的方法，将训练扩展到多机多卡。
+- **混合精度训练**：
+    - 部分算子使用FP16计算kernel，加速计算过程。支持AMP混合精度O1，和Pure FP16全FP训练策略O2。
+- **梯度累积训练**：
+    - 用户可以指定梯度累积的步数，在梯度累积的step中，减少多卡之间梯度的通信，减少更新的次数，可以扩大训练的batch_size.
+- **重计算训练**：
+    -  通过重新计算前向的方式，减少前向网络中间变量的存储，可以显著减少显存占用，
+
+
+**训练体验方面**，我们针对训练数据流、重启、可视化等方面做了针对性优化提升
+
+数据流
+- **多机扩展**
+    - 用户可以将数据放置到 NFS 服务器上，多机同时挂载数据即可。训练数据与计算资源分离。
+- **多数据混合**
+    - 训练数据集支持多个文件，即插即用，设置权重，传入参数即可data_dir="1.0  dateset_a  2.0 dataset_b"
+- **稳定可复现**
+    - MLM任务具有一定随机性，需要随机mask数据。本数据流通过固定每一个step数据的随机种子，实验数据流稳定可复现。
+- **快加载**
+    - 数据文件使用mmap读取，加载数百GB文件几乎不耗时。
+
+其他：
+- **断点重启**
+    - 用户可以单独设置，checkpoints steps 参数可设置较小，重启训练默认加载最新checkpoint。
+    - 断点数据自动恢复，学习率等参数也自动恢复。
+- **可视化日志记录**
+    - 日志展示为全局loss，波动小。
+    - 记录混合精度，loss_scaling等信息，方便用户debug。
+    - 对模型结构，配置参数，paddle版本信息进行记录，方便复现环境
+
+**训练效果方面**，我们release了base、large两个模型。均取得了较好的预训练效果。
+
+**ERNIE 3.0-Base-zh-CW** 模型：
+
+- 使用CLUE，WuDao共计400GB的语料，batch_size 1024, 训练 400w step，即可训练得到`ernie-3.0-base-zh`类似的模型效果。相关模型参数，开源为`ernie-3.0-base-zh-cw`，用户加载即可使用。
+使用CLUE benchmark 对最优超参数进行GradSearch搜索：
+
+Model | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUEWSC2020 | CSL | CMRC | CHID | C3
+-- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
+Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc| Acc| Acc
+ERNIE 3.0-Base-zh-CW | 12L768H | 76.44 | 76.04 |	58.02 |	60.87 |	83.56 | 78.61 |	89.14 |	84.00 |  72.26/90.40 |	84.73 |	77.15 |
+ERNIE 2.0-Base-zh | 12L768H | 74.95  | 76.25 |	58.53 |	61.72 |	83.07 |	78.81 |	84.21 |	82.77 | 68.22/88.71	| 82.78	| 73.19
+ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |	58.91 |	62.25 |	81.68 |	76.58 |	85.20 |	82.77 | 67.32/87.83 | 82.47 | 69.68
+
+**ERNIE 1.0-Large-zh-CW** 模型：
+- 除了base模型外，我们还训练了放出了large模型。此模型参数采用的是词表与ernie-1.0相同，因此命名为`ernie-1.0-large-zh-cw`。
+使用开源语料，batch_size 512, 训练 400w step，训练去除SOP任务，只保留MLM损失：
+
+Model | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUEWSC2020 | CSL | CMRC | CHID | C3
+-- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
+Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc| Acc| Acc
+ERNIE 1.0-Large-zh-CW | 24L1024H | 79.03 | 75.97 |	59.65 |	62.91 |	85.09 |	81.73||	93.09 |	84.53 | 74.22/91.88 | 88.57 | 84.54
+ERNIE 3.0-Xbase-zh| 20L1024H | 78.71 | 76.85 |	59.89 |	62.41 |	84.76 |	82.51 |	89.80 |	84.47 |	75.49/92.67 | 86.36 | 84.59
+RoBERTa-wwm-ext-large | 24L1024H | 76.61 |	76.00 |	59.33 |	62.02 |	83.88 |	78.81 |	90.79 |	83.67 |	70.58/89.82 |	85.72 |	75.26
+
+
+
+<details>
+<summary><b>训练脚本如下</b></summary>
+
+训练脚本如下
+```shell
+set -x
+
+# cd PaddleNLP/model_zoo/ernie-1.0
+export PYTHONPATH=$PYTHONPATH:../../
+
+export FLAGS_call_stack_level=2
+# export NCCL_SOCKET_IFNAME=xgbe0
+export FLAGS_gemm_use_half_precision_compute_type=False
+export FLAGS_enable_eager_mode=1
+unset CUDA_VISIBLE_DEVICES
+
+trainer_id=${PADDLE_TRAINER_ID:-"0"}
+task_name="0809-ernie-3.0-base-cw-dp16-gb1024"
+
+base_nfs="/path/to/your/nfs/mount/point"
+base_dir="${base_nfs}/ernie-cw/output/${task_name}"
+data_dir="5.0 ${base_nfs}/clue_oscar/clue_corpus_oscar_0630 7.0 ${base_nfs}/clue_train/clue_corpus_train_0629 12.0 ${base_nfs}/wudao_200g/wudao_200g_0703"
+vocab_dir="${base_nfs}/"
+
+python3 -u  -m paddle.distributed.launch \
+    --gpus "0,1,2,3,4,5,6,7" \
+    --log_dir "${base_dir}/log_${trainer_id}" \
+    run_pretrain.py \
+    --model_type "ernie" \
+    --model_name_or_path "ernie-3.0-base-zh" \
+    --tokenizer_name_or_path "${vocab_dir}" \
+    --input_dir "${data_dir}" \
+    --output_dir "${base_dir}" \
+    --fp16_opt_level "O1" \
+    --max_seq_len 512 \
+    --binary_head true \
+    --micro_batch_size 64 \
+    --sharding_degree 1\
+    --dp_degree 16 \
+    --use_sharding false \
+    --use_amp true \
+    --use_recompute false \
+    --max_lr 0.0001 \
+    --min_lr 0.00001 \
+    --max_steps 4000000 \
+    --save_steps 100000 \
+    --checkpoint_steps 5000 \
+    --decay_steps 3900000 \
+    --weight_decay 0.01 \
+    --warmup_rate 0.01 \
+    --grad_clip 1.0 \
+    --logging_freq 20 \
+    --num_workers 3 \
+    --eval_freq 1000 \
+    --device "gpu"\
+    --share_folder true \
+    --hidden_dropout_prob 0.1 \
+    --attention_probs_dropout_prob 0.1 \
+    --seed 1234 \
+```
+</details>
 
 ### 预训练模型贡献
 PaddleNLP为开发者提供了[community](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/docs/community/contribute_models/contribute_awesome_pretrained_models.rst)模块，用户可以上传自己训练的模型，开源给其他用户使用。
diff --git a/model_zoo/ernie-1.0/data_tools/create_pretraining_data.py b/model_zoo/ernie-1.0/data_tools/create_pretraining_data.py
index d49c511160fb..f74b1e3301a4 100644
--- a/model_zoo/ernie-1.0/data_tools/create_pretraining_data.py
+++ b/model_zoo/ernie-1.0/data_tools/create_pretraining_data.py
@@ -263,6 +263,17 @@ def initializer(self):
 
         def process(text):
             words = Converter.segment_func(text)
+            # if there are two empty word, the should a split dimer in the pos
+            pre_dimer = False
+            for index, w in enumerate(words):
+                if pre_dimer and len(w) == 0:
+                    words[index] = " "
+                    pre_dimer = False
+                elif len(w) == 0:
+                    pre_dimer = True
+                else:
+                    pre_dimer = False
+
             tokens = Converter.tokenizer.tokenize("".join(words))
             tokens = Converter.whole_word_mask(tokens, words)
             tokens = Converter.tokenizer.convert_tokens_to_ids(tokens)
diff --git a/model_zoo/ernie-1.0/scripts/README.md b/model_zoo/ernie-1.0/scripts/README.md
index aef873e51174..a4be04de95f1 100644
--- a/model_zoo/ernie-1.0/scripts/README.md
+++ b/model_zoo/ernie-1.0/scripts/README.md
@@ -1,44 +1,206 @@
-# ERNIE 数据制作全流程
+# ERNIE-CW 从零开始构建预训练模型
 
+ERNIE是百度提出的大规模预训练模型，曾在中文场景下取得了SOTA效果。
+PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 总共400GB，发布ERNIE-CW项目。项目目标：从零开始构建你的预训练模型。
 
-## 数据下载
+ERNIE-CW项目，从数据下载，词表制作，数据转化，模型训练，所有流程，完全开源开放，可复现。
+并训练发布开源最优的模型参数。
+
+接下来将从下面几个方面，详细介绍整个数据制作全流程，从零开始，构建一个预训练模型。
+
+- **大规模**中文数据
+- **高精准**中文分词
+- **全字符**中文词表制作
+- **快速**Token ID 转化
+
+**目录**
+* [大规模中文数据](#大规模中文数据)
+* [高精准中文分词](#高精准中文分词)
+* [中文全字符词表制作](#中文全字符词表制作)
+* [快速Token ID 转化](#快速TokenID转化)
+
+
+## 大规模中文数据
 
 ### CLUECorpus2020  & WuDaoCorpus2.0 Base 数据集
 
-CLUE 提供了约200G左右的语料文本，详细介绍见[官网](https://github.com/CLUEbenchmark/CLUECorpus2020#%E6%95%B0%E6%8D%AE%E4%B8%8B%E8%BD%BD)，用户可以申请下载，方式如下：
+CLUECorpus2020 过对Common Crawl的中文部分进行语料清洗得到。开源部分提供了约200G左右的语料文本，详细介绍见[官网](https://github.com/CLUEbenchmark/CLUECorpus2020#%E6%95%B0%E6%8D%AE%E4%B8%8B%E8%BD%BD)，用户可以通过邮件申请下载，方式如下：
 > 数据下载
 > 申请方式： 将使用语料研究目的和用途，计划、研究机构和申请者介绍，发送到邮箱，并承诺不向第三方提供。
 >
 > 邮箱: CLUEbenchmark@163.com，标题是：CLUECorpus2020 200G语料库
 
+WuDaoCorpora是悟道爬取的中文大规模语料。整体数量为3TB，目前开源的部分为WuDaoCorpus2.0 bases数据集，大小为200GB。
+用户微信登录[官网](https://resource.wudaoai.cn/home)，即可直接下载数据。下载好的压缩数据约 64GB
+```
+64GB WuDaoCorpus2.0_base_200G.rar
+```
 
-WuDaoCorpora是悟道爬取的中文大规模预料。整体数量为3TB，目前开源的部分为WuDaoCorpus2.0 bases数据集，大小为200GB。
-用户微信登录[官网](https://resource.wudaoai.cn/home)，即可直接下载数据。
 
-### 中文预料分词
+## 高精准中文分词
 
 ERNIE 使用知识嵌入的方式进行预训练，如何尽可能精确的从原始文本中提取知识，直接关系预训练模型的效果。
-目前采用的分词方式的有jieba，lac，Wordtag，效果以此
-速度对比，假设CPU使用40线程，GPU使用16卡：
+目前PaddleNLP常用的分词方式的有`jieba`，`lac`，`Wordtag`，
+效果、速度对比表格如下，假设CPU使用40线程，GPU使用16卡，处理200G文本：
 
-| 切词方式 | 效果 | 速度 | 耗时（处理200G）
+| 切词方式 | 效果 | 速度 | 预估耗时
 |-|-|-|-|
 | jieba | 一般 | 607 KB/s |  2.5 h |
 | lac   | 好 | 106 KB/s | 13.9 h
-| wordtag| 最好 | 0.94 KB/s | 159D|
+| wordtag| 最好 | 0.94 KB/s | 159 D (GPU)|
+
+综合考虑分词的效果与速度，我们选择百度的LAC作为我们的文本分词工具。
 
-综合考虑分词的效果与速度，我们选择百度的LAC作为我们的分词工具。
+本文档以WuDao数据为例，对数据进行分词：
 
+```shell
+python wudao_process.py \
+    --input_path WuDaoCorpus2.0_base_200G \
+    --workers 40  \
+    --ouput_path ./wudao_lac_cut \
+```
+注：预训练需要实现 SOP( Sentence Order Predict) 任务，在分词的同时，我们使用 简单规则 进行了文本断句。
 
+文本转化完成后。我们使用 `../data_tools/trans_to_json.py`重新转换为jsonl格式（分词完毕）。
+```shell
+python ../data_tools/trans_to_json.py  \
+    --input_path ./wudao_lac_cut \
+    --output_path wudao_corpus_200g_0623.jsonl \
+    --workers 40 \
+    --no-shuffle
+```
 
+## 中文全字符词表制作
 
-## 词表制作
+词表的制作有两种方案：
 
+第一种，词表组合思路
 1. 统计字符
 2. 制作英文词表
 3. 合并词表
 
-注：此方法拼接产出的词表容易出现UNK的情况。
-如issue[2927](https://github.com/PaddlePaddle/PaddleNLP/issues/2927)
-
-### 英文部分，下载了 WikiText 数据
+第二种，预处理后直接生成
+1. 文本预处理（中文加空格，文本normalize）
+2. 使用sentencepeice制作词表
+
+第二种方案需要对文本先使用`BasicTokenizer`切分一遍语料。
+第一种方案，自定义程度高，但存在一些局限性。本项目采用了第一种方案，详细介绍如下：
+
+### 分析准备
+词表大小： 这里我们考虑的因素主要有两个
+- 已有模型对照：
+    - ERNIE 3.0系列模型的词表，词表大小为 40000 左右。
+- 预训练数据存储占用：
+    - 文本token id化后，希望使用uint16表示，此时表示的最大字符为65536。
+    - 同时考虑到ERNIE虽然是字模型，我们的仍然需要 `##中` 之类的中文字符表示分词信息。假设使用中文全字符20902(0x4E00, 0x9FA5)个字符，那么剩余 vocab 大小不能超过 44634。
+
+综上，ERNIE-CW决定采用 40000 左右的 vocab 容量。
+其中：
+- 中文全字符 `20902`
+- 英文字符 `17000`
+- 其他字符约 `2000` 左右
+
+
+### 文本字符统计
+首先第一步是对文本字符进行统计。字符统计的目的主要是添加常用的中文字符、特殊字符。
+
+由于语料文本过大，我们随机选取 10G 左右的原始文本进行了字符统计。
+```
+python gen_char.py path_to_corpus.txt
+```
+可以在本地文件夹得到`char_dict.pickle`字符频率文件。同时我们也提供了自己统计的词频文件，方便用户复现：
+```
+wget https://xxx.bos/data/char_dict.pickle
+```
+
+### 英文字符词表
+基于字符的词频统计，使得英文字符也切割为字母，为此我们需要添加英文词表。
+英文部分，我们使用了 [WikiText](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip)  数据集，来构造词表。
+下载解压数据，使用BPE切词
+```
+wget  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
+unzip wikitext-103-v1.zip
+python gen_vocab.py ./wikitext-103-raw/wiki.train.raw
+```
+即可产生英文部分的词表。这里我们也提供了处理好的 vocab 方便用户验证。
+```
+wget  https://xxx.bos/data/eng.vocab
+```
+
+
+### 合并词表
+
+目前我们得到了字符统计表，和英文字符词表。下一步，我们将词表进行合并。
+
+将`char_dict.pickle`，`eng.vocab`放置到当前目录，使用下面命令
+```
+python merge_vocab.py
+```
+即可在 当前 目录生成 vocab.txt 得到最终词表。
+
+此阶段需要注意的一些问题是：
+1. 对于一些日文、谚文文字字符，需要进行 normalize
+2. 添加special_tokens
+
+### 问题遗留
+本项目采用的第一种方式，即拼接产出的词表，对连续非中、英文字符文本，会出现UNK的情况。
+如issue: [#2927](https://github.com/PaddlePaddle/PaddleNLP/issues/2927)、 [#2585](https://github.com/PaddlePaddle/PaddleNLP/issues/2585)。ERNIE-CW做了两点改进:
+
+1. 对 Symbol 字符默认添加空格，变成独立字符
+2. 对 日文、谚文 在合并词表阶段默认添加 ## 字符。
+
+虽然有上述两点修复，任然无法避免 [#2927](https://github.com/PaddlePaddle/PaddleNLP/issues/2927) 现象。
+彻底解决的话，建议使用第二种方式制作vocab文件。
+
+### 方案二：预处理后直接生成
+此方案没有被采用，这里也简单说明一下具体的方案：
+1. 对语料使用 BasicTokenizer 转换
+```python
+from paddlenlp.transformers import
+tokenizer = BasicTokenizer()
+basic_toknizer = lambda x: " ".join(tokenizer.tokenize(x))
+# 对语料使用 basic_toknizer 转换
+# 并存储为新的语料 afer_basic_toknizer_corpus.txt
+```
+2. 处理转换后的语料
+```shell
+python gen_vocab.py afer_basic_toknizer_corpus.txt
+```
+对处理好的vocab文件手动替换一些`<pad> -> [PAD]`之类的special_tokens，即可产出词表。
+
+
+## 快速Token ID 转化
+
+预料、词表准备妥当后，我们可以开始进行最后的数据ID转化。
+
+- 高效的 Multiprocessing 多进程实现
+- 使用内存BytesIO存储ID数据
+
+由于转换的逻辑复杂，需要定义`class Converter`对象来进行转化处理。如果每次处理新的文本，都实例化一次class对象，速度瓶颈会在处理函数的实例化。
+我们使用了提前multiprocessing.Pool的`initializer`，对处理函数进行提前实例化，提高处理效率。
+
+处理后的token id数量巨大，可以达到数百Billion，如果使用普通的数据结构，如python的list保存，会出现存储瓶颈，不仅占用空间大，list对象还需要重新分配内存空间。这里我们采用了 BytesIO 的方式，类似写入内存文件的方式，速度快，可以非常方便转化为numpy文件保存。
+
+使用 Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz CPU测试，40线程，处理速度 8+MB/s，约7个小时左右，即可完成 200GB 文本转化为ID.
+
+```
+python -u  create_pretraining_data.py \
+    --model_name ./vocab_path/vocab.txt \
+    --tokenizer_name ErnieTokenizer \
+    --input_path wudao_corpus_200g_0623.jsonl \
+    --split_sentences\
+    --chinese \
+    --cn_splited \
+    --cn_whole_word_segment \
+    --output_prefix wudao_200g_0703 \
+    --workers 40 \
+    --log_interval 1000
+```
+转化后的数据如下，使用这份数据，即可开始ERNIE-CW预训练
+```
+-rw-rw-r-- 1 500 501 129G Jul  4 03:39 wudao_200g_0703_ids.npy
+-rw-rw-r-- 1 500 501 6.4G Jul  4 03:39 wudao_200g_0703_idx.npz
+```
+
+## 其他
+- 感谢CLUE，WuDao提供的开源数据
diff --git a/model_zoo/ernie-1.0/scripts/gen_vocab.py b/model_zoo/ernie-1.0/scripts/gen_vocab.py
index 595dabcf9567..79480f9a4f44 100644
--- a/model_zoo/ernie-1.0/scripts/gen_vocab.py
+++ b/model_zoo/ernie-1.0/scripts/gen_vocab.py
@@ -12,10 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import sys
 import sentencepiece as spm
 
+input_path = sys.argv[1]
+print("Generate vocabulary file for corpus:  ", input_path)
+
 spm.SentencePieceTrainer.train(
-    input='../wikitext/wiki.all.raw',
+    input=input_path,
     model_prefix='eng',
     vocab_size=17000,
     model_type="BPE",
diff --git a/model_zoo/ernie-1.0/scripts/merge_vocab.py b/model_zoo/ernie-1.0/scripts/merge_vocab.py
index 8907f8ae56d9..07f472e4a391 100644
--- a/model_zoo/ernie-1.0/scripts/merge_vocab.py
+++ b/model_zoo/ernie-1.0/scripts/merge_vocab.py
@@ -31,48 +31,49 @@
 normalize_chars = lambda x: "".join(bt.tokenize(x))
 
 
+# 20902 个中文全字符
 def chinese_char():
     return set([chr(x) for x in range(0x4E00, 0x9FA5 + 1)])
 
 
+# 日文 或 谚文字母
 def jk_vocab(c):
     c = ord(c)
     return (c >= 0x3040 and c<= 0x33FF) or \
               (c>= 0x1100 and c<=0x11FF)   #  谚文字母
 
 
+# 特殊 TOKEN
 def add_special_token():
     return ["[PAD]", "[CLS]", "[SEP]", "[MASK]", "[UNK]"]
 
 
 char_dict = pickle.load(open("char_dict.pickle", "rb"))
-cjk_vocab = chinese_char()
-
+chinese_vocab = chinese_char()
 final_vocab = set()
-
-# Not in use char
-# final_vocab.add(" ")
-# final_vocab.add("\n")
-
 other_char = []
 
 
 def add_vocab(char, f):
     if re_sep_eng.match(char):
+        # Delete <pad> tokens in eng.vocab
         return
-    # add eng vocab and specical token
+
+    # Add eng vocab and specical token
     if re_eng.match(char) or re_sep.match(char):
         if char not in final_vocab:
             final_vocab.add(char)
             f.write(f"{char}\n")
         return
-    # add japanese and Korean char
-    if len(char) > 1 and char.startswith("##") and cjk_vocab(char[2]):
+
+    # Add chinese char
+    if len(char) > 1 and char.startswith("##") and chinese_vocab(char[2]):
         if char not in final_vocab:
             final_vocab.add(char)
             f.write(f"{char}\n")
         return
 
+    # Normalize char， 部分字符 nioe
     char = normalize_chars(char)
     for i, k in enumerate(char):
         if _is_whitespace(k) or _is_control(k):
@@ -84,6 +85,7 @@ def add_vocab(char, f):
             final_vocab.add(k)
             f.write(f"{k}\n")
             if jk_vocab(k):
+                # add "##" for japanese and korean char
                 add_vocab("##" + k, f)
 
 
@@ -92,48 +94,43 @@ def add_vocab(char, f):
         add_vocab(x, f)
 
     res = sorted(char_dict.items(), key=lambda x: -x[1])
-    # Add cjk by freq
+
+    # Add chinse char by freq
     for x in res:
         k, v = x
         k = normalize_chars(k)
-        if k in cjk_vocab:
+        if k in chinese_vocab:
             add_vocab(k, f)
-            cjk_vocab.remove(k)
-    # if cjk not in freq add it
-    cjk_vocab = sorted(cjk_vocab)
-    while len(cjk_vocab) > 0:
-        k = cjk_vocab.pop()
+            chinese_vocab.remove(k)
+
+    # If chinse char not in freq add it
+    chinese_vocab = sorted(chinese_vocab)
+    while len(chinese_vocab) > 0:
+        k = chinese_vocab.pop()
         if k not in final_vocab:
             f.write(f"{k}\n")
             final_vocab.add(k)
+
+    # And english vocab part
     with open("eng.vocab") as ec:
         line = ec.readline()
         while line:
             k, v = line.strip().split()
             if "▁" in k:
+                # remove "▁" in eng vocab
                 k = k[1:]
             elif re_sep_eng.match(k):
                 pass
             else:
+                # add "##" for eng vocab
                 k = "##" + k
 
             add_vocab(k, f)
             line = ec.readline()
+
+    # Add additional tokens in corpus
+    # such as japanese and korean char and other symbols
     for x in res:
         k, v = x
         if v >= 200:
             add_vocab(k, f)
-
-    # addition = []
-    # for x in res:
-    #     oldk,v = x
-    #     k = normalize_chars(oldk)
-    #     for c in k:
-    #         if c not in final_vocab and  v >= 200:
-    #             addition.append(c)
-    #             final_vocab.add(c)
-    # for k in sorted(addition):
-    #     f.write(f"{k}\n")
-
-# for k in sorted(other_char, key= lambda x:ord(x)):
-#     print(k)
diff --git a/model_zoo/ernie-1.0/scripts/wudao_process.py b/model_zoo/ernie-1.0/scripts/wudao_process.py
index fb24511695c2..14443bb55c09 100644
--- a/model_zoo/ernie-1.0/scripts/wudao_process.py
+++ b/model_zoo/ernie-1.0/scripts/wudao_process.py
@@ -12,8 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-input_path = "WuDaoCorpus2.0_base_200G/"
-
 import json
 import re
 import argparse
@@ -37,6 +35,7 @@ def get_args():
                         help='Number of worker processes to launch')
     parser.add_argument('--output_path',
                         type=str,
+                        default="./tmp",
                         help='Path to save the output json files.')
     parser.add_argument('--log_interval',
                         type=int,
@@ -88,8 +87,8 @@ def process(line):
 split_chars = ['。', '?', '？', ';', '；', '!', '！']
 
 
-def text_to_text(path):
-    out_name = "./tmp/" + path[-20:]
+def text_to_text(output_path, path):
+    out_name = os.path.join(output_path, path[-20:])
     print("Loading %s" % path)
     with open(path, "r") as f:
         try:
@@ -113,13 +112,15 @@ def text_to_text(path):
             count += 1
             text = js["content"]
             data_len += len(text.encode("utf-8"))
-            # make special char only once, because of those token will be treat as sentence spliter.
+            # make special char only once,
+            # because of those token will be treat as sentence spliter.
+            # 此处为断句逻辑
             for char in special_chars:
                 text = re.sub('[' + char + ']+[ ]*', char, text)
-            # space will be treat as comma, WARM, not in eng
-            # text = text.replace(" ", "，")
             for char in split_chars:
                 text = text.replace(char, char + "\n")
+
+            # 此处为分词逻辑
             final = ""
             for line in text.split("\n"):
                 if len(line) == 0:
@@ -127,8 +128,6 @@ def text_to_text(path):
                 words = seg_func(line)
                 final += " ".join(words) + "\n"
             f.write(final + "\n")
-            # if count % 100 == 0:
-            #     print("speed: ", data_len/1024/(time.time() - s))
 
     return data_len, None
 
@@ -152,7 +151,12 @@ def main():
     total_bytes_processed = 0
     print("Time to startup:", startup_end - startup_start)
 
-    encoded_files = pool.imap(text_to_text, file_paths, 1)
+    if not os.path.exists(args.output_path):
+        os.makedirs(args.output_path)
+
+    trans_func = partial(text_to_text, output_path=args.output_path)
+
+    encoded_files = pool.imap(trans_func, file_paths, 1)
 
     out_paths = []
     for i, (bytes_processed, out_path) in enumerate(encoded_files, start=1):

From 3dbb34f99acbb0c765a70a39dc14822a14887c19 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Mon, 22 Aug 2022 23:22:49 +0800
Subject: [PATCH 28/48] fix

---
 model_zoo/ernie-1.0/README.md         | 2 +-
 model_zoo/ernie-1.0/scripts/README.md | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/model_zoo/ernie-1.0/README.md b/model_zoo/ernie-1.0/README.md
index 69a66859dd84..6e32059b8351 100644
--- a/model_zoo/ernie-1.0/README.md
+++ b/model_zoo/ernie-1.0/README.md
@@ -264,7 +264,7 @@ ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |	58.91 |	62.25 |	81.68 |	76.58 |	85
 Model | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUEWSC2020 | CSL | CMRC | CHID | C3
 -- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
 Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc| Acc| Acc
-ERNIE 1.0-Large-zh-CW | 24L1024H | 79.03 | 75.97 |	59.65 |	62.91 |	85.09 |	81.73||	93.09 |	84.53 | 74.22/91.88 | 88.57 | 84.54
+ERNIE 1.0-Large-zh-CW | 24L1024H | 79.03 | 75.97 |	59.65 |	62.91 |	85.09 |	81.73| 93.09 |	84.53 | 74.22/91.88 | 88.57 | 84.54
 ERNIE 3.0-Xbase-zh| 20L1024H | 78.71 | 76.85 |	59.89 |	62.41 |	84.76 |	82.51 |	89.80 |	84.47 |	75.49/92.67 | 86.36 | 84.59
 RoBERTa-wwm-ext-large | 24L1024H | 76.61 |	76.00 |	59.33 |	62.02 |	83.88 |	78.81 |	90.79 |	83.67 |	70.58/89.82 |	85.72 |	75.26
 
diff --git a/model_zoo/ernie-1.0/scripts/README.md b/model_zoo/ernie-1.0/scripts/README.md
index a4be04de95f1..a7f7c2a55e1d 100644
--- a/model_zoo/ernie-1.0/scripts/README.md
+++ b/model_zoo/ernie-1.0/scripts/README.md
@@ -22,7 +22,7 @@ ERNIE-CW项目，从数据下载，词表制作，数据转化，模型训练，
 
 ## 大规模中文数据
 
-### CLUECorpus2020  & WuDaoCorpus2.0 Base 数据集
+**CLUECorpus2020语料**
 
 CLUECorpus2020 过对Common Crawl的中文部分进行语料清洗得到。开源部分提供了约200G左右的语料文本，详细介绍见[官网](https://github.com/CLUEbenchmark/CLUECorpus2020#%E6%95%B0%E6%8D%AE%E4%B8%8B%E8%BD%BD)，用户可以通过邮件申请下载，方式如下：
 > 数据下载
@@ -30,6 +30,8 @@ CLUECorpus2020 过对Common Crawl的中文部分进行语料清洗得到。开
 >
 > 邮箱: CLUEbenchmark@163.com，标题是：CLUECorpus2020 200G语料库
 
+**WuDaoCorpus2.0 Base 语料**
+
 WuDaoCorpora是悟道爬取的中文大规模语料。整体数量为3TB，目前开源的部分为WuDaoCorpus2.0 bases数据集，大小为200GB。
 用户微信登录[官网](https://resource.wudaoai.cn/home)，即可直接下载数据。下载好的压缩数据约 64GB
 ```
@@ -74,12 +76,12 @@ python ../data_tools/trans_to_json.py  \
 
 词表的制作有两种方案：
 
-第一种，词表组合思路
+第一种，词表组合方案
 1. 统计字符
 2. 制作英文词表
 3. 合并词表
 
-第二种，预处理后直接生成
+第二种，预处理后直接生成，方案
 1. 文本预处理（中文加空格，文本normalize）
 2. 使用sentencepeice制作词表
 

From 4bbd3fb5d4d11d051b2f8a748e431a227b0afb26 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Wed, 24 Aug 2022 16:04:29 +0800
Subject: [PATCH 29/48] throw error when dataset is invalid.

---
 model_zoo/ernie-1.0/data_tools/README.md      |  9 ++++---
 .../ernie-1.0/data_tools/dataset_utils.py     | 26 ++++++++++++++-----
 model_zoo/ernie-1.0/data_tools/helpers.cpp    | 10 +++++--
 model_zoo/ernie-1.0/run_pretrain.py           |  1 +
 model_zoo/ernie-1.0/run_pretrain_static.py    |  1 +
 model_zoo/gpt/README.md                       |  3 ++-
 model_zoo/gpt/dataset.py                      | 13 +++++++---
 7 files changed, 47 insertions(+), 16 deletions(-)

diff --git a/model_zoo/ernie-1.0/data_tools/README.md b/model_zoo/ernie-1.0/data_tools/README.md
index b0cd63ae86f9..4cf60cbb1348 100644
--- a/model_zoo/ernie-1.0/data_tools/README.md
+++ b/model_zoo/ernie-1.0/data_tools/README.md
@@ -29,10 +29,11 @@
  - tqdm
  - numpy
  - pybind11
+ - tool_helpers
  - lac (可选)
  - zstandard (可选)
 
-安装命令`pip install tqdm numpy pybind11 lac zstandard`。另，部分功能需要`g++>=4.8`编译支持
+安装命令`pip install tqdm numpy pybind11 tool_helpers lac zstandard`。另，部分功能需要`g++>=4.8`编译支持
 
 
 ## 训练全流程数据Pipeline
@@ -154,14 +155,14 @@ common config:
 ```
 通过下面脚本转化，我们可以得到处理好的预训练数据，token ids:`baike_sample_ids.npy`, 文章索引信息`baike_sample_idx.npz`.
 ```
-python -u  create_pretraining_data.py \
+python -u  data_tools/create_pretraining_data.py \
     --model_name ernie-1.0-base-zh \
     --tokenizer_name ErnieTokenizer \
-    --input_path baike_sample.jsonl \
+    --input_path eng_sample.jsonl \
     --split_sentences\
     --chinese \
     --cn_whole_word_segment \
-    --output_prefix baike_sample  \
+    --output_prefix eng_sample  \
     --workers 1 \
     --log_interval 5
 ```
diff --git a/model_zoo/ernie-1.0/data_tools/dataset_utils.py b/model_zoo/ernie-1.0/data_tools/dataset_utils.py
index b56e9251e8f6..4023413eff21 100755
--- a/model_zoo/ernie-1.0/data_tools/dataset_utils.py
+++ b/model_zoo/ernie-1.0/data_tools/dataset_utils.py
@@ -1,5 +1,7 @@
 # coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors, and NVIDIA, and PaddlePaddle Authors.
+
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+# Copyright 2018 The Google AI Language Team Authors, and NVIDIA.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -89,7 +91,13 @@ def __init__(self, datasets, weights):
 
         while True:
             try:
-                import data_tools.helpers as helpers
+                try:
+                    from tool_helpers import helpers
+                except Exception as ine:
+                    print_rank_0(
+                        ' > missing tool_helpers, pip install tool_helpers please, try to compile locally.'
+                    )
+                    import data_tools.helpers as helpers
                 break
             except Exception as e:
                 if local_rank == 0:
@@ -97,7 +105,6 @@ def __init__(self, datasets, weights):
                 print_rank_0('> wait for hepers to be compiled!')
                 time.sleep(1)
 
-        import data_tools.helpers as helpers
         helpers.build_blending_indices(self.dataset_index,
                                        self.dataset_sample_index, weights,
                                        num_datasets, self.size, local_rank == 0)
@@ -868,9 +875,16 @@ def get_samples_mapping(indexed_dataset, data_prefix, num_epochs,
         print_rank_0(
             ' > building sapmles index mapping for {} ...'.format(name))
         # First compile and then import.
-        if local_rank == 0:
-            compile_helper()
-        import data_tools.helpers as helpers
+        try:
+            from tool_helpers import helpers
+        except ModuleNotFoundError:
+            print_rank_0(
+                ' > missing tool_helpers, pip install tool_helpers please, try to compile locally.'
+            )
+            if local_rank == 0:
+                compile_helper()
+            import data_tools.helpers as helpers
+
         samples_mapping = helpers.build_mapping(indexed_dataset.doc_idx,
                                                 indexed_dataset.sizes,
                                                 num_epochs, max_num_samples,
diff --git a/model_zoo/ernie-1.0/data_tools/helpers.cpp b/model_zoo/ernie-1.0/data_tools/helpers.cpp
index 1b7c9b5e50d9..c66e740bc1ee 100644
--- a/model_zoo/ernie-1.0/data_tools/helpers.cpp
+++ b/model_zoo/ernie-1.0/data_tools/helpers.cpp
@@ -250,6 +250,8 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
          << std::flush;
     cout << "     maximum sequence length:        " << max_seq_length << endl
          << std::flush;
+    cout << "     minimum sentences num:          " << min_num_sent << endl
+         << std::flush;
     cout << "     short sequence probability:     " << short_seq_prob << endl
          << std::flush;
     cout << "     short sequence ration (1/prob): " << short_seq_ratio << endl
@@ -290,12 +292,17 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
         }
         break;
       }
+      if( epoch > 0 &&  map_index == 0 ){
+        cout << endl << "     No available documtment find this dataset." << endl << std::flush;
+        throw std::invalid_argument(
+          "Invalid dataset! the documtment should be with more than " 
+          + std::to_string(min_num_sent) + " scentences.");
+      }
       // For each document:
       for (int32_t doc = 0; doc < (docs.shape(0) - 1); ++doc) {
         // Document sentences are in [sent_index_first, sent_index_last)
         const auto sent_index_first = docs[doc];
         const auto sent_index_last = docs[doc + 1];
-
         // At the begining of the document previous index is the
         // start index.
         auto prev_start_index = sent_index_first;
@@ -327,7 +334,6 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
             }
           }
         }
-
         // If we have more than two sentences.
         if ((num_remain_sent >= min_num_sent) && (!contains_long_sentence)) {
           // Set values.
diff --git a/model_zoo/ernie-1.0/run_pretrain.py b/model_zoo/ernie-1.0/run_pretrain.py
index 2e47f8bd89b1..2afe4b3657c7 100644
--- a/model_zoo/ernie-1.0/run_pretrain.py
+++ b/model_zoo/ernie-1.0/run_pretrain.py
@@ -461,6 +461,7 @@ def do_train(args):
         data_world_size=worker_num,
         data_world_rank=worker_index,
         max_seq_len=args.max_seq_len,
+        binary_head=args.binary_head,
         current_step=global_step)
 
     # load checkpoint vars
diff --git a/model_zoo/ernie-1.0/run_pretrain_static.py b/model_zoo/ernie-1.0/run_pretrain_static.py
index e10d13d60333..71809268958a 100644
--- a/model_zoo/ernie-1.0/run_pretrain_static.py
+++ b/model_zoo/ernie-1.0/run_pretrain_static.py
@@ -451,6 +451,7 @@ def do_train(args):
             max_seq_len=args.max_seq_len,
             places=paddle.static.cuda_places(),
             data_holders=data_holders,
+            binary_head=args.binary_head,
             current_step=global_step)
         fleet.init(is_collective=True)
 
diff --git a/model_zoo/gpt/README.md b/model_zoo/gpt/README.md
index c46dd35f3586..65e4b62aa43a 100644
--- a/model_zoo/gpt/README.md
+++ b/model_zoo/gpt/README.md
@@ -29,13 +29,14 @@ GPT-[2](https://cdn.openai.com/better-language-models/language_models_are_unsupe
 - regex
 - sentencepiece >= 0.1.94
 - tqdm
+- tool_helpers
 - visualdl
 - paddlepaddle-gpu >= 2.2rc
 - pybind11
 - lac (可选)
 - zstandard (可选)
 
-安装命令 `pip install regex sentencepiece tqdm visualdl pybind11 lac zstandard`。
+安装命令 `pip install regex sentencepiece tqdm visualdl tool_helpers pybind11 lac zstandard`。
 注：需要PaddlePaddle版本大于等于2.2rc，或者使用最新develop版本，安装方法请参见Paddle[官网](https://www.paddlepaddle.org.cn)。
 
 ### 数据准备
diff --git a/model_zoo/gpt/dataset.py b/model_zoo/gpt/dataset.py
index 8bb8f19a742f..88d4c15deec9 100755
--- a/model_zoo/gpt/dataset.py
+++ b/model_zoo/gpt/dataset.py
@@ -87,7 +87,10 @@ def construct_samples_and_shuffle_data(name, data_prefix, documents, sizes,
             assert doc_idx.dtype == np.int32
             assert sizes.dtype == np.int32
 
-            import data_tools.helpers as helpers
+            try:
+                from tool_helpers import helpers
+            except Exception as e:
+                import data_tools.helpers as helpers
 
             sample_idx = helpers.build_sample_idx(sizes, doc_idx, seq_length,
                                                   num_epochs, tokens_per_epoch)
@@ -275,7 +278,7 @@ def create_pretrained_dataset(
 
     if local_rank == 0:
         try:
-            import data_tools.helpers as helpers
+            from tool_helpers import helpers
         except Exception as e:
             start_time = time.time()
             print('> compiling dataset index builder ...')
@@ -285,6 +288,7 @@ def create_pretrained_dataset(
                 '>>> done with dataset index builder. Compilation time: {:.3f} '
                 'seconds'.format(time.time() - start_time),
                 flush=True)
+            import data_tools.helpers as helpers
 
     device_world_size = paddle.distributed.get_world_size()
     device_world_rank = paddle.distributed.get_rank()
@@ -292,7 +296,10 @@ def create_pretrained_dataset(
     if device_world_size > 1 and local_rank != 0:
         while True:
             try:
-                import data_tools.helpers as helpers
+                try:
+                    from tool_helpers import helpers
+                except Exception as ine:
+                    import data_tools.helpers as helpers
                 break
             except Exception as e:
                 print("> wait for helpers to be compiled!")

From 93b89a0e1eedcb90a247537e21d335db9ede04e7 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 25 Aug 2022 00:45:04 +0800
Subject: [PATCH 30/48] update document.

---
 model_zoo/ernie-1.0/README.md                 | 124 +++++++++++++-----
 model_zoo/ernie-1.0/data_tools/README.md      |   8 +-
 .../data_tools/create_pretraining_data.py     |  19 +--
 model_zoo/ernie-1.0/run_pretrain.py           |  55 +++++---
 4 files changed, 147 insertions(+), 59 deletions(-)

diff --git a/model_zoo/ernie-1.0/README.md b/model_zoo/ernie-1.0/README.md
index 6e32059b8351..956ebb439816 100644
--- a/model_zoo/ernie-1.0/README.md
+++ b/model_zoo/ernie-1.0/README.md
@@ -1,12 +1,34 @@
 # ERNIE: Enhanced Representation through kNowledge IntEgration
 
+**目录**
+- [1. 模型简介](#模型简介)
+    - [1.1 目录结构](#目录结构)
+    - [1.1 环境依赖](#环境依赖)
+- [2. 中文预训练](#中文预训练)
+    - [2.1 小规模语料预训练: 14GB - CLUECorpusSmall](#CLUECorpusSmall)
+    - [2.2 大规模语料预训练: 400GB - CLUE & WuDao](#ERNIE-CW)
+    - [2.3 预训练模型贡献](#预训练模型贡献)
+- [3. 下游任务微调](#下游任务微调)
+  - [3.1 序列分类](#序列分类)
+  - [3.2 Token分类](#序列分类)
+  - [3.3 阅读理解](#阅读理解)
+- [4. 预测部署](#预测部署)
+- [5. 参考文献](#参考文献)
+
+
+
+
+<a name="模型简介"></a>
+
+## 1. 模型简介
+
 ERNIE是百度开创性提出的基于知识增强的持续学习语义理解框架，它将大数据预训练与多源丰富知识相结合，通过持续学习技术，不断吸收海量文本数据中词汇、结构、语义等方面的知识，实现模型效果不断进化。
 
 ERNIE在情感分析、文本匹配、自然语言推理、词法分析、阅读理解、智能问答等16个公开数据集上全面显著超越世界领先技术，在国际权威的通用语言理解评估基准GLUE上，得分首次突破90分，获得全球第一。
 相关创新成果也被国际顶级学术会议AAAI、IJCAI收录。
 同时，ERNIE在工业界得到了大规模应用，如搜索引擎、新闻推荐、广告系统、语音交互、智能客服等。
 
-ERNIE 1.0 通过建模海量数据中的词、实体及实体关系，学习真实世界的语义知识。相较于 BERT 学习原始语言信号，ERNIE 直接对先验语义知识单元进行建模，增强了模型语义表示能力。
+ERNIE 通过建模海量数据中的词、实体及实体关系，学习真实世界的语义知识。相较于 BERT 学习原始语言信号，ERNIE 直接对先验语义知识单元进行建模，增强了模型语义表示能力。
 
 这里我们举个例子：
 ```
@@ -15,6 +37,21 @@ Learnt by ERNIE：[mask] [mask] [mask] 是黑龙江的省会，国际 [mask] [ma
 ```
 在 BERT 模型中，我们通过『哈』与『滨』的局部共现，即可判断出『尔』字，模型没有学习与『哈尔滨』相关的任何知识。而 ERNIE 通过学习词与实体的表达，使模型能够建模出『哈尔滨』与『黑龙江』的关系，学到『哈尔滨』是 『黑龙江』的省会以及『哈尔滨』是个冰雪城市。
 
+<a name="项目特色"></a>
+
+**项目特色**
+- **中文预训练**
+    - 提供了完整中文预训练流程，从词表构造、数据处理、任务训练，到下游任务。
+    - 提供中文Whole Word Mask，支持文本动态Mask。
+- **数据流程**，
+    - 数据预处理流程高效，40分钟即可完成14G ERNIE数据制作。
+    - 数据稳定可复现，多数据集即插即用。
+- **分布式训练**，
+    - 支持多机多卡，支持混合精度、重计算、梯度累积等功能。
+
+<a name="目录结构"></a>
+
+### 1.1 目录结构
 
 整体的目录结构如下：
 
@@ -41,7 +78,10 @@ Learnt by ERNIE：[mask] [mask] [mask] 是黑龙江的省会，国际 [mask] [ma
 ├── run_pretrain_static.py
 └── run_pretrain_trainer.py
 ```
-## 环境依赖
+
+<a name="环境依赖"></a>
+
+### 1.2 环境依赖
 
 - visualdl
 - pybind11
@@ -49,19 +89,26 @@ Learnt by ERNIE：[mask] [mask] [mask] 是黑龙江的省会，国际 [mask] [ma
 安装命令 `pip install visualdl pybind11`
 
 
-## 中文预训练
-ERNIE预训练采用的是MLM（Mask Language Model）的训练方式，采用WWM（Whole Word Mask）方式，对于完整语义单元的Token，会同时进行Mask。整体的训练损失loss是mlm_loss + nsp_loss。
+<a name="中文预训练"></a>
+
+## 2. 中文预训练
+ERNIE预训练采用的是MLM（Mask Language Model）的训练方式，采用WWM（Whole Word Mask）方式，对于完整语义单元的Token，会同时进行Mask。整体的训练损失loss是mlm_loss + sop_loss。
+
+本样例为用户提供了高效的训练流程，
+- **支持动态文本mask**： 用户可以根据自己的需求，灵活修改mask方式。具体可以参考修改`data_tools/dataset_utils.py`中`create_masked_lm_predictions`函数。
+- **支持自动断点训练重启恢复**。 用户可以设置`checkpoint_steps`，间隔`checkpoint_steps`数，即保留最新的checkpoint到`model_last`文件夹。重启训练时，程序默认从最新checkpoint重启训练，学习率、数据集都可以恢复到checkpoint时候的状态。
 
-本样例为用户提供了高效的训练流程，支持动态文本mask，自动断点训练重启等功能。
-用户可以根据自己的需求，灵活修改mask方式。具体可以参考修改`data_tools/dataset_utils.py`中`create_masked_lm_predictions`函数。
-用户可以设置`checkpoint_steps`，间隔`checkpoint_steps`数，即保留最新的checkpoint到`model_last`文件夹。重启训练时，程序默认从最新checkpoint重启训练，学习率、数据集都可以恢复到checkpoint时候的状态。
 
+<a name="CLUECorpusSmall"></a>
+
+### 2.1 小规模语料预训练: 14GB - CLUECorpusSmall
 下面是使用CLUECorpusSmall 14G文本进行预训练的流程：
+
 <details>
 <summary><b>CLUECorpusSmall 数据集预训练</b></summary>
 
-### 数据准备
-数据下载部分请参考[data_tools]目录，根据文档中`CLUECorpusSmall 数据集处理教程`，下载数据。下载好后:
+#### 数据准备
+数据下载部分请参考[data_tools](./data_tools)目录，根据文档中`CLUECorpusSmall 数据集处理教程`，下载数据。下载好后:
 
 解压文件
 ```shell
@@ -94,7 +141,7 @@ clue_corpus_small_14g_20220104_ids.npy
 clue_corpus_small_14g_20220104_idx.npz
 ```
 
-###  开始训练
+####  开始训练
 
 将制作好的数据`clue_corpus_small_14g_20220104_ids.npy,clue_corpus_small_14g_20220104_idx.npz`移动到input_dir中，即可开始训练。
 这里以8卡训练为例任务脚本为例：
@@ -160,7 +207,7 @@ python -u  -m paddle.distributed.launch \
 - visualdl的日志在 `./output/ernie-1.0-dp8-gb512/train_log/xxx` 中。
 
 
-### CLUECorpusSmall 数据集训练效果
+#### CLUECorpusSmall 数据集训练效果
 
 使用创建好的训练clue_corpus_small_14g数据集。使用本训练脚本, batch_size=512, max_steps=100w，[详细训练日志](https://www.paddlepaddle.org.cn/paddle/visualdl/service/app/index?id=3fddf650db14b9319f9dc3a91dfe4ac6)
 
@@ -193,16 +240,19 @@ ERINE-1.0-cluecorpussmall | 12L768H | 73.24(-0.54) | 74.26 | 57.24 | 60.79 | 81.
 - `ERINE-1.0-cluecorpussmall`复现版本，采用的是batch_size=512、steps=100w。
 </details>
 
+<a name="ERNIE-CW"></a>
 
-### ERNIE-CW 预训练流程
+### 2.2 大规模语料预训练: 400GB - CLUE & WuDao
 
-PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 总共400GB，发布ERNIE-CW项目。让用户可以从零开始构建你的预训练模型。
+PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 总共400GB，提供大规模语料训练教程，让用户可以从零开始构建，基于大规模语料，训练预训练模型。
 
-ERNIE-CW项目，从数据下载，词表制作，数据转化，模型训练，所有流程，完全开源开放，可复现。
+本教程，从数据下载，词表制作，数据转化，模型训练，所有流程，完全开源开放，可复现。
 并训练发布开源最优的模型参数。
 
+#### 数据制作
+
 数据下载，词表制作，数据转化部分，请参见[此处](./scripts/README.md)。
-接下来我们主要介绍训练流程部分的特性
+接下来我们主要介绍训练流程部分的特性：
 
 
 训练结构：
@@ -245,31 +295,28 @@ ERNIE-CW项目，从数据下载，词表制作，数据转化，模型训练，
 
 **训练效果方面**，我们release了base、large两个模型。均取得了较好的预训练效果。
 
-**ERNIE 3.0-Base-zh-CW** 模型：
-
-- 使用CLUE，WuDao共计400GB的语料，batch_size 1024, 训练 400w step，即可训练得到`ernie-3.0-base-zh`类似的模型效果。相关模型参数，开源为`ernie-3.0-base-zh-cw`，用户加载即可使用。
-使用CLUE benchmark 对最优超参数进行GradSearch搜索：
+- **ERNIE 3.0-Base-zh-CW** 模型：
+    - 使用CLUE，WuDao共计400GB的语料，batch_size 1024, 训练 400w step，即可训练得到`ernie-3.0-base-zh`类似的模型效果。相关模型参数，开源为`ernie-3.0-base-zh-cw`，用户加载即可使用。使用CLUE benchmark 对最优超参数进行GradSearch搜索：
 
 Model | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUEWSC2020 | CSL | CMRC | CHID | C3
 -- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
 Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc| Acc| Acc
-ERNIE 3.0-Base-zh-CW | 12L768H | 76.44 | 76.04 |	58.02 |	60.87 |	83.56 | 78.61 |	89.14 |	84.00 |  72.26/90.40 |	84.73 |	77.15 |
-ERNIE 2.0-Base-zh | 12L768H | 74.95  | 76.25 |	58.53 |	61.72 |	83.07 |	78.81 |	84.21 |	82.77 | 68.22/88.71	| 82.78	| 73.19
-ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |	58.91 |	62.25 |	81.68 |	76.58 |	85.20 |	82.77 | 67.32/87.83 | 82.47 | 69.68
+ERNIE 3.0-Base-zh-CW | 12L768H | 76.44 | 76.04 |    58.02 |    60.87 |    83.56 | 78.61 |    89.14 |    84.00 |  72.26/90.40 |    84.73 |    77.15 |
+ERNIE 2.0-Base-zh | 12L768H | 74.95  | 76.25 |    58.53 |    61.72 |    83.07 |    78.81 |    84.21 |    82.77 | 68.22/88.71    | 82.78    | 73.19
+ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |    58.91 |    62.25 |    81.68 |    76.58 |    85.20 |    82.77 | 67.32/87.83 | 82.47 | 69.68
 
-**ERNIE 1.0-Large-zh-CW** 模型：
-- 除了base模型外，我们还训练了放出了large模型。此模型参数采用的是词表与ernie-1.0相同，因此命名为`ernie-1.0-large-zh-cw`。
-使用开源语料，batch_size 512, 训练 400w step，训练去除SOP任务，只保留MLM损失：
+- **ERNIE 1.0-Large-zh-CW** 模型：
+    - 除了base模型外，我们还训练了放出了large模型。此模型参数采用的是词表与ernie-1.0相同，因此命名为`ernie-1.0-large-zh-cw`。使用开源语料，batch_size 512, 训练 400w step，训练去除SOP任务，只保留MLM损失：
 
 Model | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUEWSC2020 | CSL | CMRC | CHID | C3
 -- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
 Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc| Acc| Acc
-ERNIE 1.0-Large-zh-CW | 24L1024H | 79.03 | 75.97 |	59.65 |	62.91 |	85.09 |	81.73| 93.09 |	84.53 | 74.22/91.88 | 88.57 | 84.54
-ERNIE 3.0-Xbase-zh| 20L1024H | 78.71 | 76.85 |	59.89 |	62.41 |	84.76 |	82.51 |	89.80 |	84.47 |	75.49/92.67 | 86.36 | 84.59
-RoBERTa-wwm-ext-large | 24L1024H | 76.61 |	76.00 |	59.33 |	62.02 |	83.88 |	78.81 |	90.79 |	83.67 |	70.58/89.82 |	85.72 |	75.26
-
+ERNIE 1.0-Large-zh-CW | 24L1024H | 79.03 | 75.97 |    59.65 |    62.91 |    85.09 |    81.73| 93.09 |    84.53 | 74.22/91.88 | 88.57 | 84.54
+ERNIE 3.0-Xbase-zh| 20L1024H | 78.71 | 76.85 |    59.89 |    62.41 |    84.76 |    82.51 |    89.80 |    84.47 |    75.49/92.67 | 86.36 | 84.59
+RoBERTa-wwm-ext-large | 24L1024H | 76.61 |    76.00 |    59.33 |    62.02 |    83.88 |    78.81 |    90.79 |    83.67 |    70.58/89.82 |    85.72 |    75.26
 
 
+###  开始训练
 <details>
 <summary><b>训练脚本如下</b></summary>
 
@@ -332,6 +379,8 @@ python3 -u  -m paddle.distributed.launch \
 ```
 </details>
 
+<a name="预训练模型贡献"></a>
+
 ### 预训练模型贡献
 PaddleNLP为开发者提供了[community](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/docs/community/contribute_models/contribute_awesome_pretrained_models.rst)模块，用户可以上传自己训练的模型，开源给其他用户使用。
 使用本文档给出的参数配置，在CLUECorpusSmall数据集上训练，可以得到`zhui/ernie-1.0-cluecorpussmall`参数，可直接使用。
@@ -341,14 +390,17 @@ model = AutoModelForMaskedLM.from_pretrained('zhui/ernie-1.0-cluecorpussmall')
 
 贡献预训练模型的方法，可以参考[贡献预训练模型权重](https://github.com/PaddlePaddle/PaddleNLP/blob/develop/docs/community/contribute_models/contribute_awesome_pretrained_models.rst)教程。
 
+<a name="下游任务微调"></a>
 
-## 下游任务finetune
+## 3. 下游任务微调
 
 使用训练中产出的checkpoint，或者paddlenlp内置的模型权重，使用本脚本，用户可以快速对当前模型效果进行评估。
 
 ### 运行示例
 本文档适配了三大主流下游任务，用户可以根据自己的需求，评估自己所需的数据集。
 
+<a name="序列分类"></a>
+
 1. 序列分类
 ```shell
 cd finetune
@@ -362,6 +414,8 @@ python run_seq_cls.py \
     --output_dir ./tmp/$dataset
 ```
 
+<a name="Token分类"></a>
+
 2. Token分类
 ```shell
 cd finetune
@@ -375,6 +429,8 @@ python run_ner.py \
     --output_dir ./tmp/$dataset
 ```
 
+<a name="阅读理解"></a>
+
 3. 阅读理解
 ```shell
 cd finetune
@@ -388,7 +444,9 @@ python run_qa.py \
 ```
 
 
-## 预测部署
+<a name="预测部署"></a>
+
+## 4. 预测部署
 以中文文本情感分类问题为例，介绍一下从模型finetune到部署的过程。
 
 与之前的finetune参数配置稍有区别，此处加入了一些配置选项。
@@ -427,5 +485,7 @@ Data: 挺失望的,还不如买一本张爱玲文集呢,以<色戒>命名,可这
 ```
 更多关于部署的情况可以参考[此处](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/text_classification/pretrained_models#%E6%A8%A1%E5%9E%8B%E9%A2%84%E6%B5%8B)。
 
-## 参考文献
+<a name="参考文献"></a>
+
+## 5. 参考文献
 - [ERNIE: Enhanced Representation through Knowledge Integration](https://arxiv.org/pdf/1904.09223.pdf)
diff --git a/model_zoo/ernie-1.0/data_tools/README.md b/model_zoo/ernie-1.0/data_tools/README.md
index 4cf60cbb1348..4a489fa698b2 100644
--- a/model_zoo/ernie-1.0/data_tools/README.md
+++ b/model_zoo/ernie-1.0/data_tools/README.md
@@ -155,17 +155,19 @@ common config:
 ```
 通过下面脚本转化，我们可以得到处理好的预训练数据，token ids:`baike_sample_ids.npy`, 文章索引信息`baike_sample_idx.npz`.
 ```
-python -u  data_tools/create_pretraining_data.py \
+python -u  create_pretraining_data.py \
     --model_name ernie-1.0-base-zh \
     --tokenizer_name ErnieTokenizer \
-    --input_path eng_sample.jsonl \
+    --input_path baike_sample.jsonl \
     --split_sentences\
     --chinese \
     --cn_whole_word_segment \
-    --output_prefix eng_sample  \
+    --output_prefix baike_sample  \
     --workers 1 \
     --log_interval 5
 ```
+1. 如果您使用已经分好词的语料，可以设置 --cn_splited 为 True，同时指定--cn_split_dimer如空格。
+2. 使用自定义词表的话，请指定model_name为词表所在的文件夹地址。
 
 ### Ernie预训练开始
 得到了处理好的训练数据，就可以开始Ernie模型的预训练了。ernie预训练的代码在`model_zoo/ernie-1.0`。
diff --git a/model_zoo/ernie-1.0/data_tools/create_pretraining_data.py b/model_zoo/ernie-1.0/data_tools/create_pretraining_data.py
index cce5ef1ebe66..e59844d5b352 100644
--- a/model_zoo/ernie-1.0/data_tools/create_pretraining_data.py
+++ b/model_zoo/ernie-1.0/data_tools/create_pretraining_data.py
@@ -267,15 +267,16 @@ def initializer(self):
         def process(text):
             words = Converter.segment_func(text)
             # if there are two empty word, the should a split dimer in the pos
-            pre_dimer = False
-            for index, w in enumerate(words):
-                if pre_dimer and len(w) == 0:
-                    words[index] = " "
-                    pre_dimer = False
-                elif len(w) == 0:
-                    pre_dimer = True
-                else:
-                    pre_dimer = False
+            if self.args.cn_splited:
+                pre_dimer = False
+                for index, w in enumerate(words):
+                    if pre_dimer and len(w) == 0:
+                        words[index] = " "
+                        pre_dimer = False
+                    elif len(w) == 0:
+                        pre_dimer = True
+                    else:
+                        pre_dimer = False
 
             tokens = Converter.tokenizer.tokenize("".join(words))
             tokens = Converter.whole_word_mask(tokens, words)
diff --git a/model_zoo/ernie-1.0/run_pretrain.py b/model_zoo/ernie-1.0/run_pretrain.py
index 2afe4b3657c7..d6bb1cfccc38 100644
--- a/model_zoo/ernie-1.0/run_pretrain.py
+++ b/model_zoo/ernie-1.0/run_pretrain.py
@@ -15,6 +15,7 @@
 ERNIE-1.0 pretraining scripts.
 """
 import argparse
+import contextlib
 import os
 import sys
 import random
@@ -66,6 +67,7 @@ def create_pretrained_dataset(
         args.eval_iters * data_world_size,
         args.micro_batch_size * args.test_iters * data_world_size
     ]
+
     train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
         data_prefix=data_file,
         args=args,
@@ -291,6 +293,7 @@ def args_post_process(args, worker_num):
         "cannot do gradient accumulate, global_batch_size: {} micro_batch_size: {}".format(
         args.global_batch_size, micro_batch_size)
     accumulate_steps = bsz_per_dp // micro_batch_size
+    assert accumulate_steps >= 1, f"Larger global_batch_size: {arg.global_batch_size} is expect, micro_batch_size is {micro_batch_size}, but only {bsz_per_dp} on each card!"
 
     args.eval_iters *= accumulate_steps
     args.test_iters *= accumulate_steps
@@ -451,6 +454,7 @@ def do_train(args):
         optimizer = fleet.distributed_optimizer(optimizer)
 
     tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name_or_path)
+    # Must extend chinese char for ErnieTokenizer
     tokenizer.extend_chinese_char()
 
     data_file = get_train_data_file(args)
@@ -517,6 +521,7 @@ def do_train(args):
         # time count
         train_reader_cost = 0.0
         train_run_cost = 0.0
+        tr_loss = paddle.to_tensor(0.0)
         reader_start = time.time()
 
         for step, batch in enumerate(train_data_loader()):
@@ -533,7 +538,17 @@ def do_train(args):
             input_ids, segment_ids, input_mask, masked_lm_positions, \
             masked_lm_labels, next_sentence_labels = batch
 
-            with model.no_sync():
+            ctx_manager = contextlib.nullcontext() if sys.version_info >= (
+                3, 7) else contextlib.suppress()
+
+            if worker_num > 1 and (args.use_recompute
+                                   or args.accumulate_steps > 1):
+                ctx_manager = model.no_sync()
+            else:
+                ctx_manager = contextlib.nullcontext() if sys.version_info >= (
+                    3, 7) else contextlib.suppress()
+
+            with ctx_manager:
                 with paddle.amp.auto_cast(args.use_amp,
                                           custom_white_list=[
                                               'softmax',
@@ -569,37 +584,47 @@ def do_train(args):
                         loss = criterion(prediction_scores, None,
                                          masked_lm_labels)
 
+                if args.accumulate_steps >= 1:
+                    tr_loss_step = loss / args.accumulate_steps
+                else:
+                    tr_loss_step = loss
+
                 if args.use_amp:
-                    scaler.scale(loss).backward()
+                    scaler.scale(tr_loss_step).backward()
                 else:
-                    loss.backward()
+                    tr_loss_step.backward()
 
-            fused_allreduce_gradients(list(model.parameters()), None)
+            tr_loss += tr_loss_step
+
+            loss_global["loss"] += loss.detach()
+            if args.binary_head:
+                loss_global["lm_loss"] += lm_loss.detach()
+                loss_global["sop_loss"] += sop_loss.detach()
+
+            # Skip for accumulate_steps in global step
+            if (step + 1) % args.accumulate_steps != 0:
+                continue
+
+            if worker_num > 1 and args.use_recompute:
+                fused_allreduce_gradients(list(model.parameters()), None)
 
             if args.use_amp:
-                scaler.minimize(optimizer, loss)
+                scaler.minimize(optimizer, tr_loss)
             else:
                 optimizer.step()
 
             optimizer.clear_grad()
             train_run_cost += time.time() - train_start
-
-            # Skip for accumulate_steps in global step
-            if (step + 1) % args.accumulate_steps != 0:
-                continue
+            tr_loss.subtract_(tr_loss)
 
             global_step += 1
 
-            loss_global["loss"] += loss.detach()
-            if args.binary_head:
-                loss_global["lm_loss"] += lm_loss.detach()
-                loss_global["sop_loss"] += sop_loss.detach()
-
             if global_step % args.logging_freq == 0:
                 log_info_dict = dict()
                 log_info_dict["global_step"] = global_step
                 for k, v in loss_global.items():
-                    log_info_dict[k] = all_gather(v) / args.logging_freq
+                    log_info_dict[k] = all_gather(
+                        v) / args.logging_freq / args.accumulate_steps
                     v.subtract_(v)
                 if worker_index == 0:
                     speed = args.logging_freq / (time.time() - tic_train)

From d565fb473f6c851338876d9fc6ddb287756bda6b Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 25 Aug 2022 12:09:41 +0800
Subject: [PATCH 31/48] refine readme.

---
 .copyright.hook                       |  2 +-
 model_zoo/ernie-1.0/README.md         |  1 -
 model_zoo/ernie-1.0/scripts/README.md | 51 +++++++++++++++------------
 3 files changed, 30 insertions(+), 24 deletions(-)

diff --git a/.copyright.hook b/.copyright.hook
index d25ac074d8c9..0537474749d6 100644
--- a/.copyright.hook
+++ b/.copyright.hook
@@ -23,7 +23,7 @@ import sys
 import os
 import datetime
 
-COPYRIGHT = '''Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+COPYRIGHT = '''Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/model_zoo/ernie-1.0/README.md b/model_zoo/ernie-1.0/README.md
index 956ebb439816..0c9545c3f66e 100644
--- a/model_zoo/ernie-1.0/README.md
+++ b/model_zoo/ernie-1.0/README.md
@@ -17,7 +17,6 @@
 
 
 
-
 <a name="模型简介"></a>
 
 ## 1. 模型简介
diff --git a/model_zoo/ernie-1.0/scripts/README.md b/model_zoo/ernie-1.0/scripts/README.md
index a7f7c2a55e1d..e85c89b07130 100644
--- a/model_zoo/ernie-1.0/scripts/README.md
+++ b/model_zoo/ernie-1.0/scripts/README.md
@@ -1,26 +1,24 @@
-# ERNIE-CW 从零开始构建预训练模型
+# **大规模** **开源** **中文** 语料预训练-<small>从零开始构建预训练模型</small>
 
 ERNIE是百度提出的大规模预训练模型，曾在中文场景下取得了SOTA效果。
-PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 总共400GB，发布ERNIE-CW项目。项目目标：从零开始构建你的预训练模型。
+PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 总共400GB，发布大规模开源语料预训练全流程。从零开始，轻松构建预训练模型。
 
-ERNIE-CW项目，从数据下载，词表制作，数据转化，模型训练，所有流程，完全开源开放，可复现。
+本项目，从数据下载，词表制作，数据转化，模型训练，所有流程，完全开源开放，可复现。
 并训练发布开源最优的模型参数。
 
 接下来将从下面几个方面，详细介绍整个数据制作全流程，从零开始，构建一个预训练模型。
 
-- **大规模**中文数据
-- **高精准**中文分词
-- **全字符**中文词表制作
-- **快速**Token ID 转化
-
 **目录**
-* [大规模中文数据](#大规模中文数据)
-* [高精准中文分词](#高精准中文分词)
-* [中文全字符词表制作](#中文全字符词表制作)
-* [快速Token ID 转化](#快速TokenID转化)
+* [1. **大规模**中文数据](#大规模中文数据)
+* [2. **高精准**中文分词](#高精准中文分词)
+* [3. **全字符**中文词表制作](#中文中文词表制作)
+* [4. **快速**Token ID 转化](#快速TokenID转化)
+* [5. 参考](#参考)
+
 
+<a name="大规模中文数据"> </a>
 
-## 大规模中文数据
+## 1. 大规模中文数据
 
 **CLUECorpus2020语料**
 
@@ -38,8 +36,9 @@ WuDaoCorpora是悟道爬取的中文大规模语料。整体数量为3TB，目
 64GB WuDaoCorpus2.0_base_200G.rar
 ```
 
+<a name="高精准中文分词"> </a>
 
-## 高精准中文分词
+## 2. 高精准中文分词
 
 ERNIE 使用知识嵌入的方式进行预训练，如何尽可能精确的从原始文本中提取知识，直接关系预训练模型的效果。
 目前PaddleNLP常用的分词方式的有`jieba`，`lac`，`Wordtag`，
@@ -72,7 +71,9 @@ python ../data_tools/trans_to_json.py  \
     --no-shuffle
 ```
 
-## 中文全字符词表制作
+<a name="全字符中文词表制作"> </a>
+
+## 3. 全字符中文词表制作
 
 词表的制作有两种方案：
 
@@ -96,7 +97,7 @@ python ../data_tools/trans_to_json.py  \
     - 文本token id化后，希望使用uint16表示，此时表示的最大字符为65536。
     - 同时考虑到ERNIE虽然是字模型，我们的仍然需要 `##中` 之类的中文字符表示分词信息。假设使用中文全字符20902(0x4E00, 0x9FA5)个字符，那么剩余 vocab 大小不能超过 44634。
 
-综上，ERNIE-CW决定采用 40000 左右的 vocab 容量。
+综上，本项目决定采用 40000 左右的 vocab 容量。
 其中：
 - 中文全字符 `20902`
 - 英文字符 `17000`
@@ -146,7 +147,7 @@ python merge_vocab.py
 
 ### 问题遗留
 本项目采用的第一种方式，即拼接产出的词表，对连续非中、英文字符文本，会出现UNK的情况。
-如issue: [#2927](https://github.com/PaddlePaddle/PaddleNLP/issues/2927)、 [#2585](https://github.com/PaddlePaddle/PaddleNLP/issues/2585)。ERNIE-CW做了两点改进:
+如issue: [#2927](https://github.com/PaddlePaddle/PaddleNLP/issues/2927)、 [#2585](https://github.com/PaddlePaddle/PaddleNLP/issues/2585)。本项目做了两点改进:
 
 1. 对 Symbol 字符默认添加空格，变成独立字符
 2. 对 日文、谚文 在合并词表阶段默认添加 ## 字符。
@@ -171,7 +172,9 @@ python gen_vocab.py afer_basic_toknizer_corpus.txt
 对处理好的vocab文件手动替换一些`<pad> -> [PAD]`之类的special_tokens，即可产出词表。
 
 
-## 快速Token ID 转化
+<a name="快速TokenID转化"> </a>
+
+## 4. 快速Token ID 转化
 
 预料、词表准备妥当后，我们可以开始进行最后的数据ID转化。
 
@@ -186,7 +189,7 @@ python gen_vocab.py afer_basic_toknizer_corpus.txt
 使用 Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz CPU测试，40线程，处理速度 8+MB/s，约7个小时左右，即可完成 200GB 文本转化为ID.
 
 ```
-python -u  create_pretraining_data.py \
+python -u  ../data_tools/create_pretraining_data.py \
     --model_name ./vocab_path/vocab.txt \
     --tokenizer_name ErnieTokenizer \
     --input_path wudao_corpus_200g_0623.jsonl \
@@ -198,11 +201,15 @@ python -u  create_pretraining_data.py \
     --workers 40 \
     --log_interval 1000
 ```
-转化后的数据如下，使用这份数据，即可开始ERNIE-CW预训练
+转化后的数据如下，使用这份数据，即可开始ERNIE预训练
 ```
 -rw-rw-r-- 1 500 501 129G Jul  4 03:39 wudao_200g_0703_ids.npy
 -rw-rw-r-- 1 500 501 6.4G Jul  4 03:39 wudao_200g_0703_idx.npz
 ```
 
-## 其他
-- 感谢CLUE，WuDao提供的开源数据
+## 5. 参考
+感谢CLUE，WuDao提供的开源文本语料，参考资料：
+- Xu, L., Zhang, X. and Dong, Q., 2020. CLUECorpus2020: A large-scale Chinese corpus for pre-training language model. arXiv preprint arXiv:2003.01355.
+- Yuan, S., Zhao, H., Du, Z., Ding, M., Liu, X., Cen, Y., Zou, X., Yang, Z. and Tang, J., 2021. Wudaocorpora: A super large-scale chinese corpora for pre-training language models. AI Open, 2, pp.65-68.
+- https://github.com/CLUEbenchmark/CLUECorpus2020
+- https://resource.wudaoai.cn

From 2b55c66ae6f5a6755e2561b8bedd6859809729c6 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 25 Aug 2022 19:31:47 +0800
Subject: [PATCH 32/48] fix

---
 model_zoo/ernie-1.0/README.md                 | 29 ++++++++++++++++---
 .../{scripts => clue_wudao_process}/README.md | 26 ++++++++++-------
 .../clue_process.py                           |  0
 .../gen_char.py                               |  0
 .../gen_vocab.py                              |  0
 .../merge_vocab.py                            |  0
 .../trans_to_json.py                          |  0
 .../wudao_process.py                          |  0
 8 files changed, 40 insertions(+), 15 deletions(-)
 rename model_zoo/ernie-1.0/{scripts => clue_wudao_process}/README.md (92%)
 rename model_zoo/ernie-1.0/{scripts => clue_wudao_process}/clue_process.py (100%)
 rename model_zoo/ernie-1.0/{scripts => clue_wudao_process}/gen_char.py (100%)
 rename model_zoo/ernie-1.0/{scripts => clue_wudao_process}/gen_vocab.py (100%)
 rename model_zoo/ernie-1.0/{scripts => clue_wudao_process}/merge_vocab.py (100%)
 rename model_zoo/ernie-1.0/{scripts => clue_wudao_process}/trans_to_json.py (100%)
 rename model_zoo/ernie-1.0/{scripts => clue_wudao_process}/wudao_process.py (100%)

diff --git a/model_zoo/ernie-1.0/README.md b/model_zoo/ernie-1.0/README.md
index 0c9545c3f66e..19bf389dc87c 100644
--- a/model_zoo/ernie-1.0/README.md
+++ b/model_zoo/ernie-1.0/README.md
@@ -250,7 +250,7 @@ PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 
 
 #### 数据制作
 
-数据下载，词表制作，数据转化部分，请参见[此处](./scripts/README.md)。
+数据下载，词表制作，数据转化部分，请参见[CLUE WuDao数据预处理](./clue_wudao_process/README.md)。
 接下来我们主要介绍训练流程部分的特性：
 
 
@@ -316,10 +316,17 @@ RoBERTa-wwm-ext-large | 24L1024H | 76.61 |    76.00 |    59.33 |    62.02 |    8
 
 
 ###  开始训练
-<details>
-<summary><b>训练脚本如下</b></summary>
 
 训练脚本如下
+
+<b>环境配置</b>
+
+- PYTHONPATH 设置为当前目录（适合paddlenlp develop运行）
+- 设置了一些FLAGS，包括增强报错，动态图Flag，提高矩阵乘法精度。
+
+<details>
+<summary>环境配置脚本</summary>
+
 ```shell
 set -x
 
@@ -331,7 +338,17 @@ export FLAGS_call_stack_level=2
 export FLAGS_gemm_use_half_precision_compute_type=False
 export FLAGS_enable_eager_mode=1
 unset CUDA_VISIBLE_DEVICES
+```
+</details>
+
+<b>路径配置</b>
 
+- 主要配置
+
+<details>
+<summary>路径配置</summary>
+
+```shell
 trainer_id=${PADDLE_TRAINER_ID:-"0"}
 task_name="0809-ernie-3.0-base-cw-dp16-gb1024"
 
@@ -339,7 +356,11 @@ base_nfs="/path/to/your/nfs/mount/point"
 base_dir="${base_nfs}/ernie-cw/output/${task_name}"
 data_dir="5.0 ${base_nfs}/clue_oscar/clue_corpus_oscar_0630 7.0 ${base_nfs}/clue_train/clue_corpus_train_0629 12.0 ${base_nfs}/wudao_200g/wudao_200g_0703"
 vocab_dir="${base_nfs}/"
+```
+</details>
 
+**启动训练**：这里启动的是两机16卡任务，dp_degree=16，整体全局的batch_size 1024
+```shell
 python3 -u  -m paddle.distributed.launch \
     --gpus "0,1,2,3,4,5,6,7" \
     --log_dir "${base_dir}/log_${trainer_id}" \
@@ -376,7 +397,7 @@ python3 -u  -m paddle.distributed.launch \
     --attention_probs_dropout_prob 0.1 \
     --seed 1234 \
 ```
-</details>
+
 
 <a name="预训练模型贡献"></a>
 
diff --git a/model_zoo/ernie-1.0/scripts/README.md b/model_zoo/ernie-1.0/clue_wudao_process/README.md
similarity index 92%
rename from model_zoo/ernie-1.0/scripts/README.md
rename to model_zoo/ernie-1.0/clue_wudao_process/README.md
index e85c89b07130..29536da9ffa9 100644
--- a/model_zoo/ernie-1.0/scripts/README.md
+++ b/model_zoo/ernie-1.0/clue_wudao_process/README.md
@@ -12,6 +12,10 @@ PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 
 * [1. **大规模**中文数据](#大规模中文数据)
 * [2. **高精准**中文分词](#高精准中文分词)
 * [3. **全字符**中文词表制作](#中文中文词表制作)
+    - [3.1 分析准备](#分析准备)
+    - [3.2 文本字符统计](#文本字符统计)
+    - [3.3 英文字符词表](#英文字符词表)
+    - [3.4 合并词表](#合并词表)
 * [4. **快速**Token ID 转化](#快速TokenID转化)
 * [5. 参考](#参考)
 
@@ -20,7 +24,7 @@ PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 
 
 ## 1. 大规模中文数据
 
-**CLUECorpus2020语料**
+**CLUECorpus2020 语料**
 
 CLUECorpus2020 过对Common Crawl的中文部分进行语料清洗得到。开源部分提供了约200G左右的语料文本，详细介绍见[官网](https://github.com/CLUEbenchmark/CLUECorpus2020#%E6%95%B0%E6%8D%AE%E4%B8%8B%E8%BD%BD)，用户可以通过邮件申请下载，方式如下：
 > 数据下载
@@ -60,7 +64,7 @@ python wudao_process.py \
     --workers 40  \
     --ouput_path ./wudao_lac_cut \
 ```
-注：预训练需要实现 SOP( Sentence Order Predict) 任务，在分词的同时，我们使用 简单规则 进行了文本断句。
+注：预训练需要实现 SOP( Sentence Order Predict) 任务，在分词的同时，我们使用 简单规则 进行了文本断句。如果语料只有一句话，建议去除SOP loss，训练时设置 `binary_head=False`。
 
 文本转化完成后。我们使用 `../data_tools/trans_to_json.py`重新转换为jsonl格式（分词完毕）。
 ```shell
@@ -89,13 +93,13 @@ python ../data_tools/trans_to_json.py  \
 第二种方案需要对文本先使用`BasicTokenizer`切分一遍语料。
 第一种方案，自定义程度高，但存在一些局限性。本项目采用了第一种方案，详细介绍如下：
 
-### 分析准备
+### 3.1 分析准备
 词表大小： 这里我们考虑的因素主要有两个
 - 已有模型对照：
     - ERNIE 3.0系列模型的词表，词表大小为 40000 左右。
 - 预训练数据存储占用：
     - 文本token id化后，希望使用uint16表示，此时表示的最大字符为65536。
-    - 同时考虑到ERNIE虽然是字模型，我们的仍然需要 `##中` 之类的中文字符表示分词信息。假设使用中文全字符20902(0x4E00, 0x9FA5)个字符，那么剩余 vocab 大小不能超过 44634。
+    - 同时考虑到ERNIE虽然是字模型，我们的仍然需要 `##中` 之类的中文字符表示分词信息。假设使用中文全字符20902(0x4E00-0x9FA5)个字符，那么剩余 vocab 大小不能超过 44634。
 
 综上，本项目决定采用 40000 左右的 vocab 容量。
 其中：
@@ -104,7 +108,7 @@ python ../data_tools/trans_to_json.py  \
 - 其他字符约 `2000` 左右
 
 
-### 文本字符统计
+### 3.2 文本字符统计
 首先第一步是对文本字符进行统计。字符统计的目的主要是添加常用的中文字符、特殊字符。
 
 由于语料文本过大，我们随机选取 10G 左右的原始文本进行了字符统计。
@@ -113,10 +117,10 @@ python gen_char.py path_to_corpus.txt
 ```
 可以在本地文件夹得到`char_dict.pickle`字符频率文件。同时我们也提供了自己统计的词频文件，方便用户复现：
 ```
-wget https://xxx.bos/data/char_dict.pickle
+wget https://paddlenlp.bj.bcebos.com/models/transformers/data_tools/char_dict.pickle
 ```
 
-### 英文字符词表
+### 3.3 英文字符词表
 基于字符的词频统计，使得英文字符也切割为字母，为此我们需要添加英文词表。
 英文部分，我们使用了 [WikiText](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip)  数据集，来构造词表。
 下载解压数据，使用BPE切词
@@ -127,11 +131,11 @@ python gen_vocab.py ./wikitext-103-raw/wiki.train.raw
 ```
 即可产生英文部分的词表。这里我们也提供了处理好的 vocab 方便用户验证。
 ```
-wget  https://xxx.bos/data/eng.vocab
+wget https://paddlenlp.bj.bcebos.com/models/transformers/data_tools/eng.vocab
 ```
 
 
-### 合并词表
+### 3.4 合并词表
 
 目前我们得到了字符统计表，和英文字符词表。下一步，我们将词表进行合并。
 
@@ -145,7 +149,7 @@ python merge_vocab.py
 1. 对于一些日文、谚文文字字符，需要进行 normalize
 2. 添加special_tokens
 
-### 问题遗留
+### 3.5 问题遗留
 本项目采用的第一种方式，即拼接产出的词表，对连续非中、英文字符文本，会出现UNK的情况。
 如issue: [#2927](https://github.com/PaddlePaddle/PaddleNLP/issues/2927)、 [#2585](https://github.com/PaddlePaddle/PaddleNLP/issues/2585)。本项目做了两点改进:
 
@@ -155,7 +159,7 @@ python merge_vocab.py
 虽然有上述两点修复，任然无法避免 [#2927](https://github.com/PaddlePaddle/PaddleNLP/issues/2927) 现象。
 彻底解决的话，建议使用第二种方式制作vocab文件。
 
-### 方案二：预处理后直接生成
+### 3.6 方案二：预处理后直接生成
 此方案没有被采用，这里也简单说明一下具体的方案：
 1. 对语料使用 BasicTokenizer 转换
 ```python
diff --git a/model_zoo/ernie-1.0/scripts/clue_process.py b/model_zoo/ernie-1.0/clue_wudao_process/clue_process.py
similarity index 100%
rename from model_zoo/ernie-1.0/scripts/clue_process.py
rename to model_zoo/ernie-1.0/clue_wudao_process/clue_process.py
diff --git a/model_zoo/ernie-1.0/scripts/gen_char.py b/model_zoo/ernie-1.0/clue_wudao_process/gen_char.py
similarity index 100%
rename from model_zoo/ernie-1.0/scripts/gen_char.py
rename to model_zoo/ernie-1.0/clue_wudao_process/gen_char.py
diff --git a/model_zoo/ernie-1.0/scripts/gen_vocab.py b/model_zoo/ernie-1.0/clue_wudao_process/gen_vocab.py
similarity index 100%
rename from model_zoo/ernie-1.0/scripts/gen_vocab.py
rename to model_zoo/ernie-1.0/clue_wudao_process/gen_vocab.py
diff --git a/model_zoo/ernie-1.0/scripts/merge_vocab.py b/model_zoo/ernie-1.0/clue_wudao_process/merge_vocab.py
similarity index 100%
rename from model_zoo/ernie-1.0/scripts/merge_vocab.py
rename to model_zoo/ernie-1.0/clue_wudao_process/merge_vocab.py
diff --git a/model_zoo/ernie-1.0/scripts/trans_to_json.py b/model_zoo/ernie-1.0/clue_wudao_process/trans_to_json.py
similarity index 100%
rename from model_zoo/ernie-1.0/scripts/trans_to_json.py
rename to model_zoo/ernie-1.0/clue_wudao_process/trans_to_json.py
diff --git a/model_zoo/ernie-1.0/scripts/wudao_process.py b/model_zoo/ernie-1.0/clue_wudao_process/wudao_process.py
similarity index 100%
rename from model_zoo/ernie-1.0/scripts/wudao_process.py
rename to model_zoo/ernie-1.0/clue_wudao_process/wudao_process.py

From 666be5749726632dbe9e905efe89d9512babdf72 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 26 Aug 2022 17:36:01 +0800
Subject: [PATCH 33/48] refactor

---
 .../clue_wudao_process/clue_process.py        |  2 -
 model_zoo/ernie-1.0/data_tools/README.md      | 64 +++++--------------
 .../data_tools/docs/CLUECorpus2020.md         | 12 ++++
 .../data_tools/docs/CLUECorpusSmall.md        | 59 +++++++++++++++++
 .../ernie-1.0/data_tools/docs/OpenWebText2.md | 47 ++++++++++++++
 .../data_tools/docs/WuDaoCorpusBase.md        | 31 +++++++++
 .../vocab}/gen_char.py                        |  0
 .../vocab}/gen_vocab.py                       |  0
 .../vocab}/merge_vocab.py                     |  0
 9 files changed, 164 insertions(+), 51 deletions(-)
 create mode 100644 model_zoo/ernie-1.0/data_tools/docs/CLUECorpus2020.md
 create mode 100644 model_zoo/ernie-1.0/data_tools/docs/CLUECorpusSmall.md
 create mode 100644 model_zoo/ernie-1.0/data_tools/docs/OpenWebText2.md
 create mode 100644 model_zoo/ernie-1.0/data_tools/docs/WuDaoCorpusBase.md
 rename model_zoo/ernie-1.0/{clue_wudao_process => data_tools/vocab}/gen_char.py (100%)
 rename model_zoo/ernie-1.0/{clue_wudao_process => data_tools/vocab}/gen_vocab.py (100%)
 rename model_zoo/ernie-1.0/{clue_wudao_process => data_tools/vocab}/merge_vocab.py (100%)

diff --git a/model_zoo/ernie-1.0/clue_wudao_process/clue_process.py b/model_zoo/ernie-1.0/clue_wudao_process/clue_process.py
index fb24511695c2..bea70ef7bf67 100644
--- a/model_zoo/ernie-1.0/clue_wudao_process/clue_process.py
+++ b/model_zoo/ernie-1.0/clue_wudao_process/clue_process.py
@@ -127,8 +127,6 @@ def text_to_text(path):
                 words = seg_func(line)
                 final += " ".join(words) + "\n"
             f.write(final + "\n")
-            # if count % 100 == 0:
-            #     print("speed: ", data_len/1024/(time.time() - s))
 
     return data_len, None
 
diff --git a/model_zoo/ernie-1.0/data_tools/README.md b/model_zoo/ernie-1.0/data_tools/README.md
index 4a489fa698b2..52b61ef20ab7 100644
--- a/model_zoo/ernie-1.0/data_tools/README.md
+++ b/model_zoo/ernie-1.0/data_tools/README.md
@@ -24,6 +24,7 @@
 `dataset_utils.py`中包含了index生成、动态mask的实现。
 `ernie_dataset.py`通过调用`dataset_utils.py`的一些函数，产生ernie的输入dataset。
 
+
 ### 环境依赖
 
  - tqdm
@@ -49,7 +50,19 @@
 |token动态mask（可选）| Dataset取数据 | 无 |-
 
 
-## ERNIE预训练例子
+## 数据教程汇总
+
+针对目前开源的数据集，PaddleNLP提供了详细的数据教程，点击对应数据集的链接，即可开始进行数据制作：
+
+| 名称 | 文本类型 | 纯文本大小 | 适配模型
+|-|-|-|-|
+| [CLUECorpusSmall](./docs/CLUECorpusSmall.md)| 中文 | 14GB
+| ERNIE
+| [OpenWebText2](./docs/OpenWebText2.md) | 英文 | 70GB | GPT
+| [WuDaoCorpus2.0 Base](./docs/WuDaoCorpusBase.md)| 中文 |  200GB | ERNIE
+| [CLUECorpus2020](./docs/CLUECorpus2020.md)| 中文 | 200GB | ERNIE
+
+## ERNIE预训练详细准备
 
 下面以ERNIE预训练为例，简要介绍一下预训练的全流程。
 
@@ -169,6 +182,7 @@ python -u  create_pretraining_data.py \
 1. 如果您使用已经分好词的语料，可以设置 --cn_splited 为 True，同时指定--cn_split_dimer如空格。
 2. 使用自定义词表的话，请指定model_name为词表所在的文件夹地址。
 
+
 ### Ernie预训练开始
 得到了处理好的训练数据，就可以开始Ernie模型的预训练了。ernie预训练的代码在`model_zoo/ernie-1.0`。
 简单将预处理好的数据，拷贝到data目录，即可开始Ernie模型预训练。
@@ -196,51 +210,3 @@ sh run_static.sh
 ## 参考内容
 
 注: 大部分数据流程，参考自[Megatron](https://github.com/NVIDIA/Megatron-LM)，特此表达感谢。
-
-
-# 附录
-
-## CLUECorpusSmall 数据集处理教程
-**数据集简介**：可用于语言建模、预训练或生成型任务等，数据量超过14G，近4000个定义良好的txt文件、50亿个字。主要部分来自于nlp_chinese_corpus项目
-包含如下子语料库（总共14G语料）：新闻语料[news2016zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/6bac09db4e6d4857b6d680d34447457490cb2dbdd8b8462ea1780a407f38e12b?responseContentDisposition=attachment%3B%20filename%3Dnews2016zh_corpus.zip)， 社区互动语料[webText2019zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/83da03f7b4974871a52348b41c16c7e3b34a26d5ca644f558df8435be4de51c3?responseContentDisposition=attachment%3B%20filename%3DwebText2019zh_corpus.zip)，维基百科语料[wiki2019zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/d7a166408d8b4ffdaf4de9cfca09f6ee1e2340260f26440a92f78134d068b28f?responseContentDisposition=attachment%3B%20filename%3Dwiki2019zh_corpus.zip)，评论数据语料[comment2019zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/b66ddd445735408383c42322850ac4bb82faf9cc611447c2affb925443de7a6d?responseContentDisposition=attachment%3B%20filename%3Dcomment2019zh_corpus.zip)。
-
-**数据集下载**：
-用户可以通过官方github网页下载，https://github.com/CLUEbenchmark/CLUECorpus2020 。同时，为方便用户，我们也提供了aistudio数据集下载地址。[part1](https://aistudio.baidu.com/aistudio/datasetdetail/60598)，[part2](https://aistudio.baidu.com/aistudio/datasetdetail/124357)。使用aistudio版本的数据，下载好后，可以核对md5值：
-```shell
-> md5sum ./*
- 8a8be341ebce39cfe9524fb0b46b08c5  ./comment2019zh_corpus.zip
- 4bdc2c941a7adb4a061caf273fea42b8  ./news2016zh_corpus.zip
- fc582409f078b10d717caf233cc58ddd  ./webText2019zh_corpus.zip
- 157dacde91dcbd2e52a60af49f710fa5  ./wiki2019zh_corpus.zip
-```
-解压文件
-```shell
-unzip comment2019zh_corpus.zip -d  clue_corpus_small_14g/comment2019zh_corpus
-unzip news2016zh_corpus.zip    -d  clue_corpus_small_14g/news2016zh_corpus
-unzip webText2019zh_corpus.zip -d  clue_corpus_small_14g/webText2019zh_corpus
-unzip wiki2019zh_corpus.zip    -d  clue_corpus_small_14g/wiki2019zh_corpus
-```
-将txt文件转换为jsonl格式
-```
-python trans_to_json.py  --input_path ./clue_corpus_small_14g --output_path clue_corpus_small_14g.jsonl
-```
-现在我们得到了jsonl格式的数据集，下面是针对训练任务的数据集应用，此处以ernie为例。
-```
-python -u  create_pretraining_data.py \
-    --model_name ernie-1.0-base-zh \
-    --tokenizer_name ErnieTokenizer \
-    --input_path clue_corpus_small_14g.jsonl \
-    --split_sentences\
-    --chinese \
-    --cn_whole_word_segment \
-    --cn_seg_func jieba \
-    --output_prefix clue_corpus_small_14g_20220104 \
-    --workers 48 \
-    --log_interval 10000
-```
-数据共有文档`15702702`条左右，由于分词比较耗时，大概一小时左右可以完成。在当前目录下产出训练所需数据。
-```
-clue_corpus_small_14g_20220104_ids.npy
-clue_corpus_small_14g_20220104_idx.npz
-```
-用户可以使用此数据进行预训练任务。
diff --git a/model_zoo/ernie-1.0/data_tools/docs/CLUECorpus2020.md b/model_zoo/ernie-1.0/data_tools/docs/CLUECorpus2020.md
new file mode 100644
index 000000000000..3c6727fab4c7
--- /dev/null
+++ b/model_zoo/ernie-1.0/data_tools/docs/CLUECorpus2020.md
@@ -0,0 +1,12 @@
+## CLUECorpus2020 语料
+
+| 名称 | 文本类型 | 纯文本大小 |
+|-|-|-|
+| CLUECorpus2020| 中文 | 200GB |
+
+CLUECorpus2020 过对Common Crawl的中文部分进行语料清洗得到。开源部分提供了约200G左右的语料文本，详细介绍见[官网](https://github.com/CLUEbenchmark/CLUECorpus2020#%E6%95%B0%E6%8D%AE%E4%B8%8B%E8%BD%BD)，用户可以通过邮件申请下载，方式如下：
+
+> 数据下载
+> 申请方式： 将使用语料研究目的和用途，计划、研究机构和申请者介绍，发送到邮箱，并承诺不向第三方提供。
+>
+> 邮箱: CLUEbenchmark@163.com，标题是：CLUECorpus2020 200G语料库
diff --git a/model_zoo/ernie-1.0/data_tools/docs/CLUECorpusSmall.md b/model_zoo/ernie-1.0/data_tools/docs/CLUECorpusSmall.md
new file mode 100644
index 000000000000..0dadb1ca4447
--- /dev/null
+++ b/model_zoo/ernie-1.0/data_tools/docs/CLUECorpusSmall.md
@@ -0,0 +1,59 @@
+# CLUECorpusSmall
+
+| 名称 | 文本类型 | 纯文本大小 |
+|-|-|-|
+| CLUECorpusSmall| 中文 | 14GB |
+
+**数据集简介**：可用于语言建模、预训练或生成型任务等，数据量超过14G，近4000个定义良好的txt文件、50亿个字。主要部分来自于nlp_chinese_corpus项目
+包含如下子语料库（总共14G语料）：新闻语料[news2016zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/6bac09db4e6d4857b6d680d34447457490cb2dbdd8b8462ea1780a407f38e12b?responseContentDisposition=attachment%3B%20filename%3Dnews2016zh_corpus.zip)， 社区互动语料[webText2019zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/83da03f7b4974871a52348b41c16c7e3b34a26d5ca644f558df8435be4de51c3?responseContentDisposition=attachment%3B%20filename%3DwebText2019zh_corpus.zip)，维基百科语料[wiki2019zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/d7a166408d8b4ffdaf4de9cfca09f6ee1e2340260f26440a92f78134d068b28f?responseContentDisposition=attachment%3B%20filename%3Dwiki2019zh_corpus.zip)，评论数据语料[comment2019zh_corpus.zip](https://bj.bcebos.com/v1/ai-studio-online/b66ddd445735408383c42322850ac4bb82faf9cc611447c2affb925443de7a6d?responseContentDisposition=attachment%3B%20filename%3Dcomment2019zh_corpus.zip)。
+
+## 数据获取
+
+用户可以通过官方github网页下载，https://github.com/CLUEbenchmark/CLUECorpus2020 。同时，为方便用户，我们也提供了aistudio数据集下载地址。[part1](https://aistudio.baidu.com/aistudio/datasetdetail/60598)，[part2](https://aistudio.baidu.com/aistudio/datasetdetail/124357)。使用aistudio版本的数据，下载好后，可以核对md5值：
+```shell
+> md5sum ./*
+ 8a8be341ebce39cfe9524fb0b46b08c5  ./comment2019zh_corpus.zip
+ 4bdc2c941a7adb4a061caf273fea42b8  ./news2016zh_corpus.zip
+ fc582409f078b10d717caf233cc58ddd  ./webText2019zh_corpus.zip
+ 157dacde91dcbd2e52a60af49f710fa5  ./wiki2019zh_corpus.zip
+```
+解压文件
+```shell
+unzip comment2019zh_corpus.zip -d  clue_corpus_small_14g/comment2019zh_corpus
+unzip news2016zh_corpus.zip    -d  clue_corpus_small_14g/news2016zh_corpus
+unzip webText2019zh_corpus.zip -d  clue_corpus_small_14g/webText2019zh_corpus
+unzip wiki2019zh_corpus.zip    -d  clue_corpus_small_14g/wiki2019zh_corpus
+```
+将txt文件转换为jsonl格式
+```
+python trans_to_json.py  --input_path ./clue_corpus_small_14g --output_path clue_corpus_small_14g.jsonl
+```
+现在我们得到了jsonl格式的数据集。
+
+## ERNIE 中文预训练数据制作
+
+下面是针对训练任务的数据集应用，此处以ernie为例。
+
+```
+python -u  create_pretraining_data.py \
+    --model_name ernie-1.0-base-zh \
+    --tokenizer_name ErnieTokenizer \
+    --input_path clue_corpus_small_14g.jsonl \
+    --split_sentences \
+    --chinese \
+    --cn_whole_word_segment \
+    --cn_seg_func jieba \
+    --output_prefix clue_corpus_small_14g_20220104 \
+    --workers 48 \
+    --log_interval 10000
+```
+
+- model_name 可以更换为其他 ERNIE 系列模型，如: `ernie-3.0-base-zh`
+- workers 表示转化的线程数目
+
+数据共有文档`15702702`条左右，由于分词比较耗时，大概一小时左右可以完成。在当前目录下产出训练所需数据。
+```
+clue_corpus_small_14g_20220104_ids.npy
+clue_corpus_small_14g_20220104_idx.npz
+```
+用户可以使用此数据进行预训练任务。
diff --git a/model_zoo/ernie-1.0/data_tools/docs/OpenWebText2.md b/model_zoo/ernie-1.0/data_tools/docs/OpenWebText2.md
new file mode 100644
index 000000000000..fd0830aeadce
--- /dev/null
+++ b/model_zoo/ernie-1.0/data_tools/docs/OpenWebText2.md
@@ -0,0 +1,47 @@
+# OpenWebText2
+
+| 名称 | 文本类型 | 纯文本大小 |
+|-|-|-|
+| OpenWebText2 | 英文 | 70GB |
+
+## 数据获取
+
+[OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/)是一个开源的英文网页文本数据集，数据来源于Reddit，经过去重、清洗、提取，最终包含800多万个文档。
+本示例采用EleutherAI清洗好的[OpenWebText2数据](https://openwebtext2.readthedocs.io/en/latest/index.html#download-plug-and-play-version)
+
+下载以后通过以下命令解压：
+
+```shell
+wget https://mystic.the-eye.eu/public/AI/pile_preliminary_components/openwebtext2.jsonl.zst.tar
+tar -xvf openwebtext2.json.zst.tar -C  /path/to/openwebtext
+```
+
+## GPT训练数据制作
+
+然后使用[data_tools](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/gpt/../ernie-1.0/data_tools) 工具下的`create_pretraining_data.py`脚本进行数据集制作：
+```
+python -u  create_pretraining_data.py \
+    --model_name gpt2-en \
+    --tokenizer_name GPTTokenizer \
+    --data_format JSON \
+    --input_path /path/to/openwebtext/ \
+    --append_eos \
+    --output_prefix gpt_openwebtext  \
+    --workers 40 \
+    --log_interval 10000
+```
+处理时间约一个小时左右，就可以得到我们需要的`gpt_openwebtext_ids.npy`, `gpt_openwebtext_idx.npz`数据集文件。
+
+为了方便用户运行测试本模型，本项目提供了处理好的300M的训练样本：
+```shell
+wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_ids.npy
+wget https://bj.bcebos.com/paddlenlp/models/transformers/gpt/data/gpt_en_dataset_300m_idx.npz
+```
+
+将所有预处理得到的文件统一放入一个文件夹中，以备训练使用：
+
+```
+mkdir data
+mv gpt_en_dataset_300m_ids.npy ./data
+mv gpt_en_dataset_300m_idx.npz ./data
+```
diff --git a/model_zoo/ernie-1.0/data_tools/docs/WuDaoCorpusBase.md b/model_zoo/ernie-1.0/data_tools/docs/WuDaoCorpusBase.md
new file mode 100644
index 000000000000..4a88651df42a
--- /dev/null
+++ b/model_zoo/ernie-1.0/data_tools/docs/WuDaoCorpusBase.md
@@ -0,0 +1,31 @@
+# WuDaoCorpus2.0 Base 语料
+
+
+| 名称 | 文本类型 | 纯文本大小 |
+|-|-|-|
+| WuDaoCorpus2.0 Base| 中文 | 200GB |
+
+
+WuDaoCorpora是悟道爬取的中文大规模语料。整体数量为3TB，目前开源的部分为WuDaoCorpus2.0 bases数据集，大小为200GB。
+用户微信登录[官网](https://resource.wudaoai.cn/home)，即可直接下载数据。下载好的压缩数据约 64GB
+```
+64GB WuDaoCorpus2.0_base_200G.rar
+```
+
+
+```shell
+python wudao_process.py \
+    --input_path WuDaoCorpus2.0_base_200G \
+    --workers 40  \
+    --ouput_path ./wudao_lac_cut \
+```
+注：预训练需要实现 SOP( Sentence Order Predict) 任务，在分词的同时，我们使用 简单规则 进行了文本断句。如果语料只有一句话，建议去除SOP loss，训练时设置 `binary_head=False`。
+
+文本转化完成后。我们使用 `../data_tools/trans_to_json.py`重新转换为jsonl格式（分词完毕）。
+```shell
+python ../data_tools/trans_to_json.py  \
+    --input_path ./wudao_lac_cut \
+    --output_path wudao_corpus_200g_0623.jsonl \
+    --workers 40 \
+    --no-shuffle
+```
diff --git a/model_zoo/ernie-1.0/clue_wudao_process/gen_char.py b/model_zoo/ernie-1.0/data_tools/vocab/gen_char.py
similarity index 100%
rename from model_zoo/ernie-1.0/clue_wudao_process/gen_char.py
rename to model_zoo/ernie-1.0/data_tools/vocab/gen_char.py
diff --git a/model_zoo/ernie-1.0/clue_wudao_process/gen_vocab.py b/model_zoo/ernie-1.0/data_tools/vocab/gen_vocab.py
similarity index 100%
rename from model_zoo/ernie-1.0/clue_wudao_process/gen_vocab.py
rename to model_zoo/ernie-1.0/data_tools/vocab/gen_vocab.py
diff --git a/model_zoo/ernie-1.0/clue_wudao_process/merge_vocab.py b/model_zoo/ernie-1.0/data_tools/vocab/merge_vocab.py
similarity index 100%
rename from model_zoo/ernie-1.0/clue_wudao_process/merge_vocab.py
rename to model_zoo/ernie-1.0/data_tools/vocab/merge_vocab.py

From 8639e62e01127e050ee407056bfa32b6cc46b8c4 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 26 Aug 2022 19:01:38 +0800
Subject: [PATCH 34/48] refator2

---
 .../ernie-1.0/data_tools/trans_to_json.py     | 172 ------------------
 .../{data_tools => preprocess}/README.md      |   5 +-
 .../clue_process.py                           |   0
 .../create_pretraining_data.py                |   0
 .../docs/CLUECorpus2020.md                    |   0
 .../docs/CLUECorpusSmall.md                   |   0
 .../docs/OpenWebText2.md                      |   0
 .../docs/WuDaoCorpusBase.md                   |   0
 .../trans_to_json.py                          |   0
 .../wudao_process.py                          |   0
 .../{clue_wudao_process => vocab}/README.md   |   0
 .../{data_tools => }/vocab/gen_char.py        |   0
 .../{data_tools => }/vocab/gen_vocab.py       |   0
 .../{data_tools => }/vocab/merge_vocab.py     |   0
 14 files changed, 3 insertions(+), 174 deletions(-)
 delete mode 100644 model_zoo/ernie-1.0/data_tools/trans_to_json.py
 rename model_zoo/ernie-1.0/{data_tools => preprocess}/README.md (99%)
 rename model_zoo/ernie-1.0/{clue_wudao_process => preprocess}/clue_process.py (100%)
 rename model_zoo/ernie-1.0/{data_tools => preprocess}/create_pretraining_data.py (100%)
 rename model_zoo/ernie-1.0/{data_tools => preprocess}/docs/CLUECorpus2020.md (100%)
 rename model_zoo/ernie-1.0/{data_tools => preprocess}/docs/CLUECorpusSmall.md (100%)
 rename model_zoo/ernie-1.0/{data_tools => preprocess}/docs/OpenWebText2.md (100%)
 rename model_zoo/ernie-1.0/{data_tools => preprocess}/docs/WuDaoCorpusBase.md (100%)
 rename model_zoo/ernie-1.0/{clue_wudao_process => preprocess}/trans_to_json.py (100%)
 rename model_zoo/ernie-1.0/{clue_wudao_process => preprocess}/wudao_process.py (100%)
 rename model_zoo/ernie-1.0/{clue_wudao_process => vocab}/README.md (100%)
 rename model_zoo/ernie-1.0/{data_tools => }/vocab/gen_char.py (100%)
 rename model_zoo/ernie-1.0/{data_tools => }/vocab/gen_vocab.py (100%)
 rename model_zoo/ernie-1.0/{data_tools => }/vocab/merge_vocab.py (100%)

diff --git a/model_zoo/ernie-1.0/data_tools/trans_to_json.py b/model_zoo/ernie-1.0/data_tools/trans_to_json.py
deleted file mode 100644
index bd04aa919a7a..000000000000
--- a/model_zoo/ernie-1.0/data_tools/trans_to_json.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import re
-import argparse
-import json
-import multiprocessing
-import sys
-import time
-import shutil
-from functools import partial
-
-import numpy as np
-from tqdm import tqdm
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--input_path',
-                        type=str,
-                        required=True,
-                        help='Path to you raw files. Folder or file path.')
-    parser.add_argument('--output_path',
-                        type=str,
-                        required=True,
-                        help='Path to save the output json files.')
-    parser.add_argument('--json_key',
-                        type=str,
-                        default='text',
-                        help='The content key of json file.')
-    parser.add_argument(
-        '--doc_spliter',
-        type=str,
-        default='',
-        help=
-        "Spliter between documents. We will strip the line, if you use blank line to split doc, leave it blank."
-    )
-    parser.add_argument('--min_doc_length',
-                        type=int,
-                        default=10,
-                        help="Minimal char of a documment.")
-    parser.add_argument('--workers',
-                        type=int,
-                        default=1,
-                        help='Number of worker processes to launch')
-    parser.add_argument('--log_interval',
-                        type=int,
-                        default=1,
-                        help='Interval between progress updates.')
-    parser.add_argument('--no-merge',
-                        action='store_true',
-                        help='Don\'t merge the file.')
-    parser.add_argument('--no-shuffle',
-                        action='store_true',
-                        help='Don\'t shuffle the file.')
-    args = parser.parse_args()
-    return args
-
-
-def raw_text_to_json(path, doc_spliter="", json_key="text", min_doc_length=10):
-    path = os.path.abspath(path)
-    if not os.path.exists(path):
-        print("No found file %s" % path)
-        return 0, None
-
-    out_filepath = path + ".jsonl"
-    fout = open(out_filepath, "w", encoding="utf-8")
-    len_files = 0
-    with open(path, "r") as f:
-        doc = ""
-        line = f.readline()
-        while line:
-            len_files += len(line)
-            if line.strip() == doc_spliter:
-                if len(doc) > min_doc_length:
-                    fout.write(
-                        json.dumps({json_key: doc}, ensure_ascii=False) + "\n")
-                doc = ""
-            else:
-                doc += line
-            line = f.readline()
-
-        if len(doc) > min_doc_length:
-            fout.write(json.dumps({json_key: doc}, ensure_ascii=False) + "\n")
-        doc = ""
-
-    return len_files, out_filepath
-
-
-def merge_file(file_paths, output_path):
-    if not output_path.endswith(".jsonl"):
-        output_path = output_path + ".jsonl"
-    print("Merging files into %s" % output_path)
-    with open(output_path, 'wb') as wfd:
-        for f in file_paths:
-            if f is not None and os.path.exists(f):
-                with open(f, 'rb') as fd:
-                    shutil.copyfileobj(fd, wfd)
-                os.remove(f)
-    print("File save in %s" % output_path)
-    return output_path
-
-
-def shuffle_file(output_path):
-    print("Shuffling the jsonl file...")
-    if os.path.exists(output_path):
-        os.system("shuf %s -o %s" % (output_path, output_path))
-        print("File shuffled!!!")
-    else:
-        raise ValueError("File not found: %s" % output_path)
-
-
-def main():
-    args = get_args()
-    startup_start = time.time()
-
-    file_paths = []
-    if os.path.isfile(args.input_path):
-        file_paths.append(args.input_path)
-    else:
-        for root, _, fs in os.walk(args.input_path):
-            for f in fs:
-                file_paths.append(os.path.join(root, f))
-
-    pool = multiprocessing.Pool(args.workers)
-
-    startup_end = time.time()
-    proc_start = time.time()
-    total_bytes_processed = 0
-    print("Time to startup:", startup_end - startup_start)
-
-    trans_json = partial(raw_text_to_json,
-                         doc_spliter=args.doc_spliter,
-                         json_key=args.json_key,
-                         min_doc_length=args.min_doc_length)
-    encoded_files = pool.imap(trans_json, file_paths, 1)
-
-    out_paths = []
-    for i, (bytes_processed, out_path) in enumerate(encoded_files, start=1):
-        total_bytes_processed += bytes_processed
-        out_paths.append(out_path)
-        master_start = time.time()
-
-        if i % args.log_interval == 0:
-            current = time.time()
-            elapsed = current - proc_start
-            mbs = total_bytes_processed / elapsed / 1024 / 1024
-            print(f"Processed {i} files",
-                  f"({i/elapsed} files/s, {mbs} MB/s).",
-                  file=sys.stderr)
-
-    if not args.no_merge:
-        output_path = merge_file(out_paths, args.output_path)
-        if not args.no_shuffle:
-            shuffle_file(output_path)
-
-
-if __name__ == "__main__":
-    main()
-    #profile.run("main()", "testprof")
diff --git a/model_zoo/ernie-1.0/data_tools/README.md b/model_zoo/ernie-1.0/preprocess/README.md
similarity index 99%
rename from model_zoo/ernie-1.0/data_tools/README.md
rename to model_zoo/ernie-1.0/preprocess/README.md
index 52b61ef20ab7..f7af96c4e7c9 100644
--- a/model_zoo/ernie-1.0/data_tools/README.md
+++ b/model_zoo/ernie-1.0/preprocess/README.md
@@ -18,6 +18,7 @@
 ├── Makefile
 ├── README.md
 └── trans_to_json.py
+
 ```
 其中，`trans_to_json.py`是原始数据转化的脚本，将数据转化为json串格式。
 `create_pretraining_data.py`将jsonl文本，断句、分词后，tokenizer转化为token id。
@@ -56,8 +57,7 @@
 
 | 名称 | 文本类型 | 纯文本大小 | 适配模型
 |-|-|-|-|
-| [CLUECorpusSmall](./docs/CLUECorpusSmall.md)| 中文 | 14GB
-| ERNIE
+| [CLUECorpusSmall](./docs/CLUECorpusSmall.md)| 中文 | 14GB | ERNIE
 | [OpenWebText2](./docs/OpenWebText2.md) | 英文 | 70GB | GPT
 | [WuDaoCorpus2.0 Base](./docs/WuDaoCorpusBase.md)| 中文 |  200GB | ERNIE
 | [CLUECorpus2020](./docs/CLUECorpus2020.md)| 中文 | 200GB | ERNIE
@@ -73,6 +73,7 @@ mkdir data && cd data
 wget https://bj.bcebos.com/paddlenlp/models/transformers/data_tools/baike.txt
 cd ..
 ```
+
 ### 原始数据转换 jsonl 格式
 使用`trans_to_json.py`转化为json串格式，下面是脚本的使用说明
 ```
diff --git a/model_zoo/ernie-1.0/clue_wudao_process/clue_process.py b/model_zoo/ernie-1.0/preprocess/clue_process.py
similarity index 100%
rename from model_zoo/ernie-1.0/clue_wudao_process/clue_process.py
rename to model_zoo/ernie-1.0/preprocess/clue_process.py
diff --git a/model_zoo/ernie-1.0/data_tools/create_pretraining_data.py b/model_zoo/ernie-1.0/preprocess/create_pretraining_data.py
similarity index 100%
rename from model_zoo/ernie-1.0/data_tools/create_pretraining_data.py
rename to model_zoo/ernie-1.0/preprocess/create_pretraining_data.py
diff --git a/model_zoo/ernie-1.0/data_tools/docs/CLUECorpus2020.md b/model_zoo/ernie-1.0/preprocess/docs/CLUECorpus2020.md
similarity index 100%
rename from model_zoo/ernie-1.0/data_tools/docs/CLUECorpus2020.md
rename to model_zoo/ernie-1.0/preprocess/docs/CLUECorpus2020.md
diff --git a/model_zoo/ernie-1.0/data_tools/docs/CLUECorpusSmall.md b/model_zoo/ernie-1.0/preprocess/docs/CLUECorpusSmall.md
similarity index 100%
rename from model_zoo/ernie-1.0/data_tools/docs/CLUECorpusSmall.md
rename to model_zoo/ernie-1.0/preprocess/docs/CLUECorpusSmall.md
diff --git a/model_zoo/ernie-1.0/data_tools/docs/OpenWebText2.md b/model_zoo/ernie-1.0/preprocess/docs/OpenWebText2.md
similarity index 100%
rename from model_zoo/ernie-1.0/data_tools/docs/OpenWebText2.md
rename to model_zoo/ernie-1.0/preprocess/docs/OpenWebText2.md
diff --git a/model_zoo/ernie-1.0/data_tools/docs/WuDaoCorpusBase.md b/model_zoo/ernie-1.0/preprocess/docs/WuDaoCorpusBase.md
similarity index 100%
rename from model_zoo/ernie-1.0/data_tools/docs/WuDaoCorpusBase.md
rename to model_zoo/ernie-1.0/preprocess/docs/WuDaoCorpusBase.md
diff --git a/model_zoo/ernie-1.0/clue_wudao_process/trans_to_json.py b/model_zoo/ernie-1.0/preprocess/trans_to_json.py
similarity index 100%
rename from model_zoo/ernie-1.0/clue_wudao_process/trans_to_json.py
rename to model_zoo/ernie-1.0/preprocess/trans_to_json.py
diff --git a/model_zoo/ernie-1.0/clue_wudao_process/wudao_process.py b/model_zoo/ernie-1.0/preprocess/wudao_process.py
similarity index 100%
rename from model_zoo/ernie-1.0/clue_wudao_process/wudao_process.py
rename to model_zoo/ernie-1.0/preprocess/wudao_process.py
diff --git a/model_zoo/ernie-1.0/clue_wudao_process/README.md b/model_zoo/ernie-1.0/vocab/README.md
similarity index 100%
rename from model_zoo/ernie-1.0/clue_wudao_process/README.md
rename to model_zoo/ernie-1.0/vocab/README.md
diff --git a/model_zoo/ernie-1.0/data_tools/vocab/gen_char.py b/model_zoo/ernie-1.0/vocab/gen_char.py
similarity index 100%
rename from model_zoo/ernie-1.0/data_tools/vocab/gen_char.py
rename to model_zoo/ernie-1.0/vocab/gen_char.py
diff --git a/model_zoo/ernie-1.0/data_tools/vocab/gen_vocab.py b/model_zoo/ernie-1.0/vocab/gen_vocab.py
similarity index 100%
rename from model_zoo/ernie-1.0/data_tools/vocab/gen_vocab.py
rename to model_zoo/ernie-1.0/vocab/gen_vocab.py
diff --git a/model_zoo/ernie-1.0/data_tools/vocab/merge_vocab.py b/model_zoo/ernie-1.0/vocab/merge_vocab.py
similarity index 100%
rename from model_zoo/ernie-1.0/data_tools/vocab/merge_vocab.py
rename to model_zoo/ernie-1.0/vocab/merge_vocab.py

From a6c08df4acb63c99251126dbffaa627536d12df9 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Mon, 29 Aug 2022 18:10:52 +0800
Subject: [PATCH 35/48] Add pre-training introduction.

---
 model_zoo/ernie-1.0/args.py                   |   2 +-
 .../ernie-1.0/pretraining_introduction.md     | 342 ++++++++++++++++++
 2 files changed, 343 insertions(+), 1 deletion(-)
 create mode 100644 model_zoo/ernie-1.0/pretraining_introduction.md

diff --git a/model_zoo/ernie-1.0/args.py b/model_zoo/ernie-1.0/args.py
index a8ea8e42a52e..042cbcab0604 100644
--- a/model_zoo/ernie-1.0/args.py
+++ b/model_zoo/ernie-1.0/args.py
@@ -96,7 +96,7 @@ def parse_args(MODEL_CLASSES):
     parser.add_argument("--lr_decay_style", type=str, default="cosine", choices=["cosine", "none"], help="Learning rate decay style.")
     parser.add_argument("--share_folder", type=str2bool, nargs='?', const=False, help="Use share folder for data dir and output dir on multi machine.")
 
-    # Argument for bert
+    # Argument for bert/ernie
     parser.add_argument("--masked_lm_prob", type=float, default=0.15, help="Mask token prob.")
     parser.add_argument("--short_seq_prob", type=float, default=0.1, help="Short sequence prob.")
     parser.add_argument("--favor_longer_ngram", type=str2bool, default=False, help="Short sequence prob.")
diff --git a/model_zoo/ernie-1.0/pretraining_introduction.md b/model_zoo/ernie-1.0/pretraining_introduction.md
new file mode 100644
index 000000000000..edaffdbf8784
--- /dev/null
+++ b/model_zoo/ernie-1.0/pretraining_introduction.md
@@ -0,0 +1,342 @@
+# ERNIE 中文预训练
+
+## 背景
+
+ERNIE是百度提出的大规模预训练模型，曾在中文场景下取得了SOTA效果。
+PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 总共400GB，发布大规模开源语料预训练全流程。从零开始，轻松构建预训练模型。
+
+本项目，从数据下载，词表制作，数据转化，模型训练，所有流程，完全开源开放，可复现。
+并训练发布开源最优的模型参数。
+
+接下来将从下面几个方面，详细介绍整个数据制作全流程，从零开始，构建一个预训练模型。
+
+* [1. **数据准备**](数据准备)
+    * [1.1 **大规模**中文数据](#大规模中文数据)
+    * [1.2 **高精准**中文分词](#高精准中文分词)
+    * [1.3 **快速**Token ID 转化](#快速TokenID转化)
+* [2. **全字符**中文词表制作](#中文中文词表制作)
+    - [2.1 分析准备](#分析准备)
+    - [2.2 文本字符统计](#文本字符统计)
+    - [2.3 英文字符词表](#英文字符词表)
+    - [2.4 合并词表](#合并词表)
+* [3. **开始训练**](#开始训练)
+    - [3.1 训练样例](训练样例)
+        - 环境准备
+        - 启动训练
+    - [3.2 功能支持](功能支持)
+        - 训练速度
+        - 训练体验
+    - [3.3 观察评估](观察评估)
+        - VisualDL 可视化
+        - CLUE Benchmark 效果评估
+- [4. 训练效果](#训练效果)
+    - [ERNIE 3.0-Base-zh-CW 模型](#ernie-3.0-base-zh-cw)
+    - [ERNIE 1.0-Large-zh-CW 模型](#ernie-1.0-large-zh-cw)
+* [5. 参考](#参考)
+
+整体全部流程图如下：
+![image](https://user-images.githubusercontent.com/16911935/187170152-0778a6c1-6510-4c01-84d0-8e0ea3c05231.png)
+
+
+<a name="数据准备"> </a>
+
+## 1. 数据准备
+
+<a name="大规模中文数据"> </a>
+
+### 1.1 大规模中文数据
+
+**CLUECorpus2020 语料**
+
+CLUECorpus2020 过对Common Crawl的中文部分进行语料清洗得到。开源部分提供了约200G左右的语料文本，详细介绍见[官网](https://github.com/CLUEbenchmark/CLUECorpus2020#%E6%95%B0%E6%8D%AE%E4%B8%8B%E8%BD%BD)，用户可以通过邮件申请下载。
+
+**WuDaoCorpus2.0 Base 语料**
+
+WuDaoCorpora是悟道爬取的中文大规模语料。整体数量为3TB，目前开源的部分为WuDaoCorpus2.0 bases数据集，大小为200GB。
+用户微信登录[官网](https://resource.wudaoai.cn/home)，即可直接下载数据。下载好的压缩数据约 64GB
+
+
+<a name="高精准中文分词"> </a>
+
+### 1.2 高精准中文分词
+
+ERNIE 使用知识嵌入的方式进行预训练，如何尽可能精确的从原始文本中提取知识，直接关系预训练模型的效果。
+目前PaddleNLP常用的分词方式的有`jieba`，`lac`，`Wordtag`，
+效果、速度对比表格如下，假设CPU使用40线程，GPU使用16卡，处理200G文本：
+
+| 切词方式 | 效果 | 速度 | 预估耗时
+|-|-|-|-|
+| jieba | 一般 | 607 KB/s |  2.5 h |
+| lac   | 好 | 106 KB/s | 13.9 h
+| wordtag| 最好 | 0.94 KB/s | 159 D (GPU)|
+
+综合考虑分词的效果与速度，我们选择百度的LAC作为我们的文本分词工具。
+
+本文档以WuDao数据为例，对数据进行分词：
+
+```shell
+python wudao_process.py \
+    --input_path WuDaoCorpus2.0_base_200G \
+    --workers 40  \
+    --ouput_path ./wudao_lac_cut \
+```
+注：预训练需要实现 SOP( Sentence Order Predict) 任务，在分词的同时，我们使用 简单规则 进行了文本断句。如果语料只有一句话，建议去除SOP loss，训练时设置 `binary_head=False`。
+
+文本转化完成后。我们使用 `../data_tools/trans_to_json.py`重新转换为jsonl格式（分词完毕）。
+```shell
+python ../data_tools/trans_to_json.py  \
+    --input_path ./wudao_lac_cut \
+    --output_path wudao_corpus_200g_0623.jsonl \
+    --workers 40 \
+    --no-shuffle
+```
+
+
+<a name="快速TokenID转化"> </a>
+
+## 1.3 快速Token ID 转化
+
+预料、词表准备妥当后，我们可以开始进行最后的数据ID转化。
+
+- 高效的 Multiprocessing 多进程实现
+- 使用内存BytesIO存储ID数据
+
+由于转换的逻辑复杂，需要定义`class Converter`对象来进行转化处理。如果每次处理新的文本，都实例化一次class对象，速度瓶颈会在处理函数的实例化。
+我们使用了提前multiprocessing.Pool的`initializer`，对处理函数进行提前实例化，提高处理效率。
+
+处理后的token id数量巨大，可以达到数百Billion，如果使用普通的数据结构，如python的list保存，会出现存储瓶颈，不仅占用空间大，list对象还需要重新分配内存空间。这里我们采用了 BytesIO 的方式，类似写入内存文件的方式，速度快，可以非常方便转化为numpy文件保存。
+
+使用 Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz CPU测试，40线程，处理速度 8+MB/s，约7个小时左右，即可完成 200GB 文本转化为ID.
+
+```
+python -u  ../data_tools/create_pretraining_data.py \
+    --model_name ./vocab_path/vocab.txt \
+    --tokenizer_name ErnieTokenizer \
+    --input_path wudao_corpus_200g_0623.jsonl \
+    --split_sentences\
+    --chinese \
+    --cn_splited \
+    --cn_whole_word_segment \
+    --output_prefix wudao_200g_0703 \
+    --workers 40 \
+    --log_interval 1000
+```
+转化后的数据如下，使用这份数据，即可开始ERNIE预训练
+```
+-rw-rw-r-- 1 500 501 129G Jul  4 03:39 wudao_200g_0703_ids.npy
+-rw-rw-r-- 1 500 501 6.4G Jul  4 03:39 wudao_200g_0703_idx.npz
+```
+
+
+
+<a name="全字符中文词表制作"> </a>
+
+### 2. 全字符中文词表制作
+
+之前的 数据 id 化中，使用了已有的词表进行转化，当没有词表时，需要从头开始进行词表制作。这里提供了ERNIE模型词表制作的两种方案：
+
+第一种，词表组合方案
+1. 统计字符
+2. 制作英文词表
+3. 合并词表
+
+第二种，预处理后直接生成，方案
+1. 文本预处理（中文加空格，文本normalize）
+2. 使用sentencepeice制作词表
+
+第二种方案需要对文本先使用`BasicTokenizer`切分一遍语料。
+第一种方案，自定义程度高，但存在一些局限性。本项目采用了第一种方案，详细介绍如下：
+
+### 2.1 分析准备
+词表大小： 这里我们考虑的因素主要有两个
+- 已有模型对照：
+    - ERNIE 3.0系列模型的词表，词表大小为 40000 左右。
+- 预训练数据存储占用：
+    - 文本token id化后，希望使用uint16表示，此时表示的最大字符为65536。
+    - 同时考虑到ERNIE虽然是字模型，我们的仍然需要 `##中` 之类的中文字符表示分词信息。假设使用中文全字符20902(0x4E00-0x9FA5)个字符，那么剩余 vocab 大小不能超过 44634。
+
+综上，本项目决定采用 40000 左右的 vocab 容量。
+其中：
+- 中文全字符 `20902`
+- 英文字符 `17000`
+- 其他字符约 `2000` 左右
+
+
+### 2.2 文本字符统计
+首先第一步是对文本字符进行统计。字符统计的目的主要是添加常用的中文字符、特殊字符。
+
+由于语料文本过大，我们随机选取 10G 左右的原始文本进行了字符统计。
+```
+python ./vocab/gen_char.py path_to_corpus.txt
+```
+可以在本地文件夹得到`char_dict.pickle`字符频率文件。同时我们也提供了自己统计的词频文件，方便用户复现：
+```
+wget https://paddlenlp.bj.bcebos.com/models/transformers/data_tools/char_dict.pickle
+```
+
+### 2.3 英文字符词表
+基于字符的词频统计，使得英文字符也切割为字母，为此我们需要添加英文词表。
+英文部分，我们使用了 [WikiText](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip)  数据集，来构造词表。
+下载解压数据，使用BPE切词
+```
+wget  https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip
+unzip wikitext-103-v1.zip
+python ./vocab/gen_vocab.py ./wikitext-103-raw/wiki.train.raw
+```
+即可产生英文部分的词表。这里我们也提供了处理好的 vocab 方便用户验证。
+```
+wget https://paddlenlp.bj.bcebos.com/models/transformers/data_tools/eng.vocab
+```
+
+
+### 2.4 合并词表
+
+目前我们得到了字符统计表，和英文字符词表。下一步，我们将词表进行合并。
+
+将`char_dict.pickle`，`eng.vocab`放置到当前目录，使用下面命令
+```
+python ./vocab/merge_vocab.py
+```
+即可在 当前 目录生成 vocab.txt 得到最终词表。
+
+此阶段需要注意的一些问题是：
+1. 对于一些日文、谚文文字字符，需要进行 normalize
+2. 添加special_tokens
+
+### 2.5 问题遗留
+本项目采用的第一种方式，即拼接产出的词表，对连续非中、英文字符文本，会出现UNK的情况。
+如issue: [#2927](https://github.com/PaddlePaddle/PaddleNLP/issues/2927)、 [#2585](https://github.com/PaddlePaddle/PaddleNLP/issues/2585)。本项目做了两点改进:
+
+1. 对 Symbol 字符默认添加空格，变成独立字符
+2. 对 日文、谚文 在合并词表阶段默认添加 ## 字符。
+
+虽然有上述两点修复，任然无法避免 [#2927](https://github.com/PaddlePaddle/PaddleNLP/issues/2927) 现象。
+彻底解决的话，建议使用第二种方式制作vocab文件。
+
+### 2.6 方案二：预处理后直接生成
+此方案没有被采用，这里也简单说明一下具体的方案：
+1. 对语料使用 BasicTokenizer 转换
+```python
+from paddlenlp.transformers import
+tokenizer = BasicTokenizer()
+basic_toknizer = lambda x: " ".join(tokenizer.tokenize(x))
+# 对语料使用 basic_toknizer 转换
+# 并存储为新的语料 afer_basic_toknizer_corpus.txt
+```
+2. 处理转换后的语料
+```shell
+python ./vocab/gen_vocab.py afer_basic_toknizer_corpus.txt
+```
+对处理好的vocab文件手动替换一些`<pad> -> [PAD]`之类的special_tokens，即可产出词表。
+
+
+## 3. 开始训练
+
+使用开源中文语料CLUE、WuDao 总共400GB，提供上面提供的大规模语料数据集制作教程。接下来，看是模型训练。
+
+![image](https://user-images.githubusercontent.com/16911935/187134299-72628dce-cc04-49d7-89ef-078fad487724.png)
+
+### 3.1 网络配置
+
+- SOP Loss
+    - SOP (Sentence Order Predict) 损失，是 模型训练的常用损失。将文本中的句子顺序分为两段打乱，最后判断文本是否被打乱。下图是数据组织形式的展示： ![image](https://user-images.githubusercontent.com/16911935/187140981-924fd21c-fb67-4ba8-a421-490fd293175c.png)
+    - 此开关由 `binary_head` 选项开启，`binary_head=True`添加sop loss， `binary_head=False` 关闭 sop loss。
+    - **注意：如果你使用的语料文本中，只有一句话，无法分为多个句子段落，请设置 `binary_head=False`。否则，不符合要求的数据默认被删去，导致可训练的数据过小。**
+- MASK
+    -  MLM (Mask Language Model) 是通过随机将文本中的部分token，随机替换为`[MASK]` token，最后预测出真实的token值。ERNIE默认采用了Whole Word MASK方式，选定一些词语进行MASK。
+    - 用户可以设置 `masked_lm_prob` 控制mask的token占文本总token长度的比例。默认`masked_lm_prob=0.15` 随机mask 15% 的token数目。
+    - 设置`short_seq_prob`， 控制长度小于max_seq_length的样本比例，默认值`short_seq_prob=0.1`。制作数据时候，会有相应比例的数据 最大长度会设置为 一个小于 max_seq_length 的随机值。
+- Ngram MASK
+    - 项目还支持了n-gram mask策略，如下图所示，在 WWM 进行词语级别MASK的基础上（如此处mask掉的`[模型]`词组），n-gram 可以MASK掉连续n个词组。下面例子中，连续mask了2个词组，`【[语言][模型]】`同时进行了mask。 ![image](https://user-images.githubusercontent.com/16911935/187145669-7c55386d-f57a-4589-9e6d-e4a36b93e24c.png)
+    - 用户通过`max_ngrams`设置最大的`ngram`长度。默认`max_ngrams=3`。
+- Dropout
+    - Dropout 是常用的防止过拟合策略。对于大规模数据集训练，如`ernie-3.0`系列4T文本语料，可以设置 `dropout=0`，不考虑过拟合。实际`ernie-3.0-base-zh`训练中，没有开启Dropout。
+    - 用户可以设置 `hidden_dropout_prob`，`attention_probs_dropout_prob`。默认值为 `0.1`。
+
+### 3.2 训练速度
+
+**训练速度方面**，我们支持了如下策略，加
+速计算过程，减小显存占用，扩大batch_size：
+
+- **多卡多机**训练：
+    - 基于飞桨Fleet分布式API，用户可以十分方便的通过数据并行的方法，将训练扩展到多机多卡。
+- **混合精度**训练：
+    - 部分算子使用FP16计算kernel，加速计算过程。支持AMP混合精度O1，和Pure FP16全FP训练策略O2。
+- **梯度累积**训练：
+    - 用户可以指定梯度累积的步数，在梯度累积的step中，减少多卡之间梯度的通信，减少更新的次数，可以扩大训练的batch_size.
+- **重计算**训练：
+    - 通过重新计算前向的方式，减少前向网络中间变量的存储，可以显著减少显存占用。理论上，该方式以时间换空间，但在batch size显著扩大的情况下，速度下降幅度较少。
+    - 如图所示，训练过程中占用显存的中间变量，修改成了反向需要时，重新计算，避免常驻显存。![image](https://user-images.githubusercontent.com/16911935/187176881-06103714-3061-42ab-8322-0b63422e7087.png)
+
+
+### 3.3 训练体验
+**训练体验方面**，我们针对训练数据流、重启、可视化等方面做了针对性优化提升
+
+数据流
+- **多机扩展**
+    - 用户可以将数据放置到 NFS 服务器上，多机同时挂载数据即可。训练数据与计算资源分离。
+- **多数据混合**
+    - 训练数据集支持多个文件，即插即用，设置权重，传入参数即可data_dir="1.0  dateset_a  2.0 dataset_b"
+- **稳定可复现**
+    - MLM任务具有一定随机性，需要随机mask数据。本数据流通过固定每一个step数据的随机种子，实验数据流稳定可复现。
+- **快加载**
+    - 数据文件使用mmap读取，加载数百GB文件几乎不耗时。
+
+其他：
+- **断点重启**
+    - 用户可以单独设置，checkpoints steps 参数可设置较小，重启训练默认加载最新checkpoint。
+    - 断点数据自动恢复，学习率等参数也自动恢复。
+
+
+### 观察评估
+
+VisualDL训练可视化
+
+- **可视化日志记录**
+    - 日志展示为全局loss，波动小。
+    - 记录混合精度，loss_scaling等信息，方便用户debug。
+    - 对模型结构，配置参数，paddle版本信息进行记录，方便复现环境
+
+- CLUE Benchmark搜索评估参数效果
+pass
+
+
+## 训练效果
+
+**训练效果方面**，我们release了 base、large两个模型。均取得了较好的预训练效果。
+
+<a name="ernie-3.0-base-zh-cw"></a>
+
+### **ERNIE 3.0-Base-zh-CW** 模型
+
+使用CLUE，WuDao共计400GB的语料，batch_size 1024, 训练 400w step，即可训练得到`ernie-3.0-base-zh`类似的模型效果。相关模型参数，开源为`ernie-3.0-base-zh-cw`，用户加载即可使用。使用CLUE benchmark 对最优超参数进行GradSearch搜索：
+
+Model | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUEWSC2020 | CSL | CMRC | CHID | C3
+-- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
+Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc| Acc| Acc
+ERNIE 3.0-Base-zh-CW | 12L768H | 76.44 | 76.04 |    58.02 |    60.87 |    83.56 | 78.61 |    89.14 |    84.00 |  72.26/90.40 |    84.73 |    77.15 |
+ERNIE 2.0-Base-zh | 12L768H | 74.95  | 76.25 |    58.53 |    61.72 |    83.07 |    78.81 |    84.21 |    82.77 | 68.22/88.71    | 82.78    | 73.19
+ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |    58.91 |    62.25 |    81.68 |    76.58 |    85.20 |    82.77 | 67.32/87.83 | 82.47 | 69.68
+
+
+<a name="ernie-1.0-large-zh-cw"> </a>
+
+### **ERNIE 1.0-Large-zh-CW** 模型
+
+- 除了base模型外，我们还训练了放出了large模型。此模型参数采用的是词表与ernie-1.0相同，因此命名为`ernie-1.0-large-zh-cw`。使用开源语料，batch_size 512, 训练 400w step，训练去除SOP任务，只保留MLM损失：
+
+Model | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUEWSC2020 | CSL | CMRC | CHID | C3
+-- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
+Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc| Acc| Acc
+ERNIE 1.0-Large-zh-CW | 24L1024H | <b>79.03</b> | 75.97 |    59.65 |    62.91 |    85.09 |    81.73| 93.09 |    84.53 | 74.22/91.88 | 88.57 | 84.54
+ERNIE 3.0-Xbase-zh| 20L1024H | 78.71 | 76.85 |    59.89 |    62.41 |    84.76 |    82.51 |    89.80 |    84.47 |    75.49/92.67 | 86.36 | 84.59
+RoBERTa-wwm-ext-large | 24L1024H | 76.61 |    76.00 |    59.33 |    62.02 |    83.88 |    78.81 |    90.79 |    83.67 |    70.58/89.82 |    85.72 |    75.26
+
+
+## 6. 参考文献
+
+感谢CLUE，WuDao提供的开源文本语料，参考资料：
+- Xu, L., Zhang, X. and Dong, Q., 2020. CLUECorpus2020: A large-scale Chinese corpus for pre-training language model. arXiv preprint arXiv:2003.01355.
+- Yuan, S., Zhao, H., Du, Z., Ding, M., Liu, X., Cen, Y., Zou, X., Yang, Z. and Tang, J., 2021. Wudaocorpora: A super large-scale chinese corpora for pre-training language models. AI Open, 2, pp.65-68.
+- https://github.com/CLUEbenchmark/CLUECorpus2020
+- https://resource.wudaoai.cn

From 0d0c23be18cba19858cf0433945adea44210eafb Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Mon, 29 Aug 2022 19:18:02 +0800
Subject: [PATCH 36/48] update  image width.

---
 .../ernie-1.0/pretraining_introduction.md     | 26 +++++++++++++++----
 1 file changed, 21 insertions(+), 5 deletions(-)

diff --git a/model_zoo/ernie-1.0/pretraining_introduction.md b/model_zoo/ernie-1.0/pretraining_introduction.md
index edaffdbf8784..8cac15686295 100644
--- a/model_zoo/ernie-1.0/pretraining_introduction.md
+++ b/model_zoo/ernie-1.0/pretraining_introduction.md
@@ -35,7 +35,10 @@ PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 
 * [5. 参考](#参考)
 
 整体全部流程图如下：
-![image](https://user-images.githubusercontent.com/16911935/187170152-0778a6c1-6510-4c01-84d0-8e0ea3c05231.png)
+
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/16911935/187170152-0778a6c1-6510-4c01-84d0-8e0ea3c05231.png" align="middle"  width="500" />
+</p>
 
 
 <a name="数据准备"> </a>
@@ -234,12 +237,18 @@ python ./vocab/gen_vocab.py afer_basic_toknizer_corpus.txt
 
 使用开源中文语料CLUE、WuDao 总共400GB，提供上面提供的大规模语料数据集制作教程。接下来，看是模型训练。
 
-![image](https://user-images.githubusercontent.com/16911935/187134299-72628dce-cc04-49d7-89ef-078fad487724.png)
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/16911935/187134299-72628dce-cc04-49d7-89ef-078fad487724.png" align="middle"  width="500" />
+</p>
 
 ### 3.1 网络配置
 
 - SOP Loss
-    - SOP (Sentence Order Predict) 损失，是 模型训练的常用损失。将文本中的句子顺序分为两段打乱，最后判断文本是否被打乱。下图是数据组织形式的展示： ![image](https://user-images.githubusercontent.com/16911935/187140981-924fd21c-fb67-4ba8-a421-490fd293175c.png)
+    - SOP (Sentence Order Predict) 损失，是 模型训练的常用损失。将文本中的句子顺序分为两段打乱，最后判断文本是否被打乱。下图是数据组织形式的展示：
+    <p align="center">
+    <img src="https://user-images.githubusercontent.com/16911935/187140981-924fd21c-fb67-4ba8-a421-490fd293175c.png" align="middle"  width="500" />
+    </p>
+
     - 此开关由 `binary_head` 选项开启，`binary_head=True`添加sop loss， `binary_head=False` 关闭 sop loss。
     - **注意：如果你使用的语料文本中，只有一句话，无法分为多个句子段落，请设置 `binary_head=False`。否则，不符合要求的数据默认被删去，导致可训练的数据过小。**
 - MASK
@@ -247,7 +256,11 @@ python ./vocab/gen_vocab.py afer_basic_toknizer_corpus.txt
     - 用户可以设置 `masked_lm_prob` 控制mask的token占文本总token长度的比例。默认`masked_lm_prob=0.15` 随机mask 15% 的token数目。
     - 设置`short_seq_prob`， 控制长度小于max_seq_length的样本比例，默认值`short_seq_prob=0.1`。制作数据时候，会有相应比例的数据 最大长度会设置为 一个小于 max_seq_length 的随机值。
 - Ngram MASK
-    - 项目还支持了n-gram mask策略，如下图所示，在 WWM 进行词语级别MASK的基础上（如此处mask掉的`[模型]`词组），n-gram 可以MASK掉连续n个词组。下面例子中，连续mask了2个词组，`【[语言][模型]】`同时进行了mask。 ![image](https://user-images.githubusercontent.com/16911935/187145669-7c55386d-f57a-4589-9e6d-e4a36b93e24c.png)
+    - 项目还支持了n-gram mask策略，如下图所示，在 WWM 进行词语级别MASK的基础上（如此处mask掉的`[模型]`词组），n-gram 可以MASK掉连续n个词组。下面例子中，连续mask了2个词组，`【[语言][模型]】`同时进行了mask。
+    <p align="center">
+    <img src="https://user-images.githubusercontent.com/16911935/187145669-7c55386d-f57a-4589-9e6d-e4a36b93e24c.png" align="middle"  width="500" />
+    </p>
+
     - 用户通过`max_ngrams`设置最大的`ngram`长度。默认`max_ngrams=3`。
 - Dropout
     - Dropout 是常用的防止过拟合策略。对于大规模数据集训练，如`ernie-3.0`系列4T文本语料，可以设置 `dropout=0`，不考虑过拟合。实际`ernie-3.0-base-zh`训练中，没有开启Dropout。
@@ -266,7 +279,10 @@ python ./vocab/gen_vocab.py afer_basic_toknizer_corpus.txt
     - 用户可以指定梯度累积的步数，在梯度累积的step中，减少多卡之间梯度的通信，减少更新的次数，可以扩大训练的batch_size.
 - **重计算**训练：
     - 通过重新计算前向的方式，减少前向网络中间变量的存储，可以显著减少显存占用。理论上，该方式以时间换空间，但在batch size显著扩大的情况下，速度下降幅度较少。
-    - 如图所示，训练过程中占用显存的中间变量，修改成了反向需要时，重新计算，避免常驻显存。![image](https://user-images.githubusercontent.com/16911935/187176881-06103714-3061-42ab-8322-0b63422e7087.png)
+    - 如图所示，训练过程中占用显存的中间变量，修改成了反向需要时，重新计算，避免常驻显存。
+    <p align="center">
+    <img src="https://user-images.githubusercontent.com/16911935/187176881-06103714-3061-42ab-8322-0b63422e7087.png" align="middle"  width="500" />
+    </p>
 
 
 ### 3.3 训练体验

From 4096e469ffc8c284585b7839c4830e2acc010e98 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Tue, 30 Aug 2022 13:38:19 +0800
Subject: [PATCH 37/48] refine doc

---
 model_zoo/ernie-1.0/README.md                 |  8 +--
 .../ernie-1.0/pretraining_introduction.md     | 70 +++++++++++++------
 2 files changed, 54 insertions(+), 24 deletions(-)

diff --git a/model_zoo/ernie-1.0/README.md b/model_zoo/ernie-1.0/README.md
index 19bf389dc87c..009fee7def65 100644
--- a/model_zoo/ernie-1.0/README.md
+++ b/model_zoo/ernie-1.0/README.md
@@ -294,13 +294,13 @@ PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 
 
 **训练效果方面**，我们release了base、large两个模型。均取得了较好的预训练效果。
 
-- **ERNIE 3.0-Base-zh-CW** 模型：
-    - 使用CLUE，WuDao共计400GB的语料，batch_size 1024, 训练 400w step，即可训练得到`ernie-3.0-base-zh`类似的模型效果。相关模型参数，开源为`ernie-3.0-base-zh-cw`，用户加载即可使用。使用CLUE benchmark 对最优超参数进行GradSearch搜索：
+- **ERNIE 1.0-Base-zh-CW** 模型：
+    - 使用CLUE，WuDao共计400GB的语料，batch_size 1024, 训练 400w step，即可训练得到`ernie-3.0-base-zh`类似的模型效果。相关模型参数，开源为`ernie-1.0-base-zh-cw`，用户加载即可使用。使用CLUE benchmark 对最优超参数进行GradSearch搜索：
 
 Model | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUEWSC2020 | CSL | CMRC | CHID | C3
 -- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
 Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc| Acc| Acc
-ERNIE 3.0-Base-zh-CW | 12L768H | 76.44 | 76.04 |    58.02 |    60.87 |    83.56 | 78.61 |    89.14 |    84.00 |  72.26/90.40 |    84.73 |    77.15 |
+ERNIE 1.0-Base-zh-CW | 12L768H | 76.44 | 76.04 |    58.02 |    60.87 |    83.56 | 78.61 |    89.14 |    84.00 |  72.26/90.40 |    84.73 |    77.15 |
 ERNIE 2.0-Base-zh | 12L768H | 74.95  | 76.25 |    58.53 |    61.72 |    83.07 |    78.81 |    84.21 |    82.77 | 68.22/88.71    | 82.78    | 73.19
 ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |    58.91 |    62.25 |    81.68 |    76.58 |    85.20 |    82.77 | 67.32/87.83 | 82.47 | 69.68
 
@@ -350,7 +350,7 @@ unset CUDA_VISIBLE_DEVICES
 
 ```shell
 trainer_id=${PADDLE_TRAINER_ID:-"0"}
-task_name="0809-ernie-3.0-base-cw-dp16-gb1024"
+task_name="0809-ernie-1.0-base-cw-dp16-gb1024"
 
 base_nfs="/path/to/your/nfs/mount/point"
 base_dir="${base_nfs}/ernie-cw/output/${task_name}"
diff --git a/model_zoo/ernie-1.0/pretraining_introduction.md b/model_zoo/ernie-1.0/pretraining_introduction.md
index 8cac15686295..de85a2b6988a 100644
--- a/model_zoo/ernie-1.0/pretraining_introduction.md
+++ b/model_zoo/ernie-1.0/pretraining_introduction.md
@@ -20,17 +20,18 @@ PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 
     - [2.3 英文字符词表](#英文字符词表)
     - [2.4 合并词表](#合并词表)
 * [3. **开始训练**](#开始训练)
-    - [3.1 训练样例](训练样例)
+    - [3.1 训练样例](#训练样例)
         - 环境准备
         - 启动训练
-    - [3.2 功能支持](功能支持)
+    - [3.2 功能支持](#功能支持)
+        - [网络配置](#网络配置)
         - 训练速度
         - 训练体验
-    - [3.3 观察评估](观察评估)
+    - [3.3 观察评估](#观察评估)
         - VisualDL 可视化
         - CLUE Benchmark 效果评估
 - [4. 训练效果](#训练效果)
-    - [ERNIE 3.0-Base-zh-CW 模型](#ernie-3.0-base-zh-cw)
+    - [ERNIE 1.0-Base-zh-CW 模型](#ernie-1.0-base-zh-cw)
     - [ERNIE 1.0-Large-zh-CW 模型](#ernie-1.0-large-zh-cw)
 * [5. 参考](#参考)
 
@@ -246,7 +247,7 @@ python ./vocab/gen_vocab.py afer_basic_toknizer_corpus.txt
 - SOP Loss
     - SOP (Sentence Order Predict) 损失，是 模型训练的常用损失。将文本中的句子顺序分为两段打乱，最后判断文本是否被打乱。下图是数据组织形式的展示：
     <p align="center">
-    <img src="https://user-images.githubusercontent.com/16911935/187140981-924fd21c-fb67-4ba8-a421-490fd293175c.png" align="middle"  width="500" />
+    <img src="https://user-images.githubusercontent.com/16911935/187140981-924fd21c-fb67-4ba8-a421-490fd293175c.png" align="middle"  width="600" />
     </p>
 
     - 此开关由 `binary_head` 选项开启，`binary_head=True`添加sop loss， `binary_head=False` 关闭 sop loss。
@@ -258,7 +259,7 @@ python ./vocab/gen_vocab.py afer_basic_toknizer_corpus.txt
 - Ngram MASK
     - 项目还支持了n-gram mask策略，如下图所示，在 WWM 进行词语级别MASK的基础上（如此处mask掉的`[模型]`词组），n-gram 可以MASK掉连续n个词组。下面例子中，连续mask了2个词组，`【[语言][模型]】`同时进行了mask。
     <p align="center">
-    <img src="https://user-images.githubusercontent.com/16911935/187145669-7c55386d-f57a-4589-9e6d-e4a36b93e24c.png" align="middle"  width="500" />
+    <img src="https://user-images.githubusercontent.com/16911935/187145669-7c55386d-f57a-4589-9e6d-e4a36b93e24c.png" align="middle"  width="600" />
     </p>
 
     - 用户通过`max_ngrams`设置最大的`ngram`长度。默认`max_ngrams=3`。
@@ -273,32 +274,61 @@ python ./vocab/gen_vocab.py afer_basic_toknizer_corpus.txt
 
 - **多卡多机**训练：
     - 基于飞桨Fleet分布式API，用户可以十分方便的通过数据并行的方法，将训练扩展到多机多卡。
+    - *<u>使用方法</u>*：
+        - 单机八卡
+        ```shell
+        python3 -u  -m paddle.distributed.launch \
+            --gpus "0,1,2,3,4,5,6,7" \
+            run_pretrain.py
+        ```
+        - 多机，假设机器ip为 `192.168.1.101,192.168.1.102`
+        ```shell
+        python3 -u  -m paddle.distributed.launch \
+            --gpus "0,1,2,3,4,5,6,7" \
+            --ips "192.168.1.101,192.168.1.102" \
+            run_pretrain.py
+        ```
 - **混合精度**训练：
     - 部分算子使用FP16计算kernel，加速计算过程。支持AMP混合精度O1，和Pure FP16全FP训练策略O2。
+    - 如下图所示，使用AMP O1时，一些参数自动从fp32 cast为FP16类型计算。使用`O2` pure fp16时，模型参数为 fp16。
+    - *<u>使用方法</u>*:  设置`use_amp=True`开启混合精度训练。设置`fp16_opt_level=O1`，切换pure_fp16请设置为`O2`。
+    <p align="center">
+    <img src="https://user-images.githubusercontent.com/16911935/187338824-8b522935-4d6e-48d4-a5f6-55695ed3b182.png" align="middle" width=600 />
+    </p>
 - **梯度累积**训练：
-    - 用户可以指定梯度累积的步数，在梯度累积的step中，减少多卡之间梯度的通信，减少更新的次数，可以扩大训练的batch_size.
+    - 用户可以指定梯度累积的步数，在梯度累积的step中。
+    - 减少多卡之间梯度的通信，减少更新的次数，扩大训练的batch_size.
+    - <u>*使用方法*</u>：用户设置 `gobal_batch_size`为 `micro_batch_size*卡数`的倍数，即可开启梯度累积。如：单卡bs=16，8卡，此时如果设置`gobal_batch_size=512`，则梯度累积次数为`gobal_batch_size/bs/card_num=512/16/8=4`。
 - **重计算**训练：
-    - 通过重新计算前向的方式，减少前向网络中间变量的存储，可以显著减少显存占用。理论上，该方式以时间换空间，但在batch size显著扩大的情况下，速度下降幅度较少。
-    - 如图所示，训练过程中占用显存的中间变量，修改成了反向需要时，重新计算，避免常驻显存。
+    - 通过重新计算前向的方式，减少前向网络中间变量的存储，可以显著减少显存占用。理论上，该方式以时间换空间，但在batch size显著扩大的情况下，速度下降幅度较小。
+    - 如图所示：原来训练过程中，中间变量需要常驻显存，等待反向计算。使用重计算之后，修改成了反向需要时，再重新计算一遍前向过程，生成中间变量。避免常驻显存，减小显存占用。
+    - <u>*使用方法*</u>：用户设置`use_recompute=True`即可使用。注意使用时，可同时扩大`micro_batch_size`参数。
     <p align="center">
-    <img src="https://user-images.githubusercontent.com/16911935/187176881-06103714-3061-42ab-8322-0b63422e7087.png" align="middle"  width="500" />
+    <img src="https://user-images.githubusercontent.com/16911935/187176881-06103714-3061-42ab-8322-0b63422e7087.png" align="middle"  width="600" />
     </p>
 
 
+
 ### 3.3 训练体验
 **训练体验方面**，我们针对训练数据流、重启、可视化等方面做了针对性优化提升
 
 数据流
 - **多机扩展**
-    - 用户可以将数据放置到 NFS 服务器上，多机同时挂载数据即可。训练数据与计算资源分离。
+    - 用户可以将数据放置到 NFS 服务器上，多机同时挂载数据即可。
+    - 解析：当用户需要在多台机器之间，一起多机训练，或者切换到空闲的机器上训练时。由于数据集很大(数百GB)，迁移不方便。训练数据与计算资源分离，是非常适合的策略。
+    - <u>*使用方法*</u>：参考[NFS服务搭建教程](https://blog.csdn.net/eijiyey/article/details/123184529)，用户将制作好的数据，放到NFS机器，然后挂载到有训练资源的其他机器训练即可。
+    <p align="center">
+    <img src="https://user-images.githubusercontent.com/16911935/187355897-478e7aeb-560f-4ea7-a29c-4bea9d8a7712.png" align="middle"  width="500" />
+    </p>
+
 - **多数据混合**
-    - 训练数据集支持多个文件，即插即用，设置权重，传入参数即可data_dir="1.0  dateset_a  2.0 dataset_b"
+    - 训练数据集支持多个文件，即插即用，设置权重。
+    - <u>*使用方法*</u>：传入参数即可data_dir="1.0  dateset_a  2.0 dataset_b"
 - **稳定可复现**
     - MLM任务具有一定随机性，需要随机mask数据。本数据流通过固定每一个step数据的随机种子，实验数据流稳定可复现。
+    - <u>*使用方法*</u>： 传入`seed`参数即可，修改参数后会重新生成 index 数据，打乱数据顺序。
 - **快加载**
-    - 数据文件使用mmap读取，加载数百GB文件几乎不耗时。
-
-其他：
+    - 数据文件使用mmap读取，避免直接将数据加载到内存，加载数百GB文件几乎不耗时。
 - **断点重启**
     - 用户可以单独设置，checkpoints steps 参数可设置较小，重启训练默认加载最新checkpoint。
     - 断点数据自动恢复，学习率等参数也自动恢复。
@@ -314,23 +344,23 @@ VisualDL训练可视化
     - 对模型结构，配置参数，paddle版本信息进行记录，方便复现环境
 
 - CLUE Benchmark搜索评估参数效果
-pass
+    - 使用
 
 
 ## 训练效果
 
 **训练效果方面**，我们release了 base、large两个模型。均取得了较好的预训练效果。
 
-<a name="ernie-3.0-base-zh-cw"></a>
+<a name="ernie-1.0-base-zh-cw"></a>
 
-### **ERNIE 3.0-Base-zh-CW** 模型
+### **ERNIE 1.0-Base-zh-CW** 模型
 
-使用CLUE，WuDao共计400GB的语料，batch_size 1024, 训练 400w step，即可训练得到`ernie-3.0-base-zh`类似的模型效果。相关模型参数，开源为`ernie-3.0-base-zh-cw`，用户加载即可使用。使用CLUE benchmark 对最优超参数进行GradSearch搜索：
+使用CLUE，WuDao共计400GB的语料，batch_size 1024, 训练 400w step，即可训练得到`ernie-3.0-base-zh`类似的模型效果。相关模型参数，开源为`ernie-1.0-base-zh-cw`，用户加载即可使用。使用CLUE benchmark 对最优超参数进行GradSearch搜索：
 
 Model | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUEWSC2020 | CSL | CMRC | CHID | C3
 -- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
 Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc| Acc| Acc
-ERNIE 3.0-Base-zh-CW | 12L768H | 76.44 | 76.04 |    58.02 |    60.87 |    83.56 | 78.61 |    89.14 |    84.00 |  72.26/90.40 |    84.73 |    77.15 |
+ERNIE 1.0-Base-zh-CW | 12L768H | 76.44 | 76.04 |    58.02 |    60.87 |    83.56 | 78.61 |    89.14 |    84.00 |  72.26/90.40 |    84.73 |    77.15 |
 ERNIE 2.0-Base-zh | 12L768H | 74.95  | 76.25 |    58.53 |    61.72 |    83.07 |    78.81 |    84.21 |    82.77 | 68.22/88.71    | 82.78    | 73.19
 ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |    58.91 |    62.25 |    81.68 |    76.58 |    85.20 |    82.77 | 67.32/87.83 | 82.47 | 69.68
 

From 7f7b4e0de90916c2dd91eba441ff96265c92aa09 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Tue, 30 Aug 2022 14:32:09 +0800
Subject: [PATCH 38/48] fit table width.

---
 model_zoo/ernie-1.0/README.md                   | 10 +++++-----
 model_zoo/ernie-1.0/pretraining_introduction.md | 11 +++++------
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/model_zoo/ernie-1.0/README.md b/model_zoo/ernie-1.0/README.md
index 009fee7def65..bc9cddba75cf 100644
--- a/model_zoo/ernie-1.0/README.md
+++ b/model_zoo/ernie-1.0/README.md
@@ -297,9 +297,9 @@ PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 
 - **ERNIE 1.0-Base-zh-CW** 模型：
     - 使用CLUE，WuDao共计400GB的语料，batch_size 1024, 训练 400w step，即可训练得到`ernie-3.0-base-zh`类似的模型效果。相关模型参数，开源为`ernie-1.0-base-zh-cw`，用户加载即可使用。使用CLUE benchmark 对最优超参数进行GradSearch搜索：
 
-Model | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUEWSC2020 | CSL | CMRC | CHID | C3
+<div style="width:180px">Model</div> | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
 -- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
-Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc| Acc| Acc
+Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1| Acc| Acc
 ERNIE 1.0-Base-zh-CW | 12L768H | 76.44 | 76.04 |    58.02 |    60.87 |    83.56 | 78.61 |    89.14 |    84.00 |  72.26/90.40 |    84.73 |    77.15 |
 ERNIE 2.0-Base-zh | 12L768H | 74.95  | 76.25 |    58.53 |    61.72 |    83.07 |    78.81 |    84.21 |    82.77 | 68.22/88.71    | 82.78    | 73.19
 ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |    58.91 |    62.25 |    81.68 |    76.58 |    85.20 |    82.77 | 67.32/87.83 | 82.47 | 69.68
@@ -307,10 +307,10 @@ ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |    58.91 |    62.25 |    81.68 |
 - **ERNIE 1.0-Large-zh-CW** 模型：
     - 除了base模型外，我们还训练了放出了large模型。此模型参数采用的是词表与ernie-1.0相同，因此命名为`ernie-1.0-large-zh-cw`。使用开源语料，batch_size 512, 训练 400w step，训练去除SOP任务，只保留MLM损失：
 
-Model | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUEWSC2020 | CSL | CMRC | CHID | C3
+<div style="width:190px">Model</div>  | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
 -- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
-Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc| Acc| Acc
-ERNIE 1.0-Large-zh-CW | 24L1024H | 79.03 | 75.97 |    59.65 |    62.91 |    85.09 |    81.73| 93.09 |    84.53 | 74.22/91.88 | 88.57 | 84.54
+Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1 | Acc| Acc
+ERNIE 1.0-Large-zh-CW| 24L1024H | 79.03 | 75.97 |    59.65 |    62.91 |    85.09 |    81.73| 93.09 |    84.53 | 74.22/91.88 | 88.57 | 84.54
 ERNIE 3.0-Xbase-zh| 20L1024H | 78.71 | 76.85 |    59.89 |    62.41 |    84.76 |    82.51 |    89.80 |    84.47 |    75.49/92.67 | 86.36 | 84.59
 RoBERTa-wwm-ext-large | 24L1024H | 76.61 |    76.00 |    59.33 |    62.02 |    83.88 |    78.81 |    90.79 |    83.67 |    70.58/89.82 |    85.72 |    75.26
 
diff --git a/model_zoo/ernie-1.0/pretraining_introduction.md b/model_zoo/ernie-1.0/pretraining_introduction.md
index de85a2b6988a..986b131af533 100644
--- a/model_zoo/ernie-1.0/pretraining_introduction.md
+++ b/model_zoo/ernie-1.0/pretraining_introduction.md
@@ -357,9 +357,9 @@ VisualDL训练可视化
 
 使用CLUE，WuDao共计400GB的语料，batch_size 1024, 训练 400w step，即可训练得到`ernie-3.0-base-zh`类似的模型效果。相关模型参数，开源为`ernie-1.0-base-zh-cw`，用户加载即可使用。使用CLUE benchmark 对最优超参数进行GradSearch搜索：
 
-Model | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUEWSC2020 | CSL | CMRC | CHID | C3
+<div style="width:180px">Model</div> | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
 -- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
-Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc| Acc| Acc
+Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1| Acc| Acc
 ERNIE 1.0-Base-zh-CW | 12L768H | 76.44 | 76.04 |    58.02 |    60.87 |    83.56 | 78.61 |    89.14 |    84.00 |  72.26/90.40 |    84.73 |    77.15 |
 ERNIE 2.0-Base-zh | 12L768H | 74.95  | 76.25 |    58.53 |    61.72 |    83.07 |    78.81 |    84.21 |    82.77 | 68.22/88.71    | 82.78    | 73.19
 ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |    58.91 |    62.25 |    81.68 |    76.58 |    85.20 |    82.77 | 67.32/87.83 | 82.47 | 69.68
@@ -371,14 +371,13 @@ ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |    58.91 |    62.25 |    81.68 |
 
 - 除了base模型外，我们还训练了放出了large模型。此模型参数采用的是词表与ernie-1.0相同，因此命名为`ernie-1.0-large-zh-cw`。使用开源语料，batch_size 512, 训练 400w step，训练去除SOP任务，只保留MLM损失：
 
-Model | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUEWSC2020 | CSL | CMRC | CHID | C3
+<div style="width:190px">Model</div>  | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
 -- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
-Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc| Acc| Acc
-ERNIE 1.0-Large-zh-CW | 24L1024H | <b>79.03</b> | 75.97 |    59.65 |    62.91 |    85.09 |    81.73| 93.09 |    84.53 | 74.22/91.88 | 88.57 | 84.54
+Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1 | Acc| Acc
+ERNIE 1.0-Large-zh-CW| 24L1024H | 79.03 | 75.97 |    59.65 |    62.91 |    85.09 |    81.73| 93.09 |    84.53 | 74.22/91.88 | 88.57 | 84.54
 ERNIE 3.0-Xbase-zh| 20L1024H | 78.71 | 76.85 |    59.89 |    62.41 |    84.76 |    82.51 |    89.80 |    84.47 |    75.49/92.67 | 86.36 | 84.59
 RoBERTa-wwm-ext-large | 24L1024H | 76.61 |    76.00 |    59.33 |    62.02 |    83.88 |    78.81 |    90.79 |    83.67 |    70.58/89.82 |    85.72 |    75.26
 
-
 ## 6. 参考文献
 
 感谢CLUE，WuDao提供的开源文本语料，参考资料：

From 68caef01f4df6290f6ec92ceeaf49509f08d9cfd Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Tue, 30 Aug 2022 14:41:14 +0800
Subject: [PATCH 39/48] fix c++ style

---
 model_zoo/ernie-1.0/data_tools/helpers.cpp |  5 ++---
 model_zoo/ernie-1.0/preprocess/README.md   | 14 +++++++-------
 2 files changed, 9 insertions(+), 10 deletions(-)

diff --git a/model_zoo/ernie-1.0/data_tools/helpers.cpp b/model_zoo/ernie-1.0/data_tools/helpers.cpp
index c66e740bc1ee..ebd71fabd1fb 100644
--- a/model_zoo/ernie-1.0/data_tools/helpers.cpp
+++ b/model_zoo/ernie-1.0/data_tools/helpers.cpp
@@ -32,7 +32,6 @@ using namespace std;
 
 const int32_t LONG_SENTENCE_LEN = 512;
 
-
 void build_blending_indices(py::array_t<uint8_t>& dataset_index,
                             py::array_t<int64_t>& dataset_sample_index,
                             const py::array_t<double>& weights,
@@ -292,10 +291,10 @@ py::array build_mapping_impl(const py::array_t<int64_t>& docs_,
         }
         break;
       }
-      if( epoch > 0 &&  map_index == 0 ){
+      if(epoch > 0 && map_index == 0){
         cout << endl << "     No available documtment find this dataset." << endl << std::flush;
         throw std::invalid_argument(
-          "Invalid dataset! the documtment should be with more than " 
+          "Invalid dataset! the document should be with more than " 
           + std::to_string(min_num_sent) + " scentences.");
       }
       // For each document:
diff --git a/model_zoo/ernie-1.0/preprocess/README.md b/model_zoo/ernie-1.0/preprocess/README.md
index f7af96c4e7c9..3cc5700fee92 100644
--- a/model_zoo/ernie-1.0/preprocess/README.md
+++ b/model_zoo/ernie-1.0/preprocess/README.md
@@ -125,7 +125,7 @@ optional arguments:
                         必须设置，如：ernie-1.0-base-zh, 可以参考已有的模型名称 https://paddlenlp.readthedocs.io/zh/latest/model_zoo/index.html#transformer
   --tokenizer_name {ErnieTokenizer,BertTokenizer,GPTTokenizer,GPTChineseTokenizer}
                         What type of tokenizer to use.
-                        模型对应的tokenizer, 目前暂时只支持 Ernie，Bert，GPT
+                        模型对应的tokenizer, 目前暂时只支持 ERNIE，BERT，GPT
 data input/output:
   --input_path INPUT_PATH
                         Path to input JSON files.
@@ -140,14 +140,14 @@ data input/output:
   --json_key JSON_KEY   For JSON format. Space separate listed of keys to extract from json
                         文本串json的key值。同前面trans_to_json.py的json_key，默认text为key
   --split_sentences     Split documents into sentences.
-                        是否需要将文章划分成句子。一般而言，GPT不需要，Bert/Ernie模型需要
+                        是否需要将文章划分成句子。一般而言，GPT不需要，BERT/ERNIE模型需要
 
 chinese words:
   --chinese             Is corpus need words segmentation step for chinese words.
                         中文情形必须设置。处理的文本类型是否是中文。
   --cn_whole_word_segment
                         Is corpus need words segmentation step for chinese words WWM.
-                        可选。是否需要WWM策略。一般而言，Bert/Ernie模型需要，GPT不需要。
+                        可选。是否需要WWM策略。一般而言，BERT/ERNIE模型需要，GPT不需要。
   --cn_seg_func {lac,seg,jieba}
                         Words segment function for chinese words.
                         默认jieba，jieba速度较快，lac模型更准确，计算量高。
@@ -184,12 +184,12 @@ python -u  create_pretraining_data.py \
 2. 使用自定义词表的话，请指定model_name为词表所在的文件夹地址。
 
 
-### Ernie预训练开始
-得到了处理好的训练数据，就可以开始Ernie模型的预训练了。ernie预训练的代码在`model_zoo/ernie-1.0`。
-简单将预处理好的数据，拷贝到data目录，即可开始Ernie模型预训练。
+### ERNIE 预训练开始
+得到了处理好的训练数据，就可以开始ERNIE模型的预训练了。ERNIE预训练的代码在`model_zoo/ernie-1.0`。
+简单将预处理好的数据，拷贝到data目录，即可开始ERNIE模型预训练。
 ```
 mkdir data
-mv ./data_tools/baike_sample* ./data
+mv ./preprocess/baike_sample* ./data
 sh run_static.sh
 # 建议修改 run_static.sh 中的配置，将max_steps设置小一些。
 ```

From c355c51a369198763dbd8c7f87b290ce7b26a619 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Tue, 30 Aug 2022 15:55:19 +0800
Subject: [PATCH 40/48] fix table

---
 model_zoo/ernie-1.0/README.md                   | 11 +++++------
 model_zoo/ernie-1.0/pretraining_introduction.md | 10 +++++-----
 2 files changed, 10 insertions(+), 11 deletions(-)

diff --git a/model_zoo/ernie-1.0/README.md b/model_zoo/ernie-1.0/README.md
index bc9cddba75cf..1a35b6f19675 100644
--- a/model_zoo/ernie-1.0/README.md
+++ b/model_zoo/ernie-1.0/README.md
@@ -297,24 +297,23 @@ PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 
 - **ERNIE 1.0-Base-zh-CW** 模型：
     - 使用CLUE，WuDao共计400GB的语料，batch_size 1024, 训练 400w step，即可训练得到`ernie-3.0-base-zh`类似的模型效果。相关模型参数，开源为`ernie-1.0-base-zh-cw`，用户加载即可使用。使用CLUE benchmark 对最优超参数进行GradSearch搜索：
 
-<div style="width:180px">Model</div> | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
+Model&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
 -- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
-Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1| Acc| Acc
-ERNIE 1.0-Base-zh-CW | 12L768H | 76.44 | 76.04 |    58.02 |    60.87 |    83.56 | 78.61 |    89.14 |    84.00 |  72.26/90.40 |    84.73 |    77.15 |
+ Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1| Acc| Acc
+ERNIE 1.0-Base-zh-CW | 12L768H | <b>76.44</b> | 76.04 |    58.02 |    60.87 |    83.56 | 78.61 |    89.14 |    84.00 |  72.26/90.40 |    84.73 |    77.15 |
 ERNIE 2.0-Base-zh | 12L768H | 74.95  | 76.25 |    58.53 |    61.72 |    83.07 |    78.81 |    84.21 |    82.77 | 68.22/88.71    | 82.78    | 73.19
 ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |    58.91 |    62.25 |    81.68 |    76.58 |    85.20 |    82.77 | 67.32/87.83 | 82.47 | 69.68
 
 - **ERNIE 1.0-Large-zh-CW** 模型：
     - 除了base模型外，我们还训练了放出了large模型。此模型参数采用的是词表与ernie-1.0相同，因此命名为`ernie-1.0-large-zh-cw`。使用开源语料，batch_size 512, 训练 400w step，训练去除SOP任务，只保留MLM损失：
 
-<div style="width:190px">Model</div>  | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
+Model&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;  | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
 -- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
 Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1 | Acc| Acc
-ERNIE 1.0-Large-zh-CW| 24L1024H | 79.03 | 75.97 |    59.65 |    62.91 |    85.09 |    81.73| 93.09 |    84.53 | 74.22/91.88 | 88.57 | 84.54
+ERNIE 1.0-Large-zh-CW| 24L1024H | <b>79.03</b> | 75.97 |    59.65 |    62.91 |    85.09 |    81.73| 93.09 |    84.53 | 74.22/91.88 | 88.57 | 84.54
 ERNIE 3.0-Xbase-zh| 20L1024H | 78.71 | 76.85 |    59.89 |    62.41 |    84.76 |    82.51 |    89.80 |    84.47 |    75.49/92.67 | 86.36 | 84.59
 RoBERTa-wwm-ext-large | 24L1024H | 76.61 |    76.00 |    59.33 |    62.02 |    83.88 |    78.81 |    90.79 |    83.67 |    70.58/89.82 |    85.72 |    75.26
 
-
 ###  开始训练
 
 训练脚本如下
diff --git a/model_zoo/ernie-1.0/pretraining_introduction.md b/model_zoo/ernie-1.0/pretraining_introduction.md
index 986b131af533..3abb442ec255 100644
--- a/model_zoo/ernie-1.0/pretraining_introduction.md
+++ b/model_zoo/ernie-1.0/pretraining_introduction.md
@@ -357,10 +357,10 @@ VisualDL训练可视化
 
 使用CLUE，WuDao共计400GB的语料，batch_size 1024, 训练 400w step，即可训练得到`ernie-3.0-base-zh`类似的模型效果。相关模型参数，开源为`ernie-1.0-base-zh-cw`，用户加载即可使用。使用CLUE benchmark 对最优超参数进行GradSearch搜索：
 
-<div style="width:180px">Model</div> | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
+Model&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
 -- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
-Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1| Acc| Acc
-ERNIE 1.0-Base-zh-CW | 12L768H | 76.44 | 76.04 |    58.02 |    60.87 |    83.56 | 78.61 |    89.14 |    84.00 |  72.26/90.40 |    84.73 |    77.15 |
+ Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1| Acc| Acc
+ERNIE 1.0-Base-zh-CW | 12L768H | <b>76.44</b> | 76.04 |    58.02 |    60.87 |    83.56 | 78.61 |    89.14 |    84.00 |  72.26/90.40 |    84.73 |    77.15 |
 ERNIE 2.0-Base-zh | 12L768H | 74.95  | 76.25 |    58.53 |    61.72 |    83.07 |    78.81 |    84.21 |    82.77 | 68.22/88.71    | 82.78    | 73.19
 ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |    58.91 |    62.25 |    81.68 |    76.58 |    85.20 |    82.77 | 67.32/87.83 | 82.47 | 69.68
 
@@ -371,10 +371,10 @@ ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |    58.91 |    62.25 |    81.68 |
 
 - 除了base模型外，我们还训练了放出了large模型。此模型参数采用的是词表与ernie-1.0相同，因此命名为`ernie-1.0-large-zh-cw`。使用开源语料，batch_size 512, 训练 400w step，训练去除SOP任务，只保留MLM损失：
 
-<div style="width:190px">Model</div>  | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
+Model&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;  | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
 -- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
 Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1 | Acc| Acc
-ERNIE 1.0-Large-zh-CW| 24L1024H | 79.03 | 75.97 |    59.65 |    62.91 |    85.09 |    81.73| 93.09 |    84.53 | 74.22/91.88 | 88.57 | 84.54
+ERNIE 1.0-Large-zh-CW| 24L1024H | <b>79.03</b> | 75.97 |    59.65 |    62.91 |    85.09 |    81.73| 93.09 |    84.53 | 74.22/91.88 | 88.57 | 84.54
 ERNIE 3.0-Xbase-zh| 20L1024H | 78.71 | 76.85 |    59.89 |    62.41 |    84.76 |    82.51 |    89.80 |    84.47 |    75.49/92.67 | 86.36 | 84.59
 RoBERTa-wwm-ext-large | 24L1024H | 76.61 |    76.00 |    59.33 |    62.02 |    83.88 |    78.81 |    90.79 |    83.67 |    70.58/89.82 |    85.72 |    75.26
 

From 8629207ead8b24d4b5204c50891fee7fbf027d15 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Tue, 30 Aug 2022 18:06:49 +0800
Subject: [PATCH 41/48] refine docs

---
 model_zoo/ernie-1.0/README.md                 |   2 +-
 .../ernie-1.0/pretraining_introduction.md     | 146 ++++++++++++++++--
 2 files changed, 132 insertions(+), 16 deletions(-)

diff --git a/model_zoo/ernie-1.0/README.md b/model_zoo/ernie-1.0/README.md
index 1a35b6f19675..7887d89c5287 100644
--- a/model_zoo/ernie-1.0/README.md
+++ b/model_zoo/ernie-1.0/README.md
@@ -299,7 +299,7 @@ PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 
 
 Model&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
 -- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
- Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1| Acc| Acc
+ Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1| Acc| Acc | Acc
 ERNIE 1.0-Base-zh-CW | 12L768H | <b>76.44</b> | 76.04 |    58.02 |    60.87 |    83.56 | 78.61 |    89.14 |    84.00 |  72.26/90.40 |    84.73 |    77.15 |
 ERNIE 2.0-Base-zh | 12L768H | 74.95  | 76.25 |    58.53 |    61.72 |    83.07 |    78.81 |    84.21 |    82.77 | 68.22/88.71    | 82.78    | 73.19
 ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |    58.91 |    62.25 |    81.68 |    76.58 |    85.20 |    82.77 | 67.32/87.83 | 82.47 | 69.68
diff --git a/model_zoo/ernie-1.0/pretraining_introduction.md b/model_zoo/ernie-1.0/pretraining_introduction.md
index 3abb442ec255..23ea11ac7450 100644
--- a/model_zoo/ernie-1.0/pretraining_introduction.md
+++ b/model_zoo/ernie-1.0/pretraining_introduction.md
@@ -242,6 +242,116 @@ python ./vocab/gen_vocab.py afer_basic_toknizer_corpus.txt
   <img src="https://user-images.githubusercontent.com/16911935/187134299-72628dce-cc04-49d7-89ef-078fad487724.png" align="middle"  width="500" />
 </p>
 
+### 3.0 训练脚本
+
+训练脚本如下
+
+<b>环境配置</b>
+
+- PYTHONPATH 设置为当前目录（适合paddlenlp develop运行）
+- 设置了一些FLAGS，包括增强报错，动态图Flag，提高矩阵乘法精度。
+
+<details>
+<summary>环境配置脚本</summary>
+
+```shell
+set -x
+
+# cd PaddleNLP/model_zoo/ernie-1.0
+export PYTHONPATH=$PYTHONPATH:../../
+
+export FLAGS_call_stack_level=2
+# export NCCL_SOCKET_IFNAME=xgbe0
+export FLAGS_gemm_use_half_precision_compute_type=False
+export FLAGS_enable_eager_mode=1
+unset CUDA_VISIBLE_DEVICES
+```
+</details>
+
+<b>路径配置</b>
+
+- 主要配置输入输出目录
+
+<details>
+<summary>路径配置</summary>
+
+```shell
+trainer_id=${PADDLE_TRAINER_ID:-"0"}
+task_name="0809-ernie-1.0-base-cw-dp16-gb1024"
+
+base_nfs="/path/to/your/nfs/mount/point"
+base_dir="${base_nfs}/ernie-cw/output/${task_name}"
+data_dir="5.0 ${base_nfs}/clue_oscar/clue_corpus_oscar_0630 7.0 ${base_nfs}/clue_train/clue_corpus_train_0629 12.0 ${base_nfs}/wudao_200g/wudao_200g_0703"
+vocab_dir="${base_nfs}/"
+```
+</details>
+
+**启动训练**：这里启动的是单机8卡任务，整体全局的batch_size 512。加入两机训练，请设置dp_degree=16。
+```shell
+python3 -u  -m paddle.distributed.launch \
+    --gpus "0,1,2,3,4,5,6,7" \
+    --log_dir "${base_dir}/log_${trainer_id}" \
+    run_pretrain.py \
+    --model_type "ernie" \
+    --model_name_or_path "ernie-3.0-base-zh" \
+    --tokenizer_name_or_path "${vocab_dir}" \
+    --input_dir "${data_dir}" \
+    --output_dir "${base_dir}" \
+    --fp16_opt_level "O1" \
+    --max_seq_len 512 \
+    --binary_head true \
+    --micro_batch_size 64 \
+    --sharding_degree 1\
+    --dp_degree 8 \
+    --use_sharding false \
+    --use_amp true \
+    --use_recompute false \
+    --max_lr 0.0001 \
+    --min_lr 0.00001 \
+    --max_steps 4000000 \
+    --save_steps 100000 \
+    --checkpoint_steps 5000 \
+    --decay_steps 3900000 \
+    --weight_decay 0.01 \
+    --warmup_rate 0.01 \
+    --grad_clip 1.0 \
+    --logging_freq 20 \
+    --num_workers 3 \
+    --eval_freq 1000 \
+    --device "gpu"\
+    --share_folder true \
+    --hidden_dropout_prob 0.1 \
+    --attention_probs_dropout_prob 0.1 \
+    --seed 1234 \
+```
+
+
+其中参数释义如下：
+- `model_name_or_path` 要训练的模型或者之前训练的checkpoint。
+- `tokenizer_name_or_path` 模型词表文件所在的文件夹，或者PaddleNLP内置tokenizer的名字。
+- `continue_training` 默认false，模型从随机初始化，开始训练。如果为True，从已有的预训练权重加载，开始训练。如果为True， 训练初始loss 为2.x 是正常loss，如果未False，随机初始化，初始loss一般为10+。
+- `input_dir` 指定输入文件，可以使用目录，指定目录时将包括目录中的所有文件。
+- `output_dir` 指定输出文件。
+- `split` 划分数据集为train、valid、test的比例。整个数据集会按照这个比例划分数据。默认1/1000的数据为test，当样本数太少时，请修改此比例。
+- `max_seq_len` 输入文本序列的长度。
+- `micro_batch_size` 单卡batch size大小，比如此处单卡bs=64, 采用8卡训练`global_batch_size=64*8=512`。
+- `use_amp` 开启混合精度策略。
+- `fp16_opt_level` 混合精度策略，支持O1 自动混合精度，O2 pure fp16精度训练。
+- `max_lr` 训练学习率。
+- `min_lr` 学习率衰减的最小值。
+- `max_steps` 最大训练步数。
+- `save_steps` 保存模型间隔。默认保存地址格式为`output_dir/model_50000`(5w 步时的权重)。
+- `checkpoint_steps` 模型checkpoint间隔，用于模型断点重启训练。默认地址为`output_dir/model_last`.
+- `weight_decay` 权重衰减参数。
+- `warmup_rate` 学习率warmup参数。
+- `grad_clip` 梯度裁剪范围。
+- `logging_freq` 日志输出间隔。
+- `num_workers` DataLoader采样进程，当数据输入为瓶颈时，可尝试提高采样进程数目。
+- `eval_freq` 模型评估间隔。
+- `device` 训练设备，默认为GPU。
+- `share_folder` 多机训练时，如果多机input_dir为挂载的同一个nfs网络位置，可以开启次选项，多机共享同一份数据。
+
+
 ### 3.1 网络配置
 
 - SOP Loss
@@ -281,11 +391,11 @@ python ./vocab/gen_vocab.py afer_basic_toknizer_corpus.txt
             --gpus "0,1,2,3,4,5,6,7" \
             run_pretrain.py
         ```
-        - 多机，假设机器ip为 `192.168.1.101,192.168.1.102`
+        - 多机，假设机器ip为 `192.168.1.101,192.168.1.102` **注**：多台机器启动的ips参数需要顺序一致。
         ```shell
         python3 -u  -m paddle.distributed.launch \
             --gpus "0,1,2,3,4,5,6,7" \
-            --ips "192.168.1.101,192.168.1.102" \
+            --ips 192.168.1.101,192.168.1.102 \
             run_pretrain.py
         ```
 - **混合精度**训练：
@@ -322,29 +432,35 @@ python ./vocab/gen_vocab.py afer_basic_toknizer_corpus.txt
     </p>
 
 - **多数据混合**
-    - 训练数据集支持多个文件，即插即用，设置权重。
-    - <u>*使用方法*</u>：传入参数即可data_dir="1.0  dateset_a  2.0 dataset_b"
+    -  <u>*简介*</u>：训练数据集支持多个文件，即插即用，可设置不同数据集占比权重。上面的多机训练的架构，混合使用了四份数据集。
+    - <u>*使用方法*</u>：传入参数即可`input_dir="1.0  dateset_a/prefix  2.0 dataset_b/prefix"`
+    - **注意**：如果文件夹中只有一份数据如`data/wudao_200g_0703_ids.npy data/wudao_200g_0703_idx.npz`，可以直接设置`input_dir=./data`为输入目录即可。如果需要设定多份数据集，必须写上数据集前缀，如`input_dir="1.0 data/wudao_200g_0703 1.0 data2/clue_corpus_train_0629"`。写前缀即可，不要加上后面类似`_ids.npy _idx.npz`的尾缀。
 - **稳定可复现**
-    - MLM任务具有一定随机性，需要随机mask数据。本数据流通过固定每一个step数据的随机种子，实验数据流稳定可复现。
+    - <u>*简介*</u>：MLM任务具有一定随机性，需要随机mask数据。本数据流通过固定每一个step数据的随机种子，实验数据流稳定可复现。
     - <u>*使用方法*</u>： 传入`seed`参数即可，修改参数后会重新生成 index 数据，打乱数据顺序。
 - **快加载**
-    - 数据文件使用mmap读取，避免直接将数据加载到内存，加载数百GB文件几乎不耗时。
+    -  <u>*简介*</u>：数据文件使用mmap读取，避免直接将数据加载到内存，加载数百GB文件几乎不耗时。
 - **断点重启**
-    - 用户可以单独设置，checkpoints steps 参数可设置较小，重启训练默认加载最新checkpoint。
+    -  <u>*简介*</u>：用户可以单独设置，`checkpoint_steps` 参数可设置较小，重启训练默认加载最新checkpoint。
     - 断点数据自动恢复，学习率等参数也自动恢复。
+    - **注意：** 此`checkpoint_steps`参数仅保留最后一个`checkpoint`到`model_last`文件夹，默认每次覆盖。用户需要永久保存参数，请设置`save_steps`。建议可以设置`checkpoint_steps`为需要间隔训练半小时、一小时左右的时间，一旦环境故障，可以获取到最新的`checkpoint`。
 
 
-### 观察评估
+### 3.4 观察评估
 
-VisualDL训练可视化
-
-- **可视化日志记录**
+- **训练过程观察**：VisualDL可视化日志记录
     - 日志展示为全局loss，波动小。
     - 记录混合精度，loss_scaling等信息，方便用户debug。
     - 对模型结构，配置参数，paddle版本信息进行记录，方便复现环境
 
-- CLUE Benchmark搜索评估参数效果
-    - 使用
+<p align="center">
+<img src="https://user-images.githubusercontent.com/16911935/187404575-52d53892-4272-4c9d-b29d-064352628951.png" align="middle"  width="900" />
+</p>
+
+
+- **下游任务评估**：CLUE Benchmark搜索评估参数效果
+    - 使用[批量启动-grid-search](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/benchmark/clue#%E6%89%B9%E9%87%8F%E5%90%AF%E5%8A%A8-grid-search)，可以进行批量搜索任务
+    - 注意，这里使用的是训练中的checkpoint进行评估，可以直接试着 评估待评估的参数为，所在的路径地址，即如 `python grid_seach.py ouput/ernie-base-outdir/model_100000` 之类的checkpoint地址。
 
 
 ## 训练效果
@@ -359,7 +475,7 @@ VisualDL训练可视化
 
 Model&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
 -- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
- Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1| Acc| Acc
+ Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1| Acc| Acc
 ERNIE 1.0-Base-zh-CW | 12L768H | <b>76.44</b> | 76.04 |    58.02 |    60.87 |    83.56 | 78.61 |    89.14 |    84.00 |  72.26/90.40 |    84.73 |    77.15 |
 ERNIE 2.0-Base-zh | 12L768H | 74.95  | 76.25 |    58.53 |    61.72 |    83.07 |    78.81 |    84.21 |    82.77 | 68.22/88.71    | 82.78    | 73.19
 ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |    58.91 |    62.25 |    81.68 |    76.58 |    85.20 |    82.77 | 67.32/87.83 | 82.47 | 69.68
@@ -369,7 +485,7 @@ ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |    58.91 |    62.25 |    81.68 |
 
 ### **ERNIE 1.0-Large-zh-CW** 模型
 
-- 除了base模型外，我们还训练了放出了large模型。此模型参数采用的是词表与ernie-1.0相同，因此命名为`ernie-1.0-large-zh-cw`。使用开源语料，batch_size 512, 训练 400w step，训练去除SOP任务，只保留MLM损失：
+除了base模型外，我们还训练了large模型。命名为`ernie-1.0-large-zh-cw`。使用开源语料，batch_size 512, 训练 400w step，训练去除SOP任务，只保留MLM损失，使用CLUE benchmark 对最优超参数进行GradSearch搜索：
 
 Model&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;  | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
 -- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |

From 7afe8f418e151f1418cea202bdc7eba626755b7b Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 1 Sep 2022 09:46:21 +0800
Subject: [PATCH 42/48] refine model_zoo/ernie-1.0/README.md

---
 model_zoo/ernie-1.0/README.md                 | 214 ++++++++++++------
 model_zoo/ernie-1.0/args.py                   |   3 +
 .../ernie-1.0/pretraining_introduction.md     | 122 +++++-----
 3 files changed, 216 insertions(+), 123 deletions(-)

diff --git a/model_zoo/ernie-1.0/README.md b/model_zoo/ernie-1.0/README.md
index 7887d89c5287..747aa44ef7c0 100644
--- a/model_zoo/ernie-1.0/README.md
+++ b/model_zoo/ernie-1.0/README.md
@@ -93,6 +93,7 @@ Learnt by ERNIE：[mask] [mask] [mask] 是黑龙江的省会，国际 [mask] [ma
 ## 2. 中文预训练
 ERNIE预训练采用的是MLM（Mask Language Model）的训练方式，采用WWM（Whole Word Mask）方式，对于完整语义单元的Token，会同时进行Mask。整体的训练损失loss是mlm_loss + sop_loss。
 
+
 本样例为用户提供了高效的训练流程，
 - **支持动态文本mask**： 用户可以根据自己的需求，灵活修改mask方式。具体可以参考修改`data_tools/dataset_utils.py`中`create_masked_lm_predictions`函数。
 - **支持自动断点训练重启恢复**。 用户可以设置`checkpoint_steps`，间隔`checkpoint_steps`数，即保留最新的checkpoint到`model_last`文件夹。重启训练时，程序默认从最新checkpoint重启训练，学习率、数据集都可以恢复到checkpoint时候的状态。
@@ -248,80 +249,26 @@ PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 
 本教程，从数据下载，词表制作，数据转化，模型训练，所有流程，完全开源开放，可复现。
 并训练发布开源最优的模型参数。
 
-#### 数据制作
-
-数据下载，词表制作，数据转化部分，请参见[CLUE WuDao数据预处理](./clue_wudao_process/README.md)。
-接下来我们主要介绍训练流程部分的特性：
-
-
-训练结构：
-- 支持SOP损失，灵活可配置。
-训练方式：
-- 同时支持动态图和静态图训练
-
-**训练速度方面**，我们支持了如下策略，加速计算过程，减小显存占用，扩大batch_size：
-
-- **多卡多机训练**：
-    - 基于飞桨Fleet分布式API，用户可以十分方便的通过数据并行的方法，将训练扩展到多机多卡。
-- **混合精度训练**：
-    - 部分算子使用FP16计算kernel，加速计算过程。支持AMP混合精度O1，和Pure FP16全FP训练策略O2。
-- **梯度累积训练**：
-    - 用户可以指定梯度累积的步数，在梯度累积的step中，减少多卡之间梯度的通信，减少更新的次数，可以扩大训练的batch_size.
-- **重计算训练**：
-    -  通过重新计算前向的方式，减少前向网络中间变量的存储，可以显著减少显存占用，
-
-
-**训练体验方面**，我们针对训练数据流、重启、可视化等方面做了针对性优化提升
-
-数据流
-- **多机扩展**
-    - 用户可以将数据放置到 NFS 服务器上，多机同时挂载数据即可。训练数据与计算资源分离。
-- **多数据混合**
-    - 训练数据集支持多个文件，即插即用，设置权重，传入参数即可data_dir="1.0  dateset_a  2.0 dataset_b"
-- **稳定可复现**
-    - MLM任务具有一定随机性，需要随机mask数据。本数据流通过固定每一个step数据的随机种子，实验数据流稳定可复现。
-- **快加载**
-    - 数据文件使用mmap读取，加载数百GB文件几乎不耗时。
+#### 数据准备
 
-其他：
-- **断点重启**
-    - 用户可以单独设置，checkpoints steps 参数可设置较小，重启训练默认加载最新checkpoint。
-    - 断点数据自动恢复，学习率等参数也自动恢复。
-- **可视化日志记录**
-    - 日志展示为全局loss，波动小。
-    - 记录混合精度，loss_scaling等信息，方便用户debug。
-    - 对模型结构，配置参数，paddle版本信息进行记录，方便复现环境
+数据下载，数据转化部分，请参见[数据预处理文档](./preprocess/README.md)，
+- [CLUECorpus2020数据处理](./preprocess/docs/CLUECorpus2020.md)
+- [WuDaoCorpusBase数据处理](./preprocess/docs/WuDaoCorpusBase.md)
 
-**训练效果方面**，我们release了base、large两个模型。均取得了较好的预训练效果。
+如果需要定制化词表，词表制作部分请参考[词表制作](./vocab/README.md)
 
-- **ERNIE 1.0-Base-zh-CW** 模型：
-    - 使用CLUE，WuDao共计400GB的语料，batch_size 1024, 训练 400w step，即可训练得到`ernie-3.0-base-zh`类似的模型效果。相关模型参数，开源为`ernie-1.0-base-zh-cw`，用户加载即可使用。使用CLUE benchmark 对最优超参数进行GradSearch搜索：
-
-Model&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
--- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
- Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1| Acc| Acc | Acc
-ERNIE 1.0-Base-zh-CW | 12L768H | <b>76.44</b> | 76.04 |    58.02 |    60.87 |    83.56 | 78.61 |    89.14 |    84.00 |  72.26/90.40 |    84.73 |    77.15 |
-ERNIE 2.0-Base-zh | 12L768H | 74.95  | 76.25 |    58.53 |    61.72 |    83.07 |    78.81 |    84.21 |    82.77 | 68.22/88.71    | 82.78    | 73.19
-ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |    58.91 |    62.25 |    81.68 |    76.58 |    85.20 |    82.77 | 67.32/87.83 | 82.47 | 69.68
-
-- **ERNIE 1.0-Large-zh-CW** 模型：
-    - 除了base模型外，我们还训练了放出了large模型。此模型参数采用的是词表与ernie-1.0相同，因此命名为`ernie-1.0-large-zh-cw`。使用开源语料，batch_size 512, 训练 400w step，训练去除SOP任务，只保留MLM损失：
-
-Model&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;  | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
--- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
-Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1 | Acc| Acc
-ERNIE 1.0-Large-zh-CW| 24L1024H | <b>79.03</b> | 75.97 |    59.65 |    62.91 |    85.09 |    81.73| 93.09 |    84.53 | 74.22/91.88 | 88.57 | 84.54
-ERNIE 3.0-Xbase-zh| 20L1024H | 78.71 | 76.85 |    59.89 |    62.41 |    84.76 |    82.51 |    89.80 |    84.47 |    75.49/92.67 | 86.36 | 84.59
-RoBERTa-wwm-ext-large | 24L1024H | 76.61 |    76.00 |    59.33 |    62.02 |    83.88 |    78.81 |    90.79 |    83.67 |    70.58/89.82 |    85.72 |    75.26
 
 ###  开始训练
 
+### 3.1 训练脚本
+
 训练脚本如下
 
 <b>环境配置</b>
 
 - PYTHONPATH 设置为当前目录（适合paddlenlp develop运行）
 - 设置了一些FLAGS，包括增强报错，动态图Flag，提高矩阵乘法精度。
+- 多机情况下，可以设置`NCCL_SOCKET_IFNAME`指明NCCL使用的通信网口。
 
 <details>
 <summary>环境配置脚本</summary>
@@ -342,7 +289,9 @@ unset CUDA_VISIBLE_DEVICES
 
 <b>路径配置</b>
 
-- 主要配置
+- 主要配置输入输出目录
+- 这里的`vocab_dir`如果没有使用自定义词表的话，请设置为内置的tokenizer，如`ernie-1.0-base-zh,ernie-3.0-base-zh`等。
+- 这里的 `data_dir` 设置多份数据集，用户不使用多份数据集的话，直接`data_dir="./data"`即可。
 
 <details>
 <summary>路径配置</summary>
@@ -358,7 +307,7 @@ vocab_dir="${base_nfs}/"
 ```
 </details>
 
-**启动训练**：这里启动的是两机16卡任务，dp_degree=16，整体全局的batch_size 1024
+**启动训练**：这里启动的是单机8卡任务，整体全局的batch_size 512 (64*8)。如果指定ips参数，进行多机运行，如 `python3 -u  -m paddle.distributed.launch  --gpus "0,1,2,3,4,5,6,7" --ips 192.168.1.101,192.168.1.101 `
 ```shell
 python3 -u  -m paddle.distributed.launch \
     --gpus "0,1,2,3,4,5,6,7" \
@@ -369,14 +318,12 @@ python3 -u  -m paddle.distributed.launch \
     --tokenizer_name_or_path "${vocab_dir}" \
     --input_dir "${data_dir}" \
     --output_dir "${base_dir}" \
-    --fp16_opt_level "O1" \
+    --split 949,50,1 \
     --max_seq_len 512 \
     --binary_head true \
     --micro_batch_size 64 \
-    --sharding_degree 1\
-    --dp_degree 16 \
-    --use_sharding false \
     --use_amp true \
+    --fp16_opt_level "O1" \
     --use_recompute false \
     --max_lr 0.0001 \
     --min_lr 0.00001 \
@@ -398,6 +345,137 @@ python3 -u  -m paddle.distributed.launch \
 ```
 
 
+其中参数释义如下：
+- `model_name_or_path` 要训练的模型或者之前训练的checkpoint。
+- `tokenizer_name_or_path` 模型词表文件所在的文件夹(对于ernie，词表文件名一般命名为vocab.txt)，或者PaddleNLP内置tokenizer的名字。
+- `continue_training` 默认false，模型从随机初始化，开始训练。如果为True，从已有的预训练权重加载，开始训练。如果为True， 训练初始loss 为2.x 是正常loss，如果未False，随机初始化，初始loss一般为10+。
+- `input_dir` 指定输入文件，可以使用目录，指定目录时将包括目录中的所有文件。
+- `output_dir` 指定输出文件。
+- `split` 划分数据集为train、valid、test的比例。整个数据集会按照这个比例划分数据。默认`split=949,50,1`, 使用1/1000的数据为test，当样本数太少时，增大测试的样本数目。
+- `max_seq_len` 输入文本序列的长度，默认值`512`。
+- `binary_head` 是否使用SOP(Sentences Order Predicet) loss，默认为 True，使用此loss。如果用户句子语料很短，无法组合成句子对，请设置此参数为`false`。
+- `micro_batch_size` 单卡batch size大小，比如此处单卡bs=64, 采用8卡训练`global_batch_size=64*8=512`。
+- `use_amp` 开启混合精度策略。
+- `fp16_opt_level` 混合精度策略，支持O1 自动混合精度，O2 pure fp16精度训练。
+- `max_lr` 训练学习率。
+- `min_lr` 学习率衰减到最小值后，学习率将一直保持为`min_lr`。
+- `max_steps` 最大训练步数。训练不支持通过`epoch`控制，第一次制造数据index时候，日志会显示数据会被计算的epoch数，请注意查看。
+- `save_steps` 保存模型间隔。默认保存地址格式为`output_dir/model_50000`(5w 步时的权重)。
+- `checkpoint_steps` 模型checkpoint间隔，用于模型断点重启训练。默认地址为`output_dir/model_last`.
+- `weight_decay` 权重衰减参数。
+- `warmup_rate` 学习率warmup参数。
+- `grad_clip` 梯度裁剪范围。
+- `logging_freq` 日志输出间隔。
+- `num_workers` DataLoader采样进程，当数据输入为瓶颈时，可尝试提高采样进程数目。
+- `eval_freq` 模型评估间隔。
+- `device` 训练设备，默认为GPU。
+- `share_folder` 多机训练时，如果多机`input_dir`为挂载的同一个nfs网络位置，可以开启次选项，多机共享同一份数据。（每次运行，会制作训练的index数据，如果为挂载的统一nfs位置，则一台机器制作数据即可，否则每台机器都需要制作）
+
+
+<p align="center">
+  <img src="https://user-images.githubusercontent.com/16911935/187134299-72628dce-cc04-49d7-89ef-078fad487724.png" align="middle"  width="500" />
+</p>
+
+接下来我们主要介绍训练流程部分的特性的简单介绍：详细参数配置介绍请参见[ERNIE 中文预训练介绍](./pretraining_introduction.md)。
+
+- **训练网络配置方面：**
+
+    本小节主要针对，任务的损失函数、MASK参数等配置进行了简单介绍。
+    - SOP Loss
+        - SOP (Sentence Order Predict) 损失，是 模型训练的常用损失。将文本中的句子顺序分为两段打乱，最后判断文本是否被打乱。可以通过设置`binary_head`开启或者关闭。
+    - MASK
+        -  MLM (Mask Language Model) 是通过随机将文本中的部分token，随机替换为`[MASK]` token，最后预测出真实的token值。ERNIE默认采用了Whole Word MASK方式，选定一些词语进行MASK。
+        - *<u>使用方法</u>*: 用户可以设置 `masked_lm_prob` 控制mask的token占文本总token长度的比例。默认`masked_lm_prob=0.15` 随机mask 15% 的token数目。
+    - Ngram MASK
+        - 项目还支持了n-gram mask策略，如下图所示，在 WWM 进行词语级别MASK的基础上（如此处mask掉的`[模型]`词组），n-gram 可以MASK掉连续n个词组。下面例子中，连续mask了2个词组，`【[语言][模型]】`同时进行了mask。
+        <p align="center">
+        <img src="https://user-images.githubusercontent.com/16911935/187145669-7c55386d-f57a-4589-9e6d-e4a36b93e24c.png" align="middle"  width="600" />
+        </p>
+
+        - *<u>使用方法</u>*: 用户通过`max_ngrams`设置最大的`ngram`长度。默认`max_ngrams=3`。
+
+    - Dropout
+        - Dropout 是常用的防止过拟合策略。对于大规模数据集训练，如`ernie-3.0`系列4T文本语料，可以设置 `dropout=0`，不考虑过拟合。实际`ernie-3.0-base-zh`训练中，没有开启Dropout。
+
+详细参数配置介绍请参见[ERNIE 中文预训练介绍](./pretraining_introduction.md)。
+
+
+- **训练速度方面**
+
+    我们支持了如下策略，加速计算过程，减小显存占用，扩大batch_size：
+
+    - **多卡多机训练**：
+        - 基于飞桨Fleet分布式API，用户可以十分方便的通过数据并行的方法，将训练扩展到多机多卡。
+    - **混合精度训练**：
+        - 部分算子使用FP16计算kernel，加速计算过程。支持AMP混合精度O1，和Pure FP16全FP训练策略O2。
+    - **梯度累积训练**：
+        - 用户可以指定梯度累积的步数，在梯度累积的step中，减少多卡之间梯度的通信，减少更新的次数，可以扩大训练的batch_size.
+    - **重计算训练**：
+        -  通过重新计算前向的方式，减少前向网络中间变量的存储，可以显著减少显存占用，
+
+详细参数配置介绍请参见[ERNIE 中文预训练介绍](./pretraining_introduction.md)。
+
+
+- **训练数据流方面**
+
+    我们针对训练数据流扩展、混合、重启等方面做了针对性优化提升
+    <p align="center">
+    <img src="https://user-images.githubusercontent.com/16911935/187355897-478e7aeb-560f-4ea7-a29c-4bea9d8a7712.png" align="middle"  width="500" />
+    </p>
+
+    - **多机扩展**
+        - 用户可以将数据放置到 NFS 服务器上，多机同时挂载数据即可。训练数据与计算资源分离。
+    - **多数据混合**
+        - 训练数据集支持多个文件，即插即用，设置权重，传入参数即可`input_dir="1.0  dateset_a/prefix  2.0 dataset_b/prefix"`
+    - **稳定可复现**
+        - MLM任务具有一定随机性，需要随机mask数据。本数据流通过固定每一个step数据的随机种子，实验数据流稳定可复现。
+    - **快加载**
+        - 数据文件使用mmap读取，加载数百GB文件几乎不耗时。
+    - **断点重启**
+        - 用户可以单独设置，checkpoints steps 参数可设置较小，重启训练默认加载最新checkpoint。
+        - 断点数据自动恢复，学习率等参数也自动恢复。
+
+详细参数配置介绍请参见[ERNIE 中文预训练介绍](./pretraining_introduction.md)。
+
+- **观察评估方面**
+
+    - **可视化日志记录**
+        - 日志展示为全局loss，波动小。
+        - 记录混合精度，loss_scaling等信息，方便用户debug。
+        - 对模型结构，配置参数，paddle版本信息进行记录，方便复现环境
+    - **下游任务评估**：CLUE Benchmark搜索评估参数效果
+        - 使用[批量启动-grid-search](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/examples/benchmark/clue#%E6%89%B9%E9%87%8F%E5%90%AF%E5%8A%A8-grid-search)，可以进行批量搜索任务
+        - 注意，这里使用的是训练中的checkpoint进行评估，可以直接试着 评估待评估的参数为，所在的路径地址，即如 `python grid_seach.py ouput/ernie-base-outdir/model_100000` 之类的checkpoint地址。
+
+详细介绍请参见[ERNIE 中文预训练介绍](./pretraining_introduction.md)。
+
+
+- **训练效果方面**
+
+    我们release了base、large两个模型。均取得了较好的预训练效果。
+
+    - **ERNIE 1.0-Base-zh-CW** 模型：
+        - 使用CLUE，WuDao共计400GB的语料，batch_size 1024, 训练 400w step，即可训练得到`ernie-3.0-base-zh`类似的模型效果。相关模型参数，开源为`ernie-1.0-base-zh-cw`，用户加载即可使用。使用CLUE benchmark 对最优超参数进行GradSearch搜索：
+
+Model&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
+-- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
+ Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1| Acc| Acc | Acc
+ERNIE 1.0-Base-zh-CW | 12L768H | <b>76.44</b> | 76.04 |    58.02 |    60.87 |    83.56 | 78.61 |    89.14 |    84.00 |  72.26/90.40 |    84.73 |    77.15 |
+ERNIE 2.0-Base-zh | 12L768H | 74.95  | 76.25 |    58.53 |    61.72 |    83.07 |    78.81 |    84.21 |    82.77 | 68.22/88.71    | 82.78    | 73.19
+ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |    58.91 |    62.25 |    81.68 |    76.58 |    85.20 |    82.77 | 67.32/87.83 | 82.47 | 69.68
+-
+    - **ERNIE 1.0-Large-zh-CW** 模型：
+
+        - 除了base模型外，我们还训练了放出了large模型。此模型参数采用的是词表与ernie-1.0相同，因此命名为`ernie-1.0-large-zh-cw`。使用开源语料，batch_size 512, 训练 400w step，训练去除SOP任务，只保留MLM损失：
+
+Model&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;  | Arch | CLUE AVG |  AFQMC | TNEWS | IFLYTEK | CMNLI | OCNLI | CLUE WSC2020 | CSL | CMRC | CHID | C3
+-- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
+Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1 | Acc| Acc
+ERNIE 1.0-Large-zh-CW| 24L1024H | <b>79.03</b> | 75.97 |    59.65 |    62.91 |    85.09 |    81.73| 93.09 |    84.53 | 74.22/91.88 | 88.57 | 84.54
+ERNIE 3.0-Xbase-zh| 20L1024H | 78.71 | 76.85 |    59.89 |    62.41 |    84.76 |    82.51 |    89.80 |    84.47 |    75.49/92.67 | 86.36 | 84.59
+RoBERTa-wwm-ext-large | 24L1024H | 76.61 |    76.00 |    59.33 |    62.02 |    83.88 |    78.81 |    90.79 |    83.67 |    70.58/89.82 |    85.72 |    75.26
+
+
 <a name="预训练模型贡献"></a>
 
 ### 预训练模型贡献
diff --git a/model_zoo/ernie-1.0/args.py b/model_zoo/ernie-1.0/args.py
index 042cbcab0604..790d1b8852cd 100644
--- a/model_zoo/ernie-1.0/args.py
+++ b/model_zoo/ernie-1.0/args.py
@@ -121,5 +121,8 @@ def parse_args(MODEL_CLASSES):
             logger.warning(
                 "The attention_probs_dropout_prob should set to 0 for accuracy checking."
             )
+    if args.dp_degree * args.mp_degree * args.pp_degree * args.sharding_degree == 1:
+        if paddle.distributed.get_world_size() > 1:
+            args.dp_degree = paddle.distributed.get_world_size()
 
     return args
diff --git a/model_zoo/ernie-1.0/pretraining_introduction.md b/model_zoo/ernie-1.0/pretraining_introduction.md
index 23ea11ac7450..fd1162b16403 100644
--- a/model_zoo/ernie-1.0/pretraining_introduction.md
+++ b/model_zoo/ernie-1.0/pretraining_introduction.md
@@ -1,6 +1,4 @@
-# ERNIE 中文预训练
-
-## 背景
+# ERNIE 中文预训练介绍
 
 ERNIE是百度提出的大规模预训练模型，曾在中文场景下取得了SOTA效果。
 PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 总共400GB，发布大规模开源语料预训练全流程。从零开始，轻松构建预训练模型。
@@ -10,38 +8,32 @@ PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 
 
 接下来将从下面几个方面，详细介绍整个数据制作全流程，从零开始，构建一个预训练模型。
 
-* [1. **数据准备**](数据准备)
-    * [1.1 **大规模**中文数据](#大规模中文数据)
-    * [1.2 **高精准**中文分词](#高精准中文分词)
-    * [1.3 **快速**Token ID 转化](#快速TokenID转化)
-* [2. **全字符**中文词表制作](#中文中文词表制作)
+* [1. 数据准备](数据准备)
+    * [1.1 大规模中文数据](#大规模中文数据)
+    * [1.2 高精准中文分词](#高精准中文分词)
+    * [1.3 快速Token ID 转化](#快速TokenID转化)
+* [2. 全字符中文词表制作](#中文词表制作)
     - [2.1 分析准备](#分析准备)
     - [2.2 文本字符统计](#文本字符统计)
     - [2.3 英文字符词表](#英文字符词表)
     - [2.4 合并词表](#合并词表)
-* [3. **开始训练**](#开始训练)
-    - [3.1 训练样例](#训练样例)
-        - 环境准备
-        - 启动训练
-    - [3.2 功能支持](#功能支持)
-        - [网络配置](#网络配置)
-        - 训练速度
-        - 训练体验
-    - [3.3 观察评估](#观察评估)
-        - VisualDL 可视化
-        - CLUE Benchmark 效果评估
-- [4. 训练效果](#训练效果)
-    - [ERNIE 1.0-Base-zh-CW 模型](#ernie-1.0-base-zh-cw)
-    - [ERNIE 1.0-Large-zh-CW 模型](#ernie-1.0-large-zh-cw)
-* [5. 参考](#参考)
-
-整体全部流程图如下：
+* [3. 开始训练](#开始训练)
+    - [3.1 训练脚本](#训练脚本)
+    - [3.2 训练网络配置](#networks)
+    - [3.3 训练速度配置](#speed)
+    - [3.4 训练数据流配置](#data_pipe)
+    - [3.5 观察评估](#观察评估)
+- [4. 训练效果](#release_models)
+    - [4.1 ERNIE 1.0-Base-zh-CW 模型](#ernie-1.0-base-zh-cw)
+    - [4.2 ERNIE 1.0-Large-zh-CW 模型](#ernie-1.0-large-zh-cw)
+* [5. 参考](#references)
+
+全部流程介绍图如下：
 
 <p align="center">
   <img src="https://user-images.githubusercontent.com/16911935/187170152-0778a6c1-6510-4c01-84d0-8e0ea3c05231.png" align="middle"  width="500" />
 </p>
 
-
 <a name="数据准备"> </a>
 
 ## 1. 数据准备
@@ -125,15 +117,16 @@ python -u  ../data_tools/create_pretraining_data.py \
     --workers 40 \
     --log_interval 1000
 ```
-转化后的数据如下，使用这份数据，即可开始ERNIE预训练
+此处需要指定词表文件进行ID转化，用户可以使用paddlenlp内置的部分词表如`ernie-1.0-base-zh,ernie-3.0-base-zh`，设置`model_name`参数为对应参数名即可。
+也可以根据自己的需求，重新开始制作词表，然后`model_name`传入词表所在的文件夹目录即可。词表制作，请参考下一章节[全字符中文词表制作](#全字符中文词表制作)。
+
+转化后的数据如下，使用这份数据，即可开始ERNIE预训练：
 ```
 -rw-rw-r-- 1 500 501 129G Jul  4 03:39 wudao_200g_0703_ids.npy
 -rw-rw-r-- 1 500 501 6.4G Jul  4 03:39 wudao_200g_0703_idx.npz
 ```
 
-
-
-<a name="全字符中文词表制作"> </a>
+<a name="中文词表制作"> </a>
 
 ### 2. 全字符中文词表制作
 
@@ -242,7 +235,7 @@ python ./vocab/gen_vocab.py afer_basic_toknizer_corpus.txt
   <img src="https://user-images.githubusercontent.com/16911935/187134299-72628dce-cc04-49d7-89ef-078fad487724.png" align="middle"  width="500" />
 </p>
 
-### 3.0 训练脚本
+### 3.1 训练脚本
 
 训练脚本如下
 
@@ -250,6 +243,7 @@ python ./vocab/gen_vocab.py afer_basic_toknizer_corpus.txt
 
 - PYTHONPATH 设置为当前目录（适合paddlenlp develop运行）
 - 设置了一些FLAGS，包括增强报错，动态图Flag，提高矩阵乘法精度。
+- 多机情况下，可以设置`NCCL_SOCKET_IFNAME`指明NCCL使用的通信网口。
 
 <details>
 <summary>环境配置脚本</summary>
@@ -271,6 +265,8 @@ unset CUDA_VISIBLE_DEVICES
 <b>路径配置</b>
 
 - 主要配置输入输出目录
+- 这里的`vocab_dir`如果没有使用自定义词表的话，请设置为内置的tokenizer，如`ernie-1.0-base-zh,ernie-3.0-base-zh`等。
+- 这里的 `data_dir` 设置多份数据集，用户不使用多份数据集的话，直接`data_dir="./data"`即可。
 
 <details>
 <summary>路径配置</summary>
@@ -286,7 +282,7 @@ vocab_dir="${base_nfs}/"
 ```
 </details>
 
-**启动训练**：这里启动的是单机8卡任务，整体全局的batch_size 512。加入两机训练，请设置dp_degree=16。
+**启动训练**：这里启动的是单机8卡任务，整体全局的batch_size 512 (64*8)。如果指定ips参数，进行多机运行，如 `python3 -u  -m paddle.distributed.launch  --gpus "0,1,2,3,4,5,6,7" --ips 192.168.1.101,192.168.1.101 `
 ```shell
 python3 -u  -m paddle.distributed.launch \
     --gpus "0,1,2,3,4,5,6,7" \
@@ -297,14 +293,12 @@ python3 -u  -m paddle.distributed.launch \
     --tokenizer_name_or_path "${vocab_dir}" \
     --input_dir "${data_dir}" \
     --output_dir "${base_dir}" \
-    --fp16_opt_level "O1" \
+    --split 949,50,1 \
     --max_seq_len 512 \
     --binary_head true \
     --micro_batch_size 64 \
-    --sharding_degree 1\
-    --dp_degree 8 \
-    --use_sharding false \
     --use_amp true \
+    --fp16_opt_level "O1" \
     --use_recompute false \
     --max_lr 0.0001 \
     --min_lr 0.00001 \
@@ -328,18 +322,19 @@ python3 -u  -m paddle.distributed.launch \
 
 其中参数释义如下：
 - `model_name_or_path` 要训练的模型或者之前训练的checkpoint。
-- `tokenizer_name_or_path` 模型词表文件所在的文件夹，或者PaddleNLP内置tokenizer的名字。
+- `tokenizer_name_or_path` 模型词表文件所在的文件夹(对于ernie，词表文件名一般命名为vocab.txt)，或者PaddleNLP内置tokenizer的名字。
 - `continue_training` 默认false，模型从随机初始化，开始训练。如果为True，从已有的预训练权重加载，开始训练。如果为True， 训练初始loss 为2.x 是正常loss，如果未False，随机初始化，初始loss一般为10+。
 - `input_dir` 指定输入文件，可以使用目录，指定目录时将包括目录中的所有文件。
 - `output_dir` 指定输出文件。
-- `split` 划分数据集为train、valid、test的比例。整个数据集会按照这个比例划分数据。默认1/1000的数据为test，当样本数太少时，请修改此比例。
-- `max_seq_len` 输入文本序列的长度。
+- `split` 划分数据集为train、valid、test的比例。整个数据集会按照这个比例划分数据。默认`split=949,50,1`, 使用1/1000的数据为test，当样本数太少时，增大测试的样本数目。
+- `max_seq_len` 输入文本序列的长度，默认值`512`。
+- `binary_head` 是否使用SOP(Sentences Order Predicet) loss，默认为 True，使用此loss。如果用户句子语料很短，无法组合成句子对，请设置此参数为`false`。
 - `micro_batch_size` 单卡batch size大小，比如此处单卡bs=64, 采用8卡训练`global_batch_size=64*8=512`。
 - `use_amp` 开启混合精度策略。
 - `fp16_opt_level` 混合精度策略，支持O1 自动混合精度，O2 pure fp16精度训练。
 - `max_lr` 训练学习率。
-- `min_lr` 学习率衰减的最小值。
-- `max_steps` 最大训练步数。
+- `min_lr` 学习率衰减到最小值后，学习率将一直保持为`min_lr`。
+- `max_steps` 最大训练步数。训练不支持通过`epoch`控制，第一次制造数据index时候，日志会显示数据会被计算的epoch数，请注意查看。
 - `save_steps` 保存模型间隔。默认保存地址格式为`output_dir/model_50000`(5w 步时的权重)。
 - `checkpoint_steps` 模型checkpoint间隔，用于模型断点重启训练。默认地址为`output_dir/model_last`.
 - `weight_decay` 权重衰减参数。
@@ -349,10 +344,14 @@ python3 -u  -m paddle.distributed.launch \
 - `num_workers` DataLoader采样进程，当数据输入为瓶颈时，可尝试提高采样进程数目。
 - `eval_freq` 模型评估间隔。
 - `device` 训练设备，默认为GPU。
-- `share_folder` 多机训练时，如果多机input_dir为挂载的同一个nfs网络位置，可以开启次选项，多机共享同一份数据。
+- `share_folder` 多机训练时，如果多机`input_dir`为挂载的同一个nfs网络位置，可以开启次选项，多机共享同一份数据。（每次运行，会制作训练的index数据，如果为挂载的统一nfs位置，则一台机器制作数据即可，否则每台机器都需要制作）
 
 
-### 3.1 网络配置
+<a name="networks"> </a>
+
+### 3.2 训练网络配置
+
+本小节
 
 - SOP Loss
     - SOP (Sentence Order Predict) 损失，是 模型训练的常用损失。将文本中的句子顺序分为两段打乱，最后判断文本是否被打乱。下图是数据组织形式的展示：
@@ -360,11 +359,11 @@ python3 -u  -m paddle.distributed.launch \
     <img src="https://user-images.githubusercontent.com/16911935/187140981-924fd21c-fb67-4ba8-a421-490fd293175c.png" align="middle"  width="600" />
     </p>
 
-    - 此开关由 `binary_head` 选项开启，`binary_head=True`添加sop loss， `binary_head=False` 关闭 sop loss。
+    - *<u>使用方法</u>*: 此开关由 `binary_head` 选项开启，`binary_head=True`添加sop loss， `binary_head=False` 关闭 sop loss。
     - **注意：如果你使用的语料文本中，只有一句话，无法分为多个句子段落，请设置 `binary_head=False`。否则，不符合要求的数据默认被删去，导致可训练的数据过小。**
 - MASK
     -  MLM (Mask Language Model) 是通过随机将文本中的部分token，随机替换为`[MASK]` token，最后预测出真实的token值。ERNIE默认采用了Whole Word MASK方式，选定一些词语进行MASK。
-    - 用户可以设置 `masked_lm_prob` 控制mask的token占文本总token长度的比例。默认`masked_lm_prob=0.15` 随机mask 15% 的token数目。
+    - *<u>使用方法</u>*: 用户可以设置 `masked_lm_prob` 控制mask的token占文本总token长度的比例。默认`masked_lm_prob=0.15` 随机mask 15% 的token数目。
     - 设置`short_seq_prob`， 控制长度小于max_seq_length的样本比例，默认值`short_seq_prob=0.1`。制作数据时候，会有相应比例的数据 最大长度会设置为 一个小于 max_seq_length 的随机值。
 - Ngram MASK
     - 项目还支持了n-gram mask策略，如下图所示，在 WWM 进行词语级别MASK的基础上（如此处mask掉的`[模型]`词组），n-gram 可以MASK掉连续n个词组。下面例子中，连续mask了2个词组，`【[语言][模型]】`同时进行了mask。
@@ -372,12 +371,21 @@ python3 -u  -m paddle.distributed.launch \
     <img src="https://user-images.githubusercontent.com/16911935/187145669-7c55386d-f57a-4589-9e6d-e4a36b93e24c.png" align="middle"  width="600" />
     </p>
 
-    - 用户通过`max_ngrams`设置最大的`ngram`长度。默认`max_ngrams=3`。
+    - *<u>使用方法</u>*: 用户通过`max_ngrams`设置最大的`ngram`长度。默认`max_ngrams=3`。
+    - 注：
+        - ernie预训练使用的 dataset 代码文件在 `./data_tools/ernie_dataset.py`
+        - 数据集index生成，动态mask相关代码实现在`./data_tools/dataset_utils.py`
+
+        - 用户可以根据自己的需求，灵活修改mask方式。具体可以参考`dataset_utils.py`中`create_masked_lm_predictions`函数。可以自定义的选项有do_whole_word_mask, favor_longer_ngram, do_permutation, geometric_dist等，可以参考[Megatron](https://github.com/NVIDIA/Megatron-LM)使用这些lm_mask策略。
+
 - Dropout
     - Dropout 是常用的防止过拟合策略。对于大规模数据集训练，如`ernie-3.0`系列4T文本语料，可以设置 `dropout=0`，不考虑过拟合。实际`ernie-3.0-base-zh`训练中，没有开启Dropout。
-    - 用户可以设置 `hidden_dropout_prob`，`attention_probs_dropout_prob`。默认值为 `0.1`。
+    - *<u>使用方法</u>*: 用户可以设置 `hidden_dropout_prob`，`attention_probs_dropout_prob`。默认值为 `0.1`。
 
-### 3.2 训练速度
+
+<a name="speed"> </a>
+
+### 3.3 训练速度配置
 
 **训练速度方面**，我们支持了如下策略，加
 速计算过程，减小显存占用，扩大batch_size：
@@ -401,7 +409,7 @@ python3 -u  -m paddle.distributed.launch \
 - **混合精度**训练：
     - 部分算子使用FP16计算kernel，加速计算过程。支持AMP混合精度O1，和Pure FP16全FP训练策略O2。
     - 如下图所示，使用AMP O1时，一些参数自动从fp32 cast为FP16类型计算。使用`O2` pure fp16时，模型参数为 fp16。
-    - *<u>使用方法</u>*:  设置`use_amp=True`开启混合精度训练。设置`fp16_opt_level=O1`，切换pure_fp16请设置为`O2`。
+    - *<u>使用方法</u>*: 设置`use_amp=True`开启混合精度训练。设置`fp16_opt_level=O1`，切换pure_fp16请设置为`O2`。
     <p align="center">
     <img src="https://user-images.githubusercontent.com/16911935/187338824-8b522935-4d6e-48d4-a5f6-55695ed3b182.png" align="middle" width=600 />
     </p>
@@ -418,9 +426,10 @@ python3 -u  -m paddle.distributed.launch \
     </p>
 
 
+<a name="data_pipe"> </a>
 
-### 3.3 训练体验
-**训练体验方面**，我们针对训练数据流、重启、可视化等方面做了针对性优化提升
+### 3.4 训练数据流配置
+**训练数据流方面**，我们针对训练数据流扩展、混合、重启等方面做了针对性优化提升
 
 数据流
 - **多机扩展**
@@ -463,13 +472,14 @@ python3 -u  -m paddle.distributed.launch \
     - 注意，这里使用的是训练中的checkpoint进行评估，可以直接试着 评估待评估的参数为，所在的路径地址，即如 `python grid_seach.py ouput/ernie-base-outdir/model_100000` 之类的checkpoint地址。
 
 
-## 训练效果
+<a name="release_models"></a>
+## 4. 训练效果
 
 **训练效果方面**，我们release了 base、large两个模型。均取得了较好的预训练效果。
 
 <a name="ernie-1.0-base-zh-cw"></a>
 
-### **ERNIE 1.0-Base-zh-CW** 模型
+### 4.1 ERNIE 1.0-Base-zh-CW 模型
 
 使用CLUE，WuDao共计400GB的语料，batch_size 1024, 训练 400w step，即可训练得到`ernie-3.0-base-zh`类似的模型效果。相关模型参数，开源为`ernie-1.0-base-zh-cw`，用户加载即可使用。使用CLUE benchmark 对最优超参数进行GradSearch搜索：
 
@@ -483,7 +493,7 @@ ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |    58.91 |    62.25 |    81.68 |
 
 <a name="ernie-1.0-large-zh-cw"> </a>
 
-### **ERNIE 1.0-Large-zh-CW** 模型
+### 4.2 ERNIE 1.0-Large-zh-CW 模型
 
 除了base模型外，我们还训练了large模型。命名为`ernie-1.0-large-zh-cw`。使用开源语料，batch_size 512, 训练 400w step，训练去除SOP任务，只保留MLM损失，使用CLUE benchmark 对最优超参数进行GradSearch搜索：
 
@@ -494,7 +504,9 @@ ERNIE 1.0-Large-zh-CW| 24L1024H | <b>79.03</b> | 75.97 |    59.65 |    62.91 |
 ERNIE 3.0-Xbase-zh| 20L1024H | 78.71 | 76.85 |    59.89 |    62.41 |    84.76 |    82.51 |    89.80 |    84.47 |    75.49/92.67 | 86.36 | 84.59
 RoBERTa-wwm-ext-large | 24L1024H | 76.61 |    76.00 |    59.33 |    62.02 |    83.88 |    78.81 |    90.79 |    83.67 |    70.58/89.82 |    85.72 |    75.26
 
-## 6. 参考文献
+<a name="references"> </a>
+
+## 5. 参考文献
 
 感谢CLUE，WuDao提供的开源文本语料，参考资料：
 - Xu, L., Zhang, X. and Dong, Q., 2020. CLUECorpus2020: A large-scale Chinese corpus for pre-training language model. arXiv preprint arXiv:2003.01355.

From 1805538163a9bb297f260f438f14a287a34d40f6 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 1 Sep 2022 16:07:42 +0800
Subject: [PATCH 43/48] readfine readme.

---
 model_zoo/ernie-1.0/preprocess/README.md      |  26 ++-
 .../preprocess/docs/WuDaoCorpusBase.md        |  64 ++++++--
 ...wudao_process.py => words_segmentation.py} |  57 +++++--
 .../ernie-1.0/pretraining_introduction.md     |   3 +
 model_zoo/ernie-1.0/vocab/README.md           | 148 ++++++++----------
 5 files changed, 189 insertions(+), 109 deletions(-)
 rename model_zoo/ernie-1.0/preprocess/{wudao_process.py => words_segmentation.py} (78%)

diff --git a/model_zoo/ernie-1.0/preprocess/README.md b/model_zoo/ernie-1.0/preprocess/README.md
index 3cc5700fee92..6d222e4560e8 100644
--- a/model_zoo/ernie-1.0/preprocess/README.md
+++ b/model_zoo/ernie-1.0/preprocess/README.md
@@ -1,6 +1,7 @@
 # PaddleNLP 预训练数据流程
 
-本示例致力于打造基于PaddleNLP预训练模型的最佳实践。
+本示例致力于打造基于PaddleNLP预训练模型的最佳实践。预训练全部流程的整体详细介绍文档，请参考[ERNIE 中文预训练介绍](../pretraining_introduction.md)。本文档主要介绍预训练数据流程。
+
 
 我们将预训练数据过程划分为以下部分
 
@@ -42,13 +43,24 @@
 
 飞桨是自主研发、功能完备、开源开放的产业级深度学习平台，集深度学习核心训练和推理框架、基础模型库、端到端开发套件和丰富的工具组件于一体
 
-|步骤|阶段|数据格式| 样例|
+|步骤|阶段&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;|数据格式| 样例|
 |-|-|-|-|
-| - |-|原始数据： <br/> 每个doc之间用空行间隔开 <br/> - 中文，默认每句换行符，作为句子结束。<br/> - 英文，默认使用nltk判断句子结束  | ```飞桨是功能完备、开源开放的产业级深度学习平台。``` <br/> ```飞桨拥有核心训练和推理框架、基础模型库。``` <br/><br/> ```PaddleNLP是自然语言处理领域的优秀工具。```  |
-|原始数据转换<br/>`trans_to_json.py`|预处理|jsonl格式：每个doc对应一行json字符串| ```{"text": "飞桨是功能完备、开源开放的产业级深度学习平台。飞桨拥有..."}```<br/>```{"text": "PaddleNLP是自然语言..."}```
-|数据ID化<br/>`create_pretrain_data.py`|预处理| npy格式：数据id化后的token id <br/>npz格式：数据句子、文章位置索引 | -
-|训练index文件生成|训练启动|npy格式：<br/> 根据训练步数max_steps生成<br/>train、valid、test的每个样本索引文件| -
-|token动态mask（可选）| Dataset取数据 | 无 |-
+| 0️⃣初始状态 | -|原始数据： <br/> **每个doc之间用空行间隔开** <br/> - 中文，默认每句换行符，作为句子结束。<br/> - 英文，默认使用nltk判断句子结束  | ```飞桨是功能完备、开源开放的产业级深度学习平台。``` <br/> ```飞桨拥有核心训练和推理框架、基础模型库。``` <br/><br/> ```PaddleNLP是自然语言处理领域的优秀工具。```  |
+|1️⃣原始数据转换<br/>`trans_to_json.py`|预处理 <br>输入：0️⃣初始状态 <br>输出：jsonl|jsonl格式：每个doc对应一行json字符串| ```{"text": "飞桨是功能完备、开源开放的产业级深度学习平台。飞桨拥有..."}```<br/>```{"text": "PaddleNLP是自然语言..."}```
+|❇️(**可选**)数据中文分词<br/>`words_segmentation.py`|语料分词：中文WWM <br>输入：jsonl  <br> 输出：0️⃣初始状态| 将jsonl格式的数据，恢复成分词后的原始格式数据 <br> | ```飞桨 是 功能 完备、开源 开放的 产业级 深度学习 平台。``` <br/> ```飞桨 拥有 核心 训练和推理 框架、基础 模型库。``` <br/><br/> ```PaddleNLP 是 自然语言处理领域 的 优秀工具。```
+|2️⃣数据ID化<br/>`create_pretrain_data.py`|预处理| npy格式：数据id化后的token id <br/>npz格式：数据句子、文章位置索引 | -
+|3️⃣训练index文件生成|训练启动|npy格式：<br/> 根据训练步数max_steps生成<br/>train、valid、test的每个样本索引文件| -
+|4️⃣token动态mask（可选）| Dataset取数据 | 无 |-
+
+
+注意：
+- **❇️(**可选**)数据中文分词** 是中文预训练做 WWM 的可选步骤
+  - 当你的数据比较少时，分词耗时较少，不需要词步骤。直接在`create_pretrain_data.py`步骤中分词即可。
+  - 目的是为了提前分词，加快后续数据ID转化步骤。
+  - 如果这里输入的是 jsonl格式文件，最好为多文件，`trans_to_json.py` 时候开启`no-merge`选项。
+  - 当你的数据集比较大，或者需要尝试多次转换数据的时候，提前分词可以避免`create_pretrain_data.py`时每次都运行一次分词程序。
+- 转换后，需要重新 进行步骤 1️⃣`原始数据转换 trans_to_json.py`，最后2️⃣`数据ID化`步骤设置`--cn_splited=True`参数。
+- 2️⃣`数据ID化`也可以在转化ID的同时，一起实现分词。不需要❇️`数据中文分词`步骤。
 
 
 ## 数据教程汇总
diff --git a/model_zoo/ernie-1.0/preprocess/docs/WuDaoCorpusBase.md b/model_zoo/ernie-1.0/preprocess/docs/WuDaoCorpusBase.md
index 4a88651df42a..2ca81b59cc5e 100644
--- a/model_zoo/ernie-1.0/preprocess/docs/WuDaoCorpusBase.md
+++ b/model_zoo/ernie-1.0/preprocess/docs/WuDaoCorpusBase.md
@@ -5,27 +5,71 @@
 |-|-|-|
 | WuDaoCorpus2.0 Base| 中文 | 200GB |
 
-
 WuDaoCorpora是悟道爬取的中文大规模语料。整体数量为3TB，目前开源的部分为WuDaoCorpus2.0 bases数据集，大小为200GB。
-用户微信登录[官网](https://resource.wudaoai.cn/home)，即可直接下载数据。下载好的压缩数据约 64GB
+
+## 数据获取
+
+**1. 下载解压**
+
+用户微信登录[官网](https://resource.wudaoai.cn/home)，即可直接下载数据。下载好的压缩数据约 64GB。解压
 ```
-64GB WuDaoCorpus2.0_base_200G.rar
+unrar x WuDaoCorpus2.0_base_200G.rar
 ```
+**2. 语料分词**
 
-
+由于WuDao数据集比较大，分词比较耗时，这里先进行了语料分词：
 ```shell
-python wudao_process.py \
-    --input_path WuDaoCorpus2.0_base_200G \
+python words_segmentation.py \
+    --input_path ./WuDaoCorpus2.0_base_200G \
     --workers 40  \
-    --ouput_path ./wudao_lac_cut \
+    --data_format wudao \
+    --cn_seg_func seg \
+    --output_path ./wudao_lac_cut \
 ```
+
 注：预训练需要实现 SOP( Sentence Order Predict) 任务，在分词的同时，我们使用 简单规则 进行了文本断句。如果语料只有一句话，建议去除SOP loss，训练时设置 `binary_head=False`。
 
+**3. 转换为jsonl格式**
+
 文本转化完成后。我们使用 `../data_tools/trans_to_json.py`重新转换为jsonl格式（分词完毕）。
 ```shell
-python ../data_tools/trans_to_json.py  \
+python ./trans_to_json.py  \
     --input_path ./wudao_lac_cut \
     --output_path wudao_corpus_200g_0623.jsonl \
-    --workers 40 \
-    --no-shuffle
+    --workers 40
+```
+在当前目录下产出数据`wudao_corpus_200g_0623.jsonl`。格式如下：
+```
+{"text": "主持人 : 作为 一个 曲线救国 的 路线 我们 没 办法 。\n金鑫 : 考试 和 分数 只是 一个 阶段性 的 评价 手段 , 不是 目的 , 就 像 人 活着 的 目的 不是 为了 吃饭 , 吃饭 是 为了 让 我们 活下去 , 我们 学习 的 目的 不是 为了 考试 , 不是 为了 那个 分数 , 而是 我 掌握 了 知识 , 成为 我 内在 的 能力 , 将来 我 去 创作 创造 工作 , 我能 把 它 做 得 更好 。\n主持人 : 特别感谢 金总 今天 接受 我 的 访谈 , 也 让 我 从 别的 层面 看到 了 一对一 到底 存在 的 道理 是 什么 , 并且 能 发展 那么 好 的 原因 在 哪里 。\n在 节目 后 您 谈谈 您 对 一对一 未来 的 希望 , 包括 您 对 它 未来 的 设想 是 什么 ？\n金鑫 : 一对一 个性化 教育 现在 还是 在 初级阶段 , 如果 是 四个 阶段 的话 , 现在 还是 在 第一阶段 到 第二阶段 迈进 的 , 学大 在 这方面 我们 希望 能 做 得 更 快 更 远 一些 。\n将来 个性化 教育 一定 是 能够 帮助 学生 在 成绩 上 的 提升 , 能够 更好 的 成长 , 进而 成为 对 社会 对 国家 更 有用 的 人才 , 就是 我们 的 成绩 、 成长 、 成才 。\n学大 1 对 1 教育 的 教师 团队 由 各科 优秀教师 、 考试 指导 专家 、 心理 辅导 专家 及 学习 方法 指导 专家 组成 , 同时 配备 专职 班主任 及 学习 监管 师 , 全方位 辅导   顺利 而 有序 的 运作 。\n其中 部分 教师 担任 多年 毕业班 教学 工作 , 多次 参与 中 考试 命题 研究 及 阅卷 工作 , 深谙 中 考试 精髓 , 能够 在 短 的 时间 内 引领 学生 掌握 中 考试 知识   重点 , 快速 提分 。\n■   对于 成绩 差 的 学生 : 注重 学生 基础知识 , 力求 让 学生 在 基础 中 找 自信 , 在 自信 中 提升 ；\n注重 主观题 的 解题 方法 及 思路 , 以此 来 加强 对 基础知识 的 运用 。\n■   对于 成绩 需要 拔高 的 学生 : 找出 学生 弱点 , 加强 基础 , 重点 提高 弱势 项目 。\n"}
+{"text": "武田信玄 是 天生 的 武将 , 一生 开拓 了 八十五万 石至 九十余万 石之多 的 领地 。\n武田信玄  他 21 岁 时 流放 自己 的 父亲 武田信虎  至骏河 , 避免 父亲 传位 给 弟弟 , 从而 登上 了 第 19 代家督 之位 。\n他 将 信 浓国 ( 现 长野县 ) 纳入 控制 范围 后 , 又 与 当时 的 豪强 今井氏 、 北条 氏 结成 三国 军事同盟 , 与 上 杉谦信 在 川 中岛 前后 展开 了 五次 大战 。\n武田信玄  勇于 进攻 。\n他 连续 攻打 邻国 , 扩大 自己 势力范围 , 可称 遇神 杀神 , 遇佛 杀佛 。\n他 不仅 流放 了 自己 的 父亲 , 连 自己 的 嫡子 武田义信 因 与 他 在 战略 方向 上 相左 , 也 被 他 幽禁 于 佛寺 , 随即 被迫 自杀 。\n武田信玄  虽然 是 战国 武将 中 的 最强者 , 但 他 的 弱点 是 年龄 。\n信玄比 织田信长 年长 13 岁 , 比上 杉谦信 年长 9 岁 。\n当信 玄年 届 五十 之 时 , 信长 和 谦信 犹 在 壮年 。\n上杉谦信 而且 , 武田信玄  虽 驰骋 天下 , 却 未率 军 进过 京都 , 而 织田信长 在 永禄 十一年 ( 1568 年 ) 就 以 拥立 第 15 代 将军 足利义 昭 为名 率兵 上洛 了 。\n所谓 \" 制 京都 者 得 天下 \" , 所以 , 想要 一统天下 , 武田信玄  的 时间 很 紧迫 。\n元龟 三年 ( 1572 年 ) , 武田信玄  与 室 町 幕府 第 15 代 将军 足利义 昭 、 本愿 寺 显如 , 以及 浅井 氏 、 朝仓氏 等 反 织田信长 实力 组成 联盟 , 编织 \" 反信长 包围圈 \" 。\n同年 10 月 3 日 , 武田信玄  率领 大军 , 开始 了 第一次 上洛之行 。\n是 年 , 信玄 52 岁 , 这 也许 是 他 统一天下 的 最后 一次 机会 。\n武田信玄 所 率领 的 是 当时 战国 最强 的 3 万甲州 精兵 。\n打着 \" 风林火山 \" 的 旗帜 , 武田军 第一站 就 到达 了 织田信长 的 同盟 德川家康  所在 的 三河 远江 。\n织田信长 德川家康  的 军队 在 甲州 精兵 之前 显得 不堪一击 , 到 了 10 月 13 日 , 只来 成 、 天 方城 、 一 宫城 、 饭田 城 、 各和城 、 向 笠 城 等 城池 纷纷 被 攻陷 。\n德川家康  见势不妙 , 决定 在 浜松 城中 闭门不出 。\n但是 武田信玄  毫不 松懈 , 又 将 家康 在 远江 地区 的 重要 据点 二俣城 攻破 。\n德川家康  集合 所有 军队 共 1 万 1 千人 , 出城 与 信玄 决一死战 , 但 大败 而 还 , 险些 失 了 性命 。\n这次 战争 被 称为 \" 三方 原战 \" , 德川家康  曾经 承认 这次 战争 是 他 生平 最大 的 失败 。\n"}
+```
+
+## ERNIE 中文预训练数据制作
+
+下面是针对训练任务的数据集应用，此处以ernie为例。
+
+```
+python -u  create_pretraining_data.py \
+    --model_name ernie-1.0-base-zh \
+    --tokenizer_name ErnieTokenizer \
+    --input_path wudao_corpus_200g_0623.jsonl \
+    --split_sentences \
+    --chinese \
+    --cn_whole_word_segment \
+    --cn_seg_func jieba \
+    --cn_splited \
+    --output_prefix wudao_corpus_200g_0623 \
+    --workers 48 \
+    --log_interval 10000
+```
+
+- 我们提前分词好了，所以加上了 `cn_splited`，否则不需要使用此选项。
+- model_name 可以更换为其他 ERNIE 系列模型，如: `ernie-3.0-base-zh`
+- workers 表示转化的线程数目
+
+在当前目录下产出训练所需数据。
+```
+wudao_corpus_200g_0623_ids.npy
+wudao_corpus_200g_0623_idx.npz
 ```
+用户可以使用此数据进行预训练任务。
diff --git a/model_zoo/ernie-1.0/preprocess/wudao_process.py b/model_zoo/ernie-1.0/preprocess/words_segmentation.py
similarity index 78%
rename from model_zoo/ernie-1.0/preprocess/wudao_process.py
rename to model_zoo/ernie-1.0/preprocess/words_segmentation.py
index 14443bb55c09..fa42454a099d 100644
--- a/model_zoo/ernie-1.0/preprocess/wudao_process.py
+++ b/model_zoo/ernie-1.0/preprocess/words_segmentation.py
@@ -37,6 +37,16 @@ def get_args():
                         type=str,
                         default="./tmp",
                         help='Path to save the output json files.')
+    parser.add_argument('--data_format',
+                        type=str,
+                        default="jsonl",
+                        choices=["jsonl", "wudao"],
+                        help='Path to you raw files. Folder or file path.')
+    parser.add_argument('--cn_seg_func',
+                        type=str,
+                        default='jieba',
+                        choices=['lac', 'seg', 'jieba'],
+                        help='Words segment function for chinese words.')
     parser.add_argument('--log_interval',
                         type=int,
                         default=1,
@@ -83,34 +93,58 @@ def process(line):
     'jieba': jieba_segmentation_fn(),
 }
 
-special_chars = ['\n', '。', '?', '？', ' ', ';', '；', '！', '!']
-split_chars = ['。', '?', '？', ';', '；', '!', '！']
-
 
-def text_to_text(output_path, path):
-    out_name = os.path.join(output_path, path[-20:])
+def read_wudao(path):
     print("Loading %s" % path)
     with open(path, "r") as f:
         try:
             contents = json.load(f)
         except Exception as e:
             print("Failed to load %s" % path)
-            return 0, None
+            raise StopIteration
+    for js in contents:
+        yield js["content"]
+
+
+def read_jsonl(path):
+    print("Loading %s" % path)
+    with open(path, "r") as f:
+        line = f.readline()
+        while line:
+            contents = json.load(f)
+            yield contents["text"]
+            line = f.readline()
+
+
+READFILE_FUNC = {
+    'jsonl': read_jsonl,
+    'wudao': read_wudao,
+}
+
+special_chars = ['\n', '。', '?', '？', ' ', ';', '；', '！', '!']
+split_chars = ['。', '?', '？', ';', '；', '!', '！']
+
+
+def text_to_text(path, output_path, read_func, seg_func):
+    out_name = os.path.join(output_path, path[-20:])
 
     print("Write into %s" % out_name)
     if os.path.exists(out_name):
         print("File exists %s" % out_name)
         return 0, None
 
-    seg_func = CHINESE_SEG_FUNC["seg"]
+    seg_func = CHINESE_SEG_FUNC[seg_func]
+    read_func = READFILE_FUNC[read_func]
+
     import time
     s = time.time()
     data_len = 0
     count = 0
     with open(out_name, "w") as f:
-        for js in contents:
+        for text in read_func(path):
+            # for js in contents:
             count += 1
-            text = js["content"]
+            # text = js["content"]
             data_len += len(text.encode("utf-8"))
             # make special char only once,
             # because of those token will be treat as sentence spliter.
@@ -154,7 +188,10 @@ def main():
     if not os.path.exists(args.output_path):
         os.makedirs(args.output_path)
 
-    trans_func = partial(text_to_text, output_path=args.output_path)
+    trans_func = partial(text_to_text,
+                         output_path=args.output_path,
+                         seg_func=args.cn_seg_func,
+                         read_func=args.data_format)
 
     encoded_files = pool.imap(trans_func, file_paths, 1)
 
diff --git a/model_zoo/ernie-1.0/pretraining_introduction.md b/model_zoo/ernie-1.0/pretraining_introduction.md
index fd1162b16403..b1ee7e4fb315 100644
--- a/model_zoo/ernie-1.0/pretraining_introduction.md
+++ b/model_zoo/ernie-1.0/pretraining_introduction.md
@@ -38,6 +38,9 @@ PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 
 
 ## 1. 数据准备
 
+数据流是预训练的非常重要的，[预处理文档](./preprocess/README.md)提供了整体的数据变动的流程示意，用户可以查看数据制作的细节文档。
+
+
 <a name="大规模中文数据"> </a>
 
 ### 1.1 大规模中文数据
diff --git a/model_zoo/ernie-1.0/vocab/README.md b/model_zoo/ernie-1.0/vocab/README.md
index 29536da9ffa9..acd30634c41d 100644
--- a/model_zoo/ernie-1.0/vocab/README.md
+++ b/model_zoo/ernie-1.0/vocab/README.md
@@ -1,83 +1,43 @@
-# **大规模** **开源** **中文** 语料预训练-<small>从零开始构建预训练模型</small>
+# ERNIE 中文词表制作
 
 ERNIE是百度提出的大规模预训练模型，曾在中文场景下取得了SOTA效果。
-PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 总共400GB，发布大规模开源语料预训练全流程。从零开始，轻松构建预训练模型。
+PaddleNLP致力于预训练开源工作，本文档提供了ERNIE词表的制作方法。
 
-本项目，从数据下载，词表制作，数据转化，模型训练，所有流程，完全开源开放，可复现。
-并训练发布开源最优的模型参数。
-
-接下来将从下面几个方面，详细介绍整个数据制作全流程，从零开始，构建一个预训练模型。
+预训练全部流程的整体详细介绍文档，请参考[ERNIE 中文预训练介绍](../pretraining_introduction.md)。
 
 **目录**
-* [1. **大规模**中文数据](#大规模中文数据)
-* [2. **高精准**中文分词](#高精准中文分词)
-* [3. **全字符**中文词表制作](#中文中文词表制作)
-    - [3.1 分析准备](#分析准备)
-    - [3.2 文本字符统计](#文本字符统计)
-    - [3.3 英文字符词表](#英文字符词表)
-    - [3.4 合并词表](#合并词表)
-* [4. **快速**Token ID 转化](#快速TokenID转化)
-* [5. 参考](#参考)
-
+* [1. 数据获取](#数据获取)
+* [2. 全字符中文词表制作](#中文词表制作)
+    - [2.1 分析准备](#分析准备)
+    - [2.2 文本字符统计](#文本字符统计)
+    - [2.3 英文字符词表](#英文字符词表)
+    - [2.4 合并词表](#合并词表)
+* [3. 词表使用](#vocab_usage)
+    - [3.1 转化为jsonl格式数据](#jsonl)
+    - [3.2 TokenID转化](#快速TokenID转化)
+* [4. 参考](#ref)
 
-<a name="大规模中文数据"> </a>
 
-## 1. 大规模中文数据
+<a name="数据获取"> </a>
 
-**CLUECorpus2020 语料**
+## 1. 数据获取
 
-CLUECorpus2020 过对Common Crawl的中文部分进行语料清洗得到。开源部分提供了约200G左右的语料文本，详细介绍见[官网](https://github.com/CLUEbenchmark/CLUECorpus2020#%E6%95%B0%E6%8D%AE%E4%B8%8B%E8%BD%BD)，用户可以通过邮件申请下载，方式如下：
-> 数据下载
-> 申请方式： 将使用语料研究目的和用途，计划、研究机构和申请者介绍，发送到邮箱，并承诺不向第三方提供。
->
-> 邮箱: CLUEbenchmark@163.com，标题是：CLUECorpus2020 200G语料库
 
 **WuDaoCorpus2.0 Base 语料**
 
-WuDaoCorpora是悟道爬取的中文大规模语料。整体数量为3TB，目前开源的部分为WuDaoCorpus2.0 bases数据集，大小为200GB。
-用户微信登录[官网](https://resource.wudaoai.cn/home)，即可直接下载数据。下载好的压缩数据约 64GB
-```
-64GB WuDaoCorpus2.0_base_200G.rar
-```
+WuDaoCorpora是悟道爬取的中文大规模语料。整体数量为3TB，目前开源的部分为WuDaoCorpus2.0 bases数据集，大小为200GB。用户请参考[这里](../preprocess/docs/WuDaoCorpusBase.md)获取原始文本数据。
 
-<a name="高精准中文分词"> </a>
 
-## 2. 高精准中文分词
+**CLUECorpus2020 语料**
 
-ERNIE 使用知识嵌入的方式进行预训练，如何尽可能精确的从原始文本中提取知识，直接关系预训练模型的效果。
-目前PaddleNLP常用的分词方式的有`jieba`，`lac`，`Wordtag`，
-效果、速度对比表格如下，假设CPU使用40线程，GPU使用16卡，处理200G文本：
+CLUECorpus2020 过对Common Crawl的中文部分进行语料清洗得到。开源部分提供了约200G左右的语料文本，详细介绍见[官网](https://github.com/CLUEbenchmark/CLUECorpus2020#%E6%95%B0%E6%8D%AE%E4%B8%8B%E8%BD%BD)，用户参考[这里](./preprocess/docs/CLUECorpus2020.md)获取原始文本数据。
 
-| 切词方式 | 效果 | 速度 | 预估耗时
-|-|-|-|-|
-| jieba | 一般 | 607 KB/s |  2.5 h |
-| lac   | 好 | 106 KB/s | 13.9 h
-| wordtag| 最好 | 0.94 KB/s | 159 D (GPU)|
 
-综合考虑分词的效果与速度，我们选择百度的LAC作为我们的文本分词工具。
 
-本文档以WuDao数据为例，对数据进行分词：
-
-```shell
-python wudao_process.py \
-    --input_path WuDaoCorpus2.0_base_200G \
-    --workers 40  \
-    --ouput_path ./wudao_lac_cut \
-```
-注：预训练需要实现 SOP( Sentence Order Predict) 任务，在分词的同时，我们使用 简单规则 进行了文本断句。如果语料只有一句话，建议去除SOP loss，训练时设置 `binary_head=False`。
-
-文本转化完成后。我们使用 `../data_tools/trans_to_json.py`重新转换为jsonl格式（分词完毕）。
-```shell
-python ../data_tools/trans_to_json.py  \
-    --input_path ./wudao_lac_cut \
-    --output_path wudao_corpus_200g_0623.jsonl \
-    --workers 40 \
-    --no-shuffle
-```
 
 <a name="全字符中文词表制作"> </a>
 
-## 3. 全字符中文词表制作
+## 2. 全字符中文词表制作
 
 词表的制作有两种方案：
 
@@ -93,7 +53,7 @@ python ../data_tools/trans_to_json.py  \
 第二种方案需要对文本先使用`BasicTokenizer`切分一遍语料。
 第一种方案，自定义程度高，但存在一些局限性。本项目采用了第一种方案，详细介绍如下：
 
-### 3.1 分析准备
+### 2.1 分析准备
 词表大小： 这里我们考虑的因素主要有两个
 - 已有模型对照：
     - ERNIE 3.0系列模型的词表，词表大小为 40000 左右。
@@ -108,7 +68,7 @@ python ../data_tools/trans_to_json.py  \
 - 其他字符约 `2000` 左右
 
 
-### 3.2 文本字符统计
+### 2.2 文本字符统计
 首先第一步是对文本字符进行统计。字符统计的目的主要是添加常用的中文字符、特殊字符。
 
 由于语料文本过大，我们随机选取 10G 左右的原始文本进行了字符统计。
@@ -120,7 +80,7 @@ python gen_char.py path_to_corpus.txt
 wget https://paddlenlp.bj.bcebos.com/models/transformers/data_tools/char_dict.pickle
 ```
 
-### 3.3 英文字符词表
+### 2.3 英文字符词表
 基于字符的词频统计，使得英文字符也切割为字母，为此我们需要添加英文词表。
 英文部分，我们使用了 [WikiText](https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip)  数据集，来构造词表。
 下载解压数据，使用BPE切词
@@ -135,7 +95,7 @@ wget https://paddlenlp.bj.bcebos.com/models/transformers/data_tools/eng.vocab
 ```
 
 
-### 3.4 合并词表
+### 2.4 合并词表
 
 目前我们得到了字符统计表，和英文字符词表。下一步，我们将词表进行合并。
 
@@ -149,7 +109,7 @@ python merge_vocab.py
 1. 对于一些日文、谚文文字字符，需要进行 normalize
 2. 添加special_tokens
 
-### 3.5 问题遗留
+### 2.5 问题遗留
 本项目采用的第一种方式，即拼接产出的词表，对连续非中、英文字符文本，会出现UNK的情况。
 如issue: [#2927](https://github.com/PaddlePaddle/PaddleNLP/issues/2927)、 [#2585](https://github.com/PaddlePaddle/PaddleNLP/issues/2585)。本项目做了两点改进:
 
@@ -159,7 +119,7 @@ python merge_vocab.py
 虽然有上述两点修复，任然无法避免 [#2927](https://github.com/PaddlePaddle/PaddleNLP/issues/2927) 现象。
 彻底解决的话，建议使用第二种方式制作vocab文件。
 
-### 3.6 方案二：预处理后直接生成
+### 2.6 方案二：预处理后直接生成
 此方案没有被采用，这里也简单说明一下具体的方案：
 1. 对语料使用 BasicTokenizer 转换
 ```python
@@ -176,42 +136,66 @@ python gen_vocab.py afer_basic_toknizer_corpus.txt
 对处理好的vocab文件手动替换一些`<pad> -> [PAD]`之类的special_tokens，即可产出词表。
 
 
-<a name="快速TokenID转化"> </a>
+<a name="vocab_usage"></a>
+## 3. 词表使用
 
-## 4. 快速Token ID 转化
+<a name="josnl"> </a>
 
-预料、词表准备妥当后，我们可以开始进行最后的数据ID转化。
+## 3.1 转化为jsonl格式数据
 
-- 高效的 Multiprocessing 多进程实现
-- 使用内存BytesIO存储ID数据
+本文档以WuDao数据为例，对数据进行分词：
 
-由于转换的逻辑复杂，需要定义`class Converter`对象来进行转化处理。如果每次处理新的文本，都实例化一次class对象，速度瓶颈会在处理函数的实例化。
-我们使用了提前multiprocessing.Pool的`initializer`，对处理函数进行提前实例化，提高处理效率。
+```shell
+python ../preprocess/words_segmentation.py \
+    --input_path ./WuDaoCorpus2.0_base_200G \
+    --workers 40  \
+    --data_format wudao \
+    --cn_seg_func seg \
+    --output_path ./wudao_lac_cut \
+```
 
-处理后的token id数量巨大，可以达到数百Billion，如果使用普通的数据结构，如python的list保存，会出现存储瓶颈，不仅占用空间大，list对象还需要重新分配内存空间。这里我们采用了 BytesIO 的方式，类似写入内存文件的方式，速度快，可以非常方便转化为numpy文件保存。
+文本转化完成后。我们使用 `../data_tools/trans_to_json.py`重新转换为jsonl格式（分词完毕）。
+```shell
+python ../preprocess/trans_to_json.py  \
+    --input_path ./wudao_lac_cut \
+    --output_path wudao_corpus_200g_0623.jsonl \
+    --workers 40 \
+```
 
-使用 Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz CPU测试，40线程，处理速度 8+MB/s，约7个小时左右，即可完成 200GB 文本转化为ID.
+<a name="快速TokenID转化"> </a>
+
+## 3.2 Token ID 转化
+
+语料、新建的词表准备妥当后，我们可以开始进行最后的数据ID转化。
 
 ```
-python -u  ../data_tools/create_pretraining_data.py \
-    --model_name ./vocab_path/vocab.txt \
+python -u  ../preprocess/create_pretraining_data.py \
+    --model_name /path/to/your/vocab.txt \
     --tokenizer_name ErnieTokenizer \
     --input_path wudao_corpus_200g_0623.jsonl \
-    --split_sentences\
+    --split_sentences \
     --chinese \
-    --cn_splited \
     --cn_whole_word_segment \
-    --output_prefix wudao_200g_0703 \
-    --workers 40 \
-    --log_interval 1000
+    --cn_seg_func jieba \
+    --cn_splited \
+    --output_prefix wudao_corpus_200g_0623 \
+    --workers 48 \
+    --log_interval 10000
 ```
+
+- 我们提前分词好了，所以加上了 `cn_splited`，否则不需要使用此选项。
+- model_name 指定为我们准备的词表路径。也可以更换为其他 ERNIE 系列模型，如: `ernie-3.0-base-zh`
+- workers 表示转化的线程数目
+
 转化后的数据如下，使用这份数据，即可开始ERNIE预训练
 ```
 -rw-rw-r-- 1 500 501 129G Jul  4 03:39 wudao_200g_0703_ids.npy
 -rw-rw-r-- 1 500 501 6.4G Jul  4 03:39 wudao_200g_0703_idx.npz
 ```
 
-## 5. 参考
+<a name='ref'></a>
+## 4. 参考
+
 感谢CLUE，WuDao提供的开源文本语料，参考资料：
 - Xu, L., Zhang, X. and Dong, Q., 2020. CLUECorpus2020: A large-scale Chinese corpus for pre-training language model. arXiv preprint arXiv:2003.01355.
 - Yuan, S., Zhao, H., Du, Z., Ding, M., Liu, X., Cen, Y., Zou, X., Yang, Z. and Tang, J., 2021. Wudaocorpora: A super large-scale chinese corpora for pre-training language models. AI Open, 2, pp.65-68.

From c0b64796d4047af82055c3168a114b0c4d885bc4 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 1 Sep 2022 19:26:48 +0800
Subject: [PATCH 44/48] fix link

---
 docs/FAQ.md                                   |  2 +-
 model_zoo/ernie-1.0/README.md                 | 43 ++++++++++++++-----
 .../ernie-1.0/preprocess/docs/OpenWebText2.md |  2 +-
 .../ernie-1.0/pretraining_introduction.md     |  9 ++--
 model_zoo/gpt/README.md                       |  2 +-
 5 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/docs/FAQ.md b/docs/FAQ.md
index e58e3da290e2..713c0783a0ba 100644
--- a/docs/FAQ.md
+++ b/docs/FAQ.md
@@ -182,7 +182,7 @@ emb.set_state_dict(load_layer_state_dict) # 加载模型参数
 
 **A:** 预训练模型通常会有配套的tokenzier和词典，对于大多数中文预训练模型，如ERNIE-3.0，使用的都是字粒度的输入，tokenzier会将句子转换为字粒度的形式，模型无法收到词粒度的输入。如果希望引入额外的词典，需要修改预训练模型的tokenizer和词典，可以参考这里[blog](https://kexue.fm/archives/7758/comment-page-1#Tokenizer )，另外注意embedding矩阵也要加上这些新增词的embedding表示。
 
-另外还有一种方式可以使用这些字典信息，可以将数据中在词典信息中的词进行整体mask进行一个mask language model的二次预训练，这样经过二次训练的模型就包含了对额外字典的表征。可参考 [PaddleNLP 预训练数据流程](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/ernie-1.0/data_tools)。
+另外还有一种方式可以使用这些字典信息，可以将数据中在词典信息中的词进行整体mask进行一个mask language model的二次预训练，这样经过二次训练的模型就包含了对额外字典的表征。可参考 [PaddleNLP 预训练数据流程](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/ernie-1.0/)。
 
 
 此外还有些词粒度及字词混合粒度的预训练模型，在这些词粒度的模型下引入额外的词表也会容易些，我们也将持续丰富PaddleNLP中的预训练模型。
diff --git a/model_zoo/ernie-1.0/README.md b/model_zoo/ernie-1.0/README.md
index 747aa44ef7c0..62820075ce13 100644
--- a/model_zoo/ernie-1.0/README.md
+++ b/model_zoo/ernie-1.0/README.md
@@ -70,6 +70,11 @@ Learnt by ERNIE：[mask] [mask] [mask] 是黑龙江的省会，国际 [mask] [ma
 │   ├── run_seq_cls.py  序列分类任务运行脚本
 │   └── utils.py
 ├── README.md  说明文档
+├── pretraining_introduction.md 中文预训练详细介绍文档
+├── preprocess
+│   ├── docs                部分数据制作文档，包括CLUECorpusSmall，WuDaoCorpusBase
+│   └── xxx.py              文件处理的python脚本。
+├── vocab                   全中文字符词表制作教程
 ├── run_gb512_s1m.sh        训练启动shell脚本，batch size 512. max steps 100w
 ├── run_gb512_s1m_static.sh
 ├── run_gb512_s1m_trainer.sh
@@ -82,17 +87,20 @@ Learnt by ERNIE：[mask] [mask] [mask] 是黑龙江的省会，国际 [mask] [ma
 
 ### 1.2 环境依赖
 
+- tool_helpers
 - visualdl
 - pybind11
 
-安装命令 `pip install visualdl pybind11`
-
+安装命令 `pip install visualdl pybind11 tool_helpers`
 
 <a name="中文预训练"></a>
 
 ## 2. 中文预训练
+
 ERNIE预训练采用的是MLM（Mask Language Model）的训练方式，采用WWM（Whole Word Mask）方式，对于完整语义单元的Token，会同时进行Mask。整体的训练损失loss是mlm_loss + sop_loss。
 
+ERNIE 中文预训练更详细的介绍文档请可以参见[ERNIE 中文预训练介绍](./pretraining_introduction.md)。
+
 
 本样例为用户提供了高效的训练流程，
 - **支持动态文本mask**： 用户可以根据自己的需求，灵活修改mask方式。具体可以参考修改`data_tools/dataset_utils.py`中`create_masked_lm_predictions`函数。
@@ -105,7 +113,7 @@ ERNIE预训练采用的是MLM（Mask Language Model）的训练方式，采用WW
 下面是使用CLUECorpusSmall 14G文本进行预训练的流程：
 
 <details>
-<summary><b>CLUECorpusSmall 数据集预训练</b></summary>
+<summary><b>CLUECorpusSmall 数据准备</b></summary>
 
 #### 数据准备
 数据下载部分请参考[data_tools](./data_tools)目录，根据文档中`CLUECorpusSmall 数据集处理教程`，下载数据。下载好后:
@@ -141,6 +149,13 @@ clue_corpus_small_14g_20220104_ids.npy
 clue_corpus_small_14g_20220104_idx.npz
 ```
 
+</details>
+
+
+<details>
+<summary><b>CLUECorpusSmall 开始训练</b></summary>
+
+
 ####  开始训练
 
 将制作好的数据`clue_corpus_small_14g_20220104_ids.npy,clue_corpus_small_14g_20220104_idx.npz`移动到input_dir中，即可开始训练。
@@ -205,7 +220,12 @@ python -u  -m paddle.distributed.launch \
 注：
 - 训练支持断点重启，直接启动即可，程序会找到最新的checkpoint(`output_dir/model_last`)，开始重启训练。请确保重启的训练配置与之前相同。
 - visualdl的日志在 `./output/ernie-1.0-dp8-gb512/train_log/xxx` 中。
+</details>
+
+
 
+<details>
+<summary><b>CLUECorpusSmall 数据集训练效果</b></summary>
 
 #### CLUECorpusSmall 数据集训练效果
 
@@ -238,6 +258,7 @@ ERINE-1.0-cluecorpussmall | 12L768H | 73.24(-0.54) | 74.26 | 57.24 | 60.79 | 81.
 注:
 - `ERNIE-1.0 Base`官方预训练参数，采用的训练配置是batch_size=1024、steps=100w，
 - `ERINE-1.0-cluecorpussmall`复现版本，采用的是batch_size=512、steps=100w。
+
 </details>
 
 <a name="ERNIE-CW"></a>
@@ -246,7 +267,7 @@ ERINE-1.0-cluecorpussmall | 12L768H | 73.24(-0.54) | 74.26 | 57.24 | 60.79 | 81.
 
 PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 总共400GB，提供大规模语料训练教程，让用户可以从零开始构建，基于大规模语料，训练预训练模型。
 
-本教程，从数据下载，词表制作，数据转化，模型训练，所有流程，完全开源开放，可复现。
+[ERNIE 中文预训练介绍](./pretraining_introduction.md)，从数据下载，词表制作，数据转化，模型训练，所有流程，完全开源开放，可复现。
 并训练发布开源最优的模型参数。
 
 #### 数据准备
@@ -255,16 +276,14 @@ PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 
 - [CLUECorpus2020数据处理](./preprocess/docs/CLUECorpus2020.md)
 - [WuDaoCorpusBase数据处理](./preprocess/docs/WuDaoCorpusBase.md)
 
-如果需要定制化词表，词表制作部分请参考[词表制作](./vocab/README.md)
+如果需要定制化词表，词表制作部分请参考[词表制作](./vocab/README.md)。
 
 
-###  开始训练
-
-### 3.1 训练脚本
+#### 训练脚本
 
 训练脚本如下
 
-<b>环境配置</b>
+**环境配置**
 
 - PYTHONPATH 设置为当前目录（适合paddlenlp develop运行）
 - 设置了一些FLAGS，包括增强报错，动态图Flag，提高矩阵乘法精度。
@@ -287,7 +306,7 @@ unset CUDA_VISIBLE_DEVICES
 ```
 </details>
 
-<b>路径配置</b>
+**路径配置**
 
 - 主要配置输入输出目录
 - 这里的`vocab_dir`如果没有使用自定义词表的话，请设置为内置的tokenizer，如`ernie-1.0-base-zh,ernie-3.0-base-zh`等。
@@ -307,7 +326,9 @@ vocab_dir="${base_nfs}/"
 ```
 </details>
 
-**启动训练**：这里启动的是单机8卡任务，整体全局的batch_size 512 (64*8)。如果指定ips参数，进行多机运行，如 `python3 -u  -m paddle.distributed.launch  --gpus "0,1,2,3,4,5,6,7" --ips 192.168.1.101,192.168.1.101 `
+**启动训练**：
+
+这里启动的是单机8卡任务，整体全局的batch_size 512 (64*8)。如果指定ips参数，进行多机运行，如 `python3 -u  -m paddle.distributed.launch  --gpus "0,1,2,3,4,5,6,7" --ips 192.168.1.101,192.168.1.101 `
 ```shell
 python3 -u  -m paddle.distributed.launch \
     --gpus "0,1,2,3,4,5,6,7" \
diff --git a/model_zoo/ernie-1.0/preprocess/docs/OpenWebText2.md b/model_zoo/ernie-1.0/preprocess/docs/OpenWebText2.md
index fd0830aeadce..03766a70cac5 100644
--- a/model_zoo/ernie-1.0/preprocess/docs/OpenWebText2.md
+++ b/model_zoo/ernie-1.0/preprocess/docs/OpenWebText2.md
@@ -18,7 +18,7 @@ tar -xvf openwebtext2.json.zst.tar -C  /path/to/openwebtext
 
 ## GPT训练数据制作
 
-然后使用[data_tools](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/gpt/../ernie-1.0/data_tools) 工具下的`create_pretraining_data.py`脚本进行数据集制作：
+然后使用[proprecess]](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/ernie-1.0/proprecess) 工具下的`create_pretraining_data.py`脚本进行数据集制作：
 ```
 python -u  create_pretraining_data.py \
     --model_name gpt2-en \
diff --git a/model_zoo/ernie-1.0/pretraining_introduction.md b/model_zoo/ernie-1.0/pretraining_introduction.md
index b1ee7e4fb315..c6967ad26226 100644
--- a/model_zoo/ernie-1.0/pretraining_introduction.md
+++ b/model_zoo/ernie-1.0/pretraining_introduction.md
@@ -81,9 +81,9 @@ python wudao_process.py \
 ```
 注：预训练需要实现 SOP( Sentence Order Predict) 任务，在分词的同时，我们使用 简单规则 进行了文本断句。如果语料只有一句话，建议去除SOP loss，训练时设置 `binary_head=False`。
 
-文本转化完成后。我们使用 `../data_tools/trans_to_json.py`重新转换为jsonl格式（分词完毕）。
+文本转化完成后。我们使用 `./preprocess/trans_to_json.py`重新转换为jsonl格式（分词完毕）。
 ```shell
-python ../data_tools/trans_to_json.py  \
+python ./preprocess/trans_to_json.py  \
     --input_path ./wudao_lac_cut \
     --output_path wudao_corpus_200g_0623.jsonl \
     --workers 40 \
@@ -108,7 +108,7 @@ python ../data_tools/trans_to_json.py  \
 使用 Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz CPU测试，40线程，处理速度 8+MB/s，约7个小时左右，即可完成 200GB 文本转化为ID.
 
 ```
-python -u  ../data_tools/create_pretraining_data.py \
+python -u  ./preprocess/create_pretraining_data.py \
     --model_name ./vocab_path/vocab.txt \
     --tokenizer_name ErnieTokenizer \
     --input_path wudao_corpus_200g_0623.jsonl \
@@ -511,8 +511,9 @@ RoBERTa-wwm-ext-large | 24L1024H | 76.61 |    76.00 |    59.33 |    62.02 |    8
 
 ## 5. 参考文献
 
-感谢CLUE，WuDao提供的开源文本语料，参考资料：
+感谢CLUE，WuDao提供的开源文本语料，主要数据流部分参考自[Megatron](https://github.com/NVIDIA/Megatron-LM)，参考资料：
 - Xu, L., Zhang, X. and Dong, Q., 2020. CLUECorpus2020: A large-scale Chinese corpus for pre-training language model. arXiv preprint arXiv:2003.01355.
 - Yuan, S., Zhao, H., Du, Z., Ding, M., Liu, X., Cen, Y., Zou, X., Yang, Z. and Tang, J., 2021. Wudaocorpora: A super large-scale chinese corpora for pre-training language models. AI Open, 2, pp.65-68.
 - https://github.com/CLUEbenchmark/CLUECorpus2020
 - https://resource.wudaoai.cn
+- https://github.com/NVIDIA/Megatron-LM
diff --git a/model_zoo/gpt/README.md b/model_zoo/gpt/README.md
index 65e4b62aa43a..7c6920ca7c22 100644
--- a/model_zoo/gpt/README.md
+++ b/model_zoo/gpt/README.md
@@ -53,7 +53,7 @@ wget https://mystic.the-eye.eu/public/AI/pile_preliminary_components/openwebtext
 tar -xvf openwebtext2.json.zst.tar -C  /path/to/openwebtext
 ```
 
-然后使用[data_tools](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/gpt/../ernie-1.0/data_tools) 工具下的`create_pretraining_data.py`脚本进行数据集制作：
+然后使用[preprocess](https://github.com/PaddlePaddle/PaddleNLP/tree/develop/model_zoo/gpt/../ernie-1.0/preprocess) 工具下的`create_pretraining_data.py`脚本进行数据集制作：
 ```
 python -u  create_pretraining_data.py \
     --model_name gpt2-en \

From 4ac204f7ac6a102c83af388c47023abc139e207b Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 2 Sep 2022 14:39:30 +0800
Subject: [PATCH 45/48] fix bug

---
 model_zoo/ernie-1.0/preprocess/create_pretraining_data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/model_zoo/ernie-1.0/preprocess/create_pretraining_data.py b/model_zoo/ernie-1.0/preprocess/create_pretraining_data.py
index e59844d5b352..96082bebe513 100644
--- a/model_zoo/ernie-1.0/preprocess/create_pretraining_data.py
+++ b/model_zoo/ernie-1.0/preprocess/create_pretraining_data.py
@@ -271,7 +271,7 @@ def process(text):
                 pre_dimer = False
                 for index, w in enumerate(words):
                     if pre_dimer and len(w) == 0:
-                        words[index] = " "
+                        words[index] = self.args.cn_split_dimer
                         pre_dimer = False
                     elif len(w) == 0:
                         pre_dimer = True

From 02e35a53db1dff98528d750f9be65ec4e6dbc84e Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Wed, 7 Sep 2022 19:30:56 +0800
Subject: [PATCH 46/48] fix documents.

---
 examples/benchmark/clue/README.md             |  40 +++---
 model_zoo/ernie-1.0/README.md                 |   9 ++
 .../ernie-1.0/pretraining_introduction.md     | 133 +++++++++++++++---
 model_zoo/ernie-3.0/README.md                 |  88 ++++++------
 4 files changed, 187 insertions(+), 83 deletions(-)

diff --git a/examples/benchmark/clue/README.md b/examples/benchmark/clue/README.md
index 8ab13498068f..58b4a294558a 100644
--- a/examples/benchmark/clue/README.md
+++ b/examples/benchmark/clue/README.md
@@ -188,37 +188,37 @@
                 <span style="font-size:18px">ERNIE 3.0-Xbase-zh</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>78.71</b></span>
+                <span style="font-size:18px"><b>78.39</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>76.85</b></span>
+                <span style="font-size:18px"><b>76.16</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>59.89</b></span>
+                <span style="font-size:18px"><b>59.55</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>62.41</b></span>
+                <span style="font-size:18px"><b>61.87</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>84.76</b></span>
+                <span style="font-size:18px"><b>84.40</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>82.51</b></span>
+                <span style="font-size:18px"><b>81.73</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>89.80</b></span>
+                <span style="font-size:18px"><b>88.82</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>84.47</b></span>
+                <span style="font-size:18px"><b>83.60</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>75.49/92.67</b></span>
+                <span style="font-size:18px"><b>75.99/93.00</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>86.36</b></span>
+                <span style="font-size:18px"><b>86.78</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>84.59</b></span>
+                <span style="font-size:18px"><b>84.98</b></span>
             </td>
         </tr>
         <tr>
@@ -307,31 +307,31 @@
                 <span style="font-size:18px">ERNIE 2.0-Base-zh</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">74.95</span>
+                <span style="font-size:18px">74.32</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">76.25</span>
+                <span style="font-size:18px">75.65</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">58.53</span>
+                <span style="font-size:18px">58.25</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">61.72</span>
+                <span style="font-size:18px">61.64</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">83.07</span>
+                <span style="font-size:18px">82.62</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">78.81</span>
+                <span style="font-size:18px">78.71</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">84.21</span>
+                <span style="font-size:18px">81.91</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">82.77</span>
+                <span style="font-size:18px">82.33</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">68.22/88.71</span>
+                <span style="font-size:18px">66.08/87.46</span>
             </td>
             <td style="text-align:center">
                 <span style="font-size:18px">82.78</span>
diff --git a/model_zoo/ernie-1.0/README.md b/model_zoo/ernie-1.0/README.md
index 62820075ce13..8d6c4b9fddbd 100644
--- a/model_zoo/ernie-1.0/README.md
+++ b/model_zoo/ernie-1.0/README.md
@@ -328,6 +328,15 @@ vocab_dir="${base_nfs}/"
 
 **启动训练**：
 
+对于`ernie-3.0-base-zh`我们提供了悟道的一个小规模样本的数据：
+```
+mkdir data && cd data
+wget https://paddlenlp.bj.bcebos.com/models/transformers/data_tools/wudao_200g_sample_ernie-3.0-bash-zh_ids.npy
+wget https://paddlenlp.bj.bcebos.com/models/transformers/data_tools/wudao_200g_sample_ernie-3.0-bash-zh_idx.npz
+cd -
+```
+可以指定`tokenizer_name_or_path=ernie-3.0-bash-zh`,`input_dir=./data` 用下面的脚本训练。
+
 这里启动的是单机8卡任务，整体全局的batch_size 512 (64*8)。如果指定ips参数，进行多机运行，如 `python3 -u  -m paddle.distributed.launch  --gpus "0,1,2,3,4,5,6,7" --ips 192.168.1.101,192.168.1.101 `
 ```shell
 python3 -u  -m paddle.distributed.launch \
diff --git a/model_zoo/ernie-1.0/pretraining_introduction.md b/model_zoo/ernie-1.0/pretraining_introduction.md
index c6967ad26226..6f338f0eb07c 100644
--- a/model_zoo/ernie-1.0/pretraining_introduction.md
+++ b/model_zoo/ernie-1.0/pretraining_introduction.md
@@ -34,6 +34,15 @@ PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 
   <img src="https://user-images.githubusercontent.com/16911935/187170152-0778a6c1-6510-4c01-84d0-8e0ea3c05231.png" align="middle"  width="500" />
 </p>
 
+
+**环境依赖**
+
+- tool_helpers
+- visualdl
+- pybind11
+
+安装命令 `pip install visualdl pybind11 tool_helpers`
+
 <a name="数据准备"> </a>
 
 ## 1. 数据准备
@@ -45,6 +54,12 @@ PaddleNLP致力于预训练开源工作，使用开源中文语料CLUE、WuDao 
 
 ### 1.1 大规模中文数据
 
+模型的根本是数据，大数据才能有望获得更好的训练效果。我们希望语料有如下特点:
+- **大规模**：目前像ERNIE-3.0，GPT-3，CPM等模型，动辄数T的文本语料。而目前开源的一些中文模型，确是基于15G左右的CLUECorpus语料训练，大大限制了模型的效果，
+- **开源开放**：为了让用户也可以比较容易复现整体的数据流程，采用的数据希望是**开源**的，人人可以获取的。
+
+综上，我们选用的预料为 CLUECorpus2020 语料 200G， WuDaoCorpus2.0 Base 语料 200G。
+
 **CLUECorpus2020 语料**
 
 CLUECorpus2020 过对Common Crawl的中文部分进行语料清洗得到。开源部分提供了约200G左右的语料文本，详细介绍见[官网](https://github.com/CLUEbenchmark/CLUECorpus2020#%E6%95%B0%E6%8D%AE%E4%B8%8B%E8%BD%BD)，用户可以通过邮件申请下载。
@@ -52,15 +67,25 @@ CLUECorpus2020 过对Common Crawl的中文部分进行语料清洗得到。开
 **WuDaoCorpus2.0 Base 语料**
 
 WuDaoCorpora是悟道爬取的中文大规模语料。整体数量为3TB，目前开源的部分为WuDaoCorpus2.0 bases数据集，大小为200GB。
-用户微信登录[官网](https://resource.wudaoai.cn/home)，即可直接下载数据。下载好的压缩数据约 64GB
+用户微信登录[官网](https://resource.wudaoai.cn/home)，即可直接下载数据。下载好的压缩数据约 64GB。
+
+
+为了方便用户测试，我们提供了少量part的WuDao数据供大家使用，（如有侵权，请联系我们删除）
+```
+wget https://paddlenlp.bj.bcebos.com/models/transformers/data_tools/WuDaoCorpus2.0_base_200G_sample.tar.gz
+tar -xvf WuDaoCorpus2.0_base_200G_sample.tar.gz
+```
+用户可以用这份数据跑完后续全程。数据量约为2GB。
 
 
 <a name="高精准中文分词"> </a>
 
 ### 1.2 高精准中文分词
 
-ERNIE 使用知识嵌入的方式进行预训练，如何尽可能精确的从原始文本中提取知识，直接关系预训练模型的效果。
-目前PaddleNLP常用的分词方式的有`jieba`，`lac`，`Wordtag`，
+ERNIE 使用知识嵌入的方式进行预训练。文本中的知识，比如 文本的中的人名、地名、成语、短语等都是知识。如何把这知识训练融合到模型中呢？ERNIE给出的方案，是对这些知识短语一起MASK，然后预测，也就是Whole Words MASK。
+
+在我们数据处理层面，如何尽可能精确的从原始文本中提取知识，直接关系预训练模型的效果。我们对目前PaddleNLP常用的分词方式的有`jieba`，`lac`，`Wordtag`进行分析。`jieba`采用HMM隐马尔可模型，`lac`是LSTM模型，`wordtag`是基于Transformer的模型。
+
 效果、速度对比表格如下，假设CPU使用40线程，GPU使用16卡，处理200G文本：
 
 | 切词方式 | 效果 | 速度 | 预估耗时
@@ -71,14 +96,19 @@ ERNIE 使用知识嵌入的方式进行预训练，如何尽可能精确的从
 
 综合考虑分词的效果与速度，我们选择百度的LAC作为我们的文本分词工具。
 
+
 本文档以WuDao数据为例，对数据进行分词：
 
+
 ```shell
-python wudao_process.py \
-    --input_path WuDaoCorpus2.0_base_200G \
+python ./preprocess/words_segmentation.py \
+    --input_path ./WuDaoCorpus2.0_base_200G \
     --workers 40  \
-    --ouput_path ./wudao_lac_cut \
+    --data_format wudao \
+    --cn_seg_func seg \
+    --output_path ./wudao_lac_cut \
 ```
+
 注：预训练需要实现 SOP( Sentence Order Predict) 任务，在分词的同时，我们使用 简单规则 进行了文本断句。如果语料只有一句话，建议去除SOP loss，训练时设置 `binary_head=False`。
 
 文本转化完成后。我们使用 `./preprocess/trans_to_json.py`重新转换为jsonl格式（分词完毕）。
@@ -89,6 +119,11 @@ python ./preprocess/trans_to_json.py  \
     --workers 40 \
     --no-shuffle
 ```
+使用 WuDaoCorpus2.0_base_200G_sample.tar.gz 数据可以得到jsonl文本为:
+```
+wget https://paddlenlp.bj.bcebos.com/models/transformers/data_tools/wudao_corpus_200g_sample.jsonl
+```
+用户可以下载处理好的数据，进行tokenizer转换。
 
 
 <a name="快速TokenID转化"> </a>
@@ -109,7 +144,7 @@ python ./preprocess/trans_to_json.py  \
 
 ```
 python -u  ./preprocess/create_pretraining_data.py \
-    --model_name ./vocab_path/vocab.txt \
+    --model_name ernie-3.0-base-zh \
     --tokenizer_name ErnieTokenizer \
     --input_path wudao_corpus_200g_0623.jsonl \
     --split_sentences\
@@ -120,6 +155,7 @@ python -u  ./preprocess/create_pretraining_data.py \
     --workers 40 \
     --log_interval 1000
 ```
+
 此处需要指定词表文件进行ID转化，用户可以使用paddlenlp内置的部分词表如`ernie-1.0-base-zh,ernie-3.0-base-zh`，设置`model_name`参数为对应参数名即可。
 也可以根据自己的需求，重新开始制作词表，然后`model_name`传入词表所在的文件夹目录即可。词表制作，请参考下一章节[全字符中文词表制作](#全字符中文词表制作)。
 
@@ -128,21 +164,36 @@ python -u  ./preprocess/create_pretraining_data.py \
 -rw-rw-r-- 1 500 501 129G Jul  4 03:39 wudao_200g_0703_ids.npy
 -rw-rw-r-- 1 500 501 6.4G Jul  4 03:39 wudao_200g_0703_idx.npz
 ```
+同样，对于 WuDaoCorpus2.0_base_200G_sample.tar.gz 数据，使用`ernie-3.0-bash-zh`的tokenizer，可以得到数据。
+```
+mkdir data && cd data
+wget https://paddlenlp.bj.bcebos.com/models/transformers/data_tools/wudao_200g_sample_ernie-3.0-bash-zh_ids.npy
+wget https://paddlenlp.bj.bcebos.com/models/transformers/data_tools/wudao_200g_sample_ernie-3.0-bash-zh_idx.npz
+cd -
+```
 
 <a name="中文词表制作"> </a>
 
 ### 2. 全字符中文词表制作
 
-之前的 数据 id 化中，使用了已有的词表进行转化，当没有词表时，需要从头开始进行词表制作。这里提供了ERNIE模型词表制作的两种方案：
+之前的 数据 id 化中，使用了已有的词表进行转化，当没有词表时，需要从头开始进行词表制作。如果你没有制作新词表的需求，请跳过此部分，直接阅读 [第三节，开始训练](#开始训练)。
+
+那制作ERNIE的词表有什么特点需要注意呢？常见的方法是使用 sentencepiece 切词，使用BPE去找通用的子词串。但是，ERNIE之类的中文模型，是属于字模型，不会出现连续汉字作为子词 如`##中国`。一般是通过 BasicTokenizer，给所有中文汉字之间，添加空格，然后再去切分 子词 subword，这样每个汉字就都是独立的。
+```
+china -> ch #ina
+我爱china -> 我 爱 china -> 我 爱 ch #ina
+```
 
-第一种，词表组合方案
-1. 统计字符
-2. 制作英文词表
-3. 合并词表
+这里提供了ERNIE模型词表制作的两种方案：
 
-第二种，预处理后直接生成，方案
-1. 文本预处理（中文加空格，文本normalize）
-2. 使用sentencepeice制作词表
+- 第一种，词表组合方案
+    1. 统计字符
+    2. 制作英文词表
+    3. 合并词表
+
+- 第二种，预处理后直接生成，方案
+    1. 文本预处理（中文加空格，文本normalize）
+    2. 使用sentencepeice制作词表
 
 第二种方案需要对文本先使用`BasicTokenizer`切分一遍语料。
 第一种方案，自定义程度高，但存在一些局限性。本项目采用了第一种方案，详细介绍如下：
@@ -240,10 +291,9 @@ python ./vocab/gen_vocab.py afer_basic_toknizer_corpus.txt
 
 ### 3.1 训练脚本
 
-训练脚本如下
+训练脚本如下。环境配置和路径配置，不是必要的，如果用户只想简单训练，可以直接跳到[继续训练](#继续训练)部分，直接训练。
 
 <b>环境配置</b>
-
 - PYTHONPATH 设置为当前目录（适合paddlenlp develop运行）
 - 设置了一些FLAGS，包括增强报错，动态图Flag，提高矩阵乘法精度。
 - 多机情况下，可以设置`NCCL_SOCKET_IFNAME`指明NCCL使用的通信网口。
@@ -286,6 +336,7 @@ vocab_dir="${base_nfs}/"
 </details>
 
 **启动训练**：这里启动的是单机8卡任务，整体全局的batch_size 512 (64*8)。如果指定ips参数，进行多机运行，如 `python3 -u  -m paddle.distributed.launch  --gpus "0,1,2,3,4,5,6,7" --ips 192.168.1.101,192.168.1.101 `
+
 ```shell
 python3 -u  -m paddle.distributed.launch \
     --gpus "0,1,2,3,4,5,6,7" \
@@ -349,6 +400,50 @@ python3 -u  -m paddle.distributed.launch \
 - `device` 训练设备，默认为GPU。
 - `share_folder` 多机训练时，如果多机`input_dir`为挂载的同一个nfs网络位置，可以开启次选项，多机共享同一份数据。（每次运行，会制作训练的index数据，如果为挂载的统一nfs位置，则一台机器制作数据即可，否则每台机器都需要制作）
 
+<b>继续训练</b>
+<a name="继续训练"> </a>
+
+很多同学的需求，是从已有的预训练参数开始，继续训练过程，这里我们使用前面教程提供的`WuDaoCorpus2.0_base_200G_sample.tar.gz`样本数据，在`ernie-3.0-base-zh`权重上继续训练。脚本如下：
+
+<details>
+<summary><b>展开脚本</b></summary>
+
+```
+python3 -u  -m paddle.distributed.launch \
+    --gpus "0,1,2,3,4,5,6,7" \
+    --log_dir "output/ernie_continue_training/logs" \
+    run_pretrain.py \
+    --model_type "ernie" \
+    --model_name_or_path "ernie-3.0-base-zh" \
+    --tokenizer_name_or_path  "ernie-3.0-base-zh" \
+    --continue_training true \
+    --input_dir ./data \
+    --output_dir output/ernie_continue_training/ \
+    --split 949,50,1 \
+    --max_seq_len 512 \
+    --binary_head true \
+    --micro_batch_size 8 \
+    --use_amp true \
+    --fp16_opt_level "O1" \
+    --use_recompute false \
+    --max_lr 0.0001 \
+    --min_lr 0.00001 \
+    --max_steps 500000 \
+    --save_steps 100000 \
+    --checkpoint_steps 5000 \
+    --decay_steps 490000 \
+    --weight_decay 0.01 \
+    --warmup_rate 0.01 \
+    --grad_clip 1.0 \
+    --logging_freq 1 \
+    --num_workers 3 \
+    --eval_freq 1000 \
+    --device "gpu"\
+    --scale_loss 1024\
+    --seed 1234 \
+```
+</details>
+
 
 <a name="networks"> </a>
 
@@ -490,7 +585,7 @@ Model&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nb
 -- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
  Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1| Acc| Acc
 ERNIE 1.0-Base-zh-CW | 12L768H | <b>76.44</b> | 76.04 |    58.02 |    60.87 |    83.56 | 78.61 |    89.14 |    84.00 |  72.26/90.40 |    84.73 |    77.15 |
-ERNIE 2.0-Base-zh | 12L768H | 74.95  | 76.25 |    58.53 |    61.72 |    83.07 |    78.81 |    84.21 |    82.77 | 68.22/88.71    | 82.78    | 73.19
+ERNIE 2.0-Base-zh | 12L768H | 74.32  | 75.65 |  58.25 | 61.64 |  82.62 |  78.71 |    81.91 |  82.33 | 66.08/87.46    | 82.78    | 73.19
 ERNIE 1.0-Base-zh | 12L768H | 74.17 | 74.84 |    58.91 |    62.25 |    81.68 |    76.58 |    85.20 |    82.77 | 67.32/87.83 | 82.47 | 69.68
 
 
@@ -504,7 +599,7 @@ Model&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nb
 -- | -- | -- | -- | -- | -- | -- |  -- | -- | -- | -- | -- |  -- |
 Metrics |   |   | Acc | Acc | Acc | Acc | Acc | Acc | Acc | Exact/F1 | Acc| Acc
 ERNIE 1.0-Large-zh-CW| 24L1024H | <b>79.03</b> | 75.97 |    59.65 |    62.91 |    85.09 |    81.73| 93.09 |    84.53 | 74.22/91.88 | 88.57 | 84.54
-ERNIE 3.0-Xbase-zh| 20L1024H | 78.71 | 76.85 |    59.89 |    62.41 |    84.76 |    82.51 |    89.80 |    84.47 |    75.49/92.67 | 86.36 | 84.59
+ERNIE 3.0-Xbase-zh| 20L1024H | 78.39 | 76.16 | 59.55 | 61.87 | 84.40 |  81.73 | 88.82 | 83.60 |    75.99/93.00 | 86.78 | 84.98
 RoBERTa-wwm-ext-large | 24L1024H | 76.61 |    76.00 |    59.33 |    62.02 |    83.88 |    78.81 |    90.79 |    83.67 |    70.58/89.82 |    85.72 |    75.26
 
 <a name="references"> </a>
diff --git a/model_zoo/ernie-3.0/README.md b/model_zoo/ernie-3.0/README.md
index 096d9810e3da..a8fef6755dcf 100644
--- a/model_zoo/ernie-3.0/README.md
+++ b/model_zoo/ernie-3.0/README.md
@@ -180,34 +180,34 @@ batch_size=32 和 1，预测精度为 FP16 时，GPU 下的效果-时延图：
                 <span style="font-size:18px">ERNIE 2.0-Large-zh</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">77.03</span>
+                <span style="font-size:18px">76.90</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>76.41</b></span>
+                <span style="font-size:18px"><b>76.23</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>59.67</b></span>
+                <span style="font-size:18px"><b>59.33</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>62.29</b></span>
+                <span style="font-size:18px">61.91</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">83.82</span>
+                <span style="font-size:18px">83.85</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">79.69</span>
+                <span style="font-size:18px">79.93</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">89.14</span>
+                <span style="font-size:18px">89.82</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">84.10</span>
+                <span style="font-size:18px">83.23</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">71.48/90.35</span>
+                <span style="font-size:18px">70.95/90.31</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">85.52</span>
+                <span style="font-size:18px">86.78</span>
             </td>
             <td style="text-align:center">
                 <span style="font-size:18px"><b>78.12</b></span>
@@ -257,37 +257,37 @@ batch_size=32 和 1，预测精度为 FP16 时，GPU 下的效果-时延图：
                 <span style="font-size:18px"><b>ERNIE 3.0-Xbase-zh</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>78.71</b></span>
+                <span style="font-size:18px"><b>78.39</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>76.85</b></span>
+                <span style="font-size:18px"><b>76.16</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>59.89</b></span>
+                <span style="font-size:18px"><b>59.55</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>62.41</b></span>
+                <span style="font-size:18px"><b>61.87</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>84.76</b></span>
+                <span style="font-size:18px"><b>84.40</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>82.51</b></span>
+                <span style="font-size:18px"><b>81.73</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>89.80</b></span>
+                <span style="font-size:18px"><b>88.82</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>84.47</b></span>
+                <span style="font-size:18px"><b>83.60</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>75.49/92.67</b></span>
+                <span style="font-size:18px"><b>75.99/93.00</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>86.36</b></span>
+                <span style="font-size:18px"><b>86.78</b></span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px"><b>84.59</b></span>
+                <span style="font-size:18px"><b>84.98</b></span>
             </td>
         </tr>
         <tr>
@@ -373,78 +373,78 @@ batch_size=32 和 1，预测精度为 FP16 时，GPU 下的效果-时延图：
         </tr>
         <tr>
             <td style="text-align:center">
-                <span style="font-size:18px">ERNIE 2.0-Base-zh</span>
+                <span style="font-size:18px">Langboat/Mengzi-BERT-Base</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">74.95</span>
+                <span style="font-size:18px">74.69</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">76.25</span>
+                <span style="font-size:18px">75.35</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">58.53</span>
+                <span style="font-size:18px">57.76</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">61.72</span>
+                <span style="font-size:18px">61.64</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">83.07</span>
+                <span style="font-size:18px">82.41</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">78.81</span>
+                <span style="font-size:18px">77.93</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">84.21</span>
+                <span style="font-size:18px">88.16</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">82.77</span>
+                <span style="font-size:18px">82.20</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">68.22/88.71</span>
+                <span style="font-size:18px">67.04/88.35</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">82.78</span>
+                <span style="font-size:18px">83.74</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">73.19</span>
+                <span style="font-size:18px">70.70</span>
             </td>
         </tr>
         <tr>
             <td style="text-align:center">
-                <span style="font-size:18px">Langboat/Mengzi-BERT-Base</span>
+                <span style="font-size:18px">ERNIE 2.0-Base-zh</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">74.69</span>
+                <span style="font-size:18px">74.32</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">75.35</span>
+                <span style="font-size:18px">75.65</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">57.76</span>
+                <span style="font-size:18px">58.25</span>
             </td>
             <td style="text-align:center">
                 <span style="font-size:18px">61.64</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">82.41</span>
+                <span style="font-size:18px">82.62</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">77.93</span>
+                <span style="font-size:18px">78.71</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">88.16</span>
+                <span style="font-size:18px">81.91</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">82.20</span>
+                <span style="font-size:18px">82.33</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">67.04/88.35</span>
+                <span style="font-size:18px">66.08/87.46</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">83.74</span>
+                <span style="font-size:18px">82.78</span>
             </td>
             <td style="text-align:center">
-                <span style="font-size:18px">70.70</span>
+                <span style="font-size:18px">73.19</span>
             </td>
         </tr>
         <tr>

From e86664a90c59a9e3266692613575b4e9f64e1035 Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Thu, 8 Sep 2022 10:35:35 +0800
Subject: [PATCH 47/48] add weight.

---
 docs/model_zoo/transformers/ERNIE/contents.rst | 12 ++++++++++++
 paddlenlp/transformers/ernie/modeling.py       | 14 ++++++++++++++
 paddlenlp/transformers/ernie/tokenizer.py      |  6 ++++++
 3 files changed, 32 insertions(+)

diff --git a/docs/model_zoo/transformers/ERNIE/contents.rst b/docs/model_zoo/transformers/ERNIE/contents.rst
index b40fa43c7aa6..5bab4d1dc2ee 100644
--- a/docs/model_zoo/transformers/ERNIE/contents.rst
+++ b/docs/model_zoo/transformers/ERNIE/contents.rst
@@ -16,6 +16,14 @@ ERNIE模型汇总
 |                                                                                  |              | 12-heads, 108M parameters.                                                       |
 |                                                                                  |              | Trained on Chinese text.                                                         |
 +----------------------------------------------------------------------------------+--------------+----------------------------------------------------------------------------------+
+|``ernie-1.0-base-zh-cw``                                                          | Chinese      | 12-layer, 768-hidden,                                                            |
+|                                                                                  |              | 12-heads, 118M parameters.                                                       |
+|                                                                                  |              | Trained on Chinese text.                                                         |
++----------------------------------------------------------------------------------+--------------+----------------------------------------------------------------------------------+
+|``ernie-1.0-large-zh-cw``                                                         | Chinese      | 24-layer, 1024-hidden,                                                           |
+|                                                                                  |              | 16-heads, 272M parameters.                                                       |
+|                                                                                  |              | Trained on Chinese text.                                                         |
++----------------------------------------------------------------------------------+--------------+----------------------------------------------------------------------------------+
 |``ernie-tiny``                                                                    | Chinese      | 3-layer, 1024-hidden,                                                            |
 |                                                                                  |              | 16-heads, _M parameters.                                                         |
 |                                                                                  |              | Trained on Chinese text.                                                         |
@@ -32,6 +40,10 @@ ERNIE模型汇总
 |                                                                                  |              | 16-heads, 336M parameters.                                                       |
 |                                                                                  |              | Trained on lower-cased English text.                                             |
 +----------------------------------------------------------------------------------+--------------+----------------------------------------------------------------------------------+
+|``ernie-3.0-xbase-zh``                                                            | Chinese      | 20-layer, 1024-hidden,                                                           |
+|                                                                                  |              | 16-heads, 296M parameters.                                                       |
+|                                                                                  |              | Trained on Chinese text.                                                         |
++----------------------------------------------------------------------------------+--------------+----------------------------------------------------------------------------------+
 |``ernie-3.0-base-zh``                                                             | Chinese      | 12-layer, 768-hidden,                                                            |
 |                                                                                  |              | 12-heads, 118M parameters.                                                       |
 |                                                                                  |              | Trained on Chinese text.                                                         |
diff --git a/paddlenlp/transformers/ernie/modeling.py b/paddlenlp/transformers/ernie/modeling.py
index c7f6528eb993..f4218b924a0f 100644
--- a/paddlenlp/transformers/ernie/modeling.py
+++ b/paddlenlp/transformers/ernie/modeling.py
@@ -178,6 +178,18 @@ class ErniePretrainedModel(PretrainedModel):
             "vocab_size": 18000,
             "pad_token_id": 0,
         },
+        "ernie-1.0-base-zh-cw": {
+            "attention_probs_dropout_prob": 0.1,
+            "hidden_act": "gelu",
+            "hidden_dropout_prob": 0.1,
+            "hidden_size": 768,
+            "initializer_range": 0.02,
+            "max_position_embeddings": 512,
+            "num_attention_heads": 12,
+            "num_hidden_layers": 12,
+            "type_vocab_size": 4,
+            "vocab_size": 40000
+        },
         "ernie-1.0-large-zh-cw": {
             "attention_probs_dropout_prob": 0.1,
             "hidden_act": "relu",
@@ -669,6 +681,8 @@ class ErniePretrainedModel(PretrainedModel):
             "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/ernie_v1_chn_base.pdparams",
             "ernie-1.0-base-zh":
             "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/ernie_v1_chn_base.pdparams",
+            "ernie-1.0-base-zh-cw":
+            "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/ernie_1.0_base_zh_cw.pdparams",
             "ernie-1.0-large-zh-cw":
             "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/ernie_1.0_large_zh_cw.pdparams",
             "ernie-tiny":
diff --git a/paddlenlp/transformers/ernie/tokenizer.py b/paddlenlp/transformers/ernie/tokenizer.py
index dd49bdab40e2..7ec4aad88113 100644
--- a/paddlenlp/transformers/ernie/tokenizer.py
+++ b/paddlenlp/transformers/ernie/tokenizer.py
@@ -83,6 +83,8 @@ class ErnieTokenizer(PretrainedTokenizer):
             "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt",
             "ernie-1.0-base-zh":
             "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt",
+            "ernie-1.0-base-zh-cw":
+            "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/ernie_1.0_base_zh_cw_vocab.txt",
             "ernie-1.0-large-zh-cw":
             "https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt",
             "ernie-tiny":
@@ -166,6 +168,9 @@ class ErnieTokenizer(PretrainedTokenizer):
         "ernie-1.0-base-zh": {
             "do_lower_case": True
         },
+        "ernie-1.0-base-zh-cw": {
+            "do_lower_case": True
+        },
         "ernie-1.0-large-zh-cw": {
             "do_lower_case": True
         },
@@ -281,6 +286,7 @@ class ErnieTokenizer(PretrainedTokenizer):
     max_model_input_sizes = {
         "ernie-1.0": 513,
         "ernie-1.0-base-zh": 513,
+        "ernie-1.0-base-zh-cw": 512,
         "ernie-1.0-large-zh-cw": 512,
         "ernie-tiny": 600,
         "ernie-2.0-base-zh": 513,

From a7ff81dc78592edfc18a963c5e778656949d860e Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 9 Sep 2022 10:50:03 +0800
Subject: [PATCH 48/48] fix config

---
 .../ernie-1.0/preprocess/clue_process.py      | 172 ------------------
 .../ernie-1.0/pretraining_introduction.md     |   2 +-
 paddlenlp/transformers/ernie/modeling.py      |   2 +
 3 files changed, 3 insertions(+), 173 deletions(-)
 delete mode 100644 model_zoo/ernie-1.0/preprocess/clue_process.py

diff --git a/model_zoo/ernie-1.0/preprocess/clue_process.py b/model_zoo/ernie-1.0/preprocess/clue_process.py
deleted file mode 100644
index bea70ef7bf67..000000000000
--- a/model_zoo/ernie-1.0/preprocess/clue_process.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-input_path = "WuDaoCorpus2.0_base_200G/"
-
-import json
-import re
-import argparse
-import multiprocessing
-import os
-import time
-import jieba
-import sys
-from functools import partial
-
-
-def get_args():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--input_path',
-                        type=str,
-                        required=True,
-                        help='Path to you raw files. Folder or file path.')
-    parser.add_argument('--workers',
-                        type=int,
-                        default=1,
-                        help='Number of worker processes to launch')
-    parser.add_argument('--output_path',
-                        type=str,
-                        help='Path to save the output json files.')
-    parser.add_argument('--log_interval',
-                        type=int,
-                        default=1,
-                        help='Interval between progress updates.')
-    args = parser.parse_args()
-    return args
-
-
-def lexical_analysis_fn():
-    from LAC import LAC
-    lac = LAC(mode="lac")
-
-    def process(line):
-        words, _ = lac.run(line)
-        return words
-
-    return process
-
-
-def chinese_segmentation_fn():
-    from LAC import LAC
-    lac_cws = LAC(mode='seg')
-
-    def process(line):
-        words = lac_cws.run(line)
-        return words
-
-    return process
-
-
-def jieba_segmentation_fn():
-    import jieba
-
-    def process(line):
-        words = jieba.cut(line)
-        return list(words)
-
-    return process
-
-
-CHINESE_SEG_FUNC = {
-    'lac': lexical_analysis_fn(),
-    'seg': chinese_segmentation_fn(),
-    'jieba': jieba_segmentation_fn(),
-}
-
-special_chars = ['\n', '。', '?', '？', ' ', ';', '；', '！', '!']
-split_chars = ['。', '?', '？', ';', '；', '!', '！']
-
-
-def text_to_text(path):
-    out_name = "./tmp/" + path[-20:]
-    print("Loading %s" % path)
-    with open(path, "r") as f:
-        try:
-            contents = json.load(f)
-        except Exception as e:
-            print("Failed to load %s" % path)
-            return 0, None
-
-    print("Write into %s" % out_name)
-    if os.path.exists(out_name):
-        print("File exists %s" % out_name)
-        return 0, None
-
-    seg_func = CHINESE_SEG_FUNC["seg"]
-    import time
-    s = time.time()
-    data_len = 0
-    count = 0
-    with open(out_name, "w") as f:
-        for js in contents:
-            count += 1
-            text = js["content"]
-            data_len += len(text.encode("utf-8"))
-            # make special char only once, because of those token will be treat as sentence spliter.
-            for char in special_chars:
-                text = re.sub('[' + char + ']+[ ]*', char, text)
-            # space will be treat as comma, WARM, not in eng
-            # text = text.replace(" ", "，")
-            for char in split_chars:
-                text = text.replace(char, char + "\n")
-            final = ""
-            for line in text.split("\n"):
-                if len(line) == 0:
-                    continue
-                words = seg_func(line)
-                final += " ".join(words) + "\n"
-            f.write(final + "\n")
-
-    return data_len, None
-
-
-def main():
-    args = get_args()
-    startup_start = time.time()
-
-    file_paths = []
-    if os.path.isfile(args.input_path):
-        file_paths.append(args.input_path)
-    else:
-        for root, _, fs in os.walk(args.input_path):
-            for f in fs:
-                file_paths.append(os.path.join(root, f))
-
-    pool = multiprocessing.Pool(args.workers)
-
-    startup_end = time.time()
-    proc_start = time.time()
-    total_bytes_processed = 0
-    print("Time to startup:", startup_end - startup_start)
-
-    encoded_files = pool.imap(text_to_text, file_paths, 1)
-
-    out_paths = []
-    for i, (bytes_processed, out_path) in enumerate(encoded_files, start=1):
-        total_bytes_processed += bytes_processed
-        out_paths.append(out_path)
-        master_start = time.time()
-
-        if i % args.log_interval == 0:
-            current = time.time()
-            elapsed = current - proc_start
-            mbs = total_bytes_processed / elapsed / 1024 / 1024
-            print(f"Processed {i} files",
-                  f"({i/elapsed} files/s, {mbs} MB/s).",
-                  file=sys.stderr)
-    pool.close()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/model_zoo/ernie-1.0/pretraining_introduction.md b/model_zoo/ernie-1.0/pretraining_introduction.md
index 6f338f0eb07c..4489e9b87285 100644
--- a/model_zoo/ernie-1.0/pretraining_introduction.md
+++ b/model_zoo/ernie-1.0/pretraining_introduction.md
@@ -422,7 +422,7 @@ python3 -u  -m paddle.distributed.launch \
     --split 949,50,1 \
     --max_seq_len 512 \
     --binary_head true \
-    --micro_batch_size 8 \
+    --micro_batch_size 64 \
     --use_amp true \
     --fp16_opt_level "O1" \
     --use_recompute false \
diff --git a/paddlenlp/transformers/ernie/modeling.py b/paddlenlp/transformers/ernie/modeling.py
index f4218b924a0f..a329abdfc17b 100644
--- a/paddlenlp/transformers/ernie/modeling.py
+++ b/paddlenlp/transformers/ernie/modeling.py
@@ -187,7 +187,9 @@ class ErniePretrainedModel(PretrainedModel):
             "max_position_embeddings": 512,
             "num_attention_heads": 12,
             "num_hidden_layers": 12,
+            "task_type_vocab_size": 3,
             "type_vocab_size": 4,
+            "use_task_id": True,
             "vocab_size": 40000
         },
         "ernie-1.0-large-zh-cw": {