diff --git a/docs/compression.md b/docs/compression.md index a339c9b7238a..0de62d368a8b 100644 --- a/docs/compression.md +++ b/docs/compression.md @@ -117,7 +117,7 @@ compression_args = parser.parse_args_into_dataclasses() #### Trainer 实例化参数介绍 -- **--model** 待压缩的模型,目前支持 ERNIE、BERT、RoBERTa、ERNIE-M、ERNIE-Gram、PP-MiniLM、TinyBERT 等结构相似的模型,是在下游任务中微调后的模型,当预训练模型选择 ERNIE 时,需要继承 `ErniePretrainedModel`。以分类任务为例,可通过`AutoModelForSequenceClassification.from_pretrained(model_name_or_path)` 等方式来获取,这种情况下,`model_name_or_path`目录下需要有 model_config.json, model_state.pdparams 文件; +- **--model** 待压缩的模型,目前支持 ERNIE、BERT、RoBERTa、ERNIE-M、ELECTRA、ERNIE-Gram、PP-MiniLM、TinyBERT 等结构相似的模型,是在下游任务中微调后的模型,当预训练模型选择 ERNIE 时,需要继承 `ErniePretrainedModel`。以分类任务为例,可通过`AutoModelForSequenceClassification.from_pretrained(model_name_or_path)` 等方式来获取,这种情况下,`model_name_or_path`目录下需要有 model_config.json, model_state.pdparams 文件; - **--data_collator** 三类任务均可使用 PaddleNLP 预定义好的 [DataCollator 类](../../paddlenlp/data/data_collator.py),`data_collator` 可对数据进行 `Pad` 等操作。使用方法参考 [示例代码](../model_zoo/ernie-3.0/compress_seq_cls.py) 即可; - **--train_dataset** 裁剪训练需要使用的训练集,是任务相关的数据。自定义数据集的加载可参考 [文档](https://huggingface.co/docs/datasets/loading)。不启动裁剪时,可以为 None; - **--eval_dataset** 裁剪训练使用的评估集,也是量化使用的校准数据,是任务相关的数据。自定义数据集的加载可参考 [文档](https://huggingface.co/docs/datasets/loading)。是 Trainer 的必选参数; diff --git a/paddlenlp/trainer/trainer_compress.py b/paddlenlp/trainer/trainer_compress.py index b2657435681c..8804d016bc94 100644 --- a/paddlenlp/trainer/trainer_compress.py +++ b/paddlenlp/trainer/trainer_compress.py @@ -268,6 +268,24 @@ def _dynabert_init(self, model, eval_dataloader): return ofa_model, teacher_model +def check_dynabert_config(net_config, width_mult): + ''' + Corrects net_config for OFA model if necessary. + ''' + if 'electra.embeddings_project' in net_config: + net_config["electra.embeddings_project"]['expand_ratio'] = 1.0 + for key in net_config: + # Makes sure to expands the size of the last dim to `width_mult` for + # these Linear weights. + if 'q_proj' in key or 'k_proj' in key or 'v_proj' in key or 'linear1' in key: + net_config[key]['expand_ratio'] = width_mult + # Keeps the size of the last dim of these Linear weights same as + # before. + elif 'out_proj' in key or 'linear2' in key: + net_config[key]['expand_ratio'] = 1.0 + return net_config + + def _dynabert_training(self, ofa_model, model, teacher_model, train_dataloader, eval_dataloader, num_train_epochs): @@ -388,6 +406,7 @@ def evaluate_token_cls(model, data_loader): # Step8: Broadcast supernet config from width_mult, # and use this config in supernet training. net_config = utils.dynabert_config(ofa_model, width_mult) + net_config = check_dynabert_config(net_config, width_mult) ofa_model.set_net_config(net_config) if "token_type_ids" in batch: logits, teacher_logits = ofa_model( @@ -424,6 +443,7 @@ def evaluate_token_cls(model, data_loader): if global_step % self.args.save_steps == 0: for idx, width_mult in enumerate(self.args.width_mult_list): net_config = utils.dynabert_config(ofa_model, width_mult) + net_config = check_dynabert_config(net_config, width_mult) ofa_model.set_net_config(net_config) tic_eval = time.time() logger.info("width_mult %s:" % round(width_mult, 2)) @@ -453,7 +473,7 @@ def evaluate_token_cls(model, data_loader): model_to_save = model._layers if isinstance( model, paddle.DataParallel) else model model_to_save.save_pretrained(output_dir_width) - logger.info("Best acc of width_mult %.2f: %.4f" % + logger.info("Best result of width_mult %.2f: %.4f" % (width_mult, best_acc[idx])) return ofa_model @@ -479,6 +499,7 @@ def _dynabert_export(self, ofa_model): origin_model = self.model.__class__.from_pretrained(model_dir) ofa_model.model.set_state_dict(state_dict) best_config = utils.dynabert_config(ofa_model, width_mult) + best_config = check_dynabert_config(best_config, width_mult) origin_model_new = ofa_model.export(best_config, input_shapes=[[1, 1], [1, 1]], input_dtypes=['int64', 'int64'], @@ -561,7 +582,9 @@ def _batch_generator_func(): optimize_model=False) post_training_quantization.quantize() post_training_quantization.save_quantized_model( - save_model_path=os.path.join(model_dir, algo + str(batch_size)), + save_model_path=os.path.join( + model_dir, algo + + "_".join([str(batch_size), str(batch_nums)])), model_filename=args.output_filename_prefix + ".pdmodel", params_filename=args.output_filename_prefix + ".pdiparams") @@ -632,6 +655,8 @@ def auto_model_forward(self, embedding_kwargs["input_ids"] = input_ids embedding_output = self.embeddings(**embedding_kwargs) + if hasattr(self, "embeddings_project"): + embedding_output = self.embeddings_project(embedding_output) self.encoder._use_cache = use_cache # To be consistent with HF