From 0c8824dd3b2e629db1b28c0e2a5e75cf8c0111a9 Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Tue, 23 Feb 2021 19:08:36 +0800 Subject: [PATCH 1/2] add keep_extend_vocab_only option --- examples/word_embedding/README.md | 14 +----------- paddlenlp/embeddings/token_embedding.py | 30 ++++++++++++++----------- 2 files changed, 18 insertions(+), 26 deletions(-) diff --git a/examples/word_embedding/README.md b/examples/word_embedding/README.md index 1d2e8bc3b25fa..26e353e5b9dae 100644 --- a/examples/word_embedding/README.md +++ b/examples/word_embedding/README.md @@ -35,20 +35,8 @@ wget https://paddlenlp.bj.bcebos.com/data/dict.txt 我们以中文情感分类公开数据集ChnSentiCorp为示例数据集,可以运行下面的命令,在训练集(train.tsv)上进行模型训练,并在验证集(dev.tsv)验证。 -CPU 启动: - -``` -# 使用paddlenlp.embeddings.TokenEmbedding -python train.py --vocab_path='./dict.txt' --use_gpu=False --lr=5e-4 --batch_size=64 --epochs=20 --use_token_embedding=True --vdl_dir='./vdl_dir' - -# 使用paddle.nn.Embedding -python train.py --vocab_path='./dict.txt' --use_gpu=False --lr=1e-4 --batch_size=64 --epochs=20 --use_token_embedding=False --vdl_dir='./vdl_dir' -``` - -GPU 启动: +启动训练: ``` -export CUDA_VISIBLE_DEVICES=0 - # 使用paddlenlp.embeddings.TokenEmbedding python train.py --vocab_path='./dict.txt' --use_gpu=True --lr=5e-4 --batch_size=64 --epochs=20 --use_token_embedding=True --vdl_dir='./vdl_dir' diff --git a/paddlenlp/embeddings/token_embedding.py b/paddlenlp/embeddings/token_embedding.py index 2c17055cba2fd..901682af44165 100644 --- a/paddlenlp/embeddings/token_embedding.py +++ b/paddlenlp/embeddings/token_embedding.py @@ -58,6 +58,8 @@ class TokenEmbedding(nn.Embedding): The file path of extended vocabulary. trainable (object: `bool`, optional, default to True): Whether the weight of embedding can be trained. + keep_extend_vocab_only (object: `bool`, optional, default to True): + Whether keep the extend vocabulary only, only effective if provides extended_vocab_path """ def __init__(self, @@ -65,7 +67,8 @@ def __init__(self, unknown_token=UNK_TOKEN, unknown_token_vector=None, extended_vocab_path=None, - trainable=True): + trainable=True, + keep_extend_vocab_only=False): vector_path = osp.join(EMBEDDING_HOME, embedding_name + ".npz") if not osp.exists(vector_path): # download @@ -87,7 +90,8 @@ def __init__(self, [0] * self.embedding_dim).astype(paddle.get_default_dtype()) if extended_vocab_path is not None: embedding_table = self._extend_vocab(extended_vocab_path, vector_np, - pad_vector, unk_vector) + pad_vector, unk_vector, + keep_extend_vocab_only) trainable = True else: embedding_table = self._init_without_extend_vocab( @@ -138,7 +142,7 @@ def _read_vocab_list_from_file(self, extended_vocab_path): return vocab_list def _extend_vocab(self, extended_vocab_path, vector_np, pad_vector, - unk_vector): + unk_vector, keep_extend_vocab_only): """ Construct index to word list, word to index dict and embedding weight using extended vocab. @@ -182,16 +186,16 @@ def _extend_vocab(self, extended_vocab_path, vector_np, pad_vector, embedding_table[ extend_vocab_intersect_index] = pretrained_embedding_table[ pretrained_vocab_intersect_index] - - for idx in pretrained_vocab_subtract_index: - word = pretrained_idx_to_word[idx] - self._idx_to_word.append(word) - self._word_to_idx[word] = len(self._idx_to_word) - 1 - - embedding_table = np.append( - embedding_table, - pretrained_embedding_table[pretrained_vocab_subtract_index], - axis=0) + if not keep_extend_vocab_only: + for idx in pretrained_vocab_subtract_index: + word = pretrained_idx_to_word[idx] + self._idx_to_word.append(word) + self._word_to_idx[word] = len(self._idx_to_word) - 1 + + embedding_table = np.append( + embedding_table, + pretrained_embedding_table[pretrained_vocab_subtract_index], + axis=0) if self.unknown_token not in extend_vocab_set: self._idx_to_word.append(self.unknown_token) From 261c14c481c22540f2b2b06fb8c6fb156338a5c6 Mon Sep 17 00:00:00 2001 From: zhoushunjie Date: Wed, 24 Feb 2021 09:59:57 +0800 Subject: [PATCH 2/2] keep_extend_vocab_only->keep_extended_vocab_only --- paddlenlp/embeddings/token_embedding.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/paddlenlp/embeddings/token_embedding.py b/paddlenlp/embeddings/token_embedding.py index 901682af44165..c1caef1792951 100644 --- a/paddlenlp/embeddings/token_embedding.py +++ b/paddlenlp/embeddings/token_embedding.py @@ -58,8 +58,8 @@ class TokenEmbedding(nn.Embedding): The file path of extended vocabulary. trainable (object: `bool`, optional, default to True): Whether the weight of embedding can be trained. - keep_extend_vocab_only (object: `bool`, optional, default to True): - Whether keep the extend vocabulary only, only effective if provides extended_vocab_path + keep_extended_vocab_only (object: `bool`, optional, default to True): + Whether keep the extended vocabulary only, will be effective only if provides extended_vocab_path """ def __init__(self, @@ -68,7 +68,7 @@ def __init__(self, unknown_token_vector=None, extended_vocab_path=None, trainable=True, - keep_extend_vocab_only=False): + keep_extended_vocab_only=False): vector_path = osp.join(EMBEDDING_HOME, embedding_name + ".npz") if not osp.exists(vector_path): # download @@ -91,7 +91,7 @@ def __init__(self, if extended_vocab_path is not None: embedding_table = self._extend_vocab(extended_vocab_path, vector_np, pad_vector, unk_vector, - keep_extend_vocab_only) + keep_extended_vocab_only) trainable = True else: embedding_table = self._init_without_extend_vocab( @@ -142,7 +142,7 @@ def _read_vocab_list_from_file(self, extended_vocab_path): return vocab_list def _extend_vocab(self, extended_vocab_path, vector_np, pad_vector, - unk_vector, keep_extend_vocab_only): + unk_vector, keep_extended_vocab_only): """ Construct index to word list, word to index dict and embedding weight using extended vocab. @@ -186,7 +186,7 @@ def _extend_vocab(self, extended_vocab_path, vector_np, pad_vector, embedding_table[ extend_vocab_intersect_index] = pretrained_embedding_table[ pretrained_vocab_intersect_index] - if not keep_extend_vocab_only: + if not keep_extended_vocab_only: for idx in pretrained_vocab_subtract_index: word = pretrained_idx_to_word[idx] self._idx_to_word.append(word)