Skip to content

Commit

Permalink
Update Dureader-yesno example, fix run_glue bug (PaddlePaddle#68)
Browse files Browse the repository at this point in the history
* Update Dureader-yesno example, fix run_glue bug

* minor fix

* minor fix

* add predict() func

* minor fix

* add paddle.no_grad() for eval
  • Loading branch information
smallv0221 authored Mar 4, 2021
1 parent 72f9ccb commit ab330f3
Show file tree
Hide file tree
Showing 12 changed files with 158 additions and 198 deletions.
2 changes: 1 addition & 1 deletion examples/glue/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
### 2.1 环境配置
- Python >= 3.6
- paddlepaddle >= 2.0.0,安装方式请参考 [快速安装](https://www.paddlepaddle.org.cn/install/quick)
- paddlenlp >= 2.0.0rc, 安装方式:`pip install paddlenlp==2.0.0rc`
- paddlenlp >= 2.0.0rc, 安装方式:`pip install paddlenlp\>=2.0.0rc`

### 2.2 启动GLUE任务
以 GLUE/SST-2 任务为例,启动GLUE任务进行Fine-tuning 的方式如下:
Expand Down
5 changes: 3 additions & 2 deletions examples/glue/run_glue.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def parse_args():
type=str,
required=True,
help="The name of the task to train selected in the list: " +
", ".join(TASK_CLASSES.keys()), )
", ".join(METRIC_CLASSES.keys()), )
parser.add_argument(
"--model_type",
default=None,
Expand Down Expand Up @@ -172,6 +172,7 @@ def set_seed(args):
paddle.seed(args.seed)


@paddle.no_grad()
def evaluate(model, loss_fct, metric, data_loader):
model.eval()
metric.reset()
Expand Down Expand Up @@ -240,7 +241,7 @@ def do_train(args):
set_seed(args)

args.task_name = args.task_name.lower()
metric_class = TASK_CLASSES[args.task_name]
metric_class = METRIC_CLASSES[args.task_name]
args.model_type = args.model_type.lower()
model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ DuReader-robust数据集是单篇章、抽取式阅读理解数据集,具体
* PaddleNLP 安装

```shell
pip install paddlenlp==2.0.0rc
pip install paddlenlp\>=2.0.0rc
```

* 环境依赖
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ def set_seed(args):
paddle.seed(args.seed)


@paddle.no_grad()
def evaluate(model, data_loader, args):
model.eval()

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,15 @@
* PaddleNLP 安装

```shell
pip install paddlenlp==2.0.0rc
pip install paddlenlp\>=2.0.0rc
```

* 环境依赖

Python的版本要求 3.6+

### 数据准备
为了方便开发者进行测试,我们内置了数据下载脚本,也可以通过`--data_path`传入本地数据集的位置,数据集需保证与DuReader-yesno数据集格式一致
为了方便开发者进行测试,我们内置了数据下载脚本。


### Fine-tune
Expand Down
5 changes: 0 additions & 5 deletions examples/machine_reading_comprehension/DuReader-yesno/args.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,6 @@

def parse_args():
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--data_path",
type=str,
default=None,
help="Directory of all the data for train, valid, test.")
parser.add_argument(
"--model_type",
default=None,
Expand Down
210 changes: 74 additions & 136 deletions examples/machine_reading_comprehension/DuReader-yesno/run_du.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,19 @@
from paddle.io import DataLoader
from args import parse_args
import json
import paddlenlp as ppnlp

from paddlenlp.data import Pad, Stack, Tuple
from paddlenlp.datasets import load_dataset
from paddlenlp.data import Pad, Stack, Dict
from paddlenlp.transformers import BertForSequenceClassification, BertTokenizer
from paddlenlp.transformers import ErnieForSequenceClassification, ErnieTokenizer
from paddlenlp.transformers import RobertaForSequenceClassification, RobertaTokenizer
from paddlenlp.transformers import LinearDecayWithWarmup

MODEL_CLASSES = {"bert": (BertForSequenceClassification, BertTokenizer)}
MODEL_CLASSES = {
"bert": (BertForSequenceClassification, BertTokenizer),
"ernie": (ErnieForSequenceClassification, ErnieTokenizer),
"roberta": (RobertaForSequenceClassification, RobertaTokenizer),
}


def set_seed(args):
Expand All @@ -39,99 +45,46 @@ def set_seed(args):
paddle.seed(args.seed)


def convert_example(example,
tokenizer,
label_list,
max_seq_length=512,
is_test=False):
"""convert a DuReaderYesNo example into necessary features"""

def _truncate_seqs(seqs, max_seq_length):
# Account for [CLS], [SEP], [SEP] with "- 3"
tokens_a, tokens_b = seqs
max_seq_length -= 3
while True: # truncate with longest_first strategy
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_seq_length:
break
if len(tokens_a) > len(tokens_b):
tokens_a.pop()
else:
tokens_b.pop()
return seqs

def _concat_seqs(seqs, separators, seq_mask=0, separator_mask=1):
concat = sum((seq + sep for sep, seq in zip(separators, seqs)), [])
segment_ids = sum(
([i] * (len(seq) + len(sep))
for i, (sep, seq) in enumerate(zip(separators, seqs))), [])
if isinstance(seq_mask, int):
seq_mask = [[seq_mask] * len(seq) for seq in seqs]
if isinstance(separator_mask, int):
separator_mask = [[separator_mask] * len(sep) for sep in separators]
p_mask = sum((s_mask + mask
for sep, seq, s_mask, mask in zip(
separators, seqs, seq_mask, separator_mask)), [])
return concat, segment_ids, p_mask

if not is_test:
# `label_list == None` is for regression task
label_dtype = "int64" if label_list else "float32"
# get the label
label = example[-2]
example = example[:-2]
#create label maps if classification task
if label_list:
label_map = {}
for (i, l) in enumerate(label_list):
label_map[l] = i
label = label_map[label]
label = np.array([label], dtype=label_dtype)
else:
qas_id = example[-1]
example = example[:-2]
# tokenize raw text
tokens_raw = [tokenizer.tokenize(l) for l in example]
# truncate to the truncate_length,
tokens_trun = _truncate_seqs(tokens_raw, max_seq_length)
# concate the sequences with special tokens
tokens_trun[0] = [tokenizer.cls_token] + tokens_trun[0]
tokens, segment_ids, _ = _concat_seqs(tokens_trun, [[tokenizer.sep_token]] *
len(tokens_trun))
# convert the token to ids
input_ids = tokenizer.convert_tokens_to_ids(tokens)
valid_length = len(input_ids)

if not is_test:
return input_ids, segment_ids, valid_length, label
else:
return input_ids, segment_ids, valid_length, qas_id
def convert_example(example, tokenizer):
"""convert a Dureader-yesno example into necessary features"""

feature = tokenizer(
text=example['question'],
text_pair=example['answer'],
max_seq_len=args.max_seq_length)
feature['labels'] = example['labels']
feature['id'] = example['id']

return feature

def evaluate(model, metric, data_loader, do_pred=False):

@paddle.no_grad()
def evaluate(model, metric, data_loader):
model.eval()
if not do_pred:
metric.reset()
for batch in data_loader:
input_ids, segment_ids, labels = batch
logits = model(input_ids, segment_ids)
correct = metric.compute(logits, labels)
metric.update(correct)
accu = metric.accumulate()
print("accu: %f" % (accu))
else:
res = {}
for batch in data_loader:
input_ids, segment_ids, qas_id = batch
logits = model(input_ids, segment_ids)
qas_id = qas_id.numpy()
preds = paddle.argmax(logits, axis=1).numpy()
for i in range(len(preds)):
res[str(qas_id[i])] = data_loader.dataset.get_labels()[preds[i]]
with open('prediction.json', "w") as writer:
writer.write(json.dumps(res, ensure_ascii=False, indent=4) + "\n")
metric.reset()
for batch in data_loader:
input_ids, segment_ids, labels = batch
logits = model(input_ids, segment_ids)
correct = metric.compute(logits, labels)
metric.update(correct)
accu = metric.accumulate()
print("accu: %f" % (accu))
model.train()


@paddle.no_grad()
def predict(model, data_loader):
model.eval()
res = {}
for batch in data_loader:
input_ids, segment_ids, qas_id = batch
logits = model(input_ids, segment_ids)
qas_id = qas_id.numpy()
preds = paddle.argmax(logits, axis=1).numpy()
for i in range(len(preds)):
res[str(qas_id[i])] = data_loader.dataset.label_list[preds[i]]
model.train()
return res


def do_train(args):
Expand All @@ -145,78 +98,61 @@ def do_train(args):

set_seed(args)

train_ds, dev_ds, test_ds = ppnlp.datasets.DuReaderYesNo.get_datasets(
['train', 'dev', 'test'])
train_ds, dev_ds, test_ds = load_dataset(
'dureader_yesno', splits=['train', 'dev', 'test'])

trans_func = partial(convert_example, tokenizer=tokenizer)

trans_func = partial(
convert_example,
tokenizer=tokenizer,
label_list=train_ds.get_labels(),
max_seq_length=args.max_seq_length)
train_batchify_fn = lambda samples, fn=Dict({
'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id),
'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
'labels': Stack(dtype="int64")
}): fn(samples)

train_ds = train_ds.apply(trans_func, lazy=True)
test_batchify_fn = lambda samples, fn=Dict({
'input_ids': Pad(axis=0, pad_val=tokenizer.pad_token_id),
'token_type_ids': Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
'id': Stack()
}): fn(samples)

train_ds = train_ds.map(trans_func, lazy=True)
train_batch_sampler = paddle.io.DistributedBatchSampler(
train_ds, batch_size=args.batch_size, shuffle=True)

batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment
Stack(), # length
Stack(dtype="int64"), # start_pos
): [data for i, data in enumerate(fn(samples)) if i != 2]

train_data_loader = DataLoader(
dataset=train_ds,
batch_sampler=train_batch_sampler,
collate_fn=batchify_fn,
collate_fn=train_batchify_fn,
return_list=True)

dev_ds = dev_ds.apply(trans_func, lazy=True)

dev_ds = dev_ds.map(trans_func, lazy=True)
dev_batch_sampler = paddle.io.BatchSampler(
dev_ds, batch_size=args.batch_size, shuffle=False)

dev_data_loader = DataLoader(
dataset=dev_ds,
batch_sampler=dev_batch_sampler,
collate_fn=batchify_fn,
collate_fn=train_batchify_fn,
return_list=True)

test_trans_func = partial(
convert_example,
tokenizer=tokenizer,
label_list=train_ds.get_labels(),
max_seq_length=args.max_seq_length,
is_test=True)

test_ds = test_ds.apply(test_trans_func, lazy=True)
test_ds = test_ds.map(trans_func, lazy=True)
test_batch_sampler = paddle.io.BatchSampler(
test_ds, batch_size=args.batch_size, shuffle=False)

test_batchify_fn = lambda samples, fn=Tuple(
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # input
Pad(axis=0, pad_val=tokenizer.vocab[tokenizer.pad_token]), # segment
Stack() # length
): fn(samples)

test_data_loader = DataLoader(
dataset=test_ds,
batch_sampler=test_batch_sampler,
collate_fn=batchify_fn,
collate_fn=test_batchify_fn,
return_list=True)

model = model_class.from_pretrained(args.model_name_or_path, num_classes=3)
model = model_class.from_pretrained(
args.model_name_or_path, num_classes=len(train_ds.label_list))

if paddle.distributed.get_world_size() > 1:
model = paddle.DataParallel(model)

num_training_steps = args.max_steps if args.max_steps > 0 else len(
train_ds.examples) // args.batch_size * args.num_train_epochs
train_data_loader) * args.num_train_epochs

lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
args.warmup_proportion)

optimizer = paddle.optimizer.AdamW(
learning_rate=lr_scheduler,
epsilon=args.adam_epsilon,
Expand Down Expand Up @@ -250,8 +186,9 @@ def do_train(args):
lr_scheduler.step()
optimizer.clear_gradients()

if global_step % args.save_steps == 0:
if global_step % args.save_steps == 0 or global_step == num_training_steps:
if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
evaluate(model, metric, dev_data_loader)
output_dir = os.path.join(args.output_dir,
"model_%d" % global_step)
if not os.path.exists(output_dir):
Expand All @@ -263,11 +200,12 @@ def do_train(args):
tokenizer.save_pretrained(output_dir)
print('Saving checkpoint to:', output_dir)

if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
evaluate(model, metric, dev_data_loader)

if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
evaluate(model, metric, test_data_loader, True)
predictions = predict(model, test_data_loader)
with open('prediction.json', "w") as writer:
writer.write(
json.dumps(
predictions, ensure_ascii=False, indent=4) + "\n")


if __name__ == "__main__":
Expand Down
2 changes: 1 addition & 1 deletion examples/machine_reading_comprehension/SQuAD/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ SQuAD v2.0
* PaddleNLP 安装

```shell
pip install paddlenlp==2.0.0rc
pip install paddlenlp\>=2.0.0rc
```

* 环境依赖
Expand Down
1 change: 1 addition & 0 deletions examples/machine_reading_comprehension/SQuAD/run_squad.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ def set_seed(args):
paddle.seed(args.seed)


@paddle.no_grad()
def evaluate(model, data_loader, args):
model.eval()

Expand Down
Loading

0 comments on commit ab330f3

Please sign in to comment.