Skip to content

Commit

Permalink
Merge pull request PaddlePaddle#36 from AIPG/wojtuss/lac
Browse files Browse the repository at this point in the history
Lexical Analysis for Chinese (LAC) model
  • Loading branch information
Uss, Wojciech authored and GitHub Enterprise committed Aug 27, 2018
2 parents efda369 + 2cb04c8 commit afee083
Show file tree
Hide file tree
Showing 13 changed files with 291 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[submodule "fluid/lac/baidu/lac"]
path = fluid/lac/baidu/lac
url = https://github.com/baidu/lac
1 change: 1 addition & 0 deletions fluid/lac/baidu/lac
Submodule lac added at 5e08d9
1 change: 1 addition & 0 deletions fluid/lac/conf
1 change: 1 addition & 0 deletions fluid/lac/data
214 changes: 214 additions & 0 deletions fluid/lac/infer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,214 @@
import numpy as np
import argparse
import time

import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
import paddle

from train import to_lodtensor
import reader


def parse_args():
parser = argparse.ArgumentParser("Run LAC inference.")
parser.add_argument(
'--batch_size',
type=int,
default=6,
help='The size of a batch. (default: %(default)d)')
parser.add_argument(
'--device',
type=str,
default='GPU',
choices=['CPU', 'GPU'],
help='The device type. (default: %(default)s)')
parser.add_argument(
'--model_path',
type=str,
default='saved_models/params_batch_1',
help='A path to the model. (default: %(default)s)')
parser.add_argument(
"--testdata_dir",
type=str,
default="./data/test_data",
help="The folder where the training data is located.")
parser.add_argument(
"--word_dict_path",
type=str,
default="./conf/word.dic",
help="The path of the word dictionary.")
parser.add_argument(
"--label_dict_path",
type=str,
default="./conf/tag.dic",
help="The path of the label dictionary.")
parser.add_argument(
"--word_rep_dict_path",
type=str,
default="./conf/q2b.dic",
help="The path of the word replacement Dictionary.")
parser.add_argument(
"--word_emb_dim",
type=int,
default=128,
help="The dimension in which a word is embedded.")
parser.add_argument(
"--grnn_hidden_dim",
type=int,
default=256,
help="The number of hidden nodes in the GRNN layer.")
parser.add_argument(
"--bigru_num",
type=int,
default=2,
help="The number of bi_gru layers in the network.")
parser.add_argument(
"--emb_learning_rate",
type=float,
default=5,
help="The real learning rate of the embedding layer will be" \
" (emb_learning_rate * base_learning_rate).")
parser.add_argument(
"--crf_learning_rate",
type=float,
default=0.2,
help="The real learning rate of the embedding layer will be" \
" (crf_learning_rate * base_learning_rate).")
parser.add_argument(
'--num_passes',
type=int,
default=1,
help='The number of passes.')
parser.add_argument(
'--skip_pass_num',
type=int,
default=0,
help='The first num of passes to skip in statistics calculations.')
parser.add_argument(
'--profile',
action='store_true',
help='If set, do profiling.')
args = parser.parse_args()
return args


def print_arguments(args):
print('----------- Configuration Arguments -----------')
for arg, value in sorted(vars(args).iteritems()):
print('%s: %s' % (arg, value))
print('------------------------------------------------')


def load_reverse_dict2(dict_path):
return dict((line.strip().split("\t")[0], line.strip().split("\t")[1])
for idx, line in enumerate(open(dict_path, "r").readlines()))


def infer(args):
word = fluid.layers.data(name='word', shape=[1], dtype='int64', lod_level=1)
target = fluid.layers.data(
name='target', shape=[1], dtype='int64', lod_level=1)

place = fluid.CUDAPlace(0) if args.device == 'GPU' else fluid.CPUPlace()
feeder = fluid.DataFeeder(feed_list=[word, target], place=place)
exe = fluid.Executor(place)

word2id_dict = reader.load_reverse_dict(args.word_dict_path)
label2id_dict = reader.load_reverse_dict(args.label_dict_path)
word_rep_dict = reader.load_dict(args.word_rep_dict_path)
word_dict_len = max(map(int, word2id_dict.values())) + 1
label_dict_len = max(map(int, label2id_dict.values())) + 1

# TODO: load the labels properly
# label_reverse_dict = load_reverse_dict2(args.label_dict_path)

test_reader = paddle.batch(reader.file_reader(args.testdata_dir,
word2id_dict, label2id_dict,
word_rep_dict),
batch_size=args.batch_size)

inference_scope = fluid.core.Scope()
with fluid.scope_guard(inference_scope):
[inference_program, feed_target_names,
fetch_targets] = fluid.io.load_inference_model(args.model_path, exe)
total_passes = args.num_passes + args.skip_pass_num
batch_times = [0] * total_passes
word_counts = [0] * total_passes
wpses = [0] * total_passes
all_iters = 0
print("passes: {}".format(total_passes))
for pass_id in range(total_passes):
if pass_id < args.skip_pass_num:
print("Warm-up pass")
if pass_id == args.skip_pass_num:
profiler.reset_profiler()
iters = 0
for data in test_reader():
word = to_lodtensor(map(lambda x: x[0], data), place)
start = time.time()
crf_decode = exe.run(inference_program,
feed={"word": word},
fetch_list=fetch_targets,
return_numpy=False)
batch_time = time.time() - start
lod_info = (crf_decode[0].lod())[0]
np_data = np.array(crf_decode[0])
word_count = 0
assert len(data) == len(lod_info) - 1
for sen_index in xrange(len(data)):
assert len(data[sen_index][0]) == lod_info[
sen_index + 1] - lod_info[sen_index]
word_index = 0
for tag_index in xrange(lod_info[sen_index],
lod_info[sen_index + 1]):
word = str(data[sen_index][0][word_index])
# TODO: get the tags properly
# gold_tag = label_reverse_dict[data[sen_index][2][
# word_index]]
# tag = label_reverse_dict[np_data[tag_index][0]]
word_index += 1
word_count += word_index
batch_times[pass_id] += batch_time
word_counts[pass_id] += word_count
iters += 1
all_iters += 1
batch_times[pass_id] /= iters
word_counts[pass_id] /= iters
wps = word_counts[pass_id] / batch_times[pass_id]
wpses[pass_id] = wps

print("Pass: %d, iterations (total): %d (%d), latency: %.5f s, words: %d, wps: %f" %
(pass_id, iters, all_iters, batch_times[pass_id], word_counts[pass_id], wps))

# Postprocess benchmark data
latencies = batch_times[args.skip_pass_num:]
latency_avg = np.average(latencies)
latency_std = np.std(latencies)
latency_pc99 = np.percentile(latencies, 99)
wps_avg = np.average(wpses)
wps_std = np.std(wpses)
wps_pc01 = np.percentile(wpses, 1)

# Benchmark output
print('\nTotal passes (incl. warm-up): %d' % (total_passes))
print('Total iterations (incl. warm-up): %d' % (all_iters))
print('Total examples (incl. warm-up): %d' % (all_iters * args.batch_size))
print('avg latency: %.5f, std latency: %.5f, 99pc latency: %.5f' %
(latency_avg, latency_std, latency_pc99))
print('avg wps: %.5f, std wps: %.5f, wps for 99pc latency: %.5f' %
(wps_avg, wps_std, wps_pc01))


if __name__ == "__main__":
args = parse_args()
print_arguments(args)
if args.profile:
if args.device == 'GPU':
with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
infer(args)
else:
with profiler.profiler('CPU', sorted_key='total') as cpuprof:
infer(args)
else:
infer(args)
1 change: 1 addition & 0 deletions fluid/lac/network.py
1 change: 1 addition & 0 deletions fluid/lac/reader.py
30 changes: 30 additions & 0 deletions fluid/lac/scripts/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
## Purpose of this directory
The purpose of this directory is to provide exemplary execution commands. The commands are inside bash scripts described below.

## Preparation
To add execution permissions for shell scripts, run in this directory:
`chmod +x *.sh`

## Performance tips
Use the below environment flags for best performance:
```
KMP_AFFINITY=granularity=fine,compact,1,0
OMP_NUM_THREADS=<num_of_physical_cores>
```
For example, you can export them, or add them inside the specific files.

## Training
### CPU with mkldnn
Run:
`train_mkldnn.sh`
### CPU without mkldnn
Run:
`train.sh`

## Inference
### CPU with mkldnn, with profiling
Run:
`infer_profile_mkldnn.sh`
### CPU without mkldnn, with profiling
Run:
`infer_profile.sh`
9 changes: 9 additions & 0 deletions fluid/lac/scripts/infer_profile.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
cd ..
time python infer.py \
--device CPU \
--model_path saved_models/params_batch_1 \
--skip_pass_num 0 \
--num_passes 100 \
--profile
cd -
10 changes: 10 additions & 0 deletions fluid/lac/scripts/infer_profile_mkldnn.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash
cd ..
export FLAGS_use_mkldnn=True
time python infer.py \
--device CPU \
--model_path saved_models/params_batch_1 \
--skip_pass_num 0 \
--num_passes 100 \
--profile
cd -
9 changes: 9 additions & 0 deletions fluid/lac/scripts/train.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
cd ..
time python train.py \
--use_gpu 0 \
--model_save_dir saved_models \
--num_iterations 5 \
--save_model_per_batchs 5
cd -

10 changes: 10 additions & 0 deletions fluid/lac/scripts/train_mkldnn.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
#!/bin/bash
cd ..
export FLAGS_use_mkldnn=True
time python train.py \
--use_gpu 0 \
--model_save_dir saved_models \
--num_iterations 5 \
--save_model_per_batchs 5
cd -

1 change: 1 addition & 0 deletions fluid/lac/train.py

0 comments on commit afee083

Please sign in to comment.