From 1de853ee6cf1030e3a3549a1b971b177e1318009 Mon Sep 17 00:00:00 2001 From: Bai Yifan Date: Tue, 21 Jan 2020 11:37:41 +0800 Subject: [PATCH] Refine distillation demo (#51) * refine distillation demo --- demo/distillation/README.md | 40 +++ .../{distillation_demo.py => distill.py} | 63 ++-- demo/models/__init__.py | 5 +- demo/models/resnet_vd.py | 291 ++++++++++++++++++ 4 files changed, 370 insertions(+), 29 deletions(-) create mode 100644 demo/distillation/README.md rename demo/distillation/{distillation_demo.py => distill.py} (80%) create mode 100644 demo/models/resnet_vd.py diff --git a/demo/distillation/README.md b/demo/distillation/README.md new file mode 100644 index 0000000000000..ce3bc6fa71a82 --- /dev/null +++ b/demo/distillation/README.md @@ -0,0 +1,40 @@ +# 知识蒸馏示例 + +本示例将介绍如何使用知识蒸馏接口训练模型,蒸馏训练得到的模型相比不使用蒸馏策略的基线模型在精度上会有一定的提升。 + +## 接口介绍 + +请参考 [知识蒸馏API文档](https://paddlepaddle.github.io/PaddleSlim/api/single_distiller_api/)。 + +### 1. 蒸馏训练配置 + +示例使用ResNet50_vd作为teacher模型,对MobileNet结构的student网络进行蒸馏训练。 + +默认配置: + +```yaml +batch_size: 256 +init_lr: 0.1 +lr_strategy: piecewise_decay +l2_decay: 3e-5 +momentum_rate: 0.9 +num_epochs: 120 +data: imagenet +``` +训练使用默认配置启动即可 + +### 2. 启动训练 + +在配置好ImageNet数据集后,用以下命令启动训练即可: + +```shell +CUDA_VISIBLE_DEVICES=0,1,2,3 python distill.py +``` + +### 3. 训练结果 + +对比不使用蒸馏策略的基线模型(Top-1/Top-5: 70.99%/89.68%), + +经过120轮的蒸馏训练,MobileNet模型的Top-1/Top-5准确率达到72.77%/90.68%, Top-1/Top-5性能提升+1.78%/+1.00% + +详细实验数据请参见[PaddleSlim模型库蒸馏部分](https://paddlepaddle.github.io/PaddleSlim/model_zoo/#13) diff --git a/demo/distillation/distillation_demo.py b/demo/distillation/distill.py similarity index 80% rename from demo/distillation/distillation_demo.py rename to demo/distillation/distill.py index b3467e4809956..8a6a90d4464f1 100644 --- a/demo/distillation/distillation_demo.py +++ b/demo/distillation/distill.py @@ -23,7 +23,7 @@ parser = argparse.ArgumentParser(description=__doc__) add_arg = functools.partial(add_arguments, argparser=parser) # yapf: disable -add_arg('batch_size', int, 64*4, "Minibatch size.") +add_arg('batch_size', int, 64, "Minibatch size.") add_arg('use_gpu', bool, True, "Whether to use GPU or not.") add_arg('total_images', int, 1281167, "Training image number.") add_arg('image_shape', str, "3,224,224", "Input image size") @@ -32,12 +32,12 @@ add_arg('l2_decay', float, 3e-5, "The l2_decay parameter.") add_arg('momentum_rate', float, 0.9, "The value of momentum_rate.") add_arg('num_epochs', int, 120, "The number of total epochs.") -add_arg('data', str, "cifar10", "Which data to use. 'cifar10' or 'imagenet'") +add_arg('data', str, "imagenet", "Which data to use. 'cifar10' or 'imagenet'") add_arg('log_period', int, 20, "Log period in batches.") add_arg('model', str, "MobileNet", "Set the network to use.") add_arg('pretrained_model', str, None, "Whether to use pretrained model.") -add_arg('teacher_model', str, "ResNet50", "Set the teacher network to use.") -add_arg('teacher_pretrained_model', str, "./ResNet50_pretrained", "Whether to use pretrained model.") +add_arg('teacher_model', str, "ResNet50_vd", "Set the teacher network to use.") +add_arg('teacher_pretrained_model', str, "./ResNet50_vd_pretrained", "Whether to use pretrained model.") parser.add_argument('--step_epochs', nargs='+', type=int, default=[30, 60, 90], help="piecewise decay step") # yapf: enable @@ -45,7 +45,12 @@ def piecewise_decay(args): - step = int(math.ceil(float(args.total_images) / args.batch_size)) + if args.use_gpu: + devices_num = fluid.core.get_cuda_device_count() + else: + devices_num = int(os.environ.get('CPU_NUM', 1)) + step = int(math.ceil(float(args.total_images) / + args.batch_size)) * devices_num bd = [step * e for e in args.step_epochs] lr = [args.lr * (0.1**i) for i in range(len(bd) + 1)] learning_rate = fluid.layers.piecewise_decay(boundaries=bd, values=lr) @@ -53,18 +58,23 @@ def piecewise_decay(args): learning_rate=learning_rate, momentum=args.momentum_rate, regularization=fluid.regularizer.L2Decay(args.l2_decay)) - return optimizer + return learning_rate, optimizer def cosine_decay(args): - step = int(math.ceil(float(args.total_images) / args.batch_size)) + if cfg.use_gpu: + devices_num = fluid.core.get_cuda_device_count() + else: + devices_num = int(os.environ.get('CPU_NUM', 1)) + step = int(math.ceil(float(args.total_images) / + args.batch_size)) * devices_num learning_rate = fluid.layers.cosine_decay( learning_rate=args.lr, step_each_epoch=step, epochs=args.num_epochs) optimizer = fluid.optimizer.Momentum( learning_rate=learning_rate, momentum=args.momentum_rate, regularization=fluid.regularizer.L2Decay(args.l2_decay)) - return optimizer + return learning_rate, optimizer def create_optimizer(args): @@ -118,9 +128,6 @@ def compress(args): avg_cost = fluid.layers.mean(x=cost) acc_top1 = fluid.layers.accuracy(input=out, label=label, k=1) acc_top5 = fluid.layers.accuracy(input=out, label=label, k=5) - #print("="*50+"student_model_params"+"="*50) - #for v in student_program.list_vars(): - # print(v.name, v.shape) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) @@ -145,23 +152,19 @@ def compress(args): name='image', shape=image_shape, dtype='float32') predict = teacher_model.net(image, class_dim=class_dim) - #print("="*50+"teacher_model_params"+"="*50) - #for v in teacher_program.list_vars(): - # print(v.name, v.shape) - exe.run(t_startup) - _download( - 'http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_pretrained.tar', - '.') - _decompress('./ResNet50_pretrained.tar') + if not os.path.exists(args.teacher_pretrained_model): + _download( + 'http://paddle-imagenet-models-name.bj.bcebos.com/ResNet50_vd_pretrained.tar', + '.') + _decompress('./ResNet50_vd_pretrained.tar') assert args.teacher_pretrained_model and os.path.exists( args.teacher_pretrained_model ), "teacher_pretrained_model should be set when teacher_model is not None." def if_exist(var): return os.path.exists( - os.path.join(args.teacher_pretrained_model, var.name) - ) and var.name != 'fc_0.w_0' and var.name != 'fc_0.b_0' + os.path.join(args.teacher_pretrained_model, var.name)) fluid.io.load_vars( exe, @@ -173,9 +176,10 @@ def if_exist(var): merge(teacher_program, student_program, data_name_map, place) with fluid.program_guard(student_program, s_startup): - l2_loss = l2_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0", student_program) - loss = avg_cost + l2_loss - opt = create_optimizer(args) + distill_loss = soft_label_loss("teacher_fc_0.tmp_0", "fc_0.tmp_0", + student_program) + loss = avg_cost + distill_loss + lr, opt = create_optimizer(args) opt.minimize(loss) exe.run(s_startup) build_strategy = fluid.BuildStrategy() @@ -185,14 +189,17 @@ def if_exist(var): for epoch_id in range(args.num_epochs): for step_id, data in enumerate(train_loader): - loss_1, loss_2, loss_3 = exe.run( + lr_np, loss_1, loss_2, loss_3 = exe.run( parallel_main, feed=data, - fetch_list=[loss.name, avg_cost.name, l2_loss.name]) + fetch_list=[ + lr.name, loss.name, avg_cost.name, distill_loss.name + ]) if step_id % args.log_period == 0: _logger.info( - "train_epoch {} step {} loss {:.6f}, class loss {:.6f}, l2 loss {:.6f}". - format(epoch_id, step_id, loss_1[0], loss_2[0], loss_3[0])) + "train_epoch {} step {} lr {:.6f}, loss {:.6f}, class loss {:.6f}, distill loss {:.6f}". + format(epoch_id, step_id, lr_np[0], loss_1[0], loss_2[0], + loss_3[0])) val_acc1s = [] val_acc5s = [] for step_id, data in enumerate(valid_loader): diff --git a/demo/models/__init__.py b/demo/models/__init__.py index c6c0b76075bc1..bf32c5d90e9a5 100644 --- a/demo/models/__init__.py +++ b/demo/models/__init__.py @@ -1,6 +1,9 @@ from .mobilenet import MobileNet from .resnet import ResNet34, ResNet50 +from .resnet_vd import ResNet50_vd from .mobilenet_v2 import MobileNetV2 from .pvanet import PVANet -__all__ = ['MobileNet', 'ResNet34', 'ResNet50', 'MobileNetV2', 'PVANet'] +__all__ = [ + 'MobileNet', 'ResNet34', 'ResNet50', 'MobileNetV2', 'PVANet', 'ResNet50_vd' +] diff --git a/demo/models/resnet_vd.py b/demo/models/resnet_vd.py new file mode 100644 index 0000000000000..c93606de906b5 --- /dev/null +++ b/demo/models/resnet_vd.py @@ -0,0 +1,291 @@ +#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. +# +#Licensed under the Apache License, Version 2.0 (the "License"); +#you may not use this file except in compliance with the License. +#You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +#Unless required by applicable law or agreed to in writing, software +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +#See the License for the specific language governing permissions and +#limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import math + +import paddle +import paddle.fluid as fluid +from paddle.fluid.param_attr import ParamAttr + +__all__ = [ + "ResNet", "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd", + "ResNet152_vd", "ResNet200_vd" +] + + +class ResNet(): + def __init__(self, layers=50, is_3x3=False): + self.layers = layers + self.is_3x3 = is_3x3 + + def net(self, input, class_dim=1000): + is_3x3 = self.is_3x3 + layers = self.layers + supported_layers = [18, 34, 50, 101, 152, 200] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format(supported_layers, layers) + + if layers == 18: + depth = [2, 2, 2, 2] + elif layers == 34 or layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + elif layers == 200: + depth = [3, 12, 48, 3] + num_filters = [64, 128, 256, 512] + if is_3x3 == False: + conv = self.conv_bn_layer( + input=input, + num_filters=64, + filter_size=7, + stride=2, + act='relu') + else: + conv = self.conv_bn_layer( + input=input, + num_filters=32, + filter_size=3, + stride=2, + act='relu', + name='conv1_1') + conv = self.conv_bn_layer( + input=conv, + num_filters=32, + filter_size=3, + stride=1, + act='relu', + name='conv1_2') + conv = self.conv_bn_layer( + input=conv, + num_filters=64, + filter_size=3, + stride=1, + act='relu', + name='conv1_3') + + conv = fluid.layers.pool2d( + input=conv, + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + + if layers >= 50: + for block in range(len(depth)): + for i in range(depth[block]): + if layers in [101, 152, 200] and block == 2: + if i == 0: + conv_name = "res" + str(block + 2) + "a" + else: + conv_name = "res" + str(block + 2) + "b" + str(i) + else: + conv_name = "res" + str(block + 2) + chr(97 + i) + conv = self.bottleneck_block( + input=conv, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + if_first=block == i == 0, + name=conv_name) + else: + for block in range(len(depth)): + for i in range(depth[block]): + conv_name = "res" + str(block + 2) + chr(97 + i) + conv = self.basic_block( + input=conv, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1, + if_first=block == i == 0, + name=conv_name) + + pool = fluid.layers.pool2d( + input=conv, pool_type='avg', global_pooling=True) + stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) + + out = fluid.layers.fc( + input=pool, + size=class_dim, + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, stdv))) + + return out + + def conv_bn_layer(self, + input, + num_filters, + filter_size, + stride=1, + groups=1, + act=None, + name=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + param_attr=ParamAttr(name=name + "_weights"), + bias_attr=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + return fluid.layers.batch_norm( + input=conv, + act=act, + param_attr=ParamAttr(name=bn_name + '_scale'), + bias_attr=ParamAttr(bn_name + '_offset'), + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance') + + def conv_bn_layer_new(self, + input, + num_filters, + filter_size, + stride=1, + groups=1, + act=None, + name=None): + pool = fluid.layers.pool2d( + input=input, + pool_size=2, + pool_stride=2, + pool_padding=0, + pool_type='avg', + ceil_mode=True) + + conv = fluid.layers.conv2d( + input=pool, + num_filters=num_filters, + filter_size=filter_size, + stride=1, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + param_attr=ParamAttr(name=name + "_weights"), + bias_attr=False) + if name == "conv1": + bn_name = "bn_" + name + else: + bn_name = "bn" + name[3:] + return fluid.layers.batch_norm( + input=conv, + act=act, + param_attr=ParamAttr(name=bn_name + '_scale'), + bias_attr=ParamAttr(bn_name + '_offset'), + moving_mean_name=bn_name + '_mean', + moving_variance_name=bn_name + '_variance') + + def shortcut(self, input, ch_out, stride, name, if_first=False): + ch_in = input.shape[1] + if ch_in != ch_out or stride != 1: + if if_first: + return self.conv_bn_layer(input, ch_out, 1, stride, name=name) + else: + return self.conv_bn_layer_new( + input, ch_out, 1, stride, name=name) + elif if_first: + return self.conv_bn_layer(input, ch_out, 1, stride, name=name) + else: + return input + + def bottleneck_block(self, input, num_filters, stride, name, if_first): + conv0 = self.conv_bn_layer( + input=input, + num_filters=num_filters, + filter_size=1, + act='relu', + name=name + "_branch2a") + conv1 = self.conv_bn_layer( + input=conv0, + num_filters=num_filters, + filter_size=3, + stride=stride, + act='relu', + name=name + "_branch2b") + conv2 = self.conv_bn_layer( + input=conv1, + num_filters=num_filters * 4, + filter_size=1, + act=None, + name=name + "_branch2c") + + short = self.shortcut( + input, + num_filters * 4, + stride, + if_first=if_first, + name=name + "_branch1") + + return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') + + def basic_block(self, input, num_filters, stride, name, if_first): + conv0 = self.conv_bn_layer( + input=input, + num_filters=num_filters, + filter_size=3, + act='relu', + stride=stride, + name=name + "_branch2a") + conv1 = self.conv_bn_layer( + input=conv0, + num_filters=num_filters, + filter_size=3, + act=None, + name=name + "_branch2b") + short = self.shortcut( + input, + num_filters, + stride, + if_first=if_first, + name=name + "_branch1") + return fluid.layers.elementwise_add(x=short, y=conv1, act='relu') + + +def ResNet18_vd(): + model = ResNet(layers=18, is_3x3=True) + return model + + +def ResNet34_vd(): + model = ResNet(layers=34, is_3x3=True) + return model + + +def ResNet50_vd(): + model = ResNet(layers=50, is_3x3=True) + return model + + +def ResNet101_vd(): + model = ResNet(layers=101, is_3x3=True) + return model + + +def ResNet152_vd(): + model = ResNet(layers=152, is_3x3=True) + return model + + +def ResNet200_vd(): + model = ResNet(layers=200, is_3x3=True) + return model