diff --git a/examples/trials/nas_cifar10/README.md b/examples/trials/nas_cifar10/README.md index 2f3b52a869..e6f03e0b58 100644 --- a/examples/trials/nas_cifar10/README.md +++ b/examples/trials/nas_cifar10/README.md @@ -2,7 +2,14 @@ === Now we have an NAS example [NNI-NAS-Example](https://github.com/Crysple/NNI-NAS-Example) run in NNI using NAS interface from our contributors. + +We have included its trial code in this folder, and provided example config files to show how to use PPO tuner to tune the trial code. + +> Download data + +- `cd data && . download.sh` +- `tar xzf cifar-10-python.tar.gz && mv cifar-batches cifar10` Thanks our lovely contributors. -And welcome more and more people to join us! \ No newline at end of file +And welcome more and more people to join us! diff --git a/examples/trials/nas_cifar10/config_pai_ppo.yml b/examples/trials/nas_cifar10/config_pai_ppo.yml new file mode 100644 index 0000000000..38156376bd --- /dev/null +++ b/examples/trials/nas_cifar10/config_pai_ppo.yml @@ -0,0 +1,31 @@ +authorName: Unknown +experimentName: enas_macro +trialConcurrency: 20 +maxExecDuration: 2400h +maxTrialNum: 20000 +#choice: local, remote +trainingServicePlatform: pai +#choice: true, false +useAnnotation: true +multiPhase: false +versionCheck: false +nniManagerIp: 0.0.0.0 +tuner: + builtinTunerName: PPOTuner + classArgs: + optimize_mode: maximize + trials_per_update: 60 + epochs_per_update: 20 + minibatch_size: 6 +trial: + command: sh ./macro_cifar10_pai.sh + codeDir: ./ + gpuNum: 1 + cpuNum: 1 + memoryMB: 8196 + image: msranni/nni:latest + virtualCluster: nni +paiConfig: + userName: your_account + passWord: your_pwd + host: 0.0.0.0 diff --git a/examples/trials/nas_cifar10/config_ppo.yml b/examples/trials/nas_cifar10/config_ppo.yml new file mode 100644 index 0000000000..74c0dbea8e --- /dev/null +++ b/examples/trials/nas_cifar10/config_ppo.yml @@ -0,0 +1,21 @@ +authorName: Unknown +experimentName: enas_macro +trialConcurrency: 4 +maxExecDuration: 2400h +maxTrialNum: 20000 +#choice: local, remote +trainingServicePlatform: local +#choice: true, false +useAnnotation: true +multiPhase: false +tuner: + builtinTunerName: PPOTuner + classArgs: + optimize_mode: maximize + trials_per_update: 60 + epochs_per_update: 12 + minibatch_size: 10 +trial: + command: sh ./macro_cifar10.sh + codeDir: ./ + gpuNum: 1 diff --git a/examples/trials/nas_cifar10/data/download.sh b/examples/trials/nas_cifar10/data/download.sh new file mode 100755 index 0000000000..f00ac25724 --- /dev/null +++ b/examples/trials/nas_cifar10/data/download.sh @@ -0,0 +1 @@ +wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz diff --git a/examples/trials/nas_cifar10/macro_cifar10.sh b/examples/trials/nas_cifar10/macro_cifar10.sh new file mode 100644 index 0000000000..0451fbd2d8 --- /dev/null +++ b/examples/trials/nas_cifar10/macro_cifar10.sh @@ -0,0 +1,35 @@ +#!/bin/bash +set -e +export PYTHONPATH="$(pwd)" + +python3 src/cifar10/nni_child_cifar10.py \ + --data_format="NCHW" \ + --search_for="macro" \ + --reset_output_dir \ + --data_path="data/cifar10" \ + --output_dir="outputs" \ + --train_data_size=45000 \ + --batch_size=100 \ + --num_epochs=8 \ + --log_every=50 \ + --eval_every_epochs=1 \ + --child_use_aux_heads \ + --child_num_layers=12 \ + --child_out_filters=36 \ + --child_l2_reg=0.0002 \ + --child_num_branches=6 \ + --child_num_cell_layers=5 \ + --child_keep_prob=0.50 \ + --child_drop_path_keep_prob=0.60 \ + --child_lr_cosine \ + --child_lr_max=0.05 \ + --child_lr_min=0.001 \ + --child_lr_T_0=10 \ + --child_lr_T_mul=2 \ + --controller_search_whole_channels \ + --controller_train_every=1 \ + --controller_num_aggregate=20 \ + --controller_train_steps=50 \ + --child_mode="subgraph" \ + "$@" + diff --git a/examples/trials/nas_cifar10/macro_cifar10_pai.sh b/examples/trials/nas_cifar10/macro_cifar10_pai.sh new file mode 100644 index 0000000000..2e172c54ca --- /dev/null +++ b/examples/trials/nas_cifar10/macro_cifar10_pai.sh @@ -0,0 +1,35 @@ +#!/bin/bash +set -e +export PYTHONPATH="$(pwd)" + +python3 src/cifar10/nni_child_cifar10.py \ + --data_format="NCHW" \ + --search_for="macro" \ + --reset_output_dir \ + --data_path="data/cifar10" \ + --output_dir="outputs" \ + --train_data_size=45000 \ + --batch_size=100 \ + --num_epochs=30 \ + --log_every=50 \ + --eval_every_epochs=1 \ + --child_use_aux_heads \ + --child_num_layers=12 \ + --child_out_filters=36 \ + --child_l2_reg=0.0002 \ + --child_num_branches=6 \ + --child_num_cell_layers=5 \ + --child_keep_prob=0.50 \ + --child_drop_path_keep_prob=0.60 \ + --child_lr_cosine \ + --child_lr_max=0.05 \ + --child_lr_min=0.001 \ + --child_lr_T_0=10 \ + --child_lr_T_mul=2 \ + --controller_search_whole_channels \ + --controller_train_every=1 \ + --controller_num_aggregate=20 \ + --controller_train_steps=50 \ + --child_mode="subgraph" \ + "$@" + diff --git a/examples/trials/nas_cifar10/src/__init__.py b/examples/trials/nas_cifar10/src/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/trials/nas_cifar10/src/cifar10/__init__.py b/examples/trials/nas_cifar10/src/cifar10/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/examples/trials/nas_cifar10/src/cifar10/data_utils.py b/examples/trials/nas_cifar10/src/cifar10/data_utils.py new file mode 100644 index 0000000000..b8a8c36339 --- /dev/null +++ b/examples/trials/nas_cifar10/src/cifar10/data_utils.py @@ -0,0 +1,74 @@ +import os +import sys +import pickle +import numpy as np +import tensorflow as tf + + +def _read_data(data_path, train_files): + """Reads CIFAR-10 format data. Always returns NHWC format. + + Returns: + images: np tensor of size [N, H, W, C] + labels: np tensor of size [N] + """ + images, labels = [], [] + for file_name in train_files: + print(file_name) + full_name = os.path.join(data_path, file_name) + with open(full_name, "rb") as finp: + data = pickle.load(finp, encoding='latin1') + batch_images = data["data"].astype(np.float32) / 255.0 + batch_labels = np.array(data["labels"], dtype=np.int32) + images.append(batch_images) + labels.append(batch_labels) + images = np.concatenate(images, axis=0) + labels = np.concatenate(labels, axis=0) + images = np.reshape(images, [-1, 3, 32, 32]) + images = np.transpose(images, [0, 2, 3, 1]) + + return images, labels + + +def read_data(data_path, num_valids=5000): + print("-" * 80) + print("Reading data") + + images, labels = {}, {} + + train_files = [ + "data_batch_1", + "data_batch_2", + "data_batch_3", + "data_batch_4", + "data_batch_5", + ] + test_file = [ + "test_batch", + ] + images["train"], labels["train"] = _read_data(data_path, train_files) + + if num_valids: + images["valid"] = images["train"][-num_valids:] + labels["valid"] = labels["train"][-num_valids:] + + images["train"] = images["train"][:-num_valids] + labels["train"] = labels["train"][:-num_valids] + else: + images["valid"], labels["valid"] = None, None + + images["test"], labels["test"] = _read_data(data_path, test_file) + + print("Prepropcess: [subtract mean], [divide std]") + mean = np.mean(images["train"], axis=(0, 1, 2), keepdims=True) + std = np.std(images["train"], axis=(0, 1, 2), keepdims=True) + + print("mean: {}".format(np.reshape(mean * 255.0, [-1]))) + print("std: {}".format(np.reshape(std * 255.0, [-1]))) + + images["train"] = (images["train"] - mean) / std + if num_valids: + images["valid"] = (images["valid"] - mean) / std + images["test"] = (images["test"] - mean) / std + + return images, labels diff --git a/examples/trials/nas_cifar10/src/cifar10/general_child.py b/examples/trials/nas_cifar10/src/cifar10/general_child.py new file mode 100644 index 0000000000..4266f82997 --- /dev/null +++ b/examples/trials/nas_cifar10/src/cifar10/general_child.py @@ -0,0 +1,425 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import numpy as np +import tensorflow as tf +from src.common_ops import create_weight, batch_norm, batch_norm_with_mask, global_avg_pool, conv_op, pool_op +from src.utils import count_model_params, get_train_ops, get_C, get_strides +from src.cifar10.models import Model + + +class GeneralChild(Model): + def __init__(self, + images, + labels, + cutout_size=None, + whole_channels=False, + fixed_arc=None, + out_filters_scale=1, + num_layers=2, + num_branches=6, + out_filters=24, + keep_prob=1.0, + batch_size=32, + clip_mode=None, + grad_bound=None, + l2_reg=1e-4, + lr_init=0.1, + lr_dec_start=0, + lr_dec_every=10000, + lr_dec_rate=0.1, + lr_cosine=False, + lr_max=None, + lr_min=None, + lr_T_0=None, + lr_T_mul=None, + optim_algo=None, + sync_replicas=False, + num_aggregate=None, + num_replicas=None, + data_format="NHWC", + name="child", + mode="subgraph", + *args, + **kwargs + ): + + super(self.__class__, self).__init__( + images, + labels, + cutout_size=cutout_size, + batch_size=batch_size, + clip_mode=clip_mode, + grad_bound=grad_bound, + l2_reg=l2_reg, + lr_init=lr_init, + lr_dec_start=lr_dec_start, + lr_dec_every=lr_dec_every, + lr_dec_rate=lr_dec_rate, + keep_prob=keep_prob, + optim_algo=optim_algo, + sync_replicas=sync_replicas, + num_aggregate=num_aggregate, + num_replicas=num_replicas, + data_format=data_format, + name=name) + + self.whole_channels = whole_channels + self.lr_cosine = lr_cosine + self.lr_max = lr_max + self.lr_min = lr_min + self.lr_T_0 = lr_T_0 + self.lr_T_mul = lr_T_mul + self.out_filters = out_filters * out_filters_scale + self.num_layers = num_layers + self.mode = mode + + self.num_branches = num_branches + self.fixed_arc = fixed_arc + self.out_filters_scale = out_filters_scale + + pool_distance = self.num_layers // 3 + self.pool_layers = [pool_distance - 1, 2 * pool_distance - 1] + + + + def _factorized_reduction(self, x, out_filters, stride, is_training): + """Reduces the shape of x without information loss due to striding.""" + assert out_filters % 2 == 0, ( + "Need even number of filters when using this factorized reduction.") + if stride == 1: + with tf.variable_scope("path_conv"): + inp_c = get_C(x, self.data_format) + w = create_weight("w", [1, 1, inp_c, out_filters]) + x = tf.nn.conv2d(x, w, [1, 1, 1, 1], "SAME", + data_format=self.data_format) + x = batch_norm(x, is_training, data_format=self.data_format) + return x + + stride_spec = get_strides(stride, self.data_format) + # Skip path 1 + path1 = tf.nn.avg_pool( + x, [1, 1, 1, 1], stride_spec, "VALID", data_format=self.data_format) + with tf.variable_scope("path1_conv"): + inp_c = get_C(path1, self.data_format) + w = create_weight("w", [1, 1, inp_c, out_filters // 2]) + path1 = tf.nn.conv2d(path1, w, [1, 1, 1, 1], "SAME", + data_format=self.data_format) + + # Skip path 2 + # First pad with 0"s on the right and bottom, then shift the filter to + # include those 0"s that were added. + if self.data_format == "NHWC": + pad_arr = [[0, 0], [0, 1], [0, 1], [0, 0]] + path2 = tf.pad(x, pad_arr)[:, 1:, 1:, :] + concat_axis = 3 + else: + pad_arr = [[0, 0], [0, 0], [0, 1], [0, 1]] + path2 = tf.pad(x, pad_arr)[:, :, 1:, 1:] + concat_axis = 1 + + path2 = tf.nn.avg_pool( + path2, [1, 1, 1, 1], stride_spec, "VALID", data_format=self.data_format) + with tf.variable_scope("path2_conv"): + inp_c = get_C(path2, self.data_format) + w = create_weight("w", [1, 1, inp_c, out_filters // 2]) + path2 = tf.nn.conv2d(path2, w, [1, 1, 1, 1], "SAME", + data_format=self.data_format) + + # Concat and apply BN + final_path = tf.concat(values=[path1, path2], axis=concat_axis) + final_path = batch_norm(final_path, is_training, + data_format=self.data_format) + + return final_path + + def _model(self, images, is_training, reuse=False): + '''Build model''' + with tf.variable_scope(self.name, reuse=reuse): + layers = [] + + out_filters = self.out_filters + with tf.variable_scope("stem_conv"): + w = create_weight("w", [3, 3, 3, out_filters]) + x = tf.nn.conv2d( + images, w, [1, 1, 1, 1], "SAME", data_format=self.data_format) + x = batch_norm(x, is_training, data_format=self.data_format) + layers.append(x) + + def add_fixed_pooling_layer(layer_id, layers, out_filters, is_training): + '''Add a fixed pooling layer every four layers''' + out_filters *= 2 + with tf.variable_scope("pool_at_{0}".format(layer_id)): + pooled_layers = [] + for i, layer in enumerate(layers): + with tf.variable_scope("from_{0}".format(i)): + x = self._factorized_reduction( + layer, out_filters, 2, is_training) + pooled_layers.append(x) + return pooled_layers, out_filters + + def post_process_out(out, optional_inputs): + '''Form skip connection and perform batch norm''' + with tf.variable_scope("skip"): + inputs = layers[-1] + if self.data_format == "NHWC": + inp_h = inputs.get_shape()[1].value + inp_w = inputs.get_shape()[2].value + inp_c = inputs.get_shape()[3].value + out.set_shape([None, inp_h, inp_w, out_filters]) + elif self.data_format == "NCHW": + inp_c = inputs.get_shape()[1].value + inp_h = inputs.get_shape()[2].value + inp_w = inputs.get_shape()[3].value + out.set_shape([None, out_filters, inp_h, inp_w]) + optional_inputs.append(out) + pout = tf.add_n(optional_inputs) + out = batch_norm(pout, is_training, + data_format=self.data_format) + layers.append(out) + return out + + global layer_id + layer_id = -1 + + def get_layer_id(): + global layer_id + layer_id += 1 + return 'layer_' + str(layer_id) + + def conv3(inputs): + # res_layers is pre_layers that are chosen to form skip connection + # layers[-1] is always the latest input + with tf.variable_scope(get_layer_id()): + with tf.variable_scope('branch_0'): + out = conv_op( + inputs[0][0], 3, is_training, out_filters, out_filters, self.data_format, start_idx=None) + out = post_process_out(out, inputs[1]) + return out + + def conv3_sep(inputs): + with tf.variable_scope(get_layer_id()): + with tf.variable_scope('branch_1'): + out = conv_op( + inputs[0][0], 3, is_training, out_filters, out_filters, self.data_format, start_idx=None, separable=True) + out = post_process_out(out, inputs[1]) + return out + + def conv5(inputs): + with tf.variable_scope(get_layer_id()): + with tf.variable_scope('branch_2'): + out = conv_op( + inputs[0][0], 5, is_training, out_filters, out_filters, self.data_format, start_idx=None) + out = post_process_out(out, inputs[1]) + return out + + def conv5_sep(inputs): + with tf.variable_scope(get_layer_id()): + with tf.variable_scope('branch_3'): + out = conv_op( + inputs[0][0], 5, is_training, out_filters, out_filters, self.data_format, start_idx=None, separable=True) + out = post_process_out(out, inputs[1]) + return out + + def avg_pool(inputs): + with tf.variable_scope(get_layer_id()): + with tf.variable_scope('branch_4'): + out = pool_op( + inputs[0][0], is_training, out_filters, out_filters, "avg", self.data_format, start_idx=None) + out = post_process_out(out, inputs[1]) + return out + + def max_pool(inputs): + with tf.variable_scope(get_layer_id()): + with tf.variable_scope('branch_5'): + out = pool_op( + inputs[0][0], is_training, out_filters, out_filters, "max", self.data_format, start_idx=None) + out = post_process_out(out, inputs[1]) + return out + + """@nni.mutable_layers( + { + layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()], + fixed_inputs:[x], + layer_output: layer_0_out + }, + { + layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()], + fixed_inputs:[layer_0_out], + optional_inputs: [layer_0_out], + optional_input_size: [0, 1], + layer_output: layer_1_out + }, + { + layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()], + fixed_inputs:[layer_1_out], + optional_inputs: [layer_0_out, layer_1_out], + optional_input_size: [0, 1], + layer_output: layer_2_out + }, + { + layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()], + fixed_inputs:[layer_2_out], + optional_inputs: [layer_0_out, layer_1_out, layer_2_out], + optional_input_size: [0, 1], + layer_output: layer_3_out + } + )""" + layers, out_filters = add_fixed_pooling_layer( + 3, layers, out_filters, is_training) + layer_0_out, layer_1_out, layer_2_out, layer_3_out = layers[-4:] + """@nni.mutable_layers( + { + layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()], + fixed_inputs: [layer_3_out], + optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out], + optional_input_size: [0, 1], + layer_output: layer_4_out + }, + { + layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()], + fixed_inputs: [layer_4_out], + optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out], + optional_input_size: [0, 1], + layer_output: layer_5_out + }, + { + layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()], + fixed_inputs: [layer_5_out], + optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out], + optional_input_size: [0, 1], + layer_output: layer_6_out + }, + { + layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()], + fixed_inputs: [layer_6_out], + optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out], + optional_input_size: [0, 1], + layer_output: layer_7_out + } + )""" + layers, out_filters = add_fixed_pooling_layer( + 7, layers, out_filters, is_training) + layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out = layers[ + -8:] + """@nni.mutable_layers( + { + layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()], + fixed_inputs: [layer_7_out], + optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out], + optional_input_size: [0, 1], + layer_output: layer_8_out + }, + { + layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()], + fixed_inputs: [layer_8_out], + optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out, layer_8_out], + optional_input_size: [0, 1], + layer_output: layer_9_out + }, + { + layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()], + fixed_inputs: [layer_9_out], + optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out, layer_8_out, layer_9_out], + optional_input_size: [0, 1], + layer_output: layer_10_out + }, + { + layer_choice: [conv3(), conv3_sep(), conv5(), conv5_sep(), avg_pool(), max_pool()], + fixed_inputs:[layer_10_out], + optional_inputs: [layer_0_out, layer_1_out, layer_2_out, layer_3_out, layer_4_out, layer_5_out, layer_6_out, layer_7_out, layer_8_out, layer_9_out, layer_10_out], + optional_input_size: [0, 1], + layer_output: layer_11_out + } + )""" + + x = global_avg_pool(layer_11_out, data_format=self.data_format) + if is_training: + x = tf.nn.dropout(x, self.keep_prob) + with tf.variable_scope("fc"): + if self.data_format == "NHWC": + inp_c = x.get_shape()[3].value + elif self.data_format == "NCHW": + inp_c = x.get_shape()[1].value + else: + raise ValueError( + "Unknown data_format {0}".format(self.data_format)) + w = create_weight("w", [inp_c, 10]) + x = tf.matmul(x, w) + return x + + + # override + def _build_train(self): + print("-" * 80) + print("Build train graph") + logits = self._model(self.x_train, is_training=True) + log_probs = tf.nn.sparse_softmax_cross_entropy_with_logits( + logits=logits, labels=self.y_train) + self.loss = tf.reduce_mean(log_probs) + + self.train_preds = tf.argmax(logits, axis=1) + self.train_preds = tf.to_int32(self.train_preds) + self.train_acc = tf.equal(self.train_preds, self.y_train) + self.train_acc = tf.to_int32(self.train_acc) + self.train_acc = tf.reduce_sum(self.train_acc) + + tf_variables = [var + for var in tf.trainable_variables() if var.name.startswith(self.name)] + self.num_vars = count_model_params(tf_variables) + print("Model has {} params".format(self.num_vars)) + + self.global_step = tf.Variable( + 0, dtype=tf.int32, trainable=False, name="global_step") + + self.train_op, self.lr, self.grad_norm, self.optimizer = get_train_ops( + self.loss, + tf_variables, + self.global_step, + clip_mode=self.clip_mode, + grad_bound=self.grad_bound, + l2_reg=self.l2_reg, + lr_init=self.lr_init, + lr_dec_start=self.lr_dec_start, + lr_dec_every=self.lr_dec_every, + lr_dec_rate=self.lr_dec_rate, + lr_cosine=self.lr_cosine, + lr_max=self.lr_max, + lr_min=self.lr_min, + lr_T_0=self.lr_T_0, + lr_T_mul=self.lr_T_mul, + num_train_batches=self.num_train_batches, + optim_algo=self.optim_algo, + sync_replicas=False, + num_aggregate=self.num_aggregate, + num_replicas=self.num_replicas) + + # override + def _build_valid(self): + if self.x_valid is not None: + print("-" * 80) + print("Build valid graph") + logits = self._model(self.x_valid, False, reuse=True) + self.valid_preds = tf.argmax(logits, axis=1) + self.valid_preds = tf.to_int32(self.valid_preds) + self.valid_acc = tf.equal(self.valid_preds, self.y_valid) + self.valid_acc = tf.to_int32(self.valid_acc) + self.valid_acc = tf.reduce_sum(self.valid_acc) + + # override + def _build_test(self): + print("-" * 80) + print("Build test graph") + logits = self._model(self.x_test, False, reuse=True) + self.test_preds = tf.argmax(logits, axis=1) + self.test_preds = tf.to_int32(self.test_preds) + self.test_acc = tf.equal(self.test_preds, self.y_test) + self.test_acc = tf.to_int32(self.test_acc) + self.test_acc = tf.reduce_sum(self.test_acc) + + + def build_model(self): + + self._build_train() + self._build_valid() + self._build_test() diff --git a/examples/trials/nas_cifar10/src/cifar10/models.py b/examples/trials/nas_cifar10/src/cifar10/models.py new file mode 100644 index 0000000000..089fe846a6 --- /dev/null +++ b/examples/trials/nas_cifar10/src/cifar10/models.py @@ -0,0 +1,196 @@ +import os +import sys + +import numpy as np +import tensorflow as tf + + +class Model(object): + def __init__(self, + images, + labels, + cutout_size=None, + batch_size=32, + eval_batch_size=100, + clip_mode=None, + grad_bound=None, + l2_reg=1e-4, + lr_init=0.1, + lr_dec_start=0, + lr_dec_every=100, + lr_dec_rate=0.1, + keep_prob=1.0, + optim_algo=None, + sync_replicas=False, + num_aggregate=None, + num_replicas=None, + data_format="NHWC", + name="generic_model", + seed=None, + ): + """ + Args: + lr_dec_every: number of epochs to decay + """ + print("-" * 80) + print("Build model {}".format(name)) + + self.cutout_size = cutout_size + self.batch_size = batch_size + self.eval_batch_size = eval_batch_size + self.clip_mode = clip_mode + self.grad_bound = grad_bound + self.l2_reg = l2_reg + self.lr_init = lr_init + self.lr_dec_start = lr_dec_start + self.lr_dec_rate = lr_dec_rate + self.keep_prob = keep_prob + self.optim_algo = optim_algo + self.sync_replicas = sync_replicas + self.num_aggregate = num_aggregate + self.num_replicas = num_replicas + self.data_format = data_format + self.name = name + self.seed = seed + + self.global_step = None + self.valid_acc = None + self.test_acc = None + print("Build data ops") + with tf.device("/cpu:0"): + # training data + self.num_train_examples = np.shape(images["train"])[0] + + self.num_train_batches = ( + self.num_train_examples + self.batch_size - 1) // self.batch_size + x_train, y_train = tf.train.shuffle_batch( + [images["train"], labels["train"]], + batch_size=self.batch_size, + capacity=50000, + enqueue_many=True, + min_after_dequeue=0, + num_threads=16, + seed=self.seed, + allow_smaller_final_batch=True, + ) + self.lr_dec_every = lr_dec_every * self.num_train_batches + + def _pre_process(x): + x = tf.pad(x, [[4, 4], [4, 4], [0, 0]]) + x = tf.random_crop(x, [32, 32, 3], seed=self.seed) + x = tf.image.random_flip_left_right(x, seed=self.seed) + if self.cutout_size is not None: + mask = tf.ones( + [self.cutout_size, self.cutout_size], dtype=tf.int32) + start = tf.random_uniform( + [2], minval=0, maxval=32, dtype=tf.int32) + mask = tf.pad(mask, [[self.cutout_size + start[0], 32 - start[0]], + [self.cutout_size + start[1], 32 - start[1]]]) + mask = mask[self.cutout_size: self.cutout_size + 32, + self.cutout_size: self.cutout_size + 32] + mask = tf.reshape(mask, [32, 32, 1]) + mask = tf.tile(mask, [1, 1, 3]) + x = tf.where(tf.equal(mask, 0), x=x, y=tf.zeros_like(x)) + if self.data_format == "NCHW": + x = tf.transpose(x, [2, 0, 1]) + + return x + self.x_train = tf.map_fn(_pre_process, x_train, back_prop=False) + self.y_train = y_train + + # valid data + self.x_valid, self.y_valid = None, None + if images["valid"] is not None: + images["valid_original"] = np.copy(images["valid"]) + labels["valid_original"] = np.copy(labels["valid"]) + if self.data_format == "NCHW": + images["valid"] = tf.transpose( + images["valid"], [0, 3, 1, 2]) + self.num_valid_examples = np.shape(images["valid"])[0] + self.num_valid_batches = ( + (self.num_valid_examples + self.eval_batch_size - 1) + // self.eval_batch_size) + self.x_valid, self.y_valid = tf.train.batch( + [images["valid"], labels["valid"]], + batch_size=self.eval_batch_size, + capacity=5000, + enqueue_many=True, + num_threads=1, + allow_smaller_final_batch=True, + ) + + # test data + if self.data_format == "NCHW": + images["test"] = tf.transpose(images["test"], [0, 3, 1, 2]) + self.num_test_examples = np.shape(images["test"])[0] + self.num_test_batches = ( + (self.num_test_examples + self.eval_batch_size - 1) + // self.eval_batch_size) + self.x_test, self.y_test = tf.train.batch( + [images["test"], labels["test"]], + batch_size=self.eval_batch_size, + capacity=10000, + enqueue_many=True, + num_threads=1, + allow_smaller_final_batch=True, + ) + + # cache images and labels + self.images = images + self.labels = labels + + def eval_once(self, sess, eval_set, child_model, verbose=False): + """Expects self.acc and self.global_step to be defined. + + Args: + sess: tf.Session() or one of its wrap arounds. + feed_dict: can be used to give more information to sess.run(). + eval_set: "valid" or "test" + """ + + assert self.global_step is not None + global_step = sess.run(self.global_step) + print("Eval at {}".format(global_step)) + + if eval_set == "valid": + assert self.x_valid is not None + assert self.valid_acc is not None + num_examples = self.num_valid_examples + num_batches = self.num_valid_batches + acc_op = self.valid_acc + elif eval_set == "test": + assert self.test_acc is not None + num_examples = self.num_test_examples + num_batches = self.num_test_batches + acc_op = self.test_acc + else: + raise NotImplementedError("Unknown eval_set '{}'".format(eval_set)) + + total_acc = 0 + total_exp = 0 + + for batch_id in range(num_batches): + acc = sess.run(acc_op) + + total_acc += acc + total_exp += self.eval_batch_size + if verbose: + sys.stdout.write( + "\r{:<5d}/{:>5d}".format(total_acc, total_exp)) + if verbose: + print("") + print("{}_accuracy: {:<6.4f}".format( + eval_set, float(total_acc) / total_exp)) + return float(total_acc) / total_exp + + def _model(self, images, is_training, reuse=None): + raise NotImplementedError("Abstract method") + + def _build_train(self): + raise NotImplementedError("Abstract method") + + def _build_valid(self): + raise NotImplementedError("Abstract method") + + def _build_test(self): + raise NotImplementedError("Abstract method") diff --git a/examples/trials/nas_cifar10/src/cifar10/nni_child_cifar10.py b/examples/trials/nas_cifar10/src/cifar10/nni_child_cifar10.py new file mode 100644 index 0000000000..d5aed489ee --- /dev/null +++ b/examples/trials/nas_cifar10/src/cifar10/nni_child_cifar10.py @@ -0,0 +1,163 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import os +import shutil +import logging +import tensorflow as tf +from src.cifar10.data_utils import read_data +from src.cifar10.general_child import GeneralChild +import src.cifar10_flags +from src.cifar10_flags import FLAGS + + +def build_logger(log_name): + logger = logging.getLogger(log_name) + logger.setLevel(logging.DEBUG) + fh = logging.FileHandler(log_name+'.log') + fh.setLevel(logging.DEBUG) + logger.addHandler(fh) + return logger + + +logger = build_logger("nni_child_cifar10") + + +def build_trial(images, labels, ChildClass): + '''Build child class''' + child_model = ChildClass( + images, + labels, + use_aux_heads=FLAGS.child_use_aux_heads, + cutout_size=FLAGS.child_cutout_size, + whole_channels=FLAGS.controller_search_whole_channels, + num_layers=FLAGS.child_num_layers, + num_cells=FLAGS.child_num_cells, + num_branches=FLAGS.child_num_branches, + fixed_arc=FLAGS.child_fixed_arc, + out_filters_scale=FLAGS.child_out_filters_scale, + out_filters=FLAGS.child_out_filters, + keep_prob=FLAGS.child_keep_prob, + drop_path_keep_prob=FLAGS.child_drop_path_keep_prob, + num_epochs=FLAGS.num_epochs, + l2_reg=FLAGS.child_l2_reg, + data_format=FLAGS.data_format, + batch_size=FLAGS.batch_size, + clip_mode="norm", + grad_bound=FLAGS.child_grad_bound, + lr_init=FLAGS.child_lr, + lr_dec_every=FLAGS.child_lr_dec_every, + lr_dec_rate=FLAGS.child_lr_dec_rate, + lr_cosine=FLAGS.child_lr_cosine, + lr_max=FLAGS.child_lr_max, + lr_min=FLAGS.child_lr_min, + lr_T_0=FLAGS.child_lr_T_0, + lr_T_mul=FLAGS.child_lr_T_mul, + optim_algo="momentum", + sync_replicas=FLAGS.child_sync_replicas, + num_aggregate=FLAGS.child_num_aggregate, + num_replicas=FLAGS.child_num_replicas + ) + + return child_model + + +def get_child_ops(child_model): + '''Assemble child op to a dict''' + child_ops = { + "global_step": child_model.global_step, + "loss": child_model.loss, + "train_op": child_model.train_op, + "lr": child_model.lr, + "grad_norm": child_model.grad_norm, + "train_acc": child_model.train_acc, + "optimizer": child_model.optimizer, + "num_train_batches": child_model.num_train_batches, + "eval_every": child_model.num_train_batches * FLAGS.eval_every_epochs, + "eval_func": child_model.eval_once, + } + return child_ops + + +class NASTrial(): + + def __init__(self): + images, labels = read_data(FLAGS.data_path, num_valids=0) + + self.output_dir = os.path.join(os.getenv('NNI_OUTPUT_DIR'), '../..') + self.file_path = os.path.join( + self.output_dir, 'trainable_variable.txt') + + self.graph = tf.Graph() + with self.graph.as_default(): + self.child_model = build_trial(images, labels, GeneralChild) + + self.total_data = {} + + self.child_model.build_model() + self.child_ops = get_child_ops(self.child_model) + config = tf.ConfigProto( + intra_op_parallelism_threads=0, + inter_op_parallelism_threads=0, + allow_soft_placement=True) + + self.sess = tf.train.SingularMonitoredSession(config=config) + + logger.debug('initlize NASTrial done.') + + def run_one_step(self): + '''Run this model on a batch of data''' + run_ops = [ + self.child_ops["loss"], + self.child_ops["lr"], + self.child_ops["grad_norm"], + self.child_ops["train_acc"], + self.child_ops["train_op"], + ] + loss, lr, gn, tr_acc, _ = self.sess.run(run_ops) + global_step = self.sess.run(self.child_ops["global_step"]) + log_string = "" + log_string += "ch_step={:<6d}".format(global_step) + log_string += " loss={:<8.6f}".format(loss) + log_string += " lr={:<8.4f}".format(lr) + log_string += " |g|={:<8.4f}".format(gn) + log_string += " tr_acc={:<3d}/{:>3d}".format(tr_acc, FLAGS.batch_size) + if int(global_step) % FLAGS.log_every == 0: + logger.debug(log_string) + return loss, global_step + + def run(self): + '''Run this model according to the `epoch` set in FALGS''' + max_acc = 0 + while True: + _, global_step = self.run_one_step() + if global_step % self.child_ops['num_train_batches'] == 0: + acc = self.child_ops["eval_func"]( + self.sess, "test", self.child_model) + max_acc = max(max_acc, acc) + '''@nni.report_intermediate_result(acc)''' + if global_step / self.child_ops['num_train_batches'] >= FLAGS.num_epochs: + '''@nni.report_final_result(max_acc)''' + break + + +def main(_): + logger.debug("-" * 80) + + if not os.path.isdir(FLAGS.output_dir): + logger.debug( + "Path {} does not exist. Creating.".format(FLAGS.output_dir)) + os.makedirs(FLAGS.output_dir) + elif FLAGS.reset_output_dir: + logger.debug( + "Path {} exists. Remove and remake.".format(FLAGS.output_dir)) + shutil.rmtree(FLAGS.output_dir) + os.makedirs(FLAGS.output_dir) + logger.debug("-" * 80) + trial = NASTrial() + + trial.run() + + +if __name__ == "__main__": + tf.app.run() diff --git a/examples/trials/nas_cifar10/src/cifar10_flags.py b/examples/trials/nas_cifar10/src/cifar10_flags.py new file mode 100644 index 0000000000..338fcea8f8 --- /dev/null +++ b/examples/trials/nas_cifar10/src/cifar10_flags.py @@ -0,0 +1,48 @@ +import tensorflow as tf +from src.utils import DEFINE_boolean +from src.utils import DEFINE_float +from src.utils import DEFINE_integer +from src.utils import DEFINE_string +flags = tf.app.flags +FLAGS = flags.FLAGS + +DEFINE_boolean("reset_output_dir", False, "Delete output_dir if exists.") +DEFINE_string("data_path", "", "") +DEFINE_string("output_dir", "", "") +DEFINE_string("data_format", "NHWC", "'NHWC' or 'NCWH'") +DEFINE_string("search_for", None, "Must be [macro|micro]") +DEFINE_integer("train_data_size", 45000, "") +DEFINE_integer("batch_size", 32, "") + +DEFINE_integer("num_epochs", 300, "") +DEFINE_integer("child_lr_dec_every", 100, "") +DEFINE_integer("child_num_layers", 5, "") +DEFINE_integer("child_num_cells", 5, "") +DEFINE_integer("child_filter_size", 5, "") +DEFINE_integer("child_out_filters", 48, "") +DEFINE_integer("child_out_filters_scale", 1, "") +DEFINE_integer("child_num_branches", 4, "") +DEFINE_integer("child_num_aggregate", None, "") +DEFINE_integer("child_num_replicas", 1, "") +DEFINE_integer("child_block_size", 3, "") +DEFINE_integer("child_lr_T_0", None, "for lr schedule") +DEFINE_integer("child_lr_T_mul", None, "for lr schedule") +DEFINE_integer("child_cutout_size", None, "CutOut size") +DEFINE_float("child_grad_bound", 5.0, "Gradient clipping") +DEFINE_float("child_lr", 0.1, "") +DEFINE_float("child_lr_dec_rate", 0.1, "") +DEFINE_float("child_keep_prob", 0.5, "") +DEFINE_float("child_drop_path_keep_prob", 1.0, "minimum drop_path_keep_prob") +DEFINE_float("child_l2_reg", 1e-4, "") +DEFINE_float("child_lr_max", None, "for lr schedule") +DEFINE_float("child_lr_min", None, "for lr schedule") +DEFINE_string("child_skip_pattern", None, "Must be ['dense', None]") +DEFINE_string("child_fixed_arc", None, "") +DEFINE_boolean("child_use_aux_heads", False, "Should we use an aux head") +DEFINE_boolean("child_sync_replicas", False, "To sync or not to sync.") +DEFINE_boolean("child_lr_cosine", False, "Use cosine lr schedule") +DEFINE_integer("controller_train_steps", 50, "") +DEFINE_boolean("controller_search_whole_channels", False, "") +DEFINE_integer("controller_num_aggregate", 1, "") +DEFINE_integer("log_every", 50, "How many steps to log") +DEFINE_integer("eval_every_epochs", 1, "How many epochs to eval") diff --git a/examples/trials/nas_cifar10/src/common_ops.py b/examples/trials/nas_cifar10/src/common_ops.py new file mode 100644 index 0000000000..e0933f6e53 --- /dev/null +++ b/examples/trials/nas_cifar10/src/common_ops.py @@ -0,0 +1,255 @@ +import numpy as np +import tensorflow as tf +from tensorflow.python.training import moving_averages + + +def lstm(x, prev_c, prev_h, w): + ifog = tf.matmul(tf.concat([x, prev_h], axis=1), w) + i, f, o, g = tf.split(ifog, 4, axis=1) + i = tf.sigmoid(i) + f = tf.sigmoid(f) + o = tf.sigmoid(o) + g = tf.tanh(g) + next_c = i * g + f * prev_c + next_h = o * tf.tanh(next_c) + return next_c, next_h + + +def stack_lstm(x, prev_c, prev_h, w): + next_c, next_h = [], [] + for layer_id, (_c, _h, _w) in enumerate(zip(prev_c, prev_h, w)): + inputs = x if layer_id == 0 else next_h[-1] + curr_c, curr_h = lstm(inputs, _c, _h, _w) + next_c.append(curr_c) + next_h.append(curr_h) + return next_c, next_h + + +def create_weight(name, shape, initializer=None, trainable=True, seed=None): + if initializer is None: + initializer = tf.contrib.keras.initializers.he_normal(seed=seed) + return tf.get_variable(name, shape, initializer=initializer, trainable=trainable) + + +def create_bias(name, shape, initializer=None): + if initializer is None: + initializer = tf.constant_initializer(0.0, dtype=tf.float32) + return tf.get_variable(name, shape, initializer=initializer) + + +def conv_op(inputs, filter_size, is_training, count, out_filters, + data_format, ch_mul=1, start_idx=None, separable=False): + """ + Args: + start_idx: where to start taking the output channels. if None, assuming + fixed_arc mode + count: how many output_channels to take. + """ + + if data_format == "NHWC": + inp_c = inputs.get_shape()[3].value + elif data_format == "NCHW": + inp_c = inputs.get_shape()[1].value + + with tf.variable_scope("inp_conv_1"): + w = create_weight("w", [1, 1, inp_c, out_filters]) + x = tf.nn.conv2d(inputs, w, [1, 1, 1, 1], + "SAME", data_format=data_format) + x = batch_norm(x, is_training, data_format=data_format) + x = tf.nn.relu(x) + + with tf.variable_scope("out_conv_{}".format(filter_size)): + if start_idx is None: + if separable: + w_depth = create_weight( + "w_depth", [filter_size, filter_size, out_filters, ch_mul]) + w_point = create_weight( + "w_point", [1, 1, out_filters * ch_mul, count]) + x = tf.nn.separable_conv2d(x, w_depth, w_point, strides=[1, 1, 1, 1], + padding="SAME", data_format=data_format) + x = batch_norm( + x, is_training, data_format=data_format) + else: + w = create_weight( + "w", [filter_size, filter_size, inp_c, count]) + x = tf.nn.conv2d( + x, w, [1, 1, 1, 1], "SAME", data_format=data_format) + x = batch_norm( + x, is_training, data_format=data_format) + else: + if separable: + w_depth = create_weight( + "w_depth", [filter_size, filter_size, out_filters, ch_mul]) + #test_depth = w_depth + w_point = create_weight( + "w_point", [out_filters, out_filters * ch_mul]) + w_point = w_point[start_idx:start_idx+count, :] + w_point = tf.transpose(w_point, [1, 0]) + w_point = tf.reshape( + w_point, [1, 1, out_filters * ch_mul, count]) + + x = tf.nn.separable_conv2d(x, w_depth, w_point, strides=[1, 1, 1, 1], + padding="SAME", data_format=data_format) + mask = tf.range(0, out_filters, dtype=tf.int32) + mask = tf.logical_and( + start_idx <= mask, mask < start_idx + count) + x = batch_norm_with_mask( + x, is_training, mask, out_filters, data_format=data_format) + else: + w = create_weight( + "w", [filter_size, filter_size, out_filters, out_filters]) + w = tf.transpose(w, [3, 0, 1, 2]) + w = w[start_idx:start_idx+count, :, :, :] + w = tf.transpose(w, [1, 2, 3, 0]) + x = tf.nn.conv2d( + x, w, [1, 1, 1, 1], "SAME", data_format=data_format) + mask = tf.range(0, out_filters, dtype=tf.int32) + mask = tf.logical_and( + start_idx <= mask, mask < start_idx + count) + x = batch_norm_with_mask( + x, is_training, mask, out_filters, data_format=data_format) + x = tf.nn.relu(x) + return x + +def pool_op(inputs, is_training, count, out_filters, avg_or_max, data_format, start_idx=None): + """ + Args: + start_idx: where to start taking the output channels. if None, assuming + fixed_arc mode + count: how many output_channels to take. + """ + + if data_format == "NHWC": + inp_c = inputs.get_shape()[3].value + elif data_format == "NCHW": + inp_c = inputs.get_shape()[1].value + + with tf.variable_scope("conv_1"): + w = create_weight("w", [1, 1, inp_c, out_filters]) + x = tf.nn.conv2d(inputs, w, [1, 1, 1, 1], + "SAME", data_format=data_format) + x = batch_norm(x, is_training, data_format=data_format) + x = tf.nn.relu(x) + + with tf.variable_scope("pool"): + if data_format == "NHWC": + actual_data_format = "channels_last" + elif data_format == "NCHW": + actual_data_format = "channels_first" + + if avg_or_max == "avg": + x = tf.layers.average_pooling2d( + x, [3, 3], [1, 1], "SAME", data_format=actual_data_format) + elif avg_or_max == "max": + x = tf.layers.max_pooling2d( + x, [3, 3], [1, 1], "SAME", data_format=actual_data_format) + else: + raise ValueError("Unknown pool {}".format(avg_or_max)) + + if start_idx is not None: + if data_format == "NHWC": + x = x[:, :, :, start_idx: start_idx+count] + elif data_format == "NCHW": + x = x[:, start_idx: start_idx+count, :, :] + + return x + + +def global_avg_pool(x, data_format="NHWC"): + if data_format == "NHWC": + x = tf.reduce_mean(x, [1, 2]) + elif data_format == "NCHW": + x = tf.reduce_mean(x, [2, 3]) + else: + raise NotImplementedError("Unknown data_format {}".format(data_format)) + return x + + +def batch_norm(x, is_training, name="bn", decay=0.9, epsilon=1e-5, + data_format="NHWC"): + if data_format == "NHWC": + shape = [x.get_shape()[3]] + elif data_format == "NCHW": + shape = [x.get_shape()[1]] + else: + raise NotImplementedError("Unknown data_format {}".format(data_format)) + + with tf.variable_scope(name, reuse=None if is_training else True): + offset = tf.get_variable( + "offset", shape, + initializer=tf.constant_initializer(0.0, dtype=tf.float32)) + scale = tf.get_variable( + "scale", shape, + initializer=tf.constant_initializer(1.0, dtype=tf.float32)) + moving_mean = tf.get_variable( + "moving_mean", shape, trainable=False, + initializer=tf.constant_initializer(0.0, dtype=tf.float32)) + moving_variance = tf.get_variable( + "moving_variance", shape, trainable=False, + initializer=tf.constant_initializer(1.0, dtype=tf.float32)) + + if is_training: + x, mean, variance = tf.nn.fused_batch_norm( + x, scale, offset, epsilon=epsilon, data_format=data_format, + is_training=True) + update_mean = moving_averages.assign_moving_average( + moving_mean, mean, decay) + update_variance = moving_averages.assign_moving_average( + moving_variance, variance, decay) + with tf.control_dependencies([update_mean, update_variance]): + x = tf.identity(x) + else: + x, _, _ = tf.nn.fused_batch_norm(x, scale, offset, mean=moving_mean, + variance=moving_variance, + epsilon=epsilon, data_format=data_format, + is_training=False) + return x + + +def batch_norm_with_mask(x, is_training, mask, num_channels, name="bn", + decay=0.9, epsilon=1e-3, data_format="NHWC"): + + shape = [num_channels] + indices = tf.where(mask) + indices = tf.to_int32(indices) + indices = tf.reshape(indices, [-1]) + + with tf.variable_scope(name, reuse=None if is_training else True): + offset = tf.get_variable( + "offset", shape, + initializer=tf.constant_initializer(0.0, dtype=tf.float32)) + scale = tf.get_variable( + "scale", shape, + initializer=tf.constant_initializer(1.0, dtype=tf.float32)) + offset = tf.boolean_mask(offset, mask) + scale = tf.boolean_mask(scale, mask) + + moving_mean = tf.get_variable( + "moving_mean", shape, trainable=False, + initializer=tf.constant_initializer(0.0, dtype=tf.float32)) + moving_variance = tf.get_variable( + "moving_variance", shape, trainable=False, + initializer=tf.constant_initializer(1.0, dtype=tf.float32)) + + if is_training: + x, mean, variance = tf.nn.fused_batch_norm( + x, scale, offset, epsilon=epsilon, data_format=data_format, + is_training=True) + mean = (1.0 - decay) * (tf.boolean_mask(moving_mean, mask) - mean) + variance = (1.0 - decay) * \ + (tf.boolean_mask(moving_variance, mask) - variance) + update_mean = tf.scatter_sub( + moving_mean, indices, mean, use_locking=True) + update_variance = tf.scatter_sub( + moving_variance, indices, variance, use_locking=True) + with tf.control_dependencies([update_mean, update_variance]): + x = tf.identity(x) + else: + masked_moving_mean = tf.boolean_mask(moving_mean, mask) + masked_moving_variance = tf.boolean_mask(moving_variance, mask) + x, _, _ = tf.nn.fused_batch_norm(x, scale, offset, + mean=masked_moving_mean, + variance=masked_moving_variance, + epsilon=epsilon, data_format=data_format, + is_training=False) + return x diff --git a/examples/trials/nas_cifar10/src/utils.py b/examples/trials/nas_cifar10/src/utils.py new file mode 100644 index 0000000000..65d57af7f1 --- /dev/null +++ b/examples/trials/nas_cifar10/src/utils.py @@ -0,0 +1,262 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import sys +import numpy as np +import tensorflow as tf + + +user_flags = [] + + +def DEFINE_string(name, default_value, doc_string): + tf.app.flags.DEFINE_string(name, default_value, doc_string) + global user_flags + user_flags.append(name) + + +def DEFINE_integer(name, default_value, doc_string): + tf.app.flags.DEFINE_integer(name, default_value, doc_string) + global user_flags + user_flags.append(name) + + +def DEFINE_float(name, default_value, doc_string): + tf.app.flags.DEFINE_float(name, default_value, doc_string) + global user_flags + user_flags.append(name) + + +def DEFINE_boolean(name, default_value, doc_string): + tf.app.flags.DEFINE_boolean(name, default_value, doc_string) + global user_flags + user_flags.append(name) + + +def print_user_flags(line_limit=80): + print("-" * 80) + + global user_flags + FLAGS = tf.app.flags.FLAGS + + for flag_name in sorted(user_flags): + value = "{}".format(getattr(FLAGS, flag_name)) + log_string = flag_name + log_string += "." * (line_limit - len(flag_name) - len(value)) + log_string += value + print(log_string) + + +def get_C(x, data_format): + """ + Args: + x: tensor of shape [N, H, W, C] or [N, C, H, W] + """ + if data_format == "NHWC": + return x.get_shape()[3].value + elif data_format == "NCHW": + return x.get_shape()[1].value + else: + raise ValueError( + "Unknown data_format '{0}'".format(data_format)) + +def get_HW(x, data_format): + """ + Args: + x: tensor of shape [N, H, W, C] or [N, C, H, W] + """ + return x.get_shape()[2].value + +def get_strides(stride, data_format): + """ + Args: + x: tensor of shape [N, H, W, C] or [N, C, H, W] + """ + if data_format == "NHWC": + return [1, stride, stride, 1] + elif data_format == "NCHW": + return [1, 1, stride, stride] + else: + raise ValueError( + "Unknown data_format '{0}'".format(data_format)) + + +class TextColors: + HEADER = '\033[95m' + OKBLUE = '\033[94m' + OKGREEN = '\033[92m' + WARNING = '\033[93m' + FAIL = '\033[91m' + ENDC = '\033[0m' + BOLD = '\033[1m' + UNDERLINE = '\033[4m' + + +class Logger(object): + def __init__(self, output_file): + self.terminal = sys.stdout + self.log = open(output_file, "a") + + def write(self, message): + self.terminal.write(message) + self.terminal.flush() + self.log.write(message) + self.log.flush() + + +def count_model_params(tf_variables): + """ + Args: + tf_variables: list of all model variables + """ + + num_vars = 0 + for var in tf_variables: + num_vars += np.prod([dim.value for dim in var.get_shape()]) + return num_vars + + +def get_train_ops( + loss, + tf_variables, + train_step, + clip_mode=None, + grad_bound=None, + l2_reg=1e-4, + lr_warmup_val=None, + lr_warmup_steps=100, + lr_init=0.1, + lr_dec_start=0, + lr_dec_every=10000, + lr_dec_rate=0.1, + lr_dec_min=None, + lr_cosine=False, + lr_max=None, + lr_min=None, + lr_T_0=None, + lr_T_mul=None, + num_train_batches=None, + optim_algo=None, + sync_replicas=False, + num_aggregate=None, + num_replicas=None, + get_grad_norms=False, + moving_average=None): + """ + Args: + clip_mode: "global", "norm", or None. + moving_average: store the moving average of parameters + """ + + if l2_reg > 0: + l2_losses = [] + for var in tf_variables: + l2_losses.append(tf.reduce_sum(var ** 2)) + l2_loss = tf.add_n(l2_losses) + loss += l2_reg * l2_loss + + grads = tf.gradients(loss, tf_variables) + grad_norm = tf.global_norm(grads) + + grad_norms = {} + for v, g in zip(tf_variables, grads): + if v is None or g is None: + continue + if isinstance(g, tf.IndexedSlices): + grad_norms[v.name] = tf.sqrt(tf.reduce_sum(g.values ** 2)) + else: + grad_norms[v.name] = tf.sqrt(tf.reduce_sum(g ** 2)) + + if clip_mode is not None: + assert grad_bound is not None, "Need grad_bound to clip gradients." + if clip_mode == "global": + grads, _ = tf.clip_by_global_norm(grads, grad_bound) + elif clip_mode == "norm": + clipped = [] + for g in grads: + if isinstance(g, tf.IndexedSlices): + c_g = tf.clip_by_norm(g.values, grad_bound) + c_g = tf.IndexedSlices(g.indices, c_g) + else: + c_g = tf.clip_by_norm(g, grad_bound) + clipped.append(g) + grads = clipped + else: + raise NotImplementedError("Unknown clip_mode {}".format(clip_mode)) + + if lr_cosine: + assert lr_max is not None, "Need lr_max to use lr_cosine" + assert lr_min is not None, "Need lr_min to use lr_cosine" + assert lr_T_0 is not None, "Need lr_T_0 to use lr_cosine" + assert lr_T_mul is not None, "Need lr_T_mul to use lr_cosine" + assert num_train_batches is not None, ("Need num_train_batches to use" + " lr_cosine") + + curr_epoch = train_step // num_train_batches + + last_reset = tf.Variable(0, dtype=tf.int32, trainable=False, + name="last_reset") + T_i = tf.Variable(lr_T_0, dtype=tf.int32, trainable=False, name="T_i") + T_curr = curr_epoch - last_reset + + def _update(): + update_last_reset = tf.assign( + last_reset, curr_epoch, use_locking=True) + update_T_i = tf.assign(T_i, T_i * lr_T_mul, use_locking=True) + with tf.control_dependencies([update_last_reset, update_T_i]): + rate = tf.to_float(T_curr) / tf.to_float(T_i) * 3.1415926 + lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + tf.cos(rate)) + return lr + + def _no_update(): + rate = tf.to_float(T_curr) / tf.to_float(T_i) * 3.1415926 + lr = lr_min + 0.5 * (lr_max - lr_min) * (1.0 + tf.cos(rate)) + return lr + + learning_rate = tf.cond( + tf.greater_equal(T_curr, T_i), _update, _no_update) + else: + learning_rate = tf.train.exponential_decay( + lr_init, tf.maximum(train_step - lr_dec_start, 0), lr_dec_every, + lr_dec_rate, staircase=True) + if lr_dec_min is not None: + learning_rate = tf.maximum(learning_rate, lr_dec_min) + + if lr_warmup_val is not None: + learning_rate = tf.cond(tf.less(train_step, lr_warmup_steps), + lambda: lr_warmup_val, lambda: learning_rate) + + if optim_algo == "momentum": + opt = tf.train.MomentumOptimizer( + learning_rate, 0.9, use_locking=True, use_nesterov=True) + elif optim_algo == "sgd": + opt = tf.train.GradientDescentOptimizer( + learning_rate, use_locking=True) + elif optim_algo == "adam": + opt = tf.train.AdamOptimizer(learning_rate, beta1=0.0, epsilon=1e-3, + use_locking=True) + else: + raise ValueError("Unknown optim_algo {}".format(optim_algo)) + + if sync_replicas: + assert num_aggregate is not None, "Need num_aggregate to sync." + assert num_replicas is not None, "Need num_replicas to sync." + + opt = tf.train.SyncReplicasOptimizer( + opt, + replicas_to_aggregate=num_aggregate, + total_num_replicas=num_replicas, + use_locking=True) + + if moving_average is not None: + opt = tf.contrib.opt.MovingAverageOptimizer( + opt, average_decay=moving_average) + + train_op = opt.apply_gradients( + zip(grads, tf_variables), global_step=train_step) + + if get_grad_norms: + return train_op, learning_rate, grad_norm, opt, grad_norms + else: + return train_op, learning_rate, grad_norm, opt