diff --git a/python/mxnet/base.py b/python/mxnet/base.py index 15db63e0bff2..fa1302046474 100644 --- a/python/mxnet/base.py +++ b/python/mxnet/base.py @@ -794,6 +794,9 @@ def write_all_str(module_file, module_all_list): _NP_EXT_OP_PREFIX = '_npx_' _NP_EXT_OP_SUBMODULE_LIST = ['_image_', '_random_'] +_NP_EXT_OP_IMPLEMENTED_SET = {'_npx_softmax', '_npx_log_softmax', '_npx_masked_softmax', + '_npx_masked_log_softmax', '_npx_activation', + '_npx_batch_norm', '_npx_fully_connected'} _NP_INTERNAL_OP_PREFIX = '_npi_' @@ -855,7 +858,7 @@ def _init_np_op_module(root_module_name, np_module_name, mx_module_name, make_op elif np_module_name == 'numpy_extension': op_name_prefix = _NP_EXT_OP_PREFIX submodule_name_list = _NP_EXT_OP_SUBMODULE_LIST - op_implemented_set = set() + op_implemented_set = _NP_EXT_OP_IMPLEMENTED_SET elif np_module_name == 'numpy._internal': op_name_prefix = _NP_INTERNAL_OP_PREFIX submodule_name_list = [] diff --git a/python/mxnet/ndarray/numpy_extension/_api_internal.py b/python/mxnet/ndarray/numpy_extension/_api_internal.py new file mode 100644 index 000000000000..b7b2216b1f83 --- /dev/null +++ b/python/mxnet/ndarray/numpy_extension/_api_internal.py @@ -0,0 +1,24 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Namespace for numpy_extension api.""" + +from ..._ffi.function import _init_api + +__all__ = [] + +_init_api("_npx", "mxnet.ndarray.numpy_extension._api_internal") diff --git a/python/mxnet/ndarray/numpy_extension/_op.py b/python/mxnet/ndarray/numpy_extension/_op.py index 22738a0f1950..8ada24f77039 100644 --- a/python/mxnet/ndarray/numpy_extension/_op.py +++ b/python/mxnet/ndarray/numpy_extension/_op.py @@ -18,4 +18,403 @@ """Namespace for the operators not belonging to the official numpy package used in Gluon dispatched by F=ndarray module.""" -__all__ = [] +import numpy as _np +from .. import numpy as np # pylint: disable=reimported +from . import _api_internal +from ...util import set_module + + +__all__ = ['softmax', 'log_softmax', 'masked_softmax', 'masked_log_softmax', + 'activation', 'batch_norm', 'fully_connected'] + + +# pylint: disable=too-many-arguments +@set_module('mxnet.ndarray.numpy_extension') +def softmax(data, axis=-1, length=None, temperature=None, use_length=False, dtype=None): + r"""Applies the softmax function. + + The resulting array contains elements in the range (0,1) and the elements along the given axis sum up to 1. + + .. math:: + softmax(\mathbf{z/t})_j = \frac{e^{z_j/t}}{\sum_{k=1}^K e^{z_k/t}} + + for :math:`j = 1, ..., K` + + t is the temperature parameter in softmax function. By default, t equals 1.0 + + Parameters + ---------- + data : NDArray + The input array. + axis : int, optional, default='-1' + The axis along which to compute softmax. + length : NDArray + The length array. + temperature : double or None, optional, default=None + Temperature parameter in softmax + dtype : {None, 'float16', 'float32', 'float64'},optional, default='None' + DType of the output in case this can't be inferred. Defaults to + the same as input's dtype if not defined (dtype=None). + use_length : boolean or None, optional, default=0 + Whether to use the length input as a mask over the data input. + + Returns + ------- + out : NDArray or list of NDArrays + The output of this function. + + Example + ------- + >>> data = np.ones((2, 3)) + >>> npx.softmax(data, axis=0) + array([[0.5, 0.5, 0.5], + [0.5, 0.5, 0.5]]) + >>> npx.softmax(data, axis=1) + array([[0.33333334, 0.33333334, 0.33333334], + [0.33333334, 0.33333334, 0.33333334]]) + """ + if dtype and not isinstance(dtype, str): + dtype = _np.dtype(dtype).name + if use_length: + assert length is not None, "Missing length input" + return _api_internal.softmax(data, length, axis, temperature, True, dtype) + else: + assert length is None, "Length input is not used" + return _api_internal.softmax(data, axis, temperature, False, dtype) + + +# pylint: disable=too-many-arguments +@set_module('mxnet.ndarray.numpy_extension') +def log_softmax(data, axis=-1, length=None, temperature=None, use_length=False, dtype=None): + r"""Computes the log softmax of the input. + This is equivalent to computing softmax followed by log. + + Parameters + ---------- + data : NDArray + The input array. + axis : int, optional, default='-1' + The axis along which to compute softmax. + length : NDArray + The length array. + temperature : double or None, optional, default=None + Temperature parameter in softmax + dtype : {None, 'float16', 'float32', 'float64'},optional, default='None' + DType of the output in case this can't be inferred. Defaults to + the same as input's dtype if not defined (dtype=None). + use_length : boolean or None, optional, default=0 + Whether to use the length input as a mask over the data input. + + Returns + ------- + out : NDArray or list of NDArrays + The output of this function. + + Examples + -------- + >>> data = np.array([1, 2, .1]) + >>> npx.log_softmax(data) + array([-1.4170278, -0.4170278, -2.3170278]) + >>> data = np.array([[1, 2, .1],[.1, 2, 1]]) + >>> npx.log_softmax(data, axis=0) + array([[-0.34115386, -0.6931472 , -1.2411538 ], + [-1.2411538 , -0.6931472 , -0.34115386]]) + """ + if dtype and not isinstance(dtype, str): + dtype = _np.dtype(dtype).name + if use_length: + assert length is not None, "Missing length input" + return _api_internal.log_softmax(data, length, axis, temperature, True, dtype) + else: + assert length is None, "Length input is not used" + return _api_internal.log_softmax(data, axis, temperature, False, dtype) + + +# pylint: disable=too-many-arguments +@set_module('mxnet.ndarray.numpy_extension') +def masked_softmax(data, mask, axis=-1, temperature=1.0, dtype=None): + r"""Applies the softmax function masking elements according to the mask provided + + Parameters + ---------- + data : NDArray + The input array. + mask : NDArray + Mask to apply. + axis : int, optional, default='-1' + The axis along which to compute softmax. + temperature : double or None, optional, default=None + Temperature parameter in softmax + dtype : {None, 'float16', 'float32', 'float64'},optional, default='None' + DType of the output in case this can't be inferred. Defaults to + the same as input's dtype if not defined (dtype=None). + normalize : boolean or None, optional, default=1 + Whether to normalize input data x: x = x - max(x) + + Returns + ------- + out : NDArray or list of NDArrays + The output of this function. + + Examples + -------- + >>> data = np.arange(5) + >>> mask = np.array([1, 0, 1, 0, 1]) + >>> npx.masked_softmax(data, mask) + array([0.01587624, 0. , 0.11731042, 0. , 0.8668133 ]) + >>> data = np.arange(10).reshape((2, 5)) + >>> npx.masked_softmax(data, mask, axis=0) + array([[0.00669285, 0. , 0.00669285, 0. , 0.00669285], + [0.9933072 , 0. , 0.9933072 , 0. , 0.9933072 ]]) + """ + if mask is not None: + neg = -1e18 + if _np.dtype(dtype) == _np.float16: + neg = -1e4 + data = np.where(mask, data, neg) + logits = (softmax(data, axis=axis) / temperature) * mask + else: + logits = softmax(data, axis=axis) / temperature + return logits + + +# pylint: disable=too-many-arguments +@set_module('mxnet.ndarray.numpy_extension') +def masked_log_softmax(data, mask, axis=-1, temperature=1.0, dtype=None): + r"""Computes the masked log softmax of the input. + This is equivalent to computing masked softmax followed by log. + + Parameters + ---------- + data : NDArray + The input array. + mask : NDArray + Mask to apply. + axis : int, optional, default='-1' + The axis along which to compute softmax. + temperature : double or None, optional, default=None + Temperature parameter in softmax + dtype : {None, 'float16', 'float32', 'float64'},optional, default='None' + DType of the output in case this can't be inferred. Defaults to + the same as input's dtype if not defined (dtype=None). + normalize : boolean or None, optional, default=1 + Whether to normalize input data x: x = x - max(x) + + Returns + ------- + out : NDArray or list of NDArrays + The output of this function. + + Examples + -------- + >>> data = np.arange(5) + >>> mask = np.array([1, 0, 1, 0, 1]) + >>> npx.masked_log_softmax(data, mask) + array([-4.1429286 , -inf, -2.1429286 , -inf, -0.14292854]) + >>> data = np.arange(10).reshape((2, 5)) + >>> npx.masked_log_softmax(data, mask, axis=0) + array([[-5.0067153 , -inf, -5.0067153 , -inf, -5.0067153 ], + [-0.00671535, -inf, -0.00671535, -inf, -0.00671535]]) + """ + if mask is not None: + neg = -1e18 + inf = -_np.inf + if _np.dtype(dtype) == _np.float16: + neg = -1e4 + data = np.where(mask, data, neg) + logits = np.where(mask, log_softmax(data, axis=axis) / temperature, inf) + else: + logits = log_softmax(data, axis=axis) / temperature + return logits + + +# pylint: disable=too-many-arguments, unused-argument +@set_module('mxnet.ndarray.numpy_extension') +def activation(data, act_type='relu', **kwargs): + r"""Applies an activation function element-wise to the input. + + The following activation functions are supported: + + - `relu`: Rectified Linear Unit, :math:`y = max(x, 0)` + - `sigmoid`: :math:`y = \frac{1}{1 + exp(-x)}` + - `tanh`: Hyperbolic tangent, :math:`y = \frac{exp(x) - exp(-x)}{exp(x) + exp(-x)}` + - `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))` + - `softsign`: :math:`y = \frac{x}{1 + abs(x)}` + + Parameters + ---------- + data : NDArray + The input array. + act_type : {'relu', 'sigmoid', 'softrelu', 'softsign', 'tanh'}, required + Activation function to be applied. + + Returns + ------- + out : NDArray or list of NDArrays + The output of this function. + """ + return _api_internal.activation(data, act_type) + + +# pylint: disable=too-many-arguments, unused-argument +@set_module('mxnet.ndarray.numpy_extension') +def batch_norm(x, gamma, beta, running_mean, running_var, eps=1e-3, momentum=0.9, + fix_gamma=True, use_global_stats=False, output_mean_var=False, axis=1, + cudnn_off=False, min_calib_range=None, max_calib_range=None, **kwargs): + r"""Batch normalization. + + Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as + well as offset ``beta``. + + Assume the input has more than one dimension and we normalize along axis 1. + We first compute the mean and variance along this axis: + + .. math:: + + data\_mean[i] = mean(data[:,i,:,...]) \\ + data\_var[i] = var(data[:,i,:,...]) + + Then compute the normalized output, which has the same shape as input, as following: + + .. math:: + + out[:,i,:,...] = \frac{data[:,i,:,...] - data\_mean[i]}{\sqrt{data\_var[i]+\epsilon}} * gamma[i] + beta[i] + + Both *mean* and *var* returns a scalar by treating the input as a vector. + + Assume the input has size *k* on axis 1, then both ``gamma`` and ``beta`` + have shape *(k,)*. If ``output_mean_var`` is set to be true, then outputs both ``data_mean`` and + the inverse of ``data_var``, which are needed for the backward pass. Note that gradient of these + two outputs are blocked. + + Besides the inputs and the outputs, this operator accepts two auxiliary + states, ``moving_mean`` and ``moving_var``, which are *k*-length + vectors. They are global statistics for the whole dataset, which are updated + by:: + + moving_mean = moving_mean * momentum + data_mean * (1 - momentum) + moving_var = moving_var * momentum + data_var * (1 - momentum) + + If ``use_global_stats`` is set to be true, then ``moving_mean`` and + ``moving_var`` are used instead of ``data_mean`` and ``data_var`` to compute + the output. It is often used during inference. + + The parameter ``axis`` specifies which axis of the input shape denotes + the 'channel' (separately normalized groups). The default is 1. Specifying -1 sets the channel + axis to be the last item in the input shape. + + Both ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is true, + then set ``gamma`` to 1 and its gradient to 0. + + .. Note:: + When ``fix_gamma`` is set to True, no sparse support is provided. If ``fix_gamma is`` set to False, + the sparse tensors will fallback. + + Parameters + ---------- + data : NDArray + Input data to batch normalization + gamma : NDArray + gamma array + beta : NDArray + beta array + moving_mean : NDArray + running mean of input + moving_var : NDArray + running variance of input + eps : double, optional, default=0.0010000000474974513 + Epsilon to prevent div 0. Must be no less than CUDNN_BN_MIN_EPSILON + defined in cudnn.h when using cudnn (usually 1e-5) + momentum : float, optional, default=0.899999976 + Momentum for moving average + fix_gamma : boolean, optional, default=1 + Fix gamma while training + use_global_stats : boolean, optional, default=0 + Whether use global moving statistics instead of local batch-norm. + This will force change batch-norm into a scale shift operator. + output_mean_var : boolean, optional, default=0 + Output the mean and inverse std + axis : int, optional, default='1' + Specify which shape axis the channel is specified + cudnn_off : boolean, optional, default=0 + Do not select CUDNN operator, if available + min_calib_range : float or None, optional, default=None + The minimum scalar value in the form of float32 obtained through calibration. + If present, it will be used to by quantized batch norm op to calculate primitive scale. + Note: this calib_range is to calib bn output. + max_calib_range : float or None, optional, default=None + The maximum scalar value in the form of float32 obtained through calibration. + If present, it will be used to by quantized batch norm op to calculate primitive scale. + Note: this calib_range is to calib bn output. + + Returns + ------- + out : NDArray or list of NDArrays + The output of this function. + """ + return _api_internal.batch_norm(x, gamma, beta, running_mean, running_var, eps, momentum, + fix_gamma, use_global_stats, output_mean_var, axis, + cudnn_off, min_calib_range, max_calib_range) + + +# pylint: disable=too-many-arguments, unused-argument +@set_module('mxnet.ndarray.numpy_extension') +def fully_connected(x, weight, bias=None, num_hidden=None, + no_bias=True, flatten=True, **kwargs): + r"""Applies a linear transformation: :math:`Y = XW^T + b`. + + If ``flatten`` is set to be true, then the shapes are: + + - **data**: `(batch_size, x1, x2, ..., xn)` + - **weight**: `(num_hidden, x1 * x2 * ... * xn)` + - **bias**: `(num_hidden,)` + - **out**: `(batch_size, num_hidden)` + + If ``flatten`` is set to be false, then the shapes are: + + - **data**: `(x1, x2, ..., xn, input_dim)` + - **weight**: `(num_hidden, input_dim)` + - **bias**: `(num_hidden,)` + - **out**: `(x1, x2, ..., xn, num_hidden)` + + The learnable parameters include both ``weight`` and ``bias``. + + If ``no_bias`` is set to be true, then the ``bias`` term is ignored. + + .. Note:: + + The sparse support for FullyConnected is limited to forward evaluation with `row_sparse` + weight and bias, where the length of `weight.indices` and `bias.indices` must be equal + to `num_hidden`. This could be useful for model inference with `row_sparse` weights + trained with importance sampling or noise contrastive estimation. + + To compute linear transformation with 'csr' sparse data, sparse.dot is recommended instead + of sparse.FullyConnected. + + Parameters + ---------- + data : NDArray + Input data. + weight : NDArray + Weight matrix. + bias : NDArray + Bias parameter. + num_hidden : int, required + Number of hidden nodes of the output. + no_bias : boolean, optional, default=0 + Whether to disable bias parameter. + flatten : boolean, optional, default=1 + Whether to collapse all but the first axis of the input data tensor. + + Returns + ------- + out : NDArray or list of NDArrays + The output of this function. + """ + assert num_hidden is not None, "Please provide number of hidden nodes" + if no_bias: + return _api_internal.fully_connected(x, weight, num_hidden, no_bias, flatten) + else: + assert bias is not None, "Missing bias parameter" + return _api_internal.fully_connected(x, weight, bias, num_hidden, + no_bias, flatten) diff --git a/python/mxnet/numpy_extension/_op.py b/python/mxnet/numpy_extension/_op.py index a995e480221a..d168af6b10aa 100644 --- a/python/mxnet/numpy_extension/_op.py +++ b/python/mxnet/numpy_extension/_op.py @@ -17,4 +17,371 @@ """Namespace for registering numpy_extension ops for imperative programming.""" -__all__ = [] +from ..ndarray import numpy_extension as _mx_nd_npx +from ..util import set_module + + +__all__ = ['softmax', 'log_softmax', 'masked_softmax', 'masked_log_softmax', + 'activation', 'batch_norm', 'fully_connected'] + + +# pylint: disable=too-many-arguments +@set_module('mxnet.numpy_extension') +def softmax(data, axis=-1, length=None, temperature=None, use_length=False, dtype=None): + r"""Applies the softmax function. + + The resulting array contains elements in the range (0,1) and the elements along the given axis sum up to 1. + + .. math:: + softmax(\mathbf{z/t})_j = \frac{e^{z_j/t}}{\sum_{k=1}^K e^{z_k/t}} + + for :math:`j = 1, ..., K` + + t is the temperature parameter in softmax function. By default, t equals 1.0 + + Parameters + ---------- + data : NDArray + The input array. + axis : int, optional, default='-1' + The axis along which to compute softmax. + length : NDArray + The length array. + temperature : double or None, optional, default=None + Temperature parameter in softmax + dtype : {None, 'float16', 'float32', 'float64'},optional, default='None' + DType of the output in case this can't be inferred. Defaults to + the same as input's dtype if not defined (dtype=None). + use_length : boolean or None, optional, default=0 + Whether to use the length input as a mask over the data input. + + Returns + ------- + out : NDArray or list of NDArrays + The output of this function. + + Example + ------- + >>> data = np.ones((2, 3)) + >>> npx.softmax(data, axis=0) + array([[0.5, 0.5, 0.5], + [0.5, 0.5, 0.5]]) + >>> npx.softmax(data, axis=1) + array([[0.33333334, 0.33333334, 0.33333334], + [0.33333334, 0.33333334, 0.33333334]]) + """ + return _mx_nd_npx.softmax(data, axis=axis, length=length, temperature=temperature, + use_length=use_length, dtype=dtype) + + +# pylint: disable=too-many-arguments +@set_module('mxnet.numpy_extension') +def log_softmax(data, axis=-1, length=None, temperature=None, use_length=False, dtype=None): + r"""Computes the log softmax of the input. + This is equivalent to computing softmax followed by log. + + Parameters + ---------- + data : NDArray + The input array. + axis : int, optional, default='-1' + The axis along which to compute softmax. + length : NDArray + The length array. + temperature : double or None, optional, default=None + Temperature parameter in softmax + dtype : {None, 'float16', 'float32', 'float64'},optional, default='None' + DType of the output in case this can't be inferred. Defaults to + the same as input's dtype if not defined (dtype=None). + use_length : boolean or None, optional, default=0 + Whether to use the length input as a mask over the data input. + + Returns + ------- + out : NDArray or list of NDArrays + The output of this function. + + Examples + -------- + >>> data = np.array([1, 2, .1]) + >>> npx.log_softmax(data) + array([-1.4170278, -0.4170278, -2.3170278]) + >>> data = np.array([[1, 2, .1],[.1, 2, 1]]) + >>> npx.log_softmax(data, axis=0) + array([[-0.34115386, -0.6931472 , -1.2411538 ], + [-1.2411538 , -0.6931472 , -0.34115386]]) + """ + return _mx_nd_npx.log_softmax(data, axis=axis, length=length, temperature=temperature, + use_length=use_length, dtype=dtype) + + +# pylint: disable=too-many-arguments +@set_module('mxnet.numpy_extension') +def masked_softmax(data, mask, axis=-1, temperature=1.0, dtype=None): + r"""Applies the softmax function masking elements according to the mask provided + + Parameters + ---------- + data : NDArray + The input array. + mask : NDArray + Mask to apply. + axis : int, optional, default='-1' + The axis along which to compute softmax. + temperature : double or None, optional, default=None + Temperature parameter in softmax + dtype : {None, 'float16', 'float32', 'float64'},optional, default='None' + DType of the output in case this can't be inferred. Defaults to + the same as input's dtype if not defined (dtype=None). + normalize : boolean or None, optional, default=1 + Whether to normalize input data x: x = x - max(x) + + Returns + ------- + out : NDArray or list of NDArrays + The output of this function. + + Examples + -------- + >>> data = np.arange(5) + >>> mask = np.array([1, 0, 1, 0, 1]) + >>> npx.masked_softmax(data, mask) + array([0.01587624, 0. , 0.11731042, 0. , 0.8668133 ]) + >>> data = np.arange(10).reshape((2, 5)) + >>> npx.masked_softmax(data, mask, axis=0) + array([[0.00669285, 0. , 0.00669285, 0. , 0.00669285], + [0.9933072 , 0. , 0.9933072 , 0. , 0.9933072 ]]) + """ + return _mx_nd_npx.masked_softmax(data, mask, axis=axis, temperature=temperature, + dtype=dtype) + + +# pylint: disable=too-many-arguments +@set_module('mxnet.numpy_extension') +def masked_log_softmax(data, mask, axis=-1, temperature=1.0, dtype=None): + r"""Computes the masked log softmax of the input. + This is equivalent to computing masked softmax followed by log. + + Parameters + ---------- + data : NDArray + The input array. + mask : NDArray + Mask to apply. + axis : int, optional, default='-1' + The axis along which to compute softmax. + temperature : double or None, optional, default=None + Temperature parameter in softmax + dtype : {None, 'float16', 'float32', 'float64'},optional, default='None' + DType of the output in case this can't be inferred. Defaults to + the same as input's dtype if not defined (dtype=None). + normalize : boolean or None, optional, default=1 + Whether to normalize input data x: x = x - max(x) + + Returns + ------- + out : NDArray or list of NDArrays + The output of this function. + + Examples + -------- + >>> data = np.arange(5) + >>> mask = np.array([1, 0, 1, 0, 1]) + >>> npx.masked_log_softmax(data, mask) + array([-4.1429286 , -inf, -2.1429286 , -inf, -0.14292854]) + >>> data = np.arange(10).reshape((2, 5)) + >>> npx.masked_log_softmax(data, mask, axis=0) + array([[-5.0067153 , -inf, -5.0067153 , -inf, -5.0067153 ], + [-0.00671535, -inf, -0.00671535, -inf, -0.00671535]]) + """ + return _mx_nd_npx.masked_log_softmax(data, mask, axis=axis, temperature=temperature, + dtype=dtype) + + +# pylint: disable=too-many-arguments, unused-argument +@set_module('mxnet.numpy_extension') +def activation(data, act_type='relu', **kwargs): + r"""Applies an activation function element-wise to the input. + + The following activation functions are supported: + + - `relu`: Rectified Linear Unit, :math:`y = max(x, 0)` + - `sigmoid`: :math:`y = \frac{1}{1 + exp(-x)}` + - `tanh`: Hyperbolic tangent, :math:`y = \frac{exp(x) - exp(-x)}{exp(x) + exp(-x)}` + - `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))` + - `softsign`: :math:`y = \frac{x}{1 + abs(x)}` + + Parameters + ---------- + data : NDArray + The input array. + act_type : {'relu', 'sigmoid', 'softrelu', 'softsign', 'tanh'}, required + Activation function to be applied. + + Returns + ------- + out : NDArray or list of NDArrays + The output of this function. + """ + return _mx_nd_npx.activation(data, act_type=act_type) + + +# pylint: disable=too-many-arguments, unused-argument +@set_module('mxnet.numpy_extension') +def batch_norm(x, gamma, beta, running_mean, running_var, eps=1e-3, momentum=0.9, + fix_gamma=True, use_global_stats=False, output_mean_var=False, axis=1, + cudnn_off=False, min_calib_range=None, max_calib_range=None, **kwargs): + r"""Batch normalization. + + Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as + well as offset ``beta``. + + Assume the input has more than one dimension and we normalize along axis 1. + We first compute the mean and variance along this axis: + + .. math:: + + data\_mean[i] = mean(data[:,i,:,...]) \\ + data\_var[i] = var(data[:,i,:,...]) + + Then compute the normalized output, which has the same shape as input, as following: + + .. math:: + + out[:,i,:,...] = \frac{data[:,i,:,...] - data\_mean[i]}{\sqrt{data\_var[i]+\epsilon}} * gamma[i] + beta[i] + + Both *mean* and *var* returns a scalar by treating the input as a vector. + + Assume the input has size *k* on axis 1, then both ``gamma`` and ``beta`` + have shape *(k,)*. If ``output_mean_var`` is set to be true, then outputs both ``data_mean`` and + the inverse of ``data_var``, which are needed for the backward pass. Note that gradient of these + two outputs are blocked. + + Besides the inputs and the outputs, this operator accepts two auxiliary + states, ``moving_mean`` and ``moving_var``, which are *k*-length + vectors. They are global statistics for the whole dataset, which are updated + by:: + + moving_mean = moving_mean * momentum + data_mean * (1 - momentum) + moving_var = moving_var * momentum + data_var * (1 - momentum) + + If ``use_global_stats`` is set to be true, then ``moving_mean`` and + ``moving_var`` are used instead of ``data_mean`` and ``data_var`` to compute + the output. It is often used during inference. + + The parameter ``axis`` specifies which axis of the input shape denotes + the 'channel' (separately normalized groups). The default is 1. Specifying -1 sets the channel + axis to be the last item in the input shape. + + Both ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is true, + then set ``gamma`` to 1 and its gradient to 0. + + .. Note:: + When ``fix_gamma`` is set to True, no sparse support is provided. If ``fix_gamma is`` set to False, + the sparse tensors will fallback. + + Parameters + ---------- + data : NDArray + Input data to batch normalization + gamma : NDArray + gamma array + beta : NDArray + beta array + moving_mean : NDArray + running mean of input + moving_var : NDArray + running variance of input + eps : double, optional, default=0.0010000000474974513 + Epsilon to prevent div 0. Must be no less than CUDNN_BN_MIN_EPSILON + defined in cudnn.h when using cudnn (usually 1e-5) + momentum : float, optional, default=0.899999976 + Momentum for moving average + fix_gamma : boolean, optional, default=1 + Fix gamma while training + use_global_stats : boolean, optional, default=0 + Whether use global moving statistics instead of local batch-norm. + This will force change batch-norm into a scale shift operator. + output_mean_var : boolean, optional, default=0 + Output the mean and inverse std + axis : int, optional, default='1' + Specify which shape axis the channel is specified + cudnn_off : boolean, optional, default=0 + Do not select CUDNN operator, if available + min_calib_range : float or None, optional, default=None + The minimum scalar value in the form of float32 obtained through calibration. + If present, it will be used to by quantized batch norm op to calculate primitive scale. + Note: this calib_range is to calib bn output. + max_calib_range : float or None, optional, default=None + The maximum scalar value in the form of float32 obtained through calibration. + If present, it will be used to by quantized batch norm op to calculate primitive scale. + Note: this calib_range is to calib bn output. + + Returns + ------- + out : NDArray or list of NDArrays + The output of this function. + """ + return _mx_nd_npx.batch_norm(x, gamma, beta, running_mean, running_var, eps=eps, + momentum=momentum, fix_gamma=fix_gamma, + use_global_stats=use_global_stats, + output_mean_var=output_mean_var, axis=axis, cudnn_off=cudnn_off, + min_calib_range=min_calib_range, max_calib_range=max_calib_range) + + +# pylint: disable=too-many-arguments, unused-argument +@set_module('mxnet.numpy_extension') +def fully_connected(x, weight, bias=None, num_hidden=None, + no_bias=True, flatten=True, **kwargs): + r"""Applies a linear transformation: :math:`Y = XW^T + b`. + + If ``flatten`` is set to be true, then the shapes are: + + - **data**: `(batch_size, x1, x2, ..., xn)` + - **weight**: `(num_hidden, x1 * x2 * ... * xn)` + - **bias**: `(num_hidden,)` + - **out**: `(batch_size, num_hidden)` + + If ``flatten`` is set to be false, then the shapes are: + + - **data**: `(x1, x2, ..., xn, input_dim)` + - **weight**: `(num_hidden, input_dim)` + - **bias**: `(num_hidden,)` + - **out**: `(x1, x2, ..., xn, num_hidden)` + + The learnable parameters include both ``weight`` and ``bias``. + + If ``no_bias`` is set to be true, then the ``bias`` term is ignored. + + .. Note:: + + The sparse support for FullyConnected is limited to forward evaluation with `row_sparse` + weight and bias, where the length of `weight.indices` and `bias.indices` must be equal + to `num_hidden`. This could be useful for model inference with `row_sparse` weights + trained with importance sampling or noise contrastive estimation. + + To compute linear transformation with 'csr' sparse data, sparse.dot is recommended instead + of sparse.FullyConnected. + + Parameters + ---------- + data : NDArray + Input data. + weight : NDArray + Weight matrix. + bias : NDArray + Bias parameter. + num_hidden : int, required + Number of hidden nodes of the output. + no_bias : boolean, optional, default=0 + Whether to disable bias parameter. + flatten : boolean, optional, default=1 + Whether to collapse all but the first axis of the input data tensor. + + Returns + ------- + out : NDArray or list of NDArrays + The output of this function. + """ + return _mx_nd_npx.fully_connected(x, weight, bias, num_hidden=num_hidden, + no_bias=no_bias, flatten=flatten) diff --git a/src/api/operator/numpy_extension/npx_activation_op.cc b/src/api/operator/numpy_extension/npx_activation_op.cc new file mode 100644 index 000000000000..c072f6e9fc70 --- /dev/null +++ b/src/api/operator/numpy_extension/npx_activation_op.cc @@ -0,0 +1,69 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file npx_activation_op.cc + * \brief Implementation of the API of functions in src/operator/numpy_extension/npx_activation_op.cc + */ +#include +#include +#include "../utils.h" +#include "../../../operator/nn/activation-inl.h" + +namespace mxnet { + +inline int String2MXNetActType(const std::string& s) { + using namespace op; + if (s == "relu") { + return activation::kReLU; + } else if (s == "sigmoid") { + return activation::kSigmoid; + } else if (s == "tanh") { + return activation::kTanh; + } else if (s == "softrelu") { + return activation::kSoftReLU; + } else if (s == "softsign") { + return activation::kSoftSign; + } else { + LOG(FATAL) << "unknown activation type " << s; + } + LOG(FATAL) << "should not reach here "; + return 0; +} + +MXNET_REGISTER_API("_npx.activation") +.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) { + using namespace runtime; + nnvm::NodeAttrs attrs; + const nnvm::Op* op = Op::Get("_npx_activation"); + op::ActivationParam param; + // act_type + param.act_type = String2MXNetActType(args[1].operator std::string()); + attrs.parsed = param; + attrs.op = op; + SetAttrDict(&attrs); + // inputs + NDArray* inputs[] = {args[0].operator NDArray*()}; + int num_inputs = 1; + int num_outputs = 0; + auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs, &num_outputs, nullptr); + *ret = ndoutputs[0]; +}); + +} // namespace mxnet diff --git a/src/api/operator/numpy_extension/npx_batch_norm_op.cc b/src/api/operator/numpy_extension/npx_batch_norm_op.cc new file mode 100644 index 000000000000..dcf3ac4f0df7 --- /dev/null +++ b/src/api/operator/numpy_extension/npx_batch_norm_op.cc @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file npx_batch_norm_op.cc + * \brief Implementation of the API of functions in src/operator/numpy_extension/npx_batch_norm_op.cc + */ +#include +#include +#include "../utils.h" +#include "../../../operator/nn/batch_norm-inl.h" + +namespace mxnet { + +MXNET_REGISTER_API("_npx.batch_norm") +.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) { + using namespace runtime; + nnvm::NodeAttrs attrs; + const nnvm::Op* op = Op::Get("_npx_batch_norm"); + op::BatchNormParam param; + // eps + param.eps = args[5].operator double(); + // momentum + param.momentum = args[6].operator double(); + // fix_gamma + param.fix_gamma = args[7].operator bool(); + // use_global_stats + param.use_global_stats = args[8].operator bool(); + // output_mean_var + param.output_mean_var = args[9].operator bool(); + // axis + param.axis = args[10].operator int(); + // cudnn_off + param.cudnn_off = args[11].operator bool(); + // min_calib_range + if (args[12].type_code() == kDLFloat || args[12].type_code() == kDLInt) { + param.min_calib_range = args[12].operator double(); + } else { + param.min_calib_range = dmlc::nullopt; + } + // max_calib_range + if (args[13].type_code() == kDLFloat || args[13].type_code() == kDLInt) { + param.max_calib_range = args[13].operator double(); + } else { + param.max_calib_range = dmlc::nullopt; + } + attrs.parsed = param; + attrs.op = op; + SetAttrDict(&attrs); + // inputs + int num_inputs = 5; + std::vector inputs; + inputs.reserve(num_inputs); + for (int i = 0; i < num_inputs; ++i) { + inputs.push_back(args[i].operator mxnet::NDArray*()); + } + int num_outputs = 0; + auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs.data(), &num_outputs, nullptr); + if (num_outputs == 1) { + *ret = ndoutputs[0]; + } else { + std::vector ndarray_handles; + ndarray_handles.reserve(num_outputs); + for (int i = 0; i < num_outputs; ++i) { + ndarray_handles.emplace_back(ndoutputs[i]); + } + *ret = ADT(0, ndarray_handles.begin(), ndarray_handles.end()); + } +}); + +} // namespace mxnet diff --git a/src/api/operator/numpy_extension/npx_fully_connected_op.cc b/src/api/operator/numpy_extension/npx_fully_connected_op.cc new file mode 100644 index 000000000000..d9ab3c02c61b --- /dev/null +++ b/src/api/operator/numpy_extension/npx_fully_connected_op.cc @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file npx_fully_connected_op.cc + * \brief Implementation of the API of functions in src/operator/numpy_extension/npx_fully_connected_op.cc + */ +#include +#include +#include "../utils.h" +#include "../../../operator/nn/fully_connected-inl.h" + +namespace mxnet { + +MXNET_REGISTER_API("_npx.fully_connected") +.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) { + using namespace runtime; + int args_size = args.size(); + nnvm::NodeAttrs attrs; + const nnvm::Op* op = Op::Get("_npx_fully_connected"); + op::FullyConnectedParam param; + // no_bias + param.no_bias = args[args_size - 2].operator bool(); + // inputs + int num_inputs = 2; + if (param.no_bias) { + num_inputs = 2; + } else { + num_inputs = 3; + } + std::vector inputs; + inputs.reserve(num_inputs); + for (int i = 0; i < num_inputs; ++i) { + inputs.push_back(args[i].operator mxnet::NDArray*()); + } + // num_hidden + param.num_hidden = args[args_size - 3].operator int(); + // flatten + param.flatten = args[args_size - 1].operator bool(); + + attrs.parsed = param; + attrs.op = op; + SetAttrDict(&attrs); + + int num_outputs = 0; + auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs.data(), &num_outputs, nullptr); + *ret = ndoutputs[0]; +}); + +} // namespace mxnet diff --git a/src/api/operator/numpy_extension/npx_softmax_op.cc b/src/api/operator/numpy_extension/npx_softmax_op.cc new file mode 100644 index 000000000000..641129e00ae9 --- /dev/null +++ b/src/api/operator/numpy_extension/npx_softmax_op.cc @@ -0,0 +1,136 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file npx_softmax_op.cc + * \brief Implementation of the API of functions in src/operator/numpy_extension/npx_softmax_op.cc + */ +#include +#include +#include "../utils.h" +#include "../../../operator/nn/softmax-inl.h" + +namespace mxnet { + +MXNET_REGISTER_API("_npx.softmax") +.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) { + using namespace runtime; + nnvm::NodeAttrs attrs; + static const nnvm::Op* op = Op::Get("_npx_softmax"); + op::SoftmaxParam param; + int args_size = args.size(); + // inputs + int num_inputs = args_size - 4; + std::vector inputs; + inputs.reserve(num_inputs); + for (int i = 0; i < num_inputs; ++i) { + inputs.push_back(args[i].operator mxnet::NDArray*()); + } + + // parse use_length + if (args[args_size - 2].type_code() == kNull) { + param.use_length = false; + } else { + param.use_length = args[args_size - 2].operator bool(); + } + + // parse axis + if (args[args_size - 4].type_code() == kDLInt) { + param.axis = args[args_size - 4].operator int(); + } else { + param.axis = static_cast(args[args_size - 4].operator double()); + } + + // parse temperature + if (args[args_size - 3].type_code() == kNull) { + param.temperature = dmlc::nullopt; + } else { + param.temperature = args[args_size - 3].operator int64_t(); + } + + // parse dtype + if (args[args_size - 1].type_code() == kNull) { + param.dtype = dmlc::nullopt; + } else { + param.dtype = String2MXNetTypeWithBool(args[args_size - 1].operator std::string()); + } + + attrs.parsed = param; + attrs.op = op; + SetAttrDict(&attrs); + + int num_outputs = 0; + auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs.data(), &num_outputs, nullptr); + *ret = ndoutputs[0]; +}); + +MXNET_REGISTER_API("_npx.log_softmax") +.set_body([](runtime::MXNetArgs args, runtime::MXNetRetValue* ret) { + using namespace runtime; + nnvm::NodeAttrs attrs; + static const nnvm::Op* op = Op::Get("_npx_log_softmax"); + op::SoftmaxParam param; + + int args_size = args.size(); + // inputs + int num_inputs = args_size - 4; + std::vector inputs; + inputs.reserve(num_inputs); + for (int i = 0; i < num_inputs; ++i) { + inputs.push_back(args[i].operator mxnet::NDArray*()); + } + + // parse use_length + if (args[args_size - 2].type_code() == kNull) { + param.use_length = false; + } else { + param.use_length = args[args_size - 2].operator bool(); + } + + // parse axis + if (args[args_size - 4].type_code() == kDLInt) { + param.axis = args[args_size - 4].operator int(); + } else { + param.axis = static_cast(args[args_size - 4].operator double()); + } + + // parse temperature + if (args[args_size - 3].type_code() == kNull) { + param.temperature = dmlc::nullopt; + } else { + param.temperature = args[args_size - 3].operator int64_t(); + } + + // parse dtype + if (args[args_size - 1].type_code() == kNull) { + param.dtype = dmlc::nullopt; + } else { + param.dtype = String2MXNetTypeWithBool(args[args_size - 1].operator std::string()); + } + + attrs.parsed = param; + attrs.op = op; + SetAttrDict(&attrs); + + int num_outputs = 0; + auto ndoutputs = Invoke(op, &attrs, num_inputs, inputs.data(), &num_outputs, nullptr); + *ret = ndoutputs[0]; +}); + +} // namespace mxnet diff --git a/src/operator/nn/activation-inl.h b/src/operator/nn/activation-inl.h index 06ff1fe1bedb..1111464b9697 100644 --- a/src/operator/nn/activation-inl.h +++ b/src/operator/nn/activation-inl.h @@ -69,6 +69,29 @@ struct ActivationParam : public dmlc::Parameter { bool operator==(const ActivationParam& other) const { return this->act_type == other.act_type; } + std::string MXNetActType2String(int act_type) { + switch (act_type) { + case activation::kReLU: + return "relu"; + case activation::kSigmoid: + return "sigmoid"; + case activation::kTanh: + return "tanh"; + case activation::kSoftReLU: + return "softrelu"; + case activation::kSoftSign: + return "softsign"; + default: + LOG(FATAL) << "Unknown act_type enum " << act_type; + } + LOG(FATAL) << "should not reach here "; + return ""; + } + void SetAttrDict(std::unordered_map* dict) { + std::ostringstream act_type_s; + act_type_s << act_type; + (*dict)["act_type"] = MXNetActType2String(act_type); + } }; } // namespace op diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h index 485b3b33f6a8..bb8313d3cd0e 100644 --- a/src/operator/nn/batch_norm-inl.h +++ b/src/operator/nn/batch_norm-inl.h @@ -125,6 +125,28 @@ struct BatchNormParam : public dmlc::Parameter { } return flag; } + void SetAttrDict(std::unordered_map* dict) { + std::ostringstream eps_s, momentum_s, fix_gamma_s, use_global_stats_s, output_mean_var_s, + axis_s, cudnn_off_s, min_calib_range_s, max_calib_range_s; + eps_s << eps; + momentum_s << momentum; + fix_gamma_s << fix_gamma; + use_global_stats_s << use_global_stats; + output_mean_var_s << output_mean_var; + axis_s << axis; + cudnn_off_s << cudnn_off; + min_calib_range_s << min_calib_range; + max_calib_range_s << max_calib_range; + (*dict)["eps"] = eps_s.str(); + (*dict)["momentum"] = momentum_s.str(); + (*dict)["fix_gamma"] = fix_gamma_s.str(); + (*dict)["use_global_stats"] = use_global_stats_s.str(); + (*dict)["output_mean_var"] = output_mean_var_s.str(); + (*dict)["axis"] = axis_s.str(); + (*dict)["cudnn_off"] = cudnn_off_s.str(); + (*dict)["min_calib_range"] = min_calib_range_s.str(); + (*dict)["max_calib_range"] = max_calib_range_s.str(); + } }; } // namespace op diff --git a/src/operator/nn/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h index c90e8ce014e7..51d6f5c8d46d 100644 --- a/src/operator/nn/fully_connected-inl.h +++ b/src/operator/nn/fully_connected-inl.h @@ -80,6 +80,15 @@ struct FullyConnectedParam : public dmlc::Parameter { this->no_bias == other.no_bias && this->flatten == other.flatten; } + void SetAttrDict(std::unordered_map* dict) { + std::ostringstream num_hidden_s, no_bias_s, flatten_s; + num_hidden_s << num_hidden; + no_bias_s << no_bias; + flatten_s << flatten; + (*dict)["num_hidden"] = num_hidden_s.str(); + (*dict)["no_bias"] = no_bias_s.str(); + (*dict)["flatten"] = flatten_s.str(); + } }; /** diff --git a/src/operator/nn/softmax-inl.h b/src/operator/nn/softmax-inl.h index 512d8d2febbb..7f64b7426c3f 100644 --- a/src/operator/nn/softmax-inl.h +++ b/src/operator/nn/softmax-inl.h @@ -1179,6 +1179,21 @@ struct SoftmaxParam : public dmlc::Parameter { this->dtype == other.dtype && this->use_length == other.use_length; } + void SetAttrDict(std::unordered_map* dict) { + std::ostringstream axis_s, temperature_s, dtype_s, use_length_s; + axis_s << axis; + temperature_s << temperature; + dtype_s << dtype; + use_length_s << use_length; + (*dict)["axis"] = axis_s.str(); + (*dict)["temperature"] = temperature_s.str(); + if (dtype.has_value()) { + (*dict)["dtype"] = MXNetTypeWithBool2String(dtype.value()); + } else { + (*dict)["dtype"] = dtype_s.str(); + } + (*dict)["use_length"] = use_length_s.str(); + } }; struct MaskedSoftmaxParam : public dmlc::Parameter {