diff --git a/.gitmodules b/.gitmodules index 42f0027505fd..170c105a6f48 100644 --- a/.gitmodules +++ b/.gitmodules @@ -22,7 +22,3 @@ [submodule "3rdparty/googletest"] path = 3rdparty/googletest url = https://github.com/google/googletest.git -[submodule "3rdparty/mkldnn"] - path = 3rdparty/mkldnn - url = https://github.com/intel/mkl-dnn.git - branch = master diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn deleted file mode 160000 index 3e1f8f53f684..000000000000 --- a/3rdparty/mkldnn +++ /dev/null @@ -1 +0,0 @@ -Subproject commit 3e1f8f53f6845dce23abf8089501c2eb45420b9e diff --git a/CMakeLists.txt b/CMakeLists.txt index dfa9834ffbab..14b40e4f7be4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,8 +33,8 @@ mxnet_option(USE_OPENMP "Build with Openmp support" ON) mxnet_option(USE_CUDNN "Build with cudnn support" ON) # one could set CUDNN_ROOT for search path mxnet_option(USE_LAPACK "Build with lapack support" ON IF NOT MSVC) mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON) -mxnet_option(USE_MKLDNN "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE)) -mxnet_option(USE_MKLML_MKL "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE)) +mxnet_option(USE_MKLML_MKL "Use MKLML variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE)) +mxnet_option(USE_MKL_EXPERIMENTAL "Use experimental MKL (if MKL enabled and found)" OFF) mxnet_option(USE_OPERATOR_TUNING "Enable auto-tuning of operators" ON AND NOT MSVC) mxnet_option(USE_GPERFTOOLS "Build with GPerfTools support (if found)" ON) mxnet_option(USE_JEMALLOC "Build with Jemalloc support" ON) @@ -138,11 +138,14 @@ if(USE_VTUNE) endif() if(USE_MKL_IF_AVAILABLE) + if(USE_MKL_EXPERIMENTAL AND NOT USE_MKLML_MKL) + message(ERROR " USE_MKL_EXPERIMENTAL can only be used when USE_MKL_EXPERIMENTAL is enabled") + endif() find_package(MKL) if(MKL_FOUND) include_directories(${MKL_INCLUDE_DIR}) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/operator/mkl) - add_definitions(-DMXNET_USE_MKLDNN=1) + add_definitions(-DMXNET_USE_MKL2017=1) add_definitions(-DUSE_MKL=1) add_definitions(-DCUB_MKL=1) list(APPEND mxnet_LINKER_LIBS ${MKL_LIBRARIES}) @@ -151,6 +154,11 @@ if(USE_MKL_IF_AVAILABLE) endif() # If using MKL, use the Intel OMP libraries list(APPEND mxnet_LINKER_LIBS iomp5) + if(USE_MKL_EXPERIMENTAL) + add_definitions(-DMKL_EXPERIMENTAL=1) + else() + add_definitions(-DMKL_EXPERIMENTAL=0) + endif() else() message(STATUS " MKL not found") endif() diff --git a/Jenkinsfile b/Jenkinsfile index 80f9424d6812..05cda74066f9 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -24,7 +24,6 @@ mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a' // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default. mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/dmlc-core/libdmlc.a' -mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_gnu.so, lib/libmkldnn.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a' // command to start a docker container docker_run = 'tests/ci_build/ci_build.sh' // timeout in minutes @@ -162,18 +161,18 @@ def python3_gpu_ut(docker_type) { } // Python 2 -def python2_mkldnn_ut(docker_type) { +def python2_mklml_ut(docker_type) { timeout(time: max_time, unit: 'MINUTES') { sh "${docker_run} ${docker_type} find . -name '*.pyc' -type f -delete" - sh "${docker_run} ${docker_type} PYTHONPATH=./python/ MXNET_MKLDNN_DEBUG=1 nosetests-2.7 --with-timer --verbose tests/python/cpu" + sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-2.7 --with-timer --verbose tests/python/cpu" } } // Python 3 -def python3_mkldnn_ut(docker_type) { +def python3_mklml_ut(docker_type) { timeout(time: max_time, unit: 'MINUTES') { sh "${docker_run} ${docker_type} find . -name '*.pyc' -type f -delete" - sh "${docker_run} ${docker_type} PYTHONPATH=./python/ MXNET_MKLDNN_DEBUG=1 nosetests-3.4 --with-timer --verbose tests/python/cpu" + sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/cpu" } } @@ -244,20 +243,21 @@ try { } } }, - 'CPU: MKLDNN': { + 'CPU: MKLML': { node('mxnetlinux-cpu') { - ws('workspace/build-mkldnn-cpu') { + ws('workspace/build-mklml-cpu') { init_git() def flag = """ \ DEV=1 \ USE_PROFILER=1 \ USE_CPP_PACKAGE=1 \ USE_BLAS=openblas \ - USE_MKLDNN=1 \ + USE_MKL2017=1 \ + USE_MKL2017_EXPERIMENTAL=1 \ -j\$(nproc) """ make("cpu_mklml", flag) - pack_lib('mkldnn_cpu', mx_mkldnn_lib) + pack_lib('mklml_cpu') } } }, @@ -278,23 +278,24 @@ try { } } }, - 'GPU: MKLDNN': { + 'GPU: MKLML': { node('mxnetlinux-cpu') { - ws('workspace/build-mkldnn-gpu') { + ws('workspace/build-mklml-gpu') { init_git() def flag = """ \ DEV=1 \ USE_PROFILER=1 \ USE_CPP_PACKAGE=1 \ USE_BLAS=openblas \ - USE_MKLDNN=1 \ + USE_MKL2017=1 \ + USE_MKL2017_EXPERIMENTAL=1 \ USE_CUDA=1 \ USE_CUDA_PATH=/usr/local/cuda \ USE_CUDNN=1 \ -j\$(nproc) """ make("build_cuda", flag) - pack_lib('mkldnn_gpu', mx_mkldnn_lib) + pack_lib('mklml_gpu') } } }, @@ -441,43 +442,43 @@ try { } } }, - 'Python2: MKLDNN-CPU': { + 'Python2: MKLML-CPU': { node('mxnetlinux-cpu') { - ws('workspace/ut-python2-mkldnn-cpu') { + ws('workspace/ut-python2-mklml-cpu') { init_git() - unpack_lib('mkldnn_cpu', mx_mkldnn_lib) + unpack_lib('mklml_cpu') python2_ut('cpu_mklml') - python2_mkldnn_ut('cpu_mklml') + python2_mklml_ut('cpu_mklml') } } }, - 'Python2: MKLDNN-GPU': { + 'Python2: MKLML-GPU': { node('mxnetlinux-gpu') { - ws('workspace/ut-python2-mkldnn-gpu') { + ws('workspace/ut-python2-mklml-gpu') { init_git() - unpack_lib('mkldnn_gpu', mx_mkldnn_lib) + unpack_lib('mklml_gpu') python2_gpu_ut('gpu_mklml') - python2_mkldnn_ut('gpu_mklml') + python2_mklml_ut('gpu_mklml') } } }, - 'Python3: MKLDNN-CPU': { + 'Python3: MKLML-CPU': { node('mxnetlinux-cpu') { - ws('workspace/ut-python3-mkldnn-cpu') { + ws('workspace/ut-python3-mklml-cpu') { init_git() - unpack_lib('mkldnn_cpu', mx_mkldnn_lib) + unpack_lib('mklml_cpu') python3_ut('cpu_mklml') - python3_mkldnn_ut('cpu_mklml') + python3_mklml_ut('cpu_mklml') } } }, - 'Python3: MKLDNN-GPU': { + 'Python3: MKLML-GPU': { node('mxnetlinux-gpu') { - ws('workspace/ut-python3-mkldnn-gpu') { + ws('workspace/ut-python3-mklml-gpu') { init_git() - unpack_lib('mkldnn_gpu', mx_mkldnn_lib) + unpack_lib('mklml_gpu') python3_gpu_ut('gpu_mklml') - python3_mkldnn_ut('gpu_mklml') + python3_mklml_ut('gpu_mklml') } } }, diff --git a/Makefile b/Makefile index d325aa65ab01..976035b1087c 100644 --- a/Makefile +++ b/Makefile @@ -59,11 +59,11 @@ endif # use customized config file include $(config) -ifeq ($(USE_MKLDNN), 1) - RETURN_STRING := $(shell ./prepare_mkldnn.sh $(MKLDNN_ROOT)) - MKLDNNROOT := $(firstword $(RETURN_STRING)) - MKLROOT := $(lastword $(RETURN_STRING)) - export USE_MKLML = 1 +ifeq ($(USE_MKL2017), 1) +# must run ./prepare_mkl before including mshadow.mk + RETURN_STRING := $(shell ./prepare_mkl.sh $(MKLML_ROOT)) + MKLROOT := $(firstword $(RETURN_STRING)) + export USE_MKLML = $(lastword $(RETURN_STRING)) endif include mshadow/make/mshadow.mk @@ -131,16 +131,23 @@ ifeq ($(USE_NNPACK), 1) LDFLAGS += -lnnpack endif -ifeq ($(USE_MKLDNN), 1) - CFLAGS += -DMXNET_USE_MKLDNN=1 +ifeq ($(USE_MKL2017), 1) + CFLAGS += -DMXNET_USE_MKL2017=1 CFLAGS += -DUSE_MKL=1 - CFLAGS += -I$(ROOTDIR)/src/operator/nn/mkldnn/ - ifneq ($(MKLDNNROOT), $(MKLROOT)) - CFLAGS += -I$(MKLROOT)/include - LDFLAGS += -L$(MKLROOT)/lib + CFLAGS += -I$(ROOTDIR)/src/operator/mkl/ + CFLAGS += -I$(MKLML_ROOT)/include + LDFLAGS += -L$(MKLML_ROOT)/lib + ifeq ($(USE_MKL2017_EXPERIMENTAL), 1) + CFLAGS += -DMKL_EXPERIMENTAL=1 + else + CFLAGS += -DMKL_EXPERIMENTAL=0 + endif + ifeq ($(UNAME_S), Darwin) + LDFLAGS += -lmklml + else + LDFLAGS += -Wl,--as-needed -lmklml_intel -lmklml_gnu endif - CFLAGS += -I$(MKLDNNROOT)/include - LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}' + LDFLAGS += -liomp5 endif ifeq ($(USE_OPERATOR_TUNING), 1) @@ -154,7 +161,7 @@ endif # - for Ubuntu, installing atlas will not automatically install the atlas provided lapack library # silently switching lapack off instead of letting the build fail because of backward compatibility ifeq ($(USE_LAPACK), 1) -ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas mkl)) +ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas)) ifeq (,$(wildcard /lib/liblapack.a)) ifeq (,$(wildcard /usr/lib/liblapack.a)) ifeq (,$(wildcard /usr/lib64/liblapack.a)) @@ -172,7 +179,7 @@ ifeq ($(USE_LAPACK), 1) ifneq ($(USE_LAPACK_PATH), ) LDFLAGS += -L$(USE_LAPACK_PATH) endif - ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas mkl)) + ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas)) LDFLAGS += -llapack endif CFLAGS += -DMXNET_USE_LAPACK @@ -562,8 +569,7 @@ clean: cyclean $(EXTRA_PACKAGES_CLEAN) else clean: cyclean testclean $(EXTRA_PACKAGES_CLEAN) $(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ R-package/NAMESPACE R-package/man R-package/R/mxnet_generated.R \ - R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz \ - external/mkldnn/install/* + R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz cd $(DMLC_CORE); $(MAKE) clean; cd - cd $(PS_PATH); $(MAKE) clean; cd - cd $(NNVM_PATH); $(MAKE) clean; cd - diff --git a/amalgamation/mxnet_predict0.cc b/amalgamation/mxnet_predict0.cc index cfee60559501..f35591d82b22 100644 --- a/amalgamation/mxnet_predict0.cc +++ b/amalgamation/mxnet_predict0.cc @@ -66,7 +66,7 @@ #include "src/operator/operator_util.cc" #include "src/operator/nn/activation.cc" #include "src/operator/nn/batch_norm.cc" -#include "src/operator/nn/concat.cc" +#include "src/operator/concat.cc" #include "src/operator/nn/convolution.cc" #include "src/operator/nn/deconvolution.cc" #include "src/operator/nn/dropout.cc" diff --git a/cmake/ChooseBlas.cmake b/cmake/ChooseBlas.cmake index 13d7083f3d12..3a8723a5dd5e 100644 --- a/cmake/ChooseBlas.cmake +++ b/cmake/ChooseBlas.cmake @@ -23,7 +23,7 @@ if(USE_MKL_IF_AVAILABLE) find_package(MKL) endif() if(MKL_FOUND) - if(USE_MKLDNN) + if(USE_MKLML_MKL) set(BLAS "open") else() set(BLAS "MKL") @@ -55,4 +55,4 @@ elseif(BLAS STREQUAL "apple") list(APPEND mshadow_LINKER_LIBS ${Accelerate_LIBRARIES}) add_definitions(-DMSHADOW_USE_MKL=0) add_definitions(-DMSHADOW_USE_CBLAS=1) -endif() +endif() \ No newline at end of file diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake index 70405566d8ae..743a871ee7cd 100644 --- a/cmake/Modules/FindMKL.cmake +++ b/cmake/Modules/FindMKL.cmake @@ -19,7 +19,7 @@ # # Options: # -# USE_MKLDNN : Search for MKL:ML library variant +# USE_MKLML_MKL : Search for MKL:ML library variant # # MKL_USE_SINGLE_DYNAMIC_LIBRARY : use single dynamic library interface # MKL_USE_STATIC_LIBS : use static libraries @@ -33,7 +33,7 @@ # MKL_INCLUDE_DIR : unclude directory # MKL_LIBRARIES : the libraries to link against. # -# cjolivier01: Changed to also look for MKLDNN library (subset of mkl) instead of standard MKL package +# cjolivier01: Changed to also look for MKLML library (subset of mkl) instead of standard MKL package # if(MKL_FOUND) @@ -43,7 +43,7 @@ endif() # ---[ Root folders set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs") -if(USE_MKLDNN) +if(USE_MKLML_MKL) find_path(MKL_ROOT include/mkl_blas.h PATHS $ENV{MKL_ROOT} @@ -66,14 +66,13 @@ if(USE_MKLDNN) set(__mkl_libs "") if(WIN32) - list(APPEND __mkl_libs mklml_intel) + list(APPEND __mkl_libs intel) else() - list(APPEND __mkl_libs mklml_gnu) + list(APPEND __mkl_libs gnu) endif() - list(APPEND __mkl_libs mkldnn) foreach (__lib ${__mkl_libs}) - set(__mkl_lib "${__lib}") + set(__mkl_lib "mklml_${__lib}") string(TOUPPER ${__mkl_lib} __mkl_lib_upper) if(MKL_USE_STATIC_LIBS) @@ -91,7 +90,8 @@ if(USE_MKLDNN) list(APPEND MKL_LIBRARIES ${${__mkl_lib_upper}_LIBRARY}) endforeach() -else(USE_MKLDNN) + +else(USE_MKLML_MKL) # ---[ Options mxnet_option(MKL_USE_SINGLE_DYNAMIC_LIBRARY "Use single dynamic library interface" ON) @@ -193,7 +193,7 @@ else(USE_MKLDNN) list(APPEND MKL_LIBRARIES ${MKL_RTL_LIBRARY}) endif() -endif(USE_MKLDNN) +endif(USE_MKLML_MKL) include(FindPackageHandleStandardArgs) find_package_handle_standard_args(MKL DEFAULT_MSG ${__looked_for}) diff --git a/example/image-classification/common/data.py b/example/image-classification/common/data.py index 05f5ddc4506e..dc8915cda4c8 100755 --- a/example/image-classification/common/data.py +++ b/example/image-classification/common/data.py @@ -112,8 +112,7 @@ def get_rec_iter(args, kv=None): image_shape = tuple([int(l) for l in args.image_shape.split(',')]) if 'benchmark' in args and args.benchmark: data_shape = (args.batch_size,) + image_shape - train = SyntheticDataIter(args.num_classes, data_shape, - args.num_examples / args.batch_size, np.float32) + train = SyntheticDataIter(args.num_classes, data_shape, 500, np.float32) return (train, None) if kv: (rank, nworker) = (kv.rank, kv.num_workers) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index 23c24766a0d3..47582fa59527 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -34,14 +34,14 @@ #include #include #include -#include #include -#if MXNET_USE_MKLDNN == 1 -#include -#endif +#include #include "./base.h" #include "./storage.h" #include "./engine.h" +#if MKL_EXPERIMENTAL == 1 +#include +#endif // check c++11 #if DMLC_USE_CXX11 == 0 #error "cxx11 was required for ndarray module" @@ -73,7 +73,6 @@ enum NDArrayFormatErr { kRSPIdxErr, // indices error for row sparse }; -class MKLDNNMemory; /*! * \brief ndarray interface @@ -82,6 +81,9 @@ class NDArray { public: /*! \brief default constructor */ NDArray() { +#if MKL_EXPERIMENTAL == 1 + Mkl_mem_ = MKLMemHolder::create(); +#endif } /*! * \brief constructs a new dynamic NDArray @@ -95,14 +97,56 @@ class NDArray { : ptr_(std::make_shared(shape, ctx, delay_alloc, dtype)), shape_(shape), dtype_(dtype), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) { +#if MKL_EXPERIMENTAL == 1 + Mkl_mem_ = std::make_shared(); +#endif } /*! \brief constructor for NDArray with storage type */ NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx, bool delay_alloc = true, int dtype = mshadow::default_type_flag, std::vector aux_types = {}, std::vector aux_shapes = {}, - TShape storage_shape = TShape(mshadow::Shape1(0))); - + TShape storage_shape = TShape(mshadow::Shape1(0))) + : shape_(shape), dtype_(dtype), storage_type_(stype), + entry_({nullptr, 0, 0}) { + // Assign default aux types if not given + if (aux_types.size() == 0) { + if (stype == kRowSparseStorage) { + aux_types = {mshadow::kInt64}; + } else if (stype == kCSRStorage) { + aux_types = {mshadow::kInt64, mshadow::kInt64}; + } else { + LOG(FATAL) << "Unknown storage type " << stype; + } + } + // Assign default shapes if not given + // unknown shapes are intialized as {0} such that Size() would return 0 + if (aux_shapes.size() == 0) { + if (stype == kRowSparseStorage) { + aux_shapes = {TShape(mshadow::Shape1(0))}; + } else if (stype == kCSRStorage) { + // aux shapes for indptr and indices + aux_shapes = {TShape(mshadow::Shape1(0)), TShape(mshadow::Shape1(0))}; + } else { + LOG(FATAL) << "Unknown storage type " << stype; + } + } + if (storage_shape.Size() == 0) { + if (stype == kRowSparseStorage) { + storage_shape = shape; + storage_shape[0] = aux_shapes[rowsparse::kIdx][0]; + } else if (stype == kCSRStorage) { + storage_shape = aux_shapes[csr::kIdx]; + } else { + LOG(FATAL) << "Unknown storage type " << stype; + } + } + ptr_ = std::make_shared(stype, storage_shape, ctx, delay_alloc, + dtype, aux_types, aux_shapes); +#if MKL_EXPERIMENTAL == 1 + Mkl_mem_ = std::make_shared(); +#endif + } /*! * \brief constructing a static NDArray that shares data with TBlob * Use with caution: allocate ONLY ONE NDArray for each TBlob, @@ -114,11 +158,17 @@ class NDArray { : ptr_(std::make_shared(data, dev_id)), shape_(data.shape_), dtype_(data.type_flag_), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) { +#if MKL_EXPERIMENTAL == 1 + Mkl_mem_ = std::make_shared(); +#endif } /*! \brief create ndarray from shared memory */ NDArray(int shared_pid, int shared_id, const TShape& shape, int dtype) : ptr_(std::make_shared(shared_pid, shared_id, shape, dtype)), shape_(shape), dtype_(dtype), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) { +#if MKL_EXPERIMENTAL == 1 + Mkl_mem_ = std::make_shared(); +#endif } /*! @@ -135,24 +185,11 @@ class NDArray { const TBlob &data, const std::vector &aux_data, int dev_id) : ptr_(std::make_shared(stype, data, aux_data, dev_id)), shape_(shape), dtype_(data.type_flag_), storage_type_(stype), entry_({nullptr, 0, 0}) { +#if MKL_EXPERIMENTAL == 1 + Mkl_mem_ = std::make_shared(); +#endif } - /* - * This indicates whether an array is a view of another array (created by - * reshape or slice). If an array is a view and the the data is stored in - * MKLDNN format, we need to convert the data to the default format when - * data in the view is accessed. - */ - inline bool IsView() const { - // View only works on the default storage - if (storage_type() != kDefaultStorage) - return false; - // If the array reuses memory, its shape may be different from the storage - // shape. However, we shouldn't consider it as a view. - if (reuse_) - return false; - return byte_offset_ > 0 || shape() != ptr_->storage_shape; - } /*! * \return the shape of current NDArray. @@ -235,6 +272,9 @@ class NDArray { << "Unexpected storage type: " << stype; res = TBlob(dptr, shape, ptr_->aux_handles[i].ctx.dev_mask(), type); }); +#if MKL_EXPERIMENTAL == 1 + res.Mkl_mem_ = Mkl_mem_; +#endif return res; } /*! @@ -495,12 +535,15 @@ class NDArray { CHECK_GE(ptr_->shandle.size, shape.Size() * mshadow::mshadow_sizeof(dtype)) << "NDArray.AsArray: target memory size is bigger"; - // We can't reuse memory in a view. - CHECK(!IsView()); +#if MKL_EXPERIMENTAL == 1 + if (Mkl_mem_ != nullptr) { + // convert prv to cpu + Mkl_mem_->check_and_prv_to_cpu(ptr_->shandle.dptr); + } +#endif NDArray ret = *this; ret.shape_ = shape; ret.dtype_ = dtype; - ret.reuse_ = true; return ret; } /*! @@ -569,83 +612,6 @@ class NDArray { << "CheckAndAllocAuxData is not intended for kDefaultStorage"; ptr_->CheckAndAllocAuxData(i, aux_shape); } - -#if MXNET_USE_MKLDNN == 1 - /* - * Test if the data is stored in one of special MKLDNN format. - */ - bool IsMKLDNNData() const { - return ptr_->IsMKLDNN(); - } - /* - * Test if the data is stored in one of default MXNet formats. - */ - bool IsDefaultData() const { - return ptr_->IsDefault(); - } - /* - * All functions below return a raw pointer to mkldnn memory. Actually there - * is a shared pointer that hold the memory either in NDArray or in MKLDNN - * stream. As long as we call these functions inside an operator, the return - * memory is always valid. - */ - - /* - * This function returns mkldnn::memory with the default primitive_desc. - */ - const mkldnn::memory *GetMKLDNNData() const; - /* - * This function returns mkldnn::memory with the given primitive_desc - * as long as the array size meets the required size in the given primitive_desc. - */ - const mkldnn::memory *GetMKLDNNData( - const mkldnn::memory::primitive_desc &desc) const; - /* - * This function returns mkldnn::memory with the given primitive_desc. - * The returned mkldnn::memory will have the same physical layout as - * the given primitive_desc. - */ - const mkldnn::memory *GetMKLDNNDataReorder( - const mkldnn::memory::primitive_desc &desc) const; - - /* - * This function copies data from mkldnn memory. - */ - void CopyFrom(const mkldnn::memory &mem); - /* - * This function allocates memory for array and creates mkldnn memory - * with the specified format. - */ - mkldnn::memory *CreateMKLDNNData( - const mkldnn::memory::primitive_desc &desc); - - /* - * Reorder the memory to the specified layout. - */ - void MKLDNNDataReorder(const mkldnn::memory::primitive_desc &desc); - void Reorder2Default() { - CHECK_EQ(storage_type(), kDefaultStorage); - ptr_->Reorder2Default(); - } - - void InvalidateMKLDNNData() { - // Removing mkl_mem_ means the NDArray will store data in the default format. - ptr_->mkl_mem_ = nullptr; - } - - /* - * This function is used inside operators to reshape an array. - * It doesn't change the layout of the original array and allocate memory from - * the temporary buffer. The returned array is only valid inside the current - * invocation of this operator. - * This is different from Reshape. Reshape will cause data in the array to be - * converted to the default layout and allocate memory from malloc directly, - * which can be expensive. - * It's used by FullyConnected right now. - */ - NDArray MKLDNNDataReshape(const TShape &shape) const; -#endif - /*! * \brief Save list of ndarray into the Stream.x * \param fo The stream of output. @@ -680,12 +646,6 @@ class NDArray { for csr, aux_handles[0] = indptr, aux_handles[1] = indices */ std::vector aux_handles; - -#if MXNET_USE_MKLDNN == 1 - /*! This is created when data is stored in MKLDNN format. - */ - std::shared_ptr mkl_mem_; -#endif /*! \brief variable from engine */ Engine::VarHandle var; /*! @@ -747,7 +707,7 @@ class NDArray { : static_data(false), delay_alloc(false) { var = Engine::Get()->NewVariable(); ctx = Context::CPUShared(0); - shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype); + shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);; shandle.ctx = ctx; shandle.shared_pid = shared_pid; shandle.shared_id = shared_id; @@ -822,9 +782,6 @@ class NDArray { inline void CheckAndAlloc(void) { if (delay_alloc) { shandle = Storage::Get()->Alloc(shandle.size, shandle.ctx); -#if MXNET_USE_MKLDNN == 1 - mkl_mem_ = nullptr; -#endif delay_alloc = false; } } @@ -837,18 +794,12 @@ class NDArray { dbytes = std::max(dbytes, static_cast(shandle.size)); if (delay_alloc) { shandle = Storage::Get()->Alloc(dbytes, shandle.ctx); -#if MXNET_USE_MKLDNN == 1 - mkl_mem_ = nullptr; -#endif delay_alloc = false; } else if (shandle.size < dbytes) { // free storage if necessary and alloc again if (shandle.size > 0) Storage::Get()->Free(shandle); // init storage shandle = Storage::Get()->Alloc(dbytes, shandle.ctx); -#if MXNET_USE_MKLDNN == 1 - mkl_mem_ = nullptr; -#endif } } @@ -874,19 +825,20 @@ class NDArray { // storage shape is also updated // if data is already allocated, try reuse the storage. Otherwise, free the current one // and allocate new storage - void CheckAndAllocData(const TShape &shape, int dtype); - -#if MXNET_USE_MKLDNN == 1 - // Have MKL memory reference to the data in the default storage - // or create memory for MKLDNN. - void SetMKLMem(const TShape &shape, int dtype); - // In the data is stored in MKLDNN layout, we reorder data in mkl_mem_ and - // save the result in shandle. - void Reorder2Default(); - bool IsMKLDNN() const; - bool IsDefault() const; -#endif - + inline void CheckAndAllocData(const TShape &shape, int dtype) { + CHECK_NE(aux_shapes.size(), 0) << "data is expected to be allocated after aux_data"; + auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype); + if (shandle.size < dbytes) { + // free storage if necessary and alloc again + if (shandle.size > 0) Storage::Get()->Free(shandle); + // init storage + shandle = Storage::Get()->Alloc(dbytes, ctx); + } + // init shape + storage_shape = shape; + // delay_alloc is only set when data storage handle is present + delay_alloc = false; + } // create storage handle for aux data based on shape // this function assumes ctx, aux shapes and aux types are set // aux shape is also updated @@ -912,11 +864,45 @@ class NDArray { set_aux_shape(i, shape); } /*! \brief destructor */ - ~Chunk(); + ~Chunk() { + bool skip_free = static_data || delay_alloc; + Storage::Handle h = this->shandle; + std::vector aux_h = this->aux_handles; + Engine::Get()->DeleteVariable([h, aux_h, skip_free](RunContext s) { + if (skip_free == false) { + Storage::Get()->Free(h); + for (size_t i = 0; i < aux_h.size(); i++) { + if (aux_h[i].size > 0) Storage::Get()->Free(aux_h[i]); + } + } + }, shandle.ctx, var); + } }; // struct Chunk - void SetTBlob() const; + void SetTBlob() const { + CHECK(ptr_ != nullptr); + TShape shape = shape_; + char *dptr = static_cast(ptr_->shandle.dptr); + auto stype = storage_type(); + if (stype == kDefaultStorage) { + dptr += byte_offset_; + } else if (stype == kCSRStorage || stype == kRowSparseStorage) { + shape = storage_shape(); + } else { + LOG(FATAL) << "unknown storage type " << stype; + } + tblob_.dptr_ = dptr; + tblob_.shape_ = shape; + tblob_.type_flag_ = dtype_; + tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id); +#if MKL_EXPERIMENTAL == 1 + tblob_.Mkl_mem_ = Mkl_mem_; +#endif + } +#if MKL_EXPERIMENTAL == 1 + std::shared_ptr Mkl_mem_; +#endif /*! \brief internal data of NDArray */ std::shared_ptr ptr_{nullptr}; /*! \brief shape of current NDArray */ @@ -925,8 +911,6 @@ class NDArray { size_t byte_offset_ = 0; /*! \brief type of data */ int dtype_ = -1; - /*! \brief whether the NDArray uses memory of another NDArray. */ - bool reuse_ = false; /*! \brief storage type of data */ NDArrayStorageType storage_type_ = kUndefinedStorage; /*! \brief node entry for autograd */ diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h index 168ddcca24b7..b65cd2b434e4 100755 --- a/include/mxnet/tensor_blob.h +++ b/include/mxnet/tensor_blob.h @@ -36,6 +36,9 @@ #include #include #include "./base.h" +#if MXNET_USE_MKL2017 == 1 +#include +#endif namespace mxnet { /* Forward declaration for friend declaration in TBlob */ @@ -63,10 +66,17 @@ class TBlob { /*! \brief type flag of the tensor blob */ int type_flag_; + /*! \brief storing mkl chunk buffer blob, use for experimental only */ +#if MKL_EXPERIMENTAL == 1 + std::shared_ptr Mkl_mem_; +#endif /*! \brief default constructor, default copy assign will work */ TBlob(void) : dptr_(NULL), type_flag_(mshadow::DataType::kFlag) { +#if MKL_EXPERIMENTAL == 1 + Mkl_mem_ = NULL; +#endif SetDLTensor(cpu::kDevMask, 0); } /*! @@ -80,6 +90,9 @@ class TBlob { TBlob(DType *dptr, const TShape &shape, int dev_mask, int dev_id = -1) : dptr_(dptr), shape_(shape), type_flag_(mshadow::DataType::kFlag) { +#if MKL_EXPERIMENTAL == 1 + Mkl_mem_ = NULL; +#endif SetDLTensor(dev_mask, dev_id); } /*! @@ -92,6 +105,9 @@ class TBlob { */ TBlob(void *dptr, const TShape &shape, int dev_mask, int type_flag, int dev_id = -1) : dptr_(dptr), shape_(shape), type_flag_(type_flag) { +#if MKL_EXPERIMENTAL == 1 + Mkl_mem_ = NULL; +#endif SetDLTensor(dev_mask, dev_id); } /*! @@ -119,6 +135,9 @@ class TBlob { shape_ = src.shape_; type_flag_ = mshadow::DataType::kFlag; SetDLTensor(Device::kDevMask, -1); +#if MKL_EXPERIMENTAL == 1 + Mkl_mem_ = NULL; +#endif return *this; } /*! @@ -153,6 +172,11 @@ class TBlob { CHECK(mshadow::DataType::kFlag == type_flag_) << "TBlob.get_with_shape: data type do not match specified type." << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType::kFlag; +#if MKL_EXPERIMENTAL == 1 + if (Mkl_mem_ != nullptr) { + Mkl_mem_->check_and_prv_to_cpu(dptr_); + } +#endif return mshadow::Tensor(static_cast(dptr_), shape_.FlatTo2D(), shape_[shape_.ndim() - 1], @@ -193,6 +217,11 @@ class TBlob { CHECK(mshadow::DataType::kFlag == type_flag_) << "TBlob.get_with_shape: data type do not match specified type." << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType::kFlag; +#if MKL_EXPERIMENTAL == 1 + if (Mkl_mem_ != nullptr) { + Mkl_mem_->check_and_prv_to_cpu(dptr_); + } +#endif return static_cast(dptr_); } /*! \brief device mask of the corresponding device */ diff --git a/prepare_mkldnn.sh b/prepare_mkldnn.sh deleted file mode 100755 index 7cd7d6af0609..000000000000 --- a/prepare_mkldnn.sh +++ /dev/null @@ -1,118 +0,0 @@ -#!/bin/bash - -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -# set -ex -# -# All modification made by Intel Corporation: © 2016 Intel Corporation -# -# All contributions by the University of California: -# Copyright (c) 2014, 2015, The Regents of the University of California (Regents) -# All rights reserved. -# -# All other contributions: -# Copyright (c) 2014, 2015, the respective contributors -# All rights reserved. -# For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md -# -# -# Redistribution and use in source and binary forms, with or without -# modification, are permitted provided that the following conditions are met: -# -# * Redistributions of source code must retain the above copyright notice, -# this list of conditions and the following disclaimer. -# * Redistributions in binary form must reproduce the above copyright -# notice, this list of conditions and the following disclaimer in the -# documentation and/or other materials provided with the distribution. -# * Neither the name of Intel Corporation nor the names of its contributors -# may be used to endorse or promote products derived from this software -# without specific prior written permission. -# -# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE -# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE -# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -# - -MXNET_ROOTDIR="$(pwd)" -MKLDNN_ROOTDIR="$MXNET_ROOTDIR/3rdparty/mkldnn/" -MKLDNN_SRCDIR="$MKLDNN_ROOTDIR/src" -MKLDNN_BUILDDIR="$MKLDNN_ROOTDIR/build" -MKLDNN_INSTALLDIR="$MKLDNN_ROOTDIR/install" -MKLDNN_LIBDIR="$MXNET_ROOTDIR/lib" - -# MKLDNN install destination -HOME_MKLDNN=$1 -if [ ! -z "$HOME_MKLDNN" ]; then - mkdir -p $HOME_MKLDNN - if [ ! -w $HOME_MKLDNN ]; then - echo "MKLDNN install to $HOME_MKLDNN failed, please try with sudo" >&2 - exit 1 - fi -fi - -if [ -z $MKLDNNROOT ]; then -if [ ! -f "$MKLDNN_INSTALLDIR/lib/libmkldnn.so" ]; then - mkdir -p $MKLDNN_INSTALLDIR - cd $MKLDNN_ROOTDIR - if [ -z $MKLROOT ] && [ ! -f $MKLDNN_INSTALLDIR/include/mkl_cblas.h ]; then - rm -rf external && cd scripts && ./prepare_mkl.sh && cd .. - cp -a external/*/* $MKLDNN_INSTALLDIR/. - fi - echo "Building MKLDNN ..." >&2 - cd $MXNET_ROOTDIR - g++ --version >&2 - if [ -z $ARCH_OPT ]; then - cmake $MKLDNN_ROOTDIR -DCMAKE_INSTALL_PREFIX=$MKLDNN_INSTALLDIR -B$MKLDNN_BUILDDIR - else - cmake $MKLDNN_ROOTDIR -DCMAKE_INSTALL_PREFIX=$MKLDNN_INSTALLDIR -B$MKLDNN_BUILDDIR -DARCH_OPT_FLAGS=$ARCH_OPT - fi - make -C $MKLDNN_BUILDDIR -j$(cat /proc/cpuinfo | grep processor | wc -l) VERBOSE=1 >&2 - make -C $MKLDNN_BUILDDIR install - rm -rf $MKLDNN_BUILDDIR - mkdir -p $MKLDNN_LIBDIR - cp $MKLDNN_INSTALLDIR/lib/* $MKLDNN_LIBDIR -fi -MKLDNNROOT=$MKLDNN_INSTALLDIR -fi - -if [ -z $MKLROOT ] && [ -f $MKLDNNROOT/include/mkl_cblas.h ]; then - MKLROOT=$MKLDNNROOT; -fi - -# user specified MKLDNN install folder -if [ -d "$HOME_MKLDNN" ]; then - # skip if user specificed MKLDNNROOT - [ "$MKLDNNROOT" != "$HOME_MKLDNN" ] && rsync -a $MKLDNNROOT/include $MKLDNNROOT/lib $HOME_MKLDNN/. - [ "$MKLROOT" != "$HOME_MKLDNN" ] && rsync -a $MKLROOT/include $MKLROOT/lib $HOME_MKLDNN/. - # update ldconfig if possible - if [ -w /etc/ld.so.conf.d ]; then - echo "$HOME_MKLDNN/lib" > /etc/ld.so.conf.d/mxnmkldnn.conf && ldconfig - fi -# return value to calling script (Makefile,cmake) - echo $HOME_MKLDNN $HOME_MKLDNN -else - echo $MKLDNNROOT $MKLROOT -fi - diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py index 56f4b9c83e77..64619044862b 100644 --- a/python/mxnet/test_utils.py +++ b/python/mxnet/test_utils.py @@ -1287,10 +1287,6 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write', arr[:] = arg_params[name] for name, arr in exe.aux_dict.items(): arr[:] = aux_params[name] - # We need to initialize the gradient arrays if it's add. - if (grad_req == "add"): - for arr in exe.grad_arrays: - arr[:] = np.zeros(arr.shape, dtype=arr.dtype) dtypes = [np.dtype(exe.outputs[0].dtype) for exe in exe_list] max_idx = np.argmax(dtypes) diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h index 5fd1a9b1d1b9..dcd1504fb88e 100644 --- a/src/common/exec_utils.h +++ b/src/common/exec_utils.h @@ -43,61 +43,19 @@ namespace common { indices are not recorded * \return true if any source NDArray need to cast storage */ -inline bool SetupDefaultBlobsIn(const std::vector& src, - const std::vector *bufs, - std::vector *blobs, - std::vector *temp_src, - std::vector *temp_dst, - std::unordered_map *idx_map) { +inline bool SetupDefaultBlobs(const std::vector& src, + std::vector *blobs, + std::vector *temp_src, + std::vector *temp_dst, + std::unordered_map *idx_map = nullptr) { bool require_cast = false; for (size_t i = 0; i < src.size(); i++) { auto& nd = src[i]; - bool is_default = nd.storage_type() == kDefaultStorage; -#if MXNET_USE_MKLDNN == 1 - // We have to make sure it's default storage and default layout. - is_default = nd.IsDefaultData(); -#endif - if (!is_default) { - (*idx_map)[i] = temp_dst->size(); - NDArray temp = bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(), - true, nd.dtype()); -#if MXNET_USE_MKLDNN == 1 - CHECK(temp.IsDefaultData()); -#endif - temp_src->emplace_back(nd); - temp_dst->emplace_back(temp); - blobs->emplace_back(temp.data()); - require_cast = true; - } else { - blobs->push_back(nd.data()); - } - } - return require_cast; -} - -inline bool SetupDefaultBlobsOut(const std::vector& src, - const std::vector &req, - const std::vector *bufs, - std::vector *blobs, - std::vector *temp_src, - std::vector *temp_dst) { - bool require_cast = false; - for (size_t i = 0; i < src.size(); i++) { - auto& nd = src[i]; - bool is_default = nd.storage_type() == kDefaultStorage; -#if MXNET_USE_MKLDNN == 1 - // If it's writeTo, we don't need to worry whether it contains valid data. - if (req[i] == kWriteTo && is_default) - const_cast(nd).InvalidateMKLDNNData(); - // We have to make sure it's default storage and default layout. - is_default = nd.IsDefaultData(); -#endif - if (!is_default) { - NDArray temp = bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(), - true, nd.dtype()); -#if MXNET_USE_MKLDNN == 1 - CHECK(temp.IsDefaultData()); -#endif + if (nd.storage_type() != kDefaultStorage) { + if (idx_map != nullptr) { + (*idx_map)[i] = temp_dst->size(); + } + NDArray temp(nd.shape(), nd.ctx(), false, nd.dtype()); temp_src->emplace_back(nd); temp_dst->emplace_back(temp); blobs->emplace_back(temp.data()); @@ -118,9 +76,6 @@ inline bool SetupDefaultBlobsOut(const std::vector& src, */ inline void SetupDefaultBlobsInOut(const std::vector &ndinputs, const std::vector &ndoutputs, - const std::vector &req, - const std::vector *in_bufs, - const std::vector *out_bufs, std::vector *input_blobs, std::vector *output_blobs, std::vector *pre_temp_src, @@ -130,11 +85,9 @@ inline void SetupDefaultBlobsInOut(const std::vector &ndinputs, std::unordered_map *in_temp_idx_map, const std::vector &mutate_idx) { // populate input blobs - SetupDefaultBlobsIn(ndinputs, in_bufs, input_blobs, pre_temp_src, pre_temp_dst, - in_temp_idx_map); + SetupDefaultBlobs(ndinputs, input_blobs, pre_temp_src, pre_temp_dst, in_temp_idx_map); // populate output blobs - SetupDefaultBlobsOut(ndoutputs, req, out_bufs, output_blobs, post_temp_dst, - post_temp_src); + SetupDefaultBlobs(ndoutputs, output_blobs, post_temp_dst, post_temp_src); // add mutable inputs to post temp list for (const auto idx : mutate_idx) { auto map_iter = in_temp_idx_map->find(idx); diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc index e4d49554620f..1bcc40a894dd 100644 --- a/src/executor/attach_op_execs_pass.cc +++ b/src/executor/attach_op_execs_pass.cc @@ -30,8 +30,11 @@ #include "../common/utils.h" #include "../common/exec_utils.h" #include "./exec_pass.h" -#include "../operator/nn/mkldnn/mkldnn_base-inl.h" - +#if MXNET_USE_MKL2017 == 1 +#include +#include "../operator/mkl/mkl_memory-inl.h" +#include "../operator/mkl/mkl_util-inl.h" +#endif namespace mxnet { namespace op { @@ -55,34 +58,23 @@ class StorageFallbackOpExecutor : public OpExecutor { protected: // initialize the data blobs void InitBlobs() { + using namespace common; if (!init_) { - pre_temp_buf_.clear(); - post_temp_buf_.clear(); - for (size_t i = 0; i < in_array.size(); i++) { - auto &nd = in_array[i]; - pre_temp_buf_.emplace_back(nd.shape(), nd.ctx(), true, nd.dtype()); - } - for (size_t i = 0; i < out_array.size(); i++) { - auto &nd = out_array[i]; - post_temp_buf_.emplace_back(nd.shape(), nd.ctx(), true, nd.dtype()); - } + in_data_.clear(); out_data_.clear(); + pre_temp_src_.clear(); pre_temp_dst_.clear(); + post_temp_src_.clear(); post_temp_dst_.clear(); + in_temp_idx_map_.clear(); + SetupDefaultBlobsInOut(in_array, out_array, &in_data_, &out_data_, + &pre_temp_src_, &pre_temp_dst_, + &post_temp_src_, &post_temp_dst_, + &in_temp_idx_map_, mutate_idx_); init_ = true; } } // storage fallback before fcompute is launched void PreFCompute(bool is_gpu) { - using namespace common; InitBlobs(); - in_data_.clear(); out_data_.clear(); - pre_temp_src_.clear(); pre_temp_dst_.clear(); - post_temp_src_.clear(); post_temp_dst_.clear(); - in_temp_idx_map_.clear(); - SetupDefaultBlobsInOut(in_array, out_array, req, &pre_temp_buf_, &post_temp_buf_, - &in_data_, &out_data_, - &pre_temp_src_, &pre_temp_dst_, - &post_temp_src_, &post_temp_dst_, - &in_temp_idx_map_, mutate_idx_); common::CastNonDefaultStorage(pre_temp_src_, pre_temp_dst_, op_ctx, is_gpu); } @@ -93,8 +85,6 @@ class StorageFallbackOpExecutor : public OpExecutor { // default storage tensor blobs for fcompute std::vector in_data_, out_data_; - // These are NDArray buffers for cast storage. - std::vector pre_temp_buf_, post_temp_buf_; // source NDArray for cast storage std::vector pre_temp_src_, post_temp_src_; // destination NDArray for cast storage @@ -116,6 +106,10 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor { PreFCompute(is_gpu); fcompute_(state_, op_ctx, in_data_, req, out_data_); PostFCompute(is_gpu); +#if MKL_EXPERIMENTAL == 1 + mkl_tblobs_prv_to_cpu(in_data_); + mkl_tblobs_prv_to_cpu(out_data_); +#endif } ExecType exec_type() const override { @@ -181,6 +175,10 @@ class FComputeExecutor : public StorageFallbackOpExecutor { PreFCompute(is_gpu); fcompute_(attrs_, op_ctx, in_data_, req, out_data_); PostFCompute(is_gpu); +#if MKL_EXPERIMENTAL == 1 + mkl_tblobs_prv_to_cpu(in_data_); + mkl_tblobs_prv_to_cpu(out_data_); +#endif } ExecType exec_type() const override { @@ -204,9 +202,6 @@ class FComputeExExecutor : public OpExecutor { public: void Run(RunContext rctx, bool is_gpu) override { op_ctx.run_ctx = rctx; -#if MXNET_USE_MKLDNN == 1 - InvalidateOutputs(out_array, req); -#endif fcompute_(attrs_, op_ctx, in_array, req, out_array); } diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index f685370619f2..2a7d2b906684 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -1209,8 +1209,7 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { const NDArray& src = data_pool_.at(storage_id); data_entry_[i] = src.AsArray(vshape[i], vdtype[i]); } else { - data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i], - true, vdtype[i]); + data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i]); } if (log_verbose_) { LOG(INFO) << "\tinit data entry\t" << i << "\tas " << common::stype_string(storage_type); diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc index 01fab2240952..73a34c8b0f0d 100644 --- a/src/executor/infer_graph_attr_pass.cc +++ b/src/executor/infer_graph_attr_pass.cc @@ -423,6 +423,11 @@ nnvm::Graph InferStorageType(nnvm::Graph&& graph, DispatchModeVector dispatch_modes(graph.indexed_graph().num_nodes(), DispatchMode::kUndefined); graph.attrs["dispatch_mode"] = std::make_shared(std::move(dispatch_modes)); } + // initialize unknown values for dispatch modes + if (graph.attrs.count("dispatch_mode") == 0) { + DispatchModeVector dispatch_modes(graph.indexed_graph().num_nodes(), DispatchMode::kUndefined); + graph.attrs["dispatch_mode"] = std::make_shared(std::move(dispatch_modes)); + } // initialize the dev_mask vector from the context vector if (graph.attrs.count("dev_mask") == 0) { CHECK_GT(graph.attrs.count("context"), 0); diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc index 93a8bc6c54b2..eaa95a5f2418 100644 --- a/src/imperative/cached_op.cc +++ b/src/imperative/cached_op.cc @@ -214,12 +214,6 @@ nnvm::Graph Imperative::CachedOp::GetForwardGraph( StorageVector storage(idx.num_node_entries(), exec::kBadStorageID); for (const auto i : idx.input_nodes()) storage[idx.entry_id(i, 0)] = exec::kExternalStorageID; - const auto& stypes = g.GetAttr("storage_type"); - CHECK_EQ(stypes.size(), storage.size()); - for (size_t i = 0; i < stypes.size(); i++) { - if (stypes[i] != kDefaultStorage) - storage[i] = exec::kDynamicStorageID; - } auto mem_plan = PlanMemory( &g, std::move(storage), g.GetAttr >( @@ -326,10 +320,6 @@ nnvm::Graph Imperative::CachedOp::GetBackwardGraph( for (size_t i = 0; i < num_forward_entries; ++i) storage[i] = exec::kExternalStorageID; for (const auto i : idx.input_nodes()) storage[idx.entry_id(i, 0)] = exec::kExternalStorageID; for (const auto i : idx.outputs()) storage[idx.entry_id(i)] = exec::kExternalStorageID; - for (size_t i = 0; i < stypes.size(); i++) { - if (stypes[i] != kDefaultStorage) - storage[i] = exec::kDynamicStorageID; - } auto mem_plan = PlanMemory( &g, std::move(storage), g.GetAttr >("backward_ref_count"), diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h index 966a753dc120..fc28f50103b0 100644 --- a/src/imperative/imperative_utils.h +++ b/src/imperative/imperative_utils.h @@ -362,9 +362,9 @@ inline void PushFCompute(const FCompute& fn, // mapping from index in input_blobs to index in pre_temp_dst std::unordered_map in_temp_idx_map; // setup blobs - SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr, - &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst, - &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx); + SetupDefaultBlobsInOut(inputs, outputs, &input_blobs, &output_blobs, + &pre_temp_src, &pre_temp_dst, &post_temp_src, + &post_temp_dst, &in_temp_idx_map, mutate_idx); // setup context OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested}; bool is_gpu = ctx.dev_mask() == gpu::kDevMask; @@ -460,9 +460,9 @@ inline void PushOperator(const OpStatePtr& state, // mapping from index in input_blobs to index in pre_temp_dst std::unordered_map in_temp_idx_map; // populate input blobs and output blobs - SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr, - &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst, - &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx); + SetupDefaultBlobsInOut(inputs, outputs, &input_blobs, &output_blobs, + &pre_temp_src, &pre_temp_dst, &post_temp_src, &post_temp_dst, + &in_temp_idx_map, mutate_idx); // setup contexts bool is_gpu = rctx.get_ctx().dev_mask() == gpu::kDevMask; // pre-fcompute fallback @@ -607,7 +607,6 @@ inline bool CheckAndInferStorageType(nnvm::Graph* p_g, exec::DevMaskVector&& dev } if (match) return true; } - g.attrs.erase("dispatch_mode"); g.attrs.erase("storage_type"); g.attrs.erase("storage_type_inputs"); if (node_range.second > node_range.first) { diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index e01cc4206b37..e98102b6b0a3 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -32,6 +32,11 @@ #include "mxnet/engine.h" #include "ps/ps.h" #include "./kvstore_dist_server.h" +#if MKL_EXPERIMENTAL == 1 +#include +#include "../operator/mkl/mkl_memory-inl.h" +#include "../operator/mkl/mkl_util-inl.h" +#endif namespace mxnet { namespace kvstore { @@ -232,6 +237,9 @@ class KVStoreDist : public KVStoreLocal { PSKV& pskv = (gradient_compression_->get_type() == CompressionType::kNone) ? EncodeDefaultKey(key, size, false) : EncodeCompressedKey(key, size, false); +#if MKL_EXPERIMENTAL == 1 + mkl_set_tblob_eager_mode(recv_buf.data()); +#endif real_t* data = recv_buf.data().dptr(); // false means not to delete data when SArray is deleted auto vals = new ps::SArray(data, size, false); @@ -381,6 +389,9 @@ class KVStoreDist : public KVStoreLocal { [this, key, pskv, small_buf](RunContext rctx, Engine::CallbackOnComplete cb) { size_t size = small_buf.shape().Size(); real_t* data = small_buf.data().dptr(); +#if MKL_EXPERIMENTAL == 1 + mkl_set_tblob_eager_mode(small_buf.data()); +#endif // do push. false means no delete ps::SArray vals(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPush( @@ -405,6 +416,9 @@ class KVStoreDist : public KVStoreLocal { // convert to ps keys size_t size = send_buf.shape().Size(); real_t* data = send_buf.data().dptr(); +#if MKL_EXPERIMENTAL == 1 + mkl_set_tblob_eager_mode(send_buf.data()); +#endif // do push. false means no delete ps::SArray vals(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPush( @@ -426,6 +440,9 @@ class KVStoreDist : public KVStoreLocal { using namespace rowsparse; auto push_to_servers = [this, key, send_buf] (RunContext rctx, Engine::CallbackOnComplete cb) { +#if MKL_EXPERIMENTAL == 1 + mkl_set_tblob_eager_mode(send_buf.data()); +#endif real_t* data = send_buf.data().dptr(); const int64_t num_rows = send_buf.aux_shape(kIdx)[0]; const auto offsets = send_buf.aux_data(kIdx).dptr(); @@ -464,6 +481,9 @@ class KVStoreDist : public KVStoreLocal { // allocate memory for the buffer size_t num_rows = indices.shape().Size(); recv_buf.CheckAndAlloc({mshadow::Shape1(num_rows)}); +#if MKL_EXPERIMENTAL == 1 + mkl_set_tblob_eager_mode(recv_buf.data()); +#endif real_t* data = recv_buf.data().dptr(); const auto offsets = indices.data().dptr(); const auto unit_len = recv_buf.shape().ProdShape(1, recv_buf.shape().ndim()); diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index ae7209e272b0..4db314f9cf4b 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -31,14 +31,10 @@ #include #include #include -#if MXNET_USE_MKLDNN == 1 -#include -#endif #include "./ndarray_function.h" #include "../common/utils.h" #include "../operator/tensor/matrix_op-inl.h" #include "../operator/tensor/init_op.h" -#include "../operator/nn/mkldnn/mkldnn_base-inl.h" #if MXNET_USE_OPENCV #include @@ -50,104 +46,6 @@ DMLC_REGISTRY_ENABLE(::mxnet::NDArrayFunctionReg); namespace mxnet { -NDArray::NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx, - bool delay_alloc, int dtype, std::vector aux_types, - std::vector aux_shapes, TShape storage_shape) : shape_(shape), - dtype_(dtype), storage_type_(stype), entry_({nullptr, 0, 0}) { - // Assign default aux types if not given - if (aux_types.size() == 0 - && stype != kDefaultStorage) { - if (stype == kRowSparseStorage) { - aux_types = {mshadow::kInt64}; - } else if (stype == kCSRStorage) { - aux_types = {mshadow::kInt64, mshadow::kInt64}; - } else { - LOG(FATAL) << "Unknown storage type " << stype; - } - } - // Assign default shapes if not given - // unknown shapes are intialized as {0} such that Size() would return 0 - if (aux_shapes.size() == 0 - && stype != kDefaultStorage) { - if (stype == kRowSparseStorage) { - aux_shapes = {TShape(mshadow::Shape1(0))}; - } else if (stype == kCSRStorage) { - // aux shapes for indptr and indices - aux_shapes = {TShape(mshadow::Shape1(0)), TShape(mshadow::Shape1(0))}; - } else { - LOG(FATAL) << "Unknown storage type " << stype; - } - } - if (storage_shape.Size() == 0 - && stype != kDefaultStorage) { - if (stype == kRowSparseStorage) { - storage_shape = shape; - storage_shape[0] = aux_shapes[rowsparse::kIdx][0]; - } else if (stype == kCSRStorage) { - storage_shape = aux_shapes[csr::kIdx]; - } else { - LOG(FATAL) << "Unknown storage type " << stype; - } - } - if (stype == kDefaultStorage) - ptr_ = std::make_shared(shape, ctx, delay_alloc, dtype); - else - ptr_ = std::make_shared(stype, storage_shape, ctx, delay_alloc, - dtype, aux_types, aux_shapes); -} - -struct ChunkMem { - Storage::Handle h; - std::vector aux_h; -#if MXNET_USE_MKLDNN == 1 - std::shared_ptr mem; -#endif -}; - -NDArray::Chunk::~Chunk() { - bool skip_free = static_data || delay_alloc; - ChunkMem mem; - mem.h = this->shandle; - mem.aux_h = this->aux_handles; -#if MXNET_USE_MKLDNN == 1 - // We want to delete mkldnn memory after deleting the variable. - mem.mem = this->mkl_mem_; -#endif - Engine::Get()->DeleteVariable([mem, skip_free](RunContext s) { - if (skip_free == false) { -#if MXNET_USE_MKLDNN == 1 - if (mem.mem) { - CHECK_LE(mem.mem->get_primitive_desc().get_size(), mem.h.size); - CHECK_EQ(mem.mem->get_data_handle(), mem.h.dptr); - } -#endif - if (mem.h.size > 0) Storage::Get()->Free(mem.h); - for (size_t i = 0; i < mem.aux_h.size(); i++) { - if (mem.aux_h[i].size > 0) Storage::Get()->Free(mem.aux_h[i]); - } - } - }, shandle.ctx, var); -} - -void NDArray::Chunk::CheckAndAllocData(const TShape &shape, int dtype) { - CHECK_NE(aux_shapes.size(), 0) - << "data is expected to be allocated after aux_data"; - auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype); - if (shandle.size < dbytes) { - // free storage if necessary and alloc again - if (shandle.size > 0) Storage::Get()->Free(shandle); - // init storage - shandle = Storage::Get()->Alloc(dbytes, ctx); -#if MXNET_USE_MKLDNN == 1 - mkl_mem_ = nullptr; -#endif - } - // init shape - storage_shape = shape; - // delay_alloc is only set when data storage handle is present - delay_alloc = false; -} - NDArray NDArray::grad() const { if (Imperative::AGInfo::IsNone(*this)) return NDArray(); Imperative::AGInfo& info = Imperative::AGInfo::Get(entry_.node); @@ -166,55 +64,15 @@ nnvm::Symbol NDArray::get_autograd_symbol() const { return ret; } -#if MXNET_USE_MKLDNN == 1 - -NDArray NDArray::MKLDNNDataReshape(const TShape &shape) const { - CHECK(!is_none()) << "NDArray is not initialized"; - CHECK_GE(shape_.Size(), shape.Size()) - << "NDArray.Reshape: target shape size is larger current shape"; - CHECK_EQ(storage_type(), kDefaultStorage); - if (!IsMKLDNNData()) { - NDArray ret = this->Detach(); - ret.shape_ = shape; - return ret; - } else { - NDArray ret(shape, ctx(), true, dtype()); - // We shouldn't submit the reorder primitive here because submit will - // be called in operators. - auto format = GetDefaultFormat(ptr_->mkl_mem_->get_primitive_desc().desc()); - CHECK_NE(format, ptr_->mkl_mem_->get_primitive_desc().desc().data.format); - auto def_pd = GetPrimitiveDesc(ptr_->mkl_mem_->get_primitive_desc(), format); - auto def_mem = TmpMemMgr::Get()->Alloc(def_pd); - MKLDNNStream *stream = MKLDNNStream::Get(); - stream->RegisterMem(ptr_->mkl_mem_); - stream->RegisterPrim(mkldnn::reorder(*ptr_->mkl_mem_, *def_mem)); - // def_mem points to a memory region in the temp space. It's only valid - // inside an operator. As such, the returned NDArray can only be valid - // inside an operator and the shared point doesn't need to do anything - // when it's destroyed. - ret.ptr_->mkl_mem_ = std::shared_ptr(def_mem, - [](mkldnn::memory *mem){}); - ret.ptr_->shandle.dptr = def_mem->get_data_handle(); - ret.ptr_->shandle.size = def_mem->get_primitive_desc().get_size(); - ret.ptr_->delay_alloc = false; - ret.ptr_->static_data = true; - ret.byte_offset_ = byte_offset_; - return ret; - } -} - -#endif - NDArray NDArray::Reshape(const TShape &shape) const { CHECK(!is_none()) << "NDArray is not initialized"; + auto stype = storage_type(); + // reshape is not supported for non-default ndarray with dismatching shapes + CHECK((shape_ == shape) || stype == kDefaultStorage) + << "Reshape for storage type " << stype << " is not implemented yet"; CHECK_GE(shape_.Size(), shape.Size()) << "NDArray.Reshape: target shape size is larger current shape"; NDArray ret = this->Detach(); - // If the shape doesn't change, we can just return it now. - if (ret.shape_ == shape) - return ret; - // Otherwise, reshape only works on the default layout. - CHECK_EQ(storage_type(), kDefaultStorage); ret.shape_ = shape; return ret; } @@ -237,6 +95,7 @@ NDArray NDArray::ReshapeWithRecord(const TShape &shape) { return ret; } + NDArray NDArray::Slice(index_t begin, index_t end) const { CHECK(!is_none()) << "NDArray is empty"; CHECK_LE(begin, end) @@ -268,8 +127,8 @@ NDArray NDArray::SliceWithRecord(index_t begin, index_t end) { } NDArray NDArray::At(index_t idx) const { - CHECK(storage_type() == kDefaultStorage) - << "Storage type " << storage_type() << " doesn't support At()"; + CHECK(storage_type() == kDefaultStorage) << "Storage type " + << storage_type() << " doesn't support At()"; NDArray ret = this->Slice(idx, idx+1); if (shape_.ndim() > 1) { return ret.Reshape(TShape(shape_.data()+1, shape_.data()+shape_.ndim())); @@ -322,400 +181,6 @@ void NDArray::set_fresh_out_grad(bool state) const { info.fresh_out_grad = state; } -#if MXNET_USE_MKLDNN == 1 -static inline bool same_shape(const TShape &shape, mkldnn_dims_t dims, int ndims) { - if (shape.ndim() != (size_t)ndims) - return false; - for (int i = 0; i < ndims; i++) - if (shape[i] != dims[i]) - return false; - return true; -} - -static inline bool same_shape(const TShape &shape, int dtype, mkldnn::memory::desc desc) { - return same_shape(shape, desc.data.dims, desc.data.ndims) - && get_mkldnn_type(dtype) == desc.data.data_type; -} - -bool NDArray::Chunk::IsMKLDNN() const { - if (storage_type != kDefaultStorage) - return false; - if (mkl_mem_ == nullptr) - return false; - auto desc = mkl_mem_->get_primitive_desc().desc(); - return desc.data.format != GetDefaultFormat(desc); -} - -bool NDArray::Chunk::IsDefault() const { - if (storage_type != kDefaultStorage) - return false; - // If we don't have mkldnn memory yet, we just assume it's not the default - // format. - if (mkl_mem_ == nullptr) - return true; - auto desc = mkl_mem_->get_primitive_desc().desc(); - return desc.data.format == GetDefaultFormat(desc); -} - -void NDArray::Chunk::Reorder2Default() { - if (mkl_mem_ == nullptr) - return; - - auto format = GetDefaultFormat(mkl_mem_->get_primitive_desc().desc()); - CHECK(format != mkl_mem_->get_primitive_desc().desc().data.format); - - auto def_pd = GetPrimitiveDesc(mkl_mem_->get_primitive_desc(), format); - mkldnn_mem_ptr def_mem(new mkldnn::memory(def_pd)); - // This may be called in MKLDNN operators. We can't use MKLDNNStream here. - std::vector net; - net.push_back(mkldnn::reorder(*mkl_mem_, *def_mem)); - mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); - - CHECK(shandle.size >= def_pd.get_size()); - CheckAndAlloc(def_pd.get_size()); - // TODO(zhengda) We need to avoid memory copy here. - memcpy(shandle.dptr, def_mem->get_data_handle(), def_pd.get_size()); - mkl_mem_.reset(new mkldnn::memory(def_pd, shandle.dptr)); -} - -void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { - // The shape of the array and the one of the MKL memory may mismatch. - // For example, if the array stores parameters, the MKL memory may store data - // in 5 dimensions while the NDArray stores data in 4 dimensions. - if (mkl_mem_ && mkl_mem_->get_data_handle() == shandle.dptr - && same_shape(shape, dtype, mkl_mem_->get_primitive_desc().desc())) { - return; - } - - mkldnn::memory::dims dims; - // These are shapes supprted by MKLDNN. - if (shape.ndim() == 1 || shape.ndim() == 2 || shape.ndim() == 4 - || shape.ndim() == 5) { - dims.resize(shape.ndim()); - for (size_t i = 0; i < dims.size(); i++) - dims[i] = shape[i]; - } else if (shape.ndim() == 3) { - // If there are 3 dimensions, we'll force it to 4 dimensions. - dims.resize(shape.ndim() + 1); - dims[0] = 1; - for (size_t i = 0; i < shape.ndim(); i++) - dims[i + 1] = shape[i]; - } else { - LOG(FATAL) << "MKLDNN doesn't support " << shape.ndim() << " dimensions"; - } - mkldnn::memory::format layout = mkldnn::memory::format::format_undef; - switch (dims.size()) { - case 1: layout = mkldnn::memory::format::x; break; - case 2: layout = mkldnn::memory::format::nc; break; - case 4: layout = mkldnn::memory::format::nchw; break; - // This isn't the right layout when the data has 5 dimensions in MXNet. - // MXNet interprets 5 dimensions as ncdhw, but MKLDNN doesn't have - // a corresponding format. - case 5: layout = mkldnn::memory::format::goihw; break; - } - mkldnn::memory::desc data_md{dims, get_mkldnn_type(dtype), layout}; - auto cpu_engine = CpuEngine::Get()->get_engine(); - if (shandle.dptr == nullptr) { - CHECK(delay_alloc); - CheckAndAlloc(); - } - mkldnn::memory::primitive_desc pd(data_md, cpu_engine); - CHECK(shandle.size >= pd.get_size()); - mkl_mem_.reset(new mkldnn::memory(pd, shandle.dptr)); -} - -/* - * Here we want to get MKLDNN memory whose primitive desc is exactly the same as - * the given one. operator== can't guarantee that. == can return true even if - * the formats are different. I need to double check its format. - */ -static inline mkldnn::memory *GetMKLDNNExact( - const mkldnn::memory *mem, mkldnn::memory::primitive_desc desc) { - auto src_desc = mem->get_primitive_desc(); - if (desc == src_desc && desc.desc().data.format == src_desc.desc().data.format) { - return const_cast(mem); - } else { - std::shared_ptr ret(new mkldnn::memory( - desc, mem->get_data_handle())); - MKLDNNStream::Get()->RegisterMem(ret); - return ret.get(); - } -} - -const mkldnn::memory *NDArray::GetMKLDNNData( - const mkldnn::memory::primitive_desc &desc) const { - if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { - LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; - return nullptr; - } - auto mem = GetMKLDNNData(); - mkldnn::memory::primitive_desc _desc = desc; - auto desc1 = mem->get_primitive_desc().desc(); - auto desc2 = _desc.desc(); - // The MKL memory has the same format and shape as required, - // or both use the default format, we can return the MKL memory. - if (mem->get_primitive_desc() == desc - || (desc1.data.format == GetDefaultFormat(desc1) - && desc2.data.format == GetDefaultFormat(desc2))) { - return GetMKLDNNExact(ptr_->mkl_mem_.get(), desc); - } else { - return nullptr; - } -} - -const mkldnn::memory *NDArray::GetMKLDNNDataReorder( - const mkldnn::memory::primitive_desc &desc) const { - if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { - LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; - return nullptr; - } - CHECK(storage_type() == kDefaultStorage); - - auto mem = GetMKLDNNData(); - // If the memory descriptor matches, it's easy. - MKLDNNStream *stream = MKLDNNStream::Get(); - if (mem->get_primitive_desc() == desc) { - return GetMKLDNNExact(mem, desc); - } - - mkldnn::memory::primitive_desc _desc = desc; - // Now we need to determine if we should reorder the memory. - // If both use the default formats, we think we don't need to reorder. - auto desc1 = mem->get_primitive_desc().desc(); - auto desc2 = _desc.desc(); - if (desc1.data.format == GetDefaultFormat(desc1) && - desc2.data.format == GetDefaultFormat(desc2)) { - mkldnn_mem_ptr ret(new mkldnn::memory(desc, mem->get_data_handle())); - stream->RegisterMem(ret); - return ret.get(); - } else { - auto ret = TmpMemMgr::Get()->Alloc(desc); - stream->RegisterPrim(mkldnn::reorder(*mem, *ret)); - return ret; - } -} - -const mkldnn::memory *NDArray::GetMKLDNNData() const { - CHECK(storage_type() == kDefaultStorage); - // If this array uses MKLDNN layout and it's a view, we have to change its - // layout to the default layout. - if (IsMKLDNNData() && IsView()) - ptr_->Reorder2Default(); - ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_, dtype_); - // If shandle has data, the data in shandle and mkl_mem_ should match. - if (ptr_->shandle.dptr) - CHECK(ptr_->shandle.dptr == ptr_->mkl_mem_->get_data_handle()); - MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_); - auto pd = ptr_->mkl_mem_->get_primitive_desc(); - if (IsView()) { - // Sliced array must use the default layout. - CHECK_EQ(GetDefaultFormat(pd.desc()), pd.desc().data.format); - } - if (IsView()) { - void *off_addr = static_cast(ptr_->mkl_mem_->get_data_handle()) - + byte_offset_; - - // Create the primitive desc for the new mkldnn memory. - mkldnn::memory::dims dims(shape().ndim()); - for (size_t i = 0; i < dims.size(); i++) - dims[i] = shape()[i]; - mkldnn::memory::format cpp_format = static_cast( - GetDefaultFormat(shape().ndim())); - mkldnn::memory::data_type cpp_type = static_cast( - pd.desc().data.data_type); - mkldnn::memory::desc data_md(dims, cpp_type, cpp_format); - mkldnn::memory::primitive_desc new_pd(data_md, pd.get_engine()); - - std::shared_ptr ret(new mkldnn::memory(new_pd, off_addr)); - MKLDNNStream::Get()->RegisterMem(ret); - return ret.get(); - } else { - return ptr_->mkl_mem_.get(); - } -} - -void NDArray::MKLDNNDataReorder(const mkldnn::memory::primitive_desc &pd) { - CHECK_EQ(storage_type(), kDefaultStorage); - // If the memory already uses the specified layout, don't do anything. - if (ptr_->mkl_mem_ != nullptr && ptr_->mkl_mem_->get_primitive_desc() == pd) - return; - auto _pd = pd; - auto _desc = _pd.desc(); - auto def_format = GetDefaultFormat(_desc); - // If the memory is default, don't do anything. - if (def_format == _desc.data.format && ptr_->IsDefault()) - return; - // If the specified layout is default, we should use Reorder2Default. - if (def_format == _desc.data.format) { - ptr_->Reorder2Default(); - return; - } - - std::shared_ptr new_mem(new mkldnn::memory(pd)); - ptr_->SetMKLMem(shape_, dtype_); - auto old_mem = ptr_->mkl_mem_; - // It's possible that the specified layout has a different number of dimensions. - if (old_mem->get_primitive_desc().desc().data.ndims != _desc.data.ndims) { - // For now, we only support reorder from the default layout. - CHECK(ptr_->IsDefault()); - auto def_pd = GetPrimitiveDesc(pd, def_format); - old_mem.reset(new mkldnn::memory(def_pd, old_mem->get_data_handle())); - } - // This may be called in MKLDNN operators. We can't use MKLDNNStream here. - std::vector net; - net.push_back(mkldnn::reorder(*old_mem, *new_mem)); - mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); - - CHECK(ptr_->shandle.size >= pd.get_size()); - ptr_->CheckAndAlloc(pd.get_size()); - // TODO(zhengda) We need to avoid memory copy here. - memcpy(ptr_->shandle.dptr, new_mem->get_data_handle(), pd.get_size()); - ptr_->mkl_mem_.reset(new mkldnn::memory(pd, ptr_->shandle.dptr)); -} - -void NDArray::CopyFrom(const mkldnn::memory &mem) { - CHECK(ptr_ != nullptr) << "The NDArray hasn't been initialized"; - if (ptr_->mkl_mem_.get() == &mem) - return; - - CHECK(mem.get_primitive_desc().get_size() == shape().Size() * GetTypeSize(dtype_)) - << "The size of NDArray doesn't match the requested MKLDNN memory desc"; - MKLDNNStream *stream = MKLDNNStream::Get(); - // If this array uses MKLDNN layout and it's a view, we have to change its - // layout to the default layout. - if (IsMKLDNNData() && IsView()) - ptr_->Reorder2Default(); - ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_, - dtype_); - stream->RegisterMem(ptr_->mkl_mem_); - auto from_desc = mem.get_primitive_desc().desc(); - auto this_desc = ptr_->mkl_mem_->get_primitive_desc().desc(); - auto from_def_format = GetDefaultFormat(from_desc); - if (IsView()) { - // Sliced array must use the default layout. - CHECK_EQ(GetDefaultFormat(this_desc), this_desc.data.format); - } - // It's possible that the memory and the NDArray don't have the same shape. - if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims) - // If the source memory uses the default layout, we can reshape directly. - && from_def_format == from_desc.data.format) { - // In this case, we can simply create a new MKLDNN memory for the required - // shape. - mkldnn::memory::dims dims(this_desc.data.dims, - this_desc.data.dims + this_desc.data.ndims); - auto this_dtype = static_cast(this_desc.data.data_type); - auto this_format = static_cast(GetDefaultFormat(this_desc)); - mkldnn::memory::desc data_md(dims, this_dtype, this_format); - mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine()); - mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle())); - stream->RegisterMem(tmp_mem); - stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->mkl_mem_)); - } else if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims)) { - // In this case, the source memory stores data in a customized layout. We - // need to reorganize the data in memory before we can reshape. - auto def_pd = GetPrimitiveDesc(mem.get_primitive_desc(), from_def_format); - auto def_mem = TmpMemMgr::Get()->Alloc(def_pd); - stream->RegisterPrim(mkldnn::reorder(mem, *def_mem)); - // Now we can reshape it - mkldnn::memory::dims dims(this_desc.data.dims, - this_desc.data.dims + this_desc.data.ndims); - auto this_dtype = static_cast(this_desc.data.data_type); - auto this_format = static_cast(GetDefaultFormat(this_desc)); - mkldnn::memory::desc data_md(dims, this_dtype, this_format); - mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine()); - mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, def_mem->get_data_handle())); - stream->RegisterMem(tmp_mem); - stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->mkl_mem_)); - } else if (mem.get_primitive_desc() == ptr_->mkl_mem_->get_primitive_desc()) { - // If the layout is the same, we can just copy data. - stream->RegisterPrim(mkldnn::reorder(mem, *ptr_->mkl_mem_)); - } else { - auto src_def = GetDefaultFormat(mem.get_primitive_desc().desc()); - auto dst_def = GetDefaultFormat(ptr_->mkl_mem_->get_primitive_desc().desc()); - // If both are not using the default layouts. There isn't much we can do, - // other than reorder data layout directly. - if (dst_def != ptr_->mkl_mem_->get_primitive_desc().desc().data.format - && src_def != mem.get_primitive_desc().desc().data.format) { - stream->RegisterPrim(mkldnn::reorder(mem, *ptr_->mkl_mem_)); - } else if (dst_def == ptr_->mkl_mem_->get_primitive_desc().desc().data.format) { - // If the dest mem uses the default memory layout, we can simply use - // the default format of the source memory to improve perf of reorder. - auto pd = GetPrimitiveDesc(ptr_->mkl_mem_->get_primitive_desc(), src_def); - mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, ptr_->mkl_mem_->get_data_handle())); - stream->RegisterMem(tmp_mem); - stream->RegisterPrim(mkldnn::reorder(mem, *tmp_mem)); - } else { - // If the src mem uses the default memory layout, we can use - // the default format of the source memory to improve perf. - auto pd = GetPrimitiveDesc(mem.get_primitive_desc(), dst_def); - mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle())); - stream->RegisterMem(tmp_mem); - stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->mkl_mem_)); - } - } -} -mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd, - mkldnn_memory_format_t format); - -mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc &desc) { - // This array shouldn't be a view. - CHECK(!IsView()); - - if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { - LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; - return nullptr; - } - - mkldnn::memory::primitive_desc _desc = desc; - auto required_format = _desc.desc().data.format; - auto def_format = GetDefaultFormat(_desc.desc()); - // If the required format is a default format, we don't need to worry about the shape. - // If the shape isn't the same, it actually implicitly reshapes data. - if (required_format == def_format) { - ptr_->SetMKLMem(shape_, dtype_); - MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_); - return GetMKLDNNExact(ptr_->mkl_mem_.get(), desc); - } - - if (ptr_->mkl_mem_) - CHECK(ptr_->mkl_mem_->get_data_handle() == ptr_->shandle.dptr); - if (ptr_->mkl_mem_ && ptr_->mkl_mem_->get_primitive_desc() == desc) { - MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_); - return GetMKLDNNExact(ptr_->mkl_mem_.get(), desc); - } - - CHECK(ptr_->shandle.size >= desc.get_size()); - ptr_->CheckAndAlloc(desc.get_size()); - ptr_->mkl_mem_.reset(new mkldnn::memory(desc, ptr_->shandle.dptr)); - MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_); - return ptr_->mkl_mem_.get(); -} -#endif - -void NDArray::SetTBlob() const { - CHECK(ptr_ != nullptr); - TShape shape = shape_; - char *dptr = static_cast(ptr_->shandle.dptr); - auto stype = storage_type(); - if (stype == kDefaultStorage) { -#if MXNET_USE_MKLDNN == 1 - if (IsMKLDNNData()) { - ptr_->Reorder2Default(); - dptr = static_cast(ptr_->shandle.dptr); - } -#endif - dptr += byte_offset_; - } else if (stype == kCSRStorage || stype == kRowSparseStorage) { - CHECK_EQ(byte_offset_, 0); - shape = storage_shape(); - } else { - LOG(FATAL) << "unknown storage type " << stype; - } - tblob_.dptr_ = dptr; - tblob_.shape_ = shape; - tblob_.type_flag_ = dtype_; - tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id); -} /*! * \brief run a ternary operation @@ -984,51 +449,11 @@ inline void CopyFromToRspImpl(const NDArray& from, const NDArray& to, RunContext // Make a copy of a dense NDArray template inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext ctx) { -#if MXNET_USE_MKLDNN == 1 - // If neither is MKLDNN, we can copy data normally. - if (!from.IsMKLDNNData() && !to.IsMKLDNNData()) { -#endif - using namespace mshadow; - CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type"; - TBlob tmp = to.data(); - ndarray::Copy(from.data(), &tmp, - from.ctx(), to.ctx(), ctx); -#if MXNET_USE_MKLDNN == 1 - } else if (SupportMKLDNN(from.dtype(), from.shape()) - && SupportMKLDNN(to.dtype(), to.shape()) - && from.ctx().dev_mask() == cpu::kDevMask - && to.ctx().dev_mask() == cpu::kDevMask) { - // If we copy data directly, we need to make sure both NDArrays are supported - // by MKLDNN. - auto from_mem = from.GetMKLDNNData(); - auto to_mem = to.GetMKLDNNData(); - if (from_mem->get_primitive_desc() == to_mem->get_primitive_desc()) { - size_t size = std::min(from_mem->get_primitive_desc().get_size(), - to_mem->get_primitive_desc().get_size()); - memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size); - } else { - std::vector net; - net.push_back(mkldnn::reorder(*from_mem, *to_mem)); - mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); - } - } else { - // In this case, one of the NDArray isn't supported by MKLDNN, we need - // to convert the MKLDNN array to the default format first and copy data - // with Copy(). - NDArray tmp_from = from; - if (tmp_from.IsMKLDNNData()) { - tmp_from = NDArray(from.shape(), from.ctx(), false, from.dtype()); - auto tmp_mem = from.GetMKLDNNData(); - tmp_from.CopyFrom(*tmp_mem); - MKLDNNStream::Get()->Submit(); - } - CHECK(tmp_from.IsDefaultData()); - CHECK(to.IsDefaultData()); - TBlob tmp = to.data(); - ndarray::Copy(from.data(), &tmp, - from.ctx(), to.ctx(), ctx); - } -#endif + using namespace mshadow; + CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type"; + TBlob tmp = to.data(); + ndarray::Copy(from.data(), &tmp, + from.ctx(), to.ctx(), ctx); } // Make a copy of an NDArray based on storage type diff --git a/src/operator/concat-inl.h b/src/operator/concat-inl.h new file mode 100644 index 000000000000..4225ddf4eac0 --- /dev/null +++ b/src/operator/concat-inl.h @@ -0,0 +1,264 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2015 by Contributors + * \file concat-inl.h + * \brief + * \author Bing Xu +*/ +#ifndef MXNET_OPERATOR_CONCAT_INL_H_ +#define MXNET_OPERATOR_CONCAT_INL_H_ +#include +#include +#include +#include +#include +#include +#include +#include +#include "./operator_common.h" +#include "./channel_op_common.h" +#include "./tensor/broadcast_reduce_op.h" + +namespace mxnet { +namespace op { + +namespace concat_enum { +enum ConcatOpInputs {kData0, kData1, kData2, kData3, kData4}; +enum ConcatOpOutputs {kOut}; +} // namespace concat_enum + +struct ConcatParam : public dmlc::Parameter { + int num_args; + int dim; + DMLC_DECLARE_PARAMETER(ConcatParam) { + DMLC_DECLARE_FIELD(num_args).set_lower_bound(1) + .describe("Number of inputs to be concated."); + DMLC_DECLARE_FIELD(dim).set_default(1) + .describe("the dimension to be concated."); + } +}; // struct ConcatParam + +template +class ConcatOp : public Operator { + public: + explicit ConcatOp(ConcatParam param) + : size_(param.num_args), dimension_(param.dim) {} + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(static_cast(in_data.size()), size_); + CHECK_EQ(out_data.size(), 1U); + int axis = CheckAxis(dimension_, in_data[concat_enum::kData0].ndim()); + Stream *s = ctx.get_stream(); + std::vector > data(size_); + Tensor out; + size_t leading = 1, trailing = 1; + for (int i = 0; i < axis; ++i) { + leading *= out_data[concat_enum::kOut].shape_[i]; + } + for (int i = axis + 1; i < out_data[concat_enum::kOut].ndim(); ++i) { + trailing *= out_data[concat_enum::kOut].shape_[i]; + } + size_t mid = out_data[concat_enum::kOut].shape_[axis]; + Shape<3> oshape = Shape3(leading, mid, trailing); + out = out_data[concat_enum::kOut].get_with_shape(oshape, s); + + for (int i = 0; i < size_; ++i) { + Shape<3> dshape = Shape3(leading, in_data[i].shape_[axis], trailing); + data[i] = in_data[i].get_with_shape(dshape, s); + } + Concatenate(data, &out, 1, req[concat_enum::kOut]); + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1U); + CHECK_EQ(in_grad.size(), static_cast(size_)); + int axis = CheckAxis(dimension_, out_grad[concat_enum::kData0].ndim()); + Stream *s = ctx.get_stream(); + std::vector > grad_in(size_); + Tensor grad; + size_t leading = 1, trailing = 1; + for (int i = 0; i < axis; ++i) { + leading *= out_grad[concat_enum::kOut].shape_[i]; + } + for (int i = axis + 1; i < out_grad[concat_enum::kOut].ndim(); ++i) { + trailing *= out_grad[concat_enum::kOut].shape_[i]; + } + size_t mid = out_grad[concat_enum::kOut].shape_[axis]; + Shape<3> oshape = Shape3(leading, mid, trailing); + grad = out_grad[concat_enum::kOut].get_with_shape(oshape, s); + + for (int i = 0; i < size_; ++i) { + Shape<3> dshape = Shape3(leading, in_grad[i].shape_[axis], trailing); + grad_in[i] = in_grad[i].get_with_shape(dshape, s); + } + Split(grad, &grad_in, 1, req); + } + + private: + int size_; + int dimension_; +}; // class ConcatOp + +template +Operator *CreateOp(ConcatParam param, int dtype, std::vector *in_shape); + +#if DMLC_USE_CXX11 +class ConcatProp : public OperatorProperty { + public: + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + std::vector ListArguments() const override { + std::vector ret; + for (int i = 0; i < param_.num_args; ++i) { + ret.push_back(std::string("arg") + std::to_string(i)); + } + return ret; + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + CHECK_EQ(in_shape->size(), static_cast(param_.num_args)); + TShape dshape; + index_t size = 0; + bool has_zero = false; + int axis = -1; + for (int i = 0; i < param_.num_args; ++i) { + TShape tmp = (*in_shape)[i]; + if (tmp.ndim()) { + axis = CheckAxis(param_.dim, tmp.ndim()); + has_zero = tmp[axis] == 0 || has_zero; + size += tmp[axis]; + tmp[axis] = 0; + shape_assign(&dshape, tmp); + } + } + + TShape tmp = (*out_shape)[0]; + if (tmp.ndim()) { + axis = CheckAxis(param_.dim, tmp.ndim()); + tmp[axis] = 0; + shape_assign(&dshape, tmp); + } + + if (dshape.ndim() == 0) return false; + + for (int i = 0; i < param_.num_args; ++i) { + CHECK(shape_assign(&(*in_shape)[i], dshape)) + << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i]; + } + + if (!has_zero) dshape[axis] = size; + CHECK(shape_assign(&(*out_shape)[0], dshape)) + << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0]; + + return dshape.Size() != 0; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + int dtype = -1; + + for (size_t i = 0; i < in_type->size(); ++i) { + if (dtype == -1) { + dtype = in_type->at(i); + } else { + CHECK(in_type->at(i) == dtype || + in_type->at(i) == -1) << + "Non-uniform data type in Concat"; + } + } + + if (dtype == -1) { + LOG(FATAL) << "Not enough information to infer type in Concat."; + return false; + } + + size_t nin = this->ListArguments().size(); + in_type->clear(); + for (size_t i = 0; i < nin; ++i) in_type->push_back(dtype); + + size_t naux = this->ListAuxiliaryStates().size(); + aux_type->clear(); + for (size_t i = 0; i < naux; ++i) aux_type->push_back(dtype); + + size_t nout = this->ListOutputs().size(); + out_type->clear(); + for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype); + + return true; + } + + OperatorProperty* Copy() const override { + auto ptr = new ConcatProp(); + ptr->param_ = param_; + return ptr; + } + + std::string TypeString() const override { + return "Concat"; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return out_grad; + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not implemented"; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + private: + ConcatParam param_; +}; // class ConcatProp +#endif // DMLC_USE_CXX11 +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_CONCAT_INL_H_ diff --git a/src/operator/concat.cc b/src/operator/concat.cc new file mode 100644 index 000000000000..4d3c2fa1661f --- /dev/null +++ b/src/operator/concat.cc @@ -0,0 +1,112 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2015 by Contributors + * \file concat.cc + * \brief + * \author Bing Xu +*/ + +#include "./concat-inl.h" +#if MXNET_USE_MKL2017 == 1 +#include +#include "./mkl/mkl_memory-inl.h" +#include "./mkl/mkl_concat-inl.h" +#endif // MXNET_USE_MKL2017 + +namespace mxnet { +namespace op { +template<> +Operator* CreateOp(ConcatParam param, int dtype, std::vector *in_shape) { + Operator *op = NULL; +#if MXNET_USE_MKL2017 == 1 + // MKL supports 4D input tensors only for concat operation + // 2D/3D input tensors are reshaped to 4D in mkl_concat-inl.h + // hence MKL supports 2D/3D/4D input tensors for concat operation + size_t dims = (*in_shape)[0].ndim(); + bool supportedDim = (dims >= 2 && dims <= 4); + if ((1 == param.dim) && supportedDim && + (param.num_args < (dnnResourceMultipleDst - dnnResourceMultipleSrc))) { + switch (dtype) { + case mshadow::kFloat32: + return new MKLConcatOp(param); + case mshadow::kFloat64: + return new MKLConcatOp(param); + default: + break; + } + } + if (enableMKLWarnGenerated()) + LOG(INFO) << MKLConcatOp::getName() << " Skip MKL optimization"; +#endif + MSHADOW_TYPE_SWITCH(dtype, DType, { + op = new ConcatOp(param); + }); + return op; +} + +Operator* ConcatProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0), in_shape); +} + +DMLC_REGISTER_PARAMETER(ConcatParam); + +MXNET_REGISTER_OP_PROPERTY(Concat, ConcatProp) +.describe(R"code(Joins input arrays along a given axis. + +.. note:: `Concat` is deprecated. Use `concat` instead. + +The dimensions of the input arrays should be the same except the axis along +which they will be concatenated. +The dimension of the output array along the concatenated axis will be equal +to the sum of the corresponding dimensions of the input arrays. + +Example:: + + x = [[1,1],[2,2]] + y = [[3,3],[4,4],[5,5]] + z = [[6,6], [7,7],[8,8]] + + concat(x,y,z,dim=0) = [[ 1., 1.], + [ 2., 2.], + [ 3., 3.], + [ 4., 4.], + [ 5., 5.], + [ 6., 6.], + [ 7., 7.], + [ 8., 8.]] + + Note that you cannot concat x,y,z along dimension 1 since dimension + 0 is not the same for all the input arrays. + + concat(y,z,dim=1) = [[ 3., 3., 6., 6.], + [ 4., 4., 7., 7.], + [ 5., 5., 8., 8.]] + +)code" ADD_FILELINE) +.add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate") +.add_arguments(ConcatParam::__FIELDS__()) +.set_key_var_num_args("num_args"); + +NNVM_REGISTER_OP(Concat).add_alias("concat"); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/nn/concat.cu b/src/operator/concat.cu similarity index 81% rename from src/operator/nn/concat.cu rename to src/operator/concat.cu index f6bf5ece5c78..394fa736ee84 100644 --- a/src/operator/nn/concat.cu +++ b/src/operator/concat.cu @@ -28,12 +28,14 @@ namespace mxnet { namespace op { - -NNVM_REGISTER_OP(Concat) -.set_attr("FCompute", ConcatCompute); - -NNVM_REGISTER_OP(_backward_Concat) -.set_attr("FCompute", ConcatGradCompute); +template<> +Operator* CreateOp(ConcatParam param, int dtype, std::vector *in_shape) { + Operator *op = NULL; + MSHADOW_TYPE_SWITCH(dtype, DType, { + op = new ConcatOp(param); + }); + return op; +} } // namespace op } // namespace mxnet diff --git a/src/operator/convolution_v1.cc b/src/operator/convolution_v1.cc index 86c0fbb33291..7de6a34425f5 100644 --- a/src/operator/convolution_v1.cc +++ b/src/operator/convolution_v1.cc @@ -25,6 +25,11 @@ */ #include "./convolution_v1-inl.h" +#if MXNET_USE_MKL2017 == 1 +#include +#include "./mkl/mkl_memory-inl.h" +#include "./mkl/mkl_convolution-inl.h" +#endif // MXNET_USE_MKL2017 #if MXNET_USE_NNPACK == 1 #include "./nnpack/nnpack_convolution-inl.h" #endif // MXNET_USE_NNPACK diff --git a/src/operator/lrn-inl.h b/src/operator/lrn-inl.h new file mode 100644 index 000000000000..adfe4676702d --- /dev/null +++ b/src/operator/lrn-inl.h @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2015 by Contributors + * \file lrn-inl.h + * \brief + * \author Bing Xu +*/ +#ifndef MXNET_OPERATOR_LRN_INL_H_ +#define MXNET_OPERATOR_LRN_INL_H_ +#include +#include +#include +#include +#include +#include +#include +#include "./operator_common.h" +#include "./mshadow_op.h" + +namespace mxnet { +namespace op { + +namespace lrn_enum { +enum LRNInputs {kData}; +enum LRNOutputs {kOut, kTmpNorm}; +} // namespace lrn_enum + +struct LRNParam : public dmlc::Parameter { + float alpha; + float beta; + float knorm; + uint32_t nsize; + DMLC_DECLARE_PARAMETER(LRNParam) { + DMLC_DECLARE_FIELD(alpha).set_default(1e-4f) + .describe("The variance scaling parameter :math:`\alpha` in the LRN expression."); + DMLC_DECLARE_FIELD(beta).set_default(0.75f) + .describe("The power parameter :math:`\beta` in the LRN expression."); + DMLC_DECLARE_FIELD(knorm).set_default(2.0f) + .describe("The parameter :math:`k` in the LRN expression."); + DMLC_DECLARE_FIELD(nsize) + .describe("normalization window width in elements."); + } +}; // struct LRNParam + +template +class LocalResponseNormOp : public Operator { + public: + explicit LocalResponseNormOp(LRNParam param) { + param_ = param; + } + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + // TODO(xxx): Test with gradient chceker + CHECK_EQ(in_data.size(), 1U); + CHECK_EQ(out_data.size(), 2U); + // CHECK_EQ(req.size(), 2); + CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size"; + const real_t salpha = param_.alpha / param_.nsize; + Stream *s = ctx.get_stream(); + Tensor data = in_data[lrn_enum::kData].get(s); + Tensor out = out_data[lrn_enum::kOut].get(s); + Tensor tmp_norm = out_data[lrn_enum::kTmpNorm].get(s); + tmp_norm = chpool(F(data) , param_.nsize) * salpha + param_.knorm; + Assign(out, req[lrn_enum::kOut], data * F(tmp_norm, -param_.beta)); + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1U); + CHECK_EQ(in_data.size(), 1U); + CHECK_EQ(out_data.size(), 2U); + const real_t salpha = param_.alpha / param_.nsize; + Stream *s = ctx.get_stream(); + Tensor grad = out_grad[lrn_enum::kOut].get(s); + Tensor tmp_norm = out_data[lrn_enum::kTmpNorm].get(s); + Tensor data = in_data[lrn_enum::kData].get(s); + Tensor grad_in = in_grad[lrn_enum::kData].get(s); + grad_in = grad * F(tmp_norm, -param_.beta); + grad_in += (- 2.0f * param_.beta * salpha) * + chpool(grad * data * + F(tmp_norm, -param_.beta - 1.0f), + param_.nsize) * data; + } + + private: + LRNParam param_; +}; // class LocalResponseNormOp + +template +Operator *CreateOp(LRNParam param, int dtype); + +#if DMLC_USE_CXX11 +class LocalResponseNormProp : public OperatorProperty { + public: + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 1U) << "Input:[data]"; + const TShape &dshape = in_shape->at(0); + if (dshape.ndim() == 0) return false; + out_shape->clear(); + out_shape->push_back(dshape); + out_shape->push_back(dshape); + return true; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); + } + } + int n_out = this->ListOutputs().size(); + out_type->clear(); + for (int i = 0; i < n_out; ++i ) out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + auto ptr = new LocalResponseNormProp(); + ptr->param_ = param_; + return ptr; + } + + std::string TypeString() const override { + return "LRN"; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return { + out_grad[lrn_enum::kOut], in_data[lrn_enum::kData], + out_data[lrn_enum::kTmpNorm], out_data[lrn_enum::kOut] + }; + } + + int NumVisibleOutputs() const override { + return 1; + } + + int NumOutputs() const override { + return 2; + } + + std::vector ListArguments() const override { + return {"data"}; + } + + std::vector ListOutputs() const override { + return {"output", "tmp_norm"}; + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + private: + LRNParam param_; +}; // LocalResponseNormProp +#endif // DMLC_USE_CXX11 +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_LRN_INL_H_ diff --git a/src/operator/lrn.cc b/src/operator/lrn.cc new file mode 100644 index 000000000000..9b3afd80cd18 --- /dev/null +++ b/src/operator/lrn.cc @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2015 by Contributors + * \file lrn.cc + * \brief + * \author Bing Xu +*/ + +#include "./lrn-inl.h" +#if MXNET_USE_CUDNN == 1 +#include "./cudnn_lrn-inl.h" +#endif +#if MXNET_USE_MKL2017 == 1 +#include +#include "./mkl/mkl_memory-inl.h" +#include "./mkl/mkl_lrn-inl.h" +#endif + +namespace mxnet { +namespace op { +template<> +Operator* CreateOp(LRNParam param, int dtype) { +#if MXNET_USE_MKL2017 == 1 + return new MKLLRNOp(param); +#endif + return new LocalResponseNormOp(param); +} + +// DO_BIND_DISPATCH comes from operator_common.h +Operator* LocalResponseNormProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]); +} + +DMLC_REGISTER_PARAMETER(LRNParam); + +MXNET_REGISTER_OP_PROPERTY(LRN, LocalResponseNormProp) +.add_argument("data", "NDArray-or-Symbol", "Input data.") +.add_arguments(LRNParam::__FIELDS__()) +.describe(R"code(Applies local response normalization to the input. + +The local response normalization layer performs "lateral inhibition" by normalizing +over local input regions. + +If :math:`a_{x,y}^{i}` is the activity of a neuron computed by applying kernel :math:`i` at position +:math:`(x, y)` and then applying the ReLU nonlinearity, the response-normalized +activity :math:`b_{x,y}^{i}` is given by the expression: + +.. math:: + b_{x,y}^{i} = \frac{a_{x,y}^{i}}{\Bigg({k + \alpha \sum_{j=max(0, i-\frac{n}{2})}^{min(N-1, i+\frac{n}{2})} (a_{x,y}^{j})^{2}}\Bigg)^{\beta}} + +where the sum runs over :math:`n` "adjacent" kernel maps at the same spatial position, and :math:`N` is the total +number of kernels in the layer. + +)code" ADD_FILELINE); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/nn/lrn.cu b/src/operator/lrn.cu similarity index 64% rename from src/operator/nn/lrn.cu rename to src/operator/lrn.cu index 4c31ca96025c..ba872f1d26d0 100644 --- a/src/operator/nn/lrn.cu +++ b/src/operator/lrn.cu @@ -25,15 +25,29 @@ */ #include "./lrn-inl.h" +#if MXNET_USE_CUDNN == 1 +#include "./cudnn_lrn-inl.h" +#endif namespace mxnet { namespace op { - -NNVM_REGISTER_OP(LRN) -.set_attr("FCompute", LRNCompute); - -NNVM_REGISTER_OP(_backward_LRN) -.set_attr("FCompute", LRNGradCompute); +template<> +Operator* CreateOp(LRNParam param, int dtype) { + Operator *op = NULL; +#if MXNET_USE_CUDNN == 1 + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new CuDNNLocalResponseNormOp(param); + }) +#else +#if CUDA_VERSION == 7000 + LOG(FATAL) << "Due to old CUDA compiler bug, LRN is disabled." + << "Please upgrade CUDA to 7.5+ or use CUDNN"; +#else + op = new LocalResponseNormOp(param); +#endif // CUDA_VERSION +#endif // MXNET_USE_CUDNN + return op; +} } // namespace op } // namespace mxnet diff --git a/src/operator/mkl/mkl_batch_norm-inl.h b/src/operator/mkl/mkl_batch_norm-inl.h new file mode 100644 index 000000000000..b5967f4de294 --- /dev/null +++ b/src/operator/mkl/mkl_batch_norm-inl.h @@ -0,0 +1,391 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +* \file mkl_batch_norm-inl.h +* \brief +* \author lingyan.guo@intel.com +* zhenlin.luo@intel.com +* +*******************************************************************************/ +#ifndef MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_ +#define MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_ +#include +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" +#include "../mshadow_op.h" +#include "./mkl_util-inl.h" + +namespace mxnet { +namespace op { + +template +class MKLBatchNormOp : public Operator { + public: + explicit MKLBatchNormOp(BatchNormParam param) { + this->param_ = param; + fwd_top_data = MKLData::create(); + fwd_bottom_data = MKLData::create(); + bwd_top_diff = MKLData::create(); + bwd_bottom_diff = MKLData::create(); + scaleShift_space.dptr = NULL; + scaleShiftDiff_space.dptr = NULL; + } + virtual ~MKLBatchNormOp() { + if (batchNormFwdInference != NULL) dnnDelete(batchNormFwdInference); + if (batchNormFwdTraining != NULL) dnnDelete(batchNormFwdTraining); + if (batchNormBwdScaleShift != NULL) dnnDelete(batchNormBwdScaleShift); + dnnLayoutDelete(layout_usr_); + if (scaleShift_space.dptr) + Storage::Get()->Free(scaleShift_space); + if (scaleShiftDiff_space.dptr) + Storage::Get()->Free(scaleShiftDiff_space); + } + static std::string getName() { + return "MKLBatchNormOp"; + } + + private: + void LayerSetUp(const mshadow::Tensor &data, + const mshadow::Tensor &out) { + eps_ = param_.eps; + size_t dim = 4, sizes[4], strides[4]; + channels_ = data.shape_[1]; + height_ = data.shape_[2]; + width_ = data.shape_[3]; + num_ = data.shape_[0]; + + sizes[0] = width_; + sizes[1] = height_; + sizes[2] = channels_; + sizes[3] = num_; + + strides[0] = 1; + strides[1] = sizes[0]; + strides[2] = sizes[0] * sizes[1]; + strides[3] = sizes[0] * sizes[1] * sizes[2]; + + // Names are for debugging only + fwd_bottom_data->name = "fwd_bottom_data @ " + getName(); + fwd_top_data->name = "fwd_top_data @ " + getName(); + bwd_bottom_diff->name = "bwd_bottom_diff @ " + getName(); + bwd_top_diff->name = "bwd_top_diff @ " + getName(); + + dnnError_t e; + e = dnnLayoutCreate(&layout_usr_, dim, sizes, strides); + CHECK_EQ(e, E_SUCCESS); + + fwd_bottom_data->create_user_layout(dim, sizes, strides); + fwd_top_data->create_user_layout(dim, sizes, strides); + bwd_bottom_diff->create_user_layout(dim, sizes, strides); + bwd_top_diff->create_user_layout(dim, sizes, strides); + + // Primitives will be allocated during the first fwd pass + batchNormFwdInference = NULL; + batchNormFwdTraining = NULL; + batchNormBwdScaleShift = NULL; + int scaleShift_size = channels_*2*sizeof(DType); + scaleShift_space = Storage::Get()->Alloc(scaleShift_size, Context::CPU()); + scaleShiftDiff_space = Storage::Get()->Alloc(scaleShift_size, Context::CPU()); + DType * scaleShift_buf = reinterpret_cast(scaleShift_space.dptr); + /*!use_weight_bias_*/ + for (int i = 0; i < channels_; i++) { + scaleShift_buf[i] = 1.0; + scaleShift_buf[channels_ + i] = 0; + } + } + + public: + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_data.size(), 3); + CHECK_EQ(aux_states.size(), 2); + if (ctx.is_train) { + CHECK_EQ(out_data.size(), 3); + CHECK_EQ(req.size(), 3); + } else { + CHECK_GE(out_data.size(), 1); + CHECK_GE(req.size(), 1); + CHECK_EQ(req[batchnorm::kOut], kWriteTo); + } + + Stream *s = ctx.get_stream(); + Tensor data; + Tensor out; + if (in_data[batchnorm::kData].ndim() == 2) { + Shape<4> dshape = Shape4(in_data[batchnorm::kData].shape_[0], + in_data[batchnorm::kData].shape_[1], 1, 1); + data = mkl_experimental_direct_get_with_shape( + in_data[batchnorm::kData], dshape, s); + out = mkl_experimental_direct_get_with_shape( + out_data[batchnorm::kOut], dshape, s); + } else { + data = mkl_experimental_direct_get(in_data[batchnorm::kData], s); + out = mkl_experimental_direct_get(out_data[batchnorm::kOut], s); + } + + // const real_t scale = static_cast(in_data[batchnorm::kData].shape_[1]) / + // static_cast(in_data[batchnorm::kData].shape_.Size()); + + Tensor slope = in_data[batchnorm::kGamma].get(s); + Tensor bias = in_data[batchnorm::kBeta].get(s); + Tensor moving_mean = aux_states[batchnorm::kMovingMean].get(s); + Tensor moving_var = aux_states[batchnorm::kMovingVar].get(s); + + if (param_.fix_gamma) + slope = 1.f; + + dnnError_t e; + if (!init_mkldnn_) { + LayerSetUp(data, out); + init_mkldnn_ = true; + } + void* bottom_data = NULL; +#if MKL_EXPERIMENTAL == 1 + bottom_data = + reinterpret_cast(mkl_prv_data(in_data[batchnorm::kData])); +#endif + int bwd_flags = dnnUseScaleShift; + if (param_.use_global_stats) + bwd_flags = dnnUseScaleShift | dnnUseInputMeanVariance; +#if MKL_EXPERIMENTAL == 1 + if (NULL != bottom_data) { + // Is it the first pass? Create a primitive. + if (batchNormFwdInference == NULL) { + std::shared_ptr bottom_data_mem = in_data[batchnorm::kData].Mkl_mem_; + std::shared_ptr bottom_prv_desc = bottom_data_mem->get_prv_descriptor(); + CHECK(bottom_prv_desc->get_descr_type() == PrvMemDescr::PRV_DESCR_MKL2017); + std::shared_ptr > mem_descr + = std::static_pointer_cast>(bottom_prv_desc); + CHECK(mem_descr != NULL); + fwd_bottom_data = mem_descr; + + e = dnnBatchNormalizationCreateForward_v2( + &batchNormFwdInference, NULL, mem_descr->layout_int, eps_, + dnnUseInputMeanVariance | dnnUseScaleShift); + CHECK_EQ(e, E_SUCCESS); + + e = dnnBatchNormalizationCreateForward_v2( + &batchNormFwdTraining, NULL, mem_descr->layout_int, eps_, + dnnUseScaleShift); + CHECK_EQ(e, E_SUCCESS); + + fwd_top_data->create_internal_layout(batchNormFwdInference, dnnResourceDst); + bwd_top_diff->create_internal_layout(batchNormFwdInference, dnnResourceDst); + bwd_bottom_diff->create_internal_layout(batchNormFwdInference, dnnResourceSrc); + + e = dnnBatchNormalizationCreateBackward_v2( + &batchNormBwdScaleShift, NULL, mem_descr->layout_int, eps_, bwd_flags); + CHECK_EQ(e, E_SUCCESS); + } + } +#endif + if (NULL == bottom_data) { + if (batchNormFwdInference == NULL) { + e = dnnBatchNormalizationCreateForward_v2( + &batchNormFwdInference, NULL, layout_usr_, eps_, + dnnUseInputMeanVariance | dnnUseScaleShift); + CHECK_EQ(e, E_SUCCESS); + + e = dnnBatchNormalizationCreateForward_v2( + &batchNormFwdTraining, NULL, layout_usr_, eps_, dnnUseScaleShift); + CHECK_EQ(e, E_SUCCESS); + + e = dnnBatchNormalizationCreateBackward_v2( + &batchNormBwdScaleShift, NULL, layout_usr_, eps_, bwd_flags); + CHECK_EQ(e, E_SUCCESS); + } + bottom_data = reinterpret_cast(data.dptr_); + } + + DType * scaleShift_buf = reinterpret_cast(scaleShift_space.dptr); + // use_weight_bias_ + for (int i = 0; i < channels_; i++) { + scaleShift_buf[i] = (slope.dptr_)[i]; + } + for (int i = 0; i < channels_; i++) { + scaleShift_buf[channels_ + i] = (bias.dptr_)[i]; + } + + void* BatchNorm_res[dnnResourceNumber]; + BatchNorm_res[dnnResourceSrc] = bottom_data; + BatchNorm_res[dnnResourceScaleShift] = scaleShift_space.dptr; + + BatchNorm_res[dnnResourceDst] = fwd_top_data->get_output_ptr(out.dptr_, + fwd_top_data, out_data[batchnorm::kOut]); + if (ctx.is_train && !param_.use_global_stats) { + Tensor mean = out_data[batchnorm::kMean].get(s); + Tensor var = out_data[batchnorm::kVar].get(s); + CHECK(req[batchnorm::kMean] == kNullOp || req[batchnorm::kMean] == kWriteTo); + CHECK(req[batchnorm::kVar] == kNullOp || req[batchnorm::kVar] == kWriteTo); + BatchNorm_res[dnnResourceMean] = mean.dptr_; + BatchNorm_res[dnnResourceVariance] = var.dptr_; + e = dnnExecute(batchNormFwdTraining, BatchNorm_res); + CHECK_EQ(e, E_SUCCESS); + } else { + BatchNorm_res[dnnResourceMean] = moving_mean.dptr_; + BatchNorm_res[dnnResourceVariance] = moving_var.dptr_; + e = dnnExecute(batchNormFwdInference, BatchNorm_res); + CHECK_EQ(e, E_SUCCESS); + } + +#if MKL_EXPERIMENTAL == 0 + if (fwd_top_data->conversion_needed()) { + fwd_top_data->convert_from_prv(out.dptr_); + } +#endif + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1); + CHECK_EQ(in_data.size(), 3); + CHECK_EQ(out_data.size(), 3); + CHECK_EQ(in_grad.size(), 3); + Stream *s = ctx.get_stream(); + Tensor data, grad, grad_in; + + if (in_data[batchnorm::kData].ndim() == 2) { + Shape<4> dshape = Shape4(out_grad[batchnorm::kOut].shape_[0], + out_grad[batchnorm::kOut].shape_[1], 1, 1); + data = mkl_experimental_direct_get_with_shape( + in_data[batchnorm::kData], dshape, s); + grad = mkl_experimental_direct_get_with_shape( + out_grad[batchnorm::kOut], dshape, s); + grad_in = mkl_experimental_direct_get_with_shape( + in_grad[batchnorm::kData], dshape, s); + } else { + data = mkl_experimental_direct_get(in_data[batchnorm::kData], s); + grad = mkl_experimental_direct_get(out_grad[batchnorm::kOut], s); + grad_in = mkl_experimental_direct_get(in_grad[batchnorm::kData], s); + } + + Tensor slope = in_data[batchnorm::kGamma].get(s); + Tensor gslope = in_grad[batchnorm::kGamma].get(s); + Tensor gbias = in_grad[batchnorm::kBeta].get(s); + Tensor mean = out_data[batchnorm::kMean].get(s); + Tensor var = out_data[batchnorm::kVar].get(s); + Tensor moving_mean = aux_states[batchnorm::kMovingMean].get(s); + Tensor moving_var = aux_states[batchnorm::kMovingVar].get(s); + + if (param_.fix_gamma) slope = 1.f; + + void* bottom_data = NULL; +#if MKL_EXPERIMENTAL == 1 + bottom_data = reinterpret_cast(mkl_prv_data(in_data[batchnorm::kData])); +#endif + if (NULL == bottom_data) + bottom_data = reinterpret_cast(data.dptr_); + + dnnError_t e; + void* BatchNorm_res[dnnResourceNumber]; + BatchNorm_res[dnnResourceSrc] = bottom_data; + BatchNorm_res[dnnResourceScaleShift] = scaleShift_space.dptr; + if (ctx.is_train && !param_.use_global_stats) { + int size = mean.size(0); // Tensor + float * moving_mean_ptr = reinterpret_cast(moving_mean.dptr_); + float * mean_ptr = reinterpret_cast(mean.dptr_); + float * moving_var_ptr = reinterpret_cast(moving_var.dptr_); + float * var_ptr = reinterpret_cast(var.dptr_); + float minus_mom = (1 - param_.momentum); + for (int i = 0; i < size; i++) { + moving_mean_ptr[i] = moving_mean_ptr[i] * param_.momentum + + mean_ptr[i] * minus_mom; + } + for (int i = 0; i < size; i++) { + moving_var_ptr[i] = moving_var_ptr[i] * param_.momentum + + var_ptr[i] * minus_mom; + } + BatchNorm_res[dnnResourceMean] = mean.dptr_; + BatchNorm_res[dnnResourceVariance] = var.dptr_; + } else { + BatchNorm_res[dnnResourceMean] = moving_mean.dptr_; + BatchNorm_res[dnnResourceVariance] = moving_var.dptr_; + } + + + BatchNorm_res[dnnResourceDiffSrc] = bwd_bottom_diff->get_output_ptr(grad_in.dptr_, + bwd_bottom_diff, in_grad[batchnorm::kData]); + BatchNorm_res[dnnResourceDiffDst] = bwd_top_diff->get_converted_prv(grad.dptr_, + true, out_grad[batchnorm::kOut]); + BatchNorm_res[dnnResourceDiffScaleShift] = scaleShiftDiff_space.dptr; + e = dnnExecute(batchNormBwdScaleShift, BatchNorm_res); + CHECK_EQ(e, E_SUCCESS); +#if MKL_EXPERIMENTAL == 0 + if (bwd_bottom_diff->conversion_needed()) { + bwd_bottom_diff->convert_from_prv(grad_in.dptr_); + } +#endif + DType * scaleShiftDiff_buf = reinterpret_cast(scaleShiftDiff_space.dptr); + if (!param_.fix_gamma) { + // Store ScaleShift blobs + DType* diff_scale = gslope.dptr_; + for (int i = 0; i < channels_; i++) { + diff_scale[i] = scaleShiftDiff_buf[i]; + } + } else { + int gslope_size = gslope.size(0); + float * gslope_ptr = reinterpret_cast(gslope.dptr_); + for (int i = 0; i < gslope_size; i++) { + *gslope_ptr++ = 0.0f; + } + } + DType* diff_shift = gbias.dptr_; + for (int i = 0; i < channels_; i++) { + diff_shift[i] = scaleShiftDiff_buf[channels_ + i]; + } + } + + private: + BatchNormParam param_; + DType eps_; + bool use_weight_bias_; + + int num_; + int channels_; + int height_; + int width_; + bool init_mkldnn_ = false; + std::shared_ptr > fwd_top_data; + std::shared_ptr > fwd_bottom_data; + std::shared_ptr > bwd_top_diff; + std::shared_ptr > bwd_bottom_diff; + dnnPrimitive_t batchNormFwdInference = NULL; + dnnPrimitive_t batchNormFwdTraining = NULL; + dnnPrimitive_t batchNormBwdScaleShift = NULL; + Storage::Handle scaleShift_space; + Storage::Handle scaleShiftDiff_space; + dnnLayout_t layout_usr_ = NULL; +}; // class BatchNormOp +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_ diff --git a/src/operator/mkl/mkl_concat-inl.h b/src/operator/mkl/mkl_concat-inl.h new file mode 100644 index 000000000000..1ed1e81d1303 --- /dev/null +++ b/src/operator/mkl/mkl_concat-inl.h @@ -0,0 +1,314 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +* \file mkl_concat-inl.h +* \brief +* \author lingyan.guo@intel.com +* zhenlin.luo@intel.com +* +*******************************************************************************/ +#ifndef MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_ +#define MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_ +#include +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" +#include "../channel_op_common.h" +#include "./mkl_util-inl.h" +namespace mxnet { +namespace op { + + +template +class MKLConcatOp : public Operator { + public: + static std::string getName() { + return "MKLConcatOp"; + } + explicit MKLConcatOp(ConcatParam param) + : size_(param.num_args), dimension_(param.dim), init_mkldnn_(false) { + concatFwd_ = static_cast(NULL); + concatBwd_ = static_cast(NULL); + fwd_top_data_ = MKLData::create(); + bwd_top_diff_ = MKLData::create(); + + num_concats_ = param.num_args; + } + virtual ~MKLConcatOp() { + dnnDelete(concatFwd_); + dnnDelete(concatBwd_); + } + + private: + void LayerSetUp(const std::vector > &data, + const mshadow::Tensor &out, + size_t data_shape_size, size_t *split_channels_) { + size_t dim_src = data_shape_size; + size_t dim_dst = dim_src; + num_concats_ = size_; + channels_ = 0; + + for (size_t i = 1; i < num_concats_; ++i) { + for (size_t j = 1; j < data_shape_size; ++j) { + if (j == dimension_) continue; + CHECK_EQ(data[0].shape_[j], data[i].shape_[j]); + } + } + + for (size_t i = 0; i < num_concats_; ++i) { + CHECK_EQ((int)dim_src, data[i].shape_.kDimension); + + fwd_bottom_data_.push_back(MKLData::create()); + bwd_bottom_diff_.push_back(MKLData::create()); + fwd_bottom_data_[i]->name = "fwd_bottom_data_[i]"; + bwd_bottom_diff_[i]->name = "bwd_bottom_data[i]"; + + size_t *sizes_src = new size_t[dim_src]; + size_t *strides_src = new size_t[dim_src]; + for (size_t d = 0; d < dim_src; ++d) { + sizes_src[d] = data[i].shape_[dim_src - d - 1]; + strides_src[d] = (d == 0) ? 1 : strides_src[d - 1] * sizes_src[d - 1]; + } + + split_channels_[i] = data[i].shape_[1]; + channels_ += split_channels_[i]; + fwd_bottom_data_[i]->create_user_layout(dim_src, sizes_src, strides_src); + bwd_bottom_diff_[i]->create_user_layout(dim_src, sizes_src, strides_src); + delete[] sizes_src; + delete[] strides_src; + } + size_t *sizes_dst = new size_t[dim_dst]; + size_t *strides_dst = new size_t[dim_dst]; + for (size_t d = 0; d < dim_dst; ++d) { + if (d == 2) + sizes_dst[d] = channels_; + else + sizes_dst[d] = data[0].shape_[dim_dst - 1 - d]; + strides_dst[d] = (d == 0) ? 1 : strides_dst[d - 1] * sizes_dst[d - 1]; + } + bwd_top_diff_->create_user_layout(dim_dst, sizes_dst, strides_dst); + fwd_top_data_->create_user_layout(dim_dst, sizes_dst, strides_dst); + delete[] sizes_dst; + delete[] strides_dst; + concatFwd_ = NULL; + concatBwd_ = NULL; + } + + public: + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(static_cast(in_data.size()), size_); + CHECK_EQ(out_data.size(), 1); + CHECK_LT(dimension_, (size_t)in_data[concat_enum::kData0].ndim()); + Stream *s = ctx.get_stream(); + std::vector > data(size_); + Tensor out; + if (in_data[0].ndim() == 2) { + for (int i = 0; i < size_; ++i) { + Shape<4> dshape = Shape4(in_data[i].shape_[0], + in_data[i].shape_[1], 1, 1); + data[i] = mkl_experimental_direct_get_with_shape( + in_data[i], dshape, s); + } + Shape<4> dshape = Shape4(out_data[concat_enum::kOut].shape_[0], + out_data[concat_enum::kOut].shape_[1], 1, 1); + out = mkl_experimental_direct_get_with_shape( + out_data[concat_enum::kOut], dshape, s); + } else if (in_data[0].ndim() == 3) { + for (int i = 0; i < size_; ++i) { + Shape<4> dshape = Shape4(in_data[i].shape_[0], + in_data[i].shape_[1], in_data[i].shape_[2], 1); + data[i] = mkl_experimental_direct_get_with_shape( + in_data[i], dshape, s); + } + Shape<4> dshape = Shape4(out_data[concat_enum::kOut].shape_[0], + out_data[concat_enum::kOut].shape_[1], + out_data[concat_enum::kOut].shape_[2], 1); + out = mkl_experimental_direct_get_with_shape( + out_data[concat_enum::kOut], dshape, s); + } else { + for (int i = 0; i < size_; ++i) { + data[i] = mkl_experimental_direct_get(in_data[i], s); + } + out = mkl_experimental_direct_get(out_data[concat_enum::kOut], s); + } + size_t *split_channels_ = new size_t[num_concats_]; + if (!init_mkldnn_) { + init_mkldnn_ = true; + LayerSetUp(data, out, 4, split_channels_); + } + + dnnError_t e; + std::vector bottom_data; + bool isFirstPass = (concatFwd_ == NULL); + dnnLayout_t *layouts = NULL; + if (isFirstPass) { + layouts = new dnnLayout_t[num_concats_]; + } + + for (size_t i = 0; i < num_concats_; i++) { + void * bottom_i = NULL; +#if MKL_EXPERIMENTAL == 1 + bottom_i = mkl_prv_data(in_data[i]); + if (bottom_i != NULL) { + if (isFirstPass) { + std::shared_ptr > mem_descr = + mkl_get_mem_desc(in_data[i].Mkl_mem_); + fwd_bottom_data_[i] = mem_descr; + layouts[i] = mem_descr->layout_int; + } + } +#endif + if (bottom_i == NULL) { + bottom_i = data[i].dptr_; + if (isFirstPass) { + layouts[i] = fwd_bottom_data_[i]->layout_usr; + } + } + + bottom_data.push_back(reinterpret_cast(bottom_i)); + } + + if (isFirstPass) { + e = dnnConcatCreate(&concatFwd_, NULL, num_concats_, layouts); + CHECK_EQ(e, E_SUCCESS); + + fwd_top_data_->create_internal_layout(concatFwd_, dnnResourceDst); + bwd_top_diff_->create_internal_layout(concatFwd_, dnnResourceDst); + + e = dnnSplitCreate(&concatBwd_, NULL, num_concats_, + bwd_top_diff_->layout_int, split_channels_); + CHECK_EQ(e, E_SUCCESS); + + for (size_t n = 0; n < num_concats_; ++n) { + fwd_bottom_data_[n]->create_internal_layout(concatFwd_, + (dnnResourceType_t)(dnnResourceMultipleSrc + n)); + bwd_bottom_diff_[n]->create_internal_layout(concatBwd_, + (dnnResourceType_t)(dnnResourceMultipleDst + n)); + } + } + delete[] layouts; + + void *concat_res[dnnResourceNumber]; + for (size_t i = 0; i < num_concats_; ++i) { + concat_res[dnnResourceMultipleSrc + i] + = reinterpret_cast(bottom_data[i]); + } + + concat_res[dnnResourceDst] = fwd_top_data_->get_output_ptr(out.dptr_, + fwd_top_data_, out_data[concat_enum::kOut]); + e = dnnExecute(concatFwd_, concat_res); + CHECK_EQ(e, E_SUCCESS); + delete[] split_channels_; + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1); + CHECK_EQ(in_grad.size(), static_cast(size_)); + Stream *s = ctx.get_stream(); + std::vector > grad_in(size_); + Tensor grad; + if (in_grad[0].ndim() == 2) { + Shape<4> dshape = Shape4(out_grad[concat_enum::kOut].shape_[0], + out_grad[concat_enum::kOut].shape_[1], 1, 1); + grad = mkl_experimental_direct_get_with_shape( + out_grad[concat_enum::kOut], dshape, s); + for (int i = 0; i < size_; ++i) { + dshape = Shape4(in_grad[i].shape_[0], + in_grad[i].shape_[1], 1, 1); + grad_in[i] = mkl_experimental_direct_get_with_shape( + in_grad[i], dshape, s); + } + } else if (in_grad[0].ndim() == 3) { + Shape<4> dshape = Shape4(out_grad[concat_enum::kOut].shape_[0], + out_grad[concat_enum::kOut].shape_[1], + out_grad[concat_enum::kOut].shape_[2], 1); + grad = mkl_experimental_direct_get_with_shape( + out_grad[concat_enum::kOut], dshape, s); + for (int i = 0; i < size_; ++i) { + dshape = Shape4(in_grad[i].shape_[0], + in_grad[i].shape_[1], in_grad[i].shape_[2], 1); + grad_in[i] = mkl_experimental_direct_get_with_shape( + in_grad[i], dshape, s); + } + } else { + grad = mkl_experimental_direct_get(out_grad[concat_enum::kOut], s); + for (int i = 0; i < size_; ++i) { + grad_in[i] = mkl_experimental_direct_get(in_grad[i], s); + } + } + + int need_bwd = 0; + for (size_t n = 0; n < num_concats_; n++) { + need_bwd += req[n]; + } + if (!need_bwd) { + return; + } + + dnnError_t e; + void *concat_res[dnnResourceNumber]; + concat_res[dnnResourceSrc] = bwd_top_diff_->get_converted_prv(grad.dptr_, true, + out_grad[concat_enum::kOut]); + for (size_t i = 0; i < num_concats_; ++i) { + concat_res[dnnResourceMultipleDst + i] = bwd_bottom_diff_[i]->get_output_ptr( + grad_in[i].dptr_, bwd_bottom_diff_[i], in_grad[i]); + } + e = dnnExecute(concatBwd_, concat_res); + CHECK_EQ(e, E_SUCCESS); + } + + private: + int size_; + size_t dimension_; + + bool init_mkldnn_; + + dnnPrimitive_t concatFwd_; + dnnPrimitive_t concatBwd_; + std::shared_ptr > fwd_top_data_; + std::vector< std::shared_ptr > > fwd_bottom_data_; + std::shared_ptr > bwd_top_diff_; + std::vector< std::shared_ptr > > bwd_bottom_diff_; + + + size_t width_; + size_t height_; + size_t channels_; + size_t num_; + size_t num_concats_; +}; // class MKLConcatOp +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_ diff --git a/src/operator/mkl/mkl_convolution-inl.h b/src/operator/mkl/mkl_convolution-inl.h new file mode 100644 index 000000000000..813d061f172b --- /dev/null +++ b/src/operator/mkl/mkl_convolution-inl.h @@ -0,0 +1,490 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +* \file mkl_convolution-inl.h +* \brief +* \author lingyan.guo@intel.com +* zhenlin.luo@intel.com +* +*******************************************************************************/ +#ifndef MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_ +#define MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" +#include "../nn/convolution-inl.h" +#include "./mkl_util-inl.h" + +namespace mxnet { +namespace op { + +template +class MKLConvolutionOp : public Operator { + public: + static std::string getName() { + return "MKLConvolutionOp"; + } + void SetupBuffer() { + convolutionBwdBias = static_cast(NULL); + convolutionBwdFilter = static_cast(NULL); + convolutionBwdData = static_cast(NULL); + convolutionFwd = static_cast(NULL); + fwd_bottom_data = MKLData::create(); + fwd_top_data = MKLData::create(); + fwd_filter_data = MKLData::create(); + fwd_bias_data = MKLData::create(); + bwdd_top_diff = MKLData::create(); + bwdd_bottom_diff = MKLData::create(); + bwdd_filter_data = MKLData::create(); + bwdf_top_diff = MKLData::create(); + bwdf_filter_diff = MKLData::create(); + bwdf_bottom_data = MKLData::create(); + bwdb_top_diff = MKLData::create(); + bwdb_bias_diff = MKLData::create(); + // Names are for debugging purposes only. + fwd_bottom_data->name = "fwd_bottom_data @ " + this->getName(); + fwd_top_data->name = "fwd_top_data @ " + this->getName(); + fwd_filter_data->name = "fwd_filter_data @ " + this->getName(); + fwd_bias_data->name = "fwd_bias_data @ " + this->getName(); + bwdd_top_diff->name = "bwdd_top_diff @ " + this->getName(); + bwdd_bottom_diff->name = "bwdd_bottom_diff @ " + this->getName(); + bwdd_filter_data->name = "bwdd_filter_data @ " + this->getName(); + bwdf_top_diff->name = "bwdf_top_diff @ " + this->getName(); + bwdf_bottom_data->name = "bwdf_bottom_data @ " + this->getName(); + bwdf_filter_diff->name = "bwdf_filter_diff @ " + this->getName(); + bwdb_top_diff->name = "bwdb_top_diff @ " + this->getName(); + bwdb_bias_diff->name = "bwdb_bias_diff @ " + this->getName(); + } + + explicit MKLConvolutionOp(ConvolutionParam p): + convolutionFwd(NULL), + convolutionBwdData(static_cast(NULL)), + convolutionBwdFilter(static_cast(NULL)), + convolutionBwdBias(static_cast(NULL)) { + this->param_ = p; + init_mkldnn_ = false; + // convert MBytes first to Bytes and then to elements. + param_.workspace = (param_.workspace << 20) / sizeof(DType); + SetupBuffer(); + } + void ReleaseBuffer() { + if (convolutionFwd != NULL) { + dnnDelete(convolutionFwd); + convolutionFwd = NULL; + } + if (convolutionBwdData != NULL) { + dnnDelete(convolutionBwdData); + convolutionBwdData = NULL; + } + if (convolutionBwdFilter != NULL) { + dnnDelete(convolutionBwdFilter); + convolutionBwdFilter = NULL; + } + if (!param_.no_bias && convolutionBwdBias != NULL) { + dnnDelete(convolutionBwdBias); + convolutionBwdBias = NULL; + } + } + virtual ~MKLConvolutionOp() { + ReleaseBuffer(); + } + + private: + void LayerSetUp(const mshadow::Tensor &data, + const mshadow::Tensor &out) { + this->width_ = data.shape_[3]; + this->height_ = data.shape_[2]; + this->channels_ = data.shape_[1]; + this->num_ = data.shape_[0]; + this->group_ = param_.num_group; + this->width_out_ = out.shape_[3]; + this->height_out_ = out.shape_[2]; + int channel_out_ = out.shape_[1]; + this->num_output_ = channel_out_; + kernel_w_ = param_.kernel[1]; + kernel_h_ = param_.kernel[0]; + stride_w_ = param_.stride[1]; + stride_h_ = param_.stride[0]; + pad_w_ = param_.pad[1]; + pad_h_ = param_.pad[0]; + int status; + size_t n, g; + size_t iw, ih, ic; + size_t ow, oh, oc; + size_t kw, kh; + size_t dimension = 4; + g = std::max(this->group_, 1); + n = this->num_; + iw = this->width_; + ih = this->height_; + ic = this->channels_; + ow = this->width_out_; + oh = this->height_out_; + oc = this->num_output_; + kw = this->kernel_w_; + kh = this->kernel_h_; + oc = this->num_output_; + size_t bdata_sizes[4] = { iw, ih, ic, n }; + size_t bdata_strides[4] = { 1, iw, iw*ih, iw*ih*ic }; + /* starting with MKL 2017 Gold in case of groups filter layout + * becomes 5D, i.e. groups become a separate dimension */ + size_t g_mkl2017 = g; + size_t f_dimension = dimension + (g != 1); + if (getMKLBuildDate() < 20160701) { + g_mkl2017 = 1; + f_dimension = dimension; + } + size_t fdata_sizes[5] = { kw, kh, ic / g, oc / g_mkl2017, g_mkl2017 }; + size_t fdata_strides[5] = { 1, kw, kw*kh, kw*kh*ic / g, kw*kh*ic / g*oc / g }; + size_t bias_sizes[1] = { oc }; + size_t bias_strides[1] = { 1 }; + size_t tdata_sizes[4] = { ow, oh, oc, n }; + size_t tdata_strides[4] = { 1, ow, ow*oh, ow*oh*oc }; + size_t convolutionStrides[2] = { this->stride_w_, this->stride_h_ }; + int inputOffset[2] = { -this->pad_w_, -this->pad_h_ }; + // Names are for debugging purposes only. + /*** convolution section ***/ + if (!param_.no_bias) { + status = dnnGroupsConvolutionCreateForwardBias(&convolutionFwd, + NULL, + dnnAlgorithmConvolutionDirect, + g, + dimension, + bdata_sizes, + tdata_sizes, + fdata_sizes, + convolutionStrides, + inputOffset, + dnnBorderZeros); + } else { + status = dnnGroupsConvolutionCreateForward(&convolutionFwd, + NULL, + dnnAlgorithmConvolutionDirect, + g, + dimension, + bdata_sizes, + tdata_sizes, + fdata_sizes, + convolutionStrides, + inputOffset, + dnnBorderZeros); + } + CHECK_EQ(status, 0) + << "Failed dnnCreateConvolution(dnnForward) with status " + << status << "\n"; + fwd_bottom_data->create_layouts(convolutionFwd, dnnResourceSrc, dimension, + bdata_sizes, bdata_strides); + fwd_top_data->create_layouts(convolutionFwd, dnnResourceDst, dimension, + tdata_sizes, tdata_strides); + fwd_filter_data->create_layouts(convolutionFwd, dnnResourceFilter, + f_dimension, fdata_sizes, fdata_strides); + if (!param_.no_bias) + fwd_bias_data->create_layouts(convolutionFwd, dnnResourceBias, 1, + bias_sizes, bias_strides); + /* + * Backward by data layer setup + */ + status = dnnGroupsConvolutionCreateBackwardData(&convolutionBwdData, + NULL, + dnnAlgorithmConvolutionDirect, + g, + dimension, + bdata_sizes, + tdata_sizes, + fdata_sizes, + convolutionStrides, + inputOffset, + dnnBorderZeros); + CHECK_EQ(status, 0) + << "Failed dnnConvolutionCreateBackwardData with status " + << status << "\n"; + bwdd_bottom_diff->create_layouts(convolutionBwdData, dnnResourceDiffSrc, + dimension, bdata_sizes, bdata_strides); + bwdd_top_diff->create_layouts(convolutionBwdData, dnnResourceDiffDst, + dimension, tdata_sizes, tdata_strides); + bwdd_filter_data->create_layouts(convolutionBwdData, dnnResourceFilter, + f_dimension, fdata_sizes, fdata_strides); + /* + * Backward by filter layer setup + */ + status = dnnGroupsConvolutionCreateBackwardFilter(&convolutionBwdFilter, + NULL, + dnnAlgorithmConvolutionDirect, + g, + dimension, + bdata_sizes, + tdata_sizes, + fdata_sizes, + convolutionStrides, + inputOffset, + dnnBorderZeros); + CHECK_EQ(status, 0) + << "Failed dnnConvolutionCreateBackwardFilter with status " + << status << "\n"; + bwdf_bottom_data->create_layouts(convolutionBwdFilter, dnnResourceSrc, + dimension, bdata_sizes, bdata_strides); + bwdf_top_diff->create_layouts(convolutionBwdFilter, dnnResourceDiffDst, + dimension, tdata_sizes, tdata_strides); + bwdf_filter_diff->create_layouts(convolutionBwdFilter, dnnResourceDiffFilter, + f_dimension, fdata_sizes, fdata_strides); + /* + * Backward by bias layer setup + */ + if (!param_.no_bias) { + status = dnnGroupsConvolutionCreateBackwardBias(&convolutionBwdBias, + NULL, + dnnAlgorithmConvolutionDirect, + g, + dimension, + tdata_sizes); + CHECK_EQ(status, 0) + << "Failed dnnConvolutionCreateBackwardBias with status " + << status << "\n"; + bwdb_top_diff->create_layouts(convolutionBwdBias, dnnResourceDiffDst, + dimension, tdata_sizes, tdata_strides); + bwdb_bias_diff->create_layouts(convolutionBwdBias, dnnResourceDiffBias, 1, + bias_sizes, bias_strides); + } + } + + public: + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + Stream *s = ctx.get_stream(); + DType *data_ptr = NULL; + DType *wmat_ptr = NULL; + DType *out_ptr = NULL; + Tensor data = + mkl_experimental_direct_get(in_data[conv::kData], s); + Tensor out = + mkl_experimental_direct_get(out_data[conv::kOut], s); + Tensor wmat = + mkl_experimental_direct_get(in_data[conv::kWeight], s); + if (!init_mkldnn_) { + LayerSetUp(data, out); + init_mkldnn_ = true; + } + CHECK_EQ(data.CheckContiguous(), true); + CHECK_EQ(wmat.CheckContiguous(), true); + CHECK_EQ(out.CheckContiguous(), true); + data_ptr = data.dptr_; + wmat_ptr = wmat.dptr_; + out_ptr = out.dptr_; + int status; + void *res_convolutionFwd[dnnResourceNumber]; + res_convolutionFwd[dnnResourceSrc] = + fwd_bottom_data->get_converted_prv(data_ptr, false, in_data[conv::kData]); + res_convolutionFwd[dnnResourceFilter] = + fwd_filter_data->get_converted_prv(wmat_ptr, true, in_data[conv::kWeight]); + if (!param_.no_bias) { + Tensor bias = + mkl_experimental_direct_get(in_data[conv::kBias], s); + res_convolutionFwd[dnnResourceBias] = + fwd_bias_data->get_converted_prv(bias.dptr_, true, in_data[conv::kBias]); + } + + res_convolutionFwd[dnnResourceDst] = fwd_top_data->get_output_ptr(out_ptr, + fwd_top_data, out_data[conv::kOut]); + status = dnnExecute(convolutionFwd, res_convolutionFwd); + CHECK_EQ(status, 0) << "Forward convolution failed with status " << status; +#if MKL_EXPERIMENTAL == 0 + if (fwd_top_data->conversion_needed()) { + fwd_top_data->convert_from_prv(out_ptr); + } +#endif + } + void AddToModeAllocAndStoreBuffer(void *src, int blob_size, Storage::Handle *pws) { + int blob_byte_size = blob_size * sizeof(DType); + *pws = Storage::Get()->Alloc(blob_byte_size, Context::CPU()); + memcpy(pws->dptr, src, blob_byte_size); + } + void AddToModeAddAndReleaseBuffer(Storage::Handle *pws, void *dst_, int blob_size) { + DType *dst = reinterpret_cast(dst_); + DType *src = reinterpret_cast(pws->dptr); +#pragma omp parallel for + for (int i = 0; i < blob_size; i++) { + dst[i] += src[i]; + } + if (pws->dptr) + Storage::Get()->Free(*pws); + pws->dptr = NULL; + } + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + if (param_.kernel.ndim() > 2) { + LOG(FATAL) << "Volume convolution is not implmented in mshadow"; + } + CHECK_EQ(out_grad.size(), 1); + size_t expected = param_.no_bias == 0 ? 3 : 2; + CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(req.size(), expected); + CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true); + Stream *s = ctx.get_stream(); + Tensor data = + mkl_experimental_direct_get(in_data[conv::kData], s); + Shape<3> wmat_shape = + Shape3(param_.num_group, + param_.num_filter / param_.num_group, + data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]); + Tensor wmat = + mkl_experimental_direct_get_with_shape( + in_data[conv::kWeight], wmat_shape, s); + Tensor grad = + mkl_experimental_direct_get(out_grad[conv::kOut], s); + Tensor gdata = + mkl_experimental_direct_get(in_grad[conv::kData], s); + Tensor gwmat = + mkl_experimental_direct_get_with_shape( + in_grad[conv::kWeight], wmat_shape, s); + + if (!init_mkldnn_) { + init_mkldnn_ = true; + LayerSetUp(data, grad); + } + int status; + if (req[0]) { + void *res_convolutionBwdData[dnnResourceNumber]; + res_convolutionBwdData[dnnResourceDiffDst] = + bwdd_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]); + + res_convolutionBwdData[dnnResourceFilter] = + bwdd_filter_data->get_converted_prv(wmat.dptr_, false, in_data[conv::kWeight]); + Storage::Handle addtoWorkspace; + if (req[0] == kAddTo) { + // wait mkl support addto mode + AddToModeAllocAndStoreBuffer(gdata.dptr_, in_grad[conv::kData].Size(), &addtoWorkspace); + } + + res_convolutionBwdData[dnnResourceDiffSrc] = bwdd_bottom_diff->get_output_ptr(gdata.dptr_, + bwdd_bottom_diff, in_grad[conv::kData]); + status = dnnExecute(convolutionBwdData, res_convolutionBwdData); + CHECK_EQ(status, 0) << "Backward Data conv failed with status " << status; +#if MKL_EXPERIMENTAL == 0 + if (bwdd_bottom_diff->conversion_needed()) { + bwdd_bottom_diff->convert_from_prv(gdata.dptr_); + } +#endif + if (req[0] == kAddTo) { + if (bwdd_bottom_diff->conversion_needed()) { + bwdd_bottom_diff->convert_from_prv(gdata.dptr_); + } + AddToModeAddAndReleaseBuffer(&addtoWorkspace, gdata.dptr_, in_grad[conv::kData].Size()); + } + } + if (req[1]) { + void *res_convolutionBwdFilter[dnnResourceNumber]; + + res_convolutionBwdFilter[dnnResourceDiffDst] = + bwdf_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]); + + res_convolutionBwdFilter[dnnResourceSrc] = + bwdf_bottom_data->get_converted_prv(data.dptr_, false, + in_data[conv::kData]); + Storage::Handle addtoWorkspace; + if (req[1] == kAddTo) { + // wait mkl support addto mode + AddToModeAllocAndStoreBuffer(gwmat.dptr_, in_grad[conv::kWeight].Size(), &addtoWorkspace); + } + + res_convolutionBwdFilter[dnnResourceDiffFilter] = bwdf_filter_diff->get_output_ptr( + gwmat.dptr_, bwdf_filter_diff, in_grad[conv::kWeight]); + status = dnnExecute(convolutionBwdFilter, res_convolutionBwdFilter); + CHECK_EQ(status, 0) << "Backward Filter conv failed with status " << status; +#if MKL_EXPERIMENTAL == 0 + if (bwdf_filter_diff->conversion_needed()) { + bwdf_filter_diff->convert_from_prv(gwmat.dptr_); + } +#endif + if (req[1] == kAddTo) { + if (bwdf_filter_diff->conversion_needed()) { + bwdf_filter_diff->convert_from_prv(gwmat.dptr_); + } + AddToModeAddAndReleaseBuffer(&addtoWorkspace, gwmat.dptr_, in_grad[conv::kWeight].Size()); + } + } + if (!param_.no_bias) { + Tensor gbias = + mkl_experimental_direct_get(in_grad[conv::kBias], s); + void *res_convolutionBwdBias[dnnResourceNumber]; + res_convolutionBwdBias[dnnResourceDiffDst] = + bwdb_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]); + + res_convolutionBwdBias[dnnResourceDiffBias] = bwdb_bias_diff->get_output_ptr(gbias.dptr_, + bwdb_bias_diff, in_grad[conv::kBias]); + status = dnnExecute(convolutionBwdBias, res_convolutionBwdBias); + CHECK_EQ(status, 0) << "Backward Bias failed with status " << status; +#if MKL_EXPERIMENTAL == 0 + if (bwdb_bias_diff->conversion_needed()) { + bwdb_bias_diff->convert_from_prv(gbias.dptr_); + } +#endif + } + } + + private: + ConvolutionParam param_; + size_t width_, + height_, + width_out_, + height_out_, + kernel_w_, + kernel_h_, + stride_w_, + stride_h_; + int group_, + num_, + num_output_; + size_t channels_; + int pad_w_, + pad_h_; + bool init_mkldnn_; + dnnPrimitive_t convolutionFwd; + dnnPrimitive_t convolutionBwdData; + dnnPrimitive_t convolutionBwdFilter; + dnnPrimitive_t convolutionBwdBias; + /* Fwd step */ + std::shared_ptr > fwd_bottom_data, fwd_top_data, fwd_filter_data, + fwd_bias_data; + /* Bwd data step */ + std::shared_ptr > bwdd_top_diff, bwdd_bottom_diff; + std::shared_ptr > bwdd_filter_data; + /* Bwd filter step */ + std::shared_ptr > bwdf_top_diff, bwdf_filter_diff; + std::shared_ptr > bwdf_bottom_data; + std::shared_ptr > bwdf_filter_diff_iter, bwdf2fwd_filter_diff, + bwdb_bias_diff_iter; + /* Bwd bias step */ + std::shared_ptr > bwdb_top_diff, bwdb_bias_diff; +}; // class ConvolutionOp +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_ diff --git a/src/operator/mkl/mkl_cppwrapper.cc b/src/operator/mkl/mkl_cppwrapper.cc new file mode 100644 index 000000000000..507e5498c85b --- /dev/null +++ b/src/operator/mkl/mkl_cppwrapper.cc @@ -0,0 +1,44 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +* \file mkl_cppwrapper.cc +* \brief +* \author lingyan.guo@intel.com +* zhenlin.luo@intel.com +* +*******************************************************************************/ + + + +#include "mkl_cppwrapper.h" +#include +#if MXNET_USE_MKL2017 == 1 +#include "mkl_service.h" + +int getMKLBuildDate() { + static int build = 0; + if (build == 0) { + MKLVersion v; + mkl_get_version(&v); + build = atoi(v.Build); + printf("MKL Build:%d\n", build); + } + return build; +} + +bool enableMKLWarnGenerated() { + return false; +} +#endif // MSHADOW_USE_MKL2017 diff --git a/src/operator/mkl/mkl_cppwrapper.h b/src/operator/mkl/mkl_cppwrapper.h new file mode 100644 index 000000000000..7d66f20ad308 --- /dev/null +++ b/src/operator/mkl/mkl_cppwrapper.h @@ -0,0 +1,1020 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +* \file mkl_cppwrapper.h +* \brief +* \author lingyan.guo@intel.com +* zhenlin.luo@intel.com +* +*******************************************************************************/ +#ifndef MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_ +#define MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_ + + +#include +#include +#if MXNET_USE_MKL2017 == 1 +#include "mkl_dnn_types.h" +#include "mkl_dnn.h" +#include "mkl_version.h" + + +extern int getMKLBuildDate(); +extern bool enableMKLWarnGenerated(); + + +template inline dnnError_t dnnLayoutCreate( + dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]); +template <> inline dnnError_t dnnLayoutCreate( + dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]) { + return dnnLayoutCreate_F32(pLayout, dimension, size, strides); +} +template <> inline dnnError_t dnnLayoutCreate( + dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]) { + return dnnLayoutCreate_F64(pLayout, dimension, size, strides); +} + +template inline dnnError_t dnnLayoutCreateFromPrimitive( + dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type); +template <> inline dnnError_t dnnLayoutCreateFromPrimitive( + dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type) { + return dnnLayoutCreateFromPrimitive_F32(pLayout, primitive, type); +} +template <> inline dnnError_t dnnLayoutCreateFromPrimitive( + dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type) { + return dnnLayoutCreateFromPrimitive_F64(pLayout, primitive, type); +} + +template inline size_t dnnLayoutGetMemorySize( + const dnnLayout_t layout); +template <> inline size_t dnnLayoutGetMemorySize( + const dnnLayout_t layout) { + return dnnLayoutGetMemorySize_F32(layout); +} +template <> inline size_t dnnLayoutGetMemorySize( + const dnnLayout_t layout) { + return dnnLayoutGetMemorySize_F64(layout); +} + +template inline int dnnLayoutCompare( + const dnnLayout_t l1, const dnnLayout_t l2); +template <> inline int dnnLayoutCompare( + const dnnLayout_t l1, const dnnLayout_t l2) { + return dnnLayoutCompare_F32(l1, l2); +} +template <> inline int dnnLayoutCompare( + const dnnLayout_t l1, const dnnLayout_t l2) { + return dnnLayoutCompare_F64(l1, l2); +} + + +template inline dnnError_t dnnAllocateBuffer( + void **pPtr, dnnLayout_t layout); +template <> inline dnnError_t dnnAllocateBuffer( + void **pPtr, dnnLayout_t layout) { + return dnnAllocateBuffer_F32(pPtr, layout); +} +template <> inline dnnError_t dnnAllocateBuffer( + void **pPtr, dnnLayout_t layout) { + return dnnAllocateBuffer_F64(pPtr, layout); +} + +template inline dnnError_t dnnReleaseBuffer( + void *ptr); +template <> inline dnnError_t dnnReleaseBuffer( + void *ptr) { + return dnnReleaseBuffer_F32(ptr); +} +template <> inline dnnError_t dnnReleaseBuffer( + void *ptr) { + return dnnReleaseBuffer_F64(ptr); +} + +template inline dnnError_t dnnLayoutDelete( + dnnLayout_t layout); +template <> inline dnnError_t dnnLayoutDelete( + dnnLayout_t layout) { + return dnnLayoutDelete_F32(layout); +} +template <> inline dnnError_t dnnLayoutDelete( + dnnLayout_t layout) { + return dnnLayoutDelete_F64(layout); +} + +template inline dnnError_t dnnPrimitiveAttributesCreate( + dnnPrimitiveAttributes_t *attributes); +template <> inline dnnError_t dnnPrimitiveAttributesCreate( + dnnPrimitiveAttributes_t *attributes) { + return dnnPrimitiveAttributesCreate_F32(attributes); +} +template <> inline dnnError_t dnnPrimitiveAttributesCreate( + dnnPrimitiveAttributes_t *attributes) { + return dnnPrimitiveAttributesCreate_F64(attributes); +} + + +template inline dnnError_t dnnPrimitiveAttributesDestroy( + dnnPrimitiveAttributes_t attributes); +template <> inline dnnError_t dnnPrimitiveAttributesDestroy( + dnnPrimitiveAttributes_t attributes) { + return dnnPrimitiveAttributesDestroy_F32(attributes); +} +template <> inline dnnError_t dnnPrimitiveAttributesDestroy( + dnnPrimitiveAttributes_t attributes) { + return dnnPrimitiveAttributesDestroy_F64(attributes); +} + +template inline dnnError_t dnnPrimitiveGetAttributes( + dnnPrimitive_t primitive, + dnnPrimitiveAttributes_t *attributes); +template <> inline dnnError_t dnnPrimitiveGetAttributes( + dnnPrimitive_t primitive, + dnnPrimitiveAttributes_t *attributes) { + return dnnPrimitiveGetAttributes_F32(primitive, attributes); +} +template <> inline dnnError_t dnnPrimitiveGetAttributes( + dnnPrimitive_t primitive, + dnnPrimitiveAttributes_t *attributes) { + return dnnPrimitiveGetAttributes_F64(primitive, attributes); +} + +template inline dnnError_t dnnExecute( + dnnPrimitive_t primitive, void *resources[]); +template <> inline dnnError_t dnnExecute( + dnnPrimitive_t primitive, void *resources[]) { + return dnnExecute_F32(primitive, resources); +} +template <> inline dnnError_t dnnExecute( + dnnPrimitive_t primitive, void *resources[]) { + return dnnExecute_F64(primitive, resources); +} + +template inline dnnError_t dnnExecuteAsync( + dnnPrimitive_t primitive, void *resources[]); +template <> inline dnnError_t dnnExecuteAsync( + dnnPrimitive_t primitive, void *resources[]) { + return dnnExecuteAsync_F32(primitive, resources); +} +template <> inline dnnError_t dnnExecuteAsync( + dnnPrimitive_t primitive, void *resources[]) { + return dnnExecuteAsync_F64(primitive, resources); +} + +template inline dnnError_t dnnWaitFor( + dnnPrimitive_t primitive); +template <> inline dnnError_t dnnWaitFor( + dnnPrimitive_t primitive) { + return dnnWaitFor_F32(primitive); +} +template <> inline dnnError_t dnnWaitFor( + dnnPrimitive_t primitive) { + return dnnWaitFor_F64(primitive); +} + +template inline dnnError_t dnnDelete( + dnnPrimitive_t primitive); +template <> inline dnnError_t dnnDelete( + dnnPrimitive_t primitive) { + return dnnDelete_F32(primitive); +} +template <> inline dnnError_t dnnDelete( + dnnPrimitive_t primitive) { + return dnnDelete_F64(primitive); +} + + +template inline dnnError_t dnnConversionCreate( + dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to); +template <> inline dnnError_t dnnConversionCreate( + dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to) { + return dnnConversionCreate_F32(pConversion, from, to); +} +template <> inline dnnError_t dnnConversionCreate( + dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to) { + return dnnConversionCreate_F64(pConversion, from, to); +} + + +template inline dnnError_t dnnConversionExecute( + dnnPrimitive_t conversion, void *from, void *to); +template <> inline dnnError_t dnnConversionExecute( + dnnPrimitive_t conversion, void *from, void *to) { + return dnnConversionExecute_F32(conversion, from, to); +} +template <> inline dnnError_t dnnConversionExecute( + dnnPrimitive_t conversion, void *from, void *to) { + return dnnConversionExecute_F64(conversion, from, to); +} + + +template inline dnnError_t dnnConvolutionCreateForward( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); +template <> inline dnnError_t dnnConvolutionCreateForward( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { + return dnnConvolutionCreateForward_F32( + pConvolution, + attributes, + algorithm, + dimension, srcSize, dstSize, filterSize, + convolutionStrides, inputOffset, border_type); +} + +template <> inline dnnError_t dnnConvolutionCreateForward( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { + return dnnConvolutionCreateForward_F64( + pConvolution, + attributes, + algorithm, + dimension, srcSize, dstSize, filterSize, + convolutionStrides, inputOffset, border_type); +} + + +template inline dnnError_t dnnConvolutionCreateForwardBias( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); +template <> inline dnnError_t dnnConvolutionCreateForwardBias( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { + return dnnConvolutionCreateForwardBias_F32( + pConvolution, + attributes, + algorithm, + dimension, srcSize, dstSize, filterSize, + convolutionStrides, inputOffset, border_type); +} +template <> inline dnnError_t dnnConvolutionCreateForwardBias( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { + return dnnConvolutionCreateForwardBias_F64( + pConvolution, + attributes, + algorithm, + dimension, srcSize, dstSize, filterSize, + convolutionStrides, inputOffset, border_type); +} + + +template inline dnnError_t dnnConvolutionCreateBackwardData( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t dimension, const size_t srcSize[], + const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); +template <> inline dnnError_t dnnConvolutionCreateBackwardData( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t dimension, const size_t srcSize[], + const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { + return dnnConvolutionCreateBackwardData_F32( + pConvolution, + attributes, + algorithm, + dimension, srcSize, dstSize, filterSize, + convolutionStrides, inputOffset, border_type); +} +template <> inline dnnError_t dnnConvolutionCreateBackwardData( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t dimension, const size_t srcSize[], + const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { + return dnnConvolutionCreateBackwardData_F64( + pConvolution, + attributes, + algorithm, + dimension, srcSize, dstSize, filterSize, + convolutionStrides, inputOffset, border_type); +} + +template inline dnnError_t dnnConvolutionCreateBackwardFilter( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); +template <> inline dnnError_t dnnConvolutionCreateBackwardFilter( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { + return dnnConvolutionCreateBackwardFilter_F32( + pConvolution, + attributes, + algorithm, + dimension, srcSize, dstSize, filterSize, + convolutionStrides, inputOffset, border_type); +} +template <> inline dnnError_t dnnConvolutionCreateBackwardFilter( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { + return dnnConvolutionCreateBackwardFilter_F64( + pConvolution, + attributes, + algorithm, + dimension, srcSize, dstSize, filterSize, + convolutionStrides, inputOffset, border_type); +} + +template inline dnnError_t dnnConvolutionCreateBackwardBias( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t dimension, const size_t dstSize[]); +template <> inline dnnError_t dnnConvolutionCreateBackwardBias( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t dimension, const size_t dstSize[]) { + return dnnConvolutionCreateBackwardBias_F32( + pConvolution, + attributes, + algorithm, + dimension, dstSize); +} +template <> inline dnnError_t dnnConvolutionCreateBackwardBias( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t dimension, const size_t dstSize[]) { + return dnnConvolutionCreateBackwardBias_F64( + pConvolution, + attributes, + algorithm, + dimension, dstSize); +} + +template inline dnnError_t dnnGroupsConvolutionCreateForward( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t groups, size_t dimension, const size_t srcSize[], + const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); +template <> inline dnnError_t dnnGroupsConvolutionCreateForward( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t groups, size_t dimension, const size_t srcSize[], + const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { + return dnnGroupsConvolutionCreateForward_F32( + pConvolution, + attributes, + algorithm, + groups, dimension, srcSize, dstSize, filterSize, + convolutionStrides, inputOffset, border_type); +} +template <> inline dnnError_t dnnGroupsConvolutionCreateForward( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t groups, size_t dimension, const size_t srcSize[], + const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { + return dnnGroupsConvolutionCreateForward_F64( + pConvolution, + attributes, + algorithm, + groups, dimension, srcSize, dstSize, filterSize, + convolutionStrides, inputOffset, border_type); +} + +template inline dnnError_t dnnGroupsConvolutionCreateForwardBias( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t groups, size_t dimension, const size_t srcSize[], + const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); +template <> inline dnnError_t dnnGroupsConvolutionCreateForwardBias( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t groups, size_t dimension, const size_t srcSize[], + const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { + return dnnGroupsConvolutionCreateForwardBias_F32( + pConvolution, + attributes, + algorithm, + groups, dimension, srcSize, dstSize, filterSize, + convolutionStrides, inputOffset, border_type); +} +template <> inline dnnError_t dnnGroupsConvolutionCreateForwardBias( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t groups, size_t dimension, const size_t srcSize[], + const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { + return dnnGroupsConvolutionCreateForwardBias_F64( + pConvolution, + attributes, + algorithm, + groups, dimension, srcSize, dstSize, filterSize, + convolutionStrides, inputOffset, border_type); +} + +template inline dnnError_t dnnGroupsConvolutionCreateBackwardData( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t groups, size_t dimension, const size_t srcSize[], + const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); +template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardData( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t groups, size_t dimension, const size_t srcSize[], + const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { + return dnnGroupsConvolutionCreateBackwardData_F32( + pConvolution, + attributes, + algorithm, + groups, dimension, srcSize, dstSize, filterSize, + convolutionStrides, inputOffset, border_type); +} +template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardData( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t groups, size_t dimension, const size_t srcSize[], + const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { + return dnnGroupsConvolutionCreateBackwardData_F64( + pConvolution, + attributes, + algorithm, + groups, dimension, srcSize, dstSize, filterSize, + convolutionStrides, inputOffset, border_type); +} + + +template inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t groups, size_t dimension, const size_t srcSize[], + const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); +template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t groups, size_t dimension, const size_t srcSize[], + const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { + return dnnGroupsConvolutionCreateBackwardFilter_F32( + pConvolution, + attributes, + algorithm, + groups, dimension, srcSize, dstSize, filterSize, + convolutionStrides, inputOffset, border_type); +} +template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t groups, size_t dimension, const size_t srcSize[], + const size_t dstSize[], const size_t filterSize[], + const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { + return dnnGroupsConvolutionCreateBackwardFilter_F64( + pConvolution, + attributes, + algorithm, + groups, dimension, srcSize, dstSize, filterSize, + convolutionStrides, inputOffset, border_type); +} + +template inline dnnError_t dnnGroupsConvolutionCreateBackwardBias( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t groups, size_t dimension, const size_t dstSize[]); +template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardBias( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t groups, size_t dimension, const size_t dstSize[]) { + return dnnGroupsConvolutionCreateBackwardBias_F32( + pConvolution, + attributes, + algorithm, + groups, dimension, dstSize); +} +template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardBias( + dnnPrimitive_t* pConvolution, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t algorithm, + size_t groups, size_t dimension, const size_t dstSize[]) { + return dnnGroupsConvolutionCreateBackwardBias_F64( + pConvolution, + attributes, + algorithm, + groups, dimension, dstSize); +} + +template inline dnnError_t dnnReLUCreateForward( + dnnPrimitive_t* pRelu, + dnnPrimitiveAttributes_t attributes, + const dnnLayout_t dataLayout, float negativeSlope); +template <> inline dnnError_t dnnReLUCreateForward( + dnnPrimitive_t* pRelu, + dnnPrimitiveAttributes_t attributes, + const dnnLayout_t dataLayout, float negativeSlope) { + return dnnReLUCreateForward_F32( + pRelu, + attributes, + dataLayout, negativeSlope); +} +template <> inline dnnError_t dnnReLUCreateForward( + dnnPrimitive_t* pRelu, + dnnPrimitiveAttributes_t attributes, + const dnnLayout_t dataLayout, float negativeSlope) { + return dnnReLUCreateForward_F64( + pRelu, + attributes, + dataLayout, negativeSlope); +} + +template inline dnnError_t dnnReLUCreateBackward( + dnnPrimitive_t* pRelu, + dnnPrimitiveAttributes_t attributes, + const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope); +template <> inline dnnError_t dnnReLUCreateBackward( + dnnPrimitive_t* pRelu, + dnnPrimitiveAttributes_t attributes, + const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope) { + return dnnReLUCreateBackward_F32( + pRelu, + attributes, + diffLayout, dataLayout, negativeSlope); +} +template <> inline dnnError_t dnnReLUCreateBackward( + dnnPrimitive_t* pRelu, + dnnPrimitiveAttributes_t attributes, + const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope) { + return dnnReLUCreateBackward_F64( + pRelu, + attributes, + diffLayout, dataLayout, negativeSlope); +} + +template inline dnnError_t dnnLRNCreateForward( + dnnPrimitive_t* pLrn, + dnnPrimitiveAttributes_t attributes, + const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k); +template <> inline dnnError_t dnnLRNCreateForward( + dnnPrimitive_t* pLrn, + dnnPrimitiveAttributes_t attributes, + const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k) { + return dnnLRNCreateForward_F32( + pLrn, + attributes, + dataLayout, kernel_size, alpha, beta, k); +} +template <> inline dnnError_t dnnLRNCreateForward( + dnnPrimitive_t* pLrn, + dnnPrimitiveAttributes_t attributes, + const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k) { + return dnnLRNCreateForward_F64( + pLrn, + attributes, + dataLayout, kernel_size, alpha, beta, k); +} + + +template inline dnnError_t dnnLRNCreateBackward( + dnnPrimitive_t* pLrn, + dnnPrimitiveAttributes_t attributes, + const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, + size_t kernel_size, float alpha, float beta, float k); +template <> inline dnnError_t dnnLRNCreateBackward( + dnnPrimitive_t* pLrn, + dnnPrimitiveAttributes_t attributes, + const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, + size_t kernel_size, float alpha, float beta, float k) { + return dnnLRNCreateBackward_F32( + pLrn, + attributes, + diffLayout, dataLayout, kernel_size, alpha, beta, k); +} +template <> inline dnnError_t dnnLRNCreateBackward( + dnnPrimitive_t* pLrn, + dnnPrimitiveAttributes_t attributes, + const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, + size_t kernel_size, float alpha, float beta, float k) { + return dnnLRNCreateBackward_F64( + pLrn, + attributes, + diffLayout, dataLayout, kernel_size, alpha, beta, k); +} + + +template inline dnnError_t dnnPoolingCreateForward( + dnnPrimitive_t* pPooling, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t op, + const dnnLayout_t srcLayout, + const size_t kernelSize[], const size_t kernelStride[], + const int inputOffset[], const dnnBorder_t border_type); +template <> inline dnnError_t dnnPoolingCreateForward( + dnnPrimitive_t* pPooling, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t op, + const dnnLayout_t srcLayout, + const size_t kernelSize[], const size_t kernelStride[], + const int inputOffset[], const dnnBorder_t border_type) { + return dnnPoolingCreateForward_F32( + pPooling, + attributes, + op, + srcLayout, + kernelSize, kernelStride, + inputOffset, border_type); +} +template <> inline dnnError_t dnnPoolingCreateForward( + dnnPrimitive_t* pPooling, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t op, + const dnnLayout_t srcLayout, + const size_t kernelSize[], const size_t kernelStride[], + const int inputOffset[], const dnnBorder_t border_type) { + return dnnPoolingCreateForward_F64( + pPooling, + attributes, + op, + srcLayout, + kernelSize, kernelStride, + inputOffset, border_type); +} + + +template inline dnnError_t dnnPoolingCreateBackward( + dnnPrimitive_t* pPooling, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t op, + const dnnLayout_t srcLayout, + const size_t kernelSize[], const size_t kernelStride[], + const int inputOffset[], const dnnBorder_t border_type); +template <> inline dnnError_t dnnPoolingCreateBackward( + dnnPrimitive_t* pPooling, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t op, + const dnnLayout_t srcLayout, + const size_t kernelSize[], const size_t kernelStride[], + const int inputOffset[], const dnnBorder_t border_type) { + return dnnPoolingCreateBackward_F32( + pPooling, + attributes, + op, + srcLayout, + kernelSize, kernelStride, + inputOffset, border_type); +} +template <> inline dnnError_t dnnPoolingCreateBackward( + dnnPrimitive_t* pPooling, + dnnPrimitiveAttributes_t attributes, + dnnAlgorithm_t op, + const dnnLayout_t srcLayout, + const size_t kernelSize[], const size_t kernelStride[], + const int inputOffset[], const dnnBorder_t border_type) { + return dnnPoolingCreateBackward_F64( + pPooling, + attributes, + op, + srcLayout, + kernelSize, kernelStride, + inputOffset, border_type); +} + +template inline dnnError_t dnnConcatCreate( + dnnPrimitive_t *pConcat, + dnnPrimitiveAttributes_t attributes, + const size_t N, + dnnLayout_t src[]); +template <> inline dnnError_t dnnConcatCreate( + dnnPrimitive_t *pConcat, + dnnPrimitiveAttributes_t attributes, + const size_t N, + dnnLayout_t src[]) { + return dnnConcatCreate_F32( + pConcat, + attributes, + N, + src); +} +template <> inline dnnError_t dnnConcatCreate( + dnnPrimitive_t *pConcat, + dnnPrimitiveAttributes_t attributes, + const size_t N, + dnnLayout_t src[]) { + return dnnConcatCreate_F64( + pConcat, + attributes, + N, + src); +} + + +template inline dnnError_t dnnSplitCreate( + dnnPrimitive_t *pSplit, + dnnPrimitiveAttributes_t attributes, + const size_t N, + dnnLayout_t src, + size_t dst[]); +template <> inline dnnError_t dnnSplitCreate( + dnnPrimitive_t *pSplit, + dnnPrimitiveAttributes_t attributes, + const size_t N, + dnnLayout_t src, + size_t dst[]) { + return dnnSplitCreate_F32( + pSplit, + attributes, + N, + src, + dst); +} +template <> inline dnnError_t dnnSplitCreate( + dnnPrimitive_t *pSplit, + dnnPrimitiveAttributes_t attributes, + const size_t N, + dnnLayout_t src, + size_t dst[]) { + return dnnSplitCreate_F64( + pSplit, + attributes, + N, + src, + dst); +} + +template inline dnnError_t dnnSumCreate( + dnnPrimitive_t *pSum, + dnnPrimitiveAttributes_t attributes, + const size_t nSummands, dnnLayout_t layout, Dtype *coefficients); +template <> inline dnnError_t dnnSumCreate( + dnnPrimitive_t *pSum, + dnnPrimitiveAttributes_t attributes, + const size_t nSummands, dnnLayout_t layout, float *coefficients) { + return dnnSumCreate_F32( + pSum, + attributes, + nSummands, + layout, coefficients); +} +template <> inline dnnError_t dnnSumCreate( + dnnPrimitive_t *pSum, + dnnPrimitiveAttributes_t attributes, + const size_t nSummands, dnnLayout_t layout, double *coefficients) { + return dnnSumCreate_F64( + pSum, + attributes, + nSummands, + layout, coefficients); +} + +template inline dnnError_t dnnBatchNormalizationCreateForward_v2( + dnnPrimitive_t* pBatchNormalization, + dnnPrimitiveAttributes_t attributes, + const dnnLayout_t dataLayout, float eps, + int flags); + +template <> inline dnnError_t dnnBatchNormalizationCreateForward_v2( + dnnPrimitive_t* pBatchNormalization, + dnnPrimitiveAttributes_t attributes, + const dnnLayout_t dataLayout, float eps, + int flags) { + return dnnBatchNormalizationCreateForward_v2_F32( + pBatchNormalization, + attributes, + dataLayout, eps, flags); +} +template <> inline dnnError_t dnnBatchNormalizationCreateForward_v2( + dnnPrimitive_t* pBatchNormalization, + dnnPrimitiveAttributes_t attributes, + const dnnLayout_t dataLayout, float eps, + int flags) { + return dnnBatchNormalizationCreateForward_v2_F64( + pBatchNormalization, + attributes, + dataLayout, eps, flags); +} + + +template inline dnnError_t dnnBatchNormalizationCreateBackward_v2( + dnnPrimitive_t* pBatchNormalization, + dnnPrimitiveAttributes_t attributes, + const dnnLayout_t dataLayout, float eps, + int flags); + +template <> inline dnnError_t dnnBatchNormalizationCreateBackward_v2( + dnnPrimitive_t* pBatchNormalization, + dnnPrimitiveAttributes_t attributes, + const dnnLayout_t dataLayout, float eps, + int flags) { + return dnnBatchNormalizationCreateBackward_v2_F32( + pBatchNormalization, + attributes, + dataLayout, eps, flags); +} + +template <> inline dnnError_t dnnBatchNormalizationCreateBackward_v2( + dnnPrimitive_t* pBatchNormalization, + dnnPrimitiveAttributes_t attributes, + const dnnLayout_t dataLayout, float eps, + int flags) { + return dnnBatchNormalizationCreateBackward_v2_F64( + pBatchNormalization, + attributes, + dataLayout, eps, flags); +} + +template inline dnnError_t dnnInnerProductCreateForward( + dnnPrimitive_t *pInnerProduct, + dnnPrimitiveAttributes_t attributes, + size_t dimensions, + const size_t srcSize[], + size_t outputChannels); +template <> inline dnnError_t dnnInnerProductCreateForward( + dnnPrimitive_t *pInnerProduct, + dnnPrimitiveAttributes_t attributes, + size_t dimensions, + const size_t srcSize[], + size_t outputChannels) { + return dnnInnerProductCreateForward_F32(pInnerProduct, + attributes, dimensions, + srcSize, outputChannels); +} +template <> inline dnnError_t dnnInnerProductCreateForward( + dnnPrimitive_t *pInnerProduct, + dnnPrimitiveAttributes_t attributes, + size_t dimensions, + const size_t srcSize[], + size_t outputChannels) { + return dnnInnerProductCreateForward_F64(pInnerProduct, + attributes, dimensions, + srcSize, outputChannels); +} + +template inline dnnError_t dnnInnerProductCreateForwardBias( + dnnPrimitive_t *pInnerProduct, + dnnPrimitiveAttributes_t attributes, + size_t dimensions, + const size_t srcSize[], + size_t outputChannels); + +template <> inline dnnError_t dnnInnerProductCreateForwardBias( + dnnPrimitive_t *pInnerProduct, + dnnPrimitiveAttributes_t attributes, + size_t dimensions, + const size_t srcSize[], + size_t outputChannels) { + return dnnInnerProductCreateForwardBias_F32(pInnerProduct, + attributes, dimensions, + srcSize, outputChannels); +} +template <> inline dnnError_t dnnInnerProductCreateForwardBias( + dnnPrimitive_t *pInnerProduct, + dnnPrimitiveAttributes_t attributes, + size_t dimensions, + const size_t srcSize[], + size_t outputChannels) { + return dnnInnerProductCreateForwardBias_F64(pInnerProduct, + attributes, dimensions, + srcSize, outputChannels); +} + + +template inline dnnError_t dnnInnerProductCreateBackwardData( + dnnPrimitive_t *pInnerProduct, + dnnPrimitiveAttributes_t attributes, + size_t dimensions, + const size_t srcSize[], + size_t outputChannels); + +template <> inline dnnError_t dnnInnerProductCreateBackwardData( + dnnPrimitive_t *pInnerProduct, + dnnPrimitiveAttributes_t attributes, + size_t dimensions, + const size_t srcSize[], + size_t outputChannels) { + return dnnInnerProductCreateBackwardData_F32(pInnerProduct, + attributes, dimensions, + srcSize, outputChannels); +} +template <> inline dnnError_t dnnInnerProductCreateBackwardData( + dnnPrimitive_t *pInnerProduct, + dnnPrimitiveAttributes_t attributes, + size_t dimensions, + const size_t srcSize[], + size_t outputChannels) { + return dnnInnerProductCreateBackwardData_F64(pInnerProduct, + attributes, dimensions, + srcSize, outputChannels); +} + + + + +template inline dnnError_t dnnInnerProductCreateBackwardFilter( + dnnPrimitive_t *pInnerProduct, + dnnPrimitiveAttributes_t attributes, + size_t dimensions, + const size_t srcSize[], + size_t outputChannels); + +template <> inline dnnError_t dnnInnerProductCreateBackwardFilter( + dnnPrimitive_t *pInnerProduct, + dnnPrimitiveAttributes_t attributes, + size_t dimensions, + const size_t srcSize[], + size_t outputChannels) { + return dnnInnerProductCreateBackwardFilter_F32(pInnerProduct, + attributes, dimensions, + srcSize, outputChannels); +} +template <> inline dnnError_t dnnInnerProductCreateBackwardFilter( + dnnPrimitive_t *pInnerProduct, + dnnPrimitiveAttributes_t attributes, + size_t dimensions, + const size_t srcSize[], + size_t outputChannels) { + return dnnInnerProductCreateBackwardFilter_F64(pInnerProduct, + attributes, dimensions, + srcSize, outputChannels); +} + + + +template inline dnnError_t dnnInnerProductCreateBackwardBias( + dnnPrimitive_t *pInnerProduct, + dnnPrimitiveAttributes_t attributes, + size_t dimensions, + const size_t dstSize[]); + +template <> inline dnnError_t dnnInnerProductCreateBackwardBias( + dnnPrimitive_t *pInnerProduct, + dnnPrimitiveAttributes_t attributes, + size_t dimensions, + const size_t dstSize[]) { + return dnnInnerProductCreateBackwardBias_F32(pInnerProduct, + attributes, dimensions, + dstSize); +} +template <> inline dnnError_t dnnInnerProductCreateBackwardBias( + dnnPrimitive_t *pInnerProduct, + dnnPrimitiveAttributes_t attributes, + size_t dimensions, + const size_t dstSize[]) { + return dnnInnerProductCreateBackwardBias_F64(pInnerProduct, + attributes, dimensions, + dstSize); +} +#endif // #MXNET_USE_MKL2017 == 1 +#endif // MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_ diff --git a/src/operator/mkl/mkl_elementwise_copy-inl.h b/src/operator/mkl/mkl_elementwise_copy-inl.h new file mode 100644 index 000000000000..48c931291150 --- /dev/null +++ b/src/operator/mkl/mkl_elementwise_copy-inl.h @@ -0,0 +1,69 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +* \file mkl_elementwise-inl.h +* \brief +* \author lingyan.guo@intel.com +* zhenlin.luo@intel.com +* +*******************************************************************************/ +#ifndef MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_ +#define MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" +#include "../mshadow_op.h" +#include "./mkl_util-inl.h" + + +namespace mxnet { +namespace op { + +template +void MKLIdentityCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + if (!req[0]) return; +#if MKL_EXPERIMENTAL == 1 + if (op::mkl_prv_data(inputs[0])) { + std::shared_ptr in_data_mem = inputs[0].Mkl_mem_; + // User copy to avoid potential problem + std::shared_ptr > top_data = MKLData::create(); + std::shared_ptr top_mem = outputs[0].Mkl_mem_; + top_data->copy_from(in_data_mem); + top_mem->set_prv_descriptor(top_data); + return; + } +#endif + int in_blob_size = inputs[0].Size(); + int out_blob_size = outputs[0].Size(); + CHECK_EQ(in_blob_size, out_blob_size) << "MKLIdentityCompute CPU Size not Match "; + memcpy(outputs[0].dptr_, inputs[0].dptr_, in_blob_size * sizeof(DType)); +} + + + +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_ diff --git a/src/operator/mkl/mkl_elementwise_sum-inl.h b/src/operator/mkl/mkl_elementwise_sum-inl.h new file mode 100644 index 000000000000..d313fd15a5be --- /dev/null +++ b/src/operator/mkl/mkl_elementwise_sum-inl.h @@ -0,0 +1,117 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +* \file mkl_elementwise-inl.h +* \brief +* \author lingyan.guo@intel.com +* zhenlin.luo@intel.com +* +*******************************************************************************/ +#ifndef MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_ +#define MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" +#include "../mshadow_op.h" +#include "./mkl_util-inl.h" + + +namespace mxnet { +namespace op { +template +static void LayerSetUp(const std::vector > &data, + size_t data_shape_size, + std::shared_ptr > fwd_top_data) { + // Whether to use an asymptotically slower (for >2 inputs) but stabler method + // of computing the gradient for the PROD operation. (No effect for SUM op.) + // stable_prod_grad_ = 1; + size_t dim_src = data_shape_size; + size_t *sizes_src = new size_t[dim_src]; + size_t *strides_src = new size_t[dim_src]; + for (size_t d = 0; d < dim_src; ++d) { + sizes_src[d] = data[0].shape_[dim_src - d - 1]; + strides_src[d] = (d == 0) ? 1 : strides_src[d - 1] * sizes_src[d - 1]; + } + + fwd_top_data->create_user_layout(dim_src, sizes_src, strides_src); + delete[] sizes_src; + delete[] strides_src; +} + +template +void MKLElementWiseSumCompute_(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& in_data, + const std::vector& req, + const std::vector& out_data) { + using namespace mshadow; + using namespace mshadow::expr; + if (req[0] == kNullOp) return; + size_t size = in_data.size(); + Stream *s = ctx.get_stream(); + std::vector > data(size); + Tensor out = out_data[0].FlatTo1D(s); + bool in_place_flag = false; + int in_place_idx = 0; + + for (size_t i = 0; i < size; ++i) { + data[i] = in_data[i].FlatTo1D(s); + if (data[i].dptr_ == out.dptr_) { + in_place_idx = i; + in_place_flag = true; + } + } + std::shared_ptr > fwd_top_data = MKLData::create(); + std::vector coeffs_ = std::vector(data.size(), 1); + LayerSetUp(data, 1, fwd_top_data); + + + dnnError_t e; + void *eltwise_res[dnnResourceNumber]; + dnnPrimitive_t sumPrimitive = NULL; + e = dnnSumCreate(&sumPrimitive, NULL, size, fwd_top_data->layout_usr, + &coeffs_[0]); + CHECK_EQ(e, E_SUCCESS); + + eltwise_res[dnnResourceDst] = reinterpret_cast(const_cast(out.dptr_)); + eltwise_res[dnnResourceMultipleSrc] = + reinterpret_cast(reinterpret_cast(in_data[in_place_idx].dptr_)); + for (size_t i = 1; i < size; ++i) { + if (i == in_place_idx) continue; + eltwise_res[dnnResourceMultipleSrc + i] = + reinterpret_cast(reinterpret_cast(in_data[i].dptr_)); + } + + e = dnnExecute(sumPrimitive, eltwise_res); + CHECK_EQ(e, E_SUCCESS); + + if (sumPrimitive != NULL) { + dnnDelete(sumPrimitive); + sumPrimitive = NULL; + } +} + + + +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_ diff --git a/src/operator/mkl/mkl_fully_connected-inl.h b/src/operator/mkl/mkl_fully_connected-inl.h new file mode 100644 index 000000000000..5e296704b6dd --- /dev/null +++ b/src/operator/mkl/mkl_fully_connected-inl.h @@ -0,0 +1,192 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +* \file mkl_fully_connected-inl.h +* \brief +* \author zhenlin.luo@intel.com +* lingyan.guo@intel.com +* +* +*******************************************************************************/ +#ifndef MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_ +#define MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_ +#include +#include +#include +#include "../activation-inl.h" +#include "./mkl_util-inl.h" + +namespace mxnet { +namespace op { + +template +class MKLFullyConnectedOp : public Operator { + public: + explicit MKLFullyConnectedOp(const FullyConnectedParam& p, + const std::vector& in_shapes, + const std::vector& out_shapes): + param_(p) { + LayerSetUp(in_shapes, out_shapes); + } + + ~MKLFullyConnectedOp() { + dnnDelete(fullyConnectedFwd); + dnnDelete(fullyConnectedBwdData); + dnnDelete(fullyConnectedBwdFilter); + dnnDelete(fullyConnectedBwdBias); + } + static std::string getName() { + return "MKLFullyConnectedOp"; + } + + private: + void LayerSetUp(const std::vector& in_shapes, + const std::vector& out_shapes) { + const TShape& ishape = in_shapes[fullc::kData]; + + const size_t dim = 4; + const size_t src_sizes[4] = {1, 1, ishape.ProdShape(1, ishape.ndim()), ishape[0]}; + const size_t dst_sizes[2] = {param_.num_hidden, ishape[0]}; + const size_t output_channels = param_.num_hidden; + + dnnPrimitiveAttributes_t attributes = NULL; + MKLDNN_CALL(dnnPrimitiveAttributesCreate(&attributes)); + if (!param_.no_bias) { + MKLDNN_CALL(dnnInnerProductCreateForwardBias( + &fullyConnectedFwd, + attributes, + dim, + src_sizes, + output_channels)); + } else { + MKLDNN_CALL(dnnInnerProductCreateForward( + &fullyConnectedFwd, + attributes, + dim, + src_sizes, + output_channels)); + } + MKLDNN_CALL(dnnInnerProductCreateBackwardData( + &fullyConnectedBwdData, + attributes, + dim, + src_sizes, + output_channels)); + MKLDNN_CALL(dnnInnerProductCreateBackwardFilter( + &fullyConnectedBwdFilter, + attributes, + dim, + src_sizes, + output_channels)); + if (!param_.no_bias) { + MKLDNN_CALL(dnnInnerProductCreateBackwardBias( + &fullyConnectedBwdBias, + attributes, + 2, + dst_sizes)); + } + // TODO(minjie): Shouldn't `attributes` be destroyed? + } + + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + + void* res_fullyConnected[dnnResourceNumber]; + if (req[fullc::kOut] == kNullOp) return; + CHECK_EQ(req[fullc::kOut], kWriteTo); + CHECK_EQ(in_data.size(), param_.no_bias ? 2 : 3); + CHECK_EQ(out_data.size(), 1); + Stream *s = ctx.get_stream(); + + const TShape& ishape = in_data[fullc::kData].shape_; + const TShape& oshape = out_data[fullc::kOut].shape_; + + Tensor data; + Tensor out; + + Shape4(in_data[fullc::kData].shape_[0], in_data[fullc::kData].shape_[1], 1, 1); + + Shape<4> dshape = Shape4(ishape[0], ishape.ProdShape(1, ishape.ndim()), 1, 1); + Shape<4> odshape = Shape4(oshape[0], oshape.ProdShape(1, oshape.ndim()), 1, 1); + + data = in_data[fullc::kData].get_with_shape(dshape, s); + out = out_data[fullc::kOut].get_with_shape(odshape, s); + res_fullyConnected[dnnResourceSrc] = + reinterpret_cast(in_data[fullc::kData].dptr_); + res_fullyConnected[dnnResourceDst] = + reinterpret_cast(out_data[fullc::kOut].dptr_); + res_fullyConnected[dnnResourceFilter] = + reinterpret_cast(in_data[fullc::kWeight].dptr_); + if (!param_.no_bias) { + res_fullyConnected[dnnResourceBias] = reinterpret_cast(in_data[fullc::kBias].dptr_); + } + + MKLDNN_CALL(dnnExecute(fullyConnectedFwd, res_fullyConnected)); + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + + void* res_fullyConnected[dnnResourceNumber]; + CHECK_EQ(out_grad.size(), 1); + const size_t expected = param_.no_bias ? 2 : 3; + CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(req.size(), expected); + res_fullyConnected[dnnResourceSrc] = + reinterpret_cast(in_data[fullc::kData].dptr_); + res_fullyConnected[dnnResourceFilter] = + reinterpret_cast(in_data[fullc::kWeight].dptr_); + + res_fullyConnected[dnnResourceDiffDst] = + reinterpret_cast(out_grad[fullc::kOut].dptr_); + res_fullyConnected[dnnResourceDiffSrc] = + reinterpret_cast(in_grad[fullc::kData].dptr_); + res_fullyConnected[dnnResourceDiffFilter] = + reinterpret_cast(in_grad[fullc::kWeight].dptr_); + if (!param_.no_bias) { + res_fullyConnected[dnnResourceDiffBias] = + reinterpret_cast(in_grad[fullc::kBias].dptr_); + } + MKLDNN_CALL(dnnExecute(fullyConnectedBwdFilter, res_fullyConnected)); + if (!param_.no_bias) { + MKLDNN_CALL(dnnExecute(fullyConnectedBwdBias, res_fullyConnected)); + } + MKLDNN_CALL(dnnExecute(fullyConnectedBwdData, res_fullyConnected)); + } + + private: + dnnPrimitive_t fullyConnectedFwd{nullptr}; + dnnPrimitive_t fullyConnectedBwdData{nullptr}; + dnnPrimitive_t fullyConnectedBwdFilter{nullptr}; + dnnPrimitive_t fullyConnectedBwdBias{nullptr}; + const FullyConnectedParam param_; +}; // class MKLFullyConnectedOp +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_ diff --git a/src/operator/mkl/mkl_lrn-inl.h b/src/operator/mkl/mkl_lrn-inl.h new file mode 100644 index 000000000000..90dfad50fa62 --- /dev/null +++ b/src/operator/mkl/mkl_lrn-inl.h @@ -0,0 +1,265 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +* \file mkl_lrn-inl.h +* \brief +* \author zhenlin.luo@intel.com +* lingyan.guo@intel.com +* +*******************************************************************************/ +#ifndef MXNET_OPERATOR_MKL_MKL_LRN_INL_H_ +#define MXNET_OPERATOR_MKL_MKL_LRN_INL_H_ +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" +#include "../mshadow_op.h" +#include "./mkl_util-inl.h" + +namespace mxnet { +namespace op { + +template +class MKLLRNOp : public Operator { + public: + static std::string getName() { + return "MKLLRNOp"; + } + + explicit MKLLRNOp(LRNParam param) : + lrnFwd(static_cast(NULL)), + lrnBwd(static_cast(NULL)), + lrn_buffer_(NULL) { + this->param_ = param; + fwd_top_data_ = MKLData::create(); + fwd_bottom_data_ = MKLData::create(); + bwd_top_diff_ = MKLData::create(); + bwd_bottom_diff_ = MKLData::create(); + init_mkldnn_ = false; + } + + virtual ~MKLLRNOp() { + if (lrnFwd != NULL) { + dnnDelete(lrnFwd); + lrnFwd = NULL; + } + if (lrnBwd != NULL) { + dnnDelete(lrnBwd); + lrnBwd = NULL; + } + dnnReleaseBuffer(lrn_buffer_); + } + + private: + void LayerSetup(const mshadow::Tensor &data, + const mshadow::Tensor &out) { + size_ = param_.nsize; + CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local size"; + + alpha_ = param_.alpha; + beta_ = param_.beta; + k_ = param_.knorm; + size_t dim = 4, sizes[4], strides[4]; + channels_ = data.shape_[1]; + height_ = data.shape_[2]; + width_ = data.shape_[3]; + num_ = data.shape_[0]; + sizes[0] = width_; + sizes[1] = height_; + sizes[2] = channels_; + sizes[3] = num_; + + strides[0] = 1; + strides[1] = sizes[0]; + strides[2] = sizes[0] * sizes[1]; + strides[3] = sizes[0] * sizes[1] * sizes[2]; + + fwd_bottom_data_->name = "fwd_bottom_data_ @ " + getName(); + fwd_top_data_->name = "fwd_top_data_ @ " + getName(); + bwd_top_diff_->name = "bwd_top_diff_ @ " + getName(); + bwd_bottom_diff_->name = "bwd_bottom_diff_ @ " + getName(); + + fwd_bottom_data_->create_user_layout(dim, sizes, strides); + fwd_top_data_->create_user_layout(dim, sizes, strides); + bwd_bottom_diff_->create_user_layout(dim, sizes, strides); + bwd_top_diff_->create_user_layout(dim, sizes, strides); + } + + public: + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_data.size(), 1U); + CHECK_EQ(out_data.size(), 2U); + CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size"; + Stream *s = ctx.get_stream(); + Tensor data = mkl_experimental_direct_get( + in_data[lrn_enum::kData], s); + Tensor out = mkl_experimental_direct_get( + out_data[lrn_enum::kOut], s); + if (!init_mkldnn_) { + LayerSetup(data, out); + init_mkldnn_ = true; + } + + const void* bottom_data = NULL; +#if MKL_EXPERIMENTAL == 1 + bottom_data = + reinterpret_cast(mkl_prv_data(in_data[lrn_enum::kData])); +#endif +#if MKL_EXPERIMENTAL == 1 + if (NULL != bottom_data) { + if (lrnFwd == NULL) { + std::shared_ptr bottom_data_mem = + in_data[lrn_enum::kData].Mkl_mem_; + std::shared_ptr bottom_prv_descriptor = + bottom_data_mem->get_prv_descriptor(); + CHECK_EQ(bottom_prv_descriptor->get_descr_type(), + PrvMemDescr::PRV_DESCR_MKL2017); + std::shared_ptr > mem_descr + = std::static_pointer_cast>(bottom_prv_descriptor); + CHECK(mem_descr != nullptr); + fwd_bottom_data_ = mem_descr; + + dnnError_t e; + dnnLayout_t lrn_buffer_l = NULL; + + e = dnnLRNCreateForward(&lrnFwd, NULL, fwd_bottom_data_->layout_int, + size_, alpha_, beta_, k_); + CHECK_EQ(e, E_SUCCESS); + + fwd_top_data_->create_internal_layout(lrnFwd, dnnResourceDst); + + e = dnnLRNCreateBackward(&lrnBwd, NULL, + fwd_bottom_data_->layout_int, fwd_bottom_data_->layout_int, + size_, alpha_, beta_, k_); + CHECK_EQ(e, E_SUCCESS); + + e = dnnLayoutCreateFromPrimitive( + &lrn_buffer_l, lrnFwd, dnnResourceWorkspace); + CHECK_EQ(e, E_SUCCESS); + e = dnnAllocateBuffer( + reinterpret_cast(&lrn_buffer_), lrn_buffer_l); + CHECK_EQ(e, E_SUCCESS); + dnnLayoutDelete(lrn_buffer_l); + + bwd_top_diff_->create_internal_layout(lrnBwd, dnnResourceDiffDst); + bwd_bottom_diff_->create_internal_layout(lrnBwd, dnnResourceDiffSrc); + } + } +#endif + if (bottom_data == NULL) { + if (lrnFwd == NULL) { + dnnError_t e; + dnnLayout_t lrn_buffer_l = NULL; + e = dnnLRNCreateForward(&lrnFwd, NULL, fwd_bottom_data_->layout_usr, + size_, alpha_, beta_, k_); + CHECK_EQ(e, E_SUCCESS); + + e = dnnLayoutCreateFromPrimitive( + &lrn_buffer_l, lrnFwd, dnnResourceWorkspace); + CHECK_EQ(e, E_SUCCESS); + e = dnnAllocateBuffer( + reinterpret_cast(&lrn_buffer_), lrn_buffer_l); + CHECK_EQ(e, E_SUCCESS); + dnnLayoutDelete(lrn_buffer_l); + + e = dnnLRNCreateBackward(&lrnBwd, NULL, + fwd_bottom_data_->layout_usr, fwd_bottom_data_->layout_usr, + size_, alpha_, beta_, k_); + CHECK_EQ(e, E_SUCCESS); + } + bottom_data = data.dptr_; + } + + dnnError_t e; + void* lrn_res[dnnResourceNumber]; + lrn_res[dnnResourceSrc] = const_cast(bottom_data); + + lrn_res[dnnResourceDst] = fwd_top_data_->get_output_ptr( + out.dptr_, fwd_top_data_, out_data[lrn_enum::kOut]); + lrn_res[dnnResourceWorkspace] = lrn_buffer_; + e = dnnExecute(lrnFwd, lrn_res); + CHECK_EQ(e, E_SUCCESS); + } + + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1); + CHECK_EQ(in_data.size(), 1); + CHECK_EQ(out_data.size(), 2); + Stream *s = ctx.get_stream(); + Tensor grad = mkl_experimental_direct_get( + out_grad[lrn_enum::kOut], s); + Tensor data = mkl_experimental_direct_get( + in_data[lrn_enum::kData], s); + Tensor grad_in = mkl_experimental_direct_get( + in_grad[lrn_enum::kData], s); + dnnError_t e; + void* lrn_res[dnnResourceNumber]; + lrn_res[dnnResourceDiffDst] = + bwd_top_diff_->get_converted_prv(grad.dptr_, true, out_grad[lrn_enum::kOut]); + lrn_res[dnnResourceWorkspace] = lrn_buffer_; + lrn_res[dnnResourceSrc] = + fwd_bottom_data_->get_converted_prv(data.dptr_, false, in_data[lrn_enum::kData]); + + lrn_res[dnnResourceDiffSrc] = bwd_bottom_diff_->get_output_ptr( + grad_in.dptr_, bwd_bottom_diff_, in_grad[lrn_enum::kData]); + e = dnnExecute(lrnBwd, lrn_res); + CHECK_EQ(e, E_SUCCESS); + } + + private: + LRNParam param_; + int size_; + int pre_pad_; + DType alpha_; + DType beta_; + DType k_; + int num_; + int channels_; + int height_; + int width_; + bool init_mkldnn_; + + private: + dnnPrimitive_t lrnFwd, lrnBwd; + std::shared_ptr > fwd_top_data_; + std::shared_ptr > fwd_bottom_data_; + + std::shared_ptr > bwd_top_diff_; + std::shared_ptr > bwd_bottom_diff_; + + DType *lrn_buffer_; +}; // class LocalResponseNormOp +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_MKL_MKL_LRN_INL_H_ + diff --git a/src/operator/mkl/mkl_memory-inl.h b/src/operator/mkl/mkl_memory-inl.h new file mode 100644 index 000000000000..71af10254b2a --- /dev/null +++ b/src/operator/mkl/mkl_memory-inl.h @@ -0,0 +1,137 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +* \file mkl_memory-inl.h +* \brief +* \author lingyan.guo@intel.com +* zhenlin.luo@intel.com +* +*******************************************************************************/ +#ifndef MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_ +#define MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_ + + +#include +#include +#include +#include "mkl_cppwrapper.h" + +namespace mxnet { + +template +struct MKLMemoryDescriptorBase : public PrvMemDescr, + public std::enable_shared_from_this > { + MKLMemoryDescriptorBase() : layout_usr(NULL), layout_int(NULL), + convert_to_int(NULL), convert_from_int(NULL), convert_prv2prv(NULL), + name("UNKNOWN"), internal_ptr(NULL) {} + virtual ~MKLMemoryDescriptorBase() { + dnnLayoutDelete(layout_usr); + dnnLayoutDelete(layout_int); + if (internal_ptr != NULL) { + dnnReleaseBuffer(internal_ptr); + internal_ptr = NULL; + } + if (convert_to_int != NULL) { + dnnDelete(convert_to_int); + convert_to_int = NULL; + } + if (convert_from_int != NULL) { + dnnDelete(convert_from_int); + convert_from_int = NULL; + } + if (convert_prv2prv != NULL) { + dnnDelete(convert_prv2prv); + convert_prv2prv = NULL; + } + } + std::shared_ptr > get_shared_ptr() { + return this->shared_from_this(); + } + + dnnLayout_t layout_usr; + dnnLayout_t layout_int; + dnnPrimitive_t convert_to_int; + dnnPrimitive_t convert_from_int; + dnnPrimitive_t convert_prv2prv; + std::shared_ptr > descr_prv2prv_conversion; + + + std::string name; // for debugging purposes + void allocate() { + if (internal_ptr == NULL) { + int status = dnnAllocateBuffer( + reinterpret_cast(&internal_ptr), layout_int); + CHECK_EQ(status, E_SUCCESS) + << "Failed internal_ptr memory allocation with status " + << status << "\n"; + } + } + virtual void* prv_ptr(bool allocate_when_uninit = true) { + if (internal_ptr == NULL && allocate_when_uninit) + allocate(); + return internal_ptr; + } + inline bool conversion_needed() { + return (convert_to_int != NULL); + } + void create_conversions(); + void create_internal_layout(const dnnPrimitive_t primitive, + dnnResourceType_t type); + void create_user_layout(size_t dimension, const size_t size[], + const size_t strides[]); + void create_layouts( + const dnnPrimitive_t primitive, dnnResourceType_t type, + size_t dimension, const size_t size[], const size_t strides[]); + + virtual PrvDescrType get_descr_type() { + return PRV_DESCR_MKL2017; + } + virtual size_t prv_size() { + return dnnLayoutGetMemorySize(layout_int); + } + virtual size_t prv_count() { + return dnnLayoutGetMemorySize(layout_int) / sizeof(DType); + } + virtual void convert_from_prv(void* cpu_ptr); + virtual void convert_to_prv(void* cpu_ptr); + virtual bool layout_compare(std::shared_ptr other); + virtual void convert_from_other(std::shared_ptr other); + protected: + DType* internal_ptr; +}; + +template +struct MKLMemoryDescriptor : MKLMemoryDescriptorBase { + // The last get_converted_prv() argument is a hack for reusing + // in backward a conversion done already in the forward direction. + DType* get_converted_prv(DType *data_ptr, bool set_prv_ptr, + const TBlob &blob); + void* get_output_ptr(DType *data_ptr, std::shared_ptr > self_ptr, + const TBlob &blob, bool in_place = false); + bool copy_from(std::shared_ptr dnn_chunk); + MKLMemoryDescriptor() {} +}; + +template struct MKLData : MKLMemoryDescriptor { + static std::shared_ptr > create() { + return std::make_shared >(); + } +}; + +template struct MKLData; +template struct MKLData; + +} // namespace mxnet +#endif // MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_ diff --git a/src/operator/mkl/mkl_memory.cc b/src/operator/mkl/mkl_memory.cc new file mode 100644 index 000000000000..7682fe1c1f37 --- /dev/null +++ b/src/operator/mkl/mkl_memory.cc @@ -0,0 +1,291 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +* \file mkl_memory.cc +* \brief +* \author lingyan.guo@intel.com +* zhenlin.luo@intel.com +* +*******************************************************************************/ +#include "../operator_common.h" + +#if MXNET_USE_MKL2017 == 1 +#include +#include "mkl_memory-inl.h" +#include "mkl_util-inl.h" + +namespace mxnet { + +template +void MKLMemoryDescriptorBase::create_conversions() { + int status; + if (this->convert_from_int) { + status = dnnDelete(this->convert_from_int); + CHECK_EQ(status, E_SUCCESS); + this->convert_from_int = NULL; + } + if (this->convert_to_int) { + status = dnnDelete(this->convert_to_int); + CHECK_EQ(status, E_SUCCESS); + this->convert_to_int = NULL; + } + if (layout_int + && !dnnLayoutCompare(layout_usr, layout_int)) { + CHECK(layout_usr); + status = dnnConversionCreate(&convert_to_int, layout_usr, + layout_int); + CHECK_EQ(status, E_SUCCESS) + << "Failed creation convert_to_int with status " + << status << " for buffer: " << this->name << "\n"; + status = dnnConversionCreate(&convert_from_int, layout_int, + layout_usr); + CHECK_EQ(status, E_SUCCESS) + << "Failed creation convert_from_int with status " + << status << " for buffer: " << this->name << "\n"; + } +} + +template +void MKLMemoryDescriptorBase::create_internal_layout( + const dnnPrimitive_t primitive, dnnResourceType_t type) { + int status; + if (this->layout_int) { + status = dnnLayoutDelete(this->layout_int); + CHECK_EQ(status, E_SUCCESS); + } + status = dnnLayoutCreateFromPrimitive( + &this->layout_int, primitive, type); + CHECK_EQ(status, E_SUCCESS) + << "Failed dnnLayoutCreateFromPrimitive with status " + << status << " for buffer: " << this->name << "\n"; + + if (this->layout_usr) + this->create_conversions(); +} + +template +void MKLMemoryDescriptorBase::create_user_layout( + size_t dimension, const size_t size[], const size_t strides[]) { + int status; + if (this->layout_usr) { + status = dnnLayoutDelete(this->layout_usr); + CHECK_EQ(status, E_SUCCESS); + } + + status = dnnLayoutCreate( + &this->layout_usr, dimension, size, strides); + CHECK_EQ(status, E_SUCCESS) << "Failed dnnLayoutCreate with status " + << status << " for buffer: " << this->name << "\n"; + + if (this->layout_int) + this->create_conversions(); +} + +template +void MKLMemoryDescriptorBase::create_layouts( + const dnnPrimitive_t primitive, dnnResourceType_t type, + size_t dimension, const size_t size[], const size_t strides[]) { + this->create_internal_layout(primitive, type); + this->create_user_layout(dimension, size, strides); +} + + +template +void MKLMemoryDescriptorBase::convert_from_prv(void* cpu_ptr) { + CHECK(cpu_ptr); + CHECK(this->convert_from_int); + int status; + void *convert_resources[dnnResourceNumber]; + + convert_resources[dnnResourceFrom] = this->prv_ptr(); + convert_resources[dnnResourceTo] = cpu_ptr; + status = dnnExecute(this->convert_from_int, convert_resources); + CHECK_EQ(status, 0) << "Conversion from prv failed with status " << status; +} + +template +void MKLMemoryDescriptorBase::convert_to_prv(void* cpu_ptr) { + CHECK(cpu_ptr); + CHECK(this->convert_to_int); + int status; + void *convert_resources[dnnResourceNumber]; + + convert_resources[dnnResourceFrom] = cpu_ptr; + convert_resources[dnnResourceTo] = this->prv_ptr(); + status = dnnExecute(this->convert_to_int, convert_resources); + CHECK_EQ(status, 0) << "Conversion from prv failed with status " << status; +} + + +template +bool MKLMemoryDescriptorBase::layout_compare( + std::shared_ptr other) { + CHECK_EQ(other->get_descr_type(), + PrvMemDescr::PRV_DESCR_MKL2017); + std::shared_ptr >other_descr = + std::static_pointer_cast > + (other); + + if (dnnLayoutCompare(other_descr->layout_int, + this->layout_int)) + return true; + else + return false; +} + +template +void MKLMemoryDescriptorBase::convert_from_other( + std::shared_ptr other) { + std::shared_ptr > other_descr = + std::static_pointer_cast > + (other); + + int status; + dnnPrimitive_t convert; + status = dnnConversionCreate(&convert, + other_descr->layout_int, this->layout_int); + + void *convert_resources[dnnResourceNumber]; + convert_resources[dnnResourceFrom] = other_descr->prv_ptr(); + convert_resources[dnnResourceTo] = this->prv_ptr(); + status = dnnExecute(convert, convert_resources); + CHECK_EQ(status, 0) << "Conversion from other failed with status " + << status; + + dnnDelete(convert); +} + + +template +Dtype* MKLMemoryDescriptor::get_converted_prv( + Dtype *cpu_ptr, bool set_prv_ptr, const TBlob &blob) { + Dtype* prv_ptr = NULL; + std::shared_ptr dnn_chunk = NULL; +#if MKL_EXPERIMENTAL == 1 + dnn_chunk = blob.Mkl_mem_; +#endif +#if MKL_EXPERIMENTAL == 1 + if (dnn_chunk != NULL) + prv_ptr = static_cast(dnn_chunk->prv_data()); +#endif + + if (this->convert_to_int != NULL) { +#if MKL_EXPERIMENTAL == 1 + int status; + void *convert_resources[dnnResourceNumber]; +#endif + if (prv_ptr == NULL) { + this->allocate(); + this->convert_to_prv(cpu_ptr); +#if MKL_EXPERIMENTAL == 1 + if (set_prv_ptr) { + dnn_chunk->set_prv_descriptor(this->get_shared_ptr(), true); + } +#endif + return this->internal_ptr; + } +#if MKL_EXPERIMENTAL == 1 + if (prv_ptr != NULL) { + std::shared_ptr > current_descr = + op::mkl_get_mem_desc(dnn_chunk); + if (!dnnLayoutCompare(current_descr->layout_int, + this->layout_int)) { + if (this->convert_prv2prv) { + CHECK_EQ(dnnLayoutCompare( + this->descr_prv2prv_conversion->layout_int, + this->layout_int), 0); + status = 0; + } else { + status = dnnConversionCreate(&this->convert_prv2prv, + current_descr->layout_int, this->layout_int); + if (status == 0) + this->descr_prv2prv_conversion = current_descr; + } + if (status != 0) { + this->allocate(); + convert_resources[dnnResourceFrom] = cpu_ptr; + convert_resources[dnnResourceTo] = + reinterpret_cast(this->internal_ptr); + status = dnnExecute(this->convert_to_int, convert_resources); + CHECK_EQ(status, 0) << "Conversion failed with status " << status; + } else { + this->allocate(); + convert_resources[dnnResourceFrom] = reinterpret_cast(prv_ptr); + convert_resources[dnnResourceTo] = + reinterpret_cast(this->internal_ptr); + status = dnnExecute(this->convert_prv2prv, convert_resources); + CHECK_EQ(status, 0) << "Conversion failed with status " << status; + } + if (set_prv_ptr) { + dnn_chunk->set_prv_descriptor(this->get_shared_ptr(), true); + } + return this->internal_ptr; + } else if (current_descr.get() != this) { + // MKL_DLOG(INFO) << "layout OK " + // << current_descr->name << " == " << this->name; + } + } +#endif + return const_cast(prv_ptr); + } else { + if (prv_ptr != NULL) { +#if MKL_EXPERIMENTAL == 1 + std::shared_ptr > other_descr = + std::static_pointer_cast > + (dnn_chunk->prv_descriptor_); + dnn_chunk->check_and_prv_to_cpu(cpu_ptr); +#endif + // printf("get_converted_prv release %s\n", other_descr->name.c_str()); + } + } + return cpu_ptr; +} + +template +void* MKLMemoryDescriptor::get_output_ptr(Dtype *data_ptr, + std::shared_ptr > self_ptr, const TBlob &blob, bool in_place) { +#if MKL_EXPERIMENTAL == 1 + std::shared_ptr dnn_chunk = blob.Mkl_mem_; +#endif + if (this->conversion_needed()) { + void * prv_ptr = this->prv_ptr(); +#if MKL_EXPERIMENTAL == 1 + if (!in_place) { + dnn_chunk->set_prv_descriptor(self_ptr); + } else { + Dtype * blob_prv = op::mkl_prv_data(blob); + if (blob_prv != NULL) + return blob_prv; + } +#endif + return prv_ptr; + } else { +#if MKL_EXPERIMENTAL == 1 + std::shared_ptr > other_descr = + std::static_pointer_cast > + (dnn_chunk->prv_descriptor_); + dnn_chunk->check_and_prv_to_cpu(data_ptr); +#endif + return data_ptr; + } +} + +template class MKLMemoryDescriptor; +template class MKLMemoryDescriptor; + +template class MKLMemoryDescriptorBase; +template class MKLMemoryDescriptorBase; +} // namespace mxnet +#endif diff --git a/src/operator/mkl/mkl_memory.h b/src/operator/mkl/mkl_memory.h new file mode 100644 index 000000000000..13f1fd27b12b --- /dev/null +++ b/src/operator/mkl/mkl_memory.h @@ -0,0 +1,123 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +* \file mkl_memory.cc +* \brief +* \author lingyan.guo@intel.com +* zhenlin.luo@intel.com +* +*******************************************************************************/ +#ifndef MXNET_OPERATOR_MKL_MKL_MEMORY_H_ +#define MXNET_OPERATOR_MKL_MKL_MEMORY_H_ + +#include +#include +#include + + +namespace mxnet { +// Base class +struct PrvMemDescr { + virtual void convert_from_prv(void* cpu_ptr) = 0; + virtual void convert_to_prv(void* cpu_ptr) = 0; + virtual void convert_from_other(std::shared_ptr other) = 0; + virtual void* prv_ptr(bool allocate_when_uninit = true) = 0; + // returns true for matching layouts + virtual bool layout_compare(std::shared_ptr other) = 0; + virtual size_t prv_count() = 0; + virtual size_t prv_size() = 0; + // This might help using prv_ptr_ by different accelerators/engines + enum PrvDescrType { + PRV_DESCR_MKL2017, + PRV_DESCR_MKLDNN + }; + virtual PrvDescrType get_descr_type() = 0; +}; + +#if MKL_EXPERIMENTAL == 1 +// Currently HEAD_AT_PRV do not free CPU data +enum SyncedHead { + HEAD_AT_CPU, + HEAD_AT_PRV, +}; +struct MKLMemHolder { + SyncedHead head_; + std::shared_ptr prv_descriptor_; + bool b_disable_prv_2_cpu; + bool b_eager_mode; + void disable_prv_2_cpu(bool flag) { + b_disable_prv_2_cpu = flag; + } + void set_eager_mode(bool eager_mode) { + b_eager_mode = eager_mode; + } + void set_prv_descriptor(std::shared_ptr descriptor, bool same_data = false) { + head_ = HEAD_AT_PRV; + prv_descriptor_ = descriptor; + } + std::shared_ptr get_prv_descriptor() { + return prv_descriptor_; + } + bool head_at_prv() { + return (head_ == HEAD_AT_PRV) ? true : false; + } + void* prv_data(bool allocate_when_uninit = true) { + if (head_ != HEAD_AT_PRV) { + return NULL; + } + if (prv_descriptor_ == NULL) { + LOG(FATAL) << " prv_descriptor_ is NULL"; + } + CHECK(prv_descriptor_.get()); + return reinterpret_cast(prv_descriptor_->prv_ptr(allocate_when_uninit)); + } + + int prv_count() { + if (head_ != HEAD_AT_PRV) { + return 0; + } + if (prv_descriptor_ == NULL) { + LOG(FATAL) << " prv_descriptor_ is NULL"; + } + CHECK(prv_descriptor_.get()); + return prv_descriptor_->prv_count(); + } + static std::shared_ptr create() { + return std::make_shared(); + } + void check_and_prv_to_cpu(void *dptr_) { + if (!b_disable_prv_2_cpu && head_ == HEAD_AT_PRV) { + CHECK(prv_descriptor_ != nullptr); + prv_descriptor_->convert_from_prv(dptr_); + // Because operator use CPU & maybe change it, change to CPU Flag + head_ = HEAD_AT_CPU; + } + if (b_disable_prv_2_cpu) { + b_disable_prv_2_cpu = false; + } + } + MKLMemHolder() : + head_(HEAD_AT_CPU), prv_descriptor_(nullptr), + b_disable_prv_2_cpu(false), b_eager_mode(false) {} +}; +#else +struct MKLMemHolder { + public: + virtual std::shared_ptr get_prv_descriptor() = 0; +}; +#endif + +} // namespace mxnet +#endif // MXNET_OPERATOR_MKL_MKL_MEMORY_H_ diff --git a/src/operator/mkl/mkl_pooling-inl.h b/src/operator/mkl/mkl_pooling-inl.h new file mode 100644 index 000000000000..5662a61aebd3 --- /dev/null +++ b/src/operator/mkl/mkl_pooling-inl.h @@ -0,0 +1,357 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +* \file mkl_pooling-inl.h +* \brief +* \author zhenlin.luo@intel.com +* lingyan.guo@intel.com +* +*******************************************************************************/ + +#ifndef MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_ +#define MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_ +#include +#include +#include +#include "../operator_common.h" +#include "../nn/pooling-inl.h" +#include "./mkl_util-inl.h" + +namespace mxnet { +namespace op { + + +template +class MKLPoolingOp : public Operator { + public: + static std::string getName() { + return "MKLPoolingOp"; + } + explicit MKLPoolingOp(PoolingParam p) { + poolingFwd = static_cast(NULL); + poolingBwd = static_cast(NULL); + max_idx_data = static_cast(NULL); + fwd_top_data = MKLData::create(); + fwd_bottom_data = MKLData::create(); + bwd_top_diff = MKLData::create(); + bwd_bottom_diff = MKLData::create(); + this->param_ = p; + init_mkldnn_ = false; + } + virtual ~MKLPoolingOp() { + if (poolingFwd != NULL) { + dnnDelete(poolingFwd); + poolingFwd = NULL; + } + if (poolingBwd != NULL) { + dnnDelete(poolingBwd); + poolingBwd = NULL; + } + if (max_idx_data != NULL) { + dnnReleaseBuffer(max_idx_data); + max_idx_data = NULL; + } + } + + private: + void LayerSetUp(const mshadow::Tensor &data, + const mshadow::Tensor &out) { + channels_ = data.shape_[1]; + height_ = data.shape_[2]; + width_ = data.shape_[3]; + num_ = data.shape_[0]; + global_pooling_ = param_.global_pool; + if (global_pooling_) { + kernel_h_ = height_; + kernel_w_ = width_; + } else { + kernel_h_ = param_.kernel[0]; + kernel_w_ = param_.kernel[1]; + } + CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; + CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; + pad_h_ = param_.pad[0]; + pad_w_ = param_.pad[1]; + if (global_pooling_) { + stride_h_ = stride_w_ = 1; + } else { + stride_h_ = param_.stride[0]; + stride_w_ = param_.stride[1]; + } + if (global_pooling_) { + CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1) + << "With Global_pooling: true; only pad = 0 and stride = 1"; + } + if (pad_h_ != 0 || pad_w_ != 0) { + CHECK(param_.pool_type == pool_enum::kAvgPooling + || param_.pool_type == pool_enum::kMaxPooling) + << "Padding implemented only for average and max pooling."; + CHECK_LT(pad_h_, kernel_h_); + CHECK_LT(pad_w_, kernel_w_); + } + pooled_height_ = out.shape_[2]; + pooled_width_ = out.shape_[3]; + + size_t dim = 4; + size_t src_sizes[4], src_strides[4]; + size_t dst_sizes[4], dst_strides[4]; + src_sizes[0] = width_; + src_sizes[1] = height_; + src_sizes[2] = channels_; + src_sizes[3] = num_; + src_strides[0] = 1; + src_strides[1] = src_sizes[0]; + src_strides[2] = src_sizes[0] * src_sizes[1]; + src_strides[3] = src_sizes[0] * src_sizes[1] * src_sizes[2]; + dst_sizes[0] = pooled_width_; + dst_sizes[1] = pooled_height_; + dst_sizes[2] = src_sizes[2]; + dst_sizes[3] = src_sizes[3]; + dst_strides[0] = 1; + dst_strides[1] = dst_sizes[0]; + dst_strides[2] = dst_sizes[0] * dst_sizes[1]; + dst_strides[3] = dst_sizes[0] * dst_sizes[1] * dst_sizes[2]; + src_offset[0] = -pad_w_; + src_offset[1] = -pad_h_; + src_offset[2] = -pad_w_; + src_offset[3] = -pad_h_; + kernel_stride[0] = stride_w_; + kernel_stride[1] = stride_h_; + kernel_size[0] = kernel_w_; + kernel_size[1] = kernel_h_; + + // Names are for debugging only + fwd_bottom_data->name = "fwd_bottom_data @ " + getName(); + fwd_top_data->name = "fwd_top_data @ " + getName(); + bwd_top_diff->name = "bwd_top_diff @ " + getName(); + bwd_bottom_diff->name = "bwd_bottom_diff @ " + getName(); + + fwd_bottom_data->create_user_layout(dim, src_sizes, src_strides); + fwd_top_data->create_user_layout(dim, dst_sizes, dst_strides); + bwd_bottom_diff->create_user_layout(dim, src_sizes, src_strides); + bwd_top_diff->create_user_layout(dim, dst_sizes, dst_strides); + + // Primitives will be allocated during the first fwd pass + poolingFwd = NULL; + poolingBwd = NULL; + max_idx_data = NULL; + } + + public: + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_data.size(), 1); + CHECK_EQ(out_data.size(), 1); + Stream *s = ctx.get_stream(); + if (param_.kernel.ndim() >= 3) { + LOG(FATAL) << "Not implmented"; + } + Tensor data = mkl_experimental_direct_get( + in_data[pool_enum::kData], s); + Tensor out = mkl_experimental_direct_get( + out_data[pool_enum::kOut], s); + if (!init_mkldnn_) { + LayerSetUp(data, out); + init_mkldnn_ = true; + } + auto first_pass = false; + if (poolingFwd == NULL) first_pass = true; + + dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax; + + switch (param_.pool_type) { + case pool_enum::kMaxPooling: + algorithm = dnnAlgorithmPoolingMax; + break; + case pool_enum::kAvgPooling: + algorithm = dnnAlgorithmPoolingAvgIncludePadding; + + break; + default: + LOG(FATAL) << "Unknown pooling method."; + } + + dnnError_t status; + void* pooling_res[dnnResourceNumber]; + + void* bottom_data = NULL; +#if MKL_EXPERIMENTAL == 1 + bottom_data = + reinterpret_cast(mkl_prv_data(in_data[pool_enum::kData])); +#endif + dnnBorder_t border_type = dnnBorderZerosAsymm; + switch (param_.pooling_convention) { + case pool_enum::kFull: + border_type = dnnBorderZeros; + break; + case pool_enum::kValid: + border_type = dnnBorderZerosAsymm; + break; + default: + border_type = dnnBorderZerosAsymm; + break; + } + if (NULL == bottom_data) { + bottom_data = data.dptr_; + if (NULL == poolingFwd) { + status = dnnPoolingCreateForward(&poolingFwd, NULL, + algorithm, fwd_bottom_data->layout_usr, + kernel_size, kernel_stride, + src_offset, border_type); + CHECK_EQ(status, E_SUCCESS); + // Now create poolingBwd + status = dnnPoolingCreateBackward(&poolingBwd, NULL, + algorithm, fwd_bottom_data->layout_usr, + kernel_size, kernel_stride, + src_offset, border_type); + CHECK_EQ(status, E_SUCCESS); + } + } +#if MKL_EXPERIMENTAL == 1 + if (NULL != bottom_data) { + if (NULL == poolingFwd) { + std::shared_ptr bottom_data_mem = in_data[pool_enum::kData].Mkl_mem_; + std::shared_ptr bottom_prv_descriptor = + bottom_data_mem->get_prv_descriptor(); + CHECK_EQ(bottom_prv_descriptor->get_descr_type(), + PrvMemDescr::PRV_DESCR_MKL2017); + std::shared_ptr > mem_descr + = std::static_pointer_cast>(bottom_prv_descriptor); + CHECK(mem_descr != nullptr); + fwd_bottom_data = mem_descr; + + status = dnnPoolingCreateForward(&poolingFwd, NULL, + algorithm, fwd_bottom_data->layout_int, + kernel_size, kernel_stride, + src_offset, border_type); + CHECK_EQ(status, E_SUCCESS); + fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst); + + // Now create poolingBwd + status = dnnPoolingCreateBackward(&poolingBwd, NULL, + algorithm, fwd_bottom_data->layout_int, + kernel_size, kernel_stride, + src_offset, border_type); + CHECK_EQ(status, E_SUCCESS); + bwd_top_diff->create_internal_layout(poolingFwd, dnnResourceDst); + bwd_bottom_diff->create_internal_layout(poolingFwd, dnnResourceSrc); + } + } +#endif + + if (first_pass) { + dnnLayout_t max_idx_datal = NULL; + status = dnnLayoutCreateFromPrimitive( + &max_idx_datal, poolingFwd, dnnResourceWorkspace); + CHECK_EQ(status, E_SUCCESS); + status = dnnAllocateBuffer(reinterpret_cast(&max_idx_data), max_idx_datal); + CHECK_EQ(status, E_SUCCESS); +#if MKL_EXPERIMENTAL == 0 + fwd_bottom_data->create_internal_layout(poolingFwd, dnnResourceSrc); + fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst); + bwd_top_diff->create_internal_layout(poolingBwd, dnnResourceDiffDst); + bwd_bottom_diff->create_internal_layout(poolingBwd, dnnResourceDiffSrc); +#endif + dnnLayoutDelete(max_idx_datal); + first_pass = false; + } + pooling_res[dnnResourceSrc] = bottom_data; + pooling_res[dnnResourceWorkspace] = max_idx_data; + + pooling_res[dnnResourceDst] = fwd_top_data->get_output_ptr( + out.dptr_, fwd_top_data, out_data[pool_enum::kOut]); + status = dnnExecute(poolingFwd, pooling_res); + CHECK_EQ(status, E_SUCCESS); +#if MKL_EXPERIMENTAL == 0 + if (fwd_top_data->conversion_needed()) { + fwd_top_data->convert_from_prv(out.dptr_); + } +#endif + } + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + if (!req[0]) { + return; + } + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1); + CHECK_EQ(in_data.size(), 1); + CHECK_EQ(out_data.size(), 1); + CHECK_EQ(req.size(), 1); + CHECK_EQ(in_grad.size(), 1); + if (param_.kernel.ndim() >= 3) { + LOG(FATAL) << "Not implmented"; + } + Stream *s = ctx.get_stream(); + Tensor grad = mkl_experimental_direct_get( + out_grad[pool_enum::kOut], s); + Tensor input_grad = mkl_experimental_direct_get( + in_grad[pool_enum::kData], s); + dnnError_t e; + void* pooling_res[dnnResourceNumber]; + pooling_res[dnnResourceWorkspace] = reinterpret_cast(max_idx_data); + + pooling_res[dnnResourceDiffDst] = + bwd_top_diff->get_converted_prv(grad.dptr_, true, out_grad[pool_enum::kOut]); + + pooling_res[dnnResourceDiffSrc] = bwd_bottom_diff->get_output_ptr( + input_grad.dptr_, bwd_bottom_diff, in_grad[pool_enum::kData]); + e = dnnExecute(poolingBwd, pooling_res); + CHECK_EQ(e, E_SUCCESS); +#if MKL_EXPERIMENTAL == 0 + if (bwd_bottom_diff->conversion_needed()) { + bwd_bottom_diff->convert_from_prv(input_grad.dptr_); + } +#endif + } + + private: + PoolingParam param_; + int kernel_h_, kernel_w_; + int stride_h_, stride_w_; + int pad_h_, pad_w_; + int channels_, num_; + int height_, width_; + int pooled_height_, pooled_width_; + bool global_pooling_; + + private: + size_t kernel_size[2], + kernel_stride[4]; + int src_offset[4]; // 2*(dimension-2) + dnnPrimitive_t poolingFwd, poolingBwd; + DType *max_idx_data; + + std::shared_ptr > fwd_top_data; + std::shared_ptr > fwd_bottom_data; + std::shared_ptr > bwd_top_diff; + std::shared_ptr > bwd_bottom_diff; + bool init_mkldnn_; +}; // class MKLPoolingOp +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_ diff --git a/src/operator/mkl/mkl_relu-inl.h b/src/operator/mkl/mkl_relu-inl.h new file mode 100644 index 000000000000..8d7ab5e1e2db --- /dev/null +++ b/src/operator/mkl/mkl_relu-inl.h @@ -0,0 +1,272 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +* \file mkl_relu-inl.h +* \brief +* \author zhenlin.luo@intel.com +* lingyan.guo@intel.com +* +*******************************************************************************/ +#ifndef MXNET_OPERATOR_MKL_MKL_RELU_INL_H_ +#define MXNET_OPERATOR_MKL_MKL_RELU_INL_H_ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" +#include "./mkl_util-inl.h" + +namespace mxnet { +namespace op { + +template +class MKLReluOp : public Operator { + public: + static std::string getName() { + return "MKLReluOp"; + } + MKLReluOp(): + reluFwd_(NULL), + reluBwd_(NULL) { + init_mkldnn_ = false; + fwd_top_data_ = MKLData::create(); + fwd_bottom_data_ = MKLData::create(); + bwd_top_diff_ = MKLData::create(); + bwd_bottom_diff_ = MKLData::create(); + } + + ~MKLReluOp() { + if (reluFwd_ != NULL) { + dnnDelete(reluFwd_); + reluFwd_ = NULL; + } + if (reluBwd_ != NULL) { + dnnDelete(reluBwd_); + reluBwd_ = NULL; + } + } + + private: + void LayerSetUp(const mshadow::Tensor &data, + const mshadow::Tensor &out) { + size_t dim = 4; + size_t *sizes = new size_t[dim]; + size_t *strides = new size_t[dim]; + for (size_t d = 0; d < dim; ++d) { + (sizes)[d] = data.shape_[dim - 1 - d]; + (strides)[d] = (d == 0) ? 1 : (strides)[d - 1] * (sizes)[d - 1]; + } + // Names are for debugging only + fwd_bottom_data_->name = "fwd_bottom_data @ " + getName(); + fwd_top_data_->name = "fwd_top_data @ " + getName(); + bwd_bottom_diff_->name = "bwd_bottom_diff @ " + getName(); + bwd_top_diff_->name = "bwd_top_diff @ " + getName(); + fwd_bottom_data_->create_user_layout(dim, (sizes), (strides)); + fwd_top_data_->create_user_layout(dim, (sizes), (strides)); + bwd_bottom_diff_->create_user_layout(dim, (sizes), (strides)); + bwd_top_diff_->create_user_layout(dim, (sizes), (strides)); + delete[] sizes; + delete[] strides; + } + + public: + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_data.size(), 1); + CHECK_EQ(out_data.size(), 1); + Stream *s = ctx.get_stream(); + Tensor data; + Tensor out; + if (in_data[activation::kData].ndim() == 1) { + Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], 1, 1, 1); + data = mkl_experimental_direct_get_with_shape( + in_data[activation::kData], dshape, s); + out = mkl_experimental_direct_get_with_shape( + out_data[activation::kOut], dshape, s); + } else if (in_data[activation::kData].ndim() == 2) { + Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], + in_data[activation::kData].shape_[1], 1, 1); + data = mkl_experimental_direct_get_with_shape( + in_data[activation::kData], dshape, s); + out = mkl_experimental_direct_get_with_shape( + out_data[activation::kOut], dshape, s); + } else if (in_data[activation::kData].ndim() == 3) { + Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], + in_data[activation::kData].shape_[1], + in_data[activation::kData].shape_[2], 1); + data = mkl_experimental_direct_get_with_shape( + in_data[activation::kData], dshape, s); + out = mkl_experimental_direct_get_with_shape( + out_data[activation::kOut], dshape, s); + } else { + data = mkl_experimental_direct_get(in_data[activation::kData], s); + out = mkl_experimental_direct_get(out_data[activation::kOut], s); + } + if (!init_mkldnn_) { + LayerSetUp(data, out); + init_mkldnn_ = true; + } + void* bottom_data = NULL; +#if MKL_EXPERIMENTAL == 1 + bottom_data = + reinterpret_cast(mkl_prv_data(in_data[activation::kData])); +#endif +#if MKL_EXPERIMENTAL == 1 + if (bottom_data != NULL) { + if (reluFwd_ == NULL) { + std::shared_ptr > mem_descr = + mkl_get_mem_desc(in_data[activation::kData].Mkl_mem_); + DType negative_slope = 0; + dnnError_t e; + e = dnnReLUCreateForward(&reluFwd_, NULL, mem_descr->layout_int, + negative_slope); + CHECK_EQ(e, E_SUCCESS); + e = dnnReLUCreateBackward(&reluBwd_, NULL, mem_descr->layout_int, + mem_descr->layout_int, negative_slope); + CHECK_EQ(e, E_SUCCESS); + + fwd_bottom_data_ = mem_descr; + fwd_top_data_->create_internal_layout(reluFwd_, dnnResourceDst); + bwd_top_diff_->create_internal_layout(reluFwd_, dnnResourceDst); + bwd_bottom_diff_->create_internal_layout(reluFwd_, dnnResourceSrc); + } + } +#endif + if (bottom_data == NULL) { + bottom_data = data.dptr_; + if (reluFwd_ == NULL) { + dnnError_t e; + DType negative_slope = 0; + e = dnnReLUCreateForward(&reluFwd_, NULL, + fwd_bottom_data_->layout_usr, negative_slope); + CHECK_EQ(e, E_SUCCESS); + e = dnnReLUCreateBackward(&reluBwd_, NULL, + fwd_bottom_data_->layout_usr, fwd_bottom_data_->layout_usr, + negative_slope); + CHECK_EQ(e, E_SUCCESS); + } + } + dnnError_t e; + void* relu_res[dnnResourceNumber]; + relu_res[dnnResourceSrc] = bottom_data; + + relu_res[dnnResourceDst] = fwd_top_data_->get_output_ptr( + out.dptr_, fwd_top_data_, out_data[activation::kOut], (data.dptr_ == out.dptr_)); + e = dnnExecute(reluFwd_, relu_res); + CHECK_EQ(e, E_SUCCESS); +#if MKL_EXPERIMENTAL == 0 + if (fwd_top_data_->conversion_needed()) { + fwd_top_data_->convert_from_prv(out.dptr_); + } +#endif + } + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + if (!req[0]) { + return; + } + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1); + CHECK(in_data.size() == 1 && in_grad.size() == 1); + CHECK_EQ(req.size(), 1); + Stream *s = ctx.get_stream(); + Tensor m_out_grad; + Tensor m_out_data; + Tensor m_in_grad; + + if (out_grad[activation::kOut].ndim() == 1) { + Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0], 1, 1, 1); + m_out_grad = mkl_experimental_direct_get_with_shape( + out_grad[activation::kOut], dshape, s); + m_out_data = mkl_experimental_direct_get_with_shape( + out_data[activation::kOut], dshape, s); + m_in_grad = mkl_experimental_direct_get_with_shape( + in_grad[activation::kData], dshape, s); + } else if (out_grad[activation::kOut].ndim() == 2) { + Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0], + out_grad[activation::kOut].shape_[1], 1, 1); + m_out_grad = mkl_experimental_direct_get_with_shape( + out_grad[activation::kOut], dshape, s); + m_out_data = mkl_experimental_direct_get_with_shape( + out_data[activation::kOut], dshape, s); + m_in_grad = mkl_experimental_direct_get_with_shape( + in_grad[activation::kData], dshape, s); + } else if (out_grad[activation::kOut].ndim() == 3) { + Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0], + out_grad[activation::kOut].shape_[1], + out_grad[activation::kOut].shape_[2], 1); + m_out_grad = mkl_experimental_direct_get_with_shape( + out_grad[activation::kOut], dshape, s); + m_out_data = mkl_experimental_direct_get_with_shape( + out_data[activation::kOut], dshape, s); + m_in_grad = mkl_experimental_direct_get_with_shape( + in_grad[activation::kData], dshape, s); + } else { + m_out_grad = mkl_experimental_direct_get(out_grad[activation::kOut], s); + m_out_data = mkl_experimental_direct_get(out_data[activation::kOut], s); + m_in_grad = mkl_experimental_direct_get(in_grad[activation::kData], s); + } + dnnError_t e; + void* relu_res[dnnResourceNumber]; + + void* bottom_data = NULL; +#if MKL_EXPERIMENTAL == 1 + bottom_data = reinterpret_cast(mkl_prv_data(out_data[activation::kOut])); +#endif + if (NULL == bottom_data) { + bottom_data = reinterpret_cast(const_cast(m_out_data.dptr_)); + } + relu_res[dnnResourceSrc] = bottom_data; + relu_res[dnnResourceDiffDst] = bwd_top_diff_->get_converted_prv(m_out_grad.dptr_, + true, out_grad[activation::kOut]); + relu_res[dnnResourceDiffSrc] = bwd_bottom_diff_->get_output_ptr( + m_in_grad.dptr_, bwd_bottom_diff_, in_grad[activation::kData]); + e = dnnExecute(reluBwd_, relu_res); + CHECK_EQ(e, E_SUCCESS); +#if MKL_EXPERIMENTAL == 0 + if (bwd_bottom_diff_->conversion_needed()) { + bwd_bottom_diff_->convert_from_prv(m_in_grad.dptr_); + } +#endif + } + + private: + bool init_mkldnn_; + std::shared_ptr > fwd_top_data_; + std::shared_ptr > fwd_bottom_data_; + std::shared_ptr > bwd_top_diff_; + std::shared_ptr > bwd_bottom_diff_; + dnnPrimitive_t reluFwd_, reluBwd_; +}; // class MKLReluOp +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_MKL_MKL_RELU_INL_H_ diff --git a/src/operator/mkl/mkl_util-inl.h b/src/operator/mkl/mkl_util-inl.h new file mode 100644 index 000000000000..4ad786a2ce93 --- /dev/null +++ b/src/operator/mkl/mkl_util-inl.h @@ -0,0 +1,110 @@ +/******************************************************************************* +* Copyright 2016 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +* \file mkl_util-inl.h +* \brief +* \author lingyan.guo@intel.com +* zhenlin.luo@intel.com +* +*******************************************************************************/ +#ifndef MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_ +#define MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_ +#include +#define MKLDNN_CALL(func) \ + { \ + dnnError_t status = (func); \ + CHECK_EQ(status, E_SUCCESS) << "MKL DNN call failed (status: " << status << ")."; \ + } + + +namespace mxnet { +namespace op { + +#if MKL_EXPERIMENTAL == 1 + template + inline DType * mkl_prv_data(const TBlob &b) { + std::shared_ptr bottom_data_mem = b.Mkl_mem_; + bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv(); + if (mem_valid) { + return reinterpret_cast(bottom_data_mem->prv_data()); + } + return NULL; + } + + template + inline int mkl_prv_count(const TBlob &b) { + std::shared_ptr bottom_data_mem = b.Mkl_mem_; + bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv(); + if (mem_valid) { + return bottom_data_mem->prv_count(); + } + return 0; + } +#endif + inline void mkl_set_priv_flag(const TBlob &b) { +#if MKL_EXPERIMENTAL == 1 + std::shared_ptr bottom_data_mem = b.Mkl_mem_; + bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv(); + if (mem_valid) { + bottom_data_mem->disable_prv_2_cpu(true); + } +#endif + } +#if MKL_EXPERIMENTAL == 1 + template + inline std::shared_ptr > mkl_get_mem_desc( + const std::shared_ptr data_mem) { + std::shared_ptr prv_descriptor = + data_mem->get_prv_descriptor(); + CHECK_EQ(prv_descriptor->get_descr_type(), + PrvMemDescr::PRV_DESCR_MKL2017); + std::shared_ptr > mem_descr + = std::static_pointer_cast> + (prv_descriptor); + CHECK(mem_descr != NULL); + return mem_descr; + } +#endif + template + inline mshadow::Tensor mkl_experimental_direct_get( + const TBlob &b, mshadow::Stream *s) { + mkl_set_priv_flag(b); + return b.get(s); + } + template + inline mshadow::Tensor mkl_experimental_direct_get_with_shape( + const TBlob &b, const mshadow::Shape &shape, mshadow::Stream *s) { + mkl_set_priv_flag(b); + return b.get_with_shape(shape, s); + } +} // namespace op +#if MKL_EXPERIMENTAL == 1 +inline void mkl_tblobs_prv_to_cpu(const std::vector &data) { + for (size_t i = 0; i < data.size(); i++) { + std::shared_ptr mem_holder = data[i].Mkl_mem_; + if (mem_holder != nullptr && mem_holder->b_eager_mode) { + mem_holder->check_and_prv_to_cpu(data[i].dptr_); + } + } +} +inline void mkl_set_tblob_eager_mode(const TBlob &data) { + std::shared_ptr mem_holder = data.Mkl_mem_; + if (mem_holder != nullptr) { + mem_holder->set_eager_mode(true); + } +} +#endif +} // namespace mxnet +#endif // MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_ diff --git a/src/operator/nn/activation-inl.h b/src/operator/nn/activation-inl.h index a440f97e1382..ac8b747f0f39 100644 --- a/src/operator/nn/activation-inl.h +++ b/src/operator/nn/activation-inl.h @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file activation-inl.h * \brief Activation operator - * \author Bing Xu, Da Zheng + * \author Bing Xu */ #ifndef MXNET_OPERATOR_NN_ACTIVATION_INL_H_ @@ -37,7 +37,6 @@ #include #include "../operator_common.h" #include "../mxnet_op.h" -#include "../mshadow_op.h" namespace mxnet { namespace op { @@ -46,7 +45,6 @@ namespace op { namespace activation { enum ActivationOpInputs {kData}; enum ActivationOpOutputs {kOut}; -enum ActivationOpResource {kTempSpace}; enum ActivationOpType {kReLU, kSigmoid, kTanh, kSoftReLU}; } // activation @@ -61,148 +59,160 @@ struct ActivationParam : public dmlc::Parameter { .add_enum("softrelu", activation::kSoftReLU) .describe("Activation function to be applied."); } - - bool operator==(const ActivationParam& other) const { - return this->act_type == other.act_type; - } }; -} // namespace op -} // namespace mxnet +/** + * \brief This is the implementation of activation operator. + * \tparam xpu The device that the op will be executed on. + */ +template +class ActivationOp : public Operator { + public: + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_data.size(), 1U); + CHECK_EQ(out_data.size(), 1U); + Stream *s = ctx.get_stream(); + const TBlob& input = in_data[activation::kData]; + const size_t sz = input.shape_.Size(); + if (sz) { + MXNET_ASSIGN_REQ_SWITCH(req[activation::kOut], Req, { + mxnet_op::Kernel, xpu>::Launch( + s, sz, + out_data[activation::kOut].dptr(), + input.dptr()); + }); + } + } -namespace std { -template<> -struct hash { - size_t operator()(const mxnet::op::ActivationParam& val) { - return val.act_type; + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1U); + CHECK(in_data.size() == 1 && in_grad.size() == 1); + CHECK_EQ(req.size(), 1U); + Stream *s = ctx.get_stream(); + const TBlob& m_out_grad = out_grad[activation::kOut]; + const TBlob& m_out_data = out_data[activation::kOut]; + const TBlob& m_in_grad = in_grad[activation::kData]; + const size_t sz = m_out_data.shape_.Size(); + if (sz) { + MXNET_ASSIGN_REQ_SWITCH(req[activation::kData], Req, { + mxnet_op::Kernel, Req>, xpu>::Launch( + s, sz, + m_in_grad.dptr(), + m_out_grad.dptr(), + m_out_data.dptr()); + }); + } } -}; -} // namespace std +}; // class ActivationOp -namespace mxnet { -namespace op { +// Declare Factory function, used for dispatch specialization +template +Operator* CreateOp(ActivationParam type, int dtype, const TShape& dshape); -template -void ActivationForward(const OpContext &ctx, const TBlob &in_data, - const OpReqType &req, const TBlob &out_data) { - using namespace mshadow; - using namespace mshadow::expr; - Stream *s = ctx.get_stream(); - const size_t sz = in_data.shape_.Size(); - if (sz) { - MXNET_ASSIGN_REQ_SWITCH(req, Req, { - mxnet_op::Kernel, xpu>::Launch( - s, sz, - out_data.dptr(), - in_data.dptr()); - }); +#if DMLC_USE_CXX11 +class ActivationProp : public OperatorProperty { + public: + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); } -} -template -void ActivationBackward(const OpContext &ctx, const TBlob &out_grad, - const TBlob &out_data, const OpReqType &req, - const TBlob &in_grad) { - using namespace mshadow; - using namespace mshadow::expr; - Stream *s = ctx.get_stream(); - const size_t sz = out_data.shape_.Size(); - if (sz) { - MXNET_ASSIGN_REQ_SWITCH(req, Req, { - mxnet_op::Kernel, Req>, xpu>::Launch( - s, sz, - in_grad.dptr(), - out_grad.dptr(), - out_data.dptr()); - }); + std::map GetParams() const override { + return param_.__DICT__(); } -} -template -void ActivationComputeImpl(const ActivationParam ¶m, const OpContext &ctx, - const TBlob &input, OpReqType req, const TBlob &output) { - MSHADOW_REAL_TYPE_SWITCH(input.type_flag_, DType, { - switch (param.act_type) { - case activation::kReLU: - ActivationForward( - ctx, input, req, output); - break; - case activation::kSigmoid: - ActivationForward( - ctx, input, req, output); - break; - case activation::kTanh: - ActivationForward( - ctx, input, req, output); - break; - case activation::kSoftReLU: - ActivationForward( - ctx, input, req, output); - break; - default: - LOG(FATAL) << "unknown activation type"; - } - }); -} + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 1U) << "Input:[data]"; + const TShape &dshape = in_shape->at(activation::kData); + if (dshape.ndim() == 0) return false; + out_shape->clear(); + out_shape->push_back(dshape); + return true; + } -template -void ActivationGradComputeImpl(const ActivationParam ¶m, const OpContext &ctx, - const TBlob &out_grad, const TBlob &out_data, - OpReqType req, const TBlob &output) { - MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, { - switch (param.act_type) { - case activation::kReLU: - ActivationBackward( - ctx, out_grad, out_data, req, output); - break; - case activation::kSigmoid: - ActivationBackward( - ctx, out_grad, out_data, req, output); - break; - case activation::kTanh: - ActivationBackward( - ctx, out_grad, out_data, req, output); - break; - case activation::kSoftReLU: - ActivationBackward( - ctx, out_grad, out_data, req, output); - break; - default: - LOG(FATAL) << "unknown activation type"; + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); + } } - }); -} + out_type->clear(); + out_type->push_back(dtype); + return true; + } -template -void ActivationCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - CHECK_EQ(inputs.size(), 1U); - CHECK_EQ(outputs.size(), 1U); - const ActivationParam& param = nnvm::get(attrs.parsed); - ActivationComputeImpl(param, ctx, inputs[0], req[0], outputs[0]); -} + OperatorProperty* Copy() const override { + auto ptr = new ActivationProp(); + ptr->param_ = param_; + return ptr; + } -template -void ActivationGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { + std::string TypeString() const override { + return "Activation"; + } + + // decalre dependency and inplace optimization options + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { #if MXNET_USE_CUDNN == 1 - CHECK_EQ(inputs.size(), 3U); + return {out_grad[activation::kOut], out_data[activation::kOut], in_data[activation::kData]}; #else - CHECK_EQ(inputs.size(), 2U); -#endif - CHECK_EQ(outputs.size(), 1U); - CHECK_EQ(req.size(), 1U); - const ActivationParam& param = nnvm::get(attrs.parsed); - ActivationGradComputeImpl(param, ctx, inputs[0], inputs[1], req[0], outputs[0]); -} + return {out_grad[activation::kOut], out_data[activation::kOut]}; +#endif // MXNET_USE_CUDNN + } + std::vector > BackwardInplaceOption( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &in_grad) const override { + return {{out_grad[activation::kOut], in_grad[activation::kData]}}; + } + + std::vector > ForwardInplaceOption( + const std::vector &in_data, + const std::vector &out_data) const override { + return {{in_data[activation::kData], out_data[activation::kOut]}}; + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + private: + ActivationParam param_; +}; +#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_ACTIVATION_INL_H_ diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc index 0da644cb1f70..401a9e3eaa56 100644 --- a/src/operator/nn/activation.cc +++ b/src/operator/nn/activation.cc @@ -17,130 +17,69 @@ * under the License. */ - /*! * Copyright (c) 2015 by Contributors * \file activation.cc * \brief activation op - * \author Bing Xu, Da Zheng + * \author Bing Xu */ #include "./activation-inl.h" #include "../mshadow_op.h" -#include "../tensor/elemwise_unary_op.h" -#if MXNET_USE_MKLDNN == 1 -#include "./mkldnn/mkldnn_base-inl.h" -#include "./mkldnn/mkldnn_ops-inl.h" -#endif // MXNET_USE_MKLDNN +#if MXNET_USE_MKL2017 == 1 +#include +#include "../mkl/mkl_memory-inl.h" +#include "../mkl/mkl_relu-inl.h" +#endif // MXNET_USE_MKL2017 namespace mxnet { namespace op { - -DMLC_REGISTER_PARAMETER(ActivationParam); - -// This will determine the order of the inputs for backward computation. -struct ActivationGrad { - const char *op_name; - std::vector operator()(const nnvm::NodePtr& n, - const std::vector& ograds) const { - std::vector heads(ograds.begin(), ograds.end()); - heads.emplace_back(nnvm::NodeEntry{n, activation::kOut, 0}); -#if MXNET_USE_CUDNN == 1 - heads.push_back(n->inputs[activation::kData]); -#endif - return MakeGradNode(op_name, n, heads, n->attrs.dict); - } -}; - -#if MXNET_USE_MKLDNN == 1 -static void ActivationComputeExCPU(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const ActivationParam& param = nnvm::get(attrs.parsed); - CHECK_EQ(inputs.size(), 1U); - CHECK_EQ(outputs.size(), 1U); - if (SupportMKLDNN(inputs[0])) { - MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs); - MKLDNNActivationForward(attrs, ctx, inputs[0], req[0], outputs[0]); - MKLDNN_OPCHECK_RUN(ActivationCompute, attrs, ctx, inputs, req, outputs); - return; +template<> +Operator *CreateOp(ActivationParam param, int dtype, const TShape& dshape) { + Operator *op = NULL; +#if MXNET_USE_MKL2017 == 1 + if (param.act_type == activation::kReLU && dshape.ndim() <= 4) { + switch (dtype) { + case mshadow::kFloat32: + return new MKLReluOp(); + case mshadow::kFloat64: + return new MKLReluOp(); + default: + break; + } } - ActivationComputeImpl(param, ctx, inputs[0].data(), req[0], outputs[0].data()); -} - -void ActivationGradComputeExCPU(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { -#if MXNET_USE_CUDNN == 1 - CHECK_EQ(inputs.size(), 3U); -#else - CHECK_EQ(inputs.size(), 2U); + if (enableMKLWarnGenerated()) + LOG(INFO) << MKLReluOp::getName() << " Skip MKL optimization"; #endif - const ActivationParam& param = nnvm::get(attrs.parsed); - if (SupportMKLDNN(inputs[0])) { - MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs); - MKLDNNActivationBackward(attrs, ctx, inputs[0], inputs[1], req[0], - outputs[0]); - MKLDNN_OPCHECK_RUN(ActivationGradCompute, attrs, ctx, inputs, req, outputs); - return; - } - ActivationGradComputeImpl(param, ctx, inputs[0].data(), inputs[1].data(), - req[0], outputs[0].data()); + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + switch (param.act_type) { + case activation::kReLU: + op = new ActivationOp(); + break; + case activation::kSigmoid: + op = new ActivationOp(); + break; + case activation::kTanh: + op = new ActivationOp(); + break; + case activation::kSoftReLU: + op = new ActivationOp(); + break; + default: + LOG(FATAL) << "unknown activation type"; + } + }) + return op; } -#endif -inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - CHECK_EQ(in_attrs->size(), 1); - CHECK_EQ(out_attrs->size(), 1); - const ActivationParam& param = nnvm::get(attrs.parsed); - bool ret = ElemwiseStorageType<1, 1, false, false, false>(attrs, dev_mask, - dispatch_mode, - in_attrs, out_attrs); -#if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) { - *dispatch_mode = DispatchMode::kFComputeEx; - } -#endif - return ret; +// DO_BIND_DISPATCH comes from operator_common.h +Operator *ActivationProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_shape)[0]); } -inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { -#if MXNET_USE_CUDNN == 1 - CHECK_EQ(in_attrs->size(), 3U); -#else - CHECK_EQ(in_attrs->size(), 2U); -#endif - CHECK_EQ(out_attrs->size(), 1U); - const ActivationParam& param = nnvm::get(attrs.parsed); -#if MXNET_USE_CUDNN == 1 - bool ret = ElemwiseStorageType<3, 1, false, false, false>(attrs, dev_mask, - dispatch_mode, - in_attrs, out_attrs); -#else - bool ret = ElemwiseStorageType<2, 1, false, false, false>(attrs, dev_mask, - dispatch_mode, - in_attrs, out_attrs); -#endif -#if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) { - *dispatch_mode = DispatchMode::kFComputeEx; - } -#endif - return ret; -} +DMLC_REGISTER_PARAMETER(ActivationParam); -MXNET_OPERATOR_REGISTER_UNARY(Activation) +MXNET_REGISTER_OP_PROPERTY(Activation, ActivationProp) .describe(R"code(Applies an activation function element-wise to the input. The following activation functions are supported: @@ -151,35 +90,8 @@ The following activation functions are supported: - `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))` )code" ADD_FILELINE) -.set_attr_parser(ParamParser) -.set_attr("FInferStorageType", ActivationStorageType) -.set_attr("FCompute", ActivationCompute) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FComputeEx", ActivationComputeExCPU) -#endif -.set_attr("FGradient", ActivationGrad{"_backward_Activation"}) +.add_argument("data", "NDArray-or-Symbol", "Input array to activation function.") .add_arguments(ActivationParam::__FIELDS__()); -NNVM_REGISTER_OP(_backward_Activation) -.set_num_inputs(3) -.set_num_outputs(1) -.set_attr("TIsBackward", true) -.set_attr("FInferStorageType", BackwardActStorageType) -.set_attr("FInferShape", ElemwiseShape<3, 1>) -.set_attr("FInferType", ElemwiseType<3, 1>) -.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ - return std::vector >{{0, 0}}; -}) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) -#endif -.set_attr_parser(ParamParser) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FComputeEx", ActivationGradComputeExCPU) -#endif -.set_attr("FCompute", ActivationGradCompute); - } // namespace op } // namespace mxnet diff --git a/src/operator/nn/activation.cu b/src/operator/nn/activation.cu index dc435b2acc17..c2f6be9f37c8 100644 --- a/src/operator/nn/activation.cu +++ b/src/operator/nn/activation.cu @@ -31,73 +31,39 @@ namespace mxnet { namespace op { - -#if MXNET_USE_CUDNN == 1 - -template -static CuDNNActivationOp &get_cudnn_op(const ActivationParam& param) { -#if DMLC_CXX11_THREAD_LOCAL - static thread_local CuDNNActivationOp cudnn_op; -#else - static MX_THREAD_LOCAL CuDNNActivationOp cudnn_op; -#endif - cudnn_op.Init(param); - return cudnn_op; -} - template<> -void ActivationCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - CHECK_EQ(inputs.size(), 1U); - CHECK_EQ(outputs.size(), 1U); - const ActivationParam& param = nnvm::get(attrs.parsed); - +Operator *CreateOp(ActivationParam param, int dtype, const TShape& dshape) { + Operator *op = NULL; // SoftReLU not supported by CUDNN yet if (param.act_type == activation::kSoftReLU) { - MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - ActivationForward(ctx, - inputs[0], req[0], outputs[0]); - }); - } else { - MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - get_cudnn_op(param).Forward(ctx, inputs[0], req[0], outputs[0]); - }); + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new ActivationOp(); + }) + return op; } -} - -template<> -void ActivationGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - CHECK_EQ(inputs.size(), 3U); - CHECK_EQ(outputs.size(), 1U); - CHECK_EQ(req.size(), 1U); - const ActivationParam& param = nnvm::get(attrs.parsed); - // SoftReLU not supported by CUDNN yet - if (param.act_type == activation::kSoftReLU) { - MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - ActivationBackward( - ctx, inputs[0], inputs[1], req[0], outputs[0]); - }); - } else { - MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - get_cudnn_op(param).Backward(ctx, inputs[0], inputs[2], inputs[1], req[0], outputs[0]); - }); - } +#if MXNET_USE_CUDNN == 1 + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new CuDNNActivationOp(param); + }) +#else + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + switch (param.act_type) { + case activation::kReLU: + op = new ActivationOp(); + break; + case activation::kSigmoid: + op = new ActivationOp(); + break; + case activation::kTanh: + op = new ActivationOp(); + break; + default: + LOG(FATAL) << "unknown activation"; + } + }) +#endif // MXNET_USE_CUDNN + return op; } -#endif - -NNVM_REGISTER_OP(Activation) -.set_attr("FCompute", ActivationCompute); - -NNVM_REGISTER_OP(_backward_Activation) -.set_attr("FCompute", ActivationGradCompute); - } // namespace op } // namespace mxnet diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h index 27e0a8434d77..2a9dee2cf845 100644 --- a/src/operator/nn/batch_norm-inl.h +++ b/src/operator/nn/batch_norm-inl.h @@ -21,7 +21,7 @@ * Copyright (c) 2017 by Contributors * \file batch_norm-inl.h * \brief - * \author Bing Xu, Chris Olivier, Da Zheng + * \author Bing Xu, Chris Olivier */ #ifndef MXNET_OPERATOR_NN_BATCH_NORM_INL_H_ #define MXNET_OPERATOR_NN_BATCH_NORM_INL_H_ @@ -47,10 +47,8 @@ namespace mxnet { namespace op { namespace batchnorm { -enum BatchNormOpInputs {kData, kGamma, kBeta, kInMovingMean, - kInMovingVar}; // kGamma: weights, kBeta: biases +enum BatchNormOpInputs {kData, kGamma, kBeta}; // kGamma: weights, kBeta: biases enum BatchNormOpOutputs {kOut, kMean, kVar}; // req, out_data -enum BatchNormOpResource {kTempSpace}; enum BatchNormOpAuxiliary {kMovingMean, kMovingVar}; // aux_states /*! \brief Default channel axis if none specified int he params */ @@ -85,203 +83,280 @@ struct BatchNormParam : public dmlc::Parameter { DMLC_DECLARE_FIELD(cudnn_off).set_default(false) .describe("Do not select CUDNN operator, if available"); } - - bool operator==(const BatchNormParam& other) const { - return this->eps == other.eps && - this->momentum == other.momentum && - this->fix_gamma == other.fix_gamma && - this->use_global_stats == other.use_global_stats && - this->output_mean_var == other.output_mean_var && - this->axis == other.axis && - this->cudnn_off == other.cudnn_off; - } }; -} // namespace op -} // namespace mxnet - -namespace std { -template<> -struct hash { - size_t operator()(const mxnet::op::BatchNormParam& val) { - size_t ret = 0; - ret = dmlc::HashCombine(ret, val.momentum); - ret = dmlc::HashCombine(ret, val.fix_gamma); - ret = dmlc::HashCombine(ret, val.use_global_stats); - ret = dmlc::HashCombine(ret, val.output_mean_var); - ret = dmlc::HashCombine(ret, val.axis); - return ret; +/*! \brief Batch normalization operator */ +template +class BatchNormOp : public Operator { + public: + explicit BatchNormOp(BatchNormParam param) { + this->param_ = param; } -}; -} // namespace std -namespace mxnet { -namespace op { + static inline bool IsWriting(const OpReqType ort) { + return ort == kWriteTo || ort == kWriteInplace; + } -static inline bool IsBNWriting(const OpReqType ort) { - return ort == kWriteTo || ort == kWriteInplace; -} + /*! + * \brief perform a forward operation of Operator, save the output to TBlob. + * \param ctx runtime context available to this call + * \param in_data array of input data, it is const + * \param req the request types of saving operation, can only be kWriteTo or kWriteInplace. + * \param out_data array of output data, pointer is used to indicate that this is holder + * the space of TBlob in out_data must be pre-allocated with InferShape + * \param aux_states Auxiliary states of operator. Normally operator doesn't + * need, epecial case like Batch Norm requires. + * \sa OpReqType, OpContext + */ + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + + CHECK_EQ(in_data.size(), 3U); + CHECK_EQ(aux_states.size(), 2U); + if (ctx.is_train) { + CHECK_EQ(out_data.size(), 3U); + CHECK_EQ(req.size(), 3U); + } else { + CHECK_GE(out_data.size(), 1U); + CHECK_GE(req.size(), 1U); + CHECK_EQ(req[batchnorm::kOut], kWriteTo); + } + Stream *s = ctx.get_stream(); + DoForward(s, ctx, in_data, req, out_data, aux_states); + } -template -void BatchNormForwardImpl(mshadow::Stream *stream, - const OpContext &ctx, const BatchNormParam& param, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states); + /*! + * \brief Perform a Backward Operation, write gradient to the in_grad. + * + * \note + * Convention: + * out_grad.size() == OperatorProperty.NumVisibleOutputs() + * out_data.size() == OperatorProperty.NumOutputs() + * out_data can contain additional invisible returns that remembers the + * state carried from the Forward pass. For example mask in the dropout. + * The gradients are passed from visible returns in this function. + * + * \par + * Not all the TBlobs in the arguments will be available + * if you override the DeclareBackwardDependency of corresponding OperatorProperty class. + * Only the dependencies you declared will be available at corresponding position, + * the rest of the parameters are simply dummy where you will get a nullptr. + * You will be safe if you use the default DeclareBackwardDependency. + * But only declare what you need will give engine more chance for optimization. + * + * \param ctx runtime context available to this call + * \param out_grad the gradient value we get from of the Operator. + * \param in_data the array of input data. + * \param out_data the array of output data. + * \param req request types of the saving operation, can be all types. + * \param in_grad the array of gradient we need to write to. + * \param aux_states Auxiliary states of operator. Normally operator doesn't need + * \sa OperatorProperty, OpReqType, OpContext + */ + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { + CHECK_EQ(out_grad.size(), param_.output_mean_var ? 3U : 1U); + CHECK_EQ(in_data.size(), 3U); + CHECK_EQ(out_data.size(), 3U); + CHECK_EQ(in_grad.size(), 3U); + mshadow::Stream *s = ctx.get_stream(); + DoBackward(s, ctx, out_grad, in_data, + out_data, req, in_grad, aux_states); + } -template -void BatchNormBackwardImpl(mshadow::Stream *stream, - const OpContext &ctx, const BatchNormParam& param, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states); + private: + void DoForward(mshadow::Stream *stream, + const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states); + + void DoBackward(mshadow::Stream *stream, + const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states); #if MXNET_USE_CUDA -template -void BatchNormForwardImpl(mshadow::Stream *stream, - const OpContext &ctx, const BatchNormParam& param, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states); -template -void BatchNormBackwardImpl(mshadow::Stream *stream, - const OpContext &ctx, const BatchNormParam& param, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states); + void DoForward(mshadow::Stream *stream, + const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states); + void DoBackward(mshadow::Stream *stream, + const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states); #endif // MXNET_USE_CUDA -/*! - * \brief perform a forward operation of Operator, save the output to TBlob. - * \param ctx runtime context available to this call - * \param in_data array of input data, it is const - * \param req the request types of saving operation, can only be kWriteTo or kWriteInplace. - * \param out_data array of output data, pointer is used to indicate that this is holder - * the space of TBlob in out_data must be pre-allocated with InferShape - * \param aux_states Auxiliary states of operator. Normally operator doesn't - * need, epecial case like Batch Norm requires. - * \sa OpReqType, OpContext - */ -template -void BatchNormForward(const OpContext &ctx, const BatchNormParam& param, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - - CHECK_EQ(in_data.size(), 3U); - CHECK_EQ(aux_states.size(), 2U); - if (ctx.is_train) { - CHECK_EQ(out_data.size(), 3U); - CHECK_EQ(req.size(), 3U); - } else { - CHECK_GE(out_data.size(), 1U); - CHECK_GE(req.size(), 1U); - CHECK_EQ(req[batchnorm::kOut], kWriteTo); - } - Stream *s = ctx.get_stream(); - BatchNormForwardImpl(s, ctx, param, in_data, req, - out_data, aux_states); -} - -/*! - * \brief Perform a Backward Operation, write gradient to the in_grad. - * - * \note - * Convention: - * out_grad.size() == OperatorProperty.NumVisibleOutputs() - * out_data.size() == OperatorProperty.NumOutputs() - * out_data can contain additional invisible returns that remembers the - * state carried from the Forward pass. For example mask in the dropout. - * The gradients are passed from visible returns in this function. - * - * \par - * Not all the TBlobs in the arguments will be available - * if you override the DeclareBackwardDependency of corresponding OperatorProperty class. - * Only the dependencies you declared will be available at corresponding position, - * the rest of the parameters are simply dummy where you will get a nullptr. - * You will be safe if you use the default DeclareBackwardDependency. - * But only declare what you need will give engine more chance for optimization. - * - * \param ctx runtime context available to this call - * \param out_grad the gradient value we get from of the Operator. - * \param in_data the array of input data. - * \param out_data the array of output data. - * \param req request types of the saving operation, can be all types. - * \param in_grad the array of gradient we need to write to. - * \param aux_states Auxiliary states of operator. Normally operator doesn't need - * \sa OperatorProperty, OpReqType, OpContext - */ -template -void BatchNormBackward(const OpContext &ctx, const BatchNormParam& param, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { - CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U); - CHECK_EQ(in_data.size(), 3U); - CHECK_EQ(out_data.size(), 3U); - CHECK_EQ(in_grad.size(), 3U); - mshadow::Stream *s = ctx.get_stream(); - BatchNormBackwardImpl(s, ctx, param, out_grad, in_data, - out_data, req, in_grad, aux_states); -} + /*! \brief Batch normalization operator parameters */ + BatchNormParam param_; +}; // class BatchNormOp template -void BatchNormCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const BatchNormParam& param = nnvm::get(attrs.parsed); - CHECK_EQ(inputs.size(), 5U); - std::vector in_data(inputs.begin(), - inputs.begin() + batchnorm::kInMovingMean); - std::vector aux_states(inputs.begin() + batchnorm::kInMovingMean, - inputs.end()); - MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, { - BatchNormForward(ctx, param, in_data, req, outputs, - aux_states); - }); -} - -template -void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - CHECK_EQ(inputs.size(), 11U); - const BatchNormParam& param = nnvm::get(attrs.parsed); - int num_out_grads = param.output_mean_var ? 3U : 1U; - int in_data_start = 3; - int aux_states_start = in_data_start + batchnorm::kInMovingMean; - int out_data_start = in_data_start + batchnorm::kInMovingVar + 1; - std::vector out_grad(inputs.begin(), inputs.begin() + num_out_grads); - std::vector in_data(inputs.begin() + in_data_start, - inputs.begin() + aux_states_start); - std::vector aux_states(inputs.begin() + aux_states_start, - inputs.begin() + out_data_start); - std::vector out_data(inputs.begin() + out_data_start, inputs.end()); - std::vector in_grad(outputs.begin(), outputs.begin() + 3); - - MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, { - BatchNormBackward(ctx, param, out_grad, in_data, out_data, req, - in_grad, aux_states); - }); -} +Operator *CreateOp(BatchNormParam param, const int dtype, const TShape& shape); #if DMLC_USE_CXX11 +class BatchNormProp : public OperatorProperty { + public: + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]"; + const TShape &dshape = in_shape->at(0); + + const size_t channelAxis = static_cast(param_.axis < 0 + ? static_cast(dshape.ndim()) + param_.axis + : param_.axis); + CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param_.axis; + + const int channelCount = dshape[channelAxis]; + + if (dshape.ndim() == 0) { + return false; + } + + in_shape->at(1) = TShape(Shape1(channelCount)); + in_shape->at(2) = TShape(Shape1(channelCount)); + + out_shape->clear(); + out_shape->push_back(dshape); // kOut + out_shape->push_back(Shape1(channelCount)); // kMean + out_shape->push_back(Shape1(channelCount)); // kVar + + aux_shape->clear(); + aux_shape->push_back(Shape1(channelCount)); // kMovingMean + aux_shape->push_back(Shape1(channelCount)); // kMovingVar + return true; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + using namespace mshadow; + CHECK_GE(in_type->size(), 1U); + const int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + // For float16 input type beta, gamma, mean, and average are stored in float32. + // For other input types, these parameters have the same type as input + // NOTE: This requirement is from cuDNN (v. 4 and 5) + int dtype_param; + MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, { + dtype_param = mshadow::DataType::kFlag; }); + for (index_t i = 1; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype_param; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, ListArguments()[i]); + } + } + for (index_t i = 0; i < aux_type->size(); ++i) { + if ((*aux_type)[i] != -1) { + UNIFORM_TYPE_CHECK((*aux_type)[i], dtype_param, ListArguments()[i]); + } + } + const size_t n_aux = this->ListAuxiliaryStates().size(); + aux_type->clear(); + for (size_t i = 0; i < n_aux; ++i) { + aux_type->push_back(dtype_param); + } + const size_t n_out = this->ListOutputs().size(); + out_type->clear(); + out_type->push_back(dtype); + for (size_t i = 1; i < n_out; ++i) { + out_type->push_back(dtype_param); + } + return true; + } + + OperatorProperty* Copy() const override { + auto ptr = new BatchNormProp(); + ptr->param_ = param_; + return ptr; + } + + std::string TypeString() const override { + return "BatchNorm"; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return {out_grad[batchnorm::kOut], + out_data[batchnorm::kMean], + out_data[batchnorm::kVar], + in_data[batchnorm::kData], + in_data[batchnorm::kGamma] + }; + } + + int NumVisibleOutputs() const override { + if (param_.output_mean_var) { + return 3; + } + return 1; + } + + int NumOutputs() const override { + return 3; + } + + std::vector ListArguments() const override { + return {"data", "gamma", "beta"}; + } + + std::vector ListOutputs() const override { + return {"output", "mean", "var"}; + } + + std::vector ListAuxiliaryStates() const override { + return {"moving_mean", "moving_var"}; + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + inline const BatchNormParam& getParam() const { + return param_; + } + + private: + BatchNormParam param_; +}; // class BatchNormProp namespace batchnorm { diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc index ba6c413819e4..ca2883239488 100644 --- a/src/operator/nn/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -21,15 +21,16 @@ * Copyright (c) 2015 by Contributors * \file batch_norm.cc * \brief - * \author Bing Xu, Chris Olivier, Da Zheng + * \author Bing Xu, Chris Olivier */ #include "batch_norm-inl.h" #include -#include "../elemwise_op_common.h" -#if MXNET_USE_MKLDNN == 1 -#include "./mkldnn/mkldnn_batch_norm-inl.h" -#endif +#if MXNET_USE_MKL2017 == 1 +#include +#include "../mkl/mkl_memory-inl.h" +#include "../mkl/mkl_batch_norm-inl.h" +#endif // MXNET_USE_MKL2017 /*! \brief inverse standard deviation <-> variance */ #define VARIANCE_TO_INVSTD(__var$, __eps$) (1.0/sqrt((__var$) + DType(__eps$))) @@ -88,12 +89,12 @@ static inline void ForEachFast(const BNTensor3 &in_data, /*! \brief Forward CPU */ template -void BatchNormForwardImpl(mshadow::Stream *, - const OpContext &ctx, const BatchNormParam& param_, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { +void BatchNormOp::DoForward(mshadow::Stream *, + const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { // Input batchnorm::BNTensor3 inputData(in_data[batchnorm::kData], param_.axis); const TBlob &weights = in_data[batchnorm::kGamma]; @@ -163,7 +164,7 @@ void BatchNormForwardImpl(mshadow::Stream *, // note that var is still invstd if (!param_.fix_gamma) { - if (IsBNWriting(req[batchnorm::kData])) { + if (IsWriting(req[batchnorm::kData])) { ForEachFast(inputData, outputData, channel, [thisWeight, thisBias, thisMean, thisInvstd](const DType *in_data, DType *out_data) { @@ -172,10 +173,10 @@ void BatchNormForwardImpl(mshadow::Stream *, }); } } else { - if (IsBNWriting(req[batchnorm::kGamma])) { + if (IsWriting(req[batchnorm::kGamma])) { w[channel] = AccReal(1); } - if (IsBNWriting(req[batchnorm::kData])) { + if (IsWriting(req[batchnorm::kData])) { ForEachFast(inputData, outputData, channel, [thisWeight, thisBias, thisMean, thisInvstd](const DType *in_data, DType *out_data) { @@ -188,14 +189,14 @@ void BatchNormForwardImpl(mshadow::Stream *, } template -void BatchNormBackwardImpl(mshadow::Stream *, - const OpContext &ctx, const BatchNormParam& param_, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { +void BatchNormOp::DoBackward(mshadow::Stream *, + const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { // Input Data batchnorm::BNTensor3 inputData(in_data[batchnorm::kData], param_.axis); const TBlob &weights = in_data[batchnorm::kGamma]; @@ -263,7 +264,7 @@ void BatchNormBackwardImpl(mshadow::Stream *, dotp += (*thisInputData - mean) * (*gradOut_data); }); - if (!gradIn.IsEmpty() && IsBNWriting(req[batchnorm::kData])) { // if there's a grad input + if (!gradIn.IsEmpty() && IsWriting(req[batchnorm::kData])) { // if there's a grad input if (is_train_and_not_global_stats) { // when in training mode // Q(X) = X - E[x] ; i.e. input centered to zero mean @@ -299,7 +300,7 @@ void BatchNormBackwardImpl(mshadow::Stream *, // May want to make this a param eventually const AccReal scale = 1.0f; - if (IsBNWriting(req[batchnorm::kGamma])) { + if (IsWriting(req[batchnorm::kGamma])) { if (!param_.fix_gamma) { gradWeightData[channel] = scale * dotp * invstd; } else { @@ -307,185 +308,51 @@ void BatchNormBackwardImpl(mshadow::Stream *, } } - if (IsBNWriting(req[batchnorm::kBeta])) { + if (IsWriting(req[batchnorm::kBeta])) { gradBiasData[channel] = scale * sumGradOut; } } } -DMLC_REGISTER_PARAMETER(BatchNormParam); - -static bool BatchNormShape(const nnvm::NodeAttrs& attrs, - std::vector *in_shape, - std::vector *out_shape) { - const BatchNormParam& param = nnvm::get(attrs.parsed); - using namespace mshadow; - CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, MovingMean, MovingVar]"; - const TShape &dshape = in_shape->at(batchnorm::kData); - - const size_t channelAxis = static_cast(param.axis < 0 - ? static_cast(dshape.ndim()) + param.axis - : param.axis); - CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param.axis; - - const int channelCount = dshape[channelAxis]; - - if (dshape.ndim() == 0) { - return false; - } - - in_shape->at(batchnorm::kGamma) = TShape(Shape1(channelCount)); - in_shape->at(batchnorm::kBeta) = TShape(Shape1(channelCount)); - in_shape->at(batchnorm::kInMovingMean) = TShape(Shape1(channelCount)); // kMovingMean - in_shape->at(batchnorm::kInMovingVar) = TShape(Shape1(channelCount)); // kMovingVar - - out_shape->clear(); - out_shape->push_back(dshape); // kOut - out_shape->push_back(Shape1(channelCount)); // kMean - out_shape->push_back(Shape1(channelCount)); // kVar - - return true; -} - -static bool BatchNormType(const nnvm::NodeAttrs& attrs, - std::vector *in_type, std::vector *out_type) { - using namespace mshadow; - CHECK_GE(in_type->size(), 1U); - const int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - // For float16 input type beta, gamma, mean, and average are stored in float32. - // For other input types, these parameters have the same type as input - // NOTE: This requirement is from cuDNN (v. 4 and 5) - int dtype_param; - MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, { - dtype_param = mshadow::DataType::kFlag; }); - std::vector args{"data", "gamma", "beta", "mean", "var"}; - CHECK_LE(in_type->size(), args.size()); - for (index_t i = 1; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype_param; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, args[i]); - } - } - const size_t n_out = 3; - out_type->clear(); - out_type->push_back(dtype); - for (size_t i = 1; i < n_out; ++i) { - out_type->push_back(dtype_param); - } - return true; -} - -#if MXNET_USE_MKLDNN == 1 -static inline bool SupportMKLDNNBN(const NDArray &input, const BatchNormParam ¶m) { - TShape shape = input.shape(); - return SupportMKLDNN(input) && shape.ndim() == 4 +template<> +Operator *CreateOp(BatchNormParam param, const int dtype, const TShape& shape) { + param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis); + Operator *op = nullptr; +#if MXNET_USE_MKL2017 == 1 + if (shape.ndim() == 4 && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS - && shape[param.axis] % 8 == 0; -} - -void BatchNormComputeExCPU(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - CHECK_EQ(inputs.size(), 5U); - const BatchNormParam ¶m = nnvm::get(attrs.parsed); - // MKLDNN batchnorm only works well on the special MKLDNN layout. - if (SupportMKLDNNBN(inputs[0], param) && inputs[0].IsMKLDNNData()) { - std::vector in_data(inputs.begin(), inputs.begin() + batchnorm::kInMovingMean); - std::vector aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end()); - - if (inputs[0].dtype() == mshadow::kFloat32) { - MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs); - MKLDNNBatchNormForward(ctx, param, in_data, req, outputs, aux_states); - MKLDNN_OPCHECK_RUN(BatchNormCompute, attrs, ctx, inputs, req, outputs); - return; + && !mxnet::op::batchnorm::disable_mkl) { + switch (dtype) { + case mshadow::kFloat32: + op = new MKLBatchNormOp(param); + break; + case mshadow::kFloat64: + op = new MKLBatchNormOp(param); + break; + default: + // MKL operator doesn't support half_t, so fall through + break; } } - FallBackCompute(BatchNormCompute, attrs, ctx, inputs, req, outputs); -} - -void BatchNormGradComputeExCPU(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - CHECK_EQ(inputs.size(), 11U); - const BatchNormParam ¶m = nnvm::get(attrs.parsed); - int num_out_grads = param.output_mean_var ? 3U : 1U; - int in_data_start = 3; - int aux_states_start = in_data_start + batchnorm::kInMovingMean; - int out_data_start = in_data_start + batchnorm::kInMovingVar + 1; - - TShape shape = inputs[0].shape(); - // MKLDNN batchnorm only works well on the special MKLDNN layout. - if (SupportMKLDNNBN(inputs[0], param) - && (inputs[in_data_start].IsMKLDNNData() || inputs[0].IsMKLDNNData())) { - std::vector out_grad(inputs.begin(), inputs.begin() + num_out_grads); - std::vector in_data(inputs.begin() + in_data_start, - inputs.begin() + aux_states_start); - std::vector aux_states(inputs.begin() + aux_states_start, - inputs.begin() + out_data_start); - std::vector out_data(inputs.begin() + out_data_start, inputs.end()); - std::vector in_grad(outputs.begin(), outputs.begin() + 3); - - if (inputs[0].dtype() == mshadow::kFloat32) { - MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs); - MKLDNNBatchNormBackward(ctx, param, out_grad, in_data, - out_data, req, in_grad, aux_states); - MKLDNN_OPCHECK_RUN(BatchNormGradCompute, attrs, ctx, inputs, req, outputs); - return; - } - } - FallBackCompute(BatchNormGradCompute, attrs, ctx, inputs, req, outputs); -} #endif - -static inline bool BatchNormStorageType(const nnvm::NodeAttrs &attrs, - const int dev_mask, - DispatchMode *dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - CHECK_EQ(in_attrs->size(), 5); - CHECK_EQ(out_attrs->size(), 3); - DispatchMode wanted_mode; -#if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) - wanted_mode = DispatchMode::kFComputeEx; - else -#endif - wanted_mode = DispatchMode::kFCompute; - for (int& v : *in_attrs) { - if (v == - 1) v = kDefaultStorage; + if (!op) { + MSHADOW_REAL_TYPE_SWITCH_EX(dtype, + DType, + AccReal, { + op = new BatchNormOp(param); }); } - return storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, wanted_mode); + return op; } -static inline bool backward_BatchNormStorageType(const nnvm::NodeAttrs &attrs, - const int dev_mask, - DispatchMode *dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - CHECK_EQ(in_attrs->size(), 11); - CHECK_EQ(out_attrs->size(), 5); - DispatchMode wanted_mode; -#if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) - wanted_mode = DispatchMode::kFComputeEx; - else -#endif - wanted_mode = DispatchMode::kFCompute; - for (int& v : *in_attrs) { - if (v == - 1) v = kDefaultStorage; - } - return storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, wanted_mode); +// DO_BIND_DISPATCH comes from operator_common.h +Operator *BatchNormProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_shape)[0]); } -NNVM_REGISTER_OP(BatchNorm) +DMLC_REGISTER_PARAMETER(BatchNormParam); + +MXNET_REGISTER_OP_PROPERTY(BatchNorm, BatchNormProp) .describe(R"code(Batch normalization. Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as @@ -531,44 +398,14 @@ Both ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is tr then set ``gamma`` to 1 and its gradient to 0. )code" ADD_FILELINE) -.set_num_inputs(5) -.set_num_outputs(3) -.set_attr_parser(ParamParser) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - return std::vector{"data", "gamma", "beta", "moving_mean", "moving_var"}; -}) -.set_attr("FListOutputNames", - [](const NodeAttrs& attrs) { - return std::vector{"output", "mean", "var"}; -}) -.set_attr("FNumVisibleOutputs", - [](const NodeAttrs& attrs) { - const BatchNormParam& param = nnvm::get(attrs.parsed); - return param.output_mean_var ? 3 : 1; -}) -.set_attr("FMutateInputs", [](const nnvm::NodeAttrs& attrs) { - return std::vector{3, 4}; -}) -.set_attr("FInferShape", BatchNormShape) -.set_attr("FInferType", BatchNormType) -.set_attr("FInferStorageType", BatchNormStorageType) -.set_attr("FCompute", BatchNormCompute) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FComputeEx", BatchNormComputeExCPU) -#endif -.set_attr("FGradient", ElemwiseGradUseInOut{"_backward_BatchNorm"}) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) -#endif .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization") .add_argument("gamma", "NDArray-or-Symbol", "gamma array") .add_argument("beta", "NDArray-or-Symbol", "beta array") .add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input") .add_argument("moving_var", "NDArray-or-Symbol", "running variance of input") -.add_arguments(BatchNormParam::__FIELDS__()) +.add_arguments(BatchNormParam::__FIELDS__()); + +NNVM_REGISTER_OP(BatchNorm) .set_attr( "FSetInputVarAttrOnCompose", [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) { @@ -580,20 +417,5 @@ then set ``gamma`` to 1 and its gradient to 0. } }); -NNVM_REGISTER_OP(_backward_BatchNorm) -.set_num_outputs(5) -.set_attr("TIsBackward", true) -.set_attr("FInferStorageType", backward_BatchNormStorageType) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) -#endif -.set_attr_parser(ParamParser) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FComputeEx", BatchNormGradComputeExCPU) -#endif -.set_attr("FCompute", BatchNormGradCompute); - } // namespace op } // namespace mxnet diff --git a/src/operator/nn/batch_norm.cu b/src/operator/nn/batch_norm.cu index 80c15976b65f..59317b7fa837 100644 --- a/src/operator/nn/batch_norm.cu +++ b/src/operator/nn/batch_norm.cu @@ -21,7 +21,7 @@ * Copyright (c) 2017 by Contributors * \file batch_norm.cu * \brief CUDA Batch Normalization code - * \author Chris Olivier, Bing Xu, Da Zheng + * \author Chris Olivier, Bing Xu * Adapted from Torch */ #include @@ -579,13 +579,13 @@ static inline uint32_t SetupFlags(const OpContext &ctx, flags |= ctx.is_train ? IS_TRAINING_FLAG : 0; flags |= params.fix_gamma ? FIX_GAMMA_FLAG : 0; flags |= params.use_global_stats ? USE_GLOBAL_STATS_FLAG : 0; - if (IsBNWriting(req[batchnorm::kData])) { + if (BatchNormOp::IsWriting(req[batchnorm::kData])) { flags |= WRITE_DATA_FLAG; } - if (IsBNWriting(req[batchnorm::kGamma])) { + if (BatchNormOp::IsWriting(req[batchnorm::kGamma])) { flags |= WRITE_GAMMA_FLAG; } - if (IsBNWriting(req[batchnorm::kBeta])) { + if (BatchNormOp::IsWriting(req[batchnorm::kBeta])) { flags |= WRITE_BETA_FLAG; } return flags; @@ -593,12 +593,12 @@ static inline uint32_t SetupFlags(const OpContext &ctx, /*! \brief Forward batch-norm pass on GPU */ template -void BatchNormForwardImpl(mshadow::Stream *stream, - const OpContext &ctx, const BatchNormParam& param_, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { +void BatchNormOp::DoForward(mshadow::Stream *stream, + const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { batchnorm::cuda::BatchNormalizationUpdateOutput( stream, ctx, @@ -614,14 +614,14 @@ void BatchNormForwardImpl(mshadow::Stream *stream, /*! \brief Backward batch-norm pass on GPU */ template -void BatchNormBackwardImpl(mshadow::Stream *stream, - const OpContext &ctx, const BatchNormParam& param_, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { +void BatchNormOp::DoBackward(mshadow::Stream *stream, + const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { batchnorm::cuda::BatchNormalizationBackward( stream, ctx, @@ -637,92 +637,30 @@ void BatchNormBackwardImpl(mshadow::Stream *stream, MSHADOW_CUDA_POST_KERNEL_CHECK(BatchNormOp_DoBackward_gpu); } -#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4 -template -static CuDNNBatchNormOp &GetCuDNNOp(const BatchNormParam& param) { -#if DMLC_CXX11_THREAD_LOCAL - static thread_local CuDNNBatchNormOp op; -#else - static MX_THREAD_LOCAL CuDNNBatchNormOp op; -#endif - op.Init(param); - return op; -} -#endif - -template<> -void BatchNormCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - BatchNormParam param = nnvm::get(attrs.parsed); - CHECK_EQ(inputs.size(), 5U); - std::vector in_data(inputs.begin(), inputs.begin() + 3); - std::vector aux_states(inputs.begin() + 3, inputs.end()); - int dtype = inputs[0].type_flag_; - TShape shape = inputs[0].shape_; - - param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis); -#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 - if (!param.use_global_stats && !param.cudnn_off && shape.ndim() <= 4 - && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS) { - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - GetCuDNNOp(param).Forward(ctx, in_data, req, outputs, aux_states); - }) - } else { - MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, { - BatchNormForward(ctx, param, in_data, req, outputs, aux_states); - }) - } -#else - MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, { - BatchNormForward(ctx, param, in_data, req, outputs, aux_states); - }); -#endif -} - +/*! \brief Create GPU operator for batch normalization */ template<> -void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - CHECK_EQ(inputs.size(), 11U); - BatchNormParam param = nnvm::get(attrs.parsed); - std::vector out_grad(1, inputs[0]); - std::vector in_data(inputs.begin() + 3, inputs.begin() + 6); - std::vector aux_states(inputs.begin() + 6, inputs.begin() + 8); - std::vector out_data(inputs.begin() + 8, inputs.end()); - std::vector in_grad(outputs.begin(), outputs.begin() + 3); - int dtype = inputs[0].type_flag_; - TShape shape = inputs[0].shape_; - +Operator *CreateOp(BatchNormParam param, const int dtype, const TShape& shape) { param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis); + Operator *op = NULL; #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 if (!param.use_global_stats && !param.cudnn_off && shape.ndim() <= 4 && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS) { MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - GetCuDNNOp(param).Backward(ctx, out_grad, in_data, out_data, - req, in_grad, aux_states); + op = new CuDNNBatchNormOp(param); }) } else { MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, { - BatchNormBackward(ctx, param, out_grad, - in_data, out_data, req, in_grad, aux_states); + op = new BatchNormOp(param); }) } #else - MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, { - BatchNormBackward(ctx, param, out_grad, - in_data, out_data, req, in_grad, aux_states); - }); + MSHADOW_REAL_TYPE_SWITCH_EX(dtype, + DType, + AccReal, + { op = new BatchNormOp(param); }); #endif + return op; } -NNVM_REGISTER_OP(BatchNorm) -.set_attr("FCompute", BatchNormCompute); - -NNVM_REGISTER_OP(_backward_BatchNorm) -.set_attr("FCompute", BatchNormGradCompute); - } // namespace op } // namespace mxnet diff --git a/src/operator/nn/concat-inl.h b/src/operator/nn/concat-inl.h deleted file mode 100644 index a7f1fa85f612..000000000000 --- a/src/operator/nn/concat-inl.h +++ /dev/null @@ -1,160 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * Copyright (c) 2015 by Contributors - * \file concat-inl.h - * \brief - * \author Bing Xu -*/ -#ifndef MXNET_OPERATOR_NN_CONCAT_INL_H_ -#define MXNET_OPERATOR_NN_CONCAT_INL_H_ -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../channel_op_common.h" -#include "../tensor/broadcast_reduce_op.h" - -namespace mxnet { -namespace op { - -namespace concat_enum { -enum ConcatOpInputs {kData0, kData1, kData2, kData3, kData4}; -enum ConcatOpResource {kTempSpace}; -enum ConcatOpOutputs {kOut}; -} // namespace concat_enum - -struct ConcatParam : public dmlc::Parameter { - int num_args; - int dim; - DMLC_DECLARE_PARAMETER(ConcatParam) { - DMLC_DECLARE_FIELD(num_args).set_lower_bound(1) - .describe("Number of inputs to be concated."); - DMLC_DECLARE_FIELD(dim).set_default(1) - .describe("the dimension to be concated."); - } -}; // struct ConcatParam - -template -class ConcatOp { - public: - void Init(const ConcatParam ¶m) { - this->size_ = param.num_args; - this->dimension_ = param.dim; - } - - void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(static_cast(in_data.size()), size_); - CHECK_EQ(out_data.size(), 1U); - int axis = CheckAxis(dimension_, in_data[concat_enum::kData0].ndim()); - Stream *s = ctx.get_stream(); - std::vector > data(size_); - Tensor out; - size_t leading = 1, trailing = 1; - for (int i = 0; i < axis; ++i) { - leading *= out_data[concat_enum::kOut].shape_[i]; - } - for (int i = axis + 1; i < out_data[concat_enum::kOut].ndim(); ++i) { - trailing *= out_data[concat_enum::kOut].shape_[i]; - } - size_t mid = out_data[concat_enum::kOut].shape_[axis]; - Shape<3> oshape = Shape3(leading, mid, trailing); - out = out_data[concat_enum::kOut].get_with_shape(oshape, s); - - for (int i = 0; i < size_; ++i) { - Shape<3> dshape = Shape3(leading, in_data[i].shape_[axis], trailing); - data[i] = in_data[i].get_with_shape(dshape, s); - } - Concatenate(data, &out, 1, req[concat_enum::kOut]); - } - - void Backward(const OpContext &ctx, const TBlob &out_grad, - const std::vector &req, - const std::vector &in_grad) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_grad.size(), static_cast(size_)); - int axis = CheckAxis(dimension_, out_grad.ndim()); - Stream *s = ctx.get_stream(); - std::vector > grad_in(size_); - Tensor grad; - size_t leading = 1, trailing = 1; - for (int i = 0; i < axis; ++i) { - leading *= out_grad.shape_[i]; - } - for (int i = axis + 1; i < out_grad.ndim(); ++i) { - trailing *= out_grad.shape_[i]; - } - size_t mid = out_grad.shape_[axis]; - Shape<3> oshape = Shape3(leading, mid, trailing); - grad = out_grad.get_with_shape(oshape, s); - - for (int i = 0; i < size_; ++i) { - Shape<3> dshape = Shape3(leading, in_grad[i].shape_[axis], trailing); - grad_in[i] = in_grad[i].get_with_shape(dshape, s); - } - Split(grad, &grad_in, 1, req); - } - - private: - int size_; - int dimension_; -}; // class ConcatOp - -template -void ConcatCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const ConcatParam& param = nnvm::get(attrs.parsed); - MSHADOW_TYPE_SWITCH(inputs[concat_enum::kData0].type_flag_, DType, { - ConcatOp op; - op.Init(param); - op.Forward(ctx, inputs, req, outputs); - }); -} - -template -void ConcatGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const ConcatParam& param = nnvm::get(attrs.parsed); - MSHADOW_TYPE_SWITCH(inputs[concat_enum::kOut].type_flag_, DType, { - ConcatOp op; - op.Init(param); - op.Backward(ctx, inputs[concat_enum::kOut], req, outputs); - }); -} - -} // namespace op -} // namespace mxnet - -#endif // MXNET_OPERATOR_NN_CONCAT_INL_H_ diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc deleted file mode 100644 index 81dc95f1a5a5..000000000000 --- a/src/operator/nn/concat.cc +++ /dev/null @@ -1,289 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * Copyright (c) 2015 by Contributors - * \file concat.cc - * \brief - * \author Bing Xu -*/ - -#include "./concat-inl.h" -#include "./mkldnn/mkldnn_ops-inl.h" -#include "./mkldnn/mkldnn_base-inl.h" -#include "../../common/utils.h" - -namespace mxnet { -namespace op { - -static bool ConcatShape(const nnvm::NodeAttrs& attrs, - std::vector *in_shape, - std::vector *out_shape) { - using namespace mshadow; - const ConcatParam& param_ = nnvm::get(attrs.parsed); - CHECK_EQ(in_shape->size(), static_cast(param_.num_args)); - TShape dshape; - index_t size = 0; - bool has_zero = false; - int axis = -1; - for (int i = 0; i < param_.num_args; ++i) { - TShape tmp = (*in_shape)[i]; - if (tmp.ndim()) { - axis = CheckAxis(param_.dim, tmp.ndim()); - has_zero = tmp[axis] == 0 || has_zero; - size += tmp[axis]; - tmp[axis] = 0; - shape_assign(&dshape, tmp); - } - } - - TShape tmp = (*out_shape)[0]; - if (tmp.ndim()) { - axis = CheckAxis(param_.dim, tmp.ndim()); - tmp[axis] = 0; - shape_assign(&dshape, tmp); - } - - if (dshape.ndim() == 0) return false; - - for (int i = 0; i < param_.num_args; ++i) { - CHECK(shape_assign(&(*in_shape)[i], dshape)) - << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i]; - } - - if (!has_zero) dshape[axis] = size; - CHECK(shape_assign(&(*out_shape)[0], dshape)) - << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0]; - - return dshape.Size() != 0; -} - -static bool ConcatType(const nnvm::NodeAttrs& attrs, - std::vector *in_type, - std::vector *out_type) { - const ConcatParam& param_ = nnvm::get(attrs.parsed); - int dtype = -1; - - for (size_t i = 0; i < in_type->size(); ++i) { - if (dtype == -1) { - dtype = in_type->at(i); - } else { - CHECK(in_type->at(i) == dtype || - in_type->at(i) == -1) << - "Non-uniform data type in Concat"; - } - } - - if (dtype == -1) { - LOG(FATAL) << "Not enough information to infer type in Concat."; - return false; - } - - size_t nin = param_.num_args; - in_type->clear(); - for (size_t i = 0; i < nin; ++i) in_type->push_back(dtype); - - out_type->clear(); - out_type->push_back(dtype); - - return true; -} - -inline static bool ConcatForwardInferStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - CHECK(!in_attrs->empty()); - CHECK_EQ(out_attrs->size(), 1U); - DispatchMode wanted_mode; -#if MXNET_USE_MKLDNN == 1 - const ConcatParam& param = nnvm::get(attrs.parsed); - if (dev_mask == mshadow::cpu::kDevMask - && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage) - && param.dim > 0) - wanted_mode = DispatchMode::kFComputeEx; - else -#endif - wanted_mode = DispatchMode::kFCompute; - return storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, wanted_mode); -} - -inline static bool BackwardConcatStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - DispatchMode wanted_mode; -#if MXNET_USE_MKLDNN == 1 - const ConcatParam& param = nnvm::get(attrs.parsed); - CHECK_EQ(out_attrs->size(), in_attrs->size() - 1); - if (dev_mask == mshadow::cpu::kDevMask - && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage) - && param.dim > 0) - wanted_mode = DispatchMode::kFComputeEx; - else -#endif - wanted_mode = DispatchMode::kFCompute; - return storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, wanted_mode); -} - -#if MXNET_USE_MKLDNN == 1 -static void ConcatComputeExCPU(const nnvm::NodeAttrs& attrs, - const OpContext& op_ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - CHECK(!inputs.empty()); - CHECK_EQ(outputs.size(), 1U); - CHECK_EQ(req.size(), 1U); - if (req[0] == kNullOp) return; - // MKLDNN support 2D and 4D concat - if ((inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4) - && inputs[0].dtype() == mshadow::kFloat32) { - MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs); - MKLDNNConcatForward(attrs, op_ctx, inputs, req, outputs); - MKLDNN_OPCHECK_RUN(ConcatCompute, attrs, op_ctx, inputs, req, outputs); - return; - } - FallBackCompute(ConcatCompute, attrs, op_ctx, inputs, req, outputs); -} - -static void ConcatGradComputeExCPU(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - if ((inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4) - && inputs[0].dtype() == mshadow::kFloat32) { - MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs); - MKLDNNConcatBackward(attrs, ctx, inputs, req, outputs); - MKLDNN_OPCHECK_RUN(ConcatGradCompute, attrs, ctx, inputs, req, outputs); - return; - } - FallBackCompute(ConcatGradCompute, attrs, ctx, inputs, req, outputs); -} -#endif - -struct ConcatGrad { - const char *op_name; - std::vector operator()(const nnvm::NodePtr& n, - const std::vector& ograds) const { - CHECK_EQ(ograds.size(), 1); - std::vector heads(ograds.begin(), ograds.end()); -#if MXNET_USE_MKLDNN == 1 - for (size_t i = 0; i < n->inputs.size(); i++) { - heads.push_back(n->inputs[i]); - } -#endif - return MakeGradNode(op_name, n, heads, n->attrs.dict); - } -}; - -DMLC_REGISTER_PARAMETER(ConcatParam); - -NNVM_REGISTER_OP(Concat) -.describe(R"code(Joins input arrays along a given axis. - -.. note:: `Concat` is deprecated. Use `concat` instead. - -The dimensions of the input arrays should be the same except the axis along -which they will be concatenated. -The dimension of the output array along the concatenated axis will be equal -to the sum of the corresponding dimensions of the input arrays. - -Example:: - - x = [[1,1],[2,2]] - y = [[3,3],[4,4],[5,5]] - z = [[6,6], [7,7],[8,8]] - - concat(x,y,z,dim=0) = [[ 1., 1.], - [ 2., 2.], - [ 3., 3.], - [ 4., 4.], - [ 5., 5.], - [ 6., 6.], - [ 7., 7.], - [ 8., 8.]] - - Note that you cannot concat x,y,z along dimension 1 since dimension - 0 is not the same for all the input arrays. - - concat(y,z,dim=1) = [[ 3., 3., 6., 6.], - [ 4., 4., 7., 7.], - [ 5., 5., 8., 8.]] - -)code" ADD_FILELINE) -.set_num_inputs([](const NodeAttrs& attrs) { - const ConcatParam& params = nnvm::get(attrs.parsed); - return params.num_args; -}) -.set_num_outputs(1) -.set_attr_parser(ParamParser) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - const ConcatParam& params = nnvm::get(attrs.parsed); - std::vector ret; - for (int i = 0; i < params.num_args; ++i) { - ret.push_back(std::string("arg") + std::to_string(i)); - } - return ret; -}) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) -#endif -.set_attr("FInferShape", ConcatShape) -.set_attr("FInferType", ConcatType) -.set_attr("FInferStorageType", ConcatForwardInferStorageType) -.set_attr("FCompute", ConcatCompute) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FComputeEx", ConcatComputeExCPU) -#endif -.set_attr("FGradient", ConcatGrad{"_backward_Concat"}) -.set_attr("key_var_num_args", "num_args") -.add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate") -.add_arguments(ConcatParam::__FIELDS__()); - -NNVM_REGISTER_OP(Concat).add_alias("concat"); - -NNVM_REGISTER_OP(_backward_Concat) -.set_num_outputs([](const NodeAttrs& attrs) { - const ConcatParam& params = nnvm::get(attrs.parsed); - return params.num_args; -}) -.set_attr_parser(ParamParser) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) -#endif -.set_attr("TIsBackward", true) -.set_attr("FInferStorageType", BackwardConcatStorageType) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FComputeEx", ConcatGradComputeExCPU) -#endif -.set_attr("FCompute", ConcatGradCompute); - -} // namespace op -} // namespace mxnet diff --git a/src/operator/nn/convolution-inl.h b/src/operator/nn/convolution-inl.h index 6204f75c4697..1613da6c85d1 100644 --- a/src/operator/nn/convolution-inl.h +++ b/src/operator/nn/convolution-inl.h @@ -22,7 +22,7 @@ * \file convolution-inl.h * \brief * \ref: https://github.com/Yangqing/caffe/wiki/Convolution-in-Caffe:-a-memo - * \author Bing Xu, Jun Wu, Da Zheng + * \author Bing Xu, Jun Wu */ #ifndef MXNET_OPERATOR_NN_CONVOLUTION_INL_H_ #define MXNET_OPERATOR_NN_CONVOLUTION_INL_H_ @@ -148,9 +148,9 @@ namespace mxnet { namespace op { template -class ConvolutionOp { +class ConvolutionOp : public Operator { public: - void Init(ConvolutionParam p) { + explicit ConvolutionOp(ConvolutionParam p) { this->param_ = p; // convert MBytes first to Bytes and then to elements. param_.workspace = (param_.workspace << 20) / sizeof(DType); @@ -160,10 +160,11 @@ class ConvolutionOp { << "Only support NCW, NCHW and NCDHW layout"; } - void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data) { + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(req[conv::kOut], kWriteTo); @@ -232,19 +233,18 @@ class ConvolutionOp { } } - void Backward(const OpContext &ctx, - const std::vector& out_grad, - const std::vector& in_data, - const std::vector& req, - const std::vector& in_grad) { + virtual void Backward(const OpContext &ctx, + const std::vector& out_grad, + const std::vector& in_data, + const std::vector& out_data, + const std::vector& req, + const std::vector& in_grad, + const std::vector& aux_args) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(out_grad.size(), 1U); - // We expect 2 inputs: in data and weight. We don't need bias for - // computing gradient. size_t expected = param_.no_bias == 0 ? 3 : 2; - CHECK_EQ(in_data.size(), expected); - CHECK_EQ(in_grad.size(), expected); + CHECK(in_data.size() == expected && in_grad.size() == expected); CHECK_EQ(req.size(), expected); CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true); LayerSetUp(in_grad[conv::kData].shape_, out_grad[conv::kOut].shape_); @@ -386,35 +386,299 @@ class ConvolutionOp { }; // class ConvolutionOp template -void ConvolutionCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const ConvolutionParam& param = nnvm::get(attrs.parsed); - MSHADOW_REAL_TYPE_SWITCH(inputs[conv::kData].type_flag_, DType, { - ConvolutionOp op; - op.Init(param); - op.Forward(ctx, inputs, req, outputs); - }); -} +Operator* CreateOp(ConvolutionParam param, int dtype, + std::vector *in_shape, + std::vector *out_shape, + Context ctx); -template -void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const ConvolutionParam& param = nnvm::get(attrs.parsed); - std::vector in_data(inputs.begin() + 1, inputs.end()); - const TBlob &out_grad = inputs[0]; - const std::vector &in_grad = outputs; - - MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, { - ConvolutionOp op; - op.Init(param); - op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); - }); -} +#if DMLC_USE_CXX11 +class ConvolutionProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + if (!param_.no_bias) { + return {"data", "weight", "bias"}; + } else { + return {"data", "weight"}; + } + } + + void Init(const std::vector >& kwargs) override { + using namespace mshadow; + param_.Init(kwargs); + if (param_.kernel.ndim() == 1) { + param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW; + if (param_.stride.ndim() == 0) param_.stride = Shape1(1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1); + if (param_.pad.ndim() == 0) param_.pad = Shape1(0); + } else if (param_.kernel.ndim() == 2) { + param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; + if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); + } else { + CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D convolution not supported"; + param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW; + if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); + } + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + if (!param_.no_bias) { + CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; + } else { + CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; + } + // CHECK_EQ(out_shape->size(), 1) << "Output: [output]"; + out_shape->resize(1, TShape()); + const TShape &dshp = (*in_shape)[conv::kData]; + if (dshp.ndim() == 0) return false; + + if (param_.kernel.ndim() == 1) { + // 1d conv + CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x"; + Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW); + Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, + param_.kernel[0]); + wshape = ConvertLayout(wshape, kNCW, param_.layout.value()); + wshape[0] *= param_.num_group; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); + } + + const index_t dilated_ksize_x = param_.DilatedKernelSize(0); + CHECK_EQ(dshape[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + Shape<3> oshape; + oshape[0] = dshape[0]; + oshape[1] = param_.num_filter; + oshape[2] = dshape[2] ? + (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0; + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value())); + // Perform incomplete shape inference. Fill in the missing values in data shape. + // 1) We can always fill in the batch_size. + // 2) We can back-calculate the input height/width if the corresponding stride is 1. + oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW); + dshape[0] = oshape[0]; + if (oshape[2] && param_.stride[0] == 1) { + dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0]; + } + SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, + ConvertLayout(dshape, kNCW, param_.layout.value())); + // Check whether the kernel sizes are valid + if (dshape[2] != 0) { + CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; + } + return true; + } else if (param_.kernel.ndim() == 2) { + // 2d conv + CHECK_EQ(dshp.ndim(), 4U) \ + << "Input data should be 4D in batch-num_filter-y-x"; + Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW); + Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, + dshape[1] / param_.num_group, + param_.kernel[0], param_.kernel[1]); + wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); + wshape[0] *= param_.num_group; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); + } + + const index_t dilated_ksize_y = param_.DilatedKernelSize(0); + const index_t dilated_ksize_x = param_.DilatedKernelSize(1); + CHECK_EQ(dshape[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + Shape<4> oshape; + oshape[0] = dshape[0]; + oshape[1] = param_.num_filter; + oshape[2] = dshape[2] ? + (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : 0; + oshape[3] = dshape[3] ? + (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : 0; + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); + // Perform incomplete shape inference. Fill in the missing values in data shape. + // 1) We can always fill in the batch_size. + // 2) We can back-calculate the input height/width if the corresponding stride is 1. + oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW); + dshape[0] = oshape[0]; + if (oshape[2] && param_.stride[0] == 1) { + dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param_.pad[0]; + } + if (oshape[3] && param_.stride[1] == 1) { + dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param_.pad[1]; + } + SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, + ConvertLayout(dshape, kNCHW, param_.layout.value())); + // Check whether the kernel sizes are valid + if (dshape[2] != 0) { + CHECK_LE(dilated_ksize_y, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; + } + if (dshape[3] != 0) { + CHECK_LE(dilated_ksize_x, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input"; + } + return true; + } else if (param_.kernel.ndim() == 3) { + // 3d conv + CHECK_EQ(dshp.ndim(), 5U) \ + << "Input data should be 5D in batch-num_filter-depth-y-x"; + Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW); + Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, + param_.kernel[0], param_.kernel[1], param_.kernel[2]); + wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value()); + wshape[0] *= param_.num_group; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); + } + + // Note: 3D dilation currently not supported. + // Calculations below done to preserve symmetry with 1D/2D code. + const index_t dilated_ksize_d = param_.DilatedKernelSize(0); + const index_t dilated_ksize_y = param_.DilatedKernelSize(1); + const index_t dilated_ksize_x = param_.DilatedKernelSize(2); + CHECK_EQ(dshape[1] % param_.num_group, 0U) + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + CHECK_EQ(param_.dilate.Size(), 1U) + << "Dilate is not supported in 3d convolution"; + Shape<5> oshape; + oshape[0] = dshape[0]; + oshape[1] = param_.num_filter; + oshape[2] = dshape[2] ? + (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : 0; + oshape[3] = dshape[3] ? + (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : 0; + oshape[4] = dshape[4] ? + (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : 0; + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value())); + // Perform incomplete shape inference. Fill in the missing values in data shape. + // 1) We can always fill in the batch_size. + // 2) We can back-calculate the input depth/height/width if the corresponding stride is 1. + oshape = ConvertLayout((*out_shape)[0].get<5>(), param_.layout.value(), kNCDHW); + dshape[0] = oshape[0]; + if (oshape[2] && param_.stride[0] == 1) { + dshape[2] = oshape[2] + dilated_ksize_d - 1 - 2 * param_.pad[0]; + } + if (oshape[3] && param_.stride[1] == 1) { + dshape[3] = oshape[3] + dilated_ksize_y - 1 - 2 * param_.pad[1]; + } + if (oshape[4] && param_.stride[2] == 1) { + dshape[4] = oshape[4] + dilated_ksize_x - 1 - 2 * param_.pad[2]; + } + SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, + ConvertLayout(dshape, kNCDHW, param_.layout.value())); + // Check whether the kernel sizes are valid + if (dshape[2] != 0) { + CHECK_LE(dilated_ksize_d, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; + } + if (dshape[3] != 0) { + CHECK_LE(dilated_ksize_y, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input"; + } + if (dshape[4] != 0) { + CHECK_LE(dilated_ksize_x, AddPad(dshape[4], param_.pad[2])) << "kernel size exceed input"; + } + return true; + } else { + LOG(FATAL) << "Unknown convolution type"; + return false; + } + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); + } + } + out_type->clear(); + out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + auto ptr = new ConvolutionProp(); + ptr->param_ = param_; + return ptr; + } + + std::string TypeString() const override { + return "Convolution"; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return {out_grad[conv::kOut], in_data[conv::kData], in_data[conv::kWeight]}; + } + + std::vector ForwardResource( + const std::vector &in_shape) const override { + return {ResourceRequest::kTempSpace}; + } + + std::vector BackwardResource( + const std::vector &in_shape) const override { + return {ResourceRequest::kTempSpace}; + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + private: + // Adds symmetric padding to a data input (in one dimension) + index_t AddPad(index_t dsize, index_t pad) const { + return dsize + 2 * pad; + } + + ConvolutionParam param_; +}; // class ConvolutionProp +#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_CONVOLUTION_INL_H_ diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc index 951063fb4b2f..ef8ec9034db2 100644 --- a/src/operator/nn/convolution.cc +++ b/src/operator/nn/convolution.cc @@ -21,13 +21,15 @@ * Copyright (c) 2017 by Contributors * \file convolution.cc * \brief - * \author Bing Xu, Jun Wu, Da Zheng + * \author Bing Xu, Jun Wu */ #include "./convolution-inl.h" -#include "../elemwise_op_common.h" -#include "./mkldnn/mkldnn_ops-inl.h" -#include "./mkldnn/mkldnn_base-inl.h" +#if MXNET_USE_MKL2017 == 1 +#include +#include "../mkl/mkl_memory-inl.h" +#include "../mkl/mkl_convolution-inl.h" +#endif // MXNET_USE_MKL2017 #if MXNET_USE_NNPACK == 1 #include "./nnpack/nnpack_convolution-inl.h" #endif // MXNET_USE_NNPACK @@ -36,351 +38,63 @@ namespace mxnet { namespace op { DMLC_REGISTER_PARAMETER(ConvolutionParam); -static inline index_t AddPad(index_t dsize, index_t pad) { - return dsize + 2 * pad; -} - -static inline std::vector ListArguments(const ConvolutionParam& param_) { - if (!param_.no_bias) { - return {"data", "weight", "bias"}; - } else { - return {"data", "weight"}; +template<> +Operator* CreateOp(ConvolutionParam param, int dtype, + std::vector *in_shape, + std::vector *out_shape, + Context ctx) { + Operator *op = NULL; + // If 1D convolution, use MXNet implementation + if (param.kernel.ndim() == 1) { + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new ConvolutionOp(param); + }) + return op; } -} - -#if MXNET_USE_MKLDNN == 1 -static void ConvolutionComputeExCPU(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - if (SupportMKLDNNConv(inputs[0])) { - MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs); - MKLDNNConvolutionForward(attrs, ctx, inputs, req, outputs); - MKLDNN_OPCHECK_RUN(ConvolutionCompute, attrs, ctx, inputs, req, outputs); - return; - } - FallBackCompute(ConvolutionCompute, attrs, ctx, inputs, req, outputs); -} - -static void ConvolutionGradComputeExCPU(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - if (SupportMKLDNNConv(inputs[0])) { - MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs); - MKLDNNConvolutionBackward(attrs, ctx, inputs, req, outputs); - MKLDNN_OPCHECK_RUN(ConvolutionGradCompute, attrs, ctx, inputs, req, outputs); - return; - } - FallBackCompute(ConvolutionGradCompute, attrs, ctx, inputs, req, outputs); -} -#endif - -static bool ConvolutionShape(const nnvm::NodeAttrs& attrs, - std::vector *in_shape, - std::vector *out_shape) { - using namespace mshadow; - const ConvolutionParam& param_ = nnvm::get(attrs.parsed); - if (!param_.no_bias) { - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; - } else { - CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; - } - // CHECK_EQ(out_shape->size(), 1) << "Output: [output]"; - out_shape->resize(1, TShape()); - const TShape &dshp = (*in_shape)[conv::kData]; - if (dshp.ndim() == 0) return false; - - if (param_.kernel.ndim() == 1) { - // 1d conv - CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x"; - Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW); - Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, - param_.kernel[0]); - wshape = ConvertLayout(wshape, kNCW, param_.layout.value()); - wshape[0] *= param_.num_group; - SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); - } - - const index_t dilated_ksize_x = param_.DilatedKernelSize(0); - CHECK_EQ(dshape[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - Shape<3> oshape; - oshape[0] = dshape[0]; - oshape[1] = param_.num_filter; - oshape[2] = dshape[2] ? - (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0; - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value())); - // Perform incomplete shape inference. Fill in the missing values in data shape. - // 1) We can always fill in the batch_size. - // 2) We can back-calculate the input height/width if the corresponding stride is 1. - oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW); - dshape[0] = oshape[0]; - if (oshape[2] && param_.stride[0] == 1) { - dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0]; - } - SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, - ConvertLayout(dshape, kNCW, param_.layout.value())); - // Check whether the kernel sizes are valid - if (dshape[2] != 0) { - CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; - } - return true; - } else if (param_.kernel.ndim() == 2) { - // 2d conv - CHECK_EQ(dshp.ndim(), 4U) \ - << "Input data should be 4D in batch-num_filter-y-x"; - Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW); - Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, - dshape[1] / param_.num_group, - param_.kernel[0], param_.kernel[1]); - wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); - wshape[0] *= param_.num_group; - SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); - } - - const index_t dilated_ksize_y = param_.DilatedKernelSize(0); - const index_t dilated_ksize_x = param_.DilatedKernelSize(1); - CHECK_EQ(dshape[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - Shape<4> oshape; - oshape[0] = dshape[0]; - oshape[1] = param_.num_filter; - oshape[2] = dshape[2] ? - (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : 0; - oshape[3] = dshape[3] ? - (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : 0; - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); - // Perform incomplete shape inference. Fill in the missing values in data shape. - // 1) We can always fill in the batch_size. - // 2) We can back-calculate the input height/width if the corresponding stride is 1. - oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW); - dshape[0] = oshape[0]; - if (oshape[2] && param_.stride[0] == 1) { - dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param_.pad[0]; - } - if (oshape[3] && param_.stride[1] == 1) { - dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param_.pad[1]; - } - SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, - ConvertLayout(dshape, kNCHW, param_.layout.value())); - // Check whether the kernel sizes are valid - if (dshape[2] != 0) { - CHECK_LE(dilated_ksize_y, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; - } - if (dshape[3] != 0) { - CHECK_LE(dilated_ksize_x, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input"; - } - return true; - } else if (param_.kernel.ndim() == 3) { - // 3d conv - CHECK_EQ(dshp.ndim(), 5U) \ - << "Input data should be 5D in batch-num_filter-depth-y-x"; - Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW); - Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, - param_.kernel[0], param_.kernel[1], param_.kernel[2]); - wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value()); - wshape[0] *= param_.num_group; - SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); - } - - // Note: 3D dilation currently not supported. - // Calculations below done to preserve symmetry with 1D/2D code. - const index_t dilated_ksize_d = param_.DilatedKernelSize(0); - const index_t dilated_ksize_y = param_.DilatedKernelSize(1); - const index_t dilated_ksize_x = param_.DilatedKernelSize(2); - CHECK_EQ(dshape[1] % param_.num_group, 0U) - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - CHECK_EQ(param_.dilate.Size(), 1U) - << "Dilate is not supported in 3d convolution"; - Shape<5> oshape; - oshape[0] = dshape[0]; - oshape[1] = param_.num_filter; - oshape[2] = dshape[2] ? - (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : 0; - oshape[3] = dshape[3] ? - (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : 0; - oshape[4] = dshape[4] ? - (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : 0; - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value())); - // Perform incomplete shape inference. Fill in the missing values in data shape. - // 1) We can always fill in the batch_size. - // 2) We can back-calculate the input depth/height/width if the corresponding stride is 1. - oshape = ConvertLayout((*out_shape)[0].get<5>(), param_.layout.value(), kNCDHW); - dshape[0] = oshape[0]; - if (oshape[2] && param_.stride[0] == 1) { - dshape[2] = oshape[2] + dilated_ksize_d - 1 - 2 * param_.pad[0]; +#if MXNET_USE_MKL2017 == 1 + if ((param.dilate[0] == 1 && param.dilate[1] == 1) + && param.kernel.ndim() == 2) { + switch (dtype) { + case mshadow::kFloat32: + return new MKLConvolutionOp(param); + case mshadow::kFloat64: + return new MKLConvolutionOp(param); + default: + break; } - if (oshape[3] && param_.stride[1] == 1) { - dshape[3] = oshape[3] + dilated_ksize_y - 1 - 2 * param_.pad[1]; - } - if (oshape[4] && param_.stride[2] == 1) { - dshape[4] = oshape[4] + dilated_ksize_x - 1 - 2 * param_.pad[2]; - } - SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, - ConvertLayout(dshape, kNCDHW, param_.layout.value())); - // Check whether the kernel sizes are valid - if (dshape[2] != 0) { - CHECK_LE(dilated_ksize_d, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; - } - if (dshape[3] != 0) { - CHECK_LE(dilated_ksize_y, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input"; - } - if (dshape[4] != 0) { - CHECK_LE(dilated_ksize_x, AddPad(dshape[4], param_.pad[2])) << "kernel size exceed input"; - } - return true; - } else { - LOG(FATAL) << "Unknown convolution type"; - return false; } -} - -static bool ConvolutionType(const nnvm::NodeAttrs& attrs, - std::vector *in_type, std::vector *out_type) { - const ConvolutionParam& param_ = nnvm::get(attrs.parsed); - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param_)[i]); +#endif +#if MXNET_USE_NNPACK == 1 + const size_t batch_size = (*in_shape)[0][0]; + if ((param.dilate[0] == 1 && param.dilate[1] == 1) + && param.kernel.ndim() == 2 && (!param.no_bias) + && param.num_group == 1 && (batch_size == 1 || + ((batch_size > 1) && (param.stride[0] == 1) && + (param.stride[1] == 1)))) { + switch (dtype) { + case mshadow::kFloat32: + return new NNPACKConvolutionOp(param); + default: + break; } } - out_type->clear(); - out_type->push_back(dtype); - return true; -} - -inline static bool ConvStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - const ConvolutionParam& param = nnvm::get(attrs.parsed); - uint32_t in_expected = param.no_bias ? 2 : 3; - CHECK_EQ(in_attrs->size(), in_expected); - CHECK_EQ(out_attrs->size(), 1); - - DispatchMode wanted_mode; -#if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) - wanted_mode = DispatchMode::kFComputeEx; - else #endif - wanted_mode = DispatchMode::kFCompute; - return storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, wanted_mode); + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new ConvolutionOp(param); + }) + return op; } -inline static bool BackwardConvStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - const ConvolutionParam& param = nnvm::get(attrs.parsed); - uint32_t in_expected = param.no_bias ? 3 : 4; - uint32_t out_expected = param.no_bias ? 2 : 3; - CHECK_EQ(in_attrs->size(), in_expected); - CHECK_EQ(out_attrs->size(), out_expected); - - DispatchMode wanted_mode; -#if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) - wanted_mode = DispatchMode::kFComputeEx; - else -#endif - wanted_mode = DispatchMode::kFCompute; - return storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, wanted_mode); -} - -static void ConvolutionParamParser(nnvm::NodeAttrs* attrs) { - using namespace mshadow; - ConvolutionParam param_; - try { - param_.Init(attrs->dict); - } catch (const dmlc::ParamError& e) { - std::ostringstream os; - os << e.what(); - os << ", in operator " << attrs->op->name << "(" - << "name=\"" << attrs->name << "\""; - for (const auto& k : attrs->dict) { - os << ", " << k.first << "=\"" << k.second << "\""; - } - os << ")"; - throw dmlc::ParamError(os.str()); - } - - if (param_.kernel.ndim() == 1) { - param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW; - if (param_.stride.ndim() == 0) param_.stride = Shape1(1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1); - if (param_.pad.ndim() == 0) param_.pad = Shape1(0); - } else if (param_.kernel.ndim() == 2) { - param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; - if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); - } else { - CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D convolution not supported"; - param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW; - if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); - } - attrs->parsed = std::move(param_); +// DO_BIND_DISPATCH comes from operator_common.h +Operator *ConvolutionProp::CreateOperatorEx(Context ctx, + std::vector *in_shape, + std::vector *in_type) const { + std::vector out_shape, aux_shape; + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx); } -struct ConvolutionGrad { - const char *op_name; - std::vector operator()(const nnvm::NodePtr& n, - const std::vector& ograds) const { - const ConvolutionParam& param = nnvm::get(n->attrs.parsed); - std::vector heads(ograds.begin(), ograds.end()); - heads.push_back(n->inputs[conv::kData]); - heads.push_back(n->inputs[conv::kWeight]); - if (!param.no_bias) - heads.push_back(n->inputs[conv::kBias]); - return MakeGradNode(op_name, n, heads, n->attrs.dict); - } -}; - -NNVM_REGISTER_OP(Convolution) +MXNET_REGISTER_OP_PROPERTY(Convolution, ConvolutionProp) .describe(R"code(Compute *N*-D convolution on *(N+2)*-D input. In the 2-D convolution, given input data with shape *(batch_size, @@ -454,51 +168,10 @@ There are other options to tune the performance. the performance. )code" ADD_FILELINE) -.set_num_inputs([](const NodeAttrs& attrs) { - const ConvolutionParam& params = nnvm::get(attrs.parsed); - return params.no_bias ? 2 : 3; -}) -.set_num_outputs(1) -.set_attr_parser(ConvolutionParamParser) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - const ConvolutionParam& params = nnvm::get(attrs.parsed); - if (params.no_bias) - return std::vector{"data", "weight"}; - else - return std::vector{"data", "weight", "bias"}; -}) -.set_attr("FInferShape", ConvolutionShape) -.set_attr("FInferType", ConvolutionType) -.set_attr("FInferStorageType", ConvStorageType) -.set_attr("FCompute", ConvolutionCompute) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FComputeEx", ConvolutionComputeExCPU) -#endif -.set_attr("FGradient", ConvolutionGrad{"_backward_Convolution"}) -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) .add_argument("data", "NDArray-or-Symbol", "Input data to the ConvolutionOp.") .add_argument("weight", "NDArray-or-Symbol", "Weight matrix.") .add_argument("bias", "NDArray-or-Symbol", "Bias parameter.") .add_arguments(ConvolutionParam::__FIELDS__()); -NNVM_REGISTER_OP(_backward_Convolution) -.set_num_outputs([](const NodeAttrs& attrs) { - const ConvolutionParam& params = nnvm::get(attrs.parsed); - return params.no_bias ? 2 : 3; -}) -.set_attr("TIsBackward", true) -.set_attr("FInferStorageType", BackwardConvStorageType) -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) -.set_attr_parser(ConvolutionParamParser) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FComputeEx", ConvolutionGradComputeExCPU) -#endif -.set_attr("FCompute", ConvolutionGradCompute); - } // namespace op } // namespace mxnet diff --git a/src/operator/nn/convolution.cu b/src/operator/nn/convolution.cu index d7f9e564a603..7234daf0d614 100644 --- a/src/operator/nn/convolution.cu +++ b/src/operator/nn/convolution.cu @@ -21,136 +21,36 @@ * Copyright (c) 2017 by Contributors * \file convolution.cu * \brief - * \author Bing Xu, Jun Wu, Da Zheng + * \author Bing Xu, Jun Wu */ #include "./convolution-inl.h" #include -#include "./depthwise_convolution-inl.h" #if MXNET_USE_CUDNN == 1 #include "./cudnn/cudnn_convolution-inl.h" #endif // MXNET_USE_CUDNN +#include "./depthwise_convolution-inl.h" + namespace mxnet { namespace op { -#if MXNET_USE_CUDNN == 1 -template -static CuDNNConvolutionOp &GetCuDNNConvOp(const ConvolutionParam& param, - int forward_compute_type, int backward_compute_type, - const std::vector& in_shape, const std::vector& out_shape, - const Context& ctx) { -#if DMLC_CXX11_THREAD_LOCAL - static thread_local CuDNNConvolutionOp op; -#else - static MX_THREAD_LOCAL CuDNNConvolutionOp op; -#endif - op.Init(param, forward_compute_type, backward_compute_type, - in_shape, out_shape, ctx); - return op; -} -#endif - -template<> -void ConvolutionCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const ConvolutionParam& param = nnvm::get(attrs.parsed); - int dtype = inputs[conv::kData].type_flag_; - - // If 1D convolution, use MXNet implementation - if (param.kernel.ndim() == 1) { - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - ConvolutionOp op; - op.Init(param); - op.Forward(ctx, inputs, req, outputs); - }) - return; - } else if (param.num_filter == param.num_group && - param.layout.value() == mshadow::kNCHW && - param.num_filter == inputs[conv::kData].shape_[1] && - param.kernel.ndim() == 2 && - param.dilate == mshadow::Shape2(1, 1) && - dtype == mshadow::kFloat32) { - std::vector in_shape(inputs.size()); - std::vector out_shape(1, outputs[0].shape_); - for (size_t i = 0; i < in_shape.size(); i++) - in_shape[i] = inputs[i].shape_; - DepthwiseConvolutionOp op; - op.Init(param, in_shape, out_shape); - op.Forward(ctx, inputs, req, outputs); - return; - } - -#if MXNET_USE_CUDNN == 1 - // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16). - int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype; - - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - if (param.cudnn_off) { - ConvolutionOp op; - op.Init(param); - op.Forward(ctx, inputs, req, outputs); - } else if (!CuDNNConvolutionOp::Supports(param, - compute_type, compute_type, ctx.run_ctx.ctx)) { - LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied."; - ConvolutionOp op; - op.Init(param); - op.Forward(ctx, inputs, req, outputs); - } else { - std::vector in_shape(inputs.size()); - std::vector out_shape(1, outputs[0].shape_); - for (size_t i = 0; i < in_shape.size(); i++) - in_shape[i] = inputs[i].shape_; - CuDNNConvolutionOp &op = GetCuDNNConvOp(param, - compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx); - op.Forward(ctx, inputs, req, outputs); - } - }) -#else - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - ConvolutionOp op; - op.Init(param); - op.Forward(ctx, inputs, req, outputs); - }) -#endif // MXNET_USE_CUDNN -} - template<> -void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const ConvolutionParam& param = nnvm::get(attrs.parsed); - std::vector in_data(inputs.begin() + 1, inputs.end()); - const TBlob &out_grad = inputs[0]; - const std::vector &in_grad = outputs; - int dtype = out_grad.type_flag_; - - // If 1D convolution, use MXNet implementation - if (param.kernel.ndim() == 1) { - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - ConvolutionOp op; - op.Init(param); - op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); - }) - return; - } else if (param.num_filter == param.num_group && +Operator* CreateOp(ConvolutionParam param, int dtype, + std::vector *in_shape, + std::vector *out_shape, + Context ctx) { + Operator *op = NULL; + + // depth wise conv + if (param.num_filter == param.num_group && param.layout.value() == mshadow::kNCHW && - param.num_filter == in_data[conv::kData].shape_[1] && + param.num_filter == (*in_shape)[conv::kData][1] && param.kernel.ndim() == 2 && param.dilate == mshadow::Shape2(1, 1) && dtype == mshadow::kFloat32) { - // The first element stores out grad. - std::vector in_shape(in_data.size()); - std::vector out_shape(1, out_grad.shape_); - for (size_t i = 0; i < in_shape.size(); i++) - in_shape[i] = in_data[i].shape_; - DepthwiseConvolutionOp op; - op.Init(param, in_shape, out_shape); - op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); - return; + op = new DepthwiseConvolutionOp(param, *in_shape, *out_shape); + return op; } #if MXNET_USE_CUDNN == 1 @@ -159,41 +59,23 @@ void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { if (param.cudnn_off) { - ConvolutionOp op; - op.Init(param); - op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); - } else if (!CuDNNConvolutionOp::Supports(param, - compute_type, compute_type, ctx.run_ctx.ctx)) { + op = new ConvolutionOp(param); + } else if (!CuDNNConvolutionOp::Supports(param, compute_type, compute_type, ctx)) { LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied."; - ConvolutionOp op; - op.Init(param); - op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + op = new ConvolutionOp(param); } else { - // The first element stores out grad. - std::vector in_shape(in_data.size()); - std::vector out_shape(1, out_grad.shape_); - for (size_t i = 0; i < in_shape.size(); i++) - in_shape[i] = in_data[i].shape_; - CuDNNConvolutionOp &op = GetCuDNNConvOp(param, - compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx); - op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + op = new CuDNNConvolutionOp(param, compute_type, compute_type, + *in_shape, *out_shape, ctx); } }) #else MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - ConvolutionOp op; - op.Init(param); - op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + op = new ConvolutionOp(param); }) #endif // MXNET_USE_CUDNN + return op; } -NNVM_REGISTER_OP(Convolution) -.set_attr("FCompute", ConvolutionCompute); - -NNVM_REGISTER_OP(_backward_Convolution) -.set_attr("FCompute", ConvolutionGradCompute); - } // namespace op } // namespace mxnet diff --git a/src/operator/nn/cudnn/cudnn_activation-inl.h b/src/operator/nn/cudnn/cudnn_activation-inl.h index a89e7bfaf080..888528309cdf 100644 --- a/src/operator/nn/cudnn/cudnn_activation-inl.h +++ b/src/operator/nn/cudnn/cudnn_activation-inl.h @@ -33,19 +33,12 @@ namespace mxnet { namespace op { template -class CuDNNActivationOp { +class CuDNNActivationOp : public Operator { public: - CuDNNActivationOp() { - dtype_ = mshadow::DataType::kCudnnFlag; - #if CUDNN_MAJOR >= 5 - nan_prop_ = CUDNN_NOT_PROPAGATE_NAN; - CUDNN_CALL(cudnnCreateActivationDescriptor(&desc_)); - #endif - CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_)); - } - - void Init(const ActivationParam ¶m) { + explicit CuDNNActivationOp(ActivationParam param) { param_ = param; + init_cudnn_ = false; + dtype_ = mshadow::DataType::kCudnnFlag; switch (param_.act_type) { case activation::kReLU: mode_ = CUDNN_ACTIVATION_RELU; @@ -61,54 +54,67 @@ class CuDNNActivationOp { break; } #if CUDNN_MAJOR >= 5 + nan_prop_ = CUDNN_NOT_PROPAGATE_NAN; + CUDNN_CALL(cudnnCreateActivationDescriptor(&desc_)); CUDNN_CALL(cudnnSetActivationDescriptor(desc_, mode_, nan_prop_, relu_ceil_)); #endif } ~CuDNNActivationOp() { - CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_)); - #if CUDNN_MAJOR >= 5 - CUDNN_CALL(cudnnDestroyActivationDescriptor(desc_)); - #endif + if (init_cudnn_) { + CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_)); + #if CUDNN_MAJOR >= 5 + CUDNN_CALL(cudnnDestroyActivationDescriptor(desc_)); + #endif + } } - void Forward(const OpContext &ctx, const TBlob &in_data, - const OpReqType &req, const TBlob &out_data) { + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; + CHECK_EQ(in_data.size(), 1U); + CHECK_EQ(out_data.size(), 1U); Stream *s = ctx.get_stream(); Tensor data; Tensor out; - if (in_data.ndim() == 2) { - Shape<4> dshape = Shape4(in_data.shape_[0], - in_data.shape_[1], 1, 1); - data = in_data.get_with_shape(dshape, s); - out = out_data.get_with_shape(dshape, s); + if (in_data[activation::kData].ndim() == 2) { + Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], + in_data[activation::kData].shape_[1], 1, 1); + data = in_data[activation::kData].get_with_shape(dshape, s); + out = out_data[activation::kOut].get_with_shape(dshape, s); } else { Shape<4> dshape; - index_t size_left = in_data.Size(); + index_t size_left = in_data[activation::kData].Size(); for (int i = 0; i < 3; ++i) { - if (i < in_data.ndim()) { - dshape[i] = in_data.shape_[i]; + if (i < in_data[activation::kData].ndim()) { + dshape[i] = in_data[activation::kData].shape_[i]; } else { dshape[i] = 1; } size_left /= dshape[i]; } dshape[3] = size_left; - data = in_data.get_with_shape(dshape, s); - out = out_data.get_with_shape(dshape, s); + data = in_data[activation::kData].get_with_shape(dshape, s); + out = out_data[activation::kOut].get_with_shape(dshape, s); } typename DataType::ScaleType alpha = 1.0f; typename DataType::ScaleType beta = 0.0f; CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); - CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - data.shape_[0], - data.shape_[1], - data.shape_[2], - data.shape_[3])); + if (!init_cudnn_) { + init_cudnn_ = true; + CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_)); + CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + data.shape_[0], + data.shape_[1], + data.shape_[2], + data.shape_[3])); + } #if CUDNN_MAJOR <= 4 CUDNN_CALL(cudnnActivationForward(s->dnn_handle_, mode_, @@ -130,11 +136,20 @@ class CuDNNActivationOp { #endif } - void Backward(const OpContext &ctx, const TBlob &out_grad, - const TBlob &in_data, const TBlob &out_data, - const OpReqType &req, const TBlob &in_grad) { + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1U); + CHECK_EQ(in_data.size(), 1U); + CHECK_EQ(out_data.size(), 1U); + CHECK_EQ(req.size(), 1U); + CHECK_EQ(in_grad.size(), 1U); typename DataType::ScaleType alpha = 1.0f; typename DataType::ScaleType beta = 0.0f; Stream *s = ctx.get_stream(); @@ -142,38 +157,31 @@ class CuDNNActivationOp { Tensor data; Tensor output_data; Tensor input_grad; - if (in_grad.ndim() == 2) { - Shape<4> dshape = Shape4(in_grad.shape_[0], - in_grad.shape_[1], 1, 1); - data = in_data.get_with_shape(dshape, s); - grad = out_grad.get_with_shape(dshape, s); - output_data = out_data.get_with_shape(dshape, s); - input_grad = in_grad.get_with_shape(dshape, s); + if (in_grad[activation::kData].ndim() == 2) { + Shape<4> dshape = Shape4(in_grad[activation::kData].shape_[0], + in_grad[activation::kData].shape_[1], 1, 1); + data = in_data[activation::kData].get_with_shape(dshape, s); + grad = out_grad[activation::kOut].get_with_shape(dshape, s); + output_data = out_data[activation::kOut].get_with_shape(dshape, s); + input_grad = in_grad[activation::kData].get_with_shape(dshape, s); } else { Shape<4> dshape; - index_t size_left = in_grad.Size(); + index_t size_left = in_grad[activation::kData].Size(); for (int i = 0; i < 3; ++i) { - if (i < in_grad.ndim()) { - dshape[i] = in_grad.shape_[i]; + if (i < in_grad[activation::kData].ndim()) { + dshape[i] = in_grad[activation::kData].shape_[i]; } else { dshape[i] = 1; } size_left /= dshape[i]; } dshape[3] = size_left; - data = in_data.get_with_shape(dshape, s); - output_data = out_data.get_with_shape(dshape, s); - grad = out_grad.get_with_shape(dshape, s); - input_grad = in_grad.get_with_shape(dshape, s); + data = in_data[activation::kData].get_with_shape(dshape, s); + output_data = out_data[activation::kOut].get_with_shape(dshape, s); + grad = out_grad[activation::kOut].get_with_shape(dshape, s); + input_grad = in_grad[activation::kData].get_with_shape(dshape, s); } CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); - CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - data.shape_[0], - data.shape_[1], - data.shape_[2], - data.shape_[3])); #if CUDNN_MAJOR <= 4 CUDNN_CALL(cudnnActivationBackward(s->dnn_handle_, mode_, @@ -204,6 +212,7 @@ class CuDNNActivationOp { } private: + bool init_cudnn_; cudnnDataType_t dtype_; cudnnActivationMode_t mode_; cudnnTensorDescriptor_t shape_desc_; diff --git a/src/operator/nn/cudnn/cudnn_batch_norm-inl.h b/src/operator/nn/cudnn/cudnn_batch_norm-inl.h index e2337049060e..3dc9c8353a35 100644 --- a/src/operator/nn/cudnn/cudnn_batch_norm-inl.h +++ b/src/operator/nn/cudnn/cudnn_batch_norm-inl.h @@ -43,30 +43,28 @@ enum CuDNNBatchNormOpAuxiliary {kMovingMean, kMovingInvVar}; #if defined(__CUDACC__) template -class CuDNNBatchNormOp { +class CuDNNBatchNormOp : public Operator { public: - CuDNNBatchNormOp() { + explicit CuDNNBatchNormOp(BatchNormParam param) { using namespace mshadow; + CHECK_GE(param.eps, CUDNN_BN_MIN_EPSILON) + << "CuDNN requires eps to be no less than " << CUDNN_BN_MIN_EPSILON; + this->param_ = param; + init_cudnn_ = false; dtype_ = DataType::kCudnnFlag; // For float16 input type beta, gamma, mean, and average are stored in float32. // For other input types, these parameters have the same type as input dtype_param_ = (dtype_ == CUDNN_DATA_HALF) ? kFloat32 : DataType::kFlag; - CUDNN_CALL(cudnnCreateTensorDescriptor(&io_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&mean_desc_)); - } - - void Init(const BatchNormParam ¶m) { - CHECK_GE(param.eps, CUDNN_BN_MIN_EPSILON) - << "CuDNN requires eps to be no less than " << CUDNN_BN_MIN_EPSILON; - this->param_ = param; } ~CuDNNBatchNormOp() { - CUDNN_CALL(cudnnDestroyTensorDescriptor(io_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(mean_desc_)); + if (init_cudnn_) { + CUDNN_CALL(cudnnDestroyTensorDescriptor(io_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(mean_desc_)); + } } - void Forward(const OpContext &ctx, + virtual void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data, @@ -86,7 +84,29 @@ class CuDNNBatchNormOp { CHECK_GE(in_data[cudnnbatchnorm::kData].ndim(), 2); CHECK_LE(in_data[cudnnbatchnorm::kData].ndim(), 4); - Init(in_data[cudnnbatchnorm::kData]); + if (!init_cudnn_) { + for (int i = 0; i < 4; ++i) { + if (i < in_data[cudnnbatchnorm::kData].ndim()) { + shape_[i] = in_data[cudnnbatchnorm::kData].shape_[i]; + } else { + shape_[i] = 1; + } + } + CUDNN_CALL(cudnnCreateTensorDescriptor(&io_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&mean_desc_)); + CUDNN_CALL(cudnnSetTensor4dDescriptor(io_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + shape_[0], + shape_[1], + shape_[2], + shape_[3])); + CUDNN_CALL(cudnnDeriveBNTensorDescriptor(mean_desc_, + io_desc_, + CUDNN_BATCHNORM_SPATIAL)); + init_cudnn_ = true; + } + Stream *s = ctx.get_stream(); Tensor x = in_data[cudnnbatchnorm::kData].get_with_shape(shape_, s); @@ -157,7 +177,7 @@ class CuDNNBatchNormOp { }) } - void Backward(const OpContext &ctx, + virtual void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, const std::vector &out_data, @@ -173,7 +193,6 @@ class CuDNNBatchNormOp { CHECK(ctx.is_train && !param_.use_global_stats) << "use global statistics is not yet supported in CuDNNBatchNorm"; - Init(in_data[cudnnbatchnorm::kData]); Stream *s = ctx.get_stream(); Tensor x = in_data[cudnnbatchnorm::kData].get_with_shape(shape_, s); @@ -271,27 +290,7 @@ class CuDNNBatchNormOp { } private: - void Init(const TBlob &in_data) { - for (int i = 0; i < 4; ++i) { - if (i < in_data.ndim()) { - shape_[i] = in_data.shape_[i]; - } else { - shape_[i] = 1; - } - } - - CUDNN_CALL(cudnnSetTensor4dDescriptor(io_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - shape_[0], - shape_[1], - shape_[2], - shape_[3])); - CUDNN_CALL(cudnnDeriveBNTensorDescriptor(mean_desc_, - io_desc_, - CUDNN_BATCHNORM_SPATIAL)); - } - + bool init_cudnn_; cudnnDataType_t dtype_; int dtype_param_; cudnnTensorDescriptor_t io_desc_, mean_desc_; @@ -300,6 +299,91 @@ class CuDNNBatchNormOp { }; #endif // defined(__CUDACC__) +template +Operator *CreateOp_CuDNNv4(BatchNormParam param); + + +#if DMLC_USE_CXX11 +class CuDNNBatchNormProp : public OperatorProperty { + public: + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]"; + const TShape &dshape = in_shape->at(0); + if (dshape.ndim() == 0) return false; + in_shape->at(1) = TShape(Shape1(dshape[1])); + in_shape->at(2) = TShape(Shape1(dshape[1])); + + out_shape->clear(); + out_shape->push_back(dshape); + out_shape->push_back(Shape1(dshape[1])); + out_shape->push_back(Shape1(dshape[1])); + + aux_shape->clear(); + aux_shape->push_back(Shape1(dshape[1])); + aux_shape->push_back(Shape1(dshape[1])); + return true; + } + + OperatorProperty* Copy() const override { + auto ptr = new CuDNNBatchNormProp(); + ptr->param_ = param_; + return ptr; + } + + std::string TypeString() const override { + return "CuDNNBatchNorm"; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return {out_grad[cudnnbatchnorm::kOut], + out_data[cudnnbatchnorm::kMean], + out_data[cudnnbatchnorm::kInvVar], + in_data[cudnnbatchnorm::kData], + in_data[cudnnbatchnorm::kGamma] + }; + } + + int NumVisibleOutputs() const override { + return 1; + } + + int NumOutputs() const override { + return 3; + } + + std::vector ListArguments() const override { + return {"data", "gamma", "beta"}; + } + + std::vector ListOutputs() const override { + return {"output", "mean", "inv_var"}; + } + + std::vector ListAuxiliaryStates() const override { + return {"moving_mean", "moving_inv_var"}; + } + + Operator* CreateOperator(Context ctx) const override; + + private: + BatchNormParam param_; +}; // class CuDNNBatchNormProp + +#endif // DMLC_USE_CXX11 #endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4 } // namespace op } // namespace mxnet diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.cc b/src/operator/nn/cudnn/cudnn_batch_norm.cc index f1d229dd5421..e1e0c999b1fb 100644 --- a/src/operator/nn/cudnn/cudnn_batch_norm.cc +++ b/src/operator/nn/cudnn/cudnn_batch_norm.cc @@ -21,100 +21,46 @@ * Copyright (c) 2015 by Contributors * \file cudnn_batch_norm.cc * \brief - * \author Junyuan Xie, Da Zheng + * \author Junyuan Xie */ #include "./cudnn_batch_norm-inl.h" #include -#include "../../elemwise_op_common.h" namespace mxnet { namespace op { -#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4 - -static bool BatchNormShape(const nnvm::NodeAttrs& attrs, std::vector *in_shape, - std::vector *out_shape) { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, moving_mean, moving_var]"; - const TShape &dshape = in_shape->at(0); - if (dshape.ndim() == 0) return false; - in_shape->at(1) = TShape(Shape1(dshape[1])); - in_shape->at(2) = TShape(Shape1(dshape[1])); - in_shape->at(3) = TShape(Shape1(dshape[1])); - in_shape->at(4) = TShape(Shape1(dshape[1])); - - out_shape->clear(); - out_shape->push_back(dshape); - out_shape->push_back(Shape1(dshape[1])); - out_shape->push_back(Shape1(dshape[1])); - - return true; -} - -static void BatchNormCompute_CPU(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { +#if CUDNN_MAJOR >= 4 +template<> +Operator *CreateOp_CuDNNv4(BatchNormParam param) { LOG(FATAL) << "CuDNNBatchNormOp is only available for gpu."; + return NULL; } -static void BatchNormGradCompute_CPU(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - LOG(FATAL) << "CuDNNBatchNormOp is only available for gpu."; +Operator *CuDNNBatchNormProp::CreateOperator(Context ctx) const { +#if CUDNN_MAJOR >= 5 + LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5." + "Use the later instead."; + return nullptr; +#else + DO_BIND_DISPATCH(CreateOp_CuDNNv4, param_); +#endif } -NNVM_REGISTER_OP(CuDNNBatchNorm) +MXNET_REGISTER_OP_PROPERTY(CuDNNBatchNorm, CuDNNBatchNormProp) .describe("Apply batch normalization to input.") -.set_num_inputs(5) -.set_num_outputs(3) -.set_attr_parser(ParamParser) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - return std::vector{"data", "gamma", "beta", "moving_mean", "moving_var"}; -}) -.set_attr("FListOutputNames", - [](const NodeAttrs& attrs) { - return std::vector{"output", "mean", "var"}; -}) -.set_attr("FNumVisibleOutputs", - [](const NodeAttrs& attrs) { - return 1; -}) -.set_attr("FMutateInputs", [](const nnvm::NodeAttrs& attrs) { - return std::vector{3, 4}; -}) -.set_attr("FInferShape", BatchNormShape) -.set_attr("FCompute", BatchNormCompute_CPU) -.set_attr("FGradient", ElemwiseGradUseInOut{"_backward_CuDNNBatchNorm"}) .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization") -.add_argument("gamma", "NDArray-or-Symbol", "gamma array") -.add_argument("beta", "NDArray-or-Symbol", "beta array") -.add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input") -.add_argument("moving_var", "NDArray-or-Symbol", "running variance of input") -.add_arguments(BatchNormParam::__FIELDS__()) -.set_attr( - "FSetInputVarAttrOnCompose", - [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) { - if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return; - if (index == 3) { - var->attrs.dict["__init__"] = "[\"zero\", {}]"; - } else if (index == 4) { - var->attrs.dict["__init__"] = "[\"one\", {}]"; - } - }); - -NNVM_REGISTER_OP(_backward_CuDNNBatchNorm) -.set_num_outputs(5) -.set_attr("FMutateInputs", [](const nnvm::NodeAttrs& attrs) { - return std::vector{6, 7}; -}) -.set_attr("TIsBackward", true) -.set_attr_parser(ParamParser) -.set_attr("FCompute", BatchNormGradCompute_CPU); +.add_arguments(BatchNormParam::__FIELDS__()); +NNVM_REGISTER_OP(CuDNNBatchNorm) +.set_attr("FSetInputVarAttrOnCompose", + [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) { + if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return; + if (index == 3) { + var->attrs.dict["__init__"] = "[\"zero\", {}]"; + } else if (index == 4) { + var->attrs.dict["__init__"] = "[\"zero\", {}]"; + } + }); #endif // CUDNN_MAJOR >= 4 - } // namespace op } // namespace mxnet diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.cu b/src/operator/nn/cudnn/cudnn_batch_norm.cu index e07cd1e6c8f6..e96db2e5e73f 100644 --- a/src/operator/nn/cudnn/cudnn_batch_norm.cu +++ b/src/operator/nn/cudnn/cudnn_batch_norm.cu @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file cudnn_batch_norm.cu * \brief - * \author Junyuan Xie, Da Zheng + * \author Junyuan Xie */ #include "./cudnn_batch_norm-inl.h" @@ -30,60 +30,10 @@ namespace mxnet { namespace op { #if CUDNN_MAJOR == 4 - -template -static CuDNNBatchNormOp &GetCuDNNOp(const BatchNormParam& param) { -#if DMLC_CXX11_THREAD_LOCAL - static thread_local CuDNNBatchNormOp op; -#else - static MX_THREAD_LOCAL CuDNNBatchNormOp op; -#endif - op.Init(param); - return op; -} - -static void BatchNormCompute_CuDNNv4(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { -#if CUDNN_MAJOR >= 5 - LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5." - "Use the later instead."; -#else - const BatchNormParam& param = nnvm::get(attrs.parsed); - CHECK_EQ(inputs.size(), 5U); - std::vector in_data(inputs.begin(), inputs.begin() + 3); - std::vector aux_states(inputs.begin() + 3, inputs.end()); - GetCuDNNOp(param).Forward(ctx, in_data, req, outputs, aux_states); -#endif -} - -static void BatchNormGradCompute_CuDNNv4(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { -#if CUDNN_MAJOR >= 5 - LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5." - "Use the later instead."; -#else - CHECK_EQ(inputs.size(), 11U); - const BatchNormParam& param = nnvm::get(attrs.parsed); - std::vector out_grad(1, inputs[0]); - std::vector in_data(inputs.begin() + 3, inputs.begin() + 6); - std::vector aux_states(inputs.begin() + 6, inputs.begin() + 8); - std::vector out_data(inputs.begin() + 8, inputs.end()); - std::vector in_grad(outputs.begin(), outputs.begin() + 3); - GetCuDNNOp(param).Backward(ctx, out_grad, in_data, out_data, - req, in_grad, aux_states); -#endif +template<> +Operator *CreateOp_CuDNNv4(BatchNormParam param) { + return new CuDNNBatchNormOp(param); } - -NNVM_REGISTER_OP(CuDNNBatchNorm) -.set_attr("FCompute", BatchNormCompute_CuDNNv4); - -NNVM_REGISTER_OP(_backward_CuDNNBatchNorm) -.set_attr("FCompute", BatchNormGradCompute_CuDNNv4); - #endif // CUDNN_MAJOR == 4 } // namespace op } // namespace mxnet diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h index 229ba3cb1a8e..8ffe97d94310 100644 --- a/src/operator/nn/cudnn/cudnn_convolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h @@ -42,19 +42,9 @@ namespace op { * \brief The Operator used to perform convolution using cuDNN kernels. */ template -class CuDNNConvolutionOp { +class CuDNNConvolutionOp : public Operator { public: - CuDNNConvolutionOp() { - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_)); - CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_)); - } - - void Init(const ConvolutionParam& param, + explicit CuDNNConvolutionOp(const ConvolutionParam& param, int forward_compute_type, int backward_compute_type, const std::vector& in_shape, @@ -67,6 +57,8 @@ class CuDNNConvolutionOp { auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type); // convert MB to words param_.workspace = (param_.workspace << 20) / sizeof(DType); + init_cudnn_ = false; + init_temp_size_ = false; dtype_ = DataType::kCudnnFlag; // TensorCore algos only allowed on fp16-I/O convolutions if permitted by the global policy. cudnn_tensor_core_ = DataType::kFlag == kFloat16 && GetEnvAllowTensorCore(); @@ -110,19 +102,22 @@ class CuDNNConvolutionOp { } ~CuDNNConvolutionOp() { - CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_)); - CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_)); + if (init_cudnn_) { + CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_)); + CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_)); + } } - void Forward(const OpContext &ctx, + virtual void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data) { + const std::vector &out_data, + const std::vector &aux_args) { using namespace mshadow; size_t expected = param_.no_bias ? 2 : 3; CHECK_EQ(in_data.size(), expected); @@ -179,17 +174,18 @@ class CuDNNConvolutionOp { } } - void Backward(const OpContext &ctx, + virtual void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, + const std::vector &out_data, const std::vector &req, - const std::vector &in_grad) { + const std::vector &in_grad, + const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; size_t expected = param_.no_bias == 0 ? 3 : 2; CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_data.size(), expected); - CHECK_EQ(in_grad.size(), expected); + CHECK(in_data.size() == expected && in_grad.size() == expected); Stream *s = ctx.get_stream(); // I/O's should have 2 more dims than the kernel dim @@ -199,7 +195,6 @@ class CuDNNConvolutionOp { DType *data_ptr = GetNdPtr(in_data[conv::kData], param_.kernel.ndim() + 2, s); DType *gdata_ptr = GetNdPtr(in_grad[conv::kData], param_.kernel.ndim() + 2, s); - GetTempSize(ctx); Tensor workspace = AllocateTempWorkspace(ctx, backward_workspace_byte_); size_t workspace_size = TensorSizeBytes(workspace); for (uint32_t g = 0; g < param_.num_group; ++g) { @@ -338,6 +333,13 @@ class CuDNNConvolutionOp { size_t expected = param_.no_bias ? 2 : 3; CHECK_EQ(in_shape.size(), expected); CHECK_EQ(out_shape.size(), 1U); + CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_)); + CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_)); TShape dshape = in_shape[conv::kData]; TShape wshape = in_shape[conv::kWeight]; @@ -510,6 +512,7 @@ class CuDNNConvolutionOp { &bias_shape[0], &bias_stride[0])); } + init_cudnn_ = true; } void SelectAlgo(const Context& ctx, @@ -753,6 +756,7 @@ class CuDNNConvolutionOp { } void GetTempSize(const OpContext& ctx) { + if (init_temp_size_) return; mshadow::Stream *s = ctx.get_stream(); size_t back_size = 0, back_size_w = 0; CUDNN_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_, @@ -777,6 +781,8 @@ class CuDNNConvolutionOp { out_desc_, forward_algo_.AlgoNumber(), &forward_workspace_byte_)); + + init_temp_size_ = true; } int *CastTShapeToIntPtr(const TShape& s, std::vector *buffer) { @@ -841,6 +847,8 @@ class CuDNNConvolutionOp { std::vector param_dilate_; std::vector param_pad_; + bool init_cudnn_; + bool init_temp_size_; // Temp workspace size in bytes needed for Forward() operation. size_t forward_workspace_byte_; // Temp workspace size in bytes needed for Backward() operation. diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h index 3c80cdcba4c2..bc02d1b73f45 100644 --- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h @@ -39,19 +39,9 @@ namespace op { #if MXNET_USE_CUDNN == 1 template -class CuDNNDeconvolutionOp { +class CuDNNDeconvolutionOp : public Operator { public: - CuDNNDeconvolutionOp() { - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_)); - CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_)); - } - - void Init(DeconvolutionParam param, + explicit CuDNNDeconvolutionOp(DeconvolutionParam param, int forward_compute_type, int backward_compute_type, const std::vector& in_shape, @@ -64,6 +54,8 @@ class CuDNNDeconvolutionOp { auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type); // convert MB to words param_.workspace = (param_.workspace << 20) / sizeof(DType); + init_cudnn_ = false; + init_temp_size_ = false; dtype_ = mshadow::DataType::kCudnnFlag; // TensorCore algos only allowed on fp16-I/O deconvolutions if permitted by the global policy. cudnn_tensor_core_ = DataType::kFlag == kFloat16 && GetEnvAllowTensorCore(); @@ -107,19 +99,22 @@ class CuDNNDeconvolutionOp { } ~CuDNNDeconvolutionOp() { - CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_)); - CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_)); + if (init_cudnn_) { + CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_)); + CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_)); + } } - void Forward(const OpContext &ctx, + virtual void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data) { + const std::vector &out_data, + const std::vector &aux_args) { using namespace mshadow; size_t expected = param_.no_bias ? 2 : 3; CHECK_EQ(in_data.size(), expected); @@ -192,17 +187,18 @@ class CuDNNDeconvolutionOp { } } - void Backward(const OpContext &ctx, + virtual void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, + const std::vector &out_data, const std::vector &req, - const std::vector &in_grad) { + const std::vector &in_grad, + const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; size_t expected = param_.no_bias == 0 ? 3 : 2; CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_data.size(), param_.no_bias ? 2U : 3U); - CHECK_EQ(in_grad.size(), expected); + CHECK(in_data.size() == expected && in_grad.size() == expected); Stream *s = ctx.get_stream(); // I/O's should have 2 more dims than the kernel dim @@ -217,7 +213,6 @@ class CuDNNDeconvolutionOp { CHECK_NE(req[deconv::kBias], kWriteInplace); } CHECK_NE(req[deconv::kData], kWriteInplace); - GetTempSize(ctx); Tensor workspace = AllocateTempWorkspace(ctx, backward_workspace_byte_); size_t workspace_size = TensorSizeBytes(workspace); for (uint32_t g = 0; g < param_.num_group; ++g) { @@ -353,6 +348,13 @@ class CuDNNDeconvolutionOp { size_t expected = param_.no_bias ? 2 : 3; CHECK_EQ(in_shape.size(), expected); CHECK_EQ(out_shape.size(), 1U); + CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_)); + CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_)); TShape dshape = in_shape[deconv::kData]; TShape wshape = in_shape[deconv::kWeight]; @@ -534,6 +536,7 @@ class CuDNNDeconvolutionOp { &bias_shape[0], &bias_stride[0])); } + init_cudnn_ = true; } void SelectAlgo(const Context& ctx, @@ -786,6 +789,7 @@ class CuDNNDeconvolutionOp { } void GetTempSize(const OpContext& ctx) { + if (init_temp_size_) return; mshadow::Stream *s = ctx.get_stream(); size_t back_data_algo_workspace_size = 0; size_t back_filter_algo_workspace_size = 0; @@ -815,6 +819,7 @@ class CuDNNDeconvolutionOp { forward_workspace_byte_ = back_data_algo_workspace_size; backward_workspace_byte_ = std::max(forward_algo_workspace_size, back_filter_algo_workspace_size); + init_temp_size_ = true; } int *CastTShapeToIntPtr(const TShape& s, std::vector *buffer) { @@ -877,11 +882,8 @@ class CuDNNDeconvolutionOp { std::vector param_stride_; std::vector param_dilate_; - int forward_compute_type_; - int backward_compute_type_; - const std::vector in_shapes_; - const std::vector out_shapes_; - + bool init_cudnn_; + bool init_temp_size_; // Temp workspace size in bytes needed for Forward() operation. Note that // in deconvolution, this is handled by the cuDNN backprop-to-data kernel. size_t forward_workspace_byte_; diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h index 8442b37058d4..104ed8546dca 100644 --- a/src/operator/nn/cudnn/cudnn_pooling-inl.h +++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h @@ -34,18 +34,13 @@ namespace mxnet { namespace op { template -class CuDNNPoolingOp { +class CuDNNPoolingOp : public Operator { public: - CuDNNPoolingOp() { + explicit CuDNNPoolingOp(PoolingParam p) { + param_ = p; + init_cudnn_ = false; // TODO(xxx): fp16 dtype_ = mshadow::DataType::kCudnnFlag; - CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); - } - - void Init(const PoolingParam &p) { - param_ = p; switch (param_.pool_type) { case pool_enum::kMaxPooling: mode_ = CUDNN_POOLING_MAX; @@ -59,24 +54,33 @@ class CuDNNPoolingOp { } ~CuDNNPoolingOp() { - CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); - CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_)); + if (init_cudnn_) { + CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); + CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_)); + } } - void Forward(const OpContext &ctx, const TBlob &in_data, - const OpReqType &req, const TBlob &out_data) { + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; + CHECK_EQ(in_data.size(), 1U); + CHECK_EQ(out_data.size(), 1U); Stream *s = ctx.get_stream(); CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); typename DataType::ScaleType alpha = 1.0f; typename DataType::ScaleType beta = 0.0f; - this->Init(s, in_data, out_data); if (param_.kernel.ndim() == 2) { // 2d pool - Tensor data = in_data.get(s); - Tensor out = out_data.get(s); + Tensor data = in_data[pool_enum::kData].get(s); + Tensor out = out_data[pool_enum::kOut].get(s); + if (!init_cudnn_) { + this->Init(s, in_data, out_data); + } CHECK_EQ(data.CheckContiguous(), true); CHECK_EQ(out.CheckContiguous(), true); CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_, @@ -89,8 +93,11 @@ class CuDNNPoolingOp { out.dptr_)); } else if (param_.kernel.ndim() == 3) { // 3d pool - Tensor data = in_data.get(s); - Tensor out = out_data.get(s); + Tensor data = in_data[pool_enum::kData].get(s); + Tensor out = out_data[pool_enum::kOut].get(s); + if (!init_cudnn_) { + this->Init(s, in_data, out_data); + } CHECK_EQ(data.CheckContiguous(), true); CHECK_EQ(out.CheckContiguous(), true); CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_, @@ -106,23 +113,31 @@ class CuDNNPoolingOp { } } - void Backward(const OpContext &ctx, const TBlob &out_grad, - const TBlob &in_data, const TBlob &out_data, - const OpReqType &req, const TBlob &in_grad) { + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1U); + CHECK_EQ(in_data.size(), 1U); + CHECK_EQ(out_data.size(), 1U); + CHECK_EQ(req.size(), 1U); + CHECK_EQ(in_grad.size(), 1U); Stream *s = ctx.get_stream(); CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); typename DataType::ScaleType alpha = 1.0f; typename DataType::ScaleType beta = 0.0f; - this->Init(s, in_data, out_data); if (param_.kernel.ndim() == 2) { // 2d pool - Tensor m_out_grad = out_grad.get(s); - Tensor m_in_data = in_data.get(s); - Tensor m_out_data = out_data.get(s); - Tensor m_in_grad = in_grad.get(s); + Tensor m_out_grad = out_grad[pool_enum::kOut].get(s); + Tensor m_in_data = in_data[pool_enum::kData].get(s); + Tensor m_out_data = out_data[pool_enum::kOut].get(s); + Tensor m_in_grad = in_grad[pool_enum::kData].get(s); CUDNN_CALL(cudnnPoolingBackward(s->dnn_handle_, pooling_desc_, &alpha, @@ -137,10 +152,10 @@ class CuDNNPoolingOp { m_in_grad.dptr_)); } else if (param_.kernel.ndim() == 3) { // 3d pool - Tensor m_out_grad = out_grad.get(s); - Tensor m_in_data = in_data.get(s); - Tensor m_out_data = out_data.get(s); - Tensor m_in_grad = in_grad.get(s); + Tensor m_out_grad = out_grad[pool_enum::kOut].get(s); + Tensor m_in_data = in_data[pool_enum::kData].get(s); + Tensor m_out_data = out_data[pool_enum::kOut].get(s); + Tensor m_in_grad = in_grad[pool_enum::kData].get(s); CUDNN_CALL(cudnnPoolingBackward(s->dnn_handle_, pooling_desc_, &alpha, @@ -159,115 +174,129 @@ class CuDNNPoolingOp { } private: - inline void Init(mshadow::Stream *s, const TBlob &in_data, - const TBlob &out_data) { + inline void Init(mshadow::Stream *s, + const std::vector &in_data, + const std::vector &out_data) { using namespace mshadow; #if CUDNN_MAJOR >= 5 nan_prop_ = CUDNN_NOT_PROPAGATE_NAN; #endif - if (param_.kernel.ndim() == 2) { - // 2d conv - Tensor data = in_data.get(s); - Tensor out = out_data.get(s); - mshadow::Shape<4> dshape = data.shape_; - CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - data.shape_[0], - data.shape_[1], - data.shape_[2], - data.shape_[3])); - CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - out.shape_[0], - out.shape_[1], - out.shape_[2], - out.shape_[3])); - #if CUDNN_MAJOR >= 5 - CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_, - mode_, - nan_prop_, - param_.global_pool ? dshape[2] : param_.kernel[0], - param_.global_pool ? dshape[3] : param_.kernel[1], - param_.pad[0], - param_.pad[1], - param_.global_pool ? 1 : param_.stride[0], - param_.global_pool ? 1 :param_.stride[1])); - #else - CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_, - mode_, - param_.global_pool ? dshape[2] : param_.kernel[0], - param_.global_pool ? dshape[3] : param_.kernel[1], - param_.pad[0], - param_.pad[1], - param_.global_pool ? 1 : param_.stride[0], - param_.global_pool ? 1 : param_.stride[1])); - #endif - } else { - Tensor data = in_data.get(s); - Tensor out = out_data.get(s); - std::vector ishape = {static_cast(data.shape_[0]), - static_cast(data.shape_[1]), - static_cast(data.shape_[2]), - static_cast(data.shape_[3]), - static_cast(data.shape_[4])}; + CHECK_EQ(in_data.size(), 1U); + CHECK_EQ(out_data.size(), 1U); + if (!init_cudnn_) { + init_cudnn_ = true; + if (param_.kernel.ndim() == 2) { + // 2d conv + Tensor data = in_data[pool_enum::kData].get(s); + Tensor out = out_data[pool_enum::kOut].get(s); + mshadow::Shape<4> dshape = data.shape_; + CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); + CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + data.shape_[0], + data.shape_[1], + data.shape_[2], + data.shape_[3])); + CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + out.shape_[0], + out.shape_[1], + out.shape_[2], + out.shape_[3])); + #if CUDNN_MAJOR >= 5 + CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_, + mode_, + nan_prop_, + param_.global_pool ? dshape[2] : param_.kernel[0], + param_.global_pool ? dshape[3] : param_.kernel[1], + param_.pad[0], + param_.pad[1], + param_.global_pool ? 1 : param_.stride[0], + param_.global_pool ? 1 :param_.stride[1])); + #else + CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_, + mode_, + param_.global_pool ? dshape[2] : param_.kernel[0], + param_.global_pool ? dshape[3] : param_.kernel[1], + param_.pad[0], + param_.pad[1], + param_.global_pool ? 1 : param_.stride[0], + param_.global_pool ? 1 : param_.stride[1])); + #endif + } else { + Tensor data = in_data[pool_enum::kData].get(s); + Tensor out = out_data[pool_enum::kOut].get(s); + CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); + std::vector ishape = {static_cast(data.shape_[0]), + static_cast(data.shape_[1]), + static_cast(data.shape_[2]), + static_cast(data.shape_[3]), + static_cast(data.shape_[4])}; - std::vector istride = {static_cast(ishape[1] * ishape[2] * ishape[3] * ishape[4]), - static_cast(ishape[2] * ishape[3] * ishape[4]), - static_cast(ishape[3] * ishape[4]), - static_cast(ishape[4]), 1}; + std::vector istride = {static_cast(ishape[1] * ishape[2] * ishape[3] * ishape[4]), + static_cast(ishape[2] * ishape[3] * ishape[4]), + static_cast(ishape[3] * ishape[4]), + static_cast(ishape[4]), + 1}; - std::vector oshape = {static_cast(out.shape_[0]), - static_cast(out.shape_[1]), - static_cast(out.shape_[2]), - static_cast(out.shape_[3]), - static_cast(out.shape_[4])}; + std::vector oshape = {static_cast(out.shape_[0]), + static_cast(out.shape_[1]), + static_cast(out.shape_[2]), + static_cast(out.shape_[3]), + static_cast(out.shape_[4])}; - std::vector ostride = {static_cast(oshape[1] * oshape[2] * oshape[3] * oshape[4]), - static_cast(oshape[2] * oshape[3] * oshape[4]), - static_cast(oshape[3] * oshape[4]), - static_cast(oshape[4]), 1}; + std::vector ostride = {static_cast(oshape[1] * oshape[2] * oshape[3] * oshape[4]), + static_cast(oshape[2] * oshape[3] * oshape[4]), + static_cast(oshape[3] * oshape[4]), + static_cast(oshape[4]), + 1}; - std::vector kernel_vec = {param_.global_pool ? ishape[2] : - static_cast(param_.kernel[0]), - param_.global_pool ? ishape[3] : - static_cast(param_.kernel[1]), - param_.global_pool ? ishape[4] : - static_cast(param_.kernel[2])}; + std::vector kernel_vec = {param_.global_pool ? ishape[2] : + static_cast(param_.kernel[0]), + param_.global_pool ? ishape[3] : + static_cast(param_.kernel[1]), + param_.global_pool ? ishape[4] : + static_cast(param_.kernel[2])}; - std::vector pad_vec = {param_.global_pool ? 0 : static_cast(param_.pad[0]), - param_.global_pool ? 0 : static_cast(param_.pad[1]), - param_.global_pool ? 0 : static_cast(param_.pad[2])}; + std::vector pad_vec = {param_.global_pool ? 0 : static_cast(param_.pad[0]), + param_.global_pool ? 0 : static_cast(param_.pad[1]), + param_.global_pool ? 0 : static_cast(param_.pad[2])}; - std::vector stride_vec = {param_.global_pool ? 1 : static_cast(param_.stride[0]), - param_.global_pool ? 1 : static_cast(param_.stride[1]), - param_.global_pool ? 1 : static_cast(param_.stride[2])}; + std::vector stride_vec = {param_.global_pool ? 1 : static_cast(param_.stride[0]), + param_.global_pool ? 1 : static_cast(param_.stride[1]), + param_.global_pool ? 1 : static_cast(param_.stride[2])}; - CUDNN_CALL(cudnnSetTensorNdDescriptor(in_desc_, - dtype_, - static_cast(ishape.size()), - &ishape[0], - &istride[0])); - CUDNN_CALL(cudnnSetTensorNdDescriptor(out_desc_, - dtype_, - static_cast(oshape.size()), - &oshape[0], - &ostride[0])); - #if CUDNN_MAJOR >= 5 - CUDNN_CALL(cudnnSetPoolingNdDescriptor(pooling_desc_, - mode_, - nan_prop_, - static_cast(kernel_vec.size()), - &(kernel_vec[0]), - &(pad_vec[0]), - &(stride_vec[0]))); - #else - LOG(FATAL) << "3D pooling only support CUDNN v5 and abouve"; - #endif + CUDNN_CALL(cudnnSetTensorNdDescriptor(in_desc_, + dtype_, + static_cast(ishape.size()), + &ishape[0], + &istride[0])); + CUDNN_CALL(cudnnSetTensorNdDescriptor(out_desc_, + dtype_, + static_cast(oshape.size()), + &oshape[0], + &ostride[0])); + #if CUDNN_MAJOR >= 5 + CUDNN_CALL(cudnnSetPoolingNdDescriptor(pooling_desc_, + mode_, + nan_prop_, + static_cast(kernel_vec.size()), + &(kernel_vec[0]), + &(pad_vec[0]), + &(stride_vec[0]))); + #else + LOG(FATAL) << "3D pooling only support CUDNN v5 and abouve"; + #endif + } } } - + bool init_cudnn_; cudnnDataType_t dtype_; cudnnHandle_t handle_; cudnnPoolingMode_t mode_; diff --git a/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h b/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h index 239da023668d..5afdb4844364 100644 --- a/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h +++ b/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h @@ -32,64 +32,73 @@ namespace mxnet { namespace op { -class CuDNNSoftmaxActivationOp { +class CuDNNSoftmaxActivationOp : public Operator { public: - CuDNNSoftmaxActivationOp() { - dtype_ = CUDNN_DATA_FLOAT; - CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_)); - } - - void Init(SoftmaxActivationParam param) { + explicit CuDNNSoftmaxActivationOp(SoftmaxActivationParam param) { this->param_ = param; + init_cudnn_ = false; + dtype_ = CUDNN_DATA_FLOAT; } ~CuDNNSoftmaxActivationOp() { - CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_)); + if (init_cudnn_) { + CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_)); + } } - void Forward(const OpContext &ctx, const TBlob &in_data, - const OpReqType &req, const TBlob &out_data) { + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; + CHECK_EQ(in_data.size(), 1U); + CHECK_EQ(out_data.size(), 1U); Stream *s = ctx.get_stream(); Tensor data; Tensor out; cudnnSoftmaxMode_t softmax_mode; if (param_.mode == softmax_activation::kInstance) { - CHECK_EQ(in_data.ndim(), 2) + CHECK_EQ(in_data[softmax_activation::kData].ndim(), 2) << "Input need to have 2 dimensions when mode=instance."; - Shape<4> dshape = Shape4(in_data.shape_[0], in_data.shape_[1], 1, 1); - data = in_data.get_with_shape(dshape, s); - out = out_data.get_with_shape(dshape, s); + Shape<4> dshape = Shape4(in_data[softmax_activation::kData].shape_[0], + in_data[softmax_activation::kData].shape_[1], 1, 1); + data = in_data[softmax_activation::kData].get_with_shape(dshape, s); + out = out_data[softmax_activation::kOut].get_with_shape(dshape, s); softmax_mode = CUDNN_SOFTMAX_MODE_INSTANCE; } else { - CHECK_GE(in_data.ndim(), 3) + CHECK_GE(in_data[softmax_activation::kData].ndim(), 3) << "Input need to have a least 3 dimensions when mode=channel"; Shape<4> dshape; - index_t size_left = in_data.Size(); + index_t size_left = in_data[softmax_activation::kData].Size(); for (int i = 0; i < 3; ++i) { - if (i < in_data.ndim()) { - dshape[i] = in_data.shape_[i]; + if (i < in_data[softmax_activation::kData].ndim()) { + dshape[i] = in_data[softmax_activation::kData].shape_[i]; } else { dshape[i] = 1; } size_left /= dshape[i]; } dshape[3] = size_left; - data = in_data.get_with_shape(dshape, s); - out = out_data.get_with_shape(dshape, s); + data = in_data[softmax_activation::kData].get_with_shape(dshape, s); + out = out_data[softmax_activation::kOut].get_with_shape(dshape, s); softmax_mode = CUDNN_SOFTMAX_MODE_CHANNEL; } float alpha = 1.0f; float beta = 0.0f; CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); - CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - data.shape_[0], - data.shape_[1], - data.shape_[2], - data.shape_[3])); + if (!init_cudnn_) { + init_cudnn_ = true; + CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_)); + CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + data.shape_[0], + data.shape_[1], + data.shape_[2], + data.shape_[3])); + } CUDNN_CALL(cudnnSoftmaxForward(s->dnn_handle_, CUDNN_SOFTMAX_ACCURATE, softmax_mode, @@ -101,10 +110,19 @@ class CuDNNSoftmaxActivationOp { out.dptr_)); } - void Backward(const OpContext &ctx, const TBlob &out_grad, - const TBlob &out_data, const OpReqType &req, const TBlob &in_grad) { + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1U); + CHECK_EQ(out_data.size(), 1U); + CHECK_EQ(req.size(), 1U); + CHECK_EQ(in_grad.size(), 1U); float alpha = 1.0f; float beta = 0.0f; Stream *s = ctx.get_stream(); @@ -114,30 +132,31 @@ class CuDNNSoftmaxActivationOp { Tensor input_grad; cudnnSoftmaxMode_t softmax_mode; if (param_.mode == softmax_activation::kInstance) { - CHECK_EQ(in_grad.ndim(), 2) + CHECK_EQ(in_grad[softmax_activation::kData].ndim(), 2) << "Input need to have 2 dimensions when mode=instance."; - Shape<4> dshape = Shape4(in_grad.shape_[0], in_grad.shape_[1], 1, 1); - grad = out_grad.get_with_shape(dshape, s); - output_data = out_data.get_with_shape(dshape, s); - input_grad = in_grad.get_with_shape(dshape, s); + Shape<4> dshape = Shape4(in_grad[softmax_activation::kData].shape_[0], + in_grad[softmax_activation::kData].shape_[1], 1, 1); + grad = out_grad[softmax_activation::kOut].get_with_shape(dshape, s); + output_data = out_data[softmax_activation::kOut].get_with_shape(dshape, s); + input_grad = in_grad[softmax_activation::kData].get_with_shape(dshape, s); softmax_mode = CUDNN_SOFTMAX_MODE_INSTANCE; } else { - CHECK_GE(in_grad.ndim(), 3) + CHECK_GE(in_grad[softmax_activation::kData].ndim(), 3) << "Input need to have a least 3 dimensions when mode=channel"; Shape<4> dshape; - index_t size_left = in_grad.Size(); + index_t size_left = in_grad[softmax_activation::kData].Size(); for (int i = 0; i < 3; ++i) { - if (i < in_grad.ndim()) { - dshape[i] = in_grad.shape_[i]; + if (i < in_grad[softmax_activation::kData].ndim()) { + dshape[i] = in_grad[softmax_activation::kData].shape_[i]; } else { dshape[i] = 1; } size_left /= dshape[i]; } dshape[3] = size_left; - output_data = out_data.get_with_shape(dshape, s); - grad = out_grad.get_with_shape(dshape, s); - input_grad = in_grad.get_with_shape(dshape, s); + output_data = out_data[softmax_activation::kOut].get_with_shape(dshape, s); + grad = out_grad[softmax_activation::kOut].get_with_shape(dshape, s); + input_grad = in_grad[softmax_activation::kData].get_with_shape(dshape, s); softmax_mode = CUDNN_SOFTMAX_MODE_CHANNEL; } CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); @@ -155,6 +174,7 @@ class CuDNNSoftmaxActivationOp { } private: + bool init_cudnn_; cudnnDataType_t dtype_; cudnnTensorDescriptor_t shape_desc_; SoftmaxActivationParam param_; diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index b6d522b9e6f9..fbdfaa84faab 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file deconvolution-inl.h * \brief - * \author Wei Wu, Da Zheng + * \author Wei Wu */ #ifndef MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_ #define MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_ @@ -195,18 +195,19 @@ namespace mxnet { namespace op { template -class DeconvolutionOp { +class DeconvolutionOp : public Operator { public: - void Init(DeconvolutionParam p) { + explicit DeconvolutionOp(DeconvolutionParam p) { this->param_ = p; // convert MBytes first to Bytes and then to elements. param_.workspace = (param_.workspace << 20) / sizeof(real_t); } - void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data) { + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; @@ -321,18 +322,19 @@ class DeconvolutionOp { } } - void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &req, - const std::vector &in_grad) { + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { using namespace mshadow; using namespace mshadow::expr; // TODO(bing): check the BLAS Handle, be careful CHECK_EQ(out_grad.size(), 1U); size_t expected = param_.no_bias == 0 ? 3 : 2; - CHECK_EQ(in_data.size(), expected); - CHECK_EQ(in_grad.size(), expected); + CHECK(in_data.size() == expected && in_grad.size() == expected); CHECK_EQ(req.size(), expected); CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true); // get data @@ -487,52 +489,300 @@ class DeconvolutionOp { }; // class DeconvolutionOp template -void _DeconvolutionCompute(const DeconvolutionParam& param, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { - DeconvolutionOp op; - op.Init(param); - op.Forward(ctx, inputs, req, outputs); - }); -} +Operator* CreateOp(DeconvolutionParam param, int dtype, + std::vector *in_shape, + std::vector *out_shape, + Context ctx); -template -void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const DeconvolutionParam& param = nnvm::get(attrs.parsed); - _DeconvolutionCompute(param, ctx, inputs, req, outputs); -} +#if DMLC_USE_CXX11 +class DeconvolutionProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + if (!param_.no_bias) { + return {"data", "weight", "bias"}; + } else { + return {"data", "weight"}; + } + } -template -void _DeconvolutionGradCompute(const DeconvolutionParam& param, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - std::vector in_data(inputs.begin() + 1, inputs.end()); - const TBlob &out_grad = inputs[0]; - const std::vector &in_grad = outputs; - - MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, { - DeconvolutionOp op; - op.Init(param); - op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); - }); -} + void Init(const std::vector >& kwargs) override { + using namespace mshadow; + param_.Init(kwargs); + if (param_.kernel.ndim() == 1) { + param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW; + if (param_.stride.ndim() == 0) param_.stride = Shape1(1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1); + if (param_.pad.ndim() == 0) param_.pad = Shape1(0); + if (param_.adj.ndim() == 0) param_.adj = Shape1(0); + } else if (param_.kernel.ndim() == 2) { + param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; + if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); + if (param_.adj.ndim() == 0) param_.adj = Shape2(0, 0); + } else { + CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D deconvolution not supported"; + param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW; + if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); + if (param_.adj.ndim() == 0) param_.adj = Shape3(0, 0, 0); + } + } + std::map GetParams() const override { + return param_.__DICT__(); + } -template -void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const DeconvolutionParam& param = nnvm::get(attrs.parsed); - _DeconvolutionGradCompute(param, ctx, inputs, req, outputs); -} + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { +#if MXNET_USE_CUDNN == 0 + if (param_.kernel.ndim() > 2) { + LOG(FATAL) << "If not using CUDNN, only 1D or 2D Deconvolution is supported"; + return false; + } +#endif // CUDNN + + using namespace mshadow; + if (!param_.no_bias) { + CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; + } else { + CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; + } + out_shape->resize(1, TShape()); + const TShape &dshape = (*in_shape)[deconv::kData]; + if (dshape.ndim() == 0) return false; + + if (param_.kernel.ndim() == 1) { + // 1d conv + CHECK_EQ(dshape.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x"; + Shape<3> dshape_ncw = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW); + Shape<3> wshape = Shape3(dshape_ncw[1], param_.num_filter / param_.num_group, + param_.kernel[0]); + wshape = ConvertLayout(wshape, kNCW, param_.layout.value()); + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); + } + + const index_t dilated_ksize_x = param_.DilatedKernelSize(0); + + index_t o_pad[1]; + index_t o_adj[1]; + param_.InferPad(dshape_ncw, o_pad, o_adj); + + CHECK_EQ(dshape_ncw[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + + CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(x) must be samller than stride[0]"; + + Shape<3> oshape; + oshape[0] = dshape_ncw[0]; + oshape[1] = param_.num_filter; + oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) + + dilated_ksize_x - 2 * o_pad[0] + o_adj[0]; + + if (param_.target_shape[0] > 0) { + CHECK_EQ(param_.target_shape[0], oshape[2]) \ + << "param_.target_shape[0] was not reasonable, please set it carefully"; + } + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value())); + + return true; + } else if (param_.kernel.ndim() == 2) { + // 2d conv + CHECK_EQ(dshape.ndim(), 4U) \ + << "Input data should be 4D in batch-num_filter-y-x"; + Shape<4> dshape_nchw = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW); + Shape<4> wshape = Shape4(dshape_nchw[1], + param_.num_filter / param_.num_group, + param_.kernel[0], param_.kernel[1]); + wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); + } + + const index_t dilated_ksize_y = param_.DilatedKernelSize(0); + const index_t dilated_ksize_x = param_.DilatedKernelSize(1); + + index_t o_pad[2]; + index_t o_adj[2]; + param_.InferPad(dshape_nchw, o_pad, o_adj); + + CHECK_EQ(dshape_nchw[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + + CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(y) must be samller than stride[0]"; + CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(x) must be samller than stride[1]"; + + Shape<4> oshape; + oshape[0] = dshape_nchw[0]; + oshape[1] = param_.num_filter; + oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) + + dilated_ksize_y - 2 * o_pad[0] + o_adj[0]; + oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) + + dilated_ksize_x - 2 * o_pad[1] + o_adj[1]; + + if (param_.target_shape[0] > 0) { + CHECK_EQ(param_.target_shape[0], oshape[2]) \ + << "param_.target_shape[0] was not reasonable, please set it carefully"; + } + if (param_.target_shape[1] > 0) { + CHECK_EQ(param_.target_shape[1], oshape[3]) \ + << "param_.target_shape[1] was not reasonable, please set it carefully"; + } + + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); + + return true; + } else if (param_.kernel.ndim() == 3) { + // 3d conv + CHECK_EQ(dshape.ndim(), 5U) \ + << "Input data should be 5D in batch-num_filter-depth-y-x"; + Shape<5> dshape_ncdhw = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW); + Shape<5> wshape = Shape5(dshape_ncdhw[1], param_.num_filter / param_.num_group, + param_.kernel[0], param_.kernel[1], param_.kernel[2]); + wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value()); + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); + } + + // Note: 3D dilation currently not supported. + // Calculations below done to preserve symmetry with 1D/2D code. + const index_t dilated_ksize_d = param_.DilatedKernelSize(0); + const index_t dilated_ksize_y = param_.DilatedKernelSize(1); + const index_t dilated_ksize_x = param_.DilatedKernelSize(2); + + index_t o_pad[3]; + index_t o_adj[3]; + param_.InferPad(dshape_ncdhw, o_pad, o_adj); + + CHECK_EQ(dshape_ncdhw[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + CHECK_EQ(param_.dilate.Size(), 1U) + << "Dilate is not supported in 3d deconvolution"; + + CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(d) must be samller than stride[0]"; + CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(y) must be samller than stride[1]"; + CHECK_GE(param_.stride[2]-1, o_adj[2]) << "adj(x) must be samller than stride[2]"; + + Shape<5> oshape; + oshape[0] = dshape_ncdhw[0]; + oshape[1] = param_.num_filter; + oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) + + dilated_ksize_d - 2 * o_pad[0] + o_adj[0]; + oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) + + dilated_ksize_y - 2 * o_pad[1] + o_adj[1]; + oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) + + dilated_ksize_x - 2 * o_pad[2] + o_adj[2]; + + if (param_.target_shape[0] > 0) { + CHECK_EQ(param_.target_shape[0], oshape[2]) \ + << "param_.target_shape[0] was not reasonable, please it carefully"; + } + if (param_.target_shape[1] > 0) { + CHECK_EQ(param_.target_shape[1], oshape[3]) \ + << "param_.target_shape[1] was not reasonable, please set it carefully"; + } + if (param_.target_shape[2] > 0) { + CHECK_EQ(param_.target_shape[2], oshape[4]) \ + << "param_.target_shape[2] was not reasonable, please set it carefully"; + } + + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value())); + + return true; + } else { + LOG(FATAL) << "Unknown convolution type"; + return false; + } + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); + } + } + out_type->clear(); + out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + auto ptr = new DeconvolutionProp(); + ptr->param_ = param_; + return ptr; + } + + std::string TypeString() const override { + return "Deconvolution"; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return {out_grad[deconv::kOut], in_data[deconv::kData], in_data[deconv::kWeight]}; + } + + std::vector ForwardResource( + const std::vector &in_shape) const override { + return {ResourceRequest::kTempSpace}; + } + + std::vector BackwardResource( + const std::vector &in_shape) const override { + return {ResourceRequest::kTempSpace}; + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented"; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + private: + DeconvolutionParam param_; +}; // class DeconvolutionProp +#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_ diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc index a3fc915eb0fe..9d3c040c1d63 100644 --- a/src/operator/nn/deconvolution.cc +++ b/src/operator/nn/deconvolution.cc @@ -21,408 +21,45 @@ * Copyright (c) 2015 by Contributors * \file deconvolution.cc * \brief - * \author Wei Wu, Da Zheng + * \author Wei Wu */ #include "./deconvolution-inl.h" -#include "./mkldnn/mkldnn_ops-inl.h" -#include "./mkldnn/mkldnn_base-inl.h" namespace mxnet { namespace op { - -static bool DeconvolutionShape(const nnvm::NodeAttrs& attrs, - std::vector *in_shape, - std::vector *out_shape) { - const DeconvolutionParam& param_ = nnvm::get(attrs.parsed); -#if MXNET_USE_CUDNN == 0 - if (param_.kernel.ndim() > 2) { - LOG(FATAL) << "If not using CUDNN, only 1D or 2D Deconvolution is supported"; - return false; - } -#endif // CUDNN - - using namespace mshadow; - if (!param_.no_bias) { - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; - } else { - CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; - } - out_shape->resize(1, TShape()); - const TShape &dshape = (*in_shape)[deconv::kData]; - if (dshape.ndim() == 0) return false; - - if (param_.kernel.ndim() == 1) { - // 1d conv - CHECK_EQ(dshape.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x"; - Shape<3> dshape_ncw = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW); - Shape<3> wshape = Shape3(dshape_ncw[1], param_.num_filter / param_.num_group, - param_.kernel[0]); - wshape = ConvertLayout(wshape, kNCW, param_.layout.value()); - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); - } - - const index_t dilated_ksize_x = param_.DilatedKernelSize(0); - - index_t o_pad[1]; - index_t o_adj[1]; - param_.InferPad(dshape_ncw, o_pad, o_adj); - - CHECK_EQ(dshape_ncw[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - - CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(x) must be samller than stride[0]"; - - Shape<3> oshape; - oshape[0] = dshape_ncw[0]; - oshape[1] = param_.num_filter; - oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) + - dilated_ksize_x - 2 * o_pad[0] + o_adj[0]; - - if (param_.target_shape.ndim() > 0) { - if (param_.target_shape[0] > 0) { - CHECK_EQ(param_.target_shape[0], oshape[2]) \ - << "param_.target_shape[0] was not reasonable, please set it carefully"; - } - } - - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value())); - - return true; - } else if (param_.kernel.ndim() == 2) { - // 2d conv - CHECK_EQ(dshape.ndim(), 4U) \ - << "Input data should be 4D in batch-num_filter-y-x"; - Shape<4> dshape_nchw = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW); - Shape<4> wshape = Shape4(dshape_nchw[1], - param_.num_filter / param_.num_group, - param_.kernel[0], param_.kernel[1]); - wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); - } - - const index_t dilated_ksize_y = param_.DilatedKernelSize(0); - const index_t dilated_ksize_x = param_.DilatedKernelSize(1); - - index_t o_pad[2]; - index_t o_adj[2]; - param_.InferPad(dshape_nchw, o_pad, o_adj); - - CHECK_EQ(dshape_nchw[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - - CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(y) must be samller than stride[0]"; - CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(x) must be samller than stride[1]"; - - Shape<4> oshape; - oshape[0] = dshape_nchw[0]; - oshape[1] = param_.num_filter; - oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) + - dilated_ksize_y - 2 * o_pad[0] + o_adj[0]; - oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) + - dilated_ksize_x - 2 * o_pad[1] + o_adj[1]; - - if (param_.target_shape.ndim() > 1) { - if (param_.target_shape[0] > 0) { - CHECK_EQ(param_.target_shape[0], oshape[2]) \ - << "param_.target_shape[0] was not reasonable, please set it carefully"; - } - if (param_.target_shape[1] > 0) { - CHECK_EQ(param_.target_shape[1], oshape[3]) \ - << "param_.target_shape[1] was not reasonable, please set it carefully"; - } - } - - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); - - return true; - } else if (param_.kernel.ndim() == 3) { - // 3d conv - CHECK_EQ(dshape.ndim(), 5U) \ - << "Input data should be 5D in batch-num_filter-depth-y-x"; - Shape<5> dshape_ncdhw = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW); - Shape<5> wshape = Shape5(dshape_ncdhw[1], param_.num_filter / param_.num_group, - param_.kernel[0], param_.kernel[1], param_.kernel[2]); - wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value()); - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); - } - - // Note: 3D dilation currently not supported. - // Calculations below done to preserve symmetry with 1D/2D code. - const index_t dilated_ksize_d = param_.DilatedKernelSize(0); - const index_t dilated_ksize_y = param_.DilatedKernelSize(1); - const index_t dilated_ksize_x = param_.DilatedKernelSize(2); - - index_t o_pad[3]; - index_t o_adj[3]; - param_.InferPad(dshape_ncdhw, o_pad, o_adj); - - CHECK_EQ(dshape_ncdhw[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - CHECK_EQ(param_.dilate.Size(), 1U) - << "Dilate is not supported in 3d deconvolution"; - - CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(d) must be samller than stride[0]"; - CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(y) must be samller than stride[1]"; - CHECK_GE(param_.stride[2]-1, o_adj[2]) << "adj(x) must be samller than stride[2]"; - - Shape<5> oshape; - oshape[0] = dshape_ncdhw[0]; - oshape[1] = param_.num_filter; - oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) + - dilated_ksize_d - 2 * o_pad[0] + o_adj[0]; - oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) + - dilated_ksize_y - 2 * o_pad[1] + o_adj[1]; - oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) + - dilated_ksize_x - 2 * o_pad[2] + o_adj[2]; - - if (param_.target_shape.ndim() > 2) { - if (param_.target_shape[0] > 0) { - CHECK_EQ(param_.target_shape[0], oshape[2]) \ - << "param_.target_shape[0] was not reasonable, please it carefully"; - } - if (param_.target_shape[1] > 0) { - CHECK_EQ(param_.target_shape[1], oshape[3]) \ - << "param_.target_shape[1] was not reasonable, please set it carefully"; - } - if (param_.target_shape[2] > 0) { - CHECK_EQ(param_.target_shape[2], oshape[4]) \ - << "param_.target_shape[2] was not reasonable, please set it carefully"; - } - } - - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value())); - - return true; - } else { - LOG(FATAL) << "Unknown convolution type"; - return false; - } -} - -static inline std::vector ListArguments(const DeconvolutionParam& param_) { - if (!param_.no_bias) { - return {"data", "weight", "bias"}; - } else { - return {"data", "weight"}; - } -} - -static bool DeconvolutionType(const nnvm::NodeAttrs& attrs, - std::vector *in_type, std::vector *out_type) { - const DeconvolutionParam& param_ = nnvm::get(attrs.parsed); - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param_)[i]); - } - } - out_type->clear(); - out_type->push_back(dtype); - return true; -} - -inline static bool DeconvStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - const DeconvolutionParam& param = nnvm::get(attrs.parsed); - uint32_t in_expected = param.no_bias ? 2 : 3; - CHECK_EQ(in_attrs->size(), in_expected); - CHECK_EQ(out_attrs->size(), 1); - - DispatchMode wanted_mode; -#if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) - wanted_mode = DispatchMode::kFComputeEx; - else -#endif - wanted_mode = DispatchMode::kFCompute; - return storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, wanted_mode); -} - -inline static bool BackwardDeconvStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - const DeconvolutionParam& param = nnvm::get(attrs.parsed); - uint32_t out_expected = param.no_bias ? 2 : 3; - CHECK_EQ(in_attrs->size(), param.no_bias ? 3U : 4U); - CHECK_EQ(out_attrs->size(), out_expected); - - DispatchMode wanted_mode; -#if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) - wanted_mode = DispatchMode::kFComputeEx; - else -#endif - wanted_mode = DispatchMode::kFCompute; - return storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, wanted_mode); -} - -#if MXNET_USE_MKLDNN == 1 -static void DeconvolutionComputeExCPU(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - if (SupportMKLDNNConv(inputs[0])) { - MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs); - MKLDNNDeconvolutionForward(attrs, ctx, inputs, req, outputs); - MKLDNN_OPCHECK_RUN(DeconvolutionCompute, attrs, ctx, inputs, req, - outputs); - return; - } - FallBackCompute(DeconvolutionCompute, attrs, ctx, inputs, req, - outputs); +template<> +Operator* CreateOp(DeconvolutionParam param, int dtype, + std::vector *in_shape, + std::vector *out_shape, + Context ctx) { + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new DeconvolutionOp(param); + }); + return op; } -static void DeconvolutionGradComputeExCPU(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - if (SupportMKLDNNConv(inputs[0])) { - MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs); - MKLDNNDeconvolutionBackward(attrs, ctx, inputs, req, outputs); - MKLDNN_OPCHECK_RUN(DeconvolutionGradCompute, attrs, ctx, inputs, req, - outputs); - return; - } - FallBackCompute(DeconvolutionGradCompute, attrs, ctx, inputs, req, - outputs); +Operator* DeconvolutionProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + std::vector out_shape, aux_shape; + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0), in_shape, &out_shape, ctx); } -#endif - -static void DeconvolutionParamParser(nnvm::NodeAttrs* attrs) { - using namespace mshadow; - DeconvolutionParam param_; - param_.Init(attrs->dict); - if (param_.kernel.ndim() == 1) { - param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW; - if (param_.stride.ndim() == 0) param_.stride = Shape1(1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1); - if (param_.pad.ndim() == 0) param_.pad = Shape1(0); - if (param_.adj.ndim() == 0) param_.adj = Shape1(0); - } else if (param_.kernel.ndim() == 2) { - param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; - if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); - if (param_.adj.ndim() == 0) param_.adj = Shape2(0, 0); - } else { - CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D deconvolution not supported"; - param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW; - if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); - if (param_.adj.ndim() == 0) param_.adj = Shape3(0, 0, 0); - } - attrs->parsed = std::move(param_); -} - -struct DeconvolutionGrad { - const char *op_name; - std::vector operator()(const nnvm::NodePtr& n, - const std::vector& ograds) const { - std::vector heads(ograds.begin(), ograds.end()); - heads.push_back(n->inputs[deconv::kData]); - heads.push_back(n->inputs[deconv::kWeight]); - const DeconvolutionParam& param = nnvm::get(n->attrs.parsed); - if (!param.no_bias) - heads.push_back(n->inputs[deconv::kBias]); - return MakeGradNode(op_name, n, heads, n->attrs.dict); - } -}; DMLC_REGISTER_PARAMETER(DeconvolutionParam); -NNVM_REGISTER_OP(Deconvolution) -.describe("Computes 1D or 2D transposed convolution (aka fractionally strided convolution) of the " - "input tensor. This operation can be seen as the gradient of Convolution operation with " - "respect to its input. Convolution usually reduces the size of the input. Transposed " - "convolution works the other way, going from a smaller input to a larger output while " - "preserving the connectivity pattern.") -.set_num_inputs([](const NodeAttrs& attrs) { - const DeconvolutionParam& params = nnvm::get(attrs.parsed); - return params.no_bias ? 2 : 3; -}) -.set_num_outputs(1) -.set_attr_parser(DeconvolutionParamParser) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - return ListArguments(nnvm::get(attrs.parsed)); -}) -.set_attr("FInferShape", DeconvolutionShape) -.set_attr("FInferType", DeconvolutionType) -.set_attr("FInferStorageType", DeconvStorageType) -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) -.set_attr("FCompute", DeconvolutionCompute) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FComputeEx", DeconvolutionComputeExCPU) -#endif -.set_attr("FGradient", DeconvolutionGrad{"_backward_Deconvolution"}) +MXNET_REGISTER_OP_PROPERTY(Deconvolution, DeconvolutionProp) .add_argument("data", "NDArray-or-Symbol", "Input tensor to the deconvolution operation.") .add_argument("weight", "NDArray-or-Symbol", "Weights representing the kernel.") .add_argument("bias", "NDArray-or-Symbol", "Bias added to the result after the deconvolution " "operation.") -.add_arguments(DeconvolutionParam::__FIELDS__()); - -NNVM_REGISTER_OP(_backward_Deconvolution) -.set_num_outputs([](const NodeAttrs& attrs) { - const DeconvolutionParam& params = nnvm::get(attrs.parsed); - return params.no_bias ? 2 : 3; -}) -.set_attr("TIsBackward", true) -.set_attr("FInferStorageType", BackwardDeconvStorageType) -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) -.set_attr_parser(DeconvolutionParamParser) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FComputeEx", DeconvolutionGradComputeExCPU) -#endif -.set_attr("FCompute", DeconvolutionGradCompute); +.add_arguments(DeconvolutionParam::__FIELDS__()) +.describe("Computes 1D or 2D transposed convolution (aka fractionally strided convolution) of the " + "input tensor. This operation can be seen as the gradient of Convolution operation with " + "respect to its input. Convolution usually reduces the size of the input. Transposed " + "convolution works the other way, going from a smaller input to a larger output while " + "preserving the connectivity pattern."); } // namespace op } // namespace mxnet diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu index c7395428c2a0..623770170d50 100644 --- a/src/operator/nn/deconvolution.cu +++ b/src/operator/nn/deconvolution.cu @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file deconvolution.cu * \brief - * \author Wei Wu, Da Zheng + * \author Wei Wu */ #include "./deconvolution-inl.h" @@ -31,29 +31,13 @@ namespace mxnet { namespace op { - -#if MXNET_USE_CUDNN == 1 -template -static CuDNNDeconvolutionOp &GetCuDNNDeconvOp(const DeconvolutionParam& param, - int forward_compute_type, - int backward_compute_type, - const std::vector& in_shape, - const std::vector& out_shape, - const Context& ctx) { - static thread_local CuDNNDeconvolutionOp op; - op.Init(param, forward_compute_type, backward_compute_type, in_shape, out_shape, ctx); - return op; -} -#endif - template<> -void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const DeconvolutionParam& param = nnvm::get(attrs.parsed); - int dtype = inputs[0].type_flag_; +Operator* CreateOp(DeconvolutionParam param, int dtype, + std::vector *in_shape, + std::vector *out_shape, + Context ctx) { + // Logic here parallels that in Convolution.cu + Operator *op = NULL; #if MXNET_USE_CUDNN == 1 // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16). @@ -61,88 +45,23 @@ void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { if (param.cudnn_off) { - DeconvolutionOp op; - op.Init(param); - op.Forward(ctx, inputs, req, outputs); - } else if (!CuDNNDeconvolutionOp::Supports(param, - compute_type, compute_type, ctx.run_ctx.ctx)) { + op = new DeconvolutionOp(param); + } else if (!CuDNNDeconvolutionOp::Supports(param, compute_type, compute_type, ctx)) { LOG(WARNING) << "This deconvolution is not supported by cudnn, MXNET deconvolution is applied."; - DeconvolutionOp op; - op.Init(param); - op.Forward(ctx, inputs, req, outputs); + op = new DeconvolutionOp(param); } else { - std::vector in_shape(inputs.size()); - std::vector out_shape(1, outputs[0].shape_); - for (size_t i = 0; i < in_shape.size(); i++) { - in_shape[i] = inputs[i].shape_; - } - GetCuDNNDeconvOp(param, compute_type, compute_type, - in_shape, out_shape, ctx.run_ctx.ctx).Forward(ctx, inputs, req, outputs); + op = new CuDNNDeconvolutionOp(param, compute_type, compute_type, + *in_shape, *out_shape, ctx); } }) #else MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - DeconvolutionOp op; - op.Init(param); - op.Forward(ctx, inputs, req, outputs); - }) -#endif // MXNET_USE_CUDNN -} - -template<> -void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const DeconvolutionParam& param = nnvm::get(attrs.parsed); - std::vector in_data(inputs.begin() + 1, inputs.end()); - const TBlob &out_grad = inputs[0]; - const std::vector &in_grad = outputs; - int dtype = out_grad.type_flag_; - -#if MXNET_USE_CUDNN == 1 - // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16). - int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype; - - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - if (param.cudnn_off) { - DeconvolutionOp op; - op.Init(param); - op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); - } else if (!CuDNNDeconvolutionOp::Supports(param, - compute_type, compute_type, ctx.run_ctx.ctx)) { - LOG(WARNING) << - "This deconvolution is not supported by cudnn, MXNET deconvolution is applied."; - DeconvolutionOp op; - op.Init(param); - op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); - } else { - std::vector in_shape(in_data.size()); - std::vector out_shape(1, out_grad.shape_); - for (size_t i = 0; i < in_shape.size(); i++) { - in_shape[i] = in_data[i].shape_; - } - GetCuDNNDeconvOp(param, compute_type, compute_type, - in_shape, out_shape, ctx.run_ctx.ctx).Backward(ctx, - std::vector{out_grad}, in_data, req, in_grad); - } - }) -#else - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - DeconvolutionOp op; - op.Init(param); - op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + op = new DeconvolutionOp(param); }) #endif // MXNET_USE_CUDNN + return op; } -NNVM_REGISTER_OP(Deconvolution) -.set_attr("FCompute", DeconvolutionCompute); - -NNVM_REGISTER_OP(_backward_Deconvolution) -.set_attr("FCompute", DeconvolutionGradCompute); - } // namespace op } // namespace mxnet diff --git a/src/operator/nn/depthwise_convolution-inl.h b/src/operator/nn/depthwise_convolution-inl.h index 0af8cae51c84..c4b7a4787554 100644 --- a/src/operator/nn/depthwise_convolution-inl.h +++ b/src/operator/nn/depthwise_convolution-inl.h @@ -39,11 +39,11 @@ namespace mxnet { namespace op { using namespace tf::depthwise_conv; template -class DepthwiseConvolutionOp { +class DepthwiseConvolutionOp : public Operator { public: - void Init(const ConvolutionParam& param, - const std::vector& in_shape, - const std::vector& out_shape) { + explicit DepthwiseConvolutionOp(const ConvolutionParam& param, + const std::vector& in_shape, + const std::vector& out_shape) { args_.batch = in_shape[conv::kData][0]; args_.in_channel = in_shape[conv::kData][1]; args_.in_height = in_shape[conv::kData][2]; @@ -62,16 +62,19 @@ class DepthwiseConvolutionOp { ~DepthwiseConvolutionOp() {} - void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data); + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args); - void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &req, - const std::vector &in_grad); + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args); private: DepthwiseArgs args_; @@ -279,7 +282,8 @@ template void DepthwiseConvolutionOp::Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data) { + const std::vector &out_data, + const std::vector &aux_states) { using namespace mshadow; using namespace mshadow::expr; auto stream = ctx.get_stream(); @@ -301,8 +305,10 @@ template void DepthwiseConvolutionOp::Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, + const std::vector &out_data, const std::vector &req, - const std::vector &in_grad) { + const std::vector &in_grad, + const std::vector &aux_states) { using namespace mshadow; using namespace mshadow::expr; auto stream = ctx.get_stream(); diff --git a/src/operator/nn/depthwise_convolution_tf.cuh b/src/operator/nn/depthwise_convolution_tf.cuh index e4dfd8292d2d..c7f48e686136 100644 --- a/src/operator/nn/depthwise_convolution_tf.cuh +++ b/src/operator/nn/depthwise_convolution_tf.cuh @@ -24,8 +24,8 @@ * are different with origin version. * \author shuqian.qu@hobot.cc */ -#ifndef MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_ -#define MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_ +#ifndef MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_ +#define MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_ #include "../../common/cuda_utils.h" #include "../mxnet_op.h" @@ -730,4 +730,4 @@ bool TryLaunchDepthwiseConv2dBackwardFilterGPUSmall(mshadow::Stream } // namespace depthwise_conv } // namespace tf -#endif // MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_ +#endif // MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_ diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h index cff35a3cef7f..715a6f4ee219 100644 --- a/src/operator/nn/dropout-inl.h +++ b/src/operator/nn/dropout-inl.h @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file dropout-inl.h * \brief - * \author Bing Xu, Da Zheng + * \author Bing Xu */ #ifndef MXNET_OPERATOR_NN_DROPOUT_INL_H_ @@ -71,7 +71,7 @@ struct DropoutParam : public dmlc::Parameter { }; // struct DropoutParam template -class DropoutOp { +class DropoutOp : public Operator { #if defined(USE_MKL) && defined(_OPENMP) static void BernoulliGenerate(common::random::RandGenerator gen, int n, double p, int* r) { @@ -206,15 +206,16 @@ class DropoutOp { } }; - void Init(const DropoutParam ¶m) { + explicit DropoutOp(DropoutParam param) { this->pkeep_ = 1.0f - param.p; this->mode_ = static_cast(param.mode); } - void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data) { + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { if (req[dropout::kOut] != kNullOp) { CHECK_EQ(in_data.size(), 1U); if (ctx.is_train) { @@ -248,13 +249,17 @@ class DropoutOp { } } - void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad) { + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { using namespace mshadow; using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1U); + CHECK_EQ(in_grad.size(), 1U); Stream *s = ctx.get_stream(); if (ctx.is_train || mode_ == dropout::kAlways) { if (!MKLBackward(s, this->pkeep_, in_grad, out_data, out_grad)) { @@ -288,42 +293,110 @@ class DropoutOp { dropout::DropoutOpMode mode_; }; // class DropoutOp -template -void DropoutCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const DropoutParam& param = nnvm::get(attrs.parsed); - MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - static thread_local DropoutOp op; - op.Init(param); - op.Forward(ctx, inputs, req, outputs); - }); -} template -void DropoutGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const DropoutParam& param = nnvm::get(attrs.parsed); - CHECK_EQ(inputs.size(), 2U); - CHECK_EQ(outputs.size(), 1); - CHECK_EQ(req.size(), 1); - std::vector out_grads(2); - std::vector out_data(2); - out_grads[dropout::kOut] = inputs[0]; - out_data[dropout::kMask] = inputs[1]; - - MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - static thread_local DropoutOp op; - op.Init(param); - op.Backward(ctx, out_grads, out_data, req, outputs); - }); -} +Operator *CreateOp(DropoutParam param, int dtype); + +#if DMLC_USE_CXX11 +class DropoutProp : public OperatorProperty { + public: + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 1U); + const TShape &dshape = in_shape->at(0); + if (dshape.ndim() == 0) return false; + out_shape->clear(); + out_shape->push_back(dshape); + out_shape->push_back(dshape); + return true; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_EQ(in_type->size(), 1U); + int dtype = in_type->at(0); + + if (dtype == -1) { + LOG(FATAL) << "input type to dropout is not specified."; + return false; + } + + size_t nout = this->ListOutputs().size(); + out_type->clear(); + for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + auto ptr = new DropoutProp(); + ptr->param_ = param_; + return ptr; + } + + std::string TypeString() const override { + return "Dropout"; + } + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return {out_grad[dropout::kOut], out_data[dropout::kMask]}; + } + + std::vector > BackwardInplaceOption( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &in_grad) const override { + return {{out_grad[dropout::kOut], in_grad[dropout::kData]}}; + } + + std::vector > ForwardInplaceOption( + const std::vector &in_data, + const std::vector &out_data) const override { + return {{in_data[dropout::kData], out_data[dropout::kOut]}}; + } + + std::vector ForwardResource(const std::vector &in_shape) const override { + return { ResourceRequest::kParallelRandom }; + } + + int NumVisibleOutputs() const override { + return 1; + } + + int NumOutputs() const override { + return 2; + } + + std::vector ListOutputs() const override { + return {"output", "mask"}; + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented"; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + private: + DropoutParam param_; +}; // class DropoutProp +#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_DROPOUT_INL_H_ diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc index dd5f1e58fbe5..3aa832a71356 100644 --- a/src/operator/nn/dropout.cc +++ b/src/operator/nn/dropout.cc @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file dropout.cc * \brief - * \author Bing Xu, Da Zheng + * \author Bing Xu */ #include "./dropout-inl.h" @@ -29,21 +29,24 @@ namespace mxnet { namespace op { - -struct DropoutGrad { - const char *op_name; - std::vector operator()(const nnvm::NodePtr& n, - const std::vector& ograds) const { - std::vector heads; - heads.push_back(ograds[0]); - heads.emplace_back(nnvm::NodeEntry{n, dropout::kMask, 0}); - return MakeGradNode(op_name, n, heads, n->attrs.dict); - } -}; +template<> +Operator *CreateOp(DropoutParam param, int dtype) { + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new DropoutOp(param); + }); + return op; +} + +// DO_BIND_DISPATCH comes from operator_common.h +Operator *DropoutProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); +} DMLC_REGISTER_PARAMETER(DropoutParam); -NNVM_REGISTER_OP(Dropout) +MXNET_REGISTER_OP_PROPERTY(Dropout, DropoutProp) .describe(R"(Applies dropout operation to input array. - During training, each element of the input is set to zero with probability p. @@ -74,66 +77,8 @@ Example:: [[ 3. 0.5 -0.5 2. 7. ] [ 2. -0.4 7. 3. 0.2 ]] )" ADD_FILELINE) -.set_num_inputs(1) -.set_num_outputs(2) -.set_attr_parser(ParamParser) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - return std::vector{"data"}; -}) -.set_attr("FListOutputNames", - [](const NodeAttrs& attrs) { - return std::vector{"output", "mask"}; -}) -.set_attr("FNumVisibleOutputs", - [](const NodeAttrs& attrs) { - return 1; -}) -.set_attr("FInferShape", [](const nnvm::NodeAttrs& attrs, - std::vector *in_shape, std::vector *out_shape){ - using namespace mshadow; - CHECK_EQ(in_shape->size(), 1U); - const TShape &dshape = in_shape->at(0); - if (dshape.ndim() == 0) return false; - out_shape->clear(); - out_shape->push_back(dshape); - out_shape->push_back(dshape); - return true; -}) -.set_attr("FInferType", [](const nnvm::NodeAttrs& attrs, - std::vector *in_type, std::vector *out_type) { - CHECK_EQ(in_type->size(), 1U); - int dtype = in_type->at(0); - - if (dtype == -1) { - LOG(FATAL) << "input type to dropout is not specified."; - return false; - } - - size_t nout = 2; - out_type->clear(); - for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype); - return true; -}) -.set_attr("FCompute", DropoutCompute) -.set_attr("FGradient", DropoutGrad{"_backward_Dropout"}) -.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ - return std::vector >{{0, 0}}; -}) -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ ResourceRequest::kParallelRandom }; -}) .add_argument("data", "NDArray-or-Symbol", "Input array to which dropout will be applied.") .add_arguments(DropoutParam::__FIELDS__()); -NNVM_REGISTER_OP(_backward_Dropout) -.set_num_outputs(1) -.set_attr("TIsBackward", true) -.set_attr_parser(ParamParser) -.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ - return std::vector >{{0, 0}}; -}) -.set_attr("FCompute", DropoutGradCompute); - } // namespace op } // namespace mxnet diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu index e655278822a4..f416c5883203 100644 --- a/src/operator/nn/dropout.cu +++ b/src/operator/nn/dropout.cu @@ -21,20 +21,21 @@ * Copyright (c) 2015 by Contributors * \file dropout.cc * \brief - * \author Bing Xu, Da Zheng + * \author Bing Xu */ #include "./dropout-inl.h" namespace mxnet { namespace op { - -NNVM_REGISTER_OP(Dropout) -.set_attr("FCompute", DropoutCompute); - -NNVM_REGISTER_OP(_backward_Dropout) -.set_attr("FCompute", DropoutGradCompute); - +template<> +Operator *CreateOp(DropoutParam param, int dtype) { + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + op = new DropoutOp(param); + }); + return op; +} } // namespace op } // namespace mxnet diff --git a/src/operator/nn/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h index e8e95643e647..9f3deec2449f 100644 --- a/src/operator/nn/fully_connected-inl.h +++ b/src/operator/nn/fully_connected-inl.h @@ -43,7 +43,6 @@ namespace op { // These enums are only visible within this header namespace fullc { enum FullyConnectedOpInputs {kData, kWeight, kBias}; -enum FullyConnectedOpResource {kTempSpace}; enum FullyConnectedOpOutputs {kOut}; } // fullc @@ -62,160 +61,240 @@ struct FullyConnectedParam : public dmlc::Parameter { } }; +/** + * \brief This is the implementation of fully connected operator. + * \tparam xpu The device that the op will be executed on. + */ template -void FCForward(const OpContext &ctx, const FullyConnectedParam ¶m, - const std::vector &in_data, const std::vector &req, - const std::vector &out_data) { - using namespace mshadow; - using namespace mshadow::expr; - if (req[fullc::kOut] == kNullOp) return; - CHECK_EQ(req[fullc::kOut], kWriteTo); - // TODO(bing): check the BLAS Handle, be careful - // maybe need blas handle from context - // TODO(bing): judge shape to remove flatten op - Stream *s = ctx.get_stream(); +class FullyConnectedOp : public Operator { + public: + explicit FullyConnectedOp(FullyConnectedParam p) { + this->param_ = p; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + if (req[fullc::kOut] == kNullOp) return; + CHECK_EQ(req[fullc::kOut], kWriteTo); + size_t expected = param_.no_bias ? 2 : 3; + CHECK_EQ(in_data.size(), expected); + CHECK_EQ(out_data.size(), 1U); + // TODO(bing): check the BLAS Handle, be careful + // maybe need blas handle from context + // TODO(bing): judge shape to remove flatten op + Stream *s = ctx.get_stream(); #if defined(__CUDACC__) - CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) - << "Must init CuBLAS handle in stream"; + CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) + << "Must init CuBLAS handle in stream"; #endif // __CUDACC__ - const TShape& ishape = in_data[fullc::kData].shape_; - const TShape& oshape = out_data[fullc::kOut].shape_; - - Tensor wmat = in_data[fullc::kWeight].get(s); - Tensor data, out; - if (!param.flatten) { - data = in_data[fullc::kData].get_with_shape( - Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); - out = out_data[fullc::kOut].get_with_shape( - Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s); - } else { - data = in_data[fullc::kData].get_with_shape( - Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); - out = out_data[fullc::kOut].get_with_shape( - Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); - } + const TShape& ishape = in_data[fullc::kData].shape_; + const TShape& oshape = out_data[fullc::kOut].shape_; - // Legacy approach shown here for comparison: - // out = dot(data, wmat.T()); - linalg_gemm(data, wmat, out, false, true, s); - if (!param.no_bias) { - Tensor bias = in_data[fullc::kBias].get(s); - out += repmat(bias, data.size(0)); - } -} + Tensor wmat = in_data[fullc::kWeight].get(s); + Tensor data, out; + if (!param_.flatten) { + data = in_data[fullc::kData].get_with_shape( + Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); + out = out_data[fullc::kOut].get_with_shape( + Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s); + } else { + data = in_data[fullc::kData].get_with_shape( + Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); + out = out_data[fullc::kOut].get_with_shape( + Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); + } -template -void FCBackward(const OpContext &ctx, const FullyConnectedParam ¶m, - const std::vector &out_grad, const std::vector &in_data, - const std::vector &req, const std::vector &in_grad) { - using namespace mshadow; - using namespace mshadow::expr; - // TODO(bing): check the BLAS Handle, be careful - // maybe need blas handle from context - Stream *s = ctx.get_stream(); - const TShape& ishape = in_data[fullc::kData].shape_; - const TShape& oshape = out_grad[fullc::kOut].shape_; - - Tensor wmat = in_data[fullc::kWeight].get(s); - Tensor data, grad, gdata; - if (!param.flatten) { - data = in_data[fullc::kData].get_with_shape( - Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); - grad = out_grad[fullc::kOut].get_with_shape( - Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s); - gdata = in_grad[fullc::kData].get_with_shape( - Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); - } else { - data = in_data[fullc::kData].get_with_shape( - Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); - grad = out_grad[fullc::kOut].get_with_shape( - Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); - gdata = in_grad[fullc::kData].get_with_shape( - Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); + // Legacy approach shown here for comparison: + // out = dot(data, wmat.T()); + linalg_gemm(data, wmat, out, false, true, s); + if (!param_.no_bias) { + Tensor bias = in_data[fullc::kBias].get(s); + out += repmat(bias, data.size(0)); + } } + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1U); + size_t expected = param_.no_bias ? 2 : 3; + CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(req.size(), expected); + // TODO(bing): check the BLAS Handle, be careful + // maybe need blas handle from context + Stream *s = ctx.get_stream(); + const TShape& ishape = in_data[fullc::kData].shape_; + const TShape& oshape = out_grad[fullc::kOut].shape_; + + Tensor wmat = in_data[fullc::kWeight].get(s); + Tensor data, grad, gdata; + if (!param_.flatten) { + data = in_data[fullc::kData].get_with_shape( + Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); + grad = out_grad[fullc::kOut].get_with_shape( + Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s); + gdata = in_grad[fullc::kData].get_with_shape( + Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); + } else { + data = in_data[fullc::kData].get_with_shape( + Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); + grad = out_grad[fullc::kOut].get_with_shape( + Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); + gdata = in_grad[fullc::kData].get_with_shape( + Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); + } + #if defined(__CUDACC__) - CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) - << "Must init CuBLAS handle in stream"; + CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) + << "Must init CuBLAS handle in stream"; #endif - // backprop - CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace"; - // gradient of weight - Tensor gwmat = in_grad[fullc::kWeight].get(s); - // Legacy approach shown here for comparison: - // out = Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data)); - linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]); - // gradient of bias - if (!param.no_bias) { - Tensor gbias = in_grad[fullc::kBias].get(s); - Assign(gbias, req[fullc::kBias], sum_rows(grad)); + // backprop + CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace"; + // gradient of weight + Tensor gwmat = in_grad[fullc::kWeight].get(s); + // Legacy approach shown here for comparison: + // out = Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data)); + linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]); + // gradient of bias + if (!param_.no_bias) { + Tensor gbias = in_grad[fullc::kBias].get(s); + Assign(gbias, req[fullc::kBias], sum_rows(grad)); + } + // gradient of data + // Legacy approach shown here for comparison: + // Assign(gdata, req[fullc::kData], dot(grad, wmat)); + linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]); } - // gradient of data - // Legacy approach shown here for comparison: - // Assign(gdata, req[fullc::kData], dot(grad, wmat)); - linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]); -} + private: + FullyConnectedParam param_; +}; // class FullyConnectedOp + +// Decalre Factory function, used for dispatch specialization template -void FullyConnectedCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const FullyConnectedParam& param = nnvm::get(attrs.parsed); - uint32_t in_expected = param.no_bias ? 2 : 3; - CHECK_EQ(inputs.size(), in_expected); - CHECK_EQ(outputs.size(), 1U); - int dtype = inputs[0].type_flag_; - - switch (dtype) { - case mshadow::kFloat32: - FCForward(ctx, param, inputs, req, outputs); - break; - case mshadow::kFloat64: - FCForward(ctx, param, inputs, req, outputs); - break; - case mshadow::kFloat16: - LOG(FATAL) << "float16 fully connected layer is currently" - "only supported by CuDNN version."; - break; - default: - LOG(FATAL) << "Unsupported type " << dtype; +Operator* CreateOp(FullyConnectedParam param, int dtype, + std::vector *in_shape, + std::vector *out_shape, + Context ctx); + +#if DMLC_USE_CXX11 +class FullyConnectedProp : public OperatorProperty { + public: + std::vector ListArguments() const override { + if (!param_.no_bias) { + return {"data", "weight", "bias"}; + } else { + return {"data", "weight"}; + } } -} -template -void FullyConnectedGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const FullyConnectedParam& param = nnvm::get(attrs.parsed); - uint32_t out_expected = param.no_bias ? 2 : 3; - CHECK_EQ(inputs.size(), 3U); - CHECK_EQ(outputs.size(), out_expected); - CHECK_EQ(req.size(), out_expected); - - std::vector out_grad{inputs[0]}; - std::vector in_data(inputs.begin() + 1, inputs.end()); - int dtype = inputs[0].type_flag_; - - switch (dtype) { - case mshadow::kFloat32: - FCBackward(ctx, param, out_grad, in_data, req, outputs); - break; - case mshadow::kFloat64: - FCBackward(ctx, param, out_grad, in_data, req, outputs); - break; - case mshadow::kFloat16: - LOG(FATAL) << "float16 fully connected layer is currently" - "only supported by CuDNN version."; - break; - default: - LOG(FATAL) << "Unsupported type " << dtype; + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + } + + std::map GetParams() const override { + return param_.__DICT__(); } -} + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + if (!param_.no_bias) { + CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; + } else { + CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; + } + CHECK_EQ(out_shape->size(), 1U); + TShape dshape = (*in_shape)[fullc::kData]; + TShape oshape = (*out_shape)[0]; + // require data to be known + if (dshape.ndim() == 0) return false; + + index_t num_input; + if (!param_.flatten) { + num_input = dshape[dshape.ndim()-1]; + } else { + num_input = dshape.ProdShape(1, dshape.ndim()); + } + SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param_.num_hidden, num_input)); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param_.num_hidden)); + } + + if (!param_.flatten) { + TShape result_shape(dshape); + result_shape[dshape.ndim()-1] = param_.num_hidden; + SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape); + } else { + SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param_.num_hidden)); + } + if (oshape.ndim() != 0) { + dshape[0] = oshape[0]; + SHAPE_ASSIGN_CHECK(*in_shape, fullc::kData, dshape); + } + return true; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_GE(in_type->size(), 1U); + nnvm::NodeAttrs attrs; + attrs.name = "FullyConnected"; + return ElemwiseAttr( + attrs, in_type, out_type, -1); + } + + OperatorProperty* Copy() const override { + FullyConnectedProp* fc_sym = new FullyConnectedProp(); + fc_sym->param_ = this->param_; + return fc_sym; + } + + std::string TypeString() const override { + return "FullyConnected"; + } + + // decalre dependency and inplace optimization options + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return {out_grad[fullc::kOut], in_data[fullc::kData], in_data[fullc::kWeight]}; + } + + std::vector > BackwardInplaceOption( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &in_grad) const override { + return {{in_data[fullc::kData], in_grad[fullc::kData]}}; + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + private: + FullyConnectedParam param_; +}; // class FullyConnectedSymbol +#endif } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_FULLY_CONNECTED_INL_H_ diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc index 4362408a23a1..9a978160297d 100644 --- a/src/operator/nn/fully_connected.cc +++ b/src/operator/nn/fully_connected.cc @@ -23,153 +23,58 @@ * \brief fully connect operator */ #include "./fully_connected-inl.h" -#include "./mkldnn/mkldnn_ops-inl.h" -#include "./mkldnn/mkldnn_base-inl.h" #if MXNET_USE_NNPACK == 1 #include "./nnpack/nnpack_fully_connected-inl.h" #endif // MXNET_USE_NNPACK namespace mxnet { namespace op { - -static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs, - std::vector *in_shape, - std::vector *out_shape) { - const FullyConnectedParam& param = nnvm::get(attrs.parsed); - using namespace mshadow; - if (!param.no_bias) { - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; - } else { - CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; - } - CHECK_EQ(out_shape->size(), 1U); - TShape dshape = (*in_shape)[fullc::kData]; - TShape oshape = (*out_shape)[0]; - // require data to be known - if (dshape.ndim() == 0) return false; - - index_t num_input; - if (!param.flatten) { - num_input = dshape[dshape.ndim()-1]; - } else { - num_input = dshape.ProdShape(1, dshape.ndim()); - } - SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param.num_hidden, num_input)); - if (!param.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param.num_hidden)); - } - - if (!param.flatten) { - TShape result_shape(dshape); - result_shape[dshape.ndim()-1] = param.num_hidden; - SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape); - } else { - SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param.num_hidden)); - } - if (oshape.ndim() != 0) { - dshape[0] = oshape[0]; - SHAPE_ASSIGN_CHECK(*in_shape, fullc::kData, dshape); - } - return true; -} - -#if MXNET_USE_MKLDNN == 1 -void FullyConnectedComputeExCPU(const nnvm::NodeAttrs& attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - if (SupportMKLDNN(inputs[0])) { - MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs); - MKLDNNFCForward(attrs, ctx, inputs, req, outputs); - MKLDNN_OPCHECK_RUN(FullyConnectedCompute, attrs, ctx, inputs, req, - outputs); - return; - } - FallBackCompute(FullyConnectedCompute, attrs, ctx, inputs, req, outputs); -} - -void FullyConnectedGradComputeExCPU(const nnvm::NodeAttrs& attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - if (SupportMKLDNN(inputs[0])) { - MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs); - MKLDNNFCBackward(attrs, ctx, inputs, req, outputs); - MKLDNN_OPCHECK_RUN(FullyConnectedGradCompute, attrs, ctx, inputs, req, - outputs); - return; +template<> +Operator* CreateOp(FullyConnectedParam param, int dtype, + std::vector *in_shape, + std::vector *out_shape, + Context ctx) { + Operator *op = NULL; +#if MXNET_USE_NNPACK == 1 + const size_t batch_size = (*in_shape)[0][0]; + // nnp_fully_connected_inference will do optimization for batch-size = 1 + // nnp_fully_connected_output will do optimization for batch-size > 1 + switch (dtype) { + case mshadow::kFloat32: + return new NNPACKFullyConnectedOp(param); + default: + break; } - FallBackCompute(FullyConnectedGradCompute, attrs, ctx, inputs, req, outputs); -} #endif - -static bool FullyConnectedType(const nnvm::NodeAttrs& attrs, - std::vector *in_type, std::vector *out_type) { - CHECK_GE(in_type->size(), 1U); - return ElemwiseAttr( - attrs, in_type, out_type, -1); -} - -struct FullyConnectedGrad { - const char *op_name; - std::vector operator()(const nnvm::NodePtr& n, - const std::vector& ograds) const { - std::vector heads(ograds.begin(), ograds.end()); - heads.push_back(n->inputs[fullc::kData]); - heads.push_back(n->inputs[fullc::kWeight]); - return MakeGradNode(op_name, n, heads, n->attrs.dict); + switch (dtype) { + case mshadow::kFloat32: + op = new FullyConnectedOp(param); + break; + case mshadow::kFloat64: + op = new FullyConnectedOp(param); + break; + case mshadow::kFloat16: + LOG(FATAL) << "float16 fully connected layer is currently" + "only supported by CuDNN version."; + break; + default: + LOG(FATAL) << "Unsupported type " << dtype; } -}; - -inline static bool FCStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - const FullyConnectedParam& param = nnvm::get(attrs.parsed); - uint32_t in_expected = param.no_bias ? 2 : 3; - CHECK_EQ(in_attrs->size(), in_expected); - CHECK_EQ(out_attrs->size(), 1); - DispatchMode wanted_mode; -#if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) - wanted_mode = DispatchMode::kFComputeEx; - else -#endif - wanted_mode = DispatchMode::kFCompute; - return storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, wanted_mode); + return op; } -inline static bool BackwardFCStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - const FullyConnectedParam& param = nnvm::get(attrs.parsed); - uint32_t out_expected = param.no_bias ? 2 : 3; - CHECK_EQ(in_attrs->size(), 3U); - CHECK_EQ(out_attrs->size(), out_expected); - - DispatchMode wanted_mode; -#if 0 - // TODO(zhengda) let's disable MKLDNN for FullyConnected for now. - // It seems there is a bug. - if (dev_mask == mshadow::cpu::kDevMask) - *dispatch_mode = DispatchMode::kFComputeEx; - else -#endif - wanted_mode = DispatchMode::kFCompute; - return storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, wanted_mode); +// DO_BIND_DISPATCH comes from operator_common.h +Operator *FullyConnectedProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + std::vector out_shape(1, TShape()), aux_shape; + CHECK(InferShape(in_shape, &out_shape, &aux_shape)); + DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx); } DMLC_REGISTER_PARAMETER(FullyConnectedParam); -NNVM_REGISTER_OP(FullyConnected) +MXNET_REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedProp) .describe(R"code(Applies a linear transformation: :math:`Y = XW^T + b`. If ``flatten`` is set to be true, then the shapes are: @@ -191,59 +96,9 @@ The learnable parameters include both ``weight`` and ``bias``. If ``no_bias`` is set to be true, then the ``bias`` term is ignored. )code" ADD_FILELINE) -.set_num_inputs([](const NodeAttrs& attrs) { - const FullyConnectedParam& params = nnvm::get(attrs.parsed); - return params.no_bias ? 2 : 3; -}) -.set_num_outputs(1) -.set_attr_parser(ParamParser) -.set_attr("FInferStorageType", FCStorageType) -.set_attr("FListInputNames", [](const NodeAttrs& attrs) { - const FullyConnectedParam& params = nnvm::get(attrs.parsed); - if (!params.no_bias) { - return std::vector{"data", "weight", "bias"}; - } else { - return std::vector{"data", "weight"}; - } -}) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) -#endif -.set_attr("FInferShape", FullyConnectedShape) -.set_attr("FInferType", FullyConnectedType) -.set_attr("FCompute", FullyConnectedCompute) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FComputeEx", FullyConnectedComputeExCPU) -#endif -.set_attr("FGradient", FullyConnectedGrad{"_backward_FullyConnected"}) .add_argument("data", "NDArray-or-Symbol", "Input data.") .add_argument("weight", "NDArray-or-Symbol", "Weight matrix.") .add_argument("bias", "NDArray-or-Symbol", "Bias parameter.") .add_arguments(FullyConnectedParam::__FIELDS__()); - -NNVM_REGISTER_OP(_backward_FullyConnected) -.set_num_inputs(3) -.set_num_outputs([](const NodeAttrs& attrs) { - const FullyConnectedParam& params = nnvm::get(attrs.parsed); - return params.no_bias ? 2 : 3; -}) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) -#endif -.set_attr("TIsBackward", true) -.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ - return std::vector >{{1, 0}}; -}) -.set_attr("FInferStorageType", BackwardFCStorageType) -.set_attr_parser(ParamParser) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FComputeEx", FullyConnectedGradComputeExCPU) -#endif -.set_attr("FCompute", FullyConnectedGradCompute); - } // namespace op } // namespace mxnet diff --git a/src/operator/nn/fully_connected.cu b/src/operator/nn/fully_connected.cu index c89d37767c4a..279a378e2ad4 100644 --- a/src/operator/nn/fully_connected.cu +++ b/src/operator/nn/fully_connected.cu @@ -25,50 +25,16 @@ #include "./fully_connected-inl.h" namespace mxnet { namespace op { - template<> -void FullyConnectedCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const FullyConnectedParam& param = nnvm::get(attrs.parsed); - uint32_t in_expected = param.no_bias ? 2 : 3; - CHECK_EQ(inputs.size(), in_expected); - CHECK_EQ(outputs.size(), 1U); - int dtype = inputs[0].type_flag_; - +Operator* CreateOp(FullyConnectedParam param, int dtype, + std::vector *in_shape, + std::vector *out_shape, + Context ctx) { + Operator *op = NULL; MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - FCForward(ctx, param, inputs, req, outputs); - }); + op = new FullyConnectedOp(param); + }) + return op; } - -template<> -void FullyConnectedGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const FullyConnectedParam& param = nnvm::get(attrs.parsed); - uint32_t out_expected = param.no_bias ? 2 : 3; - CHECK_EQ(inputs.size(), 3U); - CHECK_EQ(outputs.size(), out_expected); - CHECK_EQ(req.size(), out_expected); - - std::vector out_grad{inputs[0]}; - std::vector in_data(inputs.begin() + 1, inputs.end()); - int dtype = inputs[0].type_flag_; - - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - FCBackward(ctx, param, out_grad, in_data, req, outputs); - }); -} - -NNVM_REGISTER_OP(FullyConnected) -.set_attr("FCompute", FullyConnectedCompute); - -NNVM_REGISTER_OP(_backward_FullyConnected) -.set_attr("FCompute", FullyConnectedGradCompute); - } // namespace op } // namespace mxnet diff --git a/src/operator/nn/lrn-inl.h b/src/operator/nn/lrn-inl.h deleted file mode 100644 index fdae1eca0aef..000000000000 --- a/src/operator/nn/lrn-inl.h +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * Copyright (c) 2015 by Contributors - * \file lrn-inl.h - * \brief - * \author Bing Xu -*/ -#ifndef MXNET_OPERATOR_NN_LRN_INL_H_ -#define MXNET_OPERATOR_NN_LRN_INL_H_ -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../mshadow_op.h" - -namespace mxnet { -namespace op { - -namespace lrn_enum { -enum LRNInputs {kData}; -enum LRNOutputs {kOut, kTmpNorm}; -} // namespace lrn_enum - -struct LRNParam : public dmlc::Parameter { - float alpha; - float beta; - float knorm; - uint32_t nsize; - DMLC_DECLARE_PARAMETER(LRNParam) { - DMLC_DECLARE_FIELD(alpha).set_default(1e-4f) - .describe("The variance scaling parameter :math:`\alpha` in the LRN expression."); - DMLC_DECLARE_FIELD(beta).set_default(0.75f) - .describe("The power parameter :math:`\beta` in the LRN expression."); - DMLC_DECLARE_FIELD(knorm).set_default(2.0f) - .describe("The parameter :math:`k` in the LRN expression."); - DMLC_DECLARE_FIELD(nsize) - .describe("normalization window width in elements."); - } -}; // struct LRNParam - -template -void LRNForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data) { - using namespace mshadow; - using namespace mshadow::expr; - const LRNParam& param_ = nnvm::get(attrs.parsed); - // TODO(xxx): Test with gradient chceker - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 2U); - // CHECK_EQ(req.size(), 2); - CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size"; - const real_t salpha = param_.alpha / param_.nsize; - Stream *s = ctx.get_stream(); - Tensor data = in_data[lrn_enum::kData].get(s); - Tensor out = out_data[lrn_enum::kOut].get(s); - Tensor tmp_norm = out_data[lrn_enum::kTmpNorm].get(s); - tmp_norm = chpool(F(data) , param_.nsize) * salpha + param_.knorm; - Assign(out, req[lrn_enum::kOut], data * F(tmp_norm, -param_.beta)); -} - -template -void LRNBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const TBlob &out_grad, const TBlob &in_data, - const TBlob &out_norm, const OpReqType &req, - const TBlob &in_grad) { - using namespace mshadow; - using namespace mshadow::expr; - const LRNParam& param_ = nnvm::get(attrs.parsed); - const real_t salpha = param_.alpha / param_.nsize; - Stream *s = ctx.get_stream(); - Tensor grad = out_grad.get(s); - Tensor tmp_norm = out_norm.get(s); - Tensor data = in_data.get(s); - Tensor grad_in = in_grad.get(s); - grad_in = grad * F(tmp_norm, -param_.beta); - grad_in += (- 2.0f * param_.beta * salpha) * - chpool(grad * data * - F(tmp_norm, -param_.beta - 1.0f), - param_.nsize) * data; -} - -template -void LRNCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - LRNForward(attrs, ctx, inputs, req, outputs); -} - -template -void LRNGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - LRNBackward(attrs, ctx, inputs[0], // out_grad - inputs[1], // in_data - inputs[2], // out_norm - req[lrn_enum::kData], outputs[lrn_enum::kData]); -} - -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_NN_LRN_INL_H_ diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc deleted file mode 100644 index 2359b49abab6..000000000000 --- a/src/operator/nn/lrn.cc +++ /dev/null @@ -1,203 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * Copyright (c) 2015 by Contributors - * \file lrn.cc - * \brief - * \author Bing Xu, Patric Zhao (patric.zhao@intel.com) -*/ - -#include "./lrn-inl.h" -#include "../operator_common.h" -#if MXNET_USE_MKLDNN == 1 -#include "./mkldnn/mkldnn_lrn-inl.h" -#endif - -namespace mxnet { -namespace op { - -bool LRNShape(const nnvm::NodeAttrs& attrs, - std::vector *in_shape, - std::vector *out_shape) { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 1U) << "Input:[data]"; - const TShape &dshape = in_shape->at(0); - if (dshape.ndim() == 0) return false; - out_shape->clear(); - out_shape->push_back(dshape); - out_shape->push_back(dshape); - return true; -} - -inline std::vector ListArguments() { - return {"data"}; -} - -bool LRNType(const nnvm::NodeAttrs& attrs, - std::vector *in_type, - std::vector *out_type) { - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); - } - } - int n_out = 2; - out_type->clear(); - for (int i = 0; i < n_out; ++i ) out_type->push_back(dtype); - return true; -} - -struct LRNGrad { - const char *op_name; - std::vector operator()(const nnvm::NodePtr& n, - const std::vector& ograds) const { - std::vector heads; - heads.push_back(ograds[0]); // out_grad - heads.push_back(n->inputs[lrn_enum::kData]); - heads.emplace_back(nnvm::NodeEntry{n, lrn_enum::kTmpNorm, 0}); - return MakeGradNode(op_name, n, heads, n->attrs.dict); - } -}; - -bool LRNForwardInferStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - CHECK(!in_attrs->empty()); -#if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { - storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, DispatchMode::kFComputeEx); - return true; - } -#endif - storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, DispatchMode::kFCompute); - return true; -} - -bool LRNBackwardInferStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - CHECK(!in_attrs->empty()); -#if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { - storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, DispatchMode::kFComputeEx); - return true; - } -#endif - storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, DispatchMode::kFCompute); - return true; -} - -#if MXNET_USE_MKLDNN == 1 -void LRNComputeExCPU(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - const LRNParam ¶m = nnvm::get(attrs.parsed); - if (SupportMKLDNN(inputs[0])) { - // We only need to test one output array. - MKLDNN_OPCHECK_INIT(false, 1, inputs, outputs); - MKLDNNLRNForward(ctx, param, inputs[0], req[0], outputs[0]); - MKLDNN_OPCHECK_RUN(LRNCompute, attrs, ctx, inputs, req, outputs); - return; - } - FallBackCompute(LRNCompute, attrs, ctx, inputs, req, outputs); -} - -void LRNGradComputeExCPU(const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - const LRNParam ¶m = nnvm::get(attrs.parsed); - const NDArray &out_grad = inputs[0]; - const NDArray &in_data = inputs[1]; - const NDArray &in_grad = outputs[0]; - - if (SupportMKLDNN(inputs[0])) { - MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs); - MKLDNNLRNBackward(ctx, param, out_grad, in_data, req[0], in_grad); - MKLDNN_OPCHECK_RUN(LRNGradCompute, attrs, ctx, inputs, req, outputs); - return; - } - FallBackCompute(LRNGradCompute, attrs, ctx, inputs, req, outputs); -} -#endif - -DMLC_REGISTER_PARAMETER(LRNParam); - -NNVM_REGISTER_OP(LRN) -.describe(R"code(Applies local response normalization to the input. - -The local response normalization layer performs "lateral inhibition" by normalizing -over local input regions. - -If :math:`a_{x,y}^{i}` is the activity of a neuron computed by applying kernel :math:`i` at position -:math:`(x, y)` and then applying the ReLU nonlinearity, the response-normalized -activity :math:`b_{x,y}^{i}` is given by the expression: - -.. math:: - b_{x,y}^{i} = \frac{a_{x,y}^{i}}{\Bigg({k + \alpha \sum_{j=max(0, i-\frac{n}{2})}^{min(N-1, i+\frac{n}{2})} (a_{x,y}^{j})^{2}}\Bigg)^{\beta}} - -where the sum runs over :math:`n` "adjacent" kernel maps at the same spatial position, and :math:`N` is the total -number of kernels in the layer. - -)code" ADD_FILELINE) -.set_num_inputs(1) -.set_num_outputs(2) -.set_attr("FNumVisibleOutputs", - [](const NodeAttrs& attrs) { return 1; }) -.set_attr_parser(ParamParser) -.set_attr("FInferShape", LRNShape) -.set_attr("FInferType", LRNType) -.set_attr("FInferStorageType", LRNForwardInferStorageType) -.set_attr("FCompute", LRNCompute) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FComputeEx", LRNComputeExCPU) -#endif -.set_attr("FGradient", LRNGrad{"_backward_LRN"}) -.add_argument("data", "NDArray-or-Symbol", "Input data to LRN") -.add_arguments(LRNParam::__FIELDS__()); - -NNVM_REGISTER_OP(_backward_LRN) -.set_num_outputs(1) -.set_attr_parser(ParamParser) -.set_attr("FInferStorageType", LRNBackwardInferStorageType) -.set_attr("TIsBackward", true) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FComputeEx", LRNGradComputeExCPU) -#endif -.set_attr("FCompute", LRNGradCompute); - -} // namespace op -} // namespace mxnet diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc b/src/operator/nn/mkldnn/mkldnn_act.cc deleted file mode 100644 index 71fdf4ca585b..000000000000 --- a/src/operator/nn/mkldnn/mkldnn_act.cc +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file mkldnn_act.cc - * \brief - * \author Da Zheng -*/ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "../../operator_common.h" -#include "../activation-inl.h" -#include "./mkldnn_base-inl.h" - -#if MXNET_USE_MKLDNN == 1 - -#include - -namespace mxnet { -namespace op { - -bool SupportMKLDNNAct(const ActivationParam& param) { - // We only enable ReLU for now. It seems other activations have some precision - // problems. - return param.act_type == activation::kReLU; -#if 0 - || param.act_type == activation::kSigmoid - || param.act_type == activation::kSoftReLU; -#endif -} - -static inline mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) { - switch (param.act_type) { - case activation::kReLU: - return mkldnn::algorithm::eltwise_relu; - case activation::kSigmoid: - return mkldnn::algorithm::eltwise_logistic; - case activation::kTanh: - return mkldnn::algorithm::eltwise_tanh; - case activation::kSoftReLU: - return mkldnn::algorithm::eltwise_soft_relu; - default: - LOG(FATAL) << "unknown activation type"; - return mkldnn::algorithm::eltwise_relu; - } -} - -typedef std::shared_ptr mkldnn_act_pdesc_ptr; - -static mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl( - const ActivationParam& param, bool is_train, - const mkldnn::memory &input_mem, int dtype) { - mkldnn::memory::primitive_desc data_mpd = input_mem.get_primitive_desc(); - mkldnn::memory::desc data_md = data_mpd.desc(); - auto cpu_engine = data_mpd.get_engine(); - - auto alg = GetMKLDNNActAlgo(param); - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - DType alpha = 0; - mkldnn::eltwise_forward::desc desc = is_train - ? mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_training, - alg, data_md, alpha) - : mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_scoring, - alg, data_md, alpha); - return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine); - }); - LOG(INFO) << "Unsupported data type for MKLDNN activation"; - mkldnn::eltwise_forward::desc desc = mkldnn::eltwise_forward::desc( - mkldnn::prop_kind::forward_training, alg, data_md, 0.0); - return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine); -} - -typedef MKLDNNParamOpSign MKLDNNActSignature; - -class MKLDNNActForward { - std::shared_ptr fwd; - std::shared_ptr data; - std::shared_ptr out; - - public: - const mkldnn::eltwise_forward::primitive_desc fwd_pd; - - MKLDNNActForward(const ActivationParam& param, bool is_train, - const NDArray &data, const mkldnn::memory &mem): fwd_pd( - GetActFwdDescImpl(param, is_train, mem, data.dtype())) { - } - - void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &output) { - if (this->data == nullptr) - this->data = std::shared_ptr(new mkldnn::memory( - data.get_primitive_desc(), data.get_data_handle())); - else - this->data->set_data_handle(data.get_data_handle()); - - CHECK(fwd_pd.dst_primitive_desc() == output.get_primitive_desc()); - if (this->out == nullptr) - this->out = std::shared_ptr(new mkldnn::memory( - fwd_pd.dst_primitive_desc(), output.get_data_handle())); - else - this->out->set_data_handle(output.get_data_handle()); - - if (this->fwd == nullptr) { - this->fwd = std::shared_ptr( - new mkldnn::eltwise_forward(fwd_pd, mkldnn::primitive::at(*this->data), - *this->out)); - } - } - - const mkldnn::eltwise_forward &GetFwd() const { - return *fwd; - } -}; - -static MKLDNNActForward &GetActForward(const ActivationParam& param, - const OpContext &ctx, const NDArray &in_data, - const mkldnn::memory &in_mem) { - static thread_local std::unordered_map fwds; - MKLDNNActSignature key(param); - key.AddSign(ctx.is_train); - key.AddSign(param.act_type); - key.AddSign(in_data); - - auto it = fwds.find(key); - if (it == fwds.end()) { - MKLDNNActForward fwd(param, ctx.is_train, in_data, in_mem); - auto ins_ret = fwds.insert(std::pair( - key, fwd)); - CHECK(ins_ret.second); - it = ins_ret.first; - } - return it->second; -} - -void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const NDArray &in_data, const OpReqType &req, - const NDArray &out_data) { - const ActivationParam& param = nnvm::get(attrs.parsed); - auto input_mem = in_data.GetMKLDNNData(); - MKLDNNActForward &fwd = GetActForward(param, ctx, in_data, *input_mem); - auto out_mem = const_cast(out_data).CreateMKLDNNData( - fwd.fwd_pd.dst_primitive_desc()); - fwd.SetNewMem(*input_mem, *out_mem); - MKLDNNStream *stream = MKLDNNStream::Get(); - stream->RegisterPrim(fwd.GetFwd()); - stream->Submit(); -} - -void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const NDArray &out_grad, const NDArray &in_data, - const OpReqType &req, const NDArray &in_grad) { - if (req == kNullOp) { - return; - } - - const ActivationParam& param = nnvm::get(attrs.parsed); - TmpMemMgr::Get()->Init(ctx.requested[activation::kTempSpace]); - auto diff_dst_memory = out_grad.GetMKLDNNData(); - auto input_mem = in_data.GetMKLDNNData(); - // We need to make sure the two inputs to eltwise_backward has the same memory - // descriptor. Otherwise, the perf will suffer. - if (input_mem->get_primitive_desc() != diff_dst_memory->get_primitive_desc()) - input_mem = in_data.GetMKLDNNDataReorder(diff_dst_memory->get_primitive_desc()); - mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); - mkldnn::memory::desc data_md = data_mpd.desc(); - mkldnn::memory::desc diff_md = diff_dst_memory->get_primitive_desc().desc(); - auto cpu_engine = data_mpd.get_engine(); - - MKLDNNStream *stream = MKLDNNStream::Get(); - auto alg = GetMKLDNNActAlgo(param); - mkldnn_output_t diff_src_memory; - - MSHADOW_REAL_TYPE_SWITCH(in_data.dtype(), DType, { - DType alpha = 0; - mkldnn::eltwise_forward::desc fw_desc(mkldnn::prop_kind::forward_training, - alg, data_md, alpha); - mkldnn::eltwise_forward::primitive_desc fw_pdesc(fw_desc, cpu_engine); - mkldnn::eltwise_backward::desc bw_desc(alg, diff_md, data_md, alpha); - mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine, - fw_pdesc); - - diff_src_memory = CreateMKLDNNMem(in_grad, - bw_pdesc.diff_src_primitive_desc(), req); - stream->RegisterPrim(mkldnn::eltwise_backward(bw_pdesc, *input_mem, - *diff_dst_memory, - *diff_src_memory.second)); - }); - CommitOutput(in_grad, diff_src_memory); - stream->Submit(); -} - -} // namespace op -} // namespace mxnet - -#endif diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h deleted file mode 100644 index 1c583e1f671e..000000000000 --- a/src/operator/nn/mkldnn/mkldnn_base-inl.h +++ /dev/null @@ -1,488 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/******************************************************************************* -* Copyright 2016-2017 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkldnn_base-inl.h -* \brief -* \author young.jin.kim@intel.com -* ashok.emani@intel.com -* deepthi.karkada@intel.com -* louis.feng@intel.com -* adam.d.straw@intel.com -* zhengda1936@gmail.com -* -*******************************************************************************/ - -#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_ -#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_ - -#if MXNET_USE_MKLDNN == 1 -#include -#include -#include -#include -#include -#include -#include -#include "mkldnn.hpp" -#include "mxnet/ndarray.h" -#include "mxnet/resource.h" -#include "mxnet/op_attr_types.h" -using namespace mkldnn; -namespace mxnet { -extern bool EnableMkldnnWarnGenerated(); -// ===== CpuEngine ======================================= -// cpu_engine singleton -class CpuEngine { - public: - static CpuEngine *Get() { - // I's thread-safe in C++11. - static thread_local CpuEngine myInstance; - return &myInstance; - } - CpuEngine(CpuEngine const &) = delete; // Copy construct - CpuEngine(CpuEngine &&) = delete; // Move construct - CpuEngine &operator=(CpuEngine const &) = delete; // Copy assign - CpuEngine &operator=(CpuEngine &&) = delete; // Move assign - - mkldnn::engine &get_engine() { return _cpu_engine; } - - protected: - CpuEngine() : _cpu_engine(mkldnn::engine::cpu, 0) {} - ~CpuEngine() {} - - private: - mkldnn::engine _cpu_engine; -}; - -// type enumerator -template -struct data_type_enum {}; - -template <> -struct data_type_enum { - enum { type = mkldnn::memory::data_type::f32 }; -}; - -template <> -struct data_type_enum { - enum { type = mkldnn::memory::data_type::s32 }; -}; - -template <> -struct data_type_enum { - enum { type = mkldnn::memory::data_type::s16 }; -}; - -template <> -struct data_type_enum { - enum { type = mkldnn::memory::data_type::s8 }; -}; - -template <> -struct data_type_enum { - enum { type = mkldnn::memory::data_type::u8 }; -}; - -static inline bool SupportMKLDNNArray(int dtype, const TShape &shape) { - int ndim = shape.ndim(); - bool support = ndim == 1 || ndim == 2 || ndim == 4; - support = support && (dtype == mshadow::kFloat32 || dtype == mshadow::kInt32 - || dtype == mshadow::kInt8 || dtype == mshadow::kUint8); - return support; -} - -static inline bool SupportStorageMKLDNN(int stype) { - return stype == kDefaultStorage; -} - -static inline bool SupportMKLDNN(int dtype, const TShape &shape) { - int ndim = shape.ndim(); - return dtype == mshadow::kFloat32 && (ndim == 1 || ndim == 2 || ndim == 4); -} - -static inline bool SupportMKLDNN(const NDArray &input) { - return SupportMKLDNN(input.dtype(), input.shape()) - && SupportStorageMKLDNN(input.storage_type()); -} - -static inline bool SupportMKLDNNConv(const NDArray &input) { - return input.dtype() == mshadow::kFloat32 && input.shape().ndim() == 4; -} - -/* - * This is to align address to a certain alignment. - */ -void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space); - -namespace op { -struct ActivationParam; -bool SupportMKLDNNAct(const op::ActivationParam& param); -} - -static int GetTypeSize(int dtype) { - int size = -1; - MSHADOW_TYPE_SWITCH(dtype, DType, { - size = sizeof(DType); - }); - return size; -} - -static inline size_t GetArraySize(const NDArray &arr) { - return arr.shape().Size() * GetTypeSize(arr.dtype()); -} - -static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { - switch (dtype) { - case mshadow::kFloat32: - return mkldnn::memory::data_type::f32; - case mshadow::kInt32: - return mkldnn::memory::data_type::s32; - case mshadow::kInt8: - return mkldnn::memory::data_type::s8; - case mshadow::kUint8: - return mkldnn::memory::data_type::u8; - default: - LOG(FATAL) << "unknown type for MKLDNN"; - return mkldnn::memory::data_type::data_undef; - } -} - -inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr, int ndim) { - mkldnn::memory::dims dims(ndim); - for (size_t i = 0; i < dims.size(); i++) dims[i] = arr.shape()[i]; - return mkldnn::memory::desc{dims, get_mkldnn_type(arr.dtype()), - mkldnn::memory::format::any}; -} - -inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr) { - return GetMemDesc(arr, arr.shape().ndim()); -} - -inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr, - int num_groups) { - if (num_groups == 1) { - return GetMemDesc(arr); - } else { - CHECK_EQ(arr.shape().ndim(), 4U); - mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups, - static_cast(arr.shape()[0] / num_groups), - static_cast(arr.shape()[1]), - static_cast(arr.shape()[2]), - static_cast(arr.shape()[3])}; - return mkldnn::memory::desc{tz, get_mkldnn_type(arr.dtype()), - mkldnn::memory::format::any}; - } -} - -typedef std::shared_ptr mkldnn_mem_ptr; -typedef std::shared_ptr mkldnn_mem_const_ptr; - -/* - * This is to manage the temporary memory provided by MXNet for operators. - * The temp memory is mainly used to keep the reordered data. In an operator, we - * may need multiple pieces of memory for them. But MXNet can only provide - * a single piece of memory. This class is to help break the temporary memory - * from MXNet to store the reordered data. - * The amount of temporary memory used in an operator depends on the layout of - * input arrays and the operator. It's difficult to calculate it manually, so - * the class also estimate the amount of memory automatically. - */ -class TmpMemMgr { - // This points to the memory buffer where we can allocate temp memory. - char *curr_mem; - // The total size of the temp memory. - size_t mem_size; - // This contains the current available memory size. - size_t curr_size; - // This estimate the required temp memory size in an operator. - size_t est_size; - const size_t alignment = 4096; - - public: - static TmpMemMgr *Get() { - static thread_local TmpMemMgr mgr; - return &mgr; - } - - TmpMemMgr() { - Reset(); - est_size = 0; - mem_size = 0; - } - - void Reset() { - curr_mem = nullptr; - curr_size = 0; - // We don't reset est_size and mem_size because est_size contains the - // estimated temp memory size from the last run and mem_size contains the - // memroy size allocated in the last run. - } - - void Init(const Resource &r) { - // If the last time, if we estimate that we need more memory, we should the - // larger memory size. - mem_size = std::max(mem_size, est_size); - if (mem_size > 0) { - // Let's allocate some extra memory. If we don't use some of them all the time, - // the OS won't physically allocate pages for them any way. - this->curr_size = mem_size * 2; - this->curr_mem = static_cast(r.get_host_space_internal(this->curr_size)); - } - // reset est_size, so we can start to estimate the temp memory size. - this->est_size = 0; - } - - mkldnn::memory *Alloc(const mkldnn::memory::primitive_desc &pd); -}; - -class MKLDNNStream { - std::vector net; - // Here we hold all memory related to the operators in the stream. - std::vector > mem_holder; - - public: - static MKLDNNStream *Get() { - static thread_local MKLDNNStream stream; - return &stream; - } - - void RegisterPrim(const mkldnn::primitive &prim) { net.push_back(prim); } - - void RegisterMem(std::shared_ptr mem) { - mem_holder.push_back(mem); - } - - bool HasOps() const { - return !net.empty(); - } - - void Submit() { - if (!net.empty()) - mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); - net.clear(); - mem_holder.clear(); - TmpMemMgr::Get()->Reset(); - } -}; - -class MKLDNNOpSignature { - std::vector eles; - uint64_t hash; - - public: - MKLDNNOpSignature() { - hash = 0; - } - - explicit MKLDNNOpSignature(uint64_t hash) { - this->hash = hash; - } - - /* - * We provide different methods to add signature to an op. - * For operations, such as convolutin and fully connected, which determines - * the optimal data layout for the op, we only need to use the shape and data - * type to sign the op. For other operations, such as activation, which uses - * whatever layout in the input array, we have to use the shape, the data type - * and the layout to sign the op. - */ - - void AddSign(const mkldnn::memory &mem) { - auto desc = mem.get_primitive_desc().desc(); - hash = hash * 2 + desc.data.format; - eles.push_back(desc.data.format); - hash = hash * 2 + desc.data.data_type; - eles.push_back(desc.data.data_type); - for (int i = 0; i < desc.data.ndims; i++) { - hash = hash * 2 + desc.data.dims[i]; - eles.push_back(desc.data.dims[i]); - } - } - - void AddSign(const std::vector &arrs) { - for (auto &arr : arrs) { - AddSign(arr); - } - } - - void AddSign(const NDArray &arr) { - if (arr.IsMKLDNNData()) { - AddSign(*(arr.GetMKLDNNData())); - } else { - hash = hash * 2 + arr.dtype(); - eles.push_back(arr.dtype()); - AddSign(arr.shape()); - } - } - - void AddSign(const TShape &shape) { - for (size_t i = 0; i < shape.ndim(); i++) { - hash = hash * 2 + shape[i]; - eles.push_back(shape[i]); - } - } - - void AddSign(int val) { - hash = hash * 2 + val; - eles.push_back(val); - } - - bool operator==(const MKLDNNOpSignature &sign) const { - if (hash != sign.hash) - return false; - if (eles.size() != sign.eles.size()) - return false; - for (size_t i = 0; i < eles.size(); i++) - if (eles[i] != sign.eles[i]) - return false; - return true; - } - - uint64_t GetHash() const { - return hash; - } -}; - -struct MKLDNNOpHash { - size_t operator()(const MKLDNNOpSignature &sign) const { - return sign.GetHash(); - } -}; - -template -class MKLDNNParamOpSign: public MKLDNNOpSignature { - const ParamType param; - - static size_t hash(const ParamType ¶m) { - std::hash fn; - return fn(param); - } - - public: - explicit MKLDNNParamOpSign(const ParamType &_param): MKLDNNOpSignature( - hash(_param)), param(_param) { - } - - bool operator==(const MKLDNNParamOpSign &sign) const { - const MKLDNNOpSignature &this_upper = *this; - const MKLDNNOpSignature &other_upper = sign; - return this_upper == other_upper && param == sign.param; - } -}; - -enum OutDataOp { - Noop, - CopyBack, - AddBack, -}; - -typedef std::pair mkldnn_output_t; - -/* - * These two functions try to create MKLDNN memory in an NDArray based on `req'. - * The difference is that the first function can create MKLDNN memory with - * special layouts in an NDArray, while the second one can only create MKLDNN - * memory with default layouts. - * If these two functions are used, we have to call CommitOutput to write - * the output back to the output NDArray. - */ -mkldnn_output_t CreateMKLDNNMem(const NDArray &arr, - const mkldnn::memory::primitive_desc &desc, - OpReqType req); -mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr, - const mkldnn::memory::primitive_desc &desc, - OpReqType req); -/* This function has to be used with one of the functions above. */ -void CommitOutput(const NDArray &arr, const mkldnn_output_t &res); - -static inline void InvalidateOutputs(const std::vector &arrs, - const std::vector &reqs) { - for (size_t i = 0; i < arrs.size(); i++) { - if (reqs[i] == kWriteTo || reqs[i] == kNullOp) { - const_cast(arrs[i]).InvalidateMKLDNNData(); - } - } -} - -const mkldnn::memory *GetWeights(const NDArray &arr, - const mkldnn::memory::primitive_desc &target_pd, - int num_groups); - -mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc); -mkldnn_memory_format_t GetDefaultFormat(int num_dims); -mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd, - mkldnn_memory_format_t format); - -void FallBackCompute(FCompute fn, const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs); - -/* - * This class is used to check the correctness of MKLDNN operators. - */ -class OpCheck { - std::vector inputs; - std::vector outputs; - bool backward; - size_t num_checks; - - public: - OpCheck(bool backward, size_t num_checks) { - this->backward = backward; - this->num_checks = num_checks; - } - - void Init(const std::vector &inputs_, - const std::vector &outputs_); - - void Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs, - const mxnet::OpContext &ctx, - const std::vector &inputs_, - const std::vector &req, - const std::vector &outputs_); -}; - -#define MKLDNN_OPCHECK_INIT(backward, num_checks, inputs, outputs) \ - static bool debug = dmlc::GetEnv("MXNET_MKLDNN_DEBUG", false); \ - OpCheck check(backward, num_checks); \ - if (debug) check.Init(inputs, outputs); - -#define MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs) \ - if (debug) check.Run(fn, attrs, ctx, inputs, req, outputs); - -} // namespace mxnet -#endif -#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_ diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc deleted file mode 100644 index c34ca03a2809..000000000000 --- a/src/operator/nn/mkldnn/mkldnn_base.cc +++ /dev/null @@ -1,385 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#if MXNET_USE_MKLDNN == 1 - -#include -#include "./mkldnn_base-inl.h" -#include "./mkldnn_ops-inl.h" - -namespace mxnet { - -void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space) { - if (size > *space) - return nullptr; - intptr_t addr = reinterpret_cast(mem); - // If the address has been aligned, don't do anything. - intptr_t last_chunk = addr % alignment; - if (last_chunk == 0) - return mem; - intptr_t padding = alignment - last_chunk; - // If the buffer doesn't have enough space, we should return null here. - if (padding + size > *space) - return nullptr; - addr += padding; - *space -= padding; - CHECK_EQ(addr % alignment, 0); - return reinterpret_cast(addr); -} - -mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::primitive_desc &pd) { - // We need to include the size of the memory used for alignment. - this->est_size += pd.get_size() + alignment; - void *mem = AlignMem(this->curr_mem, pd.get_size(), alignment, &this->curr_size); - if (mem) { - // The memory is allocated from the temporary memory space in the - // operator. It'll only become invalid after we exit from the operator. - mkldnn_mem_ptr ret(new mkldnn::memory(pd, mem)); - MKLDNNStream::Get()->RegisterMem(ret); - CHECK_EQ(mem, mem); - this->curr_size -= pd.get_size(); - this->curr_mem = static_cast(mem) + pd.get_size(); - return ret.get(); - } else { - LOG(WARNING) << "Allocate " << pd.get_size() - << " bytes with malloc directly"; - mkldnn_mem_ptr ret(new mkldnn::memory(pd)); - MKLDNNStream::Get()->RegisterMem(ret); - return ret.get(); - } -} - -mkldnn_output_t CreateMKLDNNMem(const NDArray &arr, - const mkldnn::memory::primitive_desc &desc, - OpReqType req) { - if (kAddTo == req) { - auto tmp = TmpMemMgr::Get()->Alloc(desc); - return mkldnn_output_t(OutDataOp::AddBack, tmp); - } else if (kWriteInplace == req) { - // MKLDNN ops may not support the case that the input and the output uses - // the same memory. Let's use an extra copy to make sure it always works. - auto tmp = TmpMemMgr::Get()->Alloc(desc); - return mkldnn_output_t(OutDataOp::CopyBack, tmp); - } else { - mkldnn::memory *mem = const_cast(arr).CreateMKLDNNData(desc); - if (mem == nullptr) { - auto tmp = TmpMemMgr::Get()->Alloc(desc); - return mkldnn_output_t(OutDataOp::CopyBack, tmp); - } else { - return mkldnn_output_t(OutDataOp::Noop, mem); - } - } -} - -mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr, - const mkldnn::memory::primitive_desc &desc, - OpReqType req) { - if (kAddTo == req) { - auto tmp = TmpMemMgr::Get()->Alloc(desc); - return mkldnn_output_t(OutDataOp::AddBack, tmp); - } else if (kWriteInplace == req) { - auto tmp = TmpMemMgr::Get()->Alloc(desc); - return mkldnn_output_t(OutDataOp::CopyBack, tmp); - } else { - auto _desc = desc; - auto def_format = GetDefaultFormat(_desc.desc()); - mkldnn::memory *mem = nullptr; - if (def_format == _desc.desc().data.format) { - mem = const_cast(arr).CreateMKLDNNData(desc); - } - if (mem == nullptr) { - auto tmp = TmpMemMgr::Get()->Alloc(desc); - return mkldnn_output_t(OutDataOp::CopyBack, tmp); - } else { - return mkldnn_output_t(OutDataOp::Noop, mem); - } - } -} - -void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) { - if (res.first == CopyBack) { - const_cast(arr).CopyFrom(*res.second); - } else if (res.first == AddBack) { - auto mem = arr.GetMKLDNNData(res.second->get_primitive_desc()); - CHECK(mem != nullptr); - // We have to allocate new memory for the sum result. - auto sum_res = TmpMemMgr::Get()->Alloc( - res.second->get_primitive_desc()); - op::Sum(*res.second, *mem, *sum_res); - const_cast(arr).CopyFrom(*sum_res); - } -} - -const mkldnn::memory *GetWeights(const NDArray &arr, - const mkldnn::memory::primitive_desc &target_pd, - int num_groups) { - const mkldnn::memory *mem = arr.GetMKLDNNData(target_pd); - // If the weight array already uses the target layout, simply return it - // directly. - if (mem) - return mem; - - mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); - auto engine = CpuEngine::Get()->get_engine(); - if (arr.shape().ndim() == 2) { - mkldnn::memory::dims tz = mkldnn::memory::dims{ - static_cast(arr.shape()[0]), static_cast(arr.shape()[1])}; - mkldnn::memory::desc md = - mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi}; - mkldnn::memory::primitive_desc pd = - mkldnn::memory::primitive_desc{md, engine}; - mem = arr.GetMKLDNNData(pd); - } else if (arr.shape().ndim() == 4 && num_groups == 1) { - mkldnn::memory::dims tz = mkldnn::memory::dims{ - static_cast(arr.shape()[0]), static_cast(arr.shape()[1]), - static_cast(arr.shape()[2]), static_cast(arr.shape()[3])}; - mkldnn::memory::desc md = - mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw}; - mkldnn::memory::primitive_desc pd = - mkldnn::memory::primitive_desc{md, engine}; - mem = arr.GetMKLDNNData(pd); - } else if (arr.shape().ndim() == 4) { - mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups, - static_cast(arr.shape()[0] / num_groups), - static_cast(arr.shape()[1]), - static_cast(arr.shape()[2]), - static_cast(arr.shape()[3])}; - mkldnn::memory::desc md = - mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw}; - mkldnn::memory::primitive_desc pd = - mkldnn::memory::primitive_desc{md, engine}; - mem = arr.GetMKLDNNData(pd); - } else { - LOG(FATAL) << "The weight array has an unsupported number of dimensions"; - return nullptr; - } - if (mem == nullptr) - mem = arr.GetMKLDNNDataReorder(target_pd); - if (mem->get_primitive_desc() == target_pd) return mem; - - auto ret = TmpMemMgr::Get()->Alloc(target_pd); - MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(*mem, *ret)); - return ret; -} - -mkldnn_memory_format_t GetDefaultFormat(int num_dims) { - switch (num_dims) { - case 1: return mkldnn_x; - case 2: return mkldnn_nc; - case 4: return mkldnn_nchw; - case 5: return mkldnn_goihw; - default: - LOG(FATAL) << "Unsupported MKLDNN dimensions: " << num_dims; - return mkldnn_format_undef; - } -} - -mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc) { - if (desc.data.ndims == 1) { - return desc.data.format; - } else if (desc.data.ndims == 2) { - if (desc.data.format == mkldnn_io) - return mkldnn_oi; - else - return desc.data.format; - } else if (desc.data.ndims == 4) { - switch (desc.data.format) { - case mkldnn_nchw: - case mkldnn_nhwc: - case mkldnn_chwn: - case mkldnn_nChw8c: - case mkldnn_nChw16c: - return mkldnn_nchw; - case mkldnn_oihw: - case mkldnn_ihwo: - case mkldnn_hwio: - case mkldnn_OIhw8i8o: - case mkldnn_OIhw16i16o: - case mkldnn_OIhw8i16o2i: - case mkldnn_OIhw8o16i2o: - case mkldnn_OIhw8o8i: - case mkldnn_OIhw16o16i: - case mkldnn_IOhw16o16i: - case mkldnn_Oihw8o: - case mkldnn_Oihw16o: - case mkldnn_Ohwi8o: - case mkldnn_Ohwi16o: - case mkldnn_OhIw16o4i: - return mkldnn_oihw; - default: - LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format; - return mkldnn_format_undef; - } - } else if (desc.data.ndims == 5) { - switch (desc.data.format) { - case mkldnn_goihw: - case mkldnn_gOIhw8i8o: - case mkldnn_gOIhw16i16o: - case mkldnn_gOIhw8i16o2i: - case mkldnn_gOIhw8o16i2o: - case mkldnn_gOIhw8o8i: - case mkldnn_gOIhw16o16i: - case mkldnn_gIOhw16o16i: - case mkldnn_gOihw8o: - case mkldnn_gOihw16o: - case mkldnn_gOhwi8o: - case mkldnn_gOhwi16o: - case mkldnn_gOhIw16o4i: - return mkldnn_goihw; - default: - LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format; - return mkldnn_format_undef; - } - } else { - LOG(FATAL) << "Unsupported dimensions: " << desc.data.ndims; - return mkldnn_format_undef; - } -} - -mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd, - mkldnn_memory_format_t format) { - mkldnn::memory::dims dims(pd.desc().data.ndims); - for (size_t i = 0; i < dims.size(); i++) - dims[i] = pd.desc().data.dims[i]; - mkldnn::memory::format cpp_format = static_cast(format); - mkldnn::memory::data_type cpp_type = static_cast( - pd.desc().data.data_type); - mkldnn::memory::desc data_md(dims, cpp_type, cpp_format); - return mkldnn::memory::primitive_desc(data_md, pd.get_engine()); -} - -void FallBackCompute(FCompute fn, const nnvm::NodeAttrs &attrs, - const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - std::vector in_blobs(inputs.size()); - for (size_t i = 0; i < in_blobs.size(); i++) { - in_blobs[i] = inputs[i].data(); - } - std::vector out_blobs(outputs.size()); - for (size_t i = 0; i < out_blobs.size(); i++) { - if (req[i] == kWriteTo) - const_cast(outputs[i]).InvalidateMKLDNNData(); - CHECK(outputs[i].IsDefaultData()); - out_blobs[i] = outputs[i].data(); - } - fn(attrs, ctx, in_blobs, req, out_blobs); -} - -template -void print_diff(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2) { - DType *data1 = reinterpret_cast(arr1.data().dptr_); - DType *data2 = reinterpret_cast(arr2.data().dptr_); - for (size_t i = 0; i < arr1.shape().Size(); i++) - std::cout << data1[i] - data2[i] << ", "; - std::cout << std::endl; -} - -template -static bool SimilarArray(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2, - DType rtol, DType atol) { - if (arr1.shape().Size() != arr2.shape().Size()) - return false; - - // This function should be used outside an MKLDNN operator. - // There shouldn't be any operators in the stream. - CHECK(!MKLDNNStream::Get()->HasOps()); - // We need to reorder data in the arrays to the default layout. - // But we shouldn't reorder data in the original array. - NDArray buf1, buf2; - if (arr1.IsMKLDNNData()) { - buf1 = NDArray(arr1.shape(), arr1.ctx(), false, arr1.dtype()); - auto mem = arr1.GetMKLDNNData(); - buf1.CopyFrom(*mem); - } - if (arr2.IsMKLDNNData()) { - buf2 = NDArray(arr2.shape(), arr2.ctx(), false, arr2.dtype()); - auto mem = arr2.GetMKLDNNData(); - buf2.CopyFrom(*mem); - } - MKLDNNStream::Get()->Submit(); - - DType *data1 = reinterpret_cast( - arr1.IsMKLDNNData() ? buf1.data().dptr_: arr1.data().dptr_); - DType *data2 = reinterpret_cast( - arr2.IsMKLDNNData() ? buf2.data().dptr_: arr2.data().dptr_); - std::atomic success(true); -#pragma omp parallel for - for (size_t i = 0; i < arr1.shape().Size(); i++) { - if (std::abs(data1[i] - data2[i]) > atol + rtol * std::abs(data2[i])) - success.store(false); - } - return success.load(); -} - -void OpCheck::Init(const std::vector &inputs_, - const std::vector &outputs_) { - auto ctx = inputs_[0].ctx(); - CHECK(!MKLDNNStream::Get()->HasOps()); - for (size_t i = 0; i < inputs_.size(); i++) { - inputs.emplace_back(inputs_[i].shape(), ctx, - false, inputs_[i].dtype()); - auto mem = inputs_[i].GetMKLDNNData(); - inputs[i].CopyFrom(*mem); - } - for (size_t i = 0; i < outputs_.size(); i++) { - outputs.emplace_back(outputs_[i].shape(), ctx, - false, outputs_[i].dtype()); - if (backward) { - auto mem = outputs_[i].GetMKLDNNData(); - outputs[i].CopyFrom(*mem); - } - } - MKLDNNStream::Get()->Submit(); -} - -void OpCheck::Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs, - const mxnet::OpContext &ctx, - const std::vector &inputs_, - const std::vector &req, - const std::vector &outputs_) { - std::vector in_blobs(inputs.size()); - for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data(); - std::vector out_blobs(outputs.size()); - for (size_t i = 0; i < out_blobs.size(); i++) - out_blobs[i] = outputs[i].data(); - fn(attrs, ctx, in_blobs, req, out_blobs); - - LOG(INFO) << "test " << attrs.op->name; - size_t num = std::min(outputs.size(), outputs_.size()); - num = std::min(num_checks, num); - for (size_t i = 0; i < num; i++) { - // We don't need to compare if it doesn't need to output data. - if (req[i] == kNullOp) - continue; - MSHADOW_TYPE_SWITCH(outputs[i].dtype(), DType, { - bool similar = SimilarArray(outputs[i], outputs_[i], 1e-3, 1e-4); - if (!similar) { - LOG(ERROR) << attrs.op->name << " fails"; - print_diff(outputs[i], outputs_[i]); - } - CHECK(similar); - }); - } -} - -} // namespace mxnet - -#endif diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h deleted file mode 100644 index 19a98da6af83..000000000000 --- a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h +++ /dev/null @@ -1,431 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file mkldnn_batch_norm.cc - * \brief - * \author Tao Lv -*/ - -#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_ -#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_ - -#if MXNET_USE_MKLDNN == 1 -#include -#include -#include -#include "../batch_norm-inl.h" -#include "./mkldnn_ops-inl.h" -#include "./mkldnn_base-inl.h" - -#define VARIANCE_TO_INVSTD(__var$, __eps$) (1.0/sqrt((__var$) + DType(__eps$))) -#define INVSTD_TO_VARIANCE(__invstd$, __eps$) ((1.0 / ((__invstd$) * (__invstd$))) - (__eps$)) -namespace mxnet { -namespace op { - -typedef mkldnn::batch_normalization_forward::primitive_desc t_bn_f_pdesc; -typedef mkldnn::batch_normalization_forward::desc t_bn_f_desc; -typedef mkldnn::batch_normalization_backward::primitive_desc t_bn_b_pdesc; -typedef mkldnn::batch_normalization_backward::desc t_bn_b_desc; - -using mkldnn::use_global_stats; -using mkldnn::use_scale_shift; -using mkldnn::forward_training; -using mkldnn::forward_inference; - -inline static unsigned _GetFlags(const std::vector &in_data, - const std::vector &aux_states, - const BatchNormParam ¶m, bool is_train) { - unsigned flags = 0U; - if (in_data.size() == 3U) { - flags |= use_scale_shift; - } - - // aux_states[0]: inMean - // aux_states[1]: inVariance - if (aux_states.size() == 2U && !is_train) { - flags |= use_global_stats; - } - return flags; -} - -template -inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory &data_mem, - bool is_train, - DType eps, - unsigned flags) { - auto data_mpd = data_mem.get_primitive_desc(); - auto data_md = data_mpd.desc(); - auto engine = CpuEngine::Get()->get_engine(); - - if (is_train) { - t_bn_f_desc bnFwd_desc(forward_training, data_md, eps, flags); - return t_bn_f_pdesc(bnFwd_desc, engine); - } else { - t_bn_f_desc bnFwd_desc(forward_inference, data_md, eps, flags); - return t_bn_f_pdesc(bnFwd_desc, engine); - } -} - -template -inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory &data_mem, - const mkldnn::memory &diff_mem, - DType eps, - unsigned flags) { - auto data_mpd = data_mem.get_primitive_desc(); - auto data_md = data_mpd.desc(); - auto diff_mpd = diff_mem.get_primitive_desc(); - auto diff_md = diff_mpd.desc(); - auto engine = CpuEngine::Get()->get_engine(); - - t_bn_b_desc bnBwd_desc(mkldnn::prop_kind::backward, diff_md, data_md, eps, flags); - return t_bn_b_pdesc(bnBwd_desc, engine, _GetFwd(data_mem, true, eps, flags)); -} - -typedef MKLDNNParamOpSign MKLDNNBNSignature; - -class MKLDNNBNForward { - std::shared_ptr data_m; - std::shared_ptr weight_m; - std::shared_ptr out_m; - std::shared_ptr mean_m; - std::shared_ptr var_m; - std::shared_ptr fwd; - bool is_train; - t_bn_f_pdesc pd; - - public: - MKLDNNBNForward(const t_bn_f_pdesc &_pd, bool is_train): pd(_pd) { - weight_m.reset(new mkldnn::memory(pd.weights_primitive_desc())); - this->is_train = is_train; - } - - const mkldnn::memory &GetWeight() const { - return *weight_m; - } - - const t_bn_f_pdesc &GetPd() const { - return pd; - } - - const mkldnn::memory &GetMean() const { - return *mean_m; - } - - const mkldnn::memory &GetVar() const { - return *var_m; - } - - void SetDataHandle(const NDArray &data, const NDArray &mean, - const NDArray &var, const mkldnn::memory &out) { - auto _data = data.GetMKLDNNData(); - if (data_m) { - data_m->set_data_handle(_data->get_data_handle()); - } else { - data_m.reset(new mkldnn::memory(_data->get_primitive_desc(), - _data->get_data_handle())); - } - if (out_m) { - out_m->set_data_handle(out.get_data_handle()); - } else { - out_m.reset(new mkldnn::memory(out.get_primitive_desc(), - out.get_data_handle())); - } - auto mean_ptr = mean.data().dptr_; - if (mean_m) { - mean_m->set_data_handle(mean_ptr); - } else { - mean_m.reset(new mkldnn::memory(pd.mean_primitive_desc(), - mean_ptr)); - } - auto var_ptr = var.data().dptr_; - if (var_m) { - var_m->set_data_handle(var_ptr); - } else { - var_m.reset(new mkldnn::memory(pd.variance_primitive_desc(), - var_ptr)); - } - - if (fwd == nullptr) { - if (!is_train) - fwd.reset(new mkldnn::batch_normalization_forward( - pd, *data_m, mkldnn::primitive::at(*mean_m), - mkldnn::primitive::at(*var_m), *weight_m, *out_m)); - else - fwd.reset(new mkldnn::batch_normalization_forward( - pd, mkldnn::primitive::at(*data_m), - mkldnn::primitive::at(*weight_m), *out_m, - *mean_m, *var_m)); - } - } - - const mkldnn::batch_normalization_forward &GetFwd() const { - return *fwd; - } -}; - -template -static MKLDNNBNForward &GetBNForward(const BatchNormParam& param, - const OpContext &ctx, const NDArray &in_data, - unsigned flags) { - static thread_local std::unordered_map fwds; - MKLDNNBNSignature key(param); - key.AddSign(ctx.is_train); - key.AddSign(in_data); - - auto it = fwds.find(key); - if (it == fwds.end()) { - auto fwd_pd = _GetFwd(*in_data.GetMKLDNNData(), ctx.is_train, - (DType) param.eps, flags); - MKLDNNBNForward fwd(fwd_pd, ctx.is_train); - auto ins_ret = fwds.insert(std::pair( - key, fwd)); - CHECK(ins_ret.second); - it = ins_ret.first; - } - return it->second; -} - -template -void MKLDNNBatchNormForward(const OpContext &ctx, const BatchNormParam ¶m, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { - TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]); - unsigned flags = _GetFlags(in_data, aux_states, param, ctx.is_train); - const NDArray &data = in_data[batchnorm::kData]; - - auto &fwd = GetBNForward(param, ctx, data, flags); - const NDArray &out = out_data[batchnorm::kOut]; - - // for output memory - auto out_mem = const_cast(out).CreateMKLDNNData(fwd.GetPd().dst_primitive_desc()); - - // mxnet will always use scale shift. - // But if fix_gamma is true, then all scale elements will be set to 1.0f - if (flags & use_scale_shift) { - const NDArray &gamma = in_data[batchnorm::kGamma]; - const NDArray &beta = in_data[batchnorm::kBeta]; - CHECK_EQ(gamma.storage_type(), mxnet::kDefaultStorage); - CHECK_EQ(beta.storage_type(), mxnet::kDefaultStorage); - - const mkldnn::memory &weight_mem = fwd.GetWeight(); - DType* weight_buf = reinterpret_cast(weight_mem.get_data_handle()); - - nnvm::dim_t channels_ = data.shape()[1]; - CHECK(weight_mem.get_primitive_desc().get_size() == channels_ * sizeof(DType) * 2); - DType* weight_ptr = gamma.data().dptr(); - DType* bias_ptr = beta.data().dptr(); - if (!param.fix_gamma) { -#pragma omp parallel for simd - for (int i = 0; i < channels_; i++) { - weight_buf[i] = weight_ptr[i]; - weight_buf[channels_ + i] = bias_ptr[i]; // bias - } - } else if (IsBNWriting(req[batchnorm::kGamma])) { -#pragma omp parallel for simd - for (int i = 0; i < channels_; i++) { - weight_buf[i] = (DType)1.0f; - weight_ptr[i] = (DType)1.0f; - weight_buf[channels_ + i] = bias_ptr[i]; // bias - } - } else { -#pragma omp parallel for simd - for (int i = 0; i < channels_; i++) { - weight_buf[i] = (DType)1.0f; - weight_buf[channels_ + i] = bias_ptr[i]; // bias - } - } - - if (!ctx.is_train) { - DType* omean = out_data[batchnorm::kMean].data().dptr(); - DType* ovar = out_data[batchnorm::kVar].data().dptr(); - DType* inmean = aux_states[batchnorm::kMovingMean].data().dptr(); - DType* invar = aux_states[batchnorm::kMovingVar].data().dptr(); - // to align with origin implmentation: batch_norm.cc: L164 -#pragma omp parallel for simd - for (int i = 0; i < channels_; i++) { - omean[i] = inmean[i]; - ovar[i] = VARIANCE_TO_INVSTD(invar[i], param.eps); - } - - fwd.SetDataHandle(data, aux_states[batchnorm::kMovingMean], - aux_states[batchnorm::kMovingVar], - *out_mem); - MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd()); - MKLDNNStream::Get()->Submit(); - } else { // training - const NDArray &outMean = out_data[batchnorm::kMean]; - const NDArray &outVar = out_data[batchnorm::kVar]; - DType* omean = outMean.data().dptr(); - DType* ovar = outVar.data().dptr(); - - fwd.SetDataHandle(data, outMean, outVar, *out_mem); - MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd()); - MKLDNNStream::Get()->Submit(); - DType* mean_mem_ptr = reinterpret_cast(fwd.GetMean().get_data_handle()); - DType* var_mem_ptr = reinterpret_cast(fwd.GetVar().get_data_handle()); -#pragma omp parallel for simd - for (int i = 0; i < channels_; i++) { - omean[i] = mean_mem_ptr[i]; - ovar[i] = VARIANCE_TO_INVSTD(var_mem_ptr[i], param.eps); - } - } - } else { // no input gamma and beta - LOG(FATAL) << "MKLDNN batch normalization: should not reach here ..."; - } -} - -template -void MKLDNNBatchNormBackward(const OpContext &ctx, const BatchNormParam ¶m, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { - TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]); - CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U); - CHECK_EQ(in_data.size(), 3U); - CHECK_EQ(out_data.size(), 3U); - CHECK_EQ(in_grad.size(), 3U); - unsigned flags = _GetFlags(in_data, aux_states, param, ctx.is_train); - - const NDArray &data = in_data[batchnorm::kData]; - const NDArray &diff = out_grad[batchnorm::kOut]; - const NDArray &gradIn = in_grad[batchnorm::kData]; - const NDArray &moving_mean = aux_states[batchnorm::kMovingMean]; - const NDArray &moving_var = aux_states[batchnorm::kMovingVar]; - const NDArray &out_mean = out_data[batchnorm::kMean]; - const NDArray &out_var = out_data[batchnorm::kVar]; - - CHECK(out_mean.IsDefaultData()); - CHECK(out_var.IsDefaultData()); - CHECK(moving_mean.IsDefaultData()); - CHECK(moving_var.IsDefaultData()); - - auto data_mem = data.GetMKLDNNData(); - auto diff_mem = diff.GetMKLDNNData(); - // MKLDNN batchnorm should run on special layouts. If one of them isn't, we - // should reorder them. - if (data.IsDefaultData()) - data_mem = data.GetMKLDNNDataReorder(diff_mem->get_primitive_desc()); - else if (diff.IsDefaultData()) - diff_mem = diff.GetMKLDNNDataReorder(data_mem->get_primitive_desc()); - auto bwd_pd = _GetBwd(*data_mem, *diff_mem, param.eps, flags); - auto gradi_mem = const_cast(gradIn).CreateMKLDNNData(data_mem->get_primitive_desc()); - - if (flags & use_scale_shift) { - const NDArray &gamma = in_data[batchnorm::kGamma]; - const NDArray &beta = in_data[batchnorm::kBeta]; - // TODO(tao): how to reuse this memory? - std::shared_ptr weight_mem( - new mkldnn::memory(bwd_pd.weights_primitive_desc())); - - DType* weight_buf = reinterpret_cast(weight_mem->get_data_handle()); - nnvm::dim_t channels_ = data.shape()[1]; - for (int i = 0; i < channels_; i++) { - if (!param.fix_gamma) - weight_buf[i] = (gamma.data().dptr())[i]; // weight - else - weight_buf[i] = (DType)1.0f; - } - - for (int i = 0; i < channels_; i++) { - weight_buf[channels_ + i] = (beta.data().dptr())[i]; // bias - } - - std::shared_ptr gradw_mem( - new mkldnn::memory(bwd_pd.diff_weights_primitive_desc())); - // training but no input mean and variance - if (ctx.is_train && !param.use_global_stats) { - DType* moving_mean_ptr = reinterpret_cast(moving_mean.data().dptr()); - DType* moving_var_ptr = reinterpret_cast(moving_var.data().dptr()); - DType* out_mean_ptr = reinterpret_cast(out_mean.data().dptr()); - DType* out_var_ptr = reinterpret_cast(out_var.data().dptr()); - mkldnn::memory var_mem(bwd_pd.variance_primitive_desc()); - DType *tmp_var_ptr = reinterpret_cast(var_mem.get_data_handle()); - - DType minus_mom = (1.0f - param.momentum); - for (int i = 0; i < channels_; i++) { - moving_mean_ptr[i] = moving_mean_ptr[i] * param.momentum + - out_mean_ptr[i] * minus_mom; - float variance = INVSTD_TO_VARIANCE(out_var_ptr[i], param.eps); - tmp_var_ptr[i] = variance; - moving_var_ptr[i] = moving_var_ptr[i] * param.momentum + - variance * minus_mom; - } - - std::shared_ptr out_mean_mem( - new mkldnn::memory(bwd_pd.mean_primitive_desc(), out_mean_ptr)); - std::shared_ptr out_var_mem( - new mkldnn::memory(bwd_pd.variance_primitive_desc(), out_var_ptr)); - - auto bn_bwd = mkldnn::batch_normalization_backward(bwd_pd, - *data_mem, - mkldnn::primitive::at(*out_mean_mem), - mkldnn::primitive::at(var_mem), - *diff_mem, - *weight_mem, - *gradi_mem, - *gradw_mem); - - MKLDNNStream::Get()->RegisterPrim(bn_bwd); - MKLDNNStream::Get()->Submit(); - } else { - std::shared_ptr imean_mem( - new mkldnn::memory(bwd_pd.mean_primitive_desc(), - moving_mean.data().dptr())); - std::shared_ptr ivar_mem( - new mkldnn::memory(bwd_pd.variance_primitive_desc(), - moving_var.data().dptr())); - auto bn_bwd = mkldnn::batch_normalization_backward(bwd_pd, - *data_mem, - mkldnn::primitive::at(*imean_mem), - mkldnn::primitive::at(*ivar_mem), - *diff_mem, - *weight_mem, - *gradi_mem, - *gradw_mem); - - MKLDNNStream::Get()->RegisterPrim(bn_bwd); - MKLDNNStream::Get()->Submit(); - } - - // copy data from gradw_mem to in_grad[1] and in_grad[2] - DType* gw_buf = reinterpret_cast(gradw_mem->get_data_handle()); - for (int i = 0; i < channels_; i++) { - if (!param.fix_gamma) - (in_grad[1].data().dptr())[i] = gw_buf[i]; - else - (in_grad[1].data().dptr())[i] = 0.0f; - } - - for (int i = 0; i < channels_; i++) { - (in_grad[2].data().dptr())[i] = gw_buf[i + channels_]; - } - } else { - LOG(FATAL) << "MKLDNN batch normalization backward: should not reach here ..."; - } -} -} // namespace op -} // namespace mxnet -#endif // MXNET_USE_MKLDNN -#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_ diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc deleted file mode 100644 index d3e6e775020d..000000000000 --- a/src/operator/nn/mkldnn/mkldnn_concat.cc +++ /dev/null @@ -1,95 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file mkldnn_concat.cc - * \brief - * \author Wenting Jiang -*/ -#include "../concat-inl.h" -#include "./mkldnn_ops-inl.h" -#include "./mkldnn_base-inl.h" - -#if MXNET_USE_MKLDNN == 1 -namespace mxnet { -namespace op { - -void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data) { - TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]); - const ConcatParam& param = nnvm::get(attrs.parsed); - int num_in_data = param.num_args; - int concat_dim = param.dim; - std::vector data_md; - std::vector data_mem; - for (int i =0; i < num_in_data; i++) { - auto tmp_mem = in_data[i].GetMKLDNNData(); - auto tmp_pd = tmp_mem->get_primitive_desc(); - data_md.push_back(tmp_pd); - data_mem.push_back(*tmp_mem); - } - mkldnn::concat::primitive_desc fwd_pd(concat_dim, data_md); - auto engine = CpuEngine::Get()->get_engine(); - auto out_mem = CreateMKLDNNMem(out_data[concat_enum::kOut], - fwd_pd.dst_primitive_desc(), req[concat_enum::kOut]); - MKLDNNStream::Get()->RegisterPrim(mkldnn::concat(fwd_pd, data_mem, *out_mem.second)); - CommitOutput(out_data[concat_enum::kOut], out_mem); - MKLDNNStream::Get()->Submit(); -} - -void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]); - const ConcatParam& param = nnvm::get(attrs.parsed); - int num_in_data = param.num_args; - int axis_ = param.dim; - auto engine = CpuEngine::Get()->get_engine(); - auto gz_mem = inputs[0].GetMKLDNNData(); - mkldnn::memory::primitive_desc gz_pd = gz_mem->get_primitive_desc(); - /* init the offset */ - mkldnn::memory::dims offsets = {0, 0, 0, 0}; - for (int i = 0; i < num_in_data; i++) { - mkldnn::memory::dims diff_src_tz - = {static_cast(inputs[i+1].shape()[0]), - static_cast(inputs[i+1].shape()[1]), - static_cast(inputs[i+1].shape()[2]), - static_cast(inputs[i+1].shape()[3])}; - auto diff_src_mpd = inputs[i+1].GetMKLDNNData()->get_primitive_desc(); - auto gradi_mem_ = CreateMKLDNNMem(outputs[i], diff_src_mpd, req[i]); - // create view from gy to gxs[i] - std::shared_ptr view_pd; - view_pd.reset(new mkldnn::view::primitive_desc(gz_pd, diff_src_tz, offsets)); - // create reorder primitive from gy to gxs[i] - mkldnn::reorder::primitive_desc reorder_pd( - view_pd.get()->dst_primitive_desc(), diff_src_mpd); - offsets[axis_] += diff_src_tz[axis_]; - MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder( - reorder_pd, *gz_mem, *gradi_mem_.second)); - CommitOutput(outputs[i], gradi_mem_); - } - MKLDNNStream::Get()->Submit(); -} - -} // namespace op -} // namespace mxnet -#endif diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc deleted file mode 100644 index b94850aa620b..000000000000 --- a/src/operator/nn/mkldnn/mkldnn_convolution.cc +++ /dev/null @@ -1,357 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file mkldnn_convolution.cc - * \brief - * \author Da Zheng -*/ - -#include "../convolution-inl.h" -#include "./mkldnn_ops-inl.h" -#include "./mkldnn_base-inl.h" - -#if MXNET_USE_MKLDNN == 1 -namespace mxnet { -namespace op { - -static mkldnn::convolution_forward::primitive_desc GetConvFwdImpl( - const ConvolutionParam& param, bool is_train, const NDArray &data, - const NDArray &weights, const NDArray *bias, const NDArray &output) { - auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring; - auto data_md = GetMemDesc(data); - auto weight_md = GetWeightDesc(weights, param.num_group); - auto out_md = GetMemDesc(output); - auto engine = CpuEngine::Get()->get_engine(); - mkldnn::memory::dims strides{0, 0}; - if (param.stride.ndim() == 2) { - strides[0] = param.stride[0]; - strides[1] = param.stride[1]; - } - mkldnn::memory::dims padding{0, 0}; - if (param.pad.ndim() == 2) { - padding[0] = param.pad[0]; - padding[1] = param.pad[1]; - } - if (param.dilate.ndim() == 0 && bias == nullptr) { - mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, - data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); - return mkldnn::convolution_forward::primitive_desc(desc, engine); - } else if (param.dilate.ndim() == 0) { - auto bias_md = GetMemDesc(*bias); - mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, - data_md, weight_md, bias_md, out_md, strides, padding, padding, - mkldnn::padding_kind::zero); - return mkldnn::convolution_forward::primitive_desc(desc, engine); - } else { - mkldnn::memory::dims dilates{0, 0}; - if (param.dilate.ndim() == 2) { - dilates[0] = param.dilate[0] - 1; - dilates[1] = param.dilate[1] - 1; - } - if (bias == nullptr) { - mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, - data_md, weight_md, out_md, strides, dilates, padding, padding, - mkldnn::padding_kind::zero); - return mkldnn::convolution_forward::primitive_desc(desc, engine); - } else { - auto bias_md = GetMemDesc(*bias); - mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, - data_md, weight_md, bias_md, out_md, strides, - dilates, padding, padding, - mkldnn::padding_kind::zero); - return mkldnn::convolution_forward::primitive_desc(desc, engine); - } - } -} - -static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData( - const ConvolutionParam& param, const NDArray &data, const NDArray &weights, - const NDArray &output, const mkldnn::convolution_forward::primitive_desc &fwd_pd) { - auto data_md = GetMemDesc(data); - auto weight_md = GetWeightDesc(weights, param.num_group); - auto out_md = GetMemDesc(output); - auto engine = CpuEngine::Get()->get_engine(); - mkldnn::memory::dims strides{0, 0}; - if (param.stride.ndim() == 2) { - strides[0] = param.stride[0]; - strides[1] = param.stride[1]; - } - mkldnn::memory::dims padding{0, 0}; - if (param.pad.ndim() == 2) { - padding[0] = param.pad[0]; - padding[1] = param.pad[1]; - } - if (param.dilate.ndim() == 0) { - mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, - data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd); - } else { - mkldnn::memory::dims dilates{0, 0}; - if (param.dilate.ndim() == 2) { - dilates[0] = param.dilate[0] - 1; - dilates[1] = param.dilate[1] - 1; - } - mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, - data_md, weight_md, out_md, strides, dilates, padding, padding, - mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd); - } -} - -static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights( - const ConvolutionParam& param, const NDArray &data, - const NDArray &weights, const NDArray *bias, const NDArray &output, - const mkldnn::convolution_forward::primitive_desc &fwd_pd) { - auto data_md = GetMemDesc(data); - auto weight_md = GetWeightDesc(weights, param.num_group); - auto out_md = GetMemDesc(output); - auto engine = CpuEngine::Get()->get_engine(); - mkldnn::memory::dims strides{0, 0}; - if (param.stride.ndim() == 2) { - strides[0] = param.stride[0]; - strides[1] = param.stride[1]; - } - mkldnn::memory::dims padding{0, 0}; - if (param.pad.ndim() == 2) { - padding[0] = param.pad[0]; - padding[1] = param.pad[1]; - } - if (param.dilate.ndim() == 0 && bias == nullptr) { - mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, - data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); - } else if (param.dilate.ndim() == 0) { - auto bias_md = GetMemDesc(*bias); - mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, - data_md, weight_md, bias_md, out_md, strides, padding, padding, - mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); - } else { - mkldnn::memory::dims dilates{0, 0}; - if (param.dilate.ndim() == 2) { - dilates[0] = param.dilate[0] - 1; - dilates[1] = param.dilate[1] - 1; - } - if (bias == nullptr) { - mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, - data_md, weight_md, out_md, strides, dilates, padding, padding, - mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); - } else { - auto bias_md = GetMemDesc(*bias); - mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, - data_md, weight_md, bias_md, out_md, - strides, dilates, padding, padding, - mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); - } - } -} - -class MKLDNNConvForward { - std::shared_ptr fwd; - std::shared_ptr data; - std::shared_ptr weight; - std::shared_ptr bias; - std::shared_ptr out; - - public: - mkldnn::convolution_forward::primitive_desc fwd_pd; - - MKLDNNConvForward(const ConvolutionParam& param, bool is_train, - const NDArray &data, const NDArray &weights, - const NDArray *bias, const NDArray &output): fwd_pd( - GetConvFwdImpl(param, is_train, data, weights, bias, output)) { - } - - void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &weight, - const mkldnn::memory *bias, const mkldnn::memory &output) { - if (this->data == nullptr) - this->data = std::shared_ptr(new mkldnn::memory( - fwd_pd.src_primitive_desc(), data.get_data_handle())); - else - this->data->set_data_handle(data.get_data_handle()); - - if (this->weight == nullptr) - this->weight = std::shared_ptr(new mkldnn::memory( - fwd_pd.weights_primitive_desc(), weight.get_data_handle())); - else - this->weight->set_data_handle(weight.get_data_handle()); - - if (this->out == nullptr) - this->out = std::shared_ptr(new mkldnn::memory( - fwd_pd.dst_primitive_desc(), output.get_data_handle())); - else - this->out->set_data_handle(output.get_data_handle()); - - if (bias != nullptr) { - if (this->bias == nullptr) - this->bias = std::shared_ptr(new mkldnn::memory( - fwd_pd.bias_primitive_desc(), bias->get_data_handle())); - else - this->bias->set_data_handle(bias->get_data_handle()); - if (this->fwd == nullptr) - this->fwd = std::shared_ptr( - new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data), - mkldnn::primitive::at(*this->weight), - mkldnn::primitive::at(*this->bias), - *this->out)); - } else if (this->fwd == nullptr) { - this->fwd = std::shared_ptr( - new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data), - mkldnn::primitive::at(*this->weight), - *this->out)); - } - } - - const mkldnn::convolution_forward &GetFwd() const { - return *fwd; - } -}; - -typedef MKLDNNParamOpSign MKLDNNConvSignature; - -static inline MKLDNNConvForward &GetConvFwd( - const nnvm::NodeAttrs& attrs, bool is_train, - const NDArray &data, const NDArray &weights, - const NDArray *bias, const NDArray &output) { - static thread_local std::unordered_map fwds; - const ConvolutionParam& param = nnvm::get(attrs.parsed); - MKLDNNConvSignature key(param); - key.AddSign(is_train); - // Here we can sign the conv op with NDArray because conv primitive will - // decide the right layout for the, so we only need to get the shape and the - // data type of the arrays. - key.AddSign(data); - key.AddSign(weights); - key.AddSign(output); - if (bias) - key.AddSign(*bias); - - auto it = fwds.find(key); - if (it == fwds.end()) { - MKLDNNConvForward fwd(param, is_train, data, weights, bias, output); - auto ins_ret = fwds.insert( - std::pair(key, fwd)); - CHECK(ins_ret.second); - it = ins_ret.first; - } - return it->second; -} - -void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data) { - TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]); - const ConvolutionParam& param = nnvm::get(attrs.parsed); - MKLDNNConvForward &fwd = GetConvFwd(attrs, - ctx.is_train, in_data[conv::kData], in_data[conv::kWeight], - param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]); - - auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd.fwd_pd.src_primitive_desc()); - const mkldnn::memory *weight_mem; - if (ctx.is_train) { - // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it - // to the default format for now. - if (in_data[conv::kWeight].IsMKLDNNData()) - const_cast(in_data[conv::kWeight]).Reorder2Default(); - weight_mem = GetWeights(in_data[conv::kWeight], fwd.fwd_pd.weights_primitive_desc(), - param.num_group); - } else { - // For inference, we want to reorder the weight array so we don't need to - // reorder data every time. - const_cast(in_data[conv::kWeight]).MKLDNNDataReorder( - fwd.fwd_pd.weights_primitive_desc()); - weight_mem = in_data[conv::kWeight].GetMKLDNNData(); - } - auto out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd.fwd_pd.dst_primitive_desc(), - req[conv::kOut]); - const mkldnn::memory *bias_mem = nullptr; - if (!param.no_bias) - bias_mem = in_data[conv::kBias].GetMKLDNNDataReorder(fwd.fwd_pd.bias_primitive_desc()); - fwd.SetNewMem(*data_mem, *weight_mem, bias_mem, *out_mem.second); - MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd()); - - CommitOutput(out_data[conv::kOut], out_mem); - MKLDNNStream::Get()->Submit(); -} - -void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]); - const std::vector &in_grad = outputs; - const ConvolutionParam& param = nnvm::get(attrs.parsed); - mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwdImpl(param, ctx.is_train, - inputs[conv::kData + 1], inputs[conv::kWeight + 1], - param.no_bias ? nullptr : &inputs[conv::kBias + 1], inputs[conv::kOut]); - - CHECK_NE(req[conv::kWeight], kWriteInplace) << "cannot write weight inplace"; - mkldnn::convolution_backward_data::primitive_desc bwdData_pd - = GetConvBwdData(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1], - inputs[conv::kOut], fwd_pd); - auto out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder( - bwdData_pd.diff_dst_primitive_desc()); - if (req[conv::kData]) { - auto weight_mem = GetWeights(inputs[conv::kWeight + 1], - bwdData_pd.weights_primitive_desc(), param.num_group); - auto in_grad_mem = CreateMKLDNNMem(in_grad[conv::kData], - bwdData_pd.diff_src_primitive_desc(), req[conv::kData]); - MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_data(bwdData_pd, - *out_grad_mem, *weight_mem, *in_grad_mem.second)); - CommitOutput(in_grad[conv::kData], in_grad_mem); - } - if (req[conv::kWeight]) { - mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd - = GetConvBwdWeights(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1], - param.no_bias ? nullptr : &inputs[conv::kBias + 1], - inputs[conv::kOut], fwd_pd); - if (bwdData_pd.diff_dst_primitive_desc() != bwdWeights_pd.diff_dst_primitive_desc()) - out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder( - bwdWeights_pd.diff_dst_primitive_desc()); - auto data_mem = inputs[conv::kData + 1].GetMKLDNNDataReorder( - bwdWeights_pd.src_primitive_desc()); - auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[conv::kWeight], - bwdWeights_pd.diff_weights_primitive_desc(), - req[conv::kWeight]); - mkldnn_output_t in_grad_bias; - if (param.no_bias) { - MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights( - bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second)); - } else { - in_grad_bias = CreateMKLDNNMem(in_grad[conv::kBias], - bwdWeights_pd.diff_bias_primitive_desc(), - req[conv::kBias]); - MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights( - bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second, - *in_grad_bias.second)); - CommitOutput(in_grad[conv::kBias], in_grad_bias); - } - CommitOutput(in_grad[conv::kWeight], in_grad_weight); - } - MKLDNNStream::Get()->Submit(); -} - -} // namespace op -} // namespace mxnet - -#endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/mkldnn/mkldnn_copy.cc deleted file mode 100644 index 71d540c969cd..000000000000 --- a/src/operator/nn/mkldnn/mkldnn_copy.cc +++ /dev/null @@ -1,57 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file mkldnn_softmax.cc - * \brief - * \author Da Zheng -*/ - -#include "../softmax-inl.h" -#include "./mkldnn_ops-inl.h" -#include "./mkldnn_base-inl.h" - -#if MXNET_USE_MKLDNN == 1 -namespace mxnet { -namespace op { - -void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const NDArray &in_data, const OpReqType &req, - const NDArray &out_data) { - TmpMemMgr::Get()->Init(ctx.requested[0]); - auto in_mem = in_data.GetMKLDNNData(); - if (req == kAddTo) { - TmpMemMgr::Get()->Init(ctx.requested[0]); - // We should try and force the output memory has the same format - // as the input memory. If not, we'll have to reorder memory. - auto out_mem = out_data.GetMKLDNNData(in_mem->get_primitive_desc()); - if (out_mem == nullptr) - out_mem = out_data.GetMKLDNNData(); - auto sum_res = TmpMemMgr::Get()->Alloc(out_mem->get_primitive_desc()); - Sum(*in_mem, *out_mem, *sum_res); - const_cast(out_data).CopyFrom(*sum_res); - } else { - const_cast(out_data).CopyFrom(*in_mem); - } - MKLDNNStream::Get()->Submit(); -} - -} // namespace op -} // namespace mxnet -#endif diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc deleted file mode 100644 index d336d6dedbea..000000000000 --- a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc +++ /dev/null @@ -1,390 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file mkldnn_deconvolution.cc - * \brief - * \author Da Zheng, Rong Zhang (rong.a.zhang@intel.com) -*/ - -#if MXNET_USE_MKLDNN == 1 - -#include "../deconvolution-inl.h" -#include "./mkldnn_ops-inl.h" -#include "./mkldnn_base-inl.h" - -namespace mxnet { -namespace op { - -static inline mkldnn::memory::desc GetBiasDesc(mkldnn::memory::desc md) { - mkldnn::memory::dims dims(1); - // This is convolution on 4D data. The second dimension is the channel. - dims[0] = md.data.dims[1]; - return mkldnn::memory::desc(dims, - static_cast(md.data.data_type), - mkldnn::memory::format::any); -} - -static mkldnn::convolution_forward::primitive_desc GetDeconvBwd_( - const mkldnn::memory::desc &data_md, const mkldnn::memory::desc &weights_md, - bool has_bias, const mkldnn::memory::desc &out_md, - const mkldnn::engine &engine, const mkldnn::memory::dims &strides, - const mkldnn::memory::dims &padding, const mkldnn::memory::dims &dilates) { - if (!has_bias) { - mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training, - mkldnn::algorithm::convolution_direct, out_md, weights_md, data_md, strides, - dilates, padding, padding, mkldnn::padding_kind::zero); - return mkldnn::convolution_forward::primitive_desc(desc, engine); - } else { - auto bias_md = GetBiasDesc(data_md); - mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training, - mkldnn::algorithm::convolution_direct, out_md, weights_md, bias_md, - data_md, strides, dilates, padding, padding, mkldnn::padding_kind::zero); - return mkldnn::convolution_forward::primitive_desc(desc, engine); - } -} - -static mkldnn::convolution_backward_data::primitive_desc GetDeconvFwdImpl( - const DeconvolutionParam& param, const NDArray &data, const NDArray &weights, - bool has_bias, const NDArray &output) { - auto data_md = GetMemDesc(data); - auto weight_md = GetWeightDesc(weights, param.num_group); - auto out_md = GetMemDesc(output); - auto engine = CpuEngine::Get()->get_engine(); - mkldnn::memory::dims strides{0, 0}; - if (param.stride.ndim() == 2) { - strides[0] = param.stride[0]; - strides[1] = param.stride[1]; - } else if (param.stride.ndim() == 1) { - strides[0] = param.stride[0]; - strides[1] = param.stride[0]; - } else { - LOG(FATAL) << "Unsupported stride dim"; - } - mkldnn::memory::dims padding{0, 0}; - if (param.pad.ndim() == 2) { - padding[0] = param.pad[0]; - padding[1] = param.pad[1]; - } else if (param.pad.ndim() == 1) { - padding[0] = param.pad[0]; - padding[1] = param.pad[0]; - } else { - LOG(FATAL) << "Unsupported pad dim"; - } - mkldnn::memory::dims dilate{0, 0}; - if (param.dilate.ndim() == 2) { - dilate[0] = param.dilate[0] - 1; - dilate[1] = param.dilate[1] - 1; - } - auto bwd_pd = GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine, - strides, padding, dilate); - mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, - out_md, weight_md, data_md, strides, dilate, padding, padding, - mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_data::primitive_desc(desc, engine, bwd_pd); -} - -static mkldnn::convolution_forward::primitive_desc GetDeconvBwdData( - const DeconvolutionParam ¶m, const NDArray &data, const NDArray &weights, - bool has_bias, const NDArray &output) { - auto data_md = GetMemDesc(data); - auto weight_md = GetWeightDesc(weights, param.num_group); - auto out_md = GetMemDesc(output); - auto engine = CpuEngine::Get()->get_engine(); - mkldnn::memory::dims strides{0, 0}; - if (param.stride.ndim() == 2) { - strides[0] = param.stride[0]; - strides[1] = param.stride[1]; - } else if (param.stride.ndim() == 1) { - strides[0] = param.stride[0]; - strides[1] = param.stride[0]; - } else { - LOG(FATAL) << "Unsupported stride dim"; - } - mkldnn::memory::dims padding{0, 0}; - if (param.pad.ndim() == 2) { - padding[0] = param.pad[0]; - padding[1] = param.pad[1]; - } else if (param.pad.ndim() == 1) { - padding[0] = param.pad[0]; - padding[1] = param.pad[0]; - } else { - LOG(FATAL) << "Unsupported pad dim"; - } - mkldnn::memory::dims dilate{0, 0}; - if (param.dilate.ndim() == 2) { - dilate[0] = param.dilate[0] - 1; - dilate[1] = param.dilate[1] - 1; - } - return GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine, - strides, padding, dilate); -} - -static mkldnn::convolution_backward_weights::primitive_desc GetDeconvBwdWeights( - const DeconvolutionParam& param, const NDArray &data, const NDArray &weights, - bool has_bias, const NDArray &output, - const mkldnn::convolution_forward::primitive_desc &fwd_pd) { - auto data_md = GetMemDesc(data); - auto weight_md = GetWeightDesc(weights, param.num_group); - auto out_md = GetMemDesc(output); - auto engine = CpuEngine::Get()->get_engine(); - mkldnn::memory::dims strides{0, 0}; - if (param.stride.ndim() == 2) { - strides[0] = param.stride[0]; - strides[1] = param.stride[1]; - } else if (param.stride.ndim() == 1) { - strides[0] = param.stride[0]; - strides[1] = param.stride[0]; - } else { - LOG(FATAL) << "Unsupported stride dim"; - } - mkldnn::memory::dims padding{0, 0}; - if (param.pad.ndim() == 2) { - padding[0] = param.pad[0]; - padding[1] = param.pad[1]; - } else if (param.pad.ndim() == 1) { - padding[0] = param.pad[0]; - padding[1] = param.pad[0]; - } else { - LOG(FATAL) << "Unsupported pad dim"; - } - mkldnn::memory::dims dilate{0, 0}; - if (param.dilate.ndim() == 2) { - dilate[0] = param.dilate[0] - 1; - dilate[1] = param.dilate[1] - 1; - } - if (!has_bias) { - mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, - out_md, weight_md, data_md, strides, dilate, padding, padding, mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); - } else { - auto bias_md = GetBiasDesc(data_md); - mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, - out_md, weight_md, bias_md, data_md, strides, dilate, padding, padding, - mkldnn::padding_kind::zero); - return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); - } -} - -class MKLDNNDeconvForward { - std::shared_ptr fwd; - std::shared_ptr data; - std::shared_ptr weight; - std::shared_ptr bias; - std::shared_ptr out; - OutDataOp data_op; - - public: - MKLDNNDeconvForward(const DeconvolutionParam& param, - const NDArray &data, - const NDArray &weights, - bool has_bias, - const NDArray &output); - void SetDataHandle(const DeconvolutionParam& param, - const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data); - - void Execute(const std::vector &out_data); - - private: - mkldnn::convolution_backward_data::primitive_desc fwd_pd; -}; // class MKLDNNDeconvForward - -MKLDNNDeconvForward::MKLDNNDeconvForward(const DeconvolutionParam& param, - const NDArray &data, - const NDArray &weights, - bool has_bias, - const NDArray &output) - :fwd_pd(GetDeconvFwdImpl(param, data, weights, has_bias, output)) { - this->data = std::shared_ptr(new mkldnn::memory( - fwd_pd.diff_dst_primitive_desc())); - this->weight = std::shared_ptr(new mkldnn::memory( - fwd_pd.weights_primitive_desc())); - this->out = std::shared_ptr(new mkldnn::memory( - fwd_pd.diff_src_primitive_desc())); - this->fwd = std::shared_ptr( - new mkldnn::convolution_backward_data(fwd_pd, - mkldnn::primitive::at(*this->data), - mkldnn::primitive::at(*this->weight), - *this->out)); -} - -void MKLDNNDeconvForward::SetDataHandle(const DeconvolutionParam& param, - const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data) { - auto data_mem = in_data[deconv::kData].GetMKLDNNDataReorder( - fwd_pd.diff_dst_primitive_desc()); - const mkldnn::memory *weight_mem; - if (ctx.is_train) { - // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it - // to the default format for now. - if (in_data[deconv::kWeight].IsMKLDNNData()) - const_cast(in_data[deconv::kWeight]).Reorder2Default(); - weight_mem = GetWeights(in_data[deconv::kWeight], - fwd_pd.weights_primitive_desc(), - param.num_group); - } else { - // For inference, we want to reorder the weight array so we don't need to - // reorder data every time. - const_cast(in_data[deconv::kWeight]).MKLDNNDataReorder( - fwd_pd.weights_primitive_desc()); - weight_mem = in_data[deconv::kWeight].GetMKLDNNData(); - } - auto out_mem = CreateMKLDNNMem(out_data[deconv::kOut], - fwd_pd.diff_src_primitive_desc(), req[deconv::kOut]); - auto output = out_mem.second; - this->data->set_data_handle(data_mem->get_data_handle()); - this->weight->set_data_handle(weight_mem->get_data_handle()); - this->out->set_data_handle(output->get_data_handle()); - this->data_op = out_mem.first; -} - -void MKLDNNDeconvForward::Execute(const std::vector &out_data) { - MKLDNNStream::Get()->RegisterPrim(*fwd); - CommitOutput(out_data[deconv::kOut], mkldnn_output_t(this->data_op, this->out.get())); - MKLDNNStream::Get()->Submit(); -} - -static void MKLDNNDeconvFwdBiasPostProcess(const DeconvolutionParam& param, - const OpContext &ctx, - const std::vector &in_data, - const std::vector &out_data) { - // add bias, broadcast bias to dim 1: channel - if (!param.no_bias) { - // MKLDNN only supports float right now. - typedef float DType; - Stream *s = ctx.get_stream(); - Tensor bias = in_data[deconv::kBias].data().get(s); - // If the output data is stored in a special MKLDNN format, data() - // automatically converts its format to the default format. - // Unfortunately, MKLDNN doesn't support broadcast. - Tensor out_cpu = out_data[deconv::kOut].data().get(s); - out_cpu += mshadow::expr::broadcast<1>(bias, out_cpu.shape_); - } -} - -typedef MKLDNNParamOpSign MKLDNNDeconvSignature; - -static inline MKLDNNDeconvForward &GetDeconvFwd( - const nnvm::NodeAttrs& attrs, const NDArray &data, - const NDArray &weights, const NDArray *bias, - const NDArray &output) { - static thread_local - std::unordered_map fwds; - const DeconvolutionParam& param = nnvm::get(attrs.parsed); - MKLDNNDeconvSignature key(param); - // Here we can sign the conv op with NDArray because conv primitive will - // decide the right layout for the, so we only need to get the shape and the - // data type of the arrays. - key.AddSign(data); - key.AddSign(weights); - key.AddSign(output); - if (bias) - key.AddSign(*bias); - - auto it = fwds.find(key); - if (it == fwds.end()) { - bool has_bias = (bias != nullptr); - MKLDNNDeconvForward fwd(param, data, weights, has_bias, output); - auto ins_ret = fwds.insert( - std::pair(key, fwd)); - CHECK(ins_ret.second); - it = ins_ret.first; - } - return it->second; -} - -void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data) { - TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]); - const DeconvolutionParam& param = nnvm::get(attrs.parsed); - - MKLDNNDeconvForward &deconvFwd = GetDeconvFwd( - attrs, in_data[deconv::kData], in_data[deconv::kWeight], - param.no_bias ? nullptr : &in_data[deconv::kBias], out_data[deconv::kOut]); - - deconvFwd.SetDataHandle(param, ctx, in_data, req, out_data); - - deconvFwd.Execute(out_data); - - MKLDNNDeconvFwdBiasPostProcess(param, ctx, in_data, out_data); -} - -void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]); - const std::vector &in_grad = outputs; - const DeconvolutionParam& param = nnvm::get(attrs.parsed); - CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace"; - mkldnn::convolution_forward::primitive_desc bwdData_pd = GetDeconvBwdData( - param, inputs[deconv::kData + 1], inputs[deconv::kWeight + 1], false, - inputs[deconv::kOut]); - auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder( - bwdData_pd.src_primitive_desc()); - if (req[deconv::kData]) { - auto weight_mem = GetWeights(inputs[deconv::kWeight + 1], - bwdData_pd.weights_primitive_desc(), - param.num_group); - auto in_grad_mem = CreateMKLDNNMem(in_grad[deconv::kData], - bwdData_pd.dst_primitive_desc(), - req[deconv::kData]); - MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_forward(bwdData_pd, - *out_grad_mem, *weight_mem, *in_grad_mem.second)); - CommitOutput(in_grad[deconv::kData], in_grad_mem); - } - if (req[deconv::kWeight]) { - mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd - = GetDeconvBwdWeights(param, inputs[deconv::kData + 1], - inputs[deconv::kWeight + 1], false, inputs[deconv::kOut], bwdData_pd); - if (bwdData_pd.src_primitive_desc() != bwdWeights_pd.src_primitive_desc()) - out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder( - bwdWeights_pd.src_primitive_desc()); - auto data_mem = inputs[deconv::kData + 1].GetMKLDNNDataReorder( - bwdWeights_pd.diff_dst_primitive_desc()); - auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[deconv::kWeight], - bwdWeights_pd.diff_weights_primitive_desc(), - req[deconv::kWeight]); - MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights( - bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight.second)); - CommitOutput(in_grad[deconv::kWeight], in_grad_weight); - } - MKLDNNStream::Get()->Submit(); - if (!param.no_bias) { - typedef float DType; - Stream *s = ctx.get_stream(); - Tensor gbias = in_grad[deconv::kBias].data().get(s); - // If there is bias, the out grad has already been converted to the default - // format, so this shouldn't cause any performance issues. - Tensor grad = inputs[deconv::kOut].data().get(s); - Assign(gbias, req[deconv::kBias], mshadow::expr::sumall_except_dim<1>(grad)); - } -} - -} // namespace op -} // namespace mxnet - -#endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc deleted file mode 100644 index a8b85bbeb151..000000000000 --- a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc +++ /dev/null @@ -1,200 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file mkldnn_fully_connected.cc - * \brief - * \author Da Zheng -*/ - -#include "../fully_connected-inl.h" -#include "./mkldnn_base-inl.h" - -#if MXNET_USE_MKLDNN == 1 -namespace mxnet { -namespace op { - -inline static mkldnn::inner_product_forward::primitive_desc GetIPFwd( - const NDArray &data, const NDArray &weight, const NDArray *bias, - const mkldnn::memory::desc &out_md) { - auto data_md = GetMemDesc(data); - auto weight_md = GetMemDesc(weight); - auto engine = CpuEngine::Get()->get_engine(); - if (bias) { - auto bias_md = GetMemDesc(*bias); - mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training, - data_md, weight_md, bias_md, out_md); - return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine); - } else { - mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training, - data_md, weight_md, out_md); - return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine); - } -} - -inline static mkldnn::inner_product_backward_data::primitive_desc GetIpBwdData( - const NDArray &data, const NDArray &weight, const NDArray &output, - mkldnn::inner_product_forward::primitive_desc ipFwd_pd) { - auto data_md = GetMemDesc(data); - auto weight_md = GetMemDesc(weight); - auto out_md = GetMemDesc(output); - auto engine = CpuEngine::Get()->get_engine(); - mkldnn::inner_product_backward_data::desc desc(data_md, weight_md, out_md); - return mkldnn::inner_product_backward_data::primitive_desc(desc, engine, ipFwd_pd); -} - -inline static mkldnn::inner_product_backward_weights::primitive_desc GetIPBwdWeights( - const NDArray &data, const NDArray &weight, const NDArray *bias, - const NDArray &output, mkldnn::inner_product_forward::primitive_desc ipFwd_pd) { - auto data_md = GetMemDesc(data); - auto weight_md = GetMemDesc(weight); - auto out_md = GetMemDesc(output); - auto engine = CpuEngine::Get()->get_engine(); - if (bias) { - auto bias_md = GetMemDesc(*bias); - mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_md, - weight_md, bias_md, out_md); - return mkldnn::inner_product_backward_weights::primitive_desc( - ipBwdWeights_desc, engine, ipFwd_pd); - } else { - mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_md, - weight_md, out_md); - return mkldnn::inner_product_backward_weights::primitive_desc( - ipBwdWeights_desc, engine, ipFwd_pd); - } -} - -void MKLDNNFCForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data) { - TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]); - const FullyConnectedParam& param = nnvm::get(attrs.parsed); - const TShape& ishape = in_data[fullc::kData].shape(); - const TShape& oshape = out_data[fullc::kOut].shape(); - NDArray weight = in_data[fullc::kWeight]; - NDArray data = in_data[fullc::kData]; - auto out_md = GetMemDesc(out_data[fullc::kOut]); - if (data.shape().ndim() != 2 && !param.flatten) { - data = data.MKLDNNDataReshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1), - ishape[ishape.ndim()-1])); - mkldnn::memory::dims out_dims{static_cast(oshape.ProdShape(0, oshape.ndim()-1)), - static_cast(oshape[ishape.ndim()-1])}; - out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data[fullc::kOut].dtype()), - mkldnn::memory::format::any); - } else if (data.shape().ndim() != 2) { - data = data.MKLDNNDataReshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim()))); - mkldnn::memory::dims out_dims{static_cast(oshape[0]), - static_cast(oshape.ProdShape(1, oshape.ndim()))}; - out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data[fullc::kOut].dtype()), - mkldnn::memory::format::any); - } - - mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight, - param.no_bias ? nullptr : &in_data[fullc::kBias], out_md); - auto data_mem = data.GetMKLDNNDataReorder(ipFwd_pd.src_primitive_desc()); - auto weight_mem = weight.GetMKLDNNDataReorder(ipFwd_pd.weights_primitive_desc()); - auto out_mem = CreateMKLDNNMem(out_data[fullc::kOut], - ipFwd_pd.dst_primitive_desc(), req[fullc::kOut]); - if (param.no_bias) { - MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_forward( - ipFwd_pd, *data_mem, *weight_mem, *out_mem.second)); - } else { - auto bias_mem = in_data[fullc::kBias].GetMKLDNNDataReorder(ipFwd_pd.bias_primitive_desc()); - MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_forward(ipFwd_pd, - *data_mem, *weight_mem, *bias_mem, *out_mem.second)); - } - CommitOutput(out_data[fullc::kOut], out_mem); - MKLDNNStream::Get()->Submit(); -} - -void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]); - const std::vector &in_grad = outputs; - const FullyConnectedParam& param = nnvm::get(attrs.parsed); - const TShape& ishape = inputs[fullc::kData + 1].shape(); - const TShape& oshape = inputs[fullc::kOut].shape(); - - NDArray weight = inputs[fullc::kWeight + 1]; - NDArray data = inputs[fullc::kData + 1]; - if (data.shape().ndim() != 2 && !param.flatten) - data = data.MKLDNNDataReshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1), - ishape[ishape.ndim()-1])); - else if (data.shape().ndim() != 2) - data = data.MKLDNNDataReshape(Shape2(ishape[0], - ishape.ProdShape(1, ishape.ndim()))); - NDArray out_grad = inputs[fullc::kOut]; - if (out_grad.shape().ndim() != 2 && !param.flatten) - out_grad = out_grad.MKLDNNDataReshape(Shape2(oshape.ProdShape(0, oshape.ndim()-1), - oshape[oshape.ndim()-1])); - else if (out_grad.shape().ndim() != 2) - out_grad = out_grad.MKLDNNDataReshape(Shape2(oshape[0], - oshape.ProdShape(1, oshape.ndim()))); - - mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight, - param.no_bias ? nullptr : &in_grad[fullc::kBias], GetMemDesc(out_grad)); - - CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace"; - if (req[fullc::kData]) { - mkldnn::inner_product_backward_data::primitive_desc ipBwdData_pd = GetIpBwdData( - data, weight, out_grad, ipFwd_pd); - auto out_grad_mem = out_grad.GetMKLDNNDataReorder( - ipBwdData_pd.diff_dst_primitive_desc()); - auto weight_mem = weight.GetMKLDNNDataReorder(ipBwdData_pd.weights_primitive_desc()); - auto in_grad_mem = CreateMKLDNNMem(in_grad[fullc::kData], - ipBwdData_pd.diff_src_primitive_desc(), - req[fullc::kData]); - MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_data( - ipBwdData_pd, *out_grad_mem, *weight_mem, *in_grad_mem.second)); - CommitOutput(in_grad[fullc::kData], in_grad_mem); - } - if (req[fullc::kWeight]) { - mkldnn::inner_product_backward_weights::primitive_desc ipBwdWeights_pd - = GetIPBwdWeights(data, weight, param.no_bias ? nullptr : &in_grad[fullc::kBias], - out_grad, ipFwd_pd); - auto out_grad_mem = out_grad.GetMKLDNNDataReorder( - ipBwdWeights_pd.diff_dst_primitive_desc()); - auto data_mem = data.GetMKLDNNDataReorder(ipBwdWeights_pd.src_primitive_desc()); - auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[fullc::kWeight], - ipBwdWeights_pd.diff_weights_primitive_desc(), - req[fullc::kWeight]); - mkldnn_output_t in_grad_bias; - if (param.no_bias) { - MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_weights( - ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second)); - } else { - in_grad_bias = CreateMKLDNNMem(in_grad[fullc::kBias], - ipBwdWeights_pd.diff_bias_primitive_desc(), - req[fullc::kBias]); - MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_weights( - ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second, - *in_grad_bias.second)); - } - CommitOutput(in_grad[fullc::kWeight], in_grad_weight); - CommitOutput(in_grad[fullc::kBias], in_grad_bias); - } - MKLDNNStream::Get()->Submit(); -} - -} // namespace op -} // namespace mxnet -#endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_lrn-inl.h b/src/operator/nn/mkldnn/mkldnn_lrn-inl.h deleted file mode 100644 index 9a9bf62b67d0..000000000000 --- a/src/operator/nn/mkldnn/mkldnn_lrn-inl.h +++ /dev/null @@ -1,143 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file mkldnn_lrn-inl.h - * \brief - * \Author: Patric Zhao, patric.zhao@intel.com -*/ -#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_ -#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_ - -#if MXNET_USE_MKLDNN == 1 -#include -#include "../lrn-inl.h" -#include "./mkldnn_base-inl.h" - -namespace mxnet { -namespace op { - -inline algorithm GetMKLDNNLRNAlgo(const LRNParam ¶m) { - // TODO(Patric): lrn_within_channel will cause core dump in MKLDNN backward - // Need to confirm with MKLDNN team and fix later - return algorithm::lrn_across_channels; -} - -inline lrn_forward::primitive_desc GetLRNFwd(const LRNParam ¶m, - const bool is_train, - const memory::desc &src_md) { - const auto engine = CpuEngine::Get()->get_engine(); - const auto alg = GetMKLDNNLRNAlgo(param); - const float alpha = param.alpha; - const float beta = param.beta; - const int nsize = param.nsize; - const float k = param.knorm; - auto kind = prop_kind::forward_training; - if (is_train) { - kind = prop_kind::forward_training; - } else { - kind = prop_kind::forward_scoring; - } - lrn_forward::desc fwd_desc(kind, alg, src_md, nsize, alpha, beta, k); - return mkldnn::lrn_forward::primitive_desc(fwd_desc, engine); -} - -inline mkldnn::lrn_backward::primitive_desc -GetLRNBwd(const LRNParam ¶m, - const mkldnn::memory::desc &diff_in_md, - const mkldnn::memory::desc &diff_md, - const lrn_forward::primitive_desc &lrnFwd_desc) { - const auto engine = CpuEngine::Get()->get_engine(); - const auto alg = GetMKLDNNLRNAlgo(param); - const float alpha = param.alpha; - const float beta = param.beta; - const int nsize = param.nsize; - const float k = param.knorm; - - lrn_backward::desc lrnBwd_desc(alg, diff_in_md, - diff_md, nsize, alpha, beta, k); - return mkldnn::lrn_backward::primitive_desc(lrnBwd_desc, - engine, lrnFwd_desc); -} - -void MKLDNNLRNForward(const OpContext &ctx, - const LRNParam ¶m, - const NDArray &in_data, - const OpReqType req, - const NDArray &out_data) { - auto src_mem = in_data.GetMKLDNNData(); - const auto src_md = src_mem->get_primitive_desc().desc(); - const auto pdesc = GetLRNFwd(param, ctx.is_train, src_md); - auto dst_mem = const_cast(out_data).CreateMKLDNNData( - pdesc.dst_primitive_desc()); - if (ctx.is_train) { - std::shared_ptr ws_mem( - new mkldnn::memory(pdesc.workspace_primitive_desc())); - MKLDNNStream::Get()->RegisterPrim( - lrn_forward(pdesc, mkldnn::primitive::at(*src_mem), - *ws_mem, *dst_mem)); - MKLDNNStream::Get()->Submit(); - } else { - MKLDNNStream::Get()->RegisterPrim( - lrn_forward(pdesc, mkldnn::primitive::at(*src_mem), *dst_mem)); - MKLDNNStream::Get()->Submit(); - } -} - -void MKLDNNLRNBackward(const OpContext &ctx, const LRNParam ¶m, - const NDArray &out_grad, - const NDArray &in_data, - const OpReqType req, - const NDArray &in_grad) { - if (req == kNullOp) { - return; - } - // Repeat FW for getting workspace - auto data_mem = in_data.GetMKLDNNData(); - const auto data_md = data_mem->get_primitive_desc().desc(); - const auto pdesc_fwd = GetLRNFwd(param, ctx.is_train, data_md); - - // TODO(Patric): To keep the function stateless, we can't pass workspace - // from LRN forward to backward. We have to re-compute - // LRN forward to get the workspace. - // Will refine this code later. - std::shared_ptr ws_mem( - new mkldnn::memory(pdesc_fwd.workspace_primitive_desc())); - std::shared_ptr dst_temp( - new mkldnn::memory(pdesc_fwd.dst_primitive_desc())); - MKLDNNStream::Get()->RegisterPrim( - lrn_forward(pdesc_fwd, mkldnn::primitive::at(*data_mem), - *ws_mem, *dst_temp)); - - const auto data_in_md = pdesc_fwd.src_primitive_desc().desc(); - auto diff_mem = out_grad.GetMKLDNNData(); - const auto diff_md = diff_mem->get_primitive_desc().desc(); - const auto pdesc_bwd = GetLRNBwd(param, data_in_md, diff_md, pdesc_fwd); - auto diff_src_mem = CreateMKLDNNMem(in_grad, - pdesc_bwd.diff_src_primitive_desc(), req); - - MKLDNNStream::Get()->RegisterPrim( - lrn_backward(pdesc_bwd, mkldnn::primitive::at(*data_mem), - mkldnn::primitive::at(*diff_mem), *ws_mem, *diff_src_mem.second)); - MKLDNNStream::Get()->Submit(); -} -} // namespace op -} // namespace mxnet -#endif // MXNET_USE_MKLDNN == 1 -#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H__ diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h deleted file mode 100644 index 9149cb0c6a94..000000000000 --- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h +++ /dev/null @@ -1,114 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file mkldnn_ops-inl.h - * \brief - * \author Da Zheng -*/ - -#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ -#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ - -#if MXNET_USE_MKLDNN == 1 - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace mxnet { -namespace op { - -/* For fully connected. */ -void MKLDNNFCForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data); -void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs); - -/* For convolution. */ -void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data); -void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs); - -/* For deconvolution */ -void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data); -void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs); - -/* For softmax */ -void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const NDArray &in_data, const OpReqType &req, - const NDArray &out_data); - -/* For sum */ -void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &inputs, const OpReqType &req, - const NDArray &out_data); - -/* For copy */ -void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const NDArray &in_data, const OpReqType &req, - const NDArray &out_data); - -/* For concat */ -void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data); -void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs); - -/* For activation */ -void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const NDArray &in_data, const OpReqType &req, - const NDArray &out_data); -void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const NDArray &out_grad, const NDArray &in_data, - const OpReqType &req, const NDArray &in_grad); - -void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, - const mkldnn::memory &out); - -} // namespace op -} // namespace mxnet -#endif // MXNET_USE_MKLDNN == 1 - -#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h deleted file mode 100644 index 4f2f71866e14..000000000000 --- a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h +++ /dev/null @@ -1,121 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file mkldnn_pooling-inl.h - * \brief -*/ -#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_ -#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_ - -#if MXNET_USE_MKLDNN == 1 - -#include -#include -#include "../pooling-inl.h" -#include "./mkldnn_base-inl.h" - -namespace mxnet { -namespace op { - -class MKLDNNPoolingFwd { - public: - MKLDNNPoolingFwd(const mxnet::NDArray &input, - const mxnet::NDArray &output, - const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, - const int padding_t, const int padding_b, - const int padding_l, const int padding_r, - const mkldnn::algorithm alg_kind, - const bool with_workspace, const bool is_train) : - is_train_(is_train), - with_workspace_(with_workspace), - alg_kind_(alg_kind), - fwd_(nullptr), data_(nullptr), out_(nullptr), workspace_(nullptr) { - Init(input, output, - kernel_h, kernel_w, stride_h, stride_w, - padding_t, padding_b, padding_l, padding_r); - } - - ~MKLDNNPoolingFwd() {} - void SetDataHandle(const mxnet::NDArray &data, - const mxnet::NDArray &output, - const mxnet::NDArray *workspace = nullptr); - void Execute(); - - private: - bool is_train_; - bool with_workspace_; - mkldnn::algorithm alg_kind_; - std::shared_ptr fwd_pd_; - std::shared_ptr fwd_; - std::shared_ptr data_; - std::shared_ptr out_; - std::shared_ptr workspace_; - - private: - void Init(const mxnet::NDArray &input, - const mxnet::NDArray &output, - const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, - const int padding_t, const int padding_b, - const int padding_l, const int padding_r); -}; - -inline bool SupportMKLDNNPooling(const PoolingParam ¶m) { - return param.kernel.ndim() == 2 && - (param.pool_type == pool_enum::kMaxPooling || - param.pool_type == pool_enum::kAvgPooling) - // This is a temporary fix. There is a bug in global pooling of MKLDNN. - && !param.global_pool; -} - -inline bool SupportMKLDNNPooling(const PoolingParam ¶m, - const TShape &dshape) { - bool ret = SupportMKLDNNPooling(param); - if (!ret) - return false; - - if (param.pooling_convention == pool_enum::kValid) - return true; - - if (((dshape[2] + 2 * param.pad[0] - param.kernel[0]) % param.stride[0] == 0) && - ((dshape[3] + 2 * param.pad[1] - param.kernel[1]) % param.stride[1] == 0)) - return true; - else - return false; -} - -inline bool MKLDNNRequireWorkspace(const PoolingParam ¶m) { - return param.pool_type != pool_enum::kAvgPooling; -} - -typedef MKLDNNParamOpSign MKLDNNPoolingSignature; -void MKLDNNPoolingCompute(const OpContext &ctx, const PoolingParam ¶m, - const NDArray &in_data, const OpReqType req, - const NDArray &out_data, const NDArray *workspace); - -void MKLDNNPoolingGradCompute(const OpContext &ctx, const PoolingParam ¶m, - const NDArray &out_grad, const NDArray &in_data, - const NDArray *workspace, const OpReqType req, - const NDArray &in_grad); -} // namespace op -} // namespace mxnet -#endif // MXNET_USE_MKLDNN == 1 -#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_ diff --git a/src/operator/nn/mkldnn/mkldnn_pooling.cc b/src/operator/nn/mkldnn/mkldnn_pooling.cc deleted file mode 100644 index 6eeecaf07271..000000000000 --- a/src/operator/nn/mkldnn/mkldnn_pooling.cc +++ /dev/null @@ -1,322 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file mkldnn_pooling.cc - * \brief - * \author Tao Lv -*/ - -#if MXNET_USE_MKLDNN == 1 - -#include "./mkldnn_pooling-inl.h" - -namespace mxnet { -namespace op { - -void MKLDNNPoolingFwd::Init(const mxnet::NDArray &input, const mxnet::NDArray &output, - const int kernel_h, const int kernel_w, - const int stride_h, const int stride_w, - const int padding_t, const int padding_b, - const int padding_l, const int padding_r) { - // mkldnn::memory::desc - auto src_md = input.GetMKLDNNData()->get_primitive_desc().desc(); - mkldnn::memory::dims dims = {src_md.data.dims[0], - src_md.data.dims[1], - static_cast(output.shape()[2]), - static_cast(output.shape()[3])}; - auto dst_md = mkldnn::memory::desc({dims}, - static_cast(src_md.data.data_type), - static_cast(src_md.data.format)); - const mkldnn::engine engine = CpuEngine::Get()->get_engine(); - const mkldnn::algorithm alg_kind = this->alg_kind_; - if (alg_kind != mkldnn::algorithm::pooling_max && - alg_kind != mkldnn::algorithm::pooling_avg && - alg_kind != mkldnn::algorithm::pooling_avg_include_padding && - alg_kind != mkldnn::algorithm::pooling_avg_exclude_padding) { - LOG(FATAL) << "MKLDNN Pooling: algorithm is not supported"; - } - - mkldnn::prop_kind prop = mkldnn::prop_kind::forward_scoring; - if (this->is_train_ && alg_kind != mkldnn::algorithm::pooling_avg) { - prop = mkldnn::prop_kind::forward_training; - } - if (this->is_train_ && prop == mkldnn::prop_kind::forward_scoring) { - LOG(INFO) << "MKLDNN Pooling: training with prop_kind is forward_scoring"; - } - - const mkldnn::memory::dims strides = {stride_h, stride_w }; - const mkldnn::memory::dims pad_l = {padding_t, padding_l }; - const mkldnn::memory::dims pad_r = {padding_b, padding_r }; - const mkldnn::memory::dims kernel = {kernel_h, kernel_w }; - // mkldnn::pooling_forward::desc - const auto fwd_desc = mkldnn::pooling_forward::desc(prop, alg_kind, src_md, dst_md, - strides, kernel, pad_l, pad_r, - mkldnn::padding_kind::zero); - this->fwd_pd_.reset(new mkldnn::pooling_forward::primitive_desc(fwd_desc, engine)); - this->data_.reset(new mkldnn::memory(input.GetMKLDNNData()->get_primitive_desc())); - this->out_.reset(new mkldnn::memory(this->fwd_pd_->dst_primitive_desc())); - if (this->with_workspace_) { - this->workspace_.reset(new mkldnn::memory(this->fwd_pd_->workspace_primitive_desc())); - this->fwd_.reset(new mkldnn::pooling_forward(*(this->fwd_pd_), - mkldnn::primitive::at(*(this->data_)), - *(this->out_), - *(this->workspace_))); - } else { - this->fwd_.reset(new mkldnn::pooling_forward(*(this->fwd_pd_), - mkldnn::primitive::at(*(this->data_)), - *(this->out_))); - } - return; -} - -void MKLDNNPoolingFwd::SetDataHandle(const mxnet::NDArray &data, - const mxnet::NDArray &output, - const mxnet::NDArray *workspace) { - // mkldnn::memory - auto data_mem = data.GetMKLDNNData(); - auto out_mem = const_cast(output).CreateMKLDNNData( - this->fwd_pd_->dst_primitive_desc()); - this->data_->set_data_handle(data_mem->get_data_handle()); - this->out_->set_data_handle(out_mem->get_data_handle()); - if (this->with_workspace_ && workspace == nullptr) { - LOG(FATAL) << "MKLDNN Pooling: incorrect workspace input"; - } - - if (this->with_workspace_) { - // mkldnn::memory - auto ws_mem = workspace->GetMKLDNNData(); - this->workspace_->set_data_handle(ws_mem->get_data_handle()); - } -} - -void MKLDNNPoolingFwd::Execute() { - if (this->fwd_) { - MKLDNNStream::Get()->RegisterPrim(*(this->fwd_)); - MKLDNNStream::Get()->Submit(); - } else { - LOG(FATAL) << "MKLDNN Pooling: forward primitive is nullptr"; - } -} - -mkldnn::algorithm GetMKLDNNPoolAlgo(const PoolingParam ¶m) { - switch (param.pool_type) { - case pool_enum::kMaxPooling: - return mkldnn::algorithm::pooling_max; - break; - case pool_enum::kAvgPooling: - return mkldnn::algorithm::pooling_avg_include_padding; - break; - default: - LOG(FATAL) << "MKLDNN Pooling: Unknown pooling method."; - return mkldnn::algorithm::pooling_max; - } -} - -mkldnn::pooling_forward::primitive_desc GetPoolingFwd(const PoolingParam ¶m, - const bool is_train, - const memory::desc &data_md, - const memory::desc &out_md) { - CHECK_EQ(param.kernel.ndim(), 2) << "Not Implemented"; - int kernel_h_, kernel_w_; - if (param.global_pool) { - kernel_h_ = data_md.data.dims[2]; - kernel_w_ = data_md.data.dims[3]; - } else { - kernel_h_ = param.kernel[0]; - kernel_w_ = param.kernel[1]; - } - - CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; - CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; - - const int pad_t_ = param.pad[0], pad_b_ = param.pad[0]; - const int pad_l_ = param.pad[1], pad_r_ = param.pad[1]; - const int stride_h_ = param.stride[0], stride_w_ = param.stride[1]; - - const mkldnn::engine engine = CpuEngine::Get()->get_engine(); - if (param.global_pool) { - CHECK(pad_t_ == 0 && pad_l_ == 0 && stride_h_ == 1 && stride_w_ == 1) - << "With Global_pooling: true; only pad = 0 and stride = 1"; - } - if (pad_t_ != 0 || pad_l_ != 0) { - CHECK(param.pool_type == pool_enum::kAvgPooling || - param.pool_type == pool_enum::kMaxPooling) - << "Padding implemented only for average and max pooling."; - CHECK_LT(pad_l_, kernel_w_); - CHECK_LT(pad_t_, kernel_h_); - } - - - const mkldnn::algorithm alg = GetMKLDNNPoolAlgo(param); - mkldnn::prop_kind kind = mkldnn::prop_kind::forward_scoring; - if (is_train && alg != algorithm::pooling_avg) { - kind = mkldnn::prop_kind::forward_training; - } - - const pooling_forward::desc poolingFwd_desc(kind, alg, data_md, out_md, - {static_cast(stride_h_), - static_cast(stride_w_)}, - {kernel_h_, kernel_w_}, - {static_cast(pad_t_), - static_cast(pad_l_)}, - {static_cast(pad_b_), - static_cast(pad_r_)}, - padding_kind::zero); - return mkldnn::pooling_forward::primitive_desc(poolingFwd_desc, engine); -} - -MKLDNNPoolingFwd &GetPoolingFwd(const PoolingParam ¶m, - const bool is_train, - const NDArray &data, - const NDArray &output) { - static thread_local std::unordered_map pooling_fwds; - - bool with_workspace = is_train && MKLDNNRequireWorkspace(param); - MKLDNNPoolingSignature key(param); - key.AddSign(is_train); - key.AddSign(with_workspace); - key.AddSign(data); - key.AddSign(output); - - auto it = pooling_fwds.find(key); - if (it == pooling_fwds.end()) { - CHECK_EQ(param.kernel.ndim(), 2) << "Not Implemented"; - auto data_md = data.GetMKLDNNData()->get_primitive_desc().desc(); - int kernel_h_, kernel_w_; - if (param.global_pool) { - kernel_h_ = data_md.data.dims[2]; - kernel_w_ = data_md.data.dims[3]; - } else { - kernel_h_ = param.kernel[0]; - kernel_w_ = param.kernel[1]; - } - - CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; - CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; - - const int pad_t_ = param.pad[0], pad_b_ = param.pad[0]; - const int pad_l_ = param.pad[1], pad_r_ = param.pad[1]; - const int stride_h_ = param.stride[0], stride_w_ = param.stride[1]; - - if (param.global_pool) { - CHECK(pad_t_ == 0 && pad_l_ == 0 && stride_h_ == 1 && stride_w_ == 1) - << "With Global_pooling: true; only pad = 0 and stride = 1"; - } - - if (pad_t_ != 0 || pad_l_ != 0) { - CHECK(param.pool_type == pool_enum::kAvgPooling || - param.pool_type == pool_enum::kMaxPooling) - << "Padding implemented only for average and max pooling."; - CHECK_LT(pad_l_, kernel_w_); - CHECK_LT(pad_t_, kernel_h_); - } - - const mkldnn::algorithm alg = GetMKLDNNPoolAlgo(param); - MKLDNNPoolingFwd fwd(data, output, kernel_h_, kernel_w_, stride_h_, stride_w_, - pad_t_, pad_b_, pad_l_, pad_r_, alg, with_workspace, is_train); - auto ins_ret = pooling_fwds.insert( - std::pair(key, fwd)); - CHECK(ins_ret.second); - it = ins_ret.first; - } - return it->second; -} - -void MKLDNNPoolingCompute(const OpContext &ctx, const PoolingParam ¶m, - const NDArray &in_data, const OpReqType req, - const NDArray &out_data, const NDArray *workspace) { - auto fwd = GetPoolingFwd(param, ctx.is_train, in_data, out_data); - fwd.SetDataHandle(in_data, out_data, workspace); - fwd.Execute(); -} - -void MKLDNNPoolingGradCompute(const OpContext &ctx, const PoolingParam ¶m, - const NDArray &out_grad, const NDArray &in_data, - const NDArray *workspace, const OpReqType req, - const NDArray &in_grad) { - if (req == kNullOp) { - return; - } - - TmpMemMgr::Get()->Init(ctx.requested[0]); - // mkldnn::memory - auto diff_dst_mem = out_grad.GetMKLDNNData(); - auto input_mem = in_data.GetMKLDNNData(); - mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); - const mkldnn::memory::desc data_md = data_mpd.desc(); - const memory::dims dims = {data_md.data.dims[0], data_md.data.dims[1], - static_cast(out_grad.shape()[2]), - static_cast(out_grad.shape()[3])}; - const memory::desc out_md({dims}, - static_cast(data_md.data.data_type), - static_cast(data_md.data.format)); - auto pdesc_fwd = GetPoolingFwd(param, ctx.is_train, data_md, out_md); - - const mkldnn::memory::desc diff_md = diff_dst_mem->get_primitive_desc().desc(); - const memory::dims dims1 = {diff_md.data.dims[0], diff_md.data.dims[1], - static_cast(in_grad.shape()[2]), - static_cast(in_grad.shape()[3])}; - const memory::desc diff_in_md( - {dims1}, static_cast(diff_md.data.data_type), - static_cast(diff_md.data.format)); - const mkldnn::engine cpu_engine = data_mpd.get_engine(); - const mkldnn::algorithm alg = GetMKLDNNPoolAlgo(param); - - int kernel_h_, kernel_w_; - if (param.global_pool) { - kernel_h_ = data_md.data.dims[2]; - kernel_w_ = data_md.data.dims[3]; - } else { - kernel_h_ = param.kernel[0]; - kernel_w_ = param.kernel[1]; - } - const pooling_backward::desc desc(alg, diff_in_md, diff_md, - {static_cast(param.stride[0]), - static_cast(param.stride[1])}, - {kernel_h_, kernel_w_}, - {static_cast(param.pad[0]), - static_cast(param.pad[1])}, - {static_cast(param.pad[0]), - static_cast(param.pad[1])}, - mkldnn::padding_kind::zero); - const pooling_backward::primitive_desc pdesc(desc, cpu_engine, pdesc_fwd); - - auto diff_src_mem = - CreateMKLDNNMem(in_grad, pdesc.diff_src_primitive_desc(), req); - - if (MKLDNNRequireWorkspace(param)) { - CHECK(workspace != nullptr); - auto workspace_mem = workspace->GetMKLDNNData(); - MKLDNNStream::Get()->RegisterPrim( - pooling_backward(pdesc, *diff_dst_mem, primitive::at(*workspace_mem), - *diff_src_mem.second)); - } else { - MKLDNNStream::Get()->RegisterPrim( - pooling_backward(pdesc, *diff_dst_mem, *diff_src_mem.second)); - } - CommitOutput(in_grad, diff_src_mem); - MKLDNNStream::Get()->Submit(); -} - -} // namespace op -} // namespace mxnet -#endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_softmax.cc b/src/operator/nn/mkldnn/mkldnn_softmax.cc deleted file mode 100644 index aa59f13d06da..000000000000 --- a/src/operator/nn/mkldnn/mkldnn_softmax.cc +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file mkldnn_softmax.cc - * \brief - * \author Da Zheng -*/ - -#include "../softmax-inl.h" -#include "./mkldnn_ops-inl.h" -#include "./mkldnn_base-inl.h" - -#if MXNET_USE_MKLDNN == 1 -namespace mxnet { -namespace op { - -void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const NDArray &in_data, const OpReqType &req, - const NDArray &out_data) { - const SoftmaxParam& param = nnvm::get(attrs.parsed); - auto input_mem = in_data.GetMKLDNNData(); - mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); - mkldnn::memory::desc data_md = data_mpd.desc(); - auto cpu_engine = data_mpd.get_engine(); - auto prop = ctx.is_train - ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring; - mkldnn::softmax_forward::desc desc = mkldnn::softmax_forward::desc(prop, - data_md, param.axis); - mkldnn::softmax_forward::primitive_desc pdesc(desc, cpu_engine); - - auto output_memory = out_data.GetMKLDNNData(); - MKLDNNStream *stream = MKLDNNStream::Get(); - stream->RegisterPrim(mkldnn::softmax_forward(pdesc, *input_mem, *output_memory)); - stream->Submit(); -} - -} // namespace op -} // namespace mxnet -#endif diff --git a/src/operator/nn/mkldnn/mkldnn_sum.cc b/src/operator/nn/mkldnn/mkldnn_sum.cc deleted file mode 100644 index f3aeacf17dd1..000000000000 --- a/src/operator/nn/mkldnn/mkldnn_sum.cc +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file mkldnn_sum.cc - * \brief - * \author Da Zheng -*/ -#include - -#include "./mkldnn_ops-inl.h" -#include "./mkldnn_base-inl.h" - -#if MXNET_USE_MKLDNN == 1 -namespace mxnet { -namespace op { - -void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, - const mkldnn::memory &out) { - std::vector input_pds(2); - std::vector scales(2, 1); - std::vector inputs; - input_pds[0] = arr1.get_primitive_desc(); - input_pds[1] = arr2.get_primitive_desc(); - CHECK(input_pds[0] == input_pds[1]); - inputs.push_back(arr1); - inputs.push_back(arr2); - // TODO(zhengda) I need to reorder memory here. - mkldnn::sum::primitive_desc sum_pd(scales, input_pds); - MKLDNNStream::Get()->RegisterPrim(mkldnn::sum(sum_pd, inputs, out)); -} - -void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, - const std::vector &inputs, const OpReqType &req, - const NDArray &out_data) { - TmpMemMgr::Get()->Init(ctx.requested[0]); - std::vector in_prims; - std::vector in_pds(inputs.size()); - std::vector scales(inputs.size(), 1); - in_prims.reserve(inputs.size()); - for (size_t i = 0; i < inputs.size(); i++) { - auto in_mem = inputs[i].GetMKLDNNData(); - in_prims.push_back(*in_mem); - in_pds[i] = in_mem->get_primitive_desc(); - } - mkldnn::sum::primitive_desc pdesc(scales, in_pds); - - auto out_mem = CreateMKLDNNMem(out_data, pdesc.dst_primitive_desc(), req); - MKLDNNStream *stream = MKLDNNStream::Get(); - stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *out_mem.second)); - CommitOutput(out_data, out_mem); - stream->Submit(); -} - -} // namespace op -} // namespace mxnet -#endif diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h index 7a20f026f7b9..a32aaa2152e9 100644 --- a/src/operator/nn/pooling-inl.h +++ b/src/operator/nn/pooling-inl.h @@ -21,7 +21,7 @@ * Copyright (c) 2017 by Contributors * \file pooling-inl.h * \brief - * \author Bing Xu, Jun Wu, Da Zheng + * \author Bing Xu, Jun Wu */ #ifndef MXNET_OPERATOR_NN_POOLING_INL_H_ @@ -78,138 +78,257 @@ struct PoolingParam : public dmlc::Parameter { DMLC_DECLARE_FIELD(pad).set_default(TShape()) .describe("Pad for pooling: (y, x) or (d, y, x). Defaults to no padding."); } +}; - bool operator==(const PoolingParam& other) const { - return this->kernel == other.kernel && - this->stride == other.stride && - this->pad == other.pad && - this->pool_type == other.pool_type && - this->pooling_convention == other.pooling_convention && - this->global_pool == other.global_pool && - this->cudnn_off == other.cudnn_off; +template +class PoolingOp : public Operator { + public: + explicit PoolingOp(PoolingParam p) { + this->param_ = p; } -}; -} // namespace op -} // namespace mxnet + virtual void Forward(const OpContext& ctx, + const std::vector& in_data, + const std::vector& req, + const std::vector& out_data, + const std::vector& aux_args) { + using namespace mshadow; + CHECK_EQ(in_data.size(), 1U); + CHECK_EQ(out_data.size(), 1U); + Stream *s = ctx.get_stream(); + const TShape& ishape = in_data[pool_enum::kData].shape_; -namespace std { -template<> -struct hash { - size_t operator()(const mxnet::op::PoolingParam& val) { - size_t ret = 0; - ret = dmlc::HashCombine(ret, val.kernel); - ret = dmlc::HashCombine(ret, val.stride); - ret = dmlc::HashCombine(ret, val.pad); - ret = dmlc::HashCombine(ret, val.pool_type); - ret = dmlc::HashCombine(ret, val.pooling_convention); - ret = dmlc::HashCombine(ret, val.global_pool); - ret = dmlc::HashCombine(ret, val.cudnn_off); - return ret; + pool(s, in_data[pool_enum::kData].dptr(), + in_data[pool_enum::kData].shape_, + out_data[pool_enum::kOut].shape_, + param_.global_pool? + TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim()) + : param_.kernel, + param_.pad, + param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride, + param_.pool_type, + req[pool_enum::kOut], + out_data[pool_enum::kOut].dptr()); } -}; -} // namespace std -namespace mxnet { -namespace op { - -/* - * When MKLDNN is enabled, we might want 2 outputs instead of one inputs, which - * also changes the number of inputs for backward. - */ -int GetNumOutputs(const PoolingParam ¶m); -int GetNumBackInputs(const PoolingParam ¶m); + virtual void Backward(const OpContext& ctx, + const std::vector& out_grad, + const std::vector& in_data, + const std::vector& out_data, + const std::vector& req, + const std::vector& in_grad, + const std::vector& aux_args) { + using namespace mshadow; + CHECK_EQ(out_grad.size(), 1U); + CHECK_EQ(in_data.size(), 1U); + CHECK_EQ(out_data.size(), 1U); + CHECK_EQ(req.size(), 1U); + CHECK_EQ(in_grad.size(), 1U); + Stream *s = ctx.get_stream(); + const TShape& ishape = in_data[pool_enum::kData].shape_; -template -void PoolingForward(const OpContext& ctx, const PoolingParam ¶m, - const TBlob& in_data, const OpReqType& req, - const TBlob& out_data) { - using namespace mshadow; - Stream *s = ctx.get_stream(); - const TShape& ishape = in_data.shape_; - - pool(s, in_data.dptr(), in_data.shape_, out_data.shape_, - param.global_pool? - TShape(ishape.data()+ishape.ndim()-param.kernel.ndim(), ishape.data()+ishape.ndim()) - : param.kernel, - param.pad, - param.global_pool? TShape(param.kernel.ndim()) : param.stride, - param.pool_type, req, out_data.dptr()); -} + unpool(s, out_grad[pool_enum::kOut].dptr(), + in_data[pool_enum::kData].dptr(), + out_data[pool_enum::kOut].dptr(), + in_grad[pool_enum::kData].shape_, + out_grad[pool_enum::kOut].shape_, + param_.global_pool? + TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim()) + : param_.kernel, + param_.pad, + param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride, + param_.pool_type, + req[pool_enum::kData], + in_grad[pool_enum::kData].dptr()); + } -template -void PoolingBackward(const OpContext& ctx, const PoolingParam ¶m, - const TBlob& out_grad, const TBlob& in_data, - const TBlob& out_data, const OpReqType& req, - const TBlob& in_grad) { - using namespace mshadow; - Stream *s = ctx.get_stream(); - const TShape& ishape = in_data.shape_; - - unpool(s, out_grad.dptr(), in_data.dptr(), out_data.dptr(), - in_grad.shape_, out_grad.shape_, - param.global_pool? - TShape(ishape.data()+ishape.ndim()-param.kernel.ndim(), ishape.data()+ishape.ndim()) - : param.kernel, - param.pad, - param.global_pool? TShape(param.kernel.ndim()) : param.stride, - param.pool_type, req, in_grad.dptr()); -} + private: + PoolingParam param_; +}; // class PoolingOp template -void PoolingCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const PoolingParam& param = nnvm::get(attrs.parsed); - CHECK_EQ(inputs.size(), 1U); - CHECK_EQ(outputs.size(), GetNumOutputs(param)); - MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - if (pool_enum::kMaxPooling == param.pool_type - || pool_enum::kAvgPooling == param.pool_type - || pool_enum::kSumPooling == param.pool_type) { - PoolingForward(ctx, param, inputs[0], req[0], outputs[0]); +Operator* CreateOp(PoolingParam param, int dtype); + + +#if DMLC_USE_CXX11 +class PoolingProp : public OperatorProperty { + public: + void Init(const std::vector >& kwargs) override { + using namespace mshadow; + param_.Init(kwargs); + if (param_.kernel.ndim() == 1) { + if (param_.stride.ndim() == 0) param_.stride = Shape1(1); + if (param_.pad.ndim() == 0) param_.pad = Shape1(0); + } else if (param_.kernel.ndim() == 2) { + if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); } else { - LOG(FATAL) << "unknown pooling type"; + CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D pooling not supported"; + if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); } - }); -} + CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim()) + << "stride and kernel should have the same length"; + CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim()) + << "pad and kernel should have the same length"; + } -template -void PoolingGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const PoolingParam& param = nnvm::get(attrs.parsed); - CHECK_EQ(inputs.size(), GetNumBackInputs(param)); - CHECK_EQ(outputs.size(), 1U); - CHECK_EQ(req.size(), 1U); - off_t ograd_idx, in_data_idx, out_data_idx; - // When MKLDNN is enabled, the input data may contains arrays for workspace. - if (GetNumBackInputs(param) == 5) { - ograd_idx = 0; - in_data_idx = 2; - out_data_idx = 3; - } else { - ograd_idx = 0; - in_data_idx = 1; - out_data_idx = 2; + std::map GetParams() const override { + return param_.__DICT__(); } - MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - if (pool_enum::kMaxPooling == param.pool_type - || pool_enum::kAvgPooling == param.pool_type - || pool_enum::kSumPooling == param.pool_type) { - PoolingBackward(ctx, param, inputs[ograd_idx], - inputs[in_data_idx], inputs[out_data_idx], - req[0], outputs[0]); - } else { - LOG(FATAL) << "unknown pooling type"; + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + CHECK_EQ(in_shape->size(), 1U); + const TShape &dshape = (*in_shape)[0]; + CHECK_GE(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)" + << " Or 4D in (batch, channel, y, x) " + << " Or 5D in (batch, channel, d, y, x)"; + TShape oshape = dshape; + if (dshape.ndim() == 0) return false; + if (param_.kernel.ndim() == 1) { + CHECK_EQ(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)"; + if (param_.global_pool) { + oshape[2] = 1; + } else { + CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]) + << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2] + << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")"; + if (param_.pooling_convention == pool_enum::kValid) { + oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / + param_.stride[0]; + } else { + oshape[2] = 1 + static_cast(ceil(static_cast( + dshape[2] + 2 * param_.pad[0] - + param_.kernel[0]) / param_.stride[0])); + } + } + out_shape->clear(); + out_shape->push_back(oshape); // save output shape + } else if (param_.kernel.ndim() == 2) { + CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x)"; + if (param_.global_pool) { + oshape[2] = 1; + oshape[3] = 1; + } else { + CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]) + << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2] + << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")"; + CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1]) + << "kernel size (" << param_.kernel[1] << ") exceeds input (" << dshape[3] + << " padded to " << (dshape[3] + 2*param_.pad[1]) << ")"; + if (param_.pooling_convention == pool_enum::kValid) { + oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / + param_.stride[0]; + oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / + param_.stride[1]; + } else { + oshape[2] = 1 + static_cast(ceil(static_cast( + dshape[2] + 2 * param_.pad[0] - + param_.kernel[0]) / param_.stride[0])); + oshape[3] = 1 + static_cast(ceil(static_cast( + dshape[3] + 2 * param_.pad[1] - + param_.kernel[1]) / param_.stride[1])); + } + } + out_shape->clear(); + out_shape->push_back(oshape); // save output shape + } else if (param_.kernel.ndim() == 3) { + CHECK_EQ(dshape.ndim(), 5U) + << "Pooling: Input data should be 5D in (batch, channel, d, y, x)"; + CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) << "kernel size exceeds input"; + CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) << "kernel size exceeds input"; + CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) << "kernel size exceeds input"; + if (param_.global_pool) { + oshape[2] = 1; + oshape[3] = 1; + oshape[4] = 1; + } else { + if (param_.pooling_convention == pool_enum::kValid) { + oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / + param_.stride[0]; + oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / + param_.stride[1]; + oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) / + param_.stride[2]; + } else { + oshape[2] = 1 + static_cast(ceil(static_cast( + dshape[2] + 2 * param_.pad[0] - + param_.kernel[0]) / param_.stride[0])); + oshape[3] = 1 + static_cast(ceil(static_cast( + dshape[3] + 2 * param_.pad[1] - + param_.kernel[1]) / param_.stride[1])); + oshape[4] = 1 + static_cast(ceil(static_cast( + dshape[4] + 2 * param_.pad[2] - + param_.kernel[2]) / param_.stride[2])); + } + } + + out_shape->clear(); + out_shape->push_back(oshape); // save output shape + } + return true; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_EQ(in_type->size(), 1U); + int dtype = (*in_type)[0]; + + if (dtype == -1) { + LOG(FATAL) << "Input type to pooling is not specified."; + return false; } - }); -} + out_type->clear(); + out_type->push_back(dtype); + return true; + } + + OperatorProperty* Copy() const override { + PoolingProp *prop_sym = new PoolingProp(); + prop_sym->param_ = this->param_; + return prop_sym; + } + + std::string TypeString() const override { + return "Pooling"; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return {out_grad[pool_enum::kOut], in_data[pool_enum::kData], + out_data[pool_enum::kOut]}; + } + + std::vector > BackwardInplaceOption( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &in_grad) const override { +#if MXNET_USE_CUDNN == 1 + return {}; +#else + return {{in_data[pool_enum::kData], in_grad[pool_enum::kData]}}; +#endif + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented."; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; + + private: + PoolingParam param_; +}; // class PoolingProp +#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index f719e0753e08..8345ea3886d4 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -21,300 +21,78 @@ * Copyright (c) 2017 by Contributors * \file pooling.cc * \brief - * \author Bing Xu, Jun Wu, Da Zheng + * \author Bing Xu, Jun Wu */ -#include "../elemwise_op_common.h" #include "./pooling-inl.h" +#if MXNET_USE_MKL2017 == 1 +#include +#include "../mkl/mkl_memory-inl.h" +#include "../mkl/mkl_pooling-inl.h" +#endif // MXNET_USE_MKL2017 #if MXNET_USE_NNPACK == 1 #include "./nnpack/nnpack_pooling-inl.h" #endif // MXNET_USE_NNPACK -#if MXNET_USE_MKLDNN == 1 -#include "./mkldnn/mkldnn_pooling-inl.h" -#endif // MXNET_USE_MKLDNN namespace mxnet { namespace op { -static void PoolingParamParser(nnvm::NodeAttrs *attrs) { - using namespace mshadow; - PoolingParam param; - param.Init(attrs->dict); - if (param.kernel.ndim() == 1) { - if (param.stride.ndim() == 0) param.stride = Shape1(1); - if (param.pad.ndim() == 0) param.pad = Shape1(0); - } else if (param.kernel.ndim() == 2) { - if (param.stride.ndim() == 0) param.stride = Shape2(1, 1); - if (param.pad.ndim() == 0) param.pad = Shape2(0, 0); - } else { - CHECK_EQ(param.kernel.ndim(), 3U) << param.kernel.ndim() - << "D pooling not supported"; - if (param.stride.ndim() == 0) param.stride = Shape3(1, 1, 1); - if (param.pad.ndim() == 0) param.pad = Shape3(0, 0, 0); - } - CHECK_EQ(param.stride.ndim(), param.kernel.ndim()) - << "stride and kernel should have the same length"; - CHECK_EQ(param.pad.ndim(), param.kernel.ndim()) - << "pad and kernel should have the same length"; - attrs->parsed = std::move(param); -} - -int GetNumOutputs(const PoolingParam ¶m) { -#if MXNET_USE_MKLDNN == 1 - return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2 : 1; -#else - return 1; -#endif -} - -int GetNumBackInputs(const PoolingParam ¶m) { -#if MXNET_USE_MKLDNN == 1 - return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 5 : 3; -#else - return 3; -#endif -} - -static bool PoolingType(const nnvm::NodeAttrs& attrs, - std::vector *in_attrs, - std::vector *out_attrs) { - out_attrs->at(0) = in_attrs->at(0); -#if MXNET_USE_MKLDNN == 1 - const PoolingParam ¶m = nnvm::get(attrs.parsed); - if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param)) { - CHECK_GT(out_attrs->size(), 1U); - out_attrs->at(1) = mshadow::kInt32; - } -#endif - return true; -} - -static bool PoolingShape(const nnvm::NodeAttrs &attrs, - std::vector *in_shape, - std::vector *out_shape) { - const PoolingParam ¶m = nnvm::get(attrs.parsed); - CHECK_EQ(in_shape->size(), 1U); - const TShape &dshape = (*in_shape)[0]; - CHECK_GE(dshape.ndim(), 3U) - << "Pooling: Input data should be 3D in (batch, channel, x)" - << " Or 4D in (batch, channel, y, x) " - << " Or 5D in (batch, channel, d, y, x)"; - TShape oshape = dshape; - if (dshape.ndim() == 0) return false; - if (param.kernel.ndim() == 1) { - CHECK_EQ(dshape.ndim(), 3U) - << "Pooling: Input data should be 3D in (batch, channel, x)"; - if (param.global_pool) { - oshape[2] = 1; - } else { - CHECK(param.kernel[0] <= dshape[2] + 2 * param.pad[0]) - << "kernel size (" << param.kernel[0] << ") exceeds input (" - << dshape[2] << " padded to " << (dshape[2] + 2 * param.pad[0]) - << ")"; - if (param.pooling_convention == pool_enum::kValid) { - oshape[2] = 1 + - (dshape[2] + 2 * param.pad[0] - param.kernel[0]) / - param.stride[0]; - } else { - oshape[2] = 1 + static_cast(ceil( - static_cast(dshape[2] + 2 * param.pad[0] - - param.kernel[0]) / - param.stride[0])); +template<> +Operator *CreateOp(PoolingParam param, int dtype) { + Operator *op = NULL; +#if MXNET_USE_MKL2017 == 1 + if (param.kernel.ndim() == 2 + && ((param.pool_type == pool_enum::kMaxPooling) + || (param.pool_type == pool_enum::kAvgPooling))) { + switch (dtype) { + case mshadow::kFloat32: + return new MKLPoolingOp(param); + case mshadow::kFloat64: + return new MKLPoolingOp(param); + default: + break; } } - out_shape->clear(); - out_shape->push_back(oshape); // save output shape -#if MXNET_USE_MKLDNN == 1 - if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param)) - out_shape->push_back(oshape); // for workspace #endif - } else if (param.kernel.ndim() == 2) { - CHECK_EQ(dshape.ndim(), 4U) - << "Pooling: Input data should be 4D in (batch, channel, y, x)"; - if (param.global_pool) { - oshape[2] = 1; - oshape[3] = 1; - } else { - CHECK(param.kernel[0] <= dshape[2] + 2 * param.pad[0]) - << "kernel size (" << param.kernel[0] << ") exceeds input (" - << dshape[2] << " padded to " << (dshape[2] + 2 * param.pad[0]) - << ")"; - CHECK(param.kernel[1] <= dshape[3] + 2 * param.pad[1]) - << "kernel size (" << param.kernel[1] << ") exceeds input (" - << dshape[3] << " padded to " << (dshape[3] + 2 * param.pad[1]) - << ")"; - if (param.pooling_convention == pool_enum::kValid) { - oshape[2] = 1 + - (dshape[2] + 2 * param.pad[0] - param.kernel[0]) / - param.stride[0]; - oshape[3] = 1 + - (dshape[3] + 2 * param.pad[1] - param.kernel[1]) / - param.stride[1]; - } else { - oshape[2] = 1 + static_cast(ceil( - static_cast(dshape[2] + 2 * param.pad[0] - - param.kernel[0]) / - param.stride[0])); - oshape[3] = 1 + static_cast(ceil( - static_cast(dshape[3] + 2 * param.pad[1] - - param.kernel[1]) / - param.stride[1])); - } +#if MXNET_USE_NNPACK == 1 + // NNPACK only support max-pooling with kernel = 2, stride = 2, pooling_convention + // = kFull(note that the default value is kValid in MXNet) + if ((param.pool_type == pool_enum::kMaxPooling) + && (param.pooling_convention == pool_enum::kFull) + && (param.kernel.ndim() == 2) && (param.stride.ndim() == 2) + && (param.kernel[0] == 2) && (param.kernel[1] == 2) + && (param.stride[0] == 2) && (param.stride[1] == 2)) { + switch (dtype) { + case mshadow::kFloat32: + return new NNPACKPoolingOp(param); + default: + break; } - out_shape->clear(); - out_shape->push_back(oshape); // save output shape -#if MXNET_USE_MKLDNN == 1 - if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param)) - out_shape->push_back(oshape); // for workspace + } #endif - } else if (param.kernel.ndim() == 3) { - CHECK_EQ(dshape.ndim(), 5U) - << "Pooling: Input data should be 5D in (batch, channel, d, y, x)"; - CHECK_LE(param.kernel[0], dshape[2] + 2 * param.pad[0]) - << "kernel size exceeds input"; - CHECK_LE(param.kernel[1], dshape[3] + 2 * param.pad[1]) - << "kernel size exceeds input"; - CHECK_LE(param.kernel[2], dshape[4] + 2 * param.pad[2]) - << "kernel size exceeds input"; - if (param.global_pool) { - oshape[2] = 1; - oshape[3] = 1; - oshape[4] = 1; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + if (pool_enum::kMaxPooling == param.pool_type + || pool_enum::kAvgPooling == param.pool_type + || pool_enum::kSumPooling == param.pool_type) { + op = new PoolingOp(param); } else { - if (param.pooling_convention == pool_enum::kValid) { - oshape[2] = 1 + - (dshape[2] + 2 * param.pad[0] - param.kernel[0]) / - param.stride[0]; - oshape[3] = 1 + - (dshape[3] + 2 * param.pad[1] - param.kernel[1]) / - param.stride[1]; - oshape[4] = 1 + - (dshape[4] + 2 * param.pad[2] - param.kernel[2]) / - param.stride[2]; - } else { - oshape[2] = 1 + static_cast(ceil( - static_cast(dshape[2] + 2 * param.pad[0] - - param.kernel[0]) / - param.stride[0])); - oshape[3] = 1 + static_cast(ceil( - static_cast(dshape[3] + 2 * param.pad[1] - - param.kernel[1]) / - param.stride[1])); - oshape[4] = 1 + static_cast(ceil( - static_cast(dshape[4] + 2 * param.pad[2] - - param.kernel[2]) / - param.stride[2])); - } + LOG(FATAL) << "unknown pooling type"; + return NULL; } + }); - out_shape->clear(); - out_shape->push_back(oshape); // save output shape -#if MXNET_USE_MKLDNN == 1 - if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param)) - out_shape->push_back(oshape); // for workspace -#endif - } - return true; -} - -#if MXNET_USE_MKLDNN == 1 -void PoolingComputeExCPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - const PoolingParam ¶m = nnvm::get(attrs.parsed); - const NDArray *workspace = nullptr; - if (MKLDNNRequireWorkspace(param)) { - CHECK_GT(outputs.size(), 1U); - workspace = &outputs[1]; - } - if (SupportMKLDNN(inputs[0]) - && SupportMKLDNNPooling(param, inputs[0].shape())) { - MKLDNN_OPCHECK_INIT(false, 1, inputs, outputs); - MKLDNNPoolingCompute(ctx, param, inputs[0], req[0], outputs[0], workspace); - MKLDNN_OPCHECK_RUN(PoolingCompute, attrs, ctx, inputs, req, outputs); - return; - } - FallBackCompute(PoolingCompute, attrs, ctx, inputs, req, outputs); -} - -void PoolingGradComputeExCPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, - const std::vector &inputs, - const std::vector &req, - const std::vector &outputs) { - const PoolingParam ¶m = nnvm::get(attrs.parsed); - const NDArray &out_grad = inputs[0]; - const NDArray *workspace = nullptr; - const NDArray *in_data = nullptr; - if (MKLDNNRequireWorkspace(param)) { - // The first two elements are the gradient of the outputs in forward. - // The third is the input of forward. - // The fourth and the fifth are the outputs of forward. - CHECK_EQ(inputs.size(), 5U); - in_data = &inputs[2]; - workspace = &inputs[4]; - } else { - CHECK_EQ(inputs.size(), 3U); - in_data = &inputs[1]; - } - const NDArray &in_grad = outputs[0]; - if (SupportMKLDNN(inputs[0]) - && SupportMKLDNNPooling(param, inputs[0].shape())) { - MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs); - MKLDNNPoolingGradCompute(ctx, param, out_grad, *in_data, workspace, - req[0], in_grad); - MKLDNN_OPCHECK_RUN(PoolingGradCompute, attrs, ctx, inputs, req, - outputs); - return; - } - FallBackCompute(PoolingGradCompute, attrs, ctx, inputs, req, outputs); -} -#endif - -inline static bool PoolingStorageType(const nnvm::NodeAttrs &attrs, - const int dev_mask, - DispatchMode *dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - CHECK_EQ(in_attrs->size(), 1); - -#if MXNET_USE_MKLDNN == 1 - const PoolingParam ¶m = nnvm::get(attrs.parsed); - if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) { - return storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, DispatchMode::kFComputeEx); - } -#else - CHECK_EQ(out_attrs->size(), 1); -#endif - return storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, DispatchMode::kFCompute); + return op; } -inline static bool BackwardPoolingStorageType(const nnvm::NodeAttrs &attrs, - const int dev_mask, - DispatchMode *dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - const PoolingParam ¶m = nnvm::get(attrs.parsed); - CHECK_EQ(in_attrs->size(), GetNumBackInputs(param)); - CHECK_EQ(out_attrs->size(), 1); - -#if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) { - return storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, DispatchMode::kFComputeEx); - } -#else - CHECK_EQ(in_attrs->size(), 3); -#endif - return storage_type_assign(out_attrs, mxnet::kDefaultStorage, - dispatch_mode, DispatchMode::kFCompute); +// DO_BIND_DISPATCH comes from operator_common.h +Operator* PoolingProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]); } DMLC_REGISTER_PARAMETER(PoolingParam); -NNVM_REGISTER_OP(Pooling) - .describe(R"code(Performs pooling on the input. +MXNET_REGISTER_OP_PROPERTY(Pooling, PoolingProp) +.describe(R"code(Performs pooling on the input. The shapes for 1-D pooling are @@ -353,61 +131,8 @@ For 3-D pooling, an additional *depth* dimension is added before height, width)*. )code" ADD_FILELINE) -.set_num_inputs(1) -.set_num_outputs([](const NodeAttrs& attrs) { - const PoolingParam ¶m = nnvm::get(attrs.parsed); - return GetNumOutputs(param); -}) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FNumVisibleOutputs", - [](const NodeAttrs& attrs) { return 1; }) -#endif -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - return std::vector{"data"}; -}) -.set_attr("FListOutputNames", - [](const NodeAttrs& attrs) { - return std::vector{"output"}; -}) -.set_attr_parser(PoolingParamParser) -.set_attr("FInferStorageType", PoolingStorageType) -.set_attr("FInferType", PoolingType) -.set_attr("FInferShape", PoolingShape) -.set_attr("FCompute", PoolingCompute) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FComputeEx", PoolingComputeExCPU) -#endif -.set_attr("FGradient", - ElemwiseGradUseInOut{"_backward_Pooling"}) -.add_argument("data", "NDArray-or-Symbol", - "Input data to the pooling operator.") +.add_argument("data", "NDArray-or-Symbol", "Input data to the pooling operator.") .add_arguments(PoolingParam::__FIELDS__()); -NNVM_REGISTER_OP(_backward_Pooling) -.set_num_outputs(1) -.set_attr("TIsBackward", true) -.set_attr( - "FInplaceOption", - [](const NodeAttrs &attrs) { -#if MXNET_USE_CUDNN == 1 - return std::vector >(); -#else - return std::vector >{{1, 0}}; -#endif -}) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) -#endif -.set_attr("FInferStorageType", - BackwardPoolingStorageType) -.set_attr_parser(PoolingParamParser) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FComputeEx", PoolingGradComputeExCPU) -#endif -.set_attr("FCompute", PoolingGradCompute); - } // namespace op } // namespace mxnet diff --git a/src/operator/nn/pooling.cu b/src/operator/nn/pooling.cu index c3bcecfc77b7..dcebe6798263 100644 --- a/src/operator/nn/pooling.cu +++ b/src/operator/nn/pooling.cu @@ -21,7 +21,7 @@ * Copyright (c) 2017 by Contributors * \file pooling.cu * \brief - * \author Bing Xu, Jun Wu, Da Zheng + * \author Bing Xu, Jun Wu */ #include #include "./pooling-inl.h" @@ -32,112 +32,38 @@ namespace mxnet { namespace op { -#if MXNET_USE_CUDNN == 1 -template -static CuDNNPoolingOp &GetCuDNNPoolingOp(const PoolingParam ¶m) { -#if DMLC_CXX11_THREAD_LOCAL - static thread_local CuDNNPoolingOp op; -#else - static MX_THREAD_LOCAL CuDNNPoolingOp op; -#endif - op.Init(param); - return op; -} -#endif - template<> -void PoolingCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const PoolingParam& param = nnvm::get(attrs.parsed); - CHECK_EQ(inputs.size(), 1U); - CHECK_EQ(outputs.size(), GetNumOutputs(param)); - +Operator *CreateOp(PoolingParam param, int dtype) { + Operator *op = NULL; #if MXNET_USE_CUDNN == 1 if (!param.cudnn_off && param.kernel.ndim() > 1) { - MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { switch (param.pool_type) { case pool_enum::kMaxPooling: - case pool_enum::kAvgPooling: - GetCuDNNPoolingOp(param).Forward(ctx, inputs[0], req[0], outputs[0]); - return; - case pool_enum::kSumPooling: - LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied."; + op = new CuDNNPoolingOp(param); break; - } - }); - } -#endif // MXNET_USE_CUDNN - - MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - if (pool_enum::kMaxPooling == param.pool_type - || pool_enum::kAvgPooling == param.pool_type - || pool_enum::kSumPooling == param.pool_type) { - PoolingForward(ctx, param, inputs[0], req[0], outputs[0]); - } else { - LOG(FATAL) << "unknown pooling type"; - } - }); -} - -template<> -void PoolingGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const PoolingParam& param = nnvm::get(attrs.parsed); - CHECK_EQ(inputs.size(), GetNumBackInputs(param)); - CHECK_EQ(outputs.size(), 1U); - CHECK_EQ(req.size(), 1U); - off_t ograd_idx, in_data_idx, out_data_idx; - // When MKLDNN is enabled, the input data may contains arrays for workspace. - if (GetNumBackInputs(param) == 5) { - ograd_idx = 0; - in_data_idx = 2; - out_data_idx = 3; - } else { - ograd_idx = 0; - in_data_idx = 1; - out_data_idx = 2; - } - -#if MXNET_USE_CUDNN == 1 - if (!param.cudnn_off && param.kernel.ndim() > 1) { - MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { - switch (param.pool_type) { - case pool_enum::kMaxPooling: case pool_enum::kAvgPooling: - GetCuDNNPoolingOp(param).Backward(ctx, inputs[ograd_idx], - inputs[in_data_idx], inputs[out_data_idx], req[0], outputs[0]); - return; + op = new CuDNNPoolingOp(param); + break; case pool_enum::kSumPooling: LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied."; break; } }); } + if (op) return op; #endif // MXNET_USE_CUDNN - - MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { if (pool_enum::kMaxPooling == param.pool_type || pool_enum::kAvgPooling == param.pool_type || pool_enum::kSumPooling == param.pool_type) { - PoolingBackward(ctx, param, inputs[ograd_idx], - inputs[in_data_idx], inputs[out_data_idx], req[0], outputs[0]); + op = new PoolingOp(param); } else { LOG(FATAL) << "unknown pooling type"; } }); + return op; } -NNVM_REGISTER_OP(Pooling) -.set_attr("FCompute", PoolingCompute); - -NNVM_REGISTER_OP(_backward_Pooling) -.set_attr("FCompute", PoolingGradCompute); - } // namespace op } // namespace mxnet diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc index 0f559475d1c2..4686fb8c0dc1 100644 --- a/src/operator/nn/softmax.cc +++ b/src/operator/nn/softmax.cc @@ -25,54 +25,11 @@ #include "./softmax-inl.h" #include "../tensor/elemwise_unary_op.h" #include "../tensor/elemwise_binary_op.h" -#include "mkldnn/mkldnn_base-inl.h" -#include "mkldnn/mkldnn_ops-inl.h" namespace mxnet { namespace op { DMLC_REGISTER_PARAMETER(SoftmaxParam); -#if MXNET_USE_MKLDNN == 1 -static void SoftmaxComputeExCPU(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const SoftmaxParam& param = nnvm::get(attrs.parsed); - // It seems MKLDNN softmax doesn't support training. - // and it only supports non-negative axis. - if (SupportMKLDNN(inputs[0]) && !ctx.is_train && param.axis >= 0) { - MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs); - MKLDNNSoftmaxForward(attrs, ctx, inputs[0], req[0], outputs[0]); - auto fn = SoftmaxCompute; - MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs); - return; - } - FallBackCompute(SoftmaxCompute, attrs, ctx, - inputs, req, outputs); -} -#endif - -inline static bool SoftmaxStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - CHECK_EQ(in_attrs->size(), 1); - CHECK_EQ(out_attrs->size(), 1); - - DispatchMode wanted_mode; -#if MXNET_USE_MKLDNN == 1 - // We only run MKLDNN op if it runs on CPU. - if (dev_mask == mshadow::cpu::kDevMask) - wanted_mode = DispatchMode::kFComputeEx; - else -#endif - wanted_mode = DispatchMode::kFCompute; - return storage_type_assign(out_attrs, static_cast((*in_attrs)[0]), - dispatch_mode, wanted_mode); -} - MXNET_OPERATOR_REGISTER_UNARY(softmax) .describe(R"code(Applies the softmax function. @@ -97,10 +54,6 @@ Example:: )code" ADD_FILELINE) .set_attr_parser(ParamParser) .set_attr("FCompute", SoftmaxCompute) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FComputeEx", SoftmaxComputeExCPU) -#endif -.set_attr("FInferStorageType", SoftmaxStorageType) .set_attr("FGradient", ElemwiseGradUseOut{"_backward_softmax"}) .add_arguments(SoftmaxParam::__FIELDS__()); diff --git a/src/operator/nn/softmax_activation-inl.h b/src/operator/nn/softmax_activation-inl.h index b1d542e4068c..500bf51ccd1f 100644 --- a/src/operator/nn/softmax_activation-inl.h +++ b/src/operator/nn/softmax_activation-inl.h @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file softmax_activation-inl.h * \brief SoftmaxActivation operator - * \author Junyuan Xie, Da Zheng + * \author Junyuan Xie */ #ifndef MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_ #define MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_ @@ -61,74 +61,153 @@ struct SoftmaxActivationParam : public dmlc::Parameter { } }; +/** + * \brief This is the implementation of softmax_activation operator. + * \tparam xpu The device that the op will be executed on. + */ template -void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& reqs, - const std::vector& outputs) { - using namespace mshadow; - using namespace mshadow::expr; - const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); - CHECK_EQ(inputs.size(), 1U); - CHECK_EQ(outputs.size(), 1U); - const TBlob &in_data = inputs[softmax_activation::kData]; - const OpReqType &req = reqs[softmax_activation::kOut]; - const TBlob &out_data = outputs[softmax_activation::kOut]; - Stream *s = ctx.get_stream(); - if (param.mode == softmax_activation::kInstance) { - Tensor data = in_data.FlatTo2D(s); - Tensor out = out_data.FlatTo2D(s); - Softmax(out, data); - } else { - CHECK_GE(in_data.ndim(), 3) +class SoftmaxActivationOp : public Operator { + public: + explicit SoftmaxActivationOp(SoftmaxActivationParam p) { + this->param_ = p; + } + + virtual void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_data.size(), 1U); + CHECK_EQ(out_data.size(), 1U); + Stream *s = ctx.get_stream(); + if (param_.mode == softmax_activation::kInstance) { + Tensor data = in_data[softmax_activation::kData].FlatTo2D(s); + Tensor out = out_data[softmax_activation::kOut].FlatTo2D(s); + Softmax(out, data); + } else { + CHECK_GE(in_data[softmax_activation::kData].ndim(), 3) << "Input need to have a least 3 dimensions when mode=channel"; - int n = in_data.size(0); - int k = in_data.size(1); - Shape<3> s3 = Shape3(n, k, static_cast(in_data.Size()/n/k)); - Tensor data = in_data.get_with_shape(s3, s); - Tensor out = out_data.get_with_shape(s3, s); - Softmax(out, data); + int n = in_data[softmax_activation::kData].size(0); + int k = in_data[softmax_activation::kData].size(1); + Shape<3> s3 = Shape3(n, k, static_cast(in_data[softmax_activation::kData].Size()/n/k)); + Tensor data = + in_data[softmax_activation::kData].get_with_shape(s3, s); + Tensor out = + out_data[softmax_activation::kOut].get_with_shape(s3, s); + Softmax(out, data); + } } -} + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1U); + CHECK(in_data.size() == 1 && in_grad.size() == 1); + CHECK_EQ(req.size(), 1U); + // Use 3d tensor for both mode -> {instance, channel}. Get shapes + int total_size = in_grad[softmax_activation::kData].Size(); + int batch_size = in_grad[softmax_activation::kData].shape_[0]; + int channel_num = in_grad[softmax_activation::kData].shape_[1]; + int rest_size = total_size / (batch_size * channel_num); + const Shape<3> data_shape = Shape3(batch_size, channel_num, rest_size); + // Get tensors + Stream *s = ctx.get_stream(); + Tensor m_out_grad = + out_grad[softmax_activation::kOut].get_with_shape(data_shape, s); + Tensor m_out_data = + out_data[softmax_activation::kOut].get_with_shape(data_shape, s); + Tensor m_in_grad = + in_grad[softmax_activation::kData].get_with_shape(data_shape, s); + // get requested temp space + Tensor workspace = ctx.requested[softmax_activation::kTempSpace].get_space( + Shape2(batch_size, rest_size), s); + workspace = reduce_with_axis(m_out_grad * m_out_data, 1); + Assign(m_in_grad, req[softmax_activation::kData], + m_out_data * (m_out_grad - broadcast_with_axis(workspace, 0, channel_num))); + } + + private: + SoftmaxActivationParam param_; +}; // class SoftmaxActivationOp + +// Decalre Factory function, used for dispatch specialization template -void SoftmaxActivationGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& reqs, - const std::vector& outputs) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(inputs.size(), 2U); - CHECK_EQ(outputs.size(), 1); - CHECK_EQ(reqs.size(), 1); - const TBlob &out_grad = inputs[0]; - const TBlob &out_data = inputs[1]; - const OpReqType &req = reqs[0]; - const TBlob &in_grad = outputs[0]; - // Use 3d tensor for both mode -> {instance, channel}. Get shapes - int total_size = in_grad.Size(); - int batch_size = in_grad.shape_[0]; - int channel_num = in_grad.shape_[1]; - int rest_size = total_size / (batch_size * channel_num); - const Shape<3> data_shape = Shape3(batch_size, channel_num, rest_size); - // Get tensors - Stream *s = ctx.get_stream(); - Tensor m_out_grad = - out_grad.get_with_shape(data_shape, s); - Tensor m_out_data = - out_data.get_with_shape(data_shape, s); - Tensor m_in_grad = - in_grad.get_with_shape(data_shape, s); - // get requested temp space - Tensor workspace = ctx.requested[softmax_activation::kTempSpace].get_space( - Shape2(batch_size, rest_size), s); - workspace = reduce_with_axis(m_out_grad * m_out_data, 1); - Assign(m_in_grad, req, - m_out_data * (m_out_grad - broadcast_with_axis(workspace, 0, channel_num))); -} +Operator* CreateOp(SoftmaxActivationParam type); + +#if DMLC_USE_CXX11 +class SoftmaxActivationProp : public OperatorProperty { + public: + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 1U) << "Input:[data]"; + const TShape &dshape = in_shape->at(softmax_activation::kData); + if (dshape.ndim() == 0) return false; + out_shape->clear(); + out_shape->push_back(dshape); + return true; + } + + OperatorProperty* Copy() const override { + auto ptr = new SoftmaxActivationProp(); + ptr->param_ = param_; + return ptr; + } + + std::string TypeString() const override { + return "SoftmaxActivation"; + } + + // decalre dependency and inplace optimization options + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + return {out_grad[softmax_activation::kOut], out_data[softmax_activation::kOut]}; + } + + std::vector BackwardResource( + const std::vector &in_shape) const override { + return {ResourceRequest::kTempSpace}; + } + std::vector > BackwardInplaceOption( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &in_grad) const override { + return {{out_grad[softmax_activation::kOut], in_grad[softmax_activation::kData]}}; + } + + std::vector > ForwardInplaceOption( + const std::vector &in_data, + const std::vector &out_data) const override { + return {{in_data[softmax_activation::kData], out_data[softmax_activation::kOut]}}; + } + + Operator* CreateOperator(Context ctx) const override; + + private: + SoftmaxActivationParam param_; +}; +#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_ diff --git a/src/operator/nn/softmax_activation.cc b/src/operator/nn/softmax_activation.cc index bdfd8b065de1..657b382c6e03 100644 --- a/src/operator/nn/softmax_activation.cc +++ b/src/operator/nn/softmax_activation.cc @@ -21,18 +21,26 @@ * Copyright (c) 2015 by Contributors * \file activation.cc * \brief softmax_activation op - * \author Junyuan Xie, Da Zheng + * \author Junyuan Xie */ #include "./softmax_activation-inl.h" -#include "../tensor/elemwise_unary_op.h" #include "../mshadow_op.h" namespace mxnet { namespace op { +template<> +Operator *CreateOp(SoftmaxActivationParam param) { + return new SoftmaxActivationOp(param); +} + +// DO_BIND_DISPATCH comes from operator_common.h +Operator *SoftmaxActivationProp::CreateOperator(Context ctx) const { + DO_BIND_DISPATCH(CreateOp, param_); +} DMLC_REGISTER_PARAMETER(SoftmaxActivationParam); -MXNET_OPERATOR_REGISTER_UNARY(SoftmaxActivation) +MXNET_REGISTER_OP_PROPERTY(SoftmaxActivation, SoftmaxActivationProp) .describe(R"code(Applies softmax activation to input. This is intended for internal layers. .. note:: @@ -57,22 +65,8 @@ Example:: [ 6.56221947e-03 5.95310994e-04 9.73919690e-01 1.78379621e-02 1.08472735e-03]] )code" ADD_FILELINE) -.set_attr_parser(ParamParser) -.set_attr("FCompute", SoftmaxActivationCompute) -.set_attr("FGradient", ElemwiseGradUseOut{"_backward_SoftmaxActivation"}) +.add_argument("data", "NDArray-or-Symbol", "Input array to activation function.") .add_arguments(SoftmaxActivationParam::__FIELDS__()); -NNVM_REGISTER_OP(_backward_SoftmaxActivation) -.set_num_outputs(1) -.set_attr("TIsBackward", true) -.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ - return std::vector >{{0, 0}}; -}) -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) -.set_attr_parser(ParamParser) -.set_attr("FCompute", SoftmaxActivationGradCompute); - } // namespace op } // namespace mxnet diff --git a/src/operator/nn/softmax_activation.cu b/src/operator/nn/softmax_activation.cu index f3997e00052e..0810483e1262 100644 --- a/src/operator/nn/softmax_activation.cu +++ b/src/operator/nn/softmax_activation.cu @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file softmax_activation.cu * \brief - * \author Junyuan Xie, Da Zheng + * \author Junyuan Xie */ #include "./softmax_activation-inl.h" #include "../mshadow_op.h" @@ -31,51 +31,14 @@ namespace mxnet { namespace op { - +template<> +Operator *CreateOp(SoftmaxActivationParam param) { #if MXNET_USE_CUDNN == 1 - -static inline CuDNNSoftmaxActivationOp &GetCuDNNSoftmaxActOp(const SoftmaxActivationParam& param) { -#if DMLC_CXX11_THREAD_LOCAL - static thread_local CuDNNSoftmaxActivationOp op; + return new CuDNNSoftmaxActivationOp(param); #else - static MX_THREAD_LOCAL CuDNNSoftmaxActivationOp op; -#endif - op.Init(param); - return op; -} - -template<> -void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); - CHECK_EQ(inputs.size(), 1U); - CHECK_EQ(outputs.size(), 1U); - GetCuDNNSoftmaxActOp(param).Forward(ctx, inputs[0], req[0], outputs[0]); + return new SoftmaxActivationOp(param); +#endif // MXNET_USE_CUDNN } - -template<> -void SoftmaxActivationGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); - CHECK_EQ(inputs.size(), 2U); - CHECK_EQ(outputs.size(), 1); - CHECK_EQ(req.size(), 1); - GetCuDNNSoftmaxActOp(param).Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]); -} -#endif - -NNVM_REGISTER_OP(SoftmaxActivation) -.set_attr("FCompute", SoftmaxActivationCompute); - -NNVM_REGISTER_OP(_backward_SoftmaxActivation) -.set_attr("FCompute", SoftmaxActivationGradCompute); - } // namespace op } // namespace mxnet diff --git a/src/operator/nn/upsampling-inl.h b/src/operator/nn/upsampling-inl.h index 4b9159edd174..f660609ace28 100644 --- a/src/operator/nn/upsampling-inl.h +++ b/src/operator/nn/upsampling-inl.h @@ -35,7 +35,6 @@ #include #include #include "../operator_common.h" -#include "./deconvolution-inl.h" namespace mxnet { namespace op { @@ -83,147 +82,253 @@ struct UpSamplingParam : public dmlc::Parameter { }; // struct UpSamplingParam template -void UpSamplingForward(const OpContext &ctx, const UpSamplingParam ¶m, +class UpSamplingNearestOp : public Operator { + public: + explicit UpSamplingNearestOp(UpSamplingParam p) { + this->param_ = p; + } + + virtual void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), static_cast(param.num_args)); - CHECK_EQ(out_data.size(), 1U); - if (req[up_enum::kOut] == kNullOp) { - return; - } - Stream *s = ctx.get_stream(); - Tensor out = out_data[up_enum::kOut].get(s); - if (param.num_args > 1) { - int begin = 0; - for (int i = 0; i < param.num_args; ++i) { - Tensor data = in_data[i].get(s); - int end = begin + data.size(1); - int scale = out_data[up_enum::kOut].size(2)/in_data[i].size(2); - if (param.multi_input_mode == up_enum::kSum) { - if (i == 0) { - Assign(out, req[up_enum::kOut], upsampling_nearest(data, scale)); + const std::vector &out_data, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_data.size(), static_cast(param_.num_args)); + CHECK_EQ(out_data.size(), 1U); + if (req[up_enum::kOut] == kNullOp) { + return; + } + Stream *s = ctx.get_stream(); + Tensor out = out_data[up_enum::kOut].get(s); + if (param_.num_args > 1) { + int begin = 0; + for (int i = 0; i < param_.num_args; ++i) { + Tensor data = in_data[i].get(s); + int end = begin + data.size(1); + int scale = out_data[up_enum::kOut].size(2)/in_data[i].size(2); + if (param_.multi_input_mode == up_enum::kSum) { + if (i == 0) { + Assign(out, req[up_enum::kOut], upsampling_nearest(data, scale)); + } else { + out += upsampling_nearest(data, scale); + } } else { - out += upsampling_nearest(data, scale); + Assign(slice<1>(out, begin, end), req[up_enum::kOut], upsampling_nearest(data, scale)); } - } else { - Assign(slice<1>(out, begin, end), req[up_enum::kOut], upsampling_nearest(data, scale)); + begin = end; } - begin = end; + } else { + Tensor data = in_data[up_enum::kData].get(s); + Assign(out, req[up_enum::kOut], upsampling_nearest(data, param_.scale)); } - } else { - Tensor data = in_data[up_enum::kData].get(s); - Assign(out, req[up_enum::kOut], upsampling_nearest(data, param.scale)); } -} -template -void UpSamplingBackward(const OpContext &ctx, const UpSamplingParam ¶m, - const TBlob &out_grad, const std::vector &req, - const std::vector &in_grad) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_grad.size(), static_cast(param.num_args)); - Stream *s = ctx.get_stream(); - Tensor grad = out_grad.get(s); - if (param.num_args > 1) { - int begin = 0; - for (int i = 0; i < param.num_args; ++i) { - Tensor input_grad = in_grad[i].get(s); + virtual void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_args) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(out_grad.size(), 1U); + CHECK_EQ(in_grad.size(), static_cast(param_.num_args)); + Stream *s = ctx.get_stream(); + Tensor grad = out_grad[up_enum::kOut].get(s); + if (param_.num_args > 1) { + int begin = 0; + for (int i = 0; i < param_.num_args; ++i) { + Tensor input_grad = in_grad[i].get(s); + mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]); + int end = begin + input_grad.size(1); + int scale = grad.size(2)/in_shape[0]; + if (param_.multi_input_mode == up_enum::kSum) { + Assign(input_grad, req[i], + pool(grad, + in_shape, + scale, + scale, + scale, + scale)); + } else { + Assign(input_grad, req[i], + pool(slice<1>(grad, begin, end), + in_shape, + scale, + scale, + scale, + scale)); + } + begin = end; + } + } else { + Tensor input_grad = in_grad[up_enum::kData].get(s); mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]); - int end = begin + input_grad.size(1); - int scale = grad.size(2)/in_shape[0]; - if (param.multi_input_mode == up_enum::kSum) { - Assign(input_grad, req[i], - pool(grad, - in_shape, - scale, - scale, - scale, - scale)); + Assign(input_grad, req[up_enum::kData], + pool(grad, + in_shape, + param_.scale, + param_.scale, + param_.scale, + param_.scale)); + } + } + + private: + UpSamplingParam param_; +}; // class UpSamplingNearestOp + +template +Operator *CreateOp(UpSamplingParam param, int dtype); + + +#if DMLC_USE_CXX11 +class UpSamplingProp : public OperatorProperty { + public: + void Init(const std::vector >& kwargs) override { + param_.Init(kwargs); + } + + std::map GetParams() const override { + return param_.__DICT__(); + } + + std::vector ListArguments() const override { + if (param_.sample_type == up_enum::kNearest) { + std::vector ret; + for (int i = 0; i < param_.num_args; ++i) { + ret.push_back(std::string("arg") + std::to_string(i)); + } + return ret; + } else { + return {"data", "weight"}; + } + } + + bool InferShape(std::vector *in_shape, + std::vector *out_shape, + std::vector *aux_shape) const override { + CHECK_GE(in_shape->size(), 1U); + const TShape &dshape = (*in_shape)[0]; + TShape oshape = dshape; + if (param_.sample_type == up_enum::kNearest) { + CHECK_EQ(in_shape->size(), static_cast(param_.num_args)); + oshape[1] = 0; + for (auto& shape : *in_shape) { + CHECK_EQ(shape.ndim(), 4U) << \ + "UpSamplingNearest: Input data should be 4D in (batch, channel, y, x)"; + int oh = dshape[2]*param_.scale, ow = dshape[3]*param_.scale; + CHECK_EQ(oh%shape[2], 0U) << "UpSamplingNearest: input height of " << shape[2] << \ + "does not divide output height of " << oh; + CHECK_EQ(ow%shape[3], 0U) << "UpSamplingNearest: input width of " << shape[3] << \ + "does not divide output width of " << ow; + if (param_.multi_input_mode == up_enum::kSum) { + CHECK(oshape[1] == 0 || oshape[1] == shape[1]) << \ + "Number of channels must be the same when multi_input_mode==sum"; + oshape[1] = shape[1]; + } else { + oshape[1] += shape[1]; + } + } + } else { + CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; + CHECK_EQ(dshape.ndim(), 4U) << \ + "UpSamplingBilinear: Input data should be 4D in (batch, channel, y, x)"; + if (dshape.ndim() == 0) return false; + int kernel = 2 * param_.scale - param_.scale % 2; + SHAPE_ASSIGN_CHECK(*in_shape, + up_enum::kWeight, + mshadow::Shape4(dshape[1], 1, kernel, kernel)); + oshape = dshape; + } + oshape[2] = dshape[2] * param_.scale; + oshape[3] = dshape[3] * param_.scale; + out_shape->clear(); + out_shape->push_back(oshape); + return true; + } + + bool InferType(std::vector *in_type, + std::vector *out_type, + std::vector *aux_type) const override { + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; } else { - Assign(input_grad, req[i], - pool(slice<1>(grad, begin, end), - in_shape, - scale, - scale, - scale, - scale)); + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); } - begin = end; } - } else { - Tensor input_grad = in_grad[up_enum::kData].get(s); - mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]); - Assign(input_grad, req[up_enum::kData], - pool(grad, - in_shape, - param.scale, - param.scale, - param.scale, - param.scale)); - } -} - -static inline DeconvolutionParam GetDeconvolutionParam(const UpSamplingParam& param) { - DeconvolutionParam p = DeconvolutionParam(); - int kernel = 2 * param.scale - param.scale % 2; - int stride = param.scale; - int pad = static_cast(ceil((param.scale - 1) / 2.)); - p.workspace = param.workspace; - p.num_group = param.num_filter; - p.num_filter = param.num_filter; - p.no_bias = true; - int shape[] = {1, 1}; - p.dilate = TShape(shape, shape + 2); - shape[0] = shape[1] = kernel; - p.kernel = TShape(shape, shape + 2); - shape[0] = shape[1] = stride; - p.stride = TShape(shape, shape + 2); - shape[0] = shape[1] = pad; - p.pad = TShape(shape, shape + 2); - return p; -} + out_type->clear(); + out_type->push_back(dtype); + return true; + } -template -void UpSamplingCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const UpSamplingParam& param = nnvm::get(attrs.parsed); - if (param.sample_type == up_enum::kNearest) { - MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { - UpSamplingForward(ctx, param, inputs, req, outputs); - }); - } else if (param.sample_type == up_enum::kBilinear) { - DeconvolutionParam p = GetDeconvolutionParam(param); - _DeconvolutionCompute(p, ctx, inputs, req, outputs); - } else { - LOG(FATAL) << "Unknown sample type"; - } -} + OperatorProperty* Copy() const override { + auto ptr = new UpSamplingProp(); + ptr->param_ = this->param_; + return ptr; + } + + std::string TypeString() const override { + return "UpSampling"; + } + + std::vector DeclareBackwardDependency( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data) const override { + if (param_.sample_type == up_enum::kNearest) { + return {out_grad[up_enum::kOut]}; + } else { + return {out_grad[up_enum::kOut], in_data[up_enum::kData], in_data[up_enum::kWeight]}; + } + } + + std::vector > BackwardInplaceOption( + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &in_grad) const override { + return {}; + } + + std::vector ForwardResource( + const std::vector &in_shape) const override { + if (param_.sample_type == up_enum::kNearest) { + return {}; + } else { + return {ResourceRequest::kTempSpace}; + } + } + + std::vector BackwardResource( + const std::vector &in_shape) const override { + if (param_.sample_type == up_enum::kNearest) { + return {}; + } else { + return {ResourceRequest::kTempSpace}; + } + } + + Operator* CreateOperator(Context ctx) const override { + LOG(FATAL) << "Not Implemented"; + return NULL; + } + + Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const override; -template -void UpSamplingGradCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - const UpSamplingParam& param = nnvm::get(attrs.parsed); - if (param.sample_type == up_enum::kNearest) { - MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { - CHECK_EQ(inputs.size(), 1U); - UpSamplingBackward(ctx, param, inputs[0], req, outputs); - }); - } else if (param.sample_type == up_enum::kBilinear) { - DeconvolutionParam p = GetDeconvolutionParam(param); - _DeconvolutionGradCompute(p, ctx, inputs, req, outputs); - } else { - LOG(FATAL) << "Unknown sample type"; - } -} + private: + UpSamplingParam param_; +}; // class UpSamplingProp +#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet diff --git a/src/operator/nn/upsampling.cc b/src/operator/nn/upsampling.cc index 44b619ac9516..8942e35ab325 100644 --- a/src/operator/nn/upsampling.cc +++ b/src/operator/nn/upsampling.cc @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file upsampling_nearest.cc * \brief - * \author Bing Xu, Da Zheng + * \author Bing Xu */ #include "./upsampling-inl.h" @@ -30,123 +30,51 @@ namespace mxnet { namespace op { - -static bool UpSamplingShape(const nnvm::NodeAttrs& attrs, - std::vector *in_shape, std::vector *out_shape) { - const UpSamplingParam& param_ = nnvm::get(attrs.parsed); - CHECK_GE(in_shape->size(), 1U); - const TShape &dshape = (*in_shape)[0]; - TShape oshape = dshape; - if (param_.sample_type == up_enum::kNearest) { - CHECK_EQ(in_shape->size(), static_cast(param_.num_args)); - oshape[1] = 0; - for (auto& shape : *in_shape) { - CHECK_EQ(shape.ndim(), 4U) << \ - "UpSamplingNearest: Input data should be 4D in (batch, channel, y, x)"; - int oh = dshape[2]*param_.scale, ow = dshape[3]*param_.scale; - CHECK_EQ(oh%shape[2], 0U) << "UpSamplingNearest: input height of " << shape[2] << \ - "does not divide output height of " << oh; - CHECK_EQ(ow%shape[3], 0U) << "UpSamplingNearest: input width of " << shape[3] << \ - "does not divide output width of " << ow; - if (param_.multi_input_mode == up_enum::kSum) { - CHECK(oshape[1] == 0 || oshape[1] == shape[1]) << \ - "Number of channels must be the same when multi_input_mode==sum"; - oshape[1] = shape[1]; - } else { - oshape[1] += shape[1]; - } - } - } else { - CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; - CHECK_EQ(dshape.ndim(), 4U) << \ - "UpSamplingBilinear: Input data should be 4D in (batch, channel, y, x)"; - if (dshape.ndim() == 0) return false; - int kernel = 2 * param_.scale - param_.scale % 2; - SHAPE_ASSIGN_CHECK(*in_shape, - up_enum::kWeight, - mshadow::Shape4(dshape[1], 1, kernel, kernel)); - oshape = dshape; - } - oshape[2] = dshape[2] * param_.scale; - oshape[3] = dshape[3] * param_.scale; - out_shape->clear(); - out_shape->push_back(oshape); - return true; -} - -static inline std::vector ListArguments(const UpSamplingParam& param) { - if (param.sample_type == up_enum::kNearest) { - std::vector ret; - for (int i = 0; i < param.num_args; ++i) { - ret.push_back(std::string("arg") + std::to_string(i)); - } - return ret; - } else { - return {"data", "weight"}; - } -} - -static bool UpSamplingType(const nnvm::NodeAttrs& attrs, - std::vector *in_type, std::vector *out_type) { - const UpSamplingParam& param = nnvm::get(attrs.parsed); - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; +template<> +Operator *CreateOp(UpSamplingParam param, int dtype) { + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + if (param.sample_type == up_enum::kNearest) { + op = new UpSamplingNearestOp(param); + } else if (param.sample_type == up_enum::kBilinear) { + DeconvolutionParam p = DeconvolutionParam(); + int kernel = 2 * param.scale - param.scale % 2; + int stride = param.scale; + int pad = static_cast(ceil((param.scale - 1) / 2.)); + p.workspace = param.workspace; + p.num_group = param.num_filter; + p.num_filter = param.num_filter; + p.no_bias = true; + int shape[] = {1, 1}; + p.dilate = TShape(shape, shape + 2); + shape[0] = shape[1] = kernel; + p.kernel = TShape(shape, shape + 2); + shape[0] = shape[1] = stride; + p.stride = TShape(shape, shape + 2); + shape[0] = shape[1] = pad; + p.pad = TShape(shape, shape + 2); + op = new DeconvolutionOp(p); } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param)[i]); + LOG(FATAL) << "Unknown sample type"; } - } - out_type->clear(); - out_type->push_back(dtype); - return true; + }); + return op; } -struct UpSamplingGrad { - const char *op_name; - std::vector operator()(const nnvm::NodePtr& n, - const std::vector& ograds) const { - const UpSamplingParam& param_ = nnvm::get(n->attrs.parsed); - std::vector heads(ograds.begin(), ograds.end()); - if (param_.sample_type != up_enum::kNearest) { - heads.push_back(n->inputs[up_enum::kData]); - heads.push_back(n->inputs[up_enum::kWeight]); - } - return MakeGradNode(op_name, n, heads, n->attrs.dict); - } -}; +Operator* UpSamplingProp::CreateOperatorEx(Context ctx, std::vector *in_shape, + std::vector *in_type) const { + DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); +} DMLC_REGISTER_PARAMETER(UpSamplingParam); -NNVM_REGISTER_OP(UpSampling) +MXNET_REGISTER_OP_PROPERTY(UpSampling, UpSamplingProp) .describe("Performs nearest neighbor/bilinear up sampling to inputs.") -.set_num_inputs([](const NodeAttrs& attrs) { - const UpSamplingParam& params = nnvm::get(attrs.parsed); - return params.sample_type == up_enum::kNearest ? params.num_args : 2; -}) -.set_num_outputs(1) -.set_attr_parser(ParamParser) -.set_attr("FListInputNames", - [](const NodeAttrs& attrs) { - return ListArguments(nnvm::get(attrs.parsed)); -}) -.set_attr("FInferShape", UpSamplingShape) -.set_attr("FInferType", UpSamplingType) -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - const UpSamplingParam& param = nnvm::get(n.parsed); - if (param.sample_type == up_enum::kNearest) { - return std::vector(); - } else { - return std::vector{ResourceRequest::kTempSpace}; - } -}) -.set_attr("FCompute", UpSamplingCompute) -.set_attr("FGradient", UpSamplingGrad{"_backward_UpSampling"}) -.set_attr("key_var_num_args", "num_args") .add_argument("data", "NDArray-or-Symbol[]", "Array of tensors to upsample") .add_arguments(UpSamplingParam::__FIELDS__()) +.set_key_var_num_args("num_args"); + +NNVM_REGISTER_OP(UpSampling) .set_attr("FSetInputVarAttrOnCompose", [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) { if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return; @@ -154,23 +82,5 @@ NNVM_REGISTER_OP(UpSampling) var->attrs.dict["__init__"] = "[\"bilinear\", {}]"; } }); - -NNVM_REGISTER_OP(_backward_UpSampling) -.set_num_outputs([](const NodeAttrs& attrs) { - const UpSamplingParam& params = nnvm::get(attrs.parsed); - return params.sample_type == up_enum::kNearest ? params.num_args : 2; -}) -.set_attr("TIsBackward", true) -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - const UpSamplingParam& param = nnvm::get(n.parsed); - if (param.sample_type == up_enum::kNearest) { - return std::vector(); - } else { - return std::vector{ResourceRequest::kTempSpace}; - } -}) -.set_attr_parser(ParamParser) -.set_attr("FCompute", UpSamplingGradCompute); - } // namespace op } // namespace mxnet diff --git a/src/operator/nn/upsampling.cu b/src/operator/nn/upsampling.cu index c5ff2fafd64a..f83535a2b2e6 100644 --- a/src/operator/nn/upsampling.cu +++ b/src/operator/nn/upsampling.cu @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file upsampling_nearest.cc * \brief - * \author Bing Xu, Da Zheng + * \author Bing Xu */ #include "./deconvolution-inl.h" @@ -29,12 +29,36 @@ namespace mxnet { namespace op { - -NNVM_REGISTER_OP(UpSampling) -.set_attr("FCompute", UpSamplingCompute); - -NNVM_REGISTER_OP(_backward_UpSampling) -.set_attr("FCompute", UpSamplingGradCompute); +template<> +Operator *CreateOp(UpSamplingParam param, int dtype) { + Operator *op = NULL; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + if (param.sample_type == up_enum::kNearest) { + op = new UpSamplingNearestOp(param); + } else if (param.sample_type == up_enum::kBilinear) { + DeconvolutionParam p = DeconvolutionParam(); + int kernel = 2 * param.scale - param.scale % 2; + int stride = param.scale; + int pad = static_cast(ceil((param.scale - 1) / 2.)); + p.workspace = param.workspace; + p.num_group = param.num_filter; + p.num_filter = param.num_filter; + p.no_bias = true; + int shape[] = {1, 1}; + p.dilate = TShape(shape, shape + 2); + shape[0] = shape[1] = kernel; + p.kernel = TShape(shape, shape + 2); + shape[0] = shape[1] = stride; + p.stride = TShape(shape, shape + 2); + shape[0] = shape[1] = pad; + p.pad = TShape(shape, shape + 2); + op = new DeconvolutionOp(p); + } else { + LOG(FATAL) << "Unknown sample type"; + } + }); + return op; +} } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h index e345bb2193f4..ed200273854d 100644 --- a/src/operator/tensor/cast_storage-inl.h +++ b/src/operator/tensor/cast_storage-inl.h @@ -27,15 +27,11 @@ #include #include #include -#include #include "../mxnet_op.h" #include "../operator_common.h" #ifdef __CUDACC__ #include "./cast_storage-inl.cuh" #endif // __CUDACC__ -#if MXNET_USE_MKLDNN == 1 -#include "../nn/mkldnn/mkldnn_base-inl.h" -#endif namespace mxnet { @@ -346,20 +342,8 @@ void CastStorageComputeImpl(const OpContext& ctx, } else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) { TBlob ret = output.data(); CastStorageCsrDnsImpl(ctx, input, &ret); -#if MXNET_USE_MKLDNN == 1 - } else if (src_stype == kDefaultStorage && dst_stype == kDefaultStorage) { - CHECK_EQ(output.ctx().dev_type, input.ctx().dev_type); - // If one of them uses the MKLDNN layout. - if (input.IsMKLDNNData() || output.IsMKLDNNData()) { - auto in_mem = input.GetMKLDNNData(); - const_cast(output).CopyFrom(*in_mem); - MKLDNNStream::Get()->Submit(); - } else { - mxnet_op::copy(ctx.get_stream(), output.data(), input.data()); - } -#endif } else { - LOG(FATAL) << "Not implemented from " << src_stype << " to " << dst_stype; + LOG(FATAL) << "Not implemented"; } } @@ -392,14 +376,8 @@ inline bool CastStorageInferStorageType(const nnvm::NodeAttrs& attrs, // dns -> dns, dns -> rsp, dns -> csr if (!dispatched && in_stype == kDefaultStorage && param_stype == kDefaultStorage) { // dns -> dns - DispatchMode mode = DispatchMode::kFCompute; -#if MXNET_USE_MKLDNN == 1 - // If we use MKLDNN and the arrays are in CPU memory, the array may store - // MKLDNN layout, we should convert its layout explicitly. - if (dev_mask == kCPU) - mode = DispatchMode::kFComputeEx; -#endif - dispatched = storage_type_assign(out_attrs, kDefaultStorage, dispatch_mode, mode); + dispatched = storage_type_assign(out_attrs, kDefaultStorage, + dispatch_mode, DispatchMode::kFCompute); } if (!dispatched && in_stype == kDefaultStorage && (param_stype == kRowSparseStorage || param_stype == kCSRStorage)) { diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc index d73edc723520..d7e5e04ce87a 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_op_basic.cc @@ -24,68 +24,11 @@ */ #include "./elemwise_unary_op.h" #include "./elemwise_binary_op-inl.h" -#include "../nn/mkldnn/mkldnn_ops-inl.h" -#include "../nn/mkldnn/mkldnn_base-inl.h" namespace mxnet { namespace op { -static void ElemwiseAddEx(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - CHECK_EQ(inputs.size(), 2U); - CHECK_EQ(outputs.size(), 1U); -#if MXNET_USE_MKLDNN == 1 - if (SupportMKLDNN(inputs[0]) && SupportMKLDNN(inputs[1])) { - MKLDNNSumForward(attrs, ctx, inputs, req[0], outputs[0]); - return; - } else if (inputs[0].storage_type() == kDefaultStorage - && inputs[1].storage_type() == kDefaultStorage) { - // This happens if inputs are supposed to be in MKLDNN format - // but MKLDNN doesn't support the data type or the shape. We're - // forced to convert it to the default format. - std::vector in_blobs(2); - std::vector out_blobs(1); - in_blobs[0] = inputs[0].data(); - in_blobs[1] = inputs[1].data(); - out_blobs[0] = outputs[0].data(); - ElemwiseBinaryOp::Compute(attrs, ctx, in_blobs, - req, out_blobs); - return; - } -#endif - ElemwiseBinaryOp::ComputeEx(attrs, ctx, inputs, - req, outputs); -} - -static inline bool ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - CHECK_EQ(in_attrs->size(), 2); - CHECK_EQ(out_attrs->size(), 1); - bool ret = ElemwiseStorageType<2, 1, true, true, true>(attrs, dev_mask, dispatch_mode, - in_attrs, out_attrs); -#if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask - && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage) - && out_attrs->at(0) == kDefaultStorage) { - *dispatch_mode = DispatchMode::kFComputeEx; - } -#endif - return ret; -} - -MXNET_OPERATOR_REGISTER_BINARY(elemwise_add) -.set_attr("FInferStorageType", ElemwiseAddStorageType) -.set_attr("FCompute", ElemwiseBinaryOp::Compute) -.set_attr("FComputeEx", ElemwiseAddEx) -.set_attr("FResourceRequest", /* For Sparse CSR */ - [](const NodeAttrs& attrs) { - return std::vector{ResourceRequest::kTempSpace};}) +MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(elemwise_add, op::mshadow_op::plus) MXNET_ADD_SPARSE_OP_ALIAS(elemwise_add) .add_alias("_add").add_alias("_plus").add_alias("_Plus") .describe(R"code(Adds arguments element-wise. @@ -103,41 +46,6 @@ The storage type of ``elemwise_add`` output depends on storage types of inputs // this must differ from elemwise_add to prevent add to optimization in forward pass. MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_grad_add, op::mshadow_op::plus); -static void _backward_ElemwiseAddEx(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - CHECK_EQ(inputs.size(), 1U); - CHECK_EQ(outputs.size(), 2U); -#if MXNET_USE_MKLDNN == 1 - if (inputs[0].IsMKLDNNData()) { - MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]); - MKLDNNCopy(attrs, ctx, inputs[0], req[1], outputs[1]); - return; - } -#endif - ElemwiseBinaryOp::BackwardUseNoneEx( - attrs, ctx, inputs, req, outputs); -} - -static inline bool ElemwiseAddBackwardStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - CHECK_EQ(in_attrs->size(), 1); - CHECK_EQ(out_attrs->size(), 2); - bool ret = ElemwiseStorageType<1, 2, true, true, true>(attrs, dev_mask, dispatch_mode, - in_attrs, out_attrs); -#if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask) { - *dispatch_mode = DispatchMode::kFComputeEx; - } -#endif - return ret; -} - NNVM_REGISTER_OP(_backward_add) .set_num_inputs(1) .set_num_outputs(2) @@ -147,15 +55,13 @@ NNVM_REGISTER_OP(_backward_add) return std::vector >{{0, 0}, {0, 1}}; }) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) -#endif .set_attr("FCompute", ElemwiseBinaryOp::BackwardUseNone< cpu, mshadow_op::identity, mshadow_op::identity>) -.set_attr("FComputeEx", _backward_ElemwiseAddEx) -.set_attr("FInferStorageType", ElemwiseAddBackwardStorageType); +.set_attr("FComputeEx", + ElemwiseBinaryOp::BackwardUseNoneEx) +.set_attr("FInferStorageType", + ElemwiseStorageType<1, 2, true, true, true>); MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(elemwise_sub, op::mshadow_op::minus) MXNET_ADD_SPARSE_OP_ALIAS(elemwise_sub) diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc index 6118ddf19c30..8c12218be062 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc @@ -65,7 +65,7 @@ static bool BinaryScalarStorageTypeWithDenseResultStorageType(const NodeAttrs& a const auto dispatch_ex = invalid_ctx ? DispatchMode::kFComputeFallback : DispatchMode::kFComputeEx; const double alpha = nnvm::get(attrs.parsed); - if (common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) { + if (instype == kDefaultStorage) { dispatched = storage_type_assign(&out_attrs[0], kDefaultStorage, dispatch_mode, DispatchMode::kFCompute); } @@ -89,7 +89,7 @@ static bool BinaryScalarStorageType(const nnvm::NodeAttrs& attrs, const auto in_stype = in_attrs->at(0); auto &out_stype = out_attrs->at(0); bool dispatched = false; - if (!dispatched && (in_stype == kDefaultStorage)) { + if (!dispatched && in_stype == kDefaultStorage) { // dns -> dns dispatched = storage_type_assign(&out_stype, kDefaultStorage, dispatch_mode, DispatchMode::kFCompute); diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc index 10154bc9646d..b31dbb2598f0 100644 --- a/src/operator/tensor/elemwise_sum.cc +++ b/src/operator/tensor/elemwise_sum.cc @@ -24,8 +24,6 @@ */ #include "./elemwise_sum.h" #include "../../ndarray/ndarray_function.h" -#include "../nn/mkldnn/mkldnn_ops-inl.h" -#include "../../common/utils.h" namespace mxnet { namespace op { @@ -81,28 +79,9 @@ bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, std::vector *out_attrs) { CHECK(!in_attrs->empty()); CHECK_EQ(out_attrs->size(), 1U); - bool ret = ElemwiseStorageAttr(attrs, dev_mask, dispatch_mode, - in_attrs, out_attrs); -#if MXNET_USE_MKLDNN == 1 - // We should always use FComputeEx. - if (dev_mask == mshadow::cpu::kDevMask - && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage) - && out_attrs->at(0) == kDefaultStorage) { - *dispatch_mode = DispatchMode::kFComputeEx; - } -#endif - return ret; -} - -#if MXNET_USE_MKLDNN == 1 -static inline bool IsMKLDNNData(const std::vector &arrs) { - for (auto &arr : arrs) { - if (!arr.IsMKLDNNData()) - return false; - } - return true; + return ElemwiseStorageAttr(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); } -#endif void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs, const OpContext& ctx, @@ -113,28 +92,13 @@ void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs, CHECK_EQ(outputs.size(), 1U); CHECK_EQ(req.size(), 1U); if (req[0] == kNullOp) return; + CHECK_EQ(req[0], kWriteTo) << "ElementWiseSumComputeExCPU only supports req = kWriteTo"; if (inputs[0].storage_type() == kRowSparseStorage) { mshadow::Stream* s = ctx.get_stream(); Resource rsc = ResourceManager::Get()->Request(ctx.run_ctx.get_ctx(), ResourceRequest(ResourceRequest::kTempSpace)); NDArray out_nd = outputs[0]; mxnet::ndarray::ElementwiseSum(s, rsc, inputs, &out_nd); -#if MXNET_USE_MKLDNN == 1 - } else if (IsMKLDNNData(inputs)) { - MKLDNNSumForward(attrs, ctx, inputs, req[0], outputs[0]); -#endif - } else if (common::ContainsOnlyStorage(inputs, kDefaultStorage)) { - // This case happens when we want to create an MKLDNN NDArray but the type - // or the shape isn't supported by MKLDNN. In this case, NDArray falls back - // to the default storage type and, thus, we have to handle the default - // storage in FComputeEx. - std::vector in_blobs(inputs.size()); - std::vector out_blobs(outputs.size()); - for (size_t i = 0; i < in_blobs.size(); i++) - in_blobs[i] = inputs[i].data(); - for (size_t i = 0; i < out_blobs.size(); i++) - out_blobs[i] = outputs[i].data(); - ElementWiseSumCompute(attrs, ctx, in_blobs, req, out_blobs); } else { LogUnimplementedOp(attrs, ctx, inputs, req, outputs); } diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc index cca3b2c9ff90..13a58d0165a8 100644 --- a/src/operator/tensor/elemwise_unary_op_basic.cc +++ b/src/operator/tensor/elemwise_unary_op_basic.cc @@ -24,7 +24,6 @@ #include #include "elemwise_unary_op.h" #include "./elemwise_binary_op-inl.h" -#include "../nn/mkldnn/mkldnn_ops-inl.h" namespace mxnet { namespace op { @@ -108,64 +107,12 @@ MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_sigmoid, unary_bwd); // copy -static void CopyEx(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - CHECK_EQ(inputs.size(), 1U); - CHECK_EQ(outputs.size(), 1U); - const auto in_stype = inputs[0].storage_type(); - const auto out_stype = outputs[0].storage_type(); -#if MXNET_USE_MKLDNN == 1 - if (inputs[0].IsMKLDNNData()) { - MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]); - return; - } else if (in_stype == kDefaultStorage && out_stype == kDefaultStorage) { - // This happens if inputs are supposed to be in MKLDNN format - // but MKLDNN doesn't support the data type or the shape. We're - // forced to convert it to the default format. - std::vector in_blobs {inputs[0].data()}; - std::vector out_blobs {outputs[0].data()}; - UnaryOp::IdentityCompute(attrs, ctx, in_blobs, req, out_blobs); - return; - } -#endif - UnaryOp::IdentityComputeEx(attrs, ctx, inputs, req, outputs); -} - -static inline bool CopyStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - CHECK_EQ(in_attrs->size(), 1); - CHECK_EQ(out_attrs->size(), 1); - bool ret = ElemwiseStorageType<1, 1, false, true, true>(attrs, dev_mask, dispatch_mode, - in_attrs, out_attrs); -#if MXNET_USE_MKLDNN == 1 - // We have to make sure all inputs are default layouts. Otherwise, we might - // want to fallback. - if (dev_mask == mshadow::cpu::kDevMask - && in_attrs->at(0) == kDefaultStorage - && out_attrs->at(0) == kDefaultStorage) { - *dispatch_mode = DispatchMode::kFComputeEx; - } -#endif - return ret; -} - MXNET_OPERATOR_REGISTER_UNARY(_copy) .MXNET_DESCRIBE("Returns a copy of the input.") .add_alias("identity") -.set_attr("FInferStorageType", CopyStorageType) +.set_attr("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>) .set_attr("FCompute", UnaryOp::IdentityCompute) -.set_attr("FComputeEx", CopyEx) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) -#endif +.set_attr("FComputeEx", UnaryOp::IdentityComputeEx) .set_attr("FInplaceIdentity", [](const NodeAttrs& attrs){ return std::vector{true}; @@ -180,14 +127,9 @@ NNVM_REGISTER_OP(_backward_copy) [](const NodeAttrs& attrs){ return std::vector >{{0, 0}}; }) -.set_attr("FInferStorageType", CopyStorageType) +.set_attr("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>) .set_attr("FCompute", UnaryOp::IdentityCompute) -.set_attr("FComputeEx", CopyEx) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) -#endif +.set_attr("FComputeEx", UnaryOp::IdentityComputeEx) .set_attr("FInplaceIdentity", [](const NodeAttrs& attrs){ return std::vector{true}; diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc index 25c233318f01..9167fcfe7e34 100644 --- a/src/operator/tensor/matrix_op.cc +++ b/src/operator/tensor/matrix_op.cc @@ -25,8 +25,6 @@ // this will be invoked by gcc and compile CPU version #include "./matrix_op-inl.h" #include "./elemwise_unary_op.h" -#include "../nn/mkldnn/mkldnn_ops-inl.h" -#include "../nn/mkldnn/mkldnn_base-inl.h" namespace mxnet { namespace op { @@ -182,51 +180,6 @@ If the argument `reverse` is set to 1, then the special values are inferred from .add_argument("data", "NDArray-or-Symbol", "Input data to reshape.") .add_arguments(ReshapeParam::__FIELDS__()); -static void FlattenEx(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - CHECK_EQ(inputs.size(), 1U); - CHECK_EQ(outputs.size(), 1U); -#if MXNET_USE_MKLDNN == 1 - const auto in_stype = inputs[0].storage_type(); - const auto out_stype = outputs[0].storage_type(); - if (inputs[0].IsMKLDNNData()) { - MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]); - // If the output is a special MKLDNN layout and the number of dimensions - // is larger than 2, we should use the default layout. - if (outputs[0].IsMKLDNNData() && inputs[0].shape().ndim() > 2) - const_cast(outputs[0]).Reorder2Default(); - return; - } else { - // This happens if inputs are supposed to be in MKLDNN format - // but MKLDNN doesn't support the data type or the shape. We're - // forced to convert it to the default format. - FallBackCompute(UnaryOp::IdentityCompute, attrs, ctx, inputs, req, outputs); - return; - } -#endif -} - -static inline bool FlattenStorageType(const nnvm::NodeAttrs& attrs, - const int dev_mask, - DispatchMode* dispatch_mode, - std::vector *in_attrs, - std::vector *out_attrs) { - CHECK_EQ(in_attrs->size(), 1); - CHECK_EQ(out_attrs->size(), 1); - bool ret = ElemwiseStorageType<1, 1, false, false, false>(attrs, dev_mask, dispatch_mode, - in_attrs, out_attrs); -#if MXNET_USE_MKLDNN == 1 - if (dev_mask == mshadow::cpu::kDevMask - && in_attrs->at(0) == kDefaultStorage - && out_attrs->at(0) == kDefaultStorage) { - *dispatch_mode = DispatchMode::kFComputeEx; - } -#endif - return ret; -} NNVM_REGISTER_OP(Flatten) .add_alias("flatten") @@ -257,15 +210,8 @@ Example:: .set_num_outputs(1) .set_attr("FInferShape", FlattenShape) .set_attr("FInferType", ElemwiseType<1, 1>) -.set_attr("FInferStorageType", FlattenStorageType) .set_attr("FGradient", ElemwiseGradUseNone{ "_backward_copy" }) .set_attr("FCompute", UnaryOp::IdentityCompute) -.set_attr("FComputeEx", FlattenEx) -#if MXNET_USE_MKLDNN == 1 -.set_attr("FResourceRequest", [](const NodeAttrs& n) { - return std::vector{ResourceRequest::kTempSpace}; -}) -#endif .set_attr("FInplaceOption", [](const NodeAttrs& attrs) { return std::vector >{{0, 0}}; diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h index 52df4dd2bbc8..f0dd61f01ac0 100644 --- a/src/storage/cpu_device_storage.h +++ b/src/storage/cpu_device_storage.h @@ -54,13 +54,7 @@ class CPUDeviceStorage { /*! * \brief Alignment of allocation. */ -#if MXNET_USE_MKLDNN == 1 - // MKLDNN requires special alignment. 4096 is used by the MKLDNN library in - // memory allocation. - static constexpr size_t alignment_ = 4096; -#else static constexpr size_t alignment_ = 16; -#endif }; // class CPUDeviceStorage inline void* CPUDeviceStorage::Alloc(size_t size) { diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh index 1d6d64be3862..794a4c55ee45 100755 --- a/tests/ci_build/ci_build.sh +++ b/tests/ci_build/ci_build.sh @@ -178,7 +178,6 @@ ${DOCKER_BINARY} run --rm --pid=host \ -e "CI_BUILD_GID=$(id -g)" \ -e "CUDA_ARCH=-gencode arch=compute_52,code=[sm_52,compute_52] --fatbin-options -compress-all" \ -e "MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0" \ - -e "ARCH_OPT=-mavx2" \ ${CI_DOCKER_EXTRA_PARAMS[@]} \ ${DOCKER_IMG_NAME} \ ${PRE_COMMAND} \ diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h index 570911c23568..6a220bdad6d7 100644 --- a/tests/cpp/include/test_core_op.h +++ b/tests/cpp/include/test_core_op.h @@ -209,13 +209,6 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer requested.emplace_back(r); } else if (req.type == ResourceRequest::kRandom) { requested.emplace_back(ResourceManager::Get()->Request(ctx->run_ctx.ctx, req)); - } else if (req.type == ResourceRequest::kParallelRandom) { - Resource rm = ResourceManager::Get()->Request(ctx->run_ctx.ctx, req); - if (ctx->run_ctx.ctx.dev_mask() == Context::kCPU) { - common::random::RandGenerator::AllocState( - rm.get_parallel_random()); - } - requested.emplace_back(rm); } else { LOG(FATAL) << "resource type not yet supported"; } @@ -321,9 +314,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer // Set up forward attrs_ = ParseAttrs(op_, args); - int num_inputs = op_->num_inputs; - if (op_->get_num_inputs) - num_inputs = op_->get_num_inputs(attrs_); + const int num_inputs = op_->num_inputs; if (!inputs.empty()) { CHECK_EQ(inputs.size(), static_cast(num_inputs)); @@ -349,8 +340,8 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer inputs_.reserve(num_inputs); inputs_p.reserve(num_inputs); - outputs_.reserve(inferred_num_outputs); - outputs_p.reserve(inferred_num_outputs); + outputs_.reserve(num_visible_outputs); + outputs_p.reserve(num_visible_outputs); for (size_t i = 0; i < static_cast(num_inputs); ++i) { CHECK_LT(i, static_cast(shapes.size())); @@ -359,7 +350,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer inputs_p.emplace_back(&*inputs_.rbegin()); } - for (size_t i = 0; i < static_cast(inferred_num_outputs); ++i) { + for (size_t i = 0; i < static_cast(num_visible_outputs); ++i) { // If supplied and valid, pass from the supplied outputs vector // Otherwise use empty for forward pass, or zero-filled for backward pass outputs_.emplace_back(i < outputs.size() diff --git a/tests/cpp/include/test_op_runner.h b/tests/cpp/include/test_op_runner.h index 1e00e30a1b34..0992c41f760e 100644 --- a/tests/cpp/include/test_op_runner.h +++ b/tests/cpp/include/test_op_runner.h @@ -137,8 +137,7 @@ class OperatorRunner { const test::op::kwargs_t& kwargs, int dim = 0, size_t count = 1, - const std::vector& timing_shapes = {}, - bool backward = true) { + const std::vector& timing_shapes = {}) { if (mxnet::test::quick_test) { total_iterations_ = 2; count = 1; @@ -226,7 +225,7 @@ class OperatorRunner { CHECK(false) << "Unsupported dimension count: " << (D + 1); } if (info.executor_) { - if (info.executor_->HasBackward() && backward) { + if (info.executor_->HasBackward()) { RunGenericOperatorBackward(&info, count); } timing += info.executor_->GetTiming(); diff --git a/tests/cpp/operator/activation_perf.cc b/tests/cpp/operator/activation_perf.cc index 1bd8ca89c9f5..e482848705ad 100644 --- a/tests/cpp/operator/activation_perf.cc +++ b/tests/cpp/operator/activation_perf.cc @@ -26,7 +26,7 @@ #include #include #include "../include/test_op_runner.h" -#include "../include/test_core_op.h" +#include "../include/test_legacy_op.h" #include "../../src/operator/nn/activation-inl.h" using namespace mxnet; @@ -41,10 +41,8 @@ TEST(ACTIVATION_PERF, ExecuteBidirectional) { TShape shape({5, 5}); kwargs_t kwargs = basic_activation_args; kwargs.push_back({"act_type", "tanh"}); - - test::op::CoreOperatorRunner runner; - runner.RunBidirectional(false, { shape }, test::op::CoreOpExecutor::ArgsWithOpName( - kwargs, "Activation", "_backward_Activation"), 1); + test::op::LegacyOpRunner runner; + runner.RunBidirectional(false, { shape }, kwargs, 1); } /*! @@ -54,12 +52,10 @@ TEST(ACTIVATION_PERF, TimingCPU) { kwargs_t kwargs = basic_activation_args; // Which math function is arbitrary since it will have roughly constant timing among approaches kwargs.push_back({"act_type", "tanh"}); - kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "Activation", - "_backward_Activation"); - TShape shape({10, 10, 10, 10}); - test::op::CoreOperatorRunner runner; - runner.RunBidirectional(false, { shape }, kwargs, 1); - + test::op::LegacyOpRunner runner; + runner.RunBidirectional(false, + { TShape({10, 10, 10, 10}) }, + kwargs, 1); // prime code and cache std::vector shapes; if (test::performance_run) { shapes = { @@ -88,11 +84,11 @@ TEST(ACTIVATION_PERF, TimingGPU) { kwargs_t kwargs = basic_activation_args; // Which math function is arbitrary since it will have roughly constant timing among approaches kwargs.push_back({"act_type", "tanh"}); - kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "Activation", - "_backward_Activation"); - TShape shape({10, 10, 10, 10}); - test::op::CoreOperatorRunner runner; - runner.RunBidirectional(true, { shape }, kwargs, 1); + test::OperatorRunner> runner; + runner.RunBidirectional(true, + { TShape({10, 10, 10, 10}) }, + kwargs, 1); // prime code and cache std::vector shapes = { {1, 1, 28, 28}, {1, 3, 28, 28}, diff --git a/tests/cpp/operator/batchnorm_test.cc b/tests/cpp/operator/batchnorm_test.cc index 607b9804684a..179e42a3830f 100644 --- a/tests/cpp/operator/batchnorm_test.cc +++ b/tests/cpp/operator/batchnorm_test.cc @@ -24,14 +24,11 @@ * \author Chris Olivier */ -#if 0 - #include #include #include "../../src/operator/nn/batch_norm-inl.h" #include "../../src/operator/batch_norm_v1-inl.h" #include "./test_legacy_op.h" -#include "./test_core_op.h" #include "executor/exec_pass.h" using namespace mxnet; @@ -1830,5 +1827,3 @@ TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu_ugs) { } #endif // MXNET_USE_CUDA - -#endif diff --git a/tests/cpp/operator/dropout_perf.cc b/tests/cpp/operator/dropout_perf.cc index c28b9bd48097..90bf6ebb0dfd 100644 --- a/tests/cpp/operator/dropout_perf.cc +++ b/tests/cpp/operator/dropout_perf.cc @@ -26,7 +26,7 @@ #include #include #include "../include/test_op_runner.h" -#include "../include/test_core_op.h" +#include "../include/test_legacy_op.h" #include "../../src/operator/nn/dropout-inl.h" using namespace mxnet; @@ -41,10 +41,8 @@ TEST(DROPOUT_PERF, ExecuteBidirectional) { TShape shape({5, 5}); kwargs_t kwargs = basic_dropout_args; kwargs.push_back({"mode", "always"}); - test::op::CoreOperatorRunner runner; - kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "Dropout", - "_backward_Dropout"); - runner.RunGenericOperatorForward(false, { shape }, kwargs, 1); + test::op::LegacyOpRunner runner; + runner.RunBidirectional(false, { shape }, kwargs, 1); } /*! @@ -54,11 +52,10 @@ TEST(DROPOUT_PERF, TimingCPU) { kwargs_t kwargs = basic_dropout_args; // Which math function is arbitrary since it will have roughly constant timing among approaches kwargs.push_back({"mode", "always"}); - TShape shape({10, 10, 10, 10}); - test::op::CoreOperatorRunner runner; - kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "Dropout", - "_backward_Dropout"); - runner.RunGenericOperatorForward(false, { shape }, kwargs, 1); + test::op::LegacyOpRunner runner; + runner.RunBidirectional(false, + { TShape({10, 10, 10, 10}) }, + kwargs, 1); // prime code and cache std::vector shapes; if (test::performance_run) { shapes = { @@ -75,9 +72,7 @@ TEST(DROPOUT_PERF, TimingCPU) { }; } for (const TShape &shape : shapes) { - kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "Dropout", - "_backward_Dropout"); - runner.TimingTest("Dropout Operator CPU", false, false, kwargs, 2, 10, { shape }, false); + runner.TimingTest("Dropout Operator CPU", false, false, kwargs, 2, 10, { shape }); } } @@ -89,11 +84,11 @@ TEST(DROPOUT_PERF, TimingGPU) { kwargs_t kwargs = basic_dropout_args; // Which math function is arbitrary since it will have roughly constant timing among approaches kwargs.push_back({"mode", "always"}); - TShape shape({10, 10, 10, 10}); - test::op::CoreOperatorRunner runner; - kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "Dropout", - "_backward_Dropout"); - runner.RunGenericOperatorForward(true, { shape }, kwargs, 1); + test::OperatorRunner> runner; + runner.RunBidirectional(true, + { TShape({10, 10, 10, 10}) }, + kwargs, 1); // prime code and cache std::vector shapes = { {1, 1, 28, 28}, {1, 3, 28, 28}, @@ -102,9 +97,8 @@ TEST(DROPOUT_PERF, TimingGPU) { {20, 3, 128, 128} }; for (const TShape &shape : shapes) { - kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "Dropout", - "_backward_Dropout"); - runner.TimingTest("Dropout Operator GPU", true, false, kwargs, 2, 10, { shape }, false); + runner.TimingTest("Dropout Operator GPU", true, false, kwargs, 2, 10, { shape }); } } #endif // MXNET_USE_CUDA == 1 + diff --git a/tests/cpp/operator/fully_conn_perf.cc b/tests/cpp/operator/fully_conn_perf.cc index 0ea4082a3fc5..2acfacdddcf0 100644 --- a/tests/cpp/operator/fully_conn_perf.cc +++ b/tests/cpp/operator/fully_conn_perf.cc @@ -29,25 +29,21 @@ #include #include "../../src/operator/nn/fully_connected-inl.h" #include "../include/test_op_runner.h" -#include "../include/test_core_op.h" +#include "../include/test_legacy_op.h" using namespace mxnet; typedef std::vector > kwargs_t; -const kwargs_t basic_fullyconn_args = { {"num_hidden", "250"}, {"no_bias", "true"} }; +const kwargs_t basic_fullyconn_args = { {"num_hidden", "250"} }; /*! * \brief Generic bidirectional sanity test */ TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) { - TShape shape1({5, 5}); - TShape shape2({250, 5}); + TShape shape({5, 5}); kwargs_t kwargs = basic_fullyconn_args; - test::op::CoreOperatorRunner runner; - runner.set_verbose(true); - kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "FullyConnected", - "_backward_FullyConnected"); - runner.RunGenericOperatorForward(false, { shape1, shape2 }, kwargs, 1); + test::op::LegacyOpRunner runner; + runner.RunBidirectional(false, { shape }, kwargs, 1); } /*! @@ -55,12 +51,10 @@ TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) { */ TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) { kwargs_t kwargs = basic_fullyconn_args; - TShape shape1({10, 10, 10, 10}); - TShape shape2({250, 1000}); - test::op::CoreOperatorRunner runner; - kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "FullyConnected", - "_backward_FullyConnected"); - runner.RunGenericOperatorForward(false, { shape1, shape2 }, kwargs, 1); + test::op::LegacyOpRunner runner; + runner.RunBidirectional(false, + { TShape({10, 10, 10, 10}) }, + kwargs, 1); // prime code and cache std::vector shapes; if (test::performance_run) { shapes = { @@ -77,11 +71,7 @@ TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) { }; } for (const TShape& shape : shapes) { - TShape shape2({250, static_cast(shape.ProdShape(1, shape.ndim()))}); - kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "FullyConnected", - "_backward_FullyConnected"); - runner.TimingTest("Fully connected CPU", false, false, kwargs, 2, 10, - { shape, shape2 }, false); + runner.TimingTest("Fully connected CPU", false, false, kwargs, 2, 10, { shape }); } } @@ -91,12 +81,12 @@ TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) { */ TEST(FULLY_CONNECTED, FullyConnectedTimingGPU) { kwargs_t kwargs = basic_fullyconn_args; - TShape shape1({10, 10, 10, 10}); - TShape shape2({250, 1000}); - test::op::CoreOperatorRunner runner; - kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "FullyConnected", - "_backward_FullyConnected"); - runner.RunGenericOperatorForward(true, { shape1, shape2 }, kwargs, 1); + test::OperatorRunner> + runner; + runner.RunBidirectional(true, + { TShape({10, 10, 10, 10}) }, + kwargs, 1); // prime code and cache std::vector shapes; if (test::performance_run) { shapes = { @@ -113,11 +103,7 @@ TEST(FULLY_CONNECTED, FullyConnectedTimingGPU) { }; } for (const TShape& shape : shapes) { - TShape shape2({250, shape.ProdShape(1, shape.ndim())}); - kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "FullyConnected", - "_backward_FullyConnected"); - runner.TimingTest("Fully connected GPU", true, false, kwargs, 2, 10, - { shape, shape2 }, false); + runner.TimingTest("Fully connected GPU", true, false, kwargs, 2, 10, { shape }); } } #endif // MXNET_USE_CUDA == 1 diff --git a/tests/cpp/operator/mkldnn.cc b/tests/cpp/operator/mkldnn.cc deleted file mode 100644 index a8a3d26fac3d..000000000000 --- a/tests/cpp/operator/mkldnn.cc +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file mkldnn.cc - * \brief test functions in mkldnn. - * \author Da Zheng - */ - -#if MXNET_USE_MKLDNN == 1 - -#include "gtest/gtest.h" -#include "../../src/operator/nn/mkldnn/mkldnn_base-inl.h" - -bool test_mem_align(void *mem, size_t size, size_t alignment, size_t space) { - void *ret1, *ret2; - size_t space1, space2; - space1 = space; - space2 = space; - ret1 = mxnet::AlignMem(mem, size, alignment, &space1); - ret2 = std::align(alignment, size, mem, space2); - EXPECT_EQ(ret1, ret2); - EXPECT_EQ(space1, space2); - return ret1 == ret2; -} - -TEST(MKLDNN_UTIL_FUNC, AlignMem) { - size_t alignment = 4096; - void *mem; - size_t size, space; - - // When mem has been aligned. - mem = reinterpret_cast(0x10000); - size = 1000; - space = 10000; - test_mem_align(mem, size, alignment, space); - - // When mem isn't aligned and we have enough space for alignment. - mem = reinterpret_cast(0x10010); - size = 1000; - space = 10000; - test_mem_align(mem, size, alignment, space); - - // When mem isn't aligned and we don't have enough memory for alignment - mem = reinterpret_cast(0x10010); - size = 1000; - space = 1001; - test_mem_align(mem, size, alignment, space); - - for (size_t i = 0; i < 10000; i++) { - mem = reinterpret_cast(random()); - size = random() % 2000; - space = random() % 2000; - test_mem_align(mem, size, alignment, space); - } -} -#endif diff --git a/tests/python/gpu/test_gluon_model_zoo_gpu.py b/tests/python/gpu/test_gluon_model_zoo_gpu.py deleted file mode 100644 index bc35b0b32327..000000000000 --- a/tests/python/gpu/test_gluon_model_zoo_gpu.py +++ /dev/null @@ -1,163 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -from __future__ import print_function -import mxnet as mx -import numpy as np -import copy -from mxnet import autograd -from mxnet.gluon.model_zoo.vision import get_model -from mxnet.test_utils import assert_almost_equal -import sys - -def eprint(*args, **kwargs): - print(*args, file=sys.stderr, **kwargs) - -VAL_DATA='data/val-5k-256.rec' -def download_data(): - return mx.test_utils.download( - 'http://data.mxnet.io/data/val-5k-256.rec', VAL_DATA) - -def test_inference(): - all_models = ['resnet50_v1', 'vgg19_bn', 'alexnet', #'inceptionv3', - 'densenet201', 'squeezenet1.0', 'mobilenet0.25'] - - batch_size = 10 - download_data() - for model_name in all_models: - eprint('testing inference on %s'%model_name) - - data_shape = (3, 224, 224) if 'inception' not in model_name else (3, 299, 299) - dataIter = mx.io.ImageRecordIter( - path_imgrec = VAL_DATA, - label_width = 1, - preprocess_threads = 1, - batch_size = batch_size, - data_shape = data_shape, - label_name = 'softmax_label', - rand_crop = False, - rand_mirror = False) - data_batch = dataIter.next() - data = data_batch.data[0] - label = data_batch.label[0] - gpu_data = data.as_in_context(mx.gpu()) - gpu_label = label.as_in_context(mx.gpu()) - - # This is to create a model and run the model once to initialize - # all parameters. - cpu_model = get_model(model_name) - cpu_model.collect_params().initialize(ctx=mx.cpu()) - cpu_model(mx.nd.array(data, ctx=mx.cpu())) - gpu_model = get_model(model_name) - gpu_model.collect_params().initialize(ctx=mx.gpu()) - gpu_model(mx.nd.array(data, ctx=mx.gpu())) - - # Force the two models have the same parameters. - cpu_params = cpu_model.collect_params() - gpu_params = gpu_model.collect_params() - for k in cpu_params.keys(): - k = k.replace(cpu_params.prefix, '') - cpu_param = cpu_params.get(k) - gpu_param = gpu_params.get(k) - gpu_param.set_data(cpu_param.data().as_in_context(mx.gpu())) - - # Run inference. - with autograd.record(train_mode=False): - cpu_out = cpu_model(mx.nd.array(data, ctx=mx.cpu())) - gpu_out = gpu_model(gpu_data) - out = cpu_out.asnumpy() - max_val = np.max(out) - assert_almost_equal(out / max_val, gpu_out.asnumpy() / max_val, rtol=1e-2, atol=1e-2) - -def get_nn_model(name): - if "densenet" in name: - return get_model(name, dropout=0) - else: - return get_model(name) - -def test_training(): - # We use network models without dropout for testing. - # TODO(zhengda) mobilenet can't pass this test even without MKLDNN. - all_models = ['resnet18_v1', 'densenet121'] - - batch_size = 10 - label = mx.nd.random.uniform(low=0, high=10, shape=(batch_size)).astype('int32') - - download_data() - dataIter = mx.io.ImageRecordIter( - path_imgrec = VAL_DATA, - label_width = 1, - preprocess_threads = 1, - batch_size = batch_size, - data_shape = (3, 224, 224), - label_name = 'softmax_label', - rand_crop = False, - rand_mirror = False) - data_batch = dataIter.next() - data = data_batch.data[0] - label = data_batch.label[0] - gpu_data = data.as_in_context(mx.gpu()) - gpu_label = label.as_in_context(mx.gpu()) - softmax_cross_entropy = mx.gluon.loss.SoftmaxCrossEntropyLoss() - - for model_name in all_models: - eprint('testing %s'%model_name) - #data = mx.nd.random.uniform(shape=(100, 3, 224, 224)) - - # This is to create a model and run the model once to initialize - # all parameters. - cpu_model = get_nn_model(model_name) - cpu_model.collect_params().initialize(ctx=mx.cpu()) - cpu_model(mx.nd.array(data, ctx=mx.cpu())) - gpu_model = get_nn_model(model_name) - gpu_model.collect_params().initialize(ctx=mx.gpu()) - gpu_model(mx.nd.array(data, ctx=mx.gpu())) - - # Force the two models have the same parameters. - cpu_params = cpu_model.collect_params() - gpu_params = gpu_model.collect_params() - for k in cpu_params.keys(): - k = k.replace(cpu_params.prefix, '') - cpu_param = cpu_params.get(k) - gpu_param = gpu_params.get(k) - gpu_param.set_data(cpu_param.data().as_in_context(mx.gpu())) - - cpu_trainer = mx.gluon.Trainer(cpu_params, 'sgd', {'learning_rate': 0.1}) - gpu_trainer = mx.gluon.Trainer(gpu_params, 'sgd', {'learning_rate': 0.1}) - - # Run forward and backward once. - with autograd.record(): - cpu_out = cpu_model(mx.nd.array(data, ctx=mx.cpu())) - gpu_out = gpu_model(gpu_data) - cpu_loss = softmax_cross_entropy(cpu_out, label) - gpu_loss = softmax_cross_entropy(gpu_out, gpu_label) - assert_almost_equal(cpu_out.asnumpy(), gpu_out.asnumpy(), rtol=1e-2, atol=1e-2) - cpu_loss.backward() - gpu_loss.backward() - cpu_trainer.step(batch_size) - gpu_trainer.step(batch_size) - - # Compare the parameters of the two models. - for k in cpu_params.keys(): - k = k.replace(cpu_params.prefix, '') - cpu_param = cpu_params.get(k) - gpu_param = gpu_params.get(k) - assert_almost_equal(cpu_param.data().asnumpy(), gpu_param.data().asnumpy(), rtol=1e-2, atol=1e-2) - -if __name__ == '__main__': - import nose - nose.runmodule() diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 5ae489529c33..55bb30cc7d6a 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -987,13 +987,6 @@ def test_activation_with_type(): check_consistency(sym, ctx_list) -def test_lrn(): - sym = mx.sym.LRN(alpha=0.0001, beta=0.75, knorm=2, nsize=5, name='lrn') - ctx_list = [{'ctx': mx.gpu(0), 'lrn_data': (2, 6, 10, 10), 'type_dict': {'lrn_data': np.float32}}, - {'ctx': mx.cpu(0), 'lrn_data': (2, 6, 10, 10), 'type_dict': {'lrn_data': np.float32}}] - check_consistency(sym, ctx_list) - - def test_embedding_with_type(): def test_embedding_helper(data_types, weight_types, low_pad, high_pad): NVD = [[20, 10, 20], [200, 10, 300]]