diff --git a/.gitmodules b/.gitmodules index 170c105a6f48..42f0027505fd 100644 --- a/.gitmodules +++ b/.gitmodules @@ -22,3 +22,7 @@ [submodule "3rdparty/googletest"] path = 3rdparty/googletest url = https://github.com/google/googletest.git +[submodule "3rdparty/mkldnn"] + path = 3rdparty/mkldnn + url = https://github.com/intel/mkl-dnn.git + branch = master diff --git a/3rdparty/mkldnn b/3rdparty/mkldnn new file mode 160000 index 000000000000..3e1f8f53f684 --- /dev/null +++ b/3rdparty/mkldnn @@ -0,0 +1 @@ +Subproject commit 3e1f8f53f6845dce23abf8089501c2eb45420b9e diff --git a/CMakeLists.txt b/CMakeLists.txt index 1d2e8c7aa333..a1ebb0c0eb9b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,8 +33,8 @@ mxnet_option(USE_OPENMP "Build with Openmp support" ON) mxnet_option(USE_CUDNN "Build with cudnn support" ON) # one could set CUDNN_ROOT for search path mxnet_option(USE_LAPACK "Build with lapack support" ON IF NOT MSVC) mxnet_option(USE_MKL_IF_AVAILABLE "Use MKL if found" ON) -mxnet_option(USE_MKLML_MKL "Use MKLML variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE)) -mxnet_option(USE_MKL_EXPERIMENTAL "Use experimental MKL (if MKL enabled and found)" OFF) +mxnet_option(USE_MKLDNN "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE)) +mxnet_option(USE_MKLML_MKL "Use MKLDNN variant of MKL (if MKL found)" ON IF USE_MKL_IF_AVAILABLE AND UNIX AND (NOT APPLE)) mxnet_option(USE_OPERATOR_TUNING "Enable auto-tuning of operators" ON AND NOT MSVC) mxnet_option(USE_GPERFTOOLS "Build with GPerfTools support (if found)" ON) mxnet_option(USE_JEMALLOC "Build with Jemalloc support" ON) @@ -138,14 +138,11 @@ if(USE_VTUNE) endif() if(USE_MKL_IF_AVAILABLE) - if(USE_MKL_EXPERIMENTAL AND NOT USE_MKLML_MKL) - message(ERROR " USE_MKL_EXPERIMENTAL can only be used when USE_MKL_EXPERIMENTAL is enabled") - endif() find_package(MKL) if(MKL_FOUND) include_directories(${MKL_INCLUDE_DIR}) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/src/operator/mkl) - add_definitions(-DMXNET_USE_MKL2017=1) + add_definitions(-DMXNET_USE_MKLDNN=1) add_definitions(-DUSE_MKL=1) add_definitions(-DCUB_MKL=1) list(APPEND mxnet_LINKER_LIBS ${MKL_LIBRARIES}) @@ -154,11 +151,6 @@ if(USE_MKL_IF_AVAILABLE) endif() # If using MKL, use the Intel OMP libraries list(APPEND mxnet_LINKER_LIBS iomp5) - if(USE_MKL_EXPERIMENTAL) - add_definitions(-DMKL_EXPERIMENTAL=1) - else() - add_definitions(-DMKL_EXPERIMENTAL=0) - endif() else() message(STATUS " MKL not found") endif() diff --git a/Jenkinsfile b/Jenkinsfile index 2bffdd4e5aea..b5b6ec671c82 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -6,6 +6,7 @@ mx_lib = 'lib/libmxnet.so, lib/libmxnet.a, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a' // mxnet cmake libraries, in cmake builds we do not produce a libnvvm static library by default. mx_cmake_lib = 'build/libmxnet.so, build/libmxnet.a, build/dmlc-core/libdmlc.a' +mx_mkldnn_lib = 'lib/libmxnet.so, lib/libmxnet.a, lib/libiomp5.so, lib/libmklml_gnu.so, lib/libmkldnn.so, lib/libmkldnn.so.0, lib/libmklml_intel.so, dmlc-core/libdmlc.a, nnvm/lib/libnnvm.a' // command to start a docker container docker_run = 'tests/ci_build/ci_build.sh' // timeout in minutes @@ -143,18 +144,18 @@ def python3_gpu_ut(docker_type) { } // Python 2 -def python2_mklml_ut(docker_type) { +def python2_mkldnn_ut(docker_type) { timeout(time: max_time, unit: 'MINUTES') { sh "${docker_run} ${docker_type} find . -name '*.pyc' -type f -delete" - sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-2.7 --with-timer --verbose tests/python/cpu" + sh "${docker_run} ${docker_type} PYTHONPATH=./python/ MXNET_MKLDNN_DEBUG=1 nosetests-2.7 --with-timer --verbose tests/python/cpu" } } // Python 3 -def python3_mklml_ut(docker_type) { +def python3_mkldnn_ut(docker_type) { timeout(time: max_time, unit: 'MINUTES') { sh "${docker_run} ${docker_type} find . -name '*.pyc' -type f -delete" - sh "${docker_run} ${docker_type} PYTHONPATH=./python/ nosetests-3.4 --with-timer --verbose tests/python/cpu" + sh "${docker_run} ${docker_type} PYTHONPATH=./python/ MXNET_MKLDNN_DEBUG=1 nosetests-3.4 --with-timer --verbose tests/python/cpu" } } @@ -225,21 +226,20 @@ try { } } }, - 'CPU: MKLML': { + 'CPU: MKLDNN': { node('mxnetlinux-cpu') { - ws('workspace/build-mklml-cpu') { + ws('workspace/build-mkldnn-cpu') { init_git() def flag = """ \ DEV=1 \ USE_PROFILER=1 \ USE_CPP_PACKAGE=1 \ USE_BLAS=openblas \ - USE_MKL2017=1 \ - USE_MKL2017_EXPERIMENTAL=1 \ + USE_MKLDNN=1 \ -j\$(nproc) """ make("cpu_mklml", flag) - pack_lib('mklml_cpu') + pack_lib('mkldnn_cpu', mx_mkldnn_lib) } } }, @@ -260,24 +260,23 @@ try { } } }, - 'GPU: MKLML': { + 'GPU: MKLDNN': { node('mxnetlinux-cpu') { - ws('workspace/build-mklml-gpu') { + ws('workspace/build-mkldnn-gpu') { init_git() def flag = """ \ DEV=1 \ USE_PROFILER=1 \ USE_CPP_PACKAGE=1 \ USE_BLAS=openblas \ - USE_MKL2017=1 \ - USE_MKL2017_EXPERIMENTAL=1 \ + USE_MKLDNN=1 \ USE_CUDA=1 \ USE_CUDA_PATH=/usr/local/cuda \ USE_CUDNN=1 \ -j\$(nproc) """ make("build_cuda", flag) - pack_lib('mklml_gpu') + pack_lib('mkldnn_gpu', mx_mkldnn_lib) } } }, @@ -424,43 +423,43 @@ try { } } }, - 'Python2: MKLML-CPU': { + 'Python2: MKLDNN-CPU': { node('mxnetlinux-cpu') { - ws('workspace/ut-python2-mklml-cpu') { + ws('workspace/ut-python2-mkldnn-cpu') { init_git() - unpack_lib('mklml_cpu') + unpack_lib('mkldnn_cpu', mx_mkldnn_lib) python2_ut('cpu_mklml') - python2_mklml_ut('cpu_mklml') + python2_mkldnn_ut('cpu_mklml') } } }, - 'Python2: MKLML-GPU': { + 'Python2: MKLDNN-GPU': { node('mxnetlinux-gpu') { - ws('workspace/ut-python2-mklml-gpu') { + ws('workspace/ut-python2-mkldnn-gpu') { init_git() - unpack_lib('mklml_gpu') + unpack_lib('mkldnn_gpu', mx_mkldnn_lib) python2_gpu_ut('gpu_mklml') - python2_mklml_ut('gpu_mklml') + python2_mkldnn_ut('gpu_mklml') } } }, - 'Python3: MKLML-CPU': { + 'Python3: MKLDNN-CPU': { node('mxnetlinux-cpu') { - ws('workspace/ut-python3-mklml-cpu') { + ws('workspace/ut-python3-mkldnn-cpu') { init_git() - unpack_lib('mklml_cpu') + unpack_lib('mkldnn_cpu', mx_mkldnn_lib) python3_ut('cpu_mklml') - python3_mklml_ut('cpu_mklml') + python3_mkldnn_ut('cpu_mklml') } } }, - 'Python3: MKLML-GPU': { + 'Python3: MKLDNN-GPU': { node('mxnetlinux-gpu') { - ws('workspace/ut-python3-mklml-gpu') { + ws('workspace/ut-python3-mkldnn-gpu') { init_git() - unpack_lib('mklml_gpu') + unpack_lib('mkldnn_gpu', mx_mkldnn_lib) python3_gpu_ut('gpu_mklml') - python3_mklml_ut('gpu_mklml') + python3_mkldnn_ut('gpu_mklml') } } }, diff --git a/Makefile b/Makefile index b0cff74e69ea..de38463f8bcd 100644 --- a/Makefile +++ b/Makefile @@ -42,11 +42,11 @@ endif # use customized config file include $(config) -ifeq ($(USE_MKL2017), 1) -# must run ./prepare_mkl before including mshadow.mk - RETURN_STRING := $(shell ./prepare_mkl.sh $(MKLML_ROOT)) - MKLROOT := $(firstword $(RETURN_STRING)) - export USE_MKLML = $(lastword $(RETURN_STRING)) +ifeq ($(USE_MKLDNN), 1) + RETURN_STRING := $(shell ./prepare_mkldnn.sh $(MKLDNN_ROOT)) + MKLDNNROOT := $(firstword $(RETURN_STRING)) + MKLROOT := $(lastword $(RETURN_STRING)) + export USE_MKLML = 1 endif include mshadow/make/mshadow.mk @@ -114,23 +114,16 @@ ifeq ($(USE_NNPACK), 1) LDFLAGS += -lnnpack endif -ifeq ($(USE_MKL2017), 1) - CFLAGS += -DMXNET_USE_MKL2017=1 +ifeq ($(USE_MKLDNN), 1) + CFLAGS += -DMXNET_USE_MKLDNN=1 CFLAGS += -DUSE_MKL=1 - CFLAGS += -I$(ROOTDIR)/src/operator/mkl/ - CFLAGS += -I$(MKLML_ROOT)/include - LDFLAGS += -L$(MKLML_ROOT)/lib - ifeq ($(USE_MKL2017_EXPERIMENTAL), 1) - CFLAGS += -DMKL_EXPERIMENTAL=1 - else - CFLAGS += -DMKL_EXPERIMENTAL=0 - endif - ifeq ($(UNAME_S), Darwin) - LDFLAGS += -lmklml - else - LDFLAGS += -Wl,--as-needed -lmklml_intel -lmklml_gnu + CFLAGS += -I$(ROOTDIR)/src/operator/nn/mkldnn/ + ifneq ($(MKLDNNROOT), $(MKLROOT)) + CFLAGS += -I$(MKLROOT)/include + LDFLAGS += -L$(MKLROOT)/lib endif - LDFLAGS += -liomp5 + CFLAGS += -I$(MKLDNNROOT)/include + LDFLAGS += -L$(MKLDNNROOT)/lib -lmkldnn -Wl,-rpath,'$${ORIGIN}' endif ifeq ($(USE_OPERATOR_TUNING), 1) @@ -144,7 +137,7 @@ endif # - for Ubuntu, installing atlas will not automatically install the atlas provided lapack library # silently switching lapack off instead of letting the build fail because of backward compatibility ifeq ($(USE_LAPACK), 1) -ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas)) +ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas mkl)) ifeq (,$(wildcard /lib/liblapack.a)) ifeq (,$(wildcard /usr/lib/liblapack.a)) ifeq (,$(wildcard /usr/lib64/liblapack.a)) @@ -162,7 +155,7 @@ ifeq ($(USE_LAPACK), 1) ifneq ($(USE_LAPACK_PATH), ) LDFLAGS += -L$(USE_LAPACK_PATH) endif - ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas)) + ifeq ($(USE_BLAS),$(filter $(USE_BLAS),blas openblas atlas mkl)) LDFLAGS += -llapack endif CFLAGS += -DMXNET_USE_LAPACK @@ -552,7 +545,8 @@ clean: cyclean $(EXTRA_PACKAGES_CLEAN) else clean: cyclean testclean $(EXTRA_PACKAGES_CLEAN) $(RM) -r build lib bin *~ */*~ */*/*~ */*/*/*~ R-package/NAMESPACE R-package/man R-package/R/mxnet_generated.R \ - R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz + R-package/inst R-package/src/image_recordio.h R-package/src/*.o R-package/src/*.so mxnet_*.tar.gz \ + external/mkldnn/install/* cd $(DMLC_CORE); $(MAKE) clean; cd - cd $(PS_PATH); $(MAKE) clean; cd - cd $(NNVM_PATH); $(MAKE) clean; cd - diff --git a/amalgamation/mxnet_predict0.cc b/amalgamation/mxnet_predict0.cc index f35591d82b22..cfee60559501 100644 --- a/amalgamation/mxnet_predict0.cc +++ b/amalgamation/mxnet_predict0.cc @@ -66,7 +66,7 @@ #include "src/operator/operator_util.cc" #include "src/operator/nn/activation.cc" #include "src/operator/nn/batch_norm.cc" -#include "src/operator/concat.cc" +#include "src/operator/nn/concat.cc" #include "src/operator/nn/convolution.cc" #include "src/operator/nn/deconvolution.cc" #include "src/operator/nn/dropout.cc" diff --git a/cmake/ChooseBlas.cmake b/cmake/ChooseBlas.cmake index 3a8723a5dd5e..13d7083f3d12 100644 --- a/cmake/ChooseBlas.cmake +++ b/cmake/ChooseBlas.cmake @@ -23,7 +23,7 @@ if(USE_MKL_IF_AVAILABLE) find_package(MKL) endif() if(MKL_FOUND) - if(USE_MKLML_MKL) + if(USE_MKLDNN) set(BLAS "open") else() set(BLAS "MKL") @@ -55,4 +55,4 @@ elseif(BLAS STREQUAL "apple") list(APPEND mshadow_LINKER_LIBS ${Accelerate_LIBRARIES}) add_definitions(-DMSHADOW_USE_MKL=0) add_definitions(-DMSHADOW_USE_CBLAS=1) -endif() \ No newline at end of file +endif() diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake index 743a871ee7cd..70405566d8ae 100644 --- a/cmake/Modules/FindMKL.cmake +++ b/cmake/Modules/FindMKL.cmake @@ -19,7 +19,7 @@ # # Options: # -# USE_MKLML_MKL : Search for MKL:ML library variant +# USE_MKLDNN : Search for MKL:ML library variant # # MKL_USE_SINGLE_DYNAMIC_LIBRARY : use single dynamic library interface # MKL_USE_STATIC_LIBS : use static libraries @@ -33,7 +33,7 @@ # MKL_INCLUDE_DIR : unclude directory # MKL_LIBRARIES : the libraries to link against. # -# cjolivier01: Changed to also look for MKLML library (subset of mkl) instead of standard MKL package +# cjolivier01: Changed to also look for MKLDNN library (subset of mkl) instead of standard MKL package # if(MKL_FOUND) @@ -43,7 +43,7 @@ endif() # ---[ Root folders set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs") -if(USE_MKLML_MKL) +if(USE_MKLDNN) find_path(MKL_ROOT include/mkl_blas.h PATHS $ENV{MKL_ROOT} @@ -66,13 +66,14 @@ if(USE_MKLML_MKL) set(__mkl_libs "") if(WIN32) - list(APPEND __mkl_libs intel) + list(APPEND __mkl_libs mklml_intel) else() - list(APPEND __mkl_libs gnu) + list(APPEND __mkl_libs mklml_gnu) endif() + list(APPEND __mkl_libs mkldnn) foreach (__lib ${__mkl_libs}) - set(__mkl_lib "mklml_${__lib}") + set(__mkl_lib "${__lib}") string(TOUPPER ${__mkl_lib} __mkl_lib_upper) if(MKL_USE_STATIC_LIBS) @@ -90,8 +91,7 @@ if(USE_MKLML_MKL) list(APPEND MKL_LIBRARIES ${${__mkl_lib_upper}_LIBRARY}) endforeach() - -else(USE_MKLML_MKL) +else(USE_MKLDNN) # ---[ Options mxnet_option(MKL_USE_SINGLE_DYNAMIC_LIBRARY "Use single dynamic library interface" ON) @@ -193,7 +193,7 @@ else(USE_MKLML_MKL) list(APPEND MKL_LIBRARIES ${MKL_RTL_LIBRARY}) endif() -endif(USE_MKLML_MKL) +endif(USE_MKLDNN) include(FindPackageHandleStandardArgs) find_package_handle_standard_args(MKL DEFAULT_MSG ${__looked_for}) diff --git a/example/image-classification/common/data.py b/example/image-classification/common/data.py index dc8915cda4c8..05f5ddc4506e 100755 --- a/example/image-classification/common/data.py +++ b/example/image-classification/common/data.py @@ -112,7 +112,8 @@ def get_rec_iter(args, kv=None): image_shape = tuple([int(l) for l in args.image_shape.split(',')]) if 'benchmark' in args and args.benchmark: data_shape = (args.batch_size,) + image_shape - train = SyntheticDataIter(args.num_classes, data_shape, 500, np.float32) + train = SyntheticDataIter(args.num_classes, data_shape, + args.num_examples / args.batch_size, np.float32) return (train, None) if kv: (rank, nworker) = (kv.rank, kv.num_workers) diff --git a/include/mxnet/ndarray.h b/include/mxnet/ndarray.h index a18d2daec8c3..43bc205944e2 100644 --- a/include/mxnet/ndarray.h +++ b/include/mxnet/ndarray.h @@ -35,12 +35,13 @@ #include #include #include +#include +#if MXNET_USE_MKLDNN == 1 +#include +#endif #include "./base.h" #include "./storage.h" #include "./engine.h" -#if MKL_EXPERIMENTAL == 1 -#include -#endif // check c++11 #if DMLC_USE_CXX11 == 0 #error "cxx11 was required for ndarray module" @@ -72,6 +73,7 @@ enum NDArrayFormatErr { kRSPIdxErr, // indices error for row sparse }; +class MKLDNNMemory; /*! * \brief ndarray interface @@ -80,9 +82,6 @@ class NDArray { public: /*! \brief default constructor */ NDArray() { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = MKLMemHolder::create(); -#endif } /*! * \brief constructs a new dynamic NDArray @@ -96,56 +95,14 @@ class NDArray { : ptr_(std::make_shared(shape, ctx, delay_alloc, dtype)), shape_(shape), dtype_(dtype), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = std::make_shared(); -#endif } /*! \brief constructor for NDArray with storage type */ NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx, bool delay_alloc = true, int dtype = mshadow::default_type_flag, std::vector aux_types = {}, std::vector aux_shapes = {}, - TShape storage_shape = TShape(mshadow::Shape1(0))) - : shape_(shape), dtype_(dtype), storage_type_(stype), - entry_({nullptr, 0, 0}) { - // Assign default aux types if not given - if (aux_types.size() == 0) { - if (stype == kRowSparseStorage) { - aux_types = {mshadow::kInt64}; - } else if (stype == kCSRStorage) { - aux_types = {mshadow::kInt64, mshadow::kInt64}; - } else { - LOG(FATAL) << "Unknown storage type " << stype; - } - } - // Assign default shapes if not given - // unknown shapes are intialized as {0} such that Size() would return 0 - if (aux_shapes.size() == 0) { - if (stype == kRowSparseStorage) { - aux_shapes = {TShape(mshadow::Shape1(0))}; - } else if (stype == kCSRStorage) { - // aux shapes for indptr and indices - aux_shapes = {TShape(mshadow::Shape1(0)), TShape(mshadow::Shape1(0))}; - } else { - LOG(FATAL) << "Unknown storage type " << stype; - } - } - if (storage_shape.Size() == 0) { - if (stype == kRowSparseStorage) { - storage_shape = shape; - storage_shape[0] = aux_shapes[rowsparse::kIdx][0]; - } else if (stype == kCSRStorage) { - storage_shape = aux_shapes[csr::kIdx]; - } else { - LOG(FATAL) << "Unknown storage type " << stype; - } - } - ptr_ = std::make_shared(stype, storage_shape, ctx, delay_alloc, - dtype, aux_types, aux_shapes); -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = std::make_shared(); -#endif - } + TShape storage_shape = TShape(mshadow::Shape1(0))); + /*! * \brief constructing a static NDArray that shares data with TBlob * Use with caution: allocate ONLY ONE NDArray for each TBlob, @@ -157,17 +114,11 @@ class NDArray { : ptr_(std::make_shared(data, dev_id)), shape_(data.shape_), dtype_(data.type_flag_), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = std::make_shared(); -#endif } /*! \brief create ndarray from shared memory */ NDArray(int shared_pid, int shared_id, const TShape& shape, int dtype) : ptr_(std::make_shared(shared_pid, shared_id, shape, dtype)), shape_(shape), dtype_(dtype), storage_type_(kDefaultStorage), entry_({nullptr, 0, 0}) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = std::make_shared(); -#endif } /*! @@ -184,11 +135,24 @@ class NDArray { const TBlob &data, const std::vector &aux_data, int dev_id) : ptr_(std::make_shared(stype, data, aux_data, dev_id)), shape_(shape), dtype_(data.type_flag_), storage_type_(stype), entry_({nullptr, 0, 0}) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = std::make_shared(); -#endif } + /* + * This indicates whether an array is a view of another array (created by + * reshape or slice). If an array is a view and the the data is stored in + * MKLDNN format, we need to convert the data to the default format when + * data in the view is accessed. + */ + inline bool IsView() const { + // View only works on the default storage + if (storage_type() != kDefaultStorage) + return false; + // If the array reuses memory, its shape may be different from the storage + // shape. However, we shouldn't consider it as a view. + if (reuse_) + return false; + return byte_offset_ > 0 || shape() != ptr_->storage_shape; + } /*! * \return the shape of current NDArray. @@ -271,9 +235,6 @@ class NDArray { << "Unexpected storage type: " << stype; res = TBlob(dptr, shape, ptr_->aux_handles[i].ctx.dev_mask(), type); }); -#if MKL_EXPERIMENTAL == 1 - res.Mkl_mem_ = Mkl_mem_; -#endif return res; } /*! @@ -534,15 +495,12 @@ class NDArray { CHECK_GE(ptr_->shandle.size, shape.Size() * mshadow::mshadow_sizeof(dtype)) << "NDArray.AsArray: target memory size is bigger"; -#if MKL_EXPERIMENTAL == 1 - if (Mkl_mem_ != nullptr) { - // convert prv to cpu - Mkl_mem_->check_and_prv_to_cpu(ptr_->shandle.dptr); - } -#endif + // We can't reuse memory in a view. + CHECK(!IsView()); NDArray ret = *this; ret.shape_ = shape; ret.dtype_ = dtype; + ret.reuse_ = true; return ret; } /*! @@ -611,6 +569,83 @@ class NDArray { << "CheckAndAllocAuxData is not intended for kDefaultStorage"; ptr_->CheckAndAllocAuxData(i, aux_shape); } + +#if MXNET_USE_MKLDNN == 1 + /* + * Test if the data is stored in one of special MKLDNN format. + */ + bool IsMKLDNNData() const { + return ptr_->IsMKLDNN(); + } + /* + * Test if the data is stored in one of default MXNet formats. + */ + bool IsDefaultData() const { + return ptr_->IsDefault(); + } + /* + * All functions below return a raw pointer to mkldnn memory. Actually there + * is a shared pointer that hold the memory either in NDArray or in MKLDNN + * stream. As long as we call these functions inside an operator, the return + * memory is always valid. + */ + + /* + * This function returns mkldnn::memory with the default primitive_desc. + */ + const mkldnn::memory *GetMKLDNNData() const; + /* + * This function returns mkldnn::memory with the given primitive_desc + * as long as the array size meets the required size in the given primitive_desc. + */ + const mkldnn::memory *GetMKLDNNData( + const mkldnn::memory::primitive_desc &desc) const; + /* + * This function returns mkldnn::memory with the given primitive_desc. + * The returned mkldnn::memory will have the same physical layout as + * the given primitive_desc. + */ + const mkldnn::memory *GetMKLDNNDataReorder( + const mkldnn::memory::primitive_desc &desc) const; + + /* + * This function copies data from mkldnn memory. + */ + void CopyFrom(const mkldnn::memory &mem); + /* + * This function allocates memory for array and creates mkldnn memory + * with the specified format. + */ + mkldnn::memory *CreateMKLDNNData( + const mkldnn::memory::primitive_desc &desc); + + /* + * Reorder the memory to the specified layout. + */ + void MKLDNNDataReorder(const mkldnn::memory::primitive_desc &desc); + void Reorder2Default() { + CHECK_EQ(storage_type(), kDefaultStorage); + ptr_->Reorder2Default(); + } + + void InvalidateMKLDNNData() { + // Removing mkl_mem_ means the NDArray will store data in the default format. + ptr_->mkl_mem_ = nullptr; + } + + /* + * This function is used inside operators to reshape an array. + * It doesn't change the layout of the original array and allocate memory from + * the temporary buffer. The returned array is only valid inside the current + * invocation of this operator. + * This is different from Reshape. Reshape will cause data in the array to be + * converted to the default layout and allocate memory from malloc directly, + * which can be expensive. + * It's used by FullyConnected right now. + */ + NDArray MKLDNNDataReshape(const TShape &shape) const; +#endif + /*! * \brief Save list of ndarray into the Stream.x * \param fo The stream of output. @@ -645,6 +680,12 @@ class NDArray { for csr, aux_handles[0] = indptr, aux_handles[1] = indices */ std::vector aux_handles; + +#if MXNET_USE_MKLDNN == 1 + /*! This is created when data is stored in MKLDNN format. + */ + std::shared_ptr mkl_mem_; +#endif /*! \brief variable from engine */ Engine::VarHandle var; /*! @@ -706,7 +747,7 @@ class NDArray { : static_data(false), delay_alloc(false) { var = Engine::Get()->NewVariable(); ctx = Context::CPUShared(0); - shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype);; + shandle.size = shape.Size() * mshadow::mshadow_sizeof(dtype); shandle.ctx = ctx; shandle.shared_pid = shared_pid; shandle.shared_id = shared_id; @@ -781,6 +822,9 @@ class NDArray { inline void CheckAndAlloc(void) { if (delay_alloc) { shandle = Storage::Get()->Alloc(shandle.size, shandle.ctx); +#if MXNET_USE_MKLDNN == 1 + mkl_mem_ = nullptr; +#endif delay_alloc = false; } } @@ -789,15 +833,22 @@ class NDArray { // size is the number of bytes void CheckAndAlloc(uint64_t dbytes) { CHECK_EQ(kDefaultStorage, storage_type) - << "CheckAndAlloc(dbytes) is not intended for kDefaultStorage"; + << "CheckAndAlloc(dbytes) is only intended for kDefaultStorage"; + dbytes = std::max(dbytes, shandle.size); if (delay_alloc) { shandle = Storage::Get()->Alloc(dbytes, shandle.ctx); +#if MXNET_USE_MKLDNN == 1 + mkl_mem_ = nullptr; +#endif delay_alloc = false; } else if (shandle.size < dbytes) { // free storage if necessary and alloc again if (shandle.size > 0) Storage::Get()->Free(shandle); // init storage shandle = Storage::Get()->Alloc(dbytes, shandle.ctx); +#if MXNET_USE_MKLDNN == 1 + mkl_mem_ = nullptr; +#endif } } @@ -823,20 +874,19 @@ class NDArray { // storage shape is also updated // if data is already allocated, try reuse the storage. Otherwise, free the current one // and allocate new storage - inline void CheckAndAllocData(const TShape &shape, int dtype) { - CHECK_NE(aux_shapes.size(), 0) << "data is expected to be allocated after aux_data"; - auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype); - if (shandle.size < dbytes) { - // free storage if necessary and alloc again - if (shandle.size > 0) Storage::Get()->Free(shandle); - // init storage - shandle = Storage::Get()->Alloc(dbytes, ctx); - } - // init shape - storage_shape = shape; - // delay_alloc is only set when data storage handle is present - delay_alloc = false; - } + void CheckAndAllocData(const TShape &shape, int dtype); + +#if MXNET_USE_MKLDNN == 1 + // Have MKL memory reference to the data in the default storage + // or create memory for MKLDNN. + void SetMKLMem(const TShape &shape, int dtype); + // In the data is stored in MKLDNN layout, we reorder data in mkl_mem_ and + // save the result in shandle. + void Reorder2Default(); + bool IsMKLDNN() const; + bool IsDefault() const; +#endif + // create storage handle for aux data based on shape // this function assumes ctx, aux shapes and aux types are set // aux shape is also updated @@ -862,45 +912,11 @@ class NDArray { set_aux_shape(i, shape); } /*! \brief destructor */ - ~Chunk() { - bool skip_free = static_data || delay_alloc; - Storage::Handle h = this->shandle; - std::vector aux_h = this->aux_handles; - Engine::Get()->DeleteVariable([h, aux_h, skip_free](RunContext s) { - if (skip_free == false) { - Storage::Get()->Free(h); - for (size_t i = 0; i < aux_h.size(); i++) { - if (aux_h[i].size > 0) Storage::Get()->Free(aux_h[i]); - } - } - }, shandle.ctx, var); - } + ~Chunk(); }; // struct Chunk - void SetTBlob() const { - CHECK(ptr_ != nullptr); - TShape shape = shape_; - char *dptr = static_cast(ptr_->shandle.dptr); - auto stype = storage_type(); - if (stype == kDefaultStorage) { - dptr += byte_offset_; - } else if (stype == kCSRStorage || stype == kRowSparseStorage) { - shape = storage_shape(); - } else { - LOG(FATAL) << "unknown storage type " << stype; - } - tblob_.dptr_ = dptr; - tblob_.shape_ = shape; - tblob_.type_flag_ = dtype_; - tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id); -#if MKL_EXPERIMENTAL == 1 - tblob_.Mkl_mem_ = Mkl_mem_; -#endif - } + void SetTBlob() const; -#if MKL_EXPERIMENTAL == 1 - std::shared_ptr Mkl_mem_; -#endif /*! \brief internal data of NDArray */ std::shared_ptr ptr_{nullptr}; /*! \brief shape of current NDArray */ @@ -909,6 +925,8 @@ class NDArray { size_t byte_offset_ = 0; /*! \brief type of data */ int dtype_ = -1; + /*! \brief whether the NDArray uses memory of another NDArray. */ + bool reuse_ = false; /*! \brief storage type of data */ NDArrayStorageType storage_type_ = kUndefinedStorage; /*! \brief node entry for autograd */ diff --git a/include/mxnet/tensor_blob.h b/include/mxnet/tensor_blob.h index b65cd2b434e4..168ddcca24b7 100755 --- a/include/mxnet/tensor_blob.h +++ b/include/mxnet/tensor_blob.h @@ -36,9 +36,6 @@ #include #include #include "./base.h" -#if MXNET_USE_MKL2017 == 1 -#include -#endif namespace mxnet { /* Forward declaration for friend declaration in TBlob */ @@ -66,17 +63,10 @@ class TBlob { /*! \brief type flag of the tensor blob */ int type_flag_; - /*! \brief storing mkl chunk buffer blob, use for experimental only */ -#if MKL_EXPERIMENTAL == 1 - std::shared_ptr Mkl_mem_; -#endif /*! \brief default constructor, default copy assign will work */ TBlob(void) : dptr_(NULL), type_flag_(mshadow::DataType::kFlag) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = NULL; -#endif SetDLTensor(cpu::kDevMask, 0); } /*! @@ -90,9 +80,6 @@ class TBlob { TBlob(DType *dptr, const TShape &shape, int dev_mask, int dev_id = -1) : dptr_(dptr), shape_(shape), type_flag_(mshadow::DataType::kFlag) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = NULL; -#endif SetDLTensor(dev_mask, dev_id); } /*! @@ -105,9 +92,6 @@ class TBlob { */ TBlob(void *dptr, const TShape &shape, int dev_mask, int type_flag, int dev_id = -1) : dptr_(dptr), shape_(shape), type_flag_(type_flag) { -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = NULL; -#endif SetDLTensor(dev_mask, dev_id); } /*! @@ -135,9 +119,6 @@ class TBlob { shape_ = src.shape_; type_flag_ = mshadow::DataType::kFlag; SetDLTensor(Device::kDevMask, -1); -#if MKL_EXPERIMENTAL == 1 - Mkl_mem_ = NULL; -#endif return *this; } /*! @@ -172,11 +153,6 @@ class TBlob { CHECK(mshadow::DataType::kFlag == type_flag_) << "TBlob.get_with_shape: data type do not match specified type." << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType::kFlag; -#if MKL_EXPERIMENTAL == 1 - if (Mkl_mem_ != nullptr) { - Mkl_mem_->check_and_prv_to_cpu(dptr_); - } -#endif return mshadow::Tensor(static_cast(dptr_), shape_.FlatTo2D(), shape_[shape_.ndim() - 1], @@ -217,11 +193,6 @@ class TBlob { CHECK(mshadow::DataType::kFlag == type_flag_) << "TBlob.get_with_shape: data type do not match specified type." << "Expected: " << type_flag_ << " v.s. given " << mshadow::DataType::kFlag; -#if MKL_EXPERIMENTAL == 1 - if (Mkl_mem_ != nullptr) { - Mkl_mem_->check_and_prv_to_cpu(dptr_); - } -#endif return static_cast(dptr_); } /*! \brief device mask of the corresponding device */ diff --git a/prepare_mkldnn.sh b/prepare_mkldnn.sh new file mode 100755 index 000000000000..7cd7d6af0609 --- /dev/null +++ b/prepare_mkldnn.sh @@ -0,0 +1,118 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# set -ex +# +# All modification made by Intel Corporation: © 2016 Intel Corporation +# +# All contributions by the University of California: +# Copyright (c) 2014, 2015, The Regents of the University of California (Regents) +# All rights reserved. +# +# All other contributions: +# Copyright (c) 2014, 2015, the respective contributors +# All rights reserved. +# For the list of contributors go to https://github.com/BVLC/caffe/blob/master/CONTRIBUTORS.md +# +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are met: +# +# * Redistributions of source code must retain the above copyright notice, +# this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * Neither the name of Intel Corporation nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +# + +MXNET_ROOTDIR="$(pwd)" +MKLDNN_ROOTDIR="$MXNET_ROOTDIR/3rdparty/mkldnn/" +MKLDNN_SRCDIR="$MKLDNN_ROOTDIR/src" +MKLDNN_BUILDDIR="$MKLDNN_ROOTDIR/build" +MKLDNN_INSTALLDIR="$MKLDNN_ROOTDIR/install" +MKLDNN_LIBDIR="$MXNET_ROOTDIR/lib" + +# MKLDNN install destination +HOME_MKLDNN=$1 +if [ ! -z "$HOME_MKLDNN" ]; then + mkdir -p $HOME_MKLDNN + if [ ! -w $HOME_MKLDNN ]; then + echo "MKLDNN install to $HOME_MKLDNN failed, please try with sudo" >&2 + exit 1 + fi +fi + +if [ -z $MKLDNNROOT ]; then +if [ ! -f "$MKLDNN_INSTALLDIR/lib/libmkldnn.so" ]; then + mkdir -p $MKLDNN_INSTALLDIR + cd $MKLDNN_ROOTDIR + if [ -z $MKLROOT ] && [ ! -f $MKLDNN_INSTALLDIR/include/mkl_cblas.h ]; then + rm -rf external && cd scripts && ./prepare_mkl.sh && cd .. + cp -a external/*/* $MKLDNN_INSTALLDIR/. + fi + echo "Building MKLDNN ..." >&2 + cd $MXNET_ROOTDIR + g++ --version >&2 + if [ -z $ARCH_OPT ]; then + cmake $MKLDNN_ROOTDIR -DCMAKE_INSTALL_PREFIX=$MKLDNN_INSTALLDIR -B$MKLDNN_BUILDDIR + else + cmake $MKLDNN_ROOTDIR -DCMAKE_INSTALL_PREFIX=$MKLDNN_INSTALLDIR -B$MKLDNN_BUILDDIR -DARCH_OPT_FLAGS=$ARCH_OPT + fi + make -C $MKLDNN_BUILDDIR -j$(cat /proc/cpuinfo | grep processor | wc -l) VERBOSE=1 >&2 + make -C $MKLDNN_BUILDDIR install + rm -rf $MKLDNN_BUILDDIR + mkdir -p $MKLDNN_LIBDIR + cp $MKLDNN_INSTALLDIR/lib/* $MKLDNN_LIBDIR +fi +MKLDNNROOT=$MKLDNN_INSTALLDIR +fi + +if [ -z $MKLROOT ] && [ -f $MKLDNNROOT/include/mkl_cblas.h ]; then + MKLROOT=$MKLDNNROOT; +fi + +# user specified MKLDNN install folder +if [ -d "$HOME_MKLDNN" ]; then + # skip if user specificed MKLDNNROOT + [ "$MKLDNNROOT" != "$HOME_MKLDNN" ] && rsync -a $MKLDNNROOT/include $MKLDNNROOT/lib $HOME_MKLDNN/. + [ "$MKLROOT" != "$HOME_MKLDNN" ] && rsync -a $MKLROOT/include $MKLROOT/lib $HOME_MKLDNN/. + # update ldconfig if possible + if [ -w /etc/ld.so.conf.d ]; then + echo "$HOME_MKLDNN/lib" > /etc/ld.so.conf.d/mxnmkldnn.conf && ldconfig + fi +# return value to calling script (Makefile,cmake) + echo $HOME_MKLDNN $HOME_MKLDNN +else + echo $MKLDNNROOT $MKLROOT +fi + diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py index 64619044862b..56f4b9c83e77 100644 --- a/python/mxnet/test_utils.py +++ b/python/mxnet/test_utils.py @@ -1287,6 +1287,10 @@ def check_consistency(sym, ctx_list, scale=1.0, grad_req='write', arr[:] = arg_params[name] for name, arr in exe.aux_dict.items(): arr[:] = aux_params[name] + # We need to initialize the gradient arrays if it's add. + if (grad_req == "add"): + for arr in exe.grad_arrays: + arr[:] = np.zeros(arr.shape, dtype=arr.dtype) dtypes = [np.dtype(exe.outputs[0].dtype) for exe in exe_list] max_idx = np.argmax(dtypes) diff --git a/src/common/exec_utils.h b/src/common/exec_utils.h index dcd1504fb88e..5fd1a9b1d1b9 100644 --- a/src/common/exec_utils.h +++ b/src/common/exec_utils.h @@ -43,19 +43,61 @@ namespace common { indices are not recorded * \return true if any source NDArray need to cast storage */ -inline bool SetupDefaultBlobs(const std::vector& src, - std::vector *blobs, - std::vector *temp_src, - std::vector *temp_dst, - std::unordered_map *idx_map = nullptr) { +inline bool SetupDefaultBlobsIn(const std::vector& src, + const std::vector *bufs, + std::vector *blobs, + std::vector *temp_src, + std::vector *temp_dst, + std::unordered_map *idx_map) { bool require_cast = false; for (size_t i = 0; i < src.size(); i++) { auto& nd = src[i]; - if (nd.storage_type() != kDefaultStorage) { - if (idx_map != nullptr) { - (*idx_map)[i] = temp_dst->size(); - } - NDArray temp(nd.shape(), nd.ctx(), false, nd.dtype()); + bool is_default = nd.storage_type() == kDefaultStorage; +#if MXNET_USE_MKLDNN == 1 + // We have to make sure it's default storage and default layout. + is_default = nd.IsDefaultData(); +#endif + if (!is_default) { + (*idx_map)[i] = temp_dst->size(); + NDArray temp = bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(), + true, nd.dtype()); +#if MXNET_USE_MKLDNN == 1 + CHECK(temp.IsDefaultData()); +#endif + temp_src->emplace_back(nd); + temp_dst->emplace_back(temp); + blobs->emplace_back(temp.data()); + require_cast = true; + } else { + blobs->push_back(nd.data()); + } + } + return require_cast; +} + +inline bool SetupDefaultBlobsOut(const std::vector& src, + const std::vector &req, + const std::vector *bufs, + std::vector *blobs, + std::vector *temp_src, + std::vector *temp_dst) { + bool require_cast = false; + for (size_t i = 0; i < src.size(); i++) { + auto& nd = src[i]; + bool is_default = nd.storage_type() == kDefaultStorage; +#if MXNET_USE_MKLDNN == 1 + // If it's writeTo, we don't need to worry whether it contains valid data. + if (req[i] == kWriteTo && is_default) + const_cast(nd).InvalidateMKLDNNData(); + // We have to make sure it's default storage and default layout. + is_default = nd.IsDefaultData(); +#endif + if (!is_default) { + NDArray temp = bufs != nullptr ? bufs->at(i) : NDArray(nd.shape(), nd.ctx(), + true, nd.dtype()); +#if MXNET_USE_MKLDNN == 1 + CHECK(temp.IsDefaultData()); +#endif temp_src->emplace_back(nd); temp_dst->emplace_back(temp); blobs->emplace_back(temp.data()); @@ -76,6 +118,9 @@ inline bool SetupDefaultBlobs(const std::vector& src, */ inline void SetupDefaultBlobsInOut(const std::vector &ndinputs, const std::vector &ndoutputs, + const std::vector &req, + const std::vector *in_bufs, + const std::vector *out_bufs, std::vector *input_blobs, std::vector *output_blobs, std::vector *pre_temp_src, @@ -85,9 +130,11 @@ inline void SetupDefaultBlobsInOut(const std::vector &ndinputs, std::unordered_map *in_temp_idx_map, const std::vector &mutate_idx) { // populate input blobs - SetupDefaultBlobs(ndinputs, input_blobs, pre_temp_src, pre_temp_dst, in_temp_idx_map); + SetupDefaultBlobsIn(ndinputs, in_bufs, input_blobs, pre_temp_src, pre_temp_dst, + in_temp_idx_map); // populate output blobs - SetupDefaultBlobs(ndoutputs, output_blobs, post_temp_dst, post_temp_src); + SetupDefaultBlobsOut(ndoutputs, req, out_bufs, output_blobs, post_temp_dst, + post_temp_src); // add mutable inputs to post temp list for (const auto idx : mutate_idx) { auto map_iter = in_temp_idx_map->find(idx); diff --git a/src/executor/attach_op_execs_pass.cc b/src/executor/attach_op_execs_pass.cc index 1bcc40a894dd..e4d49554620f 100644 --- a/src/executor/attach_op_execs_pass.cc +++ b/src/executor/attach_op_execs_pass.cc @@ -30,11 +30,8 @@ #include "../common/utils.h" #include "../common/exec_utils.h" #include "./exec_pass.h" -#if MXNET_USE_MKL2017 == 1 -#include -#include "../operator/mkl/mkl_memory-inl.h" -#include "../operator/mkl/mkl_util-inl.h" -#endif +#include "../operator/nn/mkldnn/mkldnn_base-inl.h" + namespace mxnet { namespace op { @@ -58,23 +55,34 @@ class StorageFallbackOpExecutor : public OpExecutor { protected: // initialize the data blobs void InitBlobs() { - using namespace common; if (!init_) { - in_data_.clear(); out_data_.clear(); - pre_temp_src_.clear(); pre_temp_dst_.clear(); - post_temp_src_.clear(); post_temp_dst_.clear(); - in_temp_idx_map_.clear(); - SetupDefaultBlobsInOut(in_array, out_array, &in_data_, &out_data_, - &pre_temp_src_, &pre_temp_dst_, - &post_temp_src_, &post_temp_dst_, - &in_temp_idx_map_, mutate_idx_); + pre_temp_buf_.clear(); + post_temp_buf_.clear(); + for (size_t i = 0; i < in_array.size(); i++) { + auto &nd = in_array[i]; + pre_temp_buf_.emplace_back(nd.shape(), nd.ctx(), true, nd.dtype()); + } + for (size_t i = 0; i < out_array.size(); i++) { + auto &nd = out_array[i]; + post_temp_buf_.emplace_back(nd.shape(), nd.ctx(), true, nd.dtype()); + } init_ = true; } } // storage fallback before fcompute is launched void PreFCompute(bool is_gpu) { + using namespace common; InitBlobs(); + in_data_.clear(); out_data_.clear(); + pre_temp_src_.clear(); pre_temp_dst_.clear(); + post_temp_src_.clear(); post_temp_dst_.clear(); + in_temp_idx_map_.clear(); + SetupDefaultBlobsInOut(in_array, out_array, req, &pre_temp_buf_, &post_temp_buf_, + &in_data_, &out_data_, + &pre_temp_src_, &pre_temp_dst_, + &post_temp_src_, &post_temp_dst_, + &in_temp_idx_map_, mutate_idx_); common::CastNonDefaultStorage(pre_temp_src_, pre_temp_dst_, op_ctx, is_gpu); } @@ -85,6 +93,8 @@ class StorageFallbackOpExecutor : public OpExecutor { // default storage tensor blobs for fcompute std::vector in_data_, out_data_; + // These are NDArray buffers for cast storage. + std::vector pre_temp_buf_, post_temp_buf_; // source NDArray for cast storage std::vector pre_temp_src_, post_temp_src_; // destination NDArray for cast storage @@ -106,10 +116,6 @@ class StatefulComputeExecutor : public StorageFallbackOpExecutor { PreFCompute(is_gpu); fcompute_(state_, op_ctx, in_data_, req, out_data_); PostFCompute(is_gpu); -#if MKL_EXPERIMENTAL == 1 - mkl_tblobs_prv_to_cpu(in_data_); - mkl_tblobs_prv_to_cpu(out_data_); -#endif } ExecType exec_type() const override { @@ -175,10 +181,6 @@ class FComputeExecutor : public StorageFallbackOpExecutor { PreFCompute(is_gpu); fcompute_(attrs_, op_ctx, in_data_, req, out_data_); PostFCompute(is_gpu); -#if MKL_EXPERIMENTAL == 1 - mkl_tblobs_prv_to_cpu(in_data_); - mkl_tblobs_prv_to_cpu(out_data_); -#endif } ExecType exec_type() const override { @@ -202,6 +204,9 @@ class FComputeExExecutor : public OpExecutor { public: void Run(RunContext rctx, bool is_gpu) override { op_ctx.run_ctx = rctx; +#if MXNET_USE_MKLDNN == 1 + InvalidateOutputs(out_array, req); +#endif fcompute_(attrs_, op_ctx, in_array, req, out_array); } diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc index 2a7d2b906684..f685370619f2 100644 --- a/src/executor/graph_executor.cc +++ b/src/executor/graph_executor.cc @@ -1209,7 +1209,8 @@ void GraphExecutor::InitDataEntryMemory(std::vector* shared_pool) { const NDArray& src = data_pool_.at(storage_id); data_entry_[i] = src.AsArray(vshape[i], vdtype[i]); } else { - data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i]); + data_entry_[i] = NDArray(storage_type, vshape[i], data_context[i], + true, vdtype[i]); } if (log_verbose_) { LOG(INFO) << "\tinit data entry\t" << i << "\tas " << common::stype_string(storage_type); diff --git a/src/executor/infer_graph_attr_pass.cc b/src/executor/infer_graph_attr_pass.cc index 73a34c8b0f0d..01fab2240952 100644 --- a/src/executor/infer_graph_attr_pass.cc +++ b/src/executor/infer_graph_attr_pass.cc @@ -423,11 +423,6 @@ nnvm::Graph InferStorageType(nnvm::Graph&& graph, DispatchModeVector dispatch_modes(graph.indexed_graph().num_nodes(), DispatchMode::kUndefined); graph.attrs["dispatch_mode"] = std::make_shared(std::move(dispatch_modes)); } - // initialize unknown values for dispatch modes - if (graph.attrs.count("dispatch_mode") == 0) { - DispatchModeVector dispatch_modes(graph.indexed_graph().num_nodes(), DispatchMode::kUndefined); - graph.attrs["dispatch_mode"] = std::make_shared(std::move(dispatch_modes)); - } // initialize the dev_mask vector from the context vector if (graph.attrs.count("dev_mask") == 0) { CHECK_GT(graph.attrs.count("context"), 0); diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc index eaa95a5f2418..93a8bc6c54b2 100644 --- a/src/imperative/cached_op.cc +++ b/src/imperative/cached_op.cc @@ -214,6 +214,12 @@ nnvm::Graph Imperative::CachedOp::GetForwardGraph( StorageVector storage(idx.num_node_entries(), exec::kBadStorageID); for (const auto i : idx.input_nodes()) storage[idx.entry_id(i, 0)] = exec::kExternalStorageID; + const auto& stypes = g.GetAttr("storage_type"); + CHECK_EQ(stypes.size(), storage.size()); + for (size_t i = 0; i < stypes.size(); i++) { + if (stypes[i] != kDefaultStorage) + storage[i] = exec::kDynamicStorageID; + } auto mem_plan = PlanMemory( &g, std::move(storage), g.GetAttr >( @@ -320,6 +326,10 @@ nnvm::Graph Imperative::CachedOp::GetBackwardGraph( for (size_t i = 0; i < num_forward_entries; ++i) storage[i] = exec::kExternalStorageID; for (const auto i : idx.input_nodes()) storage[idx.entry_id(i, 0)] = exec::kExternalStorageID; for (const auto i : idx.outputs()) storage[idx.entry_id(i)] = exec::kExternalStorageID; + for (size_t i = 0; i < stypes.size(); i++) { + if (stypes[i] != kDefaultStorage) + storage[i] = exec::kDynamicStorageID; + } auto mem_plan = PlanMemory( &g, std::move(storage), g.GetAttr >("backward_ref_count"), diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h index fc28f50103b0..966a753dc120 100644 --- a/src/imperative/imperative_utils.h +++ b/src/imperative/imperative_utils.h @@ -362,9 +362,9 @@ inline void PushFCompute(const FCompute& fn, // mapping from index in input_blobs to index in pre_temp_dst std::unordered_map in_temp_idx_map; // setup blobs - SetupDefaultBlobsInOut(inputs, outputs, &input_blobs, &output_blobs, - &pre_temp_src, &pre_temp_dst, &post_temp_src, - &post_temp_dst, &in_temp_idx_map, mutate_idx); + SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr, + &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst, + &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx); // setup context OpContext opctx{is_train, rctx, engine::CallbackOnComplete(), requested}; bool is_gpu = ctx.dev_mask() == gpu::kDevMask; @@ -460,9 +460,9 @@ inline void PushOperator(const OpStatePtr& state, // mapping from index in input_blobs to index in pre_temp_dst std::unordered_map in_temp_idx_map; // populate input blobs and output blobs - SetupDefaultBlobsInOut(inputs, outputs, &input_blobs, &output_blobs, - &pre_temp_src, &pre_temp_dst, &post_temp_src, &post_temp_dst, - &in_temp_idx_map, mutate_idx); + SetupDefaultBlobsInOut(inputs, outputs, req, nullptr, nullptr, + &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst, + &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx); // setup contexts bool is_gpu = rctx.get_ctx().dev_mask() == gpu::kDevMask; // pre-fcompute fallback @@ -607,6 +607,7 @@ inline bool CheckAndInferStorageType(nnvm::Graph* p_g, exec::DevMaskVector&& dev } if (match) return true; } + g.attrs.erase("dispatch_mode"); g.attrs.erase("storage_type"); g.attrs.erase("storage_type_inputs"); if (node_range.second > node_range.first) { diff --git a/src/kvstore/kvstore_dist.h b/src/kvstore/kvstore_dist.h index b00d0de935f7..d0a968154afb 100644 --- a/src/kvstore/kvstore_dist.h +++ b/src/kvstore/kvstore_dist.h @@ -32,11 +32,6 @@ #include "mxnet/engine.h" #include "ps/ps.h" #include "./kvstore_dist_server.h" -#if MKL_EXPERIMENTAL == 1 -#include -#include "../operator/mkl/mkl_memory-inl.h" -#include "../operator/mkl/mkl_util-inl.h" -#endif namespace mxnet { namespace kvstore { @@ -228,9 +223,6 @@ class KVStoreDist : public KVStoreLocal { PSKV& pskv = (gradient_compression_->get_type() == CompressionType::kNone) ? EncodeDefaultKey(key, size, false) : EncodeCompressedKey(key, size, false); -#if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(recv_buf.data()); -#endif real_t* data = recv_buf.data().dptr(); // false means not to delete data when SArray is deleted auto vals = new ps::SArray(data, size, false); @@ -380,9 +372,6 @@ class KVStoreDist : public KVStoreLocal { [this, key, pskv, small_buf](RunContext rctx, Engine::CallbackOnComplete cb) { size_t size = small_buf.shape().Size(); real_t* data = small_buf.data().dptr(); -#if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(small_buf.data()); -#endif // do push. false means no delete ps::SArray vals(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPush( @@ -407,9 +396,6 @@ class KVStoreDist : public KVStoreLocal { // convert to ps keys size_t size = send_buf.shape().Size(); real_t* data = send_buf.data().dptr(); -#if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(send_buf.data()); -#endif // do push. false means no delete ps::SArray vals(data, size, false); CHECK_NOTNULL(ps_worker_)->ZPush( @@ -431,9 +417,6 @@ class KVStoreDist : public KVStoreLocal { using namespace rowsparse; auto push_to_servers = [this, key, send_buf] (RunContext rctx, Engine::CallbackOnComplete cb) { -#if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(send_buf.data()); -#endif real_t* data = send_buf.data().dptr(); const int64_t num_rows = send_buf.aux_shape(kIdx)[0]; const auto offsets = send_buf.aux_data(kIdx).dptr(); @@ -472,9 +455,6 @@ class KVStoreDist : public KVStoreLocal { // allocate memory for the buffer size_t num_rows = indices.shape().Size(); recv_buf.CheckAndAlloc({mshadow::Shape1(num_rows)}); -#if MKL_EXPERIMENTAL == 1 - mkl_set_tblob_eager_mode(recv_buf.data()); -#endif real_t* data = recv_buf.data().dptr(); const auto offsets = indices.data().dptr(); const auto unit_len = recv_buf.shape().ProdShape(1, recv_buf.shape().ndim()); diff --git a/src/ndarray/ndarray.cc b/src/ndarray/ndarray.cc index 4db314f9cf4b..ae7209e272b0 100644 --- a/src/ndarray/ndarray.cc +++ b/src/ndarray/ndarray.cc @@ -31,10 +31,14 @@ #include #include #include +#if MXNET_USE_MKLDNN == 1 +#include +#endif #include "./ndarray_function.h" #include "../common/utils.h" #include "../operator/tensor/matrix_op-inl.h" #include "../operator/tensor/init_op.h" +#include "../operator/nn/mkldnn/mkldnn_base-inl.h" #if MXNET_USE_OPENCV #include @@ -46,6 +50,104 @@ DMLC_REGISTRY_ENABLE(::mxnet::NDArrayFunctionReg); namespace mxnet { +NDArray::NDArray(const NDArrayStorageType stype, const TShape &shape, Context ctx, + bool delay_alloc, int dtype, std::vector aux_types, + std::vector aux_shapes, TShape storage_shape) : shape_(shape), + dtype_(dtype), storage_type_(stype), entry_({nullptr, 0, 0}) { + // Assign default aux types if not given + if (aux_types.size() == 0 + && stype != kDefaultStorage) { + if (stype == kRowSparseStorage) { + aux_types = {mshadow::kInt64}; + } else if (stype == kCSRStorage) { + aux_types = {mshadow::kInt64, mshadow::kInt64}; + } else { + LOG(FATAL) << "Unknown storage type " << stype; + } + } + // Assign default shapes if not given + // unknown shapes are intialized as {0} such that Size() would return 0 + if (aux_shapes.size() == 0 + && stype != kDefaultStorage) { + if (stype == kRowSparseStorage) { + aux_shapes = {TShape(mshadow::Shape1(0))}; + } else if (stype == kCSRStorage) { + // aux shapes for indptr and indices + aux_shapes = {TShape(mshadow::Shape1(0)), TShape(mshadow::Shape1(0))}; + } else { + LOG(FATAL) << "Unknown storage type " << stype; + } + } + if (storage_shape.Size() == 0 + && stype != kDefaultStorage) { + if (stype == kRowSparseStorage) { + storage_shape = shape; + storage_shape[0] = aux_shapes[rowsparse::kIdx][0]; + } else if (stype == kCSRStorage) { + storage_shape = aux_shapes[csr::kIdx]; + } else { + LOG(FATAL) << "Unknown storage type " << stype; + } + } + if (stype == kDefaultStorage) + ptr_ = std::make_shared(shape, ctx, delay_alloc, dtype); + else + ptr_ = std::make_shared(stype, storage_shape, ctx, delay_alloc, + dtype, aux_types, aux_shapes); +} + +struct ChunkMem { + Storage::Handle h; + std::vector aux_h; +#if MXNET_USE_MKLDNN == 1 + std::shared_ptr mem; +#endif +}; + +NDArray::Chunk::~Chunk() { + bool skip_free = static_data || delay_alloc; + ChunkMem mem; + mem.h = this->shandle; + mem.aux_h = this->aux_handles; +#if MXNET_USE_MKLDNN == 1 + // We want to delete mkldnn memory after deleting the variable. + mem.mem = this->mkl_mem_; +#endif + Engine::Get()->DeleteVariable([mem, skip_free](RunContext s) { + if (skip_free == false) { +#if MXNET_USE_MKLDNN == 1 + if (mem.mem) { + CHECK_LE(mem.mem->get_primitive_desc().get_size(), mem.h.size); + CHECK_EQ(mem.mem->get_data_handle(), mem.h.dptr); + } +#endif + if (mem.h.size > 0) Storage::Get()->Free(mem.h); + for (size_t i = 0; i < mem.aux_h.size(); i++) { + if (mem.aux_h[i].size > 0) Storage::Get()->Free(mem.aux_h[i]); + } + } + }, shandle.ctx, var); +} + +void NDArray::Chunk::CheckAndAllocData(const TShape &shape, int dtype) { + CHECK_NE(aux_shapes.size(), 0) + << "data is expected to be allocated after aux_data"; + auto dbytes = shape.Size() * mshadow::mshadow_sizeof(dtype); + if (shandle.size < dbytes) { + // free storage if necessary and alloc again + if (shandle.size > 0) Storage::Get()->Free(shandle); + // init storage + shandle = Storage::Get()->Alloc(dbytes, ctx); +#if MXNET_USE_MKLDNN == 1 + mkl_mem_ = nullptr; +#endif + } + // init shape + storage_shape = shape; + // delay_alloc is only set when data storage handle is present + delay_alloc = false; +} + NDArray NDArray::grad() const { if (Imperative::AGInfo::IsNone(*this)) return NDArray(); Imperative::AGInfo& info = Imperative::AGInfo::Get(entry_.node); @@ -64,15 +166,55 @@ nnvm::Symbol NDArray::get_autograd_symbol() const { return ret; } +#if MXNET_USE_MKLDNN == 1 + +NDArray NDArray::MKLDNNDataReshape(const TShape &shape) const { + CHECK(!is_none()) << "NDArray is not initialized"; + CHECK_GE(shape_.Size(), shape.Size()) + << "NDArray.Reshape: target shape size is larger current shape"; + CHECK_EQ(storage_type(), kDefaultStorage); + if (!IsMKLDNNData()) { + NDArray ret = this->Detach(); + ret.shape_ = shape; + return ret; + } else { + NDArray ret(shape, ctx(), true, dtype()); + // We shouldn't submit the reorder primitive here because submit will + // be called in operators. + auto format = GetDefaultFormat(ptr_->mkl_mem_->get_primitive_desc().desc()); + CHECK_NE(format, ptr_->mkl_mem_->get_primitive_desc().desc().data.format); + auto def_pd = GetPrimitiveDesc(ptr_->mkl_mem_->get_primitive_desc(), format); + auto def_mem = TmpMemMgr::Get()->Alloc(def_pd); + MKLDNNStream *stream = MKLDNNStream::Get(); + stream->RegisterMem(ptr_->mkl_mem_); + stream->RegisterPrim(mkldnn::reorder(*ptr_->mkl_mem_, *def_mem)); + // def_mem points to a memory region in the temp space. It's only valid + // inside an operator. As such, the returned NDArray can only be valid + // inside an operator and the shared point doesn't need to do anything + // when it's destroyed. + ret.ptr_->mkl_mem_ = std::shared_ptr(def_mem, + [](mkldnn::memory *mem){}); + ret.ptr_->shandle.dptr = def_mem->get_data_handle(); + ret.ptr_->shandle.size = def_mem->get_primitive_desc().get_size(); + ret.ptr_->delay_alloc = false; + ret.ptr_->static_data = true; + ret.byte_offset_ = byte_offset_; + return ret; + } +} + +#endif + NDArray NDArray::Reshape(const TShape &shape) const { CHECK(!is_none()) << "NDArray is not initialized"; - auto stype = storage_type(); - // reshape is not supported for non-default ndarray with dismatching shapes - CHECK((shape_ == shape) || stype == kDefaultStorage) - << "Reshape for storage type " << stype << " is not implemented yet"; CHECK_GE(shape_.Size(), shape.Size()) << "NDArray.Reshape: target shape size is larger current shape"; NDArray ret = this->Detach(); + // If the shape doesn't change, we can just return it now. + if (ret.shape_ == shape) + return ret; + // Otherwise, reshape only works on the default layout. + CHECK_EQ(storage_type(), kDefaultStorage); ret.shape_ = shape; return ret; } @@ -95,7 +237,6 @@ NDArray NDArray::ReshapeWithRecord(const TShape &shape) { return ret; } - NDArray NDArray::Slice(index_t begin, index_t end) const { CHECK(!is_none()) << "NDArray is empty"; CHECK_LE(begin, end) @@ -127,8 +268,8 @@ NDArray NDArray::SliceWithRecord(index_t begin, index_t end) { } NDArray NDArray::At(index_t idx) const { - CHECK(storage_type() == kDefaultStorage) << "Storage type " - << storage_type() << " doesn't support At()"; + CHECK(storage_type() == kDefaultStorage) + << "Storage type " << storage_type() << " doesn't support At()"; NDArray ret = this->Slice(idx, idx+1); if (shape_.ndim() > 1) { return ret.Reshape(TShape(shape_.data()+1, shape_.data()+shape_.ndim())); @@ -181,6 +322,400 @@ void NDArray::set_fresh_out_grad(bool state) const { info.fresh_out_grad = state; } +#if MXNET_USE_MKLDNN == 1 +static inline bool same_shape(const TShape &shape, mkldnn_dims_t dims, int ndims) { + if (shape.ndim() != (size_t)ndims) + return false; + for (int i = 0; i < ndims; i++) + if (shape[i] != dims[i]) + return false; + return true; +} + +static inline bool same_shape(const TShape &shape, int dtype, mkldnn::memory::desc desc) { + return same_shape(shape, desc.data.dims, desc.data.ndims) + && get_mkldnn_type(dtype) == desc.data.data_type; +} + +bool NDArray::Chunk::IsMKLDNN() const { + if (storage_type != kDefaultStorage) + return false; + if (mkl_mem_ == nullptr) + return false; + auto desc = mkl_mem_->get_primitive_desc().desc(); + return desc.data.format != GetDefaultFormat(desc); +} + +bool NDArray::Chunk::IsDefault() const { + if (storage_type != kDefaultStorage) + return false; + // If we don't have mkldnn memory yet, we just assume it's not the default + // format. + if (mkl_mem_ == nullptr) + return true; + auto desc = mkl_mem_->get_primitive_desc().desc(); + return desc.data.format == GetDefaultFormat(desc); +} + +void NDArray::Chunk::Reorder2Default() { + if (mkl_mem_ == nullptr) + return; + + auto format = GetDefaultFormat(mkl_mem_->get_primitive_desc().desc()); + CHECK(format != mkl_mem_->get_primitive_desc().desc().data.format); + + auto def_pd = GetPrimitiveDesc(mkl_mem_->get_primitive_desc(), format); + mkldnn_mem_ptr def_mem(new mkldnn::memory(def_pd)); + // This may be called in MKLDNN operators. We can't use MKLDNNStream here. + std::vector net; + net.push_back(mkldnn::reorder(*mkl_mem_, *def_mem)); + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + + CHECK(shandle.size >= def_pd.get_size()); + CheckAndAlloc(def_pd.get_size()); + // TODO(zhengda) We need to avoid memory copy here. + memcpy(shandle.dptr, def_mem->get_data_handle(), def_pd.get_size()); + mkl_mem_.reset(new mkldnn::memory(def_pd, shandle.dptr)); +} + +void NDArray::Chunk::SetMKLMem(const TShape &shape, int dtype) { + // The shape of the array and the one of the MKL memory may mismatch. + // For example, if the array stores parameters, the MKL memory may store data + // in 5 dimensions while the NDArray stores data in 4 dimensions. + if (mkl_mem_ && mkl_mem_->get_data_handle() == shandle.dptr + && same_shape(shape, dtype, mkl_mem_->get_primitive_desc().desc())) { + return; + } + + mkldnn::memory::dims dims; + // These are shapes supprted by MKLDNN. + if (shape.ndim() == 1 || shape.ndim() == 2 || shape.ndim() == 4 + || shape.ndim() == 5) { + dims.resize(shape.ndim()); + for (size_t i = 0; i < dims.size(); i++) + dims[i] = shape[i]; + } else if (shape.ndim() == 3) { + // If there are 3 dimensions, we'll force it to 4 dimensions. + dims.resize(shape.ndim() + 1); + dims[0] = 1; + for (size_t i = 0; i < shape.ndim(); i++) + dims[i + 1] = shape[i]; + } else { + LOG(FATAL) << "MKLDNN doesn't support " << shape.ndim() << " dimensions"; + } + mkldnn::memory::format layout = mkldnn::memory::format::format_undef; + switch (dims.size()) { + case 1: layout = mkldnn::memory::format::x; break; + case 2: layout = mkldnn::memory::format::nc; break; + case 4: layout = mkldnn::memory::format::nchw; break; + // This isn't the right layout when the data has 5 dimensions in MXNet. + // MXNet interprets 5 dimensions as ncdhw, but MKLDNN doesn't have + // a corresponding format. + case 5: layout = mkldnn::memory::format::goihw; break; + } + mkldnn::memory::desc data_md{dims, get_mkldnn_type(dtype), layout}; + auto cpu_engine = CpuEngine::Get()->get_engine(); + if (shandle.dptr == nullptr) { + CHECK(delay_alloc); + CheckAndAlloc(); + } + mkldnn::memory::primitive_desc pd(data_md, cpu_engine); + CHECK(shandle.size >= pd.get_size()); + mkl_mem_.reset(new mkldnn::memory(pd, shandle.dptr)); +} + +/* + * Here we want to get MKLDNN memory whose primitive desc is exactly the same as + * the given one. operator== can't guarantee that. == can return true even if + * the formats are different. I need to double check its format. + */ +static inline mkldnn::memory *GetMKLDNNExact( + const mkldnn::memory *mem, mkldnn::memory::primitive_desc desc) { + auto src_desc = mem->get_primitive_desc(); + if (desc == src_desc && desc.desc().data.format == src_desc.desc().data.format) { + return const_cast(mem); + } else { + std::shared_ptr ret(new mkldnn::memory( + desc, mem->get_data_handle())); + MKLDNNStream::Get()->RegisterMem(ret); + return ret.get(); + } +} + +const mkldnn::memory *NDArray::GetMKLDNNData( + const mkldnn::memory::primitive_desc &desc) const { + if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { + LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; + return nullptr; + } + auto mem = GetMKLDNNData(); + mkldnn::memory::primitive_desc _desc = desc; + auto desc1 = mem->get_primitive_desc().desc(); + auto desc2 = _desc.desc(); + // The MKL memory has the same format and shape as required, + // or both use the default format, we can return the MKL memory. + if (mem->get_primitive_desc() == desc + || (desc1.data.format == GetDefaultFormat(desc1) + && desc2.data.format == GetDefaultFormat(desc2))) { + return GetMKLDNNExact(ptr_->mkl_mem_.get(), desc); + } else { + return nullptr; + } +} + +const mkldnn::memory *NDArray::GetMKLDNNDataReorder( + const mkldnn::memory::primitive_desc &desc) const { + if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { + LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; + return nullptr; + } + CHECK(storage_type() == kDefaultStorage); + + auto mem = GetMKLDNNData(); + // If the memory descriptor matches, it's easy. + MKLDNNStream *stream = MKLDNNStream::Get(); + if (mem->get_primitive_desc() == desc) { + return GetMKLDNNExact(mem, desc); + } + + mkldnn::memory::primitive_desc _desc = desc; + // Now we need to determine if we should reorder the memory. + // If both use the default formats, we think we don't need to reorder. + auto desc1 = mem->get_primitive_desc().desc(); + auto desc2 = _desc.desc(); + if (desc1.data.format == GetDefaultFormat(desc1) && + desc2.data.format == GetDefaultFormat(desc2)) { + mkldnn_mem_ptr ret(new mkldnn::memory(desc, mem->get_data_handle())); + stream->RegisterMem(ret); + return ret.get(); + } else { + auto ret = TmpMemMgr::Get()->Alloc(desc); + stream->RegisterPrim(mkldnn::reorder(*mem, *ret)); + return ret; + } +} + +const mkldnn::memory *NDArray::GetMKLDNNData() const { + CHECK(storage_type() == kDefaultStorage); + // If this array uses MKLDNN layout and it's a view, we have to change its + // layout to the default layout. + if (IsMKLDNNData() && IsView()) + ptr_->Reorder2Default(); + ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_, dtype_); + // If shandle has data, the data in shandle and mkl_mem_ should match. + if (ptr_->shandle.dptr) + CHECK(ptr_->shandle.dptr == ptr_->mkl_mem_->get_data_handle()); + MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_); + auto pd = ptr_->mkl_mem_->get_primitive_desc(); + if (IsView()) { + // Sliced array must use the default layout. + CHECK_EQ(GetDefaultFormat(pd.desc()), pd.desc().data.format); + } + if (IsView()) { + void *off_addr = static_cast(ptr_->mkl_mem_->get_data_handle()) + + byte_offset_; + + // Create the primitive desc for the new mkldnn memory. + mkldnn::memory::dims dims(shape().ndim()); + for (size_t i = 0; i < dims.size(); i++) + dims[i] = shape()[i]; + mkldnn::memory::format cpp_format = static_cast( + GetDefaultFormat(shape().ndim())); + mkldnn::memory::data_type cpp_type = static_cast( + pd.desc().data.data_type); + mkldnn::memory::desc data_md(dims, cpp_type, cpp_format); + mkldnn::memory::primitive_desc new_pd(data_md, pd.get_engine()); + + std::shared_ptr ret(new mkldnn::memory(new_pd, off_addr)); + MKLDNNStream::Get()->RegisterMem(ret); + return ret.get(); + } else { + return ptr_->mkl_mem_.get(); + } +} + +void NDArray::MKLDNNDataReorder(const mkldnn::memory::primitive_desc &pd) { + CHECK_EQ(storage_type(), kDefaultStorage); + // If the memory already uses the specified layout, don't do anything. + if (ptr_->mkl_mem_ != nullptr && ptr_->mkl_mem_->get_primitive_desc() == pd) + return; + auto _pd = pd; + auto _desc = _pd.desc(); + auto def_format = GetDefaultFormat(_desc); + // If the memory is default, don't do anything. + if (def_format == _desc.data.format && ptr_->IsDefault()) + return; + // If the specified layout is default, we should use Reorder2Default. + if (def_format == _desc.data.format) { + ptr_->Reorder2Default(); + return; + } + + std::shared_ptr new_mem(new mkldnn::memory(pd)); + ptr_->SetMKLMem(shape_, dtype_); + auto old_mem = ptr_->mkl_mem_; + // It's possible that the specified layout has a different number of dimensions. + if (old_mem->get_primitive_desc().desc().data.ndims != _desc.data.ndims) { + // For now, we only support reorder from the default layout. + CHECK(ptr_->IsDefault()); + auto def_pd = GetPrimitiveDesc(pd, def_format); + old_mem.reset(new mkldnn::memory(def_pd, old_mem->get_data_handle())); + } + // This may be called in MKLDNN operators. We can't use MKLDNNStream here. + std::vector net; + net.push_back(mkldnn::reorder(*old_mem, *new_mem)); + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + + CHECK(ptr_->shandle.size >= pd.get_size()); + ptr_->CheckAndAlloc(pd.get_size()); + // TODO(zhengda) We need to avoid memory copy here. + memcpy(ptr_->shandle.dptr, new_mem->get_data_handle(), pd.get_size()); + ptr_->mkl_mem_.reset(new mkldnn::memory(pd, ptr_->shandle.dptr)); +} + +void NDArray::CopyFrom(const mkldnn::memory &mem) { + CHECK(ptr_ != nullptr) << "The NDArray hasn't been initialized"; + if (ptr_->mkl_mem_.get() == &mem) + return; + + CHECK(mem.get_primitive_desc().get_size() == shape().Size() * GetTypeSize(dtype_)) + << "The size of NDArray doesn't match the requested MKLDNN memory desc"; + MKLDNNStream *stream = MKLDNNStream::Get(); + // If this array uses MKLDNN layout and it's a view, we have to change its + // layout to the default layout. + if (IsMKLDNNData() && IsView()) + ptr_->Reorder2Default(); + ptr_->SetMKLMem(IsView() ? ptr_->storage_shape : shape_, + dtype_); + stream->RegisterMem(ptr_->mkl_mem_); + auto from_desc = mem.get_primitive_desc().desc(); + auto this_desc = ptr_->mkl_mem_->get_primitive_desc().desc(); + auto from_def_format = GetDefaultFormat(from_desc); + if (IsView()) { + // Sliced array must use the default layout. + CHECK_EQ(GetDefaultFormat(this_desc), this_desc.data.format); + } + // It's possible that the memory and the NDArray don't have the same shape. + if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims) + // If the source memory uses the default layout, we can reshape directly. + && from_def_format == from_desc.data.format) { + // In this case, we can simply create a new MKLDNN memory for the required + // shape. + mkldnn::memory::dims dims(this_desc.data.dims, + this_desc.data.dims + this_desc.data.ndims); + auto this_dtype = static_cast(this_desc.data.data_type); + auto this_format = static_cast(GetDefaultFormat(this_desc)); + mkldnn::memory::desc data_md(dims, this_dtype, this_format); + mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine()); + mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle())); + stream->RegisterMem(tmp_mem); + stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->mkl_mem_)); + } else if (!same_shape(shape_, from_desc.data.dims, from_desc.data.ndims)) { + // In this case, the source memory stores data in a customized layout. We + // need to reorganize the data in memory before we can reshape. + auto def_pd = GetPrimitiveDesc(mem.get_primitive_desc(), from_def_format); + auto def_mem = TmpMemMgr::Get()->Alloc(def_pd); + stream->RegisterPrim(mkldnn::reorder(mem, *def_mem)); + // Now we can reshape it + mkldnn::memory::dims dims(this_desc.data.dims, + this_desc.data.dims + this_desc.data.ndims); + auto this_dtype = static_cast(this_desc.data.data_type); + auto this_format = static_cast(GetDefaultFormat(this_desc)); + mkldnn::memory::desc data_md(dims, this_dtype, this_format); + mkldnn::memory::primitive_desc pd(data_md, mem.get_primitive_desc().get_engine()); + mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, def_mem->get_data_handle())); + stream->RegisterMem(tmp_mem); + stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->mkl_mem_)); + } else if (mem.get_primitive_desc() == ptr_->mkl_mem_->get_primitive_desc()) { + // If the layout is the same, we can just copy data. + stream->RegisterPrim(mkldnn::reorder(mem, *ptr_->mkl_mem_)); + } else { + auto src_def = GetDefaultFormat(mem.get_primitive_desc().desc()); + auto dst_def = GetDefaultFormat(ptr_->mkl_mem_->get_primitive_desc().desc()); + // If both are not using the default layouts. There isn't much we can do, + // other than reorder data layout directly. + if (dst_def != ptr_->mkl_mem_->get_primitive_desc().desc().data.format + && src_def != mem.get_primitive_desc().desc().data.format) { + stream->RegisterPrim(mkldnn::reorder(mem, *ptr_->mkl_mem_)); + } else if (dst_def == ptr_->mkl_mem_->get_primitive_desc().desc().data.format) { + // If the dest mem uses the default memory layout, we can simply use + // the default format of the source memory to improve perf of reorder. + auto pd = GetPrimitiveDesc(ptr_->mkl_mem_->get_primitive_desc(), src_def); + mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, ptr_->mkl_mem_->get_data_handle())); + stream->RegisterMem(tmp_mem); + stream->RegisterPrim(mkldnn::reorder(mem, *tmp_mem)); + } else { + // If the src mem uses the default memory layout, we can use + // the default format of the source memory to improve perf. + auto pd = GetPrimitiveDesc(mem.get_primitive_desc(), dst_def); + mkldnn_mem_ptr tmp_mem(new mkldnn::memory(pd, mem.get_data_handle())); + stream->RegisterMem(tmp_mem); + stream->RegisterPrim(mkldnn::reorder(*tmp_mem, *ptr_->mkl_mem_)); + } + } +} +mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd, + mkldnn_memory_format_t format); + +mkldnn::memory *NDArray::CreateMKLDNNData(const mkldnn::memory::primitive_desc &desc) { + // This array shouldn't be a view. + CHECK(!IsView()); + + if (desc.get_size() != shape().Size() * GetTypeSize(dtype_)) { + LOG(FATAL) << "The size of NDArray doesn't match the requested MKLDNN memory desc"; + return nullptr; + } + + mkldnn::memory::primitive_desc _desc = desc; + auto required_format = _desc.desc().data.format; + auto def_format = GetDefaultFormat(_desc.desc()); + // If the required format is a default format, we don't need to worry about the shape. + // If the shape isn't the same, it actually implicitly reshapes data. + if (required_format == def_format) { + ptr_->SetMKLMem(shape_, dtype_); + MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_); + return GetMKLDNNExact(ptr_->mkl_mem_.get(), desc); + } + + if (ptr_->mkl_mem_) + CHECK(ptr_->mkl_mem_->get_data_handle() == ptr_->shandle.dptr); + if (ptr_->mkl_mem_ && ptr_->mkl_mem_->get_primitive_desc() == desc) { + MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_); + return GetMKLDNNExact(ptr_->mkl_mem_.get(), desc); + } + + CHECK(ptr_->shandle.size >= desc.get_size()); + ptr_->CheckAndAlloc(desc.get_size()); + ptr_->mkl_mem_.reset(new mkldnn::memory(desc, ptr_->shandle.dptr)); + MKLDNNStream::Get()->RegisterMem(ptr_->mkl_mem_); + return ptr_->mkl_mem_.get(); +} +#endif + +void NDArray::SetTBlob() const { + CHECK(ptr_ != nullptr); + TShape shape = shape_; + char *dptr = static_cast(ptr_->shandle.dptr); + auto stype = storage_type(); + if (stype == kDefaultStorage) { +#if MXNET_USE_MKLDNN == 1 + if (IsMKLDNNData()) { + ptr_->Reorder2Default(); + dptr = static_cast(ptr_->shandle.dptr); + } +#endif + dptr += byte_offset_; + } else if (stype == kCSRStorage || stype == kRowSparseStorage) { + CHECK_EQ(byte_offset_, 0); + shape = storage_shape(); + } else { + LOG(FATAL) << "unknown storage type " << stype; + } + tblob_.dptr_ = dptr; + tblob_.shape_ = shape; + tblob_.type_flag_ = dtype_; + tblob_.SetDLTensor(ptr_->shandle.ctx.dev_mask(), ptr_->shandle.ctx.dev_id); +} /*! * \brief run a ternary operation @@ -449,11 +984,51 @@ inline void CopyFromToRspImpl(const NDArray& from, const NDArray& to, RunContext // Make a copy of a dense NDArray template inline void CopyFromToDnsImpl(const NDArray& from, const NDArray& to, RunContext ctx) { - using namespace mshadow; - CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type"; - TBlob tmp = to.data(); - ndarray::Copy(from.data(), &tmp, - from.ctx(), to.ctx(), ctx); +#if MXNET_USE_MKLDNN == 1 + // If neither is MKLDNN, we can copy data normally. + if (!from.IsMKLDNNData() && !to.IsMKLDNNData()) { +#endif + using namespace mshadow; + CHECK_EQ(from.storage_type(), to.storage_type()) << "Copying with different storage type"; + TBlob tmp = to.data(); + ndarray::Copy(from.data(), &tmp, + from.ctx(), to.ctx(), ctx); +#if MXNET_USE_MKLDNN == 1 + } else if (SupportMKLDNN(from.dtype(), from.shape()) + && SupportMKLDNN(to.dtype(), to.shape()) + && from.ctx().dev_mask() == cpu::kDevMask + && to.ctx().dev_mask() == cpu::kDevMask) { + // If we copy data directly, we need to make sure both NDArrays are supported + // by MKLDNN. + auto from_mem = from.GetMKLDNNData(); + auto to_mem = to.GetMKLDNNData(); + if (from_mem->get_primitive_desc() == to_mem->get_primitive_desc()) { + size_t size = std::min(from_mem->get_primitive_desc().get_size(), + to_mem->get_primitive_desc().get_size()); + memcpy(to_mem->get_data_handle(), from_mem->get_data_handle(), size); + } else { + std::vector net; + net.push_back(mkldnn::reorder(*from_mem, *to_mem)); + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + } + } else { + // In this case, one of the NDArray isn't supported by MKLDNN, we need + // to convert the MKLDNN array to the default format first and copy data + // with Copy(). + NDArray tmp_from = from; + if (tmp_from.IsMKLDNNData()) { + tmp_from = NDArray(from.shape(), from.ctx(), false, from.dtype()); + auto tmp_mem = from.GetMKLDNNData(); + tmp_from.CopyFrom(*tmp_mem); + MKLDNNStream::Get()->Submit(); + } + CHECK(tmp_from.IsDefaultData()); + CHECK(to.IsDefaultData()); + TBlob tmp = to.data(); + ndarray::Copy(from.data(), &tmp, + from.ctx(), to.ctx(), ctx); + } +#endif } // Make a copy of an NDArray based on storage type diff --git a/src/operator/concat-inl.h b/src/operator/concat-inl.h deleted file mode 100644 index 4225ddf4eac0..000000000000 --- a/src/operator/concat-inl.h +++ /dev/null @@ -1,264 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * Copyright (c) 2015 by Contributors - * \file concat-inl.h - * \brief - * \author Bing Xu -*/ -#ifndef MXNET_OPERATOR_CONCAT_INL_H_ -#define MXNET_OPERATOR_CONCAT_INL_H_ -#include -#include -#include -#include -#include -#include -#include -#include -#include "./operator_common.h" -#include "./channel_op_common.h" -#include "./tensor/broadcast_reduce_op.h" - -namespace mxnet { -namespace op { - -namespace concat_enum { -enum ConcatOpInputs {kData0, kData1, kData2, kData3, kData4}; -enum ConcatOpOutputs {kOut}; -} // namespace concat_enum - -struct ConcatParam : public dmlc::Parameter { - int num_args; - int dim; - DMLC_DECLARE_PARAMETER(ConcatParam) { - DMLC_DECLARE_FIELD(num_args).set_lower_bound(1) - .describe("Number of inputs to be concated."); - DMLC_DECLARE_FIELD(dim).set_default(1) - .describe("the dimension to be concated."); - } -}; // struct ConcatParam - -template -class ConcatOp : public Operator { - public: - explicit ConcatOp(ConcatParam param) - : size_(param.num_args), dimension_(param.dim) {} - - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(static_cast(in_data.size()), size_); - CHECK_EQ(out_data.size(), 1U); - int axis = CheckAxis(dimension_, in_data[concat_enum::kData0].ndim()); - Stream *s = ctx.get_stream(); - std::vector > data(size_); - Tensor out; - size_t leading = 1, trailing = 1; - for (int i = 0; i < axis; ++i) { - leading *= out_data[concat_enum::kOut].shape_[i]; - } - for (int i = axis + 1; i < out_data[concat_enum::kOut].ndim(); ++i) { - trailing *= out_data[concat_enum::kOut].shape_[i]; - } - size_t mid = out_data[concat_enum::kOut].shape_[axis]; - Shape<3> oshape = Shape3(leading, mid, trailing); - out = out_data[concat_enum::kOut].get_with_shape(oshape, s); - - for (int i = 0; i < size_; ++i) { - Shape<3> dshape = Shape3(leading, in_data[i].shape_[axis], trailing); - data[i] = in_data[i].get_with_shape(dshape, s); - } - Concatenate(data, &out, 1, req[concat_enum::kOut]); - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_grad.size(), static_cast(size_)); - int axis = CheckAxis(dimension_, out_grad[concat_enum::kData0].ndim()); - Stream *s = ctx.get_stream(); - std::vector > grad_in(size_); - Tensor grad; - size_t leading = 1, trailing = 1; - for (int i = 0; i < axis; ++i) { - leading *= out_grad[concat_enum::kOut].shape_[i]; - } - for (int i = axis + 1; i < out_grad[concat_enum::kOut].ndim(); ++i) { - trailing *= out_grad[concat_enum::kOut].shape_[i]; - } - size_t mid = out_grad[concat_enum::kOut].shape_[axis]; - Shape<3> oshape = Shape3(leading, mid, trailing); - grad = out_grad[concat_enum::kOut].get_with_shape(oshape, s); - - for (int i = 0; i < size_; ++i) { - Shape<3> dshape = Shape3(leading, in_grad[i].shape_[axis], trailing); - grad_in[i] = in_grad[i].get_with_shape(dshape, s); - } - Split(grad, &grad_in, 1, req); - } - - private: - int size_; - int dimension_; -}; // class ConcatOp - -template -Operator *CreateOp(ConcatParam param, int dtype, std::vector *in_shape); - -#if DMLC_USE_CXX11 -class ConcatProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - std::vector ListArguments() const override { - std::vector ret; - for (int i = 0; i < param_.num_args; ++i) { - ret.push_back(std::string("arg") + std::to_string(i)); - } - return ret; - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), static_cast(param_.num_args)); - TShape dshape; - index_t size = 0; - bool has_zero = false; - int axis = -1; - for (int i = 0; i < param_.num_args; ++i) { - TShape tmp = (*in_shape)[i]; - if (tmp.ndim()) { - axis = CheckAxis(param_.dim, tmp.ndim()); - has_zero = tmp[axis] == 0 || has_zero; - size += tmp[axis]; - tmp[axis] = 0; - shape_assign(&dshape, tmp); - } - } - - TShape tmp = (*out_shape)[0]; - if (tmp.ndim()) { - axis = CheckAxis(param_.dim, tmp.ndim()); - tmp[axis] = 0; - shape_assign(&dshape, tmp); - } - - if (dshape.ndim() == 0) return false; - - for (int i = 0; i < param_.num_args; ++i) { - CHECK(shape_assign(&(*in_shape)[i], dshape)) - << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i]; - } - - if (!has_zero) dshape[axis] = size; - CHECK(shape_assign(&(*out_shape)[0], dshape)) - << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0]; - - return dshape.Size() != 0; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - int dtype = -1; - - for (size_t i = 0; i < in_type->size(); ++i) { - if (dtype == -1) { - dtype = in_type->at(i); - } else { - CHECK(in_type->at(i) == dtype || - in_type->at(i) == -1) << - "Non-uniform data type in Concat"; - } - } - - if (dtype == -1) { - LOG(FATAL) << "Not enough information to infer type in Concat."; - return false; - } - - size_t nin = this->ListArguments().size(); - in_type->clear(); - for (size_t i = 0; i < nin; ++i) in_type->push_back(dtype); - - size_t naux = this->ListAuxiliaryStates().size(); - aux_type->clear(); - for (size_t i = 0; i < naux; ++i) aux_type->push_back(dtype); - - size_t nout = this->ListOutputs().size(); - out_type->clear(); - for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype); - - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new ConcatProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "Concat"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return out_grad; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not implemented"; - return NULL; - } - - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; - - private: - ConcatParam param_; -}; // class ConcatProp -#endif // DMLC_USE_CXX11 -} // namespace op -} // namespace mxnet - -#endif // MXNET_OPERATOR_CONCAT_INL_H_ diff --git a/src/operator/concat.cc b/src/operator/concat.cc deleted file mode 100644 index 4d3c2fa1661f..000000000000 --- a/src/operator/concat.cc +++ /dev/null @@ -1,112 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * Copyright (c) 2015 by Contributors - * \file concat.cc - * \brief - * \author Bing Xu -*/ - -#include "./concat-inl.h" -#if MXNET_USE_MKL2017 == 1 -#include -#include "./mkl/mkl_memory-inl.h" -#include "./mkl/mkl_concat-inl.h" -#endif // MXNET_USE_MKL2017 - -namespace mxnet { -namespace op { -template<> -Operator* CreateOp(ConcatParam param, int dtype, std::vector *in_shape) { - Operator *op = NULL; -#if MXNET_USE_MKL2017 == 1 - // MKL supports 4D input tensors only for concat operation - // 2D/3D input tensors are reshaped to 4D in mkl_concat-inl.h - // hence MKL supports 2D/3D/4D input tensors for concat operation - size_t dims = (*in_shape)[0].ndim(); - bool supportedDim = (dims >= 2 && dims <= 4); - if ((1 == param.dim) && supportedDim && - (param.num_args < (dnnResourceMultipleDst - dnnResourceMultipleSrc))) { - switch (dtype) { - case mshadow::kFloat32: - return new MKLConcatOp(param); - case mshadow::kFloat64: - return new MKLConcatOp(param); - default: - break; - } - } - if (enableMKLWarnGenerated()) - LOG(INFO) << MKLConcatOp::getName() << " Skip MKL optimization"; -#endif - MSHADOW_TYPE_SWITCH(dtype, DType, { - op = new ConcatOp(param); - }); - return op; -} - -Operator* ConcatProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0), in_shape); -} - -DMLC_REGISTER_PARAMETER(ConcatParam); - -MXNET_REGISTER_OP_PROPERTY(Concat, ConcatProp) -.describe(R"code(Joins input arrays along a given axis. - -.. note:: `Concat` is deprecated. Use `concat` instead. - -The dimensions of the input arrays should be the same except the axis along -which they will be concatenated. -The dimension of the output array along the concatenated axis will be equal -to the sum of the corresponding dimensions of the input arrays. - -Example:: - - x = [[1,1],[2,2]] - y = [[3,3],[4,4],[5,5]] - z = [[6,6], [7,7],[8,8]] - - concat(x,y,z,dim=0) = [[ 1., 1.], - [ 2., 2.], - [ 3., 3.], - [ 4., 4.], - [ 5., 5.], - [ 6., 6.], - [ 7., 7.], - [ 8., 8.]] - - Note that you cannot concat x,y,z along dimension 1 since dimension - 0 is not the same for all the input arrays. - - concat(y,z,dim=1) = [[ 3., 3., 6., 6.], - [ 4., 4., 7., 7.], - [ 5., 5., 8., 8.]] - -)code" ADD_FILELINE) -.add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate") -.add_arguments(ConcatParam::__FIELDS__()) -.set_key_var_num_args("num_args"); - -NNVM_REGISTER_OP(Concat).add_alias("concat"); - -} // namespace op -} // namespace mxnet diff --git a/src/operator/convolution_v1.cc b/src/operator/convolution_v1.cc index 7de6a34425f5..86c0fbb33291 100644 --- a/src/operator/convolution_v1.cc +++ b/src/operator/convolution_v1.cc @@ -25,11 +25,6 @@ */ #include "./convolution_v1-inl.h" -#if MXNET_USE_MKL2017 == 1 -#include -#include "./mkl/mkl_memory-inl.h" -#include "./mkl/mkl_convolution-inl.h" -#endif // MXNET_USE_MKL2017 #if MXNET_USE_NNPACK == 1 #include "./nnpack/nnpack_convolution-inl.h" #endif // MXNET_USE_NNPACK diff --git a/src/operator/lrn-inl.h b/src/operator/lrn-inl.h deleted file mode 100644 index adfe4676702d..000000000000 --- a/src/operator/lrn-inl.h +++ /dev/null @@ -1,215 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * Copyright (c) 2015 by Contributors - * \file lrn-inl.h - * \brief - * \author Bing Xu -*/ -#ifndef MXNET_OPERATOR_LRN_INL_H_ -#define MXNET_OPERATOR_LRN_INL_H_ -#include -#include -#include -#include -#include -#include -#include -#include "./operator_common.h" -#include "./mshadow_op.h" - -namespace mxnet { -namespace op { - -namespace lrn_enum { -enum LRNInputs {kData}; -enum LRNOutputs {kOut, kTmpNorm}; -} // namespace lrn_enum - -struct LRNParam : public dmlc::Parameter { - float alpha; - float beta; - float knorm; - uint32_t nsize; - DMLC_DECLARE_PARAMETER(LRNParam) { - DMLC_DECLARE_FIELD(alpha).set_default(1e-4f) - .describe("The variance scaling parameter :math:`\alpha` in the LRN expression."); - DMLC_DECLARE_FIELD(beta).set_default(0.75f) - .describe("The power parameter :math:`\beta` in the LRN expression."); - DMLC_DECLARE_FIELD(knorm).set_default(2.0f) - .describe("The parameter :math:`k` in the LRN expression."); - DMLC_DECLARE_FIELD(nsize) - .describe("normalization window width in elements."); - } -}; // struct LRNParam - -template -class LocalResponseNormOp : public Operator { - public: - explicit LocalResponseNormOp(LRNParam param) { - param_ = param; - } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - // TODO(xxx): Test with gradient chceker - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 2U); - // CHECK_EQ(req.size(), 2); - CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size"; - const real_t salpha = param_.alpha / param_.nsize; - Stream *s = ctx.get_stream(); - Tensor data = in_data[lrn_enum::kData].get(s); - Tensor out = out_data[lrn_enum::kOut].get(s); - Tensor tmp_norm = out_data[lrn_enum::kTmpNorm].get(s); - tmp_norm = chpool(F(data) , param_.nsize) * salpha + param_.knorm; - Assign(out, req[lrn_enum::kOut], data * F(tmp_norm, -param_.beta)); - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 2U); - const real_t salpha = param_.alpha / param_.nsize; - Stream *s = ctx.get_stream(); - Tensor grad = out_grad[lrn_enum::kOut].get(s); - Tensor tmp_norm = out_data[lrn_enum::kTmpNorm].get(s); - Tensor data = in_data[lrn_enum::kData].get(s); - Tensor grad_in = in_grad[lrn_enum::kData].get(s); - grad_in = grad * F(tmp_norm, -param_.beta); - grad_in += (- 2.0f * param_.beta * salpha) * - chpool(grad * data * - F(tmp_norm, -param_.beta - 1.0f), - param_.nsize) * data; - } - - private: - LRNParam param_; -}; // class LocalResponseNormOp - -template -Operator *CreateOp(LRNParam param, int dtype); - -#if DMLC_USE_CXX11 -class LocalResponseNormProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 1U) << "Input:[data]"; - const TShape &dshape = in_shape->at(0); - if (dshape.ndim() == 0) return false; - out_shape->clear(); - out_shape->push_back(dshape); - out_shape->push_back(dshape); - return true; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); - } - } - int n_out = this->ListOutputs().size(); - out_type->clear(); - for (int i = 0; i < n_out; ++i ) out_type->push_back(dtype); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new LocalResponseNormProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "LRN"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return { - out_grad[lrn_enum::kOut], in_data[lrn_enum::kData], - out_data[lrn_enum::kTmpNorm], out_data[lrn_enum::kOut] - }; - } - - int NumVisibleOutputs() const override { - return 1; - } - - int NumOutputs() const override { - return 2; - } - - std::vector ListArguments() const override { - return {"data"}; - } - - std::vector ListOutputs() const override { - return {"output", "tmp_norm"}; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } - - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; - - private: - LRNParam param_; -}; // LocalResponseNormProp -#endif // DMLC_USE_CXX11 -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_LRN_INL_H_ diff --git a/src/operator/lrn.cc b/src/operator/lrn.cc deleted file mode 100644 index 9b3afd80cd18..000000000000 --- a/src/operator/lrn.cc +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * Copyright (c) 2015 by Contributors - * \file lrn.cc - * \brief - * \author Bing Xu -*/ - -#include "./lrn-inl.h" -#if MXNET_USE_CUDNN == 1 -#include "./cudnn_lrn-inl.h" -#endif -#if MXNET_USE_MKL2017 == 1 -#include -#include "./mkl/mkl_memory-inl.h" -#include "./mkl/mkl_lrn-inl.h" -#endif - -namespace mxnet { -namespace op { -template<> -Operator* CreateOp(LRNParam param, int dtype) { -#if MXNET_USE_MKL2017 == 1 - return new MKLLRNOp(param); -#endif - return new LocalResponseNormOp(param); -} - -// DO_BIND_DISPATCH comes from operator_common.h -Operator* LocalResponseNormProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]); -} - -DMLC_REGISTER_PARAMETER(LRNParam); - -MXNET_REGISTER_OP_PROPERTY(LRN, LocalResponseNormProp) -.add_argument("data", "NDArray-or-Symbol", "Input data.") -.add_arguments(LRNParam::__FIELDS__()) -.describe(R"code(Applies local response normalization to the input. - -The local response normalization layer performs "lateral inhibition" by normalizing -over local input regions. - -If :math:`a_{x,y}^{i}` is the activity of a neuron computed by applying kernel :math:`i` at position -:math:`(x, y)` and then applying the ReLU nonlinearity, the response-normalized -activity :math:`b_{x,y}^{i}` is given by the expression: - -.. math:: - b_{x,y}^{i} = \frac{a_{x,y}^{i}}{\Bigg({k + \alpha \sum_{j=max(0, i-\frac{n}{2})}^{min(N-1, i+\frac{n}{2})} (a_{x,y}^{j})^{2}}\Bigg)^{\beta}} - -where the sum runs over :math:`n` "adjacent" kernel maps at the same spatial position, and :math:`N` is the total -number of kernels in the layer. - -)code" ADD_FILELINE); - -} // namespace op -} // namespace mxnet diff --git a/src/operator/mkl/mkl_batch_norm-inl.h b/src/operator/mkl/mkl_batch_norm-inl.h deleted file mode 100644 index b5967f4de294..000000000000 --- a/src/operator/mkl/mkl_batch_norm-inl.h +++ /dev/null @@ -1,391 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_batch_norm-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_ -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../mshadow_op.h" -#include "./mkl_util-inl.h" - -namespace mxnet { -namespace op { - -template -class MKLBatchNormOp : public Operator { - public: - explicit MKLBatchNormOp(BatchNormParam param) { - this->param_ = param; - fwd_top_data = MKLData::create(); - fwd_bottom_data = MKLData::create(); - bwd_top_diff = MKLData::create(); - bwd_bottom_diff = MKLData::create(); - scaleShift_space.dptr = NULL; - scaleShiftDiff_space.dptr = NULL; - } - virtual ~MKLBatchNormOp() { - if (batchNormFwdInference != NULL) dnnDelete(batchNormFwdInference); - if (batchNormFwdTraining != NULL) dnnDelete(batchNormFwdTraining); - if (batchNormBwdScaleShift != NULL) dnnDelete(batchNormBwdScaleShift); - dnnLayoutDelete(layout_usr_); - if (scaleShift_space.dptr) - Storage::Get()->Free(scaleShift_space); - if (scaleShiftDiff_space.dptr) - Storage::Get()->Free(scaleShiftDiff_space); - } - static std::string getName() { - return "MKLBatchNormOp"; - } - - private: - void LayerSetUp(const mshadow::Tensor &data, - const mshadow::Tensor &out) { - eps_ = param_.eps; - size_t dim = 4, sizes[4], strides[4]; - channels_ = data.shape_[1]; - height_ = data.shape_[2]; - width_ = data.shape_[3]; - num_ = data.shape_[0]; - - sizes[0] = width_; - sizes[1] = height_; - sizes[2] = channels_; - sizes[3] = num_; - - strides[0] = 1; - strides[1] = sizes[0]; - strides[2] = sizes[0] * sizes[1]; - strides[3] = sizes[0] * sizes[1] * sizes[2]; - - // Names are for debugging only - fwd_bottom_data->name = "fwd_bottom_data @ " + getName(); - fwd_top_data->name = "fwd_top_data @ " + getName(); - bwd_bottom_diff->name = "bwd_bottom_diff @ " + getName(); - bwd_top_diff->name = "bwd_top_diff @ " + getName(); - - dnnError_t e; - e = dnnLayoutCreate(&layout_usr_, dim, sizes, strides); - CHECK_EQ(e, E_SUCCESS); - - fwd_bottom_data->create_user_layout(dim, sizes, strides); - fwd_top_data->create_user_layout(dim, sizes, strides); - bwd_bottom_diff->create_user_layout(dim, sizes, strides); - bwd_top_diff->create_user_layout(dim, sizes, strides); - - // Primitives will be allocated during the first fwd pass - batchNormFwdInference = NULL; - batchNormFwdTraining = NULL; - batchNormBwdScaleShift = NULL; - int scaleShift_size = channels_*2*sizeof(DType); - scaleShift_space = Storage::Get()->Alloc(scaleShift_size, Context::CPU()); - scaleShiftDiff_space = Storage::Get()->Alloc(scaleShift_size, Context::CPU()); - DType * scaleShift_buf = reinterpret_cast(scaleShift_space.dptr); - /*!use_weight_bias_*/ - for (int i = 0; i < channels_; i++) { - scaleShift_buf[i] = 1.0; - scaleShift_buf[channels_ + i] = 0; - } - } - - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 3); - CHECK_EQ(aux_states.size(), 2); - if (ctx.is_train) { - CHECK_EQ(out_data.size(), 3); - CHECK_EQ(req.size(), 3); - } else { - CHECK_GE(out_data.size(), 1); - CHECK_GE(req.size(), 1); - CHECK_EQ(req[batchnorm::kOut], kWriteTo); - } - - Stream *s = ctx.get_stream(); - Tensor data; - Tensor out; - if (in_data[batchnorm::kData].ndim() == 2) { - Shape<4> dshape = Shape4(in_data[batchnorm::kData].shape_[0], - in_data[batchnorm::kData].shape_[1], 1, 1); - data = mkl_experimental_direct_get_with_shape( - in_data[batchnorm::kData], dshape, s); - out = mkl_experimental_direct_get_with_shape( - out_data[batchnorm::kOut], dshape, s); - } else { - data = mkl_experimental_direct_get(in_data[batchnorm::kData], s); - out = mkl_experimental_direct_get(out_data[batchnorm::kOut], s); - } - - // const real_t scale = static_cast(in_data[batchnorm::kData].shape_[1]) / - // static_cast(in_data[batchnorm::kData].shape_.Size()); - - Tensor slope = in_data[batchnorm::kGamma].get(s); - Tensor bias = in_data[batchnorm::kBeta].get(s); - Tensor moving_mean = aux_states[batchnorm::kMovingMean].get(s); - Tensor moving_var = aux_states[batchnorm::kMovingVar].get(s); - - if (param_.fix_gamma) - slope = 1.f; - - dnnError_t e; - if (!init_mkldnn_) { - LayerSetUp(data, out); - init_mkldnn_ = true; - } - void* bottom_data = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_data = - reinterpret_cast(mkl_prv_data(in_data[batchnorm::kData])); -#endif - int bwd_flags = dnnUseScaleShift; - if (param_.use_global_stats) - bwd_flags = dnnUseScaleShift | dnnUseInputMeanVariance; -#if MKL_EXPERIMENTAL == 1 - if (NULL != bottom_data) { - // Is it the first pass? Create a primitive. - if (batchNormFwdInference == NULL) { - std::shared_ptr bottom_data_mem = in_data[batchnorm::kData].Mkl_mem_; - std::shared_ptr bottom_prv_desc = bottom_data_mem->get_prv_descriptor(); - CHECK(bottom_prv_desc->get_descr_type() == PrvMemDescr::PRV_DESCR_MKL2017); - std::shared_ptr > mem_descr - = std::static_pointer_cast>(bottom_prv_desc); - CHECK(mem_descr != NULL); - fwd_bottom_data = mem_descr; - - e = dnnBatchNormalizationCreateForward_v2( - &batchNormFwdInference, NULL, mem_descr->layout_int, eps_, - dnnUseInputMeanVariance | dnnUseScaleShift); - CHECK_EQ(e, E_SUCCESS); - - e = dnnBatchNormalizationCreateForward_v2( - &batchNormFwdTraining, NULL, mem_descr->layout_int, eps_, - dnnUseScaleShift); - CHECK_EQ(e, E_SUCCESS); - - fwd_top_data->create_internal_layout(batchNormFwdInference, dnnResourceDst); - bwd_top_diff->create_internal_layout(batchNormFwdInference, dnnResourceDst); - bwd_bottom_diff->create_internal_layout(batchNormFwdInference, dnnResourceSrc); - - e = dnnBatchNormalizationCreateBackward_v2( - &batchNormBwdScaleShift, NULL, mem_descr->layout_int, eps_, bwd_flags); - CHECK_EQ(e, E_SUCCESS); - } - } -#endif - if (NULL == bottom_data) { - if (batchNormFwdInference == NULL) { - e = dnnBatchNormalizationCreateForward_v2( - &batchNormFwdInference, NULL, layout_usr_, eps_, - dnnUseInputMeanVariance | dnnUseScaleShift); - CHECK_EQ(e, E_SUCCESS); - - e = dnnBatchNormalizationCreateForward_v2( - &batchNormFwdTraining, NULL, layout_usr_, eps_, dnnUseScaleShift); - CHECK_EQ(e, E_SUCCESS); - - e = dnnBatchNormalizationCreateBackward_v2( - &batchNormBwdScaleShift, NULL, layout_usr_, eps_, bwd_flags); - CHECK_EQ(e, E_SUCCESS); - } - bottom_data = reinterpret_cast(data.dptr_); - } - - DType * scaleShift_buf = reinterpret_cast(scaleShift_space.dptr); - // use_weight_bias_ - for (int i = 0; i < channels_; i++) { - scaleShift_buf[i] = (slope.dptr_)[i]; - } - for (int i = 0; i < channels_; i++) { - scaleShift_buf[channels_ + i] = (bias.dptr_)[i]; - } - - void* BatchNorm_res[dnnResourceNumber]; - BatchNorm_res[dnnResourceSrc] = bottom_data; - BatchNorm_res[dnnResourceScaleShift] = scaleShift_space.dptr; - - BatchNorm_res[dnnResourceDst] = fwd_top_data->get_output_ptr(out.dptr_, - fwd_top_data, out_data[batchnorm::kOut]); - if (ctx.is_train && !param_.use_global_stats) { - Tensor mean = out_data[batchnorm::kMean].get(s); - Tensor var = out_data[batchnorm::kVar].get(s); - CHECK(req[batchnorm::kMean] == kNullOp || req[batchnorm::kMean] == kWriteTo); - CHECK(req[batchnorm::kVar] == kNullOp || req[batchnorm::kVar] == kWriteTo); - BatchNorm_res[dnnResourceMean] = mean.dptr_; - BatchNorm_res[dnnResourceVariance] = var.dptr_; - e = dnnExecute(batchNormFwdTraining, BatchNorm_res); - CHECK_EQ(e, E_SUCCESS); - } else { - BatchNorm_res[dnnResourceMean] = moving_mean.dptr_; - BatchNorm_res[dnnResourceVariance] = moving_var.dptr_; - e = dnnExecute(batchNormFwdInference, BatchNorm_res); - CHECK_EQ(e, E_SUCCESS); - } - -#if MKL_EXPERIMENTAL == 0 - if (fwd_top_data->conversion_needed()) { - fwd_top_data->convert_from_prv(out.dptr_); - } -#endif - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1); - CHECK_EQ(in_data.size(), 3); - CHECK_EQ(out_data.size(), 3); - CHECK_EQ(in_grad.size(), 3); - Stream *s = ctx.get_stream(); - Tensor data, grad, grad_in; - - if (in_data[batchnorm::kData].ndim() == 2) { - Shape<4> dshape = Shape4(out_grad[batchnorm::kOut].shape_[0], - out_grad[batchnorm::kOut].shape_[1], 1, 1); - data = mkl_experimental_direct_get_with_shape( - in_data[batchnorm::kData], dshape, s); - grad = mkl_experimental_direct_get_with_shape( - out_grad[batchnorm::kOut], dshape, s); - grad_in = mkl_experimental_direct_get_with_shape( - in_grad[batchnorm::kData], dshape, s); - } else { - data = mkl_experimental_direct_get(in_data[batchnorm::kData], s); - grad = mkl_experimental_direct_get(out_grad[batchnorm::kOut], s); - grad_in = mkl_experimental_direct_get(in_grad[batchnorm::kData], s); - } - - Tensor slope = in_data[batchnorm::kGamma].get(s); - Tensor gslope = in_grad[batchnorm::kGamma].get(s); - Tensor gbias = in_grad[batchnorm::kBeta].get(s); - Tensor mean = out_data[batchnorm::kMean].get(s); - Tensor var = out_data[batchnorm::kVar].get(s); - Tensor moving_mean = aux_states[batchnorm::kMovingMean].get(s); - Tensor moving_var = aux_states[batchnorm::kMovingVar].get(s); - - if (param_.fix_gamma) slope = 1.f; - - void* bottom_data = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_data = reinterpret_cast(mkl_prv_data(in_data[batchnorm::kData])); -#endif - if (NULL == bottom_data) - bottom_data = reinterpret_cast(data.dptr_); - - dnnError_t e; - void* BatchNorm_res[dnnResourceNumber]; - BatchNorm_res[dnnResourceSrc] = bottom_data; - BatchNorm_res[dnnResourceScaleShift] = scaleShift_space.dptr; - if (ctx.is_train && !param_.use_global_stats) { - int size = mean.size(0); // Tensor - float * moving_mean_ptr = reinterpret_cast(moving_mean.dptr_); - float * mean_ptr = reinterpret_cast(mean.dptr_); - float * moving_var_ptr = reinterpret_cast(moving_var.dptr_); - float * var_ptr = reinterpret_cast(var.dptr_); - float minus_mom = (1 - param_.momentum); - for (int i = 0; i < size; i++) { - moving_mean_ptr[i] = moving_mean_ptr[i] * param_.momentum - + mean_ptr[i] * minus_mom; - } - for (int i = 0; i < size; i++) { - moving_var_ptr[i] = moving_var_ptr[i] * param_.momentum - + var_ptr[i] * minus_mom; - } - BatchNorm_res[dnnResourceMean] = mean.dptr_; - BatchNorm_res[dnnResourceVariance] = var.dptr_; - } else { - BatchNorm_res[dnnResourceMean] = moving_mean.dptr_; - BatchNorm_res[dnnResourceVariance] = moving_var.dptr_; - } - - - BatchNorm_res[dnnResourceDiffSrc] = bwd_bottom_diff->get_output_ptr(grad_in.dptr_, - bwd_bottom_diff, in_grad[batchnorm::kData]); - BatchNorm_res[dnnResourceDiffDst] = bwd_top_diff->get_converted_prv(grad.dptr_, - true, out_grad[batchnorm::kOut]); - BatchNorm_res[dnnResourceDiffScaleShift] = scaleShiftDiff_space.dptr; - e = dnnExecute(batchNormBwdScaleShift, BatchNorm_res); - CHECK_EQ(e, E_SUCCESS); -#if MKL_EXPERIMENTAL == 0 - if (bwd_bottom_diff->conversion_needed()) { - bwd_bottom_diff->convert_from_prv(grad_in.dptr_); - } -#endif - DType * scaleShiftDiff_buf = reinterpret_cast(scaleShiftDiff_space.dptr); - if (!param_.fix_gamma) { - // Store ScaleShift blobs - DType* diff_scale = gslope.dptr_; - for (int i = 0; i < channels_; i++) { - diff_scale[i] = scaleShiftDiff_buf[i]; - } - } else { - int gslope_size = gslope.size(0); - float * gslope_ptr = reinterpret_cast(gslope.dptr_); - for (int i = 0; i < gslope_size; i++) { - *gslope_ptr++ = 0.0f; - } - } - DType* diff_shift = gbias.dptr_; - for (int i = 0; i < channels_; i++) { - diff_shift[i] = scaleShiftDiff_buf[channels_ + i]; - } - } - - private: - BatchNormParam param_; - DType eps_; - bool use_weight_bias_; - - int num_; - int channels_; - int height_; - int width_; - bool init_mkldnn_ = false; - std::shared_ptr > fwd_top_data; - std::shared_ptr > fwd_bottom_data; - std::shared_ptr > bwd_top_diff; - std::shared_ptr > bwd_bottom_diff; - dnnPrimitive_t batchNormFwdInference = NULL; - dnnPrimitive_t batchNormFwdTraining = NULL; - dnnPrimitive_t batchNormBwdScaleShift = NULL; - Storage::Handle scaleShift_space; - Storage::Handle scaleShiftDiff_space; - dnnLayout_t layout_usr_ = NULL; -}; // class BatchNormOp -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_BATCH_NORM_INL_H_ diff --git a/src/operator/mkl/mkl_concat-inl.h b/src/operator/mkl/mkl_concat-inl.h deleted file mode 100644 index 1ed1e81d1303..000000000000 --- a/src/operator/mkl/mkl_concat-inl.h +++ /dev/null @@ -1,314 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_concat-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_ -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../channel_op_common.h" -#include "./mkl_util-inl.h" -namespace mxnet { -namespace op { - - -template -class MKLConcatOp : public Operator { - public: - static std::string getName() { - return "MKLConcatOp"; - } - explicit MKLConcatOp(ConcatParam param) - : size_(param.num_args), dimension_(param.dim), init_mkldnn_(false) { - concatFwd_ = static_cast(NULL); - concatBwd_ = static_cast(NULL); - fwd_top_data_ = MKLData::create(); - bwd_top_diff_ = MKLData::create(); - - num_concats_ = param.num_args; - } - virtual ~MKLConcatOp() { - dnnDelete(concatFwd_); - dnnDelete(concatBwd_); - } - - private: - void LayerSetUp(const std::vector > &data, - const mshadow::Tensor &out, - size_t data_shape_size, size_t *split_channels_) { - size_t dim_src = data_shape_size; - size_t dim_dst = dim_src; - num_concats_ = size_; - channels_ = 0; - - for (size_t i = 1; i < num_concats_; ++i) { - for (size_t j = 1; j < data_shape_size; ++j) { - if (j == dimension_) continue; - CHECK_EQ(data[0].shape_[j], data[i].shape_[j]); - } - } - - for (size_t i = 0; i < num_concats_; ++i) { - CHECK_EQ((int)dim_src, data[i].shape_.kDimension); - - fwd_bottom_data_.push_back(MKLData::create()); - bwd_bottom_diff_.push_back(MKLData::create()); - fwd_bottom_data_[i]->name = "fwd_bottom_data_[i]"; - bwd_bottom_diff_[i]->name = "bwd_bottom_data[i]"; - - size_t *sizes_src = new size_t[dim_src]; - size_t *strides_src = new size_t[dim_src]; - for (size_t d = 0; d < dim_src; ++d) { - sizes_src[d] = data[i].shape_[dim_src - d - 1]; - strides_src[d] = (d == 0) ? 1 : strides_src[d - 1] * sizes_src[d - 1]; - } - - split_channels_[i] = data[i].shape_[1]; - channels_ += split_channels_[i]; - fwd_bottom_data_[i]->create_user_layout(dim_src, sizes_src, strides_src); - bwd_bottom_diff_[i]->create_user_layout(dim_src, sizes_src, strides_src); - delete[] sizes_src; - delete[] strides_src; - } - size_t *sizes_dst = new size_t[dim_dst]; - size_t *strides_dst = new size_t[dim_dst]; - for (size_t d = 0; d < dim_dst; ++d) { - if (d == 2) - sizes_dst[d] = channels_; - else - sizes_dst[d] = data[0].shape_[dim_dst - 1 - d]; - strides_dst[d] = (d == 0) ? 1 : strides_dst[d - 1] * sizes_dst[d - 1]; - } - bwd_top_diff_->create_user_layout(dim_dst, sizes_dst, strides_dst); - fwd_top_data_->create_user_layout(dim_dst, sizes_dst, strides_dst); - delete[] sizes_dst; - delete[] strides_dst; - concatFwd_ = NULL; - concatBwd_ = NULL; - } - - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(static_cast(in_data.size()), size_); - CHECK_EQ(out_data.size(), 1); - CHECK_LT(dimension_, (size_t)in_data[concat_enum::kData0].ndim()); - Stream *s = ctx.get_stream(); - std::vector > data(size_); - Tensor out; - if (in_data[0].ndim() == 2) { - for (int i = 0; i < size_; ++i) { - Shape<4> dshape = Shape4(in_data[i].shape_[0], - in_data[i].shape_[1], 1, 1); - data[i] = mkl_experimental_direct_get_with_shape( - in_data[i], dshape, s); - } - Shape<4> dshape = Shape4(out_data[concat_enum::kOut].shape_[0], - out_data[concat_enum::kOut].shape_[1], 1, 1); - out = mkl_experimental_direct_get_with_shape( - out_data[concat_enum::kOut], dshape, s); - } else if (in_data[0].ndim() == 3) { - for (int i = 0; i < size_; ++i) { - Shape<4> dshape = Shape4(in_data[i].shape_[0], - in_data[i].shape_[1], in_data[i].shape_[2], 1); - data[i] = mkl_experimental_direct_get_with_shape( - in_data[i], dshape, s); - } - Shape<4> dshape = Shape4(out_data[concat_enum::kOut].shape_[0], - out_data[concat_enum::kOut].shape_[1], - out_data[concat_enum::kOut].shape_[2], 1); - out = mkl_experimental_direct_get_with_shape( - out_data[concat_enum::kOut], dshape, s); - } else { - for (int i = 0; i < size_; ++i) { - data[i] = mkl_experimental_direct_get(in_data[i], s); - } - out = mkl_experimental_direct_get(out_data[concat_enum::kOut], s); - } - size_t *split_channels_ = new size_t[num_concats_]; - if (!init_mkldnn_) { - init_mkldnn_ = true; - LayerSetUp(data, out, 4, split_channels_); - } - - dnnError_t e; - std::vector bottom_data; - bool isFirstPass = (concatFwd_ == NULL); - dnnLayout_t *layouts = NULL; - if (isFirstPass) { - layouts = new dnnLayout_t[num_concats_]; - } - - for (size_t i = 0; i < num_concats_; i++) { - void * bottom_i = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_i = mkl_prv_data(in_data[i]); - if (bottom_i != NULL) { - if (isFirstPass) { - std::shared_ptr > mem_descr = - mkl_get_mem_desc(in_data[i].Mkl_mem_); - fwd_bottom_data_[i] = mem_descr; - layouts[i] = mem_descr->layout_int; - } - } -#endif - if (bottom_i == NULL) { - bottom_i = data[i].dptr_; - if (isFirstPass) { - layouts[i] = fwd_bottom_data_[i]->layout_usr; - } - } - - bottom_data.push_back(reinterpret_cast(bottom_i)); - } - - if (isFirstPass) { - e = dnnConcatCreate(&concatFwd_, NULL, num_concats_, layouts); - CHECK_EQ(e, E_SUCCESS); - - fwd_top_data_->create_internal_layout(concatFwd_, dnnResourceDst); - bwd_top_diff_->create_internal_layout(concatFwd_, dnnResourceDst); - - e = dnnSplitCreate(&concatBwd_, NULL, num_concats_, - bwd_top_diff_->layout_int, split_channels_); - CHECK_EQ(e, E_SUCCESS); - - for (size_t n = 0; n < num_concats_; ++n) { - fwd_bottom_data_[n]->create_internal_layout(concatFwd_, - (dnnResourceType_t)(dnnResourceMultipleSrc + n)); - bwd_bottom_diff_[n]->create_internal_layout(concatBwd_, - (dnnResourceType_t)(dnnResourceMultipleDst + n)); - } - } - delete[] layouts; - - void *concat_res[dnnResourceNumber]; - for (size_t i = 0; i < num_concats_; ++i) { - concat_res[dnnResourceMultipleSrc + i] - = reinterpret_cast(bottom_data[i]); - } - - concat_res[dnnResourceDst] = fwd_top_data_->get_output_ptr(out.dptr_, - fwd_top_data_, out_data[concat_enum::kOut]); - e = dnnExecute(concatFwd_, concat_res); - CHECK_EQ(e, E_SUCCESS); - delete[] split_channels_; - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1); - CHECK_EQ(in_grad.size(), static_cast(size_)); - Stream *s = ctx.get_stream(); - std::vector > grad_in(size_); - Tensor grad; - if (in_grad[0].ndim() == 2) { - Shape<4> dshape = Shape4(out_grad[concat_enum::kOut].shape_[0], - out_grad[concat_enum::kOut].shape_[1], 1, 1); - grad = mkl_experimental_direct_get_with_shape( - out_grad[concat_enum::kOut], dshape, s); - for (int i = 0; i < size_; ++i) { - dshape = Shape4(in_grad[i].shape_[0], - in_grad[i].shape_[1], 1, 1); - grad_in[i] = mkl_experimental_direct_get_with_shape( - in_grad[i], dshape, s); - } - } else if (in_grad[0].ndim() == 3) { - Shape<4> dshape = Shape4(out_grad[concat_enum::kOut].shape_[0], - out_grad[concat_enum::kOut].shape_[1], - out_grad[concat_enum::kOut].shape_[2], 1); - grad = mkl_experimental_direct_get_with_shape( - out_grad[concat_enum::kOut], dshape, s); - for (int i = 0; i < size_; ++i) { - dshape = Shape4(in_grad[i].shape_[0], - in_grad[i].shape_[1], in_grad[i].shape_[2], 1); - grad_in[i] = mkl_experimental_direct_get_with_shape( - in_grad[i], dshape, s); - } - } else { - grad = mkl_experimental_direct_get(out_grad[concat_enum::kOut], s); - for (int i = 0; i < size_; ++i) { - grad_in[i] = mkl_experimental_direct_get(in_grad[i], s); - } - } - - int need_bwd = 0; - for (size_t n = 0; n < num_concats_; n++) { - need_bwd += req[n]; - } - if (!need_bwd) { - return; - } - - dnnError_t e; - void *concat_res[dnnResourceNumber]; - concat_res[dnnResourceSrc] = bwd_top_diff_->get_converted_prv(grad.dptr_, true, - out_grad[concat_enum::kOut]); - for (size_t i = 0; i < num_concats_; ++i) { - concat_res[dnnResourceMultipleDst + i] = bwd_bottom_diff_[i]->get_output_ptr( - grad_in[i].dptr_, bwd_bottom_diff_[i], in_grad[i]); - } - e = dnnExecute(concatBwd_, concat_res); - CHECK_EQ(e, E_SUCCESS); - } - - private: - int size_; - size_t dimension_; - - bool init_mkldnn_; - - dnnPrimitive_t concatFwd_; - dnnPrimitive_t concatBwd_; - std::shared_ptr > fwd_top_data_; - std::vector< std::shared_ptr > > fwd_bottom_data_; - std::shared_ptr > bwd_top_diff_; - std::vector< std::shared_ptr > > bwd_bottom_diff_; - - - size_t width_; - size_t height_; - size_t channels_; - size_t num_; - size_t num_concats_; -}; // class MKLConcatOp -} // namespace op -} // namespace mxnet - -#endif // MXNET_OPERATOR_MKL_MKL_CONCAT_INL_H_ diff --git a/src/operator/mkl/mkl_convolution-inl.h b/src/operator/mkl/mkl_convolution-inl.h deleted file mode 100644 index 813d061f172b..000000000000 --- a/src/operator/mkl/mkl_convolution-inl.h +++ /dev/null @@ -1,490 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_convolution-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../nn/convolution-inl.h" -#include "./mkl_util-inl.h" - -namespace mxnet { -namespace op { - -template -class MKLConvolutionOp : public Operator { - public: - static std::string getName() { - return "MKLConvolutionOp"; - } - void SetupBuffer() { - convolutionBwdBias = static_cast(NULL); - convolutionBwdFilter = static_cast(NULL); - convolutionBwdData = static_cast(NULL); - convolutionFwd = static_cast(NULL); - fwd_bottom_data = MKLData::create(); - fwd_top_data = MKLData::create(); - fwd_filter_data = MKLData::create(); - fwd_bias_data = MKLData::create(); - bwdd_top_diff = MKLData::create(); - bwdd_bottom_diff = MKLData::create(); - bwdd_filter_data = MKLData::create(); - bwdf_top_diff = MKLData::create(); - bwdf_filter_diff = MKLData::create(); - bwdf_bottom_data = MKLData::create(); - bwdb_top_diff = MKLData::create(); - bwdb_bias_diff = MKLData::create(); - // Names are for debugging purposes only. - fwd_bottom_data->name = "fwd_bottom_data @ " + this->getName(); - fwd_top_data->name = "fwd_top_data @ " + this->getName(); - fwd_filter_data->name = "fwd_filter_data @ " + this->getName(); - fwd_bias_data->name = "fwd_bias_data @ " + this->getName(); - bwdd_top_diff->name = "bwdd_top_diff @ " + this->getName(); - bwdd_bottom_diff->name = "bwdd_bottom_diff @ " + this->getName(); - bwdd_filter_data->name = "bwdd_filter_data @ " + this->getName(); - bwdf_top_diff->name = "bwdf_top_diff @ " + this->getName(); - bwdf_bottom_data->name = "bwdf_bottom_data @ " + this->getName(); - bwdf_filter_diff->name = "bwdf_filter_diff @ " + this->getName(); - bwdb_top_diff->name = "bwdb_top_diff @ " + this->getName(); - bwdb_bias_diff->name = "bwdb_bias_diff @ " + this->getName(); - } - - explicit MKLConvolutionOp(ConvolutionParam p): - convolutionFwd(NULL), - convolutionBwdData(static_cast(NULL)), - convolutionBwdFilter(static_cast(NULL)), - convolutionBwdBias(static_cast(NULL)) { - this->param_ = p; - init_mkldnn_ = false; - // convert MBytes first to Bytes and then to elements. - param_.workspace = (param_.workspace << 20) / sizeof(DType); - SetupBuffer(); - } - void ReleaseBuffer() { - if (convolutionFwd != NULL) { - dnnDelete(convolutionFwd); - convolutionFwd = NULL; - } - if (convolutionBwdData != NULL) { - dnnDelete(convolutionBwdData); - convolutionBwdData = NULL; - } - if (convolutionBwdFilter != NULL) { - dnnDelete(convolutionBwdFilter); - convolutionBwdFilter = NULL; - } - if (!param_.no_bias && convolutionBwdBias != NULL) { - dnnDelete(convolutionBwdBias); - convolutionBwdBias = NULL; - } - } - virtual ~MKLConvolutionOp() { - ReleaseBuffer(); - } - - private: - void LayerSetUp(const mshadow::Tensor &data, - const mshadow::Tensor &out) { - this->width_ = data.shape_[3]; - this->height_ = data.shape_[2]; - this->channels_ = data.shape_[1]; - this->num_ = data.shape_[0]; - this->group_ = param_.num_group; - this->width_out_ = out.shape_[3]; - this->height_out_ = out.shape_[2]; - int channel_out_ = out.shape_[1]; - this->num_output_ = channel_out_; - kernel_w_ = param_.kernel[1]; - kernel_h_ = param_.kernel[0]; - stride_w_ = param_.stride[1]; - stride_h_ = param_.stride[0]; - pad_w_ = param_.pad[1]; - pad_h_ = param_.pad[0]; - int status; - size_t n, g; - size_t iw, ih, ic; - size_t ow, oh, oc; - size_t kw, kh; - size_t dimension = 4; - g = std::max(this->group_, 1); - n = this->num_; - iw = this->width_; - ih = this->height_; - ic = this->channels_; - ow = this->width_out_; - oh = this->height_out_; - oc = this->num_output_; - kw = this->kernel_w_; - kh = this->kernel_h_; - oc = this->num_output_; - size_t bdata_sizes[4] = { iw, ih, ic, n }; - size_t bdata_strides[4] = { 1, iw, iw*ih, iw*ih*ic }; - /* starting with MKL 2017 Gold in case of groups filter layout - * becomes 5D, i.e. groups become a separate dimension */ - size_t g_mkl2017 = g; - size_t f_dimension = dimension + (g != 1); - if (getMKLBuildDate() < 20160701) { - g_mkl2017 = 1; - f_dimension = dimension; - } - size_t fdata_sizes[5] = { kw, kh, ic / g, oc / g_mkl2017, g_mkl2017 }; - size_t fdata_strides[5] = { 1, kw, kw*kh, kw*kh*ic / g, kw*kh*ic / g*oc / g }; - size_t bias_sizes[1] = { oc }; - size_t bias_strides[1] = { 1 }; - size_t tdata_sizes[4] = { ow, oh, oc, n }; - size_t tdata_strides[4] = { 1, ow, ow*oh, ow*oh*oc }; - size_t convolutionStrides[2] = { this->stride_w_, this->stride_h_ }; - int inputOffset[2] = { -this->pad_w_, -this->pad_h_ }; - // Names are for debugging purposes only. - /*** convolution section ***/ - if (!param_.no_bias) { - status = dnnGroupsConvolutionCreateForwardBias(&convolutionFwd, - NULL, - dnnAlgorithmConvolutionDirect, - g, - dimension, - bdata_sizes, - tdata_sizes, - fdata_sizes, - convolutionStrides, - inputOffset, - dnnBorderZeros); - } else { - status = dnnGroupsConvolutionCreateForward(&convolutionFwd, - NULL, - dnnAlgorithmConvolutionDirect, - g, - dimension, - bdata_sizes, - tdata_sizes, - fdata_sizes, - convolutionStrides, - inputOffset, - dnnBorderZeros); - } - CHECK_EQ(status, 0) - << "Failed dnnCreateConvolution(dnnForward) with status " - << status << "\n"; - fwd_bottom_data->create_layouts(convolutionFwd, dnnResourceSrc, dimension, - bdata_sizes, bdata_strides); - fwd_top_data->create_layouts(convolutionFwd, dnnResourceDst, dimension, - tdata_sizes, tdata_strides); - fwd_filter_data->create_layouts(convolutionFwd, dnnResourceFilter, - f_dimension, fdata_sizes, fdata_strides); - if (!param_.no_bias) - fwd_bias_data->create_layouts(convolutionFwd, dnnResourceBias, 1, - bias_sizes, bias_strides); - /* - * Backward by data layer setup - */ - status = dnnGroupsConvolutionCreateBackwardData(&convolutionBwdData, - NULL, - dnnAlgorithmConvolutionDirect, - g, - dimension, - bdata_sizes, - tdata_sizes, - fdata_sizes, - convolutionStrides, - inputOffset, - dnnBorderZeros); - CHECK_EQ(status, 0) - << "Failed dnnConvolutionCreateBackwardData with status " - << status << "\n"; - bwdd_bottom_diff->create_layouts(convolutionBwdData, dnnResourceDiffSrc, - dimension, bdata_sizes, bdata_strides); - bwdd_top_diff->create_layouts(convolutionBwdData, dnnResourceDiffDst, - dimension, tdata_sizes, tdata_strides); - bwdd_filter_data->create_layouts(convolutionBwdData, dnnResourceFilter, - f_dimension, fdata_sizes, fdata_strides); - /* - * Backward by filter layer setup - */ - status = dnnGroupsConvolutionCreateBackwardFilter(&convolutionBwdFilter, - NULL, - dnnAlgorithmConvolutionDirect, - g, - dimension, - bdata_sizes, - tdata_sizes, - fdata_sizes, - convolutionStrides, - inputOffset, - dnnBorderZeros); - CHECK_EQ(status, 0) - << "Failed dnnConvolutionCreateBackwardFilter with status " - << status << "\n"; - bwdf_bottom_data->create_layouts(convolutionBwdFilter, dnnResourceSrc, - dimension, bdata_sizes, bdata_strides); - bwdf_top_diff->create_layouts(convolutionBwdFilter, dnnResourceDiffDst, - dimension, tdata_sizes, tdata_strides); - bwdf_filter_diff->create_layouts(convolutionBwdFilter, dnnResourceDiffFilter, - f_dimension, fdata_sizes, fdata_strides); - /* - * Backward by bias layer setup - */ - if (!param_.no_bias) { - status = dnnGroupsConvolutionCreateBackwardBias(&convolutionBwdBias, - NULL, - dnnAlgorithmConvolutionDirect, - g, - dimension, - tdata_sizes); - CHECK_EQ(status, 0) - << "Failed dnnConvolutionCreateBackwardBias with status " - << status << "\n"; - bwdb_top_diff->create_layouts(convolutionBwdBias, dnnResourceDiffDst, - dimension, tdata_sizes, tdata_strides); - bwdb_bias_diff->create_layouts(convolutionBwdBias, dnnResourceDiffBias, 1, - bias_sizes, bias_strides); - } - } - - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - Stream *s = ctx.get_stream(); - DType *data_ptr = NULL; - DType *wmat_ptr = NULL; - DType *out_ptr = NULL; - Tensor data = - mkl_experimental_direct_get(in_data[conv::kData], s); - Tensor out = - mkl_experimental_direct_get(out_data[conv::kOut], s); - Tensor wmat = - mkl_experimental_direct_get(in_data[conv::kWeight], s); - if (!init_mkldnn_) { - LayerSetUp(data, out); - init_mkldnn_ = true; - } - CHECK_EQ(data.CheckContiguous(), true); - CHECK_EQ(wmat.CheckContiguous(), true); - CHECK_EQ(out.CheckContiguous(), true); - data_ptr = data.dptr_; - wmat_ptr = wmat.dptr_; - out_ptr = out.dptr_; - int status; - void *res_convolutionFwd[dnnResourceNumber]; - res_convolutionFwd[dnnResourceSrc] = - fwd_bottom_data->get_converted_prv(data_ptr, false, in_data[conv::kData]); - res_convolutionFwd[dnnResourceFilter] = - fwd_filter_data->get_converted_prv(wmat_ptr, true, in_data[conv::kWeight]); - if (!param_.no_bias) { - Tensor bias = - mkl_experimental_direct_get(in_data[conv::kBias], s); - res_convolutionFwd[dnnResourceBias] = - fwd_bias_data->get_converted_prv(bias.dptr_, true, in_data[conv::kBias]); - } - - res_convolutionFwd[dnnResourceDst] = fwd_top_data->get_output_ptr(out_ptr, - fwd_top_data, out_data[conv::kOut]); - status = dnnExecute(convolutionFwd, res_convolutionFwd); - CHECK_EQ(status, 0) << "Forward convolution failed with status " << status; -#if MKL_EXPERIMENTAL == 0 - if (fwd_top_data->conversion_needed()) { - fwd_top_data->convert_from_prv(out_ptr); - } -#endif - } - void AddToModeAllocAndStoreBuffer(void *src, int blob_size, Storage::Handle *pws) { - int blob_byte_size = blob_size * sizeof(DType); - *pws = Storage::Get()->Alloc(blob_byte_size, Context::CPU()); - memcpy(pws->dptr, src, blob_byte_size); - } - void AddToModeAddAndReleaseBuffer(Storage::Handle *pws, void *dst_, int blob_size) { - DType *dst = reinterpret_cast(dst_); - DType *src = reinterpret_cast(pws->dptr); -#pragma omp parallel for - for (int i = 0; i < blob_size; i++) { - dst[i] += src[i]; - } - if (pws->dptr) - Storage::Get()->Free(*pws); - pws->dptr = NULL; - } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - if (param_.kernel.ndim() > 2) { - LOG(FATAL) << "Volume convolution is not implmented in mshadow"; - } - CHECK_EQ(out_grad.size(), 1); - size_t expected = param_.no_bias == 0 ? 3 : 2; - CHECK(in_data.size() == expected && in_grad.size() == expected); - CHECK_EQ(req.size(), expected); - CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true); - Stream *s = ctx.get_stream(); - Tensor data = - mkl_experimental_direct_get(in_data[conv::kData], s); - Shape<3> wmat_shape = - Shape3(param_.num_group, - param_.num_filter / param_.num_group, - data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]); - Tensor wmat = - mkl_experimental_direct_get_with_shape( - in_data[conv::kWeight], wmat_shape, s); - Tensor grad = - mkl_experimental_direct_get(out_grad[conv::kOut], s); - Tensor gdata = - mkl_experimental_direct_get(in_grad[conv::kData], s); - Tensor gwmat = - mkl_experimental_direct_get_with_shape( - in_grad[conv::kWeight], wmat_shape, s); - - if (!init_mkldnn_) { - init_mkldnn_ = true; - LayerSetUp(data, grad); - } - int status; - if (req[0]) { - void *res_convolutionBwdData[dnnResourceNumber]; - res_convolutionBwdData[dnnResourceDiffDst] = - bwdd_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]); - - res_convolutionBwdData[dnnResourceFilter] = - bwdd_filter_data->get_converted_prv(wmat.dptr_, false, in_data[conv::kWeight]); - Storage::Handle addtoWorkspace; - if (req[0] == kAddTo) { - // wait mkl support addto mode - AddToModeAllocAndStoreBuffer(gdata.dptr_, in_grad[conv::kData].Size(), &addtoWorkspace); - } - - res_convolutionBwdData[dnnResourceDiffSrc] = bwdd_bottom_diff->get_output_ptr(gdata.dptr_, - bwdd_bottom_diff, in_grad[conv::kData]); - status = dnnExecute(convolutionBwdData, res_convolutionBwdData); - CHECK_EQ(status, 0) << "Backward Data conv failed with status " << status; -#if MKL_EXPERIMENTAL == 0 - if (bwdd_bottom_diff->conversion_needed()) { - bwdd_bottom_diff->convert_from_prv(gdata.dptr_); - } -#endif - if (req[0] == kAddTo) { - if (bwdd_bottom_diff->conversion_needed()) { - bwdd_bottom_diff->convert_from_prv(gdata.dptr_); - } - AddToModeAddAndReleaseBuffer(&addtoWorkspace, gdata.dptr_, in_grad[conv::kData].Size()); - } - } - if (req[1]) { - void *res_convolutionBwdFilter[dnnResourceNumber]; - - res_convolutionBwdFilter[dnnResourceDiffDst] = - bwdf_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]); - - res_convolutionBwdFilter[dnnResourceSrc] = - bwdf_bottom_data->get_converted_prv(data.dptr_, false, - in_data[conv::kData]); - Storage::Handle addtoWorkspace; - if (req[1] == kAddTo) { - // wait mkl support addto mode - AddToModeAllocAndStoreBuffer(gwmat.dptr_, in_grad[conv::kWeight].Size(), &addtoWorkspace); - } - - res_convolutionBwdFilter[dnnResourceDiffFilter] = bwdf_filter_diff->get_output_ptr( - gwmat.dptr_, bwdf_filter_diff, in_grad[conv::kWeight]); - status = dnnExecute(convolutionBwdFilter, res_convolutionBwdFilter); - CHECK_EQ(status, 0) << "Backward Filter conv failed with status " << status; -#if MKL_EXPERIMENTAL == 0 - if (bwdf_filter_diff->conversion_needed()) { - bwdf_filter_diff->convert_from_prv(gwmat.dptr_); - } -#endif - if (req[1] == kAddTo) { - if (bwdf_filter_diff->conversion_needed()) { - bwdf_filter_diff->convert_from_prv(gwmat.dptr_); - } - AddToModeAddAndReleaseBuffer(&addtoWorkspace, gwmat.dptr_, in_grad[conv::kWeight].Size()); - } - } - if (!param_.no_bias) { - Tensor gbias = - mkl_experimental_direct_get(in_grad[conv::kBias], s); - void *res_convolutionBwdBias[dnnResourceNumber]; - res_convolutionBwdBias[dnnResourceDiffDst] = - bwdb_top_diff->get_converted_prv(grad.dptr_, true, out_grad[conv::kOut]); - - res_convolutionBwdBias[dnnResourceDiffBias] = bwdb_bias_diff->get_output_ptr(gbias.dptr_, - bwdb_bias_diff, in_grad[conv::kBias]); - status = dnnExecute(convolutionBwdBias, res_convolutionBwdBias); - CHECK_EQ(status, 0) << "Backward Bias failed with status " << status; -#if MKL_EXPERIMENTAL == 0 - if (bwdb_bias_diff->conversion_needed()) { - bwdb_bias_diff->convert_from_prv(gbias.dptr_); - } -#endif - } - } - - private: - ConvolutionParam param_; - size_t width_, - height_, - width_out_, - height_out_, - kernel_w_, - kernel_h_, - stride_w_, - stride_h_; - int group_, - num_, - num_output_; - size_t channels_; - int pad_w_, - pad_h_; - bool init_mkldnn_; - dnnPrimitive_t convolutionFwd; - dnnPrimitive_t convolutionBwdData; - dnnPrimitive_t convolutionBwdFilter; - dnnPrimitive_t convolutionBwdBias; - /* Fwd step */ - std::shared_ptr > fwd_bottom_data, fwd_top_data, fwd_filter_data, - fwd_bias_data; - /* Bwd data step */ - std::shared_ptr > bwdd_top_diff, bwdd_bottom_diff; - std::shared_ptr > bwdd_filter_data; - /* Bwd filter step */ - std::shared_ptr > bwdf_top_diff, bwdf_filter_diff; - std::shared_ptr > bwdf_bottom_data; - std::shared_ptr > bwdf_filter_diff_iter, bwdf2fwd_filter_diff, - bwdb_bias_diff_iter; - /* Bwd bias step */ - std::shared_ptr > bwdb_top_diff, bwdb_bias_diff; -}; // class ConvolutionOp -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_CONVOLUTION_INL_H_ diff --git a/src/operator/mkl/mkl_cppwrapper.cc b/src/operator/mkl/mkl_cppwrapper.cc deleted file mode 100644 index 507e5498c85b..000000000000 --- a/src/operator/mkl/mkl_cppwrapper.cc +++ /dev/null @@ -1,44 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_cppwrapper.cc -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ - - - -#include "mkl_cppwrapper.h" -#include -#if MXNET_USE_MKL2017 == 1 -#include "mkl_service.h" - -int getMKLBuildDate() { - static int build = 0; - if (build == 0) { - MKLVersion v; - mkl_get_version(&v); - build = atoi(v.Build); - printf("MKL Build:%d\n", build); - } - return build; -} - -bool enableMKLWarnGenerated() { - return false; -} -#endif // MSHADOW_USE_MKL2017 diff --git a/src/operator/mkl/mkl_cppwrapper.h b/src/operator/mkl/mkl_cppwrapper.h deleted file mode 100644 index 7d66f20ad308..000000000000 --- a/src/operator/mkl/mkl_cppwrapper.h +++ /dev/null @@ -1,1020 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_cppwrapper.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_ -#define MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_ - - -#include -#include -#if MXNET_USE_MKL2017 == 1 -#include "mkl_dnn_types.h" -#include "mkl_dnn.h" -#include "mkl_version.h" - - -extern int getMKLBuildDate(); -extern bool enableMKLWarnGenerated(); - - -template inline dnnError_t dnnLayoutCreate( - dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]); -template <> inline dnnError_t dnnLayoutCreate( - dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]) { - return dnnLayoutCreate_F32(pLayout, dimension, size, strides); -} -template <> inline dnnError_t dnnLayoutCreate( - dnnLayout_t *pLayout, size_t dimension, const size_t size[], const size_t strides[]) { - return dnnLayoutCreate_F64(pLayout, dimension, size, strides); -} - -template inline dnnError_t dnnLayoutCreateFromPrimitive( - dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type); -template <> inline dnnError_t dnnLayoutCreateFromPrimitive( - dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type) { - return dnnLayoutCreateFromPrimitive_F32(pLayout, primitive, type); -} -template <> inline dnnError_t dnnLayoutCreateFromPrimitive( - dnnLayout_t *pLayout, const dnnPrimitive_t primitive, dnnResourceType_t type) { - return dnnLayoutCreateFromPrimitive_F64(pLayout, primitive, type); -} - -template inline size_t dnnLayoutGetMemorySize( - const dnnLayout_t layout); -template <> inline size_t dnnLayoutGetMemorySize( - const dnnLayout_t layout) { - return dnnLayoutGetMemorySize_F32(layout); -} -template <> inline size_t dnnLayoutGetMemorySize( - const dnnLayout_t layout) { - return dnnLayoutGetMemorySize_F64(layout); -} - -template inline int dnnLayoutCompare( - const dnnLayout_t l1, const dnnLayout_t l2); -template <> inline int dnnLayoutCompare( - const dnnLayout_t l1, const dnnLayout_t l2) { - return dnnLayoutCompare_F32(l1, l2); -} -template <> inline int dnnLayoutCompare( - const dnnLayout_t l1, const dnnLayout_t l2) { - return dnnLayoutCompare_F64(l1, l2); -} - - -template inline dnnError_t dnnAllocateBuffer( - void **pPtr, dnnLayout_t layout); -template <> inline dnnError_t dnnAllocateBuffer( - void **pPtr, dnnLayout_t layout) { - return dnnAllocateBuffer_F32(pPtr, layout); -} -template <> inline dnnError_t dnnAllocateBuffer( - void **pPtr, dnnLayout_t layout) { - return dnnAllocateBuffer_F64(pPtr, layout); -} - -template inline dnnError_t dnnReleaseBuffer( - void *ptr); -template <> inline dnnError_t dnnReleaseBuffer( - void *ptr) { - return dnnReleaseBuffer_F32(ptr); -} -template <> inline dnnError_t dnnReleaseBuffer( - void *ptr) { - return dnnReleaseBuffer_F64(ptr); -} - -template inline dnnError_t dnnLayoutDelete( - dnnLayout_t layout); -template <> inline dnnError_t dnnLayoutDelete( - dnnLayout_t layout) { - return dnnLayoutDelete_F32(layout); -} -template <> inline dnnError_t dnnLayoutDelete( - dnnLayout_t layout) { - return dnnLayoutDelete_F64(layout); -} - -template inline dnnError_t dnnPrimitiveAttributesCreate( - dnnPrimitiveAttributes_t *attributes); -template <> inline dnnError_t dnnPrimitiveAttributesCreate( - dnnPrimitiveAttributes_t *attributes) { - return dnnPrimitiveAttributesCreate_F32(attributes); -} -template <> inline dnnError_t dnnPrimitiveAttributesCreate( - dnnPrimitiveAttributes_t *attributes) { - return dnnPrimitiveAttributesCreate_F64(attributes); -} - - -template inline dnnError_t dnnPrimitiveAttributesDestroy( - dnnPrimitiveAttributes_t attributes); -template <> inline dnnError_t dnnPrimitiveAttributesDestroy( - dnnPrimitiveAttributes_t attributes) { - return dnnPrimitiveAttributesDestroy_F32(attributes); -} -template <> inline dnnError_t dnnPrimitiveAttributesDestroy( - dnnPrimitiveAttributes_t attributes) { - return dnnPrimitiveAttributesDestroy_F64(attributes); -} - -template inline dnnError_t dnnPrimitiveGetAttributes( - dnnPrimitive_t primitive, - dnnPrimitiveAttributes_t *attributes); -template <> inline dnnError_t dnnPrimitiveGetAttributes( - dnnPrimitive_t primitive, - dnnPrimitiveAttributes_t *attributes) { - return dnnPrimitiveGetAttributes_F32(primitive, attributes); -} -template <> inline dnnError_t dnnPrimitiveGetAttributes( - dnnPrimitive_t primitive, - dnnPrimitiveAttributes_t *attributes) { - return dnnPrimitiveGetAttributes_F64(primitive, attributes); -} - -template inline dnnError_t dnnExecute( - dnnPrimitive_t primitive, void *resources[]); -template <> inline dnnError_t dnnExecute( - dnnPrimitive_t primitive, void *resources[]) { - return dnnExecute_F32(primitive, resources); -} -template <> inline dnnError_t dnnExecute( - dnnPrimitive_t primitive, void *resources[]) { - return dnnExecute_F64(primitive, resources); -} - -template inline dnnError_t dnnExecuteAsync( - dnnPrimitive_t primitive, void *resources[]); -template <> inline dnnError_t dnnExecuteAsync( - dnnPrimitive_t primitive, void *resources[]) { - return dnnExecuteAsync_F32(primitive, resources); -} -template <> inline dnnError_t dnnExecuteAsync( - dnnPrimitive_t primitive, void *resources[]) { - return dnnExecuteAsync_F64(primitive, resources); -} - -template inline dnnError_t dnnWaitFor( - dnnPrimitive_t primitive); -template <> inline dnnError_t dnnWaitFor( - dnnPrimitive_t primitive) { - return dnnWaitFor_F32(primitive); -} -template <> inline dnnError_t dnnWaitFor( - dnnPrimitive_t primitive) { - return dnnWaitFor_F64(primitive); -} - -template inline dnnError_t dnnDelete( - dnnPrimitive_t primitive); -template <> inline dnnError_t dnnDelete( - dnnPrimitive_t primitive) { - return dnnDelete_F32(primitive); -} -template <> inline dnnError_t dnnDelete( - dnnPrimitive_t primitive) { - return dnnDelete_F64(primitive); -} - - -template inline dnnError_t dnnConversionCreate( - dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to); -template <> inline dnnError_t dnnConversionCreate( - dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to) { - return dnnConversionCreate_F32(pConversion, from, to); -} -template <> inline dnnError_t dnnConversionCreate( - dnnPrimitive_t* pConversion, const dnnLayout_t from, const dnnLayout_t to) { - return dnnConversionCreate_F64(pConversion, from, to); -} - - -template inline dnnError_t dnnConversionExecute( - dnnPrimitive_t conversion, void *from, void *to); -template <> inline dnnError_t dnnConversionExecute( - dnnPrimitive_t conversion, void *from, void *to) { - return dnnConversionExecute_F32(conversion, from, to); -} -template <> inline dnnError_t dnnConversionExecute( - dnnPrimitive_t conversion, void *from, void *to) { - return dnnConversionExecute_F64(conversion, from, to); -} - - -template inline dnnError_t dnnConvolutionCreateForward( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnConvolutionCreateForward( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateForward_F32( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - -template <> inline dnnError_t dnnConvolutionCreateForward( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateForward_F64( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - - -template inline dnnError_t dnnConvolutionCreateForwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnConvolutionCreateForwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateForwardBias_F32( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnConvolutionCreateForwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateForwardBias_F64( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - - -template inline dnnError_t dnnConvolutionCreateBackwardData( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnConvolutionCreateBackwardData( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateBackwardData_F32( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnConvolutionCreateBackwardData( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateBackwardData_F64( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - -template inline dnnError_t dnnConvolutionCreateBackwardFilter( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnConvolutionCreateBackwardFilter( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateBackwardFilter_F32( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnConvolutionCreateBackwardFilter( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t srcSize[], const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnConvolutionCreateBackwardFilter_F64( - pConvolution, - attributes, - algorithm, - dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - -template inline dnnError_t dnnConvolutionCreateBackwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t dstSize[]); -template <> inline dnnError_t dnnConvolutionCreateBackwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t dstSize[]) { - return dnnConvolutionCreateBackwardBias_F32( - pConvolution, - attributes, - algorithm, - dimension, dstSize); -} -template <> inline dnnError_t dnnConvolutionCreateBackwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t dimension, const size_t dstSize[]) { - return dnnConvolutionCreateBackwardBias_F64( - pConvolution, - attributes, - algorithm, - dimension, dstSize); -} - -template inline dnnError_t dnnGroupsConvolutionCreateForward( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnGroupsConvolutionCreateForward( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateForward_F32( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnGroupsConvolutionCreateForward( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateForward_F64( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - -template inline dnnError_t dnnGroupsConvolutionCreateForwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnGroupsConvolutionCreateForwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateForwardBias_F32( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnGroupsConvolutionCreateForwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateForwardBias_F64( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - -template inline dnnError_t dnnGroupsConvolutionCreateBackwardData( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardData( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateBackwardData_F32( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardData( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateBackwardData_F64( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - - -template inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateBackwardFilter_F32( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} -template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardFilter( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t srcSize[], - const size_t dstSize[], const size_t filterSize[], - const size_t convolutionStrides[], const int inputOffset[], const dnnBorder_t border_type) { - return dnnGroupsConvolutionCreateBackwardFilter_F64( - pConvolution, - attributes, - algorithm, - groups, dimension, srcSize, dstSize, filterSize, - convolutionStrides, inputOffset, border_type); -} - -template inline dnnError_t dnnGroupsConvolutionCreateBackwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t dstSize[]); -template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t dstSize[]) { - return dnnGroupsConvolutionCreateBackwardBias_F32( - pConvolution, - attributes, - algorithm, - groups, dimension, dstSize); -} -template <> inline dnnError_t dnnGroupsConvolutionCreateBackwardBias( - dnnPrimitive_t* pConvolution, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t algorithm, - size_t groups, size_t dimension, const size_t dstSize[]) { - return dnnGroupsConvolutionCreateBackwardBias_F64( - pConvolution, - attributes, - algorithm, - groups, dimension, dstSize); -} - -template inline dnnError_t dnnReLUCreateForward( - dnnPrimitive_t* pRelu, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float negativeSlope); -template <> inline dnnError_t dnnReLUCreateForward( - dnnPrimitive_t* pRelu, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float negativeSlope) { - return dnnReLUCreateForward_F32( - pRelu, - attributes, - dataLayout, negativeSlope); -} -template <> inline dnnError_t dnnReLUCreateForward( - dnnPrimitive_t* pRelu, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float negativeSlope) { - return dnnReLUCreateForward_F64( - pRelu, - attributes, - dataLayout, negativeSlope); -} - -template inline dnnError_t dnnReLUCreateBackward( - dnnPrimitive_t* pRelu, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope); -template <> inline dnnError_t dnnReLUCreateBackward( - dnnPrimitive_t* pRelu, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope) { - return dnnReLUCreateBackward_F32( - pRelu, - attributes, - diffLayout, dataLayout, negativeSlope); -} -template <> inline dnnError_t dnnReLUCreateBackward( - dnnPrimitive_t* pRelu, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, float negativeSlope) { - return dnnReLUCreateBackward_F64( - pRelu, - attributes, - diffLayout, dataLayout, negativeSlope); -} - -template inline dnnError_t dnnLRNCreateForward( - dnnPrimitive_t* pLrn, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k); -template <> inline dnnError_t dnnLRNCreateForward( - dnnPrimitive_t* pLrn, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k) { - return dnnLRNCreateForward_F32( - pLrn, - attributes, - dataLayout, kernel_size, alpha, beta, k); -} -template <> inline dnnError_t dnnLRNCreateForward( - dnnPrimitive_t* pLrn, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, size_t kernel_size, float alpha, float beta, float k) { - return dnnLRNCreateForward_F64( - pLrn, - attributes, - dataLayout, kernel_size, alpha, beta, k); -} - - -template inline dnnError_t dnnLRNCreateBackward( - dnnPrimitive_t* pLrn, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, - size_t kernel_size, float alpha, float beta, float k); -template <> inline dnnError_t dnnLRNCreateBackward( - dnnPrimitive_t* pLrn, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, - size_t kernel_size, float alpha, float beta, float k) { - return dnnLRNCreateBackward_F32( - pLrn, - attributes, - diffLayout, dataLayout, kernel_size, alpha, beta, k); -} -template <> inline dnnError_t dnnLRNCreateBackward( - dnnPrimitive_t* pLrn, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t diffLayout, const dnnLayout_t dataLayout, - size_t kernel_size, float alpha, float beta, float k) { - return dnnLRNCreateBackward_F64( - pLrn, - attributes, - diffLayout, dataLayout, kernel_size, alpha, beta, k); -} - - -template inline dnnError_t dnnPoolingCreateForward( - dnnPrimitive_t* pPooling, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t op, - const dnnLayout_t srcLayout, - const size_t kernelSize[], const size_t kernelStride[], - const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnPoolingCreateForward( - dnnPrimitive_t* pPooling, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t op, - const dnnLayout_t srcLayout, - const size_t kernelSize[], const size_t kernelStride[], - const int inputOffset[], const dnnBorder_t border_type) { - return dnnPoolingCreateForward_F32( - pPooling, - attributes, - op, - srcLayout, - kernelSize, kernelStride, - inputOffset, border_type); -} -template <> inline dnnError_t dnnPoolingCreateForward( - dnnPrimitive_t* pPooling, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t op, - const dnnLayout_t srcLayout, - const size_t kernelSize[], const size_t kernelStride[], - const int inputOffset[], const dnnBorder_t border_type) { - return dnnPoolingCreateForward_F64( - pPooling, - attributes, - op, - srcLayout, - kernelSize, kernelStride, - inputOffset, border_type); -} - - -template inline dnnError_t dnnPoolingCreateBackward( - dnnPrimitive_t* pPooling, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t op, - const dnnLayout_t srcLayout, - const size_t kernelSize[], const size_t kernelStride[], - const int inputOffset[], const dnnBorder_t border_type); -template <> inline dnnError_t dnnPoolingCreateBackward( - dnnPrimitive_t* pPooling, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t op, - const dnnLayout_t srcLayout, - const size_t kernelSize[], const size_t kernelStride[], - const int inputOffset[], const dnnBorder_t border_type) { - return dnnPoolingCreateBackward_F32( - pPooling, - attributes, - op, - srcLayout, - kernelSize, kernelStride, - inputOffset, border_type); -} -template <> inline dnnError_t dnnPoolingCreateBackward( - dnnPrimitive_t* pPooling, - dnnPrimitiveAttributes_t attributes, - dnnAlgorithm_t op, - const dnnLayout_t srcLayout, - const size_t kernelSize[], const size_t kernelStride[], - const int inputOffset[], const dnnBorder_t border_type) { - return dnnPoolingCreateBackward_F64( - pPooling, - attributes, - op, - srcLayout, - kernelSize, kernelStride, - inputOffset, border_type); -} - -template inline dnnError_t dnnConcatCreate( - dnnPrimitive_t *pConcat, - dnnPrimitiveAttributes_t attributes, - const size_t N, - dnnLayout_t src[]); -template <> inline dnnError_t dnnConcatCreate( - dnnPrimitive_t *pConcat, - dnnPrimitiveAttributes_t attributes, - const size_t N, - dnnLayout_t src[]) { - return dnnConcatCreate_F32( - pConcat, - attributes, - N, - src); -} -template <> inline dnnError_t dnnConcatCreate( - dnnPrimitive_t *pConcat, - dnnPrimitiveAttributes_t attributes, - const size_t N, - dnnLayout_t src[]) { - return dnnConcatCreate_F64( - pConcat, - attributes, - N, - src); -} - - -template inline dnnError_t dnnSplitCreate( - dnnPrimitive_t *pSplit, - dnnPrimitiveAttributes_t attributes, - const size_t N, - dnnLayout_t src, - size_t dst[]); -template <> inline dnnError_t dnnSplitCreate( - dnnPrimitive_t *pSplit, - dnnPrimitiveAttributes_t attributes, - const size_t N, - dnnLayout_t src, - size_t dst[]) { - return dnnSplitCreate_F32( - pSplit, - attributes, - N, - src, - dst); -} -template <> inline dnnError_t dnnSplitCreate( - dnnPrimitive_t *pSplit, - dnnPrimitiveAttributes_t attributes, - const size_t N, - dnnLayout_t src, - size_t dst[]) { - return dnnSplitCreate_F64( - pSplit, - attributes, - N, - src, - dst); -} - -template inline dnnError_t dnnSumCreate( - dnnPrimitive_t *pSum, - dnnPrimitiveAttributes_t attributes, - const size_t nSummands, dnnLayout_t layout, Dtype *coefficients); -template <> inline dnnError_t dnnSumCreate( - dnnPrimitive_t *pSum, - dnnPrimitiveAttributes_t attributes, - const size_t nSummands, dnnLayout_t layout, float *coefficients) { - return dnnSumCreate_F32( - pSum, - attributes, - nSummands, - layout, coefficients); -} -template <> inline dnnError_t dnnSumCreate( - dnnPrimitive_t *pSum, - dnnPrimitiveAttributes_t attributes, - const size_t nSummands, dnnLayout_t layout, double *coefficients) { - return dnnSumCreate_F64( - pSum, - attributes, - nSummands, - layout, coefficients); -} - -template inline dnnError_t dnnBatchNormalizationCreateForward_v2( - dnnPrimitive_t* pBatchNormalization, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float eps, - int flags); - -template <> inline dnnError_t dnnBatchNormalizationCreateForward_v2( - dnnPrimitive_t* pBatchNormalization, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float eps, - int flags) { - return dnnBatchNormalizationCreateForward_v2_F32( - pBatchNormalization, - attributes, - dataLayout, eps, flags); -} -template <> inline dnnError_t dnnBatchNormalizationCreateForward_v2( - dnnPrimitive_t* pBatchNormalization, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float eps, - int flags) { - return dnnBatchNormalizationCreateForward_v2_F64( - pBatchNormalization, - attributes, - dataLayout, eps, flags); -} - - -template inline dnnError_t dnnBatchNormalizationCreateBackward_v2( - dnnPrimitive_t* pBatchNormalization, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float eps, - int flags); - -template <> inline dnnError_t dnnBatchNormalizationCreateBackward_v2( - dnnPrimitive_t* pBatchNormalization, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float eps, - int flags) { - return dnnBatchNormalizationCreateBackward_v2_F32( - pBatchNormalization, - attributes, - dataLayout, eps, flags); -} - -template <> inline dnnError_t dnnBatchNormalizationCreateBackward_v2( - dnnPrimitive_t* pBatchNormalization, - dnnPrimitiveAttributes_t attributes, - const dnnLayout_t dataLayout, float eps, - int flags) { - return dnnBatchNormalizationCreateBackward_v2_F64( - pBatchNormalization, - attributes, - dataLayout, eps, flags); -} - -template inline dnnError_t dnnInnerProductCreateForward( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels); -template <> inline dnnError_t dnnInnerProductCreateForward( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateForward_F32(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} -template <> inline dnnError_t dnnInnerProductCreateForward( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateForward_F64(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} - -template inline dnnError_t dnnInnerProductCreateForwardBias( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels); - -template <> inline dnnError_t dnnInnerProductCreateForwardBias( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateForwardBias_F32(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} -template <> inline dnnError_t dnnInnerProductCreateForwardBias( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateForwardBias_F64(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} - - -template inline dnnError_t dnnInnerProductCreateBackwardData( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels); - -template <> inline dnnError_t dnnInnerProductCreateBackwardData( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateBackwardData_F32(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} -template <> inline dnnError_t dnnInnerProductCreateBackwardData( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateBackwardData_F64(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} - - - - -template inline dnnError_t dnnInnerProductCreateBackwardFilter( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels); - -template <> inline dnnError_t dnnInnerProductCreateBackwardFilter( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateBackwardFilter_F32(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} -template <> inline dnnError_t dnnInnerProductCreateBackwardFilter( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t srcSize[], - size_t outputChannels) { - return dnnInnerProductCreateBackwardFilter_F64(pInnerProduct, - attributes, dimensions, - srcSize, outputChannels); -} - - - -template inline dnnError_t dnnInnerProductCreateBackwardBias( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t dstSize[]); - -template <> inline dnnError_t dnnInnerProductCreateBackwardBias( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t dstSize[]) { - return dnnInnerProductCreateBackwardBias_F32(pInnerProduct, - attributes, dimensions, - dstSize); -} -template <> inline dnnError_t dnnInnerProductCreateBackwardBias( - dnnPrimitive_t *pInnerProduct, - dnnPrimitiveAttributes_t attributes, - size_t dimensions, - const size_t dstSize[]) { - return dnnInnerProductCreateBackwardBias_F64(pInnerProduct, - attributes, dimensions, - dstSize); -} -#endif // #MXNET_USE_MKL2017 == 1 -#endif // MXNET_OPERATOR_MKL_MKL_CPPWRAPPER_H_ diff --git a/src/operator/mkl/mkl_elementwise_copy-inl.h b/src/operator/mkl/mkl_elementwise_copy-inl.h deleted file mode 100644 index 48c931291150..000000000000 --- a/src/operator/mkl/mkl_elementwise_copy-inl.h +++ /dev/null @@ -1,69 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_elementwise-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../mshadow_op.h" -#include "./mkl_util-inl.h" - - -namespace mxnet { -namespace op { - -template -void MKLIdentityCompute(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& inputs, - const std::vector& req, - const std::vector& outputs) { - if (!req[0]) return; -#if MKL_EXPERIMENTAL == 1 - if (op::mkl_prv_data(inputs[0])) { - std::shared_ptr in_data_mem = inputs[0].Mkl_mem_; - // User copy to avoid potential problem - std::shared_ptr > top_data = MKLData::create(); - std::shared_ptr top_mem = outputs[0].Mkl_mem_; - top_data->copy_from(in_data_mem); - top_mem->set_prv_descriptor(top_data); - return; - } -#endif - int in_blob_size = inputs[0].Size(); - int out_blob_size = outputs[0].Size(); - CHECK_EQ(in_blob_size, out_blob_size) << "MKLIdentityCompute CPU Size not Match "; - memcpy(outputs[0].dptr_, inputs[0].dptr_, in_blob_size * sizeof(DType)); -} - - - -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_COPY_INL_H_ diff --git a/src/operator/mkl/mkl_elementwise_sum-inl.h b/src/operator/mkl/mkl_elementwise_sum-inl.h deleted file mode 100644 index d313fd15a5be..000000000000 --- a/src/operator/mkl/mkl_elementwise_sum-inl.h +++ /dev/null @@ -1,117 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_elementwise-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../mshadow_op.h" -#include "./mkl_util-inl.h" - - -namespace mxnet { -namespace op { -template -static void LayerSetUp(const std::vector > &data, - size_t data_shape_size, - std::shared_ptr > fwd_top_data) { - // Whether to use an asymptotically slower (for >2 inputs) but stabler method - // of computing the gradient for the PROD operation. (No effect for SUM op.) - // stable_prod_grad_ = 1; - size_t dim_src = data_shape_size; - size_t *sizes_src = new size_t[dim_src]; - size_t *strides_src = new size_t[dim_src]; - for (size_t d = 0; d < dim_src; ++d) { - sizes_src[d] = data[0].shape_[dim_src - d - 1]; - strides_src[d] = (d == 0) ? 1 : strides_src[d - 1] * sizes_src[d - 1]; - } - - fwd_top_data->create_user_layout(dim_src, sizes_src, strides_src); - delete[] sizes_src; - delete[] strides_src; -} - -template -void MKLElementWiseSumCompute_(const nnvm::NodeAttrs& attrs, - const OpContext& ctx, - const std::vector& in_data, - const std::vector& req, - const std::vector& out_data) { - using namespace mshadow; - using namespace mshadow::expr; - if (req[0] == kNullOp) return; - size_t size = in_data.size(); - Stream *s = ctx.get_stream(); - std::vector > data(size); - Tensor out = out_data[0].FlatTo1D(s); - bool in_place_flag = false; - int in_place_idx = 0; - - for (size_t i = 0; i < size; ++i) { - data[i] = in_data[i].FlatTo1D(s); - if (data[i].dptr_ == out.dptr_) { - in_place_idx = i; - in_place_flag = true; - } - } - std::shared_ptr > fwd_top_data = MKLData::create(); - std::vector coeffs_ = std::vector(data.size(), 1); - LayerSetUp(data, 1, fwd_top_data); - - - dnnError_t e; - void *eltwise_res[dnnResourceNumber]; - dnnPrimitive_t sumPrimitive = NULL; - e = dnnSumCreate(&sumPrimitive, NULL, size, fwd_top_data->layout_usr, - &coeffs_[0]); - CHECK_EQ(e, E_SUCCESS); - - eltwise_res[dnnResourceDst] = reinterpret_cast(const_cast(out.dptr_)); - eltwise_res[dnnResourceMultipleSrc] = - reinterpret_cast(reinterpret_cast(in_data[in_place_idx].dptr_)); - for (size_t i = 1; i < size; ++i) { - if (i == in_place_idx) continue; - eltwise_res[dnnResourceMultipleSrc + i] = - reinterpret_cast(reinterpret_cast(in_data[i].dptr_)); - } - - e = dnnExecute(sumPrimitive, eltwise_res); - CHECK_EQ(e, E_SUCCESS); - - if (sumPrimitive != NULL) { - dnnDelete(sumPrimitive); - sumPrimitive = NULL; - } -} - - - -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_ELEMENTWISE_SUM_INL_H_ diff --git a/src/operator/mkl/mkl_fully_connected-inl.h b/src/operator/mkl/mkl_fully_connected-inl.h deleted file mode 100644 index 5e296704b6dd..000000000000 --- a/src/operator/mkl/mkl_fully_connected-inl.h +++ /dev/null @@ -1,192 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_fully_connected-inl.h -* \brief -* \author zhenlin.luo@intel.com -* lingyan.guo@intel.com -* -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_ -#include -#include -#include -#include "../activation-inl.h" -#include "./mkl_util-inl.h" - -namespace mxnet { -namespace op { - -template -class MKLFullyConnectedOp : public Operator { - public: - explicit MKLFullyConnectedOp(const FullyConnectedParam& p, - const std::vector& in_shapes, - const std::vector& out_shapes): - param_(p) { - LayerSetUp(in_shapes, out_shapes); - } - - ~MKLFullyConnectedOp() { - dnnDelete(fullyConnectedFwd); - dnnDelete(fullyConnectedBwdData); - dnnDelete(fullyConnectedBwdFilter); - dnnDelete(fullyConnectedBwdBias); - } - static std::string getName() { - return "MKLFullyConnectedOp"; - } - - private: - void LayerSetUp(const std::vector& in_shapes, - const std::vector& out_shapes) { - const TShape& ishape = in_shapes[fullc::kData]; - - const size_t dim = 4; - const size_t src_sizes[4] = {1, 1, ishape.ProdShape(1, ishape.ndim()), ishape[0]}; - const size_t dst_sizes[2] = {param_.num_hidden, ishape[0]}; - const size_t output_channels = param_.num_hidden; - - dnnPrimitiveAttributes_t attributes = NULL; - MKLDNN_CALL(dnnPrimitiveAttributesCreate(&attributes)); - if (!param_.no_bias) { - MKLDNN_CALL(dnnInnerProductCreateForwardBias( - &fullyConnectedFwd, - attributes, - dim, - src_sizes, - output_channels)); - } else { - MKLDNN_CALL(dnnInnerProductCreateForward( - &fullyConnectedFwd, - attributes, - dim, - src_sizes, - output_channels)); - } - MKLDNN_CALL(dnnInnerProductCreateBackwardData( - &fullyConnectedBwdData, - attributes, - dim, - src_sizes, - output_channels)); - MKLDNN_CALL(dnnInnerProductCreateBackwardFilter( - &fullyConnectedBwdFilter, - attributes, - dim, - src_sizes, - output_channels)); - if (!param_.no_bias) { - MKLDNN_CALL(dnnInnerProductCreateBackwardBias( - &fullyConnectedBwdBias, - attributes, - 2, - dst_sizes)); - } - // TODO(minjie): Shouldn't `attributes` be destroyed? - } - - - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - - void* res_fullyConnected[dnnResourceNumber]; - if (req[fullc::kOut] == kNullOp) return; - CHECK_EQ(req[fullc::kOut], kWriteTo); - CHECK_EQ(in_data.size(), param_.no_bias ? 2 : 3); - CHECK_EQ(out_data.size(), 1); - Stream *s = ctx.get_stream(); - - const TShape& ishape = in_data[fullc::kData].shape_; - const TShape& oshape = out_data[fullc::kOut].shape_; - - Tensor data; - Tensor out; - - Shape4(in_data[fullc::kData].shape_[0], in_data[fullc::kData].shape_[1], 1, 1); - - Shape<4> dshape = Shape4(ishape[0], ishape.ProdShape(1, ishape.ndim()), 1, 1); - Shape<4> odshape = Shape4(oshape[0], oshape.ProdShape(1, oshape.ndim()), 1, 1); - - data = in_data[fullc::kData].get_with_shape(dshape, s); - out = out_data[fullc::kOut].get_with_shape(odshape, s); - res_fullyConnected[dnnResourceSrc] = - reinterpret_cast(in_data[fullc::kData].dptr_); - res_fullyConnected[dnnResourceDst] = - reinterpret_cast(out_data[fullc::kOut].dptr_); - res_fullyConnected[dnnResourceFilter] = - reinterpret_cast(in_data[fullc::kWeight].dptr_); - if (!param_.no_bias) { - res_fullyConnected[dnnResourceBias] = reinterpret_cast(in_data[fullc::kBias].dptr_); - } - - MKLDNN_CALL(dnnExecute(fullyConnectedFwd, res_fullyConnected)); - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - - void* res_fullyConnected[dnnResourceNumber]; - CHECK_EQ(out_grad.size(), 1); - const size_t expected = param_.no_bias ? 2 : 3; - CHECK(in_data.size() == expected && in_grad.size() == expected); - CHECK_EQ(req.size(), expected); - res_fullyConnected[dnnResourceSrc] = - reinterpret_cast(in_data[fullc::kData].dptr_); - res_fullyConnected[dnnResourceFilter] = - reinterpret_cast(in_data[fullc::kWeight].dptr_); - - res_fullyConnected[dnnResourceDiffDst] = - reinterpret_cast(out_grad[fullc::kOut].dptr_); - res_fullyConnected[dnnResourceDiffSrc] = - reinterpret_cast(in_grad[fullc::kData].dptr_); - res_fullyConnected[dnnResourceDiffFilter] = - reinterpret_cast(in_grad[fullc::kWeight].dptr_); - if (!param_.no_bias) { - res_fullyConnected[dnnResourceDiffBias] = - reinterpret_cast(in_grad[fullc::kBias].dptr_); - } - MKLDNN_CALL(dnnExecute(fullyConnectedBwdFilter, res_fullyConnected)); - if (!param_.no_bias) { - MKLDNN_CALL(dnnExecute(fullyConnectedBwdBias, res_fullyConnected)); - } - MKLDNN_CALL(dnnExecute(fullyConnectedBwdData, res_fullyConnected)); - } - - private: - dnnPrimitive_t fullyConnectedFwd{nullptr}; - dnnPrimitive_t fullyConnectedBwdData{nullptr}; - dnnPrimitive_t fullyConnectedBwdFilter{nullptr}; - dnnPrimitive_t fullyConnectedBwdBias{nullptr}; - const FullyConnectedParam param_; -}; // class MKLFullyConnectedOp -} // namespace op -} // namespace mxnet - -#endif // MXNET_OPERATOR_MKL_MKL_FULLY_CONNECTED_INL_H_ diff --git a/src/operator/mkl/mkl_lrn-inl.h b/src/operator/mkl/mkl_lrn-inl.h deleted file mode 100644 index 90dfad50fa62..000000000000 --- a/src/operator/mkl/mkl_lrn-inl.h +++ /dev/null @@ -1,265 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_lrn-inl.h -* \brief -* \author zhenlin.luo@intel.com -* lingyan.guo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_LRN_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_LRN_INL_H_ -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "../mshadow_op.h" -#include "./mkl_util-inl.h" - -namespace mxnet { -namespace op { - -template -class MKLLRNOp : public Operator { - public: - static std::string getName() { - return "MKLLRNOp"; - } - - explicit MKLLRNOp(LRNParam param) : - lrnFwd(static_cast(NULL)), - lrnBwd(static_cast(NULL)), - lrn_buffer_(NULL) { - this->param_ = param; - fwd_top_data_ = MKLData::create(); - fwd_bottom_data_ = MKLData::create(); - bwd_top_diff_ = MKLData::create(); - bwd_bottom_diff_ = MKLData::create(); - init_mkldnn_ = false; - } - - virtual ~MKLLRNOp() { - if (lrnFwd != NULL) { - dnnDelete(lrnFwd); - lrnFwd = NULL; - } - if (lrnBwd != NULL) { - dnnDelete(lrnBwd); - lrnBwd = NULL; - } - dnnReleaseBuffer(lrn_buffer_); - } - - private: - void LayerSetup(const mshadow::Tensor &data, - const mshadow::Tensor &out) { - size_ = param_.nsize; - CHECK_EQ(size_ % 2, 1) << "LRN only supports odd values for local size"; - - alpha_ = param_.alpha; - beta_ = param_.beta; - k_ = param_.knorm; - size_t dim = 4, sizes[4], strides[4]; - channels_ = data.shape_[1]; - height_ = data.shape_[2]; - width_ = data.shape_[3]; - num_ = data.shape_[0]; - sizes[0] = width_; - sizes[1] = height_; - sizes[2] = channels_; - sizes[3] = num_; - - strides[0] = 1; - strides[1] = sizes[0]; - strides[2] = sizes[0] * sizes[1]; - strides[3] = sizes[0] * sizes[1] * sizes[2]; - - fwd_bottom_data_->name = "fwd_bottom_data_ @ " + getName(); - fwd_top_data_->name = "fwd_top_data_ @ " + getName(); - bwd_top_diff_->name = "bwd_top_diff_ @ " + getName(); - bwd_bottom_diff_->name = "bwd_bottom_diff_ @ " + getName(); - - fwd_bottom_data_->create_user_layout(dim, sizes, strides); - fwd_top_data_->create_user_layout(dim, sizes, strides); - bwd_bottom_diff_->create_user_layout(dim, sizes, strides); - bwd_top_diff_->create_user_layout(dim, sizes, strides); - } - - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 2U); - CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size"; - Stream *s = ctx.get_stream(); - Tensor data = mkl_experimental_direct_get( - in_data[lrn_enum::kData], s); - Tensor out = mkl_experimental_direct_get( - out_data[lrn_enum::kOut], s); - if (!init_mkldnn_) { - LayerSetup(data, out); - init_mkldnn_ = true; - } - - const void* bottom_data = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_data = - reinterpret_cast(mkl_prv_data(in_data[lrn_enum::kData])); -#endif -#if MKL_EXPERIMENTAL == 1 - if (NULL != bottom_data) { - if (lrnFwd == NULL) { - std::shared_ptr bottom_data_mem = - in_data[lrn_enum::kData].Mkl_mem_; - std::shared_ptr bottom_prv_descriptor = - bottom_data_mem->get_prv_descriptor(); - CHECK_EQ(bottom_prv_descriptor->get_descr_type(), - PrvMemDescr::PRV_DESCR_MKL2017); - std::shared_ptr > mem_descr - = std::static_pointer_cast>(bottom_prv_descriptor); - CHECK(mem_descr != nullptr); - fwd_bottom_data_ = mem_descr; - - dnnError_t e; - dnnLayout_t lrn_buffer_l = NULL; - - e = dnnLRNCreateForward(&lrnFwd, NULL, fwd_bottom_data_->layout_int, - size_, alpha_, beta_, k_); - CHECK_EQ(e, E_SUCCESS); - - fwd_top_data_->create_internal_layout(lrnFwd, dnnResourceDst); - - e = dnnLRNCreateBackward(&lrnBwd, NULL, - fwd_bottom_data_->layout_int, fwd_bottom_data_->layout_int, - size_, alpha_, beta_, k_); - CHECK_EQ(e, E_SUCCESS); - - e = dnnLayoutCreateFromPrimitive( - &lrn_buffer_l, lrnFwd, dnnResourceWorkspace); - CHECK_EQ(e, E_SUCCESS); - e = dnnAllocateBuffer( - reinterpret_cast(&lrn_buffer_), lrn_buffer_l); - CHECK_EQ(e, E_SUCCESS); - dnnLayoutDelete(lrn_buffer_l); - - bwd_top_diff_->create_internal_layout(lrnBwd, dnnResourceDiffDst); - bwd_bottom_diff_->create_internal_layout(lrnBwd, dnnResourceDiffSrc); - } - } -#endif - if (bottom_data == NULL) { - if (lrnFwd == NULL) { - dnnError_t e; - dnnLayout_t lrn_buffer_l = NULL; - e = dnnLRNCreateForward(&lrnFwd, NULL, fwd_bottom_data_->layout_usr, - size_, alpha_, beta_, k_); - CHECK_EQ(e, E_SUCCESS); - - e = dnnLayoutCreateFromPrimitive( - &lrn_buffer_l, lrnFwd, dnnResourceWorkspace); - CHECK_EQ(e, E_SUCCESS); - e = dnnAllocateBuffer( - reinterpret_cast(&lrn_buffer_), lrn_buffer_l); - CHECK_EQ(e, E_SUCCESS); - dnnLayoutDelete(lrn_buffer_l); - - e = dnnLRNCreateBackward(&lrnBwd, NULL, - fwd_bottom_data_->layout_usr, fwd_bottom_data_->layout_usr, - size_, alpha_, beta_, k_); - CHECK_EQ(e, E_SUCCESS); - } - bottom_data = data.dptr_; - } - - dnnError_t e; - void* lrn_res[dnnResourceNumber]; - lrn_res[dnnResourceSrc] = const_cast(bottom_data); - - lrn_res[dnnResourceDst] = fwd_top_data_->get_output_ptr( - out.dptr_, fwd_top_data_, out_data[lrn_enum::kOut]); - lrn_res[dnnResourceWorkspace] = lrn_buffer_; - e = dnnExecute(lrnFwd, lrn_res); - CHECK_EQ(e, E_SUCCESS); - } - - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1); - CHECK_EQ(in_data.size(), 1); - CHECK_EQ(out_data.size(), 2); - Stream *s = ctx.get_stream(); - Tensor grad = mkl_experimental_direct_get( - out_grad[lrn_enum::kOut], s); - Tensor data = mkl_experimental_direct_get( - in_data[lrn_enum::kData], s); - Tensor grad_in = mkl_experimental_direct_get( - in_grad[lrn_enum::kData], s); - dnnError_t e; - void* lrn_res[dnnResourceNumber]; - lrn_res[dnnResourceDiffDst] = - bwd_top_diff_->get_converted_prv(grad.dptr_, true, out_grad[lrn_enum::kOut]); - lrn_res[dnnResourceWorkspace] = lrn_buffer_; - lrn_res[dnnResourceSrc] = - fwd_bottom_data_->get_converted_prv(data.dptr_, false, in_data[lrn_enum::kData]); - - lrn_res[dnnResourceDiffSrc] = bwd_bottom_diff_->get_output_ptr( - grad_in.dptr_, bwd_bottom_diff_, in_grad[lrn_enum::kData]); - e = dnnExecute(lrnBwd, lrn_res); - CHECK_EQ(e, E_SUCCESS); - } - - private: - LRNParam param_; - int size_; - int pre_pad_; - DType alpha_; - DType beta_; - DType k_; - int num_; - int channels_; - int height_; - int width_; - bool init_mkldnn_; - - private: - dnnPrimitive_t lrnFwd, lrnBwd; - std::shared_ptr > fwd_top_data_; - std::shared_ptr > fwd_bottom_data_; - - std::shared_ptr > bwd_top_diff_; - std::shared_ptr > bwd_bottom_diff_; - - DType *lrn_buffer_; -}; // class LocalResponseNormOp -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_LRN_INL_H_ - diff --git a/src/operator/mkl/mkl_memory-inl.h b/src/operator/mkl/mkl_memory-inl.h deleted file mode 100644 index 71af10254b2a..000000000000 --- a/src/operator/mkl/mkl_memory-inl.h +++ /dev/null @@ -1,137 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_memory-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_ - - -#include -#include -#include -#include "mkl_cppwrapper.h" - -namespace mxnet { - -template -struct MKLMemoryDescriptorBase : public PrvMemDescr, - public std::enable_shared_from_this > { - MKLMemoryDescriptorBase() : layout_usr(NULL), layout_int(NULL), - convert_to_int(NULL), convert_from_int(NULL), convert_prv2prv(NULL), - name("UNKNOWN"), internal_ptr(NULL) {} - virtual ~MKLMemoryDescriptorBase() { - dnnLayoutDelete(layout_usr); - dnnLayoutDelete(layout_int); - if (internal_ptr != NULL) { - dnnReleaseBuffer(internal_ptr); - internal_ptr = NULL; - } - if (convert_to_int != NULL) { - dnnDelete(convert_to_int); - convert_to_int = NULL; - } - if (convert_from_int != NULL) { - dnnDelete(convert_from_int); - convert_from_int = NULL; - } - if (convert_prv2prv != NULL) { - dnnDelete(convert_prv2prv); - convert_prv2prv = NULL; - } - } - std::shared_ptr > get_shared_ptr() { - return this->shared_from_this(); - } - - dnnLayout_t layout_usr; - dnnLayout_t layout_int; - dnnPrimitive_t convert_to_int; - dnnPrimitive_t convert_from_int; - dnnPrimitive_t convert_prv2prv; - std::shared_ptr > descr_prv2prv_conversion; - - - std::string name; // for debugging purposes - void allocate() { - if (internal_ptr == NULL) { - int status = dnnAllocateBuffer( - reinterpret_cast(&internal_ptr), layout_int); - CHECK_EQ(status, E_SUCCESS) - << "Failed internal_ptr memory allocation with status " - << status << "\n"; - } - } - virtual void* prv_ptr(bool allocate_when_uninit = true) { - if (internal_ptr == NULL && allocate_when_uninit) - allocate(); - return internal_ptr; - } - inline bool conversion_needed() { - return (convert_to_int != NULL); - } - void create_conversions(); - void create_internal_layout(const dnnPrimitive_t primitive, - dnnResourceType_t type); - void create_user_layout(size_t dimension, const size_t size[], - const size_t strides[]); - void create_layouts( - const dnnPrimitive_t primitive, dnnResourceType_t type, - size_t dimension, const size_t size[], const size_t strides[]); - - virtual PrvDescrType get_descr_type() { - return PRV_DESCR_MKL2017; - } - virtual size_t prv_size() { - return dnnLayoutGetMemorySize(layout_int); - } - virtual size_t prv_count() { - return dnnLayoutGetMemorySize(layout_int) / sizeof(DType); - } - virtual void convert_from_prv(void* cpu_ptr); - virtual void convert_to_prv(void* cpu_ptr); - virtual bool layout_compare(std::shared_ptr other); - virtual void convert_from_other(std::shared_ptr other); - protected: - DType* internal_ptr; -}; - -template -struct MKLMemoryDescriptor : MKLMemoryDescriptorBase { - // The last get_converted_prv() argument is a hack for reusing - // in backward a conversion done already in the forward direction. - DType* get_converted_prv(DType *data_ptr, bool set_prv_ptr, - const TBlob &blob); - void* get_output_ptr(DType *data_ptr, std::shared_ptr > self_ptr, - const TBlob &blob, bool in_place = false); - bool copy_from(std::shared_ptr dnn_chunk); - MKLMemoryDescriptor() {} -}; - -template struct MKLData : MKLMemoryDescriptor { - static std::shared_ptr > create() { - return std::make_shared >(); - } -}; - -template struct MKLData; -template struct MKLData; - -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_MEMORY_INL_H_ diff --git a/src/operator/mkl/mkl_memory.cc b/src/operator/mkl/mkl_memory.cc deleted file mode 100644 index 7682fe1c1f37..000000000000 --- a/src/operator/mkl/mkl_memory.cc +++ /dev/null @@ -1,291 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_memory.cc -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#include "../operator_common.h" - -#if MXNET_USE_MKL2017 == 1 -#include -#include "mkl_memory-inl.h" -#include "mkl_util-inl.h" - -namespace mxnet { - -template -void MKLMemoryDescriptorBase::create_conversions() { - int status; - if (this->convert_from_int) { - status = dnnDelete(this->convert_from_int); - CHECK_EQ(status, E_SUCCESS); - this->convert_from_int = NULL; - } - if (this->convert_to_int) { - status = dnnDelete(this->convert_to_int); - CHECK_EQ(status, E_SUCCESS); - this->convert_to_int = NULL; - } - if (layout_int - && !dnnLayoutCompare(layout_usr, layout_int)) { - CHECK(layout_usr); - status = dnnConversionCreate(&convert_to_int, layout_usr, - layout_int); - CHECK_EQ(status, E_SUCCESS) - << "Failed creation convert_to_int with status " - << status << " for buffer: " << this->name << "\n"; - status = dnnConversionCreate(&convert_from_int, layout_int, - layout_usr); - CHECK_EQ(status, E_SUCCESS) - << "Failed creation convert_from_int with status " - << status << " for buffer: " << this->name << "\n"; - } -} - -template -void MKLMemoryDescriptorBase::create_internal_layout( - const dnnPrimitive_t primitive, dnnResourceType_t type) { - int status; - if (this->layout_int) { - status = dnnLayoutDelete(this->layout_int); - CHECK_EQ(status, E_SUCCESS); - } - status = dnnLayoutCreateFromPrimitive( - &this->layout_int, primitive, type); - CHECK_EQ(status, E_SUCCESS) - << "Failed dnnLayoutCreateFromPrimitive with status " - << status << " for buffer: " << this->name << "\n"; - - if (this->layout_usr) - this->create_conversions(); -} - -template -void MKLMemoryDescriptorBase::create_user_layout( - size_t dimension, const size_t size[], const size_t strides[]) { - int status; - if (this->layout_usr) { - status = dnnLayoutDelete(this->layout_usr); - CHECK_EQ(status, E_SUCCESS); - } - - status = dnnLayoutCreate( - &this->layout_usr, dimension, size, strides); - CHECK_EQ(status, E_SUCCESS) << "Failed dnnLayoutCreate with status " - << status << " for buffer: " << this->name << "\n"; - - if (this->layout_int) - this->create_conversions(); -} - -template -void MKLMemoryDescriptorBase::create_layouts( - const dnnPrimitive_t primitive, dnnResourceType_t type, - size_t dimension, const size_t size[], const size_t strides[]) { - this->create_internal_layout(primitive, type); - this->create_user_layout(dimension, size, strides); -} - - -template -void MKLMemoryDescriptorBase::convert_from_prv(void* cpu_ptr) { - CHECK(cpu_ptr); - CHECK(this->convert_from_int); - int status; - void *convert_resources[dnnResourceNumber]; - - convert_resources[dnnResourceFrom] = this->prv_ptr(); - convert_resources[dnnResourceTo] = cpu_ptr; - status = dnnExecute(this->convert_from_int, convert_resources); - CHECK_EQ(status, 0) << "Conversion from prv failed with status " << status; -} - -template -void MKLMemoryDescriptorBase::convert_to_prv(void* cpu_ptr) { - CHECK(cpu_ptr); - CHECK(this->convert_to_int); - int status; - void *convert_resources[dnnResourceNumber]; - - convert_resources[dnnResourceFrom] = cpu_ptr; - convert_resources[dnnResourceTo] = this->prv_ptr(); - status = dnnExecute(this->convert_to_int, convert_resources); - CHECK_EQ(status, 0) << "Conversion from prv failed with status " << status; -} - - -template -bool MKLMemoryDescriptorBase::layout_compare( - std::shared_ptr other) { - CHECK_EQ(other->get_descr_type(), - PrvMemDescr::PRV_DESCR_MKL2017); - std::shared_ptr >other_descr = - std::static_pointer_cast > - (other); - - if (dnnLayoutCompare(other_descr->layout_int, - this->layout_int)) - return true; - else - return false; -} - -template -void MKLMemoryDescriptorBase::convert_from_other( - std::shared_ptr other) { - std::shared_ptr > other_descr = - std::static_pointer_cast > - (other); - - int status; - dnnPrimitive_t convert; - status = dnnConversionCreate(&convert, - other_descr->layout_int, this->layout_int); - - void *convert_resources[dnnResourceNumber]; - convert_resources[dnnResourceFrom] = other_descr->prv_ptr(); - convert_resources[dnnResourceTo] = this->prv_ptr(); - status = dnnExecute(convert, convert_resources); - CHECK_EQ(status, 0) << "Conversion from other failed with status " - << status; - - dnnDelete(convert); -} - - -template -Dtype* MKLMemoryDescriptor::get_converted_prv( - Dtype *cpu_ptr, bool set_prv_ptr, const TBlob &blob) { - Dtype* prv_ptr = NULL; - std::shared_ptr dnn_chunk = NULL; -#if MKL_EXPERIMENTAL == 1 - dnn_chunk = blob.Mkl_mem_; -#endif -#if MKL_EXPERIMENTAL == 1 - if (dnn_chunk != NULL) - prv_ptr = static_cast(dnn_chunk->prv_data()); -#endif - - if (this->convert_to_int != NULL) { -#if MKL_EXPERIMENTAL == 1 - int status; - void *convert_resources[dnnResourceNumber]; -#endif - if (prv_ptr == NULL) { - this->allocate(); - this->convert_to_prv(cpu_ptr); -#if MKL_EXPERIMENTAL == 1 - if (set_prv_ptr) { - dnn_chunk->set_prv_descriptor(this->get_shared_ptr(), true); - } -#endif - return this->internal_ptr; - } -#if MKL_EXPERIMENTAL == 1 - if (prv_ptr != NULL) { - std::shared_ptr > current_descr = - op::mkl_get_mem_desc(dnn_chunk); - if (!dnnLayoutCompare(current_descr->layout_int, - this->layout_int)) { - if (this->convert_prv2prv) { - CHECK_EQ(dnnLayoutCompare( - this->descr_prv2prv_conversion->layout_int, - this->layout_int), 0); - status = 0; - } else { - status = dnnConversionCreate(&this->convert_prv2prv, - current_descr->layout_int, this->layout_int); - if (status == 0) - this->descr_prv2prv_conversion = current_descr; - } - if (status != 0) { - this->allocate(); - convert_resources[dnnResourceFrom] = cpu_ptr; - convert_resources[dnnResourceTo] = - reinterpret_cast(this->internal_ptr); - status = dnnExecute(this->convert_to_int, convert_resources); - CHECK_EQ(status, 0) << "Conversion failed with status " << status; - } else { - this->allocate(); - convert_resources[dnnResourceFrom] = reinterpret_cast(prv_ptr); - convert_resources[dnnResourceTo] = - reinterpret_cast(this->internal_ptr); - status = dnnExecute(this->convert_prv2prv, convert_resources); - CHECK_EQ(status, 0) << "Conversion failed with status " << status; - } - if (set_prv_ptr) { - dnn_chunk->set_prv_descriptor(this->get_shared_ptr(), true); - } - return this->internal_ptr; - } else if (current_descr.get() != this) { - // MKL_DLOG(INFO) << "layout OK " - // << current_descr->name << " == " << this->name; - } - } -#endif - return const_cast(prv_ptr); - } else { - if (prv_ptr != NULL) { -#if MKL_EXPERIMENTAL == 1 - std::shared_ptr > other_descr = - std::static_pointer_cast > - (dnn_chunk->prv_descriptor_); - dnn_chunk->check_and_prv_to_cpu(cpu_ptr); -#endif - // printf("get_converted_prv release %s\n", other_descr->name.c_str()); - } - } - return cpu_ptr; -} - -template -void* MKLMemoryDescriptor::get_output_ptr(Dtype *data_ptr, - std::shared_ptr > self_ptr, const TBlob &blob, bool in_place) { -#if MKL_EXPERIMENTAL == 1 - std::shared_ptr dnn_chunk = blob.Mkl_mem_; -#endif - if (this->conversion_needed()) { - void * prv_ptr = this->prv_ptr(); -#if MKL_EXPERIMENTAL == 1 - if (!in_place) { - dnn_chunk->set_prv_descriptor(self_ptr); - } else { - Dtype * blob_prv = op::mkl_prv_data(blob); - if (blob_prv != NULL) - return blob_prv; - } -#endif - return prv_ptr; - } else { -#if MKL_EXPERIMENTAL == 1 - std::shared_ptr > other_descr = - std::static_pointer_cast > - (dnn_chunk->prv_descriptor_); - dnn_chunk->check_and_prv_to_cpu(data_ptr); -#endif - return data_ptr; - } -} - -template class MKLMemoryDescriptor; -template class MKLMemoryDescriptor; - -template class MKLMemoryDescriptorBase; -template class MKLMemoryDescriptorBase; -} // namespace mxnet -#endif diff --git a/src/operator/mkl/mkl_memory.h b/src/operator/mkl/mkl_memory.h deleted file mode 100644 index 13f1fd27b12b..000000000000 --- a/src/operator/mkl/mkl_memory.h +++ /dev/null @@ -1,123 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_memory.cc -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_MEMORY_H_ -#define MXNET_OPERATOR_MKL_MKL_MEMORY_H_ - -#include -#include -#include - - -namespace mxnet { -// Base class -struct PrvMemDescr { - virtual void convert_from_prv(void* cpu_ptr) = 0; - virtual void convert_to_prv(void* cpu_ptr) = 0; - virtual void convert_from_other(std::shared_ptr other) = 0; - virtual void* prv_ptr(bool allocate_when_uninit = true) = 0; - // returns true for matching layouts - virtual bool layout_compare(std::shared_ptr other) = 0; - virtual size_t prv_count() = 0; - virtual size_t prv_size() = 0; - // This might help using prv_ptr_ by different accelerators/engines - enum PrvDescrType { - PRV_DESCR_MKL2017, - PRV_DESCR_MKLDNN - }; - virtual PrvDescrType get_descr_type() = 0; -}; - -#if MKL_EXPERIMENTAL == 1 -// Currently HEAD_AT_PRV do not free CPU data -enum SyncedHead { - HEAD_AT_CPU, - HEAD_AT_PRV, -}; -struct MKLMemHolder { - SyncedHead head_; - std::shared_ptr prv_descriptor_; - bool b_disable_prv_2_cpu; - bool b_eager_mode; - void disable_prv_2_cpu(bool flag) { - b_disable_prv_2_cpu = flag; - } - void set_eager_mode(bool eager_mode) { - b_eager_mode = eager_mode; - } - void set_prv_descriptor(std::shared_ptr descriptor, bool same_data = false) { - head_ = HEAD_AT_PRV; - prv_descriptor_ = descriptor; - } - std::shared_ptr get_prv_descriptor() { - return prv_descriptor_; - } - bool head_at_prv() { - return (head_ == HEAD_AT_PRV) ? true : false; - } - void* prv_data(bool allocate_when_uninit = true) { - if (head_ != HEAD_AT_PRV) { - return NULL; - } - if (prv_descriptor_ == NULL) { - LOG(FATAL) << " prv_descriptor_ is NULL"; - } - CHECK(prv_descriptor_.get()); - return reinterpret_cast(prv_descriptor_->prv_ptr(allocate_when_uninit)); - } - - int prv_count() { - if (head_ != HEAD_AT_PRV) { - return 0; - } - if (prv_descriptor_ == NULL) { - LOG(FATAL) << " prv_descriptor_ is NULL"; - } - CHECK(prv_descriptor_.get()); - return prv_descriptor_->prv_count(); - } - static std::shared_ptr create() { - return std::make_shared(); - } - void check_and_prv_to_cpu(void *dptr_) { - if (!b_disable_prv_2_cpu && head_ == HEAD_AT_PRV) { - CHECK(prv_descriptor_ != nullptr); - prv_descriptor_->convert_from_prv(dptr_); - // Because operator use CPU & maybe change it, change to CPU Flag - head_ = HEAD_AT_CPU; - } - if (b_disable_prv_2_cpu) { - b_disable_prv_2_cpu = false; - } - } - MKLMemHolder() : - head_(HEAD_AT_CPU), prv_descriptor_(nullptr), - b_disable_prv_2_cpu(false), b_eager_mode(false) {} -}; -#else -struct MKLMemHolder { - public: - virtual std::shared_ptr get_prv_descriptor() = 0; -}; -#endif - -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_MEMORY_H_ diff --git a/src/operator/mkl/mkl_pooling-inl.h b/src/operator/mkl/mkl_pooling-inl.h deleted file mode 100644 index 5662a61aebd3..000000000000 --- a/src/operator/mkl/mkl_pooling-inl.h +++ /dev/null @@ -1,357 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_pooling-inl.h -* \brief -* \author zhenlin.luo@intel.com -* lingyan.guo@intel.com -* -*******************************************************************************/ - -#ifndef MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_ -#include -#include -#include -#include "../operator_common.h" -#include "../nn/pooling-inl.h" -#include "./mkl_util-inl.h" - -namespace mxnet { -namespace op { - - -template -class MKLPoolingOp : public Operator { - public: - static std::string getName() { - return "MKLPoolingOp"; - } - explicit MKLPoolingOp(PoolingParam p) { - poolingFwd = static_cast(NULL); - poolingBwd = static_cast(NULL); - max_idx_data = static_cast(NULL); - fwd_top_data = MKLData::create(); - fwd_bottom_data = MKLData::create(); - bwd_top_diff = MKLData::create(); - bwd_bottom_diff = MKLData::create(); - this->param_ = p; - init_mkldnn_ = false; - } - virtual ~MKLPoolingOp() { - if (poolingFwd != NULL) { - dnnDelete(poolingFwd); - poolingFwd = NULL; - } - if (poolingBwd != NULL) { - dnnDelete(poolingBwd); - poolingBwd = NULL; - } - if (max_idx_data != NULL) { - dnnReleaseBuffer(max_idx_data); - max_idx_data = NULL; - } - } - - private: - void LayerSetUp(const mshadow::Tensor &data, - const mshadow::Tensor &out) { - channels_ = data.shape_[1]; - height_ = data.shape_[2]; - width_ = data.shape_[3]; - num_ = data.shape_[0]; - global_pooling_ = param_.global_pool; - if (global_pooling_) { - kernel_h_ = height_; - kernel_w_ = width_; - } else { - kernel_h_ = param_.kernel[0]; - kernel_w_ = param_.kernel[1]; - } - CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; - CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; - pad_h_ = param_.pad[0]; - pad_w_ = param_.pad[1]; - if (global_pooling_) { - stride_h_ = stride_w_ = 1; - } else { - stride_h_ = param_.stride[0]; - stride_w_ = param_.stride[1]; - } - if (global_pooling_) { - CHECK(pad_h_ == 0 && pad_w_ == 0 && stride_h_ == 1 && stride_w_ == 1) - << "With Global_pooling: true; only pad = 0 and stride = 1"; - } - if (pad_h_ != 0 || pad_w_ != 0) { - CHECK(param_.pool_type == pool_enum::kAvgPooling - || param_.pool_type == pool_enum::kMaxPooling) - << "Padding implemented only for average and max pooling."; - CHECK_LT(pad_h_, kernel_h_); - CHECK_LT(pad_w_, kernel_w_); - } - pooled_height_ = out.shape_[2]; - pooled_width_ = out.shape_[3]; - - size_t dim = 4; - size_t src_sizes[4], src_strides[4]; - size_t dst_sizes[4], dst_strides[4]; - src_sizes[0] = width_; - src_sizes[1] = height_; - src_sizes[2] = channels_; - src_sizes[3] = num_; - src_strides[0] = 1; - src_strides[1] = src_sizes[0]; - src_strides[2] = src_sizes[0] * src_sizes[1]; - src_strides[3] = src_sizes[0] * src_sizes[1] * src_sizes[2]; - dst_sizes[0] = pooled_width_; - dst_sizes[1] = pooled_height_; - dst_sizes[2] = src_sizes[2]; - dst_sizes[3] = src_sizes[3]; - dst_strides[0] = 1; - dst_strides[1] = dst_sizes[0]; - dst_strides[2] = dst_sizes[0] * dst_sizes[1]; - dst_strides[3] = dst_sizes[0] * dst_sizes[1] * dst_sizes[2]; - src_offset[0] = -pad_w_; - src_offset[1] = -pad_h_; - src_offset[2] = -pad_w_; - src_offset[3] = -pad_h_; - kernel_stride[0] = stride_w_; - kernel_stride[1] = stride_h_; - kernel_size[0] = kernel_w_; - kernel_size[1] = kernel_h_; - - // Names are for debugging only - fwd_bottom_data->name = "fwd_bottom_data @ " + getName(); - fwd_top_data->name = "fwd_top_data @ " + getName(); - bwd_top_diff->name = "bwd_top_diff @ " + getName(); - bwd_bottom_diff->name = "bwd_bottom_diff @ " + getName(); - - fwd_bottom_data->create_user_layout(dim, src_sizes, src_strides); - fwd_top_data->create_user_layout(dim, dst_sizes, dst_strides); - bwd_bottom_diff->create_user_layout(dim, src_sizes, src_strides); - bwd_top_diff->create_user_layout(dim, dst_sizes, dst_strides); - - // Primitives will be allocated during the first fwd pass - poolingFwd = NULL; - poolingBwd = NULL; - max_idx_data = NULL; - } - - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1); - CHECK_EQ(out_data.size(), 1); - Stream *s = ctx.get_stream(); - if (param_.kernel.ndim() >= 3) { - LOG(FATAL) << "Not implmented"; - } - Tensor data = mkl_experimental_direct_get( - in_data[pool_enum::kData], s); - Tensor out = mkl_experimental_direct_get( - out_data[pool_enum::kOut], s); - if (!init_mkldnn_) { - LayerSetUp(data, out); - init_mkldnn_ = true; - } - auto first_pass = false; - if (poolingFwd == NULL) first_pass = true; - - dnnAlgorithm_t algorithm = dnnAlgorithmPoolingMax; - - switch (param_.pool_type) { - case pool_enum::kMaxPooling: - algorithm = dnnAlgorithmPoolingMax; - break; - case pool_enum::kAvgPooling: - algorithm = dnnAlgorithmPoolingAvgIncludePadding; - - break; - default: - LOG(FATAL) << "Unknown pooling method."; - } - - dnnError_t status; - void* pooling_res[dnnResourceNumber]; - - void* bottom_data = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_data = - reinterpret_cast(mkl_prv_data(in_data[pool_enum::kData])); -#endif - dnnBorder_t border_type = dnnBorderZerosAsymm; - switch (param_.pooling_convention) { - case pool_enum::kFull: - border_type = dnnBorderZeros; - break; - case pool_enum::kValid: - border_type = dnnBorderZerosAsymm; - break; - default: - border_type = dnnBorderZerosAsymm; - break; - } - if (NULL == bottom_data) { - bottom_data = data.dptr_; - if (NULL == poolingFwd) { - status = dnnPoolingCreateForward(&poolingFwd, NULL, - algorithm, fwd_bottom_data->layout_usr, - kernel_size, kernel_stride, - src_offset, border_type); - CHECK_EQ(status, E_SUCCESS); - // Now create poolingBwd - status = dnnPoolingCreateBackward(&poolingBwd, NULL, - algorithm, fwd_bottom_data->layout_usr, - kernel_size, kernel_stride, - src_offset, border_type); - CHECK_EQ(status, E_SUCCESS); - } - } -#if MKL_EXPERIMENTAL == 1 - if (NULL != bottom_data) { - if (NULL == poolingFwd) { - std::shared_ptr bottom_data_mem = in_data[pool_enum::kData].Mkl_mem_; - std::shared_ptr bottom_prv_descriptor = - bottom_data_mem->get_prv_descriptor(); - CHECK_EQ(bottom_prv_descriptor->get_descr_type(), - PrvMemDescr::PRV_DESCR_MKL2017); - std::shared_ptr > mem_descr - = std::static_pointer_cast>(bottom_prv_descriptor); - CHECK(mem_descr != nullptr); - fwd_bottom_data = mem_descr; - - status = dnnPoolingCreateForward(&poolingFwd, NULL, - algorithm, fwd_bottom_data->layout_int, - kernel_size, kernel_stride, - src_offset, border_type); - CHECK_EQ(status, E_SUCCESS); - fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst); - - // Now create poolingBwd - status = dnnPoolingCreateBackward(&poolingBwd, NULL, - algorithm, fwd_bottom_data->layout_int, - kernel_size, kernel_stride, - src_offset, border_type); - CHECK_EQ(status, E_SUCCESS); - bwd_top_diff->create_internal_layout(poolingFwd, dnnResourceDst); - bwd_bottom_diff->create_internal_layout(poolingFwd, dnnResourceSrc); - } - } -#endif - - if (first_pass) { - dnnLayout_t max_idx_datal = NULL; - status = dnnLayoutCreateFromPrimitive( - &max_idx_datal, poolingFwd, dnnResourceWorkspace); - CHECK_EQ(status, E_SUCCESS); - status = dnnAllocateBuffer(reinterpret_cast(&max_idx_data), max_idx_datal); - CHECK_EQ(status, E_SUCCESS); -#if MKL_EXPERIMENTAL == 0 - fwd_bottom_data->create_internal_layout(poolingFwd, dnnResourceSrc); - fwd_top_data->create_internal_layout(poolingFwd, dnnResourceDst); - bwd_top_diff->create_internal_layout(poolingBwd, dnnResourceDiffDst); - bwd_bottom_diff->create_internal_layout(poolingBwd, dnnResourceDiffSrc); -#endif - dnnLayoutDelete(max_idx_datal); - first_pass = false; - } - pooling_res[dnnResourceSrc] = bottom_data; - pooling_res[dnnResourceWorkspace] = max_idx_data; - - pooling_res[dnnResourceDst] = fwd_top_data->get_output_ptr( - out.dptr_, fwd_top_data, out_data[pool_enum::kOut]); - status = dnnExecute(poolingFwd, pooling_res); - CHECK_EQ(status, E_SUCCESS); -#if MKL_EXPERIMENTAL == 0 - if (fwd_top_data->conversion_needed()) { - fwd_top_data->convert_from_prv(out.dptr_); - } -#endif - } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - if (!req[0]) { - return; - } - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1); - CHECK_EQ(in_data.size(), 1); - CHECK_EQ(out_data.size(), 1); - CHECK_EQ(req.size(), 1); - CHECK_EQ(in_grad.size(), 1); - if (param_.kernel.ndim() >= 3) { - LOG(FATAL) << "Not implmented"; - } - Stream *s = ctx.get_stream(); - Tensor grad = mkl_experimental_direct_get( - out_grad[pool_enum::kOut], s); - Tensor input_grad = mkl_experimental_direct_get( - in_grad[pool_enum::kData], s); - dnnError_t e; - void* pooling_res[dnnResourceNumber]; - pooling_res[dnnResourceWorkspace] = reinterpret_cast(max_idx_data); - - pooling_res[dnnResourceDiffDst] = - bwd_top_diff->get_converted_prv(grad.dptr_, true, out_grad[pool_enum::kOut]); - - pooling_res[dnnResourceDiffSrc] = bwd_bottom_diff->get_output_ptr( - input_grad.dptr_, bwd_bottom_diff, in_grad[pool_enum::kData]); - e = dnnExecute(poolingBwd, pooling_res); - CHECK_EQ(e, E_SUCCESS); -#if MKL_EXPERIMENTAL == 0 - if (bwd_bottom_diff->conversion_needed()) { - bwd_bottom_diff->convert_from_prv(input_grad.dptr_); - } -#endif - } - - private: - PoolingParam param_; - int kernel_h_, kernel_w_; - int stride_h_, stride_w_; - int pad_h_, pad_w_; - int channels_, num_; - int height_, width_; - int pooled_height_, pooled_width_; - bool global_pooling_; - - private: - size_t kernel_size[2], - kernel_stride[4]; - int src_offset[4]; // 2*(dimension-2) - dnnPrimitive_t poolingFwd, poolingBwd; - DType *max_idx_data; - - std::shared_ptr > fwd_top_data; - std::shared_ptr > fwd_bottom_data; - std::shared_ptr > bwd_top_diff; - std::shared_ptr > bwd_bottom_diff; - bool init_mkldnn_; -}; // class MKLPoolingOp -} // namespace op -} // namespace mxnet - -#endif // MXNET_OPERATOR_MKL_MKL_POOLING_INL_H_ diff --git a/src/operator/mkl/mkl_relu-inl.h b/src/operator/mkl/mkl_relu-inl.h deleted file mode 100644 index 8d7ab5e1e2db..000000000000 --- a/src/operator/mkl/mkl_relu-inl.h +++ /dev/null @@ -1,272 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_relu-inl.h -* \brief -* \author zhenlin.luo@intel.com -* lingyan.guo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_RELU_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_RELU_INL_H_ - - -#include -#include -#include -#include -#include -#include -#include -#include -#include "../operator_common.h" -#include "./mkl_util-inl.h" - -namespace mxnet { -namespace op { - -template -class MKLReluOp : public Operator { - public: - static std::string getName() { - return "MKLReluOp"; - } - MKLReluOp(): - reluFwd_(NULL), - reluBwd_(NULL) { - init_mkldnn_ = false; - fwd_top_data_ = MKLData::create(); - fwd_bottom_data_ = MKLData::create(); - bwd_top_diff_ = MKLData::create(); - bwd_bottom_diff_ = MKLData::create(); - } - - ~MKLReluOp() { - if (reluFwd_ != NULL) { - dnnDelete(reluFwd_); - reluFwd_ = NULL; - } - if (reluBwd_ != NULL) { - dnnDelete(reluBwd_); - reluBwd_ = NULL; - } - } - - private: - void LayerSetUp(const mshadow::Tensor &data, - const mshadow::Tensor &out) { - size_t dim = 4; - size_t *sizes = new size_t[dim]; - size_t *strides = new size_t[dim]; - for (size_t d = 0; d < dim; ++d) { - (sizes)[d] = data.shape_[dim - 1 - d]; - (strides)[d] = (d == 0) ? 1 : (strides)[d - 1] * (sizes)[d - 1]; - } - // Names are for debugging only - fwd_bottom_data_->name = "fwd_bottom_data @ " + getName(); - fwd_top_data_->name = "fwd_top_data @ " + getName(); - bwd_bottom_diff_->name = "bwd_bottom_diff @ " + getName(); - bwd_top_diff_->name = "bwd_top_diff @ " + getName(); - fwd_bottom_data_->create_user_layout(dim, (sizes), (strides)); - fwd_top_data_->create_user_layout(dim, (sizes), (strides)); - bwd_bottom_diff_->create_user_layout(dim, (sizes), (strides)); - bwd_top_diff_->create_user_layout(dim, (sizes), (strides)); - delete[] sizes; - delete[] strides; - } - - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1); - CHECK_EQ(out_data.size(), 1); - Stream *s = ctx.get_stream(); - Tensor data; - Tensor out; - if (in_data[activation::kData].ndim() == 1) { - Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], 1, 1, 1); - data = mkl_experimental_direct_get_with_shape( - in_data[activation::kData], dshape, s); - out = mkl_experimental_direct_get_with_shape( - out_data[activation::kOut], dshape, s); - } else if (in_data[activation::kData].ndim() == 2) { - Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], - in_data[activation::kData].shape_[1], 1, 1); - data = mkl_experimental_direct_get_with_shape( - in_data[activation::kData], dshape, s); - out = mkl_experimental_direct_get_with_shape( - out_data[activation::kOut], dshape, s); - } else if (in_data[activation::kData].ndim() == 3) { - Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], - in_data[activation::kData].shape_[1], - in_data[activation::kData].shape_[2], 1); - data = mkl_experimental_direct_get_with_shape( - in_data[activation::kData], dshape, s); - out = mkl_experimental_direct_get_with_shape( - out_data[activation::kOut], dshape, s); - } else { - data = mkl_experimental_direct_get(in_data[activation::kData], s); - out = mkl_experimental_direct_get(out_data[activation::kOut], s); - } - if (!init_mkldnn_) { - LayerSetUp(data, out); - init_mkldnn_ = true; - } - void* bottom_data = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_data = - reinterpret_cast(mkl_prv_data(in_data[activation::kData])); -#endif -#if MKL_EXPERIMENTAL == 1 - if (bottom_data != NULL) { - if (reluFwd_ == NULL) { - std::shared_ptr > mem_descr = - mkl_get_mem_desc(in_data[activation::kData].Mkl_mem_); - DType negative_slope = 0; - dnnError_t e; - e = dnnReLUCreateForward(&reluFwd_, NULL, mem_descr->layout_int, - negative_slope); - CHECK_EQ(e, E_SUCCESS); - e = dnnReLUCreateBackward(&reluBwd_, NULL, mem_descr->layout_int, - mem_descr->layout_int, negative_slope); - CHECK_EQ(e, E_SUCCESS); - - fwd_bottom_data_ = mem_descr; - fwd_top_data_->create_internal_layout(reluFwd_, dnnResourceDst); - bwd_top_diff_->create_internal_layout(reluFwd_, dnnResourceDst); - bwd_bottom_diff_->create_internal_layout(reluFwd_, dnnResourceSrc); - } - } -#endif - if (bottom_data == NULL) { - bottom_data = data.dptr_; - if (reluFwd_ == NULL) { - dnnError_t e; - DType negative_slope = 0; - e = dnnReLUCreateForward(&reluFwd_, NULL, - fwd_bottom_data_->layout_usr, negative_slope); - CHECK_EQ(e, E_SUCCESS); - e = dnnReLUCreateBackward(&reluBwd_, NULL, - fwd_bottom_data_->layout_usr, fwd_bottom_data_->layout_usr, - negative_slope); - CHECK_EQ(e, E_SUCCESS); - } - } - dnnError_t e; - void* relu_res[dnnResourceNumber]; - relu_res[dnnResourceSrc] = bottom_data; - - relu_res[dnnResourceDst] = fwd_top_data_->get_output_ptr( - out.dptr_, fwd_top_data_, out_data[activation::kOut], (data.dptr_ == out.dptr_)); - e = dnnExecute(reluFwd_, relu_res); - CHECK_EQ(e, E_SUCCESS); -#if MKL_EXPERIMENTAL == 0 - if (fwd_top_data_->conversion_needed()) { - fwd_top_data_->convert_from_prv(out.dptr_); - } -#endif - } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - if (!req[0]) { - return; - } - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1); - CHECK(in_data.size() == 1 && in_grad.size() == 1); - CHECK_EQ(req.size(), 1); - Stream *s = ctx.get_stream(); - Tensor m_out_grad; - Tensor m_out_data; - Tensor m_in_grad; - - if (out_grad[activation::kOut].ndim() == 1) { - Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0], 1, 1, 1); - m_out_grad = mkl_experimental_direct_get_with_shape( - out_grad[activation::kOut], dshape, s); - m_out_data = mkl_experimental_direct_get_with_shape( - out_data[activation::kOut], dshape, s); - m_in_grad = mkl_experimental_direct_get_with_shape( - in_grad[activation::kData], dshape, s); - } else if (out_grad[activation::kOut].ndim() == 2) { - Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0], - out_grad[activation::kOut].shape_[1], 1, 1); - m_out_grad = mkl_experimental_direct_get_with_shape( - out_grad[activation::kOut], dshape, s); - m_out_data = mkl_experimental_direct_get_with_shape( - out_data[activation::kOut], dshape, s); - m_in_grad = mkl_experimental_direct_get_with_shape( - in_grad[activation::kData], dshape, s); - } else if (out_grad[activation::kOut].ndim() == 3) { - Shape<4> dshape = Shape4(out_grad[activation::kOut].shape_[0], - out_grad[activation::kOut].shape_[1], - out_grad[activation::kOut].shape_[2], 1); - m_out_grad = mkl_experimental_direct_get_with_shape( - out_grad[activation::kOut], dshape, s); - m_out_data = mkl_experimental_direct_get_with_shape( - out_data[activation::kOut], dshape, s); - m_in_grad = mkl_experimental_direct_get_with_shape( - in_grad[activation::kData], dshape, s); - } else { - m_out_grad = mkl_experimental_direct_get(out_grad[activation::kOut], s); - m_out_data = mkl_experimental_direct_get(out_data[activation::kOut], s); - m_in_grad = mkl_experimental_direct_get(in_grad[activation::kData], s); - } - dnnError_t e; - void* relu_res[dnnResourceNumber]; - - void* bottom_data = NULL; -#if MKL_EXPERIMENTAL == 1 - bottom_data = reinterpret_cast(mkl_prv_data(out_data[activation::kOut])); -#endif - if (NULL == bottom_data) { - bottom_data = reinterpret_cast(const_cast(m_out_data.dptr_)); - } - relu_res[dnnResourceSrc] = bottom_data; - relu_res[dnnResourceDiffDst] = bwd_top_diff_->get_converted_prv(m_out_grad.dptr_, - true, out_grad[activation::kOut]); - relu_res[dnnResourceDiffSrc] = bwd_bottom_diff_->get_output_ptr( - m_in_grad.dptr_, bwd_bottom_diff_, in_grad[activation::kData]); - e = dnnExecute(reluBwd_, relu_res); - CHECK_EQ(e, E_SUCCESS); -#if MKL_EXPERIMENTAL == 0 - if (bwd_bottom_diff_->conversion_needed()) { - bwd_bottom_diff_->convert_from_prv(m_in_grad.dptr_); - } -#endif - } - - private: - bool init_mkldnn_; - std::shared_ptr > fwd_top_data_; - std::shared_ptr > fwd_bottom_data_; - std::shared_ptr > bwd_top_diff_; - std::shared_ptr > bwd_bottom_diff_; - dnnPrimitive_t reluFwd_, reluBwd_; -}; // class MKLReluOp -} // namespace op -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_RELU_INL_H_ diff --git a/src/operator/mkl/mkl_util-inl.h b/src/operator/mkl/mkl_util-inl.h deleted file mode 100644 index 4ad786a2ce93..000000000000 --- a/src/operator/mkl/mkl_util-inl.h +++ /dev/null @@ -1,110 +0,0 @@ -/******************************************************************************* -* Copyright 2016 Intel Corporation -* -* Licensed under the Apache License, Version 2.0 (the "License"); -* you may not use this file except in compliance with the License. -* You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -* -* \file mkl_util-inl.h -* \brief -* \author lingyan.guo@intel.com -* zhenlin.luo@intel.com -* -*******************************************************************************/ -#ifndef MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_ -#define MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_ -#include -#define MKLDNN_CALL(func) \ - { \ - dnnError_t status = (func); \ - CHECK_EQ(status, E_SUCCESS) << "MKL DNN call failed (status: " << status << ")."; \ - } - - -namespace mxnet { -namespace op { - -#if MKL_EXPERIMENTAL == 1 - template - inline DType * mkl_prv_data(const TBlob &b) { - std::shared_ptr bottom_data_mem = b.Mkl_mem_; - bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv(); - if (mem_valid) { - return reinterpret_cast(bottom_data_mem->prv_data()); - } - return NULL; - } - - template - inline int mkl_prv_count(const TBlob &b) { - std::shared_ptr bottom_data_mem = b.Mkl_mem_; - bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv(); - if (mem_valid) { - return bottom_data_mem->prv_count(); - } - return 0; - } -#endif - inline void mkl_set_priv_flag(const TBlob &b) { -#if MKL_EXPERIMENTAL == 1 - std::shared_ptr bottom_data_mem = b.Mkl_mem_; - bool mem_valid = (bottom_data_mem != nullptr) && bottom_data_mem->head_at_prv(); - if (mem_valid) { - bottom_data_mem->disable_prv_2_cpu(true); - } -#endif - } -#if MKL_EXPERIMENTAL == 1 - template - inline std::shared_ptr > mkl_get_mem_desc( - const std::shared_ptr data_mem) { - std::shared_ptr prv_descriptor = - data_mem->get_prv_descriptor(); - CHECK_EQ(prv_descriptor->get_descr_type(), - PrvMemDescr::PRV_DESCR_MKL2017); - std::shared_ptr > mem_descr - = std::static_pointer_cast> - (prv_descriptor); - CHECK(mem_descr != NULL); - return mem_descr; - } -#endif - template - inline mshadow::Tensor mkl_experimental_direct_get( - const TBlob &b, mshadow::Stream *s) { - mkl_set_priv_flag(b); - return b.get(s); - } - template - inline mshadow::Tensor mkl_experimental_direct_get_with_shape( - const TBlob &b, const mshadow::Shape &shape, mshadow::Stream *s) { - mkl_set_priv_flag(b); - return b.get_with_shape(shape, s); - } -} // namespace op -#if MKL_EXPERIMENTAL == 1 -inline void mkl_tblobs_prv_to_cpu(const std::vector &data) { - for (size_t i = 0; i < data.size(); i++) { - std::shared_ptr mem_holder = data[i].Mkl_mem_; - if (mem_holder != nullptr && mem_holder->b_eager_mode) { - mem_holder->check_and_prv_to_cpu(data[i].dptr_); - } - } -} -inline void mkl_set_tblob_eager_mode(const TBlob &data) { - std::shared_ptr mem_holder = data.Mkl_mem_; - if (mem_holder != nullptr) { - mem_holder->set_eager_mode(true); - } -} -#endif -} // namespace mxnet -#endif // MXNET_OPERATOR_MKL_MKL_UTIL_INL_H_ diff --git a/src/operator/nn/activation-inl.h b/src/operator/nn/activation-inl.h index ac8b747f0f39..a440f97e1382 100644 --- a/src/operator/nn/activation-inl.h +++ b/src/operator/nn/activation-inl.h @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file activation-inl.h * \brief Activation operator - * \author Bing Xu + * \author Bing Xu, Da Zheng */ #ifndef MXNET_OPERATOR_NN_ACTIVATION_INL_H_ @@ -37,6 +37,7 @@ #include #include "../operator_common.h" #include "../mxnet_op.h" +#include "../mshadow_op.h" namespace mxnet { namespace op { @@ -45,6 +46,7 @@ namespace op { namespace activation { enum ActivationOpInputs {kData}; enum ActivationOpOutputs {kOut}; +enum ActivationOpResource {kTempSpace}; enum ActivationOpType {kReLU, kSigmoid, kTanh, kSoftReLU}; } // activation @@ -59,160 +61,148 @@ struct ActivationParam : public dmlc::Parameter { .add_enum("softrelu", activation::kSoftReLU) .describe("Activation function to be applied."); } -}; -/** - * \brief This is the implementation of activation operator. - * \tparam xpu The device that the op will be executed on. - */ -template -class ActivationOp : public Operator { - public: - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - Stream *s = ctx.get_stream(); - const TBlob& input = in_data[activation::kData]; - const size_t sz = input.shape_.Size(); - if (sz) { - MXNET_ASSIGN_REQ_SWITCH(req[activation::kOut], Req, { - mxnet_op::Kernel, xpu>::Launch( - s, sz, - out_data[activation::kOut].dptr(), - input.dptr()); - }); - } + bool operator==(const ActivationParam& other) const { + return this->act_type == other.act_type; } +}; - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK(in_data.size() == 1 && in_grad.size() == 1); - CHECK_EQ(req.size(), 1U); - Stream *s = ctx.get_stream(); - const TBlob& m_out_grad = out_grad[activation::kOut]; - const TBlob& m_out_data = out_data[activation::kOut]; - const TBlob& m_in_grad = in_grad[activation::kData]; - const size_t sz = m_out_data.shape_.Size(); - if (sz) { - MXNET_ASSIGN_REQ_SWITCH(req[activation::kData], Req, { - mxnet_op::Kernel, Req>, xpu>::Launch( - s, sz, - m_in_grad.dptr(), - m_out_grad.dptr(), - m_out_data.dptr()); - }); - } - } -}; // class ActivationOp - -// Declare Factory function, used for dispatch specialization -template -Operator* CreateOp(ActivationParam type, int dtype, const TShape& dshape); +} // namespace op +} // namespace mxnet -#if DMLC_USE_CXX11 -class ActivationProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); +namespace std { +template<> +struct hash { + size_t operator()(const mxnet::op::ActivationParam& val) { + return val.act_type; } +}; +} // namespace std + +namespace mxnet { +namespace op { - std::map GetParams() const override { - return param_.__DICT__(); +template +void ActivationForward(const OpContext &ctx, const TBlob &in_data, + const OpReqType &req, const TBlob &out_data) { + using namespace mshadow; + using namespace mshadow::expr; + Stream *s = ctx.get_stream(); + const size_t sz = in_data.shape_.Size(); + if (sz) { + MXNET_ASSIGN_REQ_SWITCH(req, Req, { + mxnet_op::Kernel, xpu>::Launch( + s, sz, + out_data.dptr(), + in_data.dptr()); + }); } +} - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 1U) << "Input:[data]"; - const TShape &dshape = in_shape->at(activation::kData); - if (dshape.ndim() == 0) return false; - out_shape->clear(); - out_shape->push_back(dshape); - return true; +template +void ActivationBackward(const OpContext &ctx, const TBlob &out_grad, + const TBlob &out_data, const OpReqType &req, + const TBlob &in_grad) { + using namespace mshadow; + using namespace mshadow::expr; + Stream *s = ctx.get_stream(); + const size_t sz = out_data.shape_.Size(); + if (sz) { + MXNET_ASSIGN_REQ_SWITCH(req, Req, { + mxnet_op::Kernel, Req>, xpu>::Launch( + s, sz, + in_grad.dptr(), + out_grad.dptr(), + out_data.dptr()); + }); } +} - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); - } +template +void ActivationComputeImpl(const ActivationParam ¶m, const OpContext &ctx, + const TBlob &input, OpReqType req, const TBlob &output) { + MSHADOW_REAL_TYPE_SWITCH(input.type_flag_, DType, { + switch (param.act_type) { + case activation::kReLU: + ActivationForward( + ctx, input, req, output); + break; + case activation::kSigmoid: + ActivationForward( + ctx, input, req, output); + break; + case activation::kTanh: + ActivationForward( + ctx, input, req, output); + break; + case activation::kSoftReLU: + ActivationForward( + ctx, input, req, output); + break; + default: + LOG(FATAL) << "unknown activation type"; } - out_type->clear(); - out_type->push_back(dtype); - return true; - } + }); +} - OperatorProperty* Copy() const override { - auto ptr = new ActivationProp(); - ptr->param_ = param_; - return ptr; - } +template +void ActivationGradComputeImpl(const ActivationParam ¶m, const OpContext &ctx, + const TBlob &out_grad, const TBlob &out_data, + OpReqType req, const TBlob &output) { + MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, { + switch (param.act_type) { + case activation::kReLU: + ActivationBackward( + ctx, out_grad, out_data, req, output); + break; + case activation::kSigmoid: + ActivationBackward( + ctx, out_grad, out_data, req, output); + break; + case activation::kTanh: + ActivationBackward( + ctx, out_grad, out_data, req, output); + break; + case activation::kSoftReLU: + ActivationBackward( + ctx, out_grad, out_data, req, output); + break; + default: + LOG(FATAL) << "unknown activation type"; + } + }); +} - std::string TypeString() const override { - return "Activation"; - } +template +void ActivationCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + const ActivationParam& param = nnvm::get(attrs.parsed); + ActivationComputeImpl(param, ctx, inputs[0], req[0], outputs[0]); +} - // decalre dependency and inplace optimization options - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { +template +void ActivationGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { #if MXNET_USE_CUDNN == 1 - return {out_grad[activation::kOut], out_data[activation::kOut], in_data[activation::kData]}; + CHECK_EQ(inputs.size(), 3U); #else - return {out_grad[activation::kOut], out_data[activation::kOut]}; -#endif // MXNET_USE_CUDNN - } + CHECK_EQ(inputs.size(), 2U); +#endif + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + const ActivationParam& param = nnvm::get(attrs.parsed); + ActivationGradComputeImpl(param, ctx, inputs[0], inputs[1], req[0], outputs[0]); +} - std::vector > BackwardInplaceOption( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &in_grad) const override { - return {{out_grad[activation::kOut], in_grad[activation::kData]}}; - } - - std::vector > ForwardInplaceOption( - const std::vector &in_data, - const std::vector &out_data) const override { - return {{in_data[activation::kData], out_data[activation::kOut]}}; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } - - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; - - private: - ActivationParam param_; -}; -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_ACTIVATION_INL_H_ diff --git a/src/operator/nn/activation.cc b/src/operator/nn/activation.cc index 401a9e3eaa56..0da644cb1f70 100644 --- a/src/operator/nn/activation.cc +++ b/src/operator/nn/activation.cc @@ -17,69 +17,130 @@ * under the License. */ + /*! * Copyright (c) 2015 by Contributors * \file activation.cc * \brief activation op - * \author Bing Xu + * \author Bing Xu, Da Zheng */ #include "./activation-inl.h" #include "../mshadow_op.h" -#if MXNET_USE_MKL2017 == 1 -#include -#include "../mkl/mkl_memory-inl.h" -#include "../mkl/mkl_relu-inl.h" -#endif // MXNET_USE_MKL2017 +#include "../tensor/elemwise_unary_op.h" +#if MXNET_USE_MKLDNN == 1 +#include "./mkldnn/mkldnn_base-inl.h" +#include "./mkldnn/mkldnn_ops-inl.h" +#endif // MXNET_USE_MKLDNN namespace mxnet { namespace op { -template<> -Operator *CreateOp(ActivationParam param, int dtype, const TShape& dshape) { - Operator *op = NULL; -#if MXNET_USE_MKL2017 == 1 - if (param.act_type == activation::kReLU && dshape.ndim() <= 4) { - switch (dtype) { - case mshadow::kFloat32: - return new MKLReluOp(); - case mshadow::kFloat64: - return new MKLReluOp(); - default: - break; - } + +DMLC_REGISTER_PARAMETER(ActivationParam); + +// This will determine the order of the inputs for backward computation. +struct ActivationGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + std::vector heads(ograds.begin(), ograds.end()); + heads.emplace_back(nnvm::NodeEntry{n, activation::kOut, 0}); +#if MXNET_USE_CUDNN == 1 + heads.push_back(n->inputs[activation::kData]); +#endif + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + +#if MXNET_USE_MKLDNN == 1 +static void ActivationComputeExCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ActivationParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + if (SupportMKLDNN(inputs[0])) { + MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs); + MKLDNNActivationForward(attrs, ctx, inputs[0], req[0], outputs[0]); + MKLDNN_OPCHECK_RUN(ActivationCompute, attrs, ctx, inputs, req, outputs); + return; } - if (enableMKLWarnGenerated()) - LOG(INFO) << MKLReluOp::getName() << " Skip MKL optimization"; + ActivationComputeImpl(param, ctx, inputs[0].data(), req[0], outputs[0].data()); +} + +void ActivationGradComputeExCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { +#if MXNET_USE_CUDNN == 1 + CHECK_EQ(inputs.size(), 3U); +#else + CHECK_EQ(inputs.size(), 2U); #endif - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - switch (param.act_type) { - case activation::kReLU: - op = new ActivationOp(); - break; - case activation::kSigmoid: - op = new ActivationOp(); - break; - case activation::kTanh: - op = new ActivationOp(); - break; - case activation::kSoftReLU: - op = new ActivationOp(); - break; - default: - LOG(FATAL) << "unknown activation type"; - } - }) - return op; + const ActivationParam& param = nnvm::get(attrs.parsed); + if (SupportMKLDNN(inputs[0])) { + MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs); + MKLDNNActivationBackward(attrs, ctx, inputs[0], inputs[1], req[0], + outputs[0]); + MKLDNN_OPCHECK_RUN(ActivationGradCompute, attrs, ctx, inputs, req, outputs); + return; + } + ActivationGradComputeImpl(param, ctx, inputs[0].data(), inputs[1].data(), + req[0], outputs[0].data()); } +#endif -// DO_BIND_DISPATCH comes from operator_common.h -Operator *ActivationProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_shape)[0]); +inline static bool ActivationStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 1); + CHECK_EQ(out_attrs->size(), 1); + const ActivationParam& param = nnvm::get(attrs.parsed); + bool ret = ElemwiseStorageType<1, 1, false, false, false>(attrs, dev_mask, + dispatch_mode, + in_attrs, out_attrs); +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) { + *dispatch_mode = DispatchMode::kFComputeEx; + } +#endif + return ret; } -DMLC_REGISTER_PARAMETER(ActivationParam); +inline static bool BackwardActStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { +#if MXNET_USE_CUDNN == 1 + CHECK_EQ(in_attrs->size(), 3U); +#else + CHECK_EQ(in_attrs->size(), 2U); +#endif + CHECK_EQ(out_attrs->size(), 1U); + const ActivationParam& param = nnvm::get(attrs.parsed); +#if MXNET_USE_CUDNN == 1 + bool ret = ElemwiseStorageType<3, 1, false, false, false>(attrs, dev_mask, + dispatch_mode, + in_attrs, out_attrs); +#else + bool ret = ElemwiseStorageType<2, 1, false, false, false>(attrs, dev_mask, + dispatch_mode, + in_attrs, out_attrs); +#endif +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNAct(param)) { + *dispatch_mode = DispatchMode::kFComputeEx; + } +#endif + return ret; +} -MXNET_REGISTER_OP_PROPERTY(Activation, ActivationProp) +MXNET_OPERATOR_REGISTER_UNARY(Activation) .describe(R"code(Applies an activation function element-wise to the input. The following activation functions are supported: @@ -90,8 +151,35 @@ The following activation functions are supported: - `softrelu`: Soft ReLU, or SoftPlus, :math:`y = log(1 + exp(x))` )code" ADD_FILELINE) -.add_argument("data", "NDArray-or-Symbol", "Input array to activation function.") +.set_attr_parser(ParamParser) +.set_attr("FInferStorageType", ActivationStorageType) +.set_attr("FCompute", ActivationCompute) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FComputeEx", ActivationComputeExCPU) +#endif +.set_attr("FGradient", ActivationGrad{"_backward_Activation"}) .add_arguments(ActivationParam::__FIELDS__()); +NNVM_REGISTER_OP(_backward_Activation) +.set_num_inputs(3) +.set_num_outputs(1) +.set_attr("TIsBackward", true) +.set_attr("FInferStorageType", BackwardActStorageType) +.set_attr("FInferShape", ElemwiseShape<3, 1>) +.set_attr("FInferType", ElemwiseType<3, 1>) +.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ + return std::vector >{{0, 0}}; +}) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif +.set_attr_parser(ParamParser) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FComputeEx", ActivationGradComputeExCPU) +#endif +.set_attr("FCompute", ActivationGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/activation.cu b/src/operator/nn/activation.cu index c2f6be9f37c8..dc435b2acc17 100644 --- a/src/operator/nn/activation.cu +++ b/src/operator/nn/activation.cu @@ -31,39 +31,73 @@ namespace mxnet { namespace op { + +#if MXNET_USE_CUDNN == 1 + +template +static CuDNNActivationOp &get_cudnn_op(const ActivationParam& param) { +#if DMLC_CXX11_THREAD_LOCAL + static thread_local CuDNNActivationOp cudnn_op; +#else + static MX_THREAD_LOCAL CuDNNActivationOp cudnn_op; +#endif + cudnn_op.Init(param); + return cudnn_op; +} + template<> -Operator *CreateOp(ActivationParam param, int dtype, const TShape& dshape) { - Operator *op = NULL; +void ActivationCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + const ActivationParam& param = nnvm::get(attrs.parsed); + // SoftReLU not supported by CUDNN yet if (param.act_type == activation::kSoftReLU) { - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new ActivationOp(); - }) - return op; + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + ActivationForward(ctx, + inputs[0], req[0], outputs[0]); + }); + } else { + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + get_cudnn_op(param).Forward(ctx, inputs[0], req[0], outputs[0]); + }); } +} -#if MXNET_USE_CUDNN == 1 - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new CuDNNActivationOp(param); - }) -#else - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - switch (param.act_type) { - case activation::kReLU: - op = new ActivationOp(); - break; - case activation::kSigmoid: - op = new ActivationOp(); - break; - case activation::kTanh: - op = new ActivationOp(); - break; - default: - LOG(FATAL) << "unknown activation"; - } - }) -#endif // MXNET_USE_CUDNN - return op; +template<> +void ActivationGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + const ActivationParam& param = nnvm::get(attrs.parsed); + + // SoftReLU not supported by CUDNN yet + if (param.act_type == activation::kSoftReLU) { + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + ActivationBackward( + ctx, inputs[0], inputs[1], req[0], outputs[0]); + }); + } else { + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + get_cudnn_op(param).Backward(ctx, inputs[0], inputs[2], inputs[1], req[0], outputs[0]); + }); + } } +#endif + +NNVM_REGISTER_OP(Activation) +.set_attr("FCompute", ActivationCompute); + +NNVM_REGISTER_OP(_backward_Activation) +.set_attr("FCompute", ActivationGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/batch_norm-inl.h b/src/operator/nn/batch_norm-inl.h index 2a9dee2cf845..27e0a8434d77 100644 --- a/src/operator/nn/batch_norm-inl.h +++ b/src/operator/nn/batch_norm-inl.h @@ -21,7 +21,7 @@ * Copyright (c) 2017 by Contributors * \file batch_norm-inl.h * \brief - * \author Bing Xu, Chris Olivier + * \author Bing Xu, Chris Olivier, Da Zheng */ #ifndef MXNET_OPERATOR_NN_BATCH_NORM_INL_H_ #define MXNET_OPERATOR_NN_BATCH_NORM_INL_H_ @@ -47,8 +47,10 @@ namespace mxnet { namespace op { namespace batchnorm { -enum BatchNormOpInputs {kData, kGamma, kBeta}; // kGamma: weights, kBeta: biases +enum BatchNormOpInputs {kData, kGamma, kBeta, kInMovingMean, + kInMovingVar}; // kGamma: weights, kBeta: biases enum BatchNormOpOutputs {kOut, kMean, kVar}; // req, out_data +enum BatchNormOpResource {kTempSpace}; enum BatchNormOpAuxiliary {kMovingMean, kMovingVar}; // aux_states /*! \brief Default channel axis if none specified int he params */ @@ -83,280 +85,203 @@ struct BatchNormParam : public dmlc::Parameter { DMLC_DECLARE_FIELD(cudnn_off).set_default(false) .describe("Do not select CUDNN operator, if available"); } -}; - -/*! \brief Batch normalization operator */ -template -class BatchNormOp : public Operator { - public: - explicit BatchNormOp(BatchNormParam param) { - this->param_ = param; - } - - static inline bool IsWriting(const OpReqType ort) { - return ort == kWriteTo || ort == kWriteInplace; - } - - /*! - * \brief perform a forward operation of Operator, save the output to TBlob. - * \param ctx runtime context available to this call - * \param in_data array of input data, it is const - * \param req the request types of saving operation, can only be kWriteTo or kWriteInplace. - * \param out_data array of output data, pointer is used to indicate that this is holder - * the space of TBlob in out_data must be pre-allocated with InferShape - * \param aux_states Auxiliary states of operator. Normally operator doesn't - * need, epecial case like Batch Norm requires. - * \sa OpReqType, OpContext - */ - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { - using namespace mshadow; - using namespace mshadow::expr; - - CHECK_EQ(in_data.size(), 3U); - CHECK_EQ(aux_states.size(), 2U); - if (ctx.is_train) { - CHECK_EQ(out_data.size(), 3U); - CHECK_EQ(req.size(), 3U); - } else { - CHECK_GE(out_data.size(), 1U); - CHECK_GE(req.size(), 1U); - CHECK_EQ(req[batchnorm::kOut], kWriteTo); - } - Stream *s = ctx.get_stream(); - DoForward(s, ctx, in_data, req, out_data, aux_states); - } - - /*! - * \brief Perform a Backward Operation, write gradient to the in_grad. - * - * \note - * Convention: - * out_grad.size() == OperatorProperty.NumVisibleOutputs() - * out_data.size() == OperatorProperty.NumOutputs() - * out_data can contain additional invisible returns that remembers the - * state carried from the Forward pass. For example mask in the dropout. - * The gradients are passed from visible returns in this function. - * - * \par - * Not all the TBlobs in the arguments will be available - * if you override the DeclareBackwardDependency of corresponding OperatorProperty class. - * Only the dependencies you declared will be available at corresponding position, - * the rest of the parameters are simply dummy where you will get a nullptr. - * You will be safe if you use the default DeclareBackwardDependency. - * But only declare what you need will give engine more chance for optimization. - * - * \param ctx runtime context available to this call - * \param out_grad the gradient value we get from of the Operator. - * \param in_data the array of input data. - * \param out_data the array of output data. - * \param req request types of the saving operation, can be all types. - * \param in_grad the array of gradient we need to write to. - * \param aux_states Auxiliary states of operator. Normally operator doesn't need - * \sa OperatorProperty, OpReqType, OpContext - */ - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { - CHECK_EQ(out_grad.size(), param_.output_mean_var ? 3U : 1U); - CHECK_EQ(in_data.size(), 3U); - CHECK_EQ(out_data.size(), 3U); - CHECK_EQ(in_grad.size(), 3U); - mshadow::Stream *s = ctx.get_stream(); - DoBackward(s, ctx, out_grad, in_data, - out_data, req, in_grad, aux_states); - } - - private: - void DoForward(mshadow::Stream *stream, - const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states); - - void DoBackward(mshadow::Stream *stream, - const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states); - -#if MXNET_USE_CUDA - void DoForward(mshadow::Stream *stream, - const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states); - void DoBackward(mshadow::Stream *stream, - const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states); -#endif // MXNET_USE_CUDA - - /*! \brief Batch normalization operator parameters */ - BatchNormParam param_; -}; // class BatchNormOp -template -Operator *CreateOp(BatchNormParam param, const int dtype, const TShape& shape); - -#if DMLC_USE_CXX11 -class BatchNormProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]"; - const TShape &dshape = in_shape->at(0); - - const size_t channelAxis = static_cast(param_.axis < 0 - ? static_cast(dshape.ndim()) + param_.axis - : param_.axis); - CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param_.axis; - - const int channelCount = dshape[channelAxis]; - - if (dshape.ndim() == 0) { - return false; - } - - in_shape->at(1) = TShape(Shape1(channelCount)); - in_shape->at(2) = TShape(Shape1(channelCount)); - - out_shape->clear(); - out_shape->push_back(dshape); // kOut - out_shape->push_back(Shape1(channelCount)); // kMean - out_shape->push_back(Shape1(channelCount)); // kVar - - aux_shape->clear(); - aux_shape->push_back(Shape1(channelCount)); // kMovingMean - aux_shape->push_back(Shape1(channelCount)); // kMovingVar - return true; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - using namespace mshadow; - CHECK_GE(in_type->size(), 1U); - const int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - // For float16 input type beta, gamma, mean, and average are stored in float32. - // For other input types, these parameters have the same type as input - // NOTE: This requirement is from cuDNN (v. 4 and 5) - int dtype_param; - MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, { - dtype_param = mshadow::DataType::kFlag; }); - for (index_t i = 1; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype_param; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, ListArguments()[i]); - } - } - for (index_t i = 0; i < aux_type->size(); ++i) { - if ((*aux_type)[i] != -1) { - UNIFORM_TYPE_CHECK((*aux_type)[i], dtype_param, ListArguments()[i]); - } - } - const size_t n_aux = this->ListAuxiliaryStates().size(); - aux_type->clear(); - for (size_t i = 0; i < n_aux; ++i) { - aux_type->push_back(dtype_param); - } - const size_t n_out = this->ListOutputs().size(); - out_type->clear(); - out_type->push_back(dtype); - for (size_t i = 1; i < n_out; ++i) { - out_type->push_back(dtype_param); - } - return true; + bool operator==(const BatchNormParam& other) const { + return this->eps == other.eps && + this->momentum == other.momentum && + this->fix_gamma == other.fix_gamma && + this->use_global_stats == other.use_global_stats && + this->output_mean_var == other.output_mean_var && + this->axis == other.axis && + this->cudnn_off == other.cudnn_off; } +}; - OperatorProperty* Copy() const override { - auto ptr = new BatchNormProp(); - ptr->param_ = param_; - return ptr; - } +} // namespace op +} // namespace mxnet - std::string TypeString() const override { - return "BatchNorm"; +namespace std { +template<> +struct hash { + size_t operator()(const mxnet::op::BatchNormParam& val) { + size_t ret = 0; + ret = dmlc::HashCombine(ret, val.momentum); + ret = dmlc::HashCombine(ret, val.fix_gamma); + ret = dmlc::HashCombine(ret, val.use_global_stats); + ret = dmlc::HashCombine(ret, val.output_mean_var); + ret = dmlc::HashCombine(ret, val.axis); + return ret; } +}; +} // namespace std - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[batchnorm::kOut], - out_data[batchnorm::kMean], - out_data[batchnorm::kVar], - in_data[batchnorm::kData], - in_data[batchnorm::kGamma] - }; - } +namespace mxnet { +namespace op { - int NumVisibleOutputs() const override { - if (param_.output_mean_var) { - return 3; - } - return 1; - } +static inline bool IsBNWriting(const OpReqType ort) { + return ort == kWriteTo || ort == kWriteInplace; +} - int NumOutputs() const override { - return 3; - } +template +void BatchNormForwardImpl(mshadow::Stream *stream, + const OpContext &ctx, const BatchNormParam& param, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states); - std::vector ListArguments() const override { - return {"data", "gamma", "beta"}; - } +template +void BatchNormBackwardImpl(mshadow::Stream *stream, + const OpContext &ctx, const BatchNormParam& param, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states); - std::vector ListOutputs() const override { - return {"output", "mean", "var"}; - } +#if MXNET_USE_CUDA +template +void BatchNormForwardImpl(mshadow::Stream *stream, + const OpContext &ctx, const BatchNormParam& param, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states); +template +void BatchNormBackwardImpl(mshadow::Stream *stream, + const OpContext &ctx, const BatchNormParam& param, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states); +#endif // MXNET_USE_CUDA - std::vector ListAuxiliaryStates() const override { - return {"moving_mean", "moving_var"}; - } +/*! + * \brief perform a forward operation of Operator, save the output to TBlob. + * \param ctx runtime context available to this call + * \param in_data array of input data, it is const + * \param req the request types of saving operation, can only be kWriteTo or kWriteInplace. + * \param out_data array of output data, pointer is used to indicate that this is holder + * the space of TBlob in out_data must be pre-allocated with InferShape + * \param aux_states Auxiliary states of operator. Normally operator doesn't + * need, epecial case like Batch Norm requires. + * \sa OpReqType, OpContext + */ +template +void BatchNormForward(const OpContext &ctx, const BatchNormParam& param, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { + using namespace mshadow; + using namespace mshadow::expr; + + CHECK_EQ(in_data.size(), 3U); + CHECK_EQ(aux_states.size(), 2U); + if (ctx.is_train) { + CHECK_EQ(out_data.size(), 3U); + CHECK_EQ(req.size(), 3U); + } else { + CHECK_GE(out_data.size(), 1U); + CHECK_GE(req.size(), 1U); + CHECK_EQ(req[batchnorm::kOut], kWriteTo); + } + Stream *s = ctx.get_stream(); + BatchNormForwardImpl(s, ctx, param, in_data, req, + out_data, aux_states); +} - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } +/*! + * \brief Perform a Backward Operation, write gradient to the in_grad. + * + * \note + * Convention: + * out_grad.size() == OperatorProperty.NumVisibleOutputs() + * out_data.size() == OperatorProperty.NumOutputs() + * out_data can contain additional invisible returns that remembers the + * state carried from the Forward pass. For example mask in the dropout. + * The gradients are passed from visible returns in this function. + * + * \par + * Not all the TBlobs in the arguments will be available + * if you override the DeclareBackwardDependency of corresponding OperatorProperty class. + * Only the dependencies you declared will be available at corresponding position, + * the rest of the parameters are simply dummy where you will get a nullptr. + * You will be safe if you use the default DeclareBackwardDependency. + * But only declare what you need will give engine more chance for optimization. + * + * \param ctx runtime context available to this call + * \param out_grad the gradient value we get from of the Operator. + * \param in_data the array of input data. + * \param out_data the array of output data. + * \param req request types of the saving operation, can be all types. + * \param in_grad the array of gradient we need to write to. + * \param aux_states Auxiliary states of operator. Normally operator doesn't need + * \sa OperatorProperty, OpReqType, OpContext + */ +template +void BatchNormBackward(const OpContext &ctx, const BatchNormParam& param, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { + CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U); + CHECK_EQ(in_data.size(), 3U); + CHECK_EQ(out_data.size(), 3U); + CHECK_EQ(in_grad.size(), 3U); + mshadow::Stream *s = ctx.get_stream(); + BatchNormBackwardImpl(s, ctx, param, out_grad, in_data, + out_data, req, in_grad, aux_states); +} - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; +template +void BatchNormCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const BatchNormParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 5U); + std::vector in_data(inputs.begin(), + inputs.begin() + batchnorm::kInMovingMean); + std::vector aux_states(inputs.begin() + batchnorm::kInMovingMean, + inputs.end()); + MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, { + BatchNormForward(ctx, param, in_data, req, outputs, + aux_states); + }); +} - inline const BatchNormParam& getParam() const { - return param_; - } +template +void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 11U); + const BatchNormParam& param = nnvm::get(attrs.parsed); + int num_out_grads = param.output_mean_var ? 3U : 1U; + int in_data_start = 3; + int aux_states_start = in_data_start + batchnorm::kInMovingMean; + int out_data_start = in_data_start + batchnorm::kInMovingVar + 1; + std::vector out_grad(inputs.begin(), inputs.begin() + num_out_grads); + std::vector in_data(inputs.begin() + in_data_start, + inputs.begin() + aux_states_start); + std::vector aux_states(inputs.begin() + aux_states_start, + inputs.begin() + out_data_start); + std::vector out_data(inputs.begin() + out_data_start, inputs.end()); + std::vector in_grad(outputs.begin(), outputs.begin() + 3); + + MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, { + BatchNormBackward(ctx, param, out_grad, in_data, out_data, req, + in_grad, aux_states); + }); +} - private: - BatchNormParam param_; -}; // class BatchNormProp +#if DMLC_USE_CXX11 namespace batchnorm { diff --git a/src/operator/nn/batch_norm.cc b/src/operator/nn/batch_norm.cc index ca2883239488..ba6c413819e4 100644 --- a/src/operator/nn/batch_norm.cc +++ b/src/operator/nn/batch_norm.cc @@ -21,16 +21,15 @@ * Copyright (c) 2015 by Contributors * \file batch_norm.cc * \brief - * \author Bing Xu, Chris Olivier + * \author Bing Xu, Chris Olivier, Da Zheng */ #include "batch_norm-inl.h" #include -#if MXNET_USE_MKL2017 == 1 -#include -#include "../mkl/mkl_memory-inl.h" -#include "../mkl/mkl_batch_norm-inl.h" -#endif // MXNET_USE_MKL2017 +#include "../elemwise_op_common.h" +#if MXNET_USE_MKLDNN == 1 +#include "./mkldnn/mkldnn_batch_norm-inl.h" +#endif /*! \brief inverse standard deviation <-> variance */ #define VARIANCE_TO_INVSTD(__var$, __eps$) (1.0/sqrt((__var$) + DType(__eps$))) @@ -89,12 +88,12 @@ static inline void ForEachFast(const BNTensor3 &in_data, /*! \brief Forward CPU */ template -void BatchNormOp::DoForward(mshadow::Stream *, - const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { +void BatchNormForwardImpl(mshadow::Stream *, + const OpContext &ctx, const BatchNormParam& param_, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { // Input batchnorm::BNTensor3 inputData(in_data[batchnorm::kData], param_.axis); const TBlob &weights = in_data[batchnorm::kGamma]; @@ -164,7 +163,7 @@ void BatchNormOp::DoForward(mshadow::Stream *, // note that var is still invstd if (!param_.fix_gamma) { - if (IsWriting(req[batchnorm::kData])) { + if (IsBNWriting(req[batchnorm::kData])) { ForEachFast(inputData, outputData, channel, [thisWeight, thisBias, thisMean, thisInvstd](const DType *in_data, DType *out_data) { @@ -173,10 +172,10 @@ void BatchNormOp::DoForward(mshadow::Stream *, }); } } else { - if (IsWriting(req[batchnorm::kGamma])) { + if (IsBNWriting(req[batchnorm::kGamma])) { w[channel] = AccReal(1); } - if (IsWriting(req[batchnorm::kData])) { + if (IsBNWriting(req[batchnorm::kData])) { ForEachFast(inputData, outputData, channel, [thisWeight, thisBias, thisMean, thisInvstd](const DType *in_data, DType *out_data) { @@ -189,14 +188,14 @@ void BatchNormOp::DoForward(mshadow::Stream *, } template -void BatchNormOp::DoBackward(mshadow::Stream *, - const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { +void BatchNormBackwardImpl(mshadow::Stream *, + const OpContext &ctx, const BatchNormParam& param_, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { // Input Data batchnorm::BNTensor3 inputData(in_data[batchnorm::kData], param_.axis); const TBlob &weights = in_data[batchnorm::kGamma]; @@ -264,7 +263,7 @@ void BatchNormOp::DoBackward(mshadow::Stream *, dotp += (*thisInputData - mean) * (*gradOut_data); }); - if (!gradIn.IsEmpty() && IsWriting(req[batchnorm::kData])) { // if there's a grad input + if (!gradIn.IsEmpty() && IsBNWriting(req[batchnorm::kData])) { // if there's a grad input if (is_train_and_not_global_stats) { // when in training mode // Q(X) = X - E[x] ; i.e. input centered to zero mean @@ -300,7 +299,7 @@ void BatchNormOp::DoBackward(mshadow::Stream *, // May want to make this a param eventually const AccReal scale = 1.0f; - if (IsWriting(req[batchnorm::kGamma])) { + if (IsBNWriting(req[batchnorm::kGamma])) { if (!param_.fix_gamma) { gradWeightData[channel] = scale * dotp * invstd; } else { @@ -308,51 +307,185 @@ void BatchNormOp::DoBackward(mshadow::Stream *, } } - if (IsWriting(req[batchnorm::kBeta])) { + if (IsBNWriting(req[batchnorm::kBeta])) { gradBiasData[channel] = scale * sumGradOut; } } } -template<> -Operator *CreateOp(BatchNormParam param, const int dtype, const TShape& shape) { - param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis); - Operator *op = nullptr; -#if MXNET_USE_MKL2017 == 1 - if (shape.ndim() == 4 +DMLC_REGISTER_PARAMETER(BatchNormParam); + +static bool BatchNormShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, + std::vector *out_shape) { + const BatchNormParam& param = nnvm::get(attrs.parsed); + using namespace mshadow; + CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, MovingMean, MovingVar]"; + const TShape &dshape = in_shape->at(batchnorm::kData); + + const size_t channelAxis = static_cast(param.axis < 0 + ? static_cast(dshape.ndim()) + param.axis + : param.axis); + CHECK_LT(channelAxis, dshape.ndim()) << "Channel axis out of range: " << param.axis; + + const int channelCount = dshape[channelAxis]; + + if (dshape.ndim() == 0) { + return false; + } + + in_shape->at(batchnorm::kGamma) = TShape(Shape1(channelCount)); + in_shape->at(batchnorm::kBeta) = TShape(Shape1(channelCount)); + in_shape->at(batchnorm::kInMovingMean) = TShape(Shape1(channelCount)); // kMovingMean + in_shape->at(batchnorm::kInMovingVar) = TShape(Shape1(channelCount)); // kMovingVar + + out_shape->clear(); + out_shape->push_back(dshape); // kOut + out_shape->push_back(Shape1(channelCount)); // kMean + out_shape->push_back(Shape1(channelCount)); // kVar + + return true; +} + +static bool BatchNormType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, std::vector *out_type) { + using namespace mshadow; + CHECK_GE(in_type->size(), 1U); + const int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + // For float16 input type beta, gamma, mean, and average are stored in float32. + // For other input types, these parameters have the same type as input + // NOTE: This requirement is from cuDNN (v. 4 and 5) + int dtype_param; + MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DTypeX, AccRealX, { + dtype_param = mshadow::DataType::kFlag; }); + std::vector args{"data", "gamma", "beta", "mean", "var"}; + CHECK_LE(in_type->size(), args.size()); + for (index_t i = 1; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype_param; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype_param, args[i]); + } + } + const size_t n_out = 3; + out_type->clear(); + out_type->push_back(dtype); + for (size_t i = 1; i < n_out; ++i) { + out_type->push_back(dtype_param); + } + return true; +} + +#if MXNET_USE_MKLDNN == 1 +static inline bool SupportMKLDNNBN(const NDArray &input, const BatchNormParam ¶m) { + TShape shape = input.shape(); + return SupportMKLDNN(input) && shape.ndim() == 4 && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS - && !mxnet::op::batchnorm::disable_mkl) { - switch (dtype) { - case mshadow::kFloat32: - op = new MKLBatchNormOp(param); - break; - case mshadow::kFloat64: - op = new MKLBatchNormOp(param); - break; - default: - // MKL operator doesn't support half_t, so fall through - break; + && shape[param.axis] % 8 == 0; +} + +void BatchNormComputeExCPU(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + CHECK_EQ(inputs.size(), 5U); + const BatchNormParam ¶m = nnvm::get(attrs.parsed); + // MKLDNN batchnorm only works well on the special MKLDNN layout. + if (SupportMKLDNNBN(inputs[0], param) && inputs[0].IsMKLDNNData()) { + std::vector in_data(inputs.begin(), inputs.begin() + batchnorm::kInMovingMean); + std::vector aux_states(inputs.begin() + batchnorm::kInMovingMean, inputs.end()); + + if (inputs[0].dtype() == mshadow::kFloat32) { + MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs); + MKLDNNBatchNormForward(ctx, param, in_data, req, outputs, aux_states); + MKLDNN_OPCHECK_RUN(BatchNormCompute, attrs, ctx, inputs, req, outputs); + return; } } -#endif - if (!op) { - MSHADOW_REAL_TYPE_SWITCH_EX(dtype, - DType, - AccReal, { - op = new BatchNormOp(param); }); + FallBackCompute(BatchNormCompute, attrs, ctx, inputs, req, outputs); +} + +void BatchNormGradComputeExCPU(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + CHECK_EQ(inputs.size(), 11U); + const BatchNormParam ¶m = nnvm::get(attrs.parsed); + int num_out_grads = param.output_mean_var ? 3U : 1U; + int in_data_start = 3; + int aux_states_start = in_data_start + batchnorm::kInMovingMean; + int out_data_start = in_data_start + batchnorm::kInMovingVar + 1; + + TShape shape = inputs[0].shape(); + // MKLDNN batchnorm only works well on the special MKLDNN layout. + if (SupportMKLDNNBN(inputs[0], param) + && (inputs[in_data_start].IsMKLDNNData() || inputs[0].IsMKLDNNData())) { + std::vector out_grad(inputs.begin(), inputs.begin() + num_out_grads); + std::vector in_data(inputs.begin() + in_data_start, + inputs.begin() + aux_states_start); + std::vector aux_states(inputs.begin() + aux_states_start, + inputs.begin() + out_data_start); + std::vector out_data(inputs.begin() + out_data_start, inputs.end()); + std::vector in_grad(outputs.begin(), outputs.begin() + 3); + + if (inputs[0].dtype() == mshadow::kFloat32) { + MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs); + MKLDNNBatchNormBackward(ctx, param, out_grad, in_data, + out_data, req, in_grad, aux_states); + MKLDNN_OPCHECK_RUN(BatchNormGradCompute, attrs, ctx, inputs, req, outputs); + return; + } } - return op; + FallBackCompute(BatchNormGradCompute, attrs, ctx, inputs, req, outputs); } +#endif -// DO_BIND_DISPATCH comes from operator_common.h -Operator *BatchNormProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], (*in_shape)[0]); +static inline bool BatchNormStorageType(const nnvm::NodeAttrs &attrs, + const int dev_mask, + DispatchMode *dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 5); + CHECK_EQ(out_attrs->size(), 3); + DispatchMode wanted_mode; +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) + wanted_mode = DispatchMode::kFComputeEx; + else +#endif + wanted_mode = DispatchMode::kFCompute; + for (int& v : *in_attrs) { + if (v == - 1) v = kDefaultStorage; + } + return storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, wanted_mode); } -DMLC_REGISTER_PARAMETER(BatchNormParam); +static inline bool backward_BatchNormStorageType(const nnvm::NodeAttrs &attrs, + const int dev_mask, + DispatchMode *dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 11); + CHECK_EQ(out_attrs->size(), 5); + DispatchMode wanted_mode; +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) + wanted_mode = DispatchMode::kFComputeEx; + else +#endif + wanted_mode = DispatchMode::kFCompute; + for (int& v : *in_attrs) { + if (v == - 1) v = kDefaultStorage; + } + return storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, wanted_mode); +} -MXNET_REGISTER_OP_PROPERTY(BatchNorm, BatchNormProp) +NNVM_REGISTER_OP(BatchNorm) .describe(R"code(Batch normalization. Normalizes a data batch by mean and variance, and applies a scale ``gamma`` as @@ -398,14 +531,44 @@ Both ``gamma`` and ``beta`` are learnable parameters. But if ``fix_gamma`` is tr then set ``gamma`` to 1 and its gradient to 0. )code" ADD_FILELINE) +.set_num_inputs(5) +.set_num_outputs(3) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data", "gamma", "beta", "moving_mean", "moving_var"}; +}) +.set_attr("FListOutputNames", + [](const NodeAttrs& attrs) { + return std::vector{"output", "mean", "var"}; +}) +.set_attr("FNumVisibleOutputs", + [](const NodeAttrs& attrs) { + const BatchNormParam& param = nnvm::get(attrs.parsed); + return param.output_mean_var ? 3 : 1; +}) +.set_attr("FMutateInputs", [](const nnvm::NodeAttrs& attrs) { + return std::vector{3, 4}; +}) +.set_attr("FInferShape", BatchNormShape) +.set_attr("FInferType", BatchNormType) +.set_attr("FInferStorageType", BatchNormStorageType) +.set_attr("FCompute", BatchNormCompute) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FComputeEx", BatchNormComputeExCPU) +#endif +.set_attr("FGradient", ElemwiseGradUseInOut{"_backward_BatchNorm"}) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization") .add_argument("gamma", "NDArray-or-Symbol", "gamma array") .add_argument("beta", "NDArray-or-Symbol", "beta array") .add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input") .add_argument("moving_var", "NDArray-or-Symbol", "running variance of input") -.add_arguments(BatchNormParam::__FIELDS__()); - -NNVM_REGISTER_OP(BatchNorm) +.add_arguments(BatchNormParam::__FIELDS__()) .set_attr( "FSetInputVarAttrOnCompose", [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) { @@ -417,5 +580,20 @@ NNVM_REGISTER_OP(BatchNorm) } }); +NNVM_REGISTER_OP(_backward_BatchNorm) +.set_num_outputs(5) +.set_attr("TIsBackward", true) +.set_attr("FInferStorageType", backward_BatchNormStorageType) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif +.set_attr_parser(ParamParser) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FComputeEx", BatchNormGradComputeExCPU) +#endif +.set_attr("FCompute", BatchNormGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/batch_norm.cu b/src/operator/nn/batch_norm.cu index 59317b7fa837..80c15976b65f 100644 --- a/src/operator/nn/batch_norm.cu +++ b/src/operator/nn/batch_norm.cu @@ -21,7 +21,7 @@ * Copyright (c) 2017 by Contributors * \file batch_norm.cu * \brief CUDA Batch Normalization code - * \author Chris Olivier, Bing Xu + * \author Chris Olivier, Bing Xu, Da Zheng * Adapted from Torch */ #include @@ -579,13 +579,13 @@ static inline uint32_t SetupFlags(const OpContext &ctx, flags |= ctx.is_train ? IS_TRAINING_FLAG : 0; flags |= params.fix_gamma ? FIX_GAMMA_FLAG : 0; flags |= params.use_global_stats ? USE_GLOBAL_STATS_FLAG : 0; - if (BatchNormOp::IsWriting(req[batchnorm::kData])) { + if (IsBNWriting(req[batchnorm::kData])) { flags |= WRITE_DATA_FLAG; } - if (BatchNormOp::IsWriting(req[batchnorm::kGamma])) { + if (IsBNWriting(req[batchnorm::kGamma])) { flags |= WRITE_GAMMA_FLAG; } - if (BatchNormOp::IsWriting(req[batchnorm::kBeta])) { + if (IsBNWriting(req[batchnorm::kBeta])) { flags |= WRITE_BETA_FLAG; } return flags; @@ -593,12 +593,12 @@ static inline uint32_t SetupFlags(const OpContext &ctx, /*! \brief Forward batch-norm pass on GPU */ template -void BatchNormOp::DoForward(mshadow::Stream *stream, - const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { +void BatchNormForwardImpl(mshadow::Stream *stream, + const OpContext &ctx, const BatchNormParam& param_, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { batchnorm::cuda::BatchNormalizationUpdateOutput( stream, ctx, @@ -614,14 +614,14 @@ void BatchNormOp::DoForward(mshadow::Stream *stream, /*! \brief Backward batch-norm pass on GPU */ template -void BatchNormOp::DoBackward(mshadow::Stream *stream, - const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { +void BatchNormBackwardImpl(mshadow::Stream *stream, + const OpContext &ctx, const BatchNormParam& param_, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { batchnorm::cuda::BatchNormalizationBackward( stream, ctx, @@ -637,30 +637,92 @@ void BatchNormOp::DoBackward(mshadow::Stream *stream, MSHADOW_CUDA_POST_KERNEL_CHECK(BatchNormOp_DoBackward_gpu); } -/*! \brief Create GPU operator for batch normalization */ +#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4 +template +static CuDNNBatchNormOp &GetCuDNNOp(const BatchNormParam& param) { +#if DMLC_CXX11_THREAD_LOCAL + static thread_local CuDNNBatchNormOp op; +#else + static MX_THREAD_LOCAL CuDNNBatchNormOp op; +#endif + op.Init(param); + return op; +} +#endif + template<> -Operator *CreateOp(BatchNormParam param, const int dtype, const TShape& shape) { +void BatchNormCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + BatchNormParam param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 5U); + std::vector in_data(inputs.begin(), inputs.begin() + 3); + std::vector aux_states(inputs.begin() + 3, inputs.end()); + int dtype = inputs[0].type_flag_; + TShape shape = inputs[0].shape_; + param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis); - Operator *op = NULL; #if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 if (!param.use_global_stats && !param.cudnn_off && shape.ndim() <= 4 && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS) { MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new CuDNNBatchNormOp(param); + GetCuDNNOp(param).Forward(ctx, in_data, req, outputs, aux_states); }) } else { MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, { - op = new BatchNormOp(param); + BatchNormForward(ctx, param, in_data, req, outputs, aux_states); }) } #else - MSHADOW_REAL_TYPE_SWITCH_EX(dtype, - DType, - AccReal, - { op = new BatchNormOp(param); }); + MSHADOW_REAL_TYPE_SWITCH_EX(inputs[0].type_flag_, DType, AccReal, { + BatchNormForward(ctx, param, in_data, req, outputs, aux_states); + }); +#endif +} + +template<> +void BatchNormGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 11U); + BatchNormParam param = nnvm::get(attrs.parsed); + std::vector out_grad(1, inputs[0]); + std::vector in_data(inputs.begin() + 3, inputs.begin() + 6); + std::vector aux_states(inputs.begin() + 6, inputs.begin() + 8); + std::vector out_data(inputs.begin() + 8, inputs.end()); + std::vector in_grad(outputs.begin(), outputs.begin() + 3); + int dtype = inputs[0].type_flag_; + TShape shape = inputs[0].shape_; + + param.axis = mxnet::op::batchnorm::GetRealAxis(shape, param.axis); +#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 5 + if (!param.use_global_stats && !param.cudnn_off && shape.ndim() <= 4 + && param.axis == mxnet::op::batchnorm::DEFAULT_AXIS) { + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + GetCuDNNOp(param).Backward(ctx, out_grad, in_data, out_data, + req, in_grad, aux_states); + }) + } else { + MSHADOW_REAL_TYPE_SWITCH_EX(dtype, DType, AccReal, { + BatchNormBackward(ctx, param, out_grad, + in_data, out_data, req, in_grad, aux_states); + }) + } +#else + MSHADOW_REAL_TYPE_SWITCH_EX(out_grad[0].type_flag_, DType, AccReal, { + BatchNormBackward(ctx, param, out_grad, + in_data, out_data, req, in_grad, aux_states); + }); #endif - return op; } +NNVM_REGISTER_OP(BatchNorm) +.set_attr("FCompute", BatchNormCompute); + +NNVM_REGISTER_OP(_backward_BatchNorm) +.set_attr("FCompute", BatchNormGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/concat-inl.h b/src/operator/nn/concat-inl.h new file mode 100644 index 000000000000..a7f1fa85f612 --- /dev/null +++ b/src/operator/nn/concat-inl.h @@ -0,0 +1,160 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2015 by Contributors + * \file concat-inl.h + * \brief + * \author Bing Xu +*/ +#ifndef MXNET_OPERATOR_NN_CONCAT_INL_H_ +#define MXNET_OPERATOR_NN_CONCAT_INL_H_ +#include +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" +#include "../channel_op_common.h" +#include "../tensor/broadcast_reduce_op.h" + +namespace mxnet { +namespace op { + +namespace concat_enum { +enum ConcatOpInputs {kData0, kData1, kData2, kData3, kData4}; +enum ConcatOpResource {kTempSpace}; +enum ConcatOpOutputs {kOut}; +} // namespace concat_enum + +struct ConcatParam : public dmlc::Parameter { + int num_args; + int dim; + DMLC_DECLARE_PARAMETER(ConcatParam) { + DMLC_DECLARE_FIELD(num_args).set_lower_bound(1) + .describe("Number of inputs to be concated."); + DMLC_DECLARE_FIELD(dim).set_default(1) + .describe("the dimension to be concated."); + } +}; // struct ConcatParam + +template +class ConcatOp { + public: + void Init(const ConcatParam ¶m) { + this->size_ = param.num_args; + this->dimension_ = param.dim; + } + + void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(static_cast(in_data.size()), size_); + CHECK_EQ(out_data.size(), 1U); + int axis = CheckAxis(dimension_, in_data[concat_enum::kData0].ndim()); + Stream *s = ctx.get_stream(); + std::vector > data(size_); + Tensor out; + size_t leading = 1, trailing = 1; + for (int i = 0; i < axis; ++i) { + leading *= out_data[concat_enum::kOut].shape_[i]; + } + for (int i = axis + 1; i < out_data[concat_enum::kOut].ndim(); ++i) { + trailing *= out_data[concat_enum::kOut].shape_[i]; + } + size_t mid = out_data[concat_enum::kOut].shape_[axis]; + Shape<3> oshape = Shape3(leading, mid, trailing); + out = out_data[concat_enum::kOut].get_with_shape(oshape, s); + + for (int i = 0; i < size_; ++i) { + Shape<3> dshape = Shape3(leading, in_data[i].shape_[axis], trailing); + data[i] = in_data[i].get_with_shape(dshape, s); + } + Concatenate(data, &out, 1, req[concat_enum::kOut]); + } + + void Backward(const OpContext &ctx, const TBlob &out_grad, + const std::vector &req, + const std::vector &in_grad) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_grad.size(), static_cast(size_)); + int axis = CheckAxis(dimension_, out_grad.ndim()); + Stream *s = ctx.get_stream(); + std::vector > grad_in(size_); + Tensor grad; + size_t leading = 1, trailing = 1; + for (int i = 0; i < axis; ++i) { + leading *= out_grad.shape_[i]; + } + for (int i = axis + 1; i < out_grad.ndim(); ++i) { + trailing *= out_grad.shape_[i]; + } + size_t mid = out_grad.shape_[axis]; + Shape<3> oshape = Shape3(leading, mid, trailing); + grad = out_grad.get_with_shape(oshape, s); + + for (int i = 0; i < size_; ++i) { + Shape<3> dshape = Shape3(leading, in_grad[i].shape_[axis], trailing); + grad_in[i] = in_grad[i].get_with_shape(dshape, s); + } + Split(grad, &grad_in, 1, req); + } + + private: + int size_; + int dimension_; +}; // class ConcatOp + +template +void ConcatCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ConcatParam& param = nnvm::get(attrs.parsed); + MSHADOW_TYPE_SWITCH(inputs[concat_enum::kData0].type_flag_, DType, { + ConcatOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); + }); +} + +template +void ConcatGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ConcatParam& param = nnvm::get(attrs.parsed); + MSHADOW_TYPE_SWITCH(inputs[concat_enum::kOut].type_flag_, DType, { + ConcatOp op; + op.Init(param); + op.Backward(ctx, inputs[concat_enum::kOut], req, outputs); + }); +} + +} // namespace op +} // namespace mxnet + +#endif // MXNET_OPERATOR_NN_CONCAT_INL_H_ diff --git a/src/operator/nn/concat.cc b/src/operator/nn/concat.cc new file mode 100644 index 000000000000..81dc95f1a5a5 --- /dev/null +++ b/src/operator/nn/concat.cc @@ -0,0 +1,289 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2015 by Contributors + * \file concat.cc + * \brief + * \author Bing Xu +*/ + +#include "./concat-inl.h" +#include "./mkldnn/mkldnn_ops-inl.h" +#include "./mkldnn/mkldnn_base-inl.h" +#include "../../common/utils.h" + +namespace mxnet { +namespace op { + +static bool ConcatShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, + std::vector *out_shape) { + using namespace mshadow; + const ConcatParam& param_ = nnvm::get(attrs.parsed); + CHECK_EQ(in_shape->size(), static_cast(param_.num_args)); + TShape dshape; + index_t size = 0; + bool has_zero = false; + int axis = -1; + for (int i = 0; i < param_.num_args; ++i) { + TShape tmp = (*in_shape)[i]; + if (tmp.ndim()) { + axis = CheckAxis(param_.dim, tmp.ndim()); + has_zero = tmp[axis] == 0 || has_zero; + size += tmp[axis]; + tmp[axis] = 0; + shape_assign(&dshape, tmp); + } + } + + TShape tmp = (*out_shape)[0]; + if (tmp.ndim()) { + axis = CheckAxis(param_.dim, tmp.ndim()); + tmp[axis] = 0; + shape_assign(&dshape, tmp); + } + + if (dshape.ndim() == 0) return false; + + for (int i = 0; i < param_.num_args; ++i) { + CHECK(shape_assign(&(*in_shape)[i], dshape)) + << "Incompatible input shape: expected " << dshape << ", got " << (*in_shape)[i]; + } + + if (!has_zero) dshape[axis] = size; + CHECK(shape_assign(&(*out_shape)[0], dshape)) + << "Incompatible output shape: expected " << dshape << ", got " << (*out_shape)[0]; + + return dshape.Size() != 0; +} + +static bool ConcatType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, + std::vector *out_type) { + const ConcatParam& param_ = nnvm::get(attrs.parsed); + int dtype = -1; + + for (size_t i = 0; i < in_type->size(); ++i) { + if (dtype == -1) { + dtype = in_type->at(i); + } else { + CHECK(in_type->at(i) == dtype || + in_type->at(i) == -1) << + "Non-uniform data type in Concat"; + } + } + + if (dtype == -1) { + LOG(FATAL) << "Not enough information to infer type in Concat."; + return false; + } + + size_t nin = param_.num_args; + in_type->clear(); + for (size_t i = 0; i < nin; ++i) in_type->push_back(dtype); + + out_type->clear(); + out_type->push_back(dtype); + + return true; +} + +inline static bool ConcatForwardInferStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK(!in_attrs->empty()); + CHECK_EQ(out_attrs->size(), 1U); + DispatchMode wanted_mode; +#if MXNET_USE_MKLDNN == 1 + const ConcatParam& param = nnvm::get(attrs.parsed); + if (dev_mask == mshadow::cpu::kDevMask + && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage) + && param.dim > 0) + wanted_mode = DispatchMode::kFComputeEx; + else +#endif + wanted_mode = DispatchMode::kFCompute; + return storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, wanted_mode); +} + +inline static bool BackwardConcatStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + DispatchMode wanted_mode; +#if MXNET_USE_MKLDNN == 1 + const ConcatParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(out_attrs->size(), in_attrs->size() - 1); + if (dev_mask == mshadow::cpu::kDevMask + && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage) + && param.dim > 0) + wanted_mode = DispatchMode::kFComputeEx; + else +#endif + wanted_mode = DispatchMode::kFCompute; + return storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, wanted_mode); +} + +#if MXNET_USE_MKLDNN == 1 +static void ConcatComputeExCPU(const nnvm::NodeAttrs& attrs, + const OpContext& op_ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK(!inputs.empty()); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + if (req[0] == kNullOp) return; + // MKLDNN support 2D and 4D concat + if ((inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4) + && inputs[0].dtype() == mshadow::kFloat32) { + MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs); + MKLDNNConcatForward(attrs, op_ctx, inputs, req, outputs); + MKLDNN_OPCHECK_RUN(ConcatCompute, attrs, op_ctx, inputs, req, outputs); + return; + } + FallBackCompute(ConcatCompute, attrs, op_ctx, inputs, req, outputs); +} + +static void ConcatGradComputeExCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + if ((inputs[0].shape().ndim() == 2 || inputs[0].shape().ndim() == 4) + && inputs[0].dtype() == mshadow::kFloat32) { + MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs); + MKLDNNConcatBackward(attrs, ctx, inputs, req, outputs); + MKLDNN_OPCHECK_RUN(ConcatGradCompute, attrs, ctx, inputs, req, outputs); + return; + } + FallBackCompute(ConcatGradCompute, attrs, ctx, inputs, req, outputs); +} +#endif + +struct ConcatGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + CHECK_EQ(ograds.size(), 1); + std::vector heads(ograds.begin(), ograds.end()); +#if MXNET_USE_MKLDNN == 1 + for (size_t i = 0; i < n->inputs.size(); i++) { + heads.push_back(n->inputs[i]); + } +#endif + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + +DMLC_REGISTER_PARAMETER(ConcatParam); + +NNVM_REGISTER_OP(Concat) +.describe(R"code(Joins input arrays along a given axis. + +.. note:: `Concat` is deprecated. Use `concat` instead. + +The dimensions of the input arrays should be the same except the axis along +which they will be concatenated. +The dimension of the output array along the concatenated axis will be equal +to the sum of the corresponding dimensions of the input arrays. + +Example:: + + x = [[1,1],[2,2]] + y = [[3,3],[4,4],[5,5]] + z = [[6,6], [7,7],[8,8]] + + concat(x,y,z,dim=0) = [[ 1., 1.], + [ 2., 2.], + [ 3., 3.], + [ 4., 4.], + [ 5., 5.], + [ 6., 6.], + [ 7., 7.], + [ 8., 8.]] + + Note that you cannot concat x,y,z along dimension 1 since dimension + 0 is not the same for all the input arrays. + + concat(y,z,dim=1) = [[ 3., 3., 6., 6.], + [ 4., 4., 7., 7.], + [ 5., 5., 8., 8.]] + +)code" ADD_FILELINE) +.set_num_inputs([](const NodeAttrs& attrs) { + const ConcatParam& params = nnvm::get(attrs.parsed); + return params.num_args; +}) +.set_num_outputs(1) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + const ConcatParam& params = nnvm::get(attrs.parsed); + std::vector ret; + for (int i = 0; i < params.num_args; ++i) { + ret.push_back(std::string("arg") + std::to_string(i)); + } + return ret; +}) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif +.set_attr("FInferShape", ConcatShape) +.set_attr("FInferType", ConcatType) +.set_attr("FInferStorageType", ConcatForwardInferStorageType) +.set_attr("FCompute", ConcatCompute) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FComputeEx", ConcatComputeExCPU) +#endif +.set_attr("FGradient", ConcatGrad{"_backward_Concat"}) +.set_attr("key_var_num_args", "num_args") +.add_argument("data", "NDArray-or-Symbol[]", "List of arrays to concatenate") +.add_arguments(ConcatParam::__FIELDS__()); + +NNVM_REGISTER_OP(Concat).add_alias("concat"); + +NNVM_REGISTER_OP(_backward_Concat) +.set_num_outputs([](const NodeAttrs& attrs) { + const ConcatParam& params = nnvm::get(attrs.parsed); + return params.num_args; +}) +.set_attr_parser(ParamParser) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif +.set_attr("TIsBackward", true) +.set_attr("FInferStorageType", BackwardConcatStorageType) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FComputeEx", ConcatGradComputeExCPU) +#endif +.set_attr("FCompute", ConcatGradCompute); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/concat.cu b/src/operator/nn/concat.cu similarity index 81% rename from src/operator/concat.cu rename to src/operator/nn/concat.cu index 394fa736ee84..f6bf5ece5c78 100644 --- a/src/operator/concat.cu +++ b/src/operator/nn/concat.cu @@ -28,14 +28,12 @@ namespace mxnet { namespace op { -template<> -Operator* CreateOp(ConcatParam param, int dtype, std::vector *in_shape) { - Operator *op = NULL; - MSHADOW_TYPE_SWITCH(dtype, DType, { - op = new ConcatOp(param); - }); - return op; -} + +NNVM_REGISTER_OP(Concat) +.set_attr("FCompute", ConcatCompute); + +NNVM_REGISTER_OP(_backward_Concat) +.set_attr("FCompute", ConcatGradCompute); } // namespace op } // namespace mxnet diff --git a/src/operator/nn/convolution-inl.h b/src/operator/nn/convolution-inl.h index 1613da6c85d1..6204f75c4697 100644 --- a/src/operator/nn/convolution-inl.h +++ b/src/operator/nn/convolution-inl.h @@ -22,7 +22,7 @@ * \file convolution-inl.h * \brief * \ref: https://github.com/Yangqing/caffe/wiki/Convolution-in-Caffe:-a-memo - * \author Bing Xu, Jun Wu + * \author Bing Xu, Jun Wu, Da Zheng */ #ifndef MXNET_OPERATOR_NN_CONVOLUTION_INL_H_ #define MXNET_OPERATOR_NN_CONVOLUTION_INL_H_ @@ -148,9 +148,9 @@ namespace mxnet { namespace op { template -class ConvolutionOp : public Operator { +class ConvolutionOp { public: - explicit ConvolutionOp(ConvolutionParam p) { + void Init(ConvolutionParam p) { this->param_ = p; // convert MBytes first to Bytes and then to elements. param_.workspace = (param_.workspace << 20) / sizeof(DType); @@ -160,11 +160,10 @@ class ConvolutionOp : public Operator { << "Only support NCW, NCHW and NCDHW layout"; } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(req[conv::kOut], kWriteTo); @@ -233,18 +232,19 @@ class ConvolutionOp : public Operator { } } - virtual void Backward(const OpContext &ctx, - const std::vector& out_grad, - const std::vector& in_data, - const std::vector& out_data, - const std::vector& req, - const std::vector& in_grad, - const std::vector& aux_args) { + void Backward(const OpContext &ctx, + const std::vector& out_grad, + const std::vector& in_data, + const std::vector& req, + const std::vector& in_grad) { using namespace mshadow; using namespace mshadow::expr; CHECK_EQ(out_grad.size(), 1U); + // We expect 2 inputs: in data and weight. We don't need bias for + // computing gradient. size_t expected = param_.no_bias == 0 ? 3 : 2; - CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(in_data.size(), expected); + CHECK_EQ(in_grad.size(), expected); CHECK_EQ(req.size(), expected); CHECK_EQ(in_data[conv::kWeight].CheckContiguous(), true); LayerSetUp(in_grad[conv::kData].shape_, out_grad[conv::kOut].shape_); @@ -386,299 +386,35 @@ class ConvolutionOp : public Operator { }; // class ConvolutionOp template -Operator* CreateOp(ConvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx); - -#if DMLC_USE_CXX11 -class ConvolutionProp : public OperatorProperty { - public: - std::vector ListArguments() const override { - if (!param_.no_bias) { - return {"data", "weight", "bias"}; - } else { - return {"data", "weight"}; - } - } - - void Init(const std::vector >& kwargs) override { - using namespace mshadow; - param_.Init(kwargs); - if (param_.kernel.ndim() == 1) { - param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW; - if (param_.stride.ndim() == 0) param_.stride = Shape1(1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1); - if (param_.pad.ndim() == 0) param_.pad = Shape1(0); - } else if (param_.kernel.ndim() == 2) { - param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; - if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); - } else { - CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D convolution not supported"; - param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW; - if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); - } - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - if (!param_.no_bias) { - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; - } else { - CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; - } - // CHECK_EQ(out_shape->size(), 1) << "Output: [output]"; - out_shape->resize(1, TShape()); - const TShape &dshp = (*in_shape)[conv::kData]; - if (dshp.ndim() == 0) return false; - - if (param_.kernel.ndim() == 1) { - // 1d conv - CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x"; - Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW); - Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, - param_.kernel[0]); - wshape = ConvertLayout(wshape, kNCW, param_.layout.value()); - wshape[0] *= param_.num_group; - SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); - } - - const index_t dilated_ksize_x = param_.DilatedKernelSize(0); - CHECK_EQ(dshape[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - Shape<3> oshape; - oshape[0] = dshape[0]; - oshape[1] = param_.num_filter; - oshape[2] = dshape[2] ? - (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0; - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value())); - // Perform incomplete shape inference. Fill in the missing values in data shape. - // 1) We can always fill in the batch_size. - // 2) We can back-calculate the input height/width if the corresponding stride is 1. - oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW); - dshape[0] = oshape[0]; - if (oshape[2] && param_.stride[0] == 1) { - dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0]; - } - SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, - ConvertLayout(dshape, kNCW, param_.layout.value())); - // Check whether the kernel sizes are valid - if (dshape[2] != 0) { - CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; - } - return true; - } else if (param_.kernel.ndim() == 2) { - // 2d conv - CHECK_EQ(dshp.ndim(), 4U) \ - << "Input data should be 4D in batch-num_filter-y-x"; - Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW); - Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, - dshape[1] / param_.num_group, - param_.kernel[0], param_.kernel[1]); - wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); - wshape[0] *= param_.num_group; - SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); - } - - const index_t dilated_ksize_y = param_.DilatedKernelSize(0); - const index_t dilated_ksize_x = param_.DilatedKernelSize(1); - CHECK_EQ(dshape[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - Shape<4> oshape; - oshape[0] = dshape[0]; - oshape[1] = param_.num_filter; - oshape[2] = dshape[2] ? - (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : 0; - oshape[3] = dshape[3] ? - (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : 0; - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); - // Perform incomplete shape inference. Fill in the missing values in data shape. - // 1) We can always fill in the batch_size. - // 2) We can back-calculate the input height/width if the corresponding stride is 1. - oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW); - dshape[0] = oshape[0]; - if (oshape[2] && param_.stride[0] == 1) { - dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param_.pad[0]; - } - if (oshape[3] && param_.stride[1] == 1) { - dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param_.pad[1]; - } - SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, - ConvertLayout(dshape, kNCHW, param_.layout.value())); - // Check whether the kernel sizes are valid - if (dshape[2] != 0) { - CHECK_LE(dilated_ksize_y, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; - } - if (dshape[3] != 0) { - CHECK_LE(dilated_ksize_x, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input"; - } - return true; - } else if (param_.kernel.ndim() == 3) { - // 3d conv - CHECK_EQ(dshp.ndim(), 5U) \ - << "Input data should be 5D in batch-num_filter-depth-y-x"; - Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW); - Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, - param_.kernel[0], param_.kernel[1], param_.kernel[2]); - wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value()); - wshape[0] *= param_.num_group; - SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); - } - - // Note: 3D dilation currently not supported. - // Calculations below done to preserve symmetry with 1D/2D code. - const index_t dilated_ksize_d = param_.DilatedKernelSize(0); - const index_t dilated_ksize_y = param_.DilatedKernelSize(1); - const index_t dilated_ksize_x = param_.DilatedKernelSize(2); - CHECK_EQ(dshape[1] % param_.num_group, 0U) - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - CHECK_EQ(param_.dilate.Size(), 1U) - << "Dilate is not supported in 3d convolution"; - Shape<5> oshape; - oshape[0] = dshape[0]; - oshape[1] = param_.num_filter; - oshape[2] = dshape[2] ? - (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : 0; - oshape[3] = dshape[3] ? - (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : 0; - oshape[4] = dshape[4] ? - (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : 0; - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value())); - // Perform incomplete shape inference. Fill in the missing values in data shape. - // 1) We can always fill in the batch_size. - // 2) We can back-calculate the input depth/height/width if the corresponding stride is 1. - oshape = ConvertLayout((*out_shape)[0].get<5>(), param_.layout.value(), kNCDHW); - dshape[0] = oshape[0]; - if (oshape[2] && param_.stride[0] == 1) { - dshape[2] = oshape[2] + dilated_ksize_d - 1 - 2 * param_.pad[0]; - } - if (oshape[3] && param_.stride[1] == 1) { - dshape[3] = oshape[3] + dilated_ksize_y - 1 - 2 * param_.pad[1]; - } - if (oshape[4] && param_.stride[2] == 1) { - dshape[4] = oshape[4] + dilated_ksize_x - 1 - 2 * param_.pad[2]; - } - SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, - ConvertLayout(dshape, kNCDHW, param_.layout.value())); - // Check whether the kernel sizes are valid - if (dshape[2] != 0) { - CHECK_LE(dilated_ksize_d, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; - } - if (dshape[3] != 0) { - CHECK_LE(dilated_ksize_y, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input"; - } - if (dshape[4] != 0) { - CHECK_LE(dilated_ksize_x, AddPad(dshape[4], param_.pad[2])) << "kernel size exceed input"; - } - return true; - } else { - LOG(FATAL) << "Unknown convolution type"; - return false; - } - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); - } - } - out_type->clear(); - out_type->push_back(dtype); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new ConvolutionProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "Convolution"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[conv::kOut], in_data[conv::kData], in_data[conv::kWeight]}; - } - - std::vector ForwardResource( - const std::vector &in_shape) const override { - return {ResourceRequest::kTempSpace}; - } - - std::vector BackwardResource( - const std::vector &in_shape) const override { - return {ResourceRequest::kTempSpace}; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } - - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; +void ConvolutionCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + MSHADOW_REAL_TYPE_SWITCH(inputs[conv::kData].type_flag_, DType, { + ConvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); + }); +} - private: - // Adds symmetric padding to a data input (in one dimension) - index_t AddPad(index_t dsize, index_t pad) const { - return dsize + 2 * pad; - } +template +void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + std::vector in_data(inputs.begin() + 1, inputs.end()); + const TBlob &out_grad = inputs[0]; + const std::vector &in_grad = outputs; + + MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, { + ConvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + }); +} - ConvolutionParam param_; -}; // class ConvolutionProp -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_CONVOLUTION_INL_H_ diff --git a/src/operator/nn/convolution.cc b/src/operator/nn/convolution.cc index ef8ec9034db2..951063fb4b2f 100644 --- a/src/operator/nn/convolution.cc +++ b/src/operator/nn/convolution.cc @@ -21,15 +21,13 @@ * Copyright (c) 2017 by Contributors * \file convolution.cc * \brief - * \author Bing Xu, Jun Wu + * \author Bing Xu, Jun Wu, Da Zheng */ #include "./convolution-inl.h" -#if MXNET_USE_MKL2017 == 1 -#include -#include "../mkl/mkl_memory-inl.h" -#include "../mkl/mkl_convolution-inl.h" -#endif // MXNET_USE_MKL2017 +#include "../elemwise_op_common.h" +#include "./mkldnn/mkldnn_ops-inl.h" +#include "./mkldnn/mkldnn_base-inl.h" #if MXNET_USE_NNPACK == 1 #include "./nnpack/nnpack_convolution-inl.h" #endif // MXNET_USE_NNPACK @@ -38,63 +36,351 @@ namespace mxnet { namespace op { DMLC_REGISTER_PARAMETER(ConvolutionParam); -template<> -Operator* CreateOp(ConvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - Operator *op = NULL; - // If 1D convolution, use MXNet implementation - if (param.kernel.ndim() == 1) { - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new ConvolutionOp(param); - }) - return op; +static inline index_t AddPad(index_t dsize, index_t pad) { + return dsize + 2 * pad; +} + +static inline std::vector ListArguments(const ConvolutionParam& param_) { + if (!param_.no_bias) { + return {"data", "weight", "bias"}; + } else { + return {"data", "weight"}; } -#if MXNET_USE_MKL2017 == 1 - if ((param.dilate[0] == 1 && param.dilate[1] == 1) - && param.kernel.ndim() == 2) { - switch (dtype) { - case mshadow::kFloat32: - return new MKLConvolutionOp(param); - case mshadow::kFloat64: - return new MKLConvolutionOp(param); - default: - break; - } +} + +#if MXNET_USE_MKLDNN == 1 +static void ConvolutionComputeExCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + if (SupportMKLDNNConv(inputs[0])) { + MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs); + MKLDNNConvolutionForward(attrs, ctx, inputs, req, outputs); + MKLDNN_OPCHECK_RUN(ConvolutionCompute, attrs, ctx, inputs, req, outputs); + return; } + FallBackCompute(ConvolutionCompute, attrs, ctx, inputs, req, outputs); +} + +static void ConvolutionGradComputeExCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + if (SupportMKLDNNConv(inputs[0])) { + MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs); + MKLDNNConvolutionBackward(attrs, ctx, inputs, req, outputs); + MKLDNN_OPCHECK_RUN(ConvolutionGradCompute, attrs, ctx, inputs, req, outputs); + return; + } + FallBackCompute(ConvolutionGradCompute, attrs, ctx, inputs, req, outputs); +} #endif -#if MXNET_USE_NNPACK == 1 - const size_t batch_size = (*in_shape)[0][0]; - if ((param.dilate[0] == 1 && param.dilate[1] == 1) - && param.kernel.ndim() == 2 && (!param.no_bias) - && param.num_group == 1 && (batch_size == 1 || - ((batch_size > 1) && (param.stride[0] == 1) && - (param.stride[1] == 1)))) { - switch (dtype) { - case mshadow::kFloat32: - return new NNPACKConvolutionOp(param); - default: - break; + +static bool ConvolutionShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, + std::vector *out_shape) { + using namespace mshadow; + const ConvolutionParam& param_ = nnvm::get(attrs.parsed); + if (!param_.no_bias) { + CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; + } else { + CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; + } + // CHECK_EQ(out_shape->size(), 1) << "Output: [output]"; + out_shape->resize(1, TShape()); + const TShape &dshp = (*in_shape)[conv::kData]; + if (dshp.ndim() == 0) return false; + + if (param_.kernel.ndim() == 1) { + // 1d conv + CHECK_EQ(dshp.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x"; + Shape<3> dshape = ConvertLayout(dshp.get<3>(), param_.layout.value(), kNCW); + Shape<3> wshape = Shape3(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, + param_.kernel[0]); + wshape = ConvertLayout(wshape, kNCW, param_.layout.value()); + wshape[0] *= param_.num_group; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); + } + + const index_t dilated_ksize_x = param_.DilatedKernelSize(0); + CHECK_EQ(dshape[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + Shape<3> oshape; + oshape[0] = dshape[0]; + oshape[1] = param_.num_filter; + oshape[2] = dshape[2] ? + (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_x) / param_.stride[0] + 1 : 0; + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value())); + // Perform incomplete shape inference. Fill in the missing values in data shape. + // 1) We can always fill in the batch_size. + // 2) We can back-calculate the input height/width if the corresponding stride is 1. + oshape = ConvertLayout((*out_shape)[0].get<3>(), param_.layout.value(), kNCW); + dshape[0] = oshape[0]; + if (oshape[2] && param_.stride[0] == 1) { + dshape[2] = oshape[2] + dilated_ksize_x - 1 - 2 * param_.pad[0]; + } + SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, + ConvertLayout(dshape, kNCW, param_.layout.value())); + // Check whether the kernel sizes are valid + if (dshape[2] != 0) { + CHECK_LE(dilated_ksize_x, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; + } + return true; + } else if (param_.kernel.ndim() == 2) { + // 2d conv + CHECK_EQ(dshp.ndim(), 4U) \ + << "Input data should be 4D in batch-num_filter-y-x"; + Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW); + Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, + dshape[1] / param_.num_group, + param_.kernel[0], param_.kernel[1]); + wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); + wshape[0] *= param_.num_group; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); + } + + const index_t dilated_ksize_y = param_.DilatedKernelSize(0); + const index_t dilated_ksize_x = param_.DilatedKernelSize(1); + CHECK_EQ(dshape[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + Shape<4> oshape; + oshape[0] = dshape[0]; + oshape[1] = param_.num_filter; + oshape[2] = dshape[2] ? + (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_y) / param_.stride[0] + 1 : 0; + oshape[3] = dshape[3] ? + (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_x) / param_.stride[1] + 1 : 0; + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); + // Perform incomplete shape inference. Fill in the missing values in data shape. + // 1) We can always fill in the batch_size. + // 2) We can back-calculate the input height/width if the corresponding stride is 1. + oshape = ConvertLayout((*out_shape)[0].get<4>(), param_.layout.value(), kNCHW); + dshape[0] = oshape[0]; + if (oshape[2] && param_.stride[0] == 1) { + dshape[2] = oshape[2] + dilated_ksize_y - 1 - 2 * param_.pad[0]; + } + if (oshape[3] && param_.stride[1] == 1) { + dshape[3] = oshape[3] + dilated_ksize_x - 1 - 2 * param_.pad[1]; + } + SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, + ConvertLayout(dshape, kNCHW, param_.layout.value())); + // Check whether the kernel sizes are valid + if (dshape[2] != 0) { + CHECK_LE(dilated_ksize_y, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; + } + if (dshape[3] != 0) { + CHECK_LE(dilated_ksize_x, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input"; + } + return true; + } else if (param_.kernel.ndim() == 3) { + // 3d conv + CHECK_EQ(dshp.ndim(), 5U) \ + << "Input data should be 5D in batch-num_filter-depth-y-x"; + Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW); + Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group, + param_.kernel[0], param_.kernel[1], param_.kernel[2]); + wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value()); + wshape[0] *= param_.num_group; + SHAPE_ASSIGN_CHECK(*in_shape, conv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, conv::kBias, Shape1(param_.num_filter)); + } + + // Note: 3D dilation currently not supported. + // Calculations below done to preserve symmetry with 1D/2D code. + const index_t dilated_ksize_d = param_.DilatedKernelSize(0); + const index_t dilated_ksize_y = param_.DilatedKernelSize(1); + const index_t dilated_ksize_x = param_.DilatedKernelSize(2); + CHECK_EQ(dshape[1] % param_.num_group, 0U) + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + CHECK_EQ(param_.dilate.Size(), 1U) + << "Dilate is not supported in 3d convolution"; + Shape<5> oshape; + oshape[0] = dshape[0]; + oshape[1] = param_.num_filter; + oshape[2] = dshape[2] ? + (AddPad(dshape[2], param_.pad[0]) - dilated_ksize_d) / param_.stride[0] + 1 : 0; + oshape[3] = dshape[3] ? + (AddPad(dshape[3], param_.pad[1]) - dilated_ksize_y) / param_.stride[1] + 1 : 0; + oshape[4] = dshape[4] ? + (AddPad(dshape[4], param_.pad[2]) - dilated_ksize_x) / param_.stride[2] + 1 : 0; + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value())); + // Perform incomplete shape inference. Fill in the missing values in data shape. + // 1) We can always fill in the batch_size. + // 2) We can back-calculate the input depth/height/width if the corresponding stride is 1. + oshape = ConvertLayout((*out_shape)[0].get<5>(), param_.layout.value(), kNCDHW); + dshape[0] = oshape[0]; + if (oshape[2] && param_.stride[0] == 1) { + dshape[2] = oshape[2] + dilated_ksize_d - 1 - 2 * param_.pad[0]; } + if (oshape[3] && param_.stride[1] == 1) { + dshape[3] = oshape[3] + dilated_ksize_y - 1 - 2 * param_.pad[1]; + } + if (oshape[4] && param_.stride[2] == 1) { + dshape[4] = oshape[4] + dilated_ksize_x - 1 - 2 * param_.pad[2]; + } + SHAPE_ASSIGN_CHECK(*in_shape, conv::kData, + ConvertLayout(dshape, kNCDHW, param_.layout.value())); + // Check whether the kernel sizes are valid + if (dshape[2] != 0) { + CHECK_LE(dilated_ksize_d, AddPad(dshape[2], param_.pad[0])) << "kernel size exceed input"; + } + if (dshape[3] != 0) { + CHECK_LE(dilated_ksize_y, AddPad(dshape[3], param_.pad[1])) << "kernel size exceed input"; + } + if (dshape[4] != 0) { + CHECK_LE(dilated_ksize_x, AddPad(dshape[4], param_.pad[2])) << "kernel size exceed input"; + } + return true; + } else { + LOG(FATAL) << "Unknown convolution type"; + return false; } +} + +static bool ConvolutionType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, std::vector *out_type) { + const ConvolutionParam& param_ = nnvm::get(attrs.parsed); + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param_)[i]); + } + } + out_type->clear(); + out_type->push_back(dtype); + return true; +} + +inline static bool ConvStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 2 : 3; + CHECK_EQ(in_attrs->size(), in_expected); + CHECK_EQ(out_attrs->size(), 1); + + DispatchMode wanted_mode; +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) + wanted_mode = DispatchMode::kFComputeEx; + else #endif - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new ConvolutionOp(param); - }) - return op; + wanted_mode = DispatchMode::kFCompute; + return storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, wanted_mode); } -// DO_BIND_DISPATCH comes from operator_common.h -Operator *ConvolutionProp::CreateOperatorEx(Context ctx, - std::vector *in_shape, - std::vector *in_type) const { - std::vector out_shape, aux_shape; - CHECK(InferShape(in_shape, &out_shape, &aux_shape)); - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx); +inline static bool BackwardConvStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 3 : 4; + uint32_t out_expected = param.no_bias ? 2 : 3; + CHECK_EQ(in_attrs->size(), in_expected); + CHECK_EQ(out_attrs->size(), out_expected); + + DispatchMode wanted_mode; +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) + wanted_mode = DispatchMode::kFComputeEx; + else +#endif + wanted_mode = DispatchMode::kFCompute; + return storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, wanted_mode); +} + +static void ConvolutionParamParser(nnvm::NodeAttrs* attrs) { + using namespace mshadow; + ConvolutionParam param_; + try { + param_.Init(attrs->dict); + } catch (const dmlc::ParamError& e) { + std::ostringstream os; + os << e.what(); + os << ", in operator " << attrs->op->name << "(" + << "name=\"" << attrs->name << "\""; + for (const auto& k : attrs->dict) { + os << ", " << k.first << "=\"" << k.second << "\""; + } + os << ")"; + throw dmlc::ParamError(os.str()); + } + + if (param_.kernel.ndim() == 1) { + param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW; + if (param_.stride.ndim() == 0) param_.stride = Shape1(1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1); + if (param_.pad.ndim() == 0) param_.pad = Shape1(0); + } else if (param_.kernel.ndim() == 2) { + param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; + if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); + } else { + CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D convolution not supported"; + param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW; + if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); + } + attrs->parsed = std::move(param_); } -MXNET_REGISTER_OP_PROPERTY(Convolution, ConvolutionProp) +struct ConvolutionGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + const ConvolutionParam& param = nnvm::get(n->attrs.parsed); + std::vector heads(ograds.begin(), ograds.end()); + heads.push_back(n->inputs[conv::kData]); + heads.push_back(n->inputs[conv::kWeight]); + if (!param.no_bias) + heads.push_back(n->inputs[conv::kBias]); + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + +NNVM_REGISTER_OP(Convolution) .describe(R"code(Compute *N*-D convolution on *(N+2)*-D input. In the 2-D convolution, given input data with shape *(batch_size, @@ -168,10 +454,51 @@ There are other options to tune the performance. the performance. )code" ADD_FILELINE) +.set_num_inputs([](const NodeAttrs& attrs) { + const ConvolutionParam& params = nnvm::get(attrs.parsed); + return params.no_bias ? 2 : 3; +}) +.set_num_outputs(1) +.set_attr_parser(ConvolutionParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + const ConvolutionParam& params = nnvm::get(attrs.parsed); + if (params.no_bias) + return std::vector{"data", "weight"}; + else + return std::vector{"data", "weight", "bias"}; +}) +.set_attr("FInferShape", ConvolutionShape) +.set_attr("FInferType", ConvolutionType) +.set_attr("FInferStorageType", ConvStorageType) +.set_attr("FCompute", ConvolutionCompute) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FComputeEx", ConvolutionComputeExCPU) +#endif +.set_attr("FGradient", ConvolutionGrad{"_backward_Convolution"}) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) .add_argument("data", "NDArray-or-Symbol", "Input data to the ConvolutionOp.") .add_argument("weight", "NDArray-or-Symbol", "Weight matrix.") .add_argument("bias", "NDArray-or-Symbol", "Bias parameter.") .add_arguments(ConvolutionParam::__FIELDS__()); +NNVM_REGISTER_OP(_backward_Convolution) +.set_num_outputs([](const NodeAttrs& attrs) { + const ConvolutionParam& params = nnvm::get(attrs.parsed); + return params.no_bias ? 2 : 3; +}) +.set_attr("TIsBackward", true) +.set_attr("FInferStorageType", BackwardConvStorageType) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +.set_attr_parser(ConvolutionParamParser) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FComputeEx", ConvolutionGradComputeExCPU) +#endif +.set_attr("FCompute", ConvolutionGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/convolution.cu b/src/operator/nn/convolution.cu index 7234daf0d614..d7f9e564a603 100644 --- a/src/operator/nn/convolution.cu +++ b/src/operator/nn/convolution.cu @@ -21,36 +21,66 @@ * Copyright (c) 2017 by Contributors * \file convolution.cu * \brief - * \author Bing Xu, Jun Wu + * \author Bing Xu, Jun Wu, Da Zheng */ #include "./convolution-inl.h" #include +#include "./depthwise_convolution-inl.h" #if MXNET_USE_CUDNN == 1 #include "./cudnn/cudnn_convolution-inl.h" #endif // MXNET_USE_CUDNN -#include "./depthwise_convolution-inl.h" - namespace mxnet { namespace op { +#if MXNET_USE_CUDNN == 1 +template +static CuDNNConvolutionOp &GetCuDNNConvOp(const ConvolutionParam& param, + int forward_compute_type, int backward_compute_type, + const std::vector& in_shape, const std::vector& out_shape, + const Context& ctx) { +#if DMLC_CXX11_THREAD_LOCAL + static thread_local CuDNNConvolutionOp op; +#else + static MX_THREAD_LOCAL CuDNNConvolutionOp op; +#endif + op.Init(param, forward_compute_type, backward_compute_type, + in_shape, out_shape, ctx); + return op; +} +#endif + template<> -Operator* CreateOp(ConvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - Operator *op = NULL; - - // depth wise conv - if (param.num_filter == param.num_group && +void ConvolutionCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + int dtype = inputs[conv::kData].type_flag_; + + // If 1D convolution, use MXNet implementation + if (param.kernel.ndim() == 1) { + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + ConvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); + }) + return; + } else if (param.num_filter == param.num_group && param.layout.value() == mshadow::kNCHW && - param.num_filter == (*in_shape)[conv::kData][1] && + param.num_filter == inputs[conv::kData].shape_[1] && param.kernel.ndim() == 2 && param.dilate == mshadow::Shape2(1, 1) && dtype == mshadow::kFloat32) { - op = new DepthwiseConvolutionOp(param, *in_shape, *out_shape); - return op; + std::vector in_shape(inputs.size()); + std::vector out_shape(1, outputs[0].shape_); + for (size_t i = 0; i < in_shape.size(); i++) + in_shape[i] = inputs[i].shape_; + DepthwiseConvolutionOp op; + op.Init(param, in_shape, out_shape); + op.Forward(ctx, inputs, req, outputs); + return; } #if MXNET_USE_CUDNN == 1 @@ -59,23 +89,111 @@ Operator* CreateOp(ConvolutionParam param, int dtype, MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { if (param.cudnn_off) { - op = new ConvolutionOp(param); - } else if (!CuDNNConvolutionOp::Supports(param, compute_type, compute_type, ctx)) { + ConvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); + } else if (!CuDNNConvolutionOp::Supports(param, + compute_type, compute_type, ctx.run_ctx.ctx)) { LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied."; - op = new ConvolutionOp(param); + ConvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); } else { - op = new CuDNNConvolutionOp(param, compute_type, compute_type, - *in_shape, *out_shape, ctx); + std::vector in_shape(inputs.size()); + std::vector out_shape(1, outputs[0].shape_); + for (size_t i = 0; i < in_shape.size(); i++) + in_shape[i] = inputs[i].shape_; + CuDNNConvolutionOp &op = GetCuDNNConvOp(param, + compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx); + op.Forward(ctx, inputs, req, outputs); } }) #else MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new ConvolutionOp(param); + ConvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); + }) +#endif // MXNET_USE_CUDNN +} + +template<> +void ConvolutionGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const ConvolutionParam& param = nnvm::get(attrs.parsed); + std::vector in_data(inputs.begin() + 1, inputs.end()); + const TBlob &out_grad = inputs[0]; + const std::vector &in_grad = outputs; + int dtype = out_grad.type_flag_; + + // If 1D convolution, use MXNet implementation + if (param.kernel.ndim() == 1) { + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + ConvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + }) + return; + } else if (param.num_filter == param.num_group && + param.layout.value() == mshadow::kNCHW && + param.num_filter == in_data[conv::kData].shape_[1] && + param.kernel.ndim() == 2 && + param.dilate == mshadow::Shape2(1, 1) && + dtype == mshadow::kFloat32) { + // The first element stores out grad. + std::vector in_shape(in_data.size()); + std::vector out_shape(1, out_grad.shape_); + for (size_t i = 0; i < in_shape.size(); i++) + in_shape[i] = in_data[i].shape_; + DepthwiseConvolutionOp op; + op.Init(param, in_shape, out_shape); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + return; + } + +#if MXNET_USE_CUDNN == 1 + // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16). + int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype; + + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + if (param.cudnn_off) { + ConvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + } else if (!CuDNNConvolutionOp::Supports(param, + compute_type, compute_type, ctx.run_ctx.ctx)) { + LOG(WARNING) << "This convolution is not supported by cudnn, MXNET convolution is applied."; + ConvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + } else { + // The first element stores out grad. + std::vector in_shape(in_data.size()); + std::vector out_shape(1, out_grad.shape_); + for (size_t i = 0; i < in_shape.size(); i++) + in_shape[i] = in_data[i].shape_; + CuDNNConvolutionOp &op = GetCuDNNConvOp(param, + compute_type, compute_type, in_shape, out_shape, ctx.run_ctx.ctx); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + } + }) +#else + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + ConvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); }) #endif // MXNET_USE_CUDNN - return op; } +NNVM_REGISTER_OP(Convolution) +.set_attr("FCompute", ConvolutionCompute); + +NNVM_REGISTER_OP(_backward_Convolution) +.set_attr("FCompute", ConvolutionGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/cudnn/cudnn_activation-inl.h b/src/operator/nn/cudnn/cudnn_activation-inl.h index 888528309cdf..a89e7bfaf080 100644 --- a/src/operator/nn/cudnn/cudnn_activation-inl.h +++ b/src/operator/nn/cudnn/cudnn_activation-inl.h @@ -33,12 +33,19 @@ namespace mxnet { namespace op { template -class CuDNNActivationOp : public Operator { +class CuDNNActivationOp { public: - explicit CuDNNActivationOp(ActivationParam param) { - param_ = param; - init_cudnn_ = false; + CuDNNActivationOp() { dtype_ = mshadow::DataType::kCudnnFlag; + #if CUDNN_MAJOR >= 5 + nan_prop_ = CUDNN_NOT_PROPAGATE_NAN; + CUDNN_CALL(cudnnCreateActivationDescriptor(&desc_)); + #endif + CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_)); + } + + void Init(const ActivationParam ¶m) { + param_ = param; switch (param_.act_type) { case activation::kReLU: mode_ = CUDNN_ACTIVATION_RELU; @@ -54,67 +61,54 @@ class CuDNNActivationOp : public Operator { break; } #if CUDNN_MAJOR >= 5 - nan_prop_ = CUDNN_NOT_PROPAGATE_NAN; - CUDNN_CALL(cudnnCreateActivationDescriptor(&desc_)); CUDNN_CALL(cudnnSetActivationDescriptor(desc_, mode_, nan_prop_, relu_ceil_)); #endif } ~CuDNNActivationOp() { - if (init_cudnn_) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_)); - #if CUDNN_MAJOR >= 5 - CUDNN_CALL(cudnnDestroyActivationDescriptor(desc_)); - #endif - } + CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_)); + #if CUDNN_MAJOR >= 5 + CUDNN_CALL(cudnnDestroyActivationDescriptor(desc_)); + #endif } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + void Forward(const OpContext &ctx, const TBlob &in_data, + const OpReqType &req, const TBlob &out_data) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); Stream *s = ctx.get_stream(); Tensor data; Tensor out; - if (in_data[activation::kData].ndim() == 2) { - Shape<4> dshape = Shape4(in_data[activation::kData].shape_[0], - in_data[activation::kData].shape_[1], 1, 1); - data = in_data[activation::kData].get_with_shape(dshape, s); - out = out_data[activation::kOut].get_with_shape(dshape, s); + if (in_data.ndim() == 2) { + Shape<4> dshape = Shape4(in_data.shape_[0], + in_data.shape_[1], 1, 1); + data = in_data.get_with_shape(dshape, s); + out = out_data.get_with_shape(dshape, s); } else { Shape<4> dshape; - index_t size_left = in_data[activation::kData].Size(); + index_t size_left = in_data.Size(); for (int i = 0; i < 3; ++i) { - if (i < in_data[activation::kData].ndim()) { - dshape[i] = in_data[activation::kData].shape_[i]; + if (i < in_data.ndim()) { + dshape[i] = in_data.shape_[i]; } else { dshape[i] = 1; } size_left /= dshape[i]; } dshape[3] = size_left; - data = in_data[activation::kData].get_with_shape(dshape, s); - out = out_data[activation::kOut].get_with_shape(dshape, s); + data = in_data.get_with_shape(dshape, s); + out = out_data.get_with_shape(dshape, s); } typename DataType::ScaleType alpha = 1.0f; typename DataType::ScaleType beta = 0.0f; CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); - if (!init_cudnn_) { - init_cudnn_ = true; - CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_)); - CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - data.shape_[0], - data.shape_[1], - data.shape_[2], - data.shape_[3])); - } + CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + data.shape_[0], + data.shape_[1], + data.shape_[2], + data.shape_[3])); #if CUDNN_MAJOR <= 4 CUDNN_CALL(cudnnActivationForward(s->dnn_handle_, mode_, @@ -136,20 +130,11 @@ class CuDNNActivationOp : public Operator { #endif } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + void Backward(const OpContext &ctx, const TBlob &out_grad, + const TBlob &in_data, const TBlob &out_data, + const OpReqType &req, const TBlob &in_grad) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - CHECK_EQ(req.size(), 1U); - CHECK_EQ(in_grad.size(), 1U); typename DataType::ScaleType alpha = 1.0f; typename DataType::ScaleType beta = 0.0f; Stream *s = ctx.get_stream(); @@ -157,31 +142,38 @@ class CuDNNActivationOp : public Operator { Tensor data; Tensor output_data; Tensor input_grad; - if (in_grad[activation::kData].ndim() == 2) { - Shape<4> dshape = Shape4(in_grad[activation::kData].shape_[0], - in_grad[activation::kData].shape_[1], 1, 1); - data = in_data[activation::kData].get_with_shape(dshape, s); - grad = out_grad[activation::kOut].get_with_shape(dshape, s); - output_data = out_data[activation::kOut].get_with_shape(dshape, s); - input_grad = in_grad[activation::kData].get_with_shape(dshape, s); + if (in_grad.ndim() == 2) { + Shape<4> dshape = Shape4(in_grad.shape_[0], + in_grad.shape_[1], 1, 1); + data = in_data.get_with_shape(dshape, s); + grad = out_grad.get_with_shape(dshape, s); + output_data = out_data.get_with_shape(dshape, s); + input_grad = in_grad.get_with_shape(dshape, s); } else { Shape<4> dshape; - index_t size_left = in_grad[activation::kData].Size(); + index_t size_left = in_grad.Size(); for (int i = 0; i < 3; ++i) { - if (i < in_grad[activation::kData].ndim()) { - dshape[i] = in_grad[activation::kData].shape_[i]; + if (i < in_grad.ndim()) { + dshape[i] = in_grad.shape_[i]; } else { dshape[i] = 1; } size_left /= dshape[i]; } dshape[3] = size_left; - data = in_data[activation::kData].get_with_shape(dshape, s); - output_data = out_data[activation::kOut].get_with_shape(dshape, s); - grad = out_grad[activation::kOut].get_with_shape(dshape, s); - input_grad = in_grad[activation::kData].get_with_shape(dshape, s); + data = in_data.get_with_shape(dshape, s); + output_data = out_data.get_with_shape(dshape, s); + grad = out_grad.get_with_shape(dshape, s); + input_grad = in_grad.get_with_shape(dshape, s); } CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); + CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + data.shape_[0], + data.shape_[1], + data.shape_[2], + data.shape_[3])); #if CUDNN_MAJOR <= 4 CUDNN_CALL(cudnnActivationBackward(s->dnn_handle_, mode_, @@ -212,7 +204,6 @@ class CuDNNActivationOp : public Operator { } private: - bool init_cudnn_; cudnnDataType_t dtype_; cudnnActivationMode_t mode_; cudnnTensorDescriptor_t shape_desc_; diff --git a/src/operator/nn/cudnn/cudnn_batch_norm-inl.h b/src/operator/nn/cudnn/cudnn_batch_norm-inl.h index 3dc9c8353a35..e2337049060e 100644 --- a/src/operator/nn/cudnn/cudnn_batch_norm-inl.h +++ b/src/operator/nn/cudnn/cudnn_batch_norm-inl.h @@ -43,28 +43,30 @@ enum CuDNNBatchNormOpAuxiliary {kMovingMean, kMovingInvVar}; #if defined(__CUDACC__) template -class CuDNNBatchNormOp : public Operator { +class CuDNNBatchNormOp { public: - explicit CuDNNBatchNormOp(BatchNormParam param) { + CuDNNBatchNormOp() { using namespace mshadow; - CHECK_GE(param.eps, CUDNN_BN_MIN_EPSILON) - << "CuDNN requires eps to be no less than " << CUDNN_BN_MIN_EPSILON; - this->param_ = param; - init_cudnn_ = false; dtype_ = DataType::kCudnnFlag; // For float16 input type beta, gamma, mean, and average are stored in float32. // For other input types, these parameters have the same type as input dtype_param_ = (dtype_ == CUDNN_DATA_HALF) ? kFloat32 : DataType::kFlag; + CUDNN_CALL(cudnnCreateTensorDescriptor(&io_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&mean_desc_)); + } + + void Init(const BatchNormParam ¶m) { + CHECK_GE(param.eps, CUDNN_BN_MIN_EPSILON) + << "CuDNN requires eps to be no less than " << CUDNN_BN_MIN_EPSILON; + this->param_ = param; } ~CuDNNBatchNormOp() { - if (init_cudnn_) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(io_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(mean_desc_)); - } + CUDNN_CALL(cudnnDestroyTensorDescriptor(io_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(mean_desc_)); } - virtual void Forward(const OpContext &ctx, + void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, const std::vector &out_data, @@ -84,29 +86,7 @@ class CuDNNBatchNormOp : public Operator { CHECK_GE(in_data[cudnnbatchnorm::kData].ndim(), 2); CHECK_LE(in_data[cudnnbatchnorm::kData].ndim(), 4); - if (!init_cudnn_) { - for (int i = 0; i < 4; ++i) { - if (i < in_data[cudnnbatchnorm::kData].ndim()) { - shape_[i] = in_data[cudnnbatchnorm::kData].shape_[i]; - } else { - shape_[i] = 1; - } - } - CUDNN_CALL(cudnnCreateTensorDescriptor(&io_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&mean_desc_)); - CUDNN_CALL(cudnnSetTensor4dDescriptor(io_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - shape_[0], - shape_[1], - shape_[2], - shape_[3])); - CUDNN_CALL(cudnnDeriveBNTensorDescriptor(mean_desc_, - io_desc_, - CUDNN_BATCHNORM_SPATIAL)); - init_cudnn_ = true; - } - + Init(in_data[cudnnbatchnorm::kData]); Stream *s = ctx.get_stream(); Tensor x = in_data[cudnnbatchnorm::kData].get_with_shape(shape_, s); @@ -177,7 +157,7 @@ class CuDNNBatchNormOp : public Operator { }) } - virtual void Backward(const OpContext &ctx, + void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, const std::vector &out_data, @@ -193,6 +173,7 @@ class CuDNNBatchNormOp : public Operator { CHECK(ctx.is_train && !param_.use_global_stats) << "use global statistics is not yet supported in CuDNNBatchNorm"; + Init(in_data[cudnnbatchnorm::kData]); Stream *s = ctx.get_stream(); Tensor x = in_data[cudnnbatchnorm::kData].get_with_shape(shape_, s); @@ -290,7 +271,27 @@ class CuDNNBatchNormOp : public Operator { } private: - bool init_cudnn_; + void Init(const TBlob &in_data) { + for (int i = 0; i < 4; ++i) { + if (i < in_data.ndim()) { + shape_[i] = in_data.shape_[i]; + } else { + shape_[i] = 1; + } + } + + CUDNN_CALL(cudnnSetTensor4dDescriptor(io_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + shape_[0], + shape_[1], + shape_[2], + shape_[3])); + CUDNN_CALL(cudnnDeriveBNTensorDescriptor(mean_desc_, + io_desc_, + CUDNN_BATCHNORM_SPATIAL)); + } + cudnnDataType_t dtype_; int dtype_param_; cudnnTensorDescriptor_t io_desc_, mean_desc_; @@ -299,91 +300,6 @@ class CuDNNBatchNormOp : public Operator { }; #endif // defined(__CUDACC__) -template -Operator *CreateOp_CuDNNv4(BatchNormParam param); - - -#if DMLC_USE_CXX11 -class CuDNNBatchNormProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, gamma, beta]"; - const TShape &dshape = in_shape->at(0); - if (dshape.ndim() == 0) return false; - in_shape->at(1) = TShape(Shape1(dshape[1])); - in_shape->at(2) = TShape(Shape1(dshape[1])); - - out_shape->clear(); - out_shape->push_back(dshape); - out_shape->push_back(Shape1(dshape[1])); - out_shape->push_back(Shape1(dshape[1])); - - aux_shape->clear(); - aux_shape->push_back(Shape1(dshape[1])); - aux_shape->push_back(Shape1(dshape[1])); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new CuDNNBatchNormProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "CuDNNBatchNorm"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[cudnnbatchnorm::kOut], - out_data[cudnnbatchnorm::kMean], - out_data[cudnnbatchnorm::kInvVar], - in_data[cudnnbatchnorm::kData], - in_data[cudnnbatchnorm::kGamma] - }; - } - - int NumVisibleOutputs() const override { - return 1; - } - - int NumOutputs() const override { - return 3; - } - - std::vector ListArguments() const override { - return {"data", "gamma", "beta"}; - } - - std::vector ListOutputs() const override { - return {"output", "mean", "inv_var"}; - } - - std::vector ListAuxiliaryStates() const override { - return {"moving_mean", "moving_inv_var"}; - } - - Operator* CreateOperator(Context ctx) const override; - - private: - BatchNormParam param_; -}; // class CuDNNBatchNormProp - -#endif // DMLC_USE_CXX11 #endif // MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4 } // namespace op } // namespace mxnet diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.cc b/src/operator/nn/cudnn/cudnn_batch_norm.cc index e1e0c999b1fb..f1d229dd5421 100644 --- a/src/operator/nn/cudnn/cudnn_batch_norm.cc +++ b/src/operator/nn/cudnn/cudnn_batch_norm.cc @@ -21,46 +21,100 @@ * Copyright (c) 2015 by Contributors * \file cudnn_batch_norm.cc * \brief - * \author Junyuan Xie + * \author Junyuan Xie, Da Zheng */ #include "./cudnn_batch_norm-inl.h" #include +#include "../../elemwise_op_common.h" namespace mxnet { namespace op { -#if CUDNN_MAJOR >= 4 -template<> -Operator *CreateOp_CuDNNv4(BatchNormParam param) { +#if MXNET_USE_CUDNN == 1 && CUDNN_MAJOR >= 4 + +static bool BatchNormShape(const nnvm::NodeAttrs& attrs, std::vector *in_shape, + std::vector *out_shape) { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 5U) << "Input:[data, gamma, beta, moving_mean, moving_var]"; + const TShape &dshape = in_shape->at(0); + if (dshape.ndim() == 0) return false; + in_shape->at(1) = TShape(Shape1(dshape[1])); + in_shape->at(2) = TShape(Shape1(dshape[1])); + in_shape->at(3) = TShape(Shape1(dshape[1])); + in_shape->at(4) = TShape(Shape1(dshape[1])); + + out_shape->clear(); + out_shape->push_back(dshape); + out_shape->push_back(Shape1(dshape[1])); + out_shape->push_back(Shape1(dshape[1])); + + return true; +} + +static void BatchNormCompute_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { LOG(FATAL) << "CuDNNBatchNormOp is only available for gpu."; - return NULL; } -Operator *CuDNNBatchNormProp::CreateOperator(Context ctx) const { -#if CUDNN_MAJOR >= 5 - LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5." - "Use the later instead."; - return nullptr; -#else - DO_BIND_DISPATCH(CreateOp_CuDNNv4, param_); -#endif +static void BatchNormGradCompute_CPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + LOG(FATAL) << "CuDNNBatchNormOp is only available for gpu."; } -MXNET_REGISTER_OP_PROPERTY(CuDNNBatchNorm, CuDNNBatchNormProp) +NNVM_REGISTER_OP(CuDNNBatchNorm) .describe("Apply batch normalization to input.") +.set_num_inputs(5) +.set_num_outputs(3) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data", "gamma", "beta", "moving_mean", "moving_var"}; +}) +.set_attr("FListOutputNames", + [](const NodeAttrs& attrs) { + return std::vector{"output", "mean", "var"}; +}) +.set_attr("FNumVisibleOutputs", + [](const NodeAttrs& attrs) { + return 1; +}) +.set_attr("FMutateInputs", [](const nnvm::NodeAttrs& attrs) { + return std::vector{3, 4}; +}) +.set_attr("FInferShape", BatchNormShape) +.set_attr("FCompute", BatchNormCompute_CPU) +.set_attr("FGradient", ElemwiseGradUseInOut{"_backward_CuDNNBatchNorm"}) .add_argument("data", "NDArray-or-Symbol", "Input data to batch normalization") -.add_arguments(BatchNormParam::__FIELDS__()); +.add_argument("gamma", "NDArray-or-Symbol", "gamma array") +.add_argument("beta", "NDArray-or-Symbol", "beta array") +.add_argument("moving_mean", "NDArray-or-Symbol", "running mean of input") +.add_argument("moving_var", "NDArray-or-Symbol", "running variance of input") +.add_arguments(BatchNormParam::__FIELDS__()) +.set_attr( + "FSetInputVarAttrOnCompose", + [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) { + if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return; + if (index == 3) { + var->attrs.dict["__init__"] = "[\"zero\", {}]"; + } else if (index == 4) { + var->attrs.dict["__init__"] = "[\"one\", {}]"; + } + }); + +NNVM_REGISTER_OP(_backward_CuDNNBatchNorm) +.set_num_outputs(5) +.set_attr("FMutateInputs", [](const nnvm::NodeAttrs& attrs) { + return std::vector{6, 7}; +}) +.set_attr("TIsBackward", true) +.set_attr_parser(ParamParser) +.set_attr("FCompute", BatchNormGradCompute_CPU); -NNVM_REGISTER_OP(CuDNNBatchNorm) -.set_attr("FSetInputVarAttrOnCompose", - [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) { - if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return; - if (index == 3) { - var->attrs.dict["__init__"] = "[\"zero\", {}]"; - } else if (index == 4) { - var->attrs.dict["__init__"] = "[\"zero\", {}]"; - } - }); #endif // CUDNN_MAJOR >= 4 + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/cudnn/cudnn_batch_norm.cu b/src/operator/nn/cudnn/cudnn_batch_norm.cu index e96db2e5e73f..e07cd1e6c8f6 100644 --- a/src/operator/nn/cudnn/cudnn_batch_norm.cu +++ b/src/operator/nn/cudnn/cudnn_batch_norm.cu @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file cudnn_batch_norm.cu * \brief - * \author Junyuan Xie + * \author Junyuan Xie, Da Zheng */ #include "./cudnn_batch_norm-inl.h" @@ -30,10 +30,60 @@ namespace mxnet { namespace op { #if CUDNN_MAJOR == 4 -template<> -Operator *CreateOp_CuDNNv4(BatchNormParam param) { - return new CuDNNBatchNormOp(param); + +template +static CuDNNBatchNormOp &GetCuDNNOp(const BatchNormParam& param) { +#if DMLC_CXX11_THREAD_LOCAL + static thread_local CuDNNBatchNormOp op; +#else + static MX_THREAD_LOCAL CuDNNBatchNormOp op; +#endif + op.Init(param); + return op; +} + +static void BatchNormCompute_CuDNNv4(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { +#if CUDNN_MAJOR >= 5 + LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5." + "Use the later instead."; +#else + const BatchNormParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 5U); + std::vector in_data(inputs.begin(), inputs.begin() + 3); + std::vector aux_states(inputs.begin() + 3, inputs.end()); + GetCuDNNOp(param).Forward(ctx, in_data, req, outputs, aux_states); +#endif +} + +static void BatchNormGradCompute_CuDNNv4(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { +#if CUDNN_MAJOR >= 5 + LOG(FATAL) << "CuDNNBatchNorm is merged into BatchNorm for cudnn version above v5." + "Use the later instead."; +#else + CHECK_EQ(inputs.size(), 11U); + const BatchNormParam& param = nnvm::get(attrs.parsed); + std::vector out_grad(1, inputs[0]); + std::vector in_data(inputs.begin() + 3, inputs.begin() + 6); + std::vector aux_states(inputs.begin() + 6, inputs.begin() + 8); + std::vector out_data(inputs.begin() + 8, inputs.end()); + std::vector in_grad(outputs.begin(), outputs.begin() + 3); + GetCuDNNOp(param).Backward(ctx, out_grad, in_data, out_data, + req, in_grad, aux_states); +#endif } + +NNVM_REGISTER_OP(CuDNNBatchNorm) +.set_attr("FCompute", BatchNormCompute_CuDNNv4); + +NNVM_REGISTER_OP(_backward_CuDNNBatchNorm) +.set_attr("FCompute", BatchNormGradCompute_CuDNNv4); + #endif // CUDNN_MAJOR == 4 } // namespace op } // namespace mxnet diff --git a/src/operator/nn/cudnn/cudnn_convolution-inl.h b/src/operator/nn/cudnn/cudnn_convolution-inl.h index 8ffe97d94310..229ba3cb1a8e 100644 --- a/src/operator/nn/cudnn/cudnn_convolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_convolution-inl.h @@ -42,9 +42,19 @@ namespace op { * \brief The Operator used to perform convolution using cuDNN kernels. */ template -class CuDNNConvolutionOp : public Operator { +class CuDNNConvolutionOp { public: - explicit CuDNNConvolutionOp(const ConvolutionParam& param, + CuDNNConvolutionOp() { + CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_)); + CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_)); + } + + void Init(const ConvolutionParam& param, int forward_compute_type, int backward_compute_type, const std::vector& in_shape, @@ -57,8 +67,6 @@ class CuDNNConvolutionOp : public Operator { auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type); // convert MB to words param_.workspace = (param_.workspace << 20) / sizeof(DType); - init_cudnn_ = false; - init_temp_size_ = false; dtype_ = DataType::kCudnnFlag; // TensorCore algos only allowed on fp16-I/O convolutions if permitted by the global policy. cudnn_tensor_core_ = DataType::kFlag == kFloat16 && GetEnvAllowTensorCore(); @@ -102,22 +110,19 @@ class CuDNNConvolutionOp : public Operator { } ~CuDNNConvolutionOp() { - if (init_cudnn_) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_)); - CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_)); - } + CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_)); + CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_)); } - virtual void Forward(const OpContext &ctx, + void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + const std::vector &out_data) { using namespace mshadow; size_t expected = param_.no_bias ? 2 : 3; CHECK_EQ(in_data.size(), expected); @@ -174,18 +179,17 @@ class CuDNNConvolutionOp : public Operator { } } - virtual void Backward(const OpContext &ctx, + void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, - const std::vector &out_data, const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; size_t expected = param_.no_bias == 0 ? 3 : 2; CHECK_EQ(out_grad.size(), 1U); - CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(in_data.size(), expected); + CHECK_EQ(in_grad.size(), expected); Stream *s = ctx.get_stream(); // I/O's should have 2 more dims than the kernel dim @@ -195,6 +199,7 @@ class CuDNNConvolutionOp : public Operator { DType *data_ptr = GetNdPtr(in_data[conv::kData], param_.kernel.ndim() + 2, s); DType *gdata_ptr = GetNdPtr(in_grad[conv::kData], param_.kernel.ndim() + 2, s); + GetTempSize(ctx); Tensor workspace = AllocateTempWorkspace(ctx, backward_workspace_byte_); size_t workspace_size = TensorSizeBytes(workspace); for (uint32_t g = 0; g < param_.num_group; ++g) { @@ -333,13 +338,6 @@ class CuDNNConvolutionOp : public Operator { size_t expected = param_.no_bias ? 2 : 3; CHECK_EQ(in_shape.size(), expected); CHECK_EQ(out_shape.size(), 1U); - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_)); - CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_)); TShape dshape = in_shape[conv::kData]; TShape wshape = in_shape[conv::kWeight]; @@ -512,7 +510,6 @@ class CuDNNConvolutionOp : public Operator { &bias_shape[0], &bias_stride[0])); } - init_cudnn_ = true; } void SelectAlgo(const Context& ctx, @@ -756,7 +753,6 @@ class CuDNNConvolutionOp : public Operator { } void GetTempSize(const OpContext& ctx) { - if (init_temp_size_) return; mshadow::Stream *s = ctx.get_stream(); size_t back_size = 0, back_size_w = 0; CUDNN_CALL(cudnnGetConvolutionBackwardDataWorkspaceSize(s->dnn_handle_, @@ -781,8 +777,6 @@ class CuDNNConvolutionOp : public Operator { out_desc_, forward_algo_.AlgoNumber(), &forward_workspace_byte_)); - - init_temp_size_ = true; } int *CastTShapeToIntPtr(const TShape& s, std::vector *buffer) { @@ -847,8 +841,6 @@ class CuDNNConvolutionOp : public Operator { std::vector param_dilate_; std::vector param_pad_; - bool init_cudnn_; - bool init_temp_size_; // Temp workspace size in bytes needed for Forward() operation. size_t forward_workspace_byte_; // Temp workspace size in bytes needed for Backward() operation. diff --git a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h index bc02d1b73f45..3c80cdcba4c2 100644 --- a/src/operator/nn/cudnn/cudnn_deconvolution-inl.h +++ b/src/operator/nn/cudnn/cudnn_deconvolution-inl.h @@ -39,9 +39,19 @@ namespace op { #if MXNET_USE_CUDNN == 1 template -class CuDNNDeconvolutionOp : public Operator { +class CuDNNDeconvolutionOp { public: - explicit CuDNNDeconvolutionOp(DeconvolutionParam param, + CuDNNDeconvolutionOp() { + CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_)); + CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_)); + CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_)); + } + + void Init(DeconvolutionParam param, int forward_compute_type, int backward_compute_type, const std::vector& in_shape, @@ -54,8 +64,6 @@ class CuDNNDeconvolutionOp : public Operator { auto cudnn_backward_compute_type = convertToCuDNNDataType(backward_compute_type); // convert MB to words param_.workspace = (param_.workspace << 20) / sizeof(DType); - init_cudnn_ = false; - init_temp_size_ = false; dtype_ = mshadow::DataType::kCudnnFlag; // TensorCore algos only allowed on fp16-I/O deconvolutions if permitted by the global policy. cudnn_tensor_core_ = DataType::kFlag == kFloat16 && GetEnvAllowTensorCore(); @@ -99,22 +107,19 @@ class CuDNNDeconvolutionOp : public Operator { } ~CuDNNDeconvolutionOp() { - if (init_cudnn_) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_)); - CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_)); - CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_)); - } + CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(bias_desc_)); + CUDNN_CALL(cudnnDestroyFilterDescriptor(filter_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(forward_conv_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_)); + CUDNN_CALL(cudnnDestroyConvolutionDescriptor(back_conv_desc_w_)); } - virtual void Forward(const OpContext &ctx, + void Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + const std::vector &out_data) { using namespace mshadow; size_t expected = param_.no_bias ? 2 : 3; CHECK_EQ(in_data.size(), expected); @@ -187,18 +192,17 @@ class CuDNNDeconvolutionOp : public Operator { } } - virtual void Backward(const OpContext &ctx, + void Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, - const std::vector &out_data, const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; size_t expected = param_.no_bias == 0 ? 3 : 2; CHECK_EQ(out_grad.size(), 1U); - CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(in_data.size(), param_.no_bias ? 2U : 3U); + CHECK_EQ(in_grad.size(), expected); Stream *s = ctx.get_stream(); // I/O's should have 2 more dims than the kernel dim @@ -213,6 +217,7 @@ class CuDNNDeconvolutionOp : public Operator { CHECK_NE(req[deconv::kBias], kWriteInplace); } CHECK_NE(req[deconv::kData], kWriteInplace); + GetTempSize(ctx); Tensor workspace = AllocateTempWorkspace(ctx, backward_workspace_byte_); size_t workspace_size = TensorSizeBytes(workspace); for (uint32_t g = 0; g < param_.num_group; ++g) { @@ -348,13 +353,6 @@ class CuDNNDeconvolutionOp : public Operator { size_t expected = param_.no_bias ? 2 : 3; CHECK_EQ(in_shape.size(), expected); CHECK_EQ(out_shape.size(), 1U); - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&bias_desc_)); - CUDNN_CALL(cudnnCreateFilterDescriptor(&filter_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&forward_conv_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_)); - CUDNN_CALL(cudnnCreateConvolutionDescriptor(&back_conv_desc_w_)); TShape dshape = in_shape[deconv::kData]; TShape wshape = in_shape[deconv::kWeight]; @@ -536,7 +534,6 @@ class CuDNNDeconvolutionOp : public Operator { &bias_shape[0], &bias_stride[0])); } - init_cudnn_ = true; } void SelectAlgo(const Context& ctx, @@ -789,7 +786,6 @@ class CuDNNDeconvolutionOp : public Operator { } void GetTempSize(const OpContext& ctx) { - if (init_temp_size_) return; mshadow::Stream *s = ctx.get_stream(); size_t back_data_algo_workspace_size = 0; size_t back_filter_algo_workspace_size = 0; @@ -819,7 +815,6 @@ class CuDNNDeconvolutionOp : public Operator { forward_workspace_byte_ = back_data_algo_workspace_size; backward_workspace_byte_ = std::max(forward_algo_workspace_size, back_filter_algo_workspace_size); - init_temp_size_ = true; } int *CastTShapeToIntPtr(const TShape& s, std::vector *buffer) { @@ -882,8 +877,11 @@ class CuDNNDeconvolutionOp : public Operator { std::vector param_stride_; std::vector param_dilate_; - bool init_cudnn_; - bool init_temp_size_; + int forward_compute_type_; + int backward_compute_type_; + const std::vector in_shapes_; + const std::vector out_shapes_; + // Temp workspace size in bytes needed for Forward() operation. Note that // in deconvolution, this is handled by the cuDNN backprop-to-data kernel. size_t forward_workspace_byte_; diff --git a/src/operator/nn/cudnn/cudnn_pooling-inl.h b/src/operator/nn/cudnn/cudnn_pooling-inl.h index 104ed8546dca..8442b37058d4 100644 --- a/src/operator/nn/cudnn/cudnn_pooling-inl.h +++ b/src/operator/nn/cudnn/cudnn_pooling-inl.h @@ -34,13 +34,18 @@ namespace mxnet { namespace op { template -class CuDNNPoolingOp : public Operator { +class CuDNNPoolingOp { public: - explicit CuDNNPoolingOp(PoolingParam p) { - param_ = p; - init_cudnn_ = false; + CuDNNPoolingOp() { // TODO(xxx): fp16 dtype_ = mshadow::DataType::kCudnnFlag; + CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); + CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); + } + + void Init(const PoolingParam &p) { + param_ = p; switch (param_.pool_type) { case pool_enum::kMaxPooling: mode_ = CUDNN_POOLING_MAX; @@ -54,33 +59,24 @@ class CuDNNPoolingOp : public Operator { } ~CuDNNPoolingOp() { - if (init_cudnn_) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); - CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); - CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_)); - } + CUDNN_CALL(cudnnDestroyTensorDescriptor(in_desc_)); + CUDNN_CALL(cudnnDestroyTensorDescriptor(out_desc_)); + CUDNN_CALL(cudnnDestroyPoolingDescriptor(pooling_desc_)); } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + void Forward(const OpContext &ctx, const TBlob &in_data, + const OpReqType &req, const TBlob &out_data) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); Stream *s = ctx.get_stream(); CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); typename DataType::ScaleType alpha = 1.0f; typename DataType::ScaleType beta = 0.0f; + this->Init(s, in_data, out_data); if (param_.kernel.ndim() == 2) { // 2d pool - Tensor data = in_data[pool_enum::kData].get(s); - Tensor out = out_data[pool_enum::kOut].get(s); - if (!init_cudnn_) { - this->Init(s, in_data, out_data); - } + Tensor data = in_data.get(s); + Tensor out = out_data.get(s); CHECK_EQ(data.CheckContiguous(), true); CHECK_EQ(out.CheckContiguous(), true); CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_, @@ -93,11 +89,8 @@ class CuDNNPoolingOp : public Operator { out.dptr_)); } else if (param_.kernel.ndim() == 3) { // 3d pool - Tensor data = in_data[pool_enum::kData].get(s); - Tensor out = out_data[pool_enum::kOut].get(s); - if (!init_cudnn_) { - this->Init(s, in_data, out_data); - } + Tensor data = in_data.get(s); + Tensor out = out_data.get(s); CHECK_EQ(data.CheckContiguous(), true); CHECK_EQ(out.CheckContiguous(), true); CUDNN_CALL(cudnnPoolingForward(s->dnn_handle_, @@ -113,31 +106,23 @@ class CuDNNPoolingOp : public Operator { } } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + void Backward(const OpContext &ctx, const TBlob &out_grad, + const TBlob &in_data, const TBlob &out_data, + const OpReqType &req, const TBlob &in_grad) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - CHECK_EQ(req.size(), 1U); - CHECK_EQ(in_grad.size(), 1U); Stream *s = ctx.get_stream(); CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); typename DataType::ScaleType alpha = 1.0f; typename DataType::ScaleType beta = 0.0f; + this->Init(s, in_data, out_data); if (param_.kernel.ndim() == 2) { // 2d pool - Tensor m_out_grad = out_grad[pool_enum::kOut].get(s); - Tensor m_in_data = in_data[pool_enum::kData].get(s); - Tensor m_out_data = out_data[pool_enum::kOut].get(s); - Tensor m_in_grad = in_grad[pool_enum::kData].get(s); + Tensor m_out_grad = out_grad.get(s); + Tensor m_in_data = in_data.get(s); + Tensor m_out_data = out_data.get(s); + Tensor m_in_grad = in_grad.get(s); CUDNN_CALL(cudnnPoolingBackward(s->dnn_handle_, pooling_desc_, &alpha, @@ -152,10 +137,10 @@ class CuDNNPoolingOp : public Operator { m_in_grad.dptr_)); } else if (param_.kernel.ndim() == 3) { // 3d pool - Tensor m_out_grad = out_grad[pool_enum::kOut].get(s); - Tensor m_in_data = in_data[pool_enum::kData].get(s); - Tensor m_out_data = out_data[pool_enum::kOut].get(s); - Tensor m_in_grad = in_grad[pool_enum::kData].get(s); + Tensor m_out_grad = out_grad.get(s); + Tensor m_in_data = in_data.get(s); + Tensor m_out_data = out_data.get(s); + Tensor m_in_grad = in_grad.get(s); CUDNN_CALL(cudnnPoolingBackward(s->dnn_handle_, pooling_desc_, &alpha, @@ -174,129 +159,115 @@ class CuDNNPoolingOp : public Operator { } private: - inline void Init(mshadow::Stream *s, - const std::vector &in_data, - const std::vector &out_data) { + inline void Init(mshadow::Stream *s, const TBlob &in_data, + const TBlob &out_data) { using namespace mshadow; #if CUDNN_MAJOR >= 5 nan_prop_ = CUDNN_NOT_PROPAGATE_NAN; #endif - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - if (!init_cudnn_) { - init_cudnn_ = true; - if (param_.kernel.ndim() == 2) { - // 2d conv - Tensor data = in_data[pool_enum::kData].get(s); - Tensor out = out_data[pool_enum::kOut].get(s); - mshadow::Shape<4> dshape = data.shape_; - CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); - CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - data.shape_[0], - data.shape_[1], - data.shape_[2], - data.shape_[3])); - CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - out.shape_[0], - out.shape_[1], - out.shape_[2], - out.shape_[3])); - #if CUDNN_MAJOR >= 5 - CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_, - mode_, - nan_prop_, - param_.global_pool ? dshape[2] : param_.kernel[0], - param_.global_pool ? dshape[3] : param_.kernel[1], - param_.pad[0], - param_.pad[1], - param_.global_pool ? 1 : param_.stride[0], - param_.global_pool ? 1 :param_.stride[1])); - #else - CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_, - mode_, - param_.global_pool ? dshape[2] : param_.kernel[0], - param_.global_pool ? dshape[3] : param_.kernel[1], - param_.pad[0], - param_.pad[1], - param_.global_pool ? 1 : param_.stride[0], - param_.global_pool ? 1 : param_.stride[1])); - #endif - } else { - Tensor data = in_data[pool_enum::kData].get(s); - Tensor out = out_data[pool_enum::kOut].get(s); - CUDNN_CALL(cudnnCreatePoolingDescriptor(&pooling_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&in_desc_)); - CUDNN_CALL(cudnnCreateTensorDescriptor(&out_desc_)); - std::vector ishape = {static_cast(data.shape_[0]), - static_cast(data.shape_[1]), - static_cast(data.shape_[2]), - static_cast(data.shape_[3]), - static_cast(data.shape_[4])}; + if (param_.kernel.ndim() == 2) { + // 2d conv + Tensor data = in_data.get(s); + Tensor out = out_data.get(s); + mshadow::Shape<4> dshape = data.shape_; + CUDNN_CALL(cudnnSetTensor4dDescriptor(in_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + data.shape_[0], + data.shape_[1], + data.shape_[2], + data.shape_[3])); + CUDNN_CALL(cudnnSetTensor4dDescriptor(out_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + out.shape_[0], + out.shape_[1], + out.shape_[2], + out.shape_[3])); + #if CUDNN_MAJOR >= 5 + CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_, + mode_, + nan_prop_, + param_.global_pool ? dshape[2] : param_.kernel[0], + param_.global_pool ? dshape[3] : param_.kernel[1], + param_.pad[0], + param_.pad[1], + param_.global_pool ? 1 : param_.stride[0], + param_.global_pool ? 1 :param_.stride[1])); + #else + CUDNN_CALL(cudnnSetPooling2dDescriptor(pooling_desc_, + mode_, + param_.global_pool ? dshape[2] : param_.kernel[0], + param_.global_pool ? dshape[3] : param_.kernel[1], + param_.pad[0], + param_.pad[1], + param_.global_pool ? 1 : param_.stride[0], + param_.global_pool ? 1 : param_.stride[1])); + #endif + } else { + Tensor data = in_data.get(s); + Tensor out = out_data.get(s); + std::vector ishape = {static_cast(data.shape_[0]), + static_cast(data.shape_[1]), + static_cast(data.shape_[2]), + static_cast(data.shape_[3]), + static_cast(data.shape_[4])}; - std::vector istride = {static_cast(ishape[1] * ishape[2] * ishape[3] * ishape[4]), - static_cast(ishape[2] * ishape[3] * ishape[4]), - static_cast(ishape[3] * ishape[4]), - static_cast(ishape[4]), - 1}; + std::vector istride = {static_cast(ishape[1] * ishape[2] * ishape[3] * ishape[4]), + static_cast(ishape[2] * ishape[3] * ishape[4]), + static_cast(ishape[3] * ishape[4]), + static_cast(ishape[4]), 1}; - std::vector oshape = {static_cast(out.shape_[0]), - static_cast(out.shape_[1]), - static_cast(out.shape_[2]), - static_cast(out.shape_[3]), - static_cast(out.shape_[4])}; + std::vector oshape = {static_cast(out.shape_[0]), + static_cast(out.shape_[1]), + static_cast(out.shape_[2]), + static_cast(out.shape_[3]), + static_cast(out.shape_[4])}; - std::vector ostride = {static_cast(oshape[1] * oshape[2] * oshape[3] * oshape[4]), - static_cast(oshape[2] * oshape[3] * oshape[4]), - static_cast(oshape[3] * oshape[4]), - static_cast(oshape[4]), - 1}; + std::vector ostride = {static_cast(oshape[1] * oshape[2] * oshape[3] * oshape[4]), + static_cast(oshape[2] * oshape[3] * oshape[4]), + static_cast(oshape[3] * oshape[4]), + static_cast(oshape[4]), 1}; - std::vector kernel_vec = {param_.global_pool ? ishape[2] : - static_cast(param_.kernel[0]), - param_.global_pool ? ishape[3] : - static_cast(param_.kernel[1]), - param_.global_pool ? ishape[4] : - static_cast(param_.kernel[2])}; + std::vector kernel_vec = {param_.global_pool ? ishape[2] : + static_cast(param_.kernel[0]), + param_.global_pool ? ishape[3] : + static_cast(param_.kernel[1]), + param_.global_pool ? ishape[4] : + static_cast(param_.kernel[2])}; - std::vector pad_vec = {param_.global_pool ? 0 : static_cast(param_.pad[0]), - param_.global_pool ? 0 : static_cast(param_.pad[1]), - param_.global_pool ? 0 : static_cast(param_.pad[2])}; + std::vector pad_vec = {param_.global_pool ? 0 : static_cast(param_.pad[0]), + param_.global_pool ? 0 : static_cast(param_.pad[1]), + param_.global_pool ? 0 : static_cast(param_.pad[2])}; - std::vector stride_vec = {param_.global_pool ? 1 : static_cast(param_.stride[0]), - param_.global_pool ? 1 : static_cast(param_.stride[1]), - param_.global_pool ? 1 : static_cast(param_.stride[2])}; + std::vector stride_vec = {param_.global_pool ? 1 : static_cast(param_.stride[0]), + param_.global_pool ? 1 : static_cast(param_.stride[1]), + param_.global_pool ? 1 : static_cast(param_.stride[2])}; - CUDNN_CALL(cudnnSetTensorNdDescriptor(in_desc_, - dtype_, - static_cast(ishape.size()), - &ishape[0], - &istride[0])); - CUDNN_CALL(cudnnSetTensorNdDescriptor(out_desc_, - dtype_, - static_cast(oshape.size()), - &oshape[0], - &ostride[0])); - #if CUDNN_MAJOR >= 5 - CUDNN_CALL(cudnnSetPoolingNdDescriptor(pooling_desc_, - mode_, - nan_prop_, - static_cast(kernel_vec.size()), - &(kernel_vec[0]), - &(pad_vec[0]), - &(stride_vec[0]))); - #else - LOG(FATAL) << "3D pooling only support CUDNN v5 and abouve"; - #endif - } + CUDNN_CALL(cudnnSetTensorNdDescriptor(in_desc_, + dtype_, + static_cast(ishape.size()), + &ishape[0], + &istride[0])); + CUDNN_CALL(cudnnSetTensorNdDescriptor(out_desc_, + dtype_, + static_cast(oshape.size()), + &oshape[0], + &ostride[0])); + #if CUDNN_MAJOR >= 5 + CUDNN_CALL(cudnnSetPoolingNdDescriptor(pooling_desc_, + mode_, + nan_prop_, + static_cast(kernel_vec.size()), + &(kernel_vec[0]), + &(pad_vec[0]), + &(stride_vec[0]))); + #else + LOG(FATAL) << "3D pooling only support CUDNN v5 and abouve"; + #endif } } - bool init_cudnn_; + cudnnDataType_t dtype_; cudnnHandle_t handle_; cudnnPoolingMode_t mode_; diff --git a/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h b/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h index 5afdb4844364..239da023668d 100644 --- a/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h +++ b/src/operator/nn/cudnn/cudnn_softmax_activation-inl.h @@ -32,73 +32,64 @@ namespace mxnet { namespace op { -class CuDNNSoftmaxActivationOp : public Operator { +class CuDNNSoftmaxActivationOp { public: - explicit CuDNNSoftmaxActivationOp(SoftmaxActivationParam param) { - this->param_ = param; - init_cudnn_ = false; + CuDNNSoftmaxActivationOp() { dtype_ = CUDNN_DATA_FLOAT; + CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_)); + } + + void Init(SoftmaxActivationParam param) { + this->param_ = param; } ~CuDNNSoftmaxActivationOp() { - if (init_cudnn_) { - CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_)); - } + CUDNN_CALL(cudnnDestroyTensorDescriptor(shape_desc_)); } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + void Forward(const OpContext &ctx, const TBlob &in_data, + const OpReqType &req, const TBlob &out_data) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); Stream *s = ctx.get_stream(); Tensor data; Tensor out; cudnnSoftmaxMode_t softmax_mode; if (param_.mode == softmax_activation::kInstance) { - CHECK_EQ(in_data[softmax_activation::kData].ndim(), 2) + CHECK_EQ(in_data.ndim(), 2) << "Input need to have 2 dimensions when mode=instance."; - Shape<4> dshape = Shape4(in_data[softmax_activation::kData].shape_[0], - in_data[softmax_activation::kData].shape_[1], 1, 1); - data = in_data[softmax_activation::kData].get_with_shape(dshape, s); - out = out_data[softmax_activation::kOut].get_with_shape(dshape, s); + Shape<4> dshape = Shape4(in_data.shape_[0], in_data.shape_[1], 1, 1); + data = in_data.get_with_shape(dshape, s); + out = out_data.get_with_shape(dshape, s); softmax_mode = CUDNN_SOFTMAX_MODE_INSTANCE; } else { - CHECK_GE(in_data[softmax_activation::kData].ndim(), 3) + CHECK_GE(in_data.ndim(), 3) << "Input need to have a least 3 dimensions when mode=channel"; Shape<4> dshape; - index_t size_left = in_data[softmax_activation::kData].Size(); + index_t size_left = in_data.Size(); for (int i = 0; i < 3; ++i) { - if (i < in_data[softmax_activation::kData].ndim()) { - dshape[i] = in_data[softmax_activation::kData].shape_[i]; + if (i < in_data.ndim()) { + dshape[i] = in_data.shape_[i]; } else { dshape[i] = 1; } size_left /= dshape[i]; } dshape[3] = size_left; - data = in_data[softmax_activation::kData].get_with_shape(dshape, s); - out = out_data[softmax_activation::kOut].get_with_shape(dshape, s); + data = in_data.get_with_shape(dshape, s); + out = out_data.get_with_shape(dshape, s); softmax_mode = CUDNN_SOFTMAX_MODE_CHANNEL; } float alpha = 1.0f; float beta = 0.0f; CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); - if (!init_cudnn_) { - init_cudnn_ = true; - CUDNN_CALL(cudnnCreateTensorDescriptor(&shape_desc_)); - CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, - CUDNN_TENSOR_NCHW, - dtype_, - data.shape_[0], - data.shape_[1], - data.shape_[2], - data.shape_[3])); - } + CUDNN_CALL(cudnnSetTensor4dDescriptor(shape_desc_, + CUDNN_TENSOR_NCHW, + dtype_, + data.shape_[0], + data.shape_[1], + data.shape_[2], + data.shape_[3])); CUDNN_CALL(cudnnSoftmaxForward(s->dnn_handle_, CUDNN_SOFTMAX_ACCURATE, softmax_mode, @@ -110,19 +101,10 @@ class CuDNNSoftmaxActivationOp : public Operator { out.dptr_)); } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + void Backward(const OpContext &ctx, const TBlob &out_grad, + const TBlob &out_data, const OpReqType &req, const TBlob &in_grad) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - CHECK_EQ(req.size(), 1U); - CHECK_EQ(in_grad.size(), 1U); float alpha = 1.0f; float beta = 0.0f; Stream *s = ctx.get_stream(); @@ -132,31 +114,30 @@ class CuDNNSoftmaxActivationOp : public Operator { Tensor input_grad; cudnnSoftmaxMode_t softmax_mode; if (param_.mode == softmax_activation::kInstance) { - CHECK_EQ(in_grad[softmax_activation::kData].ndim(), 2) + CHECK_EQ(in_grad.ndim(), 2) << "Input need to have 2 dimensions when mode=instance."; - Shape<4> dshape = Shape4(in_grad[softmax_activation::kData].shape_[0], - in_grad[softmax_activation::kData].shape_[1], 1, 1); - grad = out_grad[softmax_activation::kOut].get_with_shape(dshape, s); - output_data = out_data[softmax_activation::kOut].get_with_shape(dshape, s); - input_grad = in_grad[softmax_activation::kData].get_with_shape(dshape, s); + Shape<4> dshape = Shape4(in_grad.shape_[0], in_grad.shape_[1], 1, 1); + grad = out_grad.get_with_shape(dshape, s); + output_data = out_data.get_with_shape(dshape, s); + input_grad = in_grad.get_with_shape(dshape, s); softmax_mode = CUDNN_SOFTMAX_MODE_INSTANCE; } else { - CHECK_GE(in_grad[softmax_activation::kData].ndim(), 3) + CHECK_GE(in_grad.ndim(), 3) << "Input need to have a least 3 dimensions when mode=channel"; Shape<4> dshape; - index_t size_left = in_grad[softmax_activation::kData].Size(); + index_t size_left = in_grad.Size(); for (int i = 0; i < 3; ++i) { - if (i < in_grad[softmax_activation::kData].ndim()) { - dshape[i] = in_grad[softmax_activation::kData].shape_[i]; + if (i < in_grad.ndim()) { + dshape[i] = in_grad.shape_[i]; } else { dshape[i] = 1; } size_left /= dshape[i]; } dshape[3] = size_left; - output_data = out_data[softmax_activation::kOut].get_with_shape(dshape, s); - grad = out_grad[softmax_activation::kOut].get_with_shape(dshape, s); - input_grad = in_grad[softmax_activation::kData].get_with_shape(dshape, s); + output_data = out_data.get_with_shape(dshape, s); + grad = out_grad.get_with_shape(dshape, s); + input_grad = in_grad.get_with_shape(dshape, s); softmax_mode = CUDNN_SOFTMAX_MODE_CHANNEL; } CHECK_EQ(s->dnn_handle_ownership_, mshadow::Stream::OwnHandle); @@ -174,7 +155,6 @@ class CuDNNSoftmaxActivationOp : public Operator { } private: - bool init_cudnn_; cudnnDataType_t dtype_; cudnnTensorDescriptor_t shape_desc_; SoftmaxActivationParam param_; diff --git a/src/operator/nn/deconvolution-inl.h b/src/operator/nn/deconvolution-inl.h index fbdfaa84faab..b6d522b9e6f9 100644 --- a/src/operator/nn/deconvolution-inl.h +++ b/src/operator/nn/deconvolution-inl.h @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file deconvolution-inl.h * \brief - * \author Wei Wu + * \author Wei Wu, Da Zheng */ #ifndef MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_ #define MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_ @@ -195,19 +195,18 @@ namespace mxnet { namespace op { template -class DeconvolutionOp : public Operator { +class DeconvolutionOp { public: - explicit DeconvolutionOp(DeconvolutionParam p) { + void Init(DeconvolutionParam p) { this->param_ = p; // convert MBytes first to Bytes and then to elements. param_.workspace = (param_.workspace << 20) / sizeof(real_t); } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { + void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; @@ -322,19 +321,18 @@ class DeconvolutionOp : public Operator { } } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { + void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &req, + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; // TODO(bing): check the BLAS Handle, be careful CHECK_EQ(out_grad.size(), 1U); size_t expected = param_.no_bias == 0 ? 3 : 2; - CHECK(in_data.size() == expected && in_grad.size() == expected); + CHECK_EQ(in_data.size(), expected); + CHECK_EQ(in_grad.size(), expected); CHECK_EQ(req.size(), expected); CHECK_EQ(in_data[deconv::kWeight].CheckContiguous(), true); // get data @@ -489,300 +487,52 @@ class DeconvolutionOp : public Operator { }; // class DeconvolutionOp template -Operator* CreateOp(DeconvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx); - -#if DMLC_USE_CXX11 -class DeconvolutionProp : public OperatorProperty { - public: - std::vector ListArguments() const override { - if (!param_.no_bias) { - return {"data", "weight", "bias"}; - } else { - return {"data", "weight"}; - } - } - - void Init(const std::vector >& kwargs) override { - using namespace mshadow; - param_.Init(kwargs); - if (param_.kernel.ndim() == 1) { - param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW; - if (param_.stride.ndim() == 0) param_.stride = Shape1(1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1); - if (param_.pad.ndim() == 0) param_.pad = Shape1(0); - if (param_.adj.ndim() == 0) param_.adj = Shape1(0); - } else if (param_.kernel.ndim() == 2) { - param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; - if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); - if (param_.adj.ndim() == 0) param_.adj = Shape2(0, 0); - } else { - CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D deconvolution not supported"; - param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW; - if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); - if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); - if (param_.adj.ndim() == 0) param_.adj = Shape3(0, 0, 0); - } - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { -#if MXNET_USE_CUDNN == 0 - if (param_.kernel.ndim() > 2) { - LOG(FATAL) << "If not using CUDNN, only 1D or 2D Deconvolution is supported"; - return false; - } -#endif // CUDNN - - using namespace mshadow; - if (!param_.no_bias) { - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; - } else { - CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; - } - out_shape->resize(1, TShape()); - const TShape &dshape = (*in_shape)[deconv::kData]; - if (dshape.ndim() == 0) return false; - - if (param_.kernel.ndim() == 1) { - // 1d conv - CHECK_EQ(dshape.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x"; - Shape<3> dshape_ncw = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW); - Shape<3> wshape = Shape3(dshape_ncw[1], param_.num_filter / param_.num_group, - param_.kernel[0]); - wshape = ConvertLayout(wshape, kNCW, param_.layout.value()); - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); - } - - const index_t dilated_ksize_x = param_.DilatedKernelSize(0); - - index_t o_pad[1]; - index_t o_adj[1]; - param_.InferPad(dshape_ncw, o_pad, o_adj); - - CHECK_EQ(dshape_ncw[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - - CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(x) must be samller than stride[0]"; - - Shape<3> oshape; - oshape[0] = dshape_ncw[0]; - oshape[1] = param_.num_filter; - oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) + - dilated_ksize_x - 2 * o_pad[0] + o_adj[0]; - - if (param_.target_shape[0] > 0) { - CHECK_EQ(param_.target_shape[0], oshape[2]) \ - << "param_.target_shape[0] was not reasonable, please set it carefully"; - } - - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value())); - - return true; - } else if (param_.kernel.ndim() == 2) { - // 2d conv - CHECK_EQ(dshape.ndim(), 4U) \ - << "Input data should be 4D in batch-num_filter-y-x"; - Shape<4> dshape_nchw = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW); - Shape<4> wshape = Shape4(dshape_nchw[1], - param_.num_filter / param_.num_group, - param_.kernel[0], param_.kernel[1]); - wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); - } - - const index_t dilated_ksize_y = param_.DilatedKernelSize(0); - const index_t dilated_ksize_x = param_.DilatedKernelSize(1); - - index_t o_pad[2]; - index_t o_adj[2]; - param_.InferPad(dshape_nchw, o_pad, o_adj); - - CHECK_EQ(dshape_nchw[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - - CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(y) must be samller than stride[0]"; - CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(x) must be samller than stride[1]"; - - Shape<4> oshape; - oshape[0] = dshape_nchw[0]; - oshape[1] = param_.num_filter; - oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) + - dilated_ksize_y - 2 * o_pad[0] + o_adj[0]; - oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) + - dilated_ksize_x - 2 * o_pad[1] + o_adj[1]; - - if (param_.target_shape[0] > 0) { - CHECK_EQ(param_.target_shape[0], oshape[2]) \ - << "param_.target_shape[0] was not reasonable, please set it carefully"; - } - if (param_.target_shape[1] > 0) { - CHECK_EQ(param_.target_shape[1], oshape[3]) \ - << "param_.target_shape[1] was not reasonable, please set it carefully"; - } - - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); - - return true; - } else if (param_.kernel.ndim() == 3) { - // 3d conv - CHECK_EQ(dshape.ndim(), 5U) \ - << "Input data should be 5D in batch-num_filter-depth-y-x"; - Shape<5> dshape_ncdhw = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW); - Shape<5> wshape = Shape5(dshape_ncdhw[1], param_.num_filter / param_.num_group, - param_.kernel[0], param_.kernel[1], param_.kernel[2]); - wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value()); - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); - } - - // Note: 3D dilation currently not supported. - // Calculations below done to preserve symmetry with 1D/2D code. - const index_t dilated_ksize_d = param_.DilatedKernelSize(0); - const index_t dilated_ksize_y = param_.DilatedKernelSize(1); - const index_t dilated_ksize_x = param_.DilatedKernelSize(2); - - index_t o_pad[3]; - index_t o_adj[3]; - param_.InferPad(dshape_ncdhw, o_pad, o_adj); - - CHECK_EQ(dshape_ncdhw[1] % param_.num_group, 0U) \ - << "input num_filter must divide group size"; - CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ - << "output num_filter must divide group size"; - CHECK_GT(param_.kernel.Size(), 0U) \ - << "incorrect kernel size: " << param_.kernel; - CHECK_GT(param_.stride.Size(), 0U) \ - << "incorrect stride size: " << param_.stride; - CHECK_GT(param_.dilate.Size(), 0U) \ - << "incorrect dilate size: " << param_.dilate; - CHECK_EQ(param_.dilate.Size(), 1U) - << "Dilate is not supported in 3d deconvolution"; - - CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(d) must be samller than stride[0]"; - CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(y) must be samller than stride[1]"; - CHECK_GE(param_.stride[2]-1, o_adj[2]) << "adj(x) must be samller than stride[2]"; - - Shape<5> oshape; - oshape[0] = dshape_ncdhw[0]; - oshape[1] = param_.num_filter; - oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) + - dilated_ksize_d - 2 * o_pad[0] + o_adj[0]; - oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) + - dilated_ksize_y - 2 * o_pad[1] + o_adj[1]; - oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) + - dilated_ksize_x - 2 * o_pad[2] + o_adj[2]; - - if (param_.target_shape[0] > 0) { - CHECK_EQ(param_.target_shape[0], oshape[2]) \ - << "param_.target_shape[0] was not reasonable, please it carefully"; - } - if (param_.target_shape[1] > 0) { - CHECK_EQ(param_.target_shape[1], oshape[3]) \ - << "param_.target_shape[1] was not reasonable, please set it carefully"; - } - if (param_.target_shape[2] > 0) { - CHECK_EQ(param_.target_shape[2], oshape[4]) \ - << "param_.target_shape[2] was not reasonable, please set it carefully"; - } - - SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value())); - - return true; - } else { - LOG(FATAL) << "Unknown convolution type"; - return false; - } - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; - } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); - } - } - out_type->clear(); - out_type->push_back(dtype); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new DeconvolutionProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "Deconvolution"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[deconv::kOut], in_data[deconv::kData], in_data[deconv::kWeight]}; - } +void _DeconvolutionCompute(const DeconvolutionParam& param, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { + DeconvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); + }); +} - std::vector ForwardResource( - const std::vector &in_shape) const override { - return {ResourceRequest::kTempSpace}; - } +template +void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + _DeconvolutionCompute(param, ctx, inputs, req, outputs); +} - std::vector BackwardResource( - const std::vector &in_shape) const override { - return {ResourceRequest::kTempSpace}; - } +template +void _DeconvolutionGradCompute(const DeconvolutionParam& param, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + std::vector in_data(inputs.begin() + 1, inputs.end()); + const TBlob &out_grad = inputs[0]; + const std::vector &in_grad = outputs; + + MSHADOW_REAL_TYPE_SWITCH(out_grad.type_flag_, DType, { + DeconvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + }); +} - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented"; - return NULL; - } - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; +template +void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + _DeconvolutionGradCompute(param, ctx, inputs, req, outputs); +} - private: - DeconvolutionParam param_; -}; // class DeconvolutionProp -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_DECONVOLUTION_INL_H_ diff --git a/src/operator/nn/deconvolution.cc b/src/operator/nn/deconvolution.cc index 9d3c040c1d63..a3fc915eb0fe 100644 --- a/src/operator/nn/deconvolution.cc +++ b/src/operator/nn/deconvolution.cc @@ -21,45 +21,408 @@ * Copyright (c) 2015 by Contributors * \file deconvolution.cc * \brief - * \author Wei Wu + * \author Wei Wu, Da Zheng */ #include "./deconvolution-inl.h" +#include "./mkldnn/mkldnn_ops-inl.h" +#include "./mkldnn/mkldnn_base-inl.h" namespace mxnet { namespace op { -template<> -Operator* CreateOp(DeconvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new DeconvolutionOp(param); - }); - return op; + +static bool DeconvolutionShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, + std::vector *out_shape) { + const DeconvolutionParam& param_ = nnvm::get(attrs.parsed); +#if MXNET_USE_CUDNN == 0 + if (param_.kernel.ndim() > 2) { + LOG(FATAL) << "If not using CUDNN, only 1D or 2D Deconvolution is supported"; + return false; + } +#endif // CUDNN + + using namespace mshadow; + if (!param_.no_bias) { + CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; + } else { + CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; + } + out_shape->resize(1, TShape()); + const TShape &dshape = (*in_shape)[deconv::kData]; + if (dshape.ndim() == 0) return false; + + if (param_.kernel.ndim() == 1) { + // 1d conv + CHECK_EQ(dshape.ndim(), 3U) << "Input data should be 3D in batch-num_filter-x"; + Shape<3> dshape_ncw = ConvertLayout(dshape.get<3>(), param_.layout.value(), kNCW); + Shape<3> wshape = Shape3(dshape_ncw[1], param_.num_filter / param_.num_group, + param_.kernel[0]); + wshape = ConvertLayout(wshape, kNCW, param_.layout.value()); + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); + } + + const index_t dilated_ksize_x = param_.DilatedKernelSize(0); + + index_t o_pad[1]; + index_t o_adj[1]; + param_.InferPad(dshape_ncw, o_pad, o_adj); + + CHECK_EQ(dshape_ncw[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + + CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(x) must be samller than stride[0]"; + + Shape<3> oshape; + oshape[0] = dshape_ncw[0]; + oshape[1] = param_.num_filter; + oshape[2] = param_.stride[0] * (dshape_ncw[2] - 1) + + dilated_ksize_x - 2 * o_pad[0] + o_adj[0]; + + if (param_.target_shape.ndim() > 0) { + if (param_.target_shape[0] > 0) { + CHECK_EQ(param_.target_shape[0], oshape[2]) \ + << "param_.target_shape[0] was not reasonable, please set it carefully"; + } + } + + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCW, param_.layout.value())); + + return true; + } else if (param_.kernel.ndim() == 2) { + // 2d conv + CHECK_EQ(dshape.ndim(), 4U) \ + << "Input data should be 4D in batch-num_filter-y-x"; + Shape<4> dshape_nchw = ConvertLayout(dshape.get<4>(), param_.layout.value(), kNCHW); + Shape<4> wshape = Shape4(dshape_nchw[1], + param_.num_filter / param_.num_group, + param_.kernel[0], param_.kernel[1]); + wshape = ConvertLayout(wshape, kNCHW, param_.layout.value()); + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); + } + + const index_t dilated_ksize_y = param_.DilatedKernelSize(0); + const index_t dilated_ksize_x = param_.DilatedKernelSize(1); + + index_t o_pad[2]; + index_t o_adj[2]; + param_.InferPad(dshape_nchw, o_pad, o_adj); + + CHECK_EQ(dshape_nchw[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + + CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(y) must be samller than stride[0]"; + CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(x) must be samller than stride[1]"; + + Shape<4> oshape; + oshape[0] = dshape_nchw[0]; + oshape[1] = param_.num_filter; + oshape[2] = param_.stride[0] * (dshape_nchw[2] - 1) + + dilated_ksize_y - 2 * o_pad[0] + o_adj[0]; + oshape[3] = param_.stride[1] * (dshape_nchw[3] - 1) + + dilated_ksize_x - 2 * o_pad[1] + o_adj[1]; + + if (param_.target_shape.ndim() > 1) { + if (param_.target_shape[0] > 0) { + CHECK_EQ(param_.target_shape[0], oshape[2]) \ + << "param_.target_shape[0] was not reasonable, please set it carefully"; + } + if (param_.target_shape[1] > 0) { + CHECK_EQ(param_.target_shape[1], oshape[3]) \ + << "param_.target_shape[1] was not reasonable, please set it carefully"; + } + } + + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value())); + + return true; + } else if (param_.kernel.ndim() == 3) { + // 3d conv + CHECK_EQ(dshape.ndim(), 5U) \ + << "Input data should be 5D in batch-num_filter-depth-y-x"; + Shape<5> dshape_ncdhw = ConvertLayout(dshape.get<5>(), param_.layout.value(), kNCDHW); + Shape<5> wshape = Shape5(dshape_ncdhw[1], param_.num_filter / param_.num_group, + param_.kernel[0], param_.kernel[1], param_.kernel[2]); + wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value()); + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kWeight, wshape); + if (!param_.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, deconv::kBias, Shape1(param_.num_filter)); + } + + // Note: 3D dilation currently not supported. + // Calculations below done to preserve symmetry with 1D/2D code. + const index_t dilated_ksize_d = param_.DilatedKernelSize(0); + const index_t dilated_ksize_y = param_.DilatedKernelSize(1); + const index_t dilated_ksize_x = param_.DilatedKernelSize(2); + + index_t o_pad[3]; + index_t o_adj[3]; + param_.InferPad(dshape_ncdhw, o_pad, o_adj); + + CHECK_EQ(dshape_ncdhw[1] % param_.num_group, 0U) \ + << "input num_filter must divide group size"; + CHECK_EQ(param_.num_filter % param_.num_group, 0U) \ + << "output num_filter must divide group size"; + CHECK_GT(param_.kernel.Size(), 0U) \ + << "incorrect kernel size: " << param_.kernel; + CHECK_GT(param_.stride.Size(), 0U) \ + << "incorrect stride size: " << param_.stride; + CHECK_GT(param_.dilate.Size(), 0U) \ + << "incorrect dilate size: " << param_.dilate; + CHECK_EQ(param_.dilate.Size(), 1U) + << "Dilate is not supported in 3d deconvolution"; + + CHECK_GE(param_.stride[0]-1, o_adj[0]) << "adj(d) must be samller than stride[0]"; + CHECK_GE(param_.stride[1]-1, o_adj[1]) << "adj(y) must be samller than stride[1]"; + CHECK_GE(param_.stride[2]-1, o_adj[2]) << "adj(x) must be samller than stride[2]"; + + Shape<5> oshape; + oshape[0] = dshape_ncdhw[0]; + oshape[1] = param_.num_filter; + oshape[2] = param_.stride[0] * (dshape_ncdhw[2] - 1) + + dilated_ksize_d - 2 * o_pad[0] + o_adj[0]; + oshape[3] = param_.stride[1] * (dshape_ncdhw[3] - 1) + + dilated_ksize_y - 2 * o_pad[1] + o_adj[1]; + oshape[4] = param_.stride[2] * (dshape_ncdhw[4] - 1) + + dilated_ksize_x - 2 * o_pad[2] + o_adj[2]; + + if (param_.target_shape.ndim() > 2) { + if (param_.target_shape[0] > 0) { + CHECK_EQ(param_.target_shape[0], oshape[2]) \ + << "param_.target_shape[0] was not reasonable, please it carefully"; + } + if (param_.target_shape[1] > 0) { + CHECK_EQ(param_.target_shape[1], oshape[3]) \ + << "param_.target_shape[1] was not reasonable, please set it carefully"; + } + if (param_.target_shape[2] > 0) { + CHECK_EQ(param_.target_shape[2], oshape[4]) \ + << "param_.target_shape[2] was not reasonable, please set it carefully"; + } + } + + SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value())); + + return true; + } else { + LOG(FATAL) << "Unknown convolution type"; + return false; + } +} + +static inline std::vector ListArguments(const DeconvolutionParam& param_) { + if (!param_.no_bias) { + return {"data", "weight", "bias"}; + } else { + return {"data", "weight"}; + } +} + +static bool DeconvolutionType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, std::vector *out_type) { + const DeconvolutionParam& param_ = nnvm::get(attrs.parsed); + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param_)[i]); + } + } + out_type->clear(); + out_type->push_back(dtype); + return true; +} + +inline static bool DeconvStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 2 : 3; + CHECK_EQ(in_attrs->size(), in_expected); + CHECK_EQ(out_attrs->size(), 1); + + DispatchMode wanted_mode; +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) + wanted_mode = DispatchMode::kFComputeEx; + else +#endif + wanted_mode = DispatchMode::kFCompute; + return storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, wanted_mode); +} + +inline static bool BackwardDeconvStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + uint32_t out_expected = param.no_bias ? 2 : 3; + CHECK_EQ(in_attrs->size(), param.no_bias ? 3U : 4U); + CHECK_EQ(out_attrs->size(), out_expected); + + DispatchMode wanted_mode; +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) + wanted_mode = DispatchMode::kFComputeEx; + else +#endif + wanted_mode = DispatchMode::kFCompute; + return storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, wanted_mode); +} + +#if MXNET_USE_MKLDNN == 1 +static void DeconvolutionComputeExCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + if (SupportMKLDNNConv(inputs[0])) { + MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs); + MKLDNNDeconvolutionForward(attrs, ctx, inputs, req, outputs); + MKLDNN_OPCHECK_RUN(DeconvolutionCompute, attrs, ctx, inputs, req, + outputs); + return; + } + FallBackCompute(DeconvolutionCompute, attrs, ctx, inputs, req, + outputs); } -Operator* DeconvolutionProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - std::vector out_shape, aux_shape; - CHECK(InferShape(in_shape, &out_shape, &aux_shape)); - DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0), in_shape, &out_shape, ctx); +static void DeconvolutionGradComputeExCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + if (SupportMKLDNNConv(inputs[0])) { + MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs); + MKLDNNDeconvolutionBackward(attrs, ctx, inputs, req, outputs); + MKLDNN_OPCHECK_RUN(DeconvolutionGradCompute, attrs, ctx, inputs, req, + outputs); + return; + } + FallBackCompute(DeconvolutionGradCompute, attrs, ctx, inputs, req, + outputs); } +#endif + +static void DeconvolutionParamParser(nnvm::NodeAttrs* attrs) { + using namespace mshadow; + DeconvolutionParam param_; + param_.Init(attrs->dict); + if (param_.kernel.ndim() == 1) { + param_.layout = param_.layout? param_.layout.value() : mshadow::kNCW; + if (param_.stride.ndim() == 0) param_.stride = Shape1(1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape1(1); + if (param_.pad.ndim() == 0) param_.pad = Shape1(0); + if (param_.adj.ndim() == 0) param_.adj = Shape1(0); + } else if (param_.kernel.ndim() == 2) { + param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW; + if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); + if (param_.adj.ndim() == 0) param_.adj = Shape2(0, 0); + } else { + CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D deconvolution not supported"; + param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW; + if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); + if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1); + if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); + if (param_.adj.ndim() == 0) param_.adj = Shape3(0, 0, 0); + } + attrs->parsed = std::move(param_); +} + +struct DeconvolutionGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + std::vector heads(ograds.begin(), ograds.end()); + heads.push_back(n->inputs[deconv::kData]); + heads.push_back(n->inputs[deconv::kWeight]); + const DeconvolutionParam& param = nnvm::get(n->attrs.parsed); + if (!param.no_bias) + heads.push_back(n->inputs[deconv::kBias]); + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; DMLC_REGISTER_PARAMETER(DeconvolutionParam); -MXNET_REGISTER_OP_PROPERTY(Deconvolution, DeconvolutionProp) -.add_argument("data", "NDArray-or-Symbol", "Input tensor to the deconvolution operation.") -.add_argument("weight", "NDArray-or-Symbol", "Weights representing the kernel.") -.add_argument("bias", "NDArray-or-Symbol", "Bias added to the result after the deconvolution " - "operation.") -.add_arguments(DeconvolutionParam::__FIELDS__()) +NNVM_REGISTER_OP(Deconvolution) .describe("Computes 1D or 2D transposed convolution (aka fractionally strided convolution) of the " "input tensor. This operation can be seen as the gradient of Convolution operation with " "respect to its input. Convolution usually reduces the size of the input. Transposed " "convolution works the other way, going from a smaller input to a larger output while " - "preserving the connectivity pattern."); + "preserving the connectivity pattern.") +.set_num_inputs([](const NodeAttrs& attrs) { + const DeconvolutionParam& params = nnvm::get(attrs.parsed); + return params.no_bias ? 2 : 3; +}) +.set_num_outputs(1) +.set_attr_parser(DeconvolutionParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return ListArguments(nnvm::get(attrs.parsed)); +}) +.set_attr("FInferShape", DeconvolutionShape) +.set_attr("FInferType", DeconvolutionType) +.set_attr("FInferStorageType", DeconvStorageType) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +.set_attr("FCompute", DeconvolutionCompute) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FComputeEx", DeconvolutionComputeExCPU) +#endif +.set_attr("FGradient", DeconvolutionGrad{"_backward_Deconvolution"}) +.add_argument("data", "NDArray-or-Symbol", "Input tensor to the deconvolution operation.") +.add_argument("weight", "NDArray-or-Symbol", "Weights representing the kernel.") +.add_argument("bias", "NDArray-or-Symbol", "Bias added to the result after the deconvolution " + "operation.") +.add_arguments(DeconvolutionParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_Deconvolution) +.set_num_outputs([](const NodeAttrs& attrs) { + const DeconvolutionParam& params = nnvm::get(attrs.parsed); + return params.no_bias ? 2 : 3; +}) +.set_attr("TIsBackward", true) +.set_attr("FInferStorageType", BackwardDeconvStorageType) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +.set_attr_parser(DeconvolutionParamParser) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FComputeEx", DeconvolutionGradComputeExCPU) +#endif +.set_attr("FCompute", DeconvolutionGradCompute); } // namespace op } // namespace mxnet diff --git a/src/operator/nn/deconvolution.cu b/src/operator/nn/deconvolution.cu index 623770170d50..c7395428c2a0 100644 --- a/src/operator/nn/deconvolution.cu +++ b/src/operator/nn/deconvolution.cu @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file deconvolution.cu * \brief - * \author Wei Wu + * \author Wei Wu, Da Zheng */ #include "./deconvolution-inl.h" @@ -31,13 +31,29 @@ namespace mxnet { namespace op { + +#if MXNET_USE_CUDNN == 1 +template +static CuDNNDeconvolutionOp &GetCuDNNDeconvOp(const DeconvolutionParam& param, + int forward_compute_type, + int backward_compute_type, + const std::vector& in_shape, + const std::vector& out_shape, + const Context& ctx) { + static thread_local CuDNNDeconvolutionOp op; + op.Init(param, forward_compute_type, backward_compute_type, in_shape, out_shape, ctx); + return op; +} +#endif + template<> -Operator* CreateOp(DeconvolutionParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - // Logic here parallels that in Convolution.cu - Operator *op = NULL; +void DeconvolutionCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + int dtype = inputs[0].type_flag_; #if MXNET_USE_CUDNN == 1 // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16). @@ -45,23 +61,88 @@ Operator* CreateOp(DeconvolutionParam param, int dtype, MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { if (param.cudnn_off) { - op = new DeconvolutionOp(param); - } else if (!CuDNNDeconvolutionOp::Supports(param, compute_type, compute_type, ctx)) { + DeconvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); + } else if (!CuDNNDeconvolutionOp::Supports(param, + compute_type, compute_type, ctx.run_ctx.ctx)) { LOG(WARNING) << "This deconvolution is not supported by cudnn, MXNET deconvolution is applied."; - op = new DeconvolutionOp(param); + DeconvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); } else { - op = new CuDNNDeconvolutionOp(param, compute_type, compute_type, - *in_shape, *out_shape, ctx); + std::vector in_shape(inputs.size()); + std::vector out_shape(1, outputs[0].shape_); + for (size_t i = 0; i < in_shape.size(); i++) { + in_shape[i] = inputs[i].shape_; + } + GetCuDNNDeconvOp(param, compute_type, compute_type, + in_shape, out_shape, ctx.run_ctx.ctx).Forward(ctx, inputs, req, outputs); } }) #else MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new DeconvolutionOp(param); + DeconvolutionOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); }) #endif // MXNET_USE_CUDNN - return op; } +template<> +void DeconvolutionGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + std::vector in_data(inputs.begin() + 1, inputs.end()); + const TBlob &out_grad = inputs[0]; + const std::vector &in_grad = outputs; + int dtype = out_grad.type_flag_; + +#if MXNET_USE_CUDNN == 1 + // On fp16-I/O instances, use fp32 compute (i.e. pseudo-fp16). + int compute_type = (dtype == mshadow::kFloat16) ? mshadow::kFloat32 : dtype; + + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + if (param.cudnn_off) { + DeconvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + } else if (!CuDNNDeconvolutionOp::Supports(param, + compute_type, compute_type, ctx.run_ctx.ctx)) { + LOG(WARNING) << + "This deconvolution is not supported by cudnn, MXNET deconvolution is applied."; + DeconvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + } else { + std::vector in_shape(in_data.size()); + std::vector out_shape(1, out_grad.shape_); + for (size_t i = 0; i < in_shape.size(); i++) { + in_shape[i] = in_data[i].shape_; + } + GetCuDNNDeconvOp(param, compute_type, compute_type, + in_shape, out_shape, ctx.run_ctx.ctx).Backward(ctx, + std::vector{out_grad}, in_data, req, in_grad); + } + }) +#else + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + DeconvolutionOp op; + op.Init(param); + op.Backward(ctx, std::vector{out_grad}, in_data, req, in_grad); + }) +#endif // MXNET_USE_CUDNN +} + +NNVM_REGISTER_OP(Deconvolution) +.set_attr("FCompute", DeconvolutionCompute); + +NNVM_REGISTER_OP(_backward_Deconvolution) +.set_attr("FCompute", DeconvolutionGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/depthwise_convolution-inl.h b/src/operator/nn/depthwise_convolution-inl.h index c4b7a4787554..0af8cae51c84 100644 --- a/src/operator/nn/depthwise_convolution-inl.h +++ b/src/operator/nn/depthwise_convolution-inl.h @@ -39,11 +39,11 @@ namespace mxnet { namespace op { using namespace tf::depthwise_conv; template -class DepthwiseConvolutionOp : public Operator { +class DepthwiseConvolutionOp { public: - explicit DepthwiseConvolutionOp(const ConvolutionParam& param, - const std::vector& in_shape, - const std::vector& out_shape) { + void Init(const ConvolutionParam& param, + const std::vector& in_shape, + const std::vector& out_shape) { args_.batch = in_shape[conv::kData][0]; args_.in_channel = in_shape[conv::kData][1]; args_.in_height = in_shape[conv::kData][2]; @@ -62,19 +62,16 @@ class DepthwiseConvolutionOp : public Operator { ~DepthwiseConvolutionOp() {} - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args); + void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data); - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args); + void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &req, + const std::vector &in_grad); private: DepthwiseArgs args_; @@ -282,8 +279,7 @@ template void DepthwiseConvolutionOp::Forward(const OpContext &ctx, const std::vector &in_data, const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { + const std::vector &out_data) { using namespace mshadow; using namespace mshadow::expr; auto stream = ctx.get_stream(); @@ -305,10 +301,8 @@ template void DepthwiseConvolutionOp::Backward(const OpContext &ctx, const std::vector &out_grad, const std::vector &in_data, - const std::vector &out_data, const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; auto stream = ctx.get_stream(); diff --git a/src/operator/nn/depthwise_convolution_tf.cuh b/src/operator/nn/depthwise_convolution_tf.cuh index c7f48e686136..e4dfd8292d2d 100644 --- a/src/operator/nn/depthwise_convolution_tf.cuh +++ b/src/operator/nn/depthwise_convolution_tf.cuh @@ -24,8 +24,8 @@ * are different with origin version. * \author shuqian.qu@hobot.cc */ -#ifndef MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_ -#define MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_ +#ifndef MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_ +#define MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_ #include "../../common/cuda_utils.h" #include "../mxnet_op.h" @@ -730,4 +730,4 @@ bool TryLaunchDepthwiseConv2dBackwardFilterGPUSmall(mshadow::Stream } // namespace depthwise_conv } // namespace tf -#endif // MXNET_OPERATOR_DEPTHWISE_CONVOLUTION_TF_CUH_ +#endif // MXNET_OPERATOR_NN_DEPTHWISE_CONVOLUTION_TF_CUH_ diff --git a/src/operator/nn/dropout-inl.h b/src/operator/nn/dropout-inl.h index 715a6f4ee219..cff35a3cef7f 100644 --- a/src/operator/nn/dropout-inl.h +++ b/src/operator/nn/dropout-inl.h @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file dropout-inl.h * \brief - * \author Bing Xu + * \author Bing Xu, Da Zheng */ #ifndef MXNET_OPERATOR_NN_DROPOUT_INL_H_ @@ -71,7 +71,7 @@ struct DropoutParam : public dmlc::Parameter { }; // struct DropoutParam template -class DropoutOp : public Operator { +class DropoutOp { #if defined(USE_MKL) && defined(_OPENMP) static void BernoulliGenerate(common::random::RandGenerator gen, int n, double p, int* r) { @@ -206,16 +206,15 @@ class DropoutOp : public Operator { } }; - explicit DropoutOp(DropoutParam param) { + void Init(const DropoutParam ¶m) { this->pkeep_ = 1.0f - param.p; this->mode_ = static_cast(param.mode); } - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_states) { + void Forward(const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { if (req[dropout::kOut] != kNullOp) { CHECK_EQ(in_data.size(), 1U); if (ctx.is_train) { @@ -249,17 +248,13 @@ class DropoutOp : public Operator { } } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_states) { + void Backward(const OpContext &ctx, + const std::vector &out_grad, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad) { using namespace mshadow; using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_grad.size(), 1U); Stream *s = ctx.get_stream(); if (ctx.is_train || mode_ == dropout::kAlways) { if (!MKLBackward(s, this->pkeep_, in_grad, out_data, out_grad)) { @@ -293,110 +288,42 @@ class DropoutOp : public Operator { dropout::DropoutOpMode mode_; }; // class DropoutOp - template -Operator *CreateOp(DropoutParam param, int dtype); - -#if DMLC_USE_CXX11 -class DropoutProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 1U); - const TShape &dshape = in_shape->at(0); - if (dshape.ndim() == 0) return false; - out_shape->clear(); - out_shape->push_back(dshape); - out_shape->push_back(dshape); - return true; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_EQ(in_type->size(), 1U); - int dtype = in_type->at(0); - - if (dtype == -1) { - LOG(FATAL) << "input type to dropout is not specified."; - return false; - } - - size_t nout = this->ListOutputs().size(); - out_type->clear(); - for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new DropoutProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "Dropout"; - } +void DropoutCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DropoutParam& param = nnvm::get(attrs.parsed); + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + static thread_local DropoutOp op; + op.Init(param); + op.Forward(ctx, inputs, req, outputs); + }); +} - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[dropout::kOut], out_data[dropout::kMask]}; - } - - std::vector > BackwardInplaceOption( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &in_grad) const override { - return {{out_grad[dropout::kOut], in_grad[dropout::kData]}}; - } - - std::vector > ForwardInplaceOption( - const std::vector &in_data, - const std::vector &out_data) const override { - return {{in_data[dropout::kData], out_data[dropout::kOut]}}; - } - - std::vector ForwardResource(const std::vector &in_shape) const override { - return { ResourceRequest::kParallelRandom }; - } - - int NumVisibleOutputs() const override { - return 1; - } - - int NumOutputs() const override { - return 2; - } - - std::vector ListOutputs() const override { - return {"output", "mask"}; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented"; - return NULL; - } - - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; +template +void DropoutGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const DropoutParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1); + CHECK_EQ(req.size(), 1); + std::vector out_grads(2); + std::vector out_data(2); + out_grads[dropout::kOut] = inputs[0]; + out_data[dropout::kMask] = inputs[1]; + + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + static thread_local DropoutOp op; + op.Init(param); + op.Backward(ctx, out_grads, out_data, req, outputs); + }); +} - private: - DropoutParam param_; -}; // class DropoutProp -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_DROPOUT_INL_H_ diff --git a/src/operator/nn/dropout.cc b/src/operator/nn/dropout.cc index 3aa832a71356..dd5f1e58fbe5 100644 --- a/src/operator/nn/dropout.cc +++ b/src/operator/nn/dropout.cc @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file dropout.cc * \brief - * \author Bing Xu + * \author Bing Xu, Da Zheng */ #include "./dropout-inl.h" @@ -29,24 +29,21 @@ namespace mxnet { namespace op { -template<> -Operator *CreateOp(DropoutParam param, int dtype) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new DropoutOp(param); - }); - return op; -} - -// DO_BIND_DISPATCH comes from operator_common.h -Operator *DropoutProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); -} + +struct DropoutGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + std::vector heads; + heads.push_back(ograds[0]); + heads.emplace_back(nnvm::NodeEntry{n, dropout::kMask, 0}); + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; DMLC_REGISTER_PARAMETER(DropoutParam); -MXNET_REGISTER_OP_PROPERTY(Dropout, DropoutProp) +NNVM_REGISTER_OP(Dropout) .describe(R"(Applies dropout operation to input array. - During training, each element of the input is set to zero with probability p. @@ -77,8 +74,66 @@ Example:: [[ 3. 0.5 -0.5 2. 7. ] [ 2. -0.4 7. 3. 0.2 ]] )" ADD_FILELINE) +.set_num_inputs(1) +.set_num_outputs(2) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data"}; +}) +.set_attr("FListOutputNames", + [](const NodeAttrs& attrs) { + return std::vector{"output", "mask"}; +}) +.set_attr("FNumVisibleOutputs", + [](const NodeAttrs& attrs) { + return 1; +}) +.set_attr("FInferShape", [](const nnvm::NodeAttrs& attrs, + std::vector *in_shape, std::vector *out_shape){ + using namespace mshadow; + CHECK_EQ(in_shape->size(), 1U); + const TShape &dshape = in_shape->at(0); + if (dshape.ndim() == 0) return false; + out_shape->clear(); + out_shape->push_back(dshape); + out_shape->push_back(dshape); + return true; +}) +.set_attr("FInferType", [](const nnvm::NodeAttrs& attrs, + std::vector *in_type, std::vector *out_type) { + CHECK_EQ(in_type->size(), 1U); + int dtype = in_type->at(0); + + if (dtype == -1) { + LOG(FATAL) << "input type to dropout is not specified."; + return false; + } + + size_t nout = 2; + out_type->clear(); + for (size_t i = 0; i < nout; ++i) out_type->push_back(dtype); + return true; +}) +.set_attr("FCompute", DropoutCompute) +.set_attr("FGradient", DropoutGrad{"_backward_Dropout"}) +.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ + return std::vector >{{0, 0}}; +}) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ ResourceRequest::kParallelRandom }; +}) .add_argument("data", "NDArray-or-Symbol", "Input array to which dropout will be applied.") .add_arguments(DropoutParam::__FIELDS__()); +NNVM_REGISTER_OP(_backward_Dropout) +.set_num_outputs(1) +.set_attr("TIsBackward", true) +.set_attr_parser(ParamParser) +.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ + return std::vector >{{0, 0}}; +}) +.set_attr("FCompute", DropoutGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/dropout.cu b/src/operator/nn/dropout.cu index f416c5883203..e655278822a4 100644 --- a/src/operator/nn/dropout.cu +++ b/src/operator/nn/dropout.cu @@ -21,21 +21,20 @@ * Copyright (c) 2015 by Contributors * \file dropout.cc * \brief - * \author Bing Xu + * \author Bing Xu, Da Zheng */ #include "./dropout-inl.h" namespace mxnet { namespace op { -template<> -Operator *CreateOp(DropoutParam param, int dtype) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new DropoutOp(param); - }); - return op; -} + +NNVM_REGISTER_OP(Dropout) +.set_attr("FCompute", DropoutCompute); + +NNVM_REGISTER_OP(_backward_Dropout) +.set_attr("FCompute", DropoutGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/fully_connected-inl.h b/src/operator/nn/fully_connected-inl.h index 9f3deec2449f..e8e95643e647 100644 --- a/src/operator/nn/fully_connected-inl.h +++ b/src/operator/nn/fully_connected-inl.h @@ -43,6 +43,7 @@ namespace op { // These enums are only visible within this header namespace fullc { enum FullyConnectedOpInputs {kData, kWeight, kBias}; +enum FullyConnectedOpResource {kTempSpace}; enum FullyConnectedOpOutputs {kOut}; } // fullc @@ -61,240 +62,160 @@ struct FullyConnectedParam : public dmlc::Parameter { } }; -/** - * \brief This is the implementation of fully connected operator. - * \tparam xpu The device that the op will be executed on. - */ template -class FullyConnectedOp : public Operator { - public: - explicit FullyConnectedOp(FullyConnectedParam p) { - this->param_ = p; - } - - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - if (req[fullc::kOut] == kNullOp) return; - CHECK_EQ(req[fullc::kOut], kWriteTo); - size_t expected = param_.no_bias ? 2 : 3; - CHECK_EQ(in_data.size(), expected); - CHECK_EQ(out_data.size(), 1U); - // TODO(bing): check the BLAS Handle, be careful - // maybe need blas handle from context - // TODO(bing): judge shape to remove flatten op - Stream *s = ctx.get_stream(); +void FCForward(const OpContext &ctx, const FullyConnectedParam ¶m, + const std::vector &in_data, const std::vector &req, + const std::vector &out_data) { + using namespace mshadow; + using namespace mshadow::expr; + if (req[fullc::kOut] == kNullOp) return; + CHECK_EQ(req[fullc::kOut], kWriteTo); + // TODO(bing): check the BLAS Handle, be careful + // maybe need blas handle from context + // TODO(bing): judge shape to remove flatten op + Stream *s = ctx.get_stream(); #if defined(__CUDACC__) - CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) - << "Must init CuBLAS handle in stream"; + CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) + << "Must init CuBLAS handle in stream"; #endif // __CUDACC__ - const TShape& ishape = in_data[fullc::kData].shape_; - const TShape& oshape = out_data[fullc::kOut].shape_; - - Tensor wmat = in_data[fullc::kWeight].get(s); - Tensor data, out; - if (!param_.flatten) { - data = in_data[fullc::kData].get_with_shape( - Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); - out = out_data[fullc::kOut].get_with_shape( - Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s); - } else { - data = in_data[fullc::kData].get_with_shape( - Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); - out = out_data[fullc::kOut].get_with_shape( - Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); - } - - // Legacy approach shown here for comparison: - // out = dot(data, wmat.T()); - linalg_gemm(data, wmat, out, false, true, s); - if (!param_.no_bias) { - Tensor bias = in_data[fullc::kBias].get(s); - out += repmat(bias, data.size(0)); - } + const TShape& ishape = in_data[fullc::kData].shape_; + const TShape& oshape = out_data[fullc::kOut].shape_; + + Tensor wmat = in_data[fullc::kWeight].get(s); + Tensor data, out; + if (!param.flatten) { + data = in_data[fullc::kData].get_with_shape( + Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); + out = out_data[fullc::kOut].get_with_shape( + Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s); + } else { + data = in_data[fullc::kData].get_with_shape( + Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); + out = out_data[fullc::kOut].get_with_shape( + Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); } - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - size_t expected = param_.no_bias ? 2 : 3; - CHECK(in_data.size() == expected && in_grad.size() == expected); - CHECK_EQ(req.size(), expected); - // TODO(bing): check the BLAS Handle, be careful - // maybe need blas handle from context - Stream *s = ctx.get_stream(); - const TShape& ishape = in_data[fullc::kData].shape_; - const TShape& oshape = out_grad[fullc::kOut].shape_; + // Legacy approach shown here for comparison: + // out = dot(data, wmat.T()); + linalg_gemm(data, wmat, out, false, true, s); + if (!param.no_bias) { + Tensor bias = in_data[fullc::kBias].get(s); + out += repmat(bias, data.size(0)); + } +} - Tensor wmat = in_data[fullc::kWeight].get(s); - Tensor data, grad, gdata; - if (!param_.flatten) { - data = in_data[fullc::kData].get_with_shape( - Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); - grad = out_grad[fullc::kOut].get_with_shape( - Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s); - gdata = in_grad[fullc::kData].get_with_shape( - Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); - } else { - data = in_data[fullc::kData].get_with_shape( - Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); - grad = out_grad[fullc::kOut].get_with_shape( - Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); - gdata = in_grad[fullc::kData].get_with_shape( - Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); - } +template +void FCBackward(const OpContext &ctx, const FullyConnectedParam ¶m, + const std::vector &out_grad, const std::vector &in_data, + const std::vector &req, const std::vector &in_grad) { + using namespace mshadow; + using namespace mshadow::expr; + // TODO(bing): check the BLAS Handle, be careful + // maybe need blas handle from context + Stream *s = ctx.get_stream(); + const TShape& ishape = in_data[fullc::kData].shape_; + const TShape& oshape = out_grad[fullc::kOut].shape_; + + Tensor wmat = in_data[fullc::kWeight].get(s); + Tensor data, grad, gdata; + if (!param.flatten) { + data = in_data[fullc::kData].get_with_shape( + Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); + grad = out_grad[fullc::kOut].get_with_shape( + Shape2(oshape.ProdShape(0, oshape.ndim()-1), oshape[oshape.ndim()-1]), s); + gdata = in_grad[fullc::kData].get_with_shape( + Shape2(ishape.ProdShape(0, ishape.ndim()-1), ishape[ishape.ndim()-1]), s); + } else { + data = in_data[fullc::kData].get_with_shape( + Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); + grad = out_grad[fullc::kOut].get_with_shape( + Shape2(oshape[0], oshape.ProdShape(1, oshape.ndim())), s); + gdata = in_grad[fullc::kData].get_with_shape( + Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim())), s); + } #if defined(__CUDACC__) - CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) - << "Must init CuBLAS handle in stream"; + CHECK_EQ(s->blas_handle_ownership_, Stream::OwnHandle) + << "Must init CuBLAS handle in stream"; #endif - // backprop - CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace"; - // gradient of weight - Tensor gwmat = in_grad[fullc::kWeight].get(s); - // Legacy approach shown here for comparison: - // out = Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data)); - linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]); - // gradient of bias - if (!param_.no_bias) { - Tensor gbias = in_grad[fullc::kBias].get(s); - Assign(gbias, req[fullc::kBias], sum_rows(grad)); - } - // gradient of data - // Legacy approach shown here for comparison: - // Assign(gdata, req[fullc::kData], dot(grad, wmat)); - linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]); + // backprop + CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace"; + // gradient of weight + Tensor gwmat = in_grad[fullc::kWeight].get(s); + // Legacy approach shown here for comparison: + // out = Assign(gwmat, req[fullc::kWeight], dot(grad.T(), data)); + linalg_gemm(grad, data, gwmat, true, false, s, req[fullc::kWeight]); + // gradient of bias + if (!param.no_bias) { + Tensor gbias = in_grad[fullc::kBias].get(s); + Assign(gbias, req[fullc::kBias], sum_rows(grad)); } + // gradient of data + // Legacy approach shown here for comparison: + // Assign(gdata, req[fullc::kData], dot(grad, wmat)); + linalg_gemm(grad, wmat, gdata, false, false, s, req[fullc::kData]); +} - private: - FullyConnectedParam param_; -}; // class FullyConnectedOp - -// Decalre Factory function, used for dispatch specialization template -Operator* CreateOp(FullyConnectedParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx); - -#if DMLC_USE_CXX11 -class FullyConnectedProp : public OperatorProperty { - public: - std::vector ListArguments() const override { - if (!param_.no_bias) { - return {"data", "weight", "bias"}; - } else { - return {"data", "weight"}; - } - } - - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); +void FullyConnectedCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 2 : 3; + CHECK_EQ(inputs.size(), in_expected); + CHECK_EQ(outputs.size(), 1U); + int dtype = inputs[0].type_flag_; + + switch (dtype) { + case mshadow::kFloat32: + FCForward(ctx, param, inputs, req, outputs); + break; + case mshadow::kFloat64: + FCForward(ctx, param, inputs, req, outputs); + break; + case mshadow::kFloat16: + LOG(FATAL) << "float16 fully connected layer is currently" + "only supported by CuDNN version."; + break; + default: + LOG(FATAL) << "Unsupported type " << dtype; } +} - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - if (!param_.no_bias) { - CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; - } else { - CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; - } - CHECK_EQ(out_shape->size(), 1U); - TShape dshape = (*in_shape)[fullc::kData]; - TShape oshape = (*out_shape)[0]; - // require data to be known - if (dshape.ndim() == 0) return false; - - index_t num_input; - if (!param_.flatten) { - num_input = dshape[dshape.ndim()-1]; - } else { - num_input = dshape.ProdShape(1, dshape.ndim()); - } - SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param_.num_hidden, num_input)); - if (!param_.no_bias) { - SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param_.num_hidden)); - } - - if (!param_.flatten) { - TShape result_shape(dshape); - result_shape[dshape.ndim()-1] = param_.num_hidden; - SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape); - } else { - SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param_.num_hidden)); - } - if (oshape.ndim() != 0) { - dshape[0] = oshape[0]; - SHAPE_ASSIGN_CHECK(*in_shape, fullc::kData, dshape); - } - return true; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - nnvm::NodeAttrs attrs; - attrs.name = "FullyConnected"; - return ElemwiseAttr( - attrs, in_type, out_type, -1); - } - - OperatorProperty* Copy() const override { - FullyConnectedProp* fc_sym = new FullyConnectedProp(); - fc_sym->param_ = this->param_; - return fc_sym; - } - - std::string TypeString() const override { - return "FullyConnected"; - } - - // decalre dependency and inplace optimization options - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[fullc::kOut], in_data[fullc::kData], in_data[fullc::kWeight]}; - } - - std::vector > BackwardInplaceOption( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &in_grad) const override { - return {{in_data[fullc::kData], in_grad[fullc::kData]}}; - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; +template +void FullyConnectedGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + uint32_t out_expected = param.no_bias ? 2 : 3; + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), out_expected); + CHECK_EQ(req.size(), out_expected); + + std::vector out_grad{inputs[0]}; + std::vector in_data(inputs.begin() + 1, inputs.end()); + int dtype = inputs[0].type_flag_; + + switch (dtype) { + case mshadow::kFloat32: + FCBackward(ctx, param, out_grad, in_data, req, outputs); + break; + case mshadow::kFloat64: + FCBackward(ctx, param, out_grad, in_data, req, outputs); + break; + case mshadow::kFloat16: + LOG(FATAL) << "float16 fully connected layer is currently" + "only supported by CuDNN version."; + break; + default: + LOG(FATAL) << "Unsupported type " << dtype; } +} - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; - - private: - FullyConnectedParam param_; -}; // class FullyConnectedSymbol -#endif } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_FULLY_CONNECTED_INL_H_ diff --git a/src/operator/nn/fully_connected.cc b/src/operator/nn/fully_connected.cc index 9a978160297d..4362408a23a1 100644 --- a/src/operator/nn/fully_connected.cc +++ b/src/operator/nn/fully_connected.cc @@ -23,58 +23,153 @@ * \brief fully connect operator */ #include "./fully_connected-inl.h" +#include "./mkldnn/mkldnn_ops-inl.h" +#include "./mkldnn/mkldnn_base-inl.h" #if MXNET_USE_NNPACK == 1 #include "./nnpack/nnpack_fully_connected-inl.h" #endif // MXNET_USE_NNPACK namespace mxnet { namespace op { -template<> -Operator* CreateOp(FullyConnectedParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - Operator *op = NULL; -#if MXNET_USE_NNPACK == 1 - const size_t batch_size = (*in_shape)[0][0]; - // nnp_fully_connected_inference will do optimization for batch-size = 1 - // nnp_fully_connected_output will do optimization for batch-size > 1 - switch (dtype) { - case mshadow::kFloat32: - return new NNPACKFullyConnectedOp(param); - default: - break; + +static bool FullyConnectedShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, + std::vector *out_shape) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + using namespace mshadow; + if (!param.no_bias) { + CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]"; + } else { + CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; + } + CHECK_EQ(out_shape->size(), 1U); + TShape dshape = (*in_shape)[fullc::kData]; + TShape oshape = (*out_shape)[0]; + // require data to be known + if (dshape.ndim() == 0) return false; + + index_t num_input; + if (!param.flatten) { + num_input = dshape[dshape.ndim()-1]; + } else { + num_input = dshape.ProdShape(1, dshape.ndim()); + } + SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param.num_hidden, num_input)); + if (!param.no_bias) { + SHAPE_ASSIGN_CHECK(*in_shape, fullc::kBias, Shape1(param.num_hidden)); + } + + if (!param.flatten) { + TShape result_shape(dshape); + result_shape[dshape.ndim()-1] = param.num_hidden; + SHAPE_ASSIGN_CHECK(*out_shape, 0, result_shape); + } else { + SHAPE_ASSIGN_CHECK(*out_shape, 0, Shape2(dshape[0], param.num_hidden)); + } + if (oshape.ndim() != 0) { + dshape[0] = oshape[0]; + SHAPE_ASSIGN_CHECK(*in_shape, fullc::kData, dshape); } + return true; +} + +#if MXNET_USE_MKLDNN == 1 +void FullyConnectedComputeExCPU(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + if (SupportMKLDNN(inputs[0])) { + MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs); + MKLDNNFCForward(attrs, ctx, inputs, req, outputs); + MKLDNN_OPCHECK_RUN(FullyConnectedCompute, attrs, ctx, inputs, req, + outputs); + return; + } + FallBackCompute(FullyConnectedCompute, attrs, ctx, inputs, req, outputs); +} + +void FullyConnectedGradComputeExCPU(const nnvm::NodeAttrs& attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + if (SupportMKLDNN(inputs[0])) { + MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs); + MKLDNNFCBackward(attrs, ctx, inputs, req, outputs); + MKLDNN_OPCHECK_RUN(FullyConnectedGradCompute, attrs, ctx, inputs, req, + outputs); + return; + } + FallBackCompute(FullyConnectedGradCompute, attrs, ctx, inputs, req, outputs); +} #endif - switch (dtype) { - case mshadow::kFloat32: - op = new FullyConnectedOp(param); - break; - case mshadow::kFloat64: - op = new FullyConnectedOp(param); - break; - case mshadow::kFloat16: - LOG(FATAL) << "float16 fully connected layer is currently" - "only supported by CuDNN version."; - break; - default: - LOG(FATAL) << "Unsupported type " << dtype; + +static bool FullyConnectedType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, std::vector *out_type) { + CHECK_GE(in_type->size(), 1U); + return ElemwiseAttr( + attrs, in_type, out_type, -1); +} + +struct FullyConnectedGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + std::vector heads(ograds.begin(), ograds.end()); + heads.push_back(n->inputs[fullc::kData]); + heads.push_back(n->inputs[fullc::kWeight]); + return MakeGradNode(op_name, n, heads, n->attrs.dict); } +}; + +inline static bool FCStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 2 : 3; + CHECK_EQ(in_attrs->size(), in_expected); + CHECK_EQ(out_attrs->size(), 1); - return op; + DispatchMode wanted_mode; +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) + wanted_mode = DispatchMode::kFComputeEx; + else +#endif + wanted_mode = DispatchMode::kFCompute; + return storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, wanted_mode); } -// DO_BIND_DISPATCH comes from operator_common.h -Operator *FullyConnectedProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - std::vector out_shape(1, TShape()), aux_shape; - CHECK(InferShape(in_shape, &out_shape, &aux_shape)); - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx); +inline static bool BackwardFCStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + uint32_t out_expected = param.no_bias ? 2 : 3; + CHECK_EQ(in_attrs->size(), 3U); + CHECK_EQ(out_attrs->size(), out_expected); + + DispatchMode wanted_mode; +#if 0 + // TODO(zhengda) let's disable MKLDNN for FullyConnected for now. + // It seems there is a bug. + if (dev_mask == mshadow::cpu::kDevMask) + *dispatch_mode = DispatchMode::kFComputeEx; + else +#endif + wanted_mode = DispatchMode::kFCompute; + return storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, wanted_mode); } DMLC_REGISTER_PARAMETER(FullyConnectedParam); -MXNET_REGISTER_OP_PROPERTY(FullyConnected, FullyConnectedProp) +NNVM_REGISTER_OP(FullyConnected) .describe(R"code(Applies a linear transformation: :math:`Y = XW^T + b`. If ``flatten`` is set to be true, then the shapes are: @@ -96,9 +191,59 @@ The learnable parameters include both ``weight`` and ``bias``. If ``no_bias`` is set to be true, then the ``bias`` term is ignored. )code" ADD_FILELINE) +.set_num_inputs([](const NodeAttrs& attrs) { + const FullyConnectedParam& params = nnvm::get(attrs.parsed); + return params.no_bias ? 2 : 3; +}) +.set_num_outputs(1) +.set_attr_parser(ParamParser) +.set_attr("FInferStorageType", FCStorageType) +.set_attr("FListInputNames", [](const NodeAttrs& attrs) { + const FullyConnectedParam& params = nnvm::get(attrs.parsed); + if (!params.no_bias) { + return std::vector{"data", "weight", "bias"}; + } else { + return std::vector{"data", "weight"}; + } +}) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif +.set_attr("FInferShape", FullyConnectedShape) +.set_attr("FInferType", FullyConnectedType) +.set_attr("FCompute", FullyConnectedCompute) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FComputeEx", FullyConnectedComputeExCPU) +#endif +.set_attr("FGradient", FullyConnectedGrad{"_backward_FullyConnected"}) .add_argument("data", "NDArray-or-Symbol", "Input data.") .add_argument("weight", "NDArray-or-Symbol", "Weight matrix.") .add_argument("bias", "NDArray-or-Symbol", "Bias parameter.") .add_arguments(FullyConnectedParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_FullyConnected) +.set_num_inputs(3) +.set_num_outputs([](const NodeAttrs& attrs) { + const FullyConnectedParam& params = nnvm::get(attrs.parsed); + return params.no_bias ? 2 : 3; +}) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif +.set_attr("TIsBackward", true) +.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ + return std::vector >{{1, 0}}; +}) +.set_attr("FInferStorageType", BackwardFCStorageType) +.set_attr_parser(ParamParser) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FComputeEx", FullyConnectedGradComputeExCPU) +#endif +.set_attr("FCompute", FullyConnectedGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/fully_connected.cu b/src/operator/nn/fully_connected.cu index 279a378e2ad4..c89d37767c4a 100644 --- a/src/operator/nn/fully_connected.cu +++ b/src/operator/nn/fully_connected.cu @@ -25,16 +25,50 @@ #include "./fully_connected-inl.h" namespace mxnet { namespace op { + template<> -Operator* CreateOp(FullyConnectedParam param, int dtype, - std::vector *in_shape, - std::vector *out_shape, - Context ctx) { - Operator *op = NULL; +void FullyConnectedCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + uint32_t in_expected = param.no_bias ? 2 : 3; + CHECK_EQ(inputs.size(), in_expected); + CHECK_EQ(outputs.size(), 1U); + int dtype = inputs[0].type_flag_; + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new FullyConnectedOp(param); - }) - return op; + FCForward(ctx, param, inputs, req, outputs); + }); } + +template<> +void FullyConnectedGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + uint32_t out_expected = param.no_bias ? 2 : 3; + CHECK_EQ(inputs.size(), 3U); + CHECK_EQ(outputs.size(), out_expected); + CHECK_EQ(req.size(), out_expected); + + std::vector out_grad{inputs[0]}; + std::vector in_data(inputs.begin() + 1, inputs.end()); + int dtype = inputs[0].type_flag_; + + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + FCBackward(ctx, param, out_grad, in_data, req, outputs); + }); +} + +NNVM_REGISTER_OP(FullyConnected) +.set_attr("FCompute", FullyConnectedCompute); + +NNVM_REGISTER_OP(_backward_FullyConnected) +.set_attr("FCompute", FullyConnectedGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/lrn-inl.h b/src/operator/nn/lrn-inl.h new file mode 100644 index 000000000000..fdae1eca0aef --- /dev/null +++ b/src/operator/nn/lrn-inl.h @@ -0,0 +1,127 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2015 by Contributors + * \file lrn-inl.h + * \brief + * \author Bing Xu +*/ +#ifndef MXNET_OPERATOR_NN_LRN_INL_H_ +#define MXNET_OPERATOR_NN_LRN_INL_H_ +#include +#include +#include +#include +#include +#include +#include +#include "../operator_common.h" +#include "../mshadow_op.h" + +namespace mxnet { +namespace op { + +namespace lrn_enum { +enum LRNInputs {kData}; +enum LRNOutputs {kOut, kTmpNorm}; +} // namespace lrn_enum + +struct LRNParam : public dmlc::Parameter { + float alpha; + float beta; + float knorm; + uint32_t nsize; + DMLC_DECLARE_PARAMETER(LRNParam) { + DMLC_DECLARE_FIELD(alpha).set_default(1e-4f) + .describe("The variance scaling parameter :math:`\alpha` in the LRN expression."); + DMLC_DECLARE_FIELD(beta).set_default(0.75f) + .describe("The power parameter :math:`\beta` in the LRN expression."); + DMLC_DECLARE_FIELD(knorm).set_default(2.0f) + .describe("The parameter :math:`k` in the LRN expression."); + DMLC_DECLARE_FIELD(nsize) + .describe("normalization window width in elements."); + } +}; // struct LRNParam + +template +void LRNForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { + using namespace mshadow; + using namespace mshadow::expr; + const LRNParam& param_ = nnvm::get(attrs.parsed); + // TODO(xxx): Test with gradient chceker + CHECK_EQ(in_data.size(), 1U); + CHECK_EQ(out_data.size(), 2U); + // CHECK_EQ(req.size(), 2); + CHECK_EQ(param_.nsize % 2, 1U) << "LRN only supports odd values for local_size"; + const real_t salpha = param_.alpha / param_.nsize; + Stream *s = ctx.get_stream(); + Tensor data = in_data[lrn_enum::kData].get(s); + Tensor out = out_data[lrn_enum::kOut].get(s); + Tensor tmp_norm = out_data[lrn_enum::kTmpNorm].get(s); + tmp_norm = chpool(F(data) , param_.nsize) * salpha + param_.knorm; + Assign(out, req[lrn_enum::kOut], data * F(tmp_norm, -param_.beta)); +} + +template +void LRNBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const TBlob &out_grad, const TBlob &in_data, + const TBlob &out_norm, const OpReqType &req, + const TBlob &in_grad) { + using namespace mshadow; + using namespace mshadow::expr; + const LRNParam& param_ = nnvm::get(attrs.parsed); + const real_t salpha = param_.alpha / param_.nsize; + Stream *s = ctx.get_stream(); + Tensor grad = out_grad.get(s); + Tensor tmp_norm = out_norm.get(s); + Tensor data = in_data.get(s); + Tensor grad_in = in_grad.get(s); + grad_in = grad * F(tmp_norm, -param_.beta); + grad_in += (- 2.0f * param_.beta * salpha) * + chpool(grad * data * + F(tmp_norm, -param_.beta - 1.0f), + param_.nsize) * data; +} + +template +void LRNCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + LRNForward(attrs, ctx, inputs, req, outputs); +} + +template +void LRNGradCompute(const nnvm::NodeAttrs& attrs, const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + LRNBackward(attrs, ctx, inputs[0], // out_grad + inputs[1], // in_data + inputs[2], // out_norm + req[lrn_enum::kData], outputs[lrn_enum::kData]); +} + +} // namespace op +} // namespace mxnet +#endif // MXNET_OPERATOR_NN_LRN_INL_H_ diff --git a/src/operator/nn/lrn.cc b/src/operator/nn/lrn.cc new file mode 100644 index 000000000000..2359b49abab6 --- /dev/null +++ b/src/operator/nn/lrn.cc @@ -0,0 +1,203 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2015 by Contributors + * \file lrn.cc + * \brief + * \author Bing Xu, Patric Zhao (patric.zhao@intel.com) +*/ + +#include "./lrn-inl.h" +#include "../operator_common.h" +#if MXNET_USE_MKLDNN == 1 +#include "./mkldnn/mkldnn_lrn-inl.h" +#endif + +namespace mxnet { +namespace op { + +bool LRNShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, + std::vector *out_shape) { + using namespace mshadow; + CHECK_EQ(in_shape->size(), 1U) << "Input:[data]"; + const TShape &dshape = in_shape->at(0); + if (dshape.ndim() == 0) return false; + out_shape->clear(); + out_shape->push_back(dshape); + out_shape->push_back(dshape); + return true; +} + +inline std::vector ListArguments() { + return {"data"}; +} + +bool LRNType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, + std::vector *out_type) { + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); + } + } + int n_out = 2; + out_type->clear(); + for (int i = 0; i < n_out; ++i ) out_type->push_back(dtype); + return true; +} + +struct LRNGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + std::vector heads; + heads.push_back(ograds[0]); // out_grad + heads.push_back(n->inputs[lrn_enum::kData]); + heads.emplace_back(nnvm::NodeEntry{n, lrn_enum::kTmpNorm, 0}); + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + +bool LRNForwardInferStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK(!in_attrs->empty()); +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, DispatchMode::kFComputeEx); + return true; + } +#endif + storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, DispatchMode::kFCompute); + return true; +} + +bool LRNBackwardInferStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK(!in_attrs->empty()); +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, DispatchMode::kFComputeEx); + return true; + } +#endif + storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, DispatchMode::kFCompute); + return true; +} + +#if MXNET_USE_MKLDNN == 1 +void LRNComputeExCPU(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + const LRNParam ¶m = nnvm::get(attrs.parsed); + if (SupportMKLDNN(inputs[0])) { + // We only need to test one output array. + MKLDNN_OPCHECK_INIT(false, 1, inputs, outputs); + MKLDNNLRNForward(ctx, param, inputs[0], req[0], outputs[0]); + MKLDNN_OPCHECK_RUN(LRNCompute, attrs, ctx, inputs, req, outputs); + return; + } + FallBackCompute(LRNCompute, attrs, ctx, inputs, req, outputs); +} + +void LRNGradComputeExCPU(const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + const LRNParam ¶m = nnvm::get(attrs.parsed); + const NDArray &out_grad = inputs[0]; + const NDArray &in_data = inputs[1]; + const NDArray &in_grad = outputs[0]; + + if (SupportMKLDNN(inputs[0])) { + MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs); + MKLDNNLRNBackward(ctx, param, out_grad, in_data, req[0], in_grad); + MKLDNN_OPCHECK_RUN(LRNGradCompute, attrs, ctx, inputs, req, outputs); + return; + } + FallBackCompute(LRNGradCompute, attrs, ctx, inputs, req, outputs); +} +#endif + +DMLC_REGISTER_PARAMETER(LRNParam); + +NNVM_REGISTER_OP(LRN) +.describe(R"code(Applies local response normalization to the input. + +The local response normalization layer performs "lateral inhibition" by normalizing +over local input regions. + +If :math:`a_{x,y}^{i}` is the activity of a neuron computed by applying kernel :math:`i` at position +:math:`(x, y)` and then applying the ReLU nonlinearity, the response-normalized +activity :math:`b_{x,y}^{i}` is given by the expression: + +.. math:: + b_{x,y}^{i} = \frac{a_{x,y}^{i}}{\Bigg({k + \alpha \sum_{j=max(0, i-\frac{n}{2})}^{min(N-1, i+\frac{n}{2})} (a_{x,y}^{j})^{2}}\Bigg)^{\beta}} + +where the sum runs over :math:`n` "adjacent" kernel maps at the same spatial position, and :math:`N` is the total +number of kernels in the layer. + +)code" ADD_FILELINE) +.set_num_inputs(1) +.set_num_outputs(2) +.set_attr("FNumVisibleOutputs", + [](const NodeAttrs& attrs) { return 1; }) +.set_attr_parser(ParamParser) +.set_attr("FInferShape", LRNShape) +.set_attr("FInferType", LRNType) +.set_attr("FInferStorageType", LRNForwardInferStorageType) +.set_attr("FCompute", LRNCompute) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FComputeEx", LRNComputeExCPU) +#endif +.set_attr("FGradient", LRNGrad{"_backward_LRN"}) +.add_argument("data", "NDArray-or-Symbol", "Input data to LRN") +.add_arguments(LRNParam::__FIELDS__()); + +NNVM_REGISTER_OP(_backward_LRN) +.set_num_outputs(1) +.set_attr_parser(ParamParser) +.set_attr("FInferStorageType", LRNBackwardInferStorageType) +.set_attr("TIsBackward", true) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FComputeEx", LRNGradComputeExCPU) +#endif +.set_attr("FCompute", LRNGradCompute); + +} // namespace op +} // namespace mxnet diff --git a/src/operator/lrn.cu b/src/operator/nn/lrn.cu similarity index 64% rename from src/operator/lrn.cu rename to src/operator/nn/lrn.cu index ba872f1d26d0..4c31ca96025c 100644 --- a/src/operator/lrn.cu +++ b/src/operator/nn/lrn.cu @@ -25,29 +25,15 @@ */ #include "./lrn-inl.h" -#if MXNET_USE_CUDNN == 1 -#include "./cudnn_lrn-inl.h" -#endif namespace mxnet { namespace op { -template<> -Operator* CreateOp(LRNParam param, int dtype) { - Operator *op = NULL; -#if MXNET_USE_CUDNN == 1 - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - op = new CuDNNLocalResponseNormOp(param); - }) -#else -#if CUDA_VERSION == 7000 - LOG(FATAL) << "Due to old CUDA compiler bug, LRN is disabled." - << "Please upgrade CUDA to 7.5+ or use CUDNN"; -#else - op = new LocalResponseNormOp(param); -#endif // CUDA_VERSION -#endif // MXNET_USE_CUDNN - return op; -} + +NNVM_REGISTER_OP(LRN) +.set_attr("FCompute", LRNCompute); + +NNVM_REGISTER_OP(_backward_LRN) +.set_attr("FCompute", LRNGradCompute); } // namespace op } // namespace mxnet diff --git a/src/operator/nn/mkldnn/mkldnn_act.cc b/src/operator/nn/mkldnn/mkldnn_act.cc new file mode 100644 index 000000000000..71fdf4ca585b --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_act.cc @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_act.cc + * \brief + * \author Da Zheng +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "../../operator_common.h" +#include "../activation-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 + +#include + +namespace mxnet { +namespace op { + +bool SupportMKLDNNAct(const ActivationParam& param) { + // We only enable ReLU for now. It seems other activations have some precision + // problems. + return param.act_type == activation::kReLU; +#if 0 + || param.act_type == activation::kSigmoid + || param.act_type == activation::kSoftReLU; +#endif +} + +static inline mkldnn::algorithm GetMKLDNNActAlgo(const ActivationParam& param) { + switch (param.act_type) { + case activation::kReLU: + return mkldnn::algorithm::eltwise_relu; + case activation::kSigmoid: + return mkldnn::algorithm::eltwise_logistic; + case activation::kTanh: + return mkldnn::algorithm::eltwise_tanh; + case activation::kSoftReLU: + return mkldnn::algorithm::eltwise_soft_relu; + default: + LOG(FATAL) << "unknown activation type"; + return mkldnn::algorithm::eltwise_relu; + } +} + +typedef std::shared_ptr mkldnn_act_pdesc_ptr; + +static mkldnn::eltwise_forward::primitive_desc GetActFwdDescImpl( + const ActivationParam& param, bool is_train, + const mkldnn::memory &input_mem, int dtype) { + mkldnn::memory::primitive_desc data_mpd = input_mem.get_primitive_desc(); + mkldnn::memory::desc data_md = data_mpd.desc(); + auto cpu_engine = data_mpd.get_engine(); + + auto alg = GetMKLDNNActAlgo(param); + MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + DType alpha = 0; + mkldnn::eltwise_forward::desc desc = is_train + ? mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_training, + alg, data_md, alpha) + : mkldnn::eltwise_forward::desc(mkldnn::prop_kind::forward_scoring, + alg, data_md, alpha); + return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine); + }); + LOG(INFO) << "Unsupported data type for MKLDNN activation"; + mkldnn::eltwise_forward::desc desc = mkldnn::eltwise_forward::desc( + mkldnn::prop_kind::forward_training, alg, data_md, 0.0); + return mkldnn::eltwise_forward::primitive_desc(desc, cpu_engine); +} + +typedef MKLDNNParamOpSign MKLDNNActSignature; + +class MKLDNNActForward { + std::shared_ptr fwd; + std::shared_ptr data; + std::shared_ptr out; + + public: + const mkldnn::eltwise_forward::primitive_desc fwd_pd; + + MKLDNNActForward(const ActivationParam& param, bool is_train, + const NDArray &data, const mkldnn::memory &mem): fwd_pd( + GetActFwdDescImpl(param, is_train, mem, data.dtype())) { + } + + void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &output) { + if (this->data == nullptr) + this->data = std::shared_ptr(new mkldnn::memory( + data.get_primitive_desc(), data.get_data_handle())); + else + this->data->set_data_handle(data.get_data_handle()); + + CHECK(fwd_pd.dst_primitive_desc() == output.get_primitive_desc()); + if (this->out == nullptr) + this->out = std::shared_ptr(new mkldnn::memory( + fwd_pd.dst_primitive_desc(), output.get_data_handle())); + else + this->out->set_data_handle(output.get_data_handle()); + + if (this->fwd == nullptr) { + this->fwd = std::shared_ptr( + new mkldnn::eltwise_forward(fwd_pd, mkldnn::primitive::at(*this->data), + *this->out)); + } + } + + const mkldnn::eltwise_forward &GetFwd() const { + return *fwd; + } +}; + +static MKLDNNActForward &GetActForward(const ActivationParam& param, + const OpContext &ctx, const NDArray &in_data, + const mkldnn::memory &in_mem) { + static thread_local std::unordered_map fwds; + MKLDNNActSignature key(param); + key.AddSign(ctx.is_train); + key.AddSign(param.act_type); + key.AddSign(in_data); + + auto it = fwds.find(key); + if (it == fwds.end()) { + MKLDNNActForward fwd(param, ctx.is_train, in_data, in_mem); + auto ins_ret = fwds.insert(std::pair( + key, fwd)); + CHECK(ins_ret.second); + it = ins_ret.first; + } + return it->second; +} + +void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data) { + const ActivationParam& param = nnvm::get(attrs.parsed); + auto input_mem = in_data.GetMKLDNNData(); + MKLDNNActForward &fwd = GetActForward(param, ctx, in_data, *input_mem); + auto out_mem = const_cast(out_data).CreateMKLDNNData( + fwd.fwd_pd.dst_primitive_desc()); + fwd.SetNewMem(*input_mem, *out_mem); + MKLDNNStream *stream = MKLDNNStream::Get(); + stream->RegisterPrim(fwd.GetFwd()); + stream->Submit(); +} + +void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const NDArray &out_grad, const NDArray &in_data, + const OpReqType &req, const NDArray &in_grad) { + if (req == kNullOp) { + return; + } + + const ActivationParam& param = nnvm::get(attrs.parsed); + TmpMemMgr::Get()->Init(ctx.requested[activation::kTempSpace]); + auto diff_dst_memory = out_grad.GetMKLDNNData(); + auto input_mem = in_data.GetMKLDNNData(); + // We need to make sure the two inputs to eltwise_backward has the same memory + // descriptor. Otherwise, the perf will suffer. + if (input_mem->get_primitive_desc() != diff_dst_memory->get_primitive_desc()) + input_mem = in_data.GetMKLDNNDataReorder(diff_dst_memory->get_primitive_desc()); + mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); + mkldnn::memory::desc data_md = data_mpd.desc(); + mkldnn::memory::desc diff_md = diff_dst_memory->get_primitive_desc().desc(); + auto cpu_engine = data_mpd.get_engine(); + + MKLDNNStream *stream = MKLDNNStream::Get(); + auto alg = GetMKLDNNActAlgo(param); + mkldnn_output_t diff_src_memory; + + MSHADOW_REAL_TYPE_SWITCH(in_data.dtype(), DType, { + DType alpha = 0; + mkldnn::eltwise_forward::desc fw_desc(mkldnn::prop_kind::forward_training, + alg, data_md, alpha); + mkldnn::eltwise_forward::primitive_desc fw_pdesc(fw_desc, cpu_engine); + mkldnn::eltwise_backward::desc bw_desc(alg, diff_md, data_md, alpha); + mkldnn::eltwise_backward::primitive_desc bw_pdesc(bw_desc, cpu_engine, + fw_pdesc); + + diff_src_memory = CreateMKLDNNMem(in_grad, + bw_pdesc.diff_src_primitive_desc(), req); + stream->RegisterPrim(mkldnn::eltwise_backward(bw_pdesc, *input_mem, + *diff_dst_memory, + *diff_src_memory.second)); + }); + CommitOutput(in_grad, diff_src_memory); + stream->Submit(); +} + +} // namespace op +} // namespace mxnet + +#endif diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h new file mode 100644 index 000000000000..1c583e1f671e --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h @@ -0,0 +1,488 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/******************************************************************************* +* Copyright 2016-2017 Intel Corporation +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +* +* \file mkldnn_base-inl.h +* \brief +* \author young.jin.kim@intel.com +* ashok.emani@intel.com +* deepthi.karkada@intel.com +* louis.feng@intel.com +* adam.d.straw@intel.com +* zhengda1936@gmail.com +* +*******************************************************************************/ + +#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_ +#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_ + +#if MXNET_USE_MKLDNN == 1 +#include +#include +#include +#include +#include +#include +#include +#include "mkldnn.hpp" +#include "mxnet/ndarray.h" +#include "mxnet/resource.h" +#include "mxnet/op_attr_types.h" +using namespace mkldnn; +namespace mxnet { +extern bool EnableMkldnnWarnGenerated(); +// ===== CpuEngine ======================================= +// cpu_engine singleton +class CpuEngine { + public: + static CpuEngine *Get() { + // I's thread-safe in C++11. + static thread_local CpuEngine myInstance; + return &myInstance; + } + CpuEngine(CpuEngine const &) = delete; // Copy construct + CpuEngine(CpuEngine &&) = delete; // Move construct + CpuEngine &operator=(CpuEngine const &) = delete; // Copy assign + CpuEngine &operator=(CpuEngine &&) = delete; // Move assign + + mkldnn::engine &get_engine() { return _cpu_engine; } + + protected: + CpuEngine() : _cpu_engine(mkldnn::engine::cpu, 0) {} + ~CpuEngine() {} + + private: + mkldnn::engine _cpu_engine; +}; + +// type enumerator +template +struct data_type_enum {}; + +template <> +struct data_type_enum { + enum { type = mkldnn::memory::data_type::f32 }; +}; + +template <> +struct data_type_enum { + enum { type = mkldnn::memory::data_type::s32 }; +}; + +template <> +struct data_type_enum { + enum { type = mkldnn::memory::data_type::s16 }; +}; + +template <> +struct data_type_enum { + enum { type = mkldnn::memory::data_type::s8 }; +}; + +template <> +struct data_type_enum { + enum { type = mkldnn::memory::data_type::u8 }; +}; + +static inline bool SupportMKLDNNArray(int dtype, const TShape &shape) { + int ndim = shape.ndim(); + bool support = ndim == 1 || ndim == 2 || ndim == 4; + support = support && (dtype == mshadow::kFloat32 || dtype == mshadow::kInt32 + || dtype == mshadow::kInt8 || dtype == mshadow::kUint8); + return support; +} + +static inline bool SupportStorageMKLDNN(int stype) { + return stype == kDefaultStorage; +} + +static inline bool SupportMKLDNN(int dtype, const TShape &shape) { + int ndim = shape.ndim(); + return dtype == mshadow::kFloat32 && (ndim == 1 || ndim == 2 || ndim == 4); +} + +static inline bool SupportMKLDNN(const NDArray &input) { + return SupportMKLDNN(input.dtype(), input.shape()) + && SupportStorageMKLDNN(input.storage_type()); +} + +static inline bool SupportMKLDNNConv(const NDArray &input) { + return input.dtype() == mshadow::kFloat32 && input.shape().ndim() == 4; +} + +/* + * This is to align address to a certain alignment. + */ +void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space); + +namespace op { +struct ActivationParam; +bool SupportMKLDNNAct(const op::ActivationParam& param); +} + +static int GetTypeSize(int dtype) { + int size = -1; + MSHADOW_TYPE_SWITCH(dtype, DType, { + size = sizeof(DType); + }); + return size; +} + +static inline size_t GetArraySize(const NDArray &arr) { + return arr.shape().Size() * GetTypeSize(arr.dtype()); +} + +static inline mkldnn::memory::data_type get_mkldnn_type(int dtype) { + switch (dtype) { + case mshadow::kFloat32: + return mkldnn::memory::data_type::f32; + case mshadow::kInt32: + return mkldnn::memory::data_type::s32; + case mshadow::kInt8: + return mkldnn::memory::data_type::s8; + case mshadow::kUint8: + return mkldnn::memory::data_type::u8; + default: + LOG(FATAL) << "unknown type for MKLDNN"; + return mkldnn::memory::data_type::data_undef; + } +} + +inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr, int ndim) { + mkldnn::memory::dims dims(ndim); + for (size_t i = 0; i < dims.size(); i++) dims[i] = arr.shape()[i]; + return mkldnn::memory::desc{dims, get_mkldnn_type(arr.dtype()), + mkldnn::memory::format::any}; +} + +inline static mkldnn::memory::desc GetMemDesc(const NDArray &arr) { + return GetMemDesc(arr, arr.shape().ndim()); +} + +inline static mkldnn::memory::desc GetWeightDesc(const NDArray &arr, + int num_groups) { + if (num_groups == 1) { + return GetMemDesc(arr); + } else { + CHECK_EQ(arr.shape().ndim(), 4U); + mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups, + static_cast(arr.shape()[0] / num_groups), + static_cast(arr.shape()[1]), + static_cast(arr.shape()[2]), + static_cast(arr.shape()[3])}; + return mkldnn::memory::desc{tz, get_mkldnn_type(arr.dtype()), + mkldnn::memory::format::any}; + } +} + +typedef std::shared_ptr mkldnn_mem_ptr; +typedef std::shared_ptr mkldnn_mem_const_ptr; + +/* + * This is to manage the temporary memory provided by MXNet for operators. + * The temp memory is mainly used to keep the reordered data. In an operator, we + * may need multiple pieces of memory for them. But MXNet can only provide + * a single piece of memory. This class is to help break the temporary memory + * from MXNet to store the reordered data. + * The amount of temporary memory used in an operator depends on the layout of + * input arrays and the operator. It's difficult to calculate it manually, so + * the class also estimate the amount of memory automatically. + */ +class TmpMemMgr { + // This points to the memory buffer where we can allocate temp memory. + char *curr_mem; + // The total size of the temp memory. + size_t mem_size; + // This contains the current available memory size. + size_t curr_size; + // This estimate the required temp memory size in an operator. + size_t est_size; + const size_t alignment = 4096; + + public: + static TmpMemMgr *Get() { + static thread_local TmpMemMgr mgr; + return &mgr; + } + + TmpMemMgr() { + Reset(); + est_size = 0; + mem_size = 0; + } + + void Reset() { + curr_mem = nullptr; + curr_size = 0; + // We don't reset est_size and mem_size because est_size contains the + // estimated temp memory size from the last run and mem_size contains the + // memroy size allocated in the last run. + } + + void Init(const Resource &r) { + // If the last time, if we estimate that we need more memory, we should the + // larger memory size. + mem_size = std::max(mem_size, est_size); + if (mem_size > 0) { + // Let's allocate some extra memory. If we don't use some of them all the time, + // the OS won't physically allocate pages for them any way. + this->curr_size = mem_size * 2; + this->curr_mem = static_cast(r.get_host_space_internal(this->curr_size)); + } + // reset est_size, so we can start to estimate the temp memory size. + this->est_size = 0; + } + + mkldnn::memory *Alloc(const mkldnn::memory::primitive_desc &pd); +}; + +class MKLDNNStream { + std::vector net; + // Here we hold all memory related to the operators in the stream. + std::vector > mem_holder; + + public: + static MKLDNNStream *Get() { + static thread_local MKLDNNStream stream; + return &stream; + } + + void RegisterPrim(const mkldnn::primitive &prim) { net.push_back(prim); } + + void RegisterMem(std::shared_ptr mem) { + mem_holder.push_back(mem); + } + + bool HasOps() const { + return !net.empty(); + } + + void Submit() { + if (!net.empty()) + mkldnn::stream(mkldnn::stream::kind::eager).submit(net).wait(); + net.clear(); + mem_holder.clear(); + TmpMemMgr::Get()->Reset(); + } +}; + +class MKLDNNOpSignature { + std::vector eles; + uint64_t hash; + + public: + MKLDNNOpSignature() { + hash = 0; + } + + explicit MKLDNNOpSignature(uint64_t hash) { + this->hash = hash; + } + + /* + * We provide different methods to add signature to an op. + * For operations, such as convolutin and fully connected, which determines + * the optimal data layout for the op, we only need to use the shape and data + * type to sign the op. For other operations, such as activation, which uses + * whatever layout in the input array, we have to use the shape, the data type + * and the layout to sign the op. + */ + + void AddSign(const mkldnn::memory &mem) { + auto desc = mem.get_primitive_desc().desc(); + hash = hash * 2 + desc.data.format; + eles.push_back(desc.data.format); + hash = hash * 2 + desc.data.data_type; + eles.push_back(desc.data.data_type); + for (int i = 0; i < desc.data.ndims; i++) { + hash = hash * 2 + desc.data.dims[i]; + eles.push_back(desc.data.dims[i]); + } + } + + void AddSign(const std::vector &arrs) { + for (auto &arr : arrs) { + AddSign(arr); + } + } + + void AddSign(const NDArray &arr) { + if (arr.IsMKLDNNData()) { + AddSign(*(arr.GetMKLDNNData())); + } else { + hash = hash * 2 + arr.dtype(); + eles.push_back(arr.dtype()); + AddSign(arr.shape()); + } + } + + void AddSign(const TShape &shape) { + for (size_t i = 0; i < shape.ndim(); i++) { + hash = hash * 2 + shape[i]; + eles.push_back(shape[i]); + } + } + + void AddSign(int val) { + hash = hash * 2 + val; + eles.push_back(val); + } + + bool operator==(const MKLDNNOpSignature &sign) const { + if (hash != sign.hash) + return false; + if (eles.size() != sign.eles.size()) + return false; + for (size_t i = 0; i < eles.size(); i++) + if (eles[i] != sign.eles[i]) + return false; + return true; + } + + uint64_t GetHash() const { + return hash; + } +}; + +struct MKLDNNOpHash { + size_t operator()(const MKLDNNOpSignature &sign) const { + return sign.GetHash(); + } +}; + +template +class MKLDNNParamOpSign: public MKLDNNOpSignature { + const ParamType param; + + static size_t hash(const ParamType ¶m) { + std::hash fn; + return fn(param); + } + + public: + explicit MKLDNNParamOpSign(const ParamType &_param): MKLDNNOpSignature( + hash(_param)), param(_param) { + } + + bool operator==(const MKLDNNParamOpSign &sign) const { + const MKLDNNOpSignature &this_upper = *this; + const MKLDNNOpSignature &other_upper = sign; + return this_upper == other_upper && param == sign.param; + } +}; + +enum OutDataOp { + Noop, + CopyBack, + AddBack, +}; + +typedef std::pair mkldnn_output_t; + +/* + * These two functions try to create MKLDNN memory in an NDArray based on `req'. + * The difference is that the first function can create MKLDNN memory with + * special layouts in an NDArray, while the second one can only create MKLDNN + * memory with default layouts. + * If these two functions are used, we have to call CommitOutput to write + * the output back to the output NDArray. + */ +mkldnn_output_t CreateMKLDNNMem(const NDArray &arr, + const mkldnn::memory::primitive_desc &desc, + OpReqType req); +mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr, + const mkldnn::memory::primitive_desc &desc, + OpReqType req); +/* This function has to be used with one of the functions above. */ +void CommitOutput(const NDArray &arr, const mkldnn_output_t &res); + +static inline void InvalidateOutputs(const std::vector &arrs, + const std::vector &reqs) { + for (size_t i = 0; i < arrs.size(); i++) { + if (reqs[i] == kWriteTo || reqs[i] == kNullOp) { + const_cast(arrs[i]).InvalidateMKLDNNData(); + } + } +} + +const mkldnn::memory *GetWeights(const NDArray &arr, + const mkldnn::memory::primitive_desc &target_pd, + int num_groups); + +mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc); +mkldnn_memory_format_t GetDefaultFormat(int num_dims); +mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd, + mkldnn_memory_format_t format); + +void FallBackCompute(FCompute fn, const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs); + +/* + * This class is used to check the correctness of MKLDNN operators. + */ +class OpCheck { + std::vector inputs; + std::vector outputs; + bool backward; + size_t num_checks; + + public: + OpCheck(bool backward, size_t num_checks) { + this->backward = backward; + this->num_checks = num_checks; + } + + void Init(const std::vector &inputs_, + const std::vector &outputs_); + + void Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs, + const mxnet::OpContext &ctx, + const std::vector &inputs_, + const std::vector &req, + const std::vector &outputs_); +}; + +#define MKLDNN_OPCHECK_INIT(backward, num_checks, inputs, outputs) \ + static bool debug = dmlc::GetEnv("MXNET_MKLDNN_DEBUG", false); \ + OpCheck check(backward, num_checks); \ + if (debug) check.Init(inputs, outputs); + +#define MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs) \ + if (debug) check.Run(fn, attrs, ctx, inputs, req, outputs); + +} // namespace mxnet +#endif +#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BASE_INL_H_ diff --git a/src/operator/nn/mkldnn/mkldnn_base.cc b/src/operator/nn/mkldnn/mkldnn_base.cc new file mode 100644 index 000000000000..c34ca03a2809 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_base.cc @@ -0,0 +1,385 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#if MXNET_USE_MKLDNN == 1 + +#include +#include "./mkldnn_base-inl.h" +#include "./mkldnn_ops-inl.h" + +namespace mxnet { + +void *AlignMem(void *mem, size_t size, size_t alignment, size_t *space) { + if (size > *space) + return nullptr; + intptr_t addr = reinterpret_cast(mem); + // If the address has been aligned, don't do anything. + intptr_t last_chunk = addr % alignment; + if (last_chunk == 0) + return mem; + intptr_t padding = alignment - last_chunk; + // If the buffer doesn't have enough space, we should return null here. + if (padding + size > *space) + return nullptr; + addr += padding; + *space -= padding; + CHECK_EQ(addr % alignment, 0); + return reinterpret_cast(addr); +} + +mkldnn::memory *TmpMemMgr::Alloc(const mkldnn::memory::primitive_desc &pd) { + // We need to include the size of the memory used for alignment. + this->est_size += pd.get_size() + alignment; + void *mem = AlignMem(this->curr_mem, pd.get_size(), alignment, &this->curr_size); + if (mem) { + // The memory is allocated from the temporary memory space in the + // operator. It'll only become invalid after we exit from the operator. + mkldnn_mem_ptr ret(new mkldnn::memory(pd, mem)); + MKLDNNStream::Get()->RegisterMem(ret); + CHECK_EQ(mem, mem); + this->curr_size -= pd.get_size(); + this->curr_mem = static_cast(mem) + pd.get_size(); + return ret.get(); + } else { + LOG(WARNING) << "Allocate " << pd.get_size() + << " bytes with malloc directly"; + mkldnn_mem_ptr ret(new mkldnn::memory(pd)); + MKLDNNStream::Get()->RegisterMem(ret); + return ret.get(); + } +} + +mkldnn_output_t CreateMKLDNNMem(const NDArray &arr, + const mkldnn::memory::primitive_desc &desc, + OpReqType req) { + if (kAddTo == req) { + auto tmp = TmpMemMgr::Get()->Alloc(desc); + return mkldnn_output_t(OutDataOp::AddBack, tmp); + } else if (kWriteInplace == req) { + // MKLDNN ops may not support the case that the input and the output uses + // the same memory. Let's use an extra copy to make sure it always works. + auto tmp = TmpMemMgr::Get()->Alloc(desc); + return mkldnn_output_t(OutDataOp::CopyBack, tmp); + } else { + mkldnn::memory *mem = const_cast(arr).CreateMKLDNNData(desc); + if (mem == nullptr) { + auto tmp = TmpMemMgr::Get()->Alloc(desc); + return mkldnn_output_t(OutDataOp::CopyBack, tmp); + } else { + return mkldnn_output_t(OutDataOp::Noop, mem); + } + } +} + +mkldnn_output_t CreateMKLDNNWeightGrad(const NDArray &arr, + const mkldnn::memory::primitive_desc &desc, + OpReqType req) { + if (kAddTo == req) { + auto tmp = TmpMemMgr::Get()->Alloc(desc); + return mkldnn_output_t(OutDataOp::AddBack, tmp); + } else if (kWriteInplace == req) { + auto tmp = TmpMemMgr::Get()->Alloc(desc); + return mkldnn_output_t(OutDataOp::CopyBack, tmp); + } else { + auto _desc = desc; + auto def_format = GetDefaultFormat(_desc.desc()); + mkldnn::memory *mem = nullptr; + if (def_format == _desc.desc().data.format) { + mem = const_cast(arr).CreateMKLDNNData(desc); + } + if (mem == nullptr) { + auto tmp = TmpMemMgr::Get()->Alloc(desc); + return mkldnn_output_t(OutDataOp::CopyBack, tmp); + } else { + return mkldnn_output_t(OutDataOp::Noop, mem); + } + } +} + +void CommitOutput(const NDArray &arr, const mkldnn_output_t &res) { + if (res.first == CopyBack) { + const_cast(arr).CopyFrom(*res.second); + } else if (res.first == AddBack) { + auto mem = arr.GetMKLDNNData(res.second->get_primitive_desc()); + CHECK(mem != nullptr); + // We have to allocate new memory for the sum result. + auto sum_res = TmpMemMgr::Get()->Alloc( + res.second->get_primitive_desc()); + op::Sum(*res.second, *mem, *sum_res); + const_cast(arr).CopyFrom(*sum_res); + } +} + +const mkldnn::memory *GetWeights(const NDArray &arr, + const mkldnn::memory::primitive_desc &target_pd, + int num_groups) { + const mkldnn::memory *mem = arr.GetMKLDNNData(target_pd); + // If the weight array already uses the target layout, simply return it + // directly. + if (mem) + return mem; + + mkldnn::memory::data_type type = get_mkldnn_type(arr.dtype()); + auto engine = CpuEngine::Get()->get_engine(); + if (arr.shape().ndim() == 2) { + mkldnn::memory::dims tz = mkldnn::memory::dims{ + static_cast(arr.shape()[0]), static_cast(arr.shape()[1])}; + mkldnn::memory::desc md = + mkldnn::memory::desc{tz, type, mkldnn::memory::format::oi}; + mkldnn::memory::primitive_desc pd = + mkldnn::memory::primitive_desc{md, engine}; + mem = arr.GetMKLDNNData(pd); + } else if (arr.shape().ndim() == 4 && num_groups == 1) { + mkldnn::memory::dims tz = mkldnn::memory::dims{ + static_cast(arr.shape()[0]), static_cast(arr.shape()[1]), + static_cast(arr.shape()[2]), static_cast(arr.shape()[3])}; + mkldnn::memory::desc md = + mkldnn::memory::desc{tz, type, mkldnn::memory::format::oihw}; + mkldnn::memory::primitive_desc pd = + mkldnn::memory::primitive_desc{md, engine}; + mem = arr.GetMKLDNNData(pd); + } else if (arr.shape().ndim() == 4) { + mkldnn::memory::dims tz = mkldnn::memory::dims{ num_groups, + static_cast(arr.shape()[0] / num_groups), + static_cast(arr.shape()[1]), + static_cast(arr.shape()[2]), + static_cast(arr.shape()[3])}; + mkldnn::memory::desc md = + mkldnn::memory::desc{tz, type, mkldnn::memory::format::goihw}; + mkldnn::memory::primitive_desc pd = + mkldnn::memory::primitive_desc{md, engine}; + mem = arr.GetMKLDNNData(pd); + } else { + LOG(FATAL) << "The weight array has an unsupported number of dimensions"; + return nullptr; + } + if (mem == nullptr) + mem = arr.GetMKLDNNDataReorder(target_pd); + if (mem->get_primitive_desc() == target_pd) return mem; + + auto ret = TmpMemMgr::Get()->Alloc(target_pd); + MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder(*mem, *ret)); + return ret; +} + +mkldnn_memory_format_t GetDefaultFormat(int num_dims) { + switch (num_dims) { + case 1: return mkldnn_x; + case 2: return mkldnn_nc; + case 4: return mkldnn_nchw; + case 5: return mkldnn_goihw; + default: + LOG(FATAL) << "Unsupported MKLDNN dimensions: " << num_dims; + return mkldnn_format_undef; + } +} + +mkldnn_memory_format_t GetDefaultFormat(mkldnn::memory::desc desc) { + if (desc.data.ndims == 1) { + return desc.data.format; + } else if (desc.data.ndims == 2) { + if (desc.data.format == mkldnn_io) + return mkldnn_oi; + else + return desc.data.format; + } else if (desc.data.ndims == 4) { + switch (desc.data.format) { + case mkldnn_nchw: + case mkldnn_nhwc: + case mkldnn_chwn: + case mkldnn_nChw8c: + case mkldnn_nChw16c: + return mkldnn_nchw; + case mkldnn_oihw: + case mkldnn_ihwo: + case mkldnn_hwio: + case mkldnn_OIhw8i8o: + case mkldnn_OIhw16i16o: + case mkldnn_OIhw8i16o2i: + case mkldnn_OIhw8o16i2o: + case mkldnn_OIhw8o8i: + case mkldnn_OIhw16o16i: + case mkldnn_IOhw16o16i: + case mkldnn_Oihw8o: + case mkldnn_Oihw16o: + case mkldnn_Ohwi8o: + case mkldnn_Ohwi16o: + case mkldnn_OhIw16o4i: + return mkldnn_oihw; + default: + LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format; + return mkldnn_format_undef; + } + } else if (desc.data.ndims == 5) { + switch (desc.data.format) { + case mkldnn_goihw: + case mkldnn_gOIhw8i8o: + case mkldnn_gOIhw16i16o: + case mkldnn_gOIhw8i16o2i: + case mkldnn_gOIhw8o16i2o: + case mkldnn_gOIhw8o8i: + case mkldnn_gOIhw16o16i: + case mkldnn_gIOhw16o16i: + case mkldnn_gOihw8o: + case mkldnn_gOihw16o: + case mkldnn_gOhwi8o: + case mkldnn_gOhwi16o: + case mkldnn_gOhIw16o4i: + return mkldnn_goihw; + default: + LOG(FATAL) << "Unknown MKLDNN format for 4 dimensions: " << desc.data.format; + return mkldnn_format_undef; + } + } else { + LOG(FATAL) << "Unsupported dimensions: " << desc.data.ndims; + return mkldnn_format_undef; + } +} + +mkldnn::memory::primitive_desc GetPrimitiveDesc(mkldnn::memory::primitive_desc pd, + mkldnn_memory_format_t format) { + mkldnn::memory::dims dims(pd.desc().data.ndims); + for (size_t i = 0; i < dims.size(); i++) + dims[i] = pd.desc().data.dims[i]; + mkldnn::memory::format cpp_format = static_cast(format); + mkldnn::memory::data_type cpp_type = static_cast( + pd.desc().data.data_type); + mkldnn::memory::desc data_md(dims, cpp_type, cpp_format); + return mkldnn::memory::primitive_desc(data_md, pd.get_engine()); +} + +void FallBackCompute(FCompute fn, const nnvm::NodeAttrs &attrs, + const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) { + in_blobs[i] = inputs[i].data(); + } + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) { + if (req[i] == kWriteTo) + const_cast(outputs[i]).InvalidateMKLDNNData(); + CHECK(outputs[i].IsDefaultData()); + out_blobs[i] = outputs[i].data(); + } + fn(attrs, ctx, in_blobs, req, out_blobs); +} + +template +void print_diff(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2) { + DType *data1 = reinterpret_cast(arr1.data().dptr_); + DType *data2 = reinterpret_cast(arr2.data().dptr_); + for (size_t i = 0; i < arr1.shape().Size(); i++) + std::cout << data1[i] - data2[i] << ", "; + std::cout << std::endl; +} + +template +static bool SimilarArray(const mxnet::NDArray &arr1, const mxnet::NDArray &arr2, + DType rtol, DType atol) { + if (arr1.shape().Size() != arr2.shape().Size()) + return false; + + // This function should be used outside an MKLDNN operator. + // There shouldn't be any operators in the stream. + CHECK(!MKLDNNStream::Get()->HasOps()); + // We need to reorder data in the arrays to the default layout. + // But we shouldn't reorder data in the original array. + NDArray buf1, buf2; + if (arr1.IsMKLDNNData()) { + buf1 = NDArray(arr1.shape(), arr1.ctx(), false, arr1.dtype()); + auto mem = arr1.GetMKLDNNData(); + buf1.CopyFrom(*mem); + } + if (arr2.IsMKLDNNData()) { + buf2 = NDArray(arr2.shape(), arr2.ctx(), false, arr2.dtype()); + auto mem = arr2.GetMKLDNNData(); + buf2.CopyFrom(*mem); + } + MKLDNNStream::Get()->Submit(); + + DType *data1 = reinterpret_cast( + arr1.IsMKLDNNData() ? buf1.data().dptr_: arr1.data().dptr_); + DType *data2 = reinterpret_cast( + arr2.IsMKLDNNData() ? buf2.data().dptr_: arr2.data().dptr_); + std::atomic success(true); +#pragma omp parallel for + for (size_t i = 0; i < arr1.shape().Size(); i++) { + if (std::abs(data1[i] - data2[i]) > atol + rtol * std::abs(data2[i])) + success.store(false); + } + return success.load(); +} + +void OpCheck::Init(const std::vector &inputs_, + const std::vector &outputs_) { + auto ctx = inputs_[0].ctx(); + CHECK(!MKLDNNStream::Get()->HasOps()); + for (size_t i = 0; i < inputs_.size(); i++) { + inputs.emplace_back(inputs_[i].shape(), ctx, + false, inputs_[i].dtype()); + auto mem = inputs_[i].GetMKLDNNData(); + inputs[i].CopyFrom(*mem); + } + for (size_t i = 0; i < outputs_.size(); i++) { + outputs.emplace_back(outputs_[i].shape(), ctx, + false, outputs_[i].dtype()); + if (backward) { + auto mem = outputs_[i].GetMKLDNNData(); + outputs[i].CopyFrom(*mem); + } + } + MKLDNNStream::Get()->Submit(); +} + +void OpCheck::Run(mxnet::FCompute fn, const nnvm::NodeAttrs &attrs, + const mxnet::OpContext &ctx, + const std::vector &inputs_, + const std::vector &req, + const std::vector &outputs_) { + std::vector in_blobs(inputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) in_blobs[i] = inputs[i].data(); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + fn(attrs, ctx, in_blobs, req, out_blobs); + + LOG(INFO) << "test " << attrs.op->name; + size_t num = std::min(outputs.size(), outputs_.size()); + num = std::min(num_checks, num); + for (size_t i = 0; i < num; i++) { + // We don't need to compare if it doesn't need to output data. + if (req[i] == kNullOp) + continue; + MSHADOW_TYPE_SWITCH(outputs[i].dtype(), DType, { + bool similar = SimilarArray(outputs[i], outputs_[i], 1e-3, 1e-4); + if (!similar) { + LOG(ERROR) << attrs.op->name << " fails"; + print_diff(outputs[i], outputs_[i]); + } + CHECK(similar); + }); + } +} + +} // namespace mxnet + +#endif diff --git a/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h new file mode 100644 index 000000000000..19a98da6af83 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_batch_norm-inl.h @@ -0,0 +1,431 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_batch_norm.cc + * \brief + * \author Tao Lv +*/ + +#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_ +#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_ + +#if MXNET_USE_MKLDNN == 1 +#include +#include +#include +#include "../batch_norm-inl.h" +#include "./mkldnn_ops-inl.h" +#include "./mkldnn_base-inl.h" + +#define VARIANCE_TO_INVSTD(__var$, __eps$) (1.0/sqrt((__var$) + DType(__eps$))) +#define INVSTD_TO_VARIANCE(__invstd$, __eps$) ((1.0 / ((__invstd$) * (__invstd$))) - (__eps$)) +namespace mxnet { +namespace op { + +typedef mkldnn::batch_normalization_forward::primitive_desc t_bn_f_pdesc; +typedef mkldnn::batch_normalization_forward::desc t_bn_f_desc; +typedef mkldnn::batch_normalization_backward::primitive_desc t_bn_b_pdesc; +typedef mkldnn::batch_normalization_backward::desc t_bn_b_desc; + +using mkldnn::use_global_stats; +using mkldnn::use_scale_shift; +using mkldnn::forward_training; +using mkldnn::forward_inference; + +inline static unsigned _GetFlags(const std::vector &in_data, + const std::vector &aux_states, + const BatchNormParam ¶m, bool is_train) { + unsigned flags = 0U; + if (in_data.size() == 3U) { + flags |= use_scale_shift; + } + + // aux_states[0]: inMean + // aux_states[1]: inVariance + if (aux_states.size() == 2U && !is_train) { + flags |= use_global_stats; + } + return flags; +} + +template +inline static t_bn_f_pdesc _GetFwd(const mkldnn::memory &data_mem, + bool is_train, + DType eps, + unsigned flags) { + auto data_mpd = data_mem.get_primitive_desc(); + auto data_md = data_mpd.desc(); + auto engine = CpuEngine::Get()->get_engine(); + + if (is_train) { + t_bn_f_desc bnFwd_desc(forward_training, data_md, eps, flags); + return t_bn_f_pdesc(bnFwd_desc, engine); + } else { + t_bn_f_desc bnFwd_desc(forward_inference, data_md, eps, flags); + return t_bn_f_pdesc(bnFwd_desc, engine); + } +} + +template +inline static t_bn_b_pdesc _GetBwd(const mkldnn::memory &data_mem, + const mkldnn::memory &diff_mem, + DType eps, + unsigned flags) { + auto data_mpd = data_mem.get_primitive_desc(); + auto data_md = data_mpd.desc(); + auto diff_mpd = diff_mem.get_primitive_desc(); + auto diff_md = diff_mpd.desc(); + auto engine = CpuEngine::Get()->get_engine(); + + t_bn_b_desc bnBwd_desc(mkldnn::prop_kind::backward, diff_md, data_md, eps, flags); + return t_bn_b_pdesc(bnBwd_desc, engine, _GetFwd(data_mem, true, eps, flags)); +} + +typedef MKLDNNParamOpSign MKLDNNBNSignature; + +class MKLDNNBNForward { + std::shared_ptr data_m; + std::shared_ptr weight_m; + std::shared_ptr out_m; + std::shared_ptr mean_m; + std::shared_ptr var_m; + std::shared_ptr fwd; + bool is_train; + t_bn_f_pdesc pd; + + public: + MKLDNNBNForward(const t_bn_f_pdesc &_pd, bool is_train): pd(_pd) { + weight_m.reset(new mkldnn::memory(pd.weights_primitive_desc())); + this->is_train = is_train; + } + + const mkldnn::memory &GetWeight() const { + return *weight_m; + } + + const t_bn_f_pdesc &GetPd() const { + return pd; + } + + const mkldnn::memory &GetMean() const { + return *mean_m; + } + + const mkldnn::memory &GetVar() const { + return *var_m; + } + + void SetDataHandle(const NDArray &data, const NDArray &mean, + const NDArray &var, const mkldnn::memory &out) { + auto _data = data.GetMKLDNNData(); + if (data_m) { + data_m->set_data_handle(_data->get_data_handle()); + } else { + data_m.reset(new mkldnn::memory(_data->get_primitive_desc(), + _data->get_data_handle())); + } + if (out_m) { + out_m->set_data_handle(out.get_data_handle()); + } else { + out_m.reset(new mkldnn::memory(out.get_primitive_desc(), + out.get_data_handle())); + } + auto mean_ptr = mean.data().dptr_; + if (mean_m) { + mean_m->set_data_handle(mean_ptr); + } else { + mean_m.reset(new mkldnn::memory(pd.mean_primitive_desc(), + mean_ptr)); + } + auto var_ptr = var.data().dptr_; + if (var_m) { + var_m->set_data_handle(var_ptr); + } else { + var_m.reset(new mkldnn::memory(pd.variance_primitive_desc(), + var_ptr)); + } + + if (fwd == nullptr) { + if (!is_train) + fwd.reset(new mkldnn::batch_normalization_forward( + pd, *data_m, mkldnn::primitive::at(*mean_m), + mkldnn::primitive::at(*var_m), *weight_m, *out_m)); + else + fwd.reset(new mkldnn::batch_normalization_forward( + pd, mkldnn::primitive::at(*data_m), + mkldnn::primitive::at(*weight_m), *out_m, + *mean_m, *var_m)); + } + } + + const mkldnn::batch_normalization_forward &GetFwd() const { + return *fwd; + } +}; + +template +static MKLDNNBNForward &GetBNForward(const BatchNormParam& param, + const OpContext &ctx, const NDArray &in_data, + unsigned flags) { + static thread_local std::unordered_map fwds; + MKLDNNBNSignature key(param); + key.AddSign(ctx.is_train); + key.AddSign(in_data); + + auto it = fwds.find(key); + if (it == fwds.end()) { + auto fwd_pd = _GetFwd(*in_data.GetMKLDNNData(), ctx.is_train, + (DType) param.eps, flags); + MKLDNNBNForward fwd(fwd_pd, ctx.is_train); + auto ins_ret = fwds.insert(std::pair( + key, fwd)); + CHECK(ins_ret.second); + it = ins_ret.first; + } + return it->second; +} + +template +void MKLDNNBatchNormForward(const OpContext &ctx, const BatchNormParam ¶m, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data, + const std::vector &aux_states) { + TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]); + unsigned flags = _GetFlags(in_data, aux_states, param, ctx.is_train); + const NDArray &data = in_data[batchnorm::kData]; + + auto &fwd = GetBNForward(param, ctx, data, flags); + const NDArray &out = out_data[batchnorm::kOut]; + + // for output memory + auto out_mem = const_cast(out).CreateMKLDNNData(fwd.GetPd().dst_primitive_desc()); + + // mxnet will always use scale shift. + // But if fix_gamma is true, then all scale elements will be set to 1.0f + if (flags & use_scale_shift) { + const NDArray &gamma = in_data[batchnorm::kGamma]; + const NDArray &beta = in_data[batchnorm::kBeta]; + CHECK_EQ(gamma.storage_type(), mxnet::kDefaultStorage); + CHECK_EQ(beta.storage_type(), mxnet::kDefaultStorage); + + const mkldnn::memory &weight_mem = fwd.GetWeight(); + DType* weight_buf = reinterpret_cast(weight_mem.get_data_handle()); + + nnvm::dim_t channels_ = data.shape()[1]; + CHECK(weight_mem.get_primitive_desc().get_size() == channels_ * sizeof(DType) * 2); + DType* weight_ptr = gamma.data().dptr(); + DType* bias_ptr = beta.data().dptr(); + if (!param.fix_gamma) { +#pragma omp parallel for simd + for (int i = 0; i < channels_; i++) { + weight_buf[i] = weight_ptr[i]; + weight_buf[channels_ + i] = bias_ptr[i]; // bias + } + } else if (IsBNWriting(req[batchnorm::kGamma])) { +#pragma omp parallel for simd + for (int i = 0; i < channels_; i++) { + weight_buf[i] = (DType)1.0f; + weight_ptr[i] = (DType)1.0f; + weight_buf[channels_ + i] = bias_ptr[i]; // bias + } + } else { +#pragma omp parallel for simd + for (int i = 0; i < channels_; i++) { + weight_buf[i] = (DType)1.0f; + weight_buf[channels_ + i] = bias_ptr[i]; // bias + } + } + + if (!ctx.is_train) { + DType* omean = out_data[batchnorm::kMean].data().dptr(); + DType* ovar = out_data[batchnorm::kVar].data().dptr(); + DType* inmean = aux_states[batchnorm::kMovingMean].data().dptr(); + DType* invar = aux_states[batchnorm::kMovingVar].data().dptr(); + // to align with origin implmentation: batch_norm.cc: L164 +#pragma omp parallel for simd + for (int i = 0; i < channels_; i++) { + omean[i] = inmean[i]; + ovar[i] = VARIANCE_TO_INVSTD(invar[i], param.eps); + } + + fwd.SetDataHandle(data, aux_states[batchnorm::kMovingMean], + aux_states[batchnorm::kMovingVar], + *out_mem); + MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd()); + MKLDNNStream::Get()->Submit(); + } else { // training + const NDArray &outMean = out_data[batchnorm::kMean]; + const NDArray &outVar = out_data[batchnorm::kVar]; + DType* omean = outMean.data().dptr(); + DType* ovar = outVar.data().dptr(); + + fwd.SetDataHandle(data, outMean, outVar, *out_mem); + MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd()); + MKLDNNStream::Get()->Submit(); + DType* mean_mem_ptr = reinterpret_cast(fwd.GetMean().get_data_handle()); + DType* var_mem_ptr = reinterpret_cast(fwd.GetVar().get_data_handle()); +#pragma omp parallel for simd + for (int i = 0; i < channels_; i++) { + omean[i] = mean_mem_ptr[i]; + ovar[i] = VARIANCE_TO_INVSTD(var_mem_ptr[i], param.eps); + } + } + } else { // no input gamma and beta + LOG(FATAL) << "MKLDNN batch normalization: should not reach here ..."; + } +} + +template +void MKLDNNBatchNormBackward(const OpContext &ctx, const BatchNormParam ¶m, + const std::vector &out_grad, + const std::vector &in_data, + const std::vector &out_data, + const std::vector &req, + const std::vector &in_grad, + const std::vector &aux_states) { + TmpMemMgr::Get()->Init(ctx.requested[batchnorm::kTempSpace]); + CHECK_EQ(out_grad.size(), param.output_mean_var ? 3U : 1U); + CHECK_EQ(in_data.size(), 3U); + CHECK_EQ(out_data.size(), 3U); + CHECK_EQ(in_grad.size(), 3U); + unsigned flags = _GetFlags(in_data, aux_states, param, ctx.is_train); + + const NDArray &data = in_data[batchnorm::kData]; + const NDArray &diff = out_grad[batchnorm::kOut]; + const NDArray &gradIn = in_grad[batchnorm::kData]; + const NDArray &moving_mean = aux_states[batchnorm::kMovingMean]; + const NDArray &moving_var = aux_states[batchnorm::kMovingVar]; + const NDArray &out_mean = out_data[batchnorm::kMean]; + const NDArray &out_var = out_data[batchnorm::kVar]; + + CHECK(out_mean.IsDefaultData()); + CHECK(out_var.IsDefaultData()); + CHECK(moving_mean.IsDefaultData()); + CHECK(moving_var.IsDefaultData()); + + auto data_mem = data.GetMKLDNNData(); + auto diff_mem = diff.GetMKLDNNData(); + // MKLDNN batchnorm should run on special layouts. If one of them isn't, we + // should reorder them. + if (data.IsDefaultData()) + data_mem = data.GetMKLDNNDataReorder(diff_mem->get_primitive_desc()); + else if (diff.IsDefaultData()) + diff_mem = diff.GetMKLDNNDataReorder(data_mem->get_primitive_desc()); + auto bwd_pd = _GetBwd(*data_mem, *diff_mem, param.eps, flags); + auto gradi_mem = const_cast(gradIn).CreateMKLDNNData(data_mem->get_primitive_desc()); + + if (flags & use_scale_shift) { + const NDArray &gamma = in_data[batchnorm::kGamma]; + const NDArray &beta = in_data[batchnorm::kBeta]; + // TODO(tao): how to reuse this memory? + std::shared_ptr weight_mem( + new mkldnn::memory(bwd_pd.weights_primitive_desc())); + + DType* weight_buf = reinterpret_cast(weight_mem->get_data_handle()); + nnvm::dim_t channels_ = data.shape()[1]; + for (int i = 0; i < channels_; i++) { + if (!param.fix_gamma) + weight_buf[i] = (gamma.data().dptr())[i]; // weight + else + weight_buf[i] = (DType)1.0f; + } + + for (int i = 0; i < channels_; i++) { + weight_buf[channels_ + i] = (beta.data().dptr())[i]; // bias + } + + std::shared_ptr gradw_mem( + new mkldnn::memory(bwd_pd.diff_weights_primitive_desc())); + // training but no input mean and variance + if (ctx.is_train && !param.use_global_stats) { + DType* moving_mean_ptr = reinterpret_cast(moving_mean.data().dptr()); + DType* moving_var_ptr = reinterpret_cast(moving_var.data().dptr()); + DType* out_mean_ptr = reinterpret_cast(out_mean.data().dptr()); + DType* out_var_ptr = reinterpret_cast(out_var.data().dptr()); + mkldnn::memory var_mem(bwd_pd.variance_primitive_desc()); + DType *tmp_var_ptr = reinterpret_cast(var_mem.get_data_handle()); + + DType minus_mom = (1.0f - param.momentum); + for (int i = 0; i < channels_; i++) { + moving_mean_ptr[i] = moving_mean_ptr[i] * param.momentum + + out_mean_ptr[i] * minus_mom; + float variance = INVSTD_TO_VARIANCE(out_var_ptr[i], param.eps); + tmp_var_ptr[i] = variance; + moving_var_ptr[i] = moving_var_ptr[i] * param.momentum + + variance * minus_mom; + } + + std::shared_ptr out_mean_mem( + new mkldnn::memory(bwd_pd.mean_primitive_desc(), out_mean_ptr)); + std::shared_ptr out_var_mem( + new mkldnn::memory(bwd_pd.variance_primitive_desc(), out_var_ptr)); + + auto bn_bwd = mkldnn::batch_normalization_backward(bwd_pd, + *data_mem, + mkldnn::primitive::at(*out_mean_mem), + mkldnn::primitive::at(var_mem), + *diff_mem, + *weight_mem, + *gradi_mem, + *gradw_mem); + + MKLDNNStream::Get()->RegisterPrim(bn_bwd); + MKLDNNStream::Get()->Submit(); + } else { + std::shared_ptr imean_mem( + new mkldnn::memory(bwd_pd.mean_primitive_desc(), + moving_mean.data().dptr())); + std::shared_ptr ivar_mem( + new mkldnn::memory(bwd_pd.variance_primitive_desc(), + moving_var.data().dptr())); + auto bn_bwd = mkldnn::batch_normalization_backward(bwd_pd, + *data_mem, + mkldnn::primitive::at(*imean_mem), + mkldnn::primitive::at(*ivar_mem), + *diff_mem, + *weight_mem, + *gradi_mem, + *gradw_mem); + + MKLDNNStream::Get()->RegisterPrim(bn_bwd); + MKLDNNStream::Get()->Submit(); + } + + // copy data from gradw_mem to in_grad[1] and in_grad[2] + DType* gw_buf = reinterpret_cast(gradw_mem->get_data_handle()); + for (int i = 0; i < channels_; i++) { + if (!param.fix_gamma) + (in_grad[1].data().dptr())[i] = gw_buf[i]; + else + (in_grad[1].data().dptr())[i] = 0.0f; + } + + for (int i = 0; i < channels_; i++) { + (in_grad[2].data().dptr())[i] = gw_buf[i + channels_]; + } + } else { + LOG(FATAL) << "MKLDNN batch normalization backward: should not reach here ..."; + } +} +} // namespace op +} // namespace mxnet +#endif // MXNET_USE_MKLDNN +#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_BATCH_NORM_INL_H_ diff --git a/src/operator/nn/mkldnn/mkldnn_concat.cc b/src/operator/nn/mkldnn/mkldnn_concat.cc new file mode 100644 index 000000000000..d3e6e775020d --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_concat.cc @@ -0,0 +1,95 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_concat.cc + * \brief + * \author Wenting Jiang +*/ +#include "../concat-inl.h" +#include "./mkldnn_ops-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { + TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]); + const ConcatParam& param = nnvm::get(attrs.parsed); + int num_in_data = param.num_args; + int concat_dim = param.dim; + std::vector data_md; + std::vector data_mem; + for (int i =0; i < num_in_data; i++) { + auto tmp_mem = in_data[i].GetMKLDNNData(); + auto tmp_pd = tmp_mem->get_primitive_desc(); + data_md.push_back(tmp_pd); + data_mem.push_back(*tmp_mem); + } + mkldnn::concat::primitive_desc fwd_pd(concat_dim, data_md); + auto engine = CpuEngine::Get()->get_engine(); + auto out_mem = CreateMKLDNNMem(out_data[concat_enum::kOut], + fwd_pd.dst_primitive_desc(), req[concat_enum::kOut]); + MKLDNNStream::Get()->RegisterPrim(mkldnn::concat(fwd_pd, data_mem, *out_mem.second)); + CommitOutput(out_data[concat_enum::kOut], out_mem); + MKLDNNStream::Get()->Submit(); +} + +void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + TmpMemMgr::Get()->Init(ctx.requested[concat_enum::kTempSpace]); + const ConcatParam& param = nnvm::get(attrs.parsed); + int num_in_data = param.num_args; + int axis_ = param.dim; + auto engine = CpuEngine::Get()->get_engine(); + auto gz_mem = inputs[0].GetMKLDNNData(); + mkldnn::memory::primitive_desc gz_pd = gz_mem->get_primitive_desc(); + /* init the offset */ + mkldnn::memory::dims offsets = {0, 0, 0, 0}; + for (int i = 0; i < num_in_data; i++) { + mkldnn::memory::dims diff_src_tz + = {static_cast(inputs[i+1].shape()[0]), + static_cast(inputs[i+1].shape()[1]), + static_cast(inputs[i+1].shape()[2]), + static_cast(inputs[i+1].shape()[3])}; + auto diff_src_mpd = inputs[i+1].GetMKLDNNData()->get_primitive_desc(); + auto gradi_mem_ = CreateMKLDNNMem(outputs[i], diff_src_mpd, req[i]); + // create view from gy to gxs[i] + std::shared_ptr view_pd; + view_pd.reset(new mkldnn::view::primitive_desc(gz_pd, diff_src_tz, offsets)); + // create reorder primitive from gy to gxs[i] + mkldnn::reorder::primitive_desc reorder_pd( + view_pd.get()->dst_primitive_desc(), diff_src_mpd); + offsets[axis_] += diff_src_tz[axis_]; + MKLDNNStream::Get()->RegisterPrim(mkldnn::reorder( + reorder_pd, *gz_mem, *gradi_mem_.second)); + CommitOutput(outputs[i], gradi_mem_); + } + MKLDNNStream::Get()->Submit(); +} + +} // namespace op +} // namespace mxnet +#endif diff --git a/src/operator/nn/mkldnn/mkldnn_convolution.cc b/src/operator/nn/mkldnn/mkldnn_convolution.cc new file mode 100644 index 000000000000..b94850aa620b --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_convolution.cc @@ -0,0 +1,357 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_convolution.cc + * \brief + * \author Da Zheng +*/ + +#include "../convolution-inl.h" +#include "./mkldnn_ops-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +static mkldnn::convolution_forward::primitive_desc GetConvFwdImpl( + const ConvolutionParam& param, bool is_train, const NDArray &data, + const NDArray &weights, const NDArray *bias, const NDArray &output) { + auto prop = is_train ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring; + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weights, param.num_group); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Get()->get_engine(); + mkldnn::memory::dims strides{0, 0}; + if (param.stride.ndim() == 2) { + strides[0] = param.stride[0]; + strides[1] = param.stride[1]; + } + mkldnn::memory::dims padding{0, 0}; + if (param.pad.ndim() == 2) { + padding[0] = param.pad[0]; + padding[1] = param.pad[1]; + } + if (param.dilate.ndim() == 0 && bias == nullptr) { + mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, + data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_forward::primitive_desc(desc, engine); + } else if (param.dilate.ndim() == 0) { + auto bias_md = GetMemDesc(*bias); + mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, + data_md, weight_md, bias_md, out_md, strides, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_forward::primitive_desc(desc, engine); + } else { + mkldnn::memory::dims dilates{0, 0}; + if (param.dilate.ndim() == 2) { + dilates[0] = param.dilate[0] - 1; + dilates[1] = param.dilate[1] - 1; + } + if (bias == nullptr) { + mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, + data_md, weight_md, out_md, strides, dilates, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_forward::primitive_desc(desc, engine); + } else { + auto bias_md = GetMemDesc(*bias); + mkldnn::convolution_forward::desc desc(prop, mkldnn::algorithm::convolution_direct, + data_md, weight_md, bias_md, out_md, strides, + dilates, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_forward::primitive_desc(desc, engine); + } + } +} + +static mkldnn::convolution_backward_data::primitive_desc GetConvBwdData( + const ConvolutionParam& param, const NDArray &data, const NDArray &weights, + const NDArray &output, const mkldnn::convolution_forward::primitive_desc &fwd_pd) { + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weights, param.num_group); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Get()->get_engine(); + mkldnn::memory::dims strides{0, 0}; + if (param.stride.ndim() == 2) { + strides[0] = param.stride[0]; + strides[1] = param.stride[1]; + } + mkldnn::memory::dims padding{0, 0}; + if (param.pad.ndim() == 2) { + padding[0] = param.pad[0]; + padding[1] = param.pad[1]; + } + if (param.dilate.ndim() == 0) { + mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, + data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd); + } else { + mkldnn::memory::dims dilates{0, 0}; + if (param.dilate.ndim() == 2) { + dilates[0] = param.dilate[0] - 1; + dilates[1] = param.dilate[1] - 1; + } + mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, + data_md, weight_md, out_md, strides, dilates, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_data::primitive_desc(desc, engine, fwd_pd); + } +} + +static mkldnn::convolution_backward_weights::primitive_desc GetConvBwdWeights( + const ConvolutionParam& param, const NDArray &data, + const NDArray &weights, const NDArray *bias, const NDArray &output, + const mkldnn::convolution_forward::primitive_desc &fwd_pd) { + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weights, param.num_group); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Get()->get_engine(); + mkldnn::memory::dims strides{0, 0}; + if (param.stride.ndim() == 2) { + strides[0] = param.stride[0]; + strides[1] = param.stride[1]; + } + mkldnn::memory::dims padding{0, 0}; + if (param.pad.ndim() == 2) { + padding[0] = param.pad[0]; + padding[1] = param.pad[1]; + } + if (param.dilate.ndim() == 0 && bias == nullptr) { + mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, + data_md, weight_md, out_md, strides, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); + } else if (param.dilate.ndim() == 0) { + auto bias_md = GetMemDesc(*bias); + mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, + data_md, weight_md, bias_md, out_md, strides, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); + } else { + mkldnn::memory::dims dilates{0, 0}; + if (param.dilate.ndim() == 2) { + dilates[0] = param.dilate[0] - 1; + dilates[1] = param.dilate[1] - 1; + } + if (bias == nullptr) { + mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, + data_md, weight_md, out_md, strides, dilates, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); + } else { + auto bias_md = GetMemDesc(*bias); + mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, + data_md, weight_md, bias_md, out_md, + strides, dilates, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); + } + } +} + +class MKLDNNConvForward { + std::shared_ptr fwd; + std::shared_ptr data; + std::shared_ptr weight; + std::shared_ptr bias; + std::shared_ptr out; + + public: + mkldnn::convolution_forward::primitive_desc fwd_pd; + + MKLDNNConvForward(const ConvolutionParam& param, bool is_train, + const NDArray &data, const NDArray &weights, + const NDArray *bias, const NDArray &output): fwd_pd( + GetConvFwdImpl(param, is_train, data, weights, bias, output)) { + } + + void SetNewMem(const mkldnn::memory &data, const mkldnn::memory &weight, + const mkldnn::memory *bias, const mkldnn::memory &output) { + if (this->data == nullptr) + this->data = std::shared_ptr(new mkldnn::memory( + fwd_pd.src_primitive_desc(), data.get_data_handle())); + else + this->data->set_data_handle(data.get_data_handle()); + + if (this->weight == nullptr) + this->weight = std::shared_ptr(new mkldnn::memory( + fwd_pd.weights_primitive_desc(), weight.get_data_handle())); + else + this->weight->set_data_handle(weight.get_data_handle()); + + if (this->out == nullptr) + this->out = std::shared_ptr(new mkldnn::memory( + fwd_pd.dst_primitive_desc(), output.get_data_handle())); + else + this->out->set_data_handle(output.get_data_handle()); + + if (bias != nullptr) { + if (this->bias == nullptr) + this->bias = std::shared_ptr(new mkldnn::memory( + fwd_pd.bias_primitive_desc(), bias->get_data_handle())); + else + this->bias->set_data_handle(bias->get_data_handle()); + if (this->fwd == nullptr) + this->fwd = std::shared_ptr( + new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data), + mkldnn::primitive::at(*this->weight), + mkldnn::primitive::at(*this->bias), + *this->out)); + } else if (this->fwd == nullptr) { + this->fwd = std::shared_ptr( + new mkldnn::convolution_forward(fwd_pd, mkldnn::primitive::at(*this->data), + mkldnn::primitive::at(*this->weight), + *this->out)); + } + } + + const mkldnn::convolution_forward &GetFwd() const { + return *fwd; + } +}; + +typedef MKLDNNParamOpSign MKLDNNConvSignature; + +static inline MKLDNNConvForward &GetConvFwd( + const nnvm::NodeAttrs& attrs, bool is_train, + const NDArray &data, const NDArray &weights, + const NDArray *bias, const NDArray &output) { + static thread_local std::unordered_map fwds; + const ConvolutionParam& param = nnvm::get(attrs.parsed); + MKLDNNConvSignature key(param); + key.AddSign(is_train); + // Here we can sign the conv op with NDArray because conv primitive will + // decide the right layout for the, so we only need to get the shape and the + // data type of the arrays. + key.AddSign(data); + key.AddSign(weights); + key.AddSign(output); + if (bias) + key.AddSign(*bias); + + auto it = fwds.find(key); + if (it == fwds.end()) { + MKLDNNConvForward fwd(param, is_train, data, weights, bias, output); + auto ins_ret = fwds.insert( + std::pair(key, fwd)); + CHECK(ins_ret.second); + it = ins_ret.first; + } + return it->second; +} + +void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { + TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]); + const ConvolutionParam& param = nnvm::get(attrs.parsed); + MKLDNNConvForward &fwd = GetConvFwd(attrs, + ctx.is_train, in_data[conv::kData], in_data[conv::kWeight], + param.no_bias ? nullptr : &in_data[conv::kBias], out_data[conv::kOut]); + + auto data_mem = in_data[conv::kData].GetMKLDNNDataReorder(fwd.fwd_pd.src_primitive_desc()); + const mkldnn::memory *weight_mem; + if (ctx.is_train) { + // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it + // to the default format for now. + if (in_data[conv::kWeight].IsMKLDNNData()) + const_cast(in_data[conv::kWeight]).Reorder2Default(); + weight_mem = GetWeights(in_data[conv::kWeight], fwd.fwd_pd.weights_primitive_desc(), + param.num_group); + } else { + // For inference, we want to reorder the weight array so we don't need to + // reorder data every time. + const_cast(in_data[conv::kWeight]).MKLDNNDataReorder( + fwd.fwd_pd.weights_primitive_desc()); + weight_mem = in_data[conv::kWeight].GetMKLDNNData(); + } + auto out_mem = CreateMKLDNNMem(out_data[conv::kOut], fwd.fwd_pd.dst_primitive_desc(), + req[conv::kOut]); + const mkldnn::memory *bias_mem = nullptr; + if (!param.no_bias) + bias_mem = in_data[conv::kBias].GetMKLDNNDataReorder(fwd.fwd_pd.bias_primitive_desc()); + fwd.SetNewMem(*data_mem, *weight_mem, bias_mem, *out_mem.second); + MKLDNNStream::Get()->RegisterPrim(fwd.GetFwd()); + + CommitOutput(out_data[conv::kOut], out_mem); + MKLDNNStream::Get()->Submit(); +} + +void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + TmpMemMgr::Get()->Init(ctx.requested[conv::kTempSpace]); + const std::vector &in_grad = outputs; + const ConvolutionParam& param = nnvm::get(attrs.parsed); + mkldnn::convolution_forward::primitive_desc fwd_pd = GetConvFwdImpl(param, ctx.is_train, + inputs[conv::kData + 1], inputs[conv::kWeight + 1], + param.no_bias ? nullptr : &inputs[conv::kBias + 1], inputs[conv::kOut]); + + CHECK_NE(req[conv::kWeight], kWriteInplace) << "cannot write weight inplace"; + mkldnn::convolution_backward_data::primitive_desc bwdData_pd + = GetConvBwdData(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1], + inputs[conv::kOut], fwd_pd); + auto out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder( + bwdData_pd.diff_dst_primitive_desc()); + if (req[conv::kData]) { + auto weight_mem = GetWeights(inputs[conv::kWeight + 1], + bwdData_pd.weights_primitive_desc(), param.num_group); + auto in_grad_mem = CreateMKLDNNMem(in_grad[conv::kData], + bwdData_pd.diff_src_primitive_desc(), req[conv::kData]); + MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_data(bwdData_pd, + *out_grad_mem, *weight_mem, *in_grad_mem.second)); + CommitOutput(in_grad[conv::kData], in_grad_mem); + } + if (req[conv::kWeight]) { + mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd + = GetConvBwdWeights(param, inputs[conv::kData + 1], inputs[conv::kWeight + 1], + param.no_bias ? nullptr : &inputs[conv::kBias + 1], + inputs[conv::kOut], fwd_pd); + if (bwdData_pd.diff_dst_primitive_desc() != bwdWeights_pd.diff_dst_primitive_desc()) + out_grad_mem = inputs[conv::kOut].GetMKLDNNDataReorder( + bwdWeights_pd.diff_dst_primitive_desc()); + auto data_mem = inputs[conv::kData + 1].GetMKLDNNDataReorder( + bwdWeights_pd.src_primitive_desc()); + auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[conv::kWeight], + bwdWeights_pd.diff_weights_primitive_desc(), + req[conv::kWeight]); + mkldnn_output_t in_grad_bias; + if (param.no_bias) { + MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights( + bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second)); + } else { + in_grad_bias = CreateMKLDNNMem(in_grad[conv::kBias], + bwdWeights_pd.diff_bias_primitive_desc(), + req[conv::kBias]); + MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights( + bwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second, + *in_grad_bias.second)); + CommitOutput(in_grad[conv::kBias], in_grad_bias); + } + CommitOutput(in_grad[conv::kWeight], in_grad_weight); + } + MKLDNNStream::Get()->Submit(); +} + +} // namespace op +} // namespace mxnet + +#endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_copy.cc b/src/operator/nn/mkldnn/mkldnn_copy.cc new file mode 100644 index 000000000000..71d540c969cd --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_copy.cc @@ -0,0 +1,57 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_softmax.cc + * \brief + * \author Da Zheng +*/ + +#include "../softmax-inl.h" +#include "./mkldnn_ops-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data) { + TmpMemMgr::Get()->Init(ctx.requested[0]); + auto in_mem = in_data.GetMKLDNNData(); + if (req == kAddTo) { + TmpMemMgr::Get()->Init(ctx.requested[0]); + // We should try and force the output memory has the same format + // as the input memory. If not, we'll have to reorder memory. + auto out_mem = out_data.GetMKLDNNData(in_mem->get_primitive_desc()); + if (out_mem == nullptr) + out_mem = out_data.GetMKLDNNData(); + auto sum_res = TmpMemMgr::Get()->Alloc(out_mem->get_primitive_desc()); + Sum(*in_mem, *out_mem, *sum_res); + const_cast(out_data).CopyFrom(*sum_res); + } else { + const_cast(out_data).CopyFrom(*in_mem); + } + MKLDNNStream::Get()->Submit(); +} + +} // namespace op +} // namespace mxnet +#endif diff --git a/src/operator/nn/mkldnn/mkldnn_deconvolution.cc b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc new file mode 100644 index 000000000000..d336d6dedbea --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_deconvolution.cc @@ -0,0 +1,390 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_deconvolution.cc + * \brief + * \author Da Zheng, Rong Zhang (rong.a.zhang@intel.com) +*/ + +#if MXNET_USE_MKLDNN == 1 + +#include "../deconvolution-inl.h" +#include "./mkldnn_ops-inl.h" +#include "./mkldnn_base-inl.h" + +namespace mxnet { +namespace op { + +static inline mkldnn::memory::desc GetBiasDesc(mkldnn::memory::desc md) { + mkldnn::memory::dims dims(1); + // This is convolution on 4D data. The second dimension is the channel. + dims[0] = md.data.dims[1]; + return mkldnn::memory::desc(dims, + static_cast(md.data.data_type), + mkldnn::memory::format::any); +} + +static mkldnn::convolution_forward::primitive_desc GetDeconvBwd_( + const mkldnn::memory::desc &data_md, const mkldnn::memory::desc &weights_md, + bool has_bias, const mkldnn::memory::desc &out_md, + const mkldnn::engine &engine, const mkldnn::memory::dims &strides, + const mkldnn::memory::dims &padding, const mkldnn::memory::dims &dilates) { + if (!has_bias) { + mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training, + mkldnn::algorithm::convolution_direct, out_md, weights_md, data_md, strides, + dilates, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_forward::primitive_desc(desc, engine); + } else { + auto bias_md = GetBiasDesc(data_md); + mkldnn::convolution_forward::desc desc(mkldnn::prop_kind::forward_training, + mkldnn::algorithm::convolution_direct, out_md, weights_md, bias_md, + data_md, strides, dilates, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_forward::primitive_desc(desc, engine); + } +} + +static mkldnn::convolution_backward_data::primitive_desc GetDeconvFwdImpl( + const DeconvolutionParam& param, const NDArray &data, const NDArray &weights, + bool has_bias, const NDArray &output) { + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weights, param.num_group); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Get()->get_engine(); + mkldnn::memory::dims strides{0, 0}; + if (param.stride.ndim() == 2) { + strides[0] = param.stride[0]; + strides[1] = param.stride[1]; + } else if (param.stride.ndim() == 1) { + strides[0] = param.stride[0]; + strides[1] = param.stride[0]; + } else { + LOG(FATAL) << "Unsupported stride dim"; + } + mkldnn::memory::dims padding{0, 0}; + if (param.pad.ndim() == 2) { + padding[0] = param.pad[0]; + padding[1] = param.pad[1]; + } else if (param.pad.ndim() == 1) { + padding[0] = param.pad[0]; + padding[1] = param.pad[0]; + } else { + LOG(FATAL) << "Unsupported pad dim"; + } + mkldnn::memory::dims dilate{0, 0}; + if (param.dilate.ndim() == 2) { + dilate[0] = param.dilate[0] - 1; + dilate[1] = param.dilate[1] - 1; + } + auto bwd_pd = GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine, + strides, padding, dilate); + mkldnn::convolution_backward_data::desc desc(mkldnn::algorithm::convolution_direct, + out_md, weight_md, data_md, strides, dilate, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_data::primitive_desc(desc, engine, bwd_pd); +} + +static mkldnn::convolution_forward::primitive_desc GetDeconvBwdData( + const DeconvolutionParam ¶m, const NDArray &data, const NDArray &weights, + bool has_bias, const NDArray &output) { + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weights, param.num_group); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Get()->get_engine(); + mkldnn::memory::dims strides{0, 0}; + if (param.stride.ndim() == 2) { + strides[0] = param.stride[0]; + strides[1] = param.stride[1]; + } else if (param.stride.ndim() == 1) { + strides[0] = param.stride[0]; + strides[1] = param.stride[0]; + } else { + LOG(FATAL) << "Unsupported stride dim"; + } + mkldnn::memory::dims padding{0, 0}; + if (param.pad.ndim() == 2) { + padding[0] = param.pad[0]; + padding[1] = param.pad[1]; + } else if (param.pad.ndim() == 1) { + padding[0] = param.pad[0]; + padding[1] = param.pad[0]; + } else { + LOG(FATAL) << "Unsupported pad dim"; + } + mkldnn::memory::dims dilate{0, 0}; + if (param.dilate.ndim() == 2) { + dilate[0] = param.dilate[0] - 1; + dilate[1] = param.dilate[1] - 1; + } + return GetDeconvBwd_(data_md, weight_md, has_bias, out_md, engine, + strides, padding, dilate); +} + +static mkldnn::convolution_backward_weights::primitive_desc GetDeconvBwdWeights( + const DeconvolutionParam& param, const NDArray &data, const NDArray &weights, + bool has_bias, const NDArray &output, + const mkldnn::convolution_forward::primitive_desc &fwd_pd) { + auto data_md = GetMemDesc(data); + auto weight_md = GetWeightDesc(weights, param.num_group); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Get()->get_engine(); + mkldnn::memory::dims strides{0, 0}; + if (param.stride.ndim() == 2) { + strides[0] = param.stride[0]; + strides[1] = param.stride[1]; + } else if (param.stride.ndim() == 1) { + strides[0] = param.stride[0]; + strides[1] = param.stride[0]; + } else { + LOG(FATAL) << "Unsupported stride dim"; + } + mkldnn::memory::dims padding{0, 0}; + if (param.pad.ndim() == 2) { + padding[0] = param.pad[0]; + padding[1] = param.pad[1]; + } else if (param.pad.ndim() == 1) { + padding[0] = param.pad[0]; + padding[1] = param.pad[0]; + } else { + LOG(FATAL) << "Unsupported pad dim"; + } + mkldnn::memory::dims dilate{0, 0}; + if (param.dilate.ndim() == 2) { + dilate[0] = param.dilate[0] - 1; + dilate[1] = param.dilate[1] - 1; + } + if (!has_bias) { + mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, + out_md, weight_md, data_md, strides, dilate, padding, padding, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); + } else { + auto bias_md = GetBiasDesc(data_md); + mkldnn::convolution_backward_weights::desc desc(mkldnn::algorithm::convolution_direct, + out_md, weight_md, bias_md, data_md, strides, dilate, padding, padding, + mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc(desc, engine, fwd_pd); + } +} + +class MKLDNNDeconvForward { + std::shared_ptr fwd; + std::shared_ptr data; + std::shared_ptr weight; + std::shared_ptr bias; + std::shared_ptr out; + OutDataOp data_op; + + public: + MKLDNNDeconvForward(const DeconvolutionParam& param, + const NDArray &data, + const NDArray &weights, + bool has_bias, + const NDArray &output); + void SetDataHandle(const DeconvolutionParam& param, + const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data); + + void Execute(const std::vector &out_data); + + private: + mkldnn::convolution_backward_data::primitive_desc fwd_pd; +}; // class MKLDNNDeconvForward + +MKLDNNDeconvForward::MKLDNNDeconvForward(const DeconvolutionParam& param, + const NDArray &data, + const NDArray &weights, + bool has_bias, + const NDArray &output) + :fwd_pd(GetDeconvFwdImpl(param, data, weights, has_bias, output)) { + this->data = std::shared_ptr(new mkldnn::memory( + fwd_pd.diff_dst_primitive_desc())); + this->weight = std::shared_ptr(new mkldnn::memory( + fwd_pd.weights_primitive_desc())); + this->out = std::shared_ptr(new mkldnn::memory( + fwd_pd.diff_src_primitive_desc())); + this->fwd = std::shared_ptr( + new mkldnn::convolution_backward_data(fwd_pd, + mkldnn::primitive::at(*this->data), + mkldnn::primitive::at(*this->weight), + *this->out)); +} + +void MKLDNNDeconvForward::SetDataHandle(const DeconvolutionParam& param, + const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { + auto data_mem = in_data[deconv::kData].GetMKLDNNDataReorder( + fwd_pd.diff_dst_primitive_desc()); + const mkldnn::memory *weight_mem; + if (ctx.is_train) { + // TODO(zhengda) kvstore doesn't handle MKLDNN correctly. Let's reorder it + // to the default format for now. + if (in_data[deconv::kWeight].IsMKLDNNData()) + const_cast(in_data[deconv::kWeight]).Reorder2Default(); + weight_mem = GetWeights(in_data[deconv::kWeight], + fwd_pd.weights_primitive_desc(), + param.num_group); + } else { + // For inference, we want to reorder the weight array so we don't need to + // reorder data every time. + const_cast(in_data[deconv::kWeight]).MKLDNNDataReorder( + fwd_pd.weights_primitive_desc()); + weight_mem = in_data[deconv::kWeight].GetMKLDNNData(); + } + auto out_mem = CreateMKLDNNMem(out_data[deconv::kOut], + fwd_pd.diff_src_primitive_desc(), req[deconv::kOut]); + auto output = out_mem.second; + this->data->set_data_handle(data_mem->get_data_handle()); + this->weight->set_data_handle(weight_mem->get_data_handle()); + this->out->set_data_handle(output->get_data_handle()); + this->data_op = out_mem.first; +} + +void MKLDNNDeconvForward::Execute(const std::vector &out_data) { + MKLDNNStream::Get()->RegisterPrim(*fwd); + CommitOutput(out_data[deconv::kOut], mkldnn_output_t(this->data_op, this->out.get())); + MKLDNNStream::Get()->Submit(); +} + +static void MKLDNNDeconvFwdBiasPostProcess(const DeconvolutionParam& param, + const OpContext &ctx, + const std::vector &in_data, + const std::vector &out_data) { + // add bias, broadcast bias to dim 1: channel + if (!param.no_bias) { + // MKLDNN only supports float right now. + typedef float DType; + Stream *s = ctx.get_stream(); + Tensor bias = in_data[deconv::kBias].data().get(s); + // If the output data is stored in a special MKLDNN format, data() + // automatically converts its format to the default format. + // Unfortunately, MKLDNN doesn't support broadcast. + Tensor out_cpu = out_data[deconv::kOut].data().get(s); + out_cpu += mshadow::expr::broadcast<1>(bias, out_cpu.shape_); + } +} + +typedef MKLDNNParamOpSign MKLDNNDeconvSignature; + +static inline MKLDNNDeconvForward &GetDeconvFwd( + const nnvm::NodeAttrs& attrs, const NDArray &data, + const NDArray &weights, const NDArray *bias, + const NDArray &output) { + static thread_local + std::unordered_map fwds; + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + MKLDNNDeconvSignature key(param); + // Here we can sign the conv op with NDArray because conv primitive will + // decide the right layout for the, so we only need to get the shape and the + // data type of the arrays. + key.AddSign(data); + key.AddSign(weights); + key.AddSign(output); + if (bias) + key.AddSign(*bias); + + auto it = fwds.find(key); + if (it == fwds.end()) { + bool has_bias = (bias != nullptr); + MKLDNNDeconvForward fwd(param, data, weights, has_bias, output); + auto ins_ret = fwds.insert( + std::pair(key, fwd)); + CHECK(ins_ret.second); + it = ins_ret.first; + } + return it->second; +} + +void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { + TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]); + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + + MKLDNNDeconvForward &deconvFwd = GetDeconvFwd( + attrs, in_data[deconv::kData], in_data[deconv::kWeight], + param.no_bias ? nullptr : &in_data[deconv::kBias], out_data[deconv::kOut]); + + deconvFwd.SetDataHandle(param, ctx, in_data, req, out_data); + + deconvFwd.Execute(out_data); + + MKLDNNDeconvFwdBiasPostProcess(param, ctx, in_data, out_data); +} + +void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + TmpMemMgr::Get()->Init(ctx.requested[deconv::kTempSpace]); + const std::vector &in_grad = outputs; + const DeconvolutionParam& param = nnvm::get(attrs.parsed); + CHECK_NE(req[deconv::kWeight], kWriteInplace) << "cannot write weight inplace"; + mkldnn::convolution_forward::primitive_desc bwdData_pd = GetDeconvBwdData( + param, inputs[deconv::kData + 1], inputs[deconv::kWeight + 1], false, + inputs[deconv::kOut]); + auto out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder( + bwdData_pd.src_primitive_desc()); + if (req[deconv::kData]) { + auto weight_mem = GetWeights(inputs[deconv::kWeight + 1], + bwdData_pd.weights_primitive_desc(), + param.num_group); + auto in_grad_mem = CreateMKLDNNMem(in_grad[deconv::kData], + bwdData_pd.dst_primitive_desc(), + req[deconv::kData]); + MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_forward(bwdData_pd, + *out_grad_mem, *weight_mem, *in_grad_mem.second)); + CommitOutput(in_grad[deconv::kData], in_grad_mem); + } + if (req[deconv::kWeight]) { + mkldnn::convolution_backward_weights::primitive_desc bwdWeights_pd + = GetDeconvBwdWeights(param, inputs[deconv::kData + 1], + inputs[deconv::kWeight + 1], false, inputs[deconv::kOut], bwdData_pd); + if (bwdData_pd.src_primitive_desc() != bwdWeights_pd.src_primitive_desc()) + out_grad_mem = inputs[deconv::kOut].GetMKLDNNDataReorder( + bwdWeights_pd.src_primitive_desc()); + auto data_mem = inputs[deconv::kData + 1].GetMKLDNNDataReorder( + bwdWeights_pd.diff_dst_primitive_desc()); + auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[deconv::kWeight], + bwdWeights_pd.diff_weights_primitive_desc(), + req[deconv::kWeight]); + MKLDNNStream::Get()->RegisterPrim(mkldnn::convolution_backward_weights( + bwdWeights_pd, *out_grad_mem, *data_mem, *in_grad_weight.second)); + CommitOutput(in_grad[deconv::kWeight], in_grad_weight); + } + MKLDNNStream::Get()->Submit(); + if (!param.no_bias) { + typedef float DType; + Stream *s = ctx.get_stream(); + Tensor gbias = in_grad[deconv::kBias].data().get(s); + // If there is bias, the out grad has already been converted to the default + // format, so this shouldn't cause any performance issues. + Tensor grad = inputs[deconv::kOut].data().get(s); + Assign(gbias, req[deconv::kBias], mshadow::expr::sumall_except_dim<1>(grad)); + } +} + +} // namespace op +} // namespace mxnet + +#endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_fully_connected.cc b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc new file mode 100644 index 000000000000..a8b85bbeb151 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_fully_connected.cc @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_fully_connected.cc + * \brief + * \author Da Zheng +*/ + +#include "../fully_connected-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +inline static mkldnn::inner_product_forward::primitive_desc GetIPFwd( + const NDArray &data, const NDArray &weight, const NDArray *bias, + const mkldnn::memory::desc &out_md) { + auto data_md = GetMemDesc(data); + auto weight_md = GetMemDesc(weight); + auto engine = CpuEngine::Get()->get_engine(); + if (bias) { + auto bias_md = GetMemDesc(*bias); + mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training, + data_md, weight_md, bias_md, out_md); + return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine); + } else { + mkldnn::inner_product_forward::desc ipFwd_desc(mkldnn::prop_kind::forward_training, + data_md, weight_md, out_md); + return mkldnn::inner_product_forward::primitive_desc(ipFwd_desc, engine); + } +} + +inline static mkldnn::inner_product_backward_data::primitive_desc GetIpBwdData( + const NDArray &data, const NDArray &weight, const NDArray &output, + mkldnn::inner_product_forward::primitive_desc ipFwd_pd) { + auto data_md = GetMemDesc(data); + auto weight_md = GetMemDesc(weight); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Get()->get_engine(); + mkldnn::inner_product_backward_data::desc desc(data_md, weight_md, out_md); + return mkldnn::inner_product_backward_data::primitive_desc(desc, engine, ipFwd_pd); +} + +inline static mkldnn::inner_product_backward_weights::primitive_desc GetIPBwdWeights( + const NDArray &data, const NDArray &weight, const NDArray *bias, + const NDArray &output, mkldnn::inner_product_forward::primitive_desc ipFwd_pd) { + auto data_md = GetMemDesc(data); + auto weight_md = GetMemDesc(weight); + auto out_md = GetMemDesc(output); + auto engine = CpuEngine::Get()->get_engine(); + if (bias) { + auto bias_md = GetMemDesc(*bias); + mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_md, + weight_md, bias_md, out_md); + return mkldnn::inner_product_backward_weights::primitive_desc( + ipBwdWeights_desc, engine, ipFwd_pd); + } else { + mkldnn::inner_product_backward_weights::desc ipBwdWeights_desc(data_md, + weight_md, out_md); + return mkldnn::inner_product_backward_weights::primitive_desc( + ipBwdWeights_desc, engine, ipFwd_pd); + } +} + +void MKLDNNFCForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data) { + TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]); + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + const TShape& ishape = in_data[fullc::kData].shape(); + const TShape& oshape = out_data[fullc::kOut].shape(); + NDArray weight = in_data[fullc::kWeight]; + NDArray data = in_data[fullc::kData]; + auto out_md = GetMemDesc(out_data[fullc::kOut]); + if (data.shape().ndim() != 2 && !param.flatten) { + data = data.MKLDNNDataReshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1), + ishape[ishape.ndim()-1])); + mkldnn::memory::dims out_dims{static_cast(oshape.ProdShape(0, oshape.ndim()-1)), + static_cast(oshape[ishape.ndim()-1])}; + out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data[fullc::kOut].dtype()), + mkldnn::memory::format::any); + } else if (data.shape().ndim() != 2) { + data = data.MKLDNNDataReshape(Shape2(ishape[0], ishape.ProdShape(1, ishape.ndim()))); + mkldnn::memory::dims out_dims{static_cast(oshape[0]), + static_cast(oshape.ProdShape(1, oshape.ndim()))}; + out_md = mkldnn::memory::desc(out_dims, get_mkldnn_type(out_data[fullc::kOut].dtype()), + mkldnn::memory::format::any); + } + + mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight, + param.no_bias ? nullptr : &in_data[fullc::kBias], out_md); + auto data_mem = data.GetMKLDNNDataReorder(ipFwd_pd.src_primitive_desc()); + auto weight_mem = weight.GetMKLDNNDataReorder(ipFwd_pd.weights_primitive_desc()); + auto out_mem = CreateMKLDNNMem(out_data[fullc::kOut], + ipFwd_pd.dst_primitive_desc(), req[fullc::kOut]); + if (param.no_bias) { + MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_forward( + ipFwd_pd, *data_mem, *weight_mem, *out_mem.second)); + } else { + auto bias_mem = in_data[fullc::kBias].GetMKLDNNDataReorder(ipFwd_pd.bias_primitive_desc()); + MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_forward(ipFwd_pd, + *data_mem, *weight_mem, *bias_mem, *out_mem.second)); + } + CommitOutput(out_data[fullc::kOut], out_mem); + MKLDNNStream::Get()->Submit(); +} + +void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + TmpMemMgr::Get()->Init(ctx.requested[fullc::kTempSpace]); + const std::vector &in_grad = outputs; + const FullyConnectedParam& param = nnvm::get(attrs.parsed); + const TShape& ishape = inputs[fullc::kData + 1].shape(); + const TShape& oshape = inputs[fullc::kOut].shape(); + + NDArray weight = inputs[fullc::kWeight + 1]; + NDArray data = inputs[fullc::kData + 1]; + if (data.shape().ndim() != 2 && !param.flatten) + data = data.MKLDNNDataReshape(Shape2(ishape.ProdShape(0, ishape.ndim()-1), + ishape[ishape.ndim()-1])); + else if (data.shape().ndim() != 2) + data = data.MKLDNNDataReshape(Shape2(ishape[0], + ishape.ProdShape(1, ishape.ndim()))); + NDArray out_grad = inputs[fullc::kOut]; + if (out_grad.shape().ndim() != 2 && !param.flatten) + out_grad = out_grad.MKLDNNDataReshape(Shape2(oshape.ProdShape(0, oshape.ndim()-1), + oshape[oshape.ndim()-1])); + else if (out_grad.shape().ndim() != 2) + out_grad = out_grad.MKLDNNDataReshape(Shape2(oshape[0], + oshape.ProdShape(1, oshape.ndim()))); + + mkldnn::inner_product_forward::primitive_desc ipFwd_pd = GetIPFwd(data, weight, + param.no_bias ? nullptr : &in_grad[fullc::kBias], GetMemDesc(out_grad)); + + CHECK_NE(req[fullc::kWeight], kWriteInplace) << "cannot write weight inplace"; + if (req[fullc::kData]) { + mkldnn::inner_product_backward_data::primitive_desc ipBwdData_pd = GetIpBwdData( + data, weight, out_grad, ipFwd_pd); + auto out_grad_mem = out_grad.GetMKLDNNDataReorder( + ipBwdData_pd.diff_dst_primitive_desc()); + auto weight_mem = weight.GetMKLDNNDataReorder(ipBwdData_pd.weights_primitive_desc()); + auto in_grad_mem = CreateMKLDNNMem(in_grad[fullc::kData], + ipBwdData_pd.diff_src_primitive_desc(), + req[fullc::kData]); + MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_data( + ipBwdData_pd, *out_grad_mem, *weight_mem, *in_grad_mem.second)); + CommitOutput(in_grad[fullc::kData], in_grad_mem); + } + if (req[fullc::kWeight]) { + mkldnn::inner_product_backward_weights::primitive_desc ipBwdWeights_pd + = GetIPBwdWeights(data, weight, param.no_bias ? nullptr : &in_grad[fullc::kBias], + out_grad, ipFwd_pd); + auto out_grad_mem = out_grad.GetMKLDNNDataReorder( + ipBwdWeights_pd.diff_dst_primitive_desc()); + auto data_mem = data.GetMKLDNNDataReorder(ipBwdWeights_pd.src_primitive_desc()); + auto in_grad_weight = CreateMKLDNNWeightGrad(in_grad[fullc::kWeight], + ipBwdWeights_pd.diff_weights_primitive_desc(), + req[fullc::kWeight]); + mkldnn_output_t in_grad_bias; + if (param.no_bias) { + MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_weights( + ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second)); + } else { + in_grad_bias = CreateMKLDNNMem(in_grad[fullc::kBias], + ipBwdWeights_pd.diff_bias_primitive_desc(), + req[fullc::kBias]); + MKLDNNStream::Get()->RegisterPrim(mkldnn::inner_product_backward_weights( + ipBwdWeights_pd, *data_mem, *out_grad_mem, *in_grad_weight.second, + *in_grad_bias.second)); + } + CommitOutput(in_grad[fullc::kWeight], in_grad_weight); + CommitOutput(in_grad[fullc::kBias], in_grad_bias); + } + MKLDNNStream::Get()->Submit(); +} + +} // namespace op +} // namespace mxnet +#endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_lrn-inl.h b/src/operator/nn/mkldnn/mkldnn_lrn-inl.h new file mode 100644 index 000000000000..9a9bf62b67d0 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_lrn-inl.h @@ -0,0 +1,143 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_lrn-inl.h + * \brief + * \Author: Patric Zhao, patric.zhao@intel.com +*/ +#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_ +#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H_ + +#if MXNET_USE_MKLDNN == 1 +#include +#include "../lrn-inl.h" +#include "./mkldnn_base-inl.h" + +namespace mxnet { +namespace op { + +inline algorithm GetMKLDNNLRNAlgo(const LRNParam ¶m) { + // TODO(Patric): lrn_within_channel will cause core dump in MKLDNN backward + // Need to confirm with MKLDNN team and fix later + return algorithm::lrn_across_channels; +} + +inline lrn_forward::primitive_desc GetLRNFwd(const LRNParam ¶m, + const bool is_train, + const memory::desc &src_md) { + const auto engine = CpuEngine::Get()->get_engine(); + const auto alg = GetMKLDNNLRNAlgo(param); + const float alpha = param.alpha; + const float beta = param.beta; + const int nsize = param.nsize; + const float k = param.knorm; + auto kind = prop_kind::forward_training; + if (is_train) { + kind = prop_kind::forward_training; + } else { + kind = prop_kind::forward_scoring; + } + lrn_forward::desc fwd_desc(kind, alg, src_md, nsize, alpha, beta, k); + return mkldnn::lrn_forward::primitive_desc(fwd_desc, engine); +} + +inline mkldnn::lrn_backward::primitive_desc +GetLRNBwd(const LRNParam ¶m, + const mkldnn::memory::desc &diff_in_md, + const mkldnn::memory::desc &diff_md, + const lrn_forward::primitive_desc &lrnFwd_desc) { + const auto engine = CpuEngine::Get()->get_engine(); + const auto alg = GetMKLDNNLRNAlgo(param); + const float alpha = param.alpha; + const float beta = param.beta; + const int nsize = param.nsize; + const float k = param.knorm; + + lrn_backward::desc lrnBwd_desc(alg, diff_in_md, + diff_md, nsize, alpha, beta, k); + return mkldnn::lrn_backward::primitive_desc(lrnBwd_desc, + engine, lrnFwd_desc); +} + +void MKLDNNLRNForward(const OpContext &ctx, + const LRNParam ¶m, + const NDArray &in_data, + const OpReqType req, + const NDArray &out_data) { + auto src_mem = in_data.GetMKLDNNData(); + const auto src_md = src_mem->get_primitive_desc().desc(); + const auto pdesc = GetLRNFwd(param, ctx.is_train, src_md); + auto dst_mem = const_cast(out_data).CreateMKLDNNData( + pdesc.dst_primitive_desc()); + if (ctx.is_train) { + std::shared_ptr ws_mem( + new mkldnn::memory(pdesc.workspace_primitive_desc())); + MKLDNNStream::Get()->RegisterPrim( + lrn_forward(pdesc, mkldnn::primitive::at(*src_mem), + *ws_mem, *dst_mem)); + MKLDNNStream::Get()->Submit(); + } else { + MKLDNNStream::Get()->RegisterPrim( + lrn_forward(pdesc, mkldnn::primitive::at(*src_mem), *dst_mem)); + MKLDNNStream::Get()->Submit(); + } +} + +void MKLDNNLRNBackward(const OpContext &ctx, const LRNParam ¶m, + const NDArray &out_grad, + const NDArray &in_data, + const OpReqType req, + const NDArray &in_grad) { + if (req == kNullOp) { + return; + } + // Repeat FW for getting workspace + auto data_mem = in_data.GetMKLDNNData(); + const auto data_md = data_mem->get_primitive_desc().desc(); + const auto pdesc_fwd = GetLRNFwd(param, ctx.is_train, data_md); + + // TODO(Patric): To keep the function stateless, we can't pass workspace + // from LRN forward to backward. We have to re-compute + // LRN forward to get the workspace. + // Will refine this code later. + std::shared_ptr ws_mem( + new mkldnn::memory(pdesc_fwd.workspace_primitive_desc())); + std::shared_ptr dst_temp( + new mkldnn::memory(pdesc_fwd.dst_primitive_desc())); + MKLDNNStream::Get()->RegisterPrim( + lrn_forward(pdesc_fwd, mkldnn::primitive::at(*data_mem), + *ws_mem, *dst_temp)); + + const auto data_in_md = pdesc_fwd.src_primitive_desc().desc(); + auto diff_mem = out_grad.GetMKLDNNData(); + const auto diff_md = diff_mem->get_primitive_desc().desc(); + const auto pdesc_bwd = GetLRNBwd(param, data_in_md, diff_md, pdesc_fwd); + auto diff_src_mem = CreateMKLDNNMem(in_grad, + pdesc_bwd.diff_src_primitive_desc(), req); + + MKLDNNStream::Get()->RegisterPrim( + lrn_backward(pdesc_bwd, mkldnn::primitive::at(*data_mem), + mkldnn::primitive::at(*diff_mem), *ws_mem, *diff_src_mem.second)); + MKLDNNStream::Get()->Submit(); +} +} // namespace op +} // namespace mxnet +#endif // MXNET_USE_MKLDNN == 1 +#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_LRN_INL_H__ diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h new file mode 100644 index 000000000000..9149cb0c6a94 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_ops-inl.h + * \brief + * \author Da Zheng +*/ + +#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ +#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ + +#if MXNET_USE_MKLDNN == 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace mxnet { +namespace op { + +/* For fully connected. */ +void MKLDNNFCForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data); +void MKLDNNFCBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs); + +/* For convolution. */ +void MKLDNNConvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data); +void MKLDNNConvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs); + +/* For deconvolution */ +void MKLDNNDeconvolutionForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data); +void MKLDNNDeconvolutionBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs); + +/* For softmax */ +void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data); + +/* For sum */ +void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &inputs, const OpReqType &req, + const NDArray &out_data); + +/* For copy */ +void MKLDNNCopy(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data); + +/* For concat */ +void MKLDNNConcatForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &in_data, + const std::vector &req, + const std::vector &out_data); +void MKLDNNConcatBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs); + +/* For activation */ +void MKLDNNActivationForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data); +void MKLDNNActivationBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const NDArray &out_grad, const NDArray &in_data, + const OpReqType &req, const NDArray &in_grad); + +void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, + const mkldnn::memory &out); + +} // namespace op +} // namespace mxnet +#endif // MXNET_USE_MKLDNN == 1 + +#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_OPS_INL_H_ diff --git a/src/operator/nn/mkldnn/mkldnn_pooling-inl.h b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h new file mode 100644 index 000000000000..4f2f71866e14 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_pooling-inl.h @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_pooling-inl.h + * \brief +*/ +#ifndef MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_ +#define MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_ + +#if MXNET_USE_MKLDNN == 1 + +#include +#include +#include "../pooling-inl.h" +#include "./mkldnn_base-inl.h" + +namespace mxnet { +namespace op { + +class MKLDNNPoolingFwd { + public: + MKLDNNPoolingFwd(const mxnet::NDArray &input, + const mxnet::NDArray &output, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int padding_t, const int padding_b, + const int padding_l, const int padding_r, + const mkldnn::algorithm alg_kind, + const bool with_workspace, const bool is_train) : + is_train_(is_train), + with_workspace_(with_workspace), + alg_kind_(alg_kind), + fwd_(nullptr), data_(nullptr), out_(nullptr), workspace_(nullptr) { + Init(input, output, + kernel_h, kernel_w, stride_h, stride_w, + padding_t, padding_b, padding_l, padding_r); + } + + ~MKLDNNPoolingFwd() {} + void SetDataHandle(const mxnet::NDArray &data, + const mxnet::NDArray &output, + const mxnet::NDArray *workspace = nullptr); + void Execute(); + + private: + bool is_train_; + bool with_workspace_; + mkldnn::algorithm alg_kind_; + std::shared_ptr fwd_pd_; + std::shared_ptr fwd_; + std::shared_ptr data_; + std::shared_ptr out_; + std::shared_ptr workspace_; + + private: + void Init(const mxnet::NDArray &input, + const mxnet::NDArray &output, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int padding_t, const int padding_b, + const int padding_l, const int padding_r); +}; + +inline bool SupportMKLDNNPooling(const PoolingParam ¶m) { + return param.kernel.ndim() == 2 && + (param.pool_type == pool_enum::kMaxPooling || + param.pool_type == pool_enum::kAvgPooling) + // This is a temporary fix. There is a bug in global pooling of MKLDNN. + && !param.global_pool; +} + +inline bool SupportMKLDNNPooling(const PoolingParam ¶m, + const TShape &dshape) { + bool ret = SupportMKLDNNPooling(param); + if (!ret) + return false; + + if (param.pooling_convention == pool_enum::kValid) + return true; + + if (((dshape[2] + 2 * param.pad[0] - param.kernel[0]) % param.stride[0] == 0) && + ((dshape[3] + 2 * param.pad[1] - param.kernel[1]) % param.stride[1] == 0)) + return true; + else + return false; +} + +inline bool MKLDNNRequireWorkspace(const PoolingParam ¶m) { + return param.pool_type != pool_enum::kAvgPooling; +} + +typedef MKLDNNParamOpSign MKLDNNPoolingSignature; +void MKLDNNPoolingCompute(const OpContext &ctx, const PoolingParam ¶m, + const NDArray &in_data, const OpReqType req, + const NDArray &out_data, const NDArray *workspace); + +void MKLDNNPoolingGradCompute(const OpContext &ctx, const PoolingParam ¶m, + const NDArray &out_grad, const NDArray &in_data, + const NDArray *workspace, const OpReqType req, + const NDArray &in_grad); +} // namespace op +} // namespace mxnet +#endif // MXNET_USE_MKLDNN == 1 +#endif // MXNET_OPERATOR_NN_MKLDNN_MKLDNN_POOLING_INL_H_ diff --git a/src/operator/nn/mkldnn/mkldnn_pooling.cc b/src/operator/nn/mkldnn/mkldnn_pooling.cc new file mode 100644 index 000000000000..6eeecaf07271 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_pooling.cc @@ -0,0 +1,322 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_pooling.cc + * \brief + * \author Tao Lv +*/ + +#if MXNET_USE_MKLDNN == 1 + +#include "./mkldnn_pooling-inl.h" + +namespace mxnet { +namespace op { + +void MKLDNNPoolingFwd::Init(const mxnet::NDArray &input, const mxnet::NDArray &output, + const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, + const int padding_t, const int padding_b, + const int padding_l, const int padding_r) { + // mkldnn::memory::desc + auto src_md = input.GetMKLDNNData()->get_primitive_desc().desc(); + mkldnn::memory::dims dims = {src_md.data.dims[0], + src_md.data.dims[1], + static_cast(output.shape()[2]), + static_cast(output.shape()[3])}; + auto dst_md = mkldnn::memory::desc({dims}, + static_cast(src_md.data.data_type), + static_cast(src_md.data.format)); + const mkldnn::engine engine = CpuEngine::Get()->get_engine(); + const mkldnn::algorithm alg_kind = this->alg_kind_; + if (alg_kind != mkldnn::algorithm::pooling_max && + alg_kind != mkldnn::algorithm::pooling_avg && + alg_kind != mkldnn::algorithm::pooling_avg_include_padding && + alg_kind != mkldnn::algorithm::pooling_avg_exclude_padding) { + LOG(FATAL) << "MKLDNN Pooling: algorithm is not supported"; + } + + mkldnn::prop_kind prop = mkldnn::prop_kind::forward_scoring; + if (this->is_train_ && alg_kind != mkldnn::algorithm::pooling_avg) { + prop = mkldnn::prop_kind::forward_training; + } + if (this->is_train_ && prop == mkldnn::prop_kind::forward_scoring) { + LOG(INFO) << "MKLDNN Pooling: training with prop_kind is forward_scoring"; + } + + const mkldnn::memory::dims strides = {stride_h, stride_w }; + const mkldnn::memory::dims pad_l = {padding_t, padding_l }; + const mkldnn::memory::dims pad_r = {padding_b, padding_r }; + const mkldnn::memory::dims kernel = {kernel_h, kernel_w }; + // mkldnn::pooling_forward::desc + const auto fwd_desc = mkldnn::pooling_forward::desc(prop, alg_kind, src_md, dst_md, + strides, kernel, pad_l, pad_r, + mkldnn::padding_kind::zero); + this->fwd_pd_.reset(new mkldnn::pooling_forward::primitive_desc(fwd_desc, engine)); + this->data_.reset(new mkldnn::memory(input.GetMKLDNNData()->get_primitive_desc())); + this->out_.reset(new mkldnn::memory(this->fwd_pd_->dst_primitive_desc())); + if (this->with_workspace_) { + this->workspace_.reset(new mkldnn::memory(this->fwd_pd_->workspace_primitive_desc())); + this->fwd_.reset(new mkldnn::pooling_forward(*(this->fwd_pd_), + mkldnn::primitive::at(*(this->data_)), + *(this->out_), + *(this->workspace_))); + } else { + this->fwd_.reset(new mkldnn::pooling_forward(*(this->fwd_pd_), + mkldnn::primitive::at(*(this->data_)), + *(this->out_))); + } + return; +} + +void MKLDNNPoolingFwd::SetDataHandle(const mxnet::NDArray &data, + const mxnet::NDArray &output, + const mxnet::NDArray *workspace) { + // mkldnn::memory + auto data_mem = data.GetMKLDNNData(); + auto out_mem = const_cast(output).CreateMKLDNNData( + this->fwd_pd_->dst_primitive_desc()); + this->data_->set_data_handle(data_mem->get_data_handle()); + this->out_->set_data_handle(out_mem->get_data_handle()); + if (this->with_workspace_ && workspace == nullptr) { + LOG(FATAL) << "MKLDNN Pooling: incorrect workspace input"; + } + + if (this->with_workspace_) { + // mkldnn::memory + auto ws_mem = workspace->GetMKLDNNData(); + this->workspace_->set_data_handle(ws_mem->get_data_handle()); + } +} + +void MKLDNNPoolingFwd::Execute() { + if (this->fwd_) { + MKLDNNStream::Get()->RegisterPrim(*(this->fwd_)); + MKLDNNStream::Get()->Submit(); + } else { + LOG(FATAL) << "MKLDNN Pooling: forward primitive is nullptr"; + } +} + +mkldnn::algorithm GetMKLDNNPoolAlgo(const PoolingParam ¶m) { + switch (param.pool_type) { + case pool_enum::kMaxPooling: + return mkldnn::algorithm::pooling_max; + break; + case pool_enum::kAvgPooling: + return mkldnn::algorithm::pooling_avg_include_padding; + break; + default: + LOG(FATAL) << "MKLDNN Pooling: Unknown pooling method."; + return mkldnn::algorithm::pooling_max; + } +} + +mkldnn::pooling_forward::primitive_desc GetPoolingFwd(const PoolingParam ¶m, + const bool is_train, + const memory::desc &data_md, + const memory::desc &out_md) { + CHECK_EQ(param.kernel.ndim(), 2) << "Not Implemented"; + int kernel_h_, kernel_w_; + if (param.global_pool) { + kernel_h_ = data_md.data.dims[2]; + kernel_w_ = data_md.data.dims[3]; + } else { + kernel_h_ = param.kernel[0]; + kernel_w_ = param.kernel[1]; + } + + CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; + CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; + + const int pad_t_ = param.pad[0], pad_b_ = param.pad[0]; + const int pad_l_ = param.pad[1], pad_r_ = param.pad[1]; + const int stride_h_ = param.stride[0], stride_w_ = param.stride[1]; + + const mkldnn::engine engine = CpuEngine::Get()->get_engine(); + if (param.global_pool) { + CHECK(pad_t_ == 0 && pad_l_ == 0 && stride_h_ == 1 && stride_w_ == 1) + << "With Global_pooling: true; only pad = 0 and stride = 1"; + } + if (pad_t_ != 0 || pad_l_ != 0) { + CHECK(param.pool_type == pool_enum::kAvgPooling || + param.pool_type == pool_enum::kMaxPooling) + << "Padding implemented only for average and max pooling."; + CHECK_LT(pad_l_, kernel_w_); + CHECK_LT(pad_t_, kernel_h_); + } + + + const mkldnn::algorithm alg = GetMKLDNNPoolAlgo(param); + mkldnn::prop_kind kind = mkldnn::prop_kind::forward_scoring; + if (is_train && alg != algorithm::pooling_avg) { + kind = mkldnn::prop_kind::forward_training; + } + + const pooling_forward::desc poolingFwd_desc(kind, alg, data_md, out_md, + {static_cast(stride_h_), + static_cast(stride_w_)}, + {kernel_h_, kernel_w_}, + {static_cast(pad_t_), + static_cast(pad_l_)}, + {static_cast(pad_b_), + static_cast(pad_r_)}, + padding_kind::zero); + return mkldnn::pooling_forward::primitive_desc(poolingFwd_desc, engine); +} + +MKLDNNPoolingFwd &GetPoolingFwd(const PoolingParam ¶m, + const bool is_train, + const NDArray &data, + const NDArray &output) { + static thread_local std::unordered_map pooling_fwds; + + bool with_workspace = is_train && MKLDNNRequireWorkspace(param); + MKLDNNPoolingSignature key(param); + key.AddSign(is_train); + key.AddSign(with_workspace); + key.AddSign(data); + key.AddSign(output); + + auto it = pooling_fwds.find(key); + if (it == pooling_fwds.end()) { + CHECK_EQ(param.kernel.ndim(), 2) << "Not Implemented"; + auto data_md = data.GetMKLDNNData()->get_primitive_desc().desc(); + int kernel_h_, kernel_w_; + if (param.global_pool) { + kernel_h_ = data_md.data.dims[2]; + kernel_w_ = data_md.data.dims[3]; + } else { + kernel_h_ = param.kernel[0]; + kernel_w_ = param.kernel[1]; + } + + CHECK_GT(kernel_h_, 0) << "Filter dimensions cannot be zero."; + CHECK_GT(kernel_w_, 0) << "Filter dimensions cannot be zero."; + + const int pad_t_ = param.pad[0], pad_b_ = param.pad[0]; + const int pad_l_ = param.pad[1], pad_r_ = param.pad[1]; + const int stride_h_ = param.stride[0], stride_w_ = param.stride[1]; + + if (param.global_pool) { + CHECK(pad_t_ == 0 && pad_l_ == 0 && stride_h_ == 1 && stride_w_ == 1) + << "With Global_pooling: true; only pad = 0 and stride = 1"; + } + + if (pad_t_ != 0 || pad_l_ != 0) { + CHECK(param.pool_type == pool_enum::kAvgPooling || + param.pool_type == pool_enum::kMaxPooling) + << "Padding implemented only for average and max pooling."; + CHECK_LT(pad_l_, kernel_w_); + CHECK_LT(pad_t_, kernel_h_); + } + + const mkldnn::algorithm alg = GetMKLDNNPoolAlgo(param); + MKLDNNPoolingFwd fwd(data, output, kernel_h_, kernel_w_, stride_h_, stride_w_, + pad_t_, pad_b_, pad_l_, pad_r_, alg, with_workspace, is_train); + auto ins_ret = pooling_fwds.insert( + std::pair(key, fwd)); + CHECK(ins_ret.second); + it = ins_ret.first; + } + return it->second; +} + +void MKLDNNPoolingCompute(const OpContext &ctx, const PoolingParam ¶m, + const NDArray &in_data, const OpReqType req, + const NDArray &out_data, const NDArray *workspace) { + auto fwd = GetPoolingFwd(param, ctx.is_train, in_data, out_data); + fwd.SetDataHandle(in_data, out_data, workspace); + fwd.Execute(); +} + +void MKLDNNPoolingGradCompute(const OpContext &ctx, const PoolingParam ¶m, + const NDArray &out_grad, const NDArray &in_data, + const NDArray *workspace, const OpReqType req, + const NDArray &in_grad) { + if (req == kNullOp) { + return; + } + + TmpMemMgr::Get()->Init(ctx.requested[0]); + // mkldnn::memory + auto diff_dst_mem = out_grad.GetMKLDNNData(); + auto input_mem = in_data.GetMKLDNNData(); + mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); + const mkldnn::memory::desc data_md = data_mpd.desc(); + const memory::dims dims = {data_md.data.dims[0], data_md.data.dims[1], + static_cast(out_grad.shape()[2]), + static_cast(out_grad.shape()[3])}; + const memory::desc out_md({dims}, + static_cast(data_md.data.data_type), + static_cast(data_md.data.format)); + auto pdesc_fwd = GetPoolingFwd(param, ctx.is_train, data_md, out_md); + + const mkldnn::memory::desc diff_md = diff_dst_mem->get_primitive_desc().desc(); + const memory::dims dims1 = {diff_md.data.dims[0], diff_md.data.dims[1], + static_cast(in_grad.shape()[2]), + static_cast(in_grad.shape()[3])}; + const memory::desc diff_in_md( + {dims1}, static_cast(diff_md.data.data_type), + static_cast(diff_md.data.format)); + const mkldnn::engine cpu_engine = data_mpd.get_engine(); + const mkldnn::algorithm alg = GetMKLDNNPoolAlgo(param); + + int kernel_h_, kernel_w_; + if (param.global_pool) { + kernel_h_ = data_md.data.dims[2]; + kernel_w_ = data_md.data.dims[3]; + } else { + kernel_h_ = param.kernel[0]; + kernel_w_ = param.kernel[1]; + } + const pooling_backward::desc desc(alg, diff_in_md, diff_md, + {static_cast(param.stride[0]), + static_cast(param.stride[1])}, + {kernel_h_, kernel_w_}, + {static_cast(param.pad[0]), + static_cast(param.pad[1])}, + {static_cast(param.pad[0]), + static_cast(param.pad[1])}, + mkldnn::padding_kind::zero); + const pooling_backward::primitive_desc pdesc(desc, cpu_engine, pdesc_fwd); + + auto diff_src_mem = + CreateMKLDNNMem(in_grad, pdesc.diff_src_primitive_desc(), req); + + if (MKLDNNRequireWorkspace(param)) { + CHECK(workspace != nullptr); + auto workspace_mem = workspace->GetMKLDNNData(); + MKLDNNStream::Get()->RegisterPrim( + pooling_backward(pdesc, *diff_dst_mem, primitive::at(*workspace_mem), + *diff_src_mem.second)); + } else { + MKLDNNStream::Get()->RegisterPrim( + pooling_backward(pdesc, *diff_dst_mem, *diff_src_mem.second)); + } + CommitOutput(in_grad, diff_src_mem); + MKLDNNStream::Get()->Submit(); +} + +} // namespace op +} // namespace mxnet +#endif // MXNET_USE_MKLDNN == 1 diff --git a/src/operator/nn/mkldnn/mkldnn_softmax.cc b/src/operator/nn/mkldnn/mkldnn_softmax.cc new file mode 100644 index 000000000000..aa59f13d06da --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_softmax.cc @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_softmax.cc + * \brief + * \author Da Zheng +*/ + +#include "../softmax-inl.h" +#include "./mkldnn_ops-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +void MKLDNNSoftmaxForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const NDArray &in_data, const OpReqType &req, + const NDArray &out_data) { + const SoftmaxParam& param = nnvm::get(attrs.parsed); + auto input_mem = in_data.GetMKLDNNData(); + mkldnn::memory::primitive_desc data_mpd = input_mem->get_primitive_desc(); + mkldnn::memory::desc data_md = data_mpd.desc(); + auto cpu_engine = data_mpd.get_engine(); + auto prop = ctx.is_train + ? mkldnn::prop_kind::forward_training : mkldnn::prop_kind::forward_scoring; + mkldnn::softmax_forward::desc desc = mkldnn::softmax_forward::desc(prop, + data_md, param.axis); + mkldnn::softmax_forward::primitive_desc pdesc(desc, cpu_engine); + + auto output_memory = out_data.GetMKLDNNData(); + MKLDNNStream *stream = MKLDNNStream::Get(); + stream->RegisterPrim(mkldnn::softmax_forward(pdesc, *input_mem, *output_memory)); + stream->Submit(); +} + +} // namespace op +} // namespace mxnet +#endif diff --git a/src/operator/nn/mkldnn/mkldnn_sum.cc b/src/operator/nn/mkldnn/mkldnn_sum.cc new file mode 100644 index 000000000000..f3aeacf17dd1 --- /dev/null +++ b/src/operator/nn/mkldnn/mkldnn_sum.cc @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn_sum.cc + * \brief + * \author Da Zheng +*/ +#include + +#include "./mkldnn_ops-inl.h" +#include "./mkldnn_base-inl.h" + +#if MXNET_USE_MKLDNN == 1 +namespace mxnet { +namespace op { + +void Sum(const mkldnn::memory &arr1, const mkldnn::memory &arr2, + const mkldnn::memory &out) { + std::vector input_pds(2); + std::vector scales(2, 1); + std::vector inputs; + input_pds[0] = arr1.get_primitive_desc(); + input_pds[1] = arr2.get_primitive_desc(); + CHECK(input_pds[0] == input_pds[1]); + inputs.push_back(arr1); + inputs.push_back(arr2); + // TODO(zhengda) I need to reorder memory here. + mkldnn::sum::primitive_desc sum_pd(scales, input_pds); + MKLDNNStream::Get()->RegisterPrim(mkldnn::sum(sum_pd, inputs, out)); +} + +void MKLDNNSumForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx, + const std::vector &inputs, const OpReqType &req, + const NDArray &out_data) { + TmpMemMgr::Get()->Init(ctx.requested[0]); + std::vector in_prims; + std::vector in_pds(inputs.size()); + std::vector scales(inputs.size(), 1); + in_prims.reserve(inputs.size()); + for (size_t i = 0; i < inputs.size(); i++) { + auto in_mem = inputs[i].GetMKLDNNData(); + in_prims.push_back(*in_mem); + in_pds[i] = in_mem->get_primitive_desc(); + } + mkldnn::sum::primitive_desc pdesc(scales, in_pds); + + auto out_mem = CreateMKLDNNMem(out_data, pdesc.dst_primitive_desc(), req); + MKLDNNStream *stream = MKLDNNStream::Get(); + stream->RegisterPrim(mkldnn::sum(pdesc, in_prims, *out_mem.second)); + CommitOutput(out_data, out_mem); + stream->Submit(); +} + +} // namespace op +} // namespace mxnet +#endif diff --git a/src/operator/nn/pooling-inl.h b/src/operator/nn/pooling-inl.h index a32aaa2152e9..7a20f026f7b9 100644 --- a/src/operator/nn/pooling-inl.h +++ b/src/operator/nn/pooling-inl.h @@ -21,7 +21,7 @@ * Copyright (c) 2017 by Contributors * \file pooling-inl.h * \brief - * \author Bing Xu, Jun Wu + * \author Bing Xu, Jun Wu, Da Zheng */ #ifndef MXNET_OPERATOR_NN_POOLING_INL_H_ @@ -78,257 +78,138 @@ struct PoolingParam : public dmlc::Parameter { DMLC_DECLARE_FIELD(pad).set_default(TShape()) .describe("Pad for pooling: (y, x) or (d, y, x). Defaults to no padding."); } -}; -template -class PoolingOp : public Operator { - public: - explicit PoolingOp(PoolingParam p) { - this->param_ = p; + bool operator==(const PoolingParam& other) const { + return this->kernel == other.kernel && + this->stride == other.stride && + this->pad == other.pad && + this->pool_type == other.pool_type && + this->pooling_convention == other.pooling_convention && + this->global_pool == other.global_pool && + this->cudnn_off == other.cudnn_off; } +}; - virtual void Forward(const OpContext& ctx, - const std::vector& in_data, - const std::vector& req, - const std::vector& out_data, - const std::vector& aux_args) { - using namespace mshadow; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - Stream *s = ctx.get_stream(); - const TShape& ishape = in_data[pool_enum::kData].shape_; +} // namespace op +} // namespace mxnet - pool(s, in_data[pool_enum::kData].dptr(), - in_data[pool_enum::kData].shape_, - out_data[pool_enum::kOut].shape_, - param_.global_pool? - TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim()) - : param_.kernel, - param_.pad, - param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride, - param_.pool_type, - req[pool_enum::kOut], - out_data[pool_enum::kOut].dptr()); +namespace std { +template<> +struct hash { + size_t operator()(const mxnet::op::PoolingParam& val) { + size_t ret = 0; + ret = dmlc::HashCombine(ret, val.kernel); + ret = dmlc::HashCombine(ret, val.stride); + ret = dmlc::HashCombine(ret, val.pad); + ret = dmlc::HashCombine(ret, val.pool_type); + ret = dmlc::HashCombine(ret, val.pooling_convention); + ret = dmlc::HashCombine(ret, val.global_pool); + ret = dmlc::HashCombine(ret, val.cudnn_off); + return ret; } +}; +} // namespace std - virtual void Backward(const OpContext& ctx, - const std::vector& out_grad, - const std::vector& in_data, - const std::vector& out_data, - const std::vector& req, - const std::vector& in_grad, - const std::vector& aux_args) { - using namespace mshadow; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - CHECK_EQ(req.size(), 1U); - CHECK_EQ(in_grad.size(), 1U); - Stream *s = ctx.get_stream(); - const TShape& ishape = in_data[pool_enum::kData].shape_; - - unpool(s, out_grad[pool_enum::kOut].dptr(), - in_data[pool_enum::kData].dptr(), - out_data[pool_enum::kOut].dptr(), - in_grad[pool_enum::kData].shape_, - out_grad[pool_enum::kOut].shape_, - param_.global_pool? - TShape(ishape.data()+ishape.ndim()-param_.kernel.ndim(), ishape.data()+ishape.ndim()) - : param_.kernel, - param_.pad, - param_.global_pool? TShape(param_.kernel.ndim()) : param_.stride, - param_.pool_type, - req[pool_enum::kData], - in_grad[pool_enum::kData].dptr()); - } +namespace mxnet { +namespace op { - private: - PoolingParam param_; -}; // class PoolingOp +/* + * When MKLDNN is enabled, we might want 2 outputs instead of one inputs, which + * also changes the number of inputs for backward. + */ +int GetNumOutputs(const PoolingParam ¶m); +int GetNumBackInputs(const PoolingParam ¶m); -template -Operator* CreateOp(PoolingParam param, int dtype); +template +void PoolingForward(const OpContext& ctx, const PoolingParam ¶m, + const TBlob& in_data, const OpReqType& req, + const TBlob& out_data) { + using namespace mshadow; + Stream *s = ctx.get_stream(); + const TShape& ishape = in_data.shape_; + + pool(s, in_data.dptr(), in_data.shape_, out_data.shape_, + param.global_pool? + TShape(ishape.data()+ishape.ndim()-param.kernel.ndim(), ishape.data()+ishape.ndim()) + : param.kernel, + param.pad, + param.global_pool? TShape(param.kernel.ndim()) : param.stride, + param.pool_type, req, out_data.dptr()); +} +template +void PoolingBackward(const OpContext& ctx, const PoolingParam ¶m, + const TBlob& out_grad, const TBlob& in_data, + const TBlob& out_data, const OpReqType& req, + const TBlob& in_grad) { + using namespace mshadow; + Stream *s = ctx.get_stream(); + const TShape& ishape = in_data.shape_; + + unpool(s, out_grad.dptr(), in_data.dptr(), out_data.dptr(), + in_grad.shape_, out_grad.shape_, + param.global_pool? + TShape(ishape.data()+ishape.ndim()-param.kernel.ndim(), ishape.data()+ishape.ndim()) + : param.kernel, + param.pad, + param.global_pool? TShape(param.kernel.ndim()) : param.stride, + param.pool_type, req, in_grad.dptr()); +} -#if DMLC_USE_CXX11 -class PoolingProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - using namespace mshadow; - param_.Init(kwargs); - if (param_.kernel.ndim() == 1) { - if (param_.stride.ndim() == 0) param_.stride = Shape1(1); - if (param_.pad.ndim() == 0) param_.pad = Shape1(0); - } else if (param_.kernel.ndim() == 2) { - if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0); +template +void PoolingCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const PoolingParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), GetNumOutputs(param)); + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + if (pool_enum::kMaxPooling == param.pool_type + || pool_enum::kAvgPooling == param.pool_type + || pool_enum::kSumPooling == param.pool_type) { + PoolingForward(ctx, param, inputs[0], req[0], outputs[0]); } else { - CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D pooling not supported"; - if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1); - if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0); + LOG(FATAL) << "unknown pooling type"; } - CHECK_EQ(param_.stride.ndim(), param_.kernel.ndim()) - << "stride and kernel should have the same length"; - CHECK_EQ(param_.pad.ndim(), param_.kernel.ndim()) - << "pad and kernel should have the same length"; - } - - std::map GetParams() const override { - return param_.__DICT__(); - } + }); +} - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - CHECK_EQ(in_shape->size(), 1U); - const TShape &dshape = (*in_shape)[0]; - CHECK_GE(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)" - << " Or 4D in (batch, channel, y, x) " - << " Or 5D in (batch, channel, d, y, x)"; - TShape oshape = dshape; - if (dshape.ndim() == 0) return false; - if (param_.kernel.ndim() == 1) { - CHECK_EQ(dshape.ndim(), 3U) << "Pooling: Input data should be 3D in (batch, channel, x)"; - if (param_.global_pool) { - oshape[2] = 1; - } else { - CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]) - << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2] - << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")"; - if (param_.pooling_convention == pool_enum::kValid) { - oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / - param_.stride[0]; - } else { - oshape[2] = 1 + static_cast(ceil(static_cast( - dshape[2] + 2 * param_.pad[0] - - param_.kernel[0]) / param_.stride[0])); - } - } - out_shape->clear(); - out_shape->push_back(oshape); // save output shape - } else if (param_.kernel.ndim() == 2) { - CHECK_EQ(dshape.ndim(), 4U) << "Pooling: Input data should be 4D in (batch, channel, y, x)"; - if (param_.global_pool) { - oshape[2] = 1; - oshape[3] = 1; - } else { - CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0]) - << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2] - << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")"; - CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1]) - << "kernel size (" << param_.kernel[1] << ") exceeds input (" << dshape[3] - << " padded to " << (dshape[3] + 2*param_.pad[1]) << ")"; - if (param_.pooling_convention == pool_enum::kValid) { - oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / - param_.stride[0]; - oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / - param_.stride[1]; - } else { - oshape[2] = 1 + static_cast(ceil(static_cast( - dshape[2] + 2 * param_.pad[0] - - param_.kernel[0]) / param_.stride[0])); - oshape[3] = 1 + static_cast(ceil(static_cast( - dshape[3] + 2 * param_.pad[1] - - param_.kernel[1]) / param_.stride[1])); - } - } - out_shape->clear(); - out_shape->push_back(oshape); // save output shape - } else if (param_.kernel.ndim() == 3) { - CHECK_EQ(dshape.ndim(), 5U) - << "Pooling: Input data should be 5D in (batch, channel, d, y, x)"; - CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) << "kernel size exceeds input"; - CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) << "kernel size exceeds input"; - CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) << "kernel size exceeds input"; - if (param_.global_pool) { - oshape[2] = 1; - oshape[3] = 1; - oshape[4] = 1; - } else { - if (param_.pooling_convention == pool_enum::kValid) { - oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) / - param_.stride[0]; - oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) / - param_.stride[1]; - oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) / - param_.stride[2]; - } else { - oshape[2] = 1 + static_cast(ceil(static_cast( - dshape[2] + 2 * param_.pad[0] - - param_.kernel[0]) / param_.stride[0])); - oshape[3] = 1 + static_cast(ceil(static_cast( - dshape[3] + 2 * param_.pad[1] - - param_.kernel[1]) / param_.stride[1])); - oshape[4] = 1 + static_cast(ceil(static_cast( - dshape[4] + 2 * param_.pad[2] - - param_.kernel[2]) / param_.stride[2])); - } - } - - out_shape->clear(); - out_shape->push_back(oshape); // save output shape - } - return true; +template +void PoolingGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const PoolingParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), GetNumBackInputs(param)); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + off_t ograd_idx, in_data_idx, out_data_idx; + // When MKLDNN is enabled, the input data may contains arrays for workspace. + if (GetNumBackInputs(param) == 5) { + ograd_idx = 0; + in_data_idx = 2; + out_data_idx = 3; + } else { + ograd_idx = 0; + in_data_idx = 1; + out_data_idx = 2; } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_EQ(in_type->size(), 1U); - int dtype = (*in_type)[0]; - - if (dtype == -1) { - LOG(FATAL) << "Input type to pooling is not specified."; - return false; + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + if (pool_enum::kMaxPooling == param.pool_type + || pool_enum::kAvgPooling == param.pool_type + || pool_enum::kSumPooling == param.pool_type) { + PoolingBackward(ctx, param, inputs[ograd_idx], + inputs[in_data_idx], inputs[out_data_idx], + req[0], outputs[0]); + } else { + LOG(FATAL) << "unknown pooling type"; } + }); +} - out_type->clear(); - out_type->push_back(dtype); - return true; - } - - OperatorProperty* Copy() const override { - PoolingProp *prop_sym = new PoolingProp(); - prop_sym->param_ = this->param_; - return prop_sym; - } - - std::string TypeString() const override { - return "Pooling"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[pool_enum::kOut], in_data[pool_enum::kData], - out_data[pool_enum::kOut]}; - } - - std::vector > BackwardInplaceOption( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &in_grad) const override { -#if MXNET_USE_CUDNN == 1 - return {}; -#else - return {{in_data[pool_enum::kData], in_grad[pool_enum::kData]}}; -#endif - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented."; - return NULL; - } - - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; - - private: - PoolingParam param_; -}; // class PoolingProp -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet diff --git a/src/operator/nn/pooling.cc b/src/operator/nn/pooling.cc index 8345ea3886d4..f719e0753e08 100644 --- a/src/operator/nn/pooling.cc +++ b/src/operator/nn/pooling.cc @@ -21,78 +21,300 @@ * Copyright (c) 2017 by Contributors * \file pooling.cc * \brief - * \author Bing Xu, Jun Wu + * \author Bing Xu, Jun Wu, Da Zheng */ +#include "../elemwise_op_common.h" #include "./pooling-inl.h" -#if MXNET_USE_MKL2017 == 1 -#include -#include "../mkl/mkl_memory-inl.h" -#include "../mkl/mkl_pooling-inl.h" -#endif // MXNET_USE_MKL2017 #if MXNET_USE_NNPACK == 1 #include "./nnpack/nnpack_pooling-inl.h" #endif // MXNET_USE_NNPACK +#if MXNET_USE_MKLDNN == 1 +#include "./mkldnn/mkldnn_pooling-inl.h" +#endif // MXNET_USE_MKLDNN namespace mxnet { namespace op { -template<> -Operator *CreateOp(PoolingParam param, int dtype) { - Operator *op = NULL; -#if MXNET_USE_MKL2017 == 1 - if (param.kernel.ndim() == 2 - && ((param.pool_type == pool_enum::kMaxPooling) - || (param.pool_type == pool_enum::kAvgPooling))) { - switch (dtype) { - case mshadow::kFloat32: - return new MKLPoolingOp(param); - case mshadow::kFloat64: - return new MKLPoolingOp(param); - default: - break; +static void PoolingParamParser(nnvm::NodeAttrs *attrs) { + using namespace mshadow; + PoolingParam param; + param.Init(attrs->dict); + if (param.kernel.ndim() == 1) { + if (param.stride.ndim() == 0) param.stride = Shape1(1); + if (param.pad.ndim() == 0) param.pad = Shape1(0); + } else if (param.kernel.ndim() == 2) { + if (param.stride.ndim() == 0) param.stride = Shape2(1, 1); + if (param.pad.ndim() == 0) param.pad = Shape2(0, 0); + } else { + CHECK_EQ(param.kernel.ndim(), 3U) << param.kernel.ndim() + << "D pooling not supported"; + if (param.stride.ndim() == 0) param.stride = Shape3(1, 1, 1); + if (param.pad.ndim() == 0) param.pad = Shape3(0, 0, 0); + } + CHECK_EQ(param.stride.ndim(), param.kernel.ndim()) + << "stride and kernel should have the same length"; + CHECK_EQ(param.pad.ndim(), param.kernel.ndim()) + << "pad and kernel should have the same length"; + attrs->parsed = std::move(param); +} + +int GetNumOutputs(const PoolingParam ¶m) { +#if MXNET_USE_MKLDNN == 1 + return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 2 : 1; +#else + return 1; +#endif +} + +int GetNumBackInputs(const PoolingParam ¶m) { +#if MXNET_USE_MKLDNN == 1 + return MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param) ? 5 : 3; +#else + return 3; +#endif +} + +static bool PoolingType(const nnvm::NodeAttrs& attrs, + std::vector *in_attrs, + std::vector *out_attrs) { + out_attrs->at(0) = in_attrs->at(0); +#if MXNET_USE_MKLDNN == 1 + const PoolingParam ¶m = nnvm::get(attrs.parsed); + if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param)) { + CHECK_GT(out_attrs->size(), 1U); + out_attrs->at(1) = mshadow::kInt32; + } +#endif + return true; +} + +static bool PoolingShape(const nnvm::NodeAttrs &attrs, + std::vector *in_shape, + std::vector *out_shape) { + const PoolingParam ¶m = nnvm::get(attrs.parsed); + CHECK_EQ(in_shape->size(), 1U); + const TShape &dshape = (*in_shape)[0]; + CHECK_GE(dshape.ndim(), 3U) + << "Pooling: Input data should be 3D in (batch, channel, x)" + << " Or 4D in (batch, channel, y, x) " + << " Or 5D in (batch, channel, d, y, x)"; + TShape oshape = dshape; + if (dshape.ndim() == 0) return false; + if (param.kernel.ndim() == 1) { + CHECK_EQ(dshape.ndim(), 3U) + << "Pooling: Input data should be 3D in (batch, channel, x)"; + if (param.global_pool) { + oshape[2] = 1; + } else { + CHECK(param.kernel[0] <= dshape[2] + 2 * param.pad[0]) + << "kernel size (" << param.kernel[0] << ") exceeds input (" + << dshape[2] << " padded to " << (dshape[2] + 2 * param.pad[0]) + << ")"; + if (param.pooling_convention == pool_enum::kValid) { + oshape[2] = 1 + + (dshape[2] + 2 * param.pad[0] - param.kernel[0]) / + param.stride[0]; + } else { + oshape[2] = 1 + static_cast(ceil( + static_cast(dshape[2] + 2 * param.pad[0] - + param.kernel[0]) / + param.stride[0])); } } + out_shape->clear(); + out_shape->push_back(oshape); // save output shape +#if MXNET_USE_MKLDNN == 1 + if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param)) + out_shape->push_back(oshape); // for workspace #endif -#if MXNET_USE_NNPACK == 1 - // NNPACK only support max-pooling with kernel = 2, stride = 2, pooling_convention - // = kFull(note that the default value is kValid in MXNet) - if ((param.pool_type == pool_enum::kMaxPooling) - && (param.pooling_convention == pool_enum::kFull) - && (param.kernel.ndim() == 2) && (param.stride.ndim() == 2) - && (param.kernel[0] == 2) && (param.kernel[1] == 2) - && (param.stride[0] == 2) && (param.stride[1] == 2)) { - switch (dtype) { - case mshadow::kFloat32: - return new NNPACKPoolingOp(param); - default: - break; + } else if (param.kernel.ndim() == 2) { + CHECK_EQ(dshape.ndim(), 4U) + << "Pooling: Input data should be 4D in (batch, channel, y, x)"; + if (param.global_pool) { + oshape[2] = 1; + oshape[3] = 1; + } else { + CHECK(param.kernel[0] <= dshape[2] + 2 * param.pad[0]) + << "kernel size (" << param.kernel[0] << ") exceeds input (" + << dshape[2] << " padded to " << (dshape[2] + 2 * param.pad[0]) + << ")"; + CHECK(param.kernel[1] <= dshape[3] + 2 * param.pad[1]) + << "kernel size (" << param.kernel[1] << ") exceeds input (" + << dshape[3] << " padded to " << (dshape[3] + 2 * param.pad[1]) + << ")"; + if (param.pooling_convention == pool_enum::kValid) { + oshape[2] = 1 + + (dshape[2] + 2 * param.pad[0] - param.kernel[0]) / + param.stride[0]; + oshape[3] = 1 + + (dshape[3] + 2 * param.pad[1] - param.kernel[1]) / + param.stride[1]; + } else { + oshape[2] = 1 + static_cast(ceil( + static_cast(dshape[2] + 2 * param.pad[0] - + param.kernel[0]) / + param.stride[0])); + oshape[3] = 1 + static_cast(ceil( + static_cast(dshape[3] + 2 * param.pad[1] - + param.kernel[1]) / + param.stride[1])); + } } - } + out_shape->clear(); + out_shape->push_back(oshape); // save output shape +#if MXNET_USE_MKLDNN == 1 + if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param)) + out_shape->push_back(oshape); // for workspace #endif - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - if (pool_enum::kMaxPooling == param.pool_type - || pool_enum::kAvgPooling == param.pool_type - || pool_enum::kSumPooling == param.pool_type) { - op = new PoolingOp(param); + } else if (param.kernel.ndim() == 3) { + CHECK_EQ(dshape.ndim(), 5U) + << "Pooling: Input data should be 5D in (batch, channel, d, y, x)"; + CHECK_LE(param.kernel[0], dshape[2] + 2 * param.pad[0]) + << "kernel size exceeds input"; + CHECK_LE(param.kernel[1], dshape[3] + 2 * param.pad[1]) + << "kernel size exceeds input"; + CHECK_LE(param.kernel[2], dshape[4] + 2 * param.pad[2]) + << "kernel size exceeds input"; + if (param.global_pool) { + oshape[2] = 1; + oshape[3] = 1; + oshape[4] = 1; } else { - LOG(FATAL) << "unknown pooling type"; - return NULL; + if (param.pooling_convention == pool_enum::kValid) { + oshape[2] = 1 + + (dshape[2] + 2 * param.pad[0] - param.kernel[0]) / + param.stride[0]; + oshape[3] = 1 + + (dshape[3] + 2 * param.pad[1] - param.kernel[1]) / + param.stride[1]; + oshape[4] = 1 + + (dshape[4] + 2 * param.pad[2] - param.kernel[2]) / + param.stride[2]; + } else { + oshape[2] = 1 + static_cast(ceil( + static_cast(dshape[2] + 2 * param.pad[0] - + param.kernel[0]) / + param.stride[0])); + oshape[3] = 1 + static_cast(ceil( + static_cast(dshape[3] + 2 * param.pad[1] - + param.kernel[1]) / + param.stride[1])); + oshape[4] = 1 + static_cast(ceil( + static_cast(dshape[4] + 2 * param.pad[2] - + param.kernel[2]) / + param.stride[2])); + } } - }); - return op; + out_shape->clear(); + out_shape->push_back(oshape); // save output shape +#if MXNET_USE_MKLDNN == 1 + if (MKLDNNRequireWorkspace(param) && SupportMKLDNNPooling(param)) + out_shape->push_back(oshape); // for workspace +#endif + } + return true; +} + +#if MXNET_USE_MKLDNN == 1 +void PoolingComputeExCPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + const PoolingParam ¶m = nnvm::get(attrs.parsed); + const NDArray *workspace = nullptr; + if (MKLDNNRequireWorkspace(param)) { + CHECK_GT(outputs.size(), 1U); + workspace = &outputs[1]; + } + if (SupportMKLDNN(inputs[0]) + && SupportMKLDNNPooling(param, inputs[0].shape())) { + MKLDNN_OPCHECK_INIT(false, 1, inputs, outputs); + MKLDNNPoolingCompute(ctx, param, inputs[0], req[0], outputs[0], workspace); + MKLDNN_OPCHECK_RUN(PoolingCompute, attrs, ctx, inputs, req, outputs); + return; + } + FallBackCompute(PoolingCompute, attrs, ctx, inputs, req, outputs); +} + +void PoolingGradComputeExCPU(const nnvm::NodeAttrs &attrs, const OpContext &ctx, + const std::vector &inputs, + const std::vector &req, + const std::vector &outputs) { + const PoolingParam ¶m = nnvm::get(attrs.parsed); + const NDArray &out_grad = inputs[0]; + const NDArray *workspace = nullptr; + const NDArray *in_data = nullptr; + if (MKLDNNRequireWorkspace(param)) { + // The first two elements are the gradient of the outputs in forward. + // The third is the input of forward. + // The fourth and the fifth are the outputs of forward. + CHECK_EQ(inputs.size(), 5U); + in_data = &inputs[2]; + workspace = &inputs[4]; + } else { + CHECK_EQ(inputs.size(), 3U); + in_data = &inputs[1]; + } + const NDArray &in_grad = outputs[0]; + if (SupportMKLDNN(inputs[0]) + && SupportMKLDNNPooling(param, inputs[0].shape())) { + MKLDNN_OPCHECK_INIT(true, outputs.size(), inputs, outputs); + MKLDNNPoolingGradCompute(ctx, param, out_grad, *in_data, workspace, + req[0], in_grad); + MKLDNN_OPCHECK_RUN(PoolingGradCompute, attrs, ctx, inputs, req, + outputs); + return; + } + FallBackCompute(PoolingGradCompute, attrs, ctx, inputs, req, outputs); +} +#endif + +inline static bool PoolingStorageType(const nnvm::NodeAttrs &attrs, + const int dev_mask, + DispatchMode *dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 1); + +#if MXNET_USE_MKLDNN == 1 + const PoolingParam ¶m = nnvm::get(attrs.parsed); + if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) { + return storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, DispatchMode::kFComputeEx); + } +#else + CHECK_EQ(out_attrs->size(), 1); +#endif + return storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, DispatchMode::kFCompute); } -// DO_BIND_DISPATCH comes from operator_common.h -Operator* PoolingProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]); +inline static bool BackwardPoolingStorageType(const nnvm::NodeAttrs &attrs, + const int dev_mask, + DispatchMode *dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + const PoolingParam ¶m = nnvm::get(attrs.parsed); + CHECK_EQ(in_attrs->size(), GetNumBackInputs(param)); + CHECK_EQ(out_attrs->size(), 1); + +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask && SupportMKLDNNPooling(param)) { + return storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, DispatchMode::kFComputeEx); + } +#else + CHECK_EQ(in_attrs->size(), 3); +#endif + return storage_type_assign(out_attrs, mxnet::kDefaultStorage, + dispatch_mode, DispatchMode::kFCompute); } DMLC_REGISTER_PARAMETER(PoolingParam); -MXNET_REGISTER_OP_PROPERTY(Pooling, PoolingProp) -.describe(R"code(Performs pooling on the input. +NNVM_REGISTER_OP(Pooling) + .describe(R"code(Performs pooling on the input. The shapes for 1-D pooling are @@ -131,8 +353,61 @@ For 3-D pooling, an additional *depth* dimension is added before height, width)*. )code" ADD_FILELINE) -.add_argument("data", "NDArray-or-Symbol", "Input data to the pooling operator.") +.set_num_inputs(1) +.set_num_outputs([](const NodeAttrs& attrs) { + const PoolingParam ¶m = nnvm::get(attrs.parsed); + return GetNumOutputs(param); +}) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FNumVisibleOutputs", + [](const NodeAttrs& attrs) { return 1; }) +#endif +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return std::vector{"data"}; +}) +.set_attr("FListOutputNames", + [](const NodeAttrs& attrs) { + return std::vector{"output"}; +}) +.set_attr_parser(PoolingParamParser) +.set_attr("FInferStorageType", PoolingStorageType) +.set_attr("FInferType", PoolingType) +.set_attr("FInferShape", PoolingShape) +.set_attr("FCompute", PoolingCompute) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FComputeEx", PoolingComputeExCPU) +#endif +.set_attr("FGradient", + ElemwiseGradUseInOut{"_backward_Pooling"}) +.add_argument("data", "NDArray-or-Symbol", + "Input data to the pooling operator.") .add_arguments(PoolingParam::__FIELDS__()); +NNVM_REGISTER_OP(_backward_Pooling) +.set_num_outputs(1) +.set_attr("TIsBackward", true) +.set_attr( + "FInplaceOption", + [](const NodeAttrs &attrs) { +#if MXNET_USE_CUDNN == 1 + return std::vector >(); +#else + return std::vector >{{1, 0}}; +#endif +}) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif +.set_attr("FInferStorageType", + BackwardPoolingStorageType) +.set_attr_parser(PoolingParamParser) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FComputeEx", PoolingGradComputeExCPU) +#endif +.set_attr("FCompute", PoolingGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/pooling.cu b/src/operator/nn/pooling.cu index dcebe6798263..c3bcecfc77b7 100644 --- a/src/operator/nn/pooling.cu +++ b/src/operator/nn/pooling.cu @@ -21,7 +21,7 @@ * Copyright (c) 2017 by Contributors * \file pooling.cu * \brief - * \author Bing Xu, Jun Wu + * \author Bing Xu, Jun Wu, Da Zheng */ #include #include "./pooling-inl.h" @@ -32,38 +32,112 @@ namespace mxnet { namespace op { +#if MXNET_USE_CUDNN == 1 +template +static CuDNNPoolingOp &GetCuDNNPoolingOp(const PoolingParam ¶m) { +#if DMLC_CXX11_THREAD_LOCAL + static thread_local CuDNNPoolingOp op; +#else + static MX_THREAD_LOCAL CuDNNPoolingOp op; +#endif + op.Init(param); + return op; +} +#endif + template<> -Operator *CreateOp(PoolingParam param, int dtype) { - Operator *op = NULL; +void PoolingCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const PoolingParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), GetNumOutputs(param)); + #if MXNET_USE_CUDNN == 1 if (!param.cudnn_off && param.kernel.ndim() > 1) { - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { switch (param.pool_type) { case pool_enum::kMaxPooling: - op = new CuDNNPoolingOp(param); - break; case pool_enum::kAvgPooling: - op = new CuDNNPoolingOp(param); + GetCuDNNPoolingOp(param).Forward(ctx, inputs[0], req[0], outputs[0]); + return; + case pool_enum::kSumPooling: + LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied."; break; + } + }); + } +#endif // MXNET_USE_CUDNN + + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + if (pool_enum::kMaxPooling == param.pool_type + || pool_enum::kAvgPooling == param.pool_type + || pool_enum::kSumPooling == param.pool_type) { + PoolingForward(ctx, param, inputs[0], req[0], outputs[0]); + } else { + LOG(FATAL) << "unknown pooling type"; + } + }); +} + +template<> +void PoolingGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const PoolingParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), GetNumBackInputs(param)); + CHECK_EQ(outputs.size(), 1U); + CHECK_EQ(req.size(), 1U); + off_t ograd_idx, in_data_idx, out_data_idx; + // When MKLDNN is enabled, the input data may contains arrays for workspace. + if (GetNumBackInputs(param) == 5) { + ograd_idx = 0; + in_data_idx = 2; + out_data_idx = 3; + } else { + ograd_idx = 0; + in_data_idx = 1; + out_data_idx = 2; + } + +#if MXNET_USE_CUDNN == 1 + if (!param.cudnn_off && param.kernel.ndim() > 1) { + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { + switch (param.pool_type) { + case pool_enum::kMaxPooling: + case pool_enum::kAvgPooling: + GetCuDNNPoolingOp(param).Backward(ctx, inputs[ograd_idx], + inputs[in_data_idx], inputs[out_data_idx], req[0], outputs[0]); + return; case pool_enum::kSumPooling: LOG(WARNING) << "Sum pooling is not supported by cudnn, MXNet sum pooling is applied."; break; } }); } - if (op) return op; #endif // MXNET_USE_CUDNN - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { + + MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, { if (pool_enum::kMaxPooling == param.pool_type || pool_enum::kAvgPooling == param.pool_type || pool_enum::kSumPooling == param.pool_type) { - op = new PoolingOp(param); + PoolingBackward(ctx, param, inputs[ograd_idx], + inputs[in_data_idx], inputs[out_data_idx], req[0], outputs[0]); } else { LOG(FATAL) << "unknown pooling type"; } }); - return op; } +NNVM_REGISTER_OP(Pooling) +.set_attr("FCompute", PoolingCompute); + +NNVM_REGISTER_OP(_backward_Pooling) +.set_attr("FCompute", PoolingGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/softmax.cc b/src/operator/nn/softmax.cc index 4686fb8c0dc1..0f559475d1c2 100644 --- a/src/operator/nn/softmax.cc +++ b/src/operator/nn/softmax.cc @@ -25,11 +25,54 @@ #include "./softmax-inl.h" #include "../tensor/elemwise_unary_op.h" #include "../tensor/elemwise_binary_op.h" +#include "mkldnn/mkldnn_base-inl.h" +#include "mkldnn/mkldnn_ops-inl.h" namespace mxnet { namespace op { DMLC_REGISTER_PARAMETER(SoftmaxParam); +#if MXNET_USE_MKLDNN == 1 +static void SoftmaxComputeExCPU(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const SoftmaxParam& param = nnvm::get(attrs.parsed); + // It seems MKLDNN softmax doesn't support training. + // and it only supports non-negative axis. + if (SupportMKLDNN(inputs[0]) && !ctx.is_train && param.axis >= 0) { + MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs); + MKLDNNSoftmaxForward(attrs, ctx, inputs[0], req[0], outputs[0]); + auto fn = SoftmaxCompute; + MKLDNN_OPCHECK_RUN(fn, attrs, ctx, inputs, req, outputs); + return; + } + FallBackCompute(SoftmaxCompute, attrs, ctx, + inputs, req, outputs); +} +#endif + +inline static bool SoftmaxStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 1); + CHECK_EQ(out_attrs->size(), 1); + + DispatchMode wanted_mode; +#if MXNET_USE_MKLDNN == 1 + // We only run MKLDNN op if it runs on CPU. + if (dev_mask == mshadow::cpu::kDevMask) + wanted_mode = DispatchMode::kFComputeEx; + else +#endif + wanted_mode = DispatchMode::kFCompute; + return storage_type_assign(out_attrs, static_cast((*in_attrs)[0]), + dispatch_mode, wanted_mode); +} + MXNET_OPERATOR_REGISTER_UNARY(softmax) .describe(R"code(Applies the softmax function. @@ -54,6 +97,10 @@ Example:: )code" ADD_FILELINE) .set_attr_parser(ParamParser) .set_attr("FCompute", SoftmaxCompute) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FComputeEx", SoftmaxComputeExCPU) +#endif +.set_attr("FInferStorageType", SoftmaxStorageType) .set_attr("FGradient", ElemwiseGradUseOut{"_backward_softmax"}) .add_arguments(SoftmaxParam::__FIELDS__()); diff --git a/src/operator/nn/softmax_activation-inl.h b/src/operator/nn/softmax_activation-inl.h index 500bf51ccd1f..b1d542e4068c 100644 --- a/src/operator/nn/softmax_activation-inl.h +++ b/src/operator/nn/softmax_activation-inl.h @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file softmax_activation-inl.h * \brief SoftmaxActivation operator - * \author Junyuan Xie + * \author Junyuan Xie, Da Zheng */ #ifndef MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_ #define MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_ @@ -61,153 +61,74 @@ struct SoftmaxActivationParam : public dmlc::Parameter { } }; -/** - * \brief This is the implementation of softmax_activation operator. - * \tparam xpu The device that the op will be executed on. - */ template -class SoftmaxActivationOp : public Operator { - public: - explicit SoftmaxActivationOp(SoftmaxActivationParam p) { - this->param_ = p; - } - - virtual void Forward(const OpContext &ctx, - const std::vector &in_data, - const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), 1U); - CHECK_EQ(out_data.size(), 1U); - Stream *s = ctx.get_stream(); - if (param_.mode == softmax_activation::kInstance) { - Tensor data = in_data[softmax_activation::kData].FlatTo2D(s); - Tensor out = out_data[softmax_activation::kOut].FlatTo2D(s); - Softmax(out, data); - } else { - CHECK_GE(in_data[softmax_activation::kData].ndim(), 3) +void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& reqs, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + const TBlob &in_data = inputs[softmax_activation::kData]; + const OpReqType &req = reqs[softmax_activation::kOut]; + const TBlob &out_data = outputs[softmax_activation::kOut]; + Stream *s = ctx.get_stream(); + if (param.mode == softmax_activation::kInstance) { + Tensor data = in_data.FlatTo2D(s); + Tensor out = out_data.FlatTo2D(s); + Softmax(out, data); + } else { + CHECK_GE(in_data.ndim(), 3) << "Input need to have a least 3 dimensions when mode=channel"; - int n = in_data[softmax_activation::kData].size(0); - int k = in_data[softmax_activation::kData].size(1); - Shape<3> s3 = Shape3(n, k, static_cast(in_data[softmax_activation::kData].Size()/n/k)); - Tensor data = - in_data[softmax_activation::kData].get_with_shape(s3, s); - Tensor out = - out_data[softmax_activation::kOut].get_with_shape(s3, s); - Softmax(out, data); - } + int n = in_data.size(0); + int k = in_data.size(1); + Shape<3> s3 = Shape3(n, k, static_cast(in_data.Size()/n/k)); + Tensor data = in_data.get_with_shape(s3, s); + Tensor out = out_data.get_with_shape(s3, s); + Softmax(out, data); } +} - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK(in_data.size() == 1 && in_grad.size() == 1); - CHECK_EQ(req.size(), 1U); - // Use 3d tensor for both mode -> {instance, channel}. Get shapes - int total_size = in_grad[softmax_activation::kData].Size(); - int batch_size = in_grad[softmax_activation::kData].shape_[0]; - int channel_num = in_grad[softmax_activation::kData].shape_[1]; - int rest_size = total_size / (batch_size * channel_num); - const Shape<3> data_shape = Shape3(batch_size, channel_num, rest_size); - // Get tensors - Stream *s = ctx.get_stream(); - Tensor m_out_grad = - out_grad[softmax_activation::kOut].get_with_shape(data_shape, s); - Tensor m_out_data = - out_data[softmax_activation::kOut].get_with_shape(data_shape, s); - Tensor m_in_grad = - in_grad[softmax_activation::kData].get_with_shape(data_shape, s); - // get requested temp space - Tensor workspace = ctx.requested[softmax_activation::kTempSpace].get_space( - Shape2(batch_size, rest_size), s); - workspace = reduce_with_axis(m_out_grad * m_out_data, 1); - Assign(m_in_grad, req[softmax_activation::kData], - m_out_data * (m_out_grad - broadcast_with_axis(workspace, 0, channel_num))); - } - - private: - SoftmaxActivationParam param_; -}; // class SoftmaxActivationOp - -// Decalre Factory function, used for dispatch specialization template -Operator* CreateOp(SoftmaxActivationParam type); - -#if DMLC_USE_CXX11 -class SoftmaxActivationProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - using namespace mshadow; - CHECK_EQ(in_shape->size(), 1U) << "Input:[data]"; - const TShape &dshape = in_shape->at(softmax_activation::kData); - if (dshape.ndim() == 0) return false; - out_shape->clear(); - out_shape->push_back(dshape); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new SoftmaxActivationProp(); - ptr->param_ = param_; - return ptr; - } - - std::string TypeString() const override { - return "SoftmaxActivation"; - } - - // decalre dependency and inplace optimization options - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - return {out_grad[softmax_activation::kOut], out_data[softmax_activation::kOut]}; - } - - std::vector BackwardResource( - const std::vector &in_shape) const override { - return {ResourceRequest::kTempSpace}; - } +void SoftmaxActivationGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& reqs, + const std::vector& outputs) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1); + CHECK_EQ(reqs.size(), 1); + const TBlob &out_grad = inputs[0]; + const TBlob &out_data = inputs[1]; + const OpReqType &req = reqs[0]; + const TBlob &in_grad = outputs[0]; + // Use 3d tensor for both mode -> {instance, channel}. Get shapes + int total_size = in_grad.Size(); + int batch_size = in_grad.shape_[0]; + int channel_num = in_grad.shape_[1]; + int rest_size = total_size / (batch_size * channel_num); + const Shape<3> data_shape = Shape3(batch_size, channel_num, rest_size); + // Get tensors + Stream *s = ctx.get_stream(); + Tensor m_out_grad = + out_grad.get_with_shape(data_shape, s); + Tensor m_out_data = + out_data.get_with_shape(data_shape, s); + Tensor m_in_grad = + in_grad.get_with_shape(data_shape, s); + // get requested temp space + Tensor workspace = ctx.requested[softmax_activation::kTempSpace].get_space( + Shape2(batch_size, rest_size), s); + workspace = reduce_with_axis(m_out_grad * m_out_data, 1); + Assign(m_in_grad, req, + m_out_data * (m_out_grad - broadcast_with_axis(workspace, 0, channel_num))); +} - std::vector > BackwardInplaceOption( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &in_grad) const override { - return {{out_grad[softmax_activation::kOut], in_grad[softmax_activation::kData]}}; - } - - std::vector > ForwardInplaceOption( - const std::vector &in_data, - const std::vector &out_data) const override { - return {{in_data[softmax_activation::kData], out_data[softmax_activation::kOut]}}; - } - - Operator* CreateOperator(Context ctx) const override; - - private: - SoftmaxActivationParam param_; -}; -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet #endif // MXNET_OPERATOR_NN_SOFTMAX_ACTIVATION_INL_H_ diff --git a/src/operator/nn/softmax_activation.cc b/src/operator/nn/softmax_activation.cc index 657b382c6e03..bdfd8b065de1 100644 --- a/src/operator/nn/softmax_activation.cc +++ b/src/operator/nn/softmax_activation.cc @@ -21,26 +21,18 @@ * Copyright (c) 2015 by Contributors * \file activation.cc * \brief softmax_activation op - * \author Junyuan Xie + * \author Junyuan Xie, Da Zheng */ #include "./softmax_activation-inl.h" +#include "../tensor/elemwise_unary_op.h" #include "../mshadow_op.h" namespace mxnet { namespace op { -template<> -Operator *CreateOp(SoftmaxActivationParam param) { - return new SoftmaxActivationOp(param); -} - -// DO_BIND_DISPATCH comes from operator_common.h -Operator *SoftmaxActivationProp::CreateOperator(Context ctx) const { - DO_BIND_DISPATCH(CreateOp, param_); -} DMLC_REGISTER_PARAMETER(SoftmaxActivationParam); -MXNET_REGISTER_OP_PROPERTY(SoftmaxActivation, SoftmaxActivationProp) +MXNET_OPERATOR_REGISTER_UNARY(SoftmaxActivation) .describe(R"code(Applies softmax activation to input. This is intended for internal layers. .. note:: @@ -65,8 +57,22 @@ Example:: [ 6.56221947e-03 5.95310994e-04 9.73919690e-01 1.78379621e-02 1.08472735e-03]] )code" ADD_FILELINE) -.add_argument("data", "NDArray-or-Symbol", "Input array to activation function.") +.set_attr_parser(ParamParser) +.set_attr("FCompute", SoftmaxActivationCompute) +.set_attr("FGradient", ElemwiseGradUseOut{"_backward_SoftmaxActivation"}) .add_arguments(SoftmaxActivationParam::__FIELDS__()); +NNVM_REGISTER_OP(_backward_SoftmaxActivation) +.set_num_outputs(1) +.set_attr("TIsBackward", true) +.set_attr("FInplaceOption", [](const NodeAttrs& attrs){ + return std::vector >{{0, 0}}; +}) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +.set_attr_parser(ParamParser) +.set_attr("FCompute", SoftmaxActivationGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/softmax_activation.cu b/src/operator/nn/softmax_activation.cu index 0810483e1262..f3997e00052e 100644 --- a/src/operator/nn/softmax_activation.cu +++ b/src/operator/nn/softmax_activation.cu @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file softmax_activation.cu * \brief - * \author Junyuan Xie + * \author Junyuan Xie, Da Zheng */ #include "./softmax_activation-inl.h" #include "../mshadow_op.h" @@ -31,14 +31,51 @@ namespace mxnet { namespace op { -template<> -Operator *CreateOp(SoftmaxActivationParam param) { + #if MXNET_USE_CUDNN == 1 - return new CuDNNSoftmaxActivationOp(param); + +static inline CuDNNSoftmaxActivationOp &GetCuDNNSoftmaxActOp(const SoftmaxActivationParam& param) { +#if DMLC_CXX11_THREAD_LOCAL + static thread_local CuDNNSoftmaxActivationOp op; #else - return new SoftmaxActivationOp(param); -#endif // MXNET_USE_CUDNN + static MX_THREAD_LOCAL CuDNNSoftmaxActivationOp op; +#endif + op.Init(param); + return op; +} + +template<> +void SoftmaxActivationCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + GetCuDNNSoftmaxActOp(param).Forward(ctx, inputs[0], req[0], outputs[0]); } + +template<> +void SoftmaxActivationGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const SoftmaxActivationParam& param = nnvm::get(attrs.parsed); + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1); + CHECK_EQ(req.size(), 1); + GetCuDNNSoftmaxActOp(param).Backward(ctx, inputs[0], inputs[1], req[0], outputs[0]); +} +#endif + +NNVM_REGISTER_OP(SoftmaxActivation) +.set_attr("FCompute", SoftmaxActivationCompute); + +NNVM_REGISTER_OP(_backward_SoftmaxActivation) +.set_attr("FCompute", SoftmaxActivationGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/upsampling-inl.h b/src/operator/nn/upsampling-inl.h index f660609ace28..4b9159edd174 100644 --- a/src/operator/nn/upsampling-inl.h +++ b/src/operator/nn/upsampling-inl.h @@ -35,6 +35,7 @@ #include #include #include "../operator_common.h" +#include "./deconvolution-inl.h" namespace mxnet { namespace op { @@ -82,253 +83,147 @@ struct UpSamplingParam : public dmlc::Parameter { }; // struct UpSamplingParam template -class UpSamplingNearestOp : public Operator { - public: - explicit UpSamplingNearestOp(UpSamplingParam p) { - this->param_ = p; - } - - virtual void Forward(const OpContext &ctx, +void UpSamplingForward(const OpContext &ctx, const UpSamplingParam ¶m, const std::vector &in_data, const std::vector &req, - const std::vector &out_data, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(in_data.size(), static_cast(param_.num_args)); - CHECK_EQ(out_data.size(), 1U); - if (req[up_enum::kOut] == kNullOp) { - return; - } - Stream *s = ctx.get_stream(); - Tensor out = out_data[up_enum::kOut].get(s); - if (param_.num_args > 1) { - int begin = 0; - for (int i = 0; i < param_.num_args; ++i) { - Tensor data = in_data[i].get(s); - int end = begin + data.size(1); - int scale = out_data[up_enum::kOut].size(2)/in_data[i].size(2); - if (param_.multi_input_mode == up_enum::kSum) { - if (i == 0) { - Assign(out, req[up_enum::kOut], upsampling_nearest(data, scale)); - } else { - out += upsampling_nearest(data, scale); - } + const std::vector &out_data) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_data.size(), static_cast(param.num_args)); + CHECK_EQ(out_data.size(), 1U); + if (req[up_enum::kOut] == kNullOp) { + return; + } + Stream *s = ctx.get_stream(); + Tensor out = out_data[up_enum::kOut].get(s); + if (param.num_args > 1) { + int begin = 0; + for (int i = 0; i < param.num_args; ++i) { + Tensor data = in_data[i].get(s); + int end = begin + data.size(1); + int scale = out_data[up_enum::kOut].size(2)/in_data[i].size(2); + if (param.multi_input_mode == up_enum::kSum) { + if (i == 0) { + Assign(out, req[up_enum::kOut], upsampling_nearest(data, scale)); } else { - Assign(slice<1>(out, begin, end), req[up_enum::kOut], upsampling_nearest(data, scale)); + out += upsampling_nearest(data, scale); } - begin = end; + } else { + Assign(slice<1>(out, begin, end), req[up_enum::kOut], upsampling_nearest(data, scale)); } - } else { - Tensor data = in_data[up_enum::kData].get(s); - Assign(out, req[up_enum::kOut], upsampling_nearest(data, param_.scale)); + begin = end; } + } else { + Tensor data = in_data[up_enum::kData].get(s); + Assign(out, req[up_enum::kOut], upsampling_nearest(data, param.scale)); } +} - virtual void Backward(const OpContext &ctx, - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &req, - const std::vector &in_grad, - const std::vector &aux_args) { - using namespace mshadow; - using namespace mshadow::expr; - CHECK_EQ(out_grad.size(), 1U); - CHECK_EQ(in_grad.size(), static_cast(param_.num_args)); - Stream *s = ctx.get_stream(); - Tensor grad = out_grad[up_enum::kOut].get(s); - if (param_.num_args > 1) { - int begin = 0; - for (int i = 0; i < param_.num_args; ++i) { - Tensor input_grad = in_grad[i].get(s); - mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]); - int end = begin + input_grad.size(1); - int scale = grad.size(2)/in_shape[0]; - if (param_.multi_input_mode == up_enum::kSum) { - Assign(input_grad, req[i], - pool(grad, - in_shape, - scale, - scale, - scale, - scale)); - } else { - Assign(input_grad, req[i], - pool(slice<1>(grad, begin, end), - in_shape, - scale, - scale, - scale, - scale)); - } - begin = end; - } - } else { - Tensor input_grad = in_grad[up_enum::kData].get(s); +template +void UpSamplingBackward(const OpContext &ctx, const UpSamplingParam ¶m, + const TBlob &out_grad, const std::vector &req, + const std::vector &in_grad) { + using namespace mshadow; + using namespace mshadow::expr; + CHECK_EQ(in_grad.size(), static_cast(param.num_args)); + Stream *s = ctx.get_stream(); + Tensor grad = out_grad.get(s); + if (param.num_args > 1) { + int begin = 0; + for (int i = 0; i < param.num_args; ++i) { + Tensor input_grad = in_grad[i].get(s); mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]); - Assign(input_grad, req[up_enum::kData], - pool(grad, - in_shape, - param_.scale, - param_.scale, - param_.scale, - param_.scale)); - } - } - - private: - UpSamplingParam param_; -}; // class UpSamplingNearestOp - -template -Operator *CreateOp(UpSamplingParam param, int dtype); - - -#if DMLC_USE_CXX11 -class UpSamplingProp : public OperatorProperty { - public: - void Init(const std::vector >& kwargs) override { - param_.Init(kwargs); - } - - std::map GetParams() const override { - return param_.__DICT__(); - } - - std::vector ListArguments() const override { - if (param_.sample_type == up_enum::kNearest) { - std::vector ret; - for (int i = 0; i < param_.num_args; ++i) { - ret.push_back(std::string("arg") + std::to_string(i)); - } - return ret; - } else { - return {"data", "weight"}; - } - } - - bool InferShape(std::vector *in_shape, - std::vector *out_shape, - std::vector *aux_shape) const override { - CHECK_GE(in_shape->size(), 1U); - const TShape &dshape = (*in_shape)[0]; - TShape oshape = dshape; - if (param_.sample_type == up_enum::kNearest) { - CHECK_EQ(in_shape->size(), static_cast(param_.num_args)); - oshape[1] = 0; - for (auto& shape : *in_shape) { - CHECK_EQ(shape.ndim(), 4U) << \ - "UpSamplingNearest: Input data should be 4D in (batch, channel, y, x)"; - int oh = dshape[2]*param_.scale, ow = dshape[3]*param_.scale; - CHECK_EQ(oh%shape[2], 0U) << "UpSamplingNearest: input height of " << shape[2] << \ - "does not divide output height of " << oh; - CHECK_EQ(ow%shape[3], 0U) << "UpSamplingNearest: input width of " << shape[3] << \ - "does not divide output width of " << ow; - if (param_.multi_input_mode == up_enum::kSum) { - CHECK(oshape[1] == 0 || oshape[1] == shape[1]) << \ - "Number of channels must be the same when multi_input_mode==sum"; - oshape[1] = shape[1]; - } else { - oshape[1] += shape[1]; - } - } - } else { - CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; - CHECK_EQ(dshape.ndim(), 4U) << \ - "UpSamplingBilinear: Input data should be 4D in (batch, channel, y, x)"; - if (dshape.ndim() == 0) return false; - int kernel = 2 * param_.scale - param_.scale % 2; - SHAPE_ASSIGN_CHECK(*in_shape, - up_enum::kWeight, - mshadow::Shape4(dshape[1], 1, kernel, kernel)); - oshape = dshape; - } - oshape[2] = dshape[2] * param_.scale; - oshape[3] = dshape[3] * param_.scale; - out_shape->clear(); - out_shape->push_back(oshape); - return true; - } - - bool InferType(std::vector *in_type, - std::vector *out_type, - std::vector *aux_type) const override { - CHECK_GE(in_type->size(), 1U); - int dtype = (*in_type)[0]; - CHECK_NE(dtype, -1) << "First input must have specified type"; - for (index_t i = 0; i < in_type->size(); ++i) { - if ((*in_type)[i] == -1) { - (*in_type)[i] = dtype; + int end = begin + input_grad.size(1); + int scale = grad.size(2)/in_shape[0]; + if (param.multi_input_mode == up_enum::kSum) { + Assign(input_grad, req[i], + pool(grad, + in_shape, + scale, + scale, + scale, + scale)); } else { - UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]); + Assign(input_grad, req[i], + pool(slice<1>(grad, begin, end), + in_shape, + scale, + scale, + scale, + scale)); } + begin = end; } - out_type->clear(); - out_type->push_back(dtype); - return true; - } - - OperatorProperty* Copy() const override { - auto ptr = new UpSamplingProp(); - ptr->param_ = this->param_; - return ptr; - } - - std::string TypeString() const override { - return "UpSampling"; - } - - std::vector DeclareBackwardDependency( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data) const override { - if (param_.sample_type == up_enum::kNearest) { - return {out_grad[up_enum::kOut]}; - } else { - return {out_grad[up_enum::kOut], in_data[up_enum::kData], in_data[up_enum::kWeight]}; - } - } - - std::vector > BackwardInplaceOption( - const std::vector &out_grad, - const std::vector &in_data, - const std::vector &out_data, - const std::vector &in_grad) const override { - return {}; - } - - std::vector ForwardResource( - const std::vector &in_shape) const override { - if (param_.sample_type == up_enum::kNearest) { - return {}; - } else { - return {ResourceRequest::kTempSpace}; - } - } - - std::vector BackwardResource( - const std::vector &in_shape) const override { - if (param_.sample_type == up_enum::kNearest) { - return {}; - } else { - return {ResourceRequest::kTempSpace}; - } - } - - Operator* CreateOperator(Context ctx) const override { - LOG(FATAL) << "Not Implemented"; - return NULL; - } + } else { + Tensor input_grad = in_grad[up_enum::kData].get(s); + mshadow::Shape<2> in_shape = Shape2(input_grad.shape_[2], input_grad.shape_[3]); + Assign(input_grad, req[up_enum::kData], + pool(grad, + in_shape, + param.scale, + param.scale, + param.scale, + param.scale)); + } +} + +static inline DeconvolutionParam GetDeconvolutionParam(const UpSamplingParam& param) { + DeconvolutionParam p = DeconvolutionParam(); + int kernel = 2 * param.scale - param.scale % 2; + int stride = param.scale; + int pad = static_cast(ceil((param.scale - 1) / 2.)); + p.workspace = param.workspace; + p.num_group = param.num_filter; + p.num_filter = param.num_filter; + p.no_bias = true; + int shape[] = {1, 1}; + p.dilate = TShape(shape, shape + 2); + shape[0] = shape[1] = kernel; + p.kernel = TShape(shape, shape + 2); + shape[0] = shape[1] = stride; + p.stride = TShape(shape, shape + 2); + shape[0] = shape[1] = pad; + p.pad = TShape(shape, shape + 2); + return p; +} - Operator* CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const override; +template +void UpSamplingCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const UpSamplingParam& param = nnvm::get(attrs.parsed); + if (param.sample_type == up_enum::kNearest) { + MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { + UpSamplingForward(ctx, param, inputs, req, outputs); + }); + } else if (param.sample_type == up_enum::kBilinear) { + DeconvolutionParam p = GetDeconvolutionParam(param); + _DeconvolutionCompute(p, ctx, inputs, req, outputs); + } else { + LOG(FATAL) << "Unknown sample type"; + } +} +template +void UpSamplingGradCompute(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + const UpSamplingParam& param = nnvm::get(attrs.parsed); + if (param.sample_type == up_enum::kNearest) { + MSHADOW_REAL_TYPE_SWITCH(inputs[deconv::kData].type_flag_, DType, { + CHECK_EQ(inputs.size(), 1U); + UpSamplingBackward(ctx, param, inputs[0], req, outputs); + }); + } else if (param.sample_type == up_enum::kBilinear) { + DeconvolutionParam p = GetDeconvolutionParam(param); + _DeconvolutionGradCompute(p, ctx, inputs, req, outputs); + } else { + LOG(FATAL) << "Unknown sample type"; + } +} - private: - UpSamplingParam param_; -}; // class UpSamplingProp -#endif // DMLC_USE_CXX11 } // namespace op } // namespace mxnet diff --git a/src/operator/nn/upsampling.cc b/src/operator/nn/upsampling.cc index 8942e35ab325..44b619ac9516 100644 --- a/src/operator/nn/upsampling.cc +++ b/src/operator/nn/upsampling.cc @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file upsampling_nearest.cc * \brief - * \author Bing Xu + * \author Bing Xu, Da Zheng */ #include "./upsampling-inl.h" @@ -30,51 +30,123 @@ namespace mxnet { namespace op { -template<> -Operator *CreateOp(UpSamplingParam param, int dtype) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - if (param.sample_type == up_enum::kNearest) { - op = new UpSamplingNearestOp(param); - } else if (param.sample_type == up_enum::kBilinear) { - DeconvolutionParam p = DeconvolutionParam(); - int kernel = 2 * param.scale - param.scale % 2; - int stride = param.scale; - int pad = static_cast(ceil((param.scale - 1) / 2.)); - p.workspace = param.workspace; - p.num_group = param.num_filter; - p.num_filter = param.num_filter; - p.no_bias = true; - int shape[] = {1, 1}; - p.dilate = TShape(shape, shape + 2); - shape[0] = shape[1] = kernel; - p.kernel = TShape(shape, shape + 2); - shape[0] = shape[1] = stride; - p.stride = TShape(shape, shape + 2); - shape[0] = shape[1] = pad; - p.pad = TShape(shape, shape + 2); - op = new DeconvolutionOp(p); - } else { - LOG(FATAL) << "Unknown sample type"; + +static bool UpSamplingShape(const nnvm::NodeAttrs& attrs, + std::vector *in_shape, std::vector *out_shape) { + const UpSamplingParam& param_ = nnvm::get(attrs.parsed); + CHECK_GE(in_shape->size(), 1U); + const TShape &dshape = (*in_shape)[0]; + TShape oshape = dshape; + if (param_.sample_type == up_enum::kNearest) { + CHECK_EQ(in_shape->size(), static_cast(param_.num_args)); + oshape[1] = 0; + for (auto& shape : *in_shape) { + CHECK_EQ(shape.ndim(), 4U) << \ + "UpSamplingNearest: Input data should be 4D in (batch, channel, y, x)"; + int oh = dshape[2]*param_.scale, ow = dshape[3]*param_.scale; + CHECK_EQ(oh%shape[2], 0U) << "UpSamplingNearest: input height of " << shape[2] << \ + "does not divide output height of " << oh; + CHECK_EQ(ow%shape[3], 0U) << "UpSamplingNearest: input width of " << shape[3] << \ + "does not divide output width of " << ow; + if (param_.multi_input_mode == up_enum::kSum) { + CHECK(oshape[1] == 0 || oshape[1] == shape[1]) << \ + "Number of channels must be the same when multi_input_mode==sum"; + oshape[1] = shape[1]; + } else { + oshape[1] += shape[1]; + } + } + } else { + CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]"; + CHECK_EQ(dshape.ndim(), 4U) << \ + "UpSamplingBilinear: Input data should be 4D in (batch, channel, y, x)"; + if (dshape.ndim() == 0) return false; + int kernel = 2 * param_.scale - param_.scale % 2; + SHAPE_ASSIGN_CHECK(*in_shape, + up_enum::kWeight, + mshadow::Shape4(dshape[1], 1, kernel, kernel)); + oshape = dshape; + } + oshape[2] = dshape[2] * param_.scale; + oshape[3] = dshape[3] * param_.scale; + out_shape->clear(); + out_shape->push_back(oshape); + return true; +} + +static inline std::vector ListArguments(const UpSamplingParam& param) { + if (param.sample_type == up_enum::kNearest) { + std::vector ret; + for (int i = 0; i < param.num_args; ++i) { + ret.push_back(std::string("arg") + std::to_string(i)); } - }); - return op; + return ret; + } else { + return {"data", "weight"}; + } } -Operator* UpSamplingProp::CreateOperatorEx(Context ctx, std::vector *in_shape, - std::vector *in_type) const { - DO_BIND_DISPATCH(CreateOp, param_, in_type->at(0)); +static bool UpSamplingType(const nnvm::NodeAttrs& attrs, + std::vector *in_type, std::vector *out_type) { + const UpSamplingParam& param = nnvm::get(attrs.parsed); + CHECK_GE(in_type->size(), 1U); + int dtype = (*in_type)[0]; + CHECK_NE(dtype, -1) << "First input must have specified type"; + for (index_t i = 0; i < in_type->size(); ++i) { + if ((*in_type)[i] == -1) { + (*in_type)[i] = dtype; + } else { + UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments(param)[i]); + } + } + out_type->clear(); + out_type->push_back(dtype); + return true; } +struct UpSamplingGrad { + const char *op_name; + std::vector operator()(const nnvm::NodePtr& n, + const std::vector& ograds) const { + const UpSamplingParam& param_ = nnvm::get(n->attrs.parsed); + std::vector heads(ograds.begin(), ograds.end()); + if (param_.sample_type != up_enum::kNearest) { + heads.push_back(n->inputs[up_enum::kData]); + heads.push_back(n->inputs[up_enum::kWeight]); + } + return MakeGradNode(op_name, n, heads, n->attrs.dict); + } +}; + DMLC_REGISTER_PARAMETER(UpSamplingParam); -MXNET_REGISTER_OP_PROPERTY(UpSampling, UpSamplingProp) +NNVM_REGISTER_OP(UpSampling) .describe("Performs nearest neighbor/bilinear up sampling to inputs.") +.set_num_inputs([](const NodeAttrs& attrs) { + const UpSamplingParam& params = nnvm::get(attrs.parsed); + return params.sample_type == up_enum::kNearest ? params.num_args : 2; +}) +.set_num_outputs(1) +.set_attr_parser(ParamParser) +.set_attr("FListInputNames", + [](const NodeAttrs& attrs) { + return ListArguments(nnvm::get(attrs.parsed)); +}) +.set_attr("FInferShape", UpSamplingShape) +.set_attr("FInferType", UpSamplingType) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + const UpSamplingParam& param = nnvm::get(n.parsed); + if (param.sample_type == up_enum::kNearest) { + return std::vector(); + } else { + return std::vector{ResourceRequest::kTempSpace}; + } +}) +.set_attr("FCompute", UpSamplingCompute) +.set_attr("FGradient", UpSamplingGrad{"_backward_UpSampling"}) +.set_attr("key_var_num_args", "num_args") .add_argument("data", "NDArray-or-Symbol[]", "Array of tensors to upsample") .add_arguments(UpSamplingParam::__FIELDS__()) -.set_key_var_num_args("num_args"); - -NNVM_REGISTER_OP(UpSampling) .set_attr("FSetInputVarAttrOnCompose", [](const nnvm::NodeAttrs& attrs, nnvm::NodePtr var, const int index) { if (var->attrs.dict.find("__init__") != var->attrs.dict.end()) return; @@ -82,5 +154,23 @@ NNVM_REGISTER_OP(UpSampling) var->attrs.dict["__init__"] = "[\"bilinear\", {}]"; } }); + +NNVM_REGISTER_OP(_backward_UpSampling) +.set_num_outputs([](const NodeAttrs& attrs) { + const UpSamplingParam& params = nnvm::get(attrs.parsed); + return params.sample_type == up_enum::kNearest ? params.num_args : 2; +}) +.set_attr("TIsBackward", true) +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + const UpSamplingParam& param = nnvm::get(n.parsed); + if (param.sample_type == up_enum::kNearest) { + return std::vector(); + } else { + return std::vector{ResourceRequest::kTempSpace}; + } +}) +.set_attr_parser(ParamParser) +.set_attr("FCompute", UpSamplingGradCompute); + } // namespace op } // namespace mxnet diff --git a/src/operator/nn/upsampling.cu b/src/operator/nn/upsampling.cu index f83535a2b2e6..c5ff2fafd64a 100644 --- a/src/operator/nn/upsampling.cu +++ b/src/operator/nn/upsampling.cu @@ -21,7 +21,7 @@ * Copyright (c) 2015 by Contributors * \file upsampling_nearest.cc * \brief - * \author Bing Xu + * \author Bing Xu, Da Zheng */ #include "./deconvolution-inl.h" @@ -29,36 +29,12 @@ namespace mxnet { namespace op { -template<> -Operator *CreateOp(UpSamplingParam param, int dtype) { - Operator *op = NULL; - MSHADOW_REAL_TYPE_SWITCH(dtype, DType, { - if (param.sample_type == up_enum::kNearest) { - op = new UpSamplingNearestOp(param); - } else if (param.sample_type == up_enum::kBilinear) { - DeconvolutionParam p = DeconvolutionParam(); - int kernel = 2 * param.scale - param.scale % 2; - int stride = param.scale; - int pad = static_cast(ceil((param.scale - 1) / 2.)); - p.workspace = param.workspace; - p.num_group = param.num_filter; - p.num_filter = param.num_filter; - p.no_bias = true; - int shape[] = {1, 1}; - p.dilate = TShape(shape, shape + 2); - shape[0] = shape[1] = kernel; - p.kernel = TShape(shape, shape + 2); - shape[0] = shape[1] = stride; - p.stride = TShape(shape, shape + 2); - shape[0] = shape[1] = pad; - p.pad = TShape(shape, shape + 2); - op = new DeconvolutionOp(p); - } else { - LOG(FATAL) << "Unknown sample type"; - } - }); - return op; -} + +NNVM_REGISTER_OP(UpSampling) +.set_attr("FCompute", UpSamplingCompute); + +NNVM_REGISTER_OP(_backward_UpSampling) +.set_attr("FCompute", UpSamplingGradCompute); } // namespace op } // namespace mxnet diff --git a/src/operator/tensor/cast_storage-inl.h b/src/operator/tensor/cast_storage-inl.h index ed200273854d..e345bb2193f4 100644 --- a/src/operator/tensor/cast_storage-inl.h +++ b/src/operator/tensor/cast_storage-inl.h @@ -27,11 +27,15 @@ #include #include #include +#include #include "../mxnet_op.h" #include "../operator_common.h" #ifdef __CUDACC__ #include "./cast_storage-inl.cuh" #endif // __CUDACC__ +#if MXNET_USE_MKLDNN == 1 +#include "../nn/mkldnn/mkldnn_base-inl.h" +#endif namespace mxnet { @@ -342,8 +346,20 @@ void CastStorageComputeImpl(const OpContext& ctx, } else if (src_stype == kCSRStorage && dst_stype == kDefaultStorage) { TBlob ret = output.data(); CastStorageCsrDnsImpl(ctx, input, &ret); +#if MXNET_USE_MKLDNN == 1 + } else if (src_stype == kDefaultStorage && dst_stype == kDefaultStorage) { + CHECK_EQ(output.ctx().dev_type, input.ctx().dev_type); + // If one of them uses the MKLDNN layout. + if (input.IsMKLDNNData() || output.IsMKLDNNData()) { + auto in_mem = input.GetMKLDNNData(); + const_cast(output).CopyFrom(*in_mem); + MKLDNNStream::Get()->Submit(); + } else { + mxnet_op::copy(ctx.get_stream(), output.data(), input.data()); + } +#endif } else { - LOG(FATAL) << "Not implemented"; + LOG(FATAL) << "Not implemented from " << src_stype << " to " << dst_stype; } } @@ -376,8 +392,14 @@ inline bool CastStorageInferStorageType(const nnvm::NodeAttrs& attrs, // dns -> dns, dns -> rsp, dns -> csr if (!dispatched && in_stype == kDefaultStorage && param_stype == kDefaultStorage) { // dns -> dns - dispatched = storage_type_assign(out_attrs, kDefaultStorage, - dispatch_mode, DispatchMode::kFCompute); + DispatchMode mode = DispatchMode::kFCompute; +#if MXNET_USE_MKLDNN == 1 + // If we use MKLDNN and the arrays are in CPU memory, the array may store + // MKLDNN layout, we should convert its layout explicitly. + if (dev_mask == kCPU) + mode = DispatchMode::kFComputeEx; +#endif + dispatched = storage_type_assign(out_attrs, kDefaultStorage, dispatch_mode, mode); } if (!dispatched && in_stype == kDefaultStorage && (param_stype == kRowSparseStorage || param_stype == kCSRStorage)) { diff --git a/src/operator/tensor/elemwise_binary_op_basic.cc b/src/operator/tensor/elemwise_binary_op_basic.cc index d7e5e04ce87a..d73edc723520 100644 --- a/src/operator/tensor/elemwise_binary_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_op_basic.cc @@ -24,11 +24,68 @@ */ #include "./elemwise_unary_op.h" #include "./elemwise_binary_op-inl.h" +#include "../nn/mkldnn/mkldnn_ops-inl.h" +#include "../nn/mkldnn/mkldnn_base-inl.h" namespace mxnet { namespace op { -MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(elemwise_add, op::mshadow_op::plus) +static void ElemwiseAddEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 2U); + CHECK_EQ(outputs.size(), 1U); +#if MXNET_USE_MKLDNN == 1 + if (SupportMKLDNN(inputs[0]) && SupportMKLDNN(inputs[1])) { + MKLDNNSumForward(attrs, ctx, inputs, req[0], outputs[0]); + return; + } else if (inputs[0].storage_type() == kDefaultStorage + && inputs[1].storage_type() == kDefaultStorage) { + // This happens if inputs are supposed to be in MKLDNN format + // but MKLDNN doesn't support the data type or the shape. We're + // forced to convert it to the default format. + std::vector in_blobs(2); + std::vector out_blobs(1); + in_blobs[0] = inputs[0].data(); + in_blobs[1] = inputs[1].data(); + out_blobs[0] = outputs[0].data(); + ElemwiseBinaryOp::Compute(attrs, ctx, in_blobs, + req, out_blobs); + return; + } +#endif + ElemwiseBinaryOp::ComputeEx(attrs, ctx, inputs, + req, outputs); +} + +static inline bool ElemwiseAddStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 2); + CHECK_EQ(out_attrs->size(), 1); + bool ret = ElemwiseStorageType<2, 1, true, true, true>(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask + && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage) + && out_attrs->at(0) == kDefaultStorage) { + *dispatch_mode = DispatchMode::kFComputeEx; + } +#endif + return ret; +} + +MXNET_OPERATOR_REGISTER_BINARY(elemwise_add) +.set_attr("FInferStorageType", ElemwiseAddStorageType) +.set_attr("FCompute", ElemwiseBinaryOp::Compute) +.set_attr("FComputeEx", ElemwiseAddEx) +.set_attr("FResourceRequest", /* For Sparse CSR */ + [](const NodeAttrs& attrs) { + return std::vector{ResourceRequest::kTempSpace};}) MXNET_ADD_SPARSE_OP_ALIAS(elemwise_add) .add_alias("_add").add_alias("_plus").add_alias("_Plus") .describe(R"code(Adds arguments element-wise. @@ -46,6 +103,41 @@ The storage type of ``elemwise_add`` output depends on storage types of inputs // this must differ from elemwise_add to prevent add to optimization in forward pass. MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_grad_add, op::mshadow_op::plus); +static void _backward_ElemwiseAddEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 2U); +#if MXNET_USE_MKLDNN == 1 + if (inputs[0].IsMKLDNNData()) { + MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]); + MKLDNNCopy(attrs, ctx, inputs[0], req[1], outputs[1]); + return; + } +#endif + ElemwiseBinaryOp::BackwardUseNoneEx( + attrs, ctx, inputs, req, outputs); +} + +static inline bool ElemwiseAddBackwardStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 1); + CHECK_EQ(out_attrs->size(), 2); + bool ret = ElemwiseStorageType<1, 2, true, true, true>(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask) { + *dispatch_mode = DispatchMode::kFComputeEx; + } +#endif + return ret; +} + NNVM_REGISTER_OP(_backward_add) .set_num_inputs(1) .set_num_outputs(2) @@ -55,13 +147,15 @@ NNVM_REGISTER_OP(_backward_add) return std::vector >{{0, 0}, {0, 1}}; }) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif .set_attr("FCompute", ElemwiseBinaryOp::BackwardUseNone< cpu, mshadow_op::identity, mshadow_op::identity>) -.set_attr("FComputeEx", - ElemwiseBinaryOp::BackwardUseNoneEx) -.set_attr("FInferStorageType", - ElemwiseStorageType<1, 2, true, true, true>); +.set_attr("FComputeEx", _backward_ElemwiseAddEx) +.set_attr("FInferStorageType", ElemwiseAddBackwardStorageType); MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(elemwise_sub, op::mshadow_op::minus) MXNET_ADD_SPARSE_OP_ALIAS(elemwise_sub) diff --git a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc index 8c12218be062..6118ddf19c30 100644 --- a/src/operator/tensor/elemwise_binary_scalar_op_basic.cc +++ b/src/operator/tensor/elemwise_binary_scalar_op_basic.cc @@ -65,7 +65,7 @@ static bool BinaryScalarStorageTypeWithDenseResultStorageType(const NodeAttrs& a const auto dispatch_ex = invalid_ctx ? DispatchMode::kFComputeFallback : DispatchMode::kFComputeEx; const double alpha = nnvm::get(attrs.parsed); - if (instype == kDefaultStorage) { + if (common::ContainsOnlyStorage(*in_attrs, kDefaultStorage)) { dispatched = storage_type_assign(&out_attrs[0], kDefaultStorage, dispatch_mode, DispatchMode::kFCompute); } @@ -89,7 +89,7 @@ static bool BinaryScalarStorageType(const nnvm::NodeAttrs& attrs, const auto in_stype = in_attrs->at(0); auto &out_stype = out_attrs->at(0); bool dispatched = false; - if (!dispatched && in_stype == kDefaultStorage) { + if (!dispatched && (in_stype == kDefaultStorage)) { // dns -> dns dispatched = storage_type_assign(&out_stype, kDefaultStorage, dispatch_mode, DispatchMode::kFCompute); diff --git a/src/operator/tensor/elemwise_sum.cc b/src/operator/tensor/elemwise_sum.cc index b31dbb2598f0..10154bc9646d 100644 --- a/src/operator/tensor/elemwise_sum.cc +++ b/src/operator/tensor/elemwise_sum.cc @@ -24,6 +24,8 @@ */ #include "./elemwise_sum.h" #include "../../ndarray/ndarray_function.h" +#include "../nn/mkldnn/mkldnn_ops-inl.h" +#include "../../common/utils.h" namespace mxnet { namespace op { @@ -79,9 +81,28 @@ bool ElementWiseSumForwardInferStorageType(const nnvm::NodeAttrs& attrs, std::vector *out_attrs) { CHECK(!in_attrs->empty()); CHECK_EQ(out_attrs->size(), 1U); - return ElemwiseStorageAttr(attrs, dev_mask, dispatch_mode, - in_attrs, out_attrs); + bool ret = ElemwiseStorageAttr(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); +#if MXNET_USE_MKLDNN == 1 + // We should always use FComputeEx. + if (dev_mask == mshadow::cpu::kDevMask + && common::ContainsOnlyStorage(*in_attrs, kDefaultStorage) + && out_attrs->at(0) == kDefaultStorage) { + *dispatch_mode = DispatchMode::kFComputeEx; + } +#endif + return ret; +} + +#if MXNET_USE_MKLDNN == 1 +static inline bool IsMKLDNNData(const std::vector &arrs) { + for (auto &arr : arrs) { + if (!arr.IsMKLDNNData()) + return false; + } + return true; } +#endif void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs, const OpContext& ctx, @@ -92,13 +113,28 @@ void ElementWiseSumComputeExCPU(const nnvm::NodeAttrs& attrs, CHECK_EQ(outputs.size(), 1U); CHECK_EQ(req.size(), 1U); if (req[0] == kNullOp) return; - CHECK_EQ(req[0], kWriteTo) << "ElementWiseSumComputeExCPU only supports req = kWriteTo"; if (inputs[0].storage_type() == kRowSparseStorage) { mshadow::Stream* s = ctx.get_stream(); Resource rsc = ResourceManager::Get()->Request(ctx.run_ctx.get_ctx(), ResourceRequest(ResourceRequest::kTempSpace)); NDArray out_nd = outputs[0]; mxnet::ndarray::ElementwiseSum(s, rsc, inputs, &out_nd); +#if MXNET_USE_MKLDNN == 1 + } else if (IsMKLDNNData(inputs)) { + MKLDNNSumForward(attrs, ctx, inputs, req[0], outputs[0]); +#endif + } else if (common::ContainsOnlyStorage(inputs, kDefaultStorage)) { + // This case happens when we want to create an MKLDNN NDArray but the type + // or the shape isn't supported by MKLDNN. In this case, NDArray falls back + // to the default storage type and, thus, we have to handle the default + // storage in FComputeEx. + std::vector in_blobs(inputs.size()); + std::vector out_blobs(outputs.size()); + for (size_t i = 0; i < in_blobs.size(); i++) + in_blobs[i] = inputs[i].data(); + for (size_t i = 0; i < out_blobs.size(); i++) + out_blobs[i] = outputs[i].data(); + ElementWiseSumCompute(attrs, ctx, in_blobs, req, out_blobs); } else { LogUnimplementedOp(attrs, ctx, inputs, req, outputs); } diff --git a/src/operator/tensor/elemwise_unary_op_basic.cc b/src/operator/tensor/elemwise_unary_op_basic.cc index 13a58d0165a8..cca3b2c9ff90 100644 --- a/src/operator/tensor/elemwise_unary_op_basic.cc +++ b/src/operator/tensor/elemwise_unary_op_basic.cc @@ -24,6 +24,7 @@ #include #include "elemwise_unary_op.h" #include "./elemwise_binary_op-inl.h" +#include "../nn/mkldnn/mkldnn_ops-inl.h" namespace mxnet { namespace op { @@ -107,12 +108,64 @@ MXNET_OPERATOR_REGISTER_BINARY_WITH_SPARSE_CPU(_backward_sigmoid, unary_bwd); // copy +static void CopyEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); + const auto in_stype = inputs[0].storage_type(); + const auto out_stype = outputs[0].storage_type(); +#if MXNET_USE_MKLDNN == 1 + if (inputs[0].IsMKLDNNData()) { + MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]); + return; + } else if (in_stype == kDefaultStorage && out_stype == kDefaultStorage) { + // This happens if inputs are supposed to be in MKLDNN format + // but MKLDNN doesn't support the data type or the shape. We're + // forced to convert it to the default format. + std::vector in_blobs {inputs[0].data()}; + std::vector out_blobs {outputs[0].data()}; + UnaryOp::IdentityCompute(attrs, ctx, in_blobs, req, out_blobs); + return; + } +#endif + UnaryOp::IdentityComputeEx(attrs, ctx, inputs, req, outputs); +} + +static inline bool CopyStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 1); + CHECK_EQ(out_attrs->size(), 1); + bool ret = ElemwiseStorageType<1, 1, false, true, true>(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); +#if MXNET_USE_MKLDNN == 1 + // We have to make sure all inputs are default layouts. Otherwise, we might + // want to fallback. + if (dev_mask == mshadow::cpu::kDevMask + && in_attrs->at(0) == kDefaultStorage + && out_attrs->at(0) == kDefaultStorage) { + *dispatch_mode = DispatchMode::kFComputeEx; + } +#endif + return ret; +} + MXNET_OPERATOR_REGISTER_UNARY(_copy) .MXNET_DESCRIBE("Returns a copy of the input.") .add_alias("identity") -.set_attr("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>) +.set_attr("FInferStorageType", CopyStorageType) .set_attr("FCompute", UnaryOp::IdentityCompute) -.set_attr("FComputeEx", UnaryOp::IdentityComputeEx) +.set_attr("FComputeEx", CopyEx) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif .set_attr("FInplaceIdentity", [](const NodeAttrs& attrs){ return std::vector{true}; @@ -127,9 +180,14 @@ NNVM_REGISTER_OP(_backward_copy) [](const NodeAttrs& attrs){ return std::vector >{{0, 0}}; }) -.set_attr("FInferStorageType", ElemwiseStorageType<1, 1, false, true, true>) +.set_attr("FInferStorageType", CopyStorageType) .set_attr("FCompute", UnaryOp::IdentityCompute) -.set_attr("FComputeEx", UnaryOp::IdentityComputeEx) +.set_attr("FComputeEx", CopyEx) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif .set_attr("FInplaceIdentity", [](const NodeAttrs& attrs){ return std::vector{true}; diff --git a/src/operator/tensor/matrix_op.cc b/src/operator/tensor/matrix_op.cc index 9167fcfe7e34..25c233318f01 100644 --- a/src/operator/tensor/matrix_op.cc +++ b/src/operator/tensor/matrix_op.cc @@ -25,6 +25,8 @@ // this will be invoked by gcc and compile CPU version #include "./matrix_op-inl.h" #include "./elemwise_unary_op.h" +#include "../nn/mkldnn/mkldnn_ops-inl.h" +#include "../nn/mkldnn/mkldnn_base-inl.h" namespace mxnet { namespace op { @@ -180,6 +182,51 @@ If the argument `reverse` is set to 1, then the special values are inferred from .add_argument("data", "NDArray-or-Symbol", "Input data to reshape.") .add_arguments(ReshapeParam::__FIELDS__()); +static void FlattenEx(const nnvm::NodeAttrs& attrs, + const OpContext& ctx, + const std::vector& inputs, + const std::vector& req, + const std::vector& outputs) { + CHECK_EQ(inputs.size(), 1U); + CHECK_EQ(outputs.size(), 1U); +#if MXNET_USE_MKLDNN == 1 + const auto in_stype = inputs[0].storage_type(); + const auto out_stype = outputs[0].storage_type(); + if (inputs[0].IsMKLDNNData()) { + MKLDNNCopy(attrs, ctx, inputs[0], req[0], outputs[0]); + // If the output is a special MKLDNN layout and the number of dimensions + // is larger than 2, we should use the default layout. + if (outputs[0].IsMKLDNNData() && inputs[0].shape().ndim() > 2) + const_cast(outputs[0]).Reorder2Default(); + return; + } else { + // This happens if inputs are supposed to be in MKLDNN format + // but MKLDNN doesn't support the data type or the shape. We're + // forced to convert it to the default format. + FallBackCompute(UnaryOp::IdentityCompute, attrs, ctx, inputs, req, outputs); + return; + } +#endif +} + +static inline bool FlattenStorageType(const nnvm::NodeAttrs& attrs, + const int dev_mask, + DispatchMode* dispatch_mode, + std::vector *in_attrs, + std::vector *out_attrs) { + CHECK_EQ(in_attrs->size(), 1); + CHECK_EQ(out_attrs->size(), 1); + bool ret = ElemwiseStorageType<1, 1, false, false, false>(attrs, dev_mask, dispatch_mode, + in_attrs, out_attrs); +#if MXNET_USE_MKLDNN == 1 + if (dev_mask == mshadow::cpu::kDevMask + && in_attrs->at(0) == kDefaultStorage + && out_attrs->at(0) == kDefaultStorage) { + *dispatch_mode = DispatchMode::kFComputeEx; + } +#endif + return ret; +} NNVM_REGISTER_OP(Flatten) .add_alias("flatten") @@ -210,8 +257,15 @@ Example:: .set_num_outputs(1) .set_attr("FInferShape", FlattenShape) .set_attr("FInferType", ElemwiseType<1, 1>) +.set_attr("FInferStorageType", FlattenStorageType) .set_attr("FGradient", ElemwiseGradUseNone{ "_backward_copy" }) .set_attr("FCompute", UnaryOp::IdentityCompute) +.set_attr("FComputeEx", FlattenEx) +#if MXNET_USE_MKLDNN == 1 +.set_attr("FResourceRequest", [](const NodeAttrs& n) { + return std::vector{ResourceRequest::kTempSpace}; +}) +#endif .set_attr("FInplaceOption", [](const NodeAttrs& attrs) { return std::vector >{{0, 0}}; diff --git a/src/storage/cpu_device_storage.h b/src/storage/cpu_device_storage.h index f0dd61f01ac0..52df4dd2bbc8 100644 --- a/src/storage/cpu_device_storage.h +++ b/src/storage/cpu_device_storage.h @@ -54,7 +54,13 @@ class CPUDeviceStorage { /*! * \brief Alignment of allocation. */ +#if MXNET_USE_MKLDNN == 1 + // MKLDNN requires special alignment. 4096 is used by the MKLDNN library in + // memory allocation. + static constexpr size_t alignment_ = 4096; +#else static constexpr size_t alignment_ = 16; +#endif }; // class CPUDeviceStorage inline void* CPUDeviceStorage::Alloc(size_t size) { diff --git a/tests/ci_build/ci_build.sh b/tests/ci_build/ci_build.sh index 794a4c55ee45..1d6d64be3862 100755 --- a/tests/ci_build/ci_build.sh +++ b/tests/ci_build/ci_build.sh @@ -178,6 +178,7 @@ ${DOCKER_BINARY} run --rm --pid=host \ -e "CI_BUILD_GID=$(id -g)" \ -e "CUDA_ARCH=-gencode arch=compute_52,code=[sm_52,compute_52] --fatbin-options -compress-all" \ -e "MXNET_STORAGE_FALLBACK_LOG_VERBOSE=0" \ + -e "ARCH_OPT=-mavx2" \ ${CI_DOCKER_EXTRA_PARAMS[@]} \ ${DOCKER_IMG_NAME} \ ${PRE_COMMAND} \ diff --git a/tests/cpp/include/test_core_op.h b/tests/cpp/include/test_core_op.h index 6a220bdad6d7..570911c23568 100644 --- a/tests/cpp/include/test_core_op.h +++ b/tests/cpp/include/test_core_op.h @@ -209,6 +209,13 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer requested.emplace_back(r); } else if (req.type == ResourceRequest::kRandom) { requested.emplace_back(ResourceManager::Get()->Request(ctx->run_ctx.ctx, req)); + } else if (req.type == ResourceRequest::kParallelRandom) { + Resource rm = ResourceManager::Get()->Request(ctx->run_ctx.ctx, req); + if (ctx->run_ctx.ctx.dev_mask() == Context::kCPU) { + common::random::RandGenerator::AllocState( + rm.get_parallel_random()); + } + requested.emplace_back(rm); } else { LOG(FATAL) << "resource type not yet supported"; } @@ -314,7 +321,9 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer // Set up forward attrs_ = ParseAttrs(op_, args); - const int num_inputs = op_->num_inputs; + int num_inputs = op_->num_inputs; + if (op_->get_num_inputs) + num_inputs = op_->get_num_inputs(attrs_); if (!inputs.empty()) { CHECK_EQ(inputs.size(), static_cast(num_inputs)); @@ -340,8 +349,8 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer inputs_.reserve(num_inputs); inputs_p.reserve(num_inputs); - outputs_.reserve(num_visible_outputs); - outputs_p.reserve(num_visible_outputs); + outputs_.reserve(inferred_num_outputs); + outputs_p.reserve(inferred_num_outputs); for (size_t i = 0; i < static_cast(num_inputs); ++i) { CHECK_LT(i, static_cast(shapes.size())); @@ -350,7 +359,7 @@ class CoreOpExecutor : public test::op::OperatorDataInitializer inputs_p.emplace_back(&*inputs_.rbegin()); } - for (size_t i = 0; i < static_cast(num_visible_outputs); ++i) { + for (size_t i = 0; i < static_cast(inferred_num_outputs); ++i) { // If supplied and valid, pass from the supplied outputs vector // Otherwise use empty for forward pass, or zero-filled for backward pass outputs_.emplace_back(i < outputs.size() diff --git a/tests/cpp/include/test_op_runner.h b/tests/cpp/include/test_op_runner.h index 0992c41f760e..1e00e30a1b34 100644 --- a/tests/cpp/include/test_op_runner.h +++ b/tests/cpp/include/test_op_runner.h @@ -137,7 +137,8 @@ class OperatorRunner { const test::op::kwargs_t& kwargs, int dim = 0, size_t count = 1, - const std::vector& timing_shapes = {}) { + const std::vector& timing_shapes = {}, + bool backward = true) { if (mxnet::test::quick_test) { total_iterations_ = 2; count = 1; @@ -225,7 +226,7 @@ class OperatorRunner { CHECK(false) << "Unsupported dimension count: " << (D + 1); } if (info.executor_) { - if (info.executor_->HasBackward()) { + if (info.executor_->HasBackward() && backward) { RunGenericOperatorBackward(&info, count); } timing += info.executor_->GetTiming(); diff --git a/tests/cpp/operator/activation_perf.cc b/tests/cpp/operator/activation_perf.cc index e482848705ad..1bd8ca89c9f5 100644 --- a/tests/cpp/operator/activation_perf.cc +++ b/tests/cpp/operator/activation_perf.cc @@ -26,7 +26,7 @@ #include #include #include "../include/test_op_runner.h" -#include "../include/test_legacy_op.h" +#include "../include/test_core_op.h" #include "../../src/operator/nn/activation-inl.h" using namespace mxnet; @@ -41,8 +41,10 @@ TEST(ACTIVATION_PERF, ExecuteBidirectional) { TShape shape({5, 5}); kwargs_t kwargs = basic_activation_args; kwargs.push_back({"act_type", "tanh"}); - test::op::LegacyOpRunner runner; - runner.RunBidirectional(false, { shape }, kwargs, 1); + + test::op::CoreOperatorRunner runner; + runner.RunBidirectional(false, { shape }, test::op::CoreOpExecutor::ArgsWithOpName( + kwargs, "Activation", "_backward_Activation"), 1); } /*! @@ -52,10 +54,12 @@ TEST(ACTIVATION_PERF, TimingCPU) { kwargs_t kwargs = basic_activation_args; // Which math function is arbitrary since it will have roughly constant timing among approaches kwargs.push_back({"act_type", "tanh"}); - test::op::LegacyOpRunner runner; - runner.RunBidirectional(false, - { TShape({10, 10, 10, 10}) }, - kwargs, 1); // prime code and cache + kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "Activation", + "_backward_Activation"); + TShape shape({10, 10, 10, 10}); + test::op::CoreOperatorRunner runner; + runner.RunBidirectional(false, { shape }, kwargs, 1); + std::vector shapes; if (test::performance_run) { shapes = { @@ -84,11 +88,11 @@ TEST(ACTIVATION_PERF, TimingGPU) { kwargs_t kwargs = basic_activation_args; // Which math function is arbitrary since it will have roughly constant timing among approaches kwargs.push_back({"act_type", "tanh"}); - test::OperatorRunner> runner; - runner.RunBidirectional(true, - { TShape({10, 10, 10, 10}) }, - kwargs, 1); // prime code and cache + kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "Activation", + "_backward_Activation"); + TShape shape({10, 10, 10, 10}); + test::op::CoreOperatorRunner runner; + runner.RunBidirectional(true, { shape }, kwargs, 1); std::vector shapes = { {1, 1, 28, 28}, {1, 3, 28, 28}, diff --git a/tests/cpp/operator/batchnorm_test.cc b/tests/cpp/operator/batchnorm_test.cc index 179e42a3830f..607b9804684a 100644 --- a/tests/cpp/operator/batchnorm_test.cc +++ b/tests/cpp/operator/batchnorm_test.cc @@ -24,11 +24,14 @@ * \author Chris Olivier */ +#if 0 + #include #include #include "../../src/operator/nn/batch_norm-inl.h" #include "../../src/operator/batch_norm_v1-inl.h" #include "./test_legacy_op.h" +#include "./test_core_op.h" #include "executor/exec_pass.h" using namespace mxnet; @@ -1827,3 +1830,5 @@ TEST(BATCH_NORM, Test2DBackwardMixedComplex_gpu_cpu_ugs) { } #endif // MXNET_USE_CUDA + +#endif diff --git a/tests/cpp/operator/dropout_perf.cc b/tests/cpp/operator/dropout_perf.cc index 90bf6ebb0dfd..c28b9bd48097 100644 --- a/tests/cpp/operator/dropout_perf.cc +++ b/tests/cpp/operator/dropout_perf.cc @@ -26,7 +26,7 @@ #include #include #include "../include/test_op_runner.h" -#include "../include/test_legacy_op.h" +#include "../include/test_core_op.h" #include "../../src/operator/nn/dropout-inl.h" using namespace mxnet; @@ -41,8 +41,10 @@ TEST(DROPOUT_PERF, ExecuteBidirectional) { TShape shape({5, 5}); kwargs_t kwargs = basic_dropout_args; kwargs.push_back({"mode", "always"}); - test::op::LegacyOpRunner runner; - runner.RunBidirectional(false, { shape }, kwargs, 1); + test::op::CoreOperatorRunner runner; + kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "Dropout", + "_backward_Dropout"); + runner.RunGenericOperatorForward(false, { shape }, kwargs, 1); } /*! @@ -52,10 +54,11 @@ TEST(DROPOUT_PERF, TimingCPU) { kwargs_t kwargs = basic_dropout_args; // Which math function is arbitrary since it will have roughly constant timing among approaches kwargs.push_back({"mode", "always"}); - test::op::LegacyOpRunner runner; - runner.RunBidirectional(false, - { TShape({10, 10, 10, 10}) }, - kwargs, 1); // prime code and cache + TShape shape({10, 10, 10, 10}); + test::op::CoreOperatorRunner runner; + kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "Dropout", + "_backward_Dropout"); + runner.RunGenericOperatorForward(false, { shape }, kwargs, 1); std::vector shapes; if (test::performance_run) { shapes = { @@ -72,7 +75,9 @@ TEST(DROPOUT_PERF, TimingCPU) { }; } for (const TShape &shape : shapes) { - runner.TimingTest("Dropout Operator CPU", false, false, kwargs, 2, 10, { shape }); + kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "Dropout", + "_backward_Dropout"); + runner.TimingTest("Dropout Operator CPU", false, false, kwargs, 2, 10, { shape }, false); } } @@ -84,11 +89,11 @@ TEST(DROPOUT_PERF, TimingGPU) { kwargs_t kwargs = basic_dropout_args; // Which math function is arbitrary since it will have roughly constant timing among approaches kwargs.push_back({"mode", "always"}); - test::OperatorRunner> runner; - runner.RunBidirectional(true, - { TShape({10, 10, 10, 10}) }, - kwargs, 1); // prime code and cache + TShape shape({10, 10, 10, 10}); + test::op::CoreOperatorRunner runner; + kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "Dropout", + "_backward_Dropout"); + runner.RunGenericOperatorForward(true, { shape }, kwargs, 1); std::vector shapes = { {1, 1, 28, 28}, {1, 3, 28, 28}, @@ -97,8 +102,9 @@ TEST(DROPOUT_PERF, TimingGPU) { {20, 3, 128, 128} }; for (const TShape &shape : shapes) { - runner.TimingTest("Dropout Operator GPU", true, false, kwargs, 2, 10, { shape }); + kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "Dropout", + "_backward_Dropout"); + runner.TimingTest("Dropout Operator GPU", true, false, kwargs, 2, 10, { shape }, false); } } #endif // MXNET_USE_CUDA == 1 - diff --git a/tests/cpp/operator/fully_conn_perf.cc b/tests/cpp/operator/fully_conn_perf.cc index c8d8021f6f6e..829c20385ab5 100644 --- a/tests/cpp/operator/fully_conn_perf.cc +++ b/tests/cpp/operator/fully_conn_perf.cc @@ -28,21 +28,25 @@ #include #include "../../src/operator/nn/fully_connected-inl.h" #include "../include/test_op_runner.h" -#include "../include/test_legacy_op.h" +#include "../include/test_core_op.h" using namespace mxnet; typedef std::vector > kwargs_t; -const kwargs_t basic_fullyconn_args = { {"num_hidden", "250"} }; +const kwargs_t basic_fullyconn_args = { {"num_hidden", "250"}, {"no_bias", "true"} }; /*! * \brief Generic bidirectional sanity test */ TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) { - TShape shape({5, 5}); + TShape shape1({5, 5}); + TShape shape2({250, 5}); kwargs_t kwargs = basic_fullyconn_args; - test::op::LegacyOpRunner runner; - runner.RunBidirectional(false, { shape }, kwargs, 1); + test::op::CoreOperatorRunner runner; + runner.set_verbose(true); + kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "FullyConnected", + "_backward_FullyConnected"); + runner.RunGenericOperatorForward(false, { shape1, shape2 }, kwargs, 1); } /*! @@ -50,10 +54,12 @@ TEST(FULLY_CONNECTED, ExecuteBidirectionalFullyConnected) { */ TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) { kwargs_t kwargs = basic_fullyconn_args; - test::op::LegacyOpRunner runner; - runner.RunBidirectional(false, - { TShape({10, 10, 10, 10}) }, - kwargs, 1); // prime code and cache + TShape shape1({10, 10, 10, 10}); + TShape shape2({250, 1000}); + test::op::CoreOperatorRunner runner; + kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "FullyConnected", + "_backward_FullyConnected"); + runner.RunGenericOperatorForward(false, { shape1, shape2 }, kwargs, 1); std::vector shapes; if (test::performance_run) { shapes = { @@ -70,7 +76,11 @@ TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) { }; } for (const TShape& shape : shapes) { - runner.TimingTest("Fully connected CPU", false, false, kwargs, 2, 10, { shape }); + TShape shape2({250, shape.ProdShape(1, shape.ndim())}); + kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "FullyConnected", + "_backward_FullyConnected"); + runner.TimingTest("Fully connected CPU", false, false, kwargs, 2, 10, + { shape, shape2 }, false); } } @@ -80,12 +90,12 @@ TEST(FULLY_CONNECTED, FullyConnectedTimingCPU) { */ TEST(FULLY_CONNECTED, FullyConnectedTimingGPU) { kwargs_t kwargs = basic_fullyconn_args; - test::OperatorRunner> - runner; - runner.RunBidirectional(true, - { TShape({10, 10, 10, 10}) }, - kwargs, 1); // prime code and cache + TShape shape1({10, 10, 10, 10}); + TShape shape2({250, 1000}); + test::op::CoreOperatorRunner runner; + kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "FullyConnected", + "_backward_FullyConnected"); + runner.RunGenericOperatorForward(true, { shape1, shape2 }, kwargs, 1); std::vector shapes; if (test::performance_run) { shapes = { @@ -102,7 +112,11 @@ TEST(FULLY_CONNECTED, FullyConnectedTimingGPU) { }; } for (const TShape& shape : shapes) { - runner.TimingTest("Fully connected GPU", true, false, kwargs, 2, 10, { shape }); + TShape shape2({250, shape.ProdShape(1, shape.ndim())}); + kwargs = test::op::CoreOpExecutor::ArgsWithOpName(kwargs, "FullyConnected", + "_backward_FullyConnected"); + runner.TimingTest("Fully connected GPU", true, false, kwargs, 2, 10, + { shape, shape2 }, false); } } #endif // MXNET_USE_CUDA == 1 diff --git a/tests/cpp/operator/mkldnn.cc b/tests/cpp/operator/mkldnn.cc new file mode 100644 index 000000000000..a8a3d26fac3d --- /dev/null +++ b/tests/cpp/operator/mkldnn.cc @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file mkldnn.cc + * \brief test functions in mkldnn. + * \author Da Zheng + */ + +#if MXNET_USE_MKLDNN == 1 + +#include "gtest/gtest.h" +#include "../../src/operator/nn/mkldnn/mkldnn_base-inl.h" + +bool test_mem_align(void *mem, size_t size, size_t alignment, size_t space) { + void *ret1, *ret2; + size_t space1, space2; + space1 = space; + space2 = space; + ret1 = mxnet::AlignMem(mem, size, alignment, &space1); + ret2 = std::align(alignment, size, mem, space2); + EXPECT_EQ(ret1, ret2); + EXPECT_EQ(space1, space2); + return ret1 == ret2; +} + +TEST(MKLDNN_UTIL_FUNC, AlignMem) { + size_t alignment = 4096; + void *mem; + size_t size, space; + + // When mem has been aligned. + mem = reinterpret_cast(0x10000); + size = 1000; + space = 10000; + test_mem_align(mem, size, alignment, space); + + // When mem isn't aligned and we have enough space for alignment. + mem = reinterpret_cast(0x10010); + size = 1000; + space = 10000; + test_mem_align(mem, size, alignment, space); + + // When mem isn't aligned and we don't have enough memory for alignment + mem = reinterpret_cast(0x10010); + size = 1000; + space = 1001; + test_mem_align(mem, size, alignment, space); + + for (size_t i = 0; i < 10000; i++) { + mem = reinterpret_cast(random()); + size = random() % 2000; + space = random() % 2000; + test_mem_align(mem, size, alignment, space); + } +} +#endif diff --git a/tests/python/gpu/test_gluon_model_zoo_gpu.py b/tests/python/gpu/test_gluon_model_zoo_gpu.py new file mode 100644 index 000000000000..bc35b0b32327 --- /dev/null +++ b/tests/python/gpu/test_gluon_model_zoo_gpu.py @@ -0,0 +1,163 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import print_function +import mxnet as mx +import numpy as np +import copy +from mxnet import autograd +from mxnet.gluon.model_zoo.vision import get_model +from mxnet.test_utils import assert_almost_equal +import sys + +def eprint(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) + +VAL_DATA='data/val-5k-256.rec' +def download_data(): + return mx.test_utils.download( + 'http://data.mxnet.io/data/val-5k-256.rec', VAL_DATA) + +def test_inference(): + all_models = ['resnet50_v1', 'vgg19_bn', 'alexnet', #'inceptionv3', + 'densenet201', 'squeezenet1.0', 'mobilenet0.25'] + + batch_size = 10 + download_data() + for model_name in all_models: + eprint('testing inference on %s'%model_name) + + data_shape = (3, 224, 224) if 'inception' not in model_name else (3, 299, 299) + dataIter = mx.io.ImageRecordIter( + path_imgrec = VAL_DATA, + label_width = 1, + preprocess_threads = 1, + batch_size = batch_size, + data_shape = data_shape, + label_name = 'softmax_label', + rand_crop = False, + rand_mirror = False) + data_batch = dataIter.next() + data = data_batch.data[0] + label = data_batch.label[0] + gpu_data = data.as_in_context(mx.gpu()) + gpu_label = label.as_in_context(mx.gpu()) + + # This is to create a model and run the model once to initialize + # all parameters. + cpu_model = get_model(model_name) + cpu_model.collect_params().initialize(ctx=mx.cpu()) + cpu_model(mx.nd.array(data, ctx=mx.cpu())) + gpu_model = get_model(model_name) + gpu_model.collect_params().initialize(ctx=mx.gpu()) + gpu_model(mx.nd.array(data, ctx=mx.gpu())) + + # Force the two models have the same parameters. + cpu_params = cpu_model.collect_params() + gpu_params = gpu_model.collect_params() + for k in cpu_params.keys(): + k = k.replace(cpu_params.prefix, '') + cpu_param = cpu_params.get(k) + gpu_param = gpu_params.get(k) + gpu_param.set_data(cpu_param.data().as_in_context(mx.gpu())) + + # Run inference. + with autograd.record(train_mode=False): + cpu_out = cpu_model(mx.nd.array(data, ctx=mx.cpu())) + gpu_out = gpu_model(gpu_data) + out = cpu_out.asnumpy() + max_val = np.max(out) + assert_almost_equal(out / max_val, gpu_out.asnumpy() / max_val, rtol=1e-2, atol=1e-2) + +def get_nn_model(name): + if "densenet" in name: + return get_model(name, dropout=0) + else: + return get_model(name) + +def test_training(): + # We use network models without dropout for testing. + # TODO(zhengda) mobilenet can't pass this test even without MKLDNN. + all_models = ['resnet18_v1', 'densenet121'] + + batch_size = 10 + label = mx.nd.random.uniform(low=0, high=10, shape=(batch_size)).astype('int32') + + download_data() + dataIter = mx.io.ImageRecordIter( + path_imgrec = VAL_DATA, + label_width = 1, + preprocess_threads = 1, + batch_size = batch_size, + data_shape = (3, 224, 224), + label_name = 'softmax_label', + rand_crop = False, + rand_mirror = False) + data_batch = dataIter.next() + data = data_batch.data[0] + label = data_batch.label[0] + gpu_data = data.as_in_context(mx.gpu()) + gpu_label = label.as_in_context(mx.gpu()) + softmax_cross_entropy = mx.gluon.loss.SoftmaxCrossEntropyLoss() + + for model_name in all_models: + eprint('testing %s'%model_name) + #data = mx.nd.random.uniform(shape=(100, 3, 224, 224)) + + # This is to create a model and run the model once to initialize + # all parameters. + cpu_model = get_nn_model(model_name) + cpu_model.collect_params().initialize(ctx=mx.cpu()) + cpu_model(mx.nd.array(data, ctx=mx.cpu())) + gpu_model = get_nn_model(model_name) + gpu_model.collect_params().initialize(ctx=mx.gpu()) + gpu_model(mx.nd.array(data, ctx=mx.gpu())) + + # Force the two models have the same parameters. + cpu_params = cpu_model.collect_params() + gpu_params = gpu_model.collect_params() + for k in cpu_params.keys(): + k = k.replace(cpu_params.prefix, '') + cpu_param = cpu_params.get(k) + gpu_param = gpu_params.get(k) + gpu_param.set_data(cpu_param.data().as_in_context(mx.gpu())) + + cpu_trainer = mx.gluon.Trainer(cpu_params, 'sgd', {'learning_rate': 0.1}) + gpu_trainer = mx.gluon.Trainer(gpu_params, 'sgd', {'learning_rate': 0.1}) + + # Run forward and backward once. + with autograd.record(): + cpu_out = cpu_model(mx.nd.array(data, ctx=mx.cpu())) + gpu_out = gpu_model(gpu_data) + cpu_loss = softmax_cross_entropy(cpu_out, label) + gpu_loss = softmax_cross_entropy(gpu_out, gpu_label) + assert_almost_equal(cpu_out.asnumpy(), gpu_out.asnumpy(), rtol=1e-2, atol=1e-2) + cpu_loss.backward() + gpu_loss.backward() + cpu_trainer.step(batch_size) + gpu_trainer.step(batch_size) + + # Compare the parameters of the two models. + for k in cpu_params.keys(): + k = k.replace(cpu_params.prefix, '') + cpu_param = cpu_params.get(k) + gpu_param = gpu_params.get(k) + assert_almost_equal(cpu_param.data().asnumpy(), gpu_param.data().asnumpy(), rtol=1e-2, atol=1e-2) + +if __name__ == '__main__': + import nose + nose.runmodule() diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py index 52aca091d864..a566b950cc48 100644 --- a/tests/python/gpu/test_operator_gpu.py +++ b/tests/python/gpu/test_operator_gpu.py @@ -985,6 +985,13 @@ def test_activation_with_type(): check_consistency(sym, ctx_list) +def test_lrn(): + sym = mx.sym.LRN(alpha=0.0001, beta=0.75, knorm=2, nsize=5, name='lrn') + ctx_list = [{'ctx': mx.gpu(0), 'lrn_data': (2, 6, 10, 10), 'type_dict': {'lrn_data': np.float32}}, + {'ctx': mx.cpu(0), 'lrn_data': (2, 6, 10, 10), 'type_dict': {'lrn_data': np.float32}}] + check_consistency(sym, ctx_list) + + def test_embedding_with_type(): def test_embedding_helper(data_types, weight_types, low_pad, high_pad): NVD = [[20, 10, 20], [200, 10, 300]]